├── .gitignore
├── LICENSE.txt
├── README.md
├── build.sbt
├── docker
├── README.md
├── beeline
├── build
├── files
│ ├── Dockerfile
│ ├── bootstrap.sh
│ ├── core-site.xml
│ ├── hadoop-env.sh
│ ├── hdfs-site.xml
│ ├── hive-site.xml
│ ├── mapred-site.xml
│ └── yarn-site.xml
├── inspect
├── login
├── spark-shell
├── start
└── stop
├── project
├── build.properties
└── plugins.sbt
├── shaded-dependencies
├── build.sbt
└── project
│ ├── build.properties
│ └── plugins.sbt
├── src
├── it
│ └── scala
│ │ └── com
│ │ └── qubole
│ │ └── spark
│ │ └── hiveacid
│ │ ├── LockSuite.scala
│ │ ├── MergeSuite.scala
│ │ ├── ReadSuite.scala
│ │ ├── Table.scala
│ │ ├── TestHelper.scala
│ │ ├── TestHiveClient.java
│ │ ├── TestSparkSession.scala
│ │ ├── UpdateDeleteSuite.scala
│ │ ├── WriteSuite.scala
│ │ └── streaming
│ │ ├── HiveAcidSinkLogSuite.scala
│ │ ├── HiveAcidSinkOptionsSuite.scala
│ │ ├── HiveAcidSinkSuite.scala
│ │ ├── HiveAcidStreamingFunSuite.scala
│ │ └── StreamingTestHelper.scala
├── main
│ ├── antlr4
│ │ └── com
│ │ │ └── qubole
│ │ │ └── spark
│ │ │ └── datasources
│ │ │ └── hiveacid
│ │ │ └── sql
│ │ │ └── catalyst
│ │ │ └── parser
│ │ │ └── SqlHive.g4
│ ├── resources
│ │ └── META-INF
│ │ │ └── services
│ │ │ └── org.apache.spark.sql.sources.DataSourceRegister
│ └── scala
│ │ ├── com
│ │ └── qubole
│ │ │ ├── shaded
│ │ │ └── hadoop
│ │ │ │ └── hive
│ │ │ │ └── ql
│ │ │ │ └── io
│ │ │ │ └── orc
│ │ │ │ └── OrcAcidUtil.scala
│ │ │ └── spark
│ │ │ ├── datasources
│ │ │ └── hiveacid
│ │ │ │ └── sql
│ │ │ │ ├── HiveAnalysisException.scala
│ │ │ │ ├── catalyst
│ │ │ │ ├── parser
│ │ │ │ │ ├── AstBuilder.scala
│ │ │ │ │ └── ParseDriver.scala
│ │ │ │ └── plans
│ │ │ │ │ └── command
│ │ │ │ │ ├── DeleteCommand.scala
│ │ │ │ │ ├── MergeCommand.scala
│ │ │ │ │ └── UpdateCommand.scala
│ │ │ │ └── execution
│ │ │ │ ├── SparkAcidSqlParser.scala
│ │ │ │ └── SparkSqlAstBuilder.scala
│ │ │ └── hiveacid
│ │ │ ├── .gitignore
│ │ │ ├── AcidOperationDelegate.scala
│ │ │ ├── HiveAcidAutoConvert.scala
│ │ │ ├── HiveAcidErrors.scala
│ │ │ ├── HiveAcidOperation.scala
│ │ │ ├── HiveAcidTable.scala
│ │ │ ├── SparkAcidConf.scala
│ │ │ ├── datasource
│ │ │ ├── HiveAcidDataSource.scala
│ │ │ └── HiveAcidRelation.scala
│ │ │ ├── hive
│ │ │ ├── .gitignore
│ │ │ ├── HiveAcidMetadata.scala
│ │ │ └── HiveConverter.scala
│ │ │ ├── merge
│ │ │ ├── MergeImpl.scala
│ │ │ └── MergeWhenClause.scala
│ │ │ ├── package.scala
│ │ │ ├── rdd
│ │ │ ├── EmptyRDD.scala
│ │ │ ├── HiveAcidRDD.scala
│ │ │ └── HiveAcidUnionRDD.scala
│ │ │ ├── reader
│ │ │ ├── .gitignore
│ │ │ ├── Reader.scala
│ │ │ ├── ReaderOptions.scala
│ │ │ ├── TableReader.scala
│ │ │ └── hive
│ │ │ │ ├── HiveAcidPartitionComputer.scala
│ │ │ │ ├── HiveAcidReader.scala
│ │ │ │ ├── HiveAcidReaderOptions.scala
│ │ │ │ └── HiveAcidSearchArgument.scala
│ │ │ ├── streaming
│ │ │ ├── HiveAcidSink.scala
│ │ │ ├── HiveAcidSinkLog.scala
│ │ │ ├── HiveAcidSinkOptions.scala
│ │ │ └── HiveAcidStreamingCommitProtocol.scala
│ │ │ ├── transaction
│ │ │ ├── HiveAcidTxn.scala
│ │ │ └── HiveAcidTxnManager.scala
│ │ │ ├── util
│ │ │ ├── .gitignore
│ │ │ ├── HiveAcidKyroRegistrator.scala
│ │ │ ├── SerializableConfiguration.scala
│ │ │ ├── SerializableWritable.scala
│ │ │ └── Util.scala
│ │ │ └── writer
│ │ │ ├── TableWriter.scala
│ │ │ ├── Writer.scala
│ │ │ ├── WriterOptions.scala
│ │ │ └── hive
│ │ │ ├── HiveAcidWriter.scala
│ │ │ └── HiveAcidWriterOptions.scala
│ │ └── org
│ │ └── apache
│ │ └── spark
│ │ └── sql
│ │ ├── SqlUtils.scala
│ │ ├── catalyst
│ │ └── parser
│ │ │ └── plans
│ │ │ └── logical
│ │ │ └── MergePlan.scala
│ │ └── hive
│ │ ├── Hive3Inspectors.scala
│ │ └── HiveAcidUtils.scala
└── test
│ └── scala
│ ├── com
│ └── qubole
│ │ └── spark
│ │ └── hiveacid
│ │ ├── merge
│ │ └── MergeClauseSuite.scala
│ │ └── sql
│ │ └── catalyst
│ │ └── parser
│ │ └── MergeParserSuite.scala
│ └── org
│ └── apache
│ └── spark
│ └── sql
│ └── catalyst
│ └── parser
│ └── plans
│ └── logical
│ └── MergePlanSuite.scala
└── version.sbt
/.gitignore:
--------------------------------------------------------------------------------
1 | *.class
2 | *.log
3 | *.iml
4 | *.ipr
5 | *.iws
6 | .idea
7 | out
8 | .cache/
9 | .history/
10 | .lib/
11 | dist/*
12 | target/
13 | bin/
14 | libexec/
15 | lib_managed/
16 | src_managed/
17 | project/boot/
18 | project/plugins/project/
19 | logs/
20 | project/*-shim.sbt
21 | project/project/
22 | project/target/
23 | target/
24 | .scala_dependencies
25 | .worksheet
26 | shaded_dependencies
27 |
--------------------------------------------------------------------------------
/build.sbt:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2019 Qubole, Inc. All rights reserved.
3 | *
4 | * Licensed to the Apache Software Foundation (ASF) under one or more
5 | * contributor license agreements. See the NOTICE file distributed with
6 | * this work for additional information regarding copyright ownership.
7 | * The ASF licenses this file to You under the Apache License, Version 2.0
8 | * (the "License"); you may not use this file except in compliance with
9 | * the License. You may obtain a copy of the License at
10 | *
11 | * http://www.apache.org/licenses/LICENSE-2.0
12 | *
13 | * Unless required by applicable law or agreed to in writing, software
14 | * distributed under the License is distributed on an "AS IS" BASIS,
15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | * See the License for the specific language governing permissions and
17 | * limitations under the License.
18 | */
19 |
20 | name := "spark-acid"
21 |
22 | organization:= "com.qubole"
23 |
24 | /*******************
25 | * Scala settings
26 | */
27 |
28 | crossScalaVersions := Seq("2.11.12")
29 |
30 | scalaVersion := crossScalaVersions.value.head
31 |
32 | scalacOptions ++= Seq(
33 | "-Xlint",
34 | "-Xfatal-warnings",
35 | "-deprecation",
36 | "-unchecked",
37 | "-optimise",
38 | "-Yinline-warnings"
39 | )
40 |
41 | scalacOptions in (Compile, doc) ++= Seq(
42 | "-no-link-warnings" // Suppresses problems with Scaladoc @throws links
43 | )
44 |
45 | /**************************
46 | * Spark package settings
47 | */
48 | sparkVersion := sys.props.getOrElse("spark.version", "2.4.3")
49 |
50 | spIncludeMaven := true
51 |
52 | spIgnoreProvided := true
53 |
54 |
55 | /************************
56 | * Library Dependencies
57 | */
58 |
59 | libraryDependencies ++= Seq(
60 | // Adding test classifier seems to break transitive resolution of the core dependencies
61 | "org.apache.spark" %% "spark-hive" % sparkVersion.value % "provided" excludeAll(
62 | ExclusionRule("org.apache", "hadoop-common"),
63 | ExclusionRule("org.apache", "hadoop-hdfs")),
64 | "org.apache.spark" %% "spark-sql" % sparkVersion.value % "provided" excludeAll(
65 | ExclusionRule("org.apache", "hadoop-common"),
66 | ExclusionRule("org.apache", "hadoop-hdfs")),
67 | "org.apache.spark" %% "spark-core" % sparkVersion.value % "provided" excludeAll(
68 | ExclusionRule("org.apache", "hadoop-common"),
69 | ExclusionRule("org.apache", "hadoop-hdfs")),
70 | "org.apache.spark" %% "spark-catalyst" % sparkVersion.value % "provided" excludeAll(
71 | ExclusionRule("org.apache", "hadoop-common"),
72 | ExclusionRule("org.apache", "hadoop-hdfs")),
73 | "org.apache.hadoop" % "hadoop-common" % "2.8.1" % "provided",
74 | "org.apache.hadoop" % "hadoop-hdfs" % "2.8.1" % "provided",
75 | "org.apache.commons" % "commons-lang3" % "3.3.5" % "provided",
76 | // antlr-runtime
77 | "org.antlr" % "antlr4-runtime" % "4.7.2" % "provided"
78 | )
79 |
80 | lazy val scalatest = "org.scalatest" %% "scalatest" % "3.0.5"
81 |
82 | // Dependencies for Test
83 | libraryDependencies ++= Seq(
84 | "org.apache.hadoop" % "hadoop-common" % "2.8.1" % "provided",
85 | "org.apache.hadoop" % "hadoop-hdfs" % "2.8.1" % "provided",
86 | "org.apache.commons" % "commons-lang3" % "3.3.5" % "provided",
87 | // Dependencies for tests
88 | //
89 | "org.scalatest" %% "scalatest" % "3.0.5" % "test",
90 | "junit" % "junit" % "4.12" % "it,test",
91 | "com.novocode" % "junit-interface" % "0.11" % "it,test",
92 | "org.apache.spark" %% "spark-catalyst" % sparkVersion.value % "test" classifier "tests",
93 | "org.apache.spark" %% "spark-core" % sparkVersion.value % "test" classifier "tests",
94 | "org.apache.spark" %% "spark-sql" % sparkVersion.value % "test" classifier "tests"
95 | )
96 |
97 | // Shaded jar dependency
98 | libraryDependencies ++= Seq(
99 | // intransitive() because we don't want to include any transitive dependencies of shaded-dependencies jar in main jar
100 | // ideally all such dependencies should be shaded inside shaded-dependencies jar
101 | "com.qubole" %% "spark-acid-shaded-dependencies" % sys.props.getOrElse("package.version", "0.1") intransitive()
102 | )
103 |
104 | /**************************************
105 | * Remove Shaded Depenedency from POM
106 | */
107 |
108 | import scala.xml.{Node => XmlNode, NodeSeq => XmlNodeSeq, _}
109 | import scala.xml.transform.{RewriteRule, RuleTransformer}
110 |
111 | pomPostProcess := { (node: XmlNode) =>
112 | new RuleTransformer(new RewriteRule {
113 | override def transform(node: XmlNode): XmlNodeSeq = node match {
114 | case e: Elem if e.label == "dependency" && e.child.filter(_.label == "groupId").text.mkString == "com.qubole" =>
115 | val organization = e.child.filter(_.label == "groupId").flatMap(_.text).mkString
116 | val artifact = e.child.filter(_.label == "artifactId").flatMap(_.text).mkString
117 | val version = e.child.filter(_.label == "version").flatMap(_.text).mkString
118 | Comment(s"dependency $organization#$artifact;$version has been omitted")
119 | case _ => node
120 | }
121 | }).transform(node).head
122 | }
123 |
124 | excludeDependencies ++= Seq (
125 | // hive
126 | "org.apache.hive" % "hive-exec",
127 | "org.apache.hive" % "hive-metastore",
128 | "org.apache.hive" % "hive-jdbc",
129 | "org.apache.hive" % "hive-service",
130 | "org.apache.hive" % "hive-serde",
131 | "org.apache.hive" % "hive-common",
132 |
133 | // orc
134 | "org.apache.orc" % "orc-core",
135 | "org.apache.orc" % "orc-mapreduce",
136 |
137 | "org.slf4j" % "slf4j-api"
138 | )
139 |
140 | // do not run test at assembly
141 | test in assembly := {}
142 |
143 | // Spark Package Section
144 | spName := "qubole/spark-acid"
145 |
146 | spShade := true
147 |
148 | spAppendScalaVersion := true
149 |
150 | credentials += Credentials(Path.userHome / ".ivy2" / ".sbtcredentials")
151 |
152 | licenses += "Apache-2.0" -> url("http://opensource.org/licenses/Apache-2.0")
153 |
154 | pomExtra :=
155 | https://github.com/qubole/spark-acid
156 |
157 | git@github.com:qubole/spark-acid.git
158 | scm:git:git@github.com:qubole/spark-acid.git
159 |
160 |
161 |
162 | amoghmargoor
163 | Amogh Margoor
164 | https://github.com/amoghmargoor
165 |
166 |
167 | citrusraj
168 | Rajkumar Iyer
169 | https://github.com/citrusraj
170 |
171 |
172 | somani
173 | Abhishek Somani
174 | https://github.com/somani
175 |
176 |
177 | prakharjain09
178 | Prakhar Jain
179 | https://github.com/prakharjain09
180 |
181 |
182 | sourabh912
183 | Sourabh Goyal
184 | https://github.com/sourabh912
185 |
186 |
187 |
188 |
189 | publishMavenStyle := true
190 |
191 | bintrayReleaseOnPublish := false
192 |
193 | import ReleaseTransformations._
194 |
195 | // Add publishing to spark packages as another step.
196 | releaseProcess := Seq[ReleaseStep](
197 | checkSnapshotDependencies,
198 | inquireVersions,
199 | setReleaseVersion,
200 | commitReleaseVersion,
201 | tagRelease,
202 | pushChanges,
203 | releaseStepTask(spDist),
204 | releaseStepTask(spPublish)
205 | )
206 |
207 | /**
208 | * Antlr settings
209 | */
210 | antlr4Settings
211 | antlr4PackageName in Antlr4 := Some("com.qubole.spark.datasources.hiveacid.sql.catalyst.parser")
212 | antlr4GenListener in Antlr4 := true
213 | antlr4GenVisitor in Antlr4 := true
214 | antlr4Version := "4.7.2"
215 |
216 |
217 | /*******************
218 | * Test settings
219 | */
220 |
221 | parallelExecution in IntegrationTest := false
222 |
223 | // do not run test at assembly
224 | test in assembly := {}
225 |
226 | // do not add scala in fat jar
227 | assemblyOption in assembly := (assemblyOption in assembly).value.copy(includeScala = false)
228 |
229 | //Integration test
230 | lazy val root = (project in file("."))
231 | .configs(IntegrationTest)
232 | .settings(
233 | Defaults.itSettings,
234 | libraryDependencies += scalatest % "it"
235 | )
236 |
237 | // exclude antlr classes from assembly since those
238 | // are available in spark at runtime
239 | // any other classes to be excluded from assembly
240 | // should be added here
241 | assemblyExcludedJars in assembly := {
242 | val cp = (fullClasspath in assembly).value
243 | cp filter {_.data.getName.contains("antlr")}
244 | }
245 |
246 | /***********************
247 | * Release settings
248 | */
249 |
250 | publishMavenStyle := true
251 |
252 | bintrayReleaseOnPublish := false
253 |
254 | import ReleaseTransformations._
255 |
256 | // Add publishing to spark packages as another step.
257 | releaseProcess := Seq[ReleaseStep](
258 | checkSnapshotDependencies,
259 | inquireVersions,
260 | setReleaseVersion,
261 | commitReleaseVersion,
262 | tagRelease,
263 | pushChanges
264 | )
265 |
--------------------------------------------------------------------------------
/docker/README.md:
--------------------------------------------------------------------------------
1 | A pseudo-distributed Hadoop image for testing Spark ACID datasource, based on
2 | 1. CentOS 6
3 | 2. [Hadoop3.1.1] (https://archive.apache.org/dist/hadoop/common/hadoop-3.1.1/hadoop-3.1.1.tar.gz)
4 | 3. [Hive3.1.1] (http://mirrors.estointernet.in/apache/hive/hive-3.1.1/apache-hive-3.1.1-bin.tar.gz)
5 | 4. [MySQL 5.6.44] (http://repo.mysql.com/mysql-community-release-el6-5.noarch.rpm)
6 |
7 | # Setup
8 |
9 | Refer for [Install Docker] (https://docs.docker.com/v17.12/install/) to install docker.
10 |
11 | # Build
12 |
13 | To build docker image
14 | ```bash
15 | ./build
16 | ```
17 |
18 | # Start
19 |
20 | _NB Configure docker to run with the atleast 4GB of memory. For mac it can be configured in Docker Desktop_
21 |
22 | To start docker image
23 | ```bash
24 | ./start
25 | ```
26 |
27 | # Stop
28 |
29 | To stop docker
30 | ```bash
31 | ./stop
32 | ```
33 |
34 |
--------------------------------------------------------------------------------
/docker/beeline:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | name="spark-hiveacid-test-container"
4 |
5 | docker exec -it $name bin/bash -c "\
6 | . ~/.bashrc; \
7 | export HADOOP_HOME=/hadoop; \
8 | hive/bin/beeline -n root -p root -u jdbc:hive2://0.0.0.0:10001/default"
9 |
--------------------------------------------------------------------------------
/docker/build:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | docker build -t centos6/spark-hadoop3-hive3 files/.
3 |
--------------------------------------------------------------------------------
/docker/files/Dockerfile:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License, Version 2.0 (the "License");
2 | # you may not use this file except in compliance with the License.
3 | # You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 | from centos:6
14 | MAINTAINER rajkumar@qubole.com
15 |
16 |
17 | RUN yum -y update
18 | RUN yum -y install epel-release
19 |
20 |
21 | RUN yum -y install java-1.8.0-openjdk-devel java-1.8.0-openjdk
22 | RUN ln -s /usr/lib/jvm//java-1.8.0-openjdk-amd64/ /usr/lib/jvm/java-1.8.0
23 | RUN ln -s /usr/lib/jvm//java-1.7.0-openjdk-amd64/ /usr/lib/jvm/java-1.7.0
24 |
25 | #RUN yum -y install vim
26 | RUN yum -y install wget tar sudo rsync
27 |
28 | RUN yum -y install initscripts httpd
29 |
30 | RUN wget https://archive.apache.org/dist/hadoop/common/hadoop-3.1.1/hadoop-3.1.1.tar.gz
31 | RUN tar -xvzf hadoop-3.1.1.tar.gz
32 | RUN ln -sf /hadoop-3.1.1 /hadoop
33 |
34 | RUN wget https://archive.apache.org/dist/hive/hive-3.1.2/apache-hive-3.1.2-bin.tar.gz
35 | RUN tar -xvzf apache-hive-3.1.2-bin.tar.gz
36 | RUN ln -sf /apache-hive-3.1.2-bin /hive
37 |
38 | RUN yum -y install \
39 | mysql-server mysql-connector-java \
40 | && yum -y clean all && rm -rf /tmp/* /var/tmp/* \
41 | && ln -s /usr/share/java/mysql-connector-java.jar apache-hive-3.1.2-bin/lib/mysql-connector-java.jar
42 |
43 | # Setup sock proxy
44 | RUN yum install -y openssh openssh-clients openssh-server
45 |
46 | # passwordless ssh
47 | RUN ssh-keygen -q -N "" -t rsa -f /root/.ssh/id_rsa
48 | RUN cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys
49 |
50 | RUN chmod 755 /root && chmod 700 /root/.ssh
51 | RUN passwd --unlock root
52 |
53 | RUN yum install -y vim mlocate unzip
54 |
55 | RUN wget https://archive.apache.org/dist/hadoop/common/hadoop-2.7.0/hadoop-2.7.0.tar.gz
56 | RUN tar -xvzf hadoop-2.7.0.tar.gz
57 |
58 |
59 | # Copy configuration files
60 | RUN mkdir /conf
61 | COPY core-site.xml /conf/core-site.xml
62 | COPY hdfs-site.xml /conf/hdfs-site.xml
63 | COPY hadoop-env.sh /conf/hadoop-env.sh
64 | COPY yarn-site.xml /conf/yarn-site.xml
65 |
66 | COPY mapred-site.xml /conf/mapred-site.xml
67 | COPY hive-site.xml /conf/hive-site.xml
68 | COPY bootstrap.sh /bootstrap.sh
69 |
70 | # HDFS ports
71 | EXPOSE 1004 1006 8020 9866 9867 9870 9864 50470 9000
72 |
73 | # YARN ports
74 | EXPOSE 8030 8031 8032 8033 8040 8041 8042 8088 10020 19888
75 |
76 | # HIVE ports
77 | EXPOSE 9083 10000
78 |
79 | # SOCKS port
80 | EXPOSE 1180
81 |
82 | # mysql expose
83 | EXPOSE 3306
84 |
85 | # HDFS datnode
86 | EXPOSE 9866
87 |
--------------------------------------------------------------------------------
/docker/files/bootstrap.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | eg='\033[0;32m'
4 | enc='\033[0m'
5 | echoe () {
6 | OIFS=${IFS}
7 | IFS='%'
8 | echo -e $@
9 | IFS=${OIFS}
10 | }
11 |
12 | gprn() {
13 | echoe "${eg} >> ${1}${enc}"
14 | }
15 |
16 |
17 | ## Setup ENV variables
18 |
19 | export JAVA_HOME="/usr/lib/jvm/java-openjdk"
20 |
21 | export HDFS_NAMENODE_USER="root"
22 | export HDFS_SECONDARYNAMENODE_USER="root"
23 | export HDFS_DATANODE_USER="root"
24 | export YARN_RESOURCEMANAGER_USER="root"
25 | export YARN_NODEMANAGER_USER="root"
26 |
27 | export HADOOP_HOME="/hadoop"
28 | export HADOOP_ROOT_LOGGER=DEBUG
29 | export HADOOP_COMMON_LIB_NATIVE_DIR="/hadoop/lib/native"
30 |
31 | ## Add it to bashrc for starting hadoop
32 | echo 'export JAVA_HOME="/usr/lib/jvm/java-openjdk"' >> ~/.bashrc
33 | echo 'export HADOOP_HOME="/hadoop"' >> ~/.bashrc
34 |
35 |
36 | rm /hadoop
37 | ln -sf /hadoop-3.1.1 /hadoop
38 |
39 | cp /conf/core-site.xml /hadoop/etc/hadoop
40 | cp /conf/hdfs-site.xml /hadoop/etc/hadoop
41 | cp /conf/hadoop-env.sh /hadoop/etc/hadoop
42 | cp /conf/mapred-site.xml /hadoop/etc/hadoop
43 | cp /conf/yarn-site.xml /hadoop/etc/hadoop
44 | cp /conf/hive-site.xml /hive/conf/
45 |
46 |
47 | gprn "set up mysql"
48 | service mysqld start
49 |
50 | # Set root password
51 | mysql -uroot -e "set password = PASSWORD('root');"
52 | mysql -uroot -e "grant all privileges on *.* to 'root'@'%' identified by 'root';"
53 | service sshd start
54 |
55 | gprn "start yarn"
56 | hadoop/sbin/start-yarn.sh &
57 | sleep 5
58 |
59 | gprn "Formatting name node"
60 | hadoop/bin/hdfs namenode -format
61 |
62 | gprn "Start hdfs"
63 | hadoop/sbin/start-dfs.sh
64 |
65 | jps
66 |
67 | mkdir -p /hive/warehouse -dbType mysql -initSchemaTo 3.1.0
68 |
69 |
70 | gprn "Set up metastore DB"
71 | hive/bin/schematool -dbType mysql -initSchemaTo 3.1.0
72 |
73 | gprn "Start HMS server"
74 | hive/bin/hive --service metastore -p 10000 &
75 |
76 | gprn "Sleep and wait for HMS to be up and running"
77 | sleep 20
78 |
79 | gprn "Start HiveServer2"
80 | hive/bin/hive --service hiveserver2 --hiveconf hive.server2.thrift.port=10001 --hiveconf hive.execution.engine=mr
81 |
--------------------------------------------------------------------------------
/docker/files/core-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
21 |
22 |
35 |
36 |
37 |
38 |
39 | fs.defaultFS
40 | hdfs://0.0.0.0:9000
41 |
42 |
43 |
44 | hadoop.proxyuser.root.hosts
45 | *
46 |
47 |
48 |
49 | hadoop.proxyuser.root.groups
50 | *
51 |
52 |
53 |
--------------------------------------------------------------------------------
/docker/files/hadoop-env.sh:
--------------------------------------------------------------------------------
1 | # The maximum amount of heap to use, in MB. Default is 1000.
2 | export HADOOP_HEAPSIZE=1024
3 |
4 | # Extra Java runtime options. Empty by default.
5 | export HADOOP_NAMENODE_OPTS="$HADOOP_NAMENODE_OPTS -Xmx512m"
6 | export YARN_OPTS="$YARN_OPTS -Xmx256m"
7 |
--------------------------------------------------------------------------------
/docker/files/hdfs-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
21 |
22 |
34 |
35 |
36 |
37 |
38 | dfs.replication
39 | 1
40 |
41 |
42 | dfs.permissions.enabled
43 | false
44 |
45 |
46 | dfs.datanode.address
47 | 0.0.0.0:9866
48 |
49 |
53 |
54 | dfs.client.datanode-restart.timeout
55 | 30
56 |
57 |
58 |
--------------------------------------------------------------------------------
/docker/files/hive-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
21 |
22 |
23 |
24 |
25 | javax.jdo.option.ConnectionURL
26 | jdbc:mysql://localhost/metastore?createDatabaseIfNotExist=true
27 |
28 |
29 | javax.jdo.option.ConnectionDriverName
30 | com.mysql.jdbc.Driver
31 |
32 |
33 | javax.jdo.option.ConnectionUserName
34 | root
35 |
36 |
37 | javax.jdo.option.ConnectionPassword
38 | root
39 |
40 |
41 |
42 |
43 | hive.metastore.uris
44 | thrift://0.0.0.0:10000
45 |
46 |
47 |
48 | hive.metastore.event.db.notification.api.auth
49 | false
50 |
51 |
52 |
53 |
54 | hive.support.concurrency
55 | true
56 |
57 |
58 |
59 | hive.exec.dynamic.partition.mode
60 | nonstrict
61 |
62 |
63 |
64 | hive.compactor.initiator.on
65 | true
66 |
67 |
68 |
69 | hive.txn.manager
70 | org.apache.hadoop.hive.ql.lockmgr.DbTxnManager>
71 |
72 |
73 |
74 |
75 | hive.server2.thrift.http.port
76 | 10001
77 |
78 |
79 |
80 | hive.execution.engine
81 | mr
82 |
83 |
84 |
85 | hive.input.format
86 | org.apache.hadoop.hive.ql.io.HiveInputFormat
87 |
88 |
89 |
90 |
91 | hive.auto.convert.join
92 | false
93 |
94 |
95 |
96 | hive.stats.autogather
97 | false
98 |
99 |
100 |
101 | hive.metastore.client.capability.check
102 | false
103 |
104 |
105 |
106 |
--------------------------------------------------------------------------------
/docker/files/mapred-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
21 |
22 |
34 |
35 |
36 |
37 |
38 | mapreduce.framework.name
39 | yarn
40 |
41 |
42 | yarn.app.mapreduce.am.env
43 | HADOOP_MAPRED_HOME=${HADOOP_HOME}
44 |
45 |
46 | mapreduce.map.env
47 | HADOOP_MAPRED_HOME=${HADOOP_HOME}
48 |
49 |
50 | mapreduce.reduce.env
51 | HADOOP_MAPRED_HOME=${HADOOP_HOME}
52 |
53 |
54 | mapreduce.map.memory.mb
55 | 2048
56 |
57 |
58 | mapreduce.reduce.memory.mb
59 | 2048
60 |
61 |
62 |
63 |
--------------------------------------------------------------------------------
/docker/files/yarn-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
20 |
21 |
34 |
35 |
36 | yarn.nodemanager.aux-services
37 | mapreduce_shuffle
38 |
39 |
40 | yarn.nodemanager.aux-services.mapreduce_shuffle.class
41 | org.apache.hadoop.mapred.ShuffleHandler
42 |
43 |
44 |
--------------------------------------------------------------------------------
/docker/inspect:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | prn_row() {
3 | printf '%-32s | %-10s | %-16s | %-128s\n' "${1}" "${2}" "${3}" "${4}"
4 | }
5 | prn_row "DOCKER_NAME" "RUNNING" "IP" "PORT_MAPPING"
6 | id=spark-hiveacid-test-container
7 | NAME=`docker inspect --format='{{.Name}}' $id`
8 | RUNNING=`docker inspect --format='{{.State.Running}}' $id`
9 | IP=`docker inspect --format='{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' $id`
10 | PORT_MAPPING=`docker inspect --format='{{range $p, $conf := .NetworkSettings.Ports}} {{$p}} -> {{(index $conf 0).HostPort}} {{end}}' $id | sed -e 's/\/tcp//g'`
11 | prn_row "$NAME" "${RUNNING}" "${IP}" "${PORT_MAPPING}"
12 |
--------------------------------------------------------------------------------
/docker/login:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | docker exec -it spark-hiveacid-test-container /bin/bash
3 |
--------------------------------------------------------------------------------
/docker/spark-shell:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | if [ -z ${2} ]
3 | then
4 | echo "Specify the spark-acid jar location"
5 | echo "spark-shell ~/codeline/TOT ~/codeline/TOT/acid-ds/target/scala-2.11/spark-acid-qds-assembly-0.4.3.jar"
6 | exit
7 | fi
8 | if [ -z ${1} ]
9 | then
10 | echo "Specify and spark code base directory"
11 | echo "spark-shell ~/codeline/TOT ~/codeline/TOT/acid-ds/target/scala-2.11/spark-acid-qds-assembly-0.4.3.jar"
12 | exit
13 | fi
14 |
15 | shellenv() {
16 | export QENV_LOCAL_CODELINE="${1}"
17 | export QENV_LOCAL_CONF="${QENV_LOCAL}/conf"
18 | export HADOOP_SRC="${QENV_LOCAL_CODELINE}/hadoop2"
19 | export SPARK_SRC="${QENV_LOCAL_CODELINE}/spark"
20 | export HUSTLER_SRC="${QENV_LOCAL_CODELINE}/hustler"
21 | export HIVE_SRC="${QENV_LOCAL_CODELINE}/hive"
22 | export ZEPPELIN_SRC="${QENV_LOCAL_CODELINE}/zeppelin"
23 | }
24 |
25 | hsnapshot() {
26 | HADOOP_SNAPSHOT=`ls ${HADOOP_SRC}/hadoop-dist/target/hadoop* | grep SNAPSHOT: | cut -d':' -f1`
27 | }
28 |
29 | hivesnapshot() {
30 | loc=`ls ${HIVE_SRC}/packaging/target/apache-hive* |grep bin |grep -v ':'`
31 | HIVE_SNAPSHOT=${HIVE_SRC}/packaging/target/${loc}/${loc}/
32 | }
33 |
34 | run_spark_shelllocal() {
35 |
36 | # Setup writest into spark-env file. Run spark-shell after it.
37 | echo "Update Spark Conf based on Hadoop Build Version --> ${SPARK_SRC}/conf/spark-env.sh"
38 | hsnapshot
39 | hivesnapshot
40 |
41 | str="export SPARK_YARN_USER_ENV=CLASSPATH=${QENV_LOCAL_CONF}/"
42 | echo ${str} > ${SPARK_SRC}/conf/spark-env.sh
43 |
44 | if [ -n "${HADOOP_SNAPSHOT}" ]
45 | then
46 |
47 | str="export SPARK_DIST_CLASSPATH=${QENV_LOCAL_CONF}/:${HADOOP_SNAPSHOT}/share/hadoop/common/lib/*:${HADOOP_SNAPSHOT}/share/hadoop/common/*:${HADOOP_SNAPSHOT}/share/hadoop/hdfs:${HADOOP_SNAPSHOT}/share/hadoop/hdfs/lib/*:${HADOOP_SNAPSHOT}/share/hadoop/hdfs/*:${HADOOP_SNAPSHOT}/share/hadoop/yarn/lib/*:${HADOOP_SNAPSHOT}/share/hadoop/yarn/*:${HADOOP_SNAPSHOT}/share/hadoop/mapreduce/*:/share/hadoop/tools:${HADOOP_SNAPSHOT}/share/hadoop/tools/lib/*:${HADOOP_SNAPSHOT}/share/hadoop/tools/*:/share/hadoop/qubole:${HADOOP_SNAPSHOT}/share/hadoop/qubole/*"
48 | echo ${str} >> ${SPARK_SRC}/conf/spark-env.sh
49 | fi
50 |
51 | if [ -n "${HIVE_SNAPSHOT}" ]
52 | then
53 | str="export SPARK_DIST_CLASSPATH=\${SPARK_DIST_CLASSPATH}:${HIVE_SNAPSHOT}/lib/*"
54 | echo ${str} >> ${SPARK_SRC}/conf/spark-env.sh
55 | fi
56 |
57 | str="export HADOOP_CONF_DIR=${QENV_LOCAL_CONF}/"
58 | echo ${str} >> ${SPARK_SRC}/conf/spark-env.sh
59 |
60 | $SPARK_SRC/bin/spark-shell $@
61 | }
62 |
63 |
64 | shellenv ${1}
65 | shift
66 | run_spark_shelllocal --jars $@ --conf spark.sql.extensions=com.qubole.spark.datasources.hiveacid.HiveAcidAutoConvertExtension --conf spark.hadoop.hive.metastore.uris=thrift://localhost:10000 --conf spark.sql.catalogImplementation=hive
67 |
--------------------------------------------------------------------------------
/docker/start:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | name="spark-hiveacid-test-container"
4 |
5 | RUNNING=`docker inspect --format "{{ .State.Running}}" ${name} 2>/dev/null`
6 | if [[ $? -eq 0 ]]
7 | then
8 | if [[ "${RUNNING}" == "true" ]]
9 | then
10 | echo "$name already running"
11 | exit
12 | fi
13 | else
14 | docker run --name ${name} --hostname localhost -P -p9866:9866 -p10000:10000 -p10001:10001 -p9000:9000 -p3306:3306 -p50070:50070 -p50030:50030 -it -d centos6/spark-hadoop3-hive3 /bin/bash -c "/bootstrap.sh >/tmp/boostrap.log"
15 | fi
16 |
17 |
--------------------------------------------------------------------------------
/docker/stop:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | name="spark-hiveacid-test-container"
4 | docker kill ${name}
5 | docker rm ${name}
6 |
--------------------------------------------------------------------------------
/project/build.properties:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright 2019 Qubole, Inc. All rights reserved.
3 | #
4 | # Licensed to the Apache Software Foundation (ASF) under one or more
5 | # contributor license agreements. See the NOTICE file distributed with
6 | # this work for additional information regarding copyright ownership.
7 | # The ASF licenses this file to You under the Apache License, Version 2.0
8 | # (the "License"); you may not use this file except in compliance with
9 | # the License. You may obtain a copy of the License at
10 | #
11 | # http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 |
20 | sbt.version = 0.13.16
21 |
--------------------------------------------------------------------------------
/project/plugins.sbt:
--------------------------------------------------------------------------------
1 |
2 | resolvers += "spark-packages" at sys.props.getOrElse("spark.repo", "https://repos.spark-packages.org/")
3 |
4 | addSbtPlugin("org.spark-packages" % "sbt-spark-package" % "0.2.6")
5 |
6 | addSbtPlugin("com.github.gseitz" % "sbt-release" % "1.0.11")
7 |
8 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.9")
9 |
10 | addSbtPlugin("org.foundweekends" % "sbt-bintray" % "0.5.4")
11 |
12 | addSbtPlugin("com.simplytyped" % "sbt-antlr4" % "0.7.13")
13 |
14 | addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.9.2")
15 |
--------------------------------------------------------------------------------
/shaded-dependencies/build.sbt:
--------------------------------------------------------------------------------
1 | name := "spark-acid-shaded-dependencies"
2 |
3 | version := sys.props.getOrElse("package.version", "0.1")
4 |
5 | organization:= "com.qubole"
6 |
7 | scalaVersion := "2.11.12"
8 |
9 | scalacOptions ++= Seq(
10 | "-Xlint",
11 | "-Xfatal-warnings",
12 | "-deprecation",
13 | "-unchecked",
14 | "-optimise",
15 | "-Yinline-warnings"
16 | )
17 |
18 | scalacOptions in (Compile, doc) ++= Seq(
19 | "-no-link-warnings" // Suppresses problems with Scaladoc @throws links
20 | )
21 |
22 | // do not run test at assembly
23 | test in assembly := {}
24 |
25 | publishArtifact in (Compile, packageDoc) := false
26 |
27 | publishArtifact in (Compile, packageSrc) := false
28 |
29 | publishArtifact in (Compile, packageBin) := false
30 |
31 | val hive_version = sys.props.getOrElse("hive.version", "3.1.2")
32 |
33 | val orc_version = sys.props.getOrElse("orc.version", "1.5.6")
34 |
35 | resolvers += "Additional Maven Repository" at sys.props.getOrElse("hive.repo", "https://repo1.maven.org/maven2/")
36 |
37 | // Shaded dependency
38 | libraryDependencies ++= Seq(
39 | // Hive/Orc core dependencies packed.
40 | "org.apache.hive" % "hive-metastore" % hive_version intransitive(),
41 | "org.apache.hive" % "hive-exec" % hive_version intransitive(),
42 | "org.apache.orc" % "orc-core" % orc_version intransitive(),
43 | "org.apache.orc" % "orc-mapreduce" % orc_version intransitive(),
44 |
45 | // Only for hive3 client in tests.. but packing it in shaded jars.
46 | "org.apache.hive" % "hive-jdbc" % hive_version intransitive(),
47 | "org.apache.hive" % "hive-service" % hive_version intransitive(),
48 | "org.apache.hive" % "hive-serde" % hive_version intransitive(),
49 | "org.apache.hive" % "hive-common" % hive_version intransitive(),
50 |
51 | // To deal with hive3 metastore library 0.9.3 vs zeppelin thirft
52 | // library version 0.9.1 conflict when runing Notebooks.
53 | "org.apache.thrift" % "libfb303" % "0.9.3",
54 | "org.apache.thrift" % "libthrift" % "0.9.3"
55 | )
56 |
57 |
58 | assemblyShadeRules in assembly := Seq(
59 | ShadeRule.rename("org.apache.hadoop.hive.**" -> "com.qubole.shaded.hadoop.hive.@1").inAll,
60 | ShadeRule.rename("org.apache.hive.**" -> "com.qubole.shaded.hive.@1").inAll,
61 | ShadeRule.rename("org.apache.orc.**" -> "com.qubole.shaded.orc.@1").inAll,
62 | ShadeRule.rename("org.apache.commons.**" -> "com.qubole.shaded.commons.@1").inAll,
63 | ShadeRule.rename("org.apache.avro.**" -> "com.qubole.shaded.avro.@1").inAll,
64 | ShadeRule.rename("org.apache.parquet.**" -> "com.qubole.shaded.parquet.@1").inAll,
65 | ShadeRule.rename("org.apache.http.**" -> "com.qubole.shaded.http.@1").inAll,
66 | ShadeRule.rename("org.apache.tez.**" -> "com.qubole.shaded.tez.@1").inAll,
67 |
68 | ShadeRule.rename("com.google.**" -> "com.qubole.shaded.@1").inAll,
69 | ShadeRule.rename("com.facebook.fb303.**" -> "com.qubole.shaded.facebook.fb303.@1").inAll,
70 | ShadeRule.rename("org.apache.thrift.**" -> "com.qubole.shaded.thrift.@1").inAll,
71 |
72 | ShadeRule.rename("org.codehaus.jackson.**" -> "com.qubole.shaded.jackson.@1").inAll,
73 | ShadeRule.rename("org.joda.**" -> "com.qubole.shaded.joda.@1").inAll,
74 | ShadeRule.rename("org.json.**" -> "com.qubole.shaded.json.@1").inAll,
75 |
76 | ShadeRule.rename("jodd.**" -> "com.qubole.shaded.jodd.@1").inAll,
77 | ShadeRule.rename("javaewah.**" -> "com.qubole.shaded.javaewah.@1").inAll,
78 | ShadeRule.rename("io.airlift.**" -> "com.qubole.shaded.io.airlift.@1").inAll,
79 |
80 | ShadeRule.rename("org.openx.data.**" -> "com.qubole.shaded.openx.data.@1").inAll,
81 | ShadeRule.rename("au.com.bytecode.opencsv.**" -> "com.qubole.shaded.au.com.bytecode.opencsv.@1").inAll,
82 | ShadeRule.rename("com.readytalk.metrics.**" -> "com.qubole.shaded.readytalk.metrics.@1").inAll
83 | )
84 |
85 | import sbtassembly.AssemblyPlugin.autoImport.{ ShadeRule}
86 | import sbtassembly.MergeStrategy
87 | val distinctAndReplace: sbtassembly.MergeStrategy = new sbtassembly.MergeStrategy {
88 | val name = "distinctAndReplace"
89 | def apply(tempDir: File, path: String, files: Seq[File]): Either[String, Seq[(File, String)]] = {
90 | val lines = files flatMap (IO.readLines(_, IO.utf8))
91 | val unique = lines.distinct
92 | val replaced = unique.map { x => x.replace("org.apache.hadoop.hive", "com.qubole.shaded.hadoop.hive")}
93 | val file = sbtassembly.MergeStrategy.createMergeTarget(tempDir, path)
94 | IO.writeLines(file, replaced, IO.utf8)
95 | Right(Seq(file -> path))
96 | }
97 | }
98 |
99 |
100 | assemblyMergeStrategy in assembly := {
101 | // all discarded classes first
102 | case PathList("javax", xs @ _*) => MergeStrategy.discard
103 | case PathList("javolution", xs @_*) => MergeStrategy.discard
104 | // discard non shaded classes in hadoop and qubole packages
105 | case PathList("org", "apache", "hadoop", xs @_*) => MergeStrategy.discard
106 | case PathList("org", "apache", "log4j", xs @ _*) => MergeStrategy.last
107 | case PathList("com", "google", xs @ _*) => MergeStrategy.last
108 | case PathList("com", "esotericsoftware", xs @ _*) => MergeStrategy.last
109 | case PathList("com", "codahale", xs @ _*) => MergeStrategy.last
110 | case PathList("com", "yammer", xs @ _*) => MergeStrategy.last
111 | case PathList("org","aopalliance", xs @ _*) => MergeStrategy.last
112 | case PathList("com","zaxxer", xs @ _*) => MergeStrategy.last
113 | case PathList("org","apache", "logging", "log4j", xs @ _*) => MergeStrategy.last
114 | case PathList("io","netty", xs @ _*) => MergeStrategy.last
115 | case PathList("org","datanucleus", xs @ _*) => MergeStrategy.last
116 | case PathList("org", "apache", "arrow", xs @ _*) => MergeStrategy.last
117 | case PathList("org", "apache", "commons", "lang3", xs @ _*) => MergeStrategy.last
118 | case PathList("org", "apache", "commons", "lang3", "builder", xs @ _*) => MergeStrategy.last
119 | case PathList("org", "apache", "commons", "lang3", "concurrent", xs @ _*) => MergeStrategy.last
120 | case PathList("org", "apache", "commons", "lang3", "event", xs @ _*) => MergeStrategy.last
121 | case PathList("org", "apache", "commons", "lang3", "exception", xs @ _*) => MergeStrategy.last
122 | case PathList("org", "apache", "commons", "lang3", "math", xs @ _*) => MergeStrategy.last
123 | case PathList("org", "apache", "commons", "lang3", "mutable", xs @ _*) => MergeStrategy.last
124 | case PathList("org", "apache", "commons", "lang3", "reflect", xs @ _*) => MergeStrategy.last
125 | case PathList("org", "apache", "commons", "lang3", "text", xs @ _*) => MergeStrategy.last
126 | case PathList("org", "apache", "commons", "lang3", "time", xs @ _*) => MergeStrategy.last
127 | case PathList("org", "apache", "commons", "lang3", "tuple", xs @ _*) => MergeStrategy.last
128 | case PathList("com", "qubole", "shaded", "orc", xs @ _*) => MergeStrategy.last
129 | case PathList("org", "slf4j", "impl", xs @ _*) => MergeStrategy.last
130 | case PathList("org", "slf4j", "helpers", xs @ _*) => MergeStrategy.last
131 | case PathList("org", "slf4j", xs @ _*) => MergeStrategy.last
132 |
133 | // discard package.jdo because objects defined inside it are not shaded.
134 | // So removing for now
135 | case "package.jdo" => MergeStrategy.discard
136 |
137 | case PathList("META-INF", "services", xs @ _*) => distinctAndReplace
138 | // case "about.html" => MergeStrategy.rename
139 | case "META-INF/ECLIPSEF.RSA" => MergeStrategy.last
140 | case "META-INF/mailcap" => MergeStrategy.last
141 | case "META-INF/mimetypes.default" => MergeStrategy.last
142 | case "plugin.properties" => MergeStrategy.last
143 | case "log4j.properties" => MergeStrategy.last
144 | case "Log4j2Plugins.dat" => MergeStrategy.last
145 | case "git.properties" => MergeStrategy.last
146 | case "plugin.xml" => MergeStrategy.last
147 | case "META-INF/io.netty.versions.properties" => MergeStrategy.last
148 | case "META-INF/org/apache/logging/log4j/core/config/plugins/Log4j2Plugins.dat" => MergeStrategy.last
149 | case "codegen/config.fmpp" => MergeStrategy.first
150 |
151 | case x =>
152 | val oldStrategy = (assemblyMergeStrategy in assembly).value
153 | oldStrategy(x)
154 | }
155 |
156 | // do not add scala in fat jar
157 | assemblyOption in assembly := (assemblyOption in assembly).value.copy(includeScala = false)
158 |
159 | // For publishing assembly locally
160 | publishMavenStyle := false
161 |
162 | artifact in (Compile, assembly) := {
163 | val art = (artifact in (Compile, assembly)).value
164 | art.withClassifier(None)
165 | }
166 |
167 | addArtifact(artifact in (Compile, assembly), assembly)
168 |
169 |
--------------------------------------------------------------------------------
/shaded-dependencies/project/build.properties:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright 2019 Qubole, Inc. All rights reserved.
3 | #
4 | # Licensed to the Apache Software Foundation (ASF) under one or more
5 | # contributor license agreements. See the NOTICE file distributed with
6 | # this work for additional information regarding copyright ownership.
7 | # The ASF licenses this file to You under the Apache License, Version 2.0
8 | # (the "License"); you may not use this file except in compliance with
9 | # the License. You may obtain a copy of the License at
10 | #
11 | # http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 |
20 | sbt.version = 1.2.8
21 |
--------------------------------------------------------------------------------
/shaded-dependencies/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.9")
2 | addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.9.2")
3 |
--------------------------------------------------------------------------------
/src/it/scala/com/qubole/spark/hiveacid/LockSuite.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.qubole.spark.hiveacid
19 |
20 | import com.qubole.spark.hiveacid.hive.HiveAcidMetadata
21 | import com.qubole.spark.hiveacid.transaction.HiveAcidTxn
22 | import org.apache.log4j.{Level, LogManager, Logger}
23 | import org.apache.spark.sql.SparkSession
24 | import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach, FunSuite}
25 |
26 | import scala.util.control.NonFatal
27 |
28 | class LockSuite extends FunSuite with BeforeAndAfterEach with BeforeAndAfterAll {
29 | val log: Logger = LogManager.getLogger(this.getClass)
30 | log.setLevel(Level.INFO)
31 |
32 | var helper: TestHelper = _
33 | val isDebug = true
34 |
35 | val DEFAULT_DBNAME = "HiveTestLockDB"
36 | val cols: Map[String, String] = Map(
37 | ("intCol","int"),
38 | ("doubleCol","double"),
39 | ("floatCol","float"),
40 | ("booleanCol","boolean")
41 | )
42 | val partitionedTable = new Table(DEFAULT_DBNAME, "partitioned",
43 | cols, Table.orcPartitionedFullACIDTable, true)
44 | val normalTable = new Table(DEFAULT_DBNAME, "nonPartitioned",
45 | cols, Table.orcFullACIDTable, false)
46 |
47 | override def beforeAll() {
48 | try {
49 | helper = new TestLockHelper
50 | if (isDebug) {
51 | log.setLevel(Level.DEBUG)
52 | }
53 | helper.init(isDebug)
54 |
55 | // DB
56 | helper.hiveExecute("DROP DATABASE IF EXISTS "+ DEFAULT_DBNAME +" CASCADE")
57 | helper.hiveExecute("CREATE DATABASE "+ DEFAULT_DBNAME)
58 | helper.recreate(partitionedTable)
59 | helper.recreate(normalTable)
60 | helper.hiveExecute(partitionedTable.insertIntoHiveTableKeyRange(11, 25))
61 | } catch {
62 | case NonFatal(e) => log.info("failed " + e)
63 | }
64 | }
65 |
66 | override protected def afterAll(): Unit = {
67 | helper.hiveExecute(s"DROP TABLE IF EXISTS ${normalTable.hiveTname}")
68 | helper.hiveExecute(s"DROP TABLE IF EXISTS ${partitionedTable.hiveTname}")
69 | helper.hiveExecute("DROP DATABASE IF EXISTS "+ DEFAULT_DBNAME +" CASCADE")
70 | helper.destroy()
71 | }
72 |
73 | case class TestLockOperation(whichTransaction: Int,
74 | operationType: HiveAcidOperation.OperationType,
75 | partition: Seq[String],
76 | willFail: Boolean = false)
77 |
78 | test("test lock wait timeout exception") {
79 | val lockOps = Seq(
80 | TestLockOperation(1, HiveAcidOperation.UPDATE, Seq()), // first trans will pass
81 | TestLockOperation(1, HiveAcidOperation.DELETE, Seq()), // similar operation on first trans will pass
82 | TestLockOperation(2, HiveAcidOperation.DELETE, Seq(), true)) // second transaction will wait and fail in 100ms
83 | testLockOps(lockOps)
84 | }
85 |
86 | test("test locks within same transaction is allowed") {
87 | val lockOps = Seq(
88 | TestLockOperation(1, HiveAcidOperation.UPDATE, Seq()), // first trans will pass
89 | TestLockOperation(1, HiveAcidOperation.DELETE, Seq()), // similar operation on first trans will pass
90 | TestLockOperation(1, HiveAcidOperation.READ, Seq()), // READ on same transaction will pass
91 | TestLockOperation(1, HiveAcidOperation.INSERT_OVERWRITE, Seq()))
92 | testLockOps(lockOps)
93 | }
94 |
95 | test("test READ after UPDATE/DELETE is allowed") {
96 | val lockOps = Seq(
97 | TestLockOperation(1, HiveAcidOperation.UPDATE, Seq()), // first trans will pass
98 | TestLockOperation(1, HiveAcidOperation.DELETE, Seq()),
99 | TestLockOperation(2, HiveAcidOperation.READ, Seq())) // second transaction READ need not wait
100 | testLockOps(lockOps)
101 | }
102 |
103 | test("test DELETE/READ after INSERT OVERWRITE is not allowed") {
104 | val lockOps = Seq(
105 | TestLockOperation(1, HiveAcidOperation.INSERT_OVERWRITE, Seq()),
106 | TestLockOperation(2, HiveAcidOperation.UPDATE, Seq(), true),
107 | TestLockOperation(2, HiveAcidOperation.DELETE, Seq(), true),
108 | TestLockOperation(2, HiveAcidOperation.READ, Seq(), true))
109 | testLockOps(lockOps)
110 | }
111 |
112 | test("test INSERT_OVERWRITE and DELETE/UPDATE/READ on different partition is allowed") {
113 | val lockOps = Seq(
114 | TestLockOperation(1, HiveAcidOperation.INSERT_OVERWRITE, Seq("ptnCol=0")),
115 | TestLockOperation(2, HiveAcidOperation.DELETE, Seq("ptnCol=1")),
116 | TestLockOperation(2, HiveAcidOperation.UPDATE, Seq("ptnCol=1")),
117 | TestLockOperation(2, HiveAcidOperation.READ, Seq("ptnCol=1")))
118 | testLockOps(lockOps)
119 | }
120 |
121 | def testLockOps(lockOps: Seq[TestLockOperation]): Unit = {
122 | val tableName = DEFAULT_DBNAME + "." + "nonPartitioned"
123 | val hiveAcidMetadata = HiveAcidMetadata.fromSparkSession(helper.spark,
124 | tableName)
125 |
126 | // Just try 2 attempts for lock acquisition and fail if it cannot.
127 | helper.spark.sessionState.conf.setConfString("spark.hiveAcid.lock.max.retries", "2")
128 | val sparkConf = SparkAcidConf(helper.spark, Map())
129 | val hTxn1 = new HiveAcidTxn(helper.spark)
130 | val hTxn2 = new HiveAcidTxn(helper.spark)
131 |
132 | def executeOp(lockOp: TestLockOperation) {
133 | val txn = lockOp.whichTransaction match {
134 | case 1 => hTxn1
135 | case 2 => hTxn2
136 | case _ => throw new IllegalArgumentException("Only 1 or 2 are supported for whichTransaction field")
137 | }
138 | if (lockOp.willFail) {
139 | val thrown = intercept[RuntimeException] {
140 | txn.acquireLocks(hiveAcidMetadata, lockOp.operationType, lockOp.partition, sparkConf)
141 | }
142 | assert(thrown.getMessage.contains("Could not acquire lock. Lock State: WAITING"))
143 | } else {
144 | txn.acquireLocks(hiveAcidMetadata, lockOp.operationType, lockOp.partition, sparkConf)
145 | }
146 |
147 | }
148 |
149 | try {
150 | hTxn1.begin()
151 | hTxn2.begin()
152 | lockOps.foreach(executeOp(_))
153 | } finally {
154 | helper.spark.sessionState.conf.unsetConf("spark.hiveAcid.lock.max.retries")
155 | hTxn1.end(true)
156 | hTxn2.end(true)
157 | }
158 | }
159 |
160 | test("test HeartBeatRunner is running") {
161 | val hTxn1 = new HiveAcidTxn(helper.spark)
162 | hTxn1.begin()
163 | // Sleep for 4 seconds
164 | Thread.sleep(4 * 1000)
165 | val txn = HiveAcidTxn.txnManager.showOpenTrans().find(ti => ti.getId == hTxn1.txnId)
166 | assert(txn.isDefined, "Transaction is expected to be open")
167 | val seconds = (txn.get.getLastHeartbeatTime() - txn.get.getStartedTime()) / 1000
168 | assert(seconds >= 2, "getLastHeartBeatTime should " +
169 | "be at least 2 seconds after transaction was opened")
170 | hTxn1.end(true)
171 | }
172 | }
173 |
174 | class TestLockHelper extends TestHelper {
175 | // Create spark session with txn timeout config as that needs to be set
176 | // before the start of spark session
177 | override def getSparkSession(): SparkSession = {
178 | SparkSession.builder().appName("Hive-acid-test")
179 | .master("local[*]")
180 | .config("spark.hadoop.hive.metastore.uris", "thrift://0.0.0.0:10000")
181 | .config("spark.sql.warehouse.dir", "/tmp")
182 | .config("spark.sql.extensions", "com.qubole.spark.hiveacid.HiveAcidAutoConvertExtension")
183 | .config("spark.hadoop.hive.txn.timeout", "6")
184 | //.config("spark.ui.enabled", "true")
185 | //.config("spark.ui.port", "4041")
186 | .enableHiveSupport()
187 | .getOrCreate()
188 | }
189 | }
--------------------------------------------------------------------------------
/src/it/scala/com/qubole/spark/hiveacid/TestHiveClient.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2019 Qubole, Inc. All rights reserved.
3 | *
4 | * Licensed to the Apache Software Foundation (ASF) under one or more
5 | * contributor license agreements. See the NOTICE file distributed with
6 | * this work for additional information regarding copyright ownership.
7 | * The ASF licenses this file to You under the Apache License, Version 2.0
8 | * (the "License"); you may not use this file except in compliance with
9 | * the License. You may obtain a copy of the License at
10 | *
11 | * http://www.apache.org/licenses/LICENSE-2.0
12 | *
13 | * Unless required by applicable law or agreed to in writing, software
14 | * distributed under the License is distributed on an "AS IS" BASIS,
15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | * See the License for the specific language governing permissions and
17 | * limitations under the License.
18 | */
19 |
20 | package com.qubole.spark.hiveacid;
21 |
22 |
23 | import java.sql.Connection;
24 | import java.sql.DriverManager;
25 | import java.sql.ResultSet;
26 | import java.sql.ResultSetMetaData;
27 | import java.sql.SQLException;
28 | import java.sql.Statement;
29 |
30 | import java.io.StringWriter;
31 |
32 | public class TestHiveClient {
33 | private static Connection con = null;
34 | private static Statement stmt = null;
35 |
36 | TestHiveClient() {
37 | try {
38 | // Before running this docker container with HS2 / HMS / Hadoop running
39 | String driverName = "com.qubole.shaded.hive.jdbc.HiveDriver";
40 | Class.forName(driverName);
41 | } catch (ClassNotFoundException e) {
42 | e.printStackTrace();
43 | System.exit(1);
44 | }
45 | try {
46 | con = DriverManager.getConnection("jdbc:hive2://0.0.0.0:10001?allowMultiQueries=true", "root", "root");
47 | stmt = con.createStatement();
48 | }
49 | catch (Exception e) {
50 | System.out.println("Failed to create statement "+ e);
51 | }
52 | }
53 |
54 | public String executeQuery(String cmd) throws Exception {
55 | // Start Hive txn
56 | ResultSet rs = null;
57 | String resStr = null;
58 | try {
59 | rs = stmt.executeQuery(cmd);
60 | resStr = resultStr(rs);
61 | // close hive txn
62 | rs.close();
63 | rs = null;
64 |
65 | } catch (Exception e) {
66 | System.out.println("Failed execute query statement \""+ cmd +"\" Error:"+ e);
67 | if (rs != null ) {
68 | rs.close();
69 | }
70 | }
71 | return resStr;
72 | }
73 |
74 | public void execute(String cmd) throws SQLException {
75 | try {
76 | stmt.execute(cmd);
77 | } catch (Exception e) {
78 | System.out.println("Failed execute statement \""+ cmd +"\" Error:"+ e);
79 | }
80 | }
81 |
82 | private String resultStr(ResultSet rs) throws SQLException {
83 | StringWriter outputWriter = new StringWriter();
84 | ResultSetMetaData rsmd = rs.getMetaData();
85 | int columnsNumber = rsmd.getColumnCount();
86 | int rowNumber = 0;
87 | while (rs.next()) {
88 | if (rowNumber != 0) {
89 | outputWriter.append("\n");
90 | }
91 | rowNumber++;
92 | for (int i = 1; i <= columnsNumber; i++) {
93 | if (i > 1) outputWriter.append(",");
94 | String columnValue = rs.getString(i);
95 | // outputWriter.append(rsmd.getColumnName(i)+ "=" + columnValue);
96 | outputWriter.append(columnValue);
97 | }
98 | }
99 | return outputWriter.toString();
100 | }
101 |
102 | public void teardown() throws SQLException {
103 | if (stmt != null) {
104 | stmt.close();
105 | stmt = null;
106 | }
107 | if (con != null) {
108 | con.close();
109 | con = null;
110 | }
111 | }
112 | }
113 |
--------------------------------------------------------------------------------
/src/it/scala/com/qubole/spark/hiveacid/TestSparkSession.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2019 Qubole, Inc. All rights reserved.
3 | *
4 | * Licensed to the Apache Software Foundation (ASF) under one or more
5 | * contributor license agreements. See the NOTICE file distributed with
6 | * this work for additional information regarding copyright ownership.
7 | * The ASF licenses this file to You under the Apache License, Version 2.0
8 | * (the "License"); you may not use this file except in compliance with
9 | * the License. You may obtain a copy of the License at
10 | *
11 | * http://www.apache.org/licenses/LICENSE-2.0
12 | *
13 | * Unless required by applicable law or agreed to in writing, software
14 | * distributed under the License is distributed on an "AS IS" BASIS,
15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | * See the License for the specific language governing permissions and
17 | * limitations under the License.
18 | */
19 | package com.qubole.spark.hiveacid
20 |
21 | import org.apache.spark.sql.SparkSession
22 |
23 | private[hiveacid] object TestSparkSession {
24 |
25 | def getSession: SparkSession = {
26 | val spark: SparkSession = SparkSession.builder().appName("Hive-acid-test")
27 | .master("local[*]")
28 | .config("spark.hadoop.hive.metastore.uris", "thrift://0.0.0.0:10000")
29 | .config("spark.sql.warehouse.dir", "/tmp")
30 | .config("spark.sql.extensions", "com.qubole.spark.hiveacid.HiveAcidAutoConvertExtension")
31 | //.config("spark.ui.enabled", "true")
32 | //.config("spark.ui.port", "4041")
33 | .enableHiveSupport()
34 | .getOrCreate()
35 | spark.sparkContext.setLogLevel("WARN")
36 | spark
37 | }
38 |
39 | def close(spark: SparkSession): Unit = {
40 | spark.close()
41 | SparkSession.clearActiveSession()
42 | SparkSession.clearDefaultSession()
43 | }
44 | }
45 |
--------------------------------------------------------------------------------
/src/it/scala/com/qubole/spark/hiveacid/UpdateDeleteSuite.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2019 Qubole, Inc. All rights reserved.
3 | *
4 | * Licensed to the Apache Software Foundation (ASF) under one or more
5 | * contributor license agreements. See the NOTICE file distributed with
6 | * this work for additional information regarding copyright ownership.
7 | * The ASF licenses this file to You under the Apache License, Version 2.0
8 | * (the "License"); you may not use this file except in compliance with
9 | * the License. You may obtain a copy of the License at
10 | *
11 | * http://www.apache.org/licenses/LICENSE-2.0
12 | *
13 | * Unless required by applicable law or agreed to in writing, software
14 | * distributed under the License is distributed on an "AS IS" BASIS,
15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | * See the License for the specific language governing permissions and
17 | * limitations under the License.
18 | */
19 |
20 | package com.qubole.spark.hiveacid
21 |
22 |
23 | import org.apache.log4j.{Level, LogManager, Logger}
24 | import org.scalatest._
25 |
26 | import scala.util.control.NonFatal
27 |
28 | class UpdateDeleteSuite extends FunSuite with BeforeAndAfterEach with BeforeAndAfterAll {
29 |
30 | val log: Logger = LogManager.getLogger(this.getClass)
31 | log.setLevel(Level.INFO)
32 |
33 | var helper: TestHelper = _
34 | val isDebug = true
35 |
36 | val DEFAULT_DBNAME = "HiveTestUpdateDeleteDB"
37 | val cols: Map[String, String] = Map(
38 | ("intCol","int"),
39 | ("doubleCol","double"),
40 | ("floatCol","float"),
41 | ("booleanCol","boolean")
42 | )
43 |
44 | override def beforeAll() {
45 | try {
46 | helper = new TestHelper
47 | if (isDebug) {
48 | log.setLevel(Level.DEBUG)
49 | }
50 | helper.init(isDebug)
51 |
52 | // DB
53 | helper.hiveExecute("DROP DATABASE IF EXISTS "+ DEFAULT_DBNAME +" CASCADE")
54 | helper.hiveExecute("CREATE DATABASE "+ DEFAULT_DBNAME)
55 | } catch {
56 | case NonFatal(e) => log.info("failed " + e)
57 | }
58 | }
59 |
60 | override protected def afterAll(): Unit = {
61 | helper.destroy()
62 | }
63 |
64 | val testTables = List(
65 | // Positive Test
66 | (Table.orcFullACIDTable, false, true),
67 | (Table.orcPartitionedFullACIDTable, true, true),
68 | // Negative Test
69 | (Table.orcTable, false, false),
70 | (Table.orcPartitionedTable, true, false),
71 | (Table.orcBucketedTable, false, false), (Table.orcBucketedPartitionedTable, true, false))
72 | // Test Run
73 | updateTestForFullAcidTables(testTables)
74 | deleteTestForFullAcidTables(testTables)
75 |
76 | // Update test for full acid tables
77 | def updateTestForFullAcidTables(tTypes: List[(String, Boolean, Boolean)]): Unit = {
78 | tTypes.foreach { case (tType, isPartitioned, positiveTest) =>
79 | val tableNameSpark = "tSparkUpdate"
80 | val testName = s"Update Test for $tableNameSpark type $tType"
81 | test(testName) {
82 | val tableSpark = new Table(DEFAULT_DBNAME, tableNameSpark, cols, tType, isPartitioned)
83 | def code(): Unit = {
84 |
85 | if (positiveTest) {
86 | helper.recreate(tableSpark)
87 | helper.hiveExecute(tableSpark.insertIntoHiveTableKeyRange(11, 20))
88 | val expectedRows = 10
89 | helper.compareResult(expectedRows.toString, helper.sparkCollect(tableSpark.count))
90 | val expectedUpdateValue = helper.sparkCollect(tableSpark.selectExpectedUpdateCol(11))
91 | helper.sparkSQL(tableSpark.updateInHiveTableKey(11))
92 | val updatedVal = helper.sparkCollect(tableSpark.selectUpdateCol(11))
93 | helper.compareResult(expectedUpdateValue, updatedVal)
94 | } else {
95 | intercept[RuntimeException] {
96 | helper.recreate(tableSpark)
97 | helper.sparkSQL(tableSpark.updateInHiveTableKey(11))
98 | }
99 | }
100 | }
101 | helper.myRun(testName, code)
102 | }
103 | }
104 | }
105 |
106 | // Delete test for full acid tables
107 | def deleteTestForFullAcidTables(tTypes: List[(String, Boolean, Boolean)]): Unit = {
108 | tTypes.foreach { case (tType, isPartitioned, positiveTest) =>
109 | val tableNameSpark = "tSparkDelete"
110 | val testName = s"Delete Test for $tableNameSpark type $tType"
111 | test(testName) {
112 | val tableSpark = new Table(DEFAULT_DBNAME, tableNameSpark, cols, tType, isPartitioned)
113 | def code(): Unit = {
114 | if (positiveTest) {
115 | helper.recreate(tableSpark)
116 | helper.hiveExecute(tableSpark.insertIntoHiveTableKeyRange(11, 20))
117 | var expectedRows = 10
118 | helper.compareResult(expectedRows.toString, helper.sparkCollect(tableSpark.count))
119 |
120 | // delete 1 row
121 | helper.sparkSQL(tableSpark.deleteFromHiveTableKey(11))
122 | expectedRows = 9
123 | helper.compareResult(expectedRows.toString, helper.sparkCollect(tableSpark.count))
124 |
125 | // Delete all but 1 using predicates
126 | helper.sparkSQL(tableSpark.deleteFromHiveTableGreaterThanKey(15))
127 | helper.sparkSQL(tableSpark.deleteFromHiveTableLesserThanKey(15))
128 | expectedRows = 1
129 | helper.compareResult(expectedRows.toString, helper.sparkCollect(tableSpark.count))
130 |
131 | // No OP Delete
132 | helper.sparkCollect(tableSpark.deleteFromHiveTableGreaterThanKey(20))
133 | helper.compareResult(expectedRows.toString, helper.sparkCollect(tableSpark.count))
134 | } else {
135 | intercept[RuntimeException] {
136 | helper.recreate(tableSpark)
137 | // delete 1 row
138 | helper.sparkSQL(tableSpark.deleteFromHiveTableKey(11))
139 | }
140 | }
141 | }
142 | helper.myRun(testName, code)
143 | }
144 | }
145 | }
146 |
147 | test("Test Update on Partition Columns is not allowed") {
148 | val tableNameSpark = "tUpdateNeg"
149 | val tableSpark = new Table(DEFAULT_DBNAME, tableNameSpark, cols,
150 | Table.orcPartitionedFullACIDTable, true)
151 | helper.recreate(tableSpark,false)
152 | val thrown = intercept[AnalysisException] {
153 | helper.sparkSQL(s"UPDATE ${DEFAULT_DBNAME}.${tableNameSpark} set ptnCol = 2 where intCol > 10")
154 | }
155 | assert(thrown.getMessage.contains("UPDATE on the partition columns are not allowed"))
156 | }
157 | }
158 |
--------------------------------------------------------------------------------
/src/it/scala/com/qubole/spark/hiveacid/WriteSuite.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2019 Qubole, Inc. All rights reserved.
3 | *
4 | * Licensed to the Apache Software Foundation (ASF) under one or more
5 | * contributor license agreements. See the NOTICE file distributed with
6 | * this work for additional information regarding copyright ownership.
7 | * The ASF licenses this file to You under the Apache License, Version 2.0
8 | * (the "License"); you may not use this file except in compliance with
9 | * the License. You may obtain a copy of the License at
10 | *
11 | * http://www.apache.org/licenses/LICENSE-2.0
12 | *
13 | * Unless required by applicable law or agreed to in writing, software
14 | * distributed under the License is distributed on an "AS IS" BASIS,
15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | * See the License for the specific language governing permissions and
17 | * limitations under the License.
18 | */
19 |
20 | package com.qubole.spark.hiveacid
21 |
22 |
23 | import org.apache.log4j.{Level, LogManager, Logger}
24 | import org.scalatest._
25 |
26 | import scala.util.control.NonFatal
27 |
28 | @Ignore
29 | class WriteSuite extends FunSuite with BeforeAndAfterEach with BeforeAndAfterAll {
30 |
31 | val log: Logger = LogManager.getLogger(this.getClass)
32 | log.setLevel(Level.INFO)
33 |
34 | var helper: TestHelper = _
35 | val isDebug = true
36 |
37 | val DEFAULT_DBNAME = "HiveTestDB"
38 | val defaultPred = " intCol < 5 "
39 | val cols: Map[String, String] = Map(
40 | ("intCol","int"),
41 | ("doubleCol","double"),
42 | ("floatCol","float"),
43 | ("booleanCol","boolean")
44 | // TODO: Requires spark.sql.hive.convertMetastoreOrc=false to run
45 | // ("dateCol","date")
46 | )
47 |
48 | override def beforeAll() {
49 | try {
50 |
51 | helper = new TestHelper
52 | if (isDebug) {
53 | log.setLevel(Level.DEBUG)
54 | }
55 | helper.init(isDebug)
56 |
57 | // DB
58 | helper.hiveExecute("DROP DATABASE IF EXISTS "+ DEFAULT_DBNAME +" CASCADE")
59 | helper.hiveExecute("CREATE DATABASE "+ DEFAULT_DBNAME)
60 | } catch {
61 | case NonFatal(e) => log.info("failed " + e)
62 | }
63 | }
64 |
65 | override protected def afterAll(): Unit = {
66 | helper.destroy()
67 | }
68 |
69 |
70 | // Test Run
71 | insertIntoOverwriteTestForFullAcidTables(Table.allFullAcidTypes())
72 |
73 | // TODO: Currently requires compatibility check to be disabled in HMS to run clean
74 | // hive.metastore.client.capability.check=false
75 | // insertIntoOverwriteTestForInsertOnlyTables(Table.allInsertOnlyTypes())
76 |
77 | // Insert Into/Overwrite test for full acid tables
78 | def insertIntoOverwriteTestForFullAcidTables(tTypes: List[(String,Boolean)]): Unit = {
79 | tTypes.foreach { case (tType, isPartitioned) =>
80 | val tableNameHive = "tHive"
81 | val tableNameSpark = "tSpark"
82 | val testName = s"Simple InsertInto Test for $tableNameHive/$tableNameSpark type $tType"
83 | test(testName) {
84 | val tableHive = new Table(DEFAULT_DBNAME, tableNameHive, cols, tType, isPartitioned)
85 | val tableSpark = new Table(DEFAULT_DBNAME, tableNameSpark, cols, tType, isPartitioned)
86 | def code(): Unit = {
87 | helper.recreate(tableHive)
88 | helper.recreate(tableSpark)
89 |
90 | // Insert into rows in both tables from Hive and Spark
91 | helper.hiveExecute(tableHive.insertIntoHiveTableKeyRange(11, 20))
92 | helper.sparkSQL(tableSpark.insertIntoSparkTableKeyRange(11, 20))
93 | var expectedRows = 10
94 | helper.compareTwoTablesViaHive(tableHive, tableSpark, "After Insert Into", expectedRows)
95 | helper.compareTwoTablesViaSpark(tableHive, tableSpark, "After Insert Into", expectedRows)
96 |
97 | // Insert overwrite rows in both tables from Hive and Spark
98 | helper.hiveExecute(tableHive.insertOverwriteHiveTableKeyRange(16, 25))
99 | helper.sparkSQL(tableSpark.insertOverwriteSparkTableKeyRange(16, 25))
100 | expectedRows = if (tableHive.isPartitioned) 15 else 10
101 | helper.compareTwoTablesViaHive(tableHive, tableSpark, "After Insert Overwrite", expectedRows)
102 | helper.compareTwoTablesViaSpark(tableHive, tableSpark, "After Insert Overwrite", expectedRows)
103 |
104 | // Insert overwrite rows in both tables - add rows in hive table from spark and vice versa
105 | helper.hiveExecute(tableSpark.insertOverwriteHiveTableKeyRange(24, 27))
106 | helper.sparkSQL(tableHive.insertOverwriteSparkTableKeyRange(24, 27))
107 | expectedRows = if (tableHive.isPartitioned) expectedRows + 2 else 4
108 | helper.compareTwoTablesViaHive(tableHive, tableSpark, "After Insert Overwrite", expectedRows)
109 | helper.compareTwoTablesViaSpark(tableHive, tableSpark, "After Insert Overwrite", expectedRows)
110 |
111 | // Insert into rows in both tables - add rows in hive table from spark and vice versa
112 | helper.hiveExecute(tableSpark.insertIntoHiveTableKeyRange(24, 27))
113 | helper.sparkSQL(tableHive.insertIntoSparkTableKeyRange(24, 27))
114 | expectedRows = expectedRows + 4
115 | helper.compareTwoTablesViaHive(tableHive, tableSpark, "After Insert Into", expectedRows)
116 | helper.compareTwoTablesViaSpark(tableHive, tableSpark, "After Insert Into", expectedRows)
117 |
118 | }
119 | helper.myRun(testName, code)
120 | }
121 | }
122 | }
123 |
124 | def insertIntoOverwriteTestForInsertOnlyTables(tTypes: List[(String,Boolean)]): Unit = {
125 | tTypes.foreach { case (tType, isPartitioned) =>
126 | val tableNameSpark = "tSpark"
127 | val testName = s"Simple InsertInto Test for $tableNameSpark type $tType"
128 | test(testName) {
129 | val tableSpark = new Table(DEFAULT_DBNAME, tableNameSpark, cols, tType, isPartitioned)
130 | def code() = {
131 | helper.recreate(tableSpark)
132 | }
133 | helper.myRun(testName, code)
134 | }
135 | }
136 | }
137 |
138 | }
139 |
--------------------------------------------------------------------------------
/src/it/scala/com/qubole/spark/hiveacid/streaming/HiveAcidSinkOptionsSuite.scala:
--------------------------------------------------------------------------------
1 | package com.qubole.spark.hiveacid.streaming
2 |
3 | import java.util.Locale
4 |
5 | import com.qubole.spark.hiveacid.Table
6 | import org.apache.spark.sql.streaming.OutputMode
7 |
8 |
9 | class HiveAcidSinkOptionsSuite extends HiveAcidStreamingFunSuite {
10 |
11 | import HiveAcidSinkOptions._
12 |
13 | test("bad sink options") {
14 |
15 | def testBadOptions(options: List[(String, String)])(expectedMsg: String): Unit = {
16 |
17 | val tableName = "tempTable"
18 | val tType = Table.orcFullACIDTable
19 | val cols = Map(
20 | ("value1","int"),
21 | ("value2", "int")
22 | )
23 | val tableHive = new Table(DEFAULT_DBNAME, tableName, cols, tType, false)
24 |
25 | // creating table
26 | helper.recreate(tableHive)
27 | val errorMessage = intercept[IllegalArgumentException] {
28 | helper.runStreaming(
29 | tableHive.hiveTname, OutputMode.Append(), tableHive.getColMap.keys.toSeq, Range(1, 4), options)
30 | }.getMessage
31 | assert(errorMessage.toLowerCase(Locale.ROOT).contains(expectedMsg.toLowerCase(Locale.ROOT)))
32 |
33 | }
34 |
35 | testBadOptions(List(CLEANUP_DELAY_KEY -> "-2"))("Invalid value '-2' " +
36 | s"for option '$CLEANUP_DELAY_KEY', must be a positive integer")
37 | testBadOptions(List(COMPACT_INTERVAL_KEY -> "-5"))("Invalid value '-5' " +
38 | s"for option '$COMPACT_INTERVAL_KEY', must be a positive integer")
39 | testBadOptions(List(MIN_BATCHES_TO_RETAIN_KEY -> "-5"))("Invalid value '-5' " +
40 | s"for option '$MIN_BATCHES_TO_RETAIN_KEY', must be a positive integer")
41 | testBadOptions(List(LOG_DELETION_KEY -> "x"))("Invalid value 'x' " +
42 | s"for option '$LOG_DELETION_KEY', must be true or false")
43 |
44 | }
45 |
46 | }
47 |
--------------------------------------------------------------------------------
/src/it/scala/com/qubole/spark/hiveacid/streaming/HiveAcidSinkSuite.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2019 Qubole, Inc. All rights reserved.
3 | *
4 | * Licensed to the Apache Software Foundation (ASF) under one or more
5 | * contributor license agreements. See the NOTICE file distributed with
6 | * this work for additional information regarding copyright ownership.
7 | * The ASF licenses this file to You under the Apache License, Version 2.0
8 | * (the "License"); you may not use this file except in compliance with
9 | * the License. You may obtain a copy of the License at
10 | *
11 | * http://www.apache.org/licenses/LICENSE-2.0
12 | *
13 | * Unless required by applicable law or agreed to in writing, software
14 | * distributed under the License is distributed on an "AS IS" BASIS,
15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | * See the License for the specific language governing permissions and
17 | * limitations under the License.
18 | */
19 |
20 | package com.qubole.spark.hiveacid.streaming
21 |
22 | import java.util.Locale
23 |
24 | import com.qubole.shaded.hadoop.hive.ql.metadata.InvalidTableException
25 | import com.qubole.spark.hiveacid.{AnalysisException, Table}
26 | import com.qubole.spark.hiveacid.datasource.HiveAcidDataSource
27 | import org.apache.spark.sql.Row
28 | import org.apache.spark.sql.streaming.OutputMode
29 |
30 |
31 | class HiveAcidSinkSuite extends HiveAcidStreamingFunSuite {
32 |
33 | override protected def afterAll(): Unit = {
34 | helper.destroy()
35 | }
36 |
37 | test("table not created") {
38 | val ds = new HiveAcidDataSource()
39 | val tableName = "tempTable"
40 | val options = Map("table" -> s"$tableName")
41 |
42 | val errorMessage = intercept[InvalidTableException] {
43 | ds.createSink(helper.spark.sqlContext, options, Seq.empty, OutputMode.Append())
44 | }.getMessage()
45 | val expectedMsg = s"""table not found $tableName"""
46 | assert(errorMessage.toLowerCase(Locale.ROOT).contains(expectedMsg.toLowerCase(Locale.ROOT)))
47 |
48 | }
49 |
50 | test("table not acid table") {
51 | val ds = new HiveAcidDataSource()
52 | val tableName = s"tempTable"
53 | val options = Map("table" -> s"$DEFAULT_DBNAME.$tableName")
54 |
55 | val tType = Table.orcTable
56 | val cols = Map(
57 | ("value1","int"),
58 | ("value2", "int")
59 | )
60 |
61 | val tableHive = new Table(DEFAULT_DBNAME, tableName, cols, tType, false)
62 |
63 | helper.recreate(tableHive, false)
64 |
65 | val errorMessage = intercept[IllegalArgumentException] {
66 | ds.createSink(helper.spark.sqlContext, options, Seq.empty, OutputMode.Append())
67 | }.getMessage()
68 | val expectedMsg = s"""table ${tableHive.hiveTname} is not an acid table"""
69 | assert(errorMessage.toLowerCase(Locale.ROOT).contains(expectedMsg.toLowerCase(Locale.ROOT)))
70 |
71 | }
72 |
73 | test("table is bucketed") {
74 | val ds = new HiveAcidDataSource()
75 | val tableName = s"tempTable"
76 | val options = Map("table" -> s"$DEFAULT_DBNAME.$tableName")
77 |
78 | val tType = Table.orcBucketedFullACIDTable
79 | val cols = Map(
80 | ("value1","int"),
81 | ("value2", "int")
82 | )
83 |
84 | val tableHive = new Table(DEFAULT_DBNAME, tableName, cols, tType, false)
85 |
86 | helper.recreate(tableHive, false)
87 |
88 | val errorMessage = intercept[RuntimeException] {
89 | ds.createSink(helper.spark.sqlContext, options, Seq.empty, OutputMode.Append())
90 | }.getMessage()
91 | val expectedMsg = s"""Unsupported operation type - Streaming Write for Bucketed table """
92 | assert(errorMessage.toLowerCase(Locale.ROOT).contains(expectedMsg.toLowerCase(Locale.ROOT)))
93 |
94 | }
95 |
96 | test("partitionBy is specified with Acid Streaming") {
97 | val ds = new HiveAcidDataSource()
98 | val options = Map("table" -> "dummyTable")
99 | val errorMessage = intercept[UnsupportedOperationException] {
100 | ds.createSink(helper.spark.sqlContext, options, Seq("col1", "col2"), OutputMode.Append())
101 | }.getMessage()
102 |
103 | val expectedMsg = "Unsupported Function - partitionBy with HiveAcidSink"
104 | assert(errorMessage.toLowerCase(Locale.ROOT).contains(expectedMsg.toLowerCase(Locale.ROOT)))
105 |
106 | }
107 |
108 | test("incorrect output mode is used with Acid Streaming") {
109 | val ds = new HiveAcidDataSource()
110 | val options = Map("table" -> "dummyTable")
111 | val errorMessage = intercept[AnalysisException] {
112 | ds.createSink(helper.spark.sqlContext, options, Seq.empty, OutputMode.Update())
113 | }.getMessage()
114 | val expectedMsg = "mode is Update: Hive Acid Sink supports only Append as OutputMode"
115 | assert(errorMessage.toLowerCase(Locale.ROOT).contains(expectedMsg.toLowerCase(Locale.ROOT)))
116 |
117 | }
118 |
119 | test("table not specified") {
120 | val ds = new HiveAcidDataSource()
121 | val options = Map.empty[String, String]
122 | val errorMessage = intercept[IllegalArgumentException] {
123 | ds.createSink(helper.spark.sqlContext, options, Seq.empty, OutputMode.Append())
124 | }.getMessage()
125 | val expectedMsg = """Table Name is not specified"""
126 | assert(errorMessage.toLowerCase(Locale.ROOT).contains(expectedMsg.toLowerCase(Locale.ROOT)))
127 |
128 | }
129 |
130 | // Test Run
131 | streamingTestForAcidTables(Table.allNonBucketedFullAcidTypes())
132 | streamingTestForAcidTables(Table.allNonBucketedInsertOnlyTypes())
133 |
134 | def streamingTestForAcidTables(tTypes: List[(String,Boolean)]): Unit = {
135 | tTypes.foreach { case (tType, isPartitioned) =>
136 | val tableNameHive = "tHive"
137 | val testName = s"Simple Streaming Query Append for $tableNameHive type $tType"
138 | test(testName) {
139 | val cols: Map[String, String] = {
140 | if(!isPartitioned) {
141 | Map(
142 | ("value1","int"),
143 | ("value2","int")
144 | )
145 | } else {
146 | Map(
147 | ("value","int")
148 | )
149 | }
150 | }
151 |
152 | val tableHive = new Table(DEFAULT_DBNAME, tableNameHive, cols, tType, isPartitioned)
153 | def code(): Unit = {
154 | helper.recreate(tableHive)
155 |
156 | helper.runStreaming(tableHive.hiveTname, OutputMode.Append(), tableHive.getColMap.keys.toSeq, Range(1, 4))
157 | val resDf = helper.sparkGetDF(tableHive)
158 | val resultRow = (Row(100, 10, 1) :: Row(200, 20, 2) :: Row(300, 30, 3) :: Nil).toArray
159 | helper.compareResult(resDf._1.collect(), resultRow)
160 | helper.compareResult(resDf._2.collect(), resultRow)
161 | helper.compare(tableHive, "compare via hive")
162 | }
163 | helper.myRun(testName, code)
164 | }
165 | }
166 | }
167 |
168 | }
169 |
--------------------------------------------------------------------------------
/src/it/scala/com/qubole/spark/hiveacid/streaming/HiveAcidStreamingFunSuite.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2019 Qubole, Inc. All rights reserved.
3 | *
4 | * Licensed to the Apache Software Foundation (ASF) under one or more
5 | * contributor license agreements. See the NOTICE file distributed with
6 | * this work for additional information regarding copyright ownership.
7 | * The ASF licenses this file to You under the Apache License, Version 2.0
8 | * (the "License"); you may not use this file except in compliance with
9 | * the License. You may obtain a copy of the License at
10 | *
11 | * http://www.apache.org/licenses/LICENSE-2.0
12 | *
13 | * Unless required by applicable law or agreed to in writing, software
14 | * distributed under the License is distributed on an "AS IS" BASIS,
15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | * See the License for the specific language governing permissions and
17 | * limitations under the License.
18 | */
19 |
20 |
21 | package com.qubole.spark.hiveacid.streaming
22 |
23 | import org.apache.log4j.{Level, LogManager, Logger}
24 | import org.scalatest.{BeforeAndAfterAll, FunSuite}
25 |
26 | import scala.util.control.NonFatal
27 |
28 |
29 | abstract class HiveAcidStreamingFunSuite extends FunSuite with BeforeAndAfterAll {
30 |
31 | protected val log: Logger = LogManager.getLogger(this.getClass)
32 | log.setLevel(Level.INFO)
33 |
34 | protected var helper: StreamingTestHelper = _
35 | protected val isDebug = true
36 |
37 | protected val DEFAULT_DBNAME = "HiveTestDB"
38 |
39 | override protected def beforeAll() {
40 | try {
41 |
42 | helper = new StreamingTestHelper
43 | if (isDebug) {
44 | log.setLevel(Level.DEBUG)
45 | }
46 | helper.init(isDebug)
47 |
48 | // DB
49 | helper.hiveExecute("DROP DATABASE IF EXISTS "+ DEFAULT_DBNAME +" CASCADE")
50 | helper.hiveExecute("CREATE DATABASE "+ DEFAULT_DBNAME)
51 | } catch {
52 | case NonFatal(e) => log.info("failed " + e)
53 | }
54 | }
55 |
56 | override protected def afterAll(): Unit = {
57 | helper.destroy()
58 | }
59 |
60 | }
61 |
--------------------------------------------------------------------------------
/src/it/scala/com/qubole/spark/hiveacid/streaming/StreamingTestHelper.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2019 Qubole, Inc. All rights reserved.
3 | *
4 | * Licensed to the Apache Software Foundation (ASF) under one or more
5 | * contributor license agreements. See the NOTICE file distributed with
6 | * this work for additional information regarding copyright ownership.
7 | * The ASF licenses this file to You under the Apache License, Version 2.0
8 | * (the "License"); you may not use this file except in compliance with
9 | * the License. You may obtain a copy of the License at
10 | *
11 | * http://www.apache.org/licenses/LICENSE-2.0
12 | *
13 | * Unless required by applicable law or agreed to in writing, software
14 | * distributed under the License is distributed on an "AS IS" BASIS,
15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | * See the License for the specific language governing permissions and
17 | * limitations under the License.
18 | */
19 |
20 |
21 | package com.qubole.spark.hiveacid.streaming
22 |
23 | import java.io.{File, IOException}
24 | import java.util.UUID
25 |
26 | import com.qubole.spark.hiveacid.TestHelper
27 |
28 | import org.apache.spark.network.util.JavaUtils
29 | import org.apache.spark.sql.execution.streaming.MemoryStream
30 | import org.apache.spark.sql.streaming.{OutputMode, StreamingQuery}
31 | import org.scalatest.concurrent.TimeLimits
32 | import org.scalatest.time.SpanSugar
33 |
34 | class StreamingTestHelper extends TestHelper with TimeLimits {
35 |
36 | import StreamingTestHelper._
37 |
38 |
39 | def runStreaming(tableName: String,
40 | outputMode: OutputMode,
41 | cols: Seq[String],
42 | inputRange: Range,
43 | options: List[(String, String)] = List.empty): Unit = {
44 |
45 | val inputData = MemoryStream[Int]
46 | val ds = inputData.toDS()
47 |
48 | val checkpointDir = createCheckpointDir(namePrefix = "stream.checkpoint").getCanonicalPath
49 |
50 | var query: StreamingQuery = null
51 |
52 | try {
53 | // Starting streaming query
54 | val writerDf =
55 | ds.map(i => (i*100, i*10, i))
56 | .toDF(cols:_*)
57 | .writeStream
58 | .format("HiveAcid")
59 | .option("table", tableName)
60 | .outputMode(outputMode)
61 | .option("checkpointLocation", checkpointDir)
62 | //.start()
63 |
64 | query = options.map { option =>
65 | writerDf.option(option._1, option._2)
66 | }.lastOption.getOrElse(writerDf).start()
67 |
68 | // Adding data for streaming query
69 | inputData.addData(inputRange)
70 | failAfter(STREAMING_TIMEOUT) {
71 | query.processAllAvailable()
72 | }
73 | } finally {
74 | if (query != null) {
75 | // Terminating streaming query
76 | query.stop()
77 | deleteCheckpointDir(checkpointDir)
78 | }
79 | }
80 | }
81 |
82 | def deleteCheckpointDir(fileStr: String): Unit = {
83 | val file = new File(fileStr)
84 | if (file != null) {
85 | JavaUtils.deleteRecursively(file)
86 | }
87 | }
88 |
89 | def createCheckpointDir(root: String = System.getProperty("java.io.tmpdir"),
90 | namePrefix: String = "spark"): File = {
91 |
92 | var attempts = 0
93 | val maxAttempts = MAX_DIR_CREATION_ATTEMPTS
94 | var dir: File = null
95 | while (dir == null) {
96 | attempts += 1
97 | if (attempts > maxAttempts) {
98 | throw new IOException("Failed to create a temp directory (under " + root + ") after " +
99 | maxAttempts + " attempts!")
100 | }
101 | try {
102 | dir = new File(root, namePrefix + "-" + UUID.randomUUID.toString)
103 | if (dir.exists() || !dir.mkdirs()) {
104 | dir = null
105 | }
106 | } catch { case e: SecurityException => dir = null; }
107 | }
108 | dir.getCanonicalFile
109 | }
110 |
111 | }
112 |
113 | object StreamingTestHelper extends TestHelper with SpanSugar {
114 |
115 | val MAX_DIR_CREATION_ATTEMPTS = 10
116 | val STREAMING_TIMEOUT = 60.seconds
117 |
118 | }
119 |
--------------------------------------------------------------------------------
/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright 2019 Qubole, Inc. All rights reserved.
3 | #
4 | # Licensed to the Apache Software Foundation (ASF) under one or more
5 | # contributor license agreements. See the NOTICE file distributed with
6 | # this work for additional information regarding copyright ownership.
7 | # The ASF licenses this file to You under the Apache License, Version 2.0
8 | # (the "License"); you may not use this file except in compliance with
9 | # the License. You may obtain a copy of the License at
10 | #
11 | # http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 |
20 | com.qubole.spark.hiveacid.datasource.HiveAcidDataSource
21 |
--------------------------------------------------------------------------------
/src/main/scala/com/qubole/shaded/hadoop/hive/ql/io/orc/OrcAcidUtil.scala:
--------------------------------------------------------------------------------
1 | package com.qubole.shaded.hadoop.hive.ql.io.orc
2 |
3 | import java.util.regex.Pattern
4 |
5 | import com.qubole.shaded.hadoop.hive.ql.io.AcidUtils
6 | import org.apache.hadoop.fs.Path
7 |
8 | object OrcAcidUtil {
9 | val BUCKET_PATTERN = Pattern.compile("bucket_[0-9]{5}$")
10 |
11 | def getDeleteDeltaPaths(orcSplit: OrcSplit): Array[Path] = {
12 | assert(BUCKET_PATTERN.matcher(orcSplit.getPath.getName).matches())
13 | val bucket = AcidUtils.parseBucketId(orcSplit.getPath)
14 | assert(bucket != -1)
15 | val deleteDeltaDirPaths = VectorizedOrcAcidRowBatchReader.getDeleteDeltaDirsFromSplit(orcSplit);
16 | deleteDeltaDirPaths.map(deleteDir => AcidUtils.createBucketFile(deleteDir, bucket))
17 | }
18 | }
19 |
--------------------------------------------------------------------------------
/src/main/scala/com/qubole/spark/datasources/hiveacid/sql/HiveAnalysisException.scala:
--------------------------------------------------------------------------------
1 | package com.qubole.spark.datasources.hiveacid.sql
2 |
3 | import org.apache.spark.sql.AnalysisException
4 | import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
5 |
6 | class HiveAnalysisException(
7 | override val message: String,
8 | override val line: Option[Int] = None,
9 | override val startPosition: Option[Int] = None,
10 | // Some plans fail to serialize due to bugs in scala collections.
11 | @transient override val plan: Option[LogicalPlan] = None,
12 | override val cause: Option[Throwable] = None) extends AnalysisException(message, line, startPosition, plan, cause) {
13 | }
14 |
--------------------------------------------------------------------------------
/src/main/scala/com/qubole/spark/datasources/hiveacid/sql/catalyst/parser/ParseDriver.scala:
--------------------------------------------------------------------------------
1 | package com.qubole.spark.datasources.hiveacid.sql.catalyst.parser
2 |
3 | import com.qubole.spark.datasources.hiveacid.sql.catalyst.parser.{SqlHiveParser => SqlBaseParser}
4 | import org.antlr.v4.runtime._
5 | import org.antlr.v4.runtime.misc.Interval
6 | import org.antlr.v4.runtime.tree.TerminalNodeImpl
7 | import org.apache.spark.sql.catalyst.expressions.AttributeReference
8 | import org.apache.spark.sql.types.StructType
9 |
10 | /**
11 | * A copy of [[org.apache.spark.sql.catalyst.parser.UpperCaseCharStream]]
12 | */
13 | class UpperCaseCharStream(wrapped: CodePointCharStream) extends CharStream {
14 | override def consume(): Unit = wrapped.consume
15 | override def getSourceName(): String = wrapped.getSourceName
16 | override def index(): Int = wrapped.index
17 | override def mark(): Int = wrapped.mark
18 | override def release(marker: Int): Unit = wrapped.release(marker)
19 | override def seek(where: Int): Unit = wrapped.seek(where)
20 | override def size(): Int = wrapped.size
21 |
22 | override def getText(interval: Interval): String = {
23 | // ANTLR 4.7's CodePointCharStream implementations have bugs when
24 | // getText() is called with an empty stream, or intervals where
25 | // the start > end. See
26 | // https://github.com/antlr/antlr4/commit/ac9f7530 for one fix
27 | // that is not yet in a released ANTLR artifact.
28 | if (size() > 0 && (interval.b - interval.a >= 0)) wrapped.getText(interval) else ""
29 | }
30 |
31 | override def LA(i: Int): Int = {
32 | val la = wrapped.LA(i)
33 | if (la == 0 || la == IntStream.EOF) la
34 | else Character.toUpperCase(la)
35 | }
36 | }
37 |
38 | /**
39 | * An adaptation of [[org.apache.spark.sql.catalyst.parser.PostProcessor]]
40 | */
41 | case object PostProcessor extends SqlHiveBaseListener {
42 |
43 | /** Remove the back ticks from an Identifier. */
44 | override def exitQuotedIdentifier(ctx: SqlBaseParser.QuotedIdentifierContext): Unit = {
45 | replaceTokenByIdentifier(ctx, 1) { token =>
46 | // Remove the double back ticks in the string.
47 | token.setText(token.getText.replace("``", "`"))
48 | token
49 | }
50 | }
51 |
52 | /** Treat non-reserved keywords as Identifiers. */
53 | override def exitNonReserved(ctx: SqlBaseParser.NonReservedContext): Unit = {
54 | replaceTokenByIdentifier(ctx, 0)(identity)
55 | }
56 |
57 | private def replaceTokenByIdentifier(
58 | ctx: ParserRuleContext,
59 | stripMargins: Int)(
60 | f: CommonToken => CommonToken = identity): Unit = {
61 | val parent = ctx.getParent
62 | parent.removeLastChild()
63 | val token = ctx.getChild(0).getPayload.asInstanceOf[Token]
64 | val newToken = new CommonToken(
65 | new org.antlr.v4.runtime.misc.Pair(token.getTokenSource, token.getInputStream),
66 | SqlBaseParser.IDENTIFIER,
67 | token.getChannel,
68 | token.getStartIndex + stripMargins,
69 | token.getStopIndex - stripMargins)
70 | parent.addChild(new TerminalNodeImpl(f(newToken)))
71 | }
72 | }
73 |
74 | /**
75 | * An adaptation of [[org.apache.spark.util.random.RandomSampler]]
76 | */
77 | object RandomSampler {
78 | /**
79 | * Sampling fraction arguments may be results of computation, and subject to floating
80 | * point jitter. I check the arguments with this epsilon slop factor to prevent spurious
81 | * warnings for cases such as summing some numbers to get a sampling fraction of 1.000000001
82 | */
83 | val roundingEpsilon = 1e-6
84 | }
85 |
86 | object SparkAdaptation {
87 | /**
88 | * An adaptation of [[org.apache.spark.sql.types.StructType#toAttributes]]
89 | */
90 | def toAttributes(structType: StructType): Seq[AttributeReference] =
91 | structType.map(f => AttributeReference(f.name, f.dataType, f.nullable, f.metadata)())
92 | }
--------------------------------------------------------------------------------
/src/main/scala/com/qubole/spark/datasources/hiveacid/sql/catalyst/plans/command/DeleteCommand.scala:
--------------------------------------------------------------------------------
1 | package com.qubole.spark.datasources.hiveacid.sql.catalyst.plans.command
2 |
3 | import com.qubole.spark.hiveacid.HiveAcidErrors
4 | import com.qubole.spark.hiveacid.datasource.HiveAcidRelation
5 | import org.apache.spark.sql.{Column, Row, SparkSession}
6 | import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression}
7 | import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
8 | import org.apache.spark.sql.execution.command.RunnableCommand
9 | import org.apache.spark.sql.execution.datasources.LogicalRelation
10 |
11 | case class DeleteCommand(
12 | table: LogicalPlan,
13 | condition: Expression)
14 | extends RunnableCommand {
15 |
16 | // We don't want `table` in children as sometimes we don't want to transform it.
17 | override def children: Seq[LogicalPlan] = Seq(table)
18 | override def output: Seq[Attribute] = Seq.empty
19 | override lazy val resolved: Boolean = childrenResolved
20 | override def run(sparkSession: SparkSession): Seq[Row] = {
21 | if (children.size != 1) {
22 | throw new IllegalArgumentException("DELETE command should specify exactly one table, whereas this has: "
23 | + children.size)
24 | }
25 | children(0) match {
26 | case LogicalRelation(relation: HiveAcidRelation, _, _ , _) => {
27 | relation.delete(new Column(condition))
28 | }
29 | case _ => throw HiveAcidErrors.tableNotAcidException(table.toString())
30 | }
31 | Seq.empty[Row]
32 | }
33 | }
--------------------------------------------------------------------------------
/src/main/scala/com/qubole/spark/datasources/hiveacid/sql/catalyst/plans/command/MergeCommand.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.qubole.spark.datasources.hiveacid.sql.catalyst.plans.command
19 |
20 | import com.qubole.spark.hiveacid.HiveAcidErrors
21 | import com.qubole.spark.hiveacid.datasource.HiveAcidRelation
22 | import com.qubole.spark.hiveacid.merge.{MergeCondition, MergeWhenClause, MergeWhenNotInsert}
23 | import org.apache.spark.sql.catalyst.AliasIdentifier
24 | import org.apache.spark.sql.catalyst.analysis.EliminateSubqueryAliases
25 | import org.apache.spark.sql.catalyst.catalog.{CatalogTable, HiveTableRelation}
26 | import org.apache.spark.sql.{Row, SparkSession, SqlUtils}
27 | import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression}
28 | import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, SubqueryAlias}
29 | import org.apache.spark.sql.execution.command.RunnableCommand
30 | import org.apache.spark.sql.execution.datasources.LogicalRelation
31 |
32 | case class MergeCommand(targetTable: LogicalPlan,
33 | sourceTable: LogicalPlan,
34 | matched: Seq[MergeWhenClause],
35 | notMatched: Option[MergeWhenClause],
36 | mergeCondition: MergeCondition,
37 | sourceAlias: Option[AliasIdentifier],
38 | targetAlias: Option[AliasIdentifier])
39 | extends RunnableCommand {
40 |
41 | override def children: Seq[LogicalPlan] = Seq(targetTable, sourceTable)
42 | override def output: Seq[Attribute] = Seq.empty
43 | override lazy val resolved: Boolean = childrenResolved
44 | override def run(sparkSession: SparkSession): Seq[Row] = {
45 | val insertClause: Option[MergeWhenNotInsert] = notMatched match {
46 | case Some(i: MergeWhenNotInsert) => Some(i)
47 | case None => None
48 | case _ => throw HiveAcidErrors.mergeValidationError("WHEN NOT Clause has to be INSERT CLAUSE")
49 | }
50 |
51 | val targetRelation = children.head
52 | val sourceRelation = children.last
53 |
54 | val sourceTableFullyQualifiedName = SqlUtils.removeTopSubqueryAlias(sourceRelation) match {
55 | case hiveTable: HiveTableRelation =>
56 | Some(hiveTable.tableMeta.qualifiedName)
57 | case LogicalRelation(acidRelation: HiveAcidRelation, _, _, _) =>
58 | Some(acidRelation.fullyQualifiedTableName)
59 | case LogicalRelation(_, _, catalogTable: Option[CatalogTable], _) if catalogTable.isDefined =>
60 | Some(catalogTable.get.qualifiedName)
61 | case _ => None
62 | }
63 |
64 | val (_, sourceDf) = SqlUtils.getDFQualified(sparkSession,
65 | SqlUtils.logicalPlanToDataFrame(sparkSession, sourceTable),
66 | sourceTableFullyQualifiedName.getOrElse(""))
67 |
68 | SqlUtils.removeTopSubqueryAlias(targetRelation) match {
69 | case LogicalRelation(relation: HiveAcidRelation, _, _, _) =>
70 | relation.merge(sourceDf,
71 | mergeCondition.expression, matched, insertClause, sourceAlias, targetAlias)
72 | case _ => throw HiveAcidErrors.tableNotAcidException(targetTable.toString())
73 | }
74 |
75 | Seq.empty
76 | }
77 | }
--------------------------------------------------------------------------------
/src/main/scala/com/qubole/spark/datasources/hiveacid/sql/catalyst/plans/command/UpdateCommand.scala:
--------------------------------------------------------------------------------
1 | package com.qubole.spark.datasources.hiveacid.sql.catalyst.plans.command
2 |
3 | import com.qubole.spark.hiveacid.HiveAcidErrors
4 | import com.qubole.spark.hiveacid.datasource.HiveAcidRelation
5 | import org.apache.spark.sql.{Column, Row, SparkSession}
6 | import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression}
7 | import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
8 | import org.apache.spark.sql.execution.command.RunnableCommand
9 | import org.apache.spark.sql.execution.datasources.LogicalRelation
10 |
11 | case class UpdateCommand(
12 | table: LogicalPlan,
13 | setExpressions: Map[String, Expression],
14 | condition: Option[Expression])
15 | extends RunnableCommand {
16 |
17 | override def children: Seq[LogicalPlan] = Seq(table)
18 | override def output: Seq[Attribute] = Seq.empty
19 | override lazy val resolved: Boolean = childrenResolved
20 |
21 | override def run(sparkSession: SparkSession): Seq[Row] = {
22 | if (children.size != 1) {
23 | throw new IllegalArgumentException("UPDATE command should have one table to update, whereas this has: "
24 | + children.size)
25 | }
26 | children(0) match {
27 | case LogicalRelation(relation: HiveAcidRelation, _, _ , _) => {
28 | val setColumns = setExpressions.mapValues(expr => new Column(expr))
29 | val updateFilterColumn = condition.map(new Column(_))
30 | relation.update(updateFilterColumn, setColumns)
31 | }
32 | case LogicalRelation(_, _, Some(catalogTable), _) =>
33 | throw HiveAcidErrors.tableNotAcidException(catalogTable.qualifiedName)
34 | case _ => throw HiveAcidErrors.tableNotAcidException(table.toString())
35 | }
36 | Seq.empty[Row]
37 | }
38 | }
--------------------------------------------------------------------------------
/src/main/scala/com/qubole/spark/datasources/hiveacid/sql/execution/SparkAcidSqlParser.scala:
--------------------------------------------------------------------------------
1 | package com.qubole.spark.datasources.hiveacid.sql.execution
2 |
3 | import com.qubole.spark.datasources.hiveacid.sql.catalyst.parser._
4 | import org.antlr.v4.runtime._
5 | import org.antlr.v4.runtime.atn.PredictionMode
6 | import org.apache.spark.internal.Logging
7 | import org.apache.spark.sql.catalyst.expressions.Expression
8 | import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier}
9 | import org.apache.spark.sql.{AnalysisException, SparkSession}
10 | import org.apache.spark.sql.catalyst.parser.{ParseErrorListener, ParseException, ParserInterface}
11 | import org.apache.spark.sql.catalyst.plans.logical._
12 | import org.apache.spark.sql.catalyst.trees.Origin
13 | import org.apache.spark.sql.execution.SparkSqlParser
14 | import org.apache.spark.sql.internal.{SQLConf, VariableSubstitution}
15 | import org.apache.spark.sql.types.{DataType, StructType}
16 |
17 | /**
18 | * Concrete parser for Hive SQL statements.
19 | */
20 | case class SparkAcidSqlParser(sparkParser: ParserInterface) extends ParserInterface with Logging {
21 |
22 | override def parseExpression(sqlText: String): Expression = sparkParser.parseExpression(sqlText)
23 |
24 | override def parseTableIdentifier(sqlText: String): TableIdentifier = sparkParser.parseTableIdentifier(sqlText)
25 |
26 | override def parseFunctionIdentifier(sqlText: String): FunctionIdentifier = sparkParser.parseFunctionIdentifier(sqlText)
27 |
28 | override def parseTableSchema(sqlText: String): StructType = sparkParser.parseTableSchema(sqlText)
29 |
30 | override def parseDataType(sqlText: String): DataType = sparkParser.parseDataType(sqlText)
31 |
32 | private val substitutor: VariableSubstitution = {
33 | val field = classOf[SparkSqlParser].getDeclaredField("substitutor")
34 | field.setAccessible(true)
35 | field.get(sparkParser).asInstanceOf[VariableSubstitution]
36 | }
37 |
38 | // FIXME scala reflection would be better
39 | private val conf: SQLConf = {
40 | val field = classOf[VariableSubstitution].getDeclaredField("org$apache$spark$sql$internal$VariableSubstitution$$conf")
41 | field.setAccessible(true)
42 | field.get(substitutor).asInstanceOf[SQLConf]
43 | }
44 |
45 | private val sparkAcidAstBuilder = new SparkSqlAstBuilder(conf)
46 |
47 | override def parsePlan(sqlText: String): LogicalPlan = {
48 | try {
49 | parse(sqlText) { parser =>
50 | sparkAcidAstBuilder.visitSingleStatement(parser.singleStatement()) match {
51 | case plan: LogicalPlan => plan
52 | case _ => sparkParser.parsePlan(sqlText)
53 | }
54 | }
55 | } catch {
56 | case e: AcidParseException => throw e.parseException
57 | case _: ParseException => sparkParser.parsePlan(sqlText)
58 | }
59 | }
60 |
61 | /**
62 | * An adaptation of [[org.apache.spark.sql.execution.SparkSqlParser#parse]]
63 | * and [[org.apache.spark.sql.catalyst.parser.AbstractSqlParser#parse]]
64 | */
65 | protected def parse[T](sqlText: String)(toResult: SqlHiveParser => T): T = {
66 | val command = substitutor.substitute(sqlText)
67 | logDebug(s"Parsing command: $command")
68 |
69 |
70 | val lexer = new SqlHiveLexer(new UpperCaseCharStream(CharStreams.fromString(command)))
71 | lexer.removeErrorListeners()
72 | lexer.addErrorListener(ParseErrorListener)
73 | lexer.legacy_setops_precedence_enbled = SQLConf.get.setOpsPrecedenceEnforced
74 |
75 | val tokenStream = new CommonTokenStream(lexer)
76 | val acidSpecific = checkIfAcidSpecific(tokenStream)
77 | tokenStream.seek(0) //reset stream to first token
78 | val parser = new SqlHiveParser(tokenStream)
79 | parser.addParseListener(PostProcessor)
80 | parser.removeErrorListeners()
81 | parser.addErrorListener(ParseErrorListener)
82 | parser.legacy_setops_precedence_enbled = SQLConf.get.setOpsPrecedenceEnforced
83 | try {
84 | parser.getInterpreter.setPredictionMode(PredictionMode.LL)
85 | toResult(parser)
86 | } catch {
87 | case e: ParseException if e.command.isDefined =>
88 | throw wrapParseException(e, acidSpecific)
89 | case e: ParseException =>
90 | throw wrapParseException(e.withCommand(command), acidSpecific)
91 | case e: AnalysisException =>
92 | val position = Origin(e.line, e.startPosition)
93 | val pe = new ParseException(Option(command), e.message, position, position)
94 | throw wrapParseException(pe, acidSpecific)
95 | }
96 | }
97 |
98 | /**
99 | * Denotes ACID Specific ParseException
100 | * @param parseException
101 | */
102 | class AcidParseException(val parseException: ParseException) extends Exception
103 |
104 | def wrapParseException(e: ParseException, acidSpecific: Boolean): Throwable = {
105 | if (acidSpecific) {
106 | new AcidParseException(e)
107 | } else {
108 | e
109 | }
110 | }
111 | def checkIfAcidSpecific(tokStream: TokenStream): Boolean = {
112 | tokStream.LA(1) match {
113 | case SqlHiveParser.DELETE | SqlHiveParser.MERGE | SqlHiveParser.UPDATE => true
114 | case _ => false
115 | }
116 | }
117 | }
--------------------------------------------------------------------------------
/src/main/scala/com/qubole/spark/hiveacid/.gitignore:
--------------------------------------------------------------------------------
1 | *.scalae
2 |
--------------------------------------------------------------------------------
/src/main/scala/com/qubole/spark/hiveacid/HiveAcidAutoConvert.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2019 Qubole, Inc. All rights reserved.
3 | *
4 | * Licensed to the Apache Software Foundation (ASF) under one or more
5 | * contributor license agreements. See the NOTICE file distributed with
6 | * this work for additional information regarding copyright ownership.
7 | * The ASF licenses this file to You under the Apache License, Version 2.0
8 | * (the "License"); you may not use this file except in compliance with
9 | * the License. You may obtain a copy of the License at
10 | *
11 | * http://www.apache.org/licenses/LICENSE-2.0
12 | *
13 | * Unless required by applicable law or agreed to in writing, software
14 | * distributed under the License is distributed on an "AS IS" BASIS,
15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | * See the License for the specific language governing permissions and
17 | * limitations under the License.
18 | */
19 |
20 | package com.qubole.spark.hiveacid
21 |
22 | import java.util.Locale
23 |
24 | import com.qubole.spark.datasources.hiveacid.sql.execution.SparkAcidSqlParser
25 | import org.apache.spark.sql.{SparkSession, SparkSessionExtensions}
26 | import org.apache.spark.sql.catalyst.catalog.HiveTableRelation
27 | import org.apache.spark.sql.catalyst.plans.logical.{Filter, InsertIntoTable, LogicalPlan}
28 | import org.apache.spark.sql.catalyst.rules.Rule
29 | import org.apache.spark.sql.execution.command.DDLUtils
30 | import org.apache.spark.sql.execution.datasources.LogicalRelation
31 | import com.qubole.spark.hiveacid.datasource.HiveAcidDataSource
32 |
33 |
34 | /**
35 | * Analyzer rule to convert a transactional HiveRelation
36 | * into LogicalRelation backed by HiveAcidRelation
37 | * @param spark - spark session
38 | */
39 | case class HiveAcidAutoConvert(spark: SparkSession) extends Rule[LogicalPlan] {
40 |
41 | private def isConvertible(relation: HiveTableRelation): Boolean = {
42 | val serde = relation.tableMeta.storage.serde.getOrElse("").toLowerCase(Locale.ROOT)
43 | relation.tableMeta.properties.getOrElse("transactional", "false").toBoolean
44 | }
45 |
46 | private def convert(relation: HiveTableRelation): LogicalRelation = {
47 | val options = relation.tableMeta.properties ++
48 | relation.tableMeta.storage.properties ++ Map("table" -> relation.tableMeta.qualifiedName)
49 |
50 | val newRelation = new HiveAcidDataSource().createRelation(spark.sqlContext, options)
51 | LogicalRelation(newRelation, isStreaming = false)
52 | }
53 |
54 | override def apply(plan: LogicalPlan): LogicalPlan = {
55 | plan resolveOperators {
56 | // Write path
57 | case InsertIntoTable(r: HiveTableRelation, partition, query, overwrite, ifPartitionNotExists)
58 | if query.resolved && DDLUtils.isHiveTable(r.tableMeta) && isConvertible(r) =>
59 | InsertIntoTable(convert(r), partition, query, overwrite, ifPartitionNotExists)
60 |
61 | // Read path
62 | case relation: HiveTableRelation
63 | if DDLUtils.isHiveTable(relation.tableMeta) && isConvertible(relation) =>
64 | convert(relation)
65 | }
66 | }
67 | }
68 |
69 | class HiveAcidAutoConvertExtension extends (SparkSessionExtensions => Unit) {
70 | def apply(extension: SparkSessionExtensions): Unit = {
71 | extension.injectResolutionRule(HiveAcidAutoConvert.apply)
72 | extension.injectParser { (session, parser) =>
73 | SparkAcidSqlParser(parser)
74 | }
75 | }
76 | }
77 |
--------------------------------------------------------------------------------
/src/main/scala/com/qubole/spark/hiveacid/HiveAcidErrors.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2019 Qubole, Inc. All rights reserved.
3 | *
4 | * Licensed to the Apache Software Foundation (ASF) under one or more
5 | * contributor license agreements. See the NOTICE file distributed with
6 | * this work for additional information regarding copyright ownership.
7 | * The ASF licenses this file to You under the Apache License, Version 2.0
8 | * (the "License"); you may not use this file except in compliance with
9 | * the License. You may obtain a copy of the License at
10 | *
11 | * http://www.apache.org/licenses/LICENSE-2.0
12 | *
13 | * Unless required by applicable law or agreed to in writing, software
14 | * distributed under the License is distributed on an "AS IS" BASIS,
15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | * See the License for the specific language governing permissions and
17 | * limitations under the License.
18 | */
19 |
20 | package com.qubole.spark.hiveacid
21 |
22 | import org.apache.spark.sql.{SaveMode, SqlUtils}
23 | import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
24 |
25 | object HiveAcidErrors {
26 |
27 | def formatColumn(colName: String): String = s"`$colName`"
28 |
29 | def formatColumnList(colNames: Seq[String]): String =
30 | colNames.map(formatColumn).mkString("[", ", ", "]")
31 |
32 | def tableNotSpecifiedException(): Throwable = {
33 | new IllegalArgumentException("'table' is not specified in parameters")
34 | }
35 |
36 | def unsupportedFunction(function: String, caller: String): Throwable = {
37 | new java.lang.UnsupportedOperationException(s"Unsupported Function - $function with $caller")
38 | }
39 |
40 | def invalidOperationType(operation: String): Throwable = {
41 | new RuntimeException(s"Invalid operation type - $operation")
42 | }
43 |
44 | def unsupportedSaveMode(saveMode: SaveMode): Throwable = {
45 | new RuntimeException(s"Unsupported save mode - $saveMode")
46 | }
47 |
48 | def unsupportedOperationTypeInsertOnlyTable(operation: String, tableName: String): Throwable = {
49 | new RuntimeException(s"Unsupported operation type - $operation for InsertOnly table " + tableName)
50 | }
51 |
52 | def unsupportedOperationTypeBucketedTable(operation: String, tableName: String): Throwable = {
53 | new RuntimeException(s"Unsupported operation type - $operation for Bucketed table " + tableName)
54 | }
55 |
56 | def tableNotAcidException(tableName: String): Throwable = {
57 | new IllegalArgumentException(s"table $tableName is not an ACID table")
58 | }
59 |
60 | def couldNotAcquireLockException(exception: Exception = null): Throwable = {
61 | new RuntimeException(s"Could not acquire lock.", exception)
62 | }
63 |
64 | def couldNotAcquireLockException(state: String): Throwable = {
65 | new RuntimeException(s"Could not acquire lock. Lock State: $state")
66 | }
67 |
68 | def txnAlreadyClosed(txnId: Long): Throwable = {
69 | new RuntimeException(s"Transaction $txnId is already closed")
70 | }
71 |
72 | def txnAlreadyOpen(txnId: Long): Throwable = {
73 | new RuntimeException(s"Transaction already opened. Existing txnId: $txnId")
74 | }
75 |
76 | def txnNotStarted(table: String): Throwable = {
77 | new RuntimeException(s"Transaction on $table not started")
78 | }
79 |
80 | def txnNoTransaction(): Throwable = {
81 | new RuntimeException(s"No transaction found")
82 | }
83 |
84 | def tableSnapshotNonExistent(snapshotId: Long): Throwable = {
85 | new RuntimeException(s"Table snapshost $snapshotId does not exist")
86 | }
87 |
88 | def tableWriteIdRequestedBeforeTxnStart(table: String): Throwable = {
89 | new RuntimeException(s"Write id requested for table $table before txn was started")
90 | }
91 |
92 | def repeatedTxnId(txnId: Long, activeTxns: Seq[Long]): Throwable = {
93 | new RuntimeException(
94 | s"Repeated transaction id $txnId, active transactions are [${activeTxns.mkString(",")}]")
95 | }
96 |
97 | def unsupportedStreamingOutputMode(mode: String): Throwable = {
98 | new AnalysisException(
99 | s"mode is $mode: Hive Acid Sink supports only Append as OutputMode")
100 | }
101 |
102 | def updateSetColumnNotFound(col: String, colList: Seq[String]): Throwable = {
103 | new AnalysisException(
104 | s"SET column ${formatColumn(col)} not found among columns: ${formatColumnList(colList)}.")
105 | }
106 |
107 | def updateOnPartition(cols: Seq[String], table: String): Throwable = {
108 | val message = if (cols.length == 1) {
109 | s"SET column: ${cols.head} is partition column in table: ${table}"
110 | } else {
111 | s"SET columns: ${cols.mkString(",")} are partition columns in table: ${table}"
112 | }
113 | new AnalysisException(
114 | s"UPDATE on the partition columns are not allowed. $message"
115 | )
116 | }
117 |
118 | def txnOutdated(txnId: Long, tableName: String): Throwable = {
119 | new TransactionInvalidException(
120 | s"Transaction is $txnId is no longer valid for table $tableName", txnId, tableName)
121 | }
122 |
123 | def unexpectedReadError(cause: String): Throwable = {
124 | throw new RuntimeException(
125 | s"Unexpected error while reading the Hive Acid Data: $cause")
126 | }
127 |
128 | def mergeValidationError(cause: String): Throwable = {
129 | SqlUtils.analysisException(s"MERGE Validation Error: $cause")
130 | }
131 |
132 | def mergeResolutionError(cause: String): Throwable = {
133 | SqlUtils.analysisException(cause)
134 | }
135 |
136 | def mergeUnsupportedError(cause: String): Throwable = {
137 | throw new RuntimeException(cause)
138 | }
139 | }
140 |
141 | class TransactionInvalidException(val message:String,
142 | val txnId: Long,
143 | val tableName : String)
144 | extends Exception(message) {
145 | override def getMessage: String = {
146 | message
147 | }
148 | }
149 |
150 | class AnalysisException(
151 | val message: String,
152 | val line: Option[Int] = None,
153 | val startPosition: Option[Int] = None,
154 | // Some plans fail to serialize due to bugs in scala collections.
155 | @transient val plan: Option[LogicalPlan] = None,
156 | val cause: Option[Throwable] = None)
157 | extends Exception(message, cause.orNull) with Serializable {
158 |
159 | def withPosition(line: Option[Int], startPosition: Option[Int]): AnalysisException = {
160 | val newException = new AnalysisException(message, line, startPosition)
161 | newException.setStackTrace(getStackTrace)
162 | newException
163 | }
164 |
165 | override def getMessage: String = {
166 | val planAnnotation = Option(plan).flatten.map(p => s";\n$p").getOrElse("")
167 | getSimpleMessage + planAnnotation
168 | }
169 |
170 | // Outputs an exception without the logical plan.
171 | // For testing only
172 | def getSimpleMessage: String = {
173 | val lineAnnotation = line.map(l => s" line $l").getOrElse("")
174 | val positionAnnotation = startPosition.map(p => s" pos $p").getOrElse("")
175 | s"$message;$lineAnnotation$positionAnnotation"
176 | }
177 | }
--------------------------------------------------------------------------------
/src/main/scala/com/qubole/spark/hiveacid/HiveAcidOperation.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2019 Qubole, Inc. All rights reserved.
3 | *
4 | * Licensed to the Apache Software Foundation (ASF) under one or more
5 | * contributor license agreements. See the NOTICE file distributed with
6 | * this work for additional information regarding copyright ownership.
7 | * The ASF licenses this file to You under the Apache License, Version 2.0
8 | * (the "License"); you may not use this file except in compliance with
9 | * the License. You may obtain a copy of the License at
10 | *
11 | * http://www.apache.org/licenses/LICENSE-2.0
12 | *
13 | * Unless required by applicable law or agreed to in writing, software
14 | * distributed under the License is distributed on an "AS IS" BASIS,
15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | * See the License for the specific language governing permissions and
17 | * limitations under the License.
18 | */
19 |
20 | package com.qubole.spark.hiveacid
21 |
22 | private[hiveacid] object HiveAcidOperation extends Enumeration {
23 | type OperationType = Value
24 | val READ, INSERT_INTO, INSERT_OVERWRITE, DELETE, UPDATE, MERGE = Value
25 | }
26 |
--------------------------------------------------------------------------------
/src/main/scala/com/qubole/spark/hiveacid/SparkAcidConf.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2019 Qubole, Inc. All rights reserved.
3 | *
4 | * Licensed to the Apache Software Foundation (ASF) under one or more
5 | * contributor license agreements. See the NOTICE file distributed with
6 | * this work for additional information regarding copyright ownership.
7 | * The ASF licenses this file to You under the Apache License, Version 2.0
8 | * (the "License"); you may not use this file except in compliance with
9 | * the License. You may obtain a copy of the License at
10 | *
11 | * http://www.apache.org/licenses/LICENSE-2.0
12 | *
13 | * Unless required by applicable law or agreed to in writing, software
14 | * distributed under the License is distributed on an "AS IS" BASIS,
15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | * See the License for the specific language governing permissions and
17 | * limitations under the License.
18 | */
19 |
20 | package com.qubole.spark.hiveacid
21 |
22 | import org.apache.spark.sql.SparkSession
23 |
24 | /**
25 | * Spark specific configuration container to be used by Hive Acid module
26 | */
27 | case class SparkAcidConfigEntry[T](configName: String /* Name of the config */ ,
28 | defaultValue: String /* Default value of config in String*/ ,
29 | description: String /* Description of the config*/ ,
30 | converter: Option[(String, String) => T] /* function to convert from String to Config's Type T*/)
31 |
32 |
33 | case class SparkAcidConfigBuilder[T](configName: String) {
34 | private var defaultValue: Option[String] = None
35 | def defaultValue(value: String): SparkAcidConfigBuilder[T] = {
36 | defaultValue = Some(value)
37 | this
38 | }
39 |
40 | private var description = ""
41 | def description(desc : String): SparkAcidConfigBuilder[T] = {
42 | description = desc
43 | this
44 | }
45 |
46 | private var converter: Option[(String, String) => T] = None
47 | def converter(func: (String, String) => T): SparkAcidConfigBuilder[T] = {
48 | converter = Some(func)
49 | this
50 | }
51 |
52 | def create(): SparkAcidConfigEntry[T] = {
53 | require(!defaultValue.isEmpty, "Default Value for the Spark Acid Config needs to be specified")
54 | new SparkAcidConfigEntry[T](configName, defaultValue.get, description, converter)
55 | }
56 | }
57 |
58 | case class SparkAcidConf(@transient sparkSession: SparkSession, @transient parameters: Map[String, String]) {
59 | @transient val configMap = sparkSession.sessionState.conf.getAllConfs
60 |
61 | val predicatePushdownEnabled = getConf(SparkAcidConf.PREDICATE_PUSHDOWN_CONF)
62 | val maxSleepBetweenLockRetries = getConf(SparkAcidConf.MAX_SLEEP_BETWEEN_LOCK_RETRIES)
63 | val lockNumRetries = getConf(SparkAcidConf.LOCK_NUM_RETRIES)
64 | val metastorePartitionPruningEnabled = sparkSession.sessionState.conf.metastorePartitionPruning
65 | val includeRowIds = parameters.getOrElse("includeRowIds", "false").toBoolean
66 | val parallelPartitionComputationThreshold = getConf(SparkAcidConf.PARALLEL_PARTITION_THRESHOLD)
67 |
68 | def getConf[T](configEntry: SparkAcidConfigEntry[T]): T = {
69 | val value = configMap.getOrElse(configEntry.configName, configEntry.defaultValue)
70 | configEntry.converter match {
71 | case Some(f) => f(value, configEntry.configName)
72 | case None => value.asInstanceOf[T]
73 | }
74 | }
75 | }
76 |
77 | object SparkAcidConf {
78 | val PREDICATE_PUSHDOWN_CONF = SparkAcidConfigBuilder[Boolean]("spark.sql.hiveAcid.enablePredicatePushdown")
79 | .defaultValue("true")
80 | .converter(toBoolean)
81 | .description("Configuration to enable Predicate PushDown for Hive Acid Reader")
82 | .create()
83 |
84 | val SPARK_READER = SparkAcidConfigBuilder[Boolean]("spark.sql.hiveAcid.enableSparkReader")
85 | .defaultValue("false")
86 | .converter(toBoolean)
87 | .description("Configuration to enable the Spark readers." +
88 | " When disabled, Hive Acid Readers in this DataSource are used." +
89 | " On enabling Spark readers will be used to read the Hive Table readers")
90 | .create()
91 |
92 | val MAX_SLEEP_BETWEEN_LOCK_RETRIES = SparkAcidConfigBuilder[Long]("spark.hiveAcid.lock.max.sleep.between.retries")
93 | .defaultValue("60000")
94 | .converter(toLong)
95 | .description("Maximum sleep time between lock retries in milliseconds; " +
96 | "Lock retries are based on exponential backoff" +
97 | " and start with 50 milliseconds and increases to the maximum time defined by this configuration")
98 | .create()
99 |
100 | // Retry exponential backoff that starts with 50 millisec
101 | // Default 13 is set to make total wait around 5 minutes with max sleep being 60 seconds
102 | val LOCK_NUM_RETRIES = SparkAcidConfigBuilder[Int]("spark.hiveAcid.lock.max.retries")
103 | .defaultValue("13")
104 | .converter(toInt)
105 | .description("Maximum retries to acquire a lock; Lock retries are based on exponential backoff " +
106 | "that start with 50 milliseconds")
107 | .create()
108 |
109 | val PARALLEL_PARTITION_THRESHOLD = SparkAcidConfigBuilder[Long]("spark.hiveAcid.parallel.partitioning.threshold")
110 | .defaultValue("10")
111 | .converter(toInt)
112 | .description("Threshold for number of RDDs for a partitioned table," +
113 | " after which Spark Job will be spawn to compute RDD splits(i.e., partitions) in parallel" +
114 | " Note that every partition in a table becomes one RDD ")
115 | .create()
116 |
117 | def toBoolean(s: String, key: String): Boolean = {
118 | try {
119 | s.trim.toBoolean
120 | } catch {
121 | case _: IllegalArgumentException =>
122 | throw new IllegalArgumentException(s"$key should be boolean, but was $s")
123 | }
124 | }
125 |
126 | def toLong(s: String, key: String): Long = {
127 | try {
128 | s.trim.toLong
129 | } catch {
130 | case _: IllegalArgumentException =>
131 | throw new IllegalArgumentException(s"$key should be Long, but was $s")
132 | }
133 | }
134 |
135 | def toInt(s: String, key: String): Int = {
136 | try {
137 | s.trim.toInt
138 | } catch {
139 | case _: IllegalArgumentException =>
140 | throw new IllegalArgumentException(s"$key should be Int, but was $s")
141 | }
142 | }
143 | }
--------------------------------------------------------------------------------
/src/main/scala/com/qubole/spark/hiveacid/datasource/HiveAcidDataSource.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2019 Qubole, Inc. All rights reserved.
3 | *
4 | * Licensed to the Apache Software Foundation (ASF) under one or more
5 | * contributor license agreements. See the NOTICE file distributed with
6 | * this work for additional information regarding copyright ownership.
7 | * The ASF licenses this file to You under the Apache License, Version 2.0
8 | * (the "License"); you may not use this file except in compliance with
9 | * the License. You may obtain a copy of the License at
10 | *
11 | * http://www.apache.org/licenses/LICENSE-2.0
12 | *
13 | * Unless required by applicable law or agreed to in writing, software
14 | * distributed under the License is distributed on an "AS IS" BASIS,
15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | * See the License for the specific language governing permissions and
17 | * limitations under the License.
18 | */
19 |
20 | package com.qubole.spark.hiveacid.datasource
21 |
22 | import com.qubole.spark.hiveacid.{HiveAcidErrors, HiveAcidTable}
23 | import com.qubole.spark.hiveacid.streaming.HiveAcidSink
24 |
25 | import org.apache.spark.internal.Logging
26 | import org.apache.spark.sql._
27 | import org.apache.spark.sql.execution.streaming.Sink
28 | import org.apache.spark.sql.sources._
29 | import org.apache.spark.sql.streaming.OutputMode
30 |
31 | /**
32 | * HiveAcid Data source implementation.
33 | */
34 | class HiveAcidDataSource
35 | extends RelationProvider // USING HiveAcid
36 | with CreatableRelationProvider // Insert into/overwrite
37 | with DataSourceRegister // FORMAT("HiveAcid")
38 | with StreamSinkProvider
39 | with Logging {
40 |
41 | // returns relation for passed in table name
42 | override def createRelation(sqlContext: SQLContext,
43 | parameters: Map[String, String]): BaseRelation = {
44 | HiveAcidRelation(sqlContext.sparkSession, getFullyQualifiedTableName(parameters), parameters)
45 | }
46 |
47 | // returns relation after writing passed in data frame. Table name is part of parameter
48 | override def createRelation(sqlContext: SQLContext,
49 | mode: SaveMode,
50 | parameters: Map[String, String],
51 | df: DataFrame): BaseRelation = {
52 |
53 | val hiveAcidTable: HiveAcidTable = HiveAcidTable.fromSparkSession(
54 | sqlContext.sparkSession,
55 | getFullyQualifiedTableName(parameters),
56 | parameters)
57 |
58 | mode match {
59 | case SaveMode.Overwrite =>
60 | hiveAcidTable.insertOverwrite(df)
61 | case SaveMode.Append =>
62 | hiveAcidTable.insertInto(df)
63 | // TODO: Add support for these
64 | case SaveMode.ErrorIfExists | SaveMode.Ignore =>
65 | HiveAcidErrors.unsupportedSaveMode(mode)
66 | }
67 | createRelation(sqlContext, parameters)
68 | }
69 |
70 | override def shortName(): String = {
71 | HiveAcidDataSource.NAME
72 | }
73 |
74 | override def createSink(sqlContext: SQLContext,
75 | parameters: Map[String, String],
76 | partitionColumns: Seq[String],
77 | outputMode: OutputMode): Sink = {
78 |
79 | tableSinkAssertions(partitionColumns, outputMode)
80 |
81 | new HiveAcidSink(sqlContext.sparkSession, parameters)
82 | }
83 |
84 | private def tableSinkAssertions(partitionColumns: Seq[String], outputMode: OutputMode): Unit = {
85 |
86 | if (partitionColumns.nonEmpty) {
87 | throw HiveAcidErrors.unsupportedFunction("partitionBy", "HiveAcidSink")
88 | }
89 | if (outputMode != OutputMode.Append) {
90 | throw HiveAcidErrors.unsupportedStreamingOutputMode(s"$outputMode")
91 | }
92 |
93 | }
94 |
95 | private def getFullyQualifiedTableName(parameters: Map[String, String]): String = {
96 | parameters.getOrElse("table", {
97 | throw HiveAcidErrors.tableNotSpecifiedException()
98 | })
99 | }
100 | }
101 |
102 | object HiveAcidDataSource {
103 | val NAME = "HiveAcid"
104 | }
105 |
106 |
--------------------------------------------------------------------------------
/src/main/scala/com/qubole/spark/hiveacid/datasource/HiveAcidRelation.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2019 Qubole, Inc. All rights reserved.
3 | *
4 | * Licensed to the Apache Software Foundation (ASF) under one or more
5 | * contributor license agreements. See the NOTICE file distributed with
6 | * this work for additional information regarding copyright ownership.
7 | * The ASF licenses this file to You under the Apache License, Version 2.0
8 | * (the "License"); you may not use this file except in compliance with
9 | * the License. You may obtain a copy of the License at
10 | *
11 | * http://www.apache.org/licenses/LICENSE-2.0
12 | *
13 | * Unless required by applicable law or agreed to in writing, software
14 | * distributed under the License is distributed on an "AS IS" BASIS,
15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | * See the License for the specific language governing permissions and
17 | * limitations under the License.
18 | */
19 |
20 | package com.qubole.spark.hiveacid.datasource
21 |
22 | import org.apache.spark.internal.Logging
23 | import org.apache.spark.rdd.RDD
24 | import org.apache.spark.sql.{Column, DataFrame, Row, SQLContext, SparkSession}
25 | import org.apache.spark.sql.sources.{BaseRelation, Filter, InsertableRelation, PrunedFilteredScan}
26 | import org.apache.spark.sql.types._
27 | import com.qubole.spark.hiveacid.{HiveAcidErrors, HiveAcidTable, SparkAcidConf}
28 | import com.qubole.spark.hiveacid.hive.HiveAcidMetadata
29 | import com.qubole.spark.hiveacid.merge.{MergeWhenClause, MergeWhenNotInsert}
30 | import org.apache.spark.sql.catalyst.AliasIdentifier
31 | import org.apache.spark.sql.catalyst.expressions.Expression
32 | import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
33 |
34 | import collection.JavaConversions._
35 |
36 | /**
37 | * Container for all metadata, configuration and schema to perform operations on
38 | * Hive ACID datasource. This provides for plumbing most of the heavy lifting is
39 | * performed inside HiveAcidtTable.
40 | *
41 | * @param sparkSession Spark Session object
42 | * @param fullyQualifiedTableName Table name for the data source.
43 | * @param parameters user provided parameters required for reading and writing,
44 | * including configuration
45 | */
46 | case class HiveAcidRelation(sparkSession: SparkSession,
47 | fullyQualifiedTableName: String,
48 | parameters: Map[String, String])
49 | extends BaseRelation
50 | with InsertableRelation
51 | with PrunedFilteredScan
52 | with Logging {
53 |
54 | private val hiveAcidMetadata: HiveAcidMetadata = HiveAcidMetadata.fromSparkSession(
55 | sparkSession,
56 | fullyQualifiedTableName
57 | )
58 | private val hiveAcidTable: HiveAcidTable = new HiveAcidTable(sparkSession,
59 | hiveAcidMetadata, parameters)
60 |
61 | private val readOptions = SparkAcidConf(sparkSession, parameters)
62 |
63 | override def sqlContext: SQLContext = sparkSession.sqlContext
64 |
65 | override val schema: StructType = if (readOptions.includeRowIds) {
66 | hiveAcidMetadata.tableSchemaWithRowId
67 | } else {
68 | hiveAcidMetadata.tableSchema
69 | }
70 |
71 | override def insert(data: DataFrame, overwrite: Boolean): Unit = {
72 | // sql insert into and overwrite
73 | if (overwrite) {
74 | hiveAcidTable.insertOverwrite(data)
75 | } else {
76 | hiveAcidTable.insertInto(data)
77 | }
78 | }
79 |
80 | def update(condition: Option[Column], newValues: Map[String, Column]): Unit = {
81 | hiveAcidTable.update(condition, newValues)
82 | }
83 |
84 | def delete(condition: Column): Unit = {
85 | hiveAcidTable.delete(condition)
86 | }
87 | override def sizeInBytes: Long = {
88 | val compressionFactor = sparkSession.sessionState.conf.fileCompressionFactor
89 | (sparkSession.sessionState.conf.defaultSizeInBytes * compressionFactor).toLong
90 | }
91 |
92 | def merge(sourceDf: DataFrame,
93 | mergeExpression: Expression,
94 | matchedClause: Seq[MergeWhenClause],
95 | notMatched: Option[MergeWhenNotInsert],
96 | sourceAlias: Option[AliasIdentifier],
97 | targetAlias: Option[AliasIdentifier]): Unit = {
98 | hiveAcidTable.merge(sourceDf, mergeExpression, matchedClause,
99 | notMatched, sourceAlias, targetAlias)
100 | }
101 |
102 | def getHiveAcidTable(): HiveAcidTable = {
103 | hiveAcidTable
104 | }
105 |
106 | // FIXME: should it be true / false. Recommendation seems to
107 | // be to leave it as true
108 | override val needConversion: Boolean = false
109 |
110 | override def buildScan(requiredColumns: Array[String], filters: Array[Filter]): RDD[Row] = {
111 | val readOptions = SparkAcidConf(sparkSession, parameters)
112 | // sql "select *"
113 | hiveAcidTable.getRdd(requiredColumns, filters, readOptions)
114 | }
115 | }
116 |
--------------------------------------------------------------------------------
/src/main/scala/com/qubole/spark/hiveacid/hive/.gitignore:
--------------------------------------------------------------------------------
1 | *.scalae
2 |
--------------------------------------------------------------------------------
/src/main/scala/com/qubole/spark/hiveacid/hive/HiveAcidMetadata.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.qubole.spark.hiveacid.hive
19 |
20 | import java.lang.reflect.InvocationTargetException
21 | import java.util.Locale
22 |
23 | import scala.collection.JavaConversions._
24 | import scala.collection.mutable
25 | import com.qubole.shaded.hadoop.hive.conf.HiveConf
26 | import com.qubole.shaded.hadoop.hive.ql.io.RecordIdentifier
27 | import com.qubole.shaded.hadoop.hive.ql.metadata
28 | import com.qubole.shaded.hadoop.hive.ql.metadata.Hive
29 | import com.qubole.shaded.hadoop.hive.ql.plan.TableDesc
30 | import com.qubole.spark.hiveacid.util.Util
31 | import com.qubole.spark.hiveacid.HiveAcidErrors
32 | import org.apache.hadoop.fs.Path
33 | import org.apache.hadoop.hive.metastore.api.MetaException
34 | import org.apache.hadoop.io.Writable
35 | import org.apache.hadoop.mapred.{InputFormat, OutputFormat}
36 | import org.apache.spark.internal.Logging
37 | import org.apache.spark.sql._
38 | import org.apache.spark.sql.types._
39 |
40 | /**
41 | * Represents metadata for hive acid table and exposes API to perform operations on top of it
42 | * @param sparkSession - spark session object
43 | * @param fullyQualifiedTableName - the fully qualified hive acid table name
44 | */
45 | class HiveAcidMetadata(sparkSession: SparkSession,
46 | fullyQualifiedTableName: String) extends Logging {
47 |
48 | // hive conf
49 | private val hiveConf: HiveConf = HiveConverter.getHiveConf(sparkSession.sparkContext)
50 |
51 | // a hive representation of the table
52 | val hTable: metadata.Table = {
53 | val hive: Hive = Hive.get(hiveConf)
54 | val table = sparkSession.sessionState.sqlParser.parseTableIdentifier(fullyQualifiedTableName)
55 | val hTable = hive.getTable(
56 | table.database match {
57 | case Some(database) => database
58 | case None => HiveAcidMetadata.DEFAULT_DATABASE
59 | }, table.identifier)
60 | Hive.closeCurrent()
61 | hTable
62 | }
63 |
64 | if (hTable.getParameters.get("transactional") != "true") {
65 | throw HiveAcidErrors.tableNotAcidException(hTable.getFullyQualifiedName)
66 | }
67 |
68 | val isFullAcidTable: Boolean = hTable.getParameters.containsKey("transactional_properties") &&
69 | !hTable.getParameters.get("transactional_properties").equals("insert_only")
70 | val isInsertOnlyTable: Boolean = !isFullAcidTable
71 | val isBucketed: Boolean = hTable.getBucketCols() != null && hTable.getBucketCols.size() > 0
72 |
73 | // Table properties
74 | val isPartitioned: Boolean = hTable.isPartitioned
75 | val rootPath: Path = hTable.getDataLocation
76 | val dbName: String = hTable.getDbName
77 | val tableName: String = hTable.getTableName
78 | val fullyQualifiedName: String = hTable.getFullyQualifiedName
79 |
80 | // Schema properties
81 | val dataSchema = StructType(hTable.getSd.getCols.toList.map(
82 | HiveConverter.getCatalystStructField).toArray)
83 |
84 | val partitionSchema = StructType(hTable.getPartitionKeys.toList.map(
85 | HiveConverter.getCatalystStructField).toArray)
86 |
87 | val tableSchema: StructType = {
88 | val overlappedPartCols = mutable.Map.empty[String, StructField]
89 | partitionSchema.foreach { partitionField =>
90 | if (dataSchema.exists(getColName(_) == getColName(partitionField))) {
91 | overlappedPartCols += getColName(partitionField) -> partitionField
92 | }
93 | }
94 | StructType(dataSchema.map(f => overlappedPartCols.getOrElse(getColName(f), f)) ++
95 | partitionSchema.filterNot(f => overlappedPartCols.contains(getColName(f))))
96 | }
97 |
98 | val tableSchemaWithRowId: StructType = {
99 | StructType(
100 | Seq(
101 | StructField(HiveAcidMetadata.rowIdCol, HiveAcidMetadata.rowIdSchema)
102 | ) ++ tableSchema.fields)
103 | }
104 |
105 | lazy val tableDesc: TableDesc = {
106 | val inputFormatClass: Class[InputFormat[Writable, Writable]] =
107 | Util.classForName(hTable.getInputFormatClass.getName,
108 | loadShaded = true).asInstanceOf[java.lang.Class[InputFormat[Writable, Writable]]]
109 | val outputFormatClass: Class[OutputFormat[Writable, Writable]] =
110 | Util.classForName(hTable.getOutputFormatClass.getName,
111 | loadShaded = true).asInstanceOf[java.lang.Class[OutputFormat[Writable, Writable]]]
112 | new TableDesc(
113 | inputFormatClass,
114 | outputFormatClass,
115 | hTable.getMetadata)
116 | }
117 |
118 | /**
119 | * Returns list of partitions satisfying partition predicates
120 | * @param partitionFilters - filters to apply
121 | */
122 | def getRawPartitions(partitionFilters: Option[String] = None): Seq[metadata.Partition] = {
123 | val hive: Hive = Hive.get(hiveConf)
124 | val prunedPartitions = try {
125 | partitionFilters match {
126 | case Some(filter) => hive.getPartitionsByFilter(hTable, filter)
127 | case None => hive.getPartitions(hTable)
128 | }
129 | } finally {
130 | Hive.closeCurrent()
131 | }
132 | logDebug(s"partition count = ${prunedPartitions.size()}")
133 | prunedPartitions.toSeq
134 | }
135 |
136 | private def getColName(field: StructField): String = {
137 | HiveAcidMetadata.getColName(sparkSession, field)
138 | }
139 | }
140 |
141 | object HiveAcidMetadata {
142 | val DEFAULT_DATABASE = "default"
143 |
144 | val rowIdCol = "rowId"
145 | val rowIdSchema: StructType = {
146 | StructType(
147 | RecordIdentifier.Field.values().map {
148 | field =>
149 | StructField(
150 | name = field.name(),
151 | dataType = HiveConverter.getCatalystType(field.fieldType.getTypeName),
152 | nullable = true)
153 | }
154 | )
155 | }
156 |
157 | def fromSparkSession(sparkSession: SparkSession,
158 | fullyQualifiedTableName: String): HiveAcidMetadata = {
159 | new HiveAcidMetadata(
160 | sparkSession,
161 | fullyQualifiedTableName)
162 | }
163 |
164 | def getColName(sparkSession: SparkSession, field: StructField): String = {
165 | if (sparkSession.sessionState.conf.caseSensitiveAnalysis) {
166 | field.name
167 | } else {
168 | field.name.toLowerCase(Locale.ROOT)
169 | }
170 | }
171 |
172 | def getColNames(sparkSession: SparkSession, schema: StructType): Seq[String] = {
173 | schema.map(getColName(sparkSession, _))
174 | }
175 | }
176 |
--------------------------------------------------------------------------------
/src/main/scala/com/qubole/spark/hiveacid/hive/HiveConverter.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.qubole.spark.hiveacid.hive
19 |
20 | import java.sql.{Date, Timestamp}
21 | import java.util.Locale
22 |
23 | import com.qubole.shaded.hadoop.hive.conf.HiveConf
24 | import com.qubole.shaded.hadoop.hive.metastore.api.FieldSchema
25 | import org.apache.commons.lang3.StringUtils
26 | import org.apache.spark.internal.Logging
27 | import org.apache.spark.{SparkContext, SparkException}
28 | import org.apache.spark.sql.catalyst.parser.{CatalystSqlParser, ParseException}
29 | import org.apache.spark.sql.sources._
30 | import org.apache.spark.sql.types._
31 |
32 | import scala.collection.JavaConversions._
33 |
34 | /**
35 | * Encapsulates everything (extensions, workarounds, quirks) to handle the
36 | * SQL dialect conversion between catalyst and hive.
37 | */
38 | private[hiveacid] object HiveConverter extends Logging {
39 |
40 | def getCatalystStructField(hc: FieldSchema): StructField = {
41 | val columnType = getCatalystType(hc.getType)
42 | val metadata = if (hc.getType != columnType.catalogString) {
43 | new MetadataBuilder().putString(HIVE_TYPE_STRING, hc.getType).build()
44 | } else {
45 | Metadata.empty
46 | }
47 |
48 | val field = StructField(
49 | name = hc.getName,
50 | dataType = columnType,
51 | nullable = true,
52 | metadata = metadata)
53 | Option(hc.getComment).map(field.withComment).getOrElse(field)
54 | }
55 |
56 | def getCatalystType(dataType: String): DataType = {
57 | try {
58 | CatalystSqlParser.parseDataType(dataType)
59 | } catch {
60 | case e: ParseException =>
61 | throw new SparkException("Cannot recognize hive type string: " + dataType, e)
62 | }
63 | }
64 |
65 | def getHiveConf(sparkContext: SparkContext): HiveConf = {
66 | val hiveConf = new HiveConf()
67 | (sparkContext.hadoopConfiguration.iterator().map(kv => kv.getKey -> kv.getValue)
68 | ++ sparkContext.getConf.getAll.toMap).foreach { case (k, v) =>
69 | logDebug(
70 | s"""
71 | |Applying Hadoop/Hive/Spark and extra properties to Hive Conf:
72 | |$k=${if (k.toLowerCase(Locale.ROOT).contains("password")) "xxx" else v}
73 | """.stripMargin)
74 | hiveConf.set(k, v)
75 | }
76 | hiveConf
77 | }
78 |
79 | /**
80 | * Escape special characters in SQL string literals.
81 | *
82 | * @param value The string to be escaped.
83 | * @return Escaped string.
84 | */
85 | private def escapeSql(value: String): String = {
86 | // TODO: how to handle null
87 | StringUtils.replace(value, "'", "''")
88 | }
89 |
90 | /**
91 | * Converts value to SQL expression.
92 | * @param value The value to be converted.
93 | * @return Converted value.
94 | */
95 | private def compileValue(value: Any): Any = value match {
96 | case stringValue: String => s"'${escapeSql(stringValue)}'"
97 | case timestampValue: Timestamp => "'" + timestampValue + "'"
98 | case dateValue: Date => "'" + dateValue + "'"
99 | case arrayValue: Array[Any] => arrayValue.map(compileValue).mkString(", ")
100 | case _ => value
101 | }
102 |
103 | /**
104 | * Turns a single Filter into a String representing a SQL expression.
105 | * Returns None for an unhandled filter.
106 | */
107 | def compileFilter(f: Filter): Option[String] = Option(x = f match {
108 | case EqualTo(attr, value) => s"$attr = ${compileValue(value)}"
109 | case EqualNullSafe(attr, value) =>
110 | val col = attr
111 | s"(NOT ($col != ${compileValue(value)} OR $col = 'NULL' OR " +
112 | s"${compileValue(value)} = 'NULL') OR " +
113 | s"($col = 'NULL' AND ${compileValue(value)} = 'NULL'))"
114 | case LessThan(attr, value) => s"$attr < ${compileValue(value)}"
115 | case GreaterThan(attr, value) => s"$attr > ${compileValue(value)}"
116 | case LessThanOrEqual(attr, value) => s"$attr <= ${compileValue(value)}"
117 | case GreaterThanOrEqual(attr, value) => s"$attr >= ${compileValue(value)}"
118 | // These clauses throw in Hive MS when filtering the partitions
119 | //case IsNull(attr) => s"$attr = 'NULL'"
120 | //case IsNotNull(attr) => s"$attr != 'NULL'"
121 | case StringStartsWith(attr, value) => s"$attr LIKE '$value%'"
122 | case StringEndsWith(attr, value) => s"$attr LIKE '%$value'"
123 | case StringContains(attr, value) => s"$attr LIKE '%$value%'"
124 | case In(attr, value) => s"$attr IN (${compileValue(value)})"
125 | case Not(`f`) => compileFilter(f).map(p => s"(NOT ($p))").orNull
126 | case Or(f1, f2) =>
127 | // We can't compile Or filter unless both sub-filters are compiled successfully.
128 | // It applies too for the following And filter.
129 | // If we can make sure compileFilter supports all filters, we can remove this check.
130 | val or = Seq(f1, f2) flatMap compileFilter
131 | if (or.size == 2) {
132 | or.map(p => s"($p)").mkString(" OR ")
133 | } else null
134 | case And(f1, f2) =>
135 | val and = Seq(f1, f2).flatMap(compileFilter)
136 | if (and.size == 2) {
137 | and.map(p => s"($p)").mkString(" AND ")
138 | } else null
139 | case _ => null
140 | })
141 |
142 |
143 | def compileFilters(filters: Seq[Filter]): String = {
144 | val str = filters.flatMap(compileFilter).mkString(" and ")
145 | logDebug(str)
146 | str
147 | }
148 | }
149 |
--------------------------------------------------------------------------------
/src/main/scala/com/qubole/spark/hiveacid/package.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2019 Qubole, Inc. All rights reserved.
3 | *
4 | * Licensed to the Apache Software Foundation (ASF) under one or more
5 | * contributor license agreements. See the NOTICE file distributed with
6 | * this work for additional information regarding copyright ownership.
7 | * The ASF licenses this file to You under the Apache License, Version 2.0
8 | * (the "License"); you may not use this file except in compliance with
9 | * the License. You may obtain a copy of the License at
10 | *
11 | * http://www.apache.org/licenses/LICENSE-2.0
12 | *
13 | * Unless required by applicable law or agreed to in writing, software
14 | * distributed under the License is distributed on an "AS IS" BASIS,
15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | * See the License for the specific language governing permissions and
17 | * limitations under the License.
18 | */
19 | package com.qubole.spark
20 |
21 | import org.apache.spark.sql._
22 |
23 | package object hiveacid {
24 | implicit class HiveAcidDataFrameReader(reader: DataFrameReader) {
25 | def hiveacid(table: String, options: Map[String, String] = Map.empty): DataFrame = {
26 | reader.format("HiveAcid").option("table", table)
27 | .options(options).load()
28 | }
29 | }
30 |
31 | implicit class HiveAcidDataFrameWriter[T](writer: DataFrameWriter[T]) {
32 | def hiveacid(table: String, saveMode: String, options: Map[String, String] = Map.empty): Unit = {
33 | writer.format("HiveAcid").option("table", table)
34 | .options(options).mode(saveMode).save()
35 | }
36 | }
37 | }
38 |
--------------------------------------------------------------------------------
/src/main/scala/com/qubole/spark/hiveacid/rdd/EmptyRDD.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2019 Qubole, Inc. All rights reserved.
3 | *
4 | * Licensed to the Apache Software Foundation (ASF) under one or more
5 | * contributor license agreements. See the NOTICE file distributed with
6 | * this work for additional information regarding copyright ownership.
7 | * The ASF licenses this file to You under the Apache License, Version 2.0
8 | * (the "License"); you may not use this file except in compliance with
9 | * the License. You may obtain a copy of the License at
10 | *
11 | * http://www.apache.org/licenses/LICENSE-2.0
12 | *
13 | * Unless required by applicable law or agreed to in writing, software
14 | * distributed under the License is distributed on an "AS IS" BASIS,
15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | * See the License for the specific language governing permissions and
17 | * limitations under the License.
18 | */
19 |
20 | package com.qubole.spark.hiveacid.rdd
21 |
22 | import scala.reflect.ClassTag
23 |
24 | import org.apache.spark.{Partition, SparkContext, TaskContext}
25 | import org.apache.spark.rdd.RDD
26 |
27 | private[hiveacid] class EmptyRDD[T: ClassTag](sc: SparkContext) extends RDD[T](sc, Nil) {
28 |
29 | override def getPartitions: Array[Partition] = Array.empty
30 |
31 | override def compute(split: Partition, context: TaskContext): Iterator[T] = {
32 | throw new UnsupportedOperationException("empty RDD")
33 | }
34 | }
35 |
--------------------------------------------------------------------------------
/src/main/scala/com/qubole/spark/hiveacid/rdd/HiveAcidUnionRDD.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2019 Qubole, Inc. All rights reserved.
3 | *
4 | * Licensed to the Apache Software Foundation (ASF) under one or more
5 | * contributor license agreements. See the NOTICE file distributed with
6 | * this work for additional information regarding copyright ownership.
7 | * The ASF licenses this file to You under the Apache License, Version 2.0
8 | * (the "License"); you may not use this file except in compliance with
9 | * the License. You may obtain a copy of the License at
10 | *
11 | * http://www.apache.org/licenses/LICENSE-2.0
12 | *
13 | * Unless required by applicable law or agreed to in writing, software
14 | * distributed under the License is distributed on an "AS IS" BASIS,
15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | * See the License for the specific language governing permissions and
17 | * limitations under the License.
18 | */
19 |
20 | package com.qubole.spark.hiveacid.rdd
21 |
22 | import com.qubole.spark.hiveacid.SparkAcidConf
23 | import com.qubole.spark.hiveacid.reader.hive.HiveAcidPartitionComputer
24 |
25 | import scala.reflect.ClassTag
26 | import org.apache.spark._
27 | import org.apache.spark.rdd.{RDD, UnionRDD}
28 |
29 | /**
30 | * A Hive3RDD is created for each of the hive partition of the table. But at the end the buildScan
31 | * is supposed to return only 1 RDD for entire table. So we have to create UnionRDD for it.
32 | *
33 | * This class extends UnionRDD and makes sure that we acquire read lock once for all the
34 | * partitions of the table
35 |
36 | * @param sc - sparkContext
37 | * @param rddSeq - underlying partition RDDs
38 | * @param hiveSplitInfo - It is sequence of HiveSplitInfo.
39 | * It would be derived from the list of HiveAcidRDD passed here.
40 | * check HiveAcidRDD.getHiveSplitsInfo
41 | */
42 | private[hiveacid] class HiveAcidUnionRDD[T: ClassTag](
43 | sc: SparkContext,
44 | rddSeq: Seq[RDD[T]],
45 | //TODO: We should clean so that HiveSplitInfo need not have to be passed separately.
46 | hiveSplitInfo: Seq[HiveSplitInfo]) extends UnionRDD[T](sc, rddSeq) {
47 |
48 | private val ignoreMissingFiles =
49 | super.sparkContext.getConf.getBoolean("spark.files.ignoreMissingFiles", defaultValue = false)
50 |
51 | private val ignoreEmptySplits =
52 | super.sparkContext.getConf.getBoolean("spark.hadoopRDD.ignoreEmptySplits", defaultValue = false)
53 |
54 | private val parallelPartitionThreshold =
55 | super.sparkContext.getConf.getInt(SparkAcidConf.PARALLEL_PARTITION_THRESHOLD.configName, 10)
56 |
57 | override def getPartitions: Array[Partition] = {
58 | if (hiveSplitInfo.length > parallelPartitionThreshold) {
59 | val partitions = hiveSplitInfo.length/parallelPartitionThreshold
60 | val hiveSplitRDD = super.sparkContext.parallelize(hiveSplitInfo, partitions)
61 | val hiveAcidPartitionComputer = new HiveAcidPartitionComputer(ignoreEmptySplits, ignoreMissingFiles)
62 | // It spawns a spark job to compute Partitions for every RDD and stores it in cache.
63 | hiveAcidPartitionComputer.computeHiveSplitsAndCache(hiveSplitRDD)
64 | }
65 | super.getPartitions
66 | }
67 | }
68 |
--------------------------------------------------------------------------------
/src/main/scala/com/qubole/spark/hiveacid/reader/.gitignore:
--------------------------------------------------------------------------------
1 | *.scalae
2 |
--------------------------------------------------------------------------------
/src/main/scala/com/qubole/spark/hiveacid/reader/Reader.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2019 Qubole, Inc. All rights reserved.
3 | *
4 | * Licensed to the Apache Software Foundation (ASF) under one or more
5 | * contributor license agreements. See the NOTICE file distributed with
6 | * this work for additional information regarding copyright ownership.
7 | * The ASF licenses this file to You under the Apache License, Version 2.0
8 | * (the "License"); you may not use this file except in compliance with
9 | * the License. You may obtain a copy of the License at
10 | *
11 | * http://www.apache.org/licenses/LICENSE-2.0
12 | *
13 | * Unless required by applicable law or agreed to in writing, software
14 | * distributed under the License is distributed on an "AS IS" BASIS,
15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | * See the License for the specific language governing permissions and
17 | * limitations under the License.
18 | */
19 |
20 | package com.qubole.spark.hiveacid.reader
21 |
22 | import com.qubole.spark.hiveacid.hive.HiveAcidMetadata
23 |
24 | import org.apache.spark.rdd.RDD
25 | import org.apache.spark.sql.catalyst.InternalRow
26 |
27 | private[reader] trait Reader {
28 | def makeRDDForTable(hiveAcidMetadata: HiveAcidMetadata): RDD[InternalRow]
29 | def makeRDDForPartitionedTable(hiveAcidMetadata: HiveAcidMetadata,
30 | partitions: Seq[ReaderPartition]): RDD[InternalRow]
31 | }
32 |
33 | private[reader] case class ReaderPartition(ptn: Any)
34 |
--------------------------------------------------------------------------------
/src/main/scala/com/qubole/spark/hiveacid/reader/ReaderOptions.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2019 Qubole, Inc. All rights reserved.
3 | *
4 | * Licensed to the Apache Software Foundation (ASF) under one or more
5 | * contributor license agreements. See the NOTICE file distributed with
6 | * this work for additional information regarding copyright ownership.
7 | * The ASF licenses this file to You under the Apache License, Version 2.0
8 | * (the "License"); you may not use this file except in compliance with
9 | * the License. You may obtain a copy of the License at
10 | *
11 | * http://www.apache.org/licenses/LICENSE-2.0
12 | *
13 | * Unless required by applicable law or agreed to in writing, software
14 | * distributed under the License is distributed on an "AS IS" BASIS,
15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | * See the License for the specific language governing permissions and
17 | * limitations under the License.
18 | */
19 |
20 | package com.qubole.spark.hiveacid.reader
21 |
22 | import com.qubole.spark.hiveacid.SparkAcidConf
23 | import org.apache.hadoop.conf.Configuration
24 |
25 | import org.apache.spark.sql.catalyst.expressions.Attribute
26 | import org.apache.spark.sql.sources.Filter
27 |
28 | /**
29 | * Reader options which will be serialized and sent to each executor
30 | */
31 | private[hiveacid] class ReaderOptions(val hadoopConf: Configuration,
32 | val partitionAttributes: Seq[Attribute],
33 | val requiredAttributes: Seq[Attribute],
34 | val dataFilters: Array[Filter],
35 | val requiredNonPartitionedColumns: Array[String],
36 | val sessionLocalTimeZone: String,
37 | val readConf: SparkAcidConf) extends Serializable
38 |
--------------------------------------------------------------------------------
/src/main/scala/com/qubole/spark/hiveacid/reader/TableReader.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2019 Qubole, Inc. All rights reserved.
3 | *
4 | * Licensed to the Apache Software Foundation (ASF) under one or more
5 | * contributor license agreements. See the NOTICE file distributed with
6 | * this work for additional information regarding copyright ownership.
7 | * The ASF licenses this file to You under the Apache License, Version 2.0
8 | * (the "License"); you may not use this file except in compliance with
9 | * the License. You may obtain a copy of the License at
10 | *
11 | * http://www.apache.org/licenses/LICENSE-2.0
12 | *
13 | * Unless required by applicable law or agreed to in writing, software
14 | * distributed under the License is distributed on an "AS IS" BASIS,
15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | * See the License for the specific language governing permissions and
17 | * limitations under the License.
18 | */
19 |
20 | package com.qubole.spark.hiveacid.reader
21 |
22 | import com.qubole.spark.hiveacid.{HiveAcidOperation, SparkAcidConf}
23 | import com.qubole.spark.hiveacid.transaction._
24 | import com.qubole.spark.hiveacid.hive.HiveAcidMetadata
25 | import com.qubole.spark.hiveacid.reader.hive.{HiveAcidReader, HiveAcidReaderOptions}
26 |
27 | import org.apache.spark.internal.Logging
28 | import org.apache.spark.rdd.RDD
29 | import org.apache.spark.sql.{Row, SparkSession}
30 | import org.apache.spark.sql.catalyst.expressions._
31 | import org.apache.spark.sql.sources.Filter
32 |
33 | /**
34 | * Table reader object
35 | *
36 | * @param sparkSession - Spark session
37 | * @param curTxn - Transaction object to acquire locks.
38 | * @param hiveAcidMetadata - Hive acid table for which read is to be performed.
39 | */
40 | private[hiveacid] class TableReader(sparkSession: SparkSession,
41 | curTxn: HiveAcidTxn,
42 | hiveAcidMetadata: HiveAcidMetadata) extends Logging {
43 |
44 | def getRdd(requiredColumns: Array[String],
45 | filters: Array[Filter],
46 | readConf: SparkAcidConf): RDD[Row] = {
47 | val rowIdColumnSet = HiveAcidMetadata.rowIdSchema.fields.map(_.name).toSet
48 | val requiredColumnsWithoutRowId = requiredColumns.filterNot(rowIdColumnSet.contains)
49 | val partitionColumnNames = hiveAcidMetadata.partitionSchema.fields.map(_.name)
50 | val partitionedColumnSet = partitionColumnNames.toSet
51 |
52 | // Attributes
53 | val requiredNonPartitionedColumns = requiredColumnsWithoutRowId.filter(
54 | x => !partitionedColumnSet.contains(x))
55 |
56 | val requiredAttributes = if (!readConf.includeRowIds) {
57 | requiredColumnsWithoutRowId.map {
58 | x =>
59 | val field = hiveAcidMetadata.tableSchema.fields.find(_.name == x).get
60 | PrettyAttribute(field.name, field.dataType)
61 | }
62 | } else {
63 | requiredColumns.map {
64 | x =>
65 | val field = hiveAcidMetadata.tableSchemaWithRowId.fields.find(_.name == x).get
66 | PrettyAttribute(field.name, field.dataType)
67 | }
68 | }
69 | val partitionAttributes = hiveAcidMetadata.partitionSchema.fields.map { x =>
70 | PrettyAttribute(x.name, x.dataType)
71 | }
72 |
73 | // Filters
74 | val (partitionFilters, otherFilters) = filters.partition { predicate =>
75 | !predicate.references.isEmpty &&
76 | predicate.references.toSet.subsetOf(partitionedColumnSet)
77 | }
78 | val dataFilters = otherFilters.filter(_
79 | .references.intersect(partitionColumnNames).isEmpty
80 | )
81 |
82 | logDebug(s"total filters : ${filters.length}: " +
83 | s"dataFilters: ${dataFilters.length} " +
84 | s"partitionFilters: ${partitionFilters.length}")
85 |
86 | val hadoopConf = sparkSession.sessionState.newHadoopConf()
87 |
88 | logDebug(s"sarg.pushdown: ${hadoopConf.get("sarg.pushdown")}," +
89 | s"hive.io.file.readcolumn.names: ${hadoopConf.get("hive.io.file.readcolumn.names")}, " +
90 | s"hive.io.file.readcolumn.ids: ${hadoopConf.get("hive.io.file.readcolumn.ids")}")
91 |
92 | val readerOptions = new ReaderOptions(hadoopConf,
93 | partitionAttributes,
94 | requiredAttributes,
95 | dataFilters,
96 | requiredNonPartitionedColumns,
97 | sparkSession.sessionState.conf.sessionLocalTimeZone,
98 | readConf)
99 |
100 | val hiveAcidReaderOptions= HiveAcidReaderOptions.get(hiveAcidMetadata)
101 |
102 | val (partitions, partitionList) = HiveAcidReader.getPartitions(hiveAcidMetadata,
103 | readerOptions,
104 | partitionFilters)
105 |
106 | // Acquire lock on all the partition and then create snapshot. Every time getRDD is called
107 | // it creates a new snapshot.
108 | // NB: partitionList is Seq if partition pruning is not enabled
109 | curTxn.acquireLocks(hiveAcidMetadata, HiveAcidOperation.READ, partitionList, readConf)
110 |
111 | // Create Snapshot !!!
112 | //val curSnapshot = HiveAcidTxn.createSnapshot(curTxn, hiveAcidMetadata)
113 |
114 | val validWriteIds = HiveAcidTxn.getValidWriteIds(curTxn, hiveAcidMetadata)
115 |
116 | val reader = new HiveAcidReader(
117 | sparkSession,
118 | readerOptions,
119 | hiveAcidReaderOptions,
120 | validWriteIds)
121 |
122 | val rdd = if (hiveAcidMetadata.isPartitioned) {
123 | reader.makeRDDForPartitionedTable(hiveAcidMetadata, partitions)
124 | } else {
125 | reader.makeRDDForTable(hiveAcidMetadata)
126 | }
127 |
128 | rdd.asInstanceOf[RDD[Row]]
129 | }
130 | }
131 |
--------------------------------------------------------------------------------
/src/main/scala/com/qubole/spark/hiveacid/reader/hive/HiveAcidPartitionComputer.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2019 Qubole, Inc. All rights reserved.
3 | *
4 | * Licensed to the Apache Software Foundation (ASF) under one or more
5 | * contributor license agreements. See the NOTICE file distributed with
6 | * this work for additional information regarding copyright ownership.
7 | * The ASF licenses this file to You under the Apache License, Version 2.0
8 | * (the "License"); you may not use this file except in compliance with
9 | * the License. You may obtain a copy of the License at
10 | *
11 | * http://www.apache.org/licenses/LICENSE-2.0
12 | *
13 | * Unless required by applicable law or agreed to in writing, software
14 | * distributed under the License is distributed on an "AS IS" BASIS,
15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | * See the License for the specific language governing permissions and
17 | * limitations under the License.
18 | */
19 |
20 | package com.qubole.spark.hiveacid.reader.hive
21 |
22 | import java.util.concurrent.{ConcurrentHashMap, TimeUnit}
23 |
24 | import com.qubole.shaded.hadoop.hive.common.{ValidReaderWriteIdList, ValidWriteIdList}
25 | import com.qubole.spark.hiveacid.rdd.{HiveAcidPartition, HiveAcidRDD, HiveSplitInfo}
26 | import com.qubole.spark.hiveacid.reader.hive.HiveAcidPartitionComputer.{addToPartitionCache, getInputFormat}
27 | import com.qubole.spark.hiveacid.util.Util
28 | import org.apache.hadoop.conf.Configurable
29 | import org.apache.hadoop.fs.Path
30 | import org.apache.hadoop.io.Writable
31 | import org.apache.hadoop.mapred.{FileInputFormat, InputFormat, InvalidInputException, JobConf}
32 | import org.apache.hadoop.util.ReflectionUtils
33 | import org.apache.spark.deploy.SparkHadoopUtil
34 | import org.apache.spark.internal.Logging
35 | import org.apache.spark.rdd.RDD
36 |
37 | private[hiveacid] case class HiveAcidPartitionComputer(ignoreEmptySplits: Boolean,
38 | ignoreMissingFiles: Boolean) extends Logging {
39 | def getPartitions[K, V](id: Int, jobConf: JobConf,
40 | inputFormat: InputFormat[K, V],
41 | minPartitions: Int): Array[HiveAcidPartition] = {
42 | // add the credentials here as this can be called before SparkContext initialized
43 | SparkHadoopUtil.get.addCredentials(jobConf)
44 | try {
45 | val allInputSplits = inputFormat.getSplits(jobConf, minPartitions)
46 | val inputSplits = if (ignoreEmptySplits) {
47 | allInputSplits.filter(_.getLength > 0)
48 | } else {
49 | allInputSplits
50 | }
51 | val array = new Array[HiveAcidPartition](inputSplits.length)
52 | for (i <- inputSplits.indices) {
53 | array(i) = new HiveAcidPartition(id, i, inputSplits(i))
54 | }
55 | array
56 | } catch {
57 | case e: InvalidInputException if ignoreMissingFiles =>
58 | val inputDir = jobConf.get(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.INPUT_DIR)
59 | logWarning(s"$inputDir doesn't exist and no" +
60 | s" partitions returned from this path.", e)
61 | Array.empty[HiveAcidPartition]
62 | }
63 | }
64 |
65 | // needs to be invoked just once as its an expensive operation.
66 | def computeHiveSplitsAndCache(splitRDD: RDD[HiveSplitInfo]): Unit = {
67 | val start = System.nanoTime()
68 | logInfo("Spawning job to compute partitions for ACID table RDD")
69 | val splits = splitRDD.map {
70 | case HiveSplitInfo(id, broadcastedConf,
71 | validWriteIdList, minPartitions, ifcName, isFullAcidTable, shouldCloneJobConf, initLocalJobConfFuncOpt) =>
72 | val jobConf = HiveAcidRDD.setInputPathToJobConf(
73 | Some(HiveAcidRDD.getJobConf(broadcastedConf, shouldCloneJobConf, initLocalJobConfFuncOpt)),
74 | isFullAcidTable,
75 | new ValidReaderWriteIdList(validWriteIdList),
76 | broadcastedConf,
77 | shouldCloneJobConf,
78 | initLocalJobConfFuncOpt)
79 | val partitions = this.getPartitions[Writable, Writable](id, jobConf, getInputFormat(jobConf, ifcName), minPartitions)
80 | (partitions, FileInputFormat.getInputPaths(jobConf), validWriteIdList)
81 | }.collect()
82 |
83 | splits.foreach {
84 | case (partitions: Array[HiveAcidPartition],
85 | paths: Array[Path], validWriteIdList: String) =>
86 | addToPartitionCache(paths, validWriteIdList, partitions)
87 | }
88 | logInfo(s"Job to compute partitions took: " +
89 | s"${TimeUnit.NANOSECONDS.toSeconds(System.nanoTime() - start)} seconds")
90 | }
91 | }
92 |
93 | private[hiveacid] object HiveAcidPartitionComputer extends Logging {
94 | object Cache {
95 | val partitionCache = new ConcurrentHashMap[SplitCacheKey, Array[HiveAcidPartition]]()
96 | case class SplitCacheKey(paths: Set[Path], validWriteIdList: String)
97 | }
98 |
99 | def getFromSplitsCache(paths: Array[Path], validWriteIdList: ValidWriteIdList): Option[Array[HiveAcidPartition]] = {
100 | Option(Cache.partitionCache.get(Cache.SplitCacheKey(paths.toSet, validWriteIdList.writeToString())))
101 | }
102 |
103 | def removeFromSplitsCache(paths: Array[Path], validWriteIdList: ValidWriteIdList): Unit = {
104 | Cache.partitionCache.remove(Cache.SplitCacheKey(paths.toSet, validWriteIdList.writeToString()))
105 | }
106 |
107 | def addToPartitionCache(paths: Array[Path], validWriteIdList: String, inputSplits: Array[HiveAcidPartition]): Unit = {
108 | Cache.partitionCache.put(Cache.SplitCacheKey(paths.toSet, validWriteIdList), inputSplits)
109 | }
110 |
111 | private def getInputFormat(conf: JobConf, inputFormatClassName: String): InputFormat[Writable, Writable] = {
112 | val inputFormatClass = Util.classForName(inputFormatClassName, loadShaded = true)
113 | .asInstanceOf[java.lang.Class[InputFormat[Writable, Writable]]]
114 | val newInputFormat = ReflectionUtils.newInstance(inputFormatClass.asInstanceOf[Class[_]], conf)
115 | .asInstanceOf[InputFormat[Writable, Writable]]
116 | newInputFormat match {
117 | case c: Configurable => c.setConf(conf)
118 | case _ =>
119 | }
120 | newInputFormat
121 | }
122 |
123 | }
124 |
--------------------------------------------------------------------------------
/src/main/scala/com/qubole/spark/hiveacid/reader/hive/HiveAcidReaderOptions.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2019 Qubole, Inc. All rights reserved.
3 | *
4 | * Licensed to the Apache Software Foundation (ASF) under one or more
5 | * contributor license agreements. See the NOTICE file distributed with
6 | * this work for additional information regarding copyright ownership.
7 | * The ASF licenses this file to You under the Apache License, Version 2.0
8 | * (the "License"); you may not use this file except in compliance with
9 | * the License. You may obtain a copy of the License at
10 | *
11 | * http://www.apache.org/licenses/LICENSE-2.0
12 | *
13 | * Unless required by applicable law or agreed to in writing, software
14 | * distributed under the License is distributed on an "AS IS" BASIS,
15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | * See the License for the specific language governing permissions and
17 | * limitations under the License.
18 | */
19 |
20 | package com.qubole.spark.hiveacid.reader.hive
21 |
22 | import com.qubole.shaded.hadoop.hive.ql.plan.TableDesc
23 | import com.qubole.spark.hiveacid.hive.HiveAcidMetadata
24 | import org.apache.spark.sql.types.StructType
25 |
26 | private[reader] class HiveAcidReaderOptions(val tableDesc: TableDesc,
27 | val isFullAcidTable: Boolean,
28 | val dataSchema: StructType)
29 |
30 | private[reader] object HiveAcidReaderOptions {
31 | def get(hiveAcidMetadata: HiveAcidMetadata): HiveAcidReaderOptions = {
32 | new HiveAcidReaderOptions(hiveAcidMetadata.tableDesc, hiveAcidMetadata.isFullAcidTable,
33 | hiveAcidMetadata.dataSchema)
34 | }
35 | }
36 |
--------------------------------------------------------------------------------
/src/main/scala/com/qubole/spark/hiveacid/streaming/HiveAcidSink.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2019 Qubole, Inc. All rights reserved.
3 | *
4 | * Licensed to the Apache Software Foundation (ASF) under one or more
5 | * contributor license agreements. See the NOTICE file distributed with
6 | * this work for additional information regarding copyright ownership.
7 | * The ASF licenses this file to You under the Apache License, Version 2.0
8 | * (the "License"); you may not use this file except in compliance with
9 | * the License. You may obtain a copy of the License at
10 | *
11 | * http://www.apache.org/licenses/LICENSE-2.0
12 | *
13 | * Unless required by applicable law or agreed to in writing, software
14 | * distributed under the License is distributed on an "AS IS" BASIS,
15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | * See the License for the specific language governing permissions and
17 | * limitations under the License.
18 | */
19 |
20 |
21 | package com.qubole.spark.hiveacid.streaming
22 |
23 | import com.qubole.spark.hiveacid.{HiveAcidErrors, HiveAcidTable}
24 | import com.qubole.spark.hiveacid.hive.HiveAcidMetadata
25 | import org.apache.hadoop.fs.Path
26 | import org.apache.spark.internal.Logging
27 | import org.apache.spark.sql.{DataFrame, SparkSession}
28 | import org.apache.spark.sql.execution.streaming.Sink
29 |
30 |
31 | class HiveAcidSink(sparkSession: SparkSession,
32 | parameters: Map[String, String]) extends Sink with Logging {
33 |
34 | import HiveAcidSink._
35 |
36 | private val acidSinkOptions = new HiveAcidSinkOptions(parameters)
37 |
38 | private val fullyQualifiedTableName = acidSinkOptions.tableName
39 |
40 | private val hiveAcidTable: HiveAcidTable = HiveAcidTable.fromSparkSession(
41 | sparkSession,
42 | fullyQualifiedTableName,
43 | parameters)
44 |
45 | assertNonBucketedTable()
46 |
47 | private val logPath = getMetaDataPath()
48 | private val fileLog = new HiveAcidSinkLog(
49 | HiveAcidSinkLog.VERSION, sparkSession, logPath.toUri.toString, acidSinkOptions)
50 |
51 | private def assertNonBucketedTable(): Unit = {
52 | if(hiveAcidTable.isBucketed) {
53 | throw HiveAcidErrors.unsupportedOperationTypeBucketedTable("Streaming Write", fullyQualifiedTableName)
54 | }
55 | }
56 |
57 | private def getMetaDataPath(): Path = {
58 | acidSinkOptions.metadataDir match {
59 | case Some(dir) =>
60 | new Path(dir)
61 | case None =>
62 | logInfo(s"Metadata dir not specified. Using " +
63 | s"$metadataDirPrefix/_query_default as metadata dir")
64 | logWarning(s"Please make sure that multiple streaming writes to " +
65 | s"$fullyQualifiedTableName are not running")
66 | val tableLocation = HiveAcidMetadata.fromSparkSession(
67 | sparkSession, fullyQualifiedTableName).rootPath
68 | new Path(tableLocation, s"$metadataDirPrefix/_query_default")
69 | }
70 | }
71 |
72 | /**
73 | * Adds the batch to the sink. Each batch is transactional in itself
74 | * @param batchId batch to add
75 | * @param df dataframe to add as part of batch
76 | */
77 | override def addBatch(batchId: Long, df: DataFrame): Unit = {
78 |
79 | if (batchId <= fileLog.getLatest().map(_._1).getOrElse(-1L)) {
80 | logInfo(s"Skipping already committed batch $batchId")
81 | } else {
82 |
83 | val commitProtocol = new HiveAcidStreamingCommitProtocol(fileLog)
84 | val txnId = hiveAcidTable.addBatch(df)
85 | commitProtocol.commitJob(batchId, txnId)
86 | }
87 |
88 | }
89 |
90 | override def toString: String = s"HiveAcidSinkV1[$fullyQualifiedTableName]"
91 |
92 | }
93 |
94 | object HiveAcidSink {
95 |
96 | val metadataDirPrefix = "_acid_streaming"
97 | }
98 |
--------------------------------------------------------------------------------
/src/main/scala/com/qubole/spark/hiveacid/streaming/HiveAcidSinkLog.scala:
--------------------------------------------------------------------------------
1 |
2 | /*
3 | * Copyright 2019 Qubole, Inc. All rights reserved.
4 | *
5 | * Licensed to the Apache Software Foundation (ASF) under one or more
6 | * contributor license agreements. See the NOTICE file distributed with
7 | * this work for additional information regarding copyright ownership.
8 | * The ASF licenses this file to You under the Apache License, Version 2.0
9 | * (the "License"); you may not use this file except in compliance with
10 | * the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | */
20 |
21 | package com.qubole.spark.hiveacid.streaming
22 |
23 | import org.apache.spark.sql.SparkSession
24 | import org.apache.spark.sql.execution.streaming.CompactibleFileStreamLog
25 |
26 | case class HiveAcidSinkStatus(txnId: Long, action: String)
27 |
28 | class HiveAcidSinkLog(version: Int,
29 | sparkSession: SparkSession,
30 | path: String,
31 | options: HiveAcidSinkOptions)
32 | extends CompactibleFileStreamLog[HiveAcidSinkStatus](version, sparkSession, path) {
33 |
34 | protected override val fileCleanupDelayMs = options.fileCleanupDelayMs
35 |
36 | protected override val isDeletingExpiredLog = options.isDeletingExpiredLog
37 |
38 | protected override val defaultCompactInterval = options.compactInterval
39 |
40 | protected override val minBatchesToRetain = options.minBatchesToRetain
41 |
42 | override def compactLogs(logs: Seq[HiveAcidSinkStatus]): Seq[HiveAcidSinkStatus] = {
43 | val deletedFiles = logs.filter(_.action == HiveAcidSinkLog.DELETE_ACTION).map(_.txnId).toSet
44 | if (deletedFiles.isEmpty) {
45 | logs
46 | } else {
47 | logs.filter(f => !deletedFiles.contains(f.txnId))
48 | }
49 | }
50 |
51 | }
52 |
53 | object HiveAcidSinkLog {
54 |
55 | val VERSION = 1
56 | val DELETE_ACTION = "delete"
57 | val ADD_ACTION = "add"
58 |
59 | }
--------------------------------------------------------------------------------
/src/main/scala/com/qubole/spark/hiveacid/streaming/HiveAcidSinkOptions.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2019 Qubole, Inc. All rights reserved.
3 | *
4 | * Licensed to the Apache Software Foundation (ASF) under one or more
5 | * contributor license agreements. See the NOTICE file distributed with
6 | * this work for additional information regarding copyright ownership.
7 | * The ASF licenses this file to You under the Apache License, Version 2.0
8 | * (the "License"); you may not use this file except in compliance with
9 | * the License. You may obtain a copy of the License at
10 | *
11 | * http://www.apache.org/licenses/LICENSE-2.0
12 | *
13 | * Unless required by applicable law or agreed to in writing, software
14 | * distributed under the License is distributed on an "AS IS" BASIS,
15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | * See the License for the specific language governing permissions and
17 | * limitations under the License.
18 | */
19 |
20 | package com.qubole.spark.hiveacid.streaming
21 |
22 | import java.util.concurrent.TimeUnit
23 |
24 | import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
25 |
26 | import scala.util.Try
27 |
28 | class HiveAcidSinkOptions(parameters: CaseInsensitiveMap[String]) {
29 |
30 | import HiveAcidSinkOptions._
31 |
32 | def this(parameters: Map[String, String]) = this(CaseInsensitiveMap(parameters))
33 |
34 | val tableName = parameters.get("table").getOrElse{
35 | throw new IllegalArgumentException("Table Name is not specified")
36 | }
37 |
38 | val fileCleanupDelayMs = withLongParameter(CLEANUP_DELAY_KEY, DEFAULT_CLEANUP_DELAY)
39 |
40 | val isDeletingExpiredLog = withBooleanParameter(LOG_DELETION_KEY, DEFAULT_LOG_DELETION)
41 |
42 | val compactInterval = withIntParameter(COMPACT_INTERVAL_KEY, DEFAULT_COMPACT_INTERVAL)
43 |
44 | val minBatchesToRetain = withIntParameter(MIN_BATCHES_TO_RETAIN_KEY, DEFAULT_MIN_BATCHES_TO_RETAIN)
45 |
46 | val metadataDir = parameters.get(METADATA_DIR_KEY)
47 |
48 | private def withIntParameter(name: String, default: Int): Int = {
49 | parameters.get(name).map { str =>
50 | Try(str.toInt).toOption.filter(_ > 0).getOrElse {
51 | throw new IllegalArgumentException(
52 | s"Invalid value '$str' for option '$name', must be a positive integer")
53 | }
54 | }.getOrElse(default)
55 | }
56 |
57 | private def withLongParameter(name: String, default: Long): Long = {
58 | parameters.get(name).map { str =>
59 | Try(str.toLong).toOption.filter(_ >= 0).getOrElse {
60 | throw new IllegalArgumentException(
61 | s"Invalid value '$str' for option '$name', must be a positive integer")
62 | }
63 | }.getOrElse(default)
64 | }
65 |
66 | private def withBooleanParameter(name: String, default: Boolean): Boolean = {
67 | parameters.get(name).map { str =>
68 | try {
69 | str.toBoolean
70 | } catch {
71 | case _: IllegalArgumentException =>
72 | throw new IllegalArgumentException(
73 | s"Invalid value '$str' for option '$name', must be true or false")
74 | }
75 | }.getOrElse(default)
76 | }
77 |
78 | }
79 |
80 | object HiveAcidSinkOptions {
81 |
82 | val DEFAULT_CLEANUP_DELAY = TimeUnit.MINUTES.toMillis(10)
83 | val DEFAULT_LOG_DELETION = true
84 | val DEFAULT_COMPACT_INTERVAL = 10
85 | val DEFAULT_MIN_BATCHES_TO_RETAIN = 100
86 |
87 | val CLEANUP_DELAY_KEY = "spark.acid.streaming.log.cleanupDelayMs"
88 | val LOG_DELETION_KEY = "spark.acid.streaming.log.deletion"
89 | val COMPACT_INTERVAL_KEY = "spark.acid.streaming.log.compactInterval"
90 | val MIN_BATCHES_TO_RETAIN_KEY = "spark.acid.streaming.log.minBatchesToRetain"
91 | val METADATA_DIR_KEY = "spark.acid.streaming.log.metadataDir"
92 |
93 | }
--------------------------------------------------------------------------------
/src/main/scala/com/qubole/spark/hiveacid/streaming/HiveAcidStreamingCommitProtocol.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.qubole.spark.hiveacid.streaming
19 |
20 | import org.apache.spark.internal.Logging
21 |
22 | class HiveAcidStreamingCommitProtocol(fileLog: HiveAcidSinkLog) extends Serializable with Logging {
23 |
24 | import HiveAcidStreamingCommitProtocol._
25 |
26 | def commitJob(batchId: Long, txnId: Long): Unit = {
27 |
28 | def commitJobRetry(retryRemaining: Int, f: () => Unit): Boolean = {
29 | var retry = false
30 | try {
31 | f()
32 | }
33 | catch {
34 | case ie: IllegalStateException if ie.getMessage.contains("Race while writing batch") =>
35 | throw ie
36 | case e: Exception =>
37 | if (retryRemaining > 0) {
38 | logError(s"Unexpected error while writing commit file for batch $batchId ... " +
39 | s"Retrying", e)
40 | retry = true
41 | } else {
42 | logError(s"Unexpected error while writing commit file for batch $batchId ... " +
43 | s"Max retries reached", e)
44 | throw e
45 | }
46 | }
47 | retry
48 | }
49 |
50 | val array = Array(HiveAcidSinkStatus(txnId, HiveAcidSinkLog.ADD_ACTION))
51 |
52 | val commitJobAttempt = () => {
53 | if (fileLog.add(batchId, array)) {
54 | logInfo(s"Committed batch $batchId")
55 | } else {
56 | throw new IllegalStateException(s"Race while writing batch $batchId")
57 | }
58 | }
59 |
60 | var sleepSec = 1
61 | var retryRemaining = MAX_COMMIT_JOB_RETRIES - 1
62 | while (commitJobRetry(retryRemaining, commitJobAttempt)) {
63 | retryRemaining = retryRemaining - 1
64 | Thread.sleep(sleepSec * 1000)
65 | sleepSec = sleepSec * EXPONENTIAL_BACK_OFF_FACTOR
66 | }
67 |
68 | }
69 |
70 | }
71 |
72 | object HiveAcidStreamingCommitProtocol {
73 |
74 | val MAX_COMMIT_JOB_RETRIES = 3
75 | val EXPONENTIAL_BACK_OFF_FACTOR = 2
76 |
77 | }
--------------------------------------------------------------------------------
/src/main/scala/com/qubole/spark/hiveacid/transaction/HiveAcidTxn.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package com.qubole.spark.hiveacid.transaction
19 |
20 | import java.util.concurrent.atomic.AtomicBoolean
21 |
22 | import com.qubole.shaded.hadoop.hive.common.{ValidTxnList, ValidWriteIdList}
23 | import com.qubole.spark.hiveacid.{HiveAcidErrors, HiveAcidOperation, SparkAcidConf}
24 | import com.qubole.spark.hiveacid.hive.HiveAcidMetadata
25 | import org.apache.spark.internal.Logging
26 | import org.apache.spark.sql.SparkSession
27 |
28 | /**
29 | * Hive Acid Transaction object.
30 | * @param sparkSession: Spark Session
31 | */
32 | class HiveAcidTxn(sparkSession: SparkSession) extends Logging {
33 |
34 | HiveAcidTxn.setUpTxnManager(sparkSession)
35 |
36 | // txn ID
37 | protected var id: Long = -1
38 | protected var validTxnList: ValidTxnList = _
39 | private [hiveacid] val isClosed: AtomicBoolean = new AtomicBoolean(true)
40 |
41 | private def setTxn(id: Long, txns:ValidTxnList): Unit = {
42 | this.id = id
43 | this.validTxnList = txns
44 | isClosed.set(false)
45 | }
46 |
47 | private def unsetTxn(): Unit = {
48 | this.id = -1
49 | this.validTxnList = null
50 | isClosed.set(true)
51 | }
52 |
53 | override def toString: String = s"""{"id":"$id","validTxns":"$validTxnList"}"""
54 |
55 | /**
56 | * Public API to being transaction.
57 | */
58 | def begin(): Unit = synchronized {
59 | if (!isClosed.get) {
60 | throw HiveAcidErrors.txnAlreadyOpen(id)
61 | }
62 | val newId = HiveAcidTxn.txnManager.beginTxn(this)
63 | val txnList = HiveAcidTxn.txnManager.getValidTxns(Some(newId))
64 | setTxn(newId, txnList)
65 | // Set it for thread for all future references.
66 | HiveAcidTxn.threadLocal.set(this)
67 | logDebug(s"Begin transaction $this")
68 | }
69 |
70 | /**
71 | * Public API to end transaction
72 | * @param abort true if transaction is aborted
73 | */
74 | def end(abort: Boolean = false): Unit = synchronized {
75 | if (isClosed.get) {
76 | throw HiveAcidErrors.txnAlreadyClosed(id)
77 | }
78 |
79 | logDebug(s"End transaction $this abort = $abort")
80 | // NB: Unset it for thread proactively invariant of
81 | // underlying call fails or succeeds.
82 | HiveAcidTxn.threadLocal.set(null)
83 | HiveAcidTxn.txnManager.endTxn(id, abort)
84 | unsetTxn()
85 | }
86 |
87 | private[hiveacid] def acquireLocks(hiveAcidMetadata: HiveAcidMetadata,
88 | operationType: HiveAcidOperation.OperationType,
89 | partitionNames: Seq[String],
90 | conf: SparkAcidConf): Unit = {
91 | if (isClosed.get()) {
92 | logError(s"Transaction already closed $this")
93 | throw HiveAcidErrors.txnAlreadyClosed(id)
94 | }
95 | HiveAcidTxn.txnManager.acquireLocks(id, hiveAcidMetadata.dbName,
96 | hiveAcidMetadata.tableName, operationType, partitionNames, hiveAcidMetadata.isPartitioned, conf)
97 | }
98 |
99 | private[hiveacid] def addDynamicPartitions(writeId: Long,
100 | dbName: String,
101 | tableName: String,
102 | operationType: HiveAcidOperation.OperationType,
103 | partitions: Set[String]) = {
104 | if (isClosed.get()) {
105 | logError(s"Transaction already closed $this")
106 | throw HiveAcidErrors.txnAlreadyClosed(id)
107 | }
108 | logDebug(s"Adding dynamic partition txnId: $id writeId: $writeId dbName: $dbName" +
109 | s" tableName: $tableName partitions: ${partitions.mkString(",")}")
110 | HiveAcidTxn.txnManager.addDynamicPartitions(id, writeId, dbName,
111 | tableName, partitions, operationType)
112 | }
113 | // Public Interface
114 | def txnId: Long = id
115 | }
116 |
117 | object HiveAcidTxn extends Logging {
118 |
119 | val threadLocal = new ThreadLocal[HiveAcidTxn]
120 |
121 | // Helper function to create snapshot.
122 | private[hiveacid] def createSnapshot(txn: HiveAcidTxn, hiveAcidMetadata: HiveAcidMetadata): HiveAcidTableSnapshot = {
123 | val currentWriteId = txnManager.getCurrentWriteId(txn.txnId,
124 | hiveAcidMetadata.dbName, hiveAcidMetadata.tableName)
125 | val validWriteIdList: ValidWriteIdList = getValidWriteIds(txn, hiveAcidMetadata)
126 | HiveAcidTableSnapshot(validWriteIdList, currentWriteId)
127 | }
128 |
129 | private[hiveacid] def getValidWriteIds(txn: HiveAcidTxn, hiveAcidMetadata: HiveAcidMetadata) = {
130 | val validWriteIdList = if (txn.txnId == -1) {
131 | throw HiveAcidErrors.tableWriteIdRequestedBeforeTxnStart(hiveAcidMetadata.fullyQualifiedName)
132 | } else {
133 | txnManager.getValidWriteIds(txn.txnId, txn.validTxnList, hiveAcidMetadata.fullyQualifiedName)
134 | }
135 | validWriteIdList
136 | }
137 |
138 | // Txn manager is connection to HMS. Use single instance of it
139 | var txnManager: HiveAcidTxnManager = _
140 | private def setUpTxnManager(sparkSession: SparkSession): Unit = synchronized {
141 | if (txnManager == null) {
142 | txnManager = new HiveAcidTxnManager(sparkSession)
143 | }
144 | }
145 |
146 | /**
147 | * Creates read or write transaction based on user request.
148 | *
149 | * @param sparkSession Create a new hive Acid transaction
150 | * @return
151 | */
152 | def createTransaction(sparkSession: SparkSession): HiveAcidTxn = {
153 | setUpTxnManager(sparkSession)
154 | new HiveAcidTxn(sparkSession)
155 | }
156 |
157 | /**
158 | * Given a transaction id return the HiveAcidTxn object. Raise exception if not found.
159 | * @return
160 | */
161 | def currentTxn(): HiveAcidTxn = {
162 | threadLocal.get()
163 | }
164 |
165 | /**
166 | * Check if valid write Ids for `fullyQualifiedTableName` when `txn` was opened
167 | * is same even now. This should be invoked after `txn` acquires lock, to see
168 | * if the transaction is still valid and continue.
169 | */
170 | def IsTxnStillValid(txn: HiveAcidTxn, fullyQualifiedTableName: String): Boolean = {
171 | if (txn.txnId == - 1) {
172 | logWarning(s"Transaction being validated even before it was open")
173 | false
174 | } else {
175 | // Compare the earlier writeIds of fullyQualifiedTableName with the current one.
176 | val previousWriteIdList = txnManager.getValidWriteIds(txn.txnId, txn.validTxnList, fullyQualifiedTableName)
177 | val currentValidList = txnManager.getValidTxns(Some(txn.txnId))
178 | val currentWriteIdList = txnManager.getValidWriteIds(txn.txnId, currentValidList, fullyQualifiedTableName)
179 | // Checks if any new write transaction was started and committed
180 | // after opening transaction and before acquiring locks using HighWaterMark
181 | if (previousWriteIdList.getHighWatermark == currentWriteIdList.getHighWatermark) {
182 | // Check all the open transactions when current transaction was opened,
183 | // are still invalid i.e., either running/open or aborted.
184 | val prevOpenInvalidWriteIds = previousWriteIdList.getInvalidWriteIds
185 | .filter(!previousWriteIdList.isWriteIdAborted(_)).toSet
186 | val currentInvalidWriteIds = currentWriteIdList.getInvalidWriteIds.toSet
187 | // Previous open transactions should still be invalid
188 | if (prevOpenInvalidWriteIds.isEmpty ||
189 | prevOpenInvalidWriteIds.diff(currentInvalidWriteIds).isEmpty) {
190 | logDebug("All previous open transactions are still invalid! Transaction is valid!")
191 | true
192 | } else {
193 | logWarning("Prev Open transactions: " + prevOpenInvalidWriteIds.diff(currentInvalidWriteIds).mkString(", ")
194 | + " have been committed. Transaction " + txn.txnId + " is not valid !")
195 | false
196 | }
197 | } else {
198 | logWarning("HighWatermark moved from " +
199 | previousWriteIdList.getHighWatermark + " to " +
200 | currentWriteIdList.getHighWatermark +
201 | ". Transaction " + txn.txnId + " is not valid !")
202 | false
203 | }
204 | }
205 | }
206 | }
207 |
208 | private[hiveacid] case class HiveAcidTableSnapshot(validWriteIdList: ValidWriteIdList, currentWriteId: Long)
209 |
--------------------------------------------------------------------------------
/src/main/scala/com/qubole/spark/hiveacid/util/.gitignore:
--------------------------------------------------------------------------------
1 | *.scalae
2 |
--------------------------------------------------------------------------------
/src/main/scala/com/qubole/spark/hiveacid/util/HiveAcidKyroRegistrator.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2019 Qubole, Inc. All rights reserved.
3 | *
4 | * Licensed to the Apache Software Foundation (ASF) under one or more
5 | * contributor license agreements. See the NOTICE file distributed with
6 | * this work for additional information regarding copyright ownership.
7 | * The ASF licenses this file to You under the Apache License, Version 2.0
8 | * (the "License"); you may not use this file except in compliance with
9 | * the License. You may obtain a copy of the License at
10 | *
11 | * http://www.apache.org/licenses/LICENSE-2.0
12 | *
13 | * Unless required by applicable law or agreed to in writing, software
14 | * distributed under the License is distributed on an "AS IS" BASIS,
15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | * See the License for the specific language governing permissions and
17 | * limitations under the License.
18 | */
19 |
20 | package com.qubole.spark.hiveacid.util
21 |
22 | import com.esotericsoftware.kryo.Kryo
23 | import org.apache.spark.serializer.KryoRegistrator
24 | import com.esotericsoftware.kryo.serializers.JavaSerializer
25 |
26 | class HiveAcidKyroRegistrator extends KryoRegistrator {
27 | override def registerClasses(kryo: Kryo): Unit = {
28 | kryo.register(classOf[com.qubole.spark.hiveacid.util.SerializableConfiguration], new JavaSerializer)
29 | }
30 | }
--------------------------------------------------------------------------------
/src/main/scala/com/qubole/spark/hiveacid/util/SerializableConfiguration.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2019 Qubole, Inc. All rights reserved.
3 | *
4 | * Licensed to the Apache Software Foundation (ASF) under one or more
5 | * contributor license agreements. See the NOTICE file distributed with
6 | * this work for additional information regarding copyright ownership.
7 | * The ASF licenses this file to You under the Apache License, Version 2.0
8 | * (the "License"); you may not use this file except in compliance with
9 | * the License. You may obtain a copy of the License at
10 | *
11 | * http://www.apache.org/licenses/LICENSE-2.0
12 | *
13 | * Unless required by applicable law or agreed to in writing, software
14 | * distributed under the License is distributed on an "AS IS" BASIS,
15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | * See the License for the specific language governing permissions and
17 | * limitations under the License.
18 | */
19 |
20 | package com.qubole.spark.hiveacid.util
21 |
22 | import java.io.{ObjectInputStream, ObjectOutputStream}
23 |
24 | import org.apache.hadoop.conf.Configuration
25 |
26 | /**
27 | * Utility class to make configuration object serializable
28 | */
29 | private[hiveacid] class SerializableConfiguration(@transient var value: Configuration)
30 | extends Serializable {
31 | private def writeObject(out: ObjectOutputStream): Unit = Util.tryOrIOException {
32 | out.defaultWriteObject()
33 | value.write(out)
34 | }
35 |
36 | private def readObject(in: ObjectInputStream): Unit = Util.tryOrIOException {
37 | value = new Configuration(false)
38 | value.readFields(in)
39 | }
40 | }
41 |
--------------------------------------------------------------------------------
/src/main/scala/com/qubole/spark/hiveacid/util/SerializableWritable.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2019 Qubole, Inc. All rights reserved.
3 | *
4 | * Licensed to the Apache Software Foundation (ASF) under one or more
5 | * contributor license agreements. See the NOTICE file distributed with
6 | * this work for additional information regarding copyright ownership.
7 | * The ASF licenses this file to You under the Apache License, Version 2.0
8 | * (the "License"); you may not use this file except in compliance with
9 | * the License. You may obtain a copy of the License at
10 | *
11 | * http://www.apache.org/licenses/LICENSE-2.0
12 | *
13 | * Unless required by applicable law or agreed to in writing, software
14 | * distributed under the License is distributed on an "AS IS" BASIS,
15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | * See the License for the specific language governing permissions and
17 | * limitations under the License.
18 | */
19 |
20 | package com.qubole.spark.hiveacid.util
21 |
22 | import java.io._
23 |
24 | import org.apache.hadoop.conf.Configuration
25 | import org.apache.hadoop.io.ObjectWritable
26 | import org.apache.hadoop.io.Writable
27 |
28 | /**
29 | * Utility class to make a Writable serializable
30 | */
31 | private[hiveacid] class SerializableWritable[T <: Writable](@transient var t: T)
32 | extends Serializable {
33 |
34 | def value: T = t
35 |
36 | override def toString: String = t.toString
37 |
38 | private def writeObject(out: ObjectOutputStream): Unit = Util.tryOrIOException {
39 | out.defaultWriteObject()
40 | new ObjectWritable(t).write(out)
41 | }
42 |
43 | private def readObject(in: ObjectInputStream): Unit = Util.tryOrIOException {
44 | in.defaultReadObject()
45 | val ow = new ObjectWritable()
46 | ow.setConf(new Configuration(false))
47 | ow.readFields(in)
48 | t = ow.get().asInstanceOf[T]
49 | }
50 | }
51 |
--------------------------------------------------------------------------------
/src/main/scala/com/qubole/spark/hiveacid/util/Util.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2019 Qubole, Inc. All rights reserved.
3 | *
4 | * Licensed to the Apache Software Foundation (ASF) under one or more
5 | * contributor license agreements. See the NOTICE file distributed with
6 | * this work for additional information regarding copyright ownership.
7 | * The ASF licenses this file to You under the Apache License, Version 2.0
8 | * (the "License"); you may not use this file except in compliance with
9 | * the License. You may obtain a copy of the License at
10 | *
11 | * http://www.apache.org/licenses/LICENSE-2.0
12 | *
13 | * Unless required by applicable law or agreed to in writing, software
14 | * distributed under the License is distributed on an "AS IS" BASIS,
15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | * See the License for the specific language governing permissions and
17 | * limitations under the License.
18 | */
19 |
20 | package com.qubole.spark.hiveacid.util
21 |
22 | import java.io.IOException
23 |
24 | import org.apache.spark.internal.Logging
25 |
26 | import scala.util.control.NonFatal
27 |
28 | private[hiveacid] object Util extends Logging {
29 |
30 | def classForName(className: String, loadShaded: Boolean = false): Class[_] = {
31 | val classToLoad = if (loadShaded) {
32 | className.replaceFirst("org.apache.hadoop.hive.", "com.qubole.shaded.hadoop.hive.")
33 | } else {
34 | className
35 | }
36 | Class.forName(classToLoad, true, Thread.currentThread().getContextClassLoader)
37 | }
38 |
39 | /**
40 | * Detect whether this thread might be executing a shutdown hook. Will always return true if
41 | * the current thread is a running a shutdown hook but may spuriously return true otherwise (e.g.
42 | * if System.exit was just called by a concurrent thread).
43 | *
44 | * Currently, this detects whether the JVM is shutting down by Runtime#addShutdownHook throwing
45 | * an IllegalStateException.
46 | */
47 | def inShutdown(): Boolean = {
48 | try {
49 | val hook: Thread = new Thread {
50 | override def run() {}
51 | }
52 | // scalastyle:off runtimeaddshutdownhook
53 | Runtime.getRuntime.addShutdownHook(hook)
54 | // scalastyle:on runtimeaddshutdownhook
55 | Runtime.getRuntime.removeShutdownHook(hook)
56 | } catch {
57 | case _: IllegalStateException => return true
58 | }
59 | false
60 | }
61 |
62 | def tryOrIOException[T](block: => T): T = {
63 | try {
64 | block
65 | } catch {
66 | case e: IOException =>
67 | logError("Exception encountered", e)
68 | throw e
69 | case NonFatal(e) =>
70 | logError("Exception encountered", e)
71 | throw new IOException(e)
72 | }
73 | }
74 | }
75 |
--------------------------------------------------------------------------------
/src/main/scala/com/qubole/spark/hiveacid/writer/Writer.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2019 Qubole, Inc. All rights reserved.
3 | *
4 | * Licensed to the Apache Software Foundation (ASF) under one or more
5 | * contributor license agreements. See the NOTICE file distributed with
6 | * this work for additional information regarding copyright ownership.
7 | * The ASF licenses this file to You under the Apache License, Version 2.0
8 | * (the "License"); you may not use this file except in compliance with
9 | * the License. You may obtain a copy of the License at
10 | *
11 | * http://www.apache.org/licenses/LICENSE-2.0
12 | *
13 | * Unless required by applicable law or agreed to in writing, software
14 | * distributed under the License is distributed on an "AS IS" BASIS,
15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | * See the License for the specific language governing permissions and
17 | * limitations under the License.
18 | */
19 |
20 | package com.qubole.spark.hiveacid.writer
21 |
22 | import org.apache.spark.sql.catalyst.InternalRow
23 | import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec
24 |
25 | private[hiveacid] trait Writer {
26 | def process(row: InternalRow): Unit
27 | def close(): Unit
28 | def partitionsTouched(): Seq[TablePartitionSpec]
29 | }
--------------------------------------------------------------------------------
/src/main/scala/com/qubole/spark/hiveacid/writer/WriterOptions.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2019 Qubole, Inc. All rights reserved.
3 | *
4 | * Licensed to the Apache Software Foundation (ASF) under one or more
5 | * contributor license agreements. See the NOTICE file distributed with
6 | * this work for additional information regarding copyright ownership.
7 | * The ASF licenses this file to You under the Apache License, Version 2.0
8 | * (the "License"); you may not use this file except in compliance with
9 | * the License. You may obtain a copy of the License at
10 | *
11 | * http://www.apache.org/licenses/LICENSE-2.0
12 | *
13 | * Unless required by applicable law or agreed to in writing, software
14 | * distributed under the License is distributed on an "AS IS" BASIS,
15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | * See the License for the specific language governing permissions and
17 | * limitations under the License.
18 | */
19 |
20 | package com.qubole.spark.hiveacid.writer
21 |
22 | import com.qubole.spark.hiveacid.HiveAcidOperation
23 | import com.qubole.spark.hiveacid.util.SerializableConfiguration
24 | import org.apache.spark.sql.catalyst.expressions.Attribute
25 | import org.apache.spark.sql.types.StructType
26 |
27 | /**
28 | * Writer options which will be serialized and sent to each executor
29 | */
30 | private[hiveacid] class WriterOptions(val currentWriteId: Long,
31 | val operationType: HiveAcidOperation.OperationType,
32 | val serializableHadoopConf: SerializableConfiguration,
33 | val tableSchemaWithrowID: StructType,
34 | val dataColumns: Seq[Attribute],
35 | val partitionColumns: Seq[Attribute],
36 | val allColumns: Seq[Attribute],
37 | val timeZoneId: String,
38 | val statementId: Option[Int] = None) extends Serializable
39 |
--------------------------------------------------------------------------------
/src/main/scala/com/qubole/spark/hiveacid/writer/hive/HiveAcidWriterOptions.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2019 Qubole, Inc. All rights reserved.
3 | *
4 | * Licensed to the Apache Software Foundation (ASF) under one or more
5 | * contributor license agreements. See the NOTICE file distributed with
6 | * this work for additional information regarding copyright ownership.
7 | * The ASF licenses this file to You under the Apache License, Version 2.0
8 | * (the "License"); you may not use this file except in compliance with
9 | * the License. You may obtain a copy of the License at
10 | *
11 | * http://www.apache.org/licenses/LICENSE-2.0
12 | *
13 | * Unless required by applicable law or agreed to in writing, software
14 | * distributed under the License is distributed on an "AS IS" BASIS,
15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | * See the License for the specific language governing permissions and
17 | * limitations under the License.
18 | */
19 |
20 | package com.qubole.spark.hiveacid.writer.hive
21 |
22 | import com.qubole.shaded.hadoop.hive.ql.plan.FileSinkDesc
23 | import com.qubole.spark.hiveacid.HiveAcidOperation
24 | import com.qubole.spark.hiveacid.hive.HiveAcidMetadata
25 | import com.qubole.spark.hiveacid.writer.WriterOptions
26 | import org.apache.hadoop.fs.Path
27 |
28 | private[writer] class HiveAcidWriterOptions(val rootPath: String,
29 | fileSinkDesc: FileSinkDesc) extends Serializable {
30 | lazy val getFileSinkDesc: FileSinkDesc = {
31 | fileSinkDesc.setDirName(new Path(rootPath))
32 | fileSinkDesc
33 | }
34 | }
35 |
36 | private[writer] object HiveAcidWriterOptions {
37 | def get(hiveAcidMetadata: HiveAcidMetadata,
38 | options: WriterOptions): HiveAcidWriterOptions = {
39 | lazy val fileSinkDescriptor: FileSinkDesc = {
40 | val fileSinkDesc: FileSinkDesc = new FileSinkDesc()
41 | fileSinkDesc.setTableInfo(hiveAcidMetadata.tableDesc)
42 | fileSinkDesc.setTableWriteId(options.currentWriteId)
43 | if (options.operationType == HiveAcidOperation.INSERT_OVERWRITE) {
44 | fileSinkDesc.setInsertOverwrite(true)
45 | }
46 | if (options.statementId.isDefined) {
47 | fileSinkDesc.setStatementId(options.statementId.get)
48 | }
49 | fileSinkDesc
50 | }
51 | new HiveAcidWriterOptions(rootPath = hiveAcidMetadata.rootPath.toUri.toString,
52 | fileSinkDesc = fileSinkDescriptor)
53 | }
54 |
55 | }
56 |
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/SqlUtils.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2019 Qubole, Inc. All rights reserved.
3 | *
4 | * Licensed to the Apache Software Foundation (ASF) under one or more
5 | * contributor license agreements. See the NOTICE file distributed with
6 | * this work for additional information regarding copyright ownership.
7 | * The ASF licenses this file to You under the Apache License, Version 2.0
8 | * (the "License"); you may not use this file except in compliance with
9 | * the License. You may obtain a copy of the License at
10 | *
11 | * http://www.apache.org/licenses/LICENSE-2.0
12 | *
13 | * Unless required by applicable law or agreed to in writing, software
14 | * distributed under the License is distributed on an "AS IS" BASIS,
15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | * See the License for the specific language governing permissions and
17 | * limitations under the License.
18 | */
19 |
20 | package org.apache.spark.sql
21 |
22 | import org.apache.spark.rdd.RDD
23 | import org.apache.spark.sql.catalyst.InternalRow
24 | import org.apache.spark.sql.catalyst.analysis._
25 | import org.apache.spark.sql.catalyst.catalog.HiveTableRelation
26 | import org.apache.spark.sql.catalyst.encoders.RowEncoder
27 | import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, SubqueryAlias}
28 | import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, Expression}
29 | import org.apache.spark.sql.execution.LogicalRDD
30 | import org.apache.spark.sql.execution.datasources.LogicalRelation
31 | import org.apache.spark.sql.types.StructType
32 |
33 | object SqlUtils {
34 | def convertToDF(sparkSession: SparkSession, plan : LogicalPlan): DataFrame = {
35 | Dataset.ofRows(sparkSession, plan)
36 | }
37 |
38 | def resolveReferences(sparkSession: SparkSession,
39 | expr: Expression,
40 | planContaining: LogicalPlan, failIfUnresolved: Boolean,
41 | exprName: Option[String] = None): Expression = {
42 | resolveReferences(sparkSession, expr, Seq(planContaining), failIfUnresolved, exprName)
43 | }
44 |
45 | def resolveReferences(sparkSession: SparkSession,
46 | expr: Expression,
47 | planContaining: Seq[LogicalPlan],
48 | failIfUnresolved: Boolean,
49 | exprName: Option[String]): Expression = {
50 | val newPlan = FakeLogicalPlan(expr, planContaining)
51 | val resolvedExpr = sparkSession.sessionState.analyzer.execute(newPlan) match {
52 | case FakeLogicalPlan(resolvedExpr: Expression, _) =>
53 | // Return even if it did not successfully resolve
54 | resolvedExpr
55 | case _ =>
56 | expr
57 | // This is unexpected
58 | }
59 | if (failIfUnresolved) {
60 | resolvedExpr.flatMap(_.references).filter(!_.resolved).foreach {
61 | attr => {
62 | val failedMsg = exprName match {
63 | case Some(name) => s"${attr.sql} resolution in $name given these columns: "+
64 | planContaining.flatMap(_.output).map(_.name).mkString(",")
65 | case _ => s"${attr.sql} resolution failed given these columns: "+
66 | planContaining.flatMap(_.output).map(_.name).mkString(",")
67 | }
68 | attr.failAnalysis(failedMsg)
69 | }
70 | }
71 | }
72 | resolvedExpr
73 | }
74 |
75 | def hasSparkStopped(sparkSession: SparkSession): Boolean = {
76 | sparkSession.sparkContext.stopped.get()
77 | }
78 |
79 | /**
80 | * Qualify all the column names in the DF.
81 | * Attributes used in DF output will have fully qualified names
82 | * @param sparkSession
83 | * @param df DataFrame created by reading ACID table
84 | * @param fullyQualifiedTableName Qualified name of the Hive ACID Table
85 | * @return
86 | */
87 | def getDFQualified(sparkSession: SparkSession,
88 | df: DataFrame,
89 | fullyQualifiedTableName: String) = {
90 | val plan = df.queryExecution.analyzed
91 | val qualifiedPlan = plan match {
92 | case p: LogicalRelation =>
93 | p.copy(output = p.output
94 | .map((x: AttributeReference) =>
95 | x.withQualifier(fullyQualifiedTableName.split('.').toSeq))
96 | )
97 | case h: HiveTableRelation =>
98 | h.copy(dataCols = h.dataCols
99 | .map((x: AttributeReference) =>
100 | x.withQualifier(fullyQualifiedTableName.split('.').toSeq))
101 | )
102 | h.copy(partitionCols = h.partitionCols
103 | .map((x: AttributeReference) =>
104 | x.withQualifier(fullyQualifiedTableName.split('.').toSeq))
105 | )
106 | case _ => plan
107 | }
108 |
109 | val newDf = SqlUtils.convertToDF(sparkSession, qualifiedPlan)
110 | (qualifiedPlan, newDf)
111 | }
112 |
113 | def logicalPlanToDataFrame(sparkSession: SparkSession,
114 | logicalPlan: LogicalPlan): DataFrame = {
115 | Dataset.ofRows(sparkSession, logicalPlan)
116 | }
117 |
118 | /**
119 | * Convert RDD into DataFrame using the attributeList.
120 | * Based on [[SparkSession.createDataFrame()]] implementation but here,
121 | * attributes are provided.
122 | * @param sparkSession
123 | * @param rdd
124 | * @param schema
125 | * @param attributes
126 | * @return
127 | */
128 | def createDataFrameUsingAttributes(sparkSession: SparkSession,
129 | rdd: RDD[Row],
130 | schema: StructType,
131 | attributes: Seq[Attribute]): DataFrame = {
132 | val encoder = RowEncoder(schema)
133 | val catalystRows = rdd.map(encoder.toRow)
134 | val logicalPlan = LogicalRDD(
135 | attributes,
136 | catalystRows,
137 | isStreaming = false)(sparkSession)
138 | Dataset.ofRows(sparkSession, logicalPlan)
139 | }
140 |
141 | def analysisException(cause: String): Throwable = {
142 | new AnalysisException(cause)
143 | }
144 |
145 | def removeTopSubqueryAlias(logicalPlan: LogicalPlan): LogicalPlan = {
146 | logicalPlan match {
147 | case SubqueryAlias(_, child: LogicalPlan) => child
148 | case _ => logicalPlan
149 | }
150 | }
151 | }
152 |
153 | case class FakeLogicalPlan(expr: Expression, children: Seq[LogicalPlan])
154 | extends LogicalPlan {
155 | override def output: Seq[Attribute] = children.foldLeft(Seq[Attribute]())((out, child) => out ++ child.output)
156 | }
157 |
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/catalyst/parser/plans/logical/MergePlan.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.spark.sql.catalyst.parser.plans.logical
19 |
20 | import com.qubole.spark.hiveacid.merge.{MergeWhenClause}
21 | import org.apache.spark.sql.{SparkSession, SqlUtils}
22 | import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression}
23 | import org.apache.spark.sql.catalyst.plans.logical.{Command, LogicalPlan}
24 |
25 | case class MergePlan(sourcePlan: LogicalPlan,
26 | targetPlan: LogicalPlan,
27 | condition: Expression,
28 | matched: Seq[MergeWhenClause],
29 | notMatched: Option[MergeWhenClause]) extends Command {
30 | override def children: Seq[LogicalPlan] = Seq(sourcePlan, targetPlan)
31 | override def output: Seq[Attribute] = Seq.empty
32 | }
33 |
34 | object MergePlan {
35 | def resolve(sparkSession: SparkSession, mergePlan: MergePlan): MergePlan = {
36 | MergeWhenClause.validate(mergePlan.matched ++ mergePlan.notMatched)
37 | val resolvedCondition = SqlUtils.resolveReferences(sparkSession, mergePlan.condition,
38 | mergePlan.children, true, None)
39 | val resolvedMatched = MergeWhenClause.resolve(sparkSession, mergePlan, mergePlan.matched)
40 | val resolvedNotMatched = mergePlan.notMatched.map {
41 | x => x.resolve(sparkSession, mergePlan)
42 | }
43 |
44 | MergePlan(mergePlan.sourcePlan,
45 | mergePlan.targetPlan,
46 | resolvedCondition,
47 | resolvedMatched,
48 | resolvedNotMatched)
49 | }
50 | }
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/hive/HiveAcidUtils.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2019 Qubole, Inc. All rights reserved.
3 | *
4 | * Licensed to the Apache Software Foundation (ASF) under one or more
5 | * contributor license agreements. See the NOTICE file distributed with
6 | * this work for additional information regarding copyright ownership.
7 | * The ASF licenses this file to You under the Apache License, Version 2.0
8 | * (the "License"); you may not use this file except in compliance with
9 | * the License. You may obtain a copy of the License at
10 | *
11 | * http://www.apache.org/licenses/LICENSE-2.0
12 | *
13 | * Unless required by applicable law or agreed to in writing, software
14 | * distributed under the License is distributed on an "AS IS" BASIS,
15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | * See the License for the specific language governing permissions and
17 | * limitations under the License.
18 | */
19 |
20 | package org.apache.spark.sql.hive
21 |
22 | import scala.collection.JavaConverters._
23 | import com.qubole.spark.hiveacid.hive.HiveAcidMetadata
24 | import org.apache.spark.sql.AnalysisException
25 | import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTablePartition, CatalogUtils}
26 | import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, BoundReference, Expression, InterpretedPredicate, PrettyAttribute}
27 |
28 | object HiveAcidUtils {
29 |
30 | /**
31 | * This is adapted from [[org.apache.spark.sql.catalyst.catalog.ExternalCatalogUtils.prunePartitionsByFilter]]
32 | * Instead of [[org.apache.spark.sql.catalyst.catalog.CatalogTable]] this function will be using [[HiveAcidMetadata]]
33 | * @param hiveAcidMetadata
34 | * @param inputPartitions
35 | * @param predicates
36 | * @param defaultTimeZoneId
37 | * @return
38 | */
39 | def prunePartitionsByFilter(
40 | hiveAcidMetadata: HiveAcidMetadata,
41 | inputPartitions: Seq[CatalogTablePartition],
42 | predicates: Option[Expression],
43 | defaultTimeZoneId: String): Seq[CatalogTablePartition] = {
44 | if (predicates.isEmpty) {
45 | inputPartitions
46 | } else {
47 | val partitionSchema = hiveAcidMetadata.partitionSchema
48 | val partitionColumnNames = hiveAcidMetadata.partitionSchema.fieldNames.toSet
49 |
50 | val nonPartitionPruningPredicates = predicates.filterNot {
51 | _.references.map(_.name).toSet.subsetOf(partitionColumnNames)
52 | }
53 | if (nonPartitionPruningPredicates.nonEmpty) {
54 | throw new AnalysisException("Expected only partition pruning predicates: " +
55 | nonPartitionPruningPredicates)
56 | }
57 |
58 | val boundPredicate =
59 | InterpretedPredicate.create(predicates.get.transform {
60 | case att: Attribute =>
61 | val index = partitionSchema.indexWhere(_.name == att.name)
62 | BoundReference(index, partitionSchema(index).dataType, nullable = true)
63 | })
64 |
65 | inputPartitions.filter { p =>
66 | boundPredicate.eval(p.toRow(partitionSchema, defaultTimeZoneId))
67 | }
68 | }
69 | }
70 |
71 | def convertToCatalogTablePartition(hp: com.qubole.shaded.hadoop.hive.ql.metadata.Partition): CatalogTablePartition = {
72 | val apiPartition = hp.getTPartition
73 | val properties: Map[String, String] = if (hp.getParameters != null) {
74 | hp.getParameters.asScala.toMap
75 | } else {
76 | Map.empty
77 | }
78 | CatalogTablePartition(
79 | spec = Option(hp.getSpec).map(_.asScala.toMap).getOrElse(Map.empty),
80 | storage = CatalogStorageFormat(
81 | locationUri = Option(CatalogUtils.stringToURI(apiPartition.getSd.getLocation)),
82 | inputFormat = Option(apiPartition.getSd.getInputFormat),
83 | outputFormat = Option(apiPartition.getSd.getOutputFormat),
84 | serde = Option(apiPartition.getSd.getSerdeInfo.getSerializationLib),
85 | compressed = apiPartition.getSd.isCompressed,
86 | properties = Option(apiPartition.getSd.getSerdeInfo.getParameters)
87 | .map(_.asScala.toMap).orNull),
88 | createTime = apiPartition.getCreateTime.toLong * 1000,
89 | lastAccessTime = apiPartition.getLastAccessTime.toLong * 1000,
90 | parameters = properties,
91 | stats = None) // TODO: need to implement readHiveStats
92 | }
93 | }
94 |
--------------------------------------------------------------------------------
/src/test/scala/com/qubole/spark/hiveacid/merge/MergeClauseSuite.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2019 Qubole, Inc. All rights reserved.
3 | *
4 | * Licensed to the Apache Software Foundation (ASF) under one or more
5 | * contributor license agreements. See the NOTICE file distributed with
6 | * this work for additional information regarding copyright ownership.
7 | * The ASF licenses this file to You under the Apache License, Version 2.0
8 | * (the "License"); you may not use this file except in compliance with
9 | * the License. You may obtain a copy of the License at
10 | *
11 | * http://www.apache.org/licenses/LICENSE-2.0
12 | *
13 | * Unless required by applicable law or agreed to in writing, software
14 | * distributed under the License is distributed on an "AS IS" BASIS,
15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | * See the License for the specific language governing permissions and
17 | * limitations under the License.
18 | */
19 |
20 | package com.qubole.spark.hiveacid.merge
21 |
22 | import org.apache.spark.SparkFunSuite
23 | import org.apache.spark.sql.{AnalysisException, functions}
24 |
25 | class MergeClauseSuite extends SparkFunSuite {
26 | def insertClause(addCondition : Boolean = true): MergeWhenNotInsert = {
27 | if (addCondition) {
28 | MergeWhenNotInsert(Some(functions.expr("x > 2").expr),
29 | Seq(functions.col("x").expr, functions.col("y").expr))
30 | }
31 | else {
32 | MergeWhenNotInsert(None,
33 | Seq(functions.col("x").expr, functions.col("y").expr))
34 | }
35 | }
36 |
37 | def updateClause(addCondition : Boolean = true): MergeWhenUpdateClause = {
38 | if (addCondition) {
39 | val updateCondition = Some(functions.expr("a > 2").expr)
40 | MergeWhenUpdateClause(updateCondition,
41 | Map("b" -> functions.lit(3).expr), isStar = false)
42 | } else {
43 | MergeWhenUpdateClause(None,
44 | Map("b" -> functions.lit(3).expr), isStar = false)
45 | }
46 | }
47 |
48 | def deleteClause(addCondition : Boolean = true): MergeWhenDelete = {
49 | if (addCondition) {
50 | MergeWhenDelete(Some(functions.expr("a < 1").expr))
51 | } else {
52 | MergeWhenDelete(None)
53 | }
54 | }
55 |
56 | test("Validate MergeClauses") {
57 | val clauses = Seq(insertClause(), updateClause(), deleteClause())
58 | MergeWhenClause.validate(clauses)
59 | }
60 |
61 | test("Invalid MergeClause cases") {
62 | val invalidMerge = "MERGE Validation Error: "
63 |
64 | //empty clauses
65 | checkInvalidMergeClause(invalidMerge + MergeWhenClause.atleastOneClauseError, Seq())
66 |
67 | // multi update or insert clauses
68 | val multiUpdateClauses = Seq(updateClause(), updateClause(), insertClause())
69 | checkInvalidMergeClause(invalidMerge + MergeWhenClause.justOneClausePerTypeError, multiUpdateClauses)
70 |
71 | // multi match clauses with first clause without condition
72 | val invalidMultiMatch = Seq(updateClause(false), deleteClause())
73 | checkInvalidMergeClause(invalidMerge + MergeWhenClause.matchClauseConditionError, invalidMultiMatch)
74 |
75 | // invalid Update Clause
76 | val invalidUpdateClause = MergeWhenUpdateClause(None, Map(), isStar = false)
77 | val thrown = intercept[IllegalArgumentException] {
78 | MergeWhenClause.validate(Seq(invalidUpdateClause))
79 | }
80 | assert(thrown.getMessage === "UPDATE Clause in MERGE should have one or more SET Values")
81 | }
82 |
83 | private def checkInvalidMergeClause(invalidMessage: String, multiUpdateClauses: Seq[MergeWhenClause]) = {
84 | val thrown = intercept[AnalysisException] {
85 | MergeWhenClause.validate(multiUpdateClauses)
86 | }
87 | assert(thrown.message === invalidMessage)
88 | }
89 | }
90 |
--------------------------------------------------------------------------------
/version.sbt:
--------------------------------------------------------------------------------
1 | version in ThisBuild := "0.6.0"
--------------------------------------------------------------------------------