├── .gitignore
├── LICENSE.txt
├── README.md
├── build.sbt
├── docker
    ├── README.md
    ├── beeline
    ├── build
    ├── files
    │   ├── Dockerfile
    │   ├── bootstrap.sh
    │   ├── core-site.xml
    │   ├── hadoop-env.sh
    │   ├── hdfs-site.xml
    │   ├── hive-site.xml
    │   ├── mapred-site.xml
    │   └── yarn-site.xml
    ├── inspect
    ├── login
    ├── spark-shell
    ├── start
    └── stop
├── project
    ├── build.properties
    └── plugins.sbt
├── shaded-dependencies
    ├── build.sbt
    └── project
    │   ├── build.properties
    │   └── plugins.sbt
├── src
    ├── it
    │   └── scala
    │   │   └── com
    │   │       └── qubole
    │   │           └── spark
    │   │               └── hiveacid
    │   │                   ├── LockSuite.scala
    │   │                   ├── MergeSuite.scala
    │   │                   ├── ReadSuite.scala
    │   │                   ├── Table.scala
    │   │                   ├── TestHelper.scala
    │   │                   ├── TestHiveClient.java
    │   │                   ├── TestSparkSession.scala
    │   │                   ├── UpdateDeleteSuite.scala
    │   │                   ├── WriteSuite.scala
    │   │                   └── streaming
    │   │                       ├── HiveAcidSinkLogSuite.scala
    │   │                       ├── HiveAcidSinkOptionsSuite.scala
    │   │                       ├── HiveAcidSinkSuite.scala
    │   │                       ├── HiveAcidStreamingFunSuite.scala
    │   │                       └── StreamingTestHelper.scala
    ├── main
    │   ├── antlr4
    │   │   └── com
    │   │   │   └── qubole
    │   │   │       └── spark
    │   │   │           └── datasources
    │   │   │               └── hiveacid
    │   │   │                   └── sql
    │   │   │                       └── catalyst
    │   │   │                           └── parser
    │   │   │                               └── SqlHive.g4
    │   ├── resources
    │   │   └── META-INF
    │   │   │   └── services
    │   │   │       └── org.apache.spark.sql.sources.DataSourceRegister
    │   └── scala
    │   │   ├── com
    │   │       └── qubole
    │   │       │   ├── shaded
    │   │       │       └── hadoop
    │   │       │       │   └── hive
    │   │       │       │       └── ql
    │   │       │       │           └── io
    │   │       │       │               └── orc
    │   │       │       │                   └── OrcAcidUtil.scala
    │   │       │   └── spark
    │   │       │       ├── datasources
    │   │       │           └── hiveacid
    │   │       │           │   └── sql
    │   │       │           │       ├── HiveAnalysisException.scala
    │   │       │           │       ├── catalyst
    │   │       │           │           ├── parser
    │   │       │           │           │   ├── AstBuilder.scala
    │   │       │           │           │   └── ParseDriver.scala
    │   │       │           │           └── plans
    │   │       │           │           │   └── command
    │   │       │           │           │       ├── DeleteCommand.scala
    │   │       │           │           │       ├── MergeCommand.scala
    │   │       │           │           │       └── UpdateCommand.scala
    │   │       │           │       └── execution
    │   │       │           │           ├── SparkAcidSqlParser.scala
    │   │       │           │           └── SparkSqlAstBuilder.scala
    │   │       │       └── hiveacid
    │   │       │           ├── .gitignore
    │   │       │           ├── AcidOperationDelegate.scala
    │   │       │           ├── HiveAcidAutoConvert.scala
    │   │       │           ├── HiveAcidErrors.scala
    │   │       │           ├── HiveAcidOperation.scala
    │   │       │           ├── HiveAcidTable.scala
    │   │       │           ├── SparkAcidConf.scala
    │   │       │           ├── datasource
    │   │       │               ├── HiveAcidDataSource.scala
    │   │       │               └── HiveAcidRelation.scala
    │   │       │           ├── hive
    │   │       │               ├── .gitignore
    │   │       │               ├── HiveAcidMetadata.scala
    │   │       │               └── HiveConverter.scala
    │   │       │           ├── merge
    │   │       │               ├── MergeImpl.scala
    │   │       │               └── MergeWhenClause.scala
    │   │       │           ├── package.scala
    │   │       │           ├── rdd
    │   │       │               ├── EmptyRDD.scala
    │   │       │               ├── HiveAcidRDD.scala
    │   │       │               └── HiveAcidUnionRDD.scala
    │   │       │           ├── reader
    │   │       │               ├── .gitignore
    │   │       │               ├── Reader.scala
    │   │       │               ├── ReaderOptions.scala
    │   │       │               ├── TableReader.scala
    │   │       │               └── hive
    │   │       │               │   ├── HiveAcidPartitionComputer.scala
    │   │       │               │   ├── HiveAcidReader.scala
    │   │       │               │   ├── HiveAcidReaderOptions.scala
    │   │       │               │   └── HiveAcidSearchArgument.scala
    │   │       │           ├── streaming
    │   │       │               ├── HiveAcidSink.scala
    │   │       │               ├── HiveAcidSinkLog.scala
    │   │       │               ├── HiveAcidSinkOptions.scala
    │   │       │               └── HiveAcidStreamingCommitProtocol.scala
    │   │       │           ├── transaction
    │   │       │               ├── HiveAcidTxn.scala
    │   │       │               └── HiveAcidTxnManager.scala
    │   │       │           ├── util
    │   │       │               ├── .gitignore
    │   │       │               ├── HiveAcidKyroRegistrator.scala
    │   │       │               ├── SerializableConfiguration.scala
    │   │       │               ├── SerializableWritable.scala
    │   │       │               └── Util.scala
    │   │       │           └── writer
    │   │       │               ├── TableWriter.scala
    │   │       │               ├── Writer.scala
    │   │       │               ├── WriterOptions.scala
    │   │       │               └── hive
    │   │       │                   ├── HiveAcidWriter.scala
    │   │       │                   └── HiveAcidWriterOptions.scala
    │   │   └── org
    │   │       └── apache
    │   │           └── spark
    │   │               └── sql
    │   │                   ├── SqlUtils.scala
    │   │                   ├── catalyst
    │   │                       └── parser
    │   │                       │   └── plans
    │   │                       │       └── logical
    │   │                       │           └── MergePlan.scala
    │   │                   └── hive
    │   │                       ├── Hive3Inspectors.scala
    │   │                       └── HiveAcidUtils.scala
    └── test
    │   └── scala
    │       ├── com
    │           └── qubole
    │           │   └── spark
    │           │       └── hiveacid
    │           │           ├── merge
    │           │               └── MergeClauseSuite.scala
    │           │           └── sql
    │           │               └── catalyst
    │           │                   └── parser
    │           │                       └── MergeParserSuite.scala
    │       └── org
    │           └── apache
    │               └── spark
    │                   └── sql
    │                       └── catalyst
    │                           └── parser
    │                               └── plans
    │                                   └── logical
    │                                       └── MergePlanSuite.scala
└── version.sbt


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.class
 2 | *.log
 3 | *.iml
 4 | *.ipr
 5 | *.iws
 6 | .idea
 7 | out
 8 | .cache/
 9 | .history/
10 | .lib/
11 | dist/*
12 | target/
13 | bin/
14 | libexec/
15 | lib_managed/
16 | src_managed/
17 | project/boot/
18 | project/plugins/project/
19 | logs/
20 | project/*-shim.sbt
21 | project/project/
22 | project/target/
23 | target/
24 | .scala_dependencies
25 | .worksheet
26 | shaded_dependencies
27 | 


--------------------------------------------------------------------------------
/build.sbt:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2019 Qubole, Inc.  All rights reserved.
  3 |  *
  4 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  5 |  * contributor license agreements.  See the NOTICE file distributed with
  6 |  * this work for additional information regarding copyright ownership.
  7 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  8 |  * (the "License"); you may not use this file except in compliance with
  9 |  * the License.  You may obtain a copy of the License at
 10 |  *
 11 |  *    http://www.apache.org/licenses/LICENSE-2.0
 12 |  *
 13 |  * Unless required by applicable law or agreed to in writing, software
 14 |  * distributed under the License is distributed on an "AS IS" BASIS,
 15 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 16 |  * See the License for the specific language governing permissions and
 17 |  * limitations under the License.
 18 |  */
 19 | 
 20 | name := "spark-acid"
 21 | 
 22 | organization:= "com.qubole"
 23 | 
 24 | /*******************
 25 | 	* Scala settings
 26 | 	*/
 27 | 
 28 | crossScalaVersions := Seq("2.11.12")
 29 | 
 30 | scalaVersion := crossScalaVersions.value.head
 31 | 
 32 | scalacOptions ++= Seq(
 33 | 	"-Xlint",
 34 | 	"-Xfatal-warnings",
 35 | 	"-deprecation",
 36 | 	"-unchecked",
 37 | 	"-optimise",
 38 | 	"-Yinline-warnings"
 39 | )
 40 | 
 41 | scalacOptions in (Compile, doc) ++= Seq(
 42 | 	"-no-link-warnings" // Suppresses problems with Scaladoc @throws links
 43 | )
 44 | 
 45 | /**************************
 46 | 	* Spark package settings
 47 | 	*/
 48 | sparkVersion := sys.props.getOrElse("spark.version", "2.4.3")
 49 | 
 50 | spIncludeMaven := true
 51 | 
 52 | spIgnoreProvided := true
 53 | 
 54 | 
 55 | /************************
 56 | 	* Library Dependencies
 57 | 	*/
 58 | 
 59 | libraryDependencies ++= Seq(
 60 | 	// Adding test classifier seems to break transitive resolution of the core dependencies
 61 | 	"org.apache.spark" %% "spark-hive" % sparkVersion.value % "provided" excludeAll(
 62 | 		ExclusionRule("org.apache", "hadoop-common"),
 63 | 		ExclusionRule("org.apache", "hadoop-hdfs")),
 64 | 	"org.apache.spark" %% "spark-sql" % sparkVersion.value % "provided" excludeAll(
 65 | 		ExclusionRule("org.apache", "hadoop-common"),
 66 | 		ExclusionRule("org.apache", "hadoop-hdfs")),
 67 | 	"org.apache.spark" %% "spark-core" % sparkVersion.value % "provided" excludeAll(
 68 | 		ExclusionRule("org.apache", "hadoop-common"),
 69 | 		ExclusionRule("org.apache", "hadoop-hdfs")),
 70 | 	"org.apache.spark" %% "spark-catalyst" % sparkVersion.value % "provided" excludeAll(
 71 | 		ExclusionRule("org.apache", "hadoop-common"),
 72 | 		ExclusionRule("org.apache", "hadoop-hdfs")),
 73 | 	"org.apache.hadoop" % "hadoop-common" % "2.8.1" % "provided",
 74 | 	"org.apache.hadoop" % "hadoop-hdfs" % "2.8.1" % "provided",
 75 | 	"org.apache.commons" % "commons-lang3" % "3.3.5" % "provided",
 76 | 	// antlr-runtime
 77 | 	"org.antlr" % "antlr4-runtime" % "4.7.2" % "provided"
 78 | )
 79 | 
 80 | lazy val scalatest = "org.scalatest" %% "scalatest" % "3.0.5"
 81 | 
 82 | // Dependencies for Test
 83 | libraryDependencies ++= Seq(
 84 | 	"org.apache.hadoop" % "hadoop-common" % "2.8.1" % "provided",
 85 | 	"org.apache.hadoop" % "hadoop-hdfs" % "2.8.1" % "provided",
 86 | 	"org.apache.commons" % "commons-lang3" % "3.3.5" % "provided",
 87 | 	// Dependencies for tests
 88 | 	//
 89 | 	"org.scalatest" %% "scalatest" % "3.0.5" % "test",
 90 | 	"junit" % "junit" % "4.12" % "it,test",
 91 | 	"com.novocode" % "junit-interface" % "0.11" % "it,test",
 92 | 	"org.apache.spark" %% "spark-catalyst" % sparkVersion.value % "test" classifier "tests",
 93 | 	"org.apache.spark" %% "spark-core" % sparkVersion.value % "test" classifier "tests",
 94 | 	"org.apache.spark" %% "spark-sql" % sparkVersion.value % "test" classifier "tests"
 95 | )
 96 | 
 97 | // Shaded jar dependency
 98 | libraryDependencies ++= Seq(
 99 | 	// intransitive() because we don't want to include any transitive dependencies of shaded-dependencies jar in main jar
100 | 	// ideally all such dependencies should be shaded inside shaded-dependencies jar
101 | 	"com.qubole" %% "spark-acid-shaded-dependencies" % sys.props.getOrElse("package.version", "0.1") intransitive()
102 | )
103 | 
104 | /**************************************
105 | 	* Remove Shaded Depenedency from POM
106 | 	*/
107 | 
108 | import scala.xml.{Node => XmlNode, NodeSeq => XmlNodeSeq, _}
109 | import scala.xml.transform.{RewriteRule, RuleTransformer}
110 | 
111 | pomPostProcess := { (node: XmlNode) =>
112 |   new RuleTransformer(new RewriteRule {
113 |     override def transform(node: XmlNode): XmlNodeSeq = node match {
114 |       case e: Elem if e.label == "dependency" && e.child.filter(_.label == "groupId").text.mkString == "com.qubole" =>
115 |         val organization = e.child.filter(_.label == "groupId").flatMap(_.text).mkString
116 |         val artifact = e.child.filter(_.label == "artifactId").flatMap(_.text).mkString
117 |         val version = e.child.filter(_.label == "version").flatMap(_.text).mkString
118 |         Comment(s"dependency $organization#$artifact;$version has been omitted")
119 |       case _ => node
120 |     }
121 |   }).transform(node).head
122 | }
123 | 
124 | excludeDependencies ++= Seq (
125 | 	// hive
126 | 	"org.apache.hive" % "hive-exec",
127 | 	"org.apache.hive" % "hive-metastore",
128 | 	"org.apache.hive" % "hive-jdbc",
129 | 	"org.apache.hive" % "hive-service",
130 | 	"org.apache.hive" % "hive-serde",
131 | 	"org.apache.hive" % "hive-common",
132 | 
133 | 	// orc
134 | 	"org.apache.orc" % "orc-core",
135 | 	"org.apache.orc" % "orc-mapreduce",
136 | 
137 | 	"org.slf4j" % "slf4j-api"
138 | )
139 | 
140 | // do not run test at assembly
141 | test in assembly := {}
142 | 
143 | // Spark Package Section
144 | spName := "qubole/spark-acid"
145 | 
146 | spShade := true
147 | 
148 | spAppendScalaVersion := true
149 | 
150 | credentials += Credentials(Path.userHome / ".ivy2" / ".sbtcredentials")
151 | 
152 | licenses += "Apache-2.0" -> url("http://opensource.org/licenses/Apache-2.0")
153 | 
154 | pomExtra :=
155 |     <url>https://github.com/qubole/spark-acid</url>
156 |         <scm>
157 |             <url>git@github.com:qubole/spark-acid.git</url>
158 |             <connection>scm:git:git@github.com:qubole/spark-acid.git</connection>
159 |         </scm>
160 |         <developers>
161 | 	    <developer>
162 |                 <id>amoghmargoor</id>
163 |                 <name>Amogh Margoor</name>
164 |                 <url>https://github.com/amoghmargoor</url>
165 | 	    </developer>
166 |             <developer>
167 |                 <id>citrusraj</id>
168 |                 <name>Rajkumar Iyer</name>
169 |                 <url>https://github.com/citrusraj</url>
170 |             </developer>
171 |             <developer>
172 |                 <id>somani</id>
173 |                 <name>Abhishek Somani</name>
174 |                 <url>https://github.com/somani</url>
175 |             </developer>
176 |             <developer>
177 |                 <id>prakharjain09</id>
178 |                 <name>Prakhar Jain</name>
179 | 		<url>https://github.com/prakharjain09</url>
180 |             </developer>
181 | 	    <developer>
182 |                 <id>sourabh912</id>
183 |                 <name>Sourabh Goyal</name>
184 |                 <url>https://github.com/sourabh912</url>
185 | 	    </developer>
186 |         </developers>
187 | 
188 | 
189 | publishMavenStyle := true
190 | 
191 | bintrayReleaseOnPublish := false
192 | 
193 | import ReleaseTransformations._
194 | 
195 | // Add publishing to spark packages as another step.
196 | releaseProcess := Seq[ReleaseStep](
197 |   checkSnapshotDependencies,
198 |   inquireVersions,
199 |   setReleaseVersion,
200 |   commitReleaseVersion,
201 |   tagRelease,
202 |   pushChanges,
203 |   releaseStepTask(spDist),
204 |   releaseStepTask(spPublish)
205 | )
206 | 
207 | /**
208 | 	* Antlr settings
209 | 	*/
210 | antlr4Settings
211 | antlr4PackageName in Antlr4 := Some("com.qubole.spark.datasources.hiveacid.sql.catalyst.parser")
212 | antlr4GenListener in Antlr4 := true
213 | antlr4GenVisitor in Antlr4 := true
214 | antlr4Version := "4.7.2"
215 | 
216 | 
217 | /*******************
218 | 	*  Test settings
219 | 	*/
220 | 
221 | parallelExecution in IntegrationTest := false
222 | 
223 | // do not run test at assembly
224 | test in assembly := {}
225 | 
226 | // do not add scala in fat jar
227 | assemblyOption in assembly := (assemblyOption in assembly).value.copy(includeScala = false)
228 | 
229 | //Integration test
230 | lazy val root = (project in file("."))
231 |   .configs(IntegrationTest)
232 |   .settings(
233 |     Defaults.itSettings,
234 |     libraryDependencies += scalatest % "it"
235 |   )
236 | 
237 | // exclude antlr classes from assembly since those
238 | // are available in spark at runtime
239 | // any other classes to be excluded from assembly
240 | // should be added here
241 | assemblyExcludedJars in assembly := {
242 | 	val cp = (fullClasspath in assembly).value
243 | 	cp filter {_.data.getName.contains("antlr")}
244 | }
245 | 
246 | /***********************
247 | 	* Release settings
248 | 	*/
249 | 
250 | publishMavenStyle := true
251 | 
252 | bintrayReleaseOnPublish := false
253 | 
254 | import ReleaseTransformations._
255 | 
256 | // Add publishing to spark packages as another step.
257 | releaseProcess := Seq[ReleaseStep](
258 | 	checkSnapshotDependencies,
259 | 	inquireVersions,
260 | 	setReleaseVersion,
261 | 	commitReleaseVersion,
262 | 	tagRelease,
263 | 	pushChanges
264 | )
265 | 


--------------------------------------------------------------------------------
/docker/README.md:
--------------------------------------------------------------------------------
 1 | A pseudo-distributed Hadoop image for testing Spark ACID datasource, based on 
 2 | 1. CentOS 6
 3 | 2. [Hadoop3.1.1] (https://archive.apache.org/dist/hadoop/common/hadoop-3.1.1/hadoop-3.1.1.tar.gz)
 4 | 3. [Hive3.1.1] (http://mirrors.estointernet.in/apache/hive/hive-3.1.1/apache-hive-3.1.1-bin.tar.gz)
 5 | 4. [MySQL 5.6.44] (http://repo.mysql.com/mysql-community-release-el6-5.noarch.rpm)
 6 | 
 7 | # Setup
 8 | 
 9 | Refer for [Install Docker] (https://docs.docker.com/v17.12/install/) to install docker.
10 | 
11 | # Build
12 | 
13 | To build docker image
14 | ```bash
15 | ./build
16 | ```
17 | 
18 | # Start
19 | 
20 | _NB Configure docker to run with the atleast 4GB of memory. For mac it can be configured in Docker Desktop_
21 | 
22 | To start docker image
23 | ```bash
24 | ./start
25 | ```
26 | 
27 | # Stop
28 | 
29 | To stop docker
30 | ```bash
31 | ./stop
32 | ```
33 | 
34 | 


--------------------------------------------------------------------------------
/docker/beeline:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | name="spark-hiveacid-test-container"
4 | 
5 | docker exec -it $name bin/bash -c "\
6 | 	. ~/.bashrc;                   \
7 | 	export HADOOP_HOME=/hadoop;    \
8 | 	hive/bin/beeline -n root -p root -u jdbc:hive2://0.0.0.0:10001/default"
9 | 


--------------------------------------------------------------------------------
/docker/build:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | docker build -t centos6/spark-hadoop3-hive3 files/.
3 | 


--------------------------------------------------------------------------------
/docker/files/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Licensed under the Apache License, Version 2.0 (the "License");
 2 | # you may not use this file except in compliance with the License.
 3 | # You may obtain a copy of the License at
 4 | #
 5 | #     http://www.apache.org/licenses/LICENSE-2.0
 6 | #
 7 | # Unless required by applicable law or agreed to in writing, software
 8 | # distributed under the License is distributed on an "AS IS" BASIS,
 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 | 
13 | from centos:6
14 | MAINTAINER rajkumar@qubole.com
15 | 
16 | 
17 | RUN yum -y update
18 | RUN yum -y install epel-release
19 | 
20 | 
21 | RUN yum -y install java-1.8.0-openjdk-devel java-1.8.0-openjdk
22 | RUN ln -s /usr/lib/jvm//java-1.8.0-openjdk-amd64/ /usr/lib/jvm/java-1.8.0
23 | RUN ln -s /usr/lib/jvm//java-1.7.0-openjdk-amd64/ /usr/lib/jvm/java-1.7.0
24 | 
25 | #RUN yum -y install vim
26 | RUN yum -y install wget tar sudo rsync
27 | 
28 | RUN yum -y install initscripts httpd
29 | 
30 | RUN wget https://archive.apache.org/dist/hadoop/common/hadoop-3.1.1/hadoop-3.1.1.tar.gz
31 | RUN tar -xvzf hadoop-3.1.1.tar.gz
32 | RUN ln -sf /hadoop-3.1.1 /hadoop
33 | 
34 | RUN wget https://archive.apache.org/dist/hive/hive-3.1.2/apache-hive-3.1.2-bin.tar.gz
35 | RUN tar -xvzf apache-hive-3.1.2-bin.tar.gz
36 | RUN ln -sf /apache-hive-3.1.2-bin /hive
37 | 
38 | RUN yum -y install \
39 |     mysql-server mysql-connector-java \
40 |      && yum -y clean all && rm -rf /tmp/* /var/tmp/* \
41 |      && ln -s /usr/share/java/mysql-connector-java.jar apache-hive-3.1.2-bin/lib/mysql-connector-java.jar
42 | 
43 | # Setup sock proxy
44 | RUN yum install -y openssh openssh-clients openssh-server
45 | 
46 | # passwordless ssh
47 | RUN ssh-keygen -q -N "" -t rsa -f /root/.ssh/id_rsa
48 | RUN cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys
49 | 
50 | RUN chmod 755 /root && chmod 700 /root/.ssh
51 | RUN passwd --unlock root
52 | 
53 | RUN yum install -y vim mlocate unzip
54 | 
55 | RUN wget https://archive.apache.org/dist/hadoop/common/hadoop-2.7.0/hadoop-2.7.0.tar.gz
56 | RUN tar -xvzf hadoop-2.7.0.tar.gz
57 | 
58 | 
59 | # Copy configuration files
60 | RUN mkdir /conf
61 | COPY core-site.xml /conf/core-site.xml
62 | COPY hdfs-site.xml /conf/hdfs-site.xml
63 | COPY hadoop-env.sh /conf/hadoop-env.sh
64 | COPY yarn-site.xml /conf/yarn-site.xml
65 | 
66 | COPY mapred-site.xml /conf/mapred-site.xml
67 | COPY hive-site.xml /conf/hive-site.xml
68 | COPY bootstrap.sh /bootstrap.sh
69 | 
70 | # HDFS ports
71 | EXPOSE 1004 1006 8020 9866 9867 9870 9864 50470 9000
72 | 
73 | # YARN ports
74 | EXPOSE 8030 8031 8032 8033 8040 8041 8042 8088 10020 19888
75 | 
76 | # HIVE ports
77 | EXPOSE 9083 10000
78 | 
79 | # SOCKS port
80 | EXPOSE 1180
81 | 
82 | # mysql expose
83 | EXPOSE 3306
84 | 
85 | # HDFS datnode
86 | EXPOSE 9866
87 | 


--------------------------------------------------------------------------------
/docker/files/bootstrap.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | eg='\033[0;32m'
 4 | enc='\033[0m'
 5 | echoe () {
 6 | 	OIFS=${IFS}
 7 | 	IFS='%'
 8 | 	echo -e $@
 9 | 	IFS=${OIFS}
10 | }
11 | 
12 | gprn() {
13 | 	echoe "${eg} >> ${1}${enc}"
14 | }
15 | 
16 | 
17 | ## Setup ENV variables
18 | 
19 | export JAVA_HOME="/usr/lib/jvm/java-openjdk"
20 | 
21 | export HDFS_NAMENODE_USER="root"
22 | export HDFS_SECONDARYNAMENODE_USER="root"
23 | export HDFS_DATANODE_USER="root"
24 | export YARN_RESOURCEMANAGER_USER="root"
25 | export YARN_NODEMANAGER_USER="root"
26 | 
27 | export HADOOP_HOME="/hadoop"
28 | export HADOOP_ROOT_LOGGER=DEBUG
29 | export HADOOP_COMMON_LIB_NATIVE_DIR="/hadoop/lib/native"
30 | 
31 | ## Add it to bashrc for starting hadoop
32 | echo 'export JAVA_HOME="/usr/lib/jvm/java-openjdk"' >> ~/.bashrc
33 | echo 'export HADOOP_HOME="/hadoop"' >> ~/.bashrc
34 | 
35 | 
36 | rm /hadoop
37 | ln -sf /hadoop-3.1.1 /hadoop
38 | 
39 | cp /conf/core-site.xml /hadoop/etc/hadoop
40 | cp /conf/hdfs-site.xml /hadoop/etc/hadoop
41 | cp /conf/hadoop-env.sh /hadoop/etc/hadoop
42 | cp /conf/mapred-site.xml /hadoop/etc/hadoop
43 | cp /conf/yarn-site.xml /hadoop/etc/hadoop
44 | cp /conf/hive-site.xml /hive/conf/
45 | 
46 | 
47 | gprn "set up mysql"
48 | service mysqld start
49 | 
50 | # Set root password 
51 | mysql -uroot -e "set password = PASSWORD('root');"
52 | mysql -uroot -e "grant all privileges on *.* to 'root'@'%' identified by 'root';"
53 | service sshd start
54 | 
55 | gprn "start yarn"
56 | hadoop/sbin/start-yarn.sh &
57 | sleep 5
58 | 
59 | gprn "Formatting name node"
60 | hadoop/bin/hdfs namenode -format
61 | 
62 | gprn "Start hdfs"
63 | hadoop/sbin/start-dfs.sh
64 | 
65 | jps
66 | 
67 | mkdir -p /hive/warehouse -dbType mysql  -initSchemaTo 3.1.0
68 | 
69 | 
70 | gprn "Set up metastore DB"
71 | hive/bin/schematool -dbType mysql  -initSchemaTo 3.1.0
72 | 
73 | gprn "Start HMS server"
74 | hive/bin/hive --service metastore -p  10000 &
75 | 
76 | gprn "Sleep and wait for HMS to be up and running"
77 | sleep 20
78 | 
79 | gprn "Start HiveServer2"
80 | hive/bin/hive --service hiveserver2 --hiveconf hive.server2.thrift.port=10001 --hiveconf hive.execution.engine=mr
81 | 


--------------------------------------------------------------------------------
/docker/files/core-site.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 3 | <!--
 4 |   ~ Copyright 2019 Qubole, Inc.  All rights reserved.
 5 |   ~
 6 |   ~ Licensed to the Apache Software Foundation (ASF) under one or more
 7 |   ~ contributor license agreements.  See the NOTICE file distributed with
 8 |   ~ this work for additional information regarding copyright ownership.
 9 |   ~ The ASF licenses this file to You under the Apache License, Version 2.0
10 |   ~ (the "License"); you may not use this file except in compliance with
11 |   ~ the License.  You may obtain a copy of the License at
12 |   ~
13 |   ~    http://www.apache.org/licenses/LICENSE-2.0
14 |   ~
15 |   ~ Unless required by applicable law or agreed to in writing, software
16 |   ~ distributed under the License is distributed on an "AS IS" BASIS,
17 |   ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 |   ~ See the License for the specific language governing permissions and
19 |   ~ limitations under the License.
20 |   -->
21 | 
22 | <!--
23 |   Licensed under the Apache License, Version 2.0 (the "License");
24 |   you may not use this file except in compliance with the License.
25 |   You may obtain a copy of the License at
26 | 
27 |     http://www.apache.org/licenses/LICENSE-2.0
28 | 
29 |   Unless required by applicable law or agreed to in writing, software
30 |   distributed under the License is distributed on an "AS IS" BASIS,
31 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
32 |   See the License for the specific language governing permissions and
33 |   limitations under the License. See accompanying LICENSE file.
34 | -->
35 | 
36 | <!-- Put site-specific property overrides in this file. -->
37 | <configuration>
38 | 	<property>
39 | 		<name>fs.defaultFS</name>
40 | 		<value>hdfs://0.0.0.0:9000</value>
41 | 	</property>
42 | 
43 | 	<property>
44 | 		<name>hadoop.proxyuser.root.hosts</name>
45 | 		<value>*</value>
46 | 	</property>
47 | 
48 | 	<property>
49 | 		<name>hadoop.proxyuser.root.groups</name>
50 | 		<value>*</value>
51 | 	</property>
52 | </configuration>
53 | 


--------------------------------------------------------------------------------
/docker/files/hadoop-env.sh:
--------------------------------------------------------------------------------
1 | # The maximum amount of heap to use, in MB. Default is 1000.
2 | export HADOOP_HEAPSIZE=1024
3 | 
4 | # Extra Java runtime options.  Empty by default.
5 | export HADOOP_NAMENODE_OPTS="$HADOOP_NAMENODE_OPTS -Xmx512m"
6 | export YARN_OPTS="$YARN_OPTS -Xmx256m"
7 | 


--------------------------------------------------------------------------------
/docker/files/hdfs-site.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 3 | <!--
 4 |   ~ Copyright 2019 Qubole, Inc.  All rights reserved.
 5 |   ~
 6 |   ~ Licensed to the Apache Software Foundation (ASF) under one or more
 7 |   ~ contributor license agreements.  See the NOTICE file distributed with
 8 |   ~ this work for additional information regarding copyright ownership.
 9 |   ~ The ASF licenses this file to You under the Apache License, Version 2.0
10 |   ~ (the "License"); you may not use this file except in compliance with
11 |   ~ the License.  You may obtain a copy of the License at
12 |   ~
13 |   ~    http://www.apache.org/licenses/LICENSE-2.0
14 |   ~
15 |   ~ Unless required by applicable law or agreed to in writing, software
16 |   ~ distributed under the License is distributed on an "AS IS" BASIS,
17 |   ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 |   ~ See the License for the specific language governing permissions and
19 |   ~ limitations under the License.
20 |   -->
21 | 
22 | <!--
23 |   Licensed under the Apache License, Version 2.0 (the "License");
24 |   you may not use this file except in compliance with the License.
25 |   You may obtain a copy of the License at
26 |      http://www.apache.org/licenses/LICENSE-2.0
27 | 
28 |   Unless required by applicable law or agreed to in writing, software
29 |   distributed under the License is distributed on an "AS IS" BASIS,
30 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
31 |   See the License for the specific language governing permissions and
32 |   limitations under the License. See accompanying LICENSE file.
33 | -->
34 | 
35 | <!-- Put site-specific property overrides in this file. -->
36 | <configuration>
37 |     <property>
38 |         <name>dfs.replication</name>
39 |         <value>1</value>
40 |     </property>
41 |     <property>
42 |         <name>dfs.permissions.enabled</name>
43 |         <value>false</value>
44 |     </property>
45 |     <property>
46 |         <name>dfs.datanode.address</name>
47 |         <value>0.0.0.0:9866</value>
48 |     </property>
49 | 	<!--
50 | 	To by pass
51 | 	Service org.apache.tez.dag.app.DAGAppMaster failed in state INITED; cause: java.lang.NumberFormatException: For input string: “30s”
52 | 	-->
53 |     <property>
54 |         <name>dfs.client.datanode-restart.timeout</name>
55 |         <value>30</value>
56 |     </property>
57 | </configuration>
58 | 


--------------------------------------------------------------------------------
/docker/files/hive-site.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0"?>
  2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
  3 | <!--
  4 |   ~ Copyright 2019 Qubole, Inc.  All rights reserved.
  5 |   ~
  6 |   ~ Licensed to the Apache Software Foundation (ASF) under one or more
  7 |   ~ contributor license agreements.  See the NOTICE file distributed with
  8 |   ~ this work for additional information regarding copyright ownership.
  9 |   ~ The ASF licenses this file to You under the Apache License, Version 2.0
 10 |   ~ (the "License"); you may not use this file except in compliance with
 11 |   ~ the License.  You may obtain a copy of the License at
 12 |   ~
 13 |   ~    http://www.apache.org/licenses/LICENSE-2.0
 14 |   ~
 15 |   ~ Unless required by applicable law or agreed to in writing, software
 16 |   ~ distributed under the License is distributed on an "AS IS" BASIS,
 17 |   ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 18 |   ~ See the License for the specific language governing permissions and
 19 |   ~ limitations under the License.
 20 |   -->
 21 | 
 22 | <!-- This is SparkSQL's hive configuration. -->
 23 | <configuration>
 24 |     <property>
 25 |         <name>javax.jdo.option.ConnectionURL</name>
 26 |         <value>jdbc:mysql://localhost/metastore?createDatabaseIfNotExist=true</value>
 27 |     </property>
 28 |     <property>
 29 |         <name>javax.jdo.option.ConnectionDriverName</name>
 30 |         <value>com.mysql.jdbc.Driver</value>
 31 |     </property>
 32 |     <property>
 33 |         <name>javax.jdo.option.ConnectionUserName</name>
 34 |         <value>root</value>
 35 |     </property>
 36 |     <property>
 37 |         <name>javax.jdo.option.ConnectionPassword</name>
 38 |         <value>root</value>
 39 |     </property>
 40 | 
 41 |     <!-- for server side -->
 42 |     <property>
 43 |         <name>hive.metastore.uris</name>
 44 |         <value>thrift://0.0.0.0:10000</value>
 45 |     </property>
 46 | 
 47 |     <property>
 48 |         <name>hive.metastore.event.db.notification.api.auth</name>
 49 |         <value>false</value>
 50 |     </property>
 51 | 
 52 |     <!-- For the client support -->
 53 |     <property>
 54 |         <name>hive.support.concurrency</name>
 55 |         <value>true</value>
 56 |     </property>
 57 | 
 58 |     <property>
 59 |         <name>hive.exec.dynamic.partition.mode</name>
 60 |         <value>nonstrict</value>
 61 |     </property>
 62 | 
 63 |     <property>
 64 |         <name>hive.compactor.initiator.on</name>
 65 |         <value>true</value>
 66 |     </property>
 67 | 
 68 |     <property>
 69 |         <name>hive.txn.manager</name>
 70 |         <value>org.apache.hadoop.hive.ql.lockmgr.DbTxnManager</value>>
 71 |     </property>
 72 | 
 73 |     <!-- Hive server 2 -->
 74 |     <property>
 75 |         <name>hive.server2.thrift.http.port</name>
 76 |         <value>10001</value>
 77 |     </property>
 78 | 
 79 | 	<property>
 80 |         <name>hive.execution.engine</name>
 81 |         <value>mr</value>
 82 |     </property>
 83 | 
 84 |     <property>
 85 |         <name>hive.input.format</name>
 86 |         <value>org.apache.hadoop.hive.ql.io.HiveInputFormat</value>
 87 |     </property>
 88 | 
 89 |     <!-- https://issues.apache.org/jira/browse/HIVE-19286 -->
 90 |     <property>
 91 |         <name>hive.auto.convert.join</name>
 92 |         <value>false</value>
 93 |     </property>
 94 | 
 95 | 	<property>
 96 | 		<name>hive.stats.autogather</name>
 97 | 		<value>false</value>
 98 | 	</property>
 99 | 
100 | 	<property>
101 | 		<name>hive.metastore.client.capability.check</name>
102 | 		<value>false</value>
103 | 	</property>
104 | 
105 | </configuration>
106 | 


--------------------------------------------------------------------------------
/docker/files/mapred-site.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 3 | <!--
 4 |   ~ Copyright 2019 Qubole, Inc.  All rights reserved.
 5 |   ~
 6 |   ~ Licensed to the Apache Software Foundation (ASF) under one or more
 7 |   ~ contributor license agreements.  See the NOTICE file distributed with
 8 |   ~ this work for additional information regarding copyright ownership.
 9 |   ~ The ASF licenses this file to You under the Apache License, Version 2.0
10 |   ~ (the "License"); you may not use this file except in compliance with
11 |   ~ the License.  You may obtain a copy of the License at
12 |   ~
13 |   ~    http://www.apache.org/licenses/LICENSE-2.0
14 |   ~
15 |   ~ Unless required by applicable law or agreed to in writing, software
16 |   ~ distributed under the License is distributed on an "AS IS" BASIS,
17 |   ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 |   ~ See the License for the specific language governing permissions and
19 |   ~ limitations under the License.
20 |   -->
21 | 
22 | <!--
23 |   Licensed under the Apache License, Version 2.0 (the "License");
24 |   you may not use this file except in compliance with the License.
25 |   You may obtain a copy of the License at
26 |      http://www.apache.org/licenses/LICENSE-2.0
27 | 
28 |   Unless required by applicable law or agreed to in writing, software
29 |   distributed under the License is distributed on an "AS IS" BASIS,
30 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
31 |   See the License for the specific language governing permissions and
32 |   limitations under the License. See accompanying LICENSE file.
33 | -->
34 | 
35 | <!-- Put site-specific property overrides in this file. -->
36 | <configuration>
37 |     <property>
38 |         <name>mapreduce.framework.name</name>
39 |         <value>yarn</value>
40 |     </property>
41 |     <property>
42 |         <name>yarn.app.mapreduce.am.env</name>
43 |         <value>HADOOP_MAPRED_HOME=${HADOOP_HOME}</value>
44 |     </property>
45 |     <property>
46 |         <name>mapreduce.map.env</name>
47 |         <value>HADOOP_MAPRED_HOME=${HADOOP_HOME}</value>
48 |     </property>
49 |     <property>
50 |         <name>mapreduce.reduce.env</name>
51 |         <value>HADOOP_MAPRED_HOME=${HADOOP_HOME}</value>
52 |     </property>
53 |     <property>
54 |         <name>mapreduce.map.memory.mb</name>
55 |         <value>2048</value>
56 |     </property>
57 |     <property>
58 |         <name>mapreduce.reduce.memory.mb</name>
59 |         <value>2048</value>
60 |     </property>
61 | </configuration>
62 |  
63 | 


--------------------------------------------------------------------------------
/docker/files/yarn-site.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <!--
 3 |   ~ Copyright 2019 Qubole, Inc.  All rights reserved.
 4 |   ~
 5 |   ~ Licensed to the Apache Software Foundation (ASF) under one or more
 6 |   ~ contributor license agreements.  See the NOTICE file distributed with
 7 |   ~ this work for additional information regarding copyright ownership.
 8 |   ~ The ASF licenses this file to You under the Apache License, Version 2.0
 9 |   ~ (the "License"); you may not use this file except in compliance with
10 |   ~ the License.  You may obtain a copy of the License at
11 |   ~
12 |   ~    http://www.apache.org/licenses/LICENSE-2.0
13 |   ~
14 |   ~ Unless required by applicable law or agreed to in writing, software
15 |   ~ distributed under the License is distributed on an "AS IS" BASIS,
16 |   ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 |   ~ See the License for the specific language governing permissions and
18 |   ~ limitations under the License.
19 |   -->
20 | 
21 | <!--
22 |   Licensed under the Apache License, Version 2.0 (the "License");
23 |   you may not use this file except in compliance with the License.
24 |   You may obtain a copy of the License at
25 | 
26 |     http://www.apache.org/licenses/LICENSE-2.0
27 | 
28 |   Unless required by applicable law or agreed to in writing, software
29 |   distributed under the License is distributed on an "AS IS" BASIS,
30 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
31 |   See the License for the specific language governing permissions and
32 |   limitations under the License. See accompanying LICENSE file.
33 | -->
34 | <configuration>
35 |   <property>
36 |     <name>yarn.nodemanager.aux-services</name>
37 |     <value>mapreduce_shuffle</value>
38 |   </property>
39 |   <property>
40 |     <name>yarn.nodemanager.aux-services.mapreduce_shuffle.class</name>
41 |     <value>org.apache.hadoop.mapred.ShuffleHandler</value>
42 |   </property>
43 | </configuration>
44 | 


--------------------------------------------------------------------------------
/docker/inspect:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | prn_row()  {
 3 | 	printf '%-32s | %-10s | %-16s | %-128s\n' "${1}" "${2}" "${3}" "${4}"
 4 | }
 5 | prn_row "DOCKER_NAME" "RUNNING" "IP" "PORT_MAPPING"
 6 | id=spark-hiveacid-test-container
 7 | NAME=`docker inspect --format='{{.Name}}' $id`
 8 | RUNNING=`docker inspect --format='{{.State.Running}}' $id`
 9 | IP=`docker inspect --format='{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' $id`
10 | PORT_MAPPING=`docker inspect --format='{{range $p, $conf := .NetworkSettings.Ports}} {{$p}} -> {{(index $conf 0).HostPort}} {{end}}' $id | sed -e 's/\/tcp//g'`
11 | prn_row "$NAME" "${RUNNING}" "${IP}" "${PORT_MAPPING}"
12 | 


--------------------------------------------------------------------------------
/docker/login:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | docker exec -it spark-hiveacid-test-container /bin/bash
3 | 


--------------------------------------------------------------------------------
/docker/spark-shell:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | if [ -z ${2} ]
 3 | then
 4 | 	echo "Specify the spark-acid jar location"
 5 | 	echo "spark-shell ~/codeline/TOT ~/codeline/TOT/acid-ds/target/scala-2.11/spark-acid-qds-assembly-0.4.3.jar"
 6 | 	exit
 7 | fi
 8 | if [ -z ${1} ]
 9 | then
10 | 	echo "Specify and spark code base directory"
11 | 	echo "spark-shell ~/codeline/TOT ~/codeline/TOT/acid-ds/target/scala-2.11/spark-acid-qds-assembly-0.4.3.jar"
12 | 	exit
13 | fi
14 | 
15 | shellenv() {
16 | 	export QENV_LOCAL_CODELINE="${1}"
17 | 	export QENV_LOCAL_CONF="${QENV_LOCAL}/conf"
18 | 	export HADOOP_SRC="${QENV_LOCAL_CODELINE}/hadoop2"
19 | 	export SPARK_SRC="${QENV_LOCAL_CODELINE}/spark"
20 | 	export HUSTLER_SRC="${QENV_LOCAL_CODELINE}/hustler"
21 | 	export HIVE_SRC="${QENV_LOCAL_CODELINE}/hive"
22 | 	export ZEPPELIN_SRC="${QENV_LOCAL_CODELINE}/zeppelin"
23 | }
24 | 
25 | hsnapshot() {
26 |      HADOOP_SNAPSHOT=`ls ${HADOOP_SRC}/hadoop-dist/target/hadoop* | grep SNAPSHOT: | cut -d':' -f1`
27 | }
28 |  
29 | hivesnapshot() {
30 |      loc=`ls ${HIVE_SRC}/packaging/target/apache-hive* |grep bin |grep -v ':'`
31 |      HIVE_SNAPSHOT=${HIVE_SRC}/packaging/target/${loc}/${loc}/
32 | }
33 | 
34 | run_spark_shelllocal() {
35 | 
36 | 	# Setup writest into spark-env file. Run spark-shell after it.
37 | 	echo "Update Spark Conf based on Hadoop Build Version --> ${SPARK_SRC}/conf/spark-env.sh"
38 | 	hsnapshot
39 | 	hivesnapshot
40 | 
41 | 	str="export SPARK_YARN_USER_ENV=CLASSPATH=${QENV_LOCAL_CONF}/"
42 | 	echo ${str} > ${SPARK_SRC}/conf/spark-env.sh
43 | 
44 | 	if [ -n "${HADOOP_SNAPSHOT}" ]
45 | 	then
46 | 
47 | 		str="export SPARK_DIST_CLASSPATH=${QENV_LOCAL_CONF}/:${HADOOP_SNAPSHOT}/share/hadoop/common/lib/*:${HADOOP_SNAPSHOT}/share/hadoop/common/*:${HADOOP_SNAPSHOT}/share/hadoop/hdfs:${HADOOP_SNAPSHOT}/share/hadoop/hdfs/lib/*:${HADOOP_SNAPSHOT}/share/hadoop/hdfs/*:${HADOOP_SNAPSHOT}/share/hadoop/yarn/lib/*:${HADOOP_SNAPSHOT}/share/hadoop/yarn/*:${HADOOP_SNAPSHOT}/share/hadoop/mapreduce/*:/share/hadoop/tools:${HADOOP_SNAPSHOT}/share/hadoop/tools/lib/*:${HADOOP_SNAPSHOT}/share/hadoop/tools/*:/share/hadoop/qubole:${HADOOP_SNAPSHOT}/share/hadoop/qubole/*"
48 | 		 echo ${str} >> ${SPARK_SRC}/conf/spark-env.sh
49 | 	fi
50 | 
51 | 	if [ -n "${HIVE_SNAPSHOT}" ]
52 | 	then
53 | 		str="export SPARK_DIST_CLASSPATH=\${SPARK_DIST_CLASSPATH}:${HIVE_SNAPSHOT}/lib/*"
54 | 		echo ${str} >> ${SPARK_SRC}/conf/spark-env.sh
55 | 	fi
56 | 
57 | 	str="export HADOOP_CONF_DIR=${QENV_LOCAL_CONF}/"
58 | 	echo ${str} >> ${SPARK_SRC}/conf/spark-env.sh
59 | 
60 | 	$SPARK_SRC/bin/spark-shell $@
61 | }
62 | 
63 | 
64 | shellenv ${1}
65 | shift
66 | run_spark_shelllocal --jars $@ --conf spark.sql.extensions=com.qubole.spark.datasources.hiveacid.HiveAcidAutoConvertExtension --conf spark.hadoop.hive.metastore.uris=thrift://localhost:10000 --conf spark.sql.catalogImplementation=hive
67 | 


--------------------------------------------------------------------------------
/docker/start:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | name="spark-hiveacid-test-container"
 4 | 
 5 | RUNNING=`docker inspect --format "{{ .State.Running}}" ${name}  2>/dev/null`
 6 | if [[ $? -eq 0 ]]
 7 | then
 8 | 	if [[ "${RUNNING}" == "true" ]]
 9 | 	then
10 | 		echo "$name already running"
11 | 		exit
12 | 	fi
13 | else 
14 | 	docker run --name ${name} --hostname localhost -P -p9866:9866 -p10000:10000 -p10001:10001 -p9000:9000 -p3306:3306 -p50070:50070 -p50030:50030 -it -d centos6/spark-hadoop3-hive3 /bin/bash -c "/bootstrap.sh >/tmp/boostrap.log"
15 | fi
16 | 
17 | 


--------------------------------------------------------------------------------
/docker/stop:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | name="spark-hiveacid-test-container"
4 | docker kill ${name}
5 | docker rm ${name}
6 | 


--------------------------------------------------------------------------------
/project/build.properties:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2019 Qubole, Inc.  All rights reserved.
 3 | #
 4 | # Licensed to the Apache Software Foundation (ASF) under one or more
 5 | # contributor license agreements.  See the NOTICE file distributed with
 6 | # this work for additional information regarding copyright ownership.
 7 | # The ASF licenses this file to You under the Apache License, Version 2.0
 8 | # (the "License"); you may not use this file except in compliance with
 9 | # the License.  You may obtain a copy of the License at
10 | #
11 | #    http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | 
20 | sbt.version = 0.13.16
21 | 


--------------------------------------------------------------------------------
/project/plugins.sbt:
--------------------------------------------------------------------------------
 1 | 
 2 | resolvers += "spark-packages" at sys.props.getOrElse("spark.repo", "https://repos.spark-packages.org/")
 3 | 
 4 | addSbtPlugin("org.spark-packages" % "sbt-spark-package" % "0.2.6")
 5 | 
 6 | addSbtPlugin("com.github.gseitz" % "sbt-release" % "1.0.11")
 7 | 
 8 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.9")
 9 | 
10 | addSbtPlugin("org.foundweekends" % "sbt-bintray" % "0.5.4")
11 | 
12 | addSbtPlugin("com.simplytyped" % "sbt-antlr4" % "0.7.13")
13 | 
14 | addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.9.2")
15 | 


--------------------------------------------------------------------------------
/shaded-dependencies/build.sbt:
--------------------------------------------------------------------------------
  1 | name := "spark-acid-shaded-dependencies"
  2 | 
  3 | version := sys.props.getOrElse("package.version", "0.1")
  4 | 
  5 | organization:= "com.qubole"
  6 | 
  7 | scalaVersion := "2.11.12"
  8 | 
  9 | scalacOptions ++= Seq(
 10 | 	"-Xlint",
 11 | 	"-Xfatal-warnings",
 12 | 	"-deprecation",
 13 | 	"-unchecked",
 14 | 	"-optimise",
 15 | 	"-Yinline-warnings"
 16 | )
 17 | 
 18 | scalacOptions in (Compile, doc) ++= Seq(
 19 | 	"-no-link-warnings" // Suppresses problems with Scaladoc @throws links
 20 | )
 21 | 
 22 | // do not run test at assembly
 23 | test in assembly := {}
 24 | 
 25 | publishArtifact in (Compile, packageDoc) := false
 26 | 
 27 | publishArtifact in (Compile, packageSrc) := false
 28 | 
 29 | publishArtifact in (Compile, packageBin) := false
 30 | 
 31 | val hive_version = sys.props.getOrElse("hive.version", "3.1.2")
 32 | 
 33 | val orc_version = sys.props.getOrElse("orc.version", "1.5.6")
 34 | 
 35 | resolvers += "Additional Maven Repository" at sys.props.getOrElse("hive.repo", "https://repo1.maven.org/maven2/")
 36 | 
 37 | // Shaded dependency
 38 | libraryDependencies ++= Seq(
 39 | 	// Hive/Orc core dependencies packed.
 40 | 	"org.apache.hive" % "hive-metastore" % hive_version intransitive(),
 41 | 	"org.apache.hive" % "hive-exec" % hive_version intransitive(),
 42 | 	"org.apache.orc" % "orc-core" % orc_version intransitive(),
 43 | 	"org.apache.orc" % "orc-mapreduce" % orc_version intransitive(),
 44 | 
 45 | 	// Only for hive3 client in tests.. but packing it in shaded jars.
 46 | 	"org.apache.hive" % "hive-jdbc" % hive_version intransitive(),
 47 | 	"org.apache.hive" % "hive-service" % hive_version intransitive(),
 48 | 	"org.apache.hive" % "hive-serde" % hive_version intransitive(),
 49 | 	"org.apache.hive" % "hive-common" % hive_version intransitive(),
 50 | 	
 51 | 	// To deal with hive3 metastore library 0.9.3 vs zeppelin thirft
 52 | 	// library version 0.9.1 conflict when runing Notebooks.
 53 | 	"org.apache.thrift" % "libfb303" % "0.9.3",
 54 | 	"org.apache.thrift" % "libthrift" % "0.9.3"
 55 | )
 56 | 
 57 | 
 58 | assemblyShadeRules in assembly := Seq(
 59 | 	ShadeRule.rename("org.apache.hadoop.hive.**" -> "com.qubole.shaded.hadoop.hive.@1").inAll,
 60 | 	ShadeRule.rename("org.apache.hive.**" -> "com.qubole.shaded.hive.@1").inAll,
 61 | 	ShadeRule.rename("org.apache.orc.**" -> "com.qubole.shaded.orc.@1").inAll,
 62 | 	ShadeRule.rename("org.apache.commons.**" -> "com.qubole.shaded.commons.@1").inAll,
 63 | 	ShadeRule.rename("org.apache.avro.**" -> "com.qubole.shaded.avro.@1").inAll,
 64 | 	ShadeRule.rename("org.apache.parquet.**" -> "com.qubole.shaded.parquet.@1").inAll,
 65 | 	ShadeRule.rename("org.apache.http.**" -> "com.qubole.shaded.http.@1").inAll,
 66 | 	ShadeRule.rename("org.apache.tez.**" -> "com.qubole.shaded.tez.@1").inAll,
 67 | 
 68 | 	ShadeRule.rename("com.google.**" -> "com.qubole.shaded.@1").inAll,
 69 | 	ShadeRule.rename("com.facebook.fb303.**" -> "com.qubole.shaded.facebook.fb303.@1").inAll,
 70 | 	ShadeRule.rename("org.apache.thrift.**" -> "com.qubole.shaded.thrift.@1").inAll,
 71 | 
 72 | 	ShadeRule.rename("org.codehaus.jackson.**" -> "com.qubole.shaded.jackson.@1").inAll,
 73 | 	ShadeRule.rename("org.joda.**" -> "com.qubole.shaded.joda.@1").inAll,
 74 | 	ShadeRule.rename("org.json.**" -> "com.qubole.shaded.json.@1").inAll,
 75 | 
 76 | 	ShadeRule.rename("jodd.**" -> "com.qubole.shaded.jodd.@1").inAll,
 77 | 	ShadeRule.rename("javaewah.**" -> "com.qubole.shaded.javaewah.@1").inAll,
 78 | 	ShadeRule.rename("io.airlift.**" -> "com.qubole.shaded.io.airlift.@1").inAll,
 79 | 
 80 | 	ShadeRule.rename("org.openx.data.**" -> "com.qubole.shaded.openx.data.@1").inAll,
 81 | 	ShadeRule.rename("au.com.bytecode.opencsv.**" -> "com.qubole.shaded.au.com.bytecode.opencsv.@1").inAll,
 82 | 	ShadeRule.rename("com.readytalk.metrics.**" -> "com.qubole.shaded.readytalk.metrics.@1").inAll
 83 | )
 84 | 
 85 | import sbtassembly.AssemblyPlugin.autoImport.{ ShadeRule}
 86 | import sbtassembly.MergeStrategy
 87 | val distinctAndReplace: sbtassembly.MergeStrategy = new sbtassembly.MergeStrategy {
 88 |     val name = "distinctAndReplace"
 89 |     def apply(tempDir: File, path: String, files: Seq[File]): Either[String, Seq[(File, String)]] = {
 90 |         val lines = files flatMap (IO.readLines(_, IO.utf8))
 91 |             val unique = lines.distinct
 92 |             val replaced = unique.map {  x => x.replace("org.apache.hadoop.hive", "com.qubole.shaded.hadoop.hive")}
 93 |             val file = sbtassembly.MergeStrategy.createMergeTarget(tempDir, path)
 94 |             IO.writeLines(file, replaced, IO.utf8)
 95 |             Right(Seq(file -> path))
 96 |     }
 97 | }
 98 | 
 99 | 
100 | assemblyMergeStrategy in assembly := {
101 | 	// all discarded classes first
102 | 	case PathList("javax", xs @ _*) => MergeStrategy.discard
103 | 	case PathList("javolution", xs @_*) => MergeStrategy.discard
104 | 		// discard non shaded classes in hadoop and qubole packages
105 | 	case PathList("org", "apache", "hadoop", xs @_*) => MergeStrategy.discard
106 | 	case PathList("org", "apache", "log4j", xs @ _*) => MergeStrategy.last
107 | 	case PathList("com", "google", xs @ _*) => MergeStrategy.last
108 | 	case PathList("com", "esotericsoftware", xs @ _*) => MergeStrategy.last
109 | 	case PathList("com", "codahale", xs @ _*) => MergeStrategy.last
110 | 	case PathList("com", "yammer", xs @ _*) => MergeStrategy.last
111 | 	case PathList("org","aopalliance", xs @ _*) => MergeStrategy.last
112 | 	case PathList("com","zaxxer", xs @ _*) => MergeStrategy.last
113 | 	case PathList("org","apache", "logging", "log4j",  xs @ _*) => MergeStrategy.last
114 | 	case PathList("io","netty", xs @ _*) => MergeStrategy.last
115 | 	case PathList("org","datanucleus", xs @ _*) => MergeStrategy.last
116 | 	case PathList("org", "apache", "arrow", xs @ _*) => MergeStrategy.last
117 | 	case PathList("org", "apache", "commons", "lang3", xs @ _*) => MergeStrategy.last
118 | 	case PathList("org", "apache", "commons", "lang3", "builder", xs @ _*) => MergeStrategy.last
119 | 	case PathList("org", "apache", "commons", "lang3", "concurrent", xs @ _*) => MergeStrategy.last
120 | 	case PathList("org", "apache", "commons", "lang3", "event", xs @ _*) => MergeStrategy.last
121 | 	case PathList("org", "apache", "commons", "lang3", "exception", xs @ _*) => MergeStrategy.last
122 | 	case PathList("org", "apache", "commons", "lang3", "math", xs @ _*) => MergeStrategy.last
123 | 	case PathList("org", "apache", "commons", "lang3", "mutable", xs @ _*) => MergeStrategy.last
124 | 	case PathList("org", "apache", "commons", "lang3", "reflect", xs @ _*) => MergeStrategy.last
125 | 	case PathList("org", "apache", "commons", "lang3", "text", xs @ _*) => MergeStrategy.last
126 | 	case PathList("org", "apache", "commons", "lang3", "time", xs @ _*) => MergeStrategy.last
127 | 	case PathList("org", "apache", "commons", "lang3", "tuple", xs @ _*) => MergeStrategy.last
128 | 	case PathList("com", "qubole", "shaded", "orc", xs @ _*) => MergeStrategy.last
129 | 	case PathList("org", "slf4j", "impl", xs @ _*) => MergeStrategy.last
130 | 	case PathList("org", "slf4j", "helpers", xs @ _*) => MergeStrategy.last
131 | 	case PathList("org", "slf4j", xs @ _*) => MergeStrategy.last
132 | 
133 | 	// discard package.jdo because objects defined inside it are not shaded.
134 | 		// So removing for now
135 | 	case "package.jdo" => MergeStrategy.discard
136 | 
137 | 	case PathList("META-INF", "services", xs @ _*) => distinctAndReplace
138 | 	// case "about.html" => MergeStrategy.rename
139 | 	case "META-INF/ECLIPSEF.RSA" => MergeStrategy.last
140 | 	case "META-INF/mailcap" => MergeStrategy.last
141 | 	case "META-INF/mimetypes.default" => MergeStrategy.last
142 | 	case "plugin.properties" => MergeStrategy.last
143 | 	case "log4j.properties" => MergeStrategy.last
144 | 	case "Log4j2Plugins.dat" => MergeStrategy.last
145 | 	case "git.properties" => MergeStrategy.last
146 | 	case "plugin.xml" => MergeStrategy.last
147 | 	case "META-INF/io.netty.versions.properties" => MergeStrategy.last
148 | 	case "META-INF/org/apache/logging/log4j/core/config/plugins/Log4j2Plugins.dat" => MergeStrategy.last
149 | 	case "codegen/config.fmpp" => MergeStrategy.first
150 | 
151 | 	case x =>
152 | 			val oldStrategy = (assemblyMergeStrategy in assembly).value
153 | 			oldStrategy(x)
154 | }
155 | 
156 | // do not add scala in fat jar
157 | assemblyOption in assembly := (assemblyOption in assembly).value.copy(includeScala = false)
158 | 
159 | // For publishing assembly locally
160 | publishMavenStyle := false
161 | 
162 | artifact in (Compile, assembly) := {
163 | 	val art = (artifact in (Compile, assembly)).value
164 | 	art.withClassifier(None)
165 | }
166 | 
167 | addArtifact(artifact in (Compile, assembly), assembly)
168 | 
169 | 


--------------------------------------------------------------------------------
/shaded-dependencies/project/build.properties:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2019 Qubole, Inc.  All rights reserved.
 3 | #
 4 | # Licensed to the Apache Software Foundation (ASF) under one or more
 5 | # contributor license agreements.  See the NOTICE file distributed with
 6 | # this work for additional information regarding copyright ownership.
 7 | # The ASF licenses this file to You under the Apache License, Version 2.0
 8 | # (the "License"); you may not use this file except in compliance with
 9 | # the License.  You may obtain a copy of the License at
10 | #
11 | #    http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | 
20 | sbt.version = 1.2.8
21 | 


--------------------------------------------------------------------------------
/shaded-dependencies/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.9")
2 | addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.9.2")
3 | 


--------------------------------------------------------------------------------
/src/it/scala/com/qubole/spark/hiveacid/LockSuite.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package com.qubole.spark.hiveacid
 19 | 
 20 | import com.qubole.spark.hiveacid.hive.HiveAcidMetadata
 21 | import com.qubole.spark.hiveacid.transaction.HiveAcidTxn
 22 | import org.apache.log4j.{Level, LogManager, Logger}
 23 | import org.apache.spark.sql.SparkSession
 24 | import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach, FunSuite}
 25 | 
 26 | import scala.util.control.NonFatal
 27 | 
 28 | class LockSuite extends FunSuite with BeforeAndAfterEach with BeforeAndAfterAll {
 29 |   val log: Logger = LogManager.getLogger(this.getClass)
 30 |   log.setLevel(Level.INFO)
 31 | 
 32 |   var helper: TestHelper = _
 33 |   val isDebug = true
 34 | 
 35 |   val DEFAULT_DBNAME =  "HiveTestLockDB"
 36 |   val cols: Map[String, String] = Map(
 37 |     ("intCol","int"),
 38 |     ("doubleCol","double"),
 39 |     ("floatCol","float"),
 40 |     ("booleanCol","boolean")
 41 |   )
 42 |   val partitionedTable = new Table(DEFAULT_DBNAME, "partitioned",
 43 |     cols, Table.orcPartitionedFullACIDTable, true)
 44 |   val normalTable = new Table(DEFAULT_DBNAME, "nonPartitioned",
 45 |     cols, Table.orcFullACIDTable, false)
 46 | 
 47 |   override def beforeAll() {
 48 |     try {
 49 |       helper = new TestLockHelper
 50 |       if (isDebug) {
 51 |         log.setLevel(Level.DEBUG)
 52 |       }
 53 |       helper.init(isDebug)
 54 | 
 55 |       // DB
 56 |       helper.hiveExecute("DROP DATABASE IF EXISTS "+ DEFAULT_DBNAME +" CASCADE")
 57 |       helper.hiveExecute("CREATE DATABASE "+ DEFAULT_DBNAME)
 58 |       helper.recreate(partitionedTable)
 59 |       helper.recreate(normalTable)
 60 |       helper.hiveExecute(partitionedTable.insertIntoHiveTableKeyRange(11, 25))
 61 |     } catch {
 62 |       case NonFatal(e) => log.info("failed " + e)
 63 |     }
 64 |   }
 65 | 
 66 |   override protected def afterAll(): Unit = {
 67 |     helper.hiveExecute(s"DROP TABLE IF EXISTS ${normalTable.hiveTname}")
 68 |     helper.hiveExecute(s"DROP TABLE IF EXISTS ${partitionedTable.hiveTname}")
 69 |     helper.hiveExecute("DROP DATABASE IF EXISTS "+ DEFAULT_DBNAME +" CASCADE")
 70 |     helper.destroy()
 71 |   }
 72 | 
 73 |   case class TestLockOperation(whichTransaction: Int,
 74 |                                operationType: HiveAcidOperation.OperationType,
 75 |                                partition: Seq[String],
 76 |                                willFail: Boolean = false)
 77 | 
 78 |   test("test lock wait timeout exception") {
 79 |     val lockOps = Seq(
 80 |       TestLockOperation(1, HiveAcidOperation.UPDATE, Seq()), // first trans will pass
 81 |       TestLockOperation(1, HiveAcidOperation.DELETE, Seq()), // similar operation on first trans will pass
 82 |       TestLockOperation(2, HiveAcidOperation.DELETE, Seq(), true)) // second transaction will wait and fail in 100ms
 83 |     testLockOps(lockOps)
 84 |   }
 85 | 
 86 |   test("test locks within same transaction is allowed") {
 87 |     val lockOps = Seq(
 88 |       TestLockOperation(1, HiveAcidOperation.UPDATE, Seq()), // first trans will pass
 89 |       TestLockOperation(1, HiveAcidOperation.DELETE, Seq()), // similar operation on first trans will pass
 90 |       TestLockOperation(1, HiveAcidOperation.READ, Seq()), // READ on same transaction will pass
 91 |       TestLockOperation(1, HiveAcidOperation.INSERT_OVERWRITE, Seq()))
 92 |     testLockOps(lockOps)
 93 |   }
 94 | 
 95 |   test("test READ after UPDATE/DELETE is allowed") {
 96 |     val lockOps = Seq(
 97 |       TestLockOperation(1, HiveAcidOperation.UPDATE, Seq()), // first trans will pass
 98 |       TestLockOperation(1, HiveAcidOperation.DELETE, Seq()),
 99 |       TestLockOperation(2, HiveAcidOperation.READ, Seq())) // second transaction READ need not wait
100 |     testLockOps(lockOps)
101 |   }
102 | 
103 |   test("test DELETE/READ after INSERT OVERWRITE is not allowed") {
104 |     val lockOps = Seq(
105 |       TestLockOperation(1, HiveAcidOperation.INSERT_OVERWRITE, Seq()),
106 |       TestLockOperation(2, HiveAcidOperation.UPDATE, Seq(), true),
107 |       TestLockOperation(2, HiveAcidOperation.DELETE, Seq(), true),
108 |       TestLockOperation(2, HiveAcidOperation.READ, Seq(), true))
109 |     testLockOps(lockOps)
110 |   }
111 | 
112 |   test("test INSERT_OVERWRITE and DELETE/UPDATE/READ on different partition is allowed") {
113 |     val lockOps = Seq(
114 |       TestLockOperation(1, HiveAcidOperation.INSERT_OVERWRITE, Seq("ptnCol=0")),
115 |       TestLockOperation(2, HiveAcidOperation.DELETE, Seq("ptnCol=1")),
116 |       TestLockOperation(2, HiveAcidOperation.UPDATE, Seq("ptnCol=1")),
117 |       TestLockOperation(2, HiveAcidOperation.READ, Seq("ptnCol=1")))
118 |     testLockOps(lockOps)
119 |   }
120 | 
121 |   def testLockOps(lockOps: Seq[TestLockOperation]): Unit = {
122 |     val tableName = DEFAULT_DBNAME + "." + "nonPartitioned"
123 |     val hiveAcidMetadata = HiveAcidMetadata.fromSparkSession(helper.spark,
124 |       tableName)
125 | 
126 |     // Just try 2 attempts for lock acquisition and fail if it cannot.
127 |     helper.spark.sessionState.conf.setConfString("spark.hiveAcid.lock.max.retries", "2")
128 |     val sparkConf = SparkAcidConf(helper.spark, Map())
129 |     val hTxn1 = new HiveAcidTxn(helper.spark)
130 |     val hTxn2 = new HiveAcidTxn(helper.spark)
131 | 
132 |     def executeOp(lockOp: TestLockOperation) {
133 |       val txn = lockOp.whichTransaction match {
134 |         case 1 => hTxn1
135 |         case 2 => hTxn2
136 |         case _ => throw new IllegalArgumentException("Only 1 or 2 are supported for whichTransaction field")
137 |       }
138 |       if (lockOp.willFail) {
139 |         val thrown = intercept[RuntimeException] {
140 |           txn.acquireLocks(hiveAcidMetadata, lockOp.operationType, lockOp.partition, sparkConf)
141 |         }
142 |         assert(thrown.getMessage.contains("Could not acquire lock. Lock State: WAITING"))
143 |        } else {
144 |         txn.acquireLocks(hiveAcidMetadata, lockOp.operationType, lockOp.partition, sparkConf)
145 |       }
146 | 
147 |     }
148 | 
149 |     try {
150 |       hTxn1.begin()
151 |       hTxn2.begin()
152 |       lockOps.foreach(executeOp(_))
153 |     } finally {
154 |       helper.spark.sessionState.conf.unsetConf("spark.hiveAcid.lock.max.retries")
155 |       hTxn1.end(true)
156 |       hTxn2.end(true)
157 |     }
158 |   }
159 | 
160 |   test("test HeartBeatRunner is running") {
161 |     val hTxn1 = new HiveAcidTxn(helper.spark)
162 |     hTxn1.begin()
163 |     // Sleep for 4 seconds
164 |     Thread.sleep(4 * 1000)
165 |     val txn = HiveAcidTxn.txnManager.showOpenTrans().find(ti => ti.getId == hTxn1.txnId)
166 |     assert(txn.isDefined, "Transaction is expected to be open")
167 |     val seconds = (txn.get.getLastHeartbeatTime() - txn.get.getStartedTime()) / 1000
168 |     assert(seconds >= 2, "getLastHeartBeatTime should " +
169 |       "be at least 2 seconds after transaction was opened")
170 |     hTxn1.end(true)
171 |   }
172 | }
173 | 
174 | class TestLockHelper extends TestHelper {
175 |   // Create spark session with txn timeout config as that needs to be set
176 |   // before the start of spark session
177 |   override def getSparkSession(): SparkSession = {
178 |     SparkSession.builder().appName("Hive-acid-test")
179 |       .master("local[*]")
180 |       .config("spark.hadoop.hive.metastore.uris", "thrift://0.0.0.0:10000")
181 |       .config("spark.sql.warehouse.dir", "/tmp")
182 |       .config("spark.sql.extensions", "com.qubole.spark.hiveacid.HiveAcidAutoConvertExtension")
183 |       .config("spark.hadoop.hive.txn.timeout", "6")
184 |       //.config("spark.ui.enabled", "true")
185 |       //.config("spark.ui.port", "4041")
186 |       .enableHiveSupport()
187 |       .getOrCreate()
188 |   }
189 | }


--------------------------------------------------------------------------------
/src/it/scala/com/qubole/spark/hiveacid/TestHiveClient.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2019 Qubole, Inc.  All rights reserved.
  3 |  *
  4 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  5 |  * contributor license agreements.  See the NOTICE file distributed with
  6 |  * this work for additional information regarding copyright ownership.
  7 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  8 |  * (the "License"); you may not use this file except in compliance with
  9 |  * the License.  You may obtain a copy of the License at
 10 |  *
 11 |  *    http://www.apache.org/licenses/LICENSE-2.0
 12 |  *
 13 |  * Unless required by applicable law or agreed to in writing, software
 14 |  * distributed under the License is distributed on an "AS IS" BASIS,
 15 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 16 |  * See the License for the specific language governing permissions and
 17 |  * limitations under the License.
 18 |  */
 19 | 
 20 | package com.qubole.spark.hiveacid;
 21 | 
 22 | 
 23 | import java.sql.Connection;
 24 | import java.sql.DriverManager;
 25 | import java.sql.ResultSet;
 26 | import java.sql.ResultSetMetaData;
 27 | import java.sql.SQLException;
 28 | import java.sql.Statement;
 29 | 
 30 | import java.io.StringWriter;
 31 | 
 32 | public class TestHiveClient {
 33 | 	private static Connection con = null;
 34 | 	private static Statement stmt = null;
 35 | 
 36 | 	TestHiveClient() {
 37 | 		try {
 38 | 			// Before running this docker container with HS2 / HMS / Hadoop running
 39 | 			String driverName = "com.qubole.shaded.hive.jdbc.HiveDriver";
 40 | 			Class.forName(driverName);
 41 | 		} catch (ClassNotFoundException e) {
 42 | 			e.printStackTrace();
 43 | 			System.exit(1);
 44 | 		}
 45 | 		try {
 46 | 			con = DriverManager.getConnection("jdbc:hive2://0.0.0.0:10001?allowMultiQueries=true", "root", "root");
 47 | 			stmt = con.createStatement();
 48 | 		}
 49 | 		catch (Exception e) {
 50 | 			System.out.println("Failed to create statement "+ e);
 51 | 		}
 52 | 	}
 53 | 
 54 | 	public String executeQuery(String cmd) throws Exception {
 55 | 		// Start Hive txn
 56 | 		ResultSet rs = null;
 57 | 		String resStr = null;
 58 | 		try {
 59 | 			rs = stmt.executeQuery(cmd);
 60 | 			resStr = resultStr(rs);
 61 | 			// close hive txn
 62 | 			rs.close();
 63 | 			rs = null;
 64 | 
 65 | 		} catch (Exception e) {
 66 | 			System.out.println("Failed execute query statement \""+ cmd +"\" Error:"+ e);
 67 | 			if (rs != null ) {
 68 | 				rs.close();
 69 | 			}
 70 | 		}
 71 | 		return resStr;
 72 | 	}
 73 | 
 74 | 	public void execute(String cmd) throws SQLException {
 75 | 		try {
 76 | 			stmt.execute(cmd);
 77 | 		} catch (Exception e) {
 78 | 			System.out.println("Failed execute statement \""+ cmd +"\" Error:"+ e);
 79 | 		}
 80 | 	}
 81 | 
 82 | 	private String resultStr(ResultSet rs) throws SQLException {
 83 | 		StringWriter outputWriter = new StringWriter();
 84 | 		ResultSetMetaData rsmd = rs.getMetaData();
 85 | 		int columnsNumber = rsmd.getColumnCount();
 86 | 		int rowNumber = 0;
 87 | 		while (rs.next()) {
 88 | 			if (rowNumber != 0) {
 89 | 				outputWriter.append("\n");
 90 | 			}
 91 | 			rowNumber++;
 92 | 			for (int i = 1; i <= columnsNumber; i++) {
 93 | 				if (i > 1) outputWriter.append(",");
 94 | 				String columnValue = rs.getString(i);
 95 | 				// outputWriter.append(rsmd.getColumnName(i)+ "=" + columnValue);
 96 | 				outputWriter.append(columnValue);
 97 | 			}
 98 | 		}
 99 | 		return outputWriter.toString();
100 | 	}
101 | 
102 | 	public void teardown() throws SQLException {
103 | 		if (stmt != null) {
104 | 			stmt.close();
105 | 			stmt = null;
106 | 		}
107 | 		if (con != null) {
108 | 			con.close();
109 | 			con = null;
110 | 		}
111 | 	}
112 | }
113 | 


--------------------------------------------------------------------------------
/src/it/scala/com/qubole/spark/hiveacid/TestSparkSession.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2019 Qubole, Inc.  All rights reserved.
 3 |  *
 4 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 5 |  * contributor license agreements.  See the NOTICE file distributed with
 6 |  * this work for additional information regarding copyright ownership.
 7 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 8 |  * (the "License"); you may not use this file except in compliance with
 9 |  * the License.  You may obtain a copy of the License at
10 |  *
11 |  *    http://www.apache.org/licenses/LICENSE-2.0
12 |  *
13 |  * Unless required by applicable law or agreed to in writing, software
14 |  * distributed under the License is distributed on an "AS IS" BASIS,
15 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 |  * See the License for the specific language governing permissions and
17 |  * limitations under the License.
18 |  */
19 | package com.qubole.spark.hiveacid
20 | 
21 | import org.apache.spark.sql.SparkSession
22 | 
23 | private[hiveacid] object TestSparkSession {
24 | 
25 |   def getSession: SparkSession = {
26 |     val spark: SparkSession = SparkSession.builder().appName("Hive-acid-test")
27 |       .master("local[*]")
28 |       .config("spark.hadoop.hive.metastore.uris", "thrift://0.0.0.0:10000")
29 |       .config("spark.sql.warehouse.dir", "/tmp")
30 |       .config("spark.sql.extensions", "com.qubole.spark.hiveacid.HiveAcidAutoConvertExtension")
31 |       //.config("spark.ui.enabled", "true")
32 |       //.config("spark.ui.port", "4041")
33 |       .enableHiveSupport()
34 |       .getOrCreate()
35 |     spark.sparkContext.setLogLevel("WARN")
36 |     spark
37 |   }
38 | 
39 |   def close(spark: SparkSession): Unit = {
40 |     spark.close()
41 |     SparkSession.clearActiveSession()
42 |     SparkSession.clearDefaultSession()
43 |   }
44 | }
45 | 


--------------------------------------------------------------------------------
/src/it/scala/com/qubole/spark/hiveacid/UpdateDeleteSuite.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2019 Qubole, Inc.  All rights reserved.
  3 |  *
  4 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  5 |  * contributor license agreements.  See the NOTICE file distributed with
  6 |  * this work for additional information regarding copyright ownership.
  7 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  8 |  * (the "License"); you may not use this file except in compliance with
  9 |  * the License.  You may obtain a copy of the License at
 10 |  *
 11 |  *    http://www.apache.org/licenses/LICENSE-2.0
 12 |  *
 13 |  * Unless required by applicable law or agreed to in writing, software
 14 |  * distributed under the License is distributed on an "AS IS" BASIS,
 15 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 16 |  * See the License for the specific language governing permissions and
 17 |  * limitations under the License.
 18 |  */
 19 | 
 20 | package com.qubole.spark.hiveacid
 21 | 
 22 | 
 23 | import org.apache.log4j.{Level, LogManager, Logger}
 24 | import org.scalatest._
 25 | 
 26 | import scala.util.control.NonFatal
 27 | 
 28 | class UpdateDeleteSuite extends FunSuite with BeforeAndAfterEach with BeforeAndAfterAll {
 29 | 
 30 |   val log: Logger = LogManager.getLogger(this.getClass)
 31 |   log.setLevel(Level.INFO)
 32 | 
 33 |   var helper: TestHelper = _
 34 |   val isDebug = true
 35 | 
 36 |   val DEFAULT_DBNAME =  "HiveTestUpdateDeleteDB"
 37 |   val cols: Map[String, String] = Map(
 38 |     ("intCol","int"),
 39 |     ("doubleCol","double"),
 40 |     ("floatCol","float"),
 41 |     ("booleanCol","boolean")
 42 |   )
 43 | 
 44 |   override def beforeAll() {
 45 |     try {
 46 |       helper = new TestHelper
 47 |       if (isDebug) {
 48 |         log.setLevel(Level.DEBUG)
 49 |       }
 50 |       helper.init(isDebug)
 51 | 
 52 |       // DB
 53 |       helper.hiveExecute("DROP DATABASE IF EXISTS "+ DEFAULT_DBNAME +" CASCADE")
 54 |       helper.hiveExecute("CREATE DATABASE "+ DEFAULT_DBNAME)
 55 |     } catch {
 56 |       case NonFatal(e) => log.info("failed " + e)
 57 |     }
 58 |   }
 59 | 
 60 |   override protected def afterAll(): Unit = {
 61 |     helper.destroy()
 62 |   }
 63 | 
 64 |   val testTables = List(
 65 |     // Positive Test
 66 |     (Table.orcFullACIDTable, false, true),
 67 |     (Table.orcPartitionedFullACIDTable, true, true),
 68 |     // Negative Test
 69 |     (Table.orcTable, false, false),
 70 |     (Table.orcPartitionedTable, true, false),
 71 |     (Table.orcBucketedTable, false, false), (Table.orcBucketedPartitionedTable, true, false))
 72 |   // Test Run
 73 |   updateTestForFullAcidTables(testTables)
 74 |   deleteTestForFullAcidTables(testTables)
 75 | 
 76 |   // Update test for full acid tables
 77 |   def updateTestForFullAcidTables(tTypes: List[(String, Boolean, Boolean)]): Unit = {
 78 |     tTypes.foreach { case (tType, isPartitioned, positiveTest) =>
 79 |       val tableNameSpark = "tSparkUpdate"
 80 |       val testName = s"Update Test for $tableNameSpark type $tType"
 81 |       test(testName) {
 82 |         val tableSpark = new Table(DEFAULT_DBNAME, tableNameSpark, cols, tType, isPartitioned)
 83 |         def code(): Unit = {
 84 | 
 85 |           if (positiveTest) {
 86 |             helper.recreate(tableSpark)
 87 |             helper.hiveExecute(tableSpark.insertIntoHiveTableKeyRange(11, 20))
 88 |             val expectedRows = 10
 89 |             helper.compareResult(expectedRows.toString, helper.sparkCollect(tableSpark.count))
 90 |             val expectedUpdateValue = helper.sparkCollect(tableSpark.selectExpectedUpdateCol(11))
 91 |             helper.sparkSQL(tableSpark.updateInHiveTableKey(11))
 92 |             val updatedVal = helper.sparkCollect(tableSpark.selectUpdateCol(11))
 93 |             helper.compareResult(expectedUpdateValue, updatedVal)
 94 |           } else {
 95 |             intercept[RuntimeException] {
 96 |               helper.recreate(tableSpark)
 97 |               helper.sparkSQL(tableSpark.updateInHiveTableKey(11))
 98 |             }
 99 |           }
100 |         }
101 |         helper.myRun(testName, code)
102 |       }
103 |     }
104 |   }
105 | 
106 |   // Delete test for full acid tables
107 |   def deleteTestForFullAcidTables(tTypes: List[(String, Boolean, Boolean)]): Unit = {
108 |     tTypes.foreach { case (tType, isPartitioned, positiveTest) =>
109 |       val tableNameSpark = "tSparkDelete"
110 |       val testName = s"Delete Test for $tableNameSpark type $tType"
111 |       test(testName) {
112 |         val tableSpark = new Table(DEFAULT_DBNAME, tableNameSpark, cols, tType, isPartitioned)
113 |         def code(): Unit = {
114 |           if (positiveTest) {
115 |             helper.recreate(tableSpark)
116 |             helper.hiveExecute(tableSpark.insertIntoHiveTableKeyRange(11, 20))
117 |             var expectedRows = 10
118 |             helper.compareResult(expectedRows.toString, helper.sparkCollect(tableSpark.count))
119 | 
120 |             // delete 1 row
121 |             helper.sparkSQL(tableSpark.deleteFromHiveTableKey(11))
122 |             expectedRows = 9
123 |             helper.compareResult(expectedRows.toString, helper.sparkCollect(tableSpark.count))
124 | 
125 |             // Delete all but 1 using predicates
126 |             helper.sparkSQL(tableSpark.deleteFromHiveTableGreaterThanKey(15))
127 |             helper.sparkSQL(tableSpark.deleteFromHiveTableLesserThanKey(15))
128 |             expectedRows = 1
129 |             helper.compareResult(expectedRows.toString, helper.sparkCollect(tableSpark.count))
130 | 
131 |             // No OP Delete
132 |             helper.sparkCollect(tableSpark.deleteFromHiveTableGreaterThanKey(20))
133 |             helper.compareResult(expectedRows.toString, helper.sparkCollect(tableSpark.count))
134 |           } else {
135 |             intercept[RuntimeException] {
136 |               helper.recreate(tableSpark)
137 |               // delete 1 row
138 |               helper.sparkSQL(tableSpark.deleteFromHiveTableKey(11))
139 |             }
140 |           }
141 |         }
142 |         helper.myRun(testName, code)
143 |       }
144 |     }
145 |   }
146 | 
147 |   test("Test Update on Partition Columns is not allowed") {
148 |     val tableNameSpark = "tUpdateNeg"
149 |     val tableSpark = new Table(DEFAULT_DBNAME, tableNameSpark, cols,
150 |       Table.orcPartitionedFullACIDTable, true)
151 |     helper.recreate(tableSpark,false)
152 |     val thrown = intercept[AnalysisException] {
153 |       helper.sparkSQL(s"UPDATE ${DEFAULT_DBNAME}.${tableNameSpark} set ptnCol = 2 where intCol > 10")
154 |     }
155 |     assert(thrown.getMessage.contains("UPDATE on the partition columns are not allowed"))
156 |   }
157 | }
158 | 


--------------------------------------------------------------------------------
/src/it/scala/com/qubole/spark/hiveacid/WriteSuite.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2019 Qubole, Inc.  All rights reserved.
  3 |  *
  4 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  5 |  * contributor license agreements.  See the NOTICE file distributed with
  6 |  * this work for additional information regarding copyright ownership.
  7 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  8 |  * (the "License"); you may not use this file except in compliance with
  9 |  * the License.  You may obtain a copy of the License at
 10 |  *
 11 |  *    http://www.apache.org/licenses/LICENSE-2.0
 12 |  *
 13 |  * Unless required by applicable law or agreed to in writing, software
 14 |  * distributed under the License is distributed on an "AS IS" BASIS,
 15 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 16 |  * See the License for the specific language governing permissions and
 17 |  * limitations under the License.
 18 |  */
 19 | 
 20 | package com.qubole.spark.hiveacid
 21 | 
 22 | 
 23 | import org.apache.log4j.{Level, LogManager, Logger}
 24 | import org.scalatest._
 25 | 
 26 | import scala.util.control.NonFatal
 27 | 
 28 | @Ignore
 29 | class WriteSuite extends FunSuite with BeforeAndAfterEach with BeforeAndAfterAll {
 30 | 
 31 |   val log: Logger = LogManager.getLogger(this.getClass)
 32 |   log.setLevel(Level.INFO)
 33 | 
 34 |   var helper: TestHelper = _
 35 |   val isDebug = true
 36 | 
 37 |   val DEFAULT_DBNAME =  "HiveTestDB"
 38 |   val defaultPred = " intCol < 5 "
 39 |   val cols: Map[String, String] = Map(
 40 |     ("intCol","int"),
 41 |     ("doubleCol","double"),
 42 |     ("floatCol","float"),
 43 |     ("booleanCol","boolean")
 44 |     // TODO: Requires spark.sql.hive.convertMetastoreOrc=false to run
 45 |     // ("dateCol","date")
 46 |   )
 47 | 
 48 |   override def beforeAll() {
 49 |     try {
 50 | 
 51 |       helper = new TestHelper
 52 |       if (isDebug) {
 53 |         log.setLevel(Level.DEBUG)
 54 |       }
 55 |       helper.init(isDebug)
 56 | 
 57 |       // DB
 58 |       helper.hiveExecute("DROP DATABASE IF EXISTS "+ DEFAULT_DBNAME +" CASCADE")
 59 |       helper.hiveExecute("CREATE DATABASE "+ DEFAULT_DBNAME)
 60 |     } catch {
 61 |       case NonFatal(e) => log.info("failed " + e)
 62 |     }
 63 |   }
 64 | 
 65 |   override protected def afterAll(): Unit = {
 66 |     helper.destroy()
 67 |   }
 68 | 
 69 | 
 70 |   // Test Run
 71 |   insertIntoOverwriteTestForFullAcidTables(Table.allFullAcidTypes())
 72 | 
 73 |   // TODO: Currently requires compatibility check to be disabled in HMS to run clean
 74 |   //  hive.metastore.client.capability.check=false
 75 |   // insertIntoOverwriteTestForInsertOnlyTables(Table.allInsertOnlyTypes())
 76 | 
 77 |   // Insert Into/Overwrite test for full acid tables
 78 |   def insertIntoOverwriteTestForFullAcidTables(tTypes: List[(String,Boolean)]): Unit = {
 79 |     tTypes.foreach { case (tType, isPartitioned) =>
 80 |       val tableNameHive = "tHive"
 81 |       val tableNameSpark = "tSpark"
 82 |       val testName = s"Simple InsertInto Test for $tableNameHive/$tableNameSpark type $tType"
 83 |       test(testName) {
 84 |         val tableHive = new Table(DEFAULT_DBNAME, tableNameHive, cols, tType, isPartitioned)
 85 |         val tableSpark = new Table(DEFAULT_DBNAME, tableNameSpark, cols, tType, isPartitioned)
 86 |         def code(): Unit = {
 87 |           helper.recreate(tableHive)
 88 |           helper.recreate(tableSpark)
 89 | 
 90 |           // Insert into rows in both tables from Hive and Spark
 91 |           helper.hiveExecute(tableHive.insertIntoHiveTableKeyRange(11, 20))
 92 |           helper.sparkSQL(tableSpark.insertIntoSparkTableKeyRange(11, 20))
 93 |           var expectedRows = 10
 94 |           helper.compareTwoTablesViaHive(tableHive, tableSpark, "After Insert Into", expectedRows)
 95 |           helper.compareTwoTablesViaSpark(tableHive, tableSpark, "After Insert Into", expectedRows)
 96 | 
 97 |           // Insert overwrite rows in both tables from Hive and Spark
 98 |           helper.hiveExecute(tableHive.insertOverwriteHiveTableKeyRange(16, 25))
 99 |           helper.sparkSQL(tableSpark.insertOverwriteSparkTableKeyRange(16, 25))
100 |           expectedRows = if (tableHive.isPartitioned) 15 else 10
101 |           helper.compareTwoTablesViaHive(tableHive, tableSpark, "After Insert Overwrite", expectedRows)
102 |           helper.compareTwoTablesViaSpark(tableHive, tableSpark, "After Insert Overwrite", expectedRows)
103 | 
104 |           // Insert overwrite rows in both tables - add rows in hive table from spark and vice versa
105 |           helper.hiveExecute(tableSpark.insertOverwriteHiveTableKeyRange(24, 27))
106 |           helper.sparkSQL(tableHive.insertOverwriteSparkTableKeyRange(24, 27))
107 |           expectedRows = if (tableHive.isPartitioned) expectedRows + 2 else 4
108 |           helper.compareTwoTablesViaHive(tableHive, tableSpark, "After Insert Overwrite", expectedRows)
109 |           helper.compareTwoTablesViaSpark(tableHive, tableSpark, "After Insert Overwrite", expectedRows)
110 | 
111 |           // Insert into rows in both tables - add rows in hive table from spark and vice versa
112 |           helper.hiveExecute(tableSpark.insertIntoHiveTableKeyRange(24, 27))
113 |           helper.sparkSQL(tableHive.insertIntoSparkTableKeyRange(24, 27))
114 |           expectedRows = expectedRows + 4
115 |           helper.compareTwoTablesViaHive(tableHive, tableSpark, "After Insert Into", expectedRows)
116 |           helper.compareTwoTablesViaSpark(tableHive, tableSpark, "After Insert Into", expectedRows)
117 | 
118 |         }
119 |         helper.myRun(testName, code)
120 |       }
121 |     }
122 |   }
123 | 
124 |   def insertIntoOverwriteTestForInsertOnlyTables(tTypes: List[(String,Boolean)]): Unit = {
125 |     tTypes.foreach { case (tType, isPartitioned) =>
126 |       val tableNameSpark = "tSpark"
127 |       val testName = s"Simple InsertInto Test for $tableNameSpark type $tType"
128 |       test(testName) {
129 |         val tableSpark = new Table(DEFAULT_DBNAME, tableNameSpark, cols, tType, isPartitioned)
130 |         def code() = {
131 |           helper.recreate(tableSpark)
132 |         }
133 |         helper.myRun(testName, code)
134 |       }
135 |     }
136 |   }
137 | 
138 | }
139 | 


--------------------------------------------------------------------------------
/src/it/scala/com/qubole/spark/hiveacid/streaming/HiveAcidSinkOptionsSuite.scala:
--------------------------------------------------------------------------------
 1 | package com.qubole.spark.hiveacid.streaming
 2 | 
 3 | import java.util.Locale
 4 | 
 5 | import com.qubole.spark.hiveacid.Table
 6 | import org.apache.spark.sql.streaming.OutputMode
 7 | 
 8 | 
 9 | class HiveAcidSinkOptionsSuite extends HiveAcidStreamingFunSuite {
10 | 
11 |   import HiveAcidSinkOptions._
12 | 
13 |   test("bad sink options") {
14 | 
15 |     def testBadOptions(options: List[(String, String)])(expectedMsg: String): Unit = {
16 | 
17 |       val tableName = "tempTable"
18 |       val tType = Table.orcFullACIDTable
19 |       val cols = Map(
20 |         ("value1","int"),
21 |         ("value2", "int")
22 |       )
23 |       val tableHive = new Table(DEFAULT_DBNAME, tableName, cols, tType, false)
24 | 
25 |         // creating table
26 |         helper.recreate(tableHive)
27 |         val errorMessage = intercept[IllegalArgumentException] {
28 |           helper.runStreaming(
29 |             tableHive.hiveTname, OutputMode.Append(), tableHive.getColMap.keys.toSeq, Range(1, 4), options)
30 |         }.getMessage
31 |         assert(errorMessage.toLowerCase(Locale.ROOT).contains(expectedMsg.toLowerCase(Locale.ROOT)))
32 | 
33 |     }
34 | 
35 |     testBadOptions(List(CLEANUP_DELAY_KEY -> "-2"))("Invalid value '-2' " +
36 |       s"for option '$CLEANUP_DELAY_KEY', must be a positive integer")
37 |     testBadOptions(List(COMPACT_INTERVAL_KEY -> "-5"))("Invalid value '-5' " +
38 |       s"for option '$COMPACT_INTERVAL_KEY', must be a positive integer")
39 |     testBadOptions(List(MIN_BATCHES_TO_RETAIN_KEY -> "-5"))("Invalid value '-5' " +
40 |       s"for option '$MIN_BATCHES_TO_RETAIN_KEY', must be a positive integer")
41 |     testBadOptions(List(LOG_DELETION_KEY -> "x"))("Invalid value 'x' " +
42 |       s"for option '$LOG_DELETION_KEY', must be true or false")
43 | 
44 |   }
45 | 
46 | }
47 | 


--------------------------------------------------------------------------------
/src/it/scala/com/qubole/spark/hiveacid/streaming/HiveAcidSinkSuite.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2019 Qubole, Inc.  All rights reserved.
  3 |  *
  4 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  5 |  * contributor license agreements.  See the NOTICE file distributed with
  6 |  * this work for additional information regarding copyright ownership.
  7 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  8 |  * (the "License"); you may not use this file except in compliance with
  9 |  * the License.  You may obtain a copy of the License at
 10 |  *
 11 |  *    http://www.apache.org/licenses/LICENSE-2.0
 12 |  *
 13 |  * Unless required by applicable law or agreed to in writing, software
 14 |  * distributed under the License is distributed on an "AS IS" BASIS,
 15 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 16 |  * See the License for the specific language governing permissions and
 17 |  * limitations under the License.
 18 |  */
 19 | 
 20 | package com.qubole.spark.hiveacid.streaming
 21 | 
 22 | import java.util.Locale
 23 | 
 24 | import com.qubole.shaded.hadoop.hive.ql.metadata.InvalidTableException
 25 | import com.qubole.spark.hiveacid.{AnalysisException, Table}
 26 | import com.qubole.spark.hiveacid.datasource.HiveAcidDataSource
 27 | import org.apache.spark.sql.Row
 28 | import org.apache.spark.sql.streaming.OutputMode
 29 | 
 30 | 
 31 | class HiveAcidSinkSuite extends HiveAcidStreamingFunSuite {
 32 | 
 33 |   override protected def afterAll(): Unit = {
 34 |     helper.destroy()
 35 |   }
 36 | 
 37 |   test("table not created") {
 38 |     val ds = new HiveAcidDataSource()
 39 |     val tableName = "tempTable"
 40 |     val options = Map("table" -> s"$tableName")
 41 | 
 42 |     val errorMessage = intercept[InvalidTableException] {
 43 |       ds.createSink(helper.spark.sqlContext, options, Seq.empty, OutputMode.Append())
 44 |     }.getMessage()
 45 |     val expectedMsg = s"""table not found $tableName"""
 46 |     assert(errorMessage.toLowerCase(Locale.ROOT).contains(expectedMsg.toLowerCase(Locale.ROOT)))
 47 | 
 48 |   }
 49 | 
 50 |   test("table not acid table") {
 51 |     val ds = new HiveAcidDataSource()
 52 |     val tableName = s"tempTable"
 53 |     val options = Map("table" -> s"$DEFAULT_DBNAME.$tableName")
 54 | 
 55 |     val tType = Table.orcTable
 56 |     val cols = Map(
 57 |       ("value1","int"),
 58 |       ("value2", "int")
 59 |     )
 60 | 
 61 |     val tableHive = new Table(DEFAULT_DBNAME, tableName, cols, tType, false)
 62 | 
 63 |     helper.recreate(tableHive, false)
 64 | 
 65 |     val errorMessage = intercept[IllegalArgumentException] {
 66 |       ds.createSink(helper.spark.sqlContext, options, Seq.empty, OutputMode.Append())
 67 |     }.getMessage()
 68 |     val expectedMsg = s"""table ${tableHive.hiveTname} is not an acid table"""
 69 |     assert(errorMessage.toLowerCase(Locale.ROOT).contains(expectedMsg.toLowerCase(Locale.ROOT)))
 70 | 
 71 |   }
 72 | 
 73 |   test("table is bucketed") {
 74 |     val ds = new HiveAcidDataSource()
 75 |     val tableName = s"tempTable"
 76 |     val options = Map("table" -> s"$DEFAULT_DBNAME.$tableName")
 77 | 
 78 |     val tType = Table.orcBucketedFullACIDTable
 79 |     val cols = Map(
 80 |       ("value1","int"),
 81 |       ("value2", "int")
 82 |     )
 83 | 
 84 |     val tableHive = new Table(DEFAULT_DBNAME, tableName, cols, tType, false)
 85 | 
 86 |     helper.recreate(tableHive, false)
 87 | 
 88 |     val errorMessage = intercept[RuntimeException] {
 89 |       ds.createSink(helper.spark.sqlContext, options, Seq.empty, OutputMode.Append())
 90 |     }.getMessage()
 91 |     val expectedMsg = s"""Unsupported operation type - Streaming Write for Bucketed table """
 92 |     assert(errorMessage.toLowerCase(Locale.ROOT).contains(expectedMsg.toLowerCase(Locale.ROOT)))
 93 | 
 94 |   }
 95 | 
 96 |   test("partitionBy is specified with Acid Streaming") {
 97 |     val ds = new HiveAcidDataSource()
 98 |     val options = Map("table" -> "dummyTable")
 99 |     val errorMessage = intercept[UnsupportedOperationException] {
100 |       ds.createSink(helper.spark.sqlContext, options, Seq("col1", "col2"), OutputMode.Append())
101 |     }.getMessage()
102 | 
103 |     val expectedMsg = "Unsupported Function - partitionBy with HiveAcidSink"
104 |     assert(errorMessage.toLowerCase(Locale.ROOT).contains(expectedMsg.toLowerCase(Locale.ROOT)))
105 | 
106 |   }
107 | 
108 |   test("incorrect output mode is used with Acid Streaming") {
109 |     val ds = new HiveAcidDataSource()
110 |     val options = Map("table" -> "dummyTable")
111 |     val errorMessage = intercept[AnalysisException] {
112 |       ds.createSink(helper.spark.sqlContext, options, Seq.empty, OutputMode.Update())
113 |     }.getMessage()
114 |     val expectedMsg = "mode is Update: Hive Acid Sink supports only Append as OutputMode"
115 |     assert(errorMessage.toLowerCase(Locale.ROOT).contains(expectedMsg.toLowerCase(Locale.ROOT)))
116 | 
117 |   }
118 | 
119 |   test("table not specified") {
120 |     val ds = new HiveAcidDataSource()
121 |     val options = Map.empty[String, String]
122 |     val errorMessage = intercept[IllegalArgumentException] {
123 |       ds.createSink(helper.spark.sqlContext, options, Seq.empty, OutputMode.Append())
124 |     }.getMessage()
125 |     val expectedMsg = """Table Name is not specified"""
126 |     assert(errorMessage.toLowerCase(Locale.ROOT).contains(expectedMsg.toLowerCase(Locale.ROOT)))
127 | 
128 |   }
129 | 
130 |   // Test Run
131 |   streamingTestForAcidTables(Table.allNonBucketedFullAcidTypes())
132 |   streamingTestForAcidTables(Table.allNonBucketedInsertOnlyTypes())
133 | 
134 |   def streamingTestForAcidTables(tTypes: List[(String,Boolean)]): Unit = {
135 |     tTypes.foreach { case (tType, isPartitioned) =>
136 |       val tableNameHive = "tHive"
137 |       val testName = s"Simple Streaming Query Append for $tableNameHive type $tType"
138 |       test(testName) {
139 |         val cols: Map[String, String] = {
140 |           if(!isPartitioned) {
141 |             Map(
142 |               ("value1","int"),
143 |               ("value2","int")
144 |             )
145 |           } else {
146 |             Map(
147 |               ("value","int")
148 |             )
149 |           }
150 |         }
151 | 
152 |         val tableHive = new Table(DEFAULT_DBNAME, tableNameHive, cols, tType, isPartitioned)
153 |         def code(): Unit = {
154 |           helper.recreate(tableHive)
155 | 
156 |           helper.runStreaming(tableHive.hiveTname, OutputMode.Append(), tableHive.getColMap.keys.toSeq, Range(1, 4))
157 |           val resDf = helper.sparkGetDF(tableHive)
158 |           val resultRow = (Row(100, 10, 1) :: Row(200, 20, 2) :: Row(300, 30, 3) :: Nil).toArray
159 |           helper.compareResult(resDf._1.collect(), resultRow)
160 |           helper.compareResult(resDf._2.collect(), resultRow)
161 |           helper.compare(tableHive, "compare via hive")
162 |         }
163 |         helper.myRun(testName, code)
164 |       }
165 |     }
166 |   }
167 | 
168 | }
169 | 


--------------------------------------------------------------------------------
/src/it/scala/com/qubole/spark/hiveacid/streaming/HiveAcidStreamingFunSuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2019 Qubole, Inc.  All rights reserved.
 3 |  *
 4 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 5 |  * contributor license agreements.  See the NOTICE file distributed with
 6 |  * this work for additional information regarding copyright ownership.
 7 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 8 |  * (the "License"); you may not use this file except in compliance with
 9 |  * the License.  You may obtain a copy of the License at
10 |  *
11 |  *    http://www.apache.org/licenses/LICENSE-2.0
12 |  *
13 |  * Unless required by applicable law or agreed to in writing, software
14 |  * distributed under the License is distributed on an "AS IS" BASIS,
15 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 |  * See the License for the specific language governing permissions and
17 |  * limitations under the License.
18 |  */
19 | 
20 | 
21 | package com.qubole.spark.hiveacid.streaming
22 | 
23 | import org.apache.log4j.{Level, LogManager, Logger}
24 | import org.scalatest.{BeforeAndAfterAll, FunSuite}
25 | 
26 | import scala.util.control.NonFatal
27 | 
28 | 
29 | abstract class HiveAcidStreamingFunSuite extends FunSuite with BeforeAndAfterAll {
30 | 
31 |   protected val log: Logger = LogManager.getLogger(this.getClass)
32 |   log.setLevel(Level.INFO)
33 | 
34 |   protected var helper: StreamingTestHelper = _
35 |   protected val isDebug = true
36 | 
37 |   protected val DEFAULT_DBNAME =  "HiveTestDB"
38 | 
39 |   override protected def beforeAll() {
40 |     try {
41 | 
42 |       helper = new StreamingTestHelper
43 |       if (isDebug) {
44 |         log.setLevel(Level.DEBUG)
45 |       }
46 |       helper.init(isDebug)
47 | 
48 |       // DB
49 |       helper.hiveExecute("DROP DATABASE IF EXISTS "+ DEFAULT_DBNAME +" CASCADE")
50 |       helper.hiveExecute("CREATE DATABASE "+ DEFAULT_DBNAME)
51 |     } catch {
52 |       case NonFatal(e) => log.info("failed " + e)
53 |     }
54 |   }
55 | 
56 |   override protected def afterAll(): Unit = {
57 |     helper.destroy()
58 |   }
59 | 
60 | }
61 | 


--------------------------------------------------------------------------------
/src/it/scala/com/qubole/spark/hiveacid/streaming/StreamingTestHelper.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2019 Qubole, Inc.  All rights reserved.
  3 |  *
  4 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  5 |  * contributor license agreements.  See the NOTICE file distributed with
  6 |  * this work for additional information regarding copyright ownership.
  7 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  8 |  * (the "License"); you may not use this file except in compliance with
  9 |  * the License.  You may obtain a copy of the License at
 10 |  *
 11 |  *    http://www.apache.org/licenses/LICENSE-2.0
 12 |  *
 13 |  * Unless required by applicable law or agreed to in writing, software
 14 |  * distributed under the License is distributed on an "AS IS" BASIS,
 15 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 16 |  * See the License for the specific language governing permissions and
 17 |  * limitations under the License.
 18 |  */
 19 | 
 20 | 
 21 | package com.qubole.spark.hiveacid.streaming
 22 | 
 23 | import java.io.{File, IOException}
 24 | import java.util.UUID
 25 | 
 26 | import com.qubole.spark.hiveacid.TestHelper
 27 | 
 28 | import org.apache.spark.network.util.JavaUtils
 29 | import org.apache.spark.sql.execution.streaming.MemoryStream
 30 | import org.apache.spark.sql.streaming.{OutputMode, StreamingQuery}
 31 | import org.scalatest.concurrent.TimeLimits
 32 | import org.scalatest.time.SpanSugar
 33 | 
 34 | class StreamingTestHelper extends TestHelper with TimeLimits  {
 35 | 
 36 |   import StreamingTestHelper._
 37 | 
 38 | 
 39 |   def runStreaming(tableName: String,
 40 |                    outputMode: OutputMode,
 41 |                    cols: Seq[String],
 42 |                    inputRange: Range,
 43 |                    options: List[(String, String)] = List.empty): Unit = {
 44 | 
 45 |     val inputData = MemoryStream[Int]
 46 |     val ds = inputData.toDS()
 47 | 
 48 |     val checkpointDir = createCheckpointDir(namePrefix = "stream.checkpoint").getCanonicalPath
 49 | 
 50 |     var query: StreamingQuery = null
 51 | 
 52 |     try {
 53 |       // Starting streaming query
 54 |       val writerDf =
 55 |         ds.map(i => (i*100, i*10, i))
 56 |           .toDF(cols:_*)
 57 |           .writeStream
 58 |           .format("HiveAcid")
 59 |           .option("table", tableName)
 60 |           .outputMode(outputMode)
 61 |           .option("checkpointLocation", checkpointDir)
 62 |           //.start()
 63 | 
 64 |       query = options.map { option =>
 65 |         writerDf.option(option._1, option._2)
 66 |       }.lastOption.getOrElse(writerDf).start()
 67 | 
 68 |       // Adding data for streaming query
 69 |       inputData.addData(inputRange)
 70 |       failAfter(STREAMING_TIMEOUT) {
 71 |         query.processAllAvailable()
 72 |       }
 73 |     } finally {
 74 |       if (query != null) {
 75 |         // Terminating streaming query
 76 |         query.stop()
 77 |         deleteCheckpointDir(checkpointDir)
 78 |       }
 79 |     }
 80 |   }
 81 | 
 82 |   def deleteCheckpointDir(fileStr: String): Unit = {
 83 |     val file = new File(fileStr)
 84 |     if (file != null) {
 85 |       JavaUtils.deleteRecursively(file)
 86 |     }
 87 |   }
 88 | 
 89 |   def createCheckpointDir(root: String = System.getProperty("java.io.tmpdir"),
 90 |                           namePrefix: String = "spark"): File = {
 91 | 
 92 |     var attempts = 0
 93 |     val maxAttempts = MAX_DIR_CREATION_ATTEMPTS
 94 |     var dir: File = null
 95 |     while (dir == null) {
 96 |       attempts += 1
 97 |       if (attempts > maxAttempts) {
 98 |         throw new IOException("Failed to create a temp directory (under " + root + ") after " +
 99 |           maxAttempts + " attempts!")
100 |       }
101 |       try {
102 |         dir = new File(root, namePrefix + "-" + UUID.randomUUID.toString)
103 |         if (dir.exists() || !dir.mkdirs()) {
104 |           dir = null
105 |         }
106 |       } catch { case e: SecurityException => dir = null; }
107 |     }
108 |     dir.getCanonicalFile
109 |   }
110 | 
111 | }
112 | 
113 | object StreamingTestHelper extends TestHelper with SpanSugar  {
114 | 
115 |   val MAX_DIR_CREATION_ATTEMPTS = 10
116 |   val STREAMING_TIMEOUT = 60.seconds
117 | 
118 | }
119 | 


--------------------------------------------------------------------------------
/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2019 Qubole, Inc.  All rights reserved.
 3 | #
 4 | # Licensed to the Apache Software Foundation (ASF) under one or more
 5 | # contributor license agreements.  See the NOTICE file distributed with
 6 | # this work for additional information regarding copyright ownership.
 7 | # The ASF licenses this file to You under the Apache License, Version 2.0
 8 | # (the "License"); you may not use this file except in compliance with
 9 | # the License.  You may obtain a copy of the License at
10 | #
11 | #    http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 | 
20 | com.qubole.spark.hiveacid.datasource.HiveAcidDataSource
21 | 


--------------------------------------------------------------------------------
/src/main/scala/com/qubole/shaded/hadoop/hive/ql/io/orc/OrcAcidUtil.scala:
--------------------------------------------------------------------------------
 1 | package com.qubole.shaded.hadoop.hive.ql.io.orc
 2 | 
 3 | import java.util.regex.Pattern
 4 | 
 5 | import com.qubole.shaded.hadoop.hive.ql.io.AcidUtils
 6 | import org.apache.hadoop.fs.Path
 7 | 
 8 | object OrcAcidUtil {
 9 |   val BUCKET_PATTERN = Pattern.compile("bucket_[0-9]{5}$")
10 | 
11 |   def getDeleteDeltaPaths(orcSplit: OrcSplit): Array[Path] = {
12 |     assert(BUCKET_PATTERN.matcher(orcSplit.getPath.getName).matches())
13 |     val bucket = AcidUtils.parseBucketId(orcSplit.getPath)
14 |     assert(bucket != -1)
15 |     val deleteDeltaDirPaths = VectorizedOrcAcidRowBatchReader.getDeleteDeltaDirsFromSplit(orcSplit);
16 |     deleteDeltaDirPaths.map(deleteDir => AcidUtils.createBucketFile(deleteDir, bucket))
17 |   }
18 | }
19 | 


--------------------------------------------------------------------------------
/src/main/scala/com/qubole/spark/datasources/hiveacid/sql/HiveAnalysisException.scala:
--------------------------------------------------------------------------------
 1 | package com.qubole.spark.datasources.hiveacid.sql
 2 | 
 3 | import org.apache.spark.sql.AnalysisException
 4 | import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 5 | 
 6 | class HiveAnalysisException(
 7 |      override val message: String,
 8 |      override val line: Option[Int] = None,
 9 |      override val startPosition: Option[Int] = None,
10 |      // Some plans fail to serialize due to bugs in scala collections.
11 |      @transient override val plan: Option[LogicalPlan] = None,
12 |      override val cause: Option[Throwable] = None) extends AnalysisException(message, line, startPosition, plan, cause) {
13 | }
14 | 


--------------------------------------------------------------------------------
/src/main/scala/com/qubole/spark/datasources/hiveacid/sql/catalyst/parser/ParseDriver.scala:
--------------------------------------------------------------------------------
 1 | package com.qubole.spark.datasources.hiveacid.sql.catalyst.parser
 2 | 
 3 | import com.qubole.spark.datasources.hiveacid.sql.catalyst.parser.{SqlHiveParser => SqlBaseParser}
 4 | import org.antlr.v4.runtime._
 5 | import org.antlr.v4.runtime.misc.Interval
 6 | import org.antlr.v4.runtime.tree.TerminalNodeImpl
 7 | import org.apache.spark.sql.catalyst.expressions.AttributeReference
 8 | import org.apache.spark.sql.types.StructType
 9 | 
10 | /**
11 |  * A copy of [[org.apache.spark.sql.catalyst.parser.UpperCaseCharStream]]
12 |  */
13 | class UpperCaseCharStream(wrapped: CodePointCharStream) extends CharStream {
14 |   override def consume(): Unit = wrapped.consume
15 |   override def getSourceName(): String = wrapped.getSourceName
16 |   override def index(): Int = wrapped.index
17 |   override def mark(): Int = wrapped.mark
18 |   override def release(marker: Int): Unit = wrapped.release(marker)
19 |   override def seek(where: Int): Unit = wrapped.seek(where)
20 |   override def size(): Int = wrapped.size
21 | 
22 |   override def getText(interval: Interval): String = {
23 |     // ANTLR 4.7's CodePointCharStream implementations have bugs when
24 |     // getText() is called with an empty stream, or intervals where
25 |     // the start > end. See
26 |     // https://github.com/antlr/antlr4/commit/ac9f7530 for one fix
27 |     // that is not yet in a released ANTLR artifact.
28 |     if (size() > 0 && (interval.b - interval.a >= 0)) wrapped.getText(interval) else ""
29 |   }
30 | 
31 |   override def LA(i: Int): Int = {
32 |     val la = wrapped.LA(i)
33 |     if (la == 0 || la == IntStream.EOF) la
34 |     else Character.toUpperCase(la)
35 |   }
36 | }
37 | 
38 | /**
39 |  * An adaptation of [[org.apache.spark.sql.catalyst.parser.PostProcessor]]
40 |  */
41 | case object PostProcessor extends SqlHiveBaseListener {
42 | 
43 |   /** Remove the back ticks from an Identifier. */
44 |   override def exitQuotedIdentifier(ctx: SqlBaseParser.QuotedIdentifierContext): Unit = {
45 |     replaceTokenByIdentifier(ctx, 1) { token =>
46 |       // Remove the double back ticks in the string.
47 |       token.setText(token.getText.replace("``", "`"))
48 |       token
49 |     }
50 |   }
51 | 
52 |   /** Treat non-reserved keywords as Identifiers. */
53 |   override def exitNonReserved(ctx: SqlBaseParser.NonReservedContext): Unit = {
54 |     replaceTokenByIdentifier(ctx, 0)(identity)
55 |   }
56 | 
57 |   private def replaceTokenByIdentifier(
58 |                                         ctx: ParserRuleContext,
59 |                                         stripMargins: Int)(
60 |                                         f: CommonToken => CommonToken = identity): Unit = {
61 |     val parent = ctx.getParent
62 |     parent.removeLastChild()
63 |     val token = ctx.getChild(0).getPayload.asInstanceOf[Token]
64 |     val newToken = new CommonToken(
65 |       new org.antlr.v4.runtime.misc.Pair(token.getTokenSource, token.getInputStream),
66 |       SqlBaseParser.IDENTIFIER,
67 |       token.getChannel,
68 |       token.getStartIndex + stripMargins,
69 |       token.getStopIndex - stripMargins)
70 |     parent.addChild(new TerminalNodeImpl(f(newToken)))
71 |   }
72 | }
73 | 
74 | /**
75 |  * An adaptation of [[org.apache.spark.util.random.RandomSampler]]
76 |  */
77 | object RandomSampler {
78 |   /**
79 |    * Sampling fraction arguments may be results of computation, and subject to floating
80 |    * point jitter.  I check the arguments with this epsilon slop factor to prevent spurious
81 |    * warnings for cases such as summing some numbers to get a sampling fraction of 1.000000001
82 |    */
83 |   val roundingEpsilon = 1e-6
84 | }
85 | 
86 | object SparkAdaptation {
87 |   /**
88 |    * An adaptation of [[org.apache.spark.sql.types.StructType#toAttributes]]
89 |    */
90 |   def toAttributes(structType: StructType): Seq[AttributeReference] =
91 |     structType.map(f => AttributeReference(f.name, f.dataType, f.nullable, f.metadata)())
92 | }


--------------------------------------------------------------------------------
/src/main/scala/com/qubole/spark/datasources/hiveacid/sql/catalyst/plans/command/DeleteCommand.scala:
--------------------------------------------------------------------------------
 1 | package com.qubole.spark.datasources.hiveacid.sql.catalyst.plans.command
 2 | 
 3 | import com.qubole.spark.hiveacid.HiveAcidErrors
 4 | import com.qubole.spark.hiveacid.datasource.HiveAcidRelation
 5 | import org.apache.spark.sql.{Column, Row, SparkSession}
 6 | import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression}
 7 | import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 8 | import org.apache.spark.sql.execution.command.RunnableCommand
 9 | import org.apache.spark.sql.execution.datasources.LogicalRelation
10 | 
11 | case class DeleteCommand(
12 |     table: LogicalPlan,
13 |     condition: Expression)
14 |   extends RunnableCommand {
15 | 
16 |   // We don't want `table` in children as sometimes we don't want to transform it.
17 |   override def children: Seq[LogicalPlan] = Seq(table)
18 |   override def output: Seq[Attribute] = Seq.empty
19 |   override lazy val resolved: Boolean = childrenResolved
20 |   override def run(sparkSession: SparkSession): Seq[Row] = {
21 |     if (children.size != 1) {
22 |       throw new IllegalArgumentException("DELETE command should specify exactly one table, whereas this has: "
23 |         + children.size)
24 |     }
25 |     children(0) match {
26 |       case LogicalRelation(relation: HiveAcidRelation, _, _ , _) => {
27 |         relation.delete(new Column(condition))
28 |       }
29 |       case _ => throw HiveAcidErrors.tableNotAcidException(table.toString())
30 |     }
31 |     Seq.empty[Row]
32 |   }
33 | }


--------------------------------------------------------------------------------
/src/main/scala/com/qubole/spark/datasources/hiveacid/sql/catalyst/plans/command/MergeCommand.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.qubole.spark.datasources.hiveacid.sql.catalyst.plans.command
19 | 
20 | import com.qubole.spark.hiveacid.HiveAcidErrors
21 | import com.qubole.spark.hiveacid.datasource.HiveAcidRelation
22 | import com.qubole.spark.hiveacid.merge.{MergeCondition, MergeWhenClause, MergeWhenNotInsert}
23 | import org.apache.spark.sql.catalyst.AliasIdentifier
24 | import org.apache.spark.sql.catalyst.analysis.EliminateSubqueryAliases
25 | import org.apache.spark.sql.catalyst.catalog.{CatalogTable, HiveTableRelation}
26 | import org.apache.spark.sql.{Row, SparkSession, SqlUtils}
27 | import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression}
28 | import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, SubqueryAlias}
29 | import org.apache.spark.sql.execution.command.RunnableCommand
30 | import org.apache.spark.sql.execution.datasources.LogicalRelation
31 | 
32 | case class MergeCommand(targetTable: LogicalPlan,
33 |                         sourceTable: LogicalPlan,
34 |                         matched: Seq[MergeWhenClause],
35 |                         notMatched: Option[MergeWhenClause],
36 |                         mergeCondition: MergeCondition,
37 |                         sourceAlias: Option[AliasIdentifier],
38 |                         targetAlias: Option[AliasIdentifier])
39 |   extends RunnableCommand {
40 | 
41 |   override def children: Seq[LogicalPlan] = Seq(targetTable, sourceTable)
42 |   override def output: Seq[Attribute] = Seq.empty
43 |   override lazy val resolved: Boolean = childrenResolved
44 |   override def run(sparkSession: SparkSession): Seq[Row] = {
45 |     val insertClause: Option[MergeWhenNotInsert] = notMatched match {
46 |       case Some(i: MergeWhenNotInsert) => Some(i)
47 |       case None => None
48 |       case _ => throw HiveAcidErrors.mergeValidationError("WHEN NOT Clause has to be INSERT CLAUSE")
49 |     }
50 | 
51 |     val targetRelation = children.head
52 |     val sourceRelation = children.last
53 | 
54 |     val sourceTableFullyQualifiedName = SqlUtils.removeTopSubqueryAlias(sourceRelation) match {
55 |       case hiveTable: HiveTableRelation =>
56 |         Some(hiveTable.tableMeta.qualifiedName)
57 |       case LogicalRelation(acidRelation: HiveAcidRelation, _, _, _) =>
58 |         Some(acidRelation.fullyQualifiedTableName)
59 |       case LogicalRelation(_, _, catalogTable: Option[CatalogTable], _) if catalogTable.isDefined =>
60 |         Some(catalogTable.get.qualifiedName)
61 |       case _ => None
62 |     }
63 | 
64 |     val (_, sourceDf) = SqlUtils.getDFQualified(sparkSession,
65 |       SqlUtils.logicalPlanToDataFrame(sparkSession, sourceTable),
66 |       sourceTableFullyQualifiedName.getOrElse(""))
67 | 
68 |     SqlUtils.removeTopSubqueryAlias(targetRelation) match {
69 |       case LogicalRelation(relation: HiveAcidRelation, _, _, _) =>
70 |         relation.merge(sourceDf,
71 |           mergeCondition.expression, matched, insertClause, sourceAlias, targetAlias)
72 |       case _ => throw HiveAcidErrors.tableNotAcidException(targetTable.toString())
73 |     }
74 | 
75 |     Seq.empty
76 |   }
77 | }


--------------------------------------------------------------------------------
/src/main/scala/com/qubole/spark/datasources/hiveacid/sql/catalyst/plans/command/UpdateCommand.scala:
--------------------------------------------------------------------------------
 1 | package com.qubole.spark.datasources.hiveacid.sql.catalyst.plans.command
 2 | 
 3 | import com.qubole.spark.hiveacid.HiveAcidErrors
 4 | import com.qubole.spark.hiveacid.datasource.HiveAcidRelation
 5 | import org.apache.spark.sql.{Column, Row, SparkSession}
 6 | import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression}
 7 | import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 8 | import org.apache.spark.sql.execution.command.RunnableCommand
 9 | import org.apache.spark.sql.execution.datasources.LogicalRelation
10 | 
11 | case class UpdateCommand(
12 |     table: LogicalPlan,
13 |     setExpressions: Map[String, Expression],
14 |     condition: Option[Expression])
15 |   extends RunnableCommand {
16 | 
17 |   override def children: Seq[LogicalPlan] = Seq(table)
18 |   override def output: Seq[Attribute] = Seq.empty
19 |   override lazy val resolved: Boolean = childrenResolved
20 | 
21 |   override def run(sparkSession: SparkSession): Seq[Row] = {
22 |     if (children.size != 1) {
23 |       throw new IllegalArgumentException("UPDATE command should have one table to update, whereas this has: "
24 |         + children.size)
25 |     }
26 |     children(0) match {
27 |       case LogicalRelation(relation: HiveAcidRelation, _, _ , _) => {
28 |         val setColumns = setExpressions.mapValues(expr => new Column(expr))
29 |         val updateFilterColumn = condition.map(new Column(_))
30 |         relation.update(updateFilterColumn, setColumns)
31 |       }
32 |       case LogicalRelation(_, _, Some(catalogTable), _) =>
33 |         throw HiveAcidErrors.tableNotAcidException(catalogTable.qualifiedName)
34 |       case _ => throw HiveAcidErrors.tableNotAcidException(table.toString())
35 |     }
36 |     Seq.empty[Row]
37 |   }
38 | }


--------------------------------------------------------------------------------
/src/main/scala/com/qubole/spark/datasources/hiveacid/sql/execution/SparkAcidSqlParser.scala:
--------------------------------------------------------------------------------
  1 | package com.qubole.spark.datasources.hiveacid.sql.execution
  2 | 
  3 | import com.qubole.spark.datasources.hiveacid.sql.catalyst.parser._
  4 | import org.antlr.v4.runtime._
  5 | import org.antlr.v4.runtime.atn.PredictionMode
  6 | import org.apache.spark.internal.Logging
  7 | import org.apache.spark.sql.catalyst.expressions.Expression
  8 | import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier}
  9 | import org.apache.spark.sql.{AnalysisException, SparkSession}
 10 | import org.apache.spark.sql.catalyst.parser.{ParseErrorListener, ParseException, ParserInterface}
 11 | import org.apache.spark.sql.catalyst.plans.logical._
 12 | import org.apache.spark.sql.catalyst.trees.Origin
 13 | import org.apache.spark.sql.execution.SparkSqlParser
 14 | import org.apache.spark.sql.internal.{SQLConf, VariableSubstitution}
 15 | import org.apache.spark.sql.types.{DataType, StructType}
 16 | 
 17 | /**
 18 |  * Concrete parser for Hive SQL statements.
 19 |  */
 20 | case class SparkAcidSqlParser(sparkParser: ParserInterface) extends ParserInterface with Logging {
 21 | 
 22 |   override def parseExpression(sqlText: String): Expression = sparkParser.parseExpression(sqlText)
 23 | 
 24 |   override def parseTableIdentifier(sqlText: String): TableIdentifier = sparkParser.parseTableIdentifier(sqlText)
 25 | 
 26 |   override def parseFunctionIdentifier(sqlText: String): FunctionIdentifier = sparkParser.parseFunctionIdentifier(sqlText)
 27 | 
 28 |   override def parseTableSchema(sqlText: String): StructType = sparkParser.parseTableSchema(sqlText)
 29 | 
 30 |   override def parseDataType(sqlText: String): DataType = sparkParser.parseDataType(sqlText)
 31 | 
 32 |   private val substitutor: VariableSubstitution = {
 33 |     val field = classOf[SparkSqlParser].getDeclaredField("substitutor")
 34 |     field.setAccessible(true)
 35 |     field.get(sparkParser).asInstanceOf[VariableSubstitution]
 36 |   }
 37 | 
 38 |   // FIXME scala reflection would be better
 39 |   private val conf: SQLConf = {
 40 |     val field = classOf[VariableSubstitution].getDeclaredField("org$apache$spark$sql$internal$VariableSubstitution$$conf")
 41 |     field.setAccessible(true)
 42 |     field.get(substitutor).asInstanceOf[SQLConf]
 43 |   }
 44 | 
 45 |   private val sparkAcidAstBuilder = new SparkSqlAstBuilder(conf)
 46 | 
 47 |   override def parsePlan(sqlText: String): LogicalPlan = {
 48 |     try {
 49 |       parse(sqlText) { parser =>
 50 |         sparkAcidAstBuilder.visitSingleStatement(parser.singleStatement()) match {
 51 |           case plan: LogicalPlan => plan
 52 |           case _ => sparkParser.parsePlan(sqlText)
 53 |         }
 54 |       }
 55 |     } catch {
 56 |       case e: AcidParseException => throw e.parseException
 57 |       case _: ParseException => sparkParser.parsePlan(sqlText)
 58 |     }
 59 |   }
 60 | 
 61 |   /**
 62 |    *  An adaptation of [[org.apache.spark.sql.execution.SparkSqlParser#parse]]
 63 |    *  and [[org.apache.spark.sql.catalyst.parser.AbstractSqlParser#parse]]
 64 |    */
 65 |   protected def parse[T](sqlText: String)(toResult: SqlHiveParser => T): T = {
 66 |     val command = substitutor.substitute(sqlText)
 67 |     logDebug(s"Parsing command: $command")
 68 | 
 69 | 
 70 |     val lexer = new SqlHiveLexer(new UpperCaseCharStream(CharStreams.fromString(command)))
 71 |     lexer.removeErrorListeners()
 72 |     lexer.addErrorListener(ParseErrorListener)
 73 |     lexer.legacy_setops_precedence_enbled = SQLConf.get.setOpsPrecedenceEnforced
 74 | 
 75 |     val tokenStream = new CommonTokenStream(lexer)
 76 |     val acidSpecific = checkIfAcidSpecific(tokenStream)
 77 |     tokenStream.seek(0) //reset stream to first token
 78 |     val parser = new SqlHiveParser(tokenStream)
 79 |     parser.addParseListener(PostProcessor)
 80 |     parser.removeErrorListeners()
 81 |     parser.addErrorListener(ParseErrorListener)
 82 |     parser.legacy_setops_precedence_enbled = SQLConf.get.setOpsPrecedenceEnforced
 83 |     try {
 84 |         parser.getInterpreter.setPredictionMode(PredictionMode.LL)
 85 |         toResult(parser)
 86 |     } catch {
 87 |         case e: ParseException if e.command.isDefined =>
 88 |           throw wrapParseException(e, acidSpecific)
 89 |         case e: ParseException =>
 90 |           throw wrapParseException(e.withCommand(command), acidSpecific)
 91 |         case e: AnalysisException =>
 92 |           val position = Origin(e.line, e.startPosition)
 93 |           val pe = new ParseException(Option(command), e.message, position, position)
 94 |           throw wrapParseException(pe, acidSpecific)
 95 |       }
 96 |     }
 97 | 
 98 |   /**
 99 |     * Denotes ACID Specific ParseException
100 |     * @param parseException
101 |     */
102 |   class AcidParseException(val parseException: ParseException) extends Exception
103 | 
104 |   def wrapParseException(e: ParseException, acidSpecific: Boolean): Throwable = {
105 |     if (acidSpecific) {
106 |       new AcidParseException(e)
107 |     } else {
108 |       e
109 |     }
110 |   }
111 |   def checkIfAcidSpecific(tokStream: TokenStream): Boolean = {
112 |     tokStream.LA(1) match {
113 |       case SqlHiveParser.DELETE | SqlHiveParser.MERGE | SqlHiveParser.UPDATE => true
114 |       case _ => false
115 |     }
116 |   }
117 | }


--------------------------------------------------------------------------------
/src/main/scala/com/qubole/spark/hiveacid/.gitignore:
--------------------------------------------------------------------------------
1 | *.scalae
2 | 


--------------------------------------------------------------------------------
/src/main/scala/com/qubole/spark/hiveacid/HiveAcidAutoConvert.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2019 Qubole, Inc.  All rights reserved.
 3 |  *
 4 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 5 |  * contributor license agreements.  See the NOTICE file distributed with
 6 |  * this work for additional information regarding copyright ownership.
 7 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 8 |  * (the "License"); you may not use this file except in compliance with
 9 |  * the License.  You may obtain a copy of the License at
10 |  *
11 |  *    http://www.apache.org/licenses/LICENSE-2.0
12 |  *
13 |  * Unless required by applicable law or agreed to in writing, software
14 |  * distributed under the License is distributed on an "AS IS" BASIS,
15 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 |  * See the License for the specific language governing permissions and
17 |  * limitations under the License.
18 |  */
19 | 
20 | package com.qubole.spark.hiveacid
21 | 
22 | import java.util.Locale
23 | 
24 | import com.qubole.spark.datasources.hiveacid.sql.execution.SparkAcidSqlParser
25 | import org.apache.spark.sql.{SparkSession, SparkSessionExtensions}
26 | import org.apache.spark.sql.catalyst.catalog.HiveTableRelation
27 | import org.apache.spark.sql.catalyst.plans.logical.{Filter, InsertIntoTable, LogicalPlan}
28 | import org.apache.spark.sql.catalyst.rules.Rule
29 | import org.apache.spark.sql.execution.command.DDLUtils
30 | import org.apache.spark.sql.execution.datasources.LogicalRelation
31 | import com.qubole.spark.hiveacid.datasource.HiveAcidDataSource
32 | 
33 | 
34 | /**
35 |  * Analyzer rule to convert a transactional HiveRelation
36 |  * into LogicalRelation backed by HiveAcidRelation
37 |  * @param spark - spark session
38 |  */
39 | case class HiveAcidAutoConvert(spark: SparkSession) extends Rule[LogicalPlan] {
40 | 
41 |   private def isConvertible(relation: HiveTableRelation): Boolean = {
42 |     val serde = relation.tableMeta.storage.serde.getOrElse("").toLowerCase(Locale.ROOT)
43 |     relation.tableMeta.properties.getOrElse("transactional", "false").toBoolean
44 |   }
45 | 
46 |   private def convert(relation: HiveTableRelation): LogicalRelation = {
47 |     val options = relation.tableMeta.properties ++
48 |       relation.tableMeta.storage.properties ++ Map("table" -> relation.tableMeta.qualifiedName)
49 | 
50 |     val newRelation = new HiveAcidDataSource().createRelation(spark.sqlContext, options)
51 |     LogicalRelation(newRelation, isStreaming = false)
52 |   }
53 | 
54 |   override def apply(plan: LogicalPlan): LogicalPlan = {
55 |     plan resolveOperators {
56 |       // Write path
57 |       case InsertIntoTable(r: HiveTableRelation, partition, query, overwrite, ifPartitionNotExists)
58 |         if query.resolved && DDLUtils.isHiveTable(r.tableMeta) && isConvertible(r) =>
59 |         InsertIntoTable(convert(r), partition, query, overwrite, ifPartitionNotExists)
60 | 
61 |       // Read path
62 |       case relation: HiveTableRelation
63 |         if DDLUtils.isHiveTable(relation.tableMeta) && isConvertible(relation) =>
64 |         convert(relation)
65 |     }
66 |   }
67 | }
68 | 
69 | class HiveAcidAutoConvertExtension extends (SparkSessionExtensions => Unit) {
70 |   def apply(extension: SparkSessionExtensions): Unit = {
71 |     extension.injectResolutionRule(HiveAcidAutoConvert.apply)
72 |     extension.injectParser { (session, parser) =>
73 |       SparkAcidSqlParser(parser)
74 |     }
75 |   }
76 | }
77 | 


--------------------------------------------------------------------------------
/src/main/scala/com/qubole/spark/hiveacid/HiveAcidErrors.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2019 Qubole, Inc.  All rights reserved.
  3 |  *
  4 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  5 |  * contributor license agreements.  See the NOTICE file distributed with
  6 |  * this work for additional information regarding copyright ownership.
  7 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  8 |  * (the "License"); you may not use this file except in compliance with
  9 |  * the License.  You may obtain a copy of the License at
 10 |  *
 11 |  *    http://www.apache.org/licenses/LICENSE-2.0
 12 |  *
 13 |  * Unless required by applicable law or agreed to in writing, software
 14 |  * distributed under the License is distributed on an "AS IS" BASIS,
 15 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 16 |  * See the License for the specific language governing permissions and
 17 |  * limitations under the License.
 18 |  */
 19 | 
 20 | package com.qubole.spark.hiveacid
 21 | 
 22 | import org.apache.spark.sql.{SaveMode, SqlUtils}
 23 | import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 24 | 
 25 | object HiveAcidErrors {
 26 | 
 27 |   def formatColumn(colName: String): String = s"`$colName`"
 28 | 
 29 |   def formatColumnList(colNames: Seq[String]): String =
 30 |     colNames.map(formatColumn).mkString("[", ", ", "]")
 31 | 
 32 |   def tableNotSpecifiedException(): Throwable = {
 33 |     new IllegalArgumentException("'table' is not specified in parameters")
 34 |   }
 35 | 
 36 |   def unsupportedFunction(function: String, caller: String): Throwable = {
 37 |     new java.lang.UnsupportedOperationException(s"Unsupported Function - $function with $caller")
 38 |   }
 39 | 
 40 |   def invalidOperationType(operation: String): Throwable = {
 41 |     new RuntimeException(s"Invalid operation type - $operation")
 42 |   }
 43 | 
 44 |   def unsupportedSaveMode(saveMode: SaveMode): Throwable = {
 45 |     new RuntimeException(s"Unsupported save mode - $saveMode")
 46 |   }
 47 | 
 48 |   def unsupportedOperationTypeInsertOnlyTable(operation: String, tableName: String): Throwable = {
 49 |     new RuntimeException(s"Unsupported operation type - $operation for InsertOnly table " + tableName)
 50 |   }
 51 | 
 52 |   def unsupportedOperationTypeBucketedTable(operation: String, tableName: String): Throwable = {
 53 |     new RuntimeException(s"Unsupported operation type - $operation for Bucketed table " + tableName)
 54 |   }
 55 | 
 56 |   def tableNotAcidException(tableName: String): Throwable = {
 57 |     new IllegalArgumentException(s"table $tableName is not an ACID table")
 58 |   }
 59 | 
 60 |   def couldNotAcquireLockException(exception: Exception = null): Throwable = {
 61 |     new RuntimeException(s"Could not acquire lock.", exception)
 62 |   }
 63 | 
 64 |   def couldNotAcquireLockException(state: String): Throwable = {
 65 |     new RuntimeException(s"Could not acquire lock. Lock State: $state")
 66 |   }
 67 | 
 68 |   def txnAlreadyClosed(txnId: Long): Throwable = {
 69 |     new RuntimeException(s"Transaction $txnId is already closed")
 70 |   }
 71 | 
 72 |   def txnAlreadyOpen(txnId: Long): Throwable = {
 73 |     new RuntimeException(s"Transaction already opened. Existing txnId: $txnId")
 74 |   }
 75 | 
 76 |   def txnNotStarted(table: String): Throwable = {
 77 |     new RuntimeException(s"Transaction on $table not started")
 78 |   }
 79 | 
 80 |   def txnNoTransaction(): Throwable = {
 81 |     new RuntimeException(s"No transaction found")
 82 |   }
 83 | 
 84 |   def tableSnapshotNonExistent(snapshotId: Long): Throwable = {
 85 |     new RuntimeException(s"Table snapshost $snapshotId does not exist")
 86 |   }
 87 | 
 88 |   def tableWriteIdRequestedBeforeTxnStart(table: String): Throwable = {
 89 |     new RuntimeException(s"Write id requested for table $table before txn was started")
 90 |   }
 91 | 
 92 |   def repeatedTxnId(txnId: Long, activeTxns: Seq[Long]): Throwable = {
 93 |     new RuntimeException(
 94 |       s"Repeated transaction id $txnId, active transactions are [${activeTxns.mkString(",")}]")
 95 |   }
 96 | 
 97 |   def unsupportedStreamingOutputMode(mode: String): Throwable = {
 98 |     new AnalysisException(
 99 |       s"mode is $mode: Hive Acid Sink supports only Append as OutputMode")
100 |   }
101 | 
102 |   def updateSetColumnNotFound(col: String, colList: Seq[String]): Throwable = {
103 |     new AnalysisException(
104 |       s"SET column ${formatColumn(col)} not found among columns: ${formatColumnList(colList)}.")
105 |   }
106 | 
107 |   def updateOnPartition(cols: Seq[String], table: String): Throwable = {
108 |     val message = if (cols.length == 1) {
109 |       s"SET column: ${cols.head} is partition column in table: ${table}"
110 |     } else {
111 |       s"SET columns: ${cols.mkString(",")} are partition columns in table: ${table}"
112 |     }
113 |     new AnalysisException(
114 |       s"UPDATE on the partition columns are not allowed. $message"
115 |     )
116 |   }
117 | 
118 |   def txnOutdated(txnId: Long, tableName: String): Throwable = {
119 |     new TransactionInvalidException(
120 |       s"Transaction is $txnId is no longer valid for table $tableName", txnId, tableName)
121 |   }
122 | 
123 |   def unexpectedReadError(cause: String): Throwable = {
124 |     throw new RuntimeException(
125 |       s"Unexpected error while reading the Hive Acid Data: $cause")
126 |   }
127 | 
128 |   def mergeValidationError(cause: String): Throwable = {
129 |     SqlUtils.analysisException(s"MERGE Validation Error: $cause")
130 |   }
131 | 
132 |   def mergeResolutionError(cause: String): Throwable = {
133 |     SqlUtils.analysisException(cause)
134 |   }
135 | 
136 |   def mergeUnsupportedError(cause: String): Throwable = {
137 |     throw new RuntimeException(cause)
138 |   }
139 | }
140 | 
141 | class TransactionInvalidException(val message:String,
142 |                                   val txnId: Long,
143 |                                   val tableName : String)
144 |   extends Exception(message) {
145 |   override def getMessage: String = {
146 |     message
147 |   }
148 | }
149 | 
150 | class AnalysisException(
151 |      val message: String,
152 |      val line: Option[Int] = None,
153 |      val startPosition: Option[Int] = None,
154 |      // Some plans fail to serialize due to bugs in scala collections.
155 |      @transient val plan: Option[LogicalPlan] = None,
156 |      val cause: Option[Throwable] = None)
157 |   extends Exception(message, cause.orNull) with Serializable {
158 | 
159 |   def withPosition(line: Option[Int], startPosition: Option[Int]): AnalysisException = {
160 |     val newException = new AnalysisException(message, line, startPosition)
161 |     newException.setStackTrace(getStackTrace)
162 |     newException
163 |   }
164 | 
165 |   override def getMessage: String = {
166 |     val planAnnotation = Option(plan).flatten.map(p => s";\n$p").getOrElse("")
167 |     getSimpleMessage + planAnnotation
168 |   }
169 | 
170 |   // Outputs an exception without the logical plan.
171 |   // For testing only
172 |   def getSimpleMessage: String = {
173 |     val lineAnnotation = line.map(l => s" line $l").getOrElse("")
174 |     val positionAnnotation = startPosition.map(p => s" pos $p").getOrElse("")
175 |     s"$message;$lineAnnotation$positionAnnotation"
176 |   }
177 | }


--------------------------------------------------------------------------------
/src/main/scala/com/qubole/spark/hiveacid/HiveAcidOperation.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2019 Qubole, Inc.  All rights reserved.
 3 |  *
 4 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 5 |  * contributor license agreements.  See the NOTICE file distributed with
 6 |  * this work for additional information regarding copyright ownership.
 7 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 8 |  * (the "License"); you may not use this file except in compliance with
 9 |  * the License.  You may obtain a copy of the License at
10 |  *
11 |  *    http://www.apache.org/licenses/LICENSE-2.0
12 |  *
13 |  * Unless required by applicable law or agreed to in writing, software
14 |  * distributed under the License is distributed on an "AS IS" BASIS,
15 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 |  * See the License for the specific language governing permissions and
17 |  * limitations under the License.
18 |  */
19 | 
20 | package com.qubole.spark.hiveacid
21 | 
22 | private[hiveacid] object HiveAcidOperation extends Enumeration {
23 |   type OperationType = Value
24 |   val READ, INSERT_INTO, INSERT_OVERWRITE, DELETE, UPDATE, MERGE = Value
25 | }
26 | 


--------------------------------------------------------------------------------
/src/main/scala/com/qubole/spark/hiveacid/SparkAcidConf.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2019 Qubole, Inc.  All rights reserved.
  3 |  *
  4 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  5 |  * contributor license agreements.  See the NOTICE file distributed with
  6 |  * this work for additional information regarding copyright ownership.
  7 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  8 |  * (the "License"); you may not use this file except in compliance with
  9 |  * the License.  You may obtain a copy of the License at
 10 |  *
 11 |  *    http://www.apache.org/licenses/LICENSE-2.0
 12 |  *
 13 |  * Unless required by applicable law or agreed to in writing, software
 14 |  * distributed under the License is distributed on an "AS IS" BASIS,
 15 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 16 |  * See the License for the specific language governing permissions and
 17 |  * limitations under the License.
 18 |  */
 19 | 
 20 | package com.qubole.spark.hiveacid
 21 | 
 22 | import org.apache.spark.sql.SparkSession
 23 | 
 24 | /**
 25 |   * Spark specific configuration container to be used by Hive Acid module
 26 |   */
 27 | case class SparkAcidConfigEntry[T](configName: String /* Name of the config */ ,
 28 |                                    defaultValue: String /* Default value of config in String*/ ,
 29 |                                    description: String /* Description of the config*/ ,
 30 |                                    converter: Option[(String, String) => T] /* function to convert from String to Config's Type T*/)
 31 | 
 32 | 
 33 | case class SparkAcidConfigBuilder[T](configName: String) {
 34 |   private var defaultValue: Option[String] = None
 35 |   def defaultValue(value: String): SparkAcidConfigBuilder[T] = {
 36 |     defaultValue = Some(value)
 37 |     this
 38 |   }
 39 | 
 40 |   private var description = ""
 41 |   def description(desc : String): SparkAcidConfigBuilder[T] = {
 42 |     description = desc
 43 |     this
 44 |   }
 45 | 
 46 |   private var converter: Option[(String, String) => T] = None
 47 |   def converter(func: (String, String) => T): SparkAcidConfigBuilder[T] = {
 48 |     converter = Some(func)
 49 |     this
 50 |   }
 51 | 
 52 |   def create(): SparkAcidConfigEntry[T] = {
 53 |     require(!defaultValue.isEmpty, "Default Value for the Spark Acid Config needs to be specified")
 54 |     new SparkAcidConfigEntry[T](configName, defaultValue.get, description, converter)
 55 |   }
 56 | }
 57 | 
 58 | case class SparkAcidConf(@transient sparkSession: SparkSession, @transient parameters: Map[String, String]) {
 59 |   @transient val configMap = sparkSession.sessionState.conf.getAllConfs
 60 | 
 61 |   val predicatePushdownEnabled = getConf(SparkAcidConf.PREDICATE_PUSHDOWN_CONF)
 62 |   val maxSleepBetweenLockRetries = getConf(SparkAcidConf.MAX_SLEEP_BETWEEN_LOCK_RETRIES)
 63 |   val lockNumRetries = getConf(SparkAcidConf.LOCK_NUM_RETRIES)
 64 |   val metastorePartitionPruningEnabled = sparkSession.sessionState.conf.metastorePartitionPruning
 65 |   val includeRowIds = parameters.getOrElse("includeRowIds", "false").toBoolean
 66 |   val parallelPartitionComputationThreshold = getConf(SparkAcidConf.PARALLEL_PARTITION_THRESHOLD)
 67 | 
 68 |   def getConf[T](configEntry: SparkAcidConfigEntry[T]): T = {
 69 |     val value = configMap.getOrElse(configEntry.configName, configEntry.defaultValue)
 70 |     configEntry.converter match {
 71 |       case Some(f) => f(value, configEntry.configName)
 72 |       case None => value.asInstanceOf[T]
 73 |     }
 74 |   }
 75 | }
 76 | 
 77 | object SparkAcidConf {
 78 |   val PREDICATE_PUSHDOWN_CONF = SparkAcidConfigBuilder[Boolean]("spark.sql.hiveAcid.enablePredicatePushdown")
 79 |     .defaultValue("true")
 80 |     .converter(toBoolean)
 81 |     .description("Configuration to enable Predicate PushDown for Hive Acid Reader")
 82 |     .create()
 83 | 
 84 |   val SPARK_READER = SparkAcidConfigBuilder[Boolean]("spark.sql.hiveAcid.enableSparkReader")
 85 |     .defaultValue("false")
 86 |     .converter(toBoolean)
 87 |     .description("Configuration to enable the Spark readers." +
 88 |       " When disabled, Hive Acid Readers in this DataSource are used." +
 89 |       " On enabling Spark readers will be used to read the Hive Table readers")
 90 |     .create()
 91 | 
 92 |   val MAX_SLEEP_BETWEEN_LOCK_RETRIES = SparkAcidConfigBuilder[Long]("spark.hiveAcid.lock.max.sleep.between.retries")
 93 |     .defaultValue("60000")
 94 |     .converter(toLong)
 95 |     .description("Maximum sleep time between lock retries in milliseconds; " +
 96 |       "Lock retries are based on exponential backoff" +
 97 |       " and start with 50 milliseconds and increases to the maximum time defined by this configuration")
 98 |     .create()
 99 | 
100 |   // Retry exponential backoff that starts with 50 millisec
101 |   // Default 13 is set to make total wait around 5 minutes with max sleep being 60 seconds
102 |   val LOCK_NUM_RETRIES = SparkAcidConfigBuilder[Int]("spark.hiveAcid.lock.max.retries")
103 |     .defaultValue("13")
104 |     .converter(toInt)
105 |     .description("Maximum retries to acquire a lock; Lock retries are based on exponential backoff " +
106 |       "that start with 50 milliseconds")
107 |     .create()
108 | 
109 |   val PARALLEL_PARTITION_THRESHOLD = SparkAcidConfigBuilder[Long]("spark.hiveAcid.parallel.partitioning.threshold")
110 |     .defaultValue("10")
111 |     .converter(toInt)
112 |     .description("Threshold for number of RDDs for a partitioned table," +
113 |       " after which Spark Job will be spawn to compute RDD splits(i.e., partitions) in parallel" +
114 |       " Note that every partition in a table becomes one RDD ")
115 |     .create()
116 | 
117 |   def toBoolean(s: String, key: String): Boolean = {
118 |     try {
119 |       s.trim.toBoolean
120 |     } catch {
121 |       case _: IllegalArgumentException =>
122 |         throw new IllegalArgumentException(s"$key should be boolean, but was $s")
123 |     }
124 |   }
125 | 
126 |   def toLong(s: String, key: String): Long = {
127 |     try {
128 |       s.trim.toLong
129 |     } catch {
130 |       case _: IllegalArgumentException =>
131 |         throw new IllegalArgumentException(s"$key should be Long, but was $s")
132 |     }
133 |   }
134 | 
135 |   def toInt(s: String, key: String): Int = {
136 |     try {
137 |       s.trim.toInt
138 |     } catch {
139 |       case _: IllegalArgumentException =>
140 |         throw new IllegalArgumentException(s"$key should be Int, but was $s")
141 |     }
142 |   }
143 | }


--------------------------------------------------------------------------------
/src/main/scala/com/qubole/spark/hiveacid/datasource/HiveAcidDataSource.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2019 Qubole, Inc.  All rights reserved.
  3 |  *
  4 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  5 |  * contributor license agreements.  See the NOTICE file distributed with
  6 |  * this work for additional information regarding copyright ownership.
  7 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  8 |  * (the "License"); you may not use this file except in compliance with
  9 |  * the License.  You may obtain a copy of the License at
 10 |  *
 11 |  *    http://www.apache.org/licenses/LICENSE-2.0
 12 |  *
 13 |  * Unless required by applicable law or agreed to in writing, software
 14 |  * distributed under the License is distributed on an "AS IS" BASIS,
 15 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 16 |  * See the License for the specific language governing permissions and
 17 |  * limitations under the License.
 18 |  */
 19 | 
 20 | package com.qubole.spark.hiveacid.datasource
 21 | 
 22 | import com.qubole.spark.hiveacid.{HiveAcidErrors, HiveAcidTable}
 23 | import com.qubole.spark.hiveacid.streaming.HiveAcidSink
 24 | 
 25 | import org.apache.spark.internal.Logging
 26 | import org.apache.spark.sql._
 27 | import org.apache.spark.sql.execution.streaming.Sink
 28 | import org.apache.spark.sql.sources._
 29 | import org.apache.spark.sql.streaming.OutputMode
 30 | 
 31 | /**
 32 |   * HiveAcid Data source implementation.
 33 |   */
 34 | class HiveAcidDataSource
 35 |   extends RelationProvider          // USING HiveAcid
 36 |     with CreatableRelationProvider  // Insert into/overwrite
 37 |     with DataSourceRegister         // FORMAT("HiveAcid")
 38 |     with StreamSinkProvider
 39 |     with Logging {
 40 | 
 41 |   // returns relation for passed in table name
 42 |   override def createRelation(sqlContext: SQLContext,
 43 |                               parameters: Map[String, String]): BaseRelation = {
 44 |     HiveAcidRelation(sqlContext.sparkSession, getFullyQualifiedTableName(parameters), parameters)
 45 |   }
 46 | 
 47 |   // returns relation after writing passed in data frame. Table name is part of parameter
 48 |   override def createRelation(sqlContext: SQLContext,
 49 |                               mode: SaveMode,
 50 |                               parameters: Map[String, String],
 51 |                               df: DataFrame): BaseRelation = {
 52 | 
 53 |     val hiveAcidTable: HiveAcidTable = HiveAcidTable.fromSparkSession(
 54 |       sqlContext.sparkSession,
 55 |       getFullyQualifiedTableName(parameters),
 56 |       parameters)
 57 | 
 58 |     mode match {
 59 |       case SaveMode.Overwrite =>
 60 |         hiveAcidTable.insertOverwrite(df)
 61 |       case SaveMode.Append =>
 62 |         hiveAcidTable.insertInto(df)
 63 |       // TODO: Add support for these
 64 |       case SaveMode.ErrorIfExists | SaveMode.Ignore =>
 65 |         HiveAcidErrors.unsupportedSaveMode(mode)
 66 |     }
 67 |     createRelation(sqlContext, parameters)
 68 |   }
 69 | 
 70 |   override def shortName(): String = {
 71 |     HiveAcidDataSource.NAME
 72 |   }
 73 | 
 74 |   override def createSink(sqlContext: SQLContext,
 75 |                           parameters: Map[String, String],
 76 |                           partitionColumns: Seq[String],
 77 |                           outputMode: OutputMode): Sink = {
 78 | 
 79 |     tableSinkAssertions(partitionColumns, outputMode)
 80 | 
 81 |     new HiveAcidSink(sqlContext.sparkSession, parameters)
 82 |   }
 83 | 
 84 |   private def tableSinkAssertions(partitionColumns: Seq[String], outputMode: OutputMode): Unit = {
 85 | 
 86 |     if (partitionColumns.nonEmpty) {
 87 |       throw HiveAcidErrors.unsupportedFunction("partitionBy", "HiveAcidSink")
 88 |     }
 89 |     if (outputMode != OutputMode.Append) {
 90 |       throw HiveAcidErrors.unsupportedStreamingOutputMode(s"$outputMode")
 91 |     }
 92 | 
 93 |   }
 94 | 
 95 |   private def getFullyQualifiedTableName(parameters: Map[String, String]): String = {
 96 |     parameters.getOrElse("table", {
 97 |       throw HiveAcidErrors.tableNotSpecifiedException()
 98 |     })
 99 |   }
100 | }
101 | 
102 | object HiveAcidDataSource {
103 |   val NAME = "HiveAcid"
104 | }
105 | 
106 | 


--------------------------------------------------------------------------------
/src/main/scala/com/qubole/spark/hiveacid/datasource/HiveAcidRelation.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2019 Qubole, Inc.  All rights reserved.
  3 |  *
  4 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  5 |  * contributor license agreements.  See the NOTICE file distributed with
  6 |  * this work for additional information regarding copyright ownership.
  7 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  8 |  * (the "License"); you may not use this file except in compliance with
  9 |  * the License.  You may obtain a copy of the License at
 10 |  *
 11 |  *    http://www.apache.org/licenses/LICENSE-2.0
 12 |  *
 13 |  * Unless required by applicable law or agreed to in writing, software
 14 |  * distributed under the License is distributed on an "AS IS" BASIS,
 15 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 16 |  * See the License for the specific language governing permissions and
 17 |  * limitations under the License.
 18 |  */
 19 | 
 20 | package com.qubole.spark.hiveacid.datasource
 21 | 
 22 | import org.apache.spark.internal.Logging
 23 | import org.apache.spark.rdd.RDD
 24 | import org.apache.spark.sql.{Column, DataFrame, Row, SQLContext, SparkSession}
 25 | import org.apache.spark.sql.sources.{BaseRelation, Filter, InsertableRelation, PrunedFilteredScan}
 26 | import org.apache.spark.sql.types._
 27 | import com.qubole.spark.hiveacid.{HiveAcidErrors, HiveAcidTable, SparkAcidConf}
 28 | import com.qubole.spark.hiveacid.hive.HiveAcidMetadata
 29 | import com.qubole.spark.hiveacid.merge.{MergeWhenClause, MergeWhenNotInsert}
 30 | import org.apache.spark.sql.catalyst.AliasIdentifier
 31 | import org.apache.spark.sql.catalyst.expressions.Expression
 32 | import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 33 | 
 34 | import collection.JavaConversions._
 35 | 
 36 | /**
 37 |   * Container for all metadata, configuration and schema to perform operations on
 38 |   * Hive ACID datasource. This provides for plumbing most of the heavy lifting is
 39 |   * performed inside HiveAcidtTable.
 40 |   *
 41 |   * @param sparkSession Spark Session object
 42 |   * @param fullyQualifiedTableName Table name for the data source.
 43 |   * @param parameters user provided parameters required for reading and writing,
 44 |   *        including configuration
 45 |   */
 46 | case class HiveAcidRelation(sparkSession: SparkSession,
 47 |                             fullyQualifiedTableName: String,
 48 |                             parameters: Map[String, String])
 49 |     extends BaseRelation
 50 |     with InsertableRelation
 51 |     with PrunedFilteredScan
 52 |     with Logging {
 53 | 
 54 |   private val hiveAcidMetadata: HiveAcidMetadata = HiveAcidMetadata.fromSparkSession(
 55 |     sparkSession,
 56 |     fullyQualifiedTableName
 57 |   )
 58 |   private val hiveAcidTable: HiveAcidTable = new HiveAcidTable(sparkSession,
 59 |     hiveAcidMetadata, parameters)
 60 | 
 61 |   private val readOptions = SparkAcidConf(sparkSession, parameters)
 62 | 
 63 |   override def sqlContext: SQLContext = sparkSession.sqlContext
 64 | 
 65 |   override val schema: StructType = if (readOptions.includeRowIds) {
 66 |     hiveAcidMetadata.tableSchemaWithRowId
 67 |   } else {
 68 |     hiveAcidMetadata.tableSchema
 69 |   }
 70 | 
 71 |   override def insert(data: DataFrame, overwrite: Boolean): Unit = {
 72 |    // sql insert into and overwrite
 73 |     if (overwrite) {
 74 |       hiveAcidTable.insertOverwrite(data)
 75 |     } else {
 76 |       hiveAcidTable.insertInto(data)
 77 |     }
 78 |   }
 79 | 
 80 |   def update(condition: Option[Column], newValues: Map[String, Column]): Unit = {
 81 |     hiveAcidTable.update(condition, newValues)
 82 |   }
 83 | 
 84 |   def delete(condition: Column): Unit = {
 85 |     hiveAcidTable.delete(condition)
 86 |   }
 87 |   override def sizeInBytes: Long = {
 88 |     val compressionFactor = sparkSession.sessionState.conf.fileCompressionFactor
 89 |     (sparkSession.sessionState.conf.defaultSizeInBytes * compressionFactor).toLong
 90 |   }
 91 | 
 92 |   def merge(sourceDf: DataFrame,
 93 |             mergeExpression: Expression,
 94 |             matchedClause: Seq[MergeWhenClause],
 95 |             notMatched: Option[MergeWhenNotInsert],
 96 |             sourceAlias: Option[AliasIdentifier],
 97 |             targetAlias: Option[AliasIdentifier]): Unit = {
 98 |     hiveAcidTable.merge(sourceDf, mergeExpression, matchedClause,
 99 |       notMatched, sourceAlias, targetAlias)
100 |   }
101 | 
102 |   def getHiveAcidTable(): HiveAcidTable = {
103 |     hiveAcidTable
104 |   }
105 | 
106 |   // FIXME: should it be true / false. Recommendation seems to
107 |   //  be to leave it as true
108 |   override val needConversion: Boolean = false
109 | 
110 |   override def buildScan(requiredColumns: Array[String], filters: Array[Filter]): RDD[Row] = {
111 |     val readOptions = SparkAcidConf(sparkSession, parameters)
112 |     // sql "select *"
113 |     hiveAcidTable.getRdd(requiredColumns, filters, readOptions)
114 |   }
115 | }
116 | 


--------------------------------------------------------------------------------
/src/main/scala/com/qubole/spark/hiveacid/hive/.gitignore:
--------------------------------------------------------------------------------
1 | *.scalae
2 | 


--------------------------------------------------------------------------------
/src/main/scala/com/qubole/spark/hiveacid/hive/HiveAcidMetadata.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package com.qubole.spark.hiveacid.hive
 19 | 
 20 | import java.lang.reflect.InvocationTargetException
 21 | import java.util.Locale
 22 | 
 23 | import scala.collection.JavaConversions._
 24 | import scala.collection.mutable
 25 | import com.qubole.shaded.hadoop.hive.conf.HiveConf
 26 | import com.qubole.shaded.hadoop.hive.ql.io.RecordIdentifier
 27 | import com.qubole.shaded.hadoop.hive.ql.metadata
 28 | import com.qubole.shaded.hadoop.hive.ql.metadata.Hive
 29 | import com.qubole.shaded.hadoop.hive.ql.plan.TableDesc
 30 | import com.qubole.spark.hiveacid.util.Util
 31 | import com.qubole.spark.hiveacid.HiveAcidErrors
 32 | import org.apache.hadoop.fs.Path
 33 | import org.apache.hadoop.hive.metastore.api.MetaException
 34 | import org.apache.hadoop.io.Writable
 35 | import org.apache.hadoop.mapred.{InputFormat, OutputFormat}
 36 | import org.apache.spark.internal.Logging
 37 | import org.apache.spark.sql._
 38 | import org.apache.spark.sql.types._
 39 | 
 40 | /**
 41 |  * Represents metadata for hive acid table and exposes API to perform operations on top of it
 42 |  * @param sparkSession - spark session object
 43 |  * @param fullyQualifiedTableName - the fully qualified hive acid table name
 44 |  */
 45 | class HiveAcidMetadata(sparkSession: SparkSession,
 46 |                        fullyQualifiedTableName: String) extends Logging {
 47 | 
 48 |   // hive conf
 49 |   private val hiveConf: HiveConf = HiveConverter.getHiveConf(sparkSession.sparkContext)
 50 | 
 51 |   // a hive representation of the table
 52 |   val hTable: metadata.Table = {
 53 |     val hive: Hive = Hive.get(hiveConf)
 54 |     val table = sparkSession.sessionState.sqlParser.parseTableIdentifier(fullyQualifiedTableName)
 55 |     val hTable = hive.getTable(
 56 |       table.database match {
 57 |         case Some(database) => database
 58 |         case None => HiveAcidMetadata.DEFAULT_DATABASE
 59 |       }, table.identifier)
 60 |     Hive.closeCurrent()
 61 |     hTable
 62 |   }
 63 | 
 64 |   if (hTable.getParameters.get("transactional") != "true") {
 65 |     throw HiveAcidErrors.tableNotAcidException(hTable.getFullyQualifiedName)
 66 |   }
 67 | 
 68 |   val isFullAcidTable: Boolean = hTable.getParameters.containsKey("transactional_properties") &&
 69 |     !hTable.getParameters.get("transactional_properties").equals("insert_only")
 70 |   val isInsertOnlyTable: Boolean = !isFullAcidTable
 71 |   val isBucketed: Boolean = hTable.getBucketCols() != null && hTable.getBucketCols.size() > 0
 72 | 
 73 |   // Table properties
 74 |   val isPartitioned: Boolean = hTable.isPartitioned
 75 |   val rootPath: Path = hTable.getDataLocation
 76 |   val dbName: String = hTable.getDbName
 77 |   val tableName: String = hTable.getTableName
 78 |   val fullyQualifiedName: String = hTable.getFullyQualifiedName
 79 | 
 80 |   // Schema properties
 81 |   val dataSchema = StructType(hTable.getSd.getCols.toList.map(
 82 |     HiveConverter.getCatalystStructField).toArray)
 83 | 
 84 |   val partitionSchema = StructType(hTable.getPartitionKeys.toList.map(
 85 |     HiveConverter.getCatalystStructField).toArray)
 86 | 
 87 |   val tableSchema: StructType = {
 88 |     val overlappedPartCols = mutable.Map.empty[String, StructField]
 89 |     partitionSchema.foreach { partitionField =>
 90 |       if (dataSchema.exists(getColName(_) == getColName(partitionField))) {
 91 |         overlappedPartCols += getColName(partitionField) -> partitionField
 92 |       }
 93 |     }
 94 |     StructType(dataSchema.map(f => overlappedPartCols.getOrElse(getColName(f), f)) ++
 95 |       partitionSchema.filterNot(f => overlappedPartCols.contains(getColName(f))))
 96 |   }
 97 | 
 98 |   val tableSchemaWithRowId: StructType = {
 99 |     StructType(
100 |       Seq(
101 |         StructField(HiveAcidMetadata.rowIdCol, HiveAcidMetadata.rowIdSchema)
102 |       ) ++ tableSchema.fields)
103 |   }
104 | 
105 |   lazy val tableDesc: TableDesc = {
106 |     val inputFormatClass: Class[InputFormat[Writable, Writable]] =
107 |       Util.classForName(hTable.getInputFormatClass.getName,
108 |         loadShaded = true).asInstanceOf[java.lang.Class[InputFormat[Writable, Writable]]]
109 |     val outputFormatClass: Class[OutputFormat[Writable, Writable]] =
110 |       Util.classForName(hTable.getOutputFormatClass.getName,
111 |         loadShaded = true).asInstanceOf[java.lang.Class[OutputFormat[Writable, Writable]]]
112 |     new TableDesc(
113 |       inputFormatClass,
114 |       outputFormatClass,
115 |       hTable.getMetadata)
116 |   }
117 | 
118 |   /**
119 |     * Returns list of partitions satisfying partition predicates
120 |     * @param partitionFilters - filters to apply
121 |     */
122 |   def getRawPartitions(partitionFilters: Option[String] = None): Seq[metadata.Partition] = {
123 |     val hive: Hive = Hive.get(hiveConf)
124 |     val prunedPartitions = try {
125 |       partitionFilters match {
126 |         case Some(filter) => hive.getPartitionsByFilter(hTable, filter)
127 |         case None => hive.getPartitions(hTable)
128 |       }
129 |     } finally {
130 |       Hive.closeCurrent()
131 |     }
132 |     logDebug(s"partition count = ${prunedPartitions.size()}")
133 |     prunedPartitions.toSeq
134 |   }
135 | 
136 |   private def getColName(field: StructField): String = {
137 |     HiveAcidMetadata.getColName(sparkSession, field)
138 |   }
139 | }
140 | 
141 | object HiveAcidMetadata {
142 |   val DEFAULT_DATABASE = "default"
143 | 
144 |   val rowIdCol = "rowId"
145 |   val rowIdSchema: StructType = {
146 |     StructType(
147 |       RecordIdentifier.Field.values().map {
148 |         field =>
149 |           StructField(
150 |             name = field.name(),
151 |             dataType = HiveConverter.getCatalystType(field.fieldType.getTypeName),
152 |             nullable = true)
153 |       }
154 |     )
155 |   }
156 | 
157 |   def fromSparkSession(sparkSession: SparkSession,
158 |                        fullyQualifiedTableName: String): HiveAcidMetadata = {
159 |     new HiveAcidMetadata(
160 |       sparkSession,
161 |       fullyQualifiedTableName)
162 |   }
163 | 
164 |   def getColName(sparkSession: SparkSession, field: StructField): String = {
165 |     if (sparkSession.sessionState.conf.caseSensitiveAnalysis) {
166 |       field.name
167 |     } else {
168 |       field.name.toLowerCase(Locale.ROOT)
169 |     }
170 |   }
171 | 
172 |   def getColNames(sparkSession: SparkSession, schema: StructType): Seq[String] = {
173 |     schema.map(getColName(sparkSession, _))
174 |   }
175 | }
176 | 


--------------------------------------------------------------------------------
/src/main/scala/com/qubole/spark/hiveacid/hive/HiveConverter.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package com.qubole.spark.hiveacid.hive
 19 | 
 20 | import java.sql.{Date, Timestamp}
 21 | import java.util.Locale
 22 | 
 23 | import com.qubole.shaded.hadoop.hive.conf.HiveConf
 24 | import com.qubole.shaded.hadoop.hive.metastore.api.FieldSchema
 25 | import org.apache.commons.lang3.StringUtils
 26 | import org.apache.spark.internal.Logging
 27 | import org.apache.spark.{SparkContext, SparkException}
 28 | import org.apache.spark.sql.catalyst.parser.{CatalystSqlParser, ParseException}
 29 | import org.apache.spark.sql.sources._
 30 | import org.apache.spark.sql.types._
 31 | 
 32 | import scala.collection.JavaConversions._
 33 | 
 34 | /**
 35 |   * Encapsulates everything (extensions, workarounds, quirks) to handle the
 36 |   * SQL dialect conversion between catalyst and hive.
 37 |   */
 38 | private[hiveacid] object HiveConverter extends Logging {
 39 | 
 40 |   def getCatalystStructField(hc: FieldSchema): StructField = {
 41 |     val columnType = getCatalystType(hc.getType)
 42 |     val metadata = if (hc.getType != columnType.catalogString) {
 43 |       new MetadataBuilder().putString(HIVE_TYPE_STRING, hc.getType).build()
 44 |     } else {
 45 |       Metadata.empty
 46 |     }
 47 | 
 48 |     val field = StructField(
 49 |       name = hc.getName,
 50 |       dataType = columnType,
 51 |       nullable = true,
 52 |       metadata = metadata)
 53 |     Option(hc.getComment).map(field.withComment).getOrElse(field)
 54 |   }
 55 | 
 56 |   def getCatalystType(dataType: String): DataType = {
 57 |     try {
 58 |       CatalystSqlParser.parseDataType(dataType)
 59 |     } catch {
 60 |       case e: ParseException =>
 61 |         throw new SparkException("Cannot recognize hive type string: " + dataType, e)
 62 |     }
 63 |   }
 64 | 
 65 |   def getHiveConf(sparkContext: SparkContext): HiveConf = {
 66 |     val hiveConf = new HiveConf()
 67 |     (sparkContext.hadoopConfiguration.iterator().map(kv => kv.getKey -> kv.getValue)
 68 |       ++ sparkContext.getConf.getAll.toMap).foreach { case (k, v) =>
 69 |       logDebug(
 70 |         s"""
 71 |            |Applying Hadoop/Hive/Spark and extra properties to Hive Conf:
 72 |            |$k=${if (k.toLowerCase(Locale.ROOT).contains("password")) "xxx" else v}
 73 |          """.stripMargin)
 74 |       hiveConf.set(k, v)
 75 |     }
 76 |     hiveConf
 77 |   }
 78 | 
 79 |   /**
 80 |     * Escape special characters in SQL string literals.
 81 |     *
 82 |     * @param value The string to be escaped.
 83 |     * @return Escaped string.
 84 |     */
 85 |   private def escapeSql(value: String): String = {
 86 |     // TODO: how to handle null
 87 |     StringUtils.replace(value, "'", "''")
 88 |   }
 89 | 
 90 |   /**
 91 |     * Converts value to SQL expression.
 92 |     * @param value The value to be converted.
 93 |     * @return Converted value.
 94 |     */
 95 |   private def compileValue(value: Any): Any = value match {
 96 |     case stringValue: String => s"'${escapeSql(stringValue)}'"
 97 |     case timestampValue: Timestamp => "'" + timestampValue + "'"
 98 |     case dateValue: Date => "'" + dateValue + "'"
 99 |     case arrayValue: Array[Any] => arrayValue.map(compileValue).mkString(", ")
100 |     case _ => value
101 |   }
102 | 
103 |   /**
104 |     * Turns a single Filter into a String representing a SQL expression.
105 |     * Returns None for an unhandled filter.
106 |     */
107 |   def compileFilter(f: Filter): Option[String] = Option(x = f match {
108 |     case EqualTo(attr, value) => s"$attr = ${compileValue(value)}"
109 |     case EqualNullSafe(attr, value) =>
110 |       val col = attr
111 |       s"(NOT ($col != ${compileValue(value)} OR $col = 'NULL' OR " +
112 |         s"${compileValue(value)} = 'NULL') OR " +
113 |         s"($col = 'NULL' AND ${compileValue(value)} = 'NULL'))"
114 |     case LessThan(attr, value) => s"$attr < ${compileValue(value)}"
115 |     case GreaterThan(attr, value) => s"$attr > ${compileValue(value)}"
116 |     case LessThanOrEqual(attr, value) => s"$attr <= ${compileValue(value)}"
117 |     case GreaterThanOrEqual(attr, value) => s"$attr >= ${compileValue(value)}"
118 |     // These clauses throw in Hive MS when filtering the partitions
119 |     //case IsNull(attr) => s"$attr = 'NULL'"
120 |     //case IsNotNull(attr) => s"$attr != 'NULL'"
121 |     case StringStartsWith(attr, value) => s"$attr LIKE '$value%'"
122 |     case StringEndsWith(attr, value) => s"$attr LIKE '%$value'"
123 |     case StringContains(attr, value) => s"$attr LIKE '%$value%'"
124 |     case In(attr, value) => s"$attr IN (${compileValue(value)})"
125 |     case Not(`f`) => compileFilter(f).map(p => s"(NOT ($p))").orNull
126 |     case Or(f1, f2) =>
127 |       // We can't compile Or filter unless both sub-filters are compiled successfully.
128 |       // It applies too for the following And filter.
129 |       // If we can make sure compileFilter supports all filters, we can remove this check.
130 |       val or = Seq(f1, f2) flatMap compileFilter
131 |       if (or.size == 2) {
132 |         or.map(p => s"($p)").mkString(" OR ")
133 |       } else null
134 |     case And(f1, f2) =>
135 |       val and = Seq(f1, f2).flatMap(compileFilter)
136 |       if (and.size == 2) {
137 |         and.map(p => s"($p)").mkString(" AND ")
138 |       } else null
139 |     case _ => null
140 |   })
141 | 
142 | 
143 |   def compileFilters(filters: Seq[Filter]): String = {
144 |     val str = filters.flatMap(compileFilter).mkString(" and ")
145 |     logDebug(str)
146 |     str
147 |   }
148 | }
149 | 


--------------------------------------------------------------------------------
/src/main/scala/com/qubole/spark/hiveacid/package.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2019 Qubole, Inc.  All rights reserved.
 3 |  *
 4 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 5 |  * contributor license agreements.  See the NOTICE file distributed with
 6 |  * this work for additional information regarding copyright ownership.
 7 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 8 |  * (the "License"); you may not use this file except in compliance with
 9 |  * the License.  You may obtain a copy of the License at
10 |  *
11 |  *    http://www.apache.org/licenses/LICENSE-2.0
12 |  *
13 |  * Unless required by applicable law or agreed to in writing, software
14 |  * distributed under the License is distributed on an "AS IS" BASIS,
15 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 |  * See the License for the specific language governing permissions and
17 |  * limitations under the License.
18 |  */
19 | package com.qubole.spark
20 | 
21 | import org.apache.spark.sql._
22 | 
23 | package object hiveacid {
24 |   implicit class HiveAcidDataFrameReader(reader: DataFrameReader) {
25 |     def hiveacid(table: String, options: Map[String, String] = Map.empty): DataFrame = {
26 |       reader.format("HiveAcid").option("table", table)
27 |         .options(options).load()
28 |     }
29 |   }
30 | 
31 |   implicit class HiveAcidDataFrameWriter[T](writer: DataFrameWriter[T]) {
32 |     def hiveacid(table: String, saveMode: String, options: Map[String, String] = Map.empty): Unit = {
33 |       writer.format("HiveAcid").option("table", table)
34 |         .options(options).mode(saveMode).save()
35 |     }
36 |   }
37 | }
38 | 


--------------------------------------------------------------------------------
/src/main/scala/com/qubole/spark/hiveacid/rdd/EmptyRDD.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2019 Qubole, Inc.  All rights reserved.
 3 |  *
 4 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 5 |  * contributor license agreements.  See the NOTICE file distributed with
 6 |  * this work for additional information regarding copyright ownership.
 7 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 8 |  * (the "License"); you may not use this file except in compliance with
 9 |  * the License.  You may obtain a copy of the License at
10 |  *
11 |  *    http://www.apache.org/licenses/LICENSE-2.0
12 |  *
13 |  * Unless required by applicable law or agreed to in writing, software
14 |  * distributed under the License is distributed on an "AS IS" BASIS,
15 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 |  * See the License for the specific language governing permissions and
17 |  * limitations under the License.
18 |  */
19 | 
20 | package com.qubole.spark.hiveacid.rdd
21 | 
22 | import scala.reflect.ClassTag
23 | 
24 | import org.apache.spark.{Partition, SparkContext, TaskContext}
25 | import org.apache.spark.rdd.RDD
26 | 
27 | private[hiveacid] class EmptyRDD[T: ClassTag](sc: SparkContext) extends RDD[T](sc, Nil) {
28 | 
29 |   override def getPartitions: Array[Partition] = Array.empty
30 | 
31 |   override def compute(split: Partition, context: TaskContext): Iterator[T] = {
32 |     throw new UnsupportedOperationException("empty RDD")
33 |   }
34 | }
35 | 


--------------------------------------------------------------------------------
/src/main/scala/com/qubole/spark/hiveacid/rdd/HiveAcidUnionRDD.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2019 Qubole, Inc.  All rights reserved.
 3 |  *
 4 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 5 |  * contributor license agreements.  See the NOTICE file distributed with
 6 |  * this work for additional information regarding copyright ownership.
 7 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 8 |  * (the "License"); you may not use this file except in compliance with
 9 |  * the License.  You may obtain a copy of the License at
10 |  *
11 |  *    http://www.apache.org/licenses/LICENSE-2.0
12 |  *
13 |  * Unless required by applicable law or agreed to in writing, software
14 |  * distributed under the License is distributed on an "AS IS" BASIS,
15 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 |  * See the License for the specific language governing permissions and
17 |  * limitations under the License.
18 |  */
19 | 
20 | package com.qubole.spark.hiveacid.rdd
21 | 
22 | import com.qubole.spark.hiveacid.SparkAcidConf
23 | import com.qubole.spark.hiveacid.reader.hive.HiveAcidPartitionComputer
24 | 
25 | import scala.reflect.ClassTag
26 | import org.apache.spark._
27 | import org.apache.spark.rdd.{RDD, UnionRDD}
28 | 
29 | /**
30 |   * A Hive3RDD is created for each of the hive partition of the table. But at the end the buildScan
31 |   * is supposed to return only 1 RDD for entire table. So we have to create UnionRDD for it.
32 |   *
33 |   * This class extends UnionRDD and makes sure that we acquire read lock once for all the
34 |   * partitions of the table
35 | 
36 |   * @param sc - sparkContext
37 |   * @param rddSeq - underlying partition RDDs
38 |   * @param hiveSplitInfo - It is sequence of HiveSplitInfo.
39 |   *                      It would be derived from the list of HiveAcidRDD passed here.
40 |   *                      check HiveAcidRDD.getHiveSplitsInfo
41 |   */
42 | private[hiveacid] class HiveAcidUnionRDD[T: ClassTag](
43 |    sc: SparkContext,
44 |    rddSeq: Seq[RDD[T]],
45 |    //TODO: We should clean so that HiveSplitInfo need not have to be passed separately.
46 |    hiveSplitInfo: Seq[HiveSplitInfo]) extends UnionRDD[T](sc, rddSeq) {
47 | 
48 |   private val ignoreMissingFiles =
49 |     super.sparkContext.getConf.getBoolean("spark.files.ignoreMissingFiles", defaultValue = false)
50 | 
51 |   private val ignoreEmptySplits =
52 |     super.sparkContext.getConf.getBoolean("spark.hadoopRDD.ignoreEmptySplits", defaultValue = false)
53 | 
54 |   private val parallelPartitionThreshold =
55 |     super.sparkContext.getConf.getInt(SparkAcidConf.PARALLEL_PARTITION_THRESHOLD.configName, 10)
56 | 
57 |   override def getPartitions: Array[Partition] = {
58 |     if (hiveSplitInfo.length > parallelPartitionThreshold) {
59 |       val partitions = hiveSplitInfo.length/parallelPartitionThreshold
60 |       val hiveSplitRDD = super.sparkContext.parallelize(hiveSplitInfo, partitions)
61 |       val hiveAcidPartitionComputer = new HiveAcidPartitionComputer(ignoreEmptySplits, ignoreMissingFiles)
62 |       // It spawns a spark job to compute Partitions for every RDD and stores it in cache.
63 |       hiveAcidPartitionComputer.computeHiveSplitsAndCache(hiveSplitRDD)
64 |     }
65 |     super.getPartitions
66 |   }
67 | }
68 | 


--------------------------------------------------------------------------------
/src/main/scala/com/qubole/spark/hiveacid/reader/.gitignore:
--------------------------------------------------------------------------------
1 | *.scalae
2 | 


--------------------------------------------------------------------------------
/src/main/scala/com/qubole/spark/hiveacid/reader/Reader.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2019 Qubole, Inc.  All rights reserved.
 3 |  *
 4 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 5 |  * contributor license agreements.  See the NOTICE file distributed with
 6 |  * this work for additional information regarding copyright ownership.
 7 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 8 |  * (the "License"); you may not use this file except in compliance with
 9 |  * the License.  You may obtain a copy of the License at
10 |  *
11 |  *    http://www.apache.org/licenses/LICENSE-2.0
12 |  *
13 |  * Unless required by applicable law or agreed to in writing, software
14 |  * distributed under the License is distributed on an "AS IS" BASIS,
15 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 |  * See the License for the specific language governing permissions and
17 |  * limitations under the License.
18 |  */
19 | 
20 | package com.qubole.spark.hiveacid.reader
21 | 
22 | import com.qubole.spark.hiveacid.hive.HiveAcidMetadata
23 | 
24 | import org.apache.spark.rdd.RDD
25 | import org.apache.spark.sql.catalyst.InternalRow
26 | 
27 | private[reader] trait Reader {
28 |   def makeRDDForTable(hiveAcidMetadata: HiveAcidMetadata): RDD[InternalRow]
29 |   def makeRDDForPartitionedTable(hiveAcidMetadata: HiveAcidMetadata,
30 |                                  partitions: Seq[ReaderPartition]):  RDD[InternalRow]
31 | }
32 | 
33 | private[reader] case class ReaderPartition(ptn: Any)
34 | 


--------------------------------------------------------------------------------
/src/main/scala/com/qubole/spark/hiveacid/reader/ReaderOptions.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2019 Qubole, Inc.  All rights reserved.
 3 |  *
 4 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 5 |  * contributor license agreements.  See the NOTICE file distributed with
 6 |  * this work for additional information regarding copyright ownership.
 7 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 8 |  * (the "License"); you may not use this file except in compliance with
 9 |  * the License.  You may obtain a copy of the License at
10 |  *
11 |  *    http://www.apache.org/licenses/LICENSE-2.0
12 |  *
13 |  * Unless required by applicable law or agreed to in writing, software
14 |  * distributed under the License is distributed on an "AS IS" BASIS,
15 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 |  * See the License for the specific language governing permissions and
17 |  * limitations under the License.
18 |  */
19 | 
20 | package com.qubole.spark.hiveacid.reader
21 | 
22 | import com.qubole.spark.hiveacid.SparkAcidConf
23 | import org.apache.hadoop.conf.Configuration
24 | 
25 | import org.apache.spark.sql.catalyst.expressions.Attribute
26 | import org.apache.spark.sql.sources.Filter
27 | 
28 | /**
29 |  * Reader options which will be serialized and sent to each executor
30 |  */
31 | private[hiveacid] class ReaderOptions(val hadoopConf: Configuration,
32 |                                       val partitionAttributes: Seq[Attribute],
33 |                                       val requiredAttributes: Seq[Attribute],
34 |                                       val dataFilters: Array[Filter],
35 |                                       val requiredNonPartitionedColumns: Array[String],
36 |                                       val sessionLocalTimeZone: String,
37 |                                       val readConf: SparkAcidConf) extends Serializable
38 | 


--------------------------------------------------------------------------------
/src/main/scala/com/qubole/spark/hiveacid/reader/TableReader.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2019 Qubole, Inc.  All rights reserved.
  3 |  *
  4 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  5 |  * contributor license agreements.  See the NOTICE file distributed with
  6 |  * this work for additional information regarding copyright ownership.
  7 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  8 |  * (the "License"); you may not use this file except in compliance with
  9 |  * the License.  You may obtain a copy of the License at
 10 |  *
 11 |  *    http://www.apache.org/licenses/LICENSE-2.0
 12 |  *
 13 |  * Unless required by applicable law or agreed to in writing, software
 14 |  * distributed under the License is distributed on an "AS IS" BASIS,
 15 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 16 |  * See the License for the specific language governing permissions and
 17 |  * limitations under the License.
 18 |  */
 19 | 
 20 | package com.qubole.spark.hiveacid.reader
 21 | 
 22 | import com.qubole.spark.hiveacid.{HiveAcidOperation, SparkAcidConf}
 23 | import com.qubole.spark.hiveacid.transaction._
 24 | import com.qubole.spark.hiveacid.hive.HiveAcidMetadata
 25 | import com.qubole.spark.hiveacid.reader.hive.{HiveAcidReader, HiveAcidReaderOptions}
 26 | 
 27 | import org.apache.spark.internal.Logging
 28 | import org.apache.spark.rdd.RDD
 29 | import org.apache.spark.sql.{Row, SparkSession}
 30 | import org.apache.spark.sql.catalyst.expressions._
 31 | import org.apache.spark.sql.sources.Filter
 32 | 
 33 | /**
 34 |   * Table reader object
 35 |   *
 36 |   * @param sparkSession - Spark session
 37 |   * @param curTxn - Transaction object to acquire locks.
 38 |   * @param hiveAcidMetadata - Hive acid table for which read is to be performed.
 39 |   */
 40 | private[hiveacid] class TableReader(sparkSession: SparkSession,
 41 |                                     curTxn: HiveAcidTxn,
 42 |                                     hiveAcidMetadata: HiveAcidMetadata) extends Logging {
 43 | 
 44 |   def getRdd(requiredColumns: Array[String],
 45 |              filters: Array[Filter],
 46 |              readConf: SparkAcidConf): RDD[Row] = {
 47 |     val rowIdColumnSet = HiveAcidMetadata.rowIdSchema.fields.map(_.name).toSet
 48 |     val requiredColumnsWithoutRowId = requiredColumns.filterNot(rowIdColumnSet.contains)
 49 |     val partitionColumnNames = hiveAcidMetadata.partitionSchema.fields.map(_.name)
 50 |     val partitionedColumnSet = partitionColumnNames.toSet
 51 | 
 52 |     // Attributes
 53 |     val requiredNonPartitionedColumns = requiredColumnsWithoutRowId.filter(
 54 |       x => !partitionedColumnSet.contains(x))
 55 | 
 56 |     val requiredAttributes = if (!readConf.includeRowIds) {
 57 |       requiredColumnsWithoutRowId.map {
 58 |         x =>
 59 |           val field = hiveAcidMetadata.tableSchema.fields.find(_.name == x).get
 60 |           PrettyAttribute(field.name, field.dataType)
 61 |       }
 62 |     } else {
 63 |       requiredColumns.map {
 64 |         x =>
 65 |           val field = hiveAcidMetadata.tableSchemaWithRowId.fields.find(_.name == x).get
 66 |           PrettyAttribute(field.name, field.dataType)
 67 |       }
 68 |     }
 69 |     val partitionAttributes = hiveAcidMetadata.partitionSchema.fields.map { x =>
 70 |       PrettyAttribute(x.name, x.dataType)
 71 |     }
 72 | 
 73 |     // Filters
 74 |     val (partitionFilters, otherFilters) = filters.partition { predicate =>
 75 |       !predicate.references.isEmpty &&
 76 |         predicate.references.toSet.subsetOf(partitionedColumnSet)
 77 |     }
 78 |     val dataFilters = otherFilters.filter(_
 79 |       .references.intersect(partitionColumnNames).isEmpty
 80 |     )
 81 | 
 82 |     logDebug(s"total filters : ${filters.length}: " +
 83 |       s"dataFilters: ${dataFilters.length} " +
 84 |       s"partitionFilters: ${partitionFilters.length}")
 85 | 
 86 |     val hadoopConf = sparkSession.sessionState.newHadoopConf()
 87 | 
 88 |     logDebug(s"sarg.pushdown: ${hadoopConf.get("sarg.pushdown")}," +
 89 |       s"hive.io.file.readcolumn.names: ${hadoopConf.get("hive.io.file.readcolumn.names")}, " +
 90 |       s"hive.io.file.readcolumn.ids: ${hadoopConf.get("hive.io.file.readcolumn.ids")}")
 91 | 
 92 |     val readerOptions = new ReaderOptions(hadoopConf,
 93 |       partitionAttributes,
 94 |       requiredAttributes,
 95 |       dataFilters,
 96 |       requiredNonPartitionedColumns,
 97 |       sparkSession.sessionState.conf.sessionLocalTimeZone,
 98 |       readConf)
 99 | 
100 |     val hiveAcidReaderOptions= HiveAcidReaderOptions.get(hiveAcidMetadata)
101 | 
102 |     val (partitions, partitionList) = HiveAcidReader.getPartitions(hiveAcidMetadata,
103 |       readerOptions,
104 |       partitionFilters)
105 | 
106 |     // Acquire lock on all the partition and then create snapshot. Every time getRDD is called
107 |     // it creates a new snapshot.
108 |     // NB: partitionList is Seq if partition pruning is not enabled
109 |     curTxn.acquireLocks(hiveAcidMetadata, HiveAcidOperation.READ, partitionList, readConf)
110 | 
111 |     // Create Snapshot !!!
112 |     //val curSnapshot = HiveAcidTxn.createSnapshot(curTxn, hiveAcidMetadata)
113 | 
114 |     val validWriteIds = HiveAcidTxn.getValidWriteIds(curTxn, hiveAcidMetadata)
115 | 
116 |     val reader = new HiveAcidReader(
117 |       sparkSession,
118 |       readerOptions,
119 |       hiveAcidReaderOptions,
120 |       validWriteIds)
121 | 
122 |     val rdd = if (hiveAcidMetadata.isPartitioned) {
123 |       reader.makeRDDForPartitionedTable(hiveAcidMetadata, partitions)
124 |     } else {
125 |       reader.makeRDDForTable(hiveAcidMetadata)
126 |     }
127 | 
128 |     rdd.asInstanceOf[RDD[Row]]
129 |   }
130 | }
131 | 


--------------------------------------------------------------------------------
/src/main/scala/com/qubole/spark/hiveacid/reader/hive/HiveAcidPartitionComputer.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2019 Qubole, Inc.  All rights reserved.
  3 |  *
  4 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  5 |  * contributor license agreements.  See the NOTICE file distributed with
  6 |  * this work for additional information regarding copyright ownership.
  7 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  8 |  * (the "License"); you may not use this file except in compliance with
  9 |  * the License.  You may obtain a copy of the License at
 10 |  *
 11 |  *    http://www.apache.org/licenses/LICENSE-2.0
 12 |  *
 13 |  * Unless required by applicable law or agreed to in writing, software
 14 |  * distributed under the License is distributed on an "AS IS" BASIS,
 15 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 16 |  * See the License for the specific language governing permissions and
 17 |  * limitations under the License.
 18 |  */
 19 | 
 20 | package com.qubole.spark.hiveacid.reader.hive
 21 | 
 22 | import java.util.concurrent.{ConcurrentHashMap, TimeUnit}
 23 | 
 24 | import com.qubole.shaded.hadoop.hive.common.{ValidReaderWriteIdList, ValidWriteIdList}
 25 | import com.qubole.spark.hiveacid.rdd.{HiveAcidPartition, HiveAcidRDD, HiveSplitInfo}
 26 | import com.qubole.spark.hiveacid.reader.hive.HiveAcidPartitionComputer.{addToPartitionCache, getInputFormat}
 27 | import com.qubole.spark.hiveacid.util.Util
 28 | import org.apache.hadoop.conf.Configurable
 29 | import org.apache.hadoop.fs.Path
 30 | import org.apache.hadoop.io.Writable
 31 | import org.apache.hadoop.mapred.{FileInputFormat, InputFormat, InvalidInputException, JobConf}
 32 | import org.apache.hadoop.util.ReflectionUtils
 33 | import org.apache.spark.deploy.SparkHadoopUtil
 34 | import org.apache.spark.internal.Logging
 35 | import org.apache.spark.rdd.RDD
 36 | 
 37 | private[hiveacid] case class HiveAcidPartitionComputer(ignoreEmptySplits: Boolean,
 38 |                                                   ignoreMissingFiles: Boolean) extends Logging {
 39 |   def getPartitions[K, V](id: Int, jobConf: JobConf,
 40 |                     inputFormat: InputFormat[K, V],
 41 |                     minPartitions: Int): Array[HiveAcidPartition] = {
 42 |     // add the credentials here as this can be called before SparkContext initialized
 43 |     SparkHadoopUtil.get.addCredentials(jobConf)
 44 |     try {
 45 |       val allInputSplits = inputFormat.getSplits(jobConf, minPartitions)
 46 |       val inputSplits = if (ignoreEmptySplits) {
 47 |         allInputSplits.filter(_.getLength > 0)
 48 |       } else {
 49 |         allInputSplits
 50 |       }
 51 |       val array = new Array[HiveAcidPartition](inputSplits.length)
 52 |       for (i <- inputSplits.indices) {
 53 |         array(i) = new HiveAcidPartition(id, i, inputSplits(i))
 54 |       }
 55 |       array
 56 |     } catch {
 57 |       case e: InvalidInputException if ignoreMissingFiles =>
 58 |         val inputDir = jobConf.get(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.INPUT_DIR)
 59 |         logWarning(s"$inputDir doesn't exist and no" +
 60 |           s" partitions returned from this path.", e)
 61 |         Array.empty[HiveAcidPartition]
 62 |     }
 63 |   }
 64 | 
 65 |   // needs to be invoked just once as its an expensive operation.
 66 |   def computeHiveSplitsAndCache(splitRDD: RDD[HiveSplitInfo]): Unit = {
 67 |     val start = System.nanoTime()
 68 |     logInfo("Spawning job to compute partitions for ACID table RDD")
 69 |     val splits = splitRDD.map {
 70 |       case HiveSplitInfo(id, broadcastedConf,
 71 |       validWriteIdList, minPartitions, ifcName, isFullAcidTable, shouldCloneJobConf, initLocalJobConfFuncOpt) =>
 72 |         val jobConf = HiveAcidRDD.setInputPathToJobConf(
 73 |           Some(HiveAcidRDD.getJobConf(broadcastedConf, shouldCloneJobConf, initLocalJobConfFuncOpt)),
 74 |           isFullAcidTable,
 75 |           new ValidReaderWriteIdList(validWriteIdList),
 76 |           broadcastedConf,
 77 |           shouldCloneJobConf,
 78 |           initLocalJobConfFuncOpt)
 79 |         val partitions = this.getPartitions[Writable, Writable](id, jobConf, getInputFormat(jobConf, ifcName), minPartitions)
 80 |         (partitions, FileInputFormat.getInputPaths(jobConf), validWriteIdList)
 81 |     }.collect()
 82 | 
 83 |     splits.foreach {
 84 |       case (partitions: Array[HiveAcidPartition],
 85 |       paths: Array[Path], validWriteIdList: String) =>
 86 |         addToPartitionCache(paths, validWriteIdList, partitions)
 87 |     }
 88 |     logInfo(s"Job to compute partitions took: " +
 89 |       s"${TimeUnit.NANOSECONDS.toSeconds(System.nanoTime() - start)} seconds")
 90 |   }
 91 | }
 92 | 
 93 | private[hiveacid] object HiveAcidPartitionComputer extends Logging {
 94 |   object Cache {
 95 |     val partitionCache = new ConcurrentHashMap[SplitCacheKey, Array[HiveAcidPartition]]()
 96 |     case class SplitCacheKey(paths: Set[Path], validWriteIdList: String)
 97 |   }
 98 | 
 99 |   def getFromSplitsCache(paths: Array[Path], validWriteIdList: ValidWriteIdList): Option[Array[HiveAcidPartition]] = {
100 |     Option(Cache.partitionCache.get(Cache.SplitCacheKey(paths.toSet, validWriteIdList.writeToString())))
101 |   }
102 | 
103 |   def removeFromSplitsCache(paths: Array[Path], validWriteIdList: ValidWriteIdList): Unit = {
104 |     Cache.partitionCache.remove(Cache.SplitCacheKey(paths.toSet, validWriteIdList.writeToString()))
105 |   }
106 | 
107 |   def addToPartitionCache(paths: Array[Path], validWriteIdList: String, inputSplits: Array[HiveAcidPartition]): Unit = {
108 |     Cache.partitionCache.put(Cache.SplitCacheKey(paths.toSet, validWriteIdList), inputSplits)
109 |   }
110 | 
111 |   private def getInputFormat(conf: JobConf, inputFormatClassName: String): InputFormat[Writable, Writable] = {
112 |     val inputFormatClass = Util.classForName(inputFormatClassName, loadShaded = true)
113 |       .asInstanceOf[java.lang.Class[InputFormat[Writable, Writable]]]
114 |     val newInputFormat = ReflectionUtils.newInstance(inputFormatClass.asInstanceOf[Class[_]], conf)
115 |       .asInstanceOf[InputFormat[Writable, Writable]]
116 |     newInputFormat match {
117 |       case c: Configurable => c.setConf(conf)
118 |       case _ =>
119 |     }
120 |     newInputFormat
121 |   }
122 | 
123 | }
124 | 


--------------------------------------------------------------------------------
/src/main/scala/com/qubole/spark/hiveacid/reader/hive/HiveAcidReaderOptions.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2019 Qubole, Inc.  All rights reserved.
 3 |  *
 4 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 5 |  * contributor license agreements.  See the NOTICE file distributed with
 6 |  * this work for additional information regarding copyright ownership.
 7 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 8 |  * (the "License"); you may not use this file except in compliance with
 9 |  * the License.  You may obtain a copy of the License at
10 |  *
11 |  *    http://www.apache.org/licenses/LICENSE-2.0
12 |  *
13 |  * Unless required by applicable law or agreed to in writing, software
14 |  * distributed under the License is distributed on an "AS IS" BASIS,
15 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 |  * See the License for the specific language governing permissions and
17 |  * limitations under the License.
18 |  */
19 | 
20 | package com.qubole.spark.hiveacid.reader.hive
21 | 
22 | import com.qubole.shaded.hadoop.hive.ql.plan.TableDesc
23 | import com.qubole.spark.hiveacid.hive.HiveAcidMetadata
24 | import org.apache.spark.sql.types.StructType
25 | 
26 | private[reader] class HiveAcidReaderOptions(val tableDesc: TableDesc,
27 |                                             val isFullAcidTable: Boolean,
28 |                                             val dataSchema: StructType)
29 | 
30 | private[reader] object HiveAcidReaderOptions {
31 |   def get(hiveAcidMetadata: HiveAcidMetadata): HiveAcidReaderOptions = {
32 |     new HiveAcidReaderOptions(hiveAcidMetadata.tableDesc, hiveAcidMetadata.isFullAcidTable,
33 |       hiveAcidMetadata.dataSchema)
34 |   }
35 | }
36 | 


--------------------------------------------------------------------------------
/src/main/scala/com/qubole/spark/hiveacid/streaming/HiveAcidSink.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2019 Qubole, Inc.  All rights reserved.
 3 |  *
 4 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 5 |  * contributor license agreements.  See the NOTICE file distributed with
 6 |  * this work for additional information regarding copyright ownership.
 7 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 8 |  * (the "License"); you may not use this file except in compliance with
 9 |  * the License.  You may obtain a copy of the License at
10 |  *
11 |  *    http://www.apache.org/licenses/LICENSE-2.0
12 |  *
13 |  * Unless required by applicable law or agreed to in writing, software
14 |  * distributed under the License is distributed on an "AS IS" BASIS,
15 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 |  * See the License for the specific language governing permissions and
17 |  * limitations under the License.
18 |  */
19 | 
20 | 
21 | package com.qubole.spark.hiveacid.streaming
22 | 
23 | import com.qubole.spark.hiveacid.{HiveAcidErrors, HiveAcidTable}
24 | import com.qubole.spark.hiveacid.hive.HiveAcidMetadata
25 | import org.apache.hadoop.fs.Path
26 | import org.apache.spark.internal.Logging
27 | import org.apache.spark.sql.{DataFrame, SparkSession}
28 | import org.apache.spark.sql.execution.streaming.Sink
29 | 
30 | 
31 | class HiveAcidSink(sparkSession: SparkSession,
32 |                    parameters: Map[String, String]) extends Sink with Logging {
33 | 
34 |   import HiveAcidSink._
35 | 
36 |   private val acidSinkOptions = new HiveAcidSinkOptions(parameters)
37 | 
38 |   private val fullyQualifiedTableName = acidSinkOptions.tableName
39 | 
40 |   private val hiveAcidTable: HiveAcidTable = HiveAcidTable.fromSparkSession(
41 |     sparkSession,
42 |     fullyQualifiedTableName,
43 |     parameters)
44 | 
45 |   assertNonBucketedTable()
46 | 
47 |   private val logPath = getMetaDataPath()
48 |   private val fileLog = new HiveAcidSinkLog(
49 |     HiveAcidSinkLog.VERSION, sparkSession, logPath.toUri.toString, acidSinkOptions)
50 | 
51 |   private def assertNonBucketedTable(): Unit = {
52 |     if(hiveAcidTable.isBucketed) {
53 |       throw HiveAcidErrors.unsupportedOperationTypeBucketedTable("Streaming Write", fullyQualifiedTableName)
54 |     }
55 |   }
56 | 
57 |   private def getMetaDataPath(): Path = {
58 |     acidSinkOptions.metadataDir match {
59 |       case Some(dir) =>
60 |         new Path(dir)
61 |       case None =>
62 |         logInfo(s"Metadata dir not specified. Using " +
63 |           s"$metadataDirPrefix/_query_default as metadata dir")
64 |         logWarning(s"Please make sure that multiple streaming writes to " +
65 |           s"$fullyQualifiedTableName are not running")
66 |         val tableLocation = HiveAcidMetadata.fromSparkSession(
67 |           sparkSession, fullyQualifiedTableName).rootPath
68 |         new Path(tableLocation, s"$metadataDirPrefix/_query_default")
69 |     }
70 |   }
71 | 
72 |   /**
73 |     * Adds the batch to the sink. Each batch is transactional in itself
74 |     * @param batchId batch to add
75 |     * @param df dataframe to add as part of batch
76 |     */
77 |   override def addBatch(batchId: Long, df: DataFrame): Unit = {
78 | 
79 |     if (batchId <= fileLog.getLatest().map(_._1).getOrElse(-1L)) {
80 |       logInfo(s"Skipping already committed batch $batchId")
81 |     } else {
82 | 
83 |       val commitProtocol = new HiveAcidStreamingCommitProtocol(fileLog)
84 |       val txnId = hiveAcidTable.addBatch(df)
85 |       commitProtocol.commitJob(batchId, txnId)
86 |     }
87 | 
88 |   }
89 | 
90 |   override def toString: String = s"HiveAcidSinkV1[$fullyQualifiedTableName]"
91 | 
92 | }
93 | 
94 | object HiveAcidSink {
95 | 
96 |   val metadataDirPrefix = "_acid_streaming"
97 | }
98 | 


--------------------------------------------------------------------------------
/src/main/scala/com/qubole/spark/hiveacid/streaming/HiveAcidSinkLog.scala:
--------------------------------------------------------------------------------
 1 | 
 2 | /*
 3 |  * Copyright 2019 Qubole, Inc.  All rights reserved.
 4 |  *
 5 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 6 |  * contributor license agreements.  See the NOTICE file distributed with
 7 |  * this work for additional information regarding copyright ownership.
 8 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 9 |  * (the "License"); you may not use this file except in compliance with
10 |  * the License.  You may obtain a copy of the License at
11 |  *
12 |  *    http://www.apache.org/licenses/LICENSE-2.0
13 |  *
14 |  * Unless required by applicable law or agreed to in writing, software
15 |  * distributed under the License is distributed on an "AS IS" BASIS,
16 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 |  * See the License for the specific language governing permissions and
18 |  * limitations under the License.
19 |  */
20 | 
21 | package com.qubole.spark.hiveacid.streaming
22 | 
23 | import org.apache.spark.sql.SparkSession
24 | import org.apache.spark.sql.execution.streaming.CompactibleFileStreamLog
25 | 
26 | case class HiveAcidSinkStatus(txnId: Long, action: String)
27 | 
28 | class HiveAcidSinkLog(version: Int,
29 |                       sparkSession: SparkSession,
30 |                       path: String,
31 |                       options: HiveAcidSinkOptions)
32 |   extends CompactibleFileStreamLog[HiveAcidSinkStatus](version, sparkSession, path) {
33 | 
34 |   protected override val fileCleanupDelayMs = options.fileCleanupDelayMs
35 | 
36 |   protected override val isDeletingExpiredLog = options.isDeletingExpiredLog
37 | 
38 |   protected override val defaultCompactInterval = options.compactInterval
39 | 
40 |   protected override val minBatchesToRetain = options.minBatchesToRetain
41 | 
42 |   override def compactLogs(logs: Seq[HiveAcidSinkStatus]): Seq[HiveAcidSinkStatus] = {
43 |     val deletedFiles = logs.filter(_.action == HiveAcidSinkLog.DELETE_ACTION).map(_.txnId).toSet
44 |     if (deletedFiles.isEmpty) {
45 |       logs
46 |     } else {
47 |       logs.filter(f => !deletedFiles.contains(f.txnId))
48 |     }
49 |   }
50 | 
51 | }
52 | 
53 | object HiveAcidSinkLog {
54 | 
55 |   val VERSION = 1
56 |   val DELETE_ACTION = "delete"
57 |   val ADD_ACTION = "add"
58 | 
59 | }


--------------------------------------------------------------------------------
/src/main/scala/com/qubole/spark/hiveacid/streaming/HiveAcidSinkOptions.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2019 Qubole, Inc.  All rights reserved.
 3 |  *
 4 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 5 |  * contributor license agreements.  See the NOTICE file distributed with
 6 |  * this work for additional information regarding copyright ownership.
 7 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 8 |  * (the "License"); you may not use this file except in compliance with
 9 |  * the License.  You may obtain a copy of the License at
10 |  *
11 |  *    http://www.apache.org/licenses/LICENSE-2.0
12 |  *
13 |  * Unless required by applicable law or agreed to in writing, software
14 |  * distributed under the License is distributed on an "AS IS" BASIS,
15 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 |  * See the License for the specific language governing permissions and
17 |  * limitations under the License.
18 |  */
19 | 
20 | package com.qubole.spark.hiveacid.streaming
21 | 
22 | import java.util.concurrent.TimeUnit
23 | 
24 | import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
25 | 
26 | import scala.util.Try
27 | 
28 | class HiveAcidSinkOptions(parameters: CaseInsensitiveMap[String]) {
29 | 
30 |   import HiveAcidSinkOptions._
31 | 
32 |   def this(parameters: Map[String, String]) = this(CaseInsensitiveMap(parameters))
33 | 
34 |   val tableName = parameters.get("table").getOrElse{
35 |     throw new IllegalArgumentException("Table Name is not specified")
36 |   }
37 | 
38 |   val fileCleanupDelayMs = withLongParameter(CLEANUP_DELAY_KEY, DEFAULT_CLEANUP_DELAY)
39 | 
40 |   val isDeletingExpiredLog = withBooleanParameter(LOG_DELETION_KEY, DEFAULT_LOG_DELETION)
41 | 
42 |   val compactInterval = withIntParameter(COMPACT_INTERVAL_KEY, DEFAULT_COMPACT_INTERVAL)
43 | 
44 |   val minBatchesToRetain = withIntParameter(MIN_BATCHES_TO_RETAIN_KEY, DEFAULT_MIN_BATCHES_TO_RETAIN)
45 | 
46 |   val metadataDir = parameters.get(METADATA_DIR_KEY)
47 | 
48 |   private def withIntParameter(name: String, default: Int): Int = {
49 |     parameters.get(name).map { str =>
50 |       Try(str.toInt).toOption.filter(_ > 0).getOrElse {
51 |         throw new IllegalArgumentException(
52 |           s"Invalid value '$str' for option '$name', must be a positive integer")
53 |       }
54 |     }.getOrElse(default)
55 |   }
56 | 
57 |   private def withLongParameter(name: String, default: Long): Long = {
58 |     parameters.get(name).map { str =>
59 |       Try(str.toLong).toOption.filter(_ >= 0).getOrElse {
60 |         throw new IllegalArgumentException(
61 |           s"Invalid value '$str' for option '$name', must be a positive integer")
62 |       }
63 |     }.getOrElse(default)
64 |   }
65 | 
66 |   private def withBooleanParameter(name: String, default: Boolean): Boolean = {
67 |     parameters.get(name).map { str =>
68 |       try {
69 |         str.toBoolean
70 |       } catch {
71 |         case _: IllegalArgumentException =>
72 |           throw new IllegalArgumentException(
73 |             s"Invalid value '$str' for option '$name', must be true or false")
74 |       }
75 |     }.getOrElse(default)
76 |   }
77 | 
78 | }
79 | 
80 | object HiveAcidSinkOptions {
81 | 
82 |   val DEFAULT_CLEANUP_DELAY = TimeUnit.MINUTES.toMillis(10)
83 |   val DEFAULT_LOG_DELETION = true
84 |   val DEFAULT_COMPACT_INTERVAL = 10
85 |   val DEFAULT_MIN_BATCHES_TO_RETAIN = 100
86 | 
87 |   val CLEANUP_DELAY_KEY = "spark.acid.streaming.log.cleanupDelayMs"
88 |   val LOG_DELETION_KEY = "spark.acid.streaming.log.deletion"
89 |   val COMPACT_INTERVAL_KEY = "spark.acid.streaming.log.compactInterval"
90 |   val MIN_BATCHES_TO_RETAIN_KEY = "spark.acid.streaming.log.minBatchesToRetain"
91 |   val METADATA_DIR_KEY = "spark.acid.streaming.log.metadataDir"
92 | 
93 | }


--------------------------------------------------------------------------------
/src/main/scala/com/qubole/spark/hiveacid/streaming/HiveAcidStreamingCommitProtocol.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.qubole.spark.hiveacid.streaming
19 | 
20 | import org.apache.spark.internal.Logging
21 | 
22 | class HiveAcidStreamingCommitProtocol(fileLog: HiveAcidSinkLog) extends Serializable with Logging {
23 | 
24 |   import HiveAcidStreamingCommitProtocol._
25 | 
26 |   def commitJob(batchId: Long, txnId: Long): Unit = {
27 | 
28 |     def commitJobRetry(retryRemaining: Int, f: () => Unit): Boolean = {
29 |       var retry = false
30 |       try {
31 |         f()
32 |       }
33 |       catch {
34 |         case ie: IllegalStateException if ie.getMessage.contains("Race while writing batch") =>
35 |           throw ie
36 |         case e: Exception =>
37 |           if (retryRemaining > 0) {
38 |             logError(s"Unexpected error while writing commit file for batch $batchId ... " +
39 |               s"Retrying", e)
40 |             retry = true
41 |           } else {
42 |             logError(s"Unexpected error while writing commit file for batch $batchId ... " +
43 |               s"Max retries reached", e)
44 |             throw e
45 |           }
46 |       }
47 |       retry
48 |     }
49 | 
50 |     val array = Array(HiveAcidSinkStatus(txnId, HiveAcidSinkLog.ADD_ACTION))
51 | 
52 |     val commitJobAttempt = () => {
53 |       if (fileLog.add(batchId, array)) {
54 |         logInfo(s"Committed batch $batchId")
55 |       } else {
56 |         throw new IllegalStateException(s"Race while writing batch $batchId")
57 |       }
58 |     }
59 | 
60 |     var sleepSec = 1
61 |     var retryRemaining = MAX_COMMIT_JOB_RETRIES - 1
62 |     while (commitJobRetry(retryRemaining, commitJobAttempt)) {
63 |       retryRemaining = retryRemaining - 1
64 |       Thread.sleep(sleepSec * 1000)
65 |       sleepSec = sleepSec * EXPONENTIAL_BACK_OFF_FACTOR
66 |     }
67 | 
68 |   }
69 | 
70 | }
71 | 
72 | object HiveAcidStreamingCommitProtocol {
73 | 
74 |   val MAX_COMMIT_JOB_RETRIES = 3
75 |   val EXPONENTIAL_BACK_OFF_FACTOR = 2
76 | 
77 | }


--------------------------------------------------------------------------------
/src/main/scala/com/qubole/spark/hiveacid/transaction/HiveAcidTxn.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package com.qubole.spark.hiveacid.transaction
 19 | 
 20 | import java.util.concurrent.atomic.AtomicBoolean
 21 | 
 22 | import com.qubole.shaded.hadoop.hive.common.{ValidTxnList, ValidWriteIdList}
 23 | import com.qubole.spark.hiveacid.{HiveAcidErrors, HiveAcidOperation, SparkAcidConf}
 24 | import com.qubole.spark.hiveacid.hive.HiveAcidMetadata
 25 | import org.apache.spark.internal.Logging
 26 | import org.apache.spark.sql.SparkSession
 27 | 
 28 | /**
 29 |   * Hive Acid Transaction object.
 30 |   * @param sparkSession: Spark Session
 31 |   */
 32 | class HiveAcidTxn(sparkSession: SparkSession) extends Logging {
 33 | 
 34 |   HiveAcidTxn.setUpTxnManager(sparkSession)
 35 | 
 36 |   // txn ID
 37 |   protected var id: Long = -1
 38 |   protected var validTxnList: ValidTxnList = _
 39 |   private [hiveacid] val isClosed: AtomicBoolean = new AtomicBoolean(true)
 40 | 
 41 |   private def setTxn(id: Long, txns:ValidTxnList): Unit = {
 42 |     this.id = id
 43 |     this.validTxnList = txns
 44 |     isClosed.set(false)
 45 |   }
 46 | 
 47 |   private def unsetTxn(): Unit = {
 48 |     this.id = -1
 49 |     this.validTxnList = null
 50 |     isClosed.set(true)
 51 |   }
 52 | 
 53 |   override def toString: String = s"""{"id":"$id","validTxns":"$validTxnList"}"""
 54 | 
 55 |   /**
 56 |     * Public API to being transaction.
 57 |     */
 58 |   def begin(): Unit = synchronized {
 59 |     if (!isClosed.get) {
 60 |       throw HiveAcidErrors.txnAlreadyOpen(id)
 61 |     }
 62 |     val newId = HiveAcidTxn.txnManager.beginTxn(this)
 63 |     val txnList = HiveAcidTxn.txnManager.getValidTxns(Some(newId))
 64 |     setTxn(newId, txnList)
 65 |     // Set it for thread for all future references.
 66 |     HiveAcidTxn.threadLocal.set(this)
 67 |     logDebug(s"Begin transaction $this")
 68 |   }
 69 | 
 70 |   /**
 71 |     * Public API to end transaction
 72 |     * @param abort true if transaction is aborted
 73 |     */
 74 |   def end(abort: Boolean = false): Unit = synchronized {
 75 |     if (isClosed.get) {
 76 |       throw HiveAcidErrors.txnAlreadyClosed(id)
 77 |     }
 78 | 
 79 |     logDebug(s"End transaction $this abort = $abort")
 80 |     // NB: Unset it for thread proactively invariant of
 81 |     //  underlying call fails or succeeds.
 82 |     HiveAcidTxn.threadLocal.set(null)
 83 |     HiveAcidTxn.txnManager.endTxn(id, abort)
 84 |     unsetTxn()
 85 |   }
 86 | 
 87 |   private[hiveacid] def acquireLocks(hiveAcidMetadata: HiveAcidMetadata,
 88 |                                      operationType: HiveAcidOperation.OperationType,
 89 |                                      partitionNames: Seq[String],
 90 |                                      conf: SparkAcidConf): Unit = {
 91 |     if (isClosed.get()) {
 92 |       logError(s"Transaction already closed $this")
 93 |       throw HiveAcidErrors.txnAlreadyClosed(id)
 94 |     }
 95 |     HiveAcidTxn.txnManager.acquireLocks(id, hiveAcidMetadata.dbName,
 96 |       hiveAcidMetadata.tableName, operationType, partitionNames, hiveAcidMetadata.isPartitioned, conf)
 97 |   }
 98 | 
 99 |   private[hiveacid] def addDynamicPartitions(writeId: Long,
100 |                                              dbName: String,
101 |                                              tableName: String,
102 |                                              operationType: HiveAcidOperation.OperationType,
103 |                                              partitions: Set[String]) = {
104 |     if (isClosed.get()) {
105 |       logError(s"Transaction already closed $this")
106 |       throw HiveAcidErrors.txnAlreadyClosed(id)
107 |     }
108 |     logDebug(s"Adding dynamic partition txnId: $id writeId: $writeId dbName: $dbName" +
109 |       s" tableName: $tableName partitions: ${partitions.mkString(",")}")
110 |     HiveAcidTxn.txnManager.addDynamicPartitions(id, writeId, dbName,
111 |       tableName, partitions, operationType)
112 |   }
113 |   // Public Interface
114 |   def txnId: Long = id
115 | }
116 | 
117 | object HiveAcidTxn extends Logging {
118 | 
119 |   val threadLocal = new ThreadLocal[HiveAcidTxn]
120 | 
121 |   // Helper function to create snapshot.
122 |   private[hiveacid] def createSnapshot(txn: HiveAcidTxn, hiveAcidMetadata: HiveAcidMetadata): HiveAcidTableSnapshot = {
123 |     val currentWriteId = txnManager.getCurrentWriteId(txn.txnId,
124 |       hiveAcidMetadata.dbName, hiveAcidMetadata.tableName)
125 |     val validWriteIdList: ValidWriteIdList = getValidWriteIds(txn, hiveAcidMetadata)
126 |     HiveAcidTableSnapshot(validWriteIdList, currentWriteId)
127 |   }
128 | 
129 |   private[hiveacid] def getValidWriteIds(txn: HiveAcidTxn, hiveAcidMetadata: HiveAcidMetadata) = {
130 |     val validWriteIdList = if (txn.txnId == -1) {
131 |       throw HiveAcidErrors.tableWriteIdRequestedBeforeTxnStart(hiveAcidMetadata.fullyQualifiedName)
132 |     } else {
133 |       txnManager.getValidWriteIds(txn.txnId, txn.validTxnList, hiveAcidMetadata.fullyQualifiedName)
134 |     }
135 |     validWriteIdList
136 |   }
137 | 
138 |   // Txn manager is connection to HMS. Use single instance of it
139 |   var txnManager: HiveAcidTxnManager = _
140 |   private def setUpTxnManager(sparkSession: SparkSession): Unit = synchronized {
141 |     if (txnManager == null) {
142 |       txnManager = new HiveAcidTxnManager(sparkSession)
143 |     }
144 |   }
145 | 
146 |   /**
147 |     * Creates read or write transaction based on user request.
148 |     *
149 |     * @param sparkSession Create a new hive Acid transaction
150 |     * @return
151 |     */
152 |   def createTransaction(sparkSession: SparkSession): HiveAcidTxn = {
153 |     setUpTxnManager(sparkSession)
154 |     new HiveAcidTxn(sparkSession)
155 |   }
156 | 
157 |   /**
158 |     * Given a transaction id return the HiveAcidTxn object. Raise exception if not found.
159 |     * @return
160 |     */
161 |   def currentTxn(): HiveAcidTxn = {
162 |     threadLocal.get()
163 |   }
164 | 
165 |   /**
166 |     * Check if valid write Ids for `fullyQualifiedTableName` when `txn` was opened
167 |     * is same even now. This should be invoked after `txn` acquires lock, to see
168 |     * if the transaction is still valid and continue.
169 |     */
170 |   def IsTxnStillValid(txn: HiveAcidTxn, fullyQualifiedTableName: String): Boolean = {
171 |     if (txn.txnId == - 1) {
172 |       logWarning(s"Transaction being validated even before it was open")
173 |       false
174 |     } else {
175 |       // Compare the earlier writeIds of fullyQualifiedTableName with the current one.
176 |       val previousWriteIdList = txnManager.getValidWriteIds(txn.txnId, txn.validTxnList, fullyQualifiedTableName)
177 |       val currentValidList = txnManager.getValidTxns(Some(txn.txnId))
178 |       val currentWriteIdList = txnManager.getValidWriteIds(txn.txnId, currentValidList, fullyQualifiedTableName)
179 |       // Checks if any new write transaction was started and committed
180 |       // after opening transaction and before acquiring locks using HighWaterMark
181 |       if (previousWriteIdList.getHighWatermark == currentWriteIdList.getHighWatermark) {
182 |         // Check all the open transactions when current transaction was opened,
183 |         // are still invalid i.e., either running/open or aborted.
184 |         val prevOpenInvalidWriteIds = previousWriteIdList.getInvalidWriteIds
185 |           .filter(!previousWriteIdList.isWriteIdAborted(_)).toSet
186 |         val currentInvalidWriteIds = currentWriteIdList.getInvalidWriteIds.toSet
187 |         // Previous open transactions should still be invalid
188 |         if (prevOpenInvalidWriteIds.isEmpty ||
189 |           prevOpenInvalidWriteIds.diff(currentInvalidWriteIds).isEmpty) {
190 |           logDebug("All previous open transactions are still invalid! Transaction is valid!")
191 |           true
192 |         } else {
193 |           logWarning("Prev Open transactions: " +  prevOpenInvalidWriteIds.diff(currentInvalidWriteIds).mkString(", ")
194 |             + " have been committed. Transaction " + txn.txnId + " is not valid !")
195 |           false
196 |         }
197 |       } else {
198 |         logWarning("HighWatermark moved from " +
199 |           previousWriteIdList.getHighWatermark + " to " +
200 |           currentWriteIdList.getHighWatermark +
201 |           ". Transaction " + txn.txnId + " is not valid !")
202 |         false
203 |       }
204 |     }
205 |   }
206 | }
207 | 
208 | private[hiveacid] case class HiveAcidTableSnapshot(validWriteIdList: ValidWriteIdList, currentWriteId: Long)
209 | 


--------------------------------------------------------------------------------
/src/main/scala/com/qubole/spark/hiveacid/util/.gitignore:
--------------------------------------------------------------------------------
1 | *.scalae
2 | 


--------------------------------------------------------------------------------
/src/main/scala/com/qubole/spark/hiveacid/util/HiveAcidKyroRegistrator.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2019 Qubole, Inc.  All rights reserved.
 3 |  *
 4 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 5 |  * contributor license agreements.  See the NOTICE file distributed with
 6 |  * this work for additional information regarding copyright ownership.
 7 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 8 |  * (the "License"); you may not use this file except in compliance with
 9 |  * the License.  You may obtain a copy of the License at
10 |  *
11 |  *    http://www.apache.org/licenses/LICENSE-2.0
12 |  *
13 |  * Unless required by applicable law or agreed to in writing, software
14 |  * distributed under the License is distributed on an "AS IS" BASIS,
15 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 |  * See the License for the specific language governing permissions and
17 |  * limitations under the License.
18 |  */
19 | 
20 | package com.qubole.spark.hiveacid.util
21 | 
22 | import com.esotericsoftware.kryo.Kryo
23 | import org.apache.spark.serializer.KryoRegistrator
24 | import com.esotericsoftware.kryo.serializers.JavaSerializer
25 | 
26 | class HiveAcidKyroRegistrator extends KryoRegistrator {
27 |   override def registerClasses(kryo: Kryo): Unit = {
28 |     kryo.register(classOf[com.qubole.spark.hiveacid.util.SerializableConfiguration], new JavaSerializer)
29 |   }
30 | }


--------------------------------------------------------------------------------
/src/main/scala/com/qubole/spark/hiveacid/util/SerializableConfiguration.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2019 Qubole, Inc.  All rights reserved.
 3 |  *
 4 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 5 |  * contributor license agreements.  See the NOTICE file distributed with
 6 |  * this work for additional information regarding copyright ownership.
 7 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 8 |  * (the "License"); you may not use this file except in compliance with
 9 |  * the License.  You may obtain a copy of the License at
10 |  *
11 |  *    http://www.apache.org/licenses/LICENSE-2.0
12 |  *
13 |  * Unless required by applicable law or agreed to in writing, software
14 |  * distributed under the License is distributed on an "AS IS" BASIS,
15 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 |  * See the License for the specific language governing permissions and
17 |  * limitations under the License.
18 |  */
19 | 
20 | package com.qubole.spark.hiveacid.util
21 | 
22 | import java.io.{ObjectInputStream, ObjectOutputStream}
23 | 
24 | import org.apache.hadoop.conf.Configuration
25 | 
26 | /**
27 |  * Utility class to make configuration object serializable
28 |  */
29 | private[hiveacid] class SerializableConfiguration(@transient var value: Configuration)
30 |   extends Serializable {
31 |   private def writeObject(out: ObjectOutputStream): Unit = Util.tryOrIOException {
32 |     out.defaultWriteObject()
33 |     value.write(out)
34 |   }
35 | 
36 |   private def readObject(in: ObjectInputStream): Unit = Util.tryOrIOException {
37 |     value = new Configuration(false)
38 |     value.readFields(in)
39 |   }
40 | }
41 | 


--------------------------------------------------------------------------------
/src/main/scala/com/qubole/spark/hiveacid/util/SerializableWritable.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2019 Qubole, Inc.  All rights reserved.
 3 |  *
 4 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 5 |  * contributor license agreements.  See the NOTICE file distributed with
 6 |  * this work for additional information regarding copyright ownership.
 7 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 8 |  * (the "License"); you may not use this file except in compliance with
 9 |  * the License.  You may obtain a copy of the License at
10 |  *
11 |  *    http://www.apache.org/licenses/LICENSE-2.0
12 |  *
13 |  * Unless required by applicable law or agreed to in writing, software
14 |  * distributed under the License is distributed on an "AS IS" BASIS,
15 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 |  * See the License for the specific language governing permissions and
17 |  * limitations under the License.
18 |  */
19 | 
20 | package com.qubole.spark.hiveacid.util
21 | 
22 | import java.io._
23 | 
24 | import org.apache.hadoop.conf.Configuration
25 | import org.apache.hadoop.io.ObjectWritable
26 | import org.apache.hadoop.io.Writable
27 | 
28 | /**
29 |  * Utility class to make a Writable serializable
30 |  */
31 | private[hiveacid] class SerializableWritable[T <: Writable](@transient var t: T)
32 |   extends Serializable {
33 | 
34 |   def value: T = t
35 | 
36 |   override def toString: String = t.toString
37 | 
38 |   private def writeObject(out: ObjectOutputStream): Unit = Util.tryOrIOException {
39 |     out.defaultWriteObject()
40 |     new ObjectWritable(t).write(out)
41 |   }
42 | 
43 |   private def readObject(in: ObjectInputStream): Unit = Util.tryOrIOException {
44 |     in.defaultReadObject()
45 |     val ow = new ObjectWritable()
46 |     ow.setConf(new Configuration(false))
47 |     ow.readFields(in)
48 |     t = ow.get().asInstanceOf[T]
49 |   }
50 | }
51 | 


--------------------------------------------------------------------------------
/src/main/scala/com/qubole/spark/hiveacid/util/Util.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2019 Qubole, Inc.  All rights reserved.
 3 |  *
 4 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 5 |  * contributor license agreements.  See the NOTICE file distributed with
 6 |  * this work for additional information regarding copyright ownership.
 7 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 8 |  * (the "License"); you may not use this file except in compliance with
 9 |  * the License.  You may obtain a copy of the License at
10 |  *
11 |  *    http://www.apache.org/licenses/LICENSE-2.0
12 |  *
13 |  * Unless required by applicable law or agreed to in writing, software
14 |  * distributed under the License is distributed on an "AS IS" BASIS,
15 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 |  * See the License for the specific language governing permissions and
17 |  * limitations under the License.
18 |  */
19 | 
20 | package com.qubole.spark.hiveacid.util
21 | 
22 | import java.io.IOException
23 | 
24 | import org.apache.spark.internal.Logging
25 | 
26 | import scala.util.control.NonFatal
27 | 
28 | private[hiveacid] object Util extends Logging {
29 | 
30 |   def classForName(className: String, loadShaded: Boolean = false): Class[_] = {
31 |     val classToLoad = if (loadShaded) {
32 |       className.replaceFirst("org.apache.hadoop.hive.", "com.qubole.shaded.hadoop.hive.")
33 |     } else {
34 |       className
35 |     }
36 |     Class.forName(classToLoad, true, Thread.currentThread().getContextClassLoader)
37 |   }
38 | 
39 |   /**
40 |     * Detect whether this thread might be executing a shutdown hook. Will always return true if
41 |     * the current thread is a running a shutdown hook but may spuriously return true otherwise (e.g.
42 |     * if System.exit was just called by a concurrent thread).
43 |     *
44 |     * Currently, this detects whether the JVM is shutting down by Runtime#addShutdownHook throwing
45 |     * an IllegalStateException.
46 |     */
47 |   def inShutdown(): Boolean = {
48 |     try {
49 |       val hook: Thread = new Thread {
50 |         override def run() {}
51 |       }
52 |       // scalastyle:off runtimeaddshutdownhook
53 |       Runtime.getRuntime.addShutdownHook(hook)
54 |       // scalastyle:on runtimeaddshutdownhook
55 |       Runtime.getRuntime.removeShutdownHook(hook)
56 |     } catch {
57 |       case _: IllegalStateException => return true
58 |     }
59 |     false
60 |   }
61 | 
62 |   def tryOrIOException[T](block: => T): T = {
63 |     try {
64 |       block
65 |     } catch {
66 |       case e: IOException =>
67 |         logError("Exception encountered", e)
68 |         throw e
69 |       case NonFatal(e) =>
70 |         logError("Exception encountered", e)
71 |         throw new IOException(e)
72 |     }
73 |   }
74 | }
75 | 


--------------------------------------------------------------------------------
/src/main/scala/com/qubole/spark/hiveacid/writer/Writer.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2019 Qubole, Inc.  All rights reserved.
 3 |  *
 4 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 5 |  * contributor license agreements.  See the NOTICE file distributed with
 6 |  * this work for additional information regarding copyright ownership.
 7 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 8 |  * (the "License"); you may not use this file except in compliance with
 9 |  * the License.  You may obtain a copy of the License at
10 |  *
11 |  *    http://www.apache.org/licenses/LICENSE-2.0
12 |  *
13 |  * Unless required by applicable law or agreed to in writing, software
14 |  * distributed under the License is distributed on an "AS IS" BASIS,
15 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 |  * See the License for the specific language governing permissions and
17 |  * limitations under the License.
18 |  */
19 | 
20 | package com.qubole.spark.hiveacid.writer
21 | 
22 | import org.apache.spark.sql.catalyst.InternalRow
23 | import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec
24 | 
25 | private[hiveacid] trait Writer {
26 |   def process(row: InternalRow): Unit
27 |   def close(): Unit
28 |   def partitionsTouched(): Seq[TablePartitionSpec]
29 | }


--------------------------------------------------------------------------------
/src/main/scala/com/qubole/spark/hiveacid/writer/WriterOptions.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2019 Qubole, Inc.  All rights reserved.
 3 |  *
 4 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 5 |  * contributor license agreements.  See the NOTICE file distributed with
 6 |  * this work for additional information regarding copyright ownership.
 7 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 8 |  * (the "License"); you may not use this file except in compliance with
 9 |  * the License.  You may obtain a copy of the License at
10 |  *
11 |  *    http://www.apache.org/licenses/LICENSE-2.0
12 |  *
13 |  * Unless required by applicable law or agreed to in writing, software
14 |  * distributed under the License is distributed on an "AS IS" BASIS,
15 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 |  * See the License for the specific language governing permissions and
17 |  * limitations under the License.
18 |  */
19 | 
20 | package com.qubole.spark.hiveacid.writer
21 | 
22 | import com.qubole.spark.hiveacid.HiveAcidOperation
23 | import com.qubole.spark.hiveacid.util.SerializableConfiguration
24 | import org.apache.spark.sql.catalyst.expressions.Attribute
25 | import org.apache.spark.sql.types.StructType
26 | 
27 | /**
28 |  * Writer options which will be serialized and sent to each executor
29 |  */
30 | private[hiveacid] class WriterOptions(val currentWriteId: Long,
31 |                                     val operationType: HiveAcidOperation.OperationType,
32 |                                     val serializableHadoopConf: SerializableConfiguration,
33 |                                     val tableSchemaWithrowID: StructType,
34 |                                     val dataColumns: Seq[Attribute],
35 |                                     val partitionColumns: Seq[Attribute],
36 |                                     val allColumns: Seq[Attribute],
37 |                                     val timeZoneId: String,
38 |                                     val statementId: Option[Int] = None) extends Serializable
39 | 


--------------------------------------------------------------------------------
/src/main/scala/com/qubole/spark/hiveacid/writer/hive/HiveAcidWriterOptions.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2019 Qubole, Inc.  All rights reserved.
 3 |  *
 4 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 5 |  * contributor license agreements.  See the NOTICE file distributed with
 6 |  * this work for additional information regarding copyright ownership.
 7 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 8 |  * (the "License"); you may not use this file except in compliance with
 9 |  * the License.  You may obtain a copy of the License at
10 |  *
11 |  *    http://www.apache.org/licenses/LICENSE-2.0
12 |  *
13 |  * Unless required by applicable law or agreed to in writing, software
14 |  * distributed under the License is distributed on an "AS IS" BASIS,
15 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 |  * See the License for the specific language governing permissions and
17 |  * limitations under the License.
18 |  */
19 | 
20 | package com.qubole.spark.hiveacid.writer.hive
21 | 
22 | import com.qubole.shaded.hadoop.hive.ql.plan.FileSinkDesc
23 | import com.qubole.spark.hiveacid.HiveAcidOperation
24 | import com.qubole.spark.hiveacid.hive.HiveAcidMetadata
25 | import com.qubole.spark.hiveacid.writer.WriterOptions
26 | import org.apache.hadoop.fs.Path
27 | 
28 | private[writer] class HiveAcidWriterOptions(val rootPath: String,
29 |                                             fileSinkDesc: FileSinkDesc) extends Serializable {
30 |   lazy val getFileSinkDesc: FileSinkDesc = {
31 |     fileSinkDesc.setDirName(new Path(rootPath))
32 |     fileSinkDesc
33 |   }
34 | }
35 | 
36 | private[writer] object HiveAcidWriterOptions {
37 |   def get(hiveAcidMetadata: HiveAcidMetadata,
38 |                             options: WriterOptions): HiveAcidWriterOptions = {
39 |     lazy val fileSinkDescriptor: FileSinkDesc = {
40 |       val fileSinkDesc: FileSinkDesc = new FileSinkDesc()
41 |       fileSinkDesc.setTableInfo(hiveAcidMetadata.tableDesc)
42 |       fileSinkDesc.setTableWriteId(options.currentWriteId)
43 |       if (options.operationType == HiveAcidOperation.INSERT_OVERWRITE) {
44 |         fileSinkDesc.setInsertOverwrite(true)
45 |       }
46 |       if (options.statementId.isDefined) {
47 |         fileSinkDesc.setStatementId(options.statementId.get)
48 |       }
49 |       fileSinkDesc
50 |     }
51 |     new HiveAcidWriterOptions(rootPath = hiveAcidMetadata.rootPath.toUri.toString,
52 |       fileSinkDesc = fileSinkDescriptor)
53 |   }
54 | 
55 | }
56 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/SqlUtils.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2019 Qubole, Inc.  All rights reserved.
  3 |  *
  4 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  5 |  * contributor license agreements.  See the NOTICE file distributed with
  6 |  * this work for additional information regarding copyright ownership.
  7 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  8 |  * (the "License"); you may not use this file except in compliance with
  9 |  * the License.  You may obtain a copy of the License at
 10 |  *
 11 |  *    http://www.apache.org/licenses/LICENSE-2.0
 12 |  *
 13 |  * Unless required by applicable law or agreed to in writing, software
 14 |  * distributed under the License is distributed on an "AS IS" BASIS,
 15 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 16 |  * See the License for the specific language governing permissions and
 17 |  * limitations under the License.
 18 |  */
 19 | 
 20 | package org.apache.spark.sql
 21 | 
 22 | import org.apache.spark.rdd.RDD
 23 | import org.apache.spark.sql.catalyst.InternalRow
 24 | import org.apache.spark.sql.catalyst.analysis._
 25 | import org.apache.spark.sql.catalyst.catalog.HiveTableRelation
 26 | import org.apache.spark.sql.catalyst.encoders.RowEncoder
 27 | import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, SubqueryAlias}
 28 | import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, Expression}
 29 | import org.apache.spark.sql.execution.LogicalRDD
 30 | import org.apache.spark.sql.execution.datasources.LogicalRelation
 31 | import org.apache.spark.sql.types.StructType
 32 | 
 33 | object SqlUtils {
 34 |   def convertToDF(sparkSession: SparkSession, plan : LogicalPlan): DataFrame = {
 35 |     Dataset.ofRows(sparkSession, plan)
 36 |   }
 37 | 
 38 |   def resolveReferences(sparkSession: SparkSession,
 39 |                         expr: Expression,
 40 |                         planContaining: LogicalPlan, failIfUnresolved: Boolean,
 41 |                         exprName: Option[String] = None): Expression = {
 42 |     resolveReferences(sparkSession, expr, Seq(planContaining), failIfUnresolved, exprName)
 43 |   }
 44 | 
 45 |   def resolveReferences(sparkSession: SparkSession,
 46 |                         expr: Expression,
 47 |                         planContaining: Seq[LogicalPlan],
 48 |                         failIfUnresolved: Boolean,
 49 |                         exprName: Option[String]): Expression = {
 50 |     val newPlan = FakeLogicalPlan(expr, planContaining)
 51 |     val resolvedExpr = sparkSession.sessionState.analyzer.execute(newPlan) match {
 52 |       case FakeLogicalPlan(resolvedExpr: Expression, _) =>
 53 |         // Return even if it did not successfully resolve
 54 |         resolvedExpr
 55 |       case _ =>
 56 |         expr
 57 |       // This is unexpected
 58 |     }
 59 |     if (failIfUnresolved) {
 60 |       resolvedExpr.flatMap(_.references).filter(!_.resolved).foreach {
 61 |         attr => {
 62 |           val failedMsg = exprName match {
 63 |             case Some(name) => s"${attr.sql} resolution in $name given these columns: "+
 64 |               planContaining.flatMap(_.output).map(_.name).mkString(",")
 65 |             case _ => s"${attr.sql} resolution failed given these columns: "+
 66 |               planContaining.flatMap(_.output).map(_.name).mkString(",")
 67 |           }
 68 |           attr.failAnalysis(failedMsg)
 69 |         }
 70 |       }
 71 |     }
 72 |     resolvedExpr
 73 |   }
 74 | 
 75 |   def hasSparkStopped(sparkSession: SparkSession): Boolean = {
 76 |     sparkSession.sparkContext.stopped.get()
 77 |   }
 78 | 
 79 |   /**
 80 |     * Qualify all the column names in the DF.
 81 |     * Attributes used in DF output will have fully qualified names
 82 |     * @param sparkSession
 83 |     * @param df DataFrame created by reading ACID table
 84 |     * @param fullyQualifiedTableName Qualified name of the Hive ACID Table
 85 |     * @return
 86 |     */
 87 |   def getDFQualified(sparkSession: SparkSession,
 88 |                      df: DataFrame,
 89 |                      fullyQualifiedTableName: String) = {
 90 |     val plan = df.queryExecution.analyzed
 91 |     val qualifiedPlan = plan match {
 92 |       case p: LogicalRelation =>
 93 |         p.copy(output = p.output
 94 |           .map((x: AttributeReference) =>
 95 |             x.withQualifier(fullyQualifiedTableName.split('.').toSeq))
 96 |         )
 97 |       case h: HiveTableRelation =>
 98 |         h.copy(dataCols = h.dataCols
 99 |           .map((x: AttributeReference) =>
100 |             x.withQualifier(fullyQualifiedTableName.split('.').toSeq))
101 |         )
102 |         h.copy(partitionCols = h.partitionCols
103 |           .map((x: AttributeReference) =>
104 |             x.withQualifier(fullyQualifiedTableName.split('.').toSeq))
105 |         )
106 |       case _ => plan
107 |     }
108 | 
109 |     val newDf = SqlUtils.convertToDF(sparkSession, qualifiedPlan)
110 |     (qualifiedPlan, newDf)
111 |   }
112 | 
113 |   def logicalPlanToDataFrame(sparkSession: SparkSession,
114 |                              logicalPlan: LogicalPlan): DataFrame = {
115 |     Dataset.ofRows(sparkSession, logicalPlan)
116 |   }
117 | 
118 |   /**
119 |     * Convert RDD into DataFrame using the attributeList.
120 |     * Based on [[SparkSession.createDataFrame()]] implementation but here,
121 |     * attributes are provided.
122 |     * @param sparkSession
123 |     * @param rdd
124 |     * @param schema
125 |     * @param attributes
126 |     * @return
127 |     */
128 |   def createDataFrameUsingAttributes(sparkSession: SparkSession,
129 |                                      rdd: RDD[Row],
130 |                                      schema: StructType,
131 |                                      attributes: Seq[Attribute]): DataFrame = {
132 |     val encoder = RowEncoder(schema)
133 |     val catalystRows = rdd.map(encoder.toRow)
134 |     val logicalPlan = LogicalRDD(
135 |       attributes,
136 |       catalystRows,
137 |       isStreaming = false)(sparkSession)
138 |     Dataset.ofRows(sparkSession, logicalPlan)
139 |   }
140 | 
141 |   def analysisException(cause: String): Throwable = {
142 |     new AnalysisException(cause)
143 |   }
144 | 
145 |   def removeTopSubqueryAlias(logicalPlan: LogicalPlan): LogicalPlan = {
146 |     logicalPlan match {
147 |       case SubqueryAlias(_, child: LogicalPlan) => child
148 |       case _ => logicalPlan
149 |     }
150 |   }
151 | }
152 | 
153 | case class FakeLogicalPlan(expr: Expression, children: Seq[LogicalPlan])
154 |   extends LogicalPlan {
155 |   override def output: Seq[Attribute] = children.foldLeft(Seq[Attribute]())((out, child) => out ++ child.output)
156 | }
157 | 


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/catalyst/parser/plans/logical/MergePlan.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.sql.catalyst.parser.plans.logical
19 | 
20 | import com.qubole.spark.hiveacid.merge.{MergeWhenClause}
21 | import org.apache.spark.sql.{SparkSession, SqlUtils}
22 | import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression}
23 | import org.apache.spark.sql.catalyst.plans.logical.{Command, LogicalPlan}
24 | 
25 | case class MergePlan(sourcePlan: LogicalPlan,
26 |                      targetPlan: LogicalPlan,
27 |                      condition: Expression,
28 |                      matched: Seq[MergeWhenClause],
29 |                      notMatched: Option[MergeWhenClause]) extends Command {
30 |   override def children: Seq[LogicalPlan] = Seq(sourcePlan, targetPlan)
31 |   override def output: Seq[Attribute] = Seq.empty
32 | }
33 | 
34 | object MergePlan {
35 |   def resolve(sparkSession: SparkSession, mergePlan: MergePlan): MergePlan = {
36 |     MergeWhenClause.validate(mergePlan.matched ++ mergePlan.notMatched)
37 |     val resolvedCondition = SqlUtils.resolveReferences(sparkSession, mergePlan.condition,
38 |       mergePlan.children, true, None)
39 |     val resolvedMatched = MergeWhenClause.resolve(sparkSession, mergePlan, mergePlan.matched)
40 |     val resolvedNotMatched = mergePlan.notMatched.map {
41 |       x => x.resolve(sparkSession, mergePlan)
42 |     }
43 | 
44 |     MergePlan(mergePlan.sourcePlan,
45 |       mergePlan.targetPlan,
46 |       resolvedCondition,
47 |       resolvedMatched,
48 |       resolvedNotMatched)
49 |   }
50 | }


--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/sql/hive/HiveAcidUtils.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2019 Qubole, Inc.  All rights reserved.
 3 |  *
 4 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 5 |  * contributor license agreements.  See the NOTICE file distributed with
 6 |  * this work for additional information regarding copyright ownership.
 7 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 8 |  * (the "License"); you may not use this file except in compliance with
 9 |  * the License.  You may obtain a copy of the License at
10 |  *
11 |  *    http://www.apache.org/licenses/LICENSE-2.0
12 |  *
13 |  * Unless required by applicable law or agreed to in writing, software
14 |  * distributed under the License is distributed on an "AS IS" BASIS,
15 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 |  * See the License for the specific language governing permissions and
17 |  * limitations under the License.
18 |  */
19 | 
20 | package org.apache.spark.sql.hive
21 | 
22 | import scala.collection.JavaConverters._
23 | import com.qubole.spark.hiveacid.hive.HiveAcidMetadata
24 | import org.apache.spark.sql.AnalysisException
25 | import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTablePartition, CatalogUtils}
26 | import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, BoundReference, Expression, InterpretedPredicate, PrettyAttribute}
27 | 
28 | object HiveAcidUtils {
29 | 
30 |   /**
31 |     * This is adapted from [[org.apache.spark.sql.catalyst.catalog.ExternalCatalogUtils.prunePartitionsByFilter]]
32 |     * Instead of [[org.apache.spark.sql.catalyst.catalog.CatalogTable]] this function will be using [[HiveAcidMetadata]]
33 |     * @param hiveAcidMetadata
34 |     * @param inputPartitions
35 |     * @param predicates
36 |     * @param defaultTimeZoneId
37 |     * @return
38 |     */
39 |   def prunePartitionsByFilter(
40 |                                hiveAcidMetadata: HiveAcidMetadata,
41 |                                inputPartitions: Seq[CatalogTablePartition],
42 |                                predicates: Option[Expression],
43 |                                defaultTimeZoneId: String): Seq[CatalogTablePartition] = {
44 |     if (predicates.isEmpty) {
45 |       inputPartitions
46 |     } else {
47 |       val partitionSchema = hiveAcidMetadata.partitionSchema
48 |       val partitionColumnNames = hiveAcidMetadata.partitionSchema.fieldNames.toSet
49 | 
50 |       val nonPartitionPruningPredicates = predicates.filterNot {
51 |         _.references.map(_.name).toSet.subsetOf(partitionColumnNames)
52 |       }
53 |       if (nonPartitionPruningPredicates.nonEmpty) {
54 |         throw new AnalysisException("Expected only partition pruning predicates: " +
55 |           nonPartitionPruningPredicates)
56 |       }
57 | 
58 |       val boundPredicate =
59 |         InterpretedPredicate.create(predicates.get.transform {
60 |           case att: Attribute =>
61 |             val index = partitionSchema.indexWhere(_.name == att.name)
62 |             BoundReference(index, partitionSchema(index).dataType, nullable = true)
63 |         })
64 | 
65 |       inputPartitions.filter { p =>
66 |         boundPredicate.eval(p.toRow(partitionSchema, defaultTimeZoneId))
67 |       }
68 |     }
69 |   }
70 | 
71 |   def convertToCatalogTablePartition(hp: com.qubole.shaded.hadoop.hive.ql.metadata.Partition): CatalogTablePartition = {
72 |     val apiPartition = hp.getTPartition
73 |     val properties: Map[String, String] = if (hp.getParameters != null) {
74 |       hp.getParameters.asScala.toMap
75 |     } else {
76 |       Map.empty
77 |     }
78 |     CatalogTablePartition(
79 |       spec = Option(hp.getSpec).map(_.asScala.toMap).getOrElse(Map.empty),
80 |       storage = CatalogStorageFormat(
81 |         locationUri = Option(CatalogUtils.stringToURI(apiPartition.getSd.getLocation)),
82 |         inputFormat = Option(apiPartition.getSd.getInputFormat),
83 |         outputFormat = Option(apiPartition.getSd.getOutputFormat),
84 |         serde = Option(apiPartition.getSd.getSerdeInfo.getSerializationLib),
85 |         compressed = apiPartition.getSd.isCompressed,
86 |         properties = Option(apiPartition.getSd.getSerdeInfo.getParameters)
87 |           .map(_.asScala.toMap).orNull),
88 |       createTime = apiPartition.getCreateTime.toLong * 1000,
89 |       lastAccessTime = apiPartition.getLastAccessTime.toLong * 1000,
90 |       parameters = properties,
91 |       stats = None) // TODO: need to implement readHiveStats
92 |   }
93 | }
94 | 


--------------------------------------------------------------------------------
/src/test/scala/com/qubole/spark/hiveacid/merge/MergeClauseSuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2019 Qubole, Inc.  All rights reserved.
 3 |  *
 4 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 5 |  * contributor license agreements.  See the NOTICE file distributed with
 6 |  * this work for additional information regarding copyright ownership.
 7 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 8 |  * (the "License"); you may not use this file except in compliance with
 9 |  * the License.  You may obtain a copy of the License at
10 |  *
11 |  *    http://www.apache.org/licenses/LICENSE-2.0
12 |  *
13 |  * Unless required by applicable law or agreed to in writing, software
14 |  * distributed under the License is distributed on an "AS IS" BASIS,
15 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 |  * See the License for the specific language governing permissions and
17 |  * limitations under the License.
18 |  */
19 | 
20 | package com.qubole.spark.hiveacid.merge
21 | 
22 | import org.apache.spark.SparkFunSuite
23 | import org.apache.spark.sql.{AnalysisException, functions}
24 | 
25 | class MergeClauseSuite extends SparkFunSuite {
26 |   def insertClause(addCondition : Boolean = true): MergeWhenNotInsert = {
27 |     if (addCondition) {
28 |       MergeWhenNotInsert(Some(functions.expr("x > 2").expr),
29 |         Seq(functions.col("x").expr, functions.col("y").expr))
30 |     }
31 |     else {
32 |       MergeWhenNotInsert(None,
33 |         Seq(functions.col("x").expr, functions.col("y").expr))
34 |     }
35 |   }
36 | 
37 |   def updateClause(addCondition : Boolean = true): MergeWhenUpdateClause = {
38 |     if (addCondition) {
39 |       val updateCondition = Some(functions.expr("a > 2").expr)
40 |       MergeWhenUpdateClause(updateCondition,
41 |         Map("b" -> functions.lit(3).expr), isStar = false)
42 |     } else {
43 |       MergeWhenUpdateClause(None,
44 |         Map("b" -> functions.lit(3).expr), isStar = false)
45 |     }
46 |   }
47 | 
48 |   def deleteClause(addCondition : Boolean = true): MergeWhenDelete = {
49 |     if (addCondition) {
50 |       MergeWhenDelete(Some(functions.expr("a < 1").expr))
51 |     } else {
52 |       MergeWhenDelete(None)
53 |     }
54 |   }
55 | 
56 |   test("Validate MergeClauses") {
57 |     val clauses = Seq(insertClause(), updateClause(), deleteClause())
58 |     MergeWhenClause.validate(clauses)
59 |   }
60 | 
61 |   test("Invalid MergeClause cases") {
62 |     val invalidMerge = "MERGE Validation Error: "
63 | 
64 |     //empty clauses
65 |     checkInvalidMergeClause(invalidMerge + MergeWhenClause.atleastOneClauseError, Seq())
66 | 
67 |     // multi update or insert clauses
68 |     val multiUpdateClauses = Seq(updateClause(), updateClause(), insertClause())
69 |     checkInvalidMergeClause(invalidMerge + MergeWhenClause.justOneClausePerTypeError, multiUpdateClauses)
70 | 
71 |     // multi match clauses with first clause without condition
72 |     val invalidMultiMatch = Seq(updateClause(false), deleteClause())
73 |     checkInvalidMergeClause(invalidMerge + MergeWhenClause.matchClauseConditionError, invalidMultiMatch)
74 | 
75 |     // invalid Update Clause
76 |     val invalidUpdateClause = MergeWhenUpdateClause(None, Map(), isStar = false)
77 |     val thrown = intercept[IllegalArgumentException] {
78 |       MergeWhenClause.validate(Seq(invalidUpdateClause))
79 |     }
80 |     assert(thrown.getMessage === "UPDATE Clause in MERGE should have one or more SET Values")
81 |   }
82 | 
83 |   private def checkInvalidMergeClause(invalidMessage: String, multiUpdateClauses: Seq[MergeWhenClause]) = {
84 |     val thrown = intercept[AnalysisException] {
85 |       MergeWhenClause.validate(multiUpdateClauses)
86 |     }
87 |     assert(thrown.message === invalidMessage)
88 |   }
89 | }
90 | 


--------------------------------------------------------------------------------
/version.sbt:
--------------------------------------------------------------------------------
1 | version in ThisBuild := "0.6.0"


--------------------------------------------------------------------------------