├── .gitignore ├── LICENSE.txt ├── README.md ├── build.sbt ├── docker ├── README.md ├── beeline ├── build ├── files │ ├── Dockerfile │ ├── bootstrap.sh │ ├── core-site.xml │ ├── hadoop-env.sh │ ├── hdfs-site.xml │ ├── hive-site.xml │ ├── mapred-site.xml │ └── yarn-site.xml ├── inspect ├── login ├── spark-shell ├── start └── stop ├── project ├── build.properties └── plugins.sbt ├── shaded-dependencies ├── build.sbt └── project │ ├── build.properties │ └── plugins.sbt ├── src ├── it │ └── scala │ │ └── com │ │ └── qubole │ │ └── spark │ │ └── hiveacid │ │ ├── LockSuite.scala │ │ ├── MergeSuite.scala │ │ ├── ReadSuite.scala │ │ ├── Table.scala │ │ ├── TestHelper.scala │ │ ├── TestHiveClient.java │ │ ├── TestSparkSession.scala │ │ ├── UpdateDeleteSuite.scala │ │ ├── WriteSuite.scala │ │ └── streaming │ │ ├── HiveAcidSinkLogSuite.scala │ │ ├── HiveAcidSinkOptionsSuite.scala │ │ ├── HiveAcidSinkSuite.scala │ │ ├── HiveAcidStreamingFunSuite.scala │ │ └── StreamingTestHelper.scala ├── main │ ├── antlr4 │ │ └── com │ │ │ └── qubole │ │ │ └── spark │ │ │ └── datasources │ │ │ └── hiveacid │ │ │ └── sql │ │ │ └── catalyst │ │ │ └── parser │ │ │ └── SqlHive.g4 │ ├── resources │ │ └── META-INF │ │ │ └── services │ │ │ └── org.apache.spark.sql.sources.DataSourceRegister │ └── scala │ │ ├── com │ │ └── qubole │ │ │ ├── shaded │ │ │ └── hadoop │ │ │ │ └── hive │ │ │ │ └── ql │ │ │ │ └── io │ │ │ │ └── orc │ │ │ │ └── OrcAcidUtil.scala │ │ │ └── spark │ │ │ ├── datasources │ │ │ └── hiveacid │ │ │ │ └── sql │ │ │ │ ├── HiveAnalysisException.scala │ │ │ │ ├── catalyst │ │ │ │ ├── parser │ │ │ │ │ ├── AstBuilder.scala │ │ │ │ │ └── ParseDriver.scala │ │ │ │ └── plans │ │ │ │ │ └── command │ │ │ │ │ ├── DeleteCommand.scala │ │ │ │ │ ├── MergeCommand.scala │ │ │ │ │ └── UpdateCommand.scala │ │ │ │ └── execution │ │ │ │ ├── SparkAcidSqlParser.scala │ │ │ │ └── SparkSqlAstBuilder.scala │ │ │ └── hiveacid │ │ │ ├── .gitignore │ │ │ ├── AcidOperationDelegate.scala │ │ │ ├── HiveAcidAutoConvert.scala │ │ │ ├── HiveAcidErrors.scala │ │ │ ├── HiveAcidOperation.scala │ │ │ ├── HiveAcidTable.scala │ │ │ ├── SparkAcidConf.scala │ │ │ ├── datasource │ │ │ ├── HiveAcidDataSource.scala │ │ │ └── HiveAcidRelation.scala │ │ │ ├── hive │ │ │ ├── .gitignore │ │ │ ├── HiveAcidMetadata.scala │ │ │ └── HiveConverter.scala │ │ │ ├── merge │ │ │ ├── MergeImpl.scala │ │ │ └── MergeWhenClause.scala │ │ │ ├── package.scala │ │ │ ├── rdd │ │ │ ├── EmptyRDD.scala │ │ │ ├── HiveAcidRDD.scala │ │ │ └── HiveAcidUnionRDD.scala │ │ │ ├── reader │ │ │ ├── .gitignore │ │ │ ├── Reader.scala │ │ │ ├── ReaderOptions.scala │ │ │ ├── TableReader.scala │ │ │ └── hive │ │ │ │ ├── HiveAcidPartitionComputer.scala │ │ │ │ ├── HiveAcidReader.scala │ │ │ │ ├── HiveAcidReaderOptions.scala │ │ │ │ └── HiveAcidSearchArgument.scala │ │ │ ├── streaming │ │ │ ├── HiveAcidSink.scala │ │ │ ├── HiveAcidSinkLog.scala │ │ │ ├── HiveAcidSinkOptions.scala │ │ │ └── HiveAcidStreamingCommitProtocol.scala │ │ │ ├── transaction │ │ │ ├── HiveAcidTxn.scala │ │ │ └── HiveAcidTxnManager.scala │ │ │ ├── util │ │ │ ├── .gitignore │ │ │ ├── HiveAcidKyroRegistrator.scala │ │ │ ├── SerializableConfiguration.scala │ │ │ ├── SerializableWritable.scala │ │ │ └── Util.scala │ │ │ └── writer │ │ │ ├── TableWriter.scala │ │ │ ├── Writer.scala │ │ │ ├── WriterOptions.scala │ │ │ └── hive │ │ │ ├── HiveAcidWriter.scala │ │ │ └── HiveAcidWriterOptions.scala │ │ └── org │ │ └── apache │ │ └── spark │ │ └── sql │ │ ├── SqlUtils.scala │ │ ├── catalyst │ │ └── parser │ │ │ └── plans │ │ │ └── logical │ │ │ └── MergePlan.scala │ │ └── hive │ │ ├── Hive3Inspectors.scala │ │ └── HiveAcidUtils.scala └── test │ └── scala │ ├── com │ └── qubole │ │ └── spark │ │ └── hiveacid │ │ ├── merge │ │ └── MergeClauseSuite.scala │ │ └── sql │ │ └── catalyst │ │ └── parser │ │ └── MergeParserSuite.scala │ └── org │ └── apache │ └── spark │ └── sql │ └── catalyst │ └── parser │ └── plans │ └── logical │ └── MergePlanSuite.scala └── version.sbt /.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | *.log 3 | *.iml 4 | *.ipr 5 | *.iws 6 | .idea 7 | out 8 | .cache/ 9 | .history/ 10 | .lib/ 11 | dist/* 12 | target/ 13 | bin/ 14 | libexec/ 15 | lib_managed/ 16 | src_managed/ 17 | project/boot/ 18 | project/plugins/project/ 19 | logs/ 20 | project/*-shim.sbt 21 | project/project/ 22 | project/target/ 23 | target/ 24 | .scala_dependencies 25 | .worksheet 26 | shaded_dependencies 27 | -------------------------------------------------------------------------------- /build.sbt: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 Qubole, Inc. All rights reserved. 3 | * 4 | * Licensed to the Apache Software Foundation (ASF) under one or more 5 | * contributor license agreements. See the NOTICE file distributed with 6 | * this work for additional information regarding copyright ownership. 7 | * The ASF licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | 20 | name := "spark-acid" 21 | 22 | organization:= "com.qubole" 23 | 24 | /******************* 25 | * Scala settings 26 | */ 27 | 28 | crossScalaVersions := Seq("2.11.12") 29 | 30 | scalaVersion := crossScalaVersions.value.head 31 | 32 | scalacOptions ++= Seq( 33 | "-Xlint", 34 | "-Xfatal-warnings", 35 | "-deprecation", 36 | "-unchecked", 37 | "-optimise", 38 | "-Yinline-warnings" 39 | ) 40 | 41 | scalacOptions in (Compile, doc) ++= Seq( 42 | "-no-link-warnings" // Suppresses problems with Scaladoc @throws links 43 | ) 44 | 45 | /************************** 46 | * Spark package settings 47 | */ 48 | sparkVersion := sys.props.getOrElse("spark.version", "2.4.3") 49 | 50 | spIncludeMaven := true 51 | 52 | spIgnoreProvided := true 53 | 54 | 55 | /************************ 56 | * Library Dependencies 57 | */ 58 | 59 | libraryDependencies ++= Seq( 60 | // Adding test classifier seems to break transitive resolution of the core dependencies 61 | "org.apache.spark" %% "spark-hive" % sparkVersion.value % "provided" excludeAll( 62 | ExclusionRule("org.apache", "hadoop-common"), 63 | ExclusionRule("org.apache", "hadoop-hdfs")), 64 | "org.apache.spark" %% "spark-sql" % sparkVersion.value % "provided" excludeAll( 65 | ExclusionRule("org.apache", "hadoop-common"), 66 | ExclusionRule("org.apache", "hadoop-hdfs")), 67 | "org.apache.spark" %% "spark-core" % sparkVersion.value % "provided" excludeAll( 68 | ExclusionRule("org.apache", "hadoop-common"), 69 | ExclusionRule("org.apache", "hadoop-hdfs")), 70 | "org.apache.spark" %% "spark-catalyst" % sparkVersion.value % "provided" excludeAll( 71 | ExclusionRule("org.apache", "hadoop-common"), 72 | ExclusionRule("org.apache", "hadoop-hdfs")), 73 | "org.apache.hadoop" % "hadoop-common" % "2.8.1" % "provided", 74 | "org.apache.hadoop" % "hadoop-hdfs" % "2.8.1" % "provided", 75 | "org.apache.commons" % "commons-lang3" % "3.3.5" % "provided", 76 | // antlr-runtime 77 | "org.antlr" % "antlr4-runtime" % "4.7.2" % "provided" 78 | ) 79 | 80 | lazy val scalatest = "org.scalatest" %% "scalatest" % "3.0.5" 81 | 82 | // Dependencies for Test 83 | libraryDependencies ++= Seq( 84 | "org.apache.hadoop" % "hadoop-common" % "2.8.1" % "provided", 85 | "org.apache.hadoop" % "hadoop-hdfs" % "2.8.1" % "provided", 86 | "org.apache.commons" % "commons-lang3" % "3.3.5" % "provided", 87 | // Dependencies for tests 88 | // 89 | "org.scalatest" %% "scalatest" % "3.0.5" % "test", 90 | "junit" % "junit" % "4.12" % "it,test", 91 | "com.novocode" % "junit-interface" % "0.11" % "it,test", 92 | "org.apache.spark" %% "spark-catalyst" % sparkVersion.value % "test" classifier "tests", 93 | "org.apache.spark" %% "spark-core" % sparkVersion.value % "test" classifier "tests", 94 | "org.apache.spark" %% "spark-sql" % sparkVersion.value % "test" classifier "tests" 95 | ) 96 | 97 | // Shaded jar dependency 98 | libraryDependencies ++= Seq( 99 | // intransitive() because we don't want to include any transitive dependencies of shaded-dependencies jar in main jar 100 | // ideally all such dependencies should be shaded inside shaded-dependencies jar 101 | "com.qubole" %% "spark-acid-shaded-dependencies" % sys.props.getOrElse("package.version", "0.1") intransitive() 102 | ) 103 | 104 | /************************************** 105 | * Remove Shaded Depenedency from POM 106 | */ 107 | 108 | import scala.xml.{Node => XmlNode, NodeSeq => XmlNodeSeq, _} 109 | import scala.xml.transform.{RewriteRule, RuleTransformer} 110 | 111 | pomPostProcess := { (node: XmlNode) => 112 | new RuleTransformer(new RewriteRule { 113 | override def transform(node: XmlNode): XmlNodeSeq = node match { 114 | case e: Elem if e.label == "dependency" && e.child.filter(_.label == "groupId").text.mkString == "com.qubole" => 115 | val organization = e.child.filter(_.label == "groupId").flatMap(_.text).mkString 116 | val artifact = e.child.filter(_.label == "artifactId").flatMap(_.text).mkString 117 | val version = e.child.filter(_.label == "version").flatMap(_.text).mkString 118 | Comment(s"dependency $organization#$artifact;$version has been omitted") 119 | case _ => node 120 | } 121 | }).transform(node).head 122 | } 123 | 124 | excludeDependencies ++= Seq ( 125 | // hive 126 | "org.apache.hive" % "hive-exec", 127 | "org.apache.hive" % "hive-metastore", 128 | "org.apache.hive" % "hive-jdbc", 129 | "org.apache.hive" % "hive-service", 130 | "org.apache.hive" % "hive-serde", 131 | "org.apache.hive" % "hive-common", 132 | 133 | // orc 134 | "org.apache.orc" % "orc-core", 135 | "org.apache.orc" % "orc-mapreduce", 136 | 137 | "org.slf4j" % "slf4j-api" 138 | ) 139 | 140 | // do not run test at assembly 141 | test in assembly := {} 142 | 143 | // Spark Package Section 144 | spName := "qubole/spark-acid" 145 | 146 | spShade := true 147 | 148 | spAppendScalaVersion := true 149 | 150 | credentials += Credentials(Path.userHome / ".ivy2" / ".sbtcredentials") 151 | 152 | licenses += "Apache-2.0" -> url("http://opensource.org/licenses/Apache-2.0") 153 | 154 | pomExtra := 155 | https://github.com/qubole/spark-acid 156 | 157 | git@github.com:qubole/spark-acid.git 158 | scm:git:git@github.com:qubole/spark-acid.git 159 | 160 | 161 | 162 | amoghmargoor 163 | Amogh Margoor 164 | https://github.com/amoghmargoor 165 | 166 | 167 | citrusraj 168 | Rajkumar Iyer 169 | https://github.com/citrusraj 170 | 171 | 172 | somani 173 | Abhishek Somani 174 | https://github.com/somani 175 | 176 | 177 | prakharjain09 178 | Prakhar Jain 179 | https://github.com/prakharjain09 180 | 181 | 182 | sourabh912 183 | Sourabh Goyal 184 | https://github.com/sourabh912 185 | 186 | 187 | 188 | 189 | publishMavenStyle := true 190 | 191 | bintrayReleaseOnPublish := false 192 | 193 | import ReleaseTransformations._ 194 | 195 | // Add publishing to spark packages as another step. 196 | releaseProcess := Seq[ReleaseStep]( 197 | checkSnapshotDependencies, 198 | inquireVersions, 199 | setReleaseVersion, 200 | commitReleaseVersion, 201 | tagRelease, 202 | pushChanges, 203 | releaseStepTask(spDist), 204 | releaseStepTask(spPublish) 205 | ) 206 | 207 | /** 208 | * Antlr settings 209 | */ 210 | antlr4Settings 211 | antlr4PackageName in Antlr4 := Some("com.qubole.spark.datasources.hiveacid.sql.catalyst.parser") 212 | antlr4GenListener in Antlr4 := true 213 | antlr4GenVisitor in Antlr4 := true 214 | antlr4Version := "4.7.2" 215 | 216 | 217 | /******************* 218 | * Test settings 219 | */ 220 | 221 | parallelExecution in IntegrationTest := false 222 | 223 | // do not run test at assembly 224 | test in assembly := {} 225 | 226 | // do not add scala in fat jar 227 | assemblyOption in assembly := (assemblyOption in assembly).value.copy(includeScala = false) 228 | 229 | //Integration test 230 | lazy val root = (project in file(".")) 231 | .configs(IntegrationTest) 232 | .settings( 233 | Defaults.itSettings, 234 | libraryDependencies += scalatest % "it" 235 | ) 236 | 237 | // exclude antlr classes from assembly since those 238 | // are available in spark at runtime 239 | // any other classes to be excluded from assembly 240 | // should be added here 241 | assemblyExcludedJars in assembly := { 242 | val cp = (fullClasspath in assembly).value 243 | cp filter {_.data.getName.contains("antlr")} 244 | } 245 | 246 | /*********************** 247 | * Release settings 248 | */ 249 | 250 | publishMavenStyle := true 251 | 252 | bintrayReleaseOnPublish := false 253 | 254 | import ReleaseTransformations._ 255 | 256 | // Add publishing to spark packages as another step. 257 | releaseProcess := Seq[ReleaseStep]( 258 | checkSnapshotDependencies, 259 | inquireVersions, 260 | setReleaseVersion, 261 | commitReleaseVersion, 262 | tagRelease, 263 | pushChanges 264 | ) 265 | -------------------------------------------------------------------------------- /docker/README.md: -------------------------------------------------------------------------------- 1 | A pseudo-distributed Hadoop image for testing Spark ACID datasource, based on 2 | 1. CentOS 6 3 | 2. [Hadoop3.1.1] (https://archive.apache.org/dist/hadoop/common/hadoop-3.1.1/hadoop-3.1.1.tar.gz) 4 | 3. [Hive3.1.1] (http://mirrors.estointernet.in/apache/hive/hive-3.1.1/apache-hive-3.1.1-bin.tar.gz) 5 | 4. [MySQL 5.6.44] (http://repo.mysql.com/mysql-community-release-el6-5.noarch.rpm) 6 | 7 | # Setup 8 | 9 | Refer for [Install Docker] (https://docs.docker.com/v17.12/install/) to install docker. 10 | 11 | # Build 12 | 13 | To build docker image 14 | ```bash 15 | ./build 16 | ``` 17 | 18 | # Start 19 | 20 | _NB Configure docker to run with the atleast 4GB of memory. For mac it can be configured in Docker Desktop_ 21 | 22 | To start docker image 23 | ```bash 24 | ./start 25 | ``` 26 | 27 | # Stop 28 | 29 | To stop docker 30 | ```bash 31 | ./stop 32 | ``` 33 | 34 | -------------------------------------------------------------------------------- /docker/beeline: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | name="spark-hiveacid-test-container" 4 | 5 | docker exec -it $name bin/bash -c "\ 6 | . ~/.bashrc; \ 7 | export HADOOP_HOME=/hadoop; \ 8 | hive/bin/beeline -n root -p root -u jdbc:hive2://0.0.0.0:10001/default" 9 | -------------------------------------------------------------------------------- /docker/build: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | docker build -t centos6/spark-hadoop3-hive3 files/. 3 | -------------------------------------------------------------------------------- /docker/files/Dockerfile: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | from centos:6 14 | MAINTAINER rajkumar@qubole.com 15 | 16 | 17 | RUN yum -y update 18 | RUN yum -y install epel-release 19 | 20 | 21 | RUN yum -y install java-1.8.0-openjdk-devel java-1.8.0-openjdk 22 | RUN ln -s /usr/lib/jvm//java-1.8.0-openjdk-amd64/ /usr/lib/jvm/java-1.8.0 23 | RUN ln -s /usr/lib/jvm//java-1.7.0-openjdk-amd64/ /usr/lib/jvm/java-1.7.0 24 | 25 | #RUN yum -y install vim 26 | RUN yum -y install wget tar sudo rsync 27 | 28 | RUN yum -y install initscripts httpd 29 | 30 | RUN wget https://archive.apache.org/dist/hadoop/common/hadoop-3.1.1/hadoop-3.1.1.tar.gz 31 | RUN tar -xvzf hadoop-3.1.1.tar.gz 32 | RUN ln -sf /hadoop-3.1.1 /hadoop 33 | 34 | RUN wget https://archive.apache.org/dist/hive/hive-3.1.2/apache-hive-3.1.2-bin.tar.gz 35 | RUN tar -xvzf apache-hive-3.1.2-bin.tar.gz 36 | RUN ln -sf /apache-hive-3.1.2-bin /hive 37 | 38 | RUN yum -y install \ 39 | mysql-server mysql-connector-java \ 40 | && yum -y clean all && rm -rf /tmp/* /var/tmp/* \ 41 | && ln -s /usr/share/java/mysql-connector-java.jar apache-hive-3.1.2-bin/lib/mysql-connector-java.jar 42 | 43 | # Setup sock proxy 44 | RUN yum install -y openssh openssh-clients openssh-server 45 | 46 | # passwordless ssh 47 | RUN ssh-keygen -q -N "" -t rsa -f /root/.ssh/id_rsa 48 | RUN cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys 49 | 50 | RUN chmod 755 /root && chmod 700 /root/.ssh 51 | RUN passwd --unlock root 52 | 53 | RUN yum install -y vim mlocate unzip 54 | 55 | RUN wget https://archive.apache.org/dist/hadoop/common/hadoop-2.7.0/hadoop-2.7.0.tar.gz 56 | RUN tar -xvzf hadoop-2.7.0.tar.gz 57 | 58 | 59 | # Copy configuration files 60 | RUN mkdir /conf 61 | COPY core-site.xml /conf/core-site.xml 62 | COPY hdfs-site.xml /conf/hdfs-site.xml 63 | COPY hadoop-env.sh /conf/hadoop-env.sh 64 | COPY yarn-site.xml /conf/yarn-site.xml 65 | 66 | COPY mapred-site.xml /conf/mapred-site.xml 67 | COPY hive-site.xml /conf/hive-site.xml 68 | COPY bootstrap.sh /bootstrap.sh 69 | 70 | # HDFS ports 71 | EXPOSE 1004 1006 8020 9866 9867 9870 9864 50470 9000 72 | 73 | # YARN ports 74 | EXPOSE 8030 8031 8032 8033 8040 8041 8042 8088 10020 19888 75 | 76 | # HIVE ports 77 | EXPOSE 9083 10000 78 | 79 | # SOCKS port 80 | EXPOSE 1180 81 | 82 | # mysql expose 83 | EXPOSE 3306 84 | 85 | # HDFS datnode 86 | EXPOSE 9866 87 | -------------------------------------------------------------------------------- /docker/files/bootstrap.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | eg='\033[0;32m' 4 | enc='\033[0m' 5 | echoe () { 6 | OIFS=${IFS} 7 | IFS='%' 8 | echo -e $@ 9 | IFS=${OIFS} 10 | } 11 | 12 | gprn() { 13 | echoe "${eg} >> ${1}${enc}" 14 | } 15 | 16 | 17 | ## Setup ENV variables 18 | 19 | export JAVA_HOME="/usr/lib/jvm/java-openjdk" 20 | 21 | export HDFS_NAMENODE_USER="root" 22 | export HDFS_SECONDARYNAMENODE_USER="root" 23 | export HDFS_DATANODE_USER="root" 24 | export YARN_RESOURCEMANAGER_USER="root" 25 | export YARN_NODEMANAGER_USER="root" 26 | 27 | export HADOOP_HOME="/hadoop" 28 | export HADOOP_ROOT_LOGGER=DEBUG 29 | export HADOOP_COMMON_LIB_NATIVE_DIR="/hadoop/lib/native" 30 | 31 | ## Add it to bashrc for starting hadoop 32 | echo 'export JAVA_HOME="/usr/lib/jvm/java-openjdk"' >> ~/.bashrc 33 | echo 'export HADOOP_HOME="/hadoop"' >> ~/.bashrc 34 | 35 | 36 | rm /hadoop 37 | ln -sf /hadoop-3.1.1 /hadoop 38 | 39 | cp /conf/core-site.xml /hadoop/etc/hadoop 40 | cp /conf/hdfs-site.xml /hadoop/etc/hadoop 41 | cp /conf/hadoop-env.sh /hadoop/etc/hadoop 42 | cp /conf/mapred-site.xml /hadoop/etc/hadoop 43 | cp /conf/yarn-site.xml /hadoop/etc/hadoop 44 | cp /conf/hive-site.xml /hive/conf/ 45 | 46 | 47 | gprn "set up mysql" 48 | service mysqld start 49 | 50 | # Set root password 51 | mysql -uroot -e "set password = PASSWORD('root');" 52 | mysql -uroot -e "grant all privileges on *.* to 'root'@'%' identified by 'root';" 53 | service sshd start 54 | 55 | gprn "start yarn" 56 | hadoop/sbin/start-yarn.sh & 57 | sleep 5 58 | 59 | gprn "Formatting name node" 60 | hadoop/bin/hdfs namenode -format 61 | 62 | gprn "Start hdfs" 63 | hadoop/sbin/start-dfs.sh 64 | 65 | jps 66 | 67 | mkdir -p /hive/warehouse -dbType mysql -initSchemaTo 3.1.0 68 | 69 | 70 | gprn "Set up metastore DB" 71 | hive/bin/schematool -dbType mysql -initSchemaTo 3.1.0 72 | 73 | gprn "Start HMS server" 74 | hive/bin/hive --service metastore -p 10000 & 75 | 76 | gprn "Sleep and wait for HMS to be up and running" 77 | sleep 20 78 | 79 | gprn "Start HiveServer2" 80 | hive/bin/hive --service hiveserver2 --hiveconf hive.server2.thrift.port=10001 --hiveconf hive.execution.engine=mr 81 | -------------------------------------------------------------------------------- /docker/files/core-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 21 | 22 | 35 | 36 | 37 | 38 | 39 | fs.defaultFS 40 | hdfs://0.0.0.0:9000 41 | 42 | 43 | 44 | hadoop.proxyuser.root.hosts 45 | * 46 | 47 | 48 | 49 | hadoop.proxyuser.root.groups 50 | * 51 | 52 | 53 | -------------------------------------------------------------------------------- /docker/files/hadoop-env.sh: -------------------------------------------------------------------------------- 1 | # The maximum amount of heap to use, in MB. Default is 1000. 2 | export HADOOP_HEAPSIZE=1024 3 | 4 | # Extra Java runtime options. Empty by default. 5 | export HADOOP_NAMENODE_OPTS="$HADOOP_NAMENODE_OPTS -Xmx512m" 6 | export YARN_OPTS="$YARN_OPTS -Xmx256m" 7 | -------------------------------------------------------------------------------- /docker/files/hdfs-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 21 | 22 | 34 | 35 | 36 | 37 | 38 | dfs.replication 39 | 1 40 | 41 | 42 | dfs.permissions.enabled 43 | false 44 | 45 | 46 | dfs.datanode.address 47 | 0.0.0.0:9866 48 | 49 | 53 | 54 | dfs.client.datanode-restart.timeout 55 | 30 56 | 57 | 58 | -------------------------------------------------------------------------------- /docker/files/hive-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 21 | 22 | 23 | 24 | 25 | javax.jdo.option.ConnectionURL 26 | jdbc:mysql://localhost/metastore?createDatabaseIfNotExist=true 27 | 28 | 29 | javax.jdo.option.ConnectionDriverName 30 | com.mysql.jdbc.Driver 31 | 32 | 33 | javax.jdo.option.ConnectionUserName 34 | root 35 | 36 | 37 | javax.jdo.option.ConnectionPassword 38 | root 39 | 40 | 41 | 42 | 43 | hive.metastore.uris 44 | thrift://0.0.0.0:10000 45 | 46 | 47 | 48 | hive.metastore.event.db.notification.api.auth 49 | false 50 | 51 | 52 | 53 | 54 | hive.support.concurrency 55 | true 56 | 57 | 58 | 59 | hive.exec.dynamic.partition.mode 60 | nonstrict 61 | 62 | 63 | 64 | hive.compactor.initiator.on 65 | true 66 | 67 | 68 | 69 | hive.txn.manager 70 | org.apache.hadoop.hive.ql.lockmgr.DbTxnManager> 71 | 72 | 73 | 74 | 75 | hive.server2.thrift.http.port 76 | 10001 77 | 78 | 79 | 80 | hive.execution.engine 81 | mr 82 | 83 | 84 | 85 | hive.input.format 86 | org.apache.hadoop.hive.ql.io.HiveInputFormat 87 | 88 | 89 | 90 | 91 | hive.auto.convert.join 92 | false 93 | 94 | 95 | 96 | hive.stats.autogather 97 | false 98 | 99 | 100 | 101 | hive.metastore.client.capability.check 102 | false 103 | 104 | 105 | 106 | -------------------------------------------------------------------------------- /docker/files/mapred-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 21 | 22 | 34 | 35 | 36 | 37 | 38 | mapreduce.framework.name 39 | yarn 40 | 41 | 42 | yarn.app.mapreduce.am.env 43 | HADOOP_MAPRED_HOME=${HADOOP_HOME} 44 | 45 | 46 | mapreduce.map.env 47 | HADOOP_MAPRED_HOME=${HADOOP_HOME} 48 | 49 | 50 | mapreduce.reduce.env 51 | HADOOP_MAPRED_HOME=${HADOOP_HOME} 52 | 53 | 54 | mapreduce.map.memory.mb 55 | 2048 56 | 57 | 58 | mapreduce.reduce.memory.mb 59 | 2048 60 | 61 | 62 | 63 | -------------------------------------------------------------------------------- /docker/files/yarn-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 20 | 21 | 34 | 35 | 36 | yarn.nodemanager.aux-services 37 | mapreduce_shuffle 38 | 39 | 40 | yarn.nodemanager.aux-services.mapreduce_shuffle.class 41 | org.apache.hadoop.mapred.ShuffleHandler 42 | 43 | 44 | -------------------------------------------------------------------------------- /docker/inspect: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | prn_row() { 3 | printf '%-32s | %-10s | %-16s | %-128s\n' "${1}" "${2}" "${3}" "${4}" 4 | } 5 | prn_row "DOCKER_NAME" "RUNNING" "IP" "PORT_MAPPING" 6 | id=spark-hiveacid-test-container 7 | NAME=`docker inspect --format='{{.Name}}' $id` 8 | RUNNING=`docker inspect --format='{{.State.Running}}' $id` 9 | IP=`docker inspect --format='{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' $id` 10 | PORT_MAPPING=`docker inspect --format='{{range $p, $conf := .NetworkSettings.Ports}} {{$p}} -> {{(index $conf 0).HostPort}} {{end}}' $id | sed -e 's/\/tcp//g'` 11 | prn_row "$NAME" "${RUNNING}" "${IP}" "${PORT_MAPPING}" 12 | -------------------------------------------------------------------------------- /docker/login: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | docker exec -it spark-hiveacid-test-container /bin/bash 3 | -------------------------------------------------------------------------------- /docker/spark-shell: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | if [ -z ${2} ] 3 | then 4 | echo "Specify the spark-acid jar location" 5 | echo "spark-shell ~/codeline/TOT ~/codeline/TOT/acid-ds/target/scala-2.11/spark-acid-qds-assembly-0.4.3.jar" 6 | exit 7 | fi 8 | if [ -z ${1} ] 9 | then 10 | echo "Specify and spark code base directory" 11 | echo "spark-shell ~/codeline/TOT ~/codeline/TOT/acid-ds/target/scala-2.11/spark-acid-qds-assembly-0.4.3.jar" 12 | exit 13 | fi 14 | 15 | shellenv() { 16 | export QENV_LOCAL_CODELINE="${1}" 17 | export QENV_LOCAL_CONF="${QENV_LOCAL}/conf" 18 | export HADOOP_SRC="${QENV_LOCAL_CODELINE}/hadoop2" 19 | export SPARK_SRC="${QENV_LOCAL_CODELINE}/spark" 20 | export HUSTLER_SRC="${QENV_LOCAL_CODELINE}/hustler" 21 | export HIVE_SRC="${QENV_LOCAL_CODELINE}/hive" 22 | export ZEPPELIN_SRC="${QENV_LOCAL_CODELINE}/zeppelin" 23 | } 24 | 25 | hsnapshot() { 26 | HADOOP_SNAPSHOT=`ls ${HADOOP_SRC}/hadoop-dist/target/hadoop* | grep SNAPSHOT: | cut -d':' -f1` 27 | } 28 | 29 | hivesnapshot() { 30 | loc=`ls ${HIVE_SRC}/packaging/target/apache-hive* |grep bin |grep -v ':'` 31 | HIVE_SNAPSHOT=${HIVE_SRC}/packaging/target/${loc}/${loc}/ 32 | } 33 | 34 | run_spark_shelllocal() { 35 | 36 | # Setup writest into spark-env file. Run spark-shell after it. 37 | echo "Update Spark Conf based on Hadoop Build Version --> ${SPARK_SRC}/conf/spark-env.sh" 38 | hsnapshot 39 | hivesnapshot 40 | 41 | str="export SPARK_YARN_USER_ENV=CLASSPATH=${QENV_LOCAL_CONF}/" 42 | echo ${str} > ${SPARK_SRC}/conf/spark-env.sh 43 | 44 | if [ -n "${HADOOP_SNAPSHOT}" ] 45 | then 46 | 47 | str="export SPARK_DIST_CLASSPATH=${QENV_LOCAL_CONF}/:${HADOOP_SNAPSHOT}/share/hadoop/common/lib/*:${HADOOP_SNAPSHOT}/share/hadoop/common/*:${HADOOP_SNAPSHOT}/share/hadoop/hdfs:${HADOOP_SNAPSHOT}/share/hadoop/hdfs/lib/*:${HADOOP_SNAPSHOT}/share/hadoop/hdfs/*:${HADOOP_SNAPSHOT}/share/hadoop/yarn/lib/*:${HADOOP_SNAPSHOT}/share/hadoop/yarn/*:${HADOOP_SNAPSHOT}/share/hadoop/mapreduce/*:/share/hadoop/tools:${HADOOP_SNAPSHOT}/share/hadoop/tools/lib/*:${HADOOP_SNAPSHOT}/share/hadoop/tools/*:/share/hadoop/qubole:${HADOOP_SNAPSHOT}/share/hadoop/qubole/*" 48 | echo ${str} >> ${SPARK_SRC}/conf/spark-env.sh 49 | fi 50 | 51 | if [ -n "${HIVE_SNAPSHOT}" ] 52 | then 53 | str="export SPARK_DIST_CLASSPATH=\${SPARK_DIST_CLASSPATH}:${HIVE_SNAPSHOT}/lib/*" 54 | echo ${str} >> ${SPARK_SRC}/conf/spark-env.sh 55 | fi 56 | 57 | str="export HADOOP_CONF_DIR=${QENV_LOCAL_CONF}/" 58 | echo ${str} >> ${SPARK_SRC}/conf/spark-env.sh 59 | 60 | $SPARK_SRC/bin/spark-shell $@ 61 | } 62 | 63 | 64 | shellenv ${1} 65 | shift 66 | run_spark_shelllocal --jars $@ --conf spark.sql.extensions=com.qubole.spark.datasources.hiveacid.HiveAcidAutoConvertExtension --conf spark.hadoop.hive.metastore.uris=thrift://localhost:10000 --conf spark.sql.catalogImplementation=hive 67 | -------------------------------------------------------------------------------- /docker/start: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | name="spark-hiveacid-test-container" 4 | 5 | RUNNING=`docker inspect --format "{{ .State.Running}}" ${name} 2>/dev/null` 6 | if [[ $? -eq 0 ]] 7 | then 8 | if [[ "${RUNNING}" == "true" ]] 9 | then 10 | echo "$name already running" 11 | exit 12 | fi 13 | else 14 | docker run --name ${name} --hostname localhost -P -p9866:9866 -p10000:10000 -p10001:10001 -p9000:9000 -p3306:3306 -p50070:50070 -p50030:50030 -it -d centos6/spark-hadoop3-hive3 /bin/bash -c "/bootstrap.sh >/tmp/boostrap.log" 15 | fi 16 | 17 | -------------------------------------------------------------------------------- /docker/stop: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | name="spark-hiveacid-test-container" 4 | docker kill ${name} 5 | docker rm ${name} 6 | -------------------------------------------------------------------------------- /project/build.properties: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2019 Qubole, Inc. All rights reserved. 3 | # 4 | # Licensed to the Apache Software Foundation (ASF) under one or more 5 | # contributor license agreements. See the NOTICE file distributed with 6 | # this work for additional information regarding copyright ownership. 7 | # The ASF licenses this file to You under the Apache License, Version 2.0 8 | # (the "License"); you may not use this file except in compliance with 9 | # the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | 20 | sbt.version = 0.13.16 21 | -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | 2 | resolvers += "spark-packages" at sys.props.getOrElse("spark.repo", "https://repos.spark-packages.org/") 3 | 4 | addSbtPlugin("org.spark-packages" % "sbt-spark-package" % "0.2.6") 5 | 6 | addSbtPlugin("com.github.gseitz" % "sbt-release" % "1.0.11") 7 | 8 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.9") 9 | 10 | addSbtPlugin("org.foundweekends" % "sbt-bintray" % "0.5.4") 11 | 12 | addSbtPlugin("com.simplytyped" % "sbt-antlr4" % "0.7.13") 13 | 14 | addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.9.2") 15 | -------------------------------------------------------------------------------- /shaded-dependencies/build.sbt: -------------------------------------------------------------------------------- 1 | name := "spark-acid-shaded-dependencies" 2 | 3 | version := sys.props.getOrElse("package.version", "0.1") 4 | 5 | organization:= "com.qubole" 6 | 7 | scalaVersion := "2.11.12" 8 | 9 | scalacOptions ++= Seq( 10 | "-Xlint", 11 | "-Xfatal-warnings", 12 | "-deprecation", 13 | "-unchecked", 14 | "-optimise", 15 | "-Yinline-warnings" 16 | ) 17 | 18 | scalacOptions in (Compile, doc) ++= Seq( 19 | "-no-link-warnings" // Suppresses problems with Scaladoc @throws links 20 | ) 21 | 22 | // do not run test at assembly 23 | test in assembly := {} 24 | 25 | publishArtifact in (Compile, packageDoc) := false 26 | 27 | publishArtifact in (Compile, packageSrc) := false 28 | 29 | publishArtifact in (Compile, packageBin) := false 30 | 31 | val hive_version = sys.props.getOrElse("hive.version", "3.1.2") 32 | 33 | val orc_version = sys.props.getOrElse("orc.version", "1.5.6") 34 | 35 | resolvers += "Additional Maven Repository" at sys.props.getOrElse("hive.repo", "https://repo1.maven.org/maven2/") 36 | 37 | // Shaded dependency 38 | libraryDependencies ++= Seq( 39 | // Hive/Orc core dependencies packed. 40 | "org.apache.hive" % "hive-metastore" % hive_version intransitive(), 41 | "org.apache.hive" % "hive-exec" % hive_version intransitive(), 42 | "org.apache.orc" % "orc-core" % orc_version intransitive(), 43 | "org.apache.orc" % "orc-mapreduce" % orc_version intransitive(), 44 | 45 | // Only for hive3 client in tests.. but packing it in shaded jars. 46 | "org.apache.hive" % "hive-jdbc" % hive_version intransitive(), 47 | "org.apache.hive" % "hive-service" % hive_version intransitive(), 48 | "org.apache.hive" % "hive-serde" % hive_version intransitive(), 49 | "org.apache.hive" % "hive-common" % hive_version intransitive(), 50 | 51 | // To deal with hive3 metastore library 0.9.3 vs zeppelin thirft 52 | // library version 0.9.1 conflict when runing Notebooks. 53 | "org.apache.thrift" % "libfb303" % "0.9.3", 54 | "org.apache.thrift" % "libthrift" % "0.9.3" 55 | ) 56 | 57 | 58 | assemblyShadeRules in assembly := Seq( 59 | ShadeRule.rename("org.apache.hadoop.hive.**" -> "com.qubole.shaded.hadoop.hive.@1").inAll, 60 | ShadeRule.rename("org.apache.hive.**" -> "com.qubole.shaded.hive.@1").inAll, 61 | ShadeRule.rename("org.apache.orc.**" -> "com.qubole.shaded.orc.@1").inAll, 62 | ShadeRule.rename("org.apache.commons.**" -> "com.qubole.shaded.commons.@1").inAll, 63 | ShadeRule.rename("org.apache.avro.**" -> "com.qubole.shaded.avro.@1").inAll, 64 | ShadeRule.rename("org.apache.parquet.**" -> "com.qubole.shaded.parquet.@1").inAll, 65 | ShadeRule.rename("org.apache.http.**" -> "com.qubole.shaded.http.@1").inAll, 66 | ShadeRule.rename("org.apache.tez.**" -> "com.qubole.shaded.tez.@1").inAll, 67 | 68 | ShadeRule.rename("com.google.**" -> "com.qubole.shaded.@1").inAll, 69 | ShadeRule.rename("com.facebook.fb303.**" -> "com.qubole.shaded.facebook.fb303.@1").inAll, 70 | ShadeRule.rename("org.apache.thrift.**" -> "com.qubole.shaded.thrift.@1").inAll, 71 | 72 | ShadeRule.rename("org.codehaus.jackson.**" -> "com.qubole.shaded.jackson.@1").inAll, 73 | ShadeRule.rename("org.joda.**" -> "com.qubole.shaded.joda.@1").inAll, 74 | ShadeRule.rename("org.json.**" -> "com.qubole.shaded.json.@1").inAll, 75 | 76 | ShadeRule.rename("jodd.**" -> "com.qubole.shaded.jodd.@1").inAll, 77 | ShadeRule.rename("javaewah.**" -> "com.qubole.shaded.javaewah.@1").inAll, 78 | ShadeRule.rename("io.airlift.**" -> "com.qubole.shaded.io.airlift.@1").inAll, 79 | 80 | ShadeRule.rename("org.openx.data.**" -> "com.qubole.shaded.openx.data.@1").inAll, 81 | ShadeRule.rename("au.com.bytecode.opencsv.**" -> "com.qubole.shaded.au.com.bytecode.opencsv.@1").inAll, 82 | ShadeRule.rename("com.readytalk.metrics.**" -> "com.qubole.shaded.readytalk.metrics.@1").inAll 83 | ) 84 | 85 | import sbtassembly.AssemblyPlugin.autoImport.{ ShadeRule} 86 | import sbtassembly.MergeStrategy 87 | val distinctAndReplace: sbtassembly.MergeStrategy = new sbtassembly.MergeStrategy { 88 | val name = "distinctAndReplace" 89 | def apply(tempDir: File, path: String, files: Seq[File]): Either[String, Seq[(File, String)]] = { 90 | val lines = files flatMap (IO.readLines(_, IO.utf8)) 91 | val unique = lines.distinct 92 | val replaced = unique.map { x => x.replace("org.apache.hadoop.hive", "com.qubole.shaded.hadoop.hive")} 93 | val file = sbtassembly.MergeStrategy.createMergeTarget(tempDir, path) 94 | IO.writeLines(file, replaced, IO.utf8) 95 | Right(Seq(file -> path)) 96 | } 97 | } 98 | 99 | 100 | assemblyMergeStrategy in assembly := { 101 | // all discarded classes first 102 | case PathList("javax", xs @ _*) => MergeStrategy.discard 103 | case PathList("javolution", xs @_*) => MergeStrategy.discard 104 | // discard non shaded classes in hadoop and qubole packages 105 | case PathList("org", "apache", "hadoop", xs @_*) => MergeStrategy.discard 106 | case PathList("org", "apache", "log4j", xs @ _*) => MergeStrategy.last 107 | case PathList("com", "google", xs @ _*) => MergeStrategy.last 108 | case PathList("com", "esotericsoftware", xs @ _*) => MergeStrategy.last 109 | case PathList("com", "codahale", xs @ _*) => MergeStrategy.last 110 | case PathList("com", "yammer", xs @ _*) => MergeStrategy.last 111 | case PathList("org","aopalliance", xs @ _*) => MergeStrategy.last 112 | case PathList("com","zaxxer", xs @ _*) => MergeStrategy.last 113 | case PathList("org","apache", "logging", "log4j", xs @ _*) => MergeStrategy.last 114 | case PathList("io","netty", xs @ _*) => MergeStrategy.last 115 | case PathList("org","datanucleus", xs @ _*) => MergeStrategy.last 116 | case PathList("org", "apache", "arrow", xs @ _*) => MergeStrategy.last 117 | case PathList("org", "apache", "commons", "lang3", xs @ _*) => MergeStrategy.last 118 | case PathList("org", "apache", "commons", "lang3", "builder", xs @ _*) => MergeStrategy.last 119 | case PathList("org", "apache", "commons", "lang3", "concurrent", xs @ _*) => MergeStrategy.last 120 | case PathList("org", "apache", "commons", "lang3", "event", xs @ _*) => MergeStrategy.last 121 | case PathList("org", "apache", "commons", "lang3", "exception", xs @ _*) => MergeStrategy.last 122 | case PathList("org", "apache", "commons", "lang3", "math", xs @ _*) => MergeStrategy.last 123 | case PathList("org", "apache", "commons", "lang3", "mutable", xs @ _*) => MergeStrategy.last 124 | case PathList("org", "apache", "commons", "lang3", "reflect", xs @ _*) => MergeStrategy.last 125 | case PathList("org", "apache", "commons", "lang3", "text", xs @ _*) => MergeStrategy.last 126 | case PathList("org", "apache", "commons", "lang3", "time", xs @ _*) => MergeStrategy.last 127 | case PathList("org", "apache", "commons", "lang3", "tuple", xs @ _*) => MergeStrategy.last 128 | case PathList("com", "qubole", "shaded", "orc", xs @ _*) => MergeStrategy.last 129 | case PathList("org", "slf4j", "impl", xs @ _*) => MergeStrategy.last 130 | case PathList("org", "slf4j", "helpers", xs @ _*) => MergeStrategy.last 131 | case PathList("org", "slf4j", xs @ _*) => MergeStrategy.last 132 | 133 | // discard package.jdo because objects defined inside it are not shaded. 134 | // So removing for now 135 | case "package.jdo" => MergeStrategy.discard 136 | 137 | case PathList("META-INF", "services", xs @ _*) => distinctAndReplace 138 | // case "about.html" => MergeStrategy.rename 139 | case "META-INF/ECLIPSEF.RSA" => MergeStrategy.last 140 | case "META-INF/mailcap" => MergeStrategy.last 141 | case "META-INF/mimetypes.default" => MergeStrategy.last 142 | case "plugin.properties" => MergeStrategy.last 143 | case "log4j.properties" => MergeStrategy.last 144 | case "Log4j2Plugins.dat" => MergeStrategy.last 145 | case "git.properties" => MergeStrategy.last 146 | case "plugin.xml" => MergeStrategy.last 147 | case "META-INF/io.netty.versions.properties" => MergeStrategy.last 148 | case "META-INF/org/apache/logging/log4j/core/config/plugins/Log4j2Plugins.dat" => MergeStrategy.last 149 | case "codegen/config.fmpp" => MergeStrategy.first 150 | 151 | case x => 152 | val oldStrategy = (assemblyMergeStrategy in assembly).value 153 | oldStrategy(x) 154 | } 155 | 156 | // do not add scala in fat jar 157 | assemblyOption in assembly := (assemblyOption in assembly).value.copy(includeScala = false) 158 | 159 | // For publishing assembly locally 160 | publishMavenStyle := false 161 | 162 | artifact in (Compile, assembly) := { 163 | val art = (artifact in (Compile, assembly)).value 164 | art.withClassifier(None) 165 | } 166 | 167 | addArtifact(artifact in (Compile, assembly), assembly) 168 | 169 | -------------------------------------------------------------------------------- /shaded-dependencies/project/build.properties: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2019 Qubole, Inc. All rights reserved. 3 | # 4 | # Licensed to the Apache Software Foundation (ASF) under one or more 5 | # contributor license agreements. See the NOTICE file distributed with 6 | # this work for additional information regarding copyright ownership. 7 | # The ASF licenses this file to You under the Apache License, Version 2.0 8 | # (the "License"); you may not use this file except in compliance with 9 | # the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | 20 | sbt.version = 1.2.8 21 | -------------------------------------------------------------------------------- /shaded-dependencies/project/plugins.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.9") 2 | addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.9.2") 3 | -------------------------------------------------------------------------------- /src/it/scala/com/qubole/spark/hiveacid/LockSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.qubole.spark.hiveacid 19 | 20 | import com.qubole.spark.hiveacid.hive.HiveAcidMetadata 21 | import com.qubole.spark.hiveacid.transaction.HiveAcidTxn 22 | import org.apache.log4j.{Level, LogManager, Logger} 23 | import org.apache.spark.sql.SparkSession 24 | import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach, FunSuite} 25 | 26 | import scala.util.control.NonFatal 27 | 28 | class LockSuite extends FunSuite with BeforeAndAfterEach with BeforeAndAfterAll { 29 | val log: Logger = LogManager.getLogger(this.getClass) 30 | log.setLevel(Level.INFO) 31 | 32 | var helper: TestHelper = _ 33 | val isDebug = true 34 | 35 | val DEFAULT_DBNAME = "HiveTestLockDB" 36 | val cols: Map[String, String] = Map( 37 | ("intCol","int"), 38 | ("doubleCol","double"), 39 | ("floatCol","float"), 40 | ("booleanCol","boolean") 41 | ) 42 | val partitionedTable = new Table(DEFAULT_DBNAME, "partitioned", 43 | cols, Table.orcPartitionedFullACIDTable, true) 44 | val normalTable = new Table(DEFAULT_DBNAME, "nonPartitioned", 45 | cols, Table.orcFullACIDTable, false) 46 | 47 | override def beforeAll() { 48 | try { 49 | helper = new TestLockHelper 50 | if (isDebug) { 51 | log.setLevel(Level.DEBUG) 52 | } 53 | helper.init(isDebug) 54 | 55 | // DB 56 | helper.hiveExecute("DROP DATABASE IF EXISTS "+ DEFAULT_DBNAME +" CASCADE") 57 | helper.hiveExecute("CREATE DATABASE "+ DEFAULT_DBNAME) 58 | helper.recreate(partitionedTable) 59 | helper.recreate(normalTable) 60 | helper.hiveExecute(partitionedTable.insertIntoHiveTableKeyRange(11, 25)) 61 | } catch { 62 | case NonFatal(e) => log.info("failed " + e) 63 | } 64 | } 65 | 66 | override protected def afterAll(): Unit = { 67 | helper.hiveExecute(s"DROP TABLE IF EXISTS ${normalTable.hiveTname}") 68 | helper.hiveExecute(s"DROP TABLE IF EXISTS ${partitionedTable.hiveTname}") 69 | helper.hiveExecute("DROP DATABASE IF EXISTS "+ DEFAULT_DBNAME +" CASCADE") 70 | helper.destroy() 71 | } 72 | 73 | case class TestLockOperation(whichTransaction: Int, 74 | operationType: HiveAcidOperation.OperationType, 75 | partition: Seq[String], 76 | willFail: Boolean = false) 77 | 78 | test("test lock wait timeout exception") { 79 | val lockOps = Seq( 80 | TestLockOperation(1, HiveAcidOperation.UPDATE, Seq()), // first trans will pass 81 | TestLockOperation(1, HiveAcidOperation.DELETE, Seq()), // similar operation on first trans will pass 82 | TestLockOperation(2, HiveAcidOperation.DELETE, Seq(), true)) // second transaction will wait and fail in 100ms 83 | testLockOps(lockOps) 84 | } 85 | 86 | test("test locks within same transaction is allowed") { 87 | val lockOps = Seq( 88 | TestLockOperation(1, HiveAcidOperation.UPDATE, Seq()), // first trans will pass 89 | TestLockOperation(1, HiveAcidOperation.DELETE, Seq()), // similar operation on first trans will pass 90 | TestLockOperation(1, HiveAcidOperation.READ, Seq()), // READ on same transaction will pass 91 | TestLockOperation(1, HiveAcidOperation.INSERT_OVERWRITE, Seq())) 92 | testLockOps(lockOps) 93 | } 94 | 95 | test("test READ after UPDATE/DELETE is allowed") { 96 | val lockOps = Seq( 97 | TestLockOperation(1, HiveAcidOperation.UPDATE, Seq()), // first trans will pass 98 | TestLockOperation(1, HiveAcidOperation.DELETE, Seq()), 99 | TestLockOperation(2, HiveAcidOperation.READ, Seq())) // second transaction READ need not wait 100 | testLockOps(lockOps) 101 | } 102 | 103 | test("test DELETE/READ after INSERT OVERWRITE is not allowed") { 104 | val lockOps = Seq( 105 | TestLockOperation(1, HiveAcidOperation.INSERT_OVERWRITE, Seq()), 106 | TestLockOperation(2, HiveAcidOperation.UPDATE, Seq(), true), 107 | TestLockOperation(2, HiveAcidOperation.DELETE, Seq(), true), 108 | TestLockOperation(2, HiveAcidOperation.READ, Seq(), true)) 109 | testLockOps(lockOps) 110 | } 111 | 112 | test("test INSERT_OVERWRITE and DELETE/UPDATE/READ on different partition is allowed") { 113 | val lockOps = Seq( 114 | TestLockOperation(1, HiveAcidOperation.INSERT_OVERWRITE, Seq("ptnCol=0")), 115 | TestLockOperation(2, HiveAcidOperation.DELETE, Seq("ptnCol=1")), 116 | TestLockOperation(2, HiveAcidOperation.UPDATE, Seq("ptnCol=1")), 117 | TestLockOperation(2, HiveAcidOperation.READ, Seq("ptnCol=1"))) 118 | testLockOps(lockOps) 119 | } 120 | 121 | def testLockOps(lockOps: Seq[TestLockOperation]): Unit = { 122 | val tableName = DEFAULT_DBNAME + "." + "nonPartitioned" 123 | val hiveAcidMetadata = HiveAcidMetadata.fromSparkSession(helper.spark, 124 | tableName) 125 | 126 | // Just try 2 attempts for lock acquisition and fail if it cannot. 127 | helper.spark.sessionState.conf.setConfString("spark.hiveAcid.lock.max.retries", "2") 128 | val sparkConf = SparkAcidConf(helper.spark, Map()) 129 | val hTxn1 = new HiveAcidTxn(helper.spark) 130 | val hTxn2 = new HiveAcidTxn(helper.spark) 131 | 132 | def executeOp(lockOp: TestLockOperation) { 133 | val txn = lockOp.whichTransaction match { 134 | case 1 => hTxn1 135 | case 2 => hTxn2 136 | case _ => throw new IllegalArgumentException("Only 1 or 2 are supported for whichTransaction field") 137 | } 138 | if (lockOp.willFail) { 139 | val thrown = intercept[RuntimeException] { 140 | txn.acquireLocks(hiveAcidMetadata, lockOp.operationType, lockOp.partition, sparkConf) 141 | } 142 | assert(thrown.getMessage.contains("Could not acquire lock. Lock State: WAITING")) 143 | } else { 144 | txn.acquireLocks(hiveAcidMetadata, lockOp.operationType, lockOp.partition, sparkConf) 145 | } 146 | 147 | } 148 | 149 | try { 150 | hTxn1.begin() 151 | hTxn2.begin() 152 | lockOps.foreach(executeOp(_)) 153 | } finally { 154 | helper.spark.sessionState.conf.unsetConf("spark.hiveAcid.lock.max.retries") 155 | hTxn1.end(true) 156 | hTxn2.end(true) 157 | } 158 | } 159 | 160 | test("test HeartBeatRunner is running") { 161 | val hTxn1 = new HiveAcidTxn(helper.spark) 162 | hTxn1.begin() 163 | // Sleep for 4 seconds 164 | Thread.sleep(4 * 1000) 165 | val txn = HiveAcidTxn.txnManager.showOpenTrans().find(ti => ti.getId == hTxn1.txnId) 166 | assert(txn.isDefined, "Transaction is expected to be open") 167 | val seconds = (txn.get.getLastHeartbeatTime() - txn.get.getStartedTime()) / 1000 168 | assert(seconds >= 2, "getLastHeartBeatTime should " + 169 | "be at least 2 seconds after transaction was opened") 170 | hTxn1.end(true) 171 | } 172 | } 173 | 174 | class TestLockHelper extends TestHelper { 175 | // Create spark session with txn timeout config as that needs to be set 176 | // before the start of spark session 177 | override def getSparkSession(): SparkSession = { 178 | SparkSession.builder().appName("Hive-acid-test") 179 | .master("local[*]") 180 | .config("spark.hadoop.hive.metastore.uris", "thrift://0.0.0.0:10000") 181 | .config("spark.sql.warehouse.dir", "/tmp") 182 | .config("spark.sql.extensions", "com.qubole.spark.hiveacid.HiveAcidAutoConvertExtension") 183 | .config("spark.hadoop.hive.txn.timeout", "6") 184 | //.config("spark.ui.enabled", "true") 185 | //.config("spark.ui.port", "4041") 186 | .enableHiveSupport() 187 | .getOrCreate() 188 | } 189 | } -------------------------------------------------------------------------------- /src/it/scala/com/qubole/spark/hiveacid/TestHiveClient.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 Qubole, Inc. All rights reserved. 3 | * 4 | * Licensed to the Apache Software Foundation (ASF) under one or more 5 | * contributor license agreements. See the NOTICE file distributed with 6 | * this work for additional information regarding copyright ownership. 7 | * The ASF licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | 20 | package com.qubole.spark.hiveacid; 21 | 22 | 23 | import java.sql.Connection; 24 | import java.sql.DriverManager; 25 | import java.sql.ResultSet; 26 | import java.sql.ResultSetMetaData; 27 | import java.sql.SQLException; 28 | import java.sql.Statement; 29 | 30 | import java.io.StringWriter; 31 | 32 | public class TestHiveClient { 33 | private static Connection con = null; 34 | private static Statement stmt = null; 35 | 36 | TestHiveClient() { 37 | try { 38 | // Before running this docker container with HS2 / HMS / Hadoop running 39 | String driverName = "com.qubole.shaded.hive.jdbc.HiveDriver"; 40 | Class.forName(driverName); 41 | } catch (ClassNotFoundException e) { 42 | e.printStackTrace(); 43 | System.exit(1); 44 | } 45 | try { 46 | con = DriverManager.getConnection("jdbc:hive2://0.0.0.0:10001?allowMultiQueries=true", "root", "root"); 47 | stmt = con.createStatement(); 48 | } 49 | catch (Exception e) { 50 | System.out.println("Failed to create statement "+ e); 51 | } 52 | } 53 | 54 | public String executeQuery(String cmd) throws Exception { 55 | // Start Hive txn 56 | ResultSet rs = null; 57 | String resStr = null; 58 | try { 59 | rs = stmt.executeQuery(cmd); 60 | resStr = resultStr(rs); 61 | // close hive txn 62 | rs.close(); 63 | rs = null; 64 | 65 | } catch (Exception e) { 66 | System.out.println("Failed execute query statement \""+ cmd +"\" Error:"+ e); 67 | if (rs != null ) { 68 | rs.close(); 69 | } 70 | } 71 | return resStr; 72 | } 73 | 74 | public void execute(String cmd) throws SQLException { 75 | try { 76 | stmt.execute(cmd); 77 | } catch (Exception e) { 78 | System.out.println("Failed execute statement \""+ cmd +"\" Error:"+ e); 79 | } 80 | } 81 | 82 | private String resultStr(ResultSet rs) throws SQLException { 83 | StringWriter outputWriter = new StringWriter(); 84 | ResultSetMetaData rsmd = rs.getMetaData(); 85 | int columnsNumber = rsmd.getColumnCount(); 86 | int rowNumber = 0; 87 | while (rs.next()) { 88 | if (rowNumber != 0) { 89 | outputWriter.append("\n"); 90 | } 91 | rowNumber++; 92 | for (int i = 1; i <= columnsNumber; i++) { 93 | if (i > 1) outputWriter.append(","); 94 | String columnValue = rs.getString(i); 95 | // outputWriter.append(rsmd.getColumnName(i)+ "=" + columnValue); 96 | outputWriter.append(columnValue); 97 | } 98 | } 99 | return outputWriter.toString(); 100 | } 101 | 102 | public void teardown() throws SQLException { 103 | if (stmt != null) { 104 | stmt.close(); 105 | stmt = null; 106 | } 107 | if (con != null) { 108 | con.close(); 109 | con = null; 110 | } 111 | } 112 | } 113 | -------------------------------------------------------------------------------- /src/it/scala/com/qubole/spark/hiveacid/TestSparkSession.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 Qubole, Inc. All rights reserved. 3 | * 4 | * Licensed to the Apache Software Foundation (ASF) under one or more 5 | * contributor license agreements. See the NOTICE file distributed with 6 | * this work for additional information regarding copyright ownership. 7 | * The ASF licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | package com.qubole.spark.hiveacid 20 | 21 | import org.apache.spark.sql.SparkSession 22 | 23 | private[hiveacid] object TestSparkSession { 24 | 25 | def getSession: SparkSession = { 26 | val spark: SparkSession = SparkSession.builder().appName("Hive-acid-test") 27 | .master("local[*]") 28 | .config("spark.hadoop.hive.metastore.uris", "thrift://0.0.0.0:10000") 29 | .config("spark.sql.warehouse.dir", "/tmp") 30 | .config("spark.sql.extensions", "com.qubole.spark.hiveacid.HiveAcidAutoConvertExtension") 31 | //.config("spark.ui.enabled", "true") 32 | //.config("spark.ui.port", "4041") 33 | .enableHiveSupport() 34 | .getOrCreate() 35 | spark.sparkContext.setLogLevel("WARN") 36 | spark 37 | } 38 | 39 | def close(spark: SparkSession): Unit = { 40 | spark.close() 41 | SparkSession.clearActiveSession() 42 | SparkSession.clearDefaultSession() 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /src/it/scala/com/qubole/spark/hiveacid/UpdateDeleteSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 Qubole, Inc. All rights reserved. 3 | * 4 | * Licensed to the Apache Software Foundation (ASF) under one or more 5 | * contributor license agreements. See the NOTICE file distributed with 6 | * this work for additional information regarding copyright ownership. 7 | * The ASF licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | 20 | package com.qubole.spark.hiveacid 21 | 22 | 23 | import org.apache.log4j.{Level, LogManager, Logger} 24 | import org.scalatest._ 25 | 26 | import scala.util.control.NonFatal 27 | 28 | class UpdateDeleteSuite extends FunSuite with BeforeAndAfterEach with BeforeAndAfterAll { 29 | 30 | val log: Logger = LogManager.getLogger(this.getClass) 31 | log.setLevel(Level.INFO) 32 | 33 | var helper: TestHelper = _ 34 | val isDebug = true 35 | 36 | val DEFAULT_DBNAME = "HiveTestUpdateDeleteDB" 37 | val cols: Map[String, String] = Map( 38 | ("intCol","int"), 39 | ("doubleCol","double"), 40 | ("floatCol","float"), 41 | ("booleanCol","boolean") 42 | ) 43 | 44 | override def beforeAll() { 45 | try { 46 | helper = new TestHelper 47 | if (isDebug) { 48 | log.setLevel(Level.DEBUG) 49 | } 50 | helper.init(isDebug) 51 | 52 | // DB 53 | helper.hiveExecute("DROP DATABASE IF EXISTS "+ DEFAULT_DBNAME +" CASCADE") 54 | helper.hiveExecute("CREATE DATABASE "+ DEFAULT_DBNAME) 55 | } catch { 56 | case NonFatal(e) => log.info("failed " + e) 57 | } 58 | } 59 | 60 | override protected def afterAll(): Unit = { 61 | helper.destroy() 62 | } 63 | 64 | val testTables = List( 65 | // Positive Test 66 | (Table.orcFullACIDTable, false, true), 67 | (Table.orcPartitionedFullACIDTable, true, true), 68 | // Negative Test 69 | (Table.orcTable, false, false), 70 | (Table.orcPartitionedTable, true, false), 71 | (Table.orcBucketedTable, false, false), (Table.orcBucketedPartitionedTable, true, false)) 72 | // Test Run 73 | updateTestForFullAcidTables(testTables) 74 | deleteTestForFullAcidTables(testTables) 75 | 76 | // Update test for full acid tables 77 | def updateTestForFullAcidTables(tTypes: List[(String, Boolean, Boolean)]): Unit = { 78 | tTypes.foreach { case (tType, isPartitioned, positiveTest) => 79 | val tableNameSpark = "tSparkUpdate" 80 | val testName = s"Update Test for $tableNameSpark type $tType" 81 | test(testName) { 82 | val tableSpark = new Table(DEFAULT_DBNAME, tableNameSpark, cols, tType, isPartitioned) 83 | def code(): Unit = { 84 | 85 | if (positiveTest) { 86 | helper.recreate(tableSpark) 87 | helper.hiveExecute(tableSpark.insertIntoHiveTableKeyRange(11, 20)) 88 | val expectedRows = 10 89 | helper.compareResult(expectedRows.toString, helper.sparkCollect(tableSpark.count)) 90 | val expectedUpdateValue = helper.sparkCollect(tableSpark.selectExpectedUpdateCol(11)) 91 | helper.sparkSQL(tableSpark.updateInHiveTableKey(11)) 92 | val updatedVal = helper.sparkCollect(tableSpark.selectUpdateCol(11)) 93 | helper.compareResult(expectedUpdateValue, updatedVal) 94 | } else { 95 | intercept[RuntimeException] { 96 | helper.recreate(tableSpark) 97 | helper.sparkSQL(tableSpark.updateInHiveTableKey(11)) 98 | } 99 | } 100 | } 101 | helper.myRun(testName, code) 102 | } 103 | } 104 | } 105 | 106 | // Delete test for full acid tables 107 | def deleteTestForFullAcidTables(tTypes: List[(String, Boolean, Boolean)]): Unit = { 108 | tTypes.foreach { case (tType, isPartitioned, positiveTest) => 109 | val tableNameSpark = "tSparkDelete" 110 | val testName = s"Delete Test for $tableNameSpark type $tType" 111 | test(testName) { 112 | val tableSpark = new Table(DEFAULT_DBNAME, tableNameSpark, cols, tType, isPartitioned) 113 | def code(): Unit = { 114 | if (positiveTest) { 115 | helper.recreate(tableSpark) 116 | helper.hiveExecute(tableSpark.insertIntoHiveTableKeyRange(11, 20)) 117 | var expectedRows = 10 118 | helper.compareResult(expectedRows.toString, helper.sparkCollect(tableSpark.count)) 119 | 120 | // delete 1 row 121 | helper.sparkSQL(tableSpark.deleteFromHiveTableKey(11)) 122 | expectedRows = 9 123 | helper.compareResult(expectedRows.toString, helper.sparkCollect(tableSpark.count)) 124 | 125 | // Delete all but 1 using predicates 126 | helper.sparkSQL(tableSpark.deleteFromHiveTableGreaterThanKey(15)) 127 | helper.sparkSQL(tableSpark.deleteFromHiveTableLesserThanKey(15)) 128 | expectedRows = 1 129 | helper.compareResult(expectedRows.toString, helper.sparkCollect(tableSpark.count)) 130 | 131 | // No OP Delete 132 | helper.sparkCollect(tableSpark.deleteFromHiveTableGreaterThanKey(20)) 133 | helper.compareResult(expectedRows.toString, helper.sparkCollect(tableSpark.count)) 134 | } else { 135 | intercept[RuntimeException] { 136 | helper.recreate(tableSpark) 137 | // delete 1 row 138 | helper.sparkSQL(tableSpark.deleteFromHiveTableKey(11)) 139 | } 140 | } 141 | } 142 | helper.myRun(testName, code) 143 | } 144 | } 145 | } 146 | 147 | test("Test Update on Partition Columns is not allowed") { 148 | val tableNameSpark = "tUpdateNeg" 149 | val tableSpark = new Table(DEFAULT_DBNAME, tableNameSpark, cols, 150 | Table.orcPartitionedFullACIDTable, true) 151 | helper.recreate(tableSpark,false) 152 | val thrown = intercept[AnalysisException] { 153 | helper.sparkSQL(s"UPDATE ${DEFAULT_DBNAME}.${tableNameSpark} set ptnCol = 2 where intCol > 10") 154 | } 155 | assert(thrown.getMessage.contains("UPDATE on the partition columns are not allowed")) 156 | } 157 | } 158 | -------------------------------------------------------------------------------- /src/it/scala/com/qubole/spark/hiveacid/WriteSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 Qubole, Inc. All rights reserved. 3 | * 4 | * Licensed to the Apache Software Foundation (ASF) under one or more 5 | * contributor license agreements. See the NOTICE file distributed with 6 | * this work for additional information regarding copyright ownership. 7 | * The ASF licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | 20 | package com.qubole.spark.hiveacid 21 | 22 | 23 | import org.apache.log4j.{Level, LogManager, Logger} 24 | import org.scalatest._ 25 | 26 | import scala.util.control.NonFatal 27 | 28 | @Ignore 29 | class WriteSuite extends FunSuite with BeforeAndAfterEach with BeforeAndAfterAll { 30 | 31 | val log: Logger = LogManager.getLogger(this.getClass) 32 | log.setLevel(Level.INFO) 33 | 34 | var helper: TestHelper = _ 35 | val isDebug = true 36 | 37 | val DEFAULT_DBNAME = "HiveTestDB" 38 | val defaultPred = " intCol < 5 " 39 | val cols: Map[String, String] = Map( 40 | ("intCol","int"), 41 | ("doubleCol","double"), 42 | ("floatCol","float"), 43 | ("booleanCol","boolean") 44 | // TODO: Requires spark.sql.hive.convertMetastoreOrc=false to run 45 | // ("dateCol","date") 46 | ) 47 | 48 | override def beforeAll() { 49 | try { 50 | 51 | helper = new TestHelper 52 | if (isDebug) { 53 | log.setLevel(Level.DEBUG) 54 | } 55 | helper.init(isDebug) 56 | 57 | // DB 58 | helper.hiveExecute("DROP DATABASE IF EXISTS "+ DEFAULT_DBNAME +" CASCADE") 59 | helper.hiveExecute("CREATE DATABASE "+ DEFAULT_DBNAME) 60 | } catch { 61 | case NonFatal(e) => log.info("failed " + e) 62 | } 63 | } 64 | 65 | override protected def afterAll(): Unit = { 66 | helper.destroy() 67 | } 68 | 69 | 70 | // Test Run 71 | insertIntoOverwriteTestForFullAcidTables(Table.allFullAcidTypes()) 72 | 73 | // TODO: Currently requires compatibility check to be disabled in HMS to run clean 74 | // hive.metastore.client.capability.check=false 75 | // insertIntoOverwriteTestForInsertOnlyTables(Table.allInsertOnlyTypes()) 76 | 77 | // Insert Into/Overwrite test for full acid tables 78 | def insertIntoOverwriteTestForFullAcidTables(tTypes: List[(String,Boolean)]): Unit = { 79 | tTypes.foreach { case (tType, isPartitioned) => 80 | val tableNameHive = "tHive" 81 | val tableNameSpark = "tSpark" 82 | val testName = s"Simple InsertInto Test for $tableNameHive/$tableNameSpark type $tType" 83 | test(testName) { 84 | val tableHive = new Table(DEFAULT_DBNAME, tableNameHive, cols, tType, isPartitioned) 85 | val tableSpark = new Table(DEFAULT_DBNAME, tableNameSpark, cols, tType, isPartitioned) 86 | def code(): Unit = { 87 | helper.recreate(tableHive) 88 | helper.recreate(tableSpark) 89 | 90 | // Insert into rows in both tables from Hive and Spark 91 | helper.hiveExecute(tableHive.insertIntoHiveTableKeyRange(11, 20)) 92 | helper.sparkSQL(tableSpark.insertIntoSparkTableKeyRange(11, 20)) 93 | var expectedRows = 10 94 | helper.compareTwoTablesViaHive(tableHive, tableSpark, "After Insert Into", expectedRows) 95 | helper.compareTwoTablesViaSpark(tableHive, tableSpark, "After Insert Into", expectedRows) 96 | 97 | // Insert overwrite rows in both tables from Hive and Spark 98 | helper.hiveExecute(tableHive.insertOverwriteHiveTableKeyRange(16, 25)) 99 | helper.sparkSQL(tableSpark.insertOverwriteSparkTableKeyRange(16, 25)) 100 | expectedRows = if (tableHive.isPartitioned) 15 else 10 101 | helper.compareTwoTablesViaHive(tableHive, tableSpark, "After Insert Overwrite", expectedRows) 102 | helper.compareTwoTablesViaSpark(tableHive, tableSpark, "After Insert Overwrite", expectedRows) 103 | 104 | // Insert overwrite rows in both tables - add rows in hive table from spark and vice versa 105 | helper.hiveExecute(tableSpark.insertOverwriteHiveTableKeyRange(24, 27)) 106 | helper.sparkSQL(tableHive.insertOverwriteSparkTableKeyRange(24, 27)) 107 | expectedRows = if (tableHive.isPartitioned) expectedRows + 2 else 4 108 | helper.compareTwoTablesViaHive(tableHive, tableSpark, "After Insert Overwrite", expectedRows) 109 | helper.compareTwoTablesViaSpark(tableHive, tableSpark, "After Insert Overwrite", expectedRows) 110 | 111 | // Insert into rows in both tables - add rows in hive table from spark and vice versa 112 | helper.hiveExecute(tableSpark.insertIntoHiveTableKeyRange(24, 27)) 113 | helper.sparkSQL(tableHive.insertIntoSparkTableKeyRange(24, 27)) 114 | expectedRows = expectedRows + 4 115 | helper.compareTwoTablesViaHive(tableHive, tableSpark, "After Insert Into", expectedRows) 116 | helper.compareTwoTablesViaSpark(tableHive, tableSpark, "After Insert Into", expectedRows) 117 | 118 | } 119 | helper.myRun(testName, code) 120 | } 121 | } 122 | } 123 | 124 | def insertIntoOverwriteTestForInsertOnlyTables(tTypes: List[(String,Boolean)]): Unit = { 125 | tTypes.foreach { case (tType, isPartitioned) => 126 | val tableNameSpark = "tSpark" 127 | val testName = s"Simple InsertInto Test for $tableNameSpark type $tType" 128 | test(testName) { 129 | val tableSpark = new Table(DEFAULT_DBNAME, tableNameSpark, cols, tType, isPartitioned) 130 | def code() = { 131 | helper.recreate(tableSpark) 132 | } 133 | helper.myRun(testName, code) 134 | } 135 | } 136 | } 137 | 138 | } 139 | -------------------------------------------------------------------------------- /src/it/scala/com/qubole/spark/hiveacid/streaming/HiveAcidSinkOptionsSuite.scala: -------------------------------------------------------------------------------- 1 | package com.qubole.spark.hiveacid.streaming 2 | 3 | import java.util.Locale 4 | 5 | import com.qubole.spark.hiveacid.Table 6 | import org.apache.spark.sql.streaming.OutputMode 7 | 8 | 9 | class HiveAcidSinkOptionsSuite extends HiveAcidStreamingFunSuite { 10 | 11 | import HiveAcidSinkOptions._ 12 | 13 | test("bad sink options") { 14 | 15 | def testBadOptions(options: List[(String, String)])(expectedMsg: String): Unit = { 16 | 17 | val tableName = "tempTable" 18 | val tType = Table.orcFullACIDTable 19 | val cols = Map( 20 | ("value1","int"), 21 | ("value2", "int") 22 | ) 23 | val tableHive = new Table(DEFAULT_DBNAME, tableName, cols, tType, false) 24 | 25 | // creating table 26 | helper.recreate(tableHive) 27 | val errorMessage = intercept[IllegalArgumentException] { 28 | helper.runStreaming( 29 | tableHive.hiveTname, OutputMode.Append(), tableHive.getColMap.keys.toSeq, Range(1, 4), options) 30 | }.getMessage 31 | assert(errorMessage.toLowerCase(Locale.ROOT).contains(expectedMsg.toLowerCase(Locale.ROOT))) 32 | 33 | } 34 | 35 | testBadOptions(List(CLEANUP_DELAY_KEY -> "-2"))("Invalid value '-2' " + 36 | s"for option '$CLEANUP_DELAY_KEY', must be a positive integer") 37 | testBadOptions(List(COMPACT_INTERVAL_KEY -> "-5"))("Invalid value '-5' " + 38 | s"for option '$COMPACT_INTERVAL_KEY', must be a positive integer") 39 | testBadOptions(List(MIN_BATCHES_TO_RETAIN_KEY -> "-5"))("Invalid value '-5' " + 40 | s"for option '$MIN_BATCHES_TO_RETAIN_KEY', must be a positive integer") 41 | testBadOptions(List(LOG_DELETION_KEY -> "x"))("Invalid value 'x' " + 42 | s"for option '$LOG_DELETION_KEY', must be true or false") 43 | 44 | } 45 | 46 | } 47 | -------------------------------------------------------------------------------- /src/it/scala/com/qubole/spark/hiveacid/streaming/HiveAcidSinkSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 Qubole, Inc. All rights reserved. 3 | * 4 | * Licensed to the Apache Software Foundation (ASF) under one or more 5 | * contributor license agreements. See the NOTICE file distributed with 6 | * this work for additional information regarding copyright ownership. 7 | * The ASF licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | 20 | package com.qubole.spark.hiveacid.streaming 21 | 22 | import java.util.Locale 23 | 24 | import com.qubole.shaded.hadoop.hive.ql.metadata.InvalidTableException 25 | import com.qubole.spark.hiveacid.{AnalysisException, Table} 26 | import com.qubole.spark.hiveacid.datasource.HiveAcidDataSource 27 | import org.apache.spark.sql.Row 28 | import org.apache.spark.sql.streaming.OutputMode 29 | 30 | 31 | class HiveAcidSinkSuite extends HiveAcidStreamingFunSuite { 32 | 33 | override protected def afterAll(): Unit = { 34 | helper.destroy() 35 | } 36 | 37 | test("table not created") { 38 | val ds = new HiveAcidDataSource() 39 | val tableName = "tempTable" 40 | val options = Map("table" -> s"$tableName") 41 | 42 | val errorMessage = intercept[InvalidTableException] { 43 | ds.createSink(helper.spark.sqlContext, options, Seq.empty, OutputMode.Append()) 44 | }.getMessage() 45 | val expectedMsg = s"""table not found $tableName""" 46 | assert(errorMessage.toLowerCase(Locale.ROOT).contains(expectedMsg.toLowerCase(Locale.ROOT))) 47 | 48 | } 49 | 50 | test("table not acid table") { 51 | val ds = new HiveAcidDataSource() 52 | val tableName = s"tempTable" 53 | val options = Map("table" -> s"$DEFAULT_DBNAME.$tableName") 54 | 55 | val tType = Table.orcTable 56 | val cols = Map( 57 | ("value1","int"), 58 | ("value2", "int") 59 | ) 60 | 61 | val tableHive = new Table(DEFAULT_DBNAME, tableName, cols, tType, false) 62 | 63 | helper.recreate(tableHive, false) 64 | 65 | val errorMessage = intercept[IllegalArgumentException] { 66 | ds.createSink(helper.spark.sqlContext, options, Seq.empty, OutputMode.Append()) 67 | }.getMessage() 68 | val expectedMsg = s"""table ${tableHive.hiveTname} is not an acid table""" 69 | assert(errorMessage.toLowerCase(Locale.ROOT).contains(expectedMsg.toLowerCase(Locale.ROOT))) 70 | 71 | } 72 | 73 | test("table is bucketed") { 74 | val ds = new HiveAcidDataSource() 75 | val tableName = s"tempTable" 76 | val options = Map("table" -> s"$DEFAULT_DBNAME.$tableName") 77 | 78 | val tType = Table.orcBucketedFullACIDTable 79 | val cols = Map( 80 | ("value1","int"), 81 | ("value2", "int") 82 | ) 83 | 84 | val tableHive = new Table(DEFAULT_DBNAME, tableName, cols, tType, false) 85 | 86 | helper.recreate(tableHive, false) 87 | 88 | val errorMessage = intercept[RuntimeException] { 89 | ds.createSink(helper.spark.sqlContext, options, Seq.empty, OutputMode.Append()) 90 | }.getMessage() 91 | val expectedMsg = s"""Unsupported operation type - Streaming Write for Bucketed table """ 92 | assert(errorMessage.toLowerCase(Locale.ROOT).contains(expectedMsg.toLowerCase(Locale.ROOT))) 93 | 94 | } 95 | 96 | test("partitionBy is specified with Acid Streaming") { 97 | val ds = new HiveAcidDataSource() 98 | val options = Map("table" -> "dummyTable") 99 | val errorMessage = intercept[UnsupportedOperationException] { 100 | ds.createSink(helper.spark.sqlContext, options, Seq("col1", "col2"), OutputMode.Append()) 101 | }.getMessage() 102 | 103 | val expectedMsg = "Unsupported Function - partitionBy with HiveAcidSink" 104 | assert(errorMessage.toLowerCase(Locale.ROOT).contains(expectedMsg.toLowerCase(Locale.ROOT))) 105 | 106 | } 107 | 108 | test("incorrect output mode is used with Acid Streaming") { 109 | val ds = new HiveAcidDataSource() 110 | val options = Map("table" -> "dummyTable") 111 | val errorMessage = intercept[AnalysisException] { 112 | ds.createSink(helper.spark.sqlContext, options, Seq.empty, OutputMode.Update()) 113 | }.getMessage() 114 | val expectedMsg = "mode is Update: Hive Acid Sink supports only Append as OutputMode" 115 | assert(errorMessage.toLowerCase(Locale.ROOT).contains(expectedMsg.toLowerCase(Locale.ROOT))) 116 | 117 | } 118 | 119 | test("table not specified") { 120 | val ds = new HiveAcidDataSource() 121 | val options = Map.empty[String, String] 122 | val errorMessage = intercept[IllegalArgumentException] { 123 | ds.createSink(helper.spark.sqlContext, options, Seq.empty, OutputMode.Append()) 124 | }.getMessage() 125 | val expectedMsg = """Table Name is not specified""" 126 | assert(errorMessage.toLowerCase(Locale.ROOT).contains(expectedMsg.toLowerCase(Locale.ROOT))) 127 | 128 | } 129 | 130 | // Test Run 131 | streamingTestForAcidTables(Table.allNonBucketedFullAcidTypes()) 132 | streamingTestForAcidTables(Table.allNonBucketedInsertOnlyTypes()) 133 | 134 | def streamingTestForAcidTables(tTypes: List[(String,Boolean)]): Unit = { 135 | tTypes.foreach { case (tType, isPartitioned) => 136 | val tableNameHive = "tHive" 137 | val testName = s"Simple Streaming Query Append for $tableNameHive type $tType" 138 | test(testName) { 139 | val cols: Map[String, String] = { 140 | if(!isPartitioned) { 141 | Map( 142 | ("value1","int"), 143 | ("value2","int") 144 | ) 145 | } else { 146 | Map( 147 | ("value","int") 148 | ) 149 | } 150 | } 151 | 152 | val tableHive = new Table(DEFAULT_DBNAME, tableNameHive, cols, tType, isPartitioned) 153 | def code(): Unit = { 154 | helper.recreate(tableHive) 155 | 156 | helper.runStreaming(tableHive.hiveTname, OutputMode.Append(), tableHive.getColMap.keys.toSeq, Range(1, 4)) 157 | val resDf = helper.sparkGetDF(tableHive) 158 | val resultRow = (Row(100, 10, 1) :: Row(200, 20, 2) :: Row(300, 30, 3) :: Nil).toArray 159 | helper.compareResult(resDf._1.collect(), resultRow) 160 | helper.compareResult(resDf._2.collect(), resultRow) 161 | helper.compare(tableHive, "compare via hive") 162 | } 163 | helper.myRun(testName, code) 164 | } 165 | } 166 | } 167 | 168 | } 169 | -------------------------------------------------------------------------------- /src/it/scala/com/qubole/spark/hiveacid/streaming/HiveAcidStreamingFunSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 Qubole, Inc. All rights reserved. 3 | * 4 | * Licensed to the Apache Software Foundation (ASF) under one or more 5 | * contributor license agreements. See the NOTICE file distributed with 6 | * this work for additional information regarding copyright ownership. 7 | * The ASF licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | 20 | 21 | package com.qubole.spark.hiveacid.streaming 22 | 23 | import org.apache.log4j.{Level, LogManager, Logger} 24 | import org.scalatest.{BeforeAndAfterAll, FunSuite} 25 | 26 | import scala.util.control.NonFatal 27 | 28 | 29 | abstract class HiveAcidStreamingFunSuite extends FunSuite with BeforeAndAfterAll { 30 | 31 | protected val log: Logger = LogManager.getLogger(this.getClass) 32 | log.setLevel(Level.INFO) 33 | 34 | protected var helper: StreamingTestHelper = _ 35 | protected val isDebug = true 36 | 37 | protected val DEFAULT_DBNAME = "HiveTestDB" 38 | 39 | override protected def beforeAll() { 40 | try { 41 | 42 | helper = new StreamingTestHelper 43 | if (isDebug) { 44 | log.setLevel(Level.DEBUG) 45 | } 46 | helper.init(isDebug) 47 | 48 | // DB 49 | helper.hiveExecute("DROP DATABASE IF EXISTS "+ DEFAULT_DBNAME +" CASCADE") 50 | helper.hiveExecute("CREATE DATABASE "+ DEFAULT_DBNAME) 51 | } catch { 52 | case NonFatal(e) => log.info("failed " + e) 53 | } 54 | } 55 | 56 | override protected def afterAll(): Unit = { 57 | helper.destroy() 58 | } 59 | 60 | } 61 | -------------------------------------------------------------------------------- /src/it/scala/com/qubole/spark/hiveacid/streaming/StreamingTestHelper.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 Qubole, Inc. All rights reserved. 3 | * 4 | * Licensed to the Apache Software Foundation (ASF) under one or more 5 | * contributor license agreements. See the NOTICE file distributed with 6 | * this work for additional information regarding copyright ownership. 7 | * The ASF licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | 20 | 21 | package com.qubole.spark.hiveacid.streaming 22 | 23 | import java.io.{File, IOException} 24 | import java.util.UUID 25 | 26 | import com.qubole.spark.hiveacid.TestHelper 27 | 28 | import org.apache.spark.network.util.JavaUtils 29 | import org.apache.spark.sql.execution.streaming.MemoryStream 30 | import org.apache.spark.sql.streaming.{OutputMode, StreamingQuery} 31 | import org.scalatest.concurrent.TimeLimits 32 | import org.scalatest.time.SpanSugar 33 | 34 | class StreamingTestHelper extends TestHelper with TimeLimits { 35 | 36 | import StreamingTestHelper._ 37 | 38 | 39 | def runStreaming(tableName: String, 40 | outputMode: OutputMode, 41 | cols: Seq[String], 42 | inputRange: Range, 43 | options: List[(String, String)] = List.empty): Unit = { 44 | 45 | val inputData = MemoryStream[Int] 46 | val ds = inputData.toDS() 47 | 48 | val checkpointDir = createCheckpointDir(namePrefix = "stream.checkpoint").getCanonicalPath 49 | 50 | var query: StreamingQuery = null 51 | 52 | try { 53 | // Starting streaming query 54 | val writerDf = 55 | ds.map(i => (i*100, i*10, i)) 56 | .toDF(cols:_*) 57 | .writeStream 58 | .format("HiveAcid") 59 | .option("table", tableName) 60 | .outputMode(outputMode) 61 | .option("checkpointLocation", checkpointDir) 62 | //.start() 63 | 64 | query = options.map { option => 65 | writerDf.option(option._1, option._2) 66 | }.lastOption.getOrElse(writerDf).start() 67 | 68 | // Adding data for streaming query 69 | inputData.addData(inputRange) 70 | failAfter(STREAMING_TIMEOUT) { 71 | query.processAllAvailable() 72 | } 73 | } finally { 74 | if (query != null) { 75 | // Terminating streaming query 76 | query.stop() 77 | deleteCheckpointDir(checkpointDir) 78 | } 79 | } 80 | } 81 | 82 | def deleteCheckpointDir(fileStr: String): Unit = { 83 | val file = new File(fileStr) 84 | if (file != null) { 85 | JavaUtils.deleteRecursively(file) 86 | } 87 | } 88 | 89 | def createCheckpointDir(root: String = System.getProperty("java.io.tmpdir"), 90 | namePrefix: String = "spark"): File = { 91 | 92 | var attempts = 0 93 | val maxAttempts = MAX_DIR_CREATION_ATTEMPTS 94 | var dir: File = null 95 | while (dir == null) { 96 | attempts += 1 97 | if (attempts > maxAttempts) { 98 | throw new IOException("Failed to create a temp directory (under " + root + ") after " + 99 | maxAttempts + " attempts!") 100 | } 101 | try { 102 | dir = new File(root, namePrefix + "-" + UUID.randomUUID.toString) 103 | if (dir.exists() || !dir.mkdirs()) { 104 | dir = null 105 | } 106 | } catch { case e: SecurityException => dir = null; } 107 | } 108 | dir.getCanonicalFile 109 | } 110 | 111 | } 112 | 113 | object StreamingTestHelper extends TestHelper with SpanSugar { 114 | 115 | val MAX_DIR_CREATION_ATTEMPTS = 10 116 | val STREAMING_TIMEOUT = 60.seconds 117 | 118 | } 119 | -------------------------------------------------------------------------------- /src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2019 Qubole, Inc. All rights reserved. 3 | # 4 | # Licensed to the Apache Software Foundation (ASF) under one or more 5 | # contributor license agreements. See the NOTICE file distributed with 6 | # this work for additional information regarding copyright ownership. 7 | # The ASF licenses this file to You under the Apache License, Version 2.0 8 | # (the "License"); you may not use this file except in compliance with 9 | # the License. You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | # 19 | 20 | com.qubole.spark.hiveacid.datasource.HiveAcidDataSource 21 | -------------------------------------------------------------------------------- /src/main/scala/com/qubole/shaded/hadoop/hive/ql/io/orc/OrcAcidUtil.scala: -------------------------------------------------------------------------------- 1 | package com.qubole.shaded.hadoop.hive.ql.io.orc 2 | 3 | import java.util.regex.Pattern 4 | 5 | import com.qubole.shaded.hadoop.hive.ql.io.AcidUtils 6 | import org.apache.hadoop.fs.Path 7 | 8 | object OrcAcidUtil { 9 | val BUCKET_PATTERN = Pattern.compile("bucket_[0-9]{5}$") 10 | 11 | def getDeleteDeltaPaths(orcSplit: OrcSplit): Array[Path] = { 12 | assert(BUCKET_PATTERN.matcher(orcSplit.getPath.getName).matches()) 13 | val bucket = AcidUtils.parseBucketId(orcSplit.getPath) 14 | assert(bucket != -1) 15 | val deleteDeltaDirPaths = VectorizedOrcAcidRowBatchReader.getDeleteDeltaDirsFromSplit(orcSplit); 16 | deleteDeltaDirPaths.map(deleteDir => AcidUtils.createBucketFile(deleteDir, bucket)) 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /src/main/scala/com/qubole/spark/datasources/hiveacid/sql/HiveAnalysisException.scala: -------------------------------------------------------------------------------- 1 | package com.qubole.spark.datasources.hiveacid.sql 2 | 3 | import org.apache.spark.sql.AnalysisException 4 | import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan 5 | 6 | class HiveAnalysisException( 7 | override val message: String, 8 | override val line: Option[Int] = None, 9 | override val startPosition: Option[Int] = None, 10 | // Some plans fail to serialize due to bugs in scala collections. 11 | @transient override val plan: Option[LogicalPlan] = None, 12 | override val cause: Option[Throwable] = None) extends AnalysisException(message, line, startPosition, plan, cause) { 13 | } 14 | -------------------------------------------------------------------------------- /src/main/scala/com/qubole/spark/datasources/hiveacid/sql/catalyst/parser/ParseDriver.scala: -------------------------------------------------------------------------------- 1 | package com.qubole.spark.datasources.hiveacid.sql.catalyst.parser 2 | 3 | import com.qubole.spark.datasources.hiveacid.sql.catalyst.parser.{SqlHiveParser => SqlBaseParser} 4 | import org.antlr.v4.runtime._ 5 | import org.antlr.v4.runtime.misc.Interval 6 | import org.antlr.v4.runtime.tree.TerminalNodeImpl 7 | import org.apache.spark.sql.catalyst.expressions.AttributeReference 8 | import org.apache.spark.sql.types.StructType 9 | 10 | /** 11 | * A copy of [[org.apache.spark.sql.catalyst.parser.UpperCaseCharStream]] 12 | */ 13 | class UpperCaseCharStream(wrapped: CodePointCharStream) extends CharStream { 14 | override def consume(): Unit = wrapped.consume 15 | override def getSourceName(): String = wrapped.getSourceName 16 | override def index(): Int = wrapped.index 17 | override def mark(): Int = wrapped.mark 18 | override def release(marker: Int): Unit = wrapped.release(marker) 19 | override def seek(where: Int): Unit = wrapped.seek(where) 20 | override def size(): Int = wrapped.size 21 | 22 | override def getText(interval: Interval): String = { 23 | // ANTLR 4.7's CodePointCharStream implementations have bugs when 24 | // getText() is called with an empty stream, or intervals where 25 | // the start > end. See 26 | // https://github.com/antlr/antlr4/commit/ac9f7530 for one fix 27 | // that is not yet in a released ANTLR artifact. 28 | if (size() > 0 && (interval.b - interval.a >= 0)) wrapped.getText(interval) else "" 29 | } 30 | 31 | override def LA(i: Int): Int = { 32 | val la = wrapped.LA(i) 33 | if (la == 0 || la == IntStream.EOF) la 34 | else Character.toUpperCase(la) 35 | } 36 | } 37 | 38 | /** 39 | * An adaptation of [[org.apache.spark.sql.catalyst.parser.PostProcessor]] 40 | */ 41 | case object PostProcessor extends SqlHiveBaseListener { 42 | 43 | /** Remove the back ticks from an Identifier. */ 44 | override def exitQuotedIdentifier(ctx: SqlBaseParser.QuotedIdentifierContext): Unit = { 45 | replaceTokenByIdentifier(ctx, 1) { token => 46 | // Remove the double back ticks in the string. 47 | token.setText(token.getText.replace("``", "`")) 48 | token 49 | } 50 | } 51 | 52 | /** Treat non-reserved keywords as Identifiers. */ 53 | override def exitNonReserved(ctx: SqlBaseParser.NonReservedContext): Unit = { 54 | replaceTokenByIdentifier(ctx, 0)(identity) 55 | } 56 | 57 | private def replaceTokenByIdentifier( 58 | ctx: ParserRuleContext, 59 | stripMargins: Int)( 60 | f: CommonToken => CommonToken = identity): Unit = { 61 | val parent = ctx.getParent 62 | parent.removeLastChild() 63 | val token = ctx.getChild(0).getPayload.asInstanceOf[Token] 64 | val newToken = new CommonToken( 65 | new org.antlr.v4.runtime.misc.Pair(token.getTokenSource, token.getInputStream), 66 | SqlBaseParser.IDENTIFIER, 67 | token.getChannel, 68 | token.getStartIndex + stripMargins, 69 | token.getStopIndex - stripMargins) 70 | parent.addChild(new TerminalNodeImpl(f(newToken))) 71 | } 72 | } 73 | 74 | /** 75 | * An adaptation of [[org.apache.spark.util.random.RandomSampler]] 76 | */ 77 | object RandomSampler { 78 | /** 79 | * Sampling fraction arguments may be results of computation, and subject to floating 80 | * point jitter. I check the arguments with this epsilon slop factor to prevent spurious 81 | * warnings for cases such as summing some numbers to get a sampling fraction of 1.000000001 82 | */ 83 | val roundingEpsilon = 1e-6 84 | } 85 | 86 | object SparkAdaptation { 87 | /** 88 | * An adaptation of [[org.apache.spark.sql.types.StructType#toAttributes]] 89 | */ 90 | def toAttributes(structType: StructType): Seq[AttributeReference] = 91 | structType.map(f => AttributeReference(f.name, f.dataType, f.nullable, f.metadata)()) 92 | } -------------------------------------------------------------------------------- /src/main/scala/com/qubole/spark/datasources/hiveacid/sql/catalyst/plans/command/DeleteCommand.scala: -------------------------------------------------------------------------------- 1 | package com.qubole.spark.datasources.hiveacid.sql.catalyst.plans.command 2 | 3 | import com.qubole.spark.hiveacid.HiveAcidErrors 4 | import com.qubole.spark.hiveacid.datasource.HiveAcidRelation 5 | import org.apache.spark.sql.{Column, Row, SparkSession} 6 | import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} 7 | import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan 8 | import org.apache.spark.sql.execution.command.RunnableCommand 9 | import org.apache.spark.sql.execution.datasources.LogicalRelation 10 | 11 | case class DeleteCommand( 12 | table: LogicalPlan, 13 | condition: Expression) 14 | extends RunnableCommand { 15 | 16 | // We don't want `table` in children as sometimes we don't want to transform it. 17 | override def children: Seq[LogicalPlan] = Seq(table) 18 | override def output: Seq[Attribute] = Seq.empty 19 | override lazy val resolved: Boolean = childrenResolved 20 | override def run(sparkSession: SparkSession): Seq[Row] = { 21 | if (children.size != 1) { 22 | throw new IllegalArgumentException("DELETE command should specify exactly one table, whereas this has: " 23 | + children.size) 24 | } 25 | children(0) match { 26 | case LogicalRelation(relation: HiveAcidRelation, _, _ , _) => { 27 | relation.delete(new Column(condition)) 28 | } 29 | case _ => throw HiveAcidErrors.tableNotAcidException(table.toString()) 30 | } 31 | Seq.empty[Row] 32 | } 33 | } -------------------------------------------------------------------------------- /src/main/scala/com/qubole/spark/datasources/hiveacid/sql/catalyst/plans/command/MergeCommand.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.qubole.spark.datasources.hiveacid.sql.catalyst.plans.command 19 | 20 | import com.qubole.spark.hiveacid.HiveAcidErrors 21 | import com.qubole.spark.hiveacid.datasource.HiveAcidRelation 22 | import com.qubole.spark.hiveacid.merge.{MergeCondition, MergeWhenClause, MergeWhenNotInsert} 23 | import org.apache.spark.sql.catalyst.AliasIdentifier 24 | import org.apache.spark.sql.catalyst.analysis.EliminateSubqueryAliases 25 | import org.apache.spark.sql.catalyst.catalog.{CatalogTable, HiveTableRelation} 26 | import org.apache.spark.sql.{Row, SparkSession, SqlUtils} 27 | import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} 28 | import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, SubqueryAlias} 29 | import org.apache.spark.sql.execution.command.RunnableCommand 30 | import org.apache.spark.sql.execution.datasources.LogicalRelation 31 | 32 | case class MergeCommand(targetTable: LogicalPlan, 33 | sourceTable: LogicalPlan, 34 | matched: Seq[MergeWhenClause], 35 | notMatched: Option[MergeWhenClause], 36 | mergeCondition: MergeCondition, 37 | sourceAlias: Option[AliasIdentifier], 38 | targetAlias: Option[AliasIdentifier]) 39 | extends RunnableCommand { 40 | 41 | override def children: Seq[LogicalPlan] = Seq(targetTable, sourceTable) 42 | override def output: Seq[Attribute] = Seq.empty 43 | override lazy val resolved: Boolean = childrenResolved 44 | override def run(sparkSession: SparkSession): Seq[Row] = { 45 | val insertClause: Option[MergeWhenNotInsert] = notMatched match { 46 | case Some(i: MergeWhenNotInsert) => Some(i) 47 | case None => None 48 | case _ => throw HiveAcidErrors.mergeValidationError("WHEN NOT Clause has to be INSERT CLAUSE") 49 | } 50 | 51 | val targetRelation = children.head 52 | val sourceRelation = children.last 53 | 54 | val sourceTableFullyQualifiedName = SqlUtils.removeTopSubqueryAlias(sourceRelation) match { 55 | case hiveTable: HiveTableRelation => 56 | Some(hiveTable.tableMeta.qualifiedName) 57 | case LogicalRelation(acidRelation: HiveAcidRelation, _, _, _) => 58 | Some(acidRelation.fullyQualifiedTableName) 59 | case LogicalRelation(_, _, catalogTable: Option[CatalogTable], _) if catalogTable.isDefined => 60 | Some(catalogTable.get.qualifiedName) 61 | case _ => None 62 | } 63 | 64 | val (_, sourceDf) = SqlUtils.getDFQualified(sparkSession, 65 | SqlUtils.logicalPlanToDataFrame(sparkSession, sourceTable), 66 | sourceTableFullyQualifiedName.getOrElse("")) 67 | 68 | SqlUtils.removeTopSubqueryAlias(targetRelation) match { 69 | case LogicalRelation(relation: HiveAcidRelation, _, _, _) => 70 | relation.merge(sourceDf, 71 | mergeCondition.expression, matched, insertClause, sourceAlias, targetAlias) 72 | case _ => throw HiveAcidErrors.tableNotAcidException(targetTable.toString()) 73 | } 74 | 75 | Seq.empty 76 | } 77 | } -------------------------------------------------------------------------------- /src/main/scala/com/qubole/spark/datasources/hiveacid/sql/catalyst/plans/command/UpdateCommand.scala: -------------------------------------------------------------------------------- 1 | package com.qubole.spark.datasources.hiveacid.sql.catalyst.plans.command 2 | 3 | import com.qubole.spark.hiveacid.HiveAcidErrors 4 | import com.qubole.spark.hiveacid.datasource.HiveAcidRelation 5 | import org.apache.spark.sql.{Column, Row, SparkSession} 6 | import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} 7 | import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan 8 | import org.apache.spark.sql.execution.command.RunnableCommand 9 | import org.apache.spark.sql.execution.datasources.LogicalRelation 10 | 11 | case class UpdateCommand( 12 | table: LogicalPlan, 13 | setExpressions: Map[String, Expression], 14 | condition: Option[Expression]) 15 | extends RunnableCommand { 16 | 17 | override def children: Seq[LogicalPlan] = Seq(table) 18 | override def output: Seq[Attribute] = Seq.empty 19 | override lazy val resolved: Boolean = childrenResolved 20 | 21 | override def run(sparkSession: SparkSession): Seq[Row] = { 22 | if (children.size != 1) { 23 | throw new IllegalArgumentException("UPDATE command should have one table to update, whereas this has: " 24 | + children.size) 25 | } 26 | children(0) match { 27 | case LogicalRelation(relation: HiveAcidRelation, _, _ , _) => { 28 | val setColumns = setExpressions.mapValues(expr => new Column(expr)) 29 | val updateFilterColumn = condition.map(new Column(_)) 30 | relation.update(updateFilterColumn, setColumns) 31 | } 32 | case LogicalRelation(_, _, Some(catalogTable), _) => 33 | throw HiveAcidErrors.tableNotAcidException(catalogTable.qualifiedName) 34 | case _ => throw HiveAcidErrors.tableNotAcidException(table.toString()) 35 | } 36 | Seq.empty[Row] 37 | } 38 | } -------------------------------------------------------------------------------- /src/main/scala/com/qubole/spark/datasources/hiveacid/sql/execution/SparkAcidSqlParser.scala: -------------------------------------------------------------------------------- 1 | package com.qubole.spark.datasources.hiveacid.sql.execution 2 | 3 | import com.qubole.spark.datasources.hiveacid.sql.catalyst.parser._ 4 | import org.antlr.v4.runtime._ 5 | import org.antlr.v4.runtime.atn.PredictionMode 6 | import org.apache.spark.internal.Logging 7 | import org.apache.spark.sql.catalyst.expressions.Expression 8 | import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier} 9 | import org.apache.spark.sql.{AnalysisException, SparkSession} 10 | import org.apache.spark.sql.catalyst.parser.{ParseErrorListener, ParseException, ParserInterface} 11 | import org.apache.spark.sql.catalyst.plans.logical._ 12 | import org.apache.spark.sql.catalyst.trees.Origin 13 | import org.apache.spark.sql.execution.SparkSqlParser 14 | import org.apache.spark.sql.internal.{SQLConf, VariableSubstitution} 15 | import org.apache.spark.sql.types.{DataType, StructType} 16 | 17 | /** 18 | * Concrete parser for Hive SQL statements. 19 | */ 20 | case class SparkAcidSqlParser(sparkParser: ParserInterface) extends ParserInterface with Logging { 21 | 22 | override def parseExpression(sqlText: String): Expression = sparkParser.parseExpression(sqlText) 23 | 24 | override def parseTableIdentifier(sqlText: String): TableIdentifier = sparkParser.parseTableIdentifier(sqlText) 25 | 26 | override def parseFunctionIdentifier(sqlText: String): FunctionIdentifier = sparkParser.parseFunctionIdentifier(sqlText) 27 | 28 | override def parseTableSchema(sqlText: String): StructType = sparkParser.parseTableSchema(sqlText) 29 | 30 | override def parseDataType(sqlText: String): DataType = sparkParser.parseDataType(sqlText) 31 | 32 | private val substitutor: VariableSubstitution = { 33 | val field = classOf[SparkSqlParser].getDeclaredField("substitutor") 34 | field.setAccessible(true) 35 | field.get(sparkParser).asInstanceOf[VariableSubstitution] 36 | } 37 | 38 | // FIXME scala reflection would be better 39 | private val conf: SQLConf = { 40 | val field = classOf[VariableSubstitution].getDeclaredField("org$apache$spark$sql$internal$VariableSubstitution$$conf") 41 | field.setAccessible(true) 42 | field.get(substitutor).asInstanceOf[SQLConf] 43 | } 44 | 45 | private val sparkAcidAstBuilder = new SparkSqlAstBuilder(conf) 46 | 47 | override def parsePlan(sqlText: String): LogicalPlan = { 48 | try { 49 | parse(sqlText) { parser => 50 | sparkAcidAstBuilder.visitSingleStatement(parser.singleStatement()) match { 51 | case plan: LogicalPlan => plan 52 | case _ => sparkParser.parsePlan(sqlText) 53 | } 54 | } 55 | } catch { 56 | case e: AcidParseException => throw e.parseException 57 | case _: ParseException => sparkParser.parsePlan(sqlText) 58 | } 59 | } 60 | 61 | /** 62 | * An adaptation of [[org.apache.spark.sql.execution.SparkSqlParser#parse]] 63 | * and [[org.apache.spark.sql.catalyst.parser.AbstractSqlParser#parse]] 64 | */ 65 | protected def parse[T](sqlText: String)(toResult: SqlHiveParser => T): T = { 66 | val command = substitutor.substitute(sqlText) 67 | logDebug(s"Parsing command: $command") 68 | 69 | 70 | val lexer = new SqlHiveLexer(new UpperCaseCharStream(CharStreams.fromString(command))) 71 | lexer.removeErrorListeners() 72 | lexer.addErrorListener(ParseErrorListener) 73 | lexer.legacy_setops_precedence_enbled = SQLConf.get.setOpsPrecedenceEnforced 74 | 75 | val tokenStream = new CommonTokenStream(lexer) 76 | val acidSpecific = checkIfAcidSpecific(tokenStream) 77 | tokenStream.seek(0) //reset stream to first token 78 | val parser = new SqlHiveParser(tokenStream) 79 | parser.addParseListener(PostProcessor) 80 | parser.removeErrorListeners() 81 | parser.addErrorListener(ParseErrorListener) 82 | parser.legacy_setops_precedence_enbled = SQLConf.get.setOpsPrecedenceEnforced 83 | try { 84 | parser.getInterpreter.setPredictionMode(PredictionMode.LL) 85 | toResult(parser) 86 | } catch { 87 | case e: ParseException if e.command.isDefined => 88 | throw wrapParseException(e, acidSpecific) 89 | case e: ParseException => 90 | throw wrapParseException(e.withCommand(command), acidSpecific) 91 | case e: AnalysisException => 92 | val position = Origin(e.line, e.startPosition) 93 | val pe = new ParseException(Option(command), e.message, position, position) 94 | throw wrapParseException(pe, acidSpecific) 95 | } 96 | } 97 | 98 | /** 99 | * Denotes ACID Specific ParseException 100 | * @param parseException 101 | */ 102 | class AcidParseException(val parseException: ParseException) extends Exception 103 | 104 | def wrapParseException(e: ParseException, acidSpecific: Boolean): Throwable = { 105 | if (acidSpecific) { 106 | new AcidParseException(e) 107 | } else { 108 | e 109 | } 110 | } 111 | def checkIfAcidSpecific(tokStream: TokenStream): Boolean = { 112 | tokStream.LA(1) match { 113 | case SqlHiveParser.DELETE | SqlHiveParser.MERGE | SqlHiveParser.UPDATE => true 114 | case _ => false 115 | } 116 | } 117 | } -------------------------------------------------------------------------------- /src/main/scala/com/qubole/spark/hiveacid/.gitignore: -------------------------------------------------------------------------------- 1 | *.scalae 2 | -------------------------------------------------------------------------------- /src/main/scala/com/qubole/spark/hiveacid/HiveAcidAutoConvert.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 Qubole, Inc. All rights reserved. 3 | * 4 | * Licensed to the Apache Software Foundation (ASF) under one or more 5 | * contributor license agreements. See the NOTICE file distributed with 6 | * this work for additional information regarding copyright ownership. 7 | * The ASF licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | 20 | package com.qubole.spark.hiveacid 21 | 22 | import java.util.Locale 23 | 24 | import com.qubole.spark.datasources.hiveacid.sql.execution.SparkAcidSqlParser 25 | import org.apache.spark.sql.{SparkSession, SparkSessionExtensions} 26 | import org.apache.spark.sql.catalyst.catalog.HiveTableRelation 27 | import org.apache.spark.sql.catalyst.plans.logical.{Filter, InsertIntoTable, LogicalPlan} 28 | import org.apache.spark.sql.catalyst.rules.Rule 29 | import org.apache.spark.sql.execution.command.DDLUtils 30 | import org.apache.spark.sql.execution.datasources.LogicalRelation 31 | import com.qubole.spark.hiveacid.datasource.HiveAcidDataSource 32 | 33 | 34 | /** 35 | * Analyzer rule to convert a transactional HiveRelation 36 | * into LogicalRelation backed by HiveAcidRelation 37 | * @param spark - spark session 38 | */ 39 | case class HiveAcidAutoConvert(spark: SparkSession) extends Rule[LogicalPlan] { 40 | 41 | private def isConvertible(relation: HiveTableRelation): Boolean = { 42 | val serde = relation.tableMeta.storage.serde.getOrElse("").toLowerCase(Locale.ROOT) 43 | relation.tableMeta.properties.getOrElse("transactional", "false").toBoolean 44 | } 45 | 46 | private def convert(relation: HiveTableRelation): LogicalRelation = { 47 | val options = relation.tableMeta.properties ++ 48 | relation.tableMeta.storage.properties ++ Map("table" -> relation.tableMeta.qualifiedName) 49 | 50 | val newRelation = new HiveAcidDataSource().createRelation(spark.sqlContext, options) 51 | LogicalRelation(newRelation, isStreaming = false) 52 | } 53 | 54 | override def apply(plan: LogicalPlan): LogicalPlan = { 55 | plan resolveOperators { 56 | // Write path 57 | case InsertIntoTable(r: HiveTableRelation, partition, query, overwrite, ifPartitionNotExists) 58 | if query.resolved && DDLUtils.isHiveTable(r.tableMeta) && isConvertible(r) => 59 | InsertIntoTable(convert(r), partition, query, overwrite, ifPartitionNotExists) 60 | 61 | // Read path 62 | case relation: HiveTableRelation 63 | if DDLUtils.isHiveTable(relation.tableMeta) && isConvertible(relation) => 64 | convert(relation) 65 | } 66 | } 67 | } 68 | 69 | class HiveAcidAutoConvertExtension extends (SparkSessionExtensions => Unit) { 70 | def apply(extension: SparkSessionExtensions): Unit = { 71 | extension.injectResolutionRule(HiveAcidAutoConvert.apply) 72 | extension.injectParser { (session, parser) => 73 | SparkAcidSqlParser(parser) 74 | } 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /src/main/scala/com/qubole/spark/hiveacid/HiveAcidErrors.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 Qubole, Inc. All rights reserved. 3 | * 4 | * Licensed to the Apache Software Foundation (ASF) under one or more 5 | * contributor license agreements. See the NOTICE file distributed with 6 | * this work for additional information regarding copyright ownership. 7 | * The ASF licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | 20 | package com.qubole.spark.hiveacid 21 | 22 | import org.apache.spark.sql.{SaveMode, SqlUtils} 23 | import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan 24 | 25 | object HiveAcidErrors { 26 | 27 | def formatColumn(colName: String): String = s"`$colName`" 28 | 29 | def formatColumnList(colNames: Seq[String]): String = 30 | colNames.map(formatColumn).mkString("[", ", ", "]") 31 | 32 | def tableNotSpecifiedException(): Throwable = { 33 | new IllegalArgumentException("'table' is not specified in parameters") 34 | } 35 | 36 | def unsupportedFunction(function: String, caller: String): Throwable = { 37 | new java.lang.UnsupportedOperationException(s"Unsupported Function - $function with $caller") 38 | } 39 | 40 | def invalidOperationType(operation: String): Throwable = { 41 | new RuntimeException(s"Invalid operation type - $operation") 42 | } 43 | 44 | def unsupportedSaveMode(saveMode: SaveMode): Throwable = { 45 | new RuntimeException(s"Unsupported save mode - $saveMode") 46 | } 47 | 48 | def unsupportedOperationTypeInsertOnlyTable(operation: String, tableName: String): Throwable = { 49 | new RuntimeException(s"Unsupported operation type - $operation for InsertOnly table " + tableName) 50 | } 51 | 52 | def unsupportedOperationTypeBucketedTable(operation: String, tableName: String): Throwable = { 53 | new RuntimeException(s"Unsupported operation type - $operation for Bucketed table " + tableName) 54 | } 55 | 56 | def tableNotAcidException(tableName: String): Throwable = { 57 | new IllegalArgumentException(s"table $tableName is not an ACID table") 58 | } 59 | 60 | def couldNotAcquireLockException(exception: Exception = null): Throwable = { 61 | new RuntimeException(s"Could not acquire lock.", exception) 62 | } 63 | 64 | def couldNotAcquireLockException(state: String): Throwable = { 65 | new RuntimeException(s"Could not acquire lock. Lock State: $state") 66 | } 67 | 68 | def txnAlreadyClosed(txnId: Long): Throwable = { 69 | new RuntimeException(s"Transaction $txnId is already closed") 70 | } 71 | 72 | def txnAlreadyOpen(txnId: Long): Throwable = { 73 | new RuntimeException(s"Transaction already opened. Existing txnId: $txnId") 74 | } 75 | 76 | def txnNotStarted(table: String): Throwable = { 77 | new RuntimeException(s"Transaction on $table not started") 78 | } 79 | 80 | def txnNoTransaction(): Throwable = { 81 | new RuntimeException(s"No transaction found") 82 | } 83 | 84 | def tableSnapshotNonExistent(snapshotId: Long): Throwable = { 85 | new RuntimeException(s"Table snapshost $snapshotId does not exist") 86 | } 87 | 88 | def tableWriteIdRequestedBeforeTxnStart(table: String): Throwable = { 89 | new RuntimeException(s"Write id requested for table $table before txn was started") 90 | } 91 | 92 | def repeatedTxnId(txnId: Long, activeTxns: Seq[Long]): Throwable = { 93 | new RuntimeException( 94 | s"Repeated transaction id $txnId, active transactions are [${activeTxns.mkString(",")}]") 95 | } 96 | 97 | def unsupportedStreamingOutputMode(mode: String): Throwable = { 98 | new AnalysisException( 99 | s"mode is $mode: Hive Acid Sink supports only Append as OutputMode") 100 | } 101 | 102 | def updateSetColumnNotFound(col: String, colList: Seq[String]): Throwable = { 103 | new AnalysisException( 104 | s"SET column ${formatColumn(col)} not found among columns: ${formatColumnList(colList)}.") 105 | } 106 | 107 | def updateOnPartition(cols: Seq[String], table: String): Throwable = { 108 | val message = if (cols.length == 1) { 109 | s"SET column: ${cols.head} is partition column in table: ${table}" 110 | } else { 111 | s"SET columns: ${cols.mkString(",")} are partition columns in table: ${table}" 112 | } 113 | new AnalysisException( 114 | s"UPDATE on the partition columns are not allowed. $message" 115 | ) 116 | } 117 | 118 | def txnOutdated(txnId: Long, tableName: String): Throwable = { 119 | new TransactionInvalidException( 120 | s"Transaction is $txnId is no longer valid for table $tableName", txnId, tableName) 121 | } 122 | 123 | def unexpectedReadError(cause: String): Throwable = { 124 | throw new RuntimeException( 125 | s"Unexpected error while reading the Hive Acid Data: $cause") 126 | } 127 | 128 | def mergeValidationError(cause: String): Throwable = { 129 | SqlUtils.analysisException(s"MERGE Validation Error: $cause") 130 | } 131 | 132 | def mergeResolutionError(cause: String): Throwable = { 133 | SqlUtils.analysisException(cause) 134 | } 135 | 136 | def mergeUnsupportedError(cause: String): Throwable = { 137 | throw new RuntimeException(cause) 138 | } 139 | } 140 | 141 | class TransactionInvalidException(val message:String, 142 | val txnId: Long, 143 | val tableName : String) 144 | extends Exception(message) { 145 | override def getMessage: String = { 146 | message 147 | } 148 | } 149 | 150 | class AnalysisException( 151 | val message: String, 152 | val line: Option[Int] = None, 153 | val startPosition: Option[Int] = None, 154 | // Some plans fail to serialize due to bugs in scala collections. 155 | @transient val plan: Option[LogicalPlan] = None, 156 | val cause: Option[Throwable] = None) 157 | extends Exception(message, cause.orNull) with Serializable { 158 | 159 | def withPosition(line: Option[Int], startPosition: Option[Int]): AnalysisException = { 160 | val newException = new AnalysisException(message, line, startPosition) 161 | newException.setStackTrace(getStackTrace) 162 | newException 163 | } 164 | 165 | override def getMessage: String = { 166 | val planAnnotation = Option(plan).flatten.map(p => s";\n$p").getOrElse("") 167 | getSimpleMessage + planAnnotation 168 | } 169 | 170 | // Outputs an exception without the logical plan. 171 | // For testing only 172 | def getSimpleMessage: String = { 173 | val lineAnnotation = line.map(l => s" line $l").getOrElse("") 174 | val positionAnnotation = startPosition.map(p => s" pos $p").getOrElse("") 175 | s"$message;$lineAnnotation$positionAnnotation" 176 | } 177 | } -------------------------------------------------------------------------------- /src/main/scala/com/qubole/spark/hiveacid/HiveAcidOperation.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 Qubole, Inc. All rights reserved. 3 | * 4 | * Licensed to the Apache Software Foundation (ASF) under one or more 5 | * contributor license agreements. See the NOTICE file distributed with 6 | * this work for additional information regarding copyright ownership. 7 | * The ASF licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | 20 | package com.qubole.spark.hiveacid 21 | 22 | private[hiveacid] object HiveAcidOperation extends Enumeration { 23 | type OperationType = Value 24 | val READ, INSERT_INTO, INSERT_OVERWRITE, DELETE, UPDATE, MERGE = Value 25 | } 26 | -------------------------------------------------------------------------------- /src/main/scala/com/qubole/spark/hiveacid/SparkAcidConf.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 Qubole, Inc. All rights reserved. 3 | * 4 | * Licensed to the Apache Software Foundation (ASF) under one or more 5 | * contributor license agreements. See the NOTICE file distributed with 6 | * this work for additional information regarding copyright ownership. 7 | * The ASF licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | 20 | package com.qubole.spark.hiveacid 21 | 22 | import org.apache.spark.sql.SparkSession 23 | 24 | /** 25 | * Spark specific configuration container to be used by Hive Acid module 26 | */ 27 | case class SparkAcidConfigEntry[T](configName: String /* Name of the config */ , 28 | defaultValue: String /* Default value of config in String*/ , 29 | description: String /* Description of the config*/ , 30 | converter: Option[(String, String) => T] /* function to convert from String to Config's Type T*/) 31 | 32 | 33 | case class SparkAcidConfigBuilder[T](configName: String) { 34 | private var defaultValue: Option[String] = None 35 | def defaultValue(value: String): SparkAcidConfigBuilder[T] = { 36 | defaultValue = Some(value) 37 | this 38 | } 39 | 40 | private var description = "" 41 | def description(desc : String): SparkAcidConfigBuilder[T] = { 42 | description = desc 43 | this 44 | } 45 | 46 | private var converter: Option[(String, String) => T] = None 47 | def converter(func: (String, String) => T): SparkAcidConfigBuilder[T] = { 48 | converter = Some(func) 49 | this 50 | } 51 | 52 | def create(): SparkAcidConfigEntry[T] = { 53 | require(!defaultValue.isEmpty, "Default Value for the Spark Acid Config needs to be specified") 54 | new SparkAcidConfigEntry[T](configName, defaultValue.get, description, converter) 55 | } 56 | } 57 | 58 | case class SparkAcidConf(@transient sparkSession: SparkSession, @transient parameters: Map[String, String]) { 59 | @transient val configMap = sparkSession.sessionState.conf.getAllConfs 60 | 61 | val predicatePushdownEnabled = getConf(SparkAcidConf.PREDICATE_PUSHDOWN_CONF) 62 | val maxSleepBetweenLockRetries = getConf(SparkAcidConf.MAX_SLEEP_BETWEEN_LOCK_RETRIES) 63 | val lockNumRetries = getConf(SparkAcidConf.LOCK_NUM_RETRIES) 64 | val metastorePartitionPruningEnabled = sparkSession.sessionState.conf.metastorePartitionPruning 65 | val includeRowIds = parameters.getOrElse("includeRowIds", "false").toBoolean 66 | val parallelPartitionComputationThreshold = getConf(SparkAcidConf.PARALLEL_PARTITION_THRESHOLD) 67 | 68 | def getConf[T](configEntry: SparkAcidConfigEntry[T]): T = { 69 | val value = configMap.getOrElse(configEntry.configName, configEntry.defaultValue) 70 | configEntry.converter match { 71 | case Some(f) => f(value, configEntry.configName) 72 | case None => value.asInstanceOf[T] 73 | } 74 | } 75 | } 76 | 77 | object SparkAcidConf { 78 | val PREDICATE_PUSHDOWN_CONF = SparkAcidConfigBuilder[Boolean]("spark.sql.hiveAcid.enablePredicatePushdown") 79 | .defaultValue("true") 80 | .converter(toBoolean) 81 | .description("Configuration to enable Predicate PushDown for Hive Acid Reader") 82 | .create() 83 | 84 | val SPARK_READER = SparkAcidConfigBuilder[Boolean]("spark.sql.hiveAcid.enableSparkReader") 85 | .defaultValue("false") 86 | .converter(toBoolean) 87 | .description("Configuration to enable the Spark readers." + 88 | " When disabled, Hive Acid Readers in this DataSource are used." + 89 | " On enabling Spark readers will be used to read the Hive Table readers") 90 | .create() 91 | 92 | val MAX_SLEEP_BETWEEN_LOCK_RETRIES = SparkAcidConfigBuilder[Long]("spark.hiveAcid.lock.max.sleep.between.retries") 93 | .defaultValue("60000") 94 | .converter(toLong) 95 | .description("Maximum sleep time between lock retries in milliseconds; " + 96 | "Lock retries are based on exponential backoff" + 97 | " and start with 50 milliseconds and increases to the maximum time defined by this configuration") 98 | .create() 99 | 100 | // Retry exponential backoff that starts with 50 millisec 101 | // Default 13 is set to make total wait around 5 minutes with max sleep being 60 seconds 102 | val LOCK_NUM_RETRIES = SparkAcidConfigBuilder[Int]("spark.hiveAcid.lock.max.retries") 103 | .defaultValue("13") 104 | .converter(toInt) 105 | .description("Maximum retries to acquire a lock; Lock retries are based on exponential backoff " + 106 | "that start with 50 milliseconds") 107 | .create() 108 | 109 | val PARALLEL_PARTITION_THRESHOLD = SparkAcidConfigBuilder[Long]("spark.hiveAcid.parallel.partitioning.threshold") 110 | .defaultValue("10") 111 | .converter(toInt) 112 | .description("Threshold for number of RDDs for a partitioned table," + 113 | " after which Spark Job will be spawn to compute RDD splits(i.e., partitions) in parallel" + 114 | " Note that every partition in a table becomes one RDD ") 115 | .create() 116 | 117 | def toBoolean(s: String, key: String): Boolean = { 118 | try { 119 | s.trim.toBoolean 120 | } catch { 121 | case _: IllegalArgumentException => 122 | throw new IllegalArgumentException(s"$key should be boolean, but was $s") 123 | } 124 | } 125 | 126 | def toLong(s: String, key: String): Long = { 127 | try { 128 | s.trim.toLong 129 | } catch { 130 | case _: IllegalArgumentException => 131 | throw new IllegalArgumentException(s"$key should be Long, but was $s") 132 | } 133 | } 134 | 135 | def toInt(s: String, key: String): Int = { 136 | try { 137 | s.trim.toInt 138 | } catch { 139 | case _: IllegalArgumentException => 140 | throw new IllegalArgumentException(s"$key should be Int, but was $s") 141 | } 142 | } 143 | } -------------------------------------------------------------------------------- /src/main/scala/com/qubole/spark/hiveacid/datasource/HiveAcidDataSource.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 Qubole, Inc. All rights reserved. 3 | * 4 | * Licensed to the Apache Software Foundation (ASF) under one or more 5 | * contributor license agreements. See the NOTICE file distributed with 6 | * this work for additional information regarding copyright ownership. 7 | * The ASF licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | 20 | package com.qubole.spark.hiveacid.datasource 21 | 22 | import com.qubole.spark.hiveacid.{HiveAcidErrors, HiveAcidTable} 23 | import com.qubole.spark.hiveacid.streaming.HiveAcidSink 24 | 25 | import org.apache.spark.internal.Logging 26 | import org.apache.spark.sql._ 27 | import org.apache.spark.sql.execution.streaming.Sink 28 | import org.apache.spark.sql.sources._ 29 | import org.apache.spark.sql.streaming.OutputMode 30 | 31 | /** 32 | * HiveAcid Data source implementation. 33 | */ 34 | class HiveAcidDataSource 35 | extends RelationProvider // USING HiveAcid 36 | with CreatableRelationProvider // Insert into/overwrite 37 | with DataSourceRegister // FORMAT("HiveAcid") 38 | with StreamSinkProvider 39 | with Logging { 40 | 41 | // returns relation for passed in table name 42 | override def createRelation(sqlContext: SQLContext, 43 | parameters: Map[String, String]): BaseRelation = { 44 | HiveAcidRelation(sqlContext.sparkSession, getFullyQualifiedTableName(parameters), parameters) 45 | } 46 | 47 | // returns relation after writing passed in data frame. Table name is part of parameter 48 | override def createRelation(sqlContext: SQLContext, 49 | mode: SaveMode, 50 | parameters: Map[String, String], 51 | df: DataFrame): BaseRelation = { 52 | 53 | val hiveAcidTable: HiveAcidTable = HiveAcidTable.fromSparkSession( 54 | sqlContext.sparkSession, 55 | getFullyQualifiedTableName(parameters), 56 | parameters) 57 | 58 | mode match { 59 | case SaveMode.Overwrite => 60 | hiveAcidTable.insertOverwrite(df) 61 | case SaveMode.Append => 62 | hiveAcidTable.insertInto(df) 63 | // TODO: Add support for these 64 | case SaveMode.ErrorIfExists | SaveMode.Ignore => 65 | HiveAcidErrors.unsupportedSaveMode(mode) 66 | } 67 | createRelation(sqlContext, parameters) 68 | } 69 | 70 | override def shortName(): String = { 71 | HiveAcidDataSource.NAME 72 | } 73 | 74 | override def createSink(sqlContext: SQLContext, 75 | parameters: Map[String, String], 76 | partitionColumns: Seq[String], 77 | outputMode: OutputMode): Sink = { 78 | 79 | tableSinkAssertions(partitionColumns, outputMode) 80 | 81 | new HiveAcidSink(sqlContext.sparkSession, parameters) 82 | } 83 | 84 | private def tableSinkAssertions(partitionColumns: Seq[String], outputMode: OutputMode): Unit = { 85 | 86 | if (partitionColumns.nonEmpty) { 87 | throw HiveAcidErrors.unsupportedFunction("partitionBy", "HiveAcidSink") 88 | } 89 | if (outputMode != OutputMode.Append) { 90 | throw HiveAcidErrors.unsupportedStreamingOutputMode(s"$outputMode") 91 | } 92 | 93 | } 94 | 95 | private def getFullyQualifiedTableName(parameters: Map[String, String]): String = { 96 | parameters.getOrElse("table", { 97 | throw HiveAcidErrors.tableNotSpecifiedException() 98 | }) 99 | } 100 | } 101 | 102 | object HiveAcidDataSource { 103 | val NAME = "HiveAcid" 104 | } 105 | 106 | -------------------------------------------------------------------------------- /src/main/scala/com/qubole/spark/hiveacid/datasource/HiveAcidRelation.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 Qubole, Inc. All rights reserved. 3 | * 4 | * Licensed to the Apache Software Foundation (ASF) under one or more 5 | * contributor license agreements. See the NOTICE file distributed with 6 | * this work for additional information regarding copyright ownership. 7 | * The ASF licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | 20 | package com.qubole.spark.hiveacid.datasource 21 | 22 | import org.apache.spark.internal.Logging 23 | import org.apache.spark.rdd.RDD 24 | import org.apache.spark.sql.{Column, DataFrame, Row, SQLContext, SparkSession} 25 | import org.apache.spark.sql.sources.{BaseRelation, Filter, InsertableRelation, PrunedFilteredScan} 26 | import org.apache.spark.sql.types._ 27 | import com.qubole.spark.hiveacid.{HiveAcidErrors, HiveAcidTable, SparkAcidConf} 28 | import com.qubole.spark.hiveacid.hive.HiveAcidMetadata 29 | import com.qubole.spark.hiveacid.merge.{MergeWhenClause, MergeWhenNotInsert} 30 | import org.apache.spark.sql.catalyst.AliasIdentifier 31 | import org.apache.spark.sql.catalyst.expressions.Expression 32 | import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan 33 | 34 | import collection.JavaConversions._ 35 | 36 | /** 37 | * Container for all metadata, configuration and schema to perform operations on 38 | * Hive ACID datasource. This provides for plumbing most of the heavy lifting is 39 | * performed inside HiveAcidtTable. 40 | * 41 | * @param sparkSession Spark Session object 42 | * @param fullyQualifiedTableName Table name for the data source. 43 | * @param parameters user provided parameters required for reading and writing, 44 | * including configuration 45 | */ 46 | case class HiveAcidRelation(sparkSession: SparkSession, 47 | fullyQualifiedTableName: String, 48 | parameters: Map[String, String]) 49 | extends BaseRelation 50 | with InsertableRelation 51 | with PrunedFilteredScan 52 | with Logging { 53 | 54 | private val hiveAcidMetadata: HiveAcidMetadata = HiveAcidMetadata.fromSparkSession( 55 | sparkSession, 56 | fullyQualifiedTableName 57 | ) 58 | private val hiveAcidTable: HiveAcidTable = new HiveAcidTable(sparkSession, 59 | hiveAcidMetadata, parameters) 60 | 61 | private val readOptions = SparkAcidConf(sparkSession, parameters) 62 | 63 | override def sqlContext: SQLContext = sparkSession.sqlContext 64 | 65 | override val schema: StructType = if (readOptions.includeRowIds) { 66 | hiveAcidMetadata.tableSchemaWithRowId 67 | } else { 68 | hiveAcidMetadata.tableSchema 69 | } 70 | 71 | override def insert(data: DataFrame, overwrite: Boolean): Unit = { 72 | // sql insert into and overwrite 73 | if (overwrite) { 74 | hiveAcidTable.insertOverwrite(data) 75 | } else { 76 | hiveAcidTable.insertInto(data) 77 | } 78 | } 79 | 80 | def update(condition: Option[Column], newValues: Map[String, Column]): Unit = { 81 | hiveAcidTable.update(condition, newValues) 82 | } 83 | 84 | def delete(condition: Column): Unit = { 85 | hiveAcidTable.delete(condition) 86 | } 87 | override def sizeInBytes: Long = { 88 | val compressionFactor = sparkSession.sessionState.conf.fileCompressionFactor 89 | (sparkSession.sessionState.conf.defaultSizeInBytes * compressionFactor).toLong 90 | } 91 | 92 | def merge(sourceDf: DataFrame, 93 | mergeExpression: Expression, 94 | matchedClause: Seq[MergeWhenClause], 95 | notMatched: Option[MergeWhenNotInsert], 96 | sourceAlias: Option[AliasIdentifier], 97 | targetAlias: Option[AliasIdentifier]): Unit = { 98 | hiveAcidTable.merge(sourceDf, mergeExpression, matchedClause, 99 | notMatched, sourceAlias, targetAlias) 100 | } 101 | 102 | def getHiveAcidTable(): HiveAcidTable = { 103 | hiveAcidTable 104 | } 105 | 106 | // FIXME: should it be true / false. Recommendation seems to 107 | // be to leave it as true 108 | override val needConversion: Boolean = false 109 | 110 | override def buildScan(requiredColumns: Array[String], filters: Array[Filter]): RDD[Row] = { 111 | val readOptions = SparkAcidConf(sparkSession, parameters) 112 | // sql "select *" 113 | hiveAcidTable.getRdd(requiredColumns, filters, readOptions) 114 | } 115 | } 116 | -------------------------------------------------------------------------------- /src/main/scala/com/qubole/spark/hiveacid/hive/.gitignore: -------------------------------------------------------------------------------- 1 | *.scalae 2 | -------------------------------------------------------------------------------- /src/main/scala/com/qubole/spark/hiveacid/hive/HiveAcidMetadata.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.qubole.spark.hiveacid.hive 19 | 20 | import java.lang.reflect.InvocationTargetException 21 | import java.util.Locale 22 | 23 | import scala.collection.JavaConversions._ 24 | import scala.collection.mutable 25 | import com.qubole.shaded.hadoop.hive.conf.HiveConf 26 | import com.qubole.shaded.hadoop.hive.ql.io.RecordIdentifier 27 | import com.qubole.shaded.hadoop.hive.ql.metadata 28 | import com.qubole.shaded.hadoop.hive.ql.metadata.Hive 29 | import com.qubole.shaded.hadoop.hive.ql.plan.TableDesc 30 | import com.qubole.spark.hiveacid.util.Util 31 | import com.qubole.spark.hiveacid.HiveAcidErrors 32 | import org.apache.hadoop.fs.Path 33 | import org.apache.hadoop.hive.metastore.api.MetaException 34 | import org.apache.hadoop.io.Writable 35 | import org.apache.hadoop.mapred.{InputFormat, OutputFormat} 36 | import org.apache.spark.internal.Logging 37 | import org.apache.spark.sql._ 38 | import org.apache.spark.sql.types._ 39 | 40 | /** 41 | * Represents metadata for hive acid table and exposes API to perform operations on top of it 42 | * @param sparkSession - spark session object 43 | * @param fullyQualifiedTableName - the fully qualified hive acid table name 44 | */ 45 | class HiveAcidMetadata(sparkSession: SparkSession, 46 | fullyQualifiedTableName: String) extends Logging { 47 | 48 | // hive conf 49 | private val hiveConf: HiveConf = HiveConverter.getHiveConf(sparkSession.sparkContext) 50 | 51 | // a hive representation of the table 52 | val hTable: metadata.Table = { 53 | val hive: Hive = Hive.get(hiveConf) 54 | val table = sparkSession.sessionState.sqlParser.parseTableIdentifier(fullyQualifiedTableName) 55 | val hTable = hive.getTable( 56 | table.database match { 57 | case Some(database) => database 58 | case None => HiveAcidMetadata.DEFAULT_DATABASE 59 | }, table.identifier) 60 | Hive.closeCurrent() 61 | hTable 62 | } 63 | 64 | if (hTable.getParameters.get("transactional") != "true") { 65 | throw HiveAcidErrors.tableNotAcidException(hTable.getFullyQualifiedName) 66 | } 67 | 68 | val isFullAcidTable: Boolean = hTable.getParameters.containsKey("transactional_properties") && 69 | !hTable.getParameters.get("transactional_properties").equals("insert_only") 70 | val isInsertOnlyTable: Boolean = !isFullAcidTable 71 | val isBucketed: Boolean = hTable.getBucketCols() != null && hTable.getBucketCols.size() > 0 72 | 73 | // Table properties 74 | val isPartitioned: Boolean = hTable.isPartitioned 75 | val rootPath: Path = hTable.getDataLocation 76 | val dbName: String = hTable.getDbName 77 | val tableName: String = hTable.getTableName 78 | val fullyQualifiedName: String = hTable.getFullyQualifiedName 79 | 80 | // Schema properties 81 | val dataSchema = StructType(hTable.getSd.getCols.toList.map( 82 | HiveConverter.getCatalystStructField).toArray) 83 | 84 | val partitionSchema = StructType(hTable.getPartitionKeys.toList.map( 85 | HiveConverter.getCatalystStructField).toArray) 86 | 87 | val tableSchema: StructType = { 88 | val overlappedPartCols = mutable.Map.empty[String, StructField] 89 | partitionSchema.foreach { partitionField => 90 | if (dataSchema.exists(getColName(_) == getColName(partitionField))) { 91 | overlappedPartCols += getColName(partitionField) -> partitionField 92 | } 93 | } 94 | StructType(dataSchema.map(f => overlappedPartCols.getOrElse(getColName(f), f)) ++ 95 | partitionSchema.filterNot(f => overlappedPartCols.contains(getColName(f)))) 96 | } 97 | 98 | val tableSchemaWithRowId: StructType = { 99 | StructType( 100 | Seq( 101 | StructField(HiveAcidMetadata.rowIdCol, HiveAcidMetadata.rowIdSchema) 102 | ) ++ tableSchema.fields) 103 | } 104 | 105 | lazy val tableDesc: TableDesc = { 106 | val inputFormatClass: Class[InputFormat[Writable, Writable]] = 107 | Util.classForName(hTable.getInputFormatClass.getName, 108 | loadShaded = true).asInstanceOf[java.lang.Class[InputFormat[Writable, Writable]]] 109 | val outputFormatClass: Class[OutputFormat[Writable, Writable]] = 110 | Util.classForName(hTable.getOutputFormatClass.getName, 111 | loadShaded = true).asInstanceOf[java.lang.Class[OutputFormat[Writable, Writable]]] 112 | new TableDesc( 113 | inputFormatClass, 114 | outputFormatClass, 115 | hTable.getMetadata) 116 | } 117 | 118 | /** 119 | * Returns list of partitions satisfying partition predicates 120 | * @param partitionFilters - filters to apply 121 | */ 122 | def getRawPartitions(partitionFilters: Option[String] = None): Seq[metadata.Partition] = { 123 | val hive: Hive = Hive.get(hiveConf) 124 | val prunedPartitions = try { 125 | partitionFilters match { 126 | case Some(filter) => hive.getPartitionsByFilter(hTable, filter) 127 | case None => hive.getPartitions(hTable) 128 | } 129 | } finally { 130 | Hive.closeCurrent() 131 | } 132 | logDebug(s"partition count = ${prunedPartitions.size()}") 133 | prunedPartitions.toSeq 134 | } 135 | 136 | private def getColName(field: StructField): String = { 137 | HiveAcidMetadata.getColName(sparkSession, field) 138 | } 139 | } 140 | 141 | object HiveAcidMetadata { 142 | val DEFAULT_DATABASE = "default" 143 | 144 | val rowIdCol = "rowId" 145 | val rowIdSchema: StructType = { 146 | StructType( 147 | RecordIdentifier.Field.values().map { 148 | field => 149 | StructField( 150 | name = field.name(), 151 | dataType = HiveConverter.getCatalystType(field.fieldType.getTypeName), 152 | nullable = true) 153 | } 154 | ) 155 | } 156 | 157 | def fromSparkSession(sparkSession: SparkSession, 158 | fullyQualifiedTableName: String): HiveAcidMetadata = { 159 | new HiveAcidMetadata( 160 | sparkSession, 161 | fullyQualifiedTableName) 162 | } 163 | 164 | def getColName(sparkSession: SparkSession, field: StructField): String = { 165 | if (sparkSession.sessionState.conf.caseSensitiveAnalysis) { 166 | field.name 167 | } else { 168 | field.name.toLowerCase(Locale.ROOT) 169 | } 170 | } 171 | 172 | def getColNames(sparkSession: SparkSession, schema: StructType): Seq[String] = { 173 | schema.map(getColName(sparkSession, _)) 174 | } 175 | } 176 | -------------------------------------------------------------------------------- /src/main/scala/com/qubole/spark/hiveacid/hive/HiveConverter.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.qubole.spark.hiveacid.hive 19 | 20 | import java.sql.{Date, Timestamp} 21 | import java.util.Locale 22 | 23 | import com.qubole.shaded.hadoop.hive.conf.HiveConf 24 | import com.qubole.shaded.hadoop.hive.metastore.api.FieldSchema 25 | import org.apache.commons.lang3.StringUtils 26 | import org.apache.spark.internal.Logging 27 | import org.apache.spark.{SparkContext, SparkException} 28 | import org.apache.spark.sql.catalyst.parser.{CatalystSqlParser, ParseException} 29 | import org.apache.spark.sql.sources._ 30 | import org.apache.spark.sql.types._ 31 | 32 | import scala.collection.JavaConversions._ 33 | 34 | /** 35 | * Encapsulates everything (extensions, workarounds, quirks) to handle the 36 | * SQL dialect conversion between catalyst and hive. 37 | */ 38 | private[hiveacid] object HiveConverter extends Logging { 39 | 40 | def getCatalystStructField(hc: FieldSchema): StructField = { 41 | val columnType = getCatalystType(hc.getType) 42 | val metadata = if (hc.getType != columnType.catalogString) { 43 | new MetadataBuilder().putString(HIVE_TYPE_STRING, hc.getType).build() 44 | } else { 45 | Metadata.empty 46 | } 47 | 48 | val field = StructField( 49 | name = hc.getName, 50 | dataType = columnType, 51 | nullable = true, 52 | metadata = metadata) 53 | Option(hc.getComment).map(field.withComment).getOrElse(field) 54 | } 55 | 56 | def getCatalystType(dataType: String): DataType = { 57 | try { 58 | CatalystSqlParser.parseDataType(dataType) 59 | } catch { 60 | case e: ParseException => 61 | throw new SparkException("Cannot recognize hive type string: " + dataType, e) 62 | } 63 | } 64 | 65 | def getHiveConf(sparkContext: SparkContext): HiveConf = { 66 | val hiveConf = new HiveConf() 67 | (sparkContext.hadoopConfiguration.iterator().map(kv => kv.getKey -> kv.getValue) 68 | ++ sparkContext.getConf.getAll.toMap).foreach { case (k, v) => 69 | logDebug( 70 | s""" 71 | |Applying Hadoop/Hive/Spark and extra properties to Hive Conf: 72 | |$k=${if (k.toLowerCase(Locale.ROOT).contains("password")) "xxx" else v} 73 | """.stripMargin) 74 | hiveConf.set(k, v) 75 | } 76 | hiveConf 77 | } 78 | 79 | /** 80 | * Escape special characters in SQL string literals. 81 | * 82 | * @param value The string to be escaped. 83 | * @return Escaped string. 84 | */ 85 | private def escapeSql(value: String): String = { 86 | // TODO: how to handle null 87 | StringUtils.replace(value, "'", "''") 88 | } 89 | 90 | /** 91 | * Converts value to SQL expression. 92 | * @param value The value to be converted. 93 | * @return Converted value. 94 | */ 95 | private def compileValue(value: Any): Any = value match { 96 | case stringValue: String => s"'${escapeSql(stringValue)}'" 97 | case timestampValue: Timestamp => "'" + timestampValue + "'" 98 | case dateValue: Date => "'" + dateValue + "'" 99 | case arrayValue: Array[Any] => arrayValue.map(compileValue).mkString(", ") 100 | case _ => value 101 | } 102 | 103 | /** 104 | * Turns a single Filter into a String representing a SQL expression. 105 | * Returns None for an unhandled filter. 106 | */ 107 | def compileFilter(f: Filter): Option[String] = Option(x = f match { 108 | case EqualTo(attr, value) => s"$attr = ${compileValue(value)}" 109 | case EqualNullSafe(attr, value) => 110 | val col = attr 111 | s"(NOT ($col != ${compileValue(value)} OR $col = 'NULL' OR " + 112 | s"${compileValue(value)} = 'NULL') OR " + 113 | s"($col = 'NULL' AND ${compileValue(value)} = 'NULL'))" 114 | case LessThan(attr, value) => s"$attr < ${compileValue(value)}" 115 | case GreaterThan(attr, value) => s"$attr > ${compileValue(value)}" 116 | case LessThanOrEqual(attr, value) => s"$attr <= ${compileValue(value)}" 117 | case GreaterThanOrEqual(attr, value) => s"$attr >= ${compileValue(value)}" 118 | // These clauses throw in Hive MS when filtering the partitions 119 | //case IsNull(attr) => s"$attr = 'NULL'" 120 | //case IsNotNull(attr) => s"$attr != 'NULL'" 121 | case StringStartsWith(attr, value) => s"$attr LIKE '$value%'" 122 | case StringEndsWith(attr, value) => s"$attr LIKE '%$value'" 123 | case StringContains(attr, value) => s"$attr LIKE '%$value%'" 124 | case In(attr, value) => s"$attr IN (${compileValue(value)})" 125 | case Not(`f`) => compileFilter(f).map(p => s"(NOT ($p))").orNull 126 | case Or(f1, f2) => 127 | // We can't compile Or filter unless both sub-filters are compiled successfully. 128 | // It applies too for the following And filter. 129 | // If we can make sure compileFilter supports all filters, we can remove this check. 130 | val or = Seq(f1, f2) flatMap compileFilter 131 | if (or.size == 2) { 132 | or.map(p => s"($p)").mkString(" OR ") 133 | } else null 134 | case And(f1, f2) => 135 | val and = Seq(f1, f2).flatMap(compileFilter) 136 | if (and.size == 2) { 137 | and.map(p => s"($p)").mkString(" AND ") 138 | } else null 139 | case _ => null 140 | }) 141 | 142 | 143 | def compileFilters(filters: Seq[Filter]): String = { 144 | val str = filters.flatMap(compileFilter).mkString(" and ") 145 | logDebug(str) 146 | str 147 | } 148 | } 149 | -------------------------------------------------------------------------------- /src/main/scala/com/qubole/spark/hiveacid/package.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 Qubole, Inc. All rights reserved. 3 | * 4 | * Licensed to the Apache Software Foundation (ASF) under one or more 5 | * contributor license agreements. See the NOTICE file distributed with 6 | * this work for additional information regarding copyright ownership. 7 | * The ASF licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | package com.qubole.spark 20 | 21 | import org.apache.spark.sql._ 22 | 23 | package object hiveacid { 24 | implicit class HiveAcidDataFrameReader(reader: DataFrameReader) { 25 | def hiveacid(table: String, options: Map[String, String] = Map.empty): DataFrame = { 26 | reader.format("HiveAcid").option("table", table) 27 | .options(options).load() 28 | } 29 | } 30 | 31 | implicit class HiveAcidDataFrameWriter[T](writer: DataFrameWriter[T]) { 32 | def hiveacid(table: String, saveMode: String, options: Map[String, String] = Map.empty): Unit = { 33 | writer.format("HiveAcid").option("table", table) 34 | .options(options).mode(saveMode).save() 35 | } 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/main/scala/com/qubole/spark/hiveacid/rdd/EmptyRDD.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 Qubole, Inc. All rights reserved. 3 | * 4 | * Licensed to the Apache Software Foundation (ASF) under one or more 5 | * contributor license agreements. See the NOTICE file distributed with 6 | * this work for additional information regarding copyright ownership. 7 | * The ASF licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | 20 | package com.qubole.spark.hiveacid.rdd 21 | 22 | import scala.reflect.ClassTag 23 | 24 | import org.apache.spark.{Partition, SparkContext, TaskContext} 25 | import org.apache.spark.rdd.RDD 26 | 27 | private[hiveacid] class EmptyRDD[T: ClassTag](sc: SparkContext) extends RDD[T](sc, Nil) { 28 | 29 | override def getPartitions: Array[Partition] = Array.empty 30 | 31 | override def compute(split: Partition, context: TaskContext): Iterator[T] = { 32 | throw new UnsupportedOperationException("empty RDD") 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /src/main/scala/com/qubole/spark/hiveacid/rdd/HiveAcidUnionRDD.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 Qubole, Inc. All rights reserved. 3 | * 4 | * Licensed to the Apache Software Foundation (ASF) under one or more 5 | * contributor license agreements. See the NOTICE file distributed with 6 | * this work for additional information regarding copyright ownership. 7 | * The ASF licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | 20 | package com.qubole.spark.hiveacid.rdd 21 | 22 | import com.qubole.spark.hiveacid.SparkAcidConf 23 | import com.qubole.spark.hiveacid.reader.hive.HiveAcidPartitionComputer 24 | 25 | import scala.reflect.ClassTag 26 | import org.apache.spark._ 27 | import org.apache.spark.rdd.{RDD, UnionRDD} 28 | 29 | /** 30 | * A Hive3RDD is created for each of the hive partition of the table. But at the end the buildScan 31 | * is supposed to return only 1 RDD for entire table. So we have to create UnionRDD for it. 32 | * 33 | * This class extends UnionRDD and makes sure that we acquire read lock once for all the 34 | * partitions of the table 35 | 36 | * @param sc - sparkContext 37 | * @param rddSeq - underlying partition RDDs 38 | * @param hiveSplitInfo - It is sequence of HiveSplitInfo. 39 | * It would be derived from the list of HiveAcidRDD passed here. 40 | * check HiveAcidRDD.getHiveSplitsInfo 41 | */ 42 | private[hiveacid] class HiveAcidUnionRDD[T: ClassTag]( 43 | sc: SparkContext, 44 | rddSeq: Seq[RDD[T]], 45 | //TODO: We should clean so that HiveSplitInfo need not have to be passed separately. 46 | hiveSplitInfo: Seq[HiveSplitInfo]) extends UnionRDD[T](sc, rddSeq) { 47 | 48 | private val ignoreMissingFiles = 49 | super.sparkContext.getConf.getBoolean("spark.files.ignoreMissingFiles", defaultValue = false) 50 | 51 | private val ignoreEmptySplits = 52 | super.sparkContext.getConf.getBoolean("spark.hadoopRDD.ignoreEmptySplits", defaultValue = false) 53 | 54 | private val parallelPartitionThreshold = 55 | super.sparkContext.getConf.getInt(SparkAcidConf.PARALLEL_PARTITION_THRESHOLD.configName, 10) 56 | 57 | override def getPartitions: Array[Partition] = { 58 | if (hiveSplitInfo.length > parallelPartitionThreshold) { 59 | val partitions = hiveSplitInfo.length/parallelPartitionThreshold 60 | val hiveSplitRDD = super.sparkContext.parallelize(hiveSplitInfo, partitions) 61 | val hiveAcidPartitionComputer = new HiveAcidPartitionComputer(ignoreEmptySplits, ignoreMissingFiles) 62 | // It spawns a spark job to compute Partitions for every RDD and stores it in cache. 63 | hiveAcidPartitionComputer.computeHiveSplitsAndCache(hiveSplitRDD) 64 | } 65 | super.getPartitions 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /src/main/scala/com/qubole/spark/hiveacid/reader/.gitignore: -------------------------------------------------------------------------------- 1 | *.scalae 2 | -------------------------------------------------------------------------------- /src/main/scala/com/qubole/spark/hiveacid/reader/Reader.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 Qubole, Inc. All rights reserved. 3 | * 4 | * Licensed to the Apache Software Foundation (ASF) under one or more 5 | * contributor license agreements. See the NOTICE file distributed with 6 | * this work for additional information regarding copyright ownership. 7 | * The ASF licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | 20 | package com.qubole.spark.hiveacid.reader 21 | 22 | import com.qubole.spark.hiveacid.hive.HiveAcidMetadata 23 | 24 | import org.apache.spark.rdd.RDD 25 | import org.apache.spark.sql.catalyst.InternalRow 26 | 27 | private[reader] trait Reader { 28 | def makeRDDForTable(hiveAcidMetadata: HiveAcidMetadata): RDD[InternalRow] 29 | def makeRDDForPartitionedTable(hiveAcidMetadata: HiveAcidMetadata, 30 | partitions: Seq[ReaderPartition]): RDD[InternalRow] 31 | } 32 | 33 | private[reader] case class ReaderPartition(ptn: Any) 34 | -------------------------------------------------------------------------------- /src/main/scala/com/qubole/spark/hiveacid/reader/ReaderOptions.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 Qubole, Inc. All rights reserved. 3 | * 4 | * Licensed to the Apache Software Foundation (ASF) under one or more 5 | * contributor license agreements. See the NOTICE file distributed with 6 | * this work for additional information regarding copyright ownership. 7 | * The ASF licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | 20 | package com.qubole.spark.hiveacid.reader 21 | 22 | import com.qubole.spark.hiveacid.SparkAcidConf 23 | import org.apache.hadoop.conf.Configuration 24 | 25 | import org.apache.spark.sql.catalyst.expressions.Attribute 26 | import org.apache.spark.sql.sources.Filter 27 | 28 | /** 29 | * Reader options which will be serialized and sent to each executor 30 | */ 31 | private[hiveacid] class ReaderOptions(val hadoopConf: Configuration, 32 | val partitionAttributes: Seq[Attribute], 33 | val requiredAttributes: Seq[Attribute], 34 | val dataFilters: Array[Filter], 35 | val requiredNonPartitionedColumns: Array[String], 36 | val sessionLocalTimeZone: String, 37 | val readConf: SparkAcidConf) extends Serializable 38 | -------------------------------------------------------------------------------- /src/main/scala/com/qubole/spark/hiveacid/reader/TableReader.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 Qubole, Inc. All rights reserved. 3 | * 4 | * Licensed to the Apache Software Foundation (ASF) under one or more 5 | * contributor license agreements. See the NOTICE file distributed with 6 | * this work for additional information regarding copyright ownership. 7 | * The ASF licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | 20 | package com.qubole.spark.hiveacid.reader 21 | 22 | import com.qubole.spark.hiveacid.{HiveAcidOperation, SparkAcidConf} 23 | import com.qubole.spark.hiveacid.transaction._ 24 | import com.qubole.spark.hiveacid.hive.HiveAcidMetadata 25 | import com.qubole.spark.hiveacid.reader.hive.{HiveAcidReader, HiveAcidReaderOptions} 26 | 27 | import org.apache.spark.internal.Logging 28 | import org.apache.spark.rdd.RDD 29 | import org.apache.spark.sql.{Row, SparkSession} 30 | import org.apache.spark.sql.catalyst.expressions._ 31 | import org.apache.spark.sql.sources.Filter 32 | 33 | /** 34 | * Table reader object 35 | * 36 | * @param sparkSession - Spark session 37 | * @param curTxn - Transaction object to acquire locks. 38 | * @param hiveAcidMetadata - Hive acid table for which read is to be performed. 39 | */ 40 | private[hiveacid] class TableReader(sparkSession: SparkSession, 41 | curTxn: HiveAcidTxn, 42 | hiveAcidMetadata: HiveAcidMetadata) extends Logging { 43 | 44 | def getRdd(requiredColumns: Array[String], 45 | filters: Array[Filter], 46 | readConf: SparkAcidConf): RDD[Row] = { 47 | val rowIdColumnSet = HiveAcidMetadata.rowIdSchema.fields.map(_.name).toSet 48 | val requiredColumnsWithoutRowId = requiredColumns.filterNot(rowIdColumnSet.contains) 49 | val partitionColumnNames = hiveAcidMetadata.partitionSchema.fields.map(_.name) 50 | val partitionedColumnSet = partitionColumnNames.toSet 51 | 52 | // Attributes 53 | val requiredNonPartitionedColumns = requiredColumnsWithoutRowId.filter( 54 | x => !partitionedColumnSet.contains(x)) 55 | 56 | val requiredAttributes = if (!readConf.includeRowIds) { 57 | requiredColumnsWithoutRowId.map { 58 | x => 59 | val field = hiveAcidMetadata.tableSchema.fields.find(_.name == x).get 60 | PrettyAttribute(field.name, field.dataType) 61 | } 62 | } else { 63 | requiredColumns.map { 64 | x => 65 | val field = hiveAcidMetadata.tableSchemaWithRowId.fields.find(_.name == x).get 66 | PrettyAttribute(field.name, field.dataType) 67 | } 68 | } 69 | val partitionAttributes = hiveAcidMetadata.partitionSchema.fields.map { x => 70 | PrettyAttribute(x.name, x.dataType) 71 | } 72 | 73 | // Filters 74 | val (partitionFilters, otherFilters) = filters.partition { predicate => 75 | !predicate.references.isEmpty && 76 | predicate.references.toSet.subsetOf(partitionedColumnSet) 77 | } 78 | val dataFilters = otherFilters.filter(_ 79 | .references.intersect(partitionColumnNames).isEmpty 80 | ) 81 | 82 | logDebug(s"total filters : ${filters.length}: " + 83 | s"dataFilters: ${dataFilters.length} " + 84 | s"partitionFilters: ${partitionFilters.length}") 85 | 86 | val hadoopConf = sparkSession.sessionState.newHadoopConf() 87 | 88 | logDebug(s"sarg.pushdown: ${hadoopConf.get("sarg.pushdown")}," + 89 | s"hive.io.file.readcolumn.names: ${hadoopConf.get("hive.io.file.readcolumn.names")}, " + 90 | s"hive.io.file.readcolumn.ids: ${hadoopConf.get("hive.io.file.readcolumn.ids")}") 91 | 92 | val readerOptions = new ReaderOptions(hadoopConf, 93 | partitionAttributes, 94 | requiredAttributes, 95 | dataFilters, 96 | requiredNonPartitionedColumns, 97 | sparkSession.sessionState.conf.sessionLocalTimeZone, 98 | readConf) 99 | 100 | val hiveAcidReaderOptions= HiveAcidReaderOptions.get(hiveAcidMetadata) 101 | 102 | val (partitions, partitionList) = HiveAcidReader.getPartitions(hiveAcidMetadata, 103 | readerOptions, 104 | partitionFilters) 105 | 106 | // Acquire lock on all the partition and then create snapshot. Every time getRDD is called 107 | // it creates a new snapshot. 108 | // NB: partitionList is Seq if partition pruning is not enabled 109 | curTxn.acquireLocks(hiveAcidMetadata, HiveAcidOperation.READ, partitionList, readConf) 110 | 111 | // Create Snapshot !!! 112 | //val curSnapshot = HiveAcidTxn.createSnapshot(curTxn, hiveAcidMetadata) 113 | 114 | val validWriteIds = HiveAcidTxn.getValidWriteIds(curTxn, hiveAcidMetadata) 115 | 116 | val reader = new HiveAcidReader( 117 | sparkSession, 118 | readerOptions, 119 | hiveAcidReaderOptions, 120 | validWriteIds) 121 | 122 | val rdd = if (hiveAcidMetadata.isPartitioned) { 123 | reader.makeRDDForPartitionedTable(hiveAcidMetadata, partitions) 124 | } else { 125 | reader.makeRDDForTable(hiveAcidMetadata) 126 | } 127 | 128 | rdd.asInstanceOf[RDD[Row]] 129 | } 130 | } 131 | -------------------------------------------------------------------------------- /src/main/scala/com/qubole/spark/hiveacid/reader/hive/HiveAcidPartitionComputer.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 Qubole, Inc. All rights reserved. 3 | * 4 | * Licensed to the Apache Software Foundation (ASF) under one or more 5 | * contributor license agreements. See the NOTICE file distributed with 6 | * this work for additional information regarding copyright ownership. 7 | * The ASF licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | 20 | package com.qubole.spark.hiveacid.reader.hive 21 | 22 | import java.util.concurrent.{ConcurrentHashMap, TimeUnit} 23 | 24 | import com.qubole.shaded.hadoop.hive.common.{ValidReaderWriteIdList, ValidWriteIdList} 25 | import com.qubole.spark.hiveacid.rdd.{HiveAcidPartition, HiveAcidRDD, HiveSplitInfo} 26 | import com.qubole.spark.hiveacid.reader.hive.HiveAcidPartitionComputer.{addToPartitionCache, getInputFormat} 27 | import com.qubole.spark.hiveacid.util.Util 28 | import org.apache.hadoop.conf.Configurable 29 | import org.apache.hadoop.fs.Path 30 | import org.apache.hadoop.io.Writable 31 | import org.apache.hadoop.mapred.{FileInputFormat, InputFormat, InvalidInputException, JobConf} 32 | import org.apache.hadoop.util.ReflectionUtils 33 | import org.apache.spark.deploy.SparkHadoopUtil 34 | import org.apache.spark.internal.Logging 35 | import org.apache.spark.rdd.RDD 36 | 37 | private[hiveacid] case class HiveAcidPartitionComputer(ignoreEmptySplits: Boolean, 38 | ignoreMissingFiles: Boolean) extends Logging { 39 | def getPartitions[K, V](id: Int, jobConf: JobConf, 40 | inputFormat: InputFormat[K, V], 41 | minPartitions: Int): Array[HiveAcidPartition] = { 42 | // add the credentials here as this can be called before SparkContext initialized 43 | SparkHadoopUtil.get.addCredentials(jobConf) 44 | try { 45 | val allInputSplits = inputFormat.getSplits(jobConf, minPartitions) 46 | val inputSplits = if (ignoreEmptySplits) { 47 | allInputSplits.filter(_.getLength > 0) 48 | } else { 49 | allInputSplits 50 | } 51 | val array = new Array[HiveAcidPartition](inputSplits.length) 52 | for (i <- inputSplits.indices) { 53 | array(i) = new HiveAcidPartition(id, i, inputSplits(i)) 54 | } 55 | array 56 | } catch { 57 | case e: InvalidInputException if ignoreMissingFiles => 58 | val inputDir = jobConf.get(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.INPUT_DIR) 59 | logWarning(s"$inputDir doesn't exist and no" + 60 | s" partitions returned from this path.", e) 61 | Array.empty[HiveAcidPartition] 62 | } 63 | } 64 | 65 | // needs to be invoked just once as its an expensive operation. 66 | def computeHiveSplitsAndCache(splitRDD: RDD[HiveSplitInfo]): Unit = { 67 | val start = System.nanoTime() 68 | logInfo("Spawning job to compute partitions for ACID table RDD") 69 | val splits = splitRDD.map { 70 | case HiveSplitInfo(id, broadcastedConf, 71 | validWriteIdList, minPartitions, ifcName, isFullAcidTable, shouldCloneJobConf, initLocalJobConfFuncOpt) => 72 | val jobConf = HiveAcidRDD.setInputPathToJobConf( 73 | Some(HiveAcidRDD.getJobConf(broadcastedConf, shouldCloneJobConf, initLocalJobConfFuncOpt)), 74 | isFullAcidTable, 75 | new ValidReaderWriteIdList(validWriteIdList), 76 | broadcastedConf, 77 | shouldCloneJobConf, 78 | initLocalJobConfFuncOpt) 79 | val partitions = this.getPartitions[Writable, Writable](id, jobConf, getInputFormat(jobConf, ifcName), minPartitions) 80 | (partitions, FileInputFormat.getInputPaths(jobConf), validWriteIdList) 81 | }.collect() 82 | 83 | splits.foreach { 84 | case (partitions: Array[HiveAcidPartition], 85 | paths: Array[Path], validWriteIdList: String) => 86 | addToPartitionCache(paths, validWriteIdList, partitions) 87 | } 88 | logInfo(s"Job to compute partitions took: " + 89 | s"${TimeUnit.NANOSECONDS.toSeconds(System.nanoTime() - start)} seconds") 90 | } 91 | } 92 | 93 | private[hiveacid] object HiveAcidPartitionComputer extends Logging { 94 | object Cache { 95 | val partitionCache = new ConcurrentHashMap[SplitCacheKey, Array[HiveAcidPartition]]() 96 | case class SplitCacheKey(paths: Set[Path], validWriteIdList: String) 97 | } 98 | 99 | def getFromSplitsCache(paths: Array[Path], validWriteIdList: ValidWriteIdList): Option[Array[HiveAcidPartition]] = { 100 | Option(Cache.partitionCache.get(Cache.SplitCacheKey(paths.toSet, validWriteIdList.writeToString()))) 101 | } 102 | 103 | def removeFromSplitsCache(paths: Array[Path], validWriteIdList: ValidWriteIdList): Unit = { 104 | Cache.partitionCache.remove(Cache.SplitCacheKey(paths.toSet, validWriteIdList.writeToString())) 105 | } 106 | 107 | def addToPartitionCache(paths: Array[Path], validWriteIdList: String, inputSplits: Array[HiveAcidPartition]): Unit = { 108 | Cache.partitionCache.put(Cache.SplitCacheKey(paths.toSet, validWriteIdList), inputSplits) 109 | } 110 | 111 | private def getInputFormat(conf: JobConf, inputFormatClassName: String): InputFormat[Writable, Writable] = { 112 | val inputFormatClass = Util.classForName(inputFormatClassName, loadShaded = true) 113 | .asInstanceOf[java.lang.Class[InputFormat[Writable, Writable]]] 114 | val newInputFormat = ReflectionUtils.newInstance(inputFormatClass.asInstanceOf[Class[_]], conf) 115 | .asInstanceOf[InputFormat[Writable, Writable]] 116 | newInputFormat match { 117 | case c: Configurable => c.setConf(conf) 118 | case _ => 119 | } 120 | newInputFormat 121 | } 122 | 123 | } 124 | -------------------------------------------------------------------------------- /src/main/scala/com/qubole/spark/hiveacid/reader/hive/HiveAcidReaderOptions.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 Qubole, Inc. All rights reserved. 3 | * 4 | * Licensed to the Apache Software Foundation (ASF) under one or more 5 | * contributor license agreements. See the NOTICE file distributed with 6 | * this work for additional information regarding copyright ownership. 7 | * The ASF licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | 20 | package com.qubole.spark.hiveacid.reader.hive 21 | 22 | import com.qubole.shaded.hadoop.hive.ql.plan.TableDesc 23 | import com.qubole.spark.hiveacid.hive.HiveAcidMetadata 24 | import org.apache.spark.sql.types.StructType 25 | 26 | private[reader] class HiveAcidReaderOptions(val tableDesc: TableDesc, 27 | val isFullAcidTable: Boolean, 28 | val dataSchema: StructType) 29 | 30 | private[reader] object HiveAcidReaderOptions { 31 | def get(hiveAcidMetadata: HiveAcidMetadata): HiveAcidReaderOptions = { 32 | new HiveAcidReaderOptions(hiveAcidMetadata.tableDesc, hiveAcidMetadata.isFullAcidTable, 33 | hiveAcidMetadata.dataSchema) 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /src/main/scala/com/qubole/spark/hiveacid/streaming/HiveAcidSink.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 Qubole, Inc. All rights reserved. 3 | * 4 | * Licensed to the Apache Software Foundation (ASF) under one or more 5 | * contributor license agreements. See the NOTICE file distributed with 6 | * this work for additional information regarding copyright ownership. 7 | * The ASF licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | 20 | 21 | package com.qubole.spark.hiveacid.streaming 22 | 23 | import com.qubole.spark.hiveacid.{HiveAcidErrors, HiveAcidTable} 24 | import com.qubole.spark.hiveacid.hive.HiveAcidMetadata 25 | import org.apache.hadoop.fs.Path 26 | import org.apache.spark.internal.Logging 27 | import org.apache.spark.sql.{DataFrame, SparkSession} 28 | import org.apache.spark.sql.execution.streaming.Sink 29 | 30 | 31 | class HiveAcidSink(sparkSession: SparkSession, 32 | parameters: Map[String, String]) extends Sink with Logging { 33 | 34 | import HiveAcidSink._ 35 | 36 | private val acidSinkOptions = new HiveAcidSinkOptions(parameters) 37 | 38 | private val fullyQualifiedTableName = acidSinkOptions.tableName 39 | 40 | private val hiveAcidTable: HiveAcidTable = HiveAcidTable.fromSparkSession( 41 | sparkSession, 42 | fullyQualifiedTableName, 43 | parameters) 44 | 45 | assertNonBucketedTable() 46 | 47 | private val logPath = getMetaDataPath() 48 | private val fileLog = new HiveAcidSinkLog( 49 | HiveAcidSinkLog.VERSION, sparkSession, logPath.toUri.toString, acidSinkOptions) 50 | 51 | private def assertNonBucketedTable(): Unit = { 52 | if(hiveAcidTable.isBucketed) { 53 | throw HiveAcidErrors.unsupportedOperationTypeBucketedTable("Streaming Write", fullyQualifiedTableName) 54 | } 55 | } 56 | 57 | private def getMetaDataPath(): Path = { 58 | acidSinkOptions.metadataDir match { 59 | case Some(dir) => 60 | new Path(dir) 61 | case None => 62 | logInfo(s"Metadata dir not specified. Using " + 63 | s"$metadataDirPrefix/_query_default as metadata dir") 64 | logWarning(s"Please make sure that multiple streaming writes to " + 65 | s"$fullyQualifiedTableName are not running") 66 | val tableLocation = HiveAcidMetadata.fromSparkSession( 67 | sparkSession, fullyQualifiedTableName).rootPath 68 | new Path(tableLocation, s"$metadataDirPrefix/_query_default") 69 | } 70 | } 71 | 72 | /** 73 | * Adds the batch to the sink. Each batch is transactional in itself 74 | * @param batchId batch to add 75 | * @param df dataframe to add as part of batch 76 | */ 77 | override def addBatch(batchId: Long, df: DataFrame): Unit = { 78 | 79 | if (batchId <= fileLog.getLatest().map(_._1).getOrElse(-1L)) { 80 | logInfo(s"Skipping already committed batch $batchId") 81 | } else { 82 | 83 | val commitProtocol = new HiveAcidStreamingCommitProtocol(fileLog) 84 | val txnId = hiveAcidTable.addBatch(df) 85 | commitProtocol.commitJob(batchId, txnId) 86 | } 87 | 88 | } 89 | 90 | override def toString: String = s"HiveAcidSinkV1[$fullyQualifiedTableName]" 91 | 92 | } 93 | 94 | object HiveAcidSink { 95 | 96 | val metadataDirPrefix = "_acid_streaming" 97 | } 98 | -------------------------------------------------------------------------------- /src/main/scala/com/qubole/spark/hiveacid/streaming/HiveAcidSinkLog.scala: -------------------------------------------------------------------------------- 1 | 2 | /* 3 | * Copyright 2019 Qubole, Inc. All rights reserved. 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | */ 20 | 21 | package com.qubole.spark.hiveacid.streaming 22 | 23 | import org.apache.spark.sql.SparkSession 24 | import org.apache.spark.sql.execution.streaming.CompactibleFileStreamLog 25 | 26 | case class HiveAcidSinkStatus(txnId: Long, action: String) 27 | 28 | class HiveAcidSinkLog(version: Int, 29 | sparkSession: SparkSession, 30 | path: String, 31 | options: HiveAcidSinkOptions) 32 | extends CompactibleFileStreamLog[HiveAcidSinkStatus](version, sparkSession, path) { 33 | 34 | protected override val fileCleanupDelayMs = options.fileCleanupDelayMs 35 | 36 | protected override val isDeletingExpiredLog = options.isDeletingExpiredLog 37 | 38 | protected override val defaultCompactInterval = options.compactInterval 39 | 40 | protected override val minBatchesToRetain = options.minBatchesToRetain 41 | 42 | override def compactLogs(logs: Seq[HiveAcidSinkStatus]): Seq[HiveAcidSinkStatus] = { 43 | val deletedFiles = logs.filter(_.action == HiveAcidSinkLog.DELETE_ACTION).map(_.txnId).toSet 44 | if (deletedFiles.isEmpty) { 45 | logs 46 | } else { 47 | logs.filter(f => !deletedFiles.contains(f.txnId)) 48 | } 49 | } 50 | 51 | } 52 | 53 | object HiveAcidSinkLog { 54 | 55 | val VERSION = 1 56 | val DELETE_ACTION = "delete" 57 | val ADD_ACTION = "add" 58 | 59 | } -------------------------------------------------------------------------------- /src/main/scala/com/qubole/spark/hiveacid/streaming/HiveAcidSinkOptions.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 Qubole, Inc. All rights reserved. 3 | * 4 | * Licensed to the Apache Software Foundation (ASF) under one or more 5 | * contributor license agreements. See the NOTICE file distributed with 6 | * this work for additional information regarding copyright ownership. 7 | * The ASF licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | 20 | package com.qubole.spark.hiveacid.streaming 21 | 22 | import java.util.concurrent.TimeUnit 23 | 24 | import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap 25 | 26 | import scala.util.Try 27 | 28 | class HiveAcidSinkOptions(parameters: CaseInsensitiveMap[String]) { 29 | 30 | import HiveAcidSinkOptions._ 31 | 32 | def this(parameters: Map[String, String]) = this(CaseInsensitiveMap(parameters)) 33 | 34 | val tableName = parameters.get("table").getOrElse{ 35 | throw new IllegalArgumentException("Table Name is not specified") 36 | } 37 | 38 | val fileCleanupDelayMs = withLongParameter(CLEANUP_DELAY_KEY, DEFAULT_CLEANUP_DELAY) 39 | 40 | val isDeletingExpiredLog = withBooleanParameter(LOG_DELETION_KEY, DEFAULT_LOG_DELETION) 41 | 42 | val compactInterval = withIntParameter(COMPACT_INTERVAL_KEY, DEFAULT_COMPACT_INTERVAL) 43 | 44 | val minBatchesToRetain = withIntParameter(MIN_BATCHES_TO_RETAIN_KEY, DEFAULT_MIN_BATCHES_TO_RETAIN) 45 | 46 | val metadataDir = parameters.get(METADATA_DIR_KEY) 47 | 48 | private def withIntParameter(name: String, default: Int): Int = { 49 | parameters.get(name).map { str => 50 | Try(str.toInt).toOption.filter(_ > 0).getOrElse { 51 | throw new IllegalArgumentException( 52 | s"Invalid value '$str' for option '$name', must be a positive integer") 53 | } 54 | }.getOrElse(default) 55 | } 56 | 57 | private def withLongParameter(name: String, default: Long): Long = { 58 | parameters.get(name).map { str => 59 | Try(str.toLong).toOption.filter(_ >= 0).getOrElse { 60 | throw new IllegalArgumentException( 61 | s"Invalid value '$str' for option '$name', must be a positive integer") 62 | } 63 | }.getOrElse(default) 64 | } 65 | 66 | private def withBooleanParameter(name: String, default: Boolean): Boolean = { 67 | parameters.get(name).map { str => 68 | try { 69 | str.toBoolean 70 | } catch { 71 | case _: IllegalArgumentException => 72 | throw new IllegalArgumentException( 73 | s"Invalid value '$str' for option '$name', must be true or false") 74 | } 75 | }.getOrElse(default) 76 | } 77 | 78 | } 79 | 80 | object HiveAcidSinkOptions { 81 | 82 | val DEFAULT_CLEANUP_DELAY = TimeUnit.MINUTES.toMillis(10) 83 | val DEFAULT_LOG_DELETION = true 84 | val DEFAULT_COMPACT_INTERVAL = 10 85 | val DEFAULT_MIN_BATCHES_TO_RETAIN = 100 86 | 87 | val CLEANUP_DELAY_KEY = "spark.acid.streaming.log.cleanupDelayMs" 88 | val LOG_DELETION_KEY = "spark.acid.streaming.log.deletion" 89 | val COMPACT_INTERVAL_KEY = "spark.acid.streaming.log.compactInterval" 90 | val MIN_BATCHES_TO_RETAIN_KEY = "spark.acid.streaming.log.minBatchesToRetain" 91 | val METADATA_DIR_KEY = "spark.acid.streaming.log.metadataDir" 92 | 93 | } -------------------------------------------------------------------------------- /src/main/scala/com/qubole/spark/hiveacid/streaming/HiveAcidStreamingCommitProtocol.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.qubole.spark.hiveacid.streaming 19 | 20 | import org.apache.spark.internal.Logging 21 | 22 | class HiveAcidStreamingCommitProtocol(fileLog: HiveAcidSinkLog) extends Serializable with Logging { 23 | 24 | import HiveAcidStreamingCommitProtocol._ 25 | 26 | def commitJob(batchId: Long, txnId: Long): Unit = { 27 | 28 | def commitJobRetry(retryRemaining: Int, f: () => Unit): Boolean = { 29 | var retry = false 30 | try { 31 | f() 32 | } 33 | catch { 34 | case ie: IllegalStateException if ie.getMessage.contains("Race while writing batch") => 35 | throw ie 36 | case e: Exception => 37 | if (retryRemaining > 0) { 38 | logError(s"Unexpected error while writing commit file for batch $batchId ... " + 39 | s"Retrying", e) 40 | retry = true 41 | } else { 42 | logError(s"Unexpected error while writing commit file for batch $batchId ... " + 43 | s"Max retries reached", e) 44 | throw e 45 | } 46 | } 47 | retry 48 | } 49 | 50 | val array = Array(HiveAcidSinkStatus(txnId, HiveAcidSinkLog.ADD_ACTION)) 51 | 52 | val commitJobAttempt = () => { 53 | if (fileLog.add(batchId, array)) { 54 | logInfo(s"Committed batch $batchId") 55 | } else { 56 | throw new IllegalStateException(s"Race while writing batch $batchId") 57 | } 58 | } 59 | 60 | var sleepSec = 1 61 | var retryRemaining = MAX_COMMIT_JOB_RETRIES - 1 62 | while (commitJobRetry(retryRemaining, commitJobAttempt)) { 63 | retryRemaining = retryRemaining - 1 64 | Thread.sleep(sleepSec * 1000) 65 | sleepSec = sleepSec * EXPONENTIAL_BACK_OFF_FACTOR 66 | } 67 | 68 | } 69 | 70 | } 71 | 72 | object HiveAcidStreamingCommitProtocol { 73 | 74 | val MAX_COMMIT_JOB_RETRIES = 3 75 | val EXPONENTIAL_BACK_OFF_FACTOR = 2 76 | 77 | } -------------------------------------------------------------------------------- /src/main/scala/com/qubole/spark/hiveacid/transaction/HiveAcidTxn.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.qubole.spark.hiveacid.transaction 19 | 20 | import java.util.concurrent.atomic.AtomicBoolean 21 | 22 | import com.qubole.shaded.hadoop.hive.common.{ValidTxnList, ValidWriteIdList} 23 | import com.qubole.spark.hiveacid.{HiveAcidErrors, HiveAcidOperation, SparkAcidConf} 24 | import com.qubole.spark.hiveacid.hive.HiveAcidMetadata 25 | import org.apache.spark.internal.Logging 26 | import org.apache.spark.sql.SparkSession 27 | 28 | /** 29 | * Hive Acid Transaction object. 30 | * @param sparkSession: Spark Session 31 | */ 32 | class HiveAcidTxn(sparkSession: SparkSession) extends Logging { 33 | 34 | HiveAcidTxn.setUpTxnManager(sparkSession) 35 | 36 | // txn ID 37 | protected var id: Long = -1 38 | protected var validTxnList: ValidTxnList = _ 39 | private [hiveacid] val isClosed: AtomicBoolean = new AtomicBoolean(true) 40 | 41 | private def setTxn(id: Long, txns:ValidTxnList): Unit = { 42 | this.id = id 43 | this.validTxnList = txns 44 | isClosed.set(false) 45 | } 46 | 47 | private def unsetTxn(): Unit = { 48 | this.id = -1 49 | this.validTxnList = null 50 | isClosed.set(true) 51 | } 52 | 53 | override def toString: String = s"""{"id":"$id","validTxns":"$validTxnList"}""" 54 | 55 | /** 56 | * Public API to being transaction. 57 | */ 58 | def begin(): Unit = synchronized { 59 | if (!isClosed.get) { 60 | throw HiveAcidErrors.txnAlreadyOpen(id) 61 | } 62 | val newId = HiveAcidTxn.txnManager.beginTxn(this) 63 | val txnList = HiveAcidTxn.txnManager.getValidTxns(Some(newId)) 64 | setTxn(newId, txnList) 65 | // Set it for thread for all future references. 66 | HiveAcidTxn.threadLocal.set(this) 67 | logDebug(s"Begin transaction $this") 68 | } 69 | 70 | /** 71 | * Public API to end transaction 72 | * @param abort true if transaction is aborted 73 | */ 74 | def end(abort: Boolean = false): Unit = synchronized { 75 | if (isClosed.get) { 76 | throw HiveAcidErrors.txnAlreadyClosed(id) 77 | } 78 | 79 | logDebug(s"End transaction $this abort = $abort") 80 | // NB: Unset it for thread proactively invariant of 81 | // underlying call fails or succeeds. 82 | HiveAcidTxn.threadLocal.set(null) 83 | HiveAcidTxn.txnManager.endTxn(id, abort) 84 | unsetTxn() 85 | } 86 | 87 | private[hiveacid] def acquireLocks(hiveAcidMetadata: HiveAcidMetadata, 88 | operationType: HiveAcidOperation.OperationType, 89 | partitionNames: Seq[String], 90 | conf: SparkAcidConf): Unit = { 91 | if (isClosed.get()) { 92 | logError(s"Transaction already closed $this") 93 | throw HiveAcidErrors.txnAlreadyClosed(id) 94 | } 95 | HiveAcidTxn.txnManager.acquireLocks(id, hiveAcidMetadata.dbName, 96 | hiveAcidMetadata.tableName, operationType, partitionNames, hiveAcidMetadata.isPartitioned, conf) 97 | } 98 | 99 | private[hiveacid] def addDynamicPartitions(writeId: Long, 100 | dbName: String, 101 | tableName: String, 102 | operationType: HiveAcidOperation.OperationType, 103 | partitions: Set[String]) = { 104 | if (isClosed.get()) { 105 | logError(s"Transaction already closed $this") 106 | throw HiveAcidErrors.txnAlreadyClosed(id) 107 | } 108 | logDebug(s"Adding dynamic partition txnId: $id writeId: $writeId dbName: $dbName" + 109 | s" tableName: $tableName partitions: ${partitions.mkString(",")}") 110 | HiveAcidTxn.txnManager.addDynamicPartitions(id, writeId, dbName, 111 | tableName, partitions, operationType) 112 | } 113 | // Public Interface 114 | def txnId: Long = id 115 | } 116 | 117 | object HiveAcidTxn extends Logging { 118 | 119 | val threadLocal = new ThreadLocal[HiveAcidTxn] 120 | 121 | // Helper function to create snapshot. 122 | private[hiveacid] def createSnapshot(txn: HiveAcidTxn, hiveAcidMetadata: HiveAcidMetadata): HiveAcidTableSnapshot = { 123 | val currentWriteId = txnManager.getCurrentWriteId(txn.txnId, 124 | hiveAcidMetadata.dbName, hiveAcidMetadata.tableName) 125 | val validWriteIdList: ValidWriteIdList = getValidWriteIds(txn, hiveAcidMetadata) 126 | HiveAcidTableSnapshot(validWriteIdList, currentWriteId) 127 | } 128 | 129 | private[hiveacid] def getValidWriteIds(txn: HiveAcidTxn, hiveAcidMetadata: HiveAcidMetadata) = { 130 | val validWriteIdList = if (txn.txnId == -1) { 131 | throw HiveAcidErrors.tableWriteIdRequestedBeforeTxnStart(hiveAcidMetadata.fullyQualifiedName) 132 | } else { 133 | txnManager.getValidWriteIds(txn.txnId, txn.validTxnList, hiveAcidMetadata.fullyQualifiedName) 134 | } 135 | validWriteIdList 136 | } 137 | 138 | // Txn manager is connection to HMS. Use single instance of it 139 | var txnManager: HiveAcidTxnManager = _ 140 | private def setUpTxnManager(sparkSession: SparkSession): Unit = synchronized { 141 | if (txnManager == null) { 142 | txnManager = new HiveAcidTxnManager(sparkSession) 143 | } 144 | } 145 | 146 | /** 147 | * Creates read or write transaction based on user request. 148 | * 149 | * @param sparkSession Create a new hive Acid transaction 150 | * @return 151 | */ 152 | def createTransaction(sparkSession: SparkSession): HiveAcidTxn = { 153 | setUpTxnManager(sparkSession) 154 | new HiveAcidTxn(sparkSession) 155 | } 156 | 157 | /** 158 | * Given a transaction id return the HiveAcidTxn object. Raise exception if not found. 159 | * @return 160 | */ 161 | def currentTxn(): HiveAcidTxn = { 162 | threadLocal.get() 163 | } 164 | 165 | /** 166 | * Check if valid write Ids for `fullyQualifiedTableName` when `txn` was opened 167 | * is same even now. This should be invoked after `txn` acquires lock, to see 168 | * if the transaction is still valid and continue. 169 | */ 170 | def IsTxnStillValid(txn: HiveAcidTxn, fullyQualifiedTableName: String): Boolean = { 171 | if (txn.txnId == - 1) { 172 | logWarning(s"Transaction being validated even before it was open") 173 | false 174 | } else { 175 | // Compare the earlier writeIds of fullyQualifiedTableName with the current one. 176 | val previousWriteIdList = txnManager.getValidWriteIds(txn.txnId, txn.validTxnList, fullyQualifiedTableName) 177 | val currentValidList = txnManager.getValidTxns(Some(txn.txnId)) 178 | val currentWriteIdList = txnManager.getValidWriteIds(txn.txnId, currentValidList, fullyQualifiedTableName) 179 | // Checks if any new write transaction was started and committed 180 | // after opening transaction and before acquiring locks using HighWaterMark 181 | if (previousWriteIdList.getHighWatermark == currentWriteIdList.getHighWatermark) { 182 | // Check all the open transactions when current transaction was opened, 183 | // are still invalid i.e., either running/open or aborted. 184 | val prevOpenInvalidWriteIds = previousWriteIdList.getInvalidWriteIds 185 | .filter(!previousWriteIdList.isWriteIdAborted(_)).toSet 186 | val currentInvalidWriteIds = currentWriteIdList.getInvalidWriteIds.toSet 187 | // Previous open transactions should still be invalid 188 | if (prevOpenInvalidWriteIds.isEmpty || 189 | prevOpenInvalidWriteIds.diff(currentInvalidWriteIds).isEmpty) { 190 | logDebug("All previous open transactions are still invalid! Transaction is valid!") 191 | true 192 | } else { 193 | logWarning("Prev Open transactions: " + prevOpenInvalidWriteIds.diff(currentInvalidWriteIds).mkString(", ") 194 | + " have been committed. Transaction " + txn.txnId + " is not valid !") 195 | false 196 | } 197 | } else { 198 | logWarning("HighWatermark moved from " + 199 | previousWriteIdList.getHighWatermark + " to " + 200 | currentWriteIdList.getHighWatermark + 201 | ". Transaction " + txn.txnId + " is not valid !") 202 | false 203 | } 204 | } 205 | } 206 | } 207 | 208 | private[hiveacid] case class HiveAcidTableSnapshot(validWriteIdList: ValidWriteIdList, currentWriteId: Long) 209 | -------------------------------------------------------------------------------- /src/main/scala/com/qubole/spark/hiveacid/util/.gitignore: -------------------------------------------------------------------------------- 1 | *.scalae 2 | -------------------------------------------------------------------------------- /src/main/scala/com/qubole/spark/hiveacid/util/HiveAcidKyroRegistrator.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 Qubole, Inc. All rights reserved. 3 | * 4 | * Licensed to the Apache Software Foundation (ASF) under one or more 5 | * contributor license agreements. See the NOTICE file distributed with 6 | * this work for additional information regarding copyright ownership. 7 | * The ASF licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | 20 | package com.qubole.spark.hiveacid.util 21 | 22 | import com.esotericsoftware.kryo.Kryo 23 | import org.apache.spark.serializer.KryoRegistrator 24 | import com.esotericsoftware.kryo.serializers.JavaSerializer 25 | 26 | class HiveAcidKyroRegistrator extends KryoRegistrator { 27 | override def registerClasses(kryo: Kryo): Unit = { 28 | kryo.register(classOf[com.qubole.spark.hiveacid.util.SerializableConfiguration], new JavaSerializer) 29 | } 30 | } -------------------------------------------------------------------------------- /src/main/scala/com/qubole/spark/hiveacid/util/SerializableConfiguration.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 Qubole, Inc. All rights reserved. 3 | * 4 | * Licensed to the Apache Software Foundation (ASF) under one or more 5 | * contributor license agreements. See the NOTICE file distributed with 6 | * this work for additional information regarding copyright ownership. 7 | * The ASF licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | 20 | package com.qubole.spark.hiveacid.util 21 | 22 | import java.io.{ObjectInputStream, ObjectOutputStream} 23 | 24 | import org.apache.hadoop.conf.Configuration 25 | 26 | /** 27 | * Utility class to make configuration object serializable 28 | */ 29 | private[hiveacid] class SerializableConfiguration(@transient var value: Configuration) 30 | extends Serializable { 31 | private def writeObject(out: ObjectOutputStream): Unit = Util.tryOrIOException { 32 | out.defaultWriteObject() 33 | value.write(out) 34 | } 35 | 36 | private def readObject(in: ObjectInputStream): Unit = Util.tryOrIOException { 37 | value = new Configuration(false) 38 | value.readFields(in) 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /src/main/scala/com/qubole/spark/hiveacid/util/SerializableWritable.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 Qubole, Inc. All rights reserved. 3 | * 4 | * Licensed to the Apache Software Foundation (ASF) under one or more 5 | * contributor license agreements. See the NOTICE file distributed with 6 | * this work for additional information regarding copyright ownership. 7 | * The ASF licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | 20 | package com.qubole.spark.hiveacid.util 21 | 22 | import java.io._ 23 | 24 | import org.apache.hadoop.conf.Configuration 25 | import org.apache.hadoop.io.ObjectWritable 26 | import org.apache.hadoop.io.Writable 27 | 28 | /** 29 | * Utility class to make a Writable serializable 30 | */ 31 | private[hiveacid] class SerializableWritable[T <: Writable](@transient var t: T) 32 | extends Serializable { 33 | 34 | def value: T = t 35 | 36 | override def toString: String = t.toString 37 | 38 | private def writeObject(out: ObjectOutputStream): Unit = Util.tryOrIOException { 39 | out.defaultWriteObject() 40 | new ObjectWritable(t).write(out) 41 | } 42 | 43 | private def readObject(in: ObjectInputStream): Unit = Util.tryOrIOException { 44 | in.defaultReadObject() 45 | val ow = new ObjectWritable() 46 | ow.setConf(new Configuration(false)) 47 | ow.readFields(in) 48 | t = ow.get().asInstanceOf[T] 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /src/main/scala/com/qubole/spark/hiveacid/util/Util.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 Qubole, Inc. All rights reserved. 3 | * 4 | * Licensed to the Apache Software Foundation (ASF) under one or more 5 | * contributor license agreements. See the NOTICE file distributed with 6 | * this work for additional information regarding copyright ownership. 7 | * The ASF licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | 20 | package com.qubole.spark.hiveacid.util 21 | 22 | import java.io.IOException 23 | 24 | import org.apache.spark.internal.Logging 25 | 26 | import scala.util.control.NonFatal 27 | 28 | private[hiveacid] object Util extends Logging { 29 | 30 | def classForName(className: String, loadShaded: Boolean = false): Class[_] = { 31 | val classToLoad = if (loadShaded) { 32 | className.replaceFirst("org.apache.hadoop.hive.", "com.qubole.shaded.hadoop.hive.") 33 | } else { 34 | className 35 | } 36 | Class.forName(classToLoad, true, Thread.currentThread().getContextClassLoader) 37 | } 38 | 39 | /** 40 | * Detect whether this thread might be executing a shutdown hook. Will always return true if 41 | * the current thread is a running a shutdown hook but may spuriously return true otherwise (e.g. 42 | * if System.exit was just called by a concurrent thread). 43 | * 44 | * Currently, this detects whether the JVM is shutting down by Runtime#addShutdownHook throwing 45 | * an IllegalStateException. 46 | */ 47 | def inShutdown(): Boolean = { 48 | try { 49 | val hook: Thread = new Thread { 50 | override def run() {} 51 | } 52 | // scalastyle:off runtimeaddshutdownhook 53 | Runtime.getRuntime.addShutdownHook(hook) 54 | // scalastyle:on runtimeaddshutdownhook 55 | Runtime.getRuntime.removeShutdownHook(hook) 56 | } catch { 57 | case _: IllegalStateException => return true 58 | } 59 | false 60 | } 61 | 62 | def tryOrIOException[T](block: => T): T = { 63 | try { 64 | block 65 | } catch { 66 | case e: IOException => 67 | logError("Exception encountered", e) 68 | throw e 69 | case NonFatal(e) => 70 | logError("Exception encountered", e) 71 | throw new IOException(e) 72 | } 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /src/main/scala/com/qubole/spark/hiveacid/writer/Writer.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 Qubole, Inc. All rights reserved. 3 | * 4 | * Licensed to the Apache Software Foundation (ASF) under one or more 5 | * contributor license agreements. See the NOTICE file distributed with 6 | * this work for additional information regarding copyright ownership. 7 | * The ASF licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | 20 | package com.qubole.spark.hiveacid.writer 21 | 22 | import org.apache.spark.sql.catalyst.InternalRow 23 | import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec 24 | 25 | private[hiveacid] trait Writer { 26 | def process(row: InternalRow): Unit 27 | def close(): Unit 28 | def partitionsTouched(): Seq[TablePartitionSpec] 29 | } -------------------------------------------------------------------------------- /src/main/scala/com/qubole/spark/hiveacid/writer/WriterOptions.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 Qubole, Inc. All rights reserved. 3 | * 4 | * Licensed to the Apache Software Foundation (ASF) under one or more 5 | * contributor license agreements. See the NOTICE file distributed with 6 | * this work for additional information regarding copyright ownership. 7 | * The ASF licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | 20 | package com.qubole.spark.hiveacid.writer 21 | 22 | import com.qubole.spark.hiveacid.HiveAcidOperation 23 | import com.qubole.spark.hiveacid.util.SerializableConfiguration 24 | import org.apache.spark.sql.catalyst.expressions.Attribute 25 | import org.apache.spark.sql.types.StructType 26 | 27 | /** 28 | * Writer options which will be serialized and sent to each executor 29 | */ 30 | private[hiveacid] class WriterOptions(val currentWriteId: Long, 31 | val operationType: HiveAcidOperation.OperationType, 32 | val serializableHadoopConf: SerializableConfiguration, 33 | val tableSchemaWithrowID: StructType, 34 | val dataColumns: Seq[Attribute], 35 | val partitionColumns: Seq[Attribute], 36 | val allColumns: Seq[Attribute], 37 | val timeZoneId: String, 38 | val statementId: Option[Int] = None) extends Serializable 39 | -------------------------------------------------------------------------------- /src/main/scala/com/qubole/spark/hiveacid/writer/hive/HiveAcidWriterOptions.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 Qubole, Inc. All rights reserved. 3 | * 4 | * Licensed to the Apache Software Foundation (ASF) under one or more 5 | * contributor license agreements. See the NOTICE file distributed with 6 | * this work for additional information regarding copyright ownership. 7 | * The ASF licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | 20 | package com.qubole.spark.hiveacid.writer.hive 21 | 22 | import com.qubole.shaded.hadoop.hive.ql.plan.FileSinkDesc 23 | import com.qubole.spark.hiveacid.HiveAcidOperation 24 | import com.qubole.spark.hiveacid.hive.HiveAcidMetadata 25 | import com.qubole.spark.hiveacid.writer.WriterOptions 26 | import org.apache.hadoop.fs.Path 27 | 28 | private[writer] class HiveAcidWriterOptions(val rootPath: String, 29 | fileSinkDesc: FileSinkDesc) extends Serializable { 30 | lazy val getFileSinkDesc: FileSinkDesc = { 31 | fileSinkDesc.setDirName(new Path(rootPath)) 32 | fileSinkDesc 33 | } 34 | } 35 | 36 | private[writer] object HiveAcidWriterOptions { 37 | def get(hiveAcidMetadata: HiveAcidMetadata, 38 | options: WriterOptions): HiveAcidWriterOptions = { 39 | lazy val fileSinkDescriptor: FileSinkDesc = { 40 | val fileSinkDesc: FileSinkDesc = new FileSinkDesc() 41 | fileSinkDesc.setTableInfo(hiveAcidMetadata.tableDesc) 42 | fileSinkDesc.setTableWriteId(options.currentWriteId) 43 | if (options.operationType == HiveAcidOperation.INSERT_OVERWRITE) { 44 | fileSinkDesc.setInsertOverwrite(true) 45 | } 46 | if (options.statementId.isDefined) { 47 | fileSinkDesc.setStatementId(options.statementId.get) 48 | } 49 | fileSinkDesc 50 | } 51 | new HiveAcidWriterOptions(rootPath = hiveAcidMetadata.rootPath.toUri.toString, 52 | fileSinkDesc = fileSinkDescriptor) 53 | } 54 | 55 | } 56 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/SqlUtils.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 Qubole, Inc. All rights reserved. 3 | * 4 | * Licensed to the Apache Software Foundation (ASF) under one or more 5 | * contributor license agreements. See the NOTICE file distributed with 6 | * this work for additional information regarding copyright ownership. 7 | * The ASF licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | 20 | package org.apache.spark.sql 21 | 22 | import org.apache.spark.rdd.RDD 23 | import org.apache.spark.sql.catalyst.InternalRow 24 | import org.apache.spark.sql.catalyst.analysis._ 25 | import org.apache.spark.sql.catalyst.catalog.HiveTableRelation 26 | import org.apache.spark.sql.catalyst.encoders.RowEncoder 27 | import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, SubqueryAlias} 28 | import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, Expression} 29 | import org.apache.spark.sql.execution.LogicalRDD 30 | import org.apache.spark.sql.execution.datasources.LogicalRelation 31 | import org.apache.spark.sql.types.StructType 32 | 33 | object SqlUtils { 34 | def convertToDF(sparkSession: SparkSession, plan : LogicalPlan): DataFrame = { 35 | Dataset.ofRows(sparkSession, plan) 36 | } 37 | 38 | def resolveReferences(sparkSession: SparkSession, 39 | expr: Expression, 40 | planContaining: LogicalPlan, failIfUnresolved: Boolean, 41 | exprName: Option[String] = None): Expression = { 42 | resolveReferences(sparkSession, expr, Seq(planContaining), failIfUnresolved, exprName) 43 | } 44 | 45 | def resolveReferences(sparkSession: SparkSession, 46 | expr: Expression, 47 | planContaining: Seq[LogicalPlan], 48 | failIfUnresolved: Boolean, 49 | exprName: Option[String]): Expression = { 50 | val newPlan = FakeLogicalPlan(expr, planContaining) 51 | val resolvedExpr = sparkSession.sessionState.analyzer.execute(newPlan) match { 52 | case FakeLogicalPlan(resolvedExpr: Expression, _) => 53 | // Return even if it did not successfully resolve 54 | resolvedExpr 55 | case _ => 56 | expr 57 | // This is unexpected 58 | } 59 | if (failIfUnresolved) { 60 | resolvedExpr.flatMap(_.references).filter(!_.resolved).foreach { 61 | attr => { 62 | val failedMsg = exprName match { 63 | case Some(name) => s"${attr.sql} resolution in $name given these columns: "+ 64 | planContaining.flatMap(_.output).map(_.name).mkString(",") 65 | case _ => s"${attr.sql} resolution failed given these columns: "+ 66 | planContaining.flatMap(_.output).map(_.name).mkString(",") 67 | } 68 | attr.failAnalysis(failedMsg) 69 | } 70 | } 71 | } 72 | resolvedExpr 73 | } 74 | 75 | def hasSparkStopped(sparkSession: SparkSession): Boolean = { 76 | sparkSession.sparkContext.stopped.get() 77 | } 78 | 79 | /** 80 | * Qualify all the column names in the DF. 81 | * Attributes used in DF output will have fully qualified names 82 | * @param sparkSession 83 | * @param df DataFrame created by reading ACID table 84 | * @param fullyQualifiedTableName Qualified name of the Hive ACID Table 85 | * @return 86 | */ 87 | def getDFQualified(sparkSession: SparkSession, 88 | df: DataFrame, 89 | fullyQualifiedTableName: String) = { 90 | val plan = df.queryExecution.analyzed 91 | val qualifiedPlan = plan match { 92 | case p: LogicalRelation => 93 | p.copy(output = p.output 94 | .map((x: AttributeReference) => 95 | x.withQualifier(fullyQualifiedTableName.split('.').toSeq)) 96 | ) 97 | case h: HiveTableRelation => 98 | h.copy(dataCols = h.dataCols 99 | .map((x: AttributeReference) => 100 | x.withQualifier(fullyQualifiedTableName.split('.').toSeq)) 101 | ) 102 | h.copy(partitionCols = h.partitionCols 103 | .map((x: AttributeReference) => 104 | x.withQualifier(fullyQualifiedTableName.split('.').toSeq)) 105 | ) 106 | case _ => plan 107 | } 108 | 109 | val newDf = SqlUtils.convertToDF(sparkSession, qualifiedPlan) 110 | (qualifiedPlan, newDf) 111 | } 112 | 113 | def logicalPlanToDataFrame(sparkSession: SparkSession, 114 | logicalPlan: LogicalPlan): DataFrame = { 115 | Dataset.ofRows(sparkSession, logicalPlan) 116 | } 117 | 118 | /** 119 | * Convert RDD into DataFrame using the attributeList. 120 | * Based on [[SparkSession.createDataFrame()]] implementation but here, 121 | * attributes are provided. 122 | * @param sparkSession 123 | * @param rdd 124 | * @param schema 125 | * @param attributes 126 | * @return 127 | */ 128 | def createDataFrameUsingAttributes(sparkSession: SparkSession, 129 | rdd: RDD[Row], 130 | schema: StructType, 131 | attributes: Seq[Attribute]): DataFrame = { 132 | val encoder = RowEncoder(schema) 133 | val catalystRows = rdd.map(encoder.toRow) 134 | val logicalPlan = LogicalRDD( 135 | attributes, 136 | catalystRows, 137 | isStreaming = false)(sparkSession) 138 | Dataset.ofRows(sparkSession, logicalPlan) 139 | } 140 | 141 | def analysisException(cause: String): Throwable = { 142 | new AnalysisException(cause) 143 | } 144 | 145 | def removeTopSubqueryAlias(logicalPlan: LogicalPlan): LogicalPlan = { 146 | logicalPlan match { 147 | case SubqueryAlias(_, child: LogicalPlan) => child 148 | case _ => logicalPlan 149 | } 150 | } 151 | } 152 | 153 | case class FakeLogicalPlan(expr: Expression, children: Seq[LogicalPlan]) 154 | extends LogicalPlan { 155 | override def output: Seq[Attribute] = children.foldLeft(Seq[Attribute]())((out, child) => out ++ child.output) 156 | } 157 | -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/catalyst/parser/plans/logical/MergePlan.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.sql.catalyst.parser.plans.logical 19 | 20 | import com.qubole.spark.hiveacid.merge.{MergeWhenClause} 21 | import org.apache.spark.sql.{SparkSession, SqlUtils} 22 | import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} 23 | import org.apache.spark.sql.catalyst.plans.logical.{Command, LogicalPlan} 24 | 25 | case class MergePlan(sourcePlan: LogicalPlan, 26 | targetPlan: LogicalPlan, 27 | condition: Expression, 28 | matched: Seq[MergeWhenClause], 29 | notMatched: Option[MergeWhenClause]) extends Command { 30 | override def children: Seq[LogicalPlan] = Seq(sourcePlan, targetPlan) 31 | override def output: Seq[Attribute] = Seq.empty 32 | } 33 | 34 | object MergePlan { 35 | def resolve(sparkSession: SparkSession, mergePlan: MergePlan): MergePlan = { 36 | MergeWhenClause.validate(mergePlan.matched ++ mergePlan.notMatched) 37 | val resolvedCondition = SqlUtils.resolveReferences(sparkSession, mergePlan.condition, 38 | mergePlan.children, true, None) 39 | val resolvedMatched = MergeWhenClause.resolve(sparkSession, mergePlan, mergePlan.matched) 40 | val resolvedNotMatched = mergePlan.notMatched.map { 41 | x => x.resolve(sparkSession, mergePlan) 42 | } 43 | 44 | MergePlan(mergePlan.sourcePlan, 45 | mergePlan.targetPlan, 46 | resolvedCondition, 47 | resolvedMatched, 48 | resolvedNotMatched) 49 | } 50 | } -------------------------------------------------------------------------------- /src/main/scala/org/apache/spark/sql/hive/HiveAcidUtils.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 Qubole, Inc. All rights reserved. 3 | * 4 | * Licensed to the Apache Software Foundation (ASF) under one or more 5 | * contributor license agreements. See the NOTICE file distributed with 6 | * this work for additional information regarding copyright ownership. 7 | * The ASF licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | 20 | package org.apache.spark.sql.hive 21 | 22 | import scala.collection.JavaConverters._ 23 | import com.qubole.spark.hiveacid.hive.HiveAcidMetadata 24 | import org.apache.spark.sql.AnalysisException 25 | import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTablePartition, CatalogUtils} 26 | import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, BoundReference, Expression, InterpretedPredicate, PrettyAttribute} 27 | 28 | object HiveAcidUtils { 29 | 30 | /** 31 | * This is adapted from [[org.apache.spark.sql.catalyst.catalog.ExternalCatalogUtils.prunePartitionsByFilter]] 32 | * Instead of [[org.apache.spark.sql.catalyst.catalog.CatalogTable]] this function will be using [[HiveAcidMetadata]] 33 | * @param hiveAcidMetadata 34 | * @param inputPartitions 35 | * @param predicates 36 | * @param defaultTimeZoneId 37 | * @return 38 | */ 39 | def prunePartitionsByFilter( 40 | hiveAcidMetadata: HiveAcidMetadata, 41 | inputPartitions: Seq[CatalogTablePartition], 42 | predicates: Option[Expression], 43 | defaultTimeZoneId: String): Seq[CatalogTablePartition] = { 44 | if (predicates.isEmpty) { 45 | inputPartitions 46 | } else { 47 | val partitionSchema = hiveAcidMetadata.partitionSchema 48 | val partitionColumnNames = hiveAcidMetadata.partitionSchema.fieldNames.toSet 49 | 50 | val nonPartitionPruningPredicates = predicates.filterNot { 51 | _.references.map(_.name).toSet.subsetOf(partitionColumnNames) 52 | } 53 | if (nonPartitionPruningPredicates.nonEmpty) { 54 | throw new AnalysisException("Expected only partition pruning predicates: " + 55 | nonPartitionPruningPredicates) 56 | } 57 | 58 | val boundPredicate = 59 | InterpretedPredicate.create(predicates.get.transform { 60 | case att: Attribute => 61 | val index = partitionSchema.indexWhere(_.name == att.name) 62 | BoundReference(index, partitionSchema(index).dataType, nullable = true) 63 | }) 64 | 65 | inputPartitions.filter { p => 66 | boundPredicate.eval(p.toRow(partitionSchema, defaultTimeZoneId)) 67 | } 68 | } 69 | } 70 | 71 | def convertToCatalogTablePartition(hp: com.qubole.shaded.hadoop.hive.ql.metadata.Partition): CatalogTablePartition = { 72 | val apiPartition = hp.getTPartition 73 | val properties: Map[String, String] = if (hp.getParameters != null) { 74 | hp.getParameters.asScala.toMap 75 | } else { 76 | Map.empty 77 | } 78 | CatalogTablePartition( 79 | spec = Option(hp.getSpec).map(_.asScala.toMap).getOrElse(Map.empty), 80 | storage = CatalogStorageFormat( 81 | locationUri = Option(CatalogUtils.stringToURI(apiPartition.getSd.getLocation)), 82 | inputFormat = Option(apiPartition.getSd.getInputFormat), 83 | outputFormat = Option(apiPartition.getSd.getOutputFormat), 84 | serde = Option(apiPartition.getSd.getSerdeInfo.getSerializationLib), 85 | compressed = apiPartition.getSd.isCompressed, 86 | properties = Option(apiPartition.getSd.getSerdeInfo.getParameters) 87 | .map(_.asScala.toMap).orNull), 88 | createTime = apiPartition.getCreateTime.toLong * 1000, 89 | lastAccessTime = apiPartition.getLastAccessTime.toLong * 1000, 90 | parameters = properties, 91 | stats = None) // TODO: need to implement readHiveStats 92 | } 93 | } 94 | -------------------------------------------------------------------------------- /src/test/scala/com/qubole/spark/hiveacid/merge/MergeClauseSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 Qubole, Inc. All rights reserved. 3 | * 4 | * Licensed to the Apache Software Foundation (ASF) under one or more 5 | * contributor license agreements. See the NOTICE file distributed with 6 | * this work for additional information regarding copyright ownership. 7 | * The ASF licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | 20 | package com.qubole.spark.hiveacid.merge 21 | 22 | import org.apache.spark.SparkFunSuite 23 | import org.apache.spark.sql.{AnalysisException, functions} 24 | 25 | class MergeClauseSuite extends SparkFunSuite { 26 | def insertClause(addCondition : Boolean = true): MergeWhenNotInsert = { 27 | if (addCondition) { 28 | MergeWhenNotInsert(Some(functions.expr("x > 2").expr), 29 | Seq(functions.col("x").expr, functions.col("y").expr)) 30 | } 31 | else { 32 | MergeWhenNotInsert(None, 33 | Seq(functions.col("x").expr, functions.col("y").expr)) 34 | } 35 | } 36 | 37 | def updateClause(addCondition : Boolean = true): MergeWhenUpdateClause = { 38 | if (addCondition) { 39 | val updateCondition = Some(functions.expr("a > 2").expr) 40 | MergeWhenUpdateClause(updateCondition, 41 | Map("b" -> functions.lit(3).expr), isStar = false) 42 | } else { 43 | MergeWhenUpdateClause(None, 44 | Map("b" -> functions.lit(3).expr), isStar = false) 45 | } 46 | } 47 | 48 | def deleteClause(addCondition : Boolean = true): MergeWhenDelete = { 49 | if (addCondition) { 50 | MergeWhenDelete(Some(functions.expr("a < 1").expr)) 51 | } else { 52 | MergeWhenDelete(None) 53 | } 54 | } 55 | 56 | test("Validate MergeClauses") { 57 | val clauses = Seq(insertClause(), updateClause(), deleteClause()) 58 | MergeWhenClause.validate(clauses) 59 | } 60 | 61 | test("Invalid MergeClause cases") { 62 | val invalidMerge = "MERGE Validation Error: " 63 | 64 | //empty clauses 65 | checkInvalidMergeClause(invalidMerge + MergeWhenClause.atleastOneClauseError, Seq()) 66 | 67 | // multi update or insert clauses 68 | val multiUpdateClauses = Seq(updateClause(), updateClause(), insertClause()) 69 | checkInvalidMergeClause(invalidMerge + MergeWhenClause.justOneClausePerTypeError, multiUpdateClauses) 70 | 71 | // multi match clauses with first clause without condition 72 | val invalidMultiMatch = Seq(updateClause(false), deleteClause()) 73 | checkInvalidMergeClause(invalidMerge + MergeWhenClause.matchClauseConditionError, invalidMultiMatch) 74 | 75 | // invalid Update Clause 76 | val invalidUpdateClause = MergeWhenUpdateClause(None, Map(), isStar = false) 77 | val thrown = intercept[IllegalArgumentException] { 78 | MergeWhenClause.validate(Seq(invalidUpdateClause)) 79 | } 80 | assert(thrown.getMessage === "UPDATE Clause in MERGE should have one or more SET Values") 81 | } 82 | 83 | private def checkInvalidMergeClause(invalidMessage: String, multiUpdateClauses: Seq[MergeWhenClause]) = { 84 | val thrown = intercept[AnalysisException] { 85 | MergeWhenClause.validate(multiUpdateClauses) 86 | } 87 | assert(thrown.message === invalidMessage) 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /version.sbt: -------------------------------------------------------------------------------- 1 | version in ThisBuild := "0.6.0" --------------------------------------------------------------------------------