├── .gitignore
├── README.md
├── core
    ├── pom.xml
    └── src
    │   └── main
    │       ├── resources
    │           └── META-INF
    │           │   └── services
    │           │       └── org.apache.spark.sql.sources.DataSourceRegister
    │       └── scala
    │           └── com
    │               └── hortonworks
    │                   └── spark
    │                       └── hive
    │                           ├── HiveSourceProvider.scala
    │                           ├── HiveStreamDataWriter.scala
    │                           ├── HiveStreamWriter.scala
    │                           ├── common
    │                               ├── CachedHiveWriters.scala
    │                               ├── HiveOptions.scala
    │                               └── HiveWriter.scala
    │                           └── utils
    │                               ├── HiveIsolatedClassLoader.scala
    │                               └── Logging.scala
├── example
    ├── pom.xml
    └── src
    │   └── main
    │       └── scala
    │           └── com
    │               └── hortonworks
    │                   └── spark
    │                       └── hive
    │                           └── example
    │                               ├── HiveStreamingExample.scala
    │                               └── RateStreamingExample.scala
├── pom.xml
└── scalastyle-config.xml


/.gitignore:
--------------------------------------------------------------------------------
1 | *.class
2 | *.log
3 | .idea
4 | *.iml
5 | target
6 | metastore_db
7 | tmp
8 | dependency-reduced-pom.xml
9 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Spark Hive Streaming Sink
 2 | ===
 3 | 
 4 | A sink to save Spark Structured Streaming DataFrame into Hive table
 5 | 
 6 | 
 7 | This sink:
 8 | 
 9 | 1. Saves Structured Streaming micro-batch/continuous-processing(Spark 2.3+) DataFrame into hive table.
10 | 2. Uses isolated classloader to isolate Hive related dependencies, which means it can support different versions of Hive other than Spark's built-in one.
11 | 3. Uses newest DataSource API V2, which means it can only be worked with Spark 2.3+.
12 | 
13 | The details of Hive Streaming could be referred [here](https://cwiki.apache.org/confluence/display/Hive/Streaming+Data+Ingest).
14 | 
15 | How To Build
16 | ==========
17 | 
18 | To use this connector, you will require a latest version of Spark (Spark 2.3+).
19 | 
20 | To build this project, please execute:
21 | 
22 | ```shell
23 | mvn package -DskipTests
24 | ```
25 | 
26 | `mvn package` will generate two jars，including one uber jar. User could use this uber jar at convenience.
27 | 
28 | How To Use
29 | ==========
30 | 
31 | 1. This Spark hive streaming sink jar should be loaded into Spark's environment by `--jars`.
32 | 2. A required Hive table should be created before ingesting data into this table. The requirement can be checked [here](https://cwiki.apache.org/confluence/display/Hive/Streaming+Data+Ingest#StreamingDataIngest-StreamingRequirements).
33 | 3. A `hive-site.xml` with required configurations should be put into Spark classpath, so that it can be accessed from classloader.
34 | 4. If you're running in a secured environment, then principal and keytab should be provided.
35 | 
36 | Please be aware a valid `hive-site.xml` and keytab should be accessible from executor side, which means user should pass it via `--files`.
37 | 
38 | To use this library, it is similar to other data source libraries, for example:
39 | 
40 | ```scala
41 | val socket = sparkSession.readStream
42 |   .format("socket")
43 |   .options(Map("host" -> host, "port" -> port))
44 |   .load()
45 |   .as[String]
46 | 
47 | val query = socket.map { s =>
48 |   val records = s.split(",")
49 |   assert(records.length >= 4)
50 |   (records(0).toInt, records(1), records(2), records(3))
51 | }
52 |   .selectExpr("_1 as id", "_2 as msg", "_3 as continent", "_4 as country")
53 |   .writeStream
54 |   .format("hive-streaming")
55 |   .option("metastore", metastoreUri)
56 |   .option("db", "default")
57 |   .option("table", "alerts")
58 |   .queryName("socket-hive-streaming")
59 |   .start()
60 | ```
61 | 
62 | User should convert the data source schema to match the destination table's schema, like above `.selectExpr("_1 as id", "_2 as msg", "_3 as continent", "_4 as country")`.
63 | 
64 | User should specify the data source format `hive-streaming` and required options:
65 | 
66 | 1. `metastore`, metastore uris for which to connect to.
67 | 2. `db`, db name to write to.
68 | 3. `table`, table name to write to.
69 | 
70 | Above 3 options are required to run hive streaming application, for others please check here:
71 | 
72 | option | default value | meaning
73 | ------ | ------------- | -------
74 | txn.per.batch | 100    | Hive grants a batch of transactions instead of single transactions to streaming clients.This setting configures the number of desired transactions per Transaction Batch. Data from all transactions in a single batch end up in a single file. Flume will write a maximum of batchSize events in each transaction in the batch. This setting in conjunction with batch.size provides control over the size of each file. Note that eventually Hive will transparently compact these files into larger files.
75 | auto.create.partitions |  true  | automatically create the necessary Hive partitions to stream to.
76 | principal  | none  | Kerberos user principal for accessing secure Hive.
77 | keytab  | none  | Kerberos keytab for accessing secure Hive.
78 | batch.size  | 10000  |  Max number of events written to Hive in a single Hive transaction.
79 | 
80 | License
81 | =======
82 | 
83 | Apache License, Version 2.0 http://www.apache.org/licenses/LICENSE-2.0.
84 | 


--------------------------------------------------------------------------------
/core/pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <!--
 3 |   ~ Licensed to the Apache Software Foundation (ASF) under one or more
 4 |   ~ contributor license agreements.  See the NOTICE file distributed with
 5 |   ~ this work for additional information regarding copyright ownership.
 6 |   ~ The ASF licenses this file to You under the Apache License, Version 2.0
 7 |   ~ (the "License"); you may not use this file except in compliance with
 8 |   ~ the License.  You may obtain a copy of the License at
 9 |   ~
10 |   ~    http://www.apache.org/licenses/LICENSE-2.0
11 |   ~
12 |   ~ Unless required by applicable law or agreed to in writing, software
13 |   ~ distributed under the License is distributed on an "AS IS" BASIS,
14 |   ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |   ~ See the License for the specific language governing permissions and
16 |   ~ limitations under the License.
17 |   -->
18 | <project xmlns="http://maven.apache.org/POM/4.0.0"
19 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
20 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
21 |   <modelVersion>4.0.0</modelVersion>
22 | 
23 |   <parent>
24 |     <groupId>com.hortonworks.spark</groupId>
25 |     <artifactId>spark-hive-streaming-sink-main_2.11</artifactId>
26 |     <version>0.1.0-SNAPSHOT</version>
27 |     <relativePath>../pom.xml</relativePath>
28 |   </parent>
29 | 
30 |   <artifactId>spark-hive-streaming-sink_2.11</artifactId>
31 |   <version>0.1.0-SNAPSHOT</version>
32 |   <packaging>jar</packaging>
33 | 
34 |   <build>
35 |     <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
36 |     <testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
37 |     <plugins>
38 |       <plugin>
39 |         <groupId>org.apache.maven.plugins</groupId>
40 |         <artifactId>maven-shade-plugin</artifactId>
41 |         <configuration>
42 |           <shadedArtifactAttached>false</shadedArtifactAttached>
43 |           <artifactSet>
44 |             <includes>
45 |               <include>*:*</include>
46 |             </includes>
47 |           </artifactSet>
48 |           <filters>
49 |             <filter>
50 |               <artifact>*:*</artifact>
51 |               <excludes>
52 |                 <exclude>META-INF/*.SF</exclude>
53 |                 <exclude>META-INF/*.DSA</exclude>
54 |                 <exclude>META-INF/*.RSA</exclude>
55 |               </excludes>
56 |             </filter>
57 |           </filters>
58 |         </configuration>
59 |         <executions>
60 |           <execution>
61 |             <phase>package</phase>
62 |             <goals>
63 |               <goal>shade</goal>
64 |             </goals>
65 |             <configuration>
66 |               <transformers>
67 |                 <transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/>
68 |                 <transformer implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
69 |                   <resource>reference.conf</resource>
70 |                 </transformer>
71 |                 <transformer implementation="org.apache.maven.plugins.shade.resource.DontIncludeResourceTransformer">
72 |                   <resource>log4j.properties</resource>
73 |                 </transformer>
74 |                 <transformer implementation="org.apache.maven.plugins.shade.resource.ApacheLicenseResourceTransformer"/>
75 |                 <transformer implementation="org.apache.maven.plugins.shade.resource.ApacheNoticeResourceTransformer"/>
76 |               </transformers>
77 |               <outputFile>${project.build.directory}/${project.artifactId}-assembly-${project.version}.jar</outputFile>
78 |             </configuration>
79 |           </execution>
80 |         </executions>
81 |       </plugin>
82 |     </plugins>
83 |   </build>
84 | 
85 | </project>
86 | 


--------------------------------------------------------------------------------
/core/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister:
--------------------------------------------------------------------------------
1 | com.hortonworks.spark.hive.HiveSourceProvider


--------------------------------------------------------------------------------
/core/src/main/scala/com/hortonworks/spark/hive/HiveSourceProvider.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.hortonworks.spark.hive
19 | 
20 | import org.apache.spark.sql.SparkSession
21 | import org.apache.spark.sql.sources.DataSourceRegister
22 | import org.apache.spark.sql.sources.v2.{DataSourceOptions, DataSourceV2, StreamWriteSupport}
23 | import org.apache.spark.sql.sources.v2.writer.streaming.StreamWriter
24 | import org.apache.spark.sql.streaming.OutputMode
25 | import org.apache.spark.sql.types.StructType
26 | 
27 | import com.hortonworks.spark.hive.common.HiveOptions
28 | 
29 | class HiveSourceProvider extends DataSourceV2 with StreamWriteSupport with DataSourceRegister {
30 | 
31 |   override def shortName(): String = "hive-streaming"
32 | 
33 |   override def createStreamWriter(
34 |       queryId: String,
35 |       schema: StructType,
36 |       outputMode: OutputMode,
37 |       dataSourceOptions: DataSourceOptions): StreamWriter = {
38 |     val localHiveOptions = HiveOptions.fromDataSourceOptions(dataSourceOptions)
39 |     val session = SparkSession.getActiveSession.orElse(SparkSession.getDefaultSession)
40 |     require(session.isDefined)
41 | 
42 |     if (outputMode != OutputMode.Append()) {
43 |       throw new IllegalStateException("Hive Streaming only supports output with Append mode")
44 |     }
45 | 
46 |     val destTable = try {
47 |       session.get.sharedState.externalCatalog.getTable(
48 |         localHiveOptions.dbName, localHiveOptions.tableName)
49 |     } catch {
50 |       case e: Exception => throw new IllegalStateException("Cannot find destination table in " +
51 |         s"metastore, please create table ${localHiveOptions.tableName} at first", e)
52 |     }
53 |     val destSchema = destTable.schema
54 | 
55 |     if (schema.map(_.name).toSet != destSchema.map(_.name).toSet) {
56 |       throw new IllegalStateException(s"Schema $schema transformed from input source is different" +
57 |         s" from schema $destSchema for the destination table")
58 |     }
59 | 
60 |     new HiveStreamWriter(schema.map(_.name), destTable.partitionColumnNames, dataSourceOptions)
61 |   }
62 | }
63 | 


--------------------------------------------------------------------------------
/core/src/main/scala/com/hortonworks/spark/hive/HiveStreamDataWriter.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package com.hortonworks.spark.hive
 19 | 
 20 | import java.util.{List => JList, Map => JMap}
 21 | import java.util.concurrent.{Executors, TimeUnit}
 22 | 
 23 | import scala.collection.JavaConverters._
 24 | import scala.collection.mutable
 25 | 
 26 | import org.apache.spark.sql.Row
 27 | import org.apache.spark.sql.sources.v2.DataSourceOptions
 28 | import org.apache.spark.sql.sources.v2.writer.{DataWriter, WriterCommitMessage}
 29 | import org.json4s.{DefaultFormats, Extraction}
 30 | import org.json4s.jackson.JsonMethods._
 31 | 
 32 | import com.hortonworks.spark.hive.common.{CachedHiveWriters, CachedKey, HiveOptions, HiveWriter}
 33 | import com.hortonworks.spark.hive.utils.Logging
 34 | 
 35 | case object HiveStreamWriterCommitMessage extends WriterCommitMessage
 36 | 
 37 | class HiveStreamDataWriter(
 38 |    partitionId: Int,
 39 |    attemptNumber: Int,
 40 |    columnName: Seq[String],
 41 |    partitionCols: Seq[String],
 42 |    dataSourceOptionsMap: JMap[String, String],
 43 |    initClassLoader: ClassLoader,
 44 |    isolatedClassLoader: ClassLoader) extends DataWriter[Row] with Logging {
 45 | 
 46 |   private implicit def formats = DefaultFormats
 47 | 
 48 |   private val hiveOptions =
 49 |     HiveOptions.fromDataSourceOptions(new DataSourceOptions(dataSourceOptionsMap))
 50 |   private val ugi = hiveOptions.getUGI()
 51 | 
 52 |   private val inUseWriters = new mutable.HashMap[CachedKey, HiveWriter]()
 53 | 
 54 |   private val executorService = Executors.newSingleThreadScheduledExecutor()
 55 |   executorService.scheduleAtFixedRate(new Runnable {
 56 |     Thread.currentThread().setContextClassLoader(isolatedClassLoader)
 57 | 
 58 |     override def run(): Unit = {
 59 |       inUseWriters.foreach(_._2.heartbeat())
 60 |     }
 61 |   }, 10L, 10L, TimeUnit.SECONDS)
 62 | 
 63 |   private def withClassLoader[T](func: => T): T = {
 64 |     try {
 65 |       Thread.currentThread().setContextClassLoader(isolatedClassLoader)
 66 |       func
 67 |     } finally {
 68 |       Thread.currentThread().setContextClassLoader(initClassLoader)
 69 |     }
 70 |   }
 71 | 
 72 |   override def write(row: Row): Unit = withClassLoader {
 73 |     val partitionValues = partitionCols.map { col => stringfyField(row.get(row.fieldIndex(col))) }
 74 |     val hiveEndPoint =
 75 |       Class.forName("org.apache.hive.hcatalog.streaming.HiveEndPoint", true, isolatedClassLoader)
 76 |       .getConstructor(classOf[String], classOf[String], classOf[String], classOf[JList[String]])
 77 |       .newInstance(
 78 |         hiveOptions.metastoreUri, hiveOptions.dbName, hiveOptions.tableName, partitionValues.asJava)
 79 |       .asInstanceOf[Object]
 80 | 
 81 |     val key = CachedKey(
 82 |       hiveOptions.metastoreUri, hiveOptions.dbName, hiveOptions.tableName, partitionValues)
 83 | 
 84 |     def getNewWriter(): HiveWriter = {
 85 |       val writer = CachedHiveWriters.getOrCreate(
 86 |         key, hiveEndPoint, hiveOptions, ugi, isolatedClassLoader)
 87 |       writer.beginTransaction()
 88 |       writer
 89 |     }
 90 |     val writer = inUseWriters.getOrElseUpdate(key, {
 91 |       logDebug(s"writer for $key not found in local cache")
 92 |       getNewWriter()
 93 |     })
 94 | 
 95 |     val jRow = Extraction.decompose(rowToMap(columnName, row))
 96 |     val jString = compact(render(jRow))
 97 | 
 98 |     logDebug(s"Write JSON row ${pretty(render(jRow))} into Hive Streaming")
 99 |     writer.write(jString.getBytes("UTF-8"))
100 | 
101 |     if (writer.totalRecords() >= hiveOptions.batchSize) {
102 |       writer.commitTransaction()
103 |       writer.beginTransaction()
104 |     }
105 |   }
106 | 
107 |   override def abort(): Unit = withClassLoader {
108 |     inUseWriters.foreach { case (key, writer) =>
109 |       writer.abortTransaction()
110 |       CachedHiveWriters.recycle(writer)
111 |       logDebug(s"Recycle writer $writer for $key to global cache")
112 |     }
113 |     inUseWriters.clear()
114 |     executorService.shutdown()
115 |   }
116 | 
117 |   override def commit(): WriterCommitMessage = withClassLoader {
118 |     inUseWriters.foreach { case (key, writer) =>
119 |       writer.commitTransaction()
120 |       CachedHiveWriters.recycle(writer)
121 |       logDebug(s"Recycle writer $writer for $key to global cache")
122 |     }
123 |     inUseWriters.clear()
124 |     executorService.shutdown()
125 | 
126 |     HiveStreamWriterCommitMessage
127 |   }
128 | 
129 |   private def stringfyField(col: Any): String = {
130 |     col match {
131 |       case _: Array[Byte] =>
132 |         throw new UnsupportedOperationException("Cannot convert partition column with BinaryType " +
133 |           "to String")
134 |       case _: Seq[_] =>
135 |         throw new UnsupportedOperationException("Cannot convert partition column with ArrayType " +
136 |           "to String")
137 |       case _: Map[_, _] =>
138 |         throw new UnsupportedOperationException("Cannot convert partition column with MapType " +
139 |           "to String")
140 |       case _: Row =>
141 |         throw new UnsupportedOperationException("Cannot convert partition column with StructType " +
142 |           "to String")
143 |       case i => i.toString
144 |     }
145 |   }
146 | 
147 |   private def rowToMap(columnName: Seq[String], row: Row): Map[String, Any] = {
148 |     columnName.map { col =>
149 |       val field = row.get(row.fieldIndex(col)) match {
150 |         case b: java.math.BigDecimal => new BigDecimal(b)
151 |         case r: Row => rowToMap(r.schema.map(_.name), r)
152 |         case e => e
153 |       }
154 |       col -> field
155 |     }.toMap
156 |   }
157 | }


--------------------------------------------------------------------------------
/core/src/main/scala/com/hortonworks/spark/hive/HiveStreamWriter.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.hortonworks.spark.hive
19 | 
20 | import java.util.{Map => JMap}
21 | 
22 | import org.apache.spark.sql.Row
23 | import org.apache.spark.sql.sources.v2.DataSourceOptions
24 | import org.apache.spark.sql.sources.v2.writer.{DataWriter, DataWriterFactory, WriterCommitMessage}
25 | import org.apache.spark.sql.sources.v2.writer.streaming.StreamWriter
26 | 
27 | import com.hortonworks.spark.hive.utils.HiveIsolatedClassLoader
28 | 
29 | class HiveStreamWriter(
30 |     columnNames: Seq[String],
31 |     partitionCols: Seq[String],
32 |     dataSourceOptions: DataSourceOptions)
33 |   extends StreamWriter {
34 | 
35 |   override def createWriterFactory(): DataWriterFactory[Row] = {
36 |     new HiveStreamDataWriterFactory(columnNames, partitionCols, dataSourceOptions.asMap())
37 |   }
38 | 
39 |   override def commit(epochId: Long, writerCommitMessages: Array[WriterCommitMessage]): Unit = {}
40 | 
41 |   override def abort(epochId: Long, writerCommitMessages: Array[WriterCommitMessage]): Unit = {}
42 | }
43 | 
44 | class HiveStreamDataWriterFactory(
45 |     columnName: Seq[String],
46 |     partitionCols: Seq[String],
47 |     dataSourceOptionsMap: JMap[String, String]) extends DataWriterFactory[Row] {
48 | 
49 |   override def createDataWriter(partitionId: Int, attemptNumber: Int): DataWriter[Row] = {
50 |     val restoredClassLoader = Thread.currentThread().getContextClassLoader
51 |     val currentClassLoader = HiveIsolatedClassLoader.isolatedClassLoader()
52 |     try {
53 |       Thread.currentThread().setContextClassLoader(currentClassLoader)
54 | 
55 |       currentClassLoader.loadClass(classOf[HiveStreamDataWriter].getName)
56 |         .getConstructors.head
57 |         .newInstance(partitionId: java.lang.Integer, attemptNumber: java.lang.Integer,
58 |           columnName, partitionCols, dataSourceOptionsMap, restoredClassLoader, currentClassLoader)
59 |         .asInstanceOf[DataWriter[Row]]
60 |     } finally {
61 |       Thread.currentThread().setContextClassLoader(restoredClassLoader)
62 |     }
63 |   }
64 | }
65 | 


--------------------------------------------------------------------------------
/core/src/main/scala/com/hortonworks/spark/hive/common/CachedHiveWriters.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package com.hortonworks.spark.hive.common
 19 | 
 20 | import java.util.concurrent.{Executors, TimeUnit}
 21 | import javax.annotation.Nullable
 22 | 
 23 | import scala.collection.mutable
 24 | import scala.util.Try
 25 | import scala.util.control.NonFatal
 26 | 
 27 | import org.apache.hadoop.security.UserGroupInformation
 28 | 
 29 | import com.hortonworks.spark.hive.utils.Logging
 30 | 
 31 | case class CachedKey(metastoreUri: String, db: String, table: String, partitionCols: Seq[String])
 32 | 
 33 | object CachedHiveWriters extends Logging {
 34 | 
 35 |   private val cacheExpireTimeout: Long = TimeUnit.MINUTES.toMillis(10)
 36 | 
 37 |   private val cache = new mutable.HashMap[CachedKey, mutable.Queue[HiveWriter]]()
 38 | 
 39 |   private val executorService = Executors.newSingleThreadScheduledExecutor()
 40 |   executorService.scheduleAtFixedRate(new Runnable {
 41 |     override def run(): Unit = {
 42 |       expireOldestWriters()
 43 |     }
 44 |   }, 10L, 10L, TimeUnit.MINUTES)
 45 | 
 46 |   Runtime.getRuntime.addShutdownHook(new Thread {
 47 |     override def run(): Unit = {
 48 |       try {
 49 |         clear()
 50 |         executorService.shutdown()
 51 |       } catch {
 52 |         case NonFatal(_) => // swallow exceptions
 53 |       }
 54 |     }
 55 |   })
 56 | 
 57 |   def getOrCreate(
 58 |       key: CachedKey,
 59 |       hiveEndPoint: Object,
 60 |       hiveOptions: HiveOptions,
 61 |       @Nullable ugi: UserGroupInformation,
 62 |       isolatedClassLoader: ClassLoader): HiveWriter = {
 63 |     val writer = CachedHiveWriters.synchronized {
 64 |       val queue = cache.getOrElseUpdate(key, new mutable.Queue[HiveWriter]())
 65 |       if (queue.isEmpty) {
 66 |         None
 67 |       } else {
 68 |         logDebug(s"Found writer for $key in global cache")
 69 |         Some(queue.dequeue())
 70 |       }
 71 |     }
 72 | 
 73 |     writer.getOrElse(new HiveWriter(key, hiveEndPoint, hiveOptions, ugi, isolatedClassLoader))
 74 |   }
 75 | 
 76 |   def recycle(hiveWriter: HiveWriter): Unit = {
 77 |     CachedHiveWriters.synchronized {
 78 |       cache.getOrElseUpdate(hiveWriter.key, new mutable.Queue[HiveWriter]())
 79 |         .enqueue(hiveWriter)
 80 |     }
 81 |   }
 82 | 
 83 |   private def expireOldestWriters(): Unit = {
 84 |     val currentTime = System.currentTimeMillis()
 85 |     val expiredWriters = new mutable.ArrayBuffer[HiveWriter]()
 86 | 
 87 |     CachedHiveWriters.synchronized {
 88 |       val emptyKeys = cache.filter { case (_, queue) =>
 89 |         while (queue.nonEmpty) {
 90 |           if (queue.head.lastUsed() + cacheExpireTimeout < currentTime) {
 91 |             expiredWriters.append(queue.dequeue())
 92 |           }
 93 |         }
 94 |         queue.isEmpty
 95 |       }.keySet
 96 | 
 97 |       emptyKeys.foreach { k => cache.remove(k) }
 98 |     }
 99 | 
100 |     expiredWriters.foreach { w =>
101 |       if (Try { w.close() }.isFailure) {
102 |         logWarn("Failed to close writer")
103 |       } else {
104 |         logInfo(s"Closed expired writer $w")
105 |       }
106 |     }
107 |   }
108 | 
109 |   private def clear(): Unit = {
110 |     val unusedWriters = new mutable.ArrayBuffer[HiveWriter]()
111 | 
112 |     CachedHiveWriters.synchronized {
113 |       cache.foreach { case (_, queue) =>
114 |         queue.foreach(unusedWriters.append(_))
115 |       }
116 |       cache.clear()
117 |     }
118 | 
119 |     unusedWriters.foreach { w =>
120 |       if (Try { w.close() }.isFailure) {
121 |         logWarn("Failed to close writer")
122 |       } else {
123 |         logInfo(s"Closed writer $w")
124 |       }
125 |     }
126 |   }
127 | }


--------------------------------------------------------------------------------
/core/src/main/scala/com/hortonworks/spark/hive/common/HiveOptions.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package com.hortonworks.spark.hive.common
 19 | 
 20 | import java.io.File
 21 | import javax.annotation.Nullable
 22 | 
 23 | import org.apache.hadoop.security.UserGroupInformation
 24 | import org.apache.spark.sql.sources.v2.DataSourceOptions
 25 | 
 26 | import com.hortonworks.spark.hive.utils.Logging
 27 | 
 28 | class HiveOptions private (
 29 |     val metastoreUri: String,
 30 |     val dbName: String,
 31 |     val tableName: String) extends Logging {
 32 | 
 33 |   var txnPerBatch = 100
 34 |   var batchSize = 10000
 35 |   var autoCreatePartitions = true
 36 | 
 37 |   private var principal: String = null
 38 |   private var keytab: String = null
 39 | 
 40 |   def withTxnPerBatch(txnPerBatch: Int): HiveOptions = {
 41 |     this.txnPerBatch = txnPerBatch
 42 |     this
 43 |   }
 44 | 
 45 |   def withAutoCreatePartitions(autoCreatePartitions: Boolean): HiveOptions = {
 46 |     this.autoCreatePartitions = autoCreatePartitions
 47 |     this
 48 |   }
 49 | 
 50 |   def withPrincipalAndKeytab(principal: String, keytab: String): HiveOptions = {
 51 |     this.principal = principal
 52 |     this.keytab = keytab
 53 |     this
 54 |   }
 55 | 
 56 |   def withBatchSize(batchSize: Int): HiveOptions = {
 57 |     this.batchSize = batchSize
 58 |     this
 59 |   }
 60 | 
 61 |   @Nullable
 62 |   def getUGI(): UserGroupInformation = {
 63 |     if (principal == null || keytab == null) {
 64 |       null.asInstanceOf[UserGroupInformation]
 65 |     } else {
 66 |       val kfile = new File(keytab)
 67 |       if (!(kfile.exists && kfile.canRead)) {
 68 |         throw new IllegalArgumentException(s"keytab file $keytab is not existed or unreadable")
 69 |       }
 70 | 
 71 |       val ugi = UserGroupInformation.loginUserFromKeytabAndReturnUGI(principal, keytab)
 72 |       logInfo(s"UGI $ugi with principal $principal and keytab $keytab")
 73 |       ugi
 74 |     }
 75 |   }
 76 | }
 77 | 
 78 | object HiveOptions {
 79 |   // Key of hive options (case insensitive).
 80 |   val METASTORE_URI = "metastore"
 81 |   val DB_NAME = "db"
 82 |   val TABLE_NAME = "table"
 83 |   val TXN_PER_BATCH = "txn.per.batch"
 84 |   val AUTO_CREATE_PARTITIONS = "auto.create.partitions"
 85 |   val PRINCIPAL = "principal"
 86 |   val KEYTAB = "keytab"
 87 |   val BATCH_SIZE = "batch.size"
 88 | 
 89 |   def fromDataSourceOptions(dataSourceOptions: DataSourceOptions): HiveOptions = {
 90 |     val metastoreUri = dataSourceOptions.get(METASTORE_URI)
 91 |     if (!metastoreUri.isPresent) {
 92 |       throw new IllegalArgumentException("metastore URI must be specified")
 93 |     }
 94 | 
 95 |     val dbName = dataSourceOptions.get(DB_NAME)
 96 |     if (!dbName.isPresent) {
 97 |       throw new IllegalArgumentException("db name must be specified")
 98 |     }
 99 | 
100 |     val tblName = dataSourceOptions.get(TABLE_NAME)
101 |     if (!tblName.isPresent) {
102 |       throw new IllegalArgumentException("table name must be specified")
103 |     }
104 | 
105 |     val option = new HiveOptions(metastoreUri.get(), dbName.get(), tblName.get())
106 | 
107 |     option.withTxnPerBatch(dataSourceOptions.getInt(TXN_PER_BATCH, option.txnPerBatch))
108 |       .withAutoCreatePartitions(
109 |         dataSourceOptions.getBoolean(AUTO_CREATE_PARTITIONS, option.autoCreatePartitions))
110 |       .withPrincipalAndKeytab(
111 |         dataSourceOptions.get(PRINCIPAL).orElse(option.principal),
112 |         dataSourceOptions.get(KEYTAB).orElse(option.keytab))
113 |       .withBatchSize(dataSourceOptions.getInt(BATCH_SIZE, option.batchSize))
114 |   }
115 | }
116 | 


--------------------------------------------------------------------------------
/core/src/main/scala/com/hortonworks/spark/hive/common/HiveWriter.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package com.hortonworks.spark.hive.common
 19 | 
 20 | import java.security.PrivilegedExceptionAction
 21 | 
 22 | import org.apache.hadoop.security.UserGroupInformation
 23 | 
 24 | import com.hortonworks.spark.hive.utils.Logging
 25 | 
 26 | class HiveWriter(
 27 |     val key: CachedKey,
 28 |     hiveEndPoint: Object,
 29 |     hiveOptions: HiveOptions,
 30 |     ugi: UserGroupInformation,
 31 |     isolatedClassLoader: ClassLoader) extends Logging {
 32 | 
 33 |   private val hiveConf =
 34 |     Class.forName("org.apache.hadoop.hive.conf.HiveConf", true, isolatedClassLoader)
 35 |     .newInstance()
 36 |     .asInstanceOf[Object]
 37 |   private val txnTimeout = 300 * 1000L
 38 | 
 39 |   private val connection = hiveEndPoint.getClass.getMethod(
 40 |     "newConnection",
 41 |     classOf[Boolean],
 42 |     Class.forName("org.apache.hadoop.hive.conf.HiveConf", true, isolatedClassLoader),
 43 |     classOf[UserGroupInformation])
 44 |     .invoke(hiveEndPoint, hiveOptions.autoCreatePartitions: java.lang.Boolean, hiveConf, ugi)
 45 | 
 46 |   private val writer = if (ugi == null) {
 47 |     createWriter()
 48 |   } else {
 49 |     ugi.doAs(new PrivilegedExceptionAction[Object] {
 50 |       override def run(): Object = {
 51 |         createWriter()
 52 |       }
 53 |     })
 54 |   }
 55 | 
 56 |   private var txnBatch: Object = null
 57 | 
 58 |   // Timestamp to track the activity of this HiveWriter
 59 |   private var _lastUsed: Long = System.currentTimeMillis()
 60 | 
 61 |   // Timestamp to track the last creation time of transaction batch
 62 |   private var _lastCreated = System.currentTimeMillis()
 63 | 
 64 |   // Track the number of records written in this batch
 65 |   private var _totalRecords = 0
 66 | 
 67 |   private var isTransactionBegin = false
 68 | 
 69 |   // TODO. for now we only support to write JSON String to Hive Streaming.
 70 |   private def createWriter(): Object = {
 71 |     Class.forName("org.apache.hive.hcatalog.streaming.StrictJsonWriter", true, isolatedClassLoader)
 72 |       .getConstructor(
 73 |         Class.forName("org.apache.hive.hcatalog.streaming.HiveEndPoint", true, isolatedClassLoader),
 74 |         Class.forName("org.apache.hadoop.hive.conf.HiveConf", true, isolatedClassLoader))
 75 |       .newInstance(hiveEndPoint, hiveConf)
 76 |       .asInstanceOf[Object]
 77 |   }
 78 | 
 79 |   def beginTransaction(): Unit = {
 80 |     if (txnBatch != null && call[Int](txnBatch, "remainingTransactions") == 0) {
 81 |       call[Unit](txnBatch, "close")
 82 |       txnBatch = null
 83 |     }
 84 | 
 85 |     if (txnBatch == null) {
 86 |       txnBatch = call[Object](connection, "fetchTransactionBatch",
 87 |         Seq(classOf[Int], Class.forName(
 88 |           "org.apache.hive.hcatalog.streaming.RecordWriter", true, isolatedClassLoader)),
 89 |         Seq(hiveOptions.txnPerBatch: java.lang.Integer, writer))
 90 |       _lastCreated = System.currentTimeMillis()
 91 |     }
 92 | 
 93 |     call[Unit](txnBatch, "beginNextTransaction")
 94 |     isTransactionBegin = true
 95 |     _totalRecords = 0
 96 | 
 97 |     logDebug(s"Switch to next transaction for $hiveEndPoint")
 98 |   }
 99 | 
100 |   def write(record: Array[Byte]): Unit = {
101 |     require(txnBatch != null, "current transaction is not initialized before writing")
102 |     require(isTransactionBegin, "current transaction is not beginning")
103 | 
104 |     call[Unit](txnBatch, "write", Seq(classOf[Array[Byte]]), Seq(record))
105 |     _totalRecords += 1
106 |   }
107 | 
108 |   def commitTransaction(): Unit = {
109 |     require(txnBatch != null, "current transaction is not initialized before committing")
110 |     require(isTransactionBegin, "current transaction is not beginning")
111 | 
112 |     call[Unit](txnBatch, "commit")
113 | 
114 |     _lastUsed = System.currentTimeMillis()
115 |     isTransactionBegin = false
116 |     _totalRecords = 0
117 |   }
118 | 
119 |   def abortTransaction(): Unit = {
120 |     isTransactionBegin = false
121 |     _totalRecords = 0
122 | 
123 |     if (txnBatch != null) {
124 |       call[Unit](txnBatch, "abort")
125 |     }
126 |   }
127 | 
128 |   def close(): Unit = {
129 |     isTransactionBegin = false
130 |     _totalRecords = 0
131 | 
132 |     if (txnBatch != null) {
133 |       call[Unit](txnBatch, "commit")
134 |       call[Unit](txnBatch, "close")
135 |     }
136 | 
137 |     call[Unit](connection, "close")
138 |   }
139 | 
140 |   def lastUsed(): Long = _lastUsed
141 | 
142 |   def totalRecords(): Int = _totalRecords
143 | 
144 |   def heartbeat(): Unit = {
145 |     if (System.currentTimeMillis() - _lastCreated > txnTimeout / 2) {
146 |       if (txnBatch != null) {
147 |         call[Unit](txnBatch, "heartbeat")
148 |       }
149 |     }
150 |   }
151 | 
152 |   private def call[T](
153 |       obj: Object,
154 |       method: String,
155 |       types: Seq[Class[_]] = Seq.empty,
156 |       params: Seq[Object] = Seq.empty): T = {
157 |     val mtd = obj.getClass.getMethod(method, types: _*)
158 |     mtd.setAccessible(true)
159 |     mtd.invoke(obj, params: _*).asInstanceOf[T]
160 |   }
161 | }
162 | 


--------------------------------------------------------------------------------
/core/src/main/scala/com/hortonworks/spark/hive/utils/HiveIsolatedClassLoader.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package com.hortonworks.spark.hive.utils
 19 | 
 20 | import java.net.{URL, URLClassLoader}
 21 | import java.util
 22 | 
 23 | object HiveIsolatedClassLoader extends Logging {
 24 | 
 25 |   def isolatedClassLoader(): ClassLoader = {
 26 |     val parentClassLoader = Option(Thread.currentThread().getContextClassLoader)
 27 |       .getOrElse(getClass.getClassLoader)
 28 | 
 29 |     // Assume the task parent classloader is either Spark MutableURLClassLoader or
 30 |     // ExecutorClassLoader
 31 |     def getAddedURLs(classloader: ClassLoader): Array[URL] = {
 32 |       classloader match {
 33 |         case e if e.getClass.getName == "org.apache.spark.repl.ExecutorClassLoader" =>
 34 |           val method = e.getClass.getMethod("parentLoader")
 35 |           method.setAccessible(true)
 36 |           val parent = method.invoke(e).asInstanceOf[ClassLoader].getParent
 37 |           getAddedURLs(parent)
 38 | 
 39 |         case e if e.getClass.getName == "org.apache.spark.util.ChildFirstURLClassLoader" =>
 40 |           val method = e.getClass.getMethod("parentClassLoader")
 41 |           method.setAccessible(true)
 42 |           val parent = method.invoke(e).asInstanceOf[ClassLoader].getParent
 43 |           getAddedURLs(parent)
 44 | 
 45 |         case e if e.getClass.getName == "org.apache.spark.util.MutableURLClassLoader" =>
 46 |           val method = e.getClass.getMethod("getURLs")
 47 |           method.setAccessible(true)
 48 |           method.invoke(e).asInstanceOf[Array[URL]]
 49 | 
 50 |         case e: ClassLoader =>
 51 |           Option(e.getParent).map { getAddedURLs(_) }.getOrElse(
 52 |             throw new IllegalStateException("Get unexpected classloader"))
 53 | 
 54 |         case u =>
 55 |           throw new IllegalStateException(s"Get unexpected object, $u")
 56 |       }
 57 |     }
 58 | 
 59 |     val urls = getAddedURLs(parentClassLoader)
 60 |     new HiveIsolatedClassLoader(urls, parentClassLoader)
 61 |   }
 62 | }
 63 | 
 64 | class HiveIsolatedClassLoader(urls: Array[URL], baseClassLoader: ClassLoader)
 65 |     extends URLClassLoader(urls, ClassLoader.getSystemClassLoader.getParent.getParent)
 66 |     with Logging {
 67 | 
 68 |   override def loadClass(name: String, resolve: Boolean): Class[_] = {
 69 |     val loaded = findLoadedClass(name)
 70 |     if (loaded == null) doLoadClass(name, resolve) else loaded
 71 |   }
 72 | 
 73 |   override def getResource(name: String): URL = {
 74 |     baseClassLoader.getResource(name)
 75 |   }
 76 | 
 77 |   override def getResources(name: String): util.Enumeration[URL] = {
 78 |     baseClassLoader.getResources(name)
 79 |   }
 80 | 
 81 |   def doLoadClass(name: String, resolve: Boolean): Class[_] = {
 82 |     if (isHiveClass(name)) {
 83 |       logTrace(s"hive class: $name - ${super.getResource(classToPath(name))}")
 84 |       super.loadClass(name, resolve)
 85 |     } else {
 86 |       try {
 87 |         baseClassLoader.loadClass(name)
 88 |       } catch {
 89 |         case _: ClassNotFoundException =>
 90 |           super.loadClass(name, resolve)
 91 |       }
 92 |     }
 93 |   }
 94 | 
 95 |   private def isHiveClass(name: String): Boolean = {
 96 |     name.startsWith("org.apache.hadoop.hive.") ||
 97 |       name.startsWith("org.apache.hive.") ||
 98 |       name.startsWith("org.apache.orc.")
 99 |   }
100 | 
101 |   private def classToPath(name: String): String =
102 |     name.replaceAll("\\.", "/") + ".class"
103 | }
104 | 


--------------------------------------------------------------------------------
/core/src/main/scala/com/hortonworks/spark/hive/utils/Logging.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.hortonworks.spark.hive.utils
19 | 
20 | import org.slf4j.LoggerFactory
21 | 
22 | trait Logging {
23 |   lazy val logger = LoggerFactory.getLogger(this.getClass)
24 | 
25 |   def logTrace(message: => Any): Unit = {
26 |     if (logger.isTraceEnabled) {
27 |       logger.trace(message.toString)
28 |     }
29 |   }
30 | 
31 |   def logDebug(message: => Any): Unit = {
32 |     if (logger.isDebugEnabled) {
33 |       logger.debug(message.toString)
34 |     }
35 |   }
36 | 
37 |   def logInfo(message: => Any): Unit = {
38 |     if (logger.isInfoEnabled) {
39 |       logger.info(message.toString)
40 |     }
41 |   }
42 | 
43 |   def logWarn(message: => Any): Unit = {
44 |     logger.warn(message.toString)
45 |   }
46 | 
47 |   def logWarn(message: => Any, t: Throwable): Unit = {
48 |     logger.warn(message.toString, t)
49 |   }
50 | 
51 |   def logError(message: => Any, t: Throwable): Unit = {
52 |     logger.error(message.toString, t)
53 |   }
54 | 
55 |   def logError(message: => Any): Unit = {
56 |     logger.error(message.toString)
57 |   }
58 | }
59 | 


--------------------------------------------------------------------------------
/example/pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <!--
 3 |   ~ Licensed to the Apache Software Foundation (ASF) under one or more
 4 |   ~ contributor license agreements.  See the NOTICE file distributed with
 5 |   ~ this work for additional information regarding copyright ownership.
 6 |   ~ The ASF licenses this file to You under the Apache License, Version 2.0
 7 |   ~ (the "License"); you may not use this file except in compliance with
 8 |   ~ the License.  You may obtain a copy of the License at
 9 |   ~
10 |   ~    http://www.apache.org/licenses/LICENSE-2.0
11 |   ~
12 |   ~ Unless required by applicable law or agreed to in writing, software
13 |   ~ distributed under the License is distributed on an "AS IS" BASIS,
14 |   ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |   ~ See the License for the specific language governing permissions and
16 |   ~ limitations under the License.
17 |   -->
18 | <project xmlns="http://maven.apache.org/POM/4.0.0"
19 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
20 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
21 |   <modelVersion>4.0.0</modelVersion>
22 | 
23 |   <parent>
24 |     <groupId>com.hortonworks.spark</groupId>
25 |     <artifactId>spark-hive-streaming-sink-main_2.11</artifactId>
26 |     <version>0.1.0-SNAPSHOT</version>
27 |     <relativePath>../pom.xml</relativePath>
28 |   </parent>
29 | 
30 |   <artifactId>spark-hive-streaming-sink-example_2.11</artifactId>
31 |   <version>0.1.0-SNAPSHOT</version>
32 |   <packaging>jar</packaging>
33 | 
34 |   <dependencies>
35 |     <dependency>
36 |       <groupId>com.hortonworks.spark</groupId>
37 |       <artifactId>spark-hive-streaming-sink_2.11</artifactId>
38 |       <version>0.1.0-SNAPSHOT</version>
39 |     </dependency>
40 |   </dependencies>
41 | 
42 | </project>
43 | 


--------------------------------------------------------------------------------
/example/src/main/scala/com/hortonworks/spark/hive/example/HiveStreamingExample.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.hortonworks.spark.hive.example
19 | 
20 | import org.apache.spark.SparkConf
21 | import org.apache.spark.sql.SparkSession
22 | 
23 | /**
24 |  * A Hive Streaming example to ingest data from socket and push into hive table.
25 |  *
26 |  * Assumed HIVE table Schema:
27 |  * create table alerts ( id int , msg string )
28 |  *    partitioned by (continent string, country string)
29 |  *    clustered by (id) into 5 buckets
30 |  *    stored as orc tblproperties("transactional"="true");
31 |  */
32 | object HiveStreamingExample {
33 | 
34 |   def main(args: Array[String]): Unit = {
35 |     if (args.length < 3 || args.length > 5) {
36 |       // scalastyle:off println
37 |       System.err.println(s"Usage: HiveStreamingExample <socket host> <socket port>" +
38 |         s" <metastore uri> [principal] [keytab]")
39 |       // scalastyle:on println
40 |       System.exit(1)
41 |     }
42 | 
43 |     val host = args(0)
44 |     val port = args(1)
45 |     val metastoreUri = args(2)
46 | 
47 |     val (principal, keytab) = if (args.length == 5) {
48 |       (args(3), args(4))
49 |     } else {
50 |       (null, null)
51 |     }
52 | 
53 |     val sparkConf = new SparkConf()
54 |       .set("spark.sql.streaming.checkpointLocation", "./checkpoint")
55 |     val sparkSession = SparkSession.builder()
56 |       .appName("HiveStreamingExample")
57 |       .config(sparkConf)
58 |       .enableHiveSupport()
59 |       .getOrCreate()
60 | 
61 |     import sparkSession.implicits._
62 | 
63 |     val socket = sparkSession.readStream
64 |       .format("socket")
65 |       .options(Map("host" -> host, "port" -> port))
66 |       .load()
67 |       .as[String]
68 | 
69 |     val writer = socket.map { s =>
70 |       val records = s.split(",")
71 |       assert(records.length >= 4)
72 |       (records(0).toInt, records(1), records(2), records(3))
73 |     }
74 |       .selectExpr("_1 as id", "_2 as msg", "_3 as continent", "_4 as country")
75 |       .writeStream
76 |       .format("hive-streaming")
77 |       .option("metastore", metastoreUri)
78 |       .option("db", "default")
79 |       .option("table", "alerts")
80 | 
81 |     if (principal != null && keytab != null) {
82 |       writer.option("principal", principal)
83 |         .option("keytab", keytab)
84 |     }
85 | 
86 |     val query = writer.start()
87 |     query.awaitTermination()
88 | 
89 |     query.stop()
90 |     sparkSession.stop()
91 |   }
92 | }
93 | 


--------------------------------------------------------------------------------
/example/src/main/scala/com/hortonworks/spark/hive/example/RateStreamingExample.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.hortonworks.spark.hive.example
19 | 
20 | import org.apache.spark.SparkConf
21 | import org.apache.spark.sql.SparkSession
22 | import org.apache.spark.sql.streaming.Trigger
23 | 
24 | /**
25 |  * A Hive Streaming example to ingest data from rate stream and push into hive table.
26 |  *
27 |  * Assumed HIVE table Schema:
28 |  * create table rate (value bigint)
29 |  *    clustered by (value) into 5 buckets
30 |  *    stored as orc tblproperties("transactional"="true");
31 |  */
32 | object RateStreamingExample {
33 | 
34 |   def main(args: Array[String]): Unit = {
35 |     if (args.length < 1 || args.length > 4) {
36 |       // scalastyle:off println
37 |       System.err.println(s"Usage: RateStreamingExample <metastore uri> [principal] " +
38 |         s"[keytab] [continuous?]")
39 |       // scalastyle:on println
40 |       System.exit(1)
41 |     }
42 | 
43 |     val metastoreUri = args(0)
44 |     val continuous = if (args.length == 2) {
45 |       args(1) == "continuous"
46 |     } else if (args.length == 4) {
47 |       args(3) == "continuous"
48 |     } else {
49 |       false
50 |     }
51 | 
52 |     val principal = if (args.length >= 3) args(1) else null
53 |     val keytab = if (args.length >= 3) args(2) else null
54 | 
55 |     val sparkConf = new SparkConf()
56 |       .set("spark.sql.streaming.checkpointLocation", "./checkpoint")
57 |     val sparkSession = SparkSession.builder()
58 |       .appName("RateStreamingExample")
59 |       .config(sparkConf)
60 |       .enableHiveSupport()
61 |       .getOrCreate()
62 | 
63 |     val rate = sparkSession.readStream
64 |       .format("rate")
65 |       .option("rowsPerSecond", "1")
66 |       .load()
67 | 
68 |     val writer = rate.select("value")
69 |       .writeStream
70 |       .format("hive-streaming")
71 |       .option("metastore", metastoreUri)
72 |       .option("db", "default")
73 |       .option("table", "rate")
74 | 
75 |     if (principal != null && keytab != null) {
76 |       writer.option("principal", principal)
77 |       writer.option("keytab", keytab)
78 |     }
79 | 
80 |     val query = writer
81 |       .trigger(if (continuous) Trigger.Continuous(3000L) else Trigger.ProcessingTime(3000L))
82 |       .start()
83 | 
84 |     query.awaitTermination()
85 | 
86 |     query.stop()
87 |     sparkSession.stop()
88 |   }
89 | }
90 | 


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <!--
  3 |   ~ Licensed to the Apache Software Foundation (ASF) under one or more
  4 |   ~ contributor license agreements.  See the NOTICE file distributed with
  5 |   ~ this work for additional information regarding copyright ownership.
  6 |   ~ The ASF licenses this file to You under the Apache License, Version 2.0
  7 |   ~ (the "License"); you may not use this file except in compliance with
  8 |   ~ the License.  You may obtain a copy of the License at
  9 |   ~
 10 |   ~    http://www.apache.org/licenses/LICENSE-2.0
 11 |   ~
 12 |   ~ Unless required by applicable law or agreed to in writing, software
 13 |   ~ distributed under the License is distributed on an "AS IS" BASIS,
 14 |   ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 |   ~ See the License for the specific language governing permissions and
 16 |   ~ limitations under the License.
 17 |   -->
 18 | <project xmlns="http://maven.apache.org/POM/4.0.0"
 19 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 20 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 21 |   <modelVersion>4.0.0</modelVersion>
 22 | 
 23 |   <groupId>com.hortonworks.spark</groupId>
 24 |   <artifactId>spark-hive-streaming-sink-main_2.11</artifactId>
 25 |   <version>0.1.0-SNAPSHOT</version>
 26 |   <packaging>pom</packaging>
 27 | 
 28 |   <properties>
 29 |     <spark.version>2.3.0</spark.version>
 30 |     <hive.version>1.2.1</hive.version>
 31 |     <scala.version>2.11.8</scala.version>
 32 |     <scala.binary.version>2.11</scala.binary.version>
 33 |     <MaxPermGen>512m</MaxPermGen>
 34 |     <CodeCacheSize>512m</CodeCacheSize>
 35 |     <java.version>1.8</java.version>
 36 |     <minJavaVersion>1.8</minJavaVersion>
 37 |     <maxJavaVersion>1.8</maxJavaVersion>
 38 |     <test.redirectToFile>true</test.redirectToFile>
 39 |     <scalatest.version>2.2.4</scalatest.version>
 40 |     <mockito.version>1.9.5</mockito.version>
 41 |   </properties>
 42 | 
 43 |   <modules>
 44 |     <module>core</module>
 45 |     <module>example</module>
 46 |   </modules>
 47 | 
 48 |   <repositories>
 49 | 
 50 |     <repository>
 51 |       <id>hortonworks-repo</id>
 52 |       <name>Hortonworks Repository</name>
 53 |       <url>http://repo.hortonworks.com/content/repositories/releases/</url>
 54 |       <releases>
 55 |         <enabled>true</enabled>
 56 |       </releases>
 57 |       <snapshots>
 58 |         <enabled>false</enabled>
 59 |       </snapshots>
 60 |     </repository>
 61 | 
 62 |     <repository>
 63 |       <id>hortonworks-snapshot-repo</id>
 64 |       <name>Hortonworks Snapshot Repository</name>
 65 |       <url>http://nexus-private.hortonworks.com/nexus/content/groups/public/</url>
 66 |     </repository>
 67 | 
 68 |   </repositories>
 69 | 
 70 |   <dependencies>
 71 | 
 72 |     <dependency>
 73 |       <groupId>org.apache.spark</groupId>
 74 |       <artifactId>spark-sql_${scala.binary.version}</artifactId>
 75 |       <version>${spark.version}</version>
 76 |       <scope>provided</scope>
 77 |     </dependency>
 78 | 
 79 |     <dependency>
 80 |       <groupId>org.apache.spark</groupId>
 81 |       <artifactId>spark-catalyst_${scala.binary.version}</artifactId>
 82 |       <version>${spark.version}</version>
 83 |       <scope>provided</scope>
 84 |     </dependency>
 85 | 
 86 |     <dependency>
 87 |       <groupId>org.scalacheck</groupId>
 88 |       <artifactId>scalacheck_${scala.binary.version}</artifactId>
 89 |       <version>1.12.5</version>
 90 |     </dependency>
 91 | 
 92 |     <dependency>
 93 |       <groupId>org.apache.hive.hcatalog</groupId>
 94 |       <artifactId>hive-hcatalog-streaming</artifactId>
 95 |       <version>${hive.version}</version>
 96 |       <exclusions>
 97 |         <exclusion>
 98 |           <groupId>org.slf4j</groupId>
 99 |           <artifactId>slf4j-log4j12</artifactId>
100 |         </exclusion>
101 |         <exclusion>
102 |           <groupId>org.apache.hadoop</groupId>
103 |           <artifactId>*</artifactId>
104 |         </exclusion>
105 |         <exclusion>
106 |           <groupId>org.apache.hbase</groupId>
107 |           <artifactId>*</artifactId>
108 |         </exclusion>
109 |         <exclusion>
110 |           <groupId>org.apache.hive</groupId>
111 |           <artifactId>hive-llap-tez</artifactId>
112 |         </exclusion>
113 |         <exclusion>
114 |           <groupId>org.apache.hive</groupId>
115 |           <artifactId>hive-llap-server</artifactId>
116 |         </exclusion>
117 |       </exclusions>
118 |     </dependency>
119 | 
120 |     <dependency>
121 |       <groupId>org.apache.hive.hcatalog</groupId>
122 |       <artifactId>hive-hcatalog-core</artifactId>
123 |       <version>${hive.version}</version>
124 |       <exclusions>
125 |         <exclusion>
126 |           <groupId>org.slf4j</groupId>
127 |           <artifactId>slf4j-log4j12</artifactId>
128 |         </exclusion>
129 |         <exclusion>
130 |           <groupId>org.apache.hadoop</groupId>
131 |           <artifactId>*</artifactId>
132 |         </exclusion>
133 |       </exclusions>
134 |     </dependency>
135 | 
136 |     <dependency>
137 |       <groupId>log4j</groupId>
138 |       <artifactId>log4j</artifactId>
139 |       <version>1.2.16</version>
140 |       <scope>provided</scope>
141 |     </dependency>
142 | 
143 |     <dependency>
144 |       <groupId>org.scalatest</groupId>
145 |       <artifactId>scalatest_${scala.binary.version}</artifactId>
146 |       <version>${scalatest.version}</version>
147 |       <scope>test</scope>
148 |     </dependency>
149 | 
150 |     <dependency>
151 |       <groupId>org.mockito</groupId>
152 |       <artifactId>mockito-all</artifactId>
153 |       <version>${mockito.version}</version>
154 |     </dependency>
155 | 
156 |   </dependencies>
157 | 
158 |   <build>
159 |     <pluginManagement>
160 |       <plugins>
161 | 
162 |         <plugin>
163 |           <groupId>org.apache.maven.plugins</groupId>
164 |           <artifactId>maven-enforcer-plugin</artifactId>
165 |           <version>1.4.1</version>
166 |           <executions>
167 |             <execution>
168 |               <id>enforce-versions</id>
169 |               <goals>
170 |                 <goal>enforce</goal>
171 |               </goals>
172 |               <configuration>
173 |                 <rules>
174 |                   <requireJavaVersion>
175 |                     <version>1.8</version>
176 |                   </requireJavaVersion>
177 |                 </rules>
178 |               </configuration>
179 |             </execution>
180 |           </executions>
181 |         </plugin>
182 | 
183 |         <plugin>
184 |           <groupId>org.apache.maven.plugins</groupId>
185 |           <artifactId>maven-compiler-plugin</artifactId>
186 |           <version>3.3</version>
187 |         </plugin>
188 | 
189 |         <plugin>
190 |           <groupId>org.apache.maven.plugins</groupId>
191 |           <artifactId>maven-antrun-plugin</artifactId>
192 |           <version>1.8</version>
193 |         </plugin>
194 | 
195 |         <plugin>
196 |           <groupId>org.apache.maven.plugins</groupId>
197 |           <artifactId>maven-deploy-plugin</artifactId>
198 |           <version>2.8.2</version>
199 |         </plugin>
200 | 
201 |         <plugin>
202 |           <groupId>org.codehaus.mojo</groupId>
203 |           <artifactId>build-helper-maven-plugin</artifactId>
204 |           <version>1.10</version>
205 |         </plugin>
206 | 
207 |         <plugin>
208 |           <groupId>net.alchim31.maven</groupId>
209 |           <artifactId>scala-maven-plugin</artifactId>
210 |           <version>3.2.2</version>
211 |           <executions>
212 |             <execution>
213 |               <goals>
214 |                 <goal>compile</goal>
215 |                 <goal>testCompile</goal>
216 |               </goals>
217 |             </execution>
218 |           </executions>
219 |           <configuration>
220 |             <scalaVersion>${scala.version}</scalaVersion>
221 |             <recompileMode>incremental</recompileMode>
222 |             <useZincServer>true</useZincServer>
223 |             <checkMultipleScalaVersions>false</checkMultipleScalaVersions>
224 |             <args>
225 |               <arg>-unchecked</arg>
226 |               <arg>-deprecation</arg>
227 |               <arg>-feature</arg>
228 |             </args>
229 |             <javacArgs>
230 |               <javacArg>-source</javacArg>
231 |               <javacArg>${java.version}</javacArg>
232 |               <javacArg>-target</javacArg>
233 |               <javacArg>-Xlint:unchecked</javacArg>
234 |             </javacArgs>
235 |             <jvmArgs>
236 |               <jvmArg>-Xms1024m</jvmArg>
237 |               <jvmArg>-Xmx1024m</jvmArg>
238 |               <jvmArg>-XX:MaxPermSize=${MaxPermGen}</jvmArg>
239 |               <jvmArg>-XX:ReservedCodeCacheSize=${CodeCacheSize}</jvmArg>
240 |             </jvmArgs>
241 |           </configuration>
242 |         </plugin>
243 | 
244 |         <plugin>
245 |           <groupId>org.apache.maven.plugins</groupId>
246 |           <artifactId>maven-install-plugin</artifactId>
247 |           <version>2.5.2</version>
248 |         </plugin>
249 | 
250 |         <plugin>
251 |           <groupId>org.apache.maven.plugins</groupId>
252 |           <artifactId>maven-surefire-plugin</artifactId>
253 |           <version>2.19</version>
254 |           <configuration>
255 |             <systemProperties>
256 |               <java.awt.headless>true</java.awt.headless>
257 |               <java.io.tmpdir>${project.build.directory}/tmp</java.io.tmpdir>
258 |               <project.version>${project.version}</project.version>
259 |             </systemProperties>
260 |             <redirectTestOutputToFile>${test.redirectToFile}</redirectTestOutputToFile>
261 |             <useFile>${test.redirectToFile}</useFile>
262 |             <argLine>-Xmx2g -XX:MaxPermSize=512m</argLine>
263 |             <failIfNoTests>false</failIfNoTests>
264 |           </configuration>
265 |         </plugin>
266 | 
267 |         <plugin>
268 |           <groupId>org.scalatest</groupId>
269 |           <artifactId>scalatest-maven-plugin</artifactId>
270 |           <version>1.0</version>
271 |           <configuration>
272 |             <systemProperties>
273 |               <java.awt.headless>true</java.awt.headless>
274 |               <java.io.tmpdir>${project.build.directory}/tmp</java.io.tmpdir>
275 |             </systemProperties>
276 |             <stdout>D</stdout>
277 |             <reportsDirectory>${project.build.directory}/surefire-reports</reportsDirectory>
278 |             <junitxml>.</junitxml>
279 |             <filereports>WDF TestSuite.txt</filereports>
280 |             <argLine>-Xmx2g -XX:MaxPermSize=512m</argLine>
281 |           </configuration>
282 |           <executions>
283 |             <execution>
284 |               <id>test</id>
285 |               <goals>
286 |                 <goal>test</goal>
287 |               </goals>
288 |             </execution>
289 |           </executions>
290 |         </plugin>
291 | 
292 |         <plugin>
293 |           <groupId>org.apache.maven.plugins</groupId>
294 |           <artifactId>maven-shade-plugin</artifactId>
295 |           <version>3.1.0</version>
296 |         </plugin>
297 | 
298 |       </plugins>
299 |     </pluginManagement>
300 | 
301 |     <plugins>
302 |       <plugin>
303 |         <groupId>org.apache.maven.plugins</groupId>
304 |         <artifactId>maven-compiler-plugin</artifactId>
305 |         <executions>
306 |           <execution>
307 |             <id>default-compile</id>
308 |             <phase>none</phase>
309 |           </execution>
310 |           <execution>
311 |             <id>default-testCompile</id>
312 |             <phase>none</phase>
313 |           </execution>
314 |         </executions>
315 |         <configuration>
316 |           <source>${java.version}</source>
317 |           <target>${java.version}</target>
318 |         </configuration>
319 |       </plugin>
320 | 
321 |       <plugin>
322 |         <groupId>org.apache.maven.plugins</groupId>
323 |         <artifactId>maven-enforcer-plugin</artifactId>
324 |         <inherited>false</inherited>
325 |         <configuration>
326 |           <rules>
327 |             <requireMavenVersion>
328 |               <version>[3.0.0,)</version>
329 |             </requireMavenVersion>
330 |             <requireJavaVersion>
331 |               <version>[${minJavaVersion}.0,${maxJavaVersion}.1000}]</version>
332 |             </requireJavaVersion>
333 |             <requireOS>
334 |               <family>unix</family>
335 |             </requireOS>
336 |           </rules>
337 |         </configuration>
338 |         <executions>
339 |           <execution>
340 |             <id>clean</id>
341 |             <goals>
342 |               <goal>enforce</goal>
343 |             </goals>
344 |             <phase>pre-clean</phase>
345 |           </execution>
346 |           <execution>
347 |             <id>default</id>
348 |             <goals>
349 |               <goal>enforce</goal>
350 |             </goals>
351 |             <phase>validate</phase>
352 |           </execution>
353 |           <execution>
354 |             <id>site</id>
355 |             <goals>
356 |               <goal>enforce</goal>
357 |             </goals>
358 |             <phase>pre-site</phase>
359 |           </execution>
360 |         </executions>
361 |       </plugin>
362 | 
363 |       <plugin>
364 |         <groupId>net.alchim31.maven</groupId>
365 |         <artifactId>scala-maven-plugin</artifactId>
366 |       </plugin>
367 | 
368 |       <plugin>
369 |         <groupId>org.apache.maven.plugins</groupId>
370 |         <artifactId>maven-antrun-plugin</artifactId>
371 |         <executions>
372 |           <!-- Cleans up files that tests append to (because we have two test plugins). -->
373 |           <execution>
374 |             <id>pre-test-clean</id>
375 |             <phase>generate-test-resources</phase>
376 |             <goals>
377 |               <goal>run</goal>
378 |             </goals>
379 |             <configuration>
380 |               <target>
381 |                 <delete file="${project.build.directory}/unit-tests.log" quiet="true" />
382 |                 <delete dir="${project.build.directory}/tmp" quiet="true" />
383 |               </target>
384 |             </configuration>
385 |           </execution>
386 |           <!-- Create the temp directory to be  used by tests. -->
387 |           <execution>
388 |             <id>create-tmp-dir</id>
389 |             <phase>generate-test-resources</phase>
390 |             <goals>
391 |               <goal>run</goal>
392 |             </goals>
393 |             <configuration>
394 |               <target>
395 |                 <mkdir dir="${project.build.directory}/tmp" />
396 |               </target>
397 |             </configuration>
398 |           </execution>
399 |         </executions>
400 |       </plugin>
401 | 
402 |       <plugin>
403 |         <groupId>org.apache.maven.plugins</groupId>
404 |         <artifactId>maven-surefire-plugin</artifactId>
405 |       </plugin>
406 | 
407 |       <plugin>
408 |         <groupId>org.scalatest</groupId>
409 |         <artifactId>scalatest-maven-plugin</artifactId>
410 |       </plugin>
411 | 
412 |       <plugin>
413 |         <groupId>org.scalastyle</groupId>
414 |         <artifactId>scalastyle-maven-plugin</artifactId>
415 |         <version>1.0.0</version>
416 |         <configuration>
417 |           <verbose>false</verbose>
418 |           <failOnViolation>true</failOnViolation>
419 |           <includeTestSourceDirectory>false</includeTestSourceDirectory>
420 |           <failOnWarning>false</failOnWarning>
421 |           <sourceDirectory>${basedir}/src/main/scala</sourceDirectory>
422 |           <testSourceDirectory>${basedir}/src/test/scala</testSourceDirectory>
423 |           <configLocation>scalastyle-config.xml</configLocation>
424 |           <outputFile>${basedir}/target/scalastyle-output.xml</outputFile>
425 |           <inputEncoding>${project.build.sourceEncoding}</inputEncoding>
426 |           <outputEncoding>${project.reporting.outputEncoding}</outputEncoding>
427 |         </configuration>
428 |         <executions>
429 |           <execution>
430 |             <goals>
431 |               <goal>check</goal>
432 |             </goals>
433 |           </execution>
434 |         </executions>
435 |       </plugin>
436 |     </plugins>
437 |   </build>
438 | 
439 |   <profiles>
440 | 
441 |     <profile>
442 |       <id>hortonworks</id>
443 |       <activation>
444 |         <property>
445 |           <name>hortonworks</name>
446 |         </property>
447 |       </activation>
448 |       <properties>
449 |         <hive.version>3.0.0.3.0.0.2-SNAPSHOT</hive.version>
450 |         <spark.version>2.3.0.3.0.0.2-SNAPSHOT</spark.version>
451 |       </properties>
452 |     </profile>
453 | 
454 |   </profiles>
455 | 
456 | </project>
457 | 


--------------------------------------------------------------------------------
/scalastyle-config.xml:
--------------------------------------------------------------------------------
  1 | <!--
  2 |   ~ Licensed to the Apache Software Foundation (ASF) under one or more
  3 |   ~ contributor license agreements.  See the NOTICE file distributed with
  4 |   ~ this work for additional information regarding copyright ownership.
  5 |   ~ The ASF licenses this file to You under the Apache License, Version 2.0
  6 |   ~ (the "License"); you may not use this file except in compliance with
  7 |   ~ the License.  You may obtain a copy of the License at
  8 |   ~
  9 |   ~    http://www.apache.org/licenses/LICENSE-2.0
 10 |   ~
 11 |   ~ Unless required by applicable law or agreed to in writing, software
 12 |   ~ distributed under the License is distributed on an "AS IS" BASIS,
 13 |   ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |   ~ See the License for the specific language governing permissions and
 15 |   ~ limitations under the License.
 16 |   -->
 17 | <!--
 18 | 
 19 | If you wish to turn off checking for a section of code, you can put a comment in the source
 20 | before and after the section, with the following syntax:
 21 | 
 22 |   // scalastyle:off
 23 |   ...  // stuff that breaks the styles
 24 |   // scalastyle:on
 25 | 
 26 | You can also disable only one rule, by specifying its rule id, as specified in:
 27 |   http://www.scalastyle.org/rules-0.7.0.html
 28 | 
 29 |   // scalastyle:off no.finalize
 30 |   override def finalize(): Unit = ...
 31 |   // scalastyle:on no.finalize
 32 | 
 33 | This file is divided into 3 sections:
 34 |  (1) rules that we enforce.
 35 |  (2) rules that we would like to enforce, but haven't cleaned up the codebase to turn on yet
 36 |      (or we need to make the scalastyle rule more configurable).
 37 |  (3) rules that we don't want to enforce.
 38 | -->
 39 | 
 40 | <scalastyle>
 41 |   <name>Scalastyle standard configuration</name>
 42 | 
 43 |   <!-- ================================================================================ -->
 44 |   <!--                               rules we enforce                                   -->
 45 |   <!-- ================================================================================ -->
 46 | 
 47 |   <check level="error" class="org.scalastyle.file.FileTabChecker" enabled="true"></check>
 48 | 
 49 |   <check level="error" class="org.scalastyle.file.HeaderMatchesChecker" enabled="true">
 50 |     <parameters>
 51 |        <parameter name="header"><![CDATA[/*
 52 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 53 |  * contributor license agreements.  See the NOTICE file distributed with
 54 |  * this work for additional information regarding copyright ownership.
 55 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 56 |  * (the "License"); you may not use this file except in compliance with
 57 |  * the License.  You may obtain a copy of the License at
 58 |  *
 59 |  *    http://www.apache.org/licenses/LICENSE-2.0
 60 |  *
 61 |  * Unless required by applicable law or agreed to in writing, software
 62 |  * distributed under the License is distributed on an "AS IS" BASIS,
 63 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 64 |  * See the License for the specific language governing permissions and
 65 |  * limitations under the License.
 66 |  */]]></parameter>
 67 |     </parameters>
 68 |   </check>
 69 | 
 70 |   <check level="error" class="org.scalastyle.scalariform.SpacesAfterPlusChecker" enabled="true"></check>
 71 | 
 72 |   <check level="error" class="org.scalastyle.scalariform.SpacesBeforePlusChecker" enabled="true"></check>
 73 | 
 74 |   <check level="error" class="org.scalastyle.file.WhitespaceEndOfLineChecker" enabled="true"></check>
 75 | 
 76 |   <check level="error" class="org.scalastyle.file.FileLineLengthChecker" enabled="true">
 77 |     <parameters>
 78 |       <parameter name="maxLineLength"><![CDATA[100]]></parameter>
 79 |       <parameter name="tabSize"><![CDATA[2]]></parameter>
 80 |       <parameter name="ignoreImports">true</parameter>
 81 |     </parameters>
 82 |   </check>
 83 | 
 84 |   <check level="error" class="org.scalastyle.scalariform.ClassNamesChecker" enabled="true">
 85 |     <parameters><parameter name="regex"><![CDATA[[A-Z][A-Za-z]*]]></parameter></parameters>
 86 |   </check>
 87 | 
 88 |   <check level="error" class="org.scalastyle.scalariform.ObjectNamesChecker" enabled="false">
 89 |     <parameters><parameter name="regex"><![CDATA[(config|[A-Z][A-Za-z]*)]]></parameter></parameters>
 90 |   </check>
 91 | 
 92 |   <check level="error" class="org.scalastyle.scalariform.PackageObjectNamesChecker" enabled="true">
 93 |     <parameters><parameter name="regex"><![CDATA[^[a-z][A-Za-z]*$]]></parameter></parameters>
 94 |   </check>
 95 | 
 96 |   <check customId="argcount" level="error" class="org.scalastyle.scalariform.ParameterNumberChecker" enabled="true">
 97 |     <parameters><parameter name="maxParameters"><![CDATA[10]]></parameter></parameters>
 98 |   </check>
 99 | 
100 |   <check level="error" class="org.scalastyle.scalariform.NoFinalizeChecker" enabled="true"></check>
101 | 
102 |   <check level="error" class="org.scalastyle.scalariform.CovariantEqualsChecker" enabled="true"></check>
103 | 
104 |   <check level="error" class="org.scalastyle.scalariform.StructuralTypeChecker" enabled="true"></check>
105 | 
106 |   <check level="error" class="org.scalastyle.scalariform.UppercaseLChecker" enabled="true"></check>
107 | 
108 |   <check level="error" class="org.scalastyle.scalariform.IfBraceChecker" enabled="true">
109 |     <parameters>
110 |       <parameter name="singleLineAllowed"><![CDATA[true]]></parameter>
111 |       <parameter name="doubleLineAllowed"><![CDATA[true]]></parameter>
112 |     </parameters>
113 |   </check>
114 | 
115 |   <check level="error" class="org.scalastyle.scalariform.PublicMethodsHaveTypeChecker" enabled="true"></check>
116 | 
117 |   <check level="error" class="org.scalastyle.file.NewLineAtEofChecker" enabled="true"></check>
118 | 
119 |   <check customId="nonascii" level="error" class="org.scalastyle.scalariform.NonASCIICharacterChecker" enabled="true"></check>
120 | 
121 |   <check level="error" class="org.scalastyle.scalariform.SpaceAfterCommentStartChecker" enabled="true"></check>
122 | 
123 |   <check level="error" class="org.scalastyle.scalariform.EnsureSingleSpaceBeforeTokenChecker" enabled="true">
124 |    <parameters>
125 |      <parameter name="tokens">ARROW, EQUALS, ELSE, TRY, CATCH, FINALLY, LARROW, RARROW</parameter>
126 |    </parameters>
127 |   </check>
128 | 
129 |   <check level="error" class="org.scalastyle.scalariform.EnsureSingleSpaceAfterTokenChecker" enabled="true">
130 |     <parameters>
131 |      <parameter name="tokens">ARROW, EQUALS, COMMA, COLON, IF, ELSE, DO, WHILE, FOR, MATCH, TRY, CATCH, FINALLY, LARROW, RARROW</parameter>
132 |     </parameters>
133 |   </check>
134 | 
135 |   <!-- ??? usually shouldn't be checked into the code base. -->
136 |   <check level="error" class="org.scalastyle.scalariform.NotImplementedErrorUsage" enabled="true"></check>
137 | 
138 |   <!-- As of SPARK-7977 all printlns need to be wrapped in '// scalastyle:off/on println' -->
139 |   <check customId="println" level="error" class="org.scalastyle.scalariform.TokenChecker" enabled="true">
140 |     <parameters><parameter name="regex">^println$</parameter></parameters>
141 |     <customMessage><![CDATA[Are you sure you want to println? If yes, wrap the code block with
142 |       // scalastyle:off println
143 |       println(...)
144 |       // scalastyle:on println]]></customMessage>
145 |   </check>
146 | 
147 |   <check customId="visiblefortesting" level="error" class="org.scalastyle.file.RegexChecker" enabled="false">
148 |     <parameters><parameter name="regex">@VisibleForTesting</parameter></parameters>
149 |     <customMessage><![CDATA[
150 |       @VisibleForTesting causes classpath issues. Please note this in the java doc instead (SPARK-11615).
151 |     ]]></customMessage>
152 |   </check>
153 | 
154 |   <check customId="mutablesynchronizedbuffer" level="error" class="org.scalastyle.file.RegexChecker" enabled="true">
155 |     <parameters><parameter name="regex">mutable\.SynchronizedBuffer</parameter></parameters>
156 |     <customMessage><![CDATA[
157 |       Are you sure that you want to use mutable.SynchronizedBuffer? In most cases, you should use
158 |       java.util.concurrent.ConcurrentLinkedQueue instead.
159 |       If you must use mutable.SynchronizedBuffer, wrap the code block with
160 |       // scalastyle:off mutablesynchronizedbuffer
161 |       mutable.SynchronizedBuffer[...]
162 |       // scalastyle:on mutablesynchronizedbuffer
163 |     ]]></customMessage>
164 |   </check>
165 | 
166 |   <check customId="classforname" level="error" class="org.scalastyle.file.RegexChecker" enabled="false">
167 |     <parameters><parameter name="regex">Class\.forName</parameter></parameters>
168 |     <customMessage><![CDATA[
169 |       Are you sure that you want to use Class.forName? In most cases, you should use Utils.classForName instead.
170 |       If you must use Class.forName, wrap the code block with
171 |       // scalastyle:off classforname
172 |       Class.forName(...)
173 |       // scalastyle:on classforname
174 |     ]]></customMessage>
175 |   </check>
176 | 
177 |   <check customId="awaitresult" level="error" class="org.scalastyle.file.RegexChecker" enabled="true">
178 |     <parameters><parameter name="regex">Await\.result</parameter></parameters>
179 |     <customMessage><![CDATA[
180 |       Are you sure that you want to use Await.result? In most cases, you should use ThreadUtils.awaitResult instead.
181 |       If you must use Await.result, wrap the code block with
182 |       // scalastyle:off awaitresult
183 |       Await.result(...)
184 |       // scalastyle:on awaitresult
185 |     ]]></customMessage>
186 |   </check>
187 | 
188 |   <check customId="awaitready" level="error" class="org.scalastyle.file.RegexChecker" enabled="true">
189 |     <parameters><parameter name="regex">Await\.ready</parameter></parameters>
190 |     <customMessage><![CDATA[
191 |       Are you sure that you want to use Await.ready? In most cases, you should use ThreadUtils.awaitReady instead.
192 |       If you must use Await.ready, wrap the code block with
193 |       // scalastyle:off awaitready
194 |       Await.ready(...)
195 |       // scalastyle:on awaitready
196 |     ]]></customMessage>
197 |   </check>
198 | 
199 |   <!-- As of SPARK-9613 JavaConversions should be replaced with JavaConverters -->
200 |   <check customId="javaconversions" level="error" class="org.scalastyle.scalariform.TokenChecker" enabled="true">
201 |     <parameters><parameter name="regex">JavaConversions</parameter></parameters>
202 |     <customMessage>Instead of importing implicits in scala.collection.JavaConversions._, import
203 |     scala.collection.JavaConverters._ and use .asScala / .asJava methods</customMessage>
204 |   </check>
205 | 
206 |   <check customId="commonslang2" level="error" class="org.scalastyle.scalariform.TokenChecker" enabled="true">
207 |     <parameters><parameter name="regex">org\.apache\.commons\.lang\.</parameter></parameters>
208 |     <customMessage>Use Commons Lang 3 classes (package org.apache.commons.lang3.*) instead
209 |     of Commons Lang 2 (package org.apache.commons.lang.*)</customMessage>
210 |   </check>
211 | 
212 |   <check customId="extractopt" level="error" class="org.scalastyle.scalariform.TokenChecker" enabled="true">
213 |     <parameters><parameter name="regex">extractOpt</parameter></parameters>
214 |     <customMessage>Use Utils.jsonOption(x).map(.extract[T]) instead of .extractOpt[T], as the latter
215 |     is slower.  </customMessage>
216 |   </check>
217 | 
218 |   <check level="error" class="org.scalastyle.scalariform.ImportOrderChecker" enabled="true">
219 |     <parameters>
220 |       <parameter name="groups">java,scala,3rdParty,spark,hortonworks</parameter>
221 |       <parameter name="group.java">javax?\..*</parameter>
222 |       <parameter name="group.scala">scala\..*</parameter>
223 |       <parameter name="group.3rdParty">(?!org\.apache\.spark\.).*</parameter>
224 |       <parameter name="group.spark">org\.apache\.spark\..*</parameter>
225 |       <parameter name="group.spark">com\.hortonworks\..*</parameter>
226 |     </parameters>
227 |   </check>
228 | 
229 |   <check level="error" class="org.scalastyle.scalariform.DisallowSpaceBeforeTokenChecker" enabled="true">
230 |     <parameters>
231 |       <parameter name="tokens">COMMA</parameter>
232 |     </parameters>
233 |   </check>
234 | 
235 |   <!-- SPARK-3854: Single Space between ')' and '{' -->
236 |   <check customId="SingleSpaceBetweenRParenAndLCurlyBrace" level="error" class="org.scalastyle.file.RegexChecker" enabled="true">
237 |     <parameters><parameter name="regex">\)\{</parameter></parameters>
238 |     <customMessage><![CDATA[
239 |       Single Space between ')' and `{`.
240 |     ]]></customMessage>
241 |   </check>
242 | 
243 |   <check customId="NoScalaDoc" level="error" class="org.scalastyle.file.RegexChecker" enabled="true">
244 |     <parameters><parameter name="regex">(?m)^(\s*)/[*][*].*$(\r|)\n^\1  [*]</parameter></parameters>
245 |     <customMessage>Use Javadoc style indentation for multiline comments</customMessage>
246 |   </check>
247 | 
248 |   <check customId="OmitBracesInCase" level="error" class="org.scalastyle.file.RegexChecker" enabled="true">
249 |     <parameters><parameter name="regex">case[^\n>]*=>\s*\{</parameter></parameters>
250 |     <customMessage>Omit braces in case clauses.</customMessage>
251 |   </check>
252 | 
253 |   <!-- SPARK-16877: Avoid Java annotations -->
254 |   <check level="error" class="org.scalastyle.scalariform.OverrideJavaChecker" enabled="true"></check>
255 | 
256 |   <check level="error" class="org.scalastyle.scalariform.DeprecatedJavaChecker" enabled="true"></check>
257 | 
258 |   <!-- ================================================================================ -->
259 |   <!--       rules we'd like to enforce, but haven't cleaned up the codebase yet        -->
260 |   <!-- ================================================================================ -->
261 | 
262 |   <!-- We cannot turn the following two on, because it'd fail a lot of string interpolation use cases. -->
263 |   <!-- Ideally the following two rules should be configurable to rule out string interpolation. -->
264 |   <check level="error" class="org.scalastyle.scalariform.NoWhitespaceBeforeLeftBracketChecker" enabled="false"></check>
265 |   <check level="error" class="org.scalastyle.scalariform.NoWhitespaceAfterLeftBracketChecker" enabled="false"></check>
266 | 
267 |   <!-- This breaks symbolic method names so we don't turn it on. -->
268 |   <!-- Maybe we should update it to allow basic symbolic names, and then we are good to go. -->
269 |   <check level="error" class="org.scalastyle.scalariform.MethodNamesChecker" enabled="false">
270 |     <parameters>
271 |     <parameter name="regex"><![CDATA[^[a-z][A-Za-z0-9]*$]]></parameter>
272 |     </parameters>
273 |   </check>
274 | 
275 |   <!-- Should turn this on, but we have a few places that need to be fixed first -->
276 |   <check level="error" class="org.scalastyle.scalariform.EqualsHashCodeChecker" enabled="true"></check>
277 | 
278 |   <!-- ================================================================================ -->
279 |   <!--                               rules we don't want                                -->
280 |   <!-- ================================================================================ -->
281 | 
282 |   <check level="error" class="org.scalastyle.scalariform.IllegalImportsChecker" enabled="false">
283 |     <parameters><parameter name="illegalImports"><![CDATA[sun._,java.awt._]]></parameter></parameters>
284 |   </check>
285 | 
286 |   <!-- We want the opposite of this: NewLineAtEofChecker -->
287 |   <check level="error" class="org.scalastyle.file.NoNewLineAtEofChecker" enabled="false"></check>
288 | 
289 |   <!-- This one complains about all kinds of random things. Disable. -->
290 |   <check level="error" class="org.scalastyle.scalariform.SimplifyBooleanExpressionChecker" enabled="false"></check>
291 | 
292 |   <!-- We use return quite a bit for control flows and guards -->
293 |   <check level="error" class="org.scalastyle.scalariform.ReturnChecker" enabled="false"></check>
294 | 
295 |   <!-- We use null a lot in low level code and to interface with 3rd party code -->
296 |   <check level="error" class="org.scalastyle.scalariform.NullChecker" enabled="false"></check>
297 | 
298 |   <!-- Doesn't seem super big deal here ... -->
299 |   <check level="error" class="org.scalastyle.scalariform.NoCloneChecker" enabled="false"></check>
300 | 
301 |   <!-- Doesn't seem super big deal here ... -->
302 |   <check level="error" class="org.scalastyle.file.FileLengthChecker" enabled="false">
303 |     <parameters><parameter name="maxFileLength">800></parameter></parameters>
304 |   </check>
305 | 
306 |   <!-- Doesn't seem super big deal here ... -->
307 |   <check level="error" class="org.scalastyle.scalariform.NumberOfTypesChecker" enabled="false">
308 |     <parameters><parameter name="maxTypes">30</parameter></parameters>
309 |   </check>
310 | 
311 |   <!-- Doesn't seem super big deal here ... -->
312 |   <check level="error" class="org.scalastyle.scalariform.CyclomaticComplexityChecker" enabled="false">
313 |     <parameters><parameter name="maximum">10</parameter></parameters>
314 |   </check>
315 | 
316 |   <!-- Doesn't seem super big deal here ... -->
317 |   <check level="error" class="org.scalastyle.scalariform.MethodLengthChecker" enabled="false">
318 |     <parameters><parameter name="maxLength">50</parameter></parameters>
319 |   </check>
320 | 
321 |   <!-- Not exactly feasible to enforce this right now. -->
322 |   <!-- It is also infrequent that somebody introduces a new class with a lot of methods. -->
323 |   <check level="error" class="org.scalastyle.scalariform.NumberOfMethodsInTypeChecker" enabled="false">
324 |     <parameters><parameter name="maxMethods"><![CDATA[30]]></parameter></parameters>
325 |   </check>
326 | 
327 |   <!-- Doesn't seem super big deal here, and we have a lot of magic numbers ... -->
328 |   <check level="error" class="org.scalastyle.scalariform.MagicNumberChecker" enabled="false">
329 |     <parameters><parameter name="ignore">-1,0,1,2,3</parameter></parameters>
330 |   </check>
331 | 
332 | </scalastyle>
333 | 


--------------------------------------------------------------------------------