├── .gitignore
├── LICENSE
├── README.md
├── bin
    └── herringbone
├── herringbone-impala
    ├── pom.xml
    └── src
    │   └── main
    │       ├── scala
    │           └── com
    │           │   └── stripe
    │           │       └── herringbone
    │           │           └── impala
    │           │               ├── Connection.scala
    │           │               ├── Cursor.scala
    │           │               ├── Exceptions.scala
    │           │               ├── ImpalaClient.scala
    │           │               └── ImpalaValue.scala
    │       └── thrift
    │           ├── ImpalaService.thrift
    │           ├── Status.thrift
    │           ├── beeswax.thrift
    │           ├── cli_service.thrift
    │           ├── fb303.thrift
    │           └── hive_metastore.thrift
├── herringbone-main
    ├── pom.xml
    └── src
    │   ├── main
    │       ├── scala
    │       │   └── com
    │       │   │   └── stripe
    │       │   │       └── herringbone
    │       │   │           ├── CompactInputFormat.scala
    │       │   │           ├── CompactJob.scala
    │       │   │           ├── FlattenJob.scala
    │       │   │           ├── ParquetLoad.scala
    │       │   │           ├── TsvJob.scala
    │       │   │           ├── flatten
    │       │   │               ├── FlatConsumer.scala
    │       │   │               ├── FlatConverter.scala
    │       │   │               ├── ParquetFlatConf.scala
    │       │   │               ├── ParquetFlatMapper.scala
    │       │   │               └── TypeFlattener.scala
    │       │   │           ├── load
    │       │   │               ├── FieldUtils.scala
    │       │   │               ├── HadoopFs.scala
    │       │   │               ├── HiveLoader.scala
    │       │   │               ├── HiveServer2Connection.scala
    │       │   │               ├── ImpalaLoader.scala
    │       │   │               ├── ParquetLoadConf.scala
    │       │   │               └── ParquetLoader.scala
    │       │   │           └── util
    │       │   │               └── ParquetUtils.scala
    │       └── thrift
    │       │   ├── ImpalaService.thrift
    │       │   ├── Status.thrift
    │       │   ├── beeswax.thrift
    │       │   ├── cli_service.thrift
    │       │   ├── fb303.thrift
    │       │   └── hive_metastore.thrift
    │   └── test
    │       ├── resources
    │           └── test.parquet
    │       └── scala
    │           └── com
    │               └── stripe
    │                   └── herringbone
    │                       ├── FlattenJobTest.scala
    │                       ├── flatten
    │                           ├── FlatConverterTest.scala
    │                           └── TypeFlattenerTest.scala
    │                       └── load
    │                           └── FieldUtilsTest.scala
└── pom.xml


/.gitignore:
--------------------------------------------------------------------------------
 1 | target/
 2 | data/
 3 | .idea/
 4 | *.pyc
 5 | *.iml
 6 | # ignore ROC plots
 7 | *.pdf
 8 | .tddium*
 9 | 
10 | .DS_Store
11 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2014- Stripe, Inc. (https://stripe.com)
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Herringbone
 2 | ===========
 3 | 
 4 | > _**Herringbone is deprecated and is no longer being actively maintained.**_
 5 | 
 6 | Herringbone is a suite of tools for working with parquet files on hdfs, and with impala and hive.
 7 | 
 8 | The available commands are:
 9 | 
10 | `flatten`: transform a directory of parquet files with a nested structure into a directory of parquet files with a flat schema that can be loaded into impala or hive (neither of which support nested schemas). Default output directory is `/path/to/input/directory-flat`.
11 | 
12 |     $ herringbone flatten -i /path/to/input/directory [-o /path/to/non/default/output/directory]
13 | 
14 | `load`: load a directory of parquet files (which must have a flat schema) into impala or hive (defaulting to impala). Use the --nocompute-stats option for faster loading into impala (but probably slower querying later on!)
15 | 
16 |     $ herringbone load [--hive] [-u] [--nocompute-stats] -d db_name -t table -p /path/to/parquet/directory
17 | 
18 | `tsv`: transform a directory of parquet files into a directory of tsv files (which you can concat properly later with `hadoop fs -getmerge /path/to/tsvs`). Default output directory is `/path/to/input/directory-tsv`.
19 | 
20 |     $ herringbone tsv -i /path/to/input/directory [-o /path/to/non/default/output/directory]
21 | 
22 | `compact`: transform a directory of parquet files into a directory of fewer larger parquet files. Default output directory is `/path/to/input/directory-compact`.
23 | 
24 |     $ herringbone compact -i /path/to/input/directory [-o /path/to/non/default/output/directory]
25 | 
26 | See `herringbone COMMAND --help` for more information on a specific command.
27 | 
28 | Building
29 | --------
30 | 
31 | You'll need thrift 0.9.1 on your path.
32 | 
33 |     $ git clone github.com/stripe/herringbone
34 |     $ cd herringbone
35 |     $ mvn package
36 | 
37 | Authors
38 | -------
39 | 
40 |  - [Avi Bryant](http://twitter.com/avibryant)
41 |  - [Danielle Sucher](http://twitter.com/daniellesucher)
42 |  - [Jeff Balogh](http://twitter.com/jbalogh)
43 | 


--------------------------------------------------------------------------------
/bin/herringbone:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | 
 3 | usage = <<-USAGE
 4 | Herringbone is a suite of tools for working with parquet files on hdfs.
 5 | 
 6 | The available commands are:
 7 | 
 8 | flatten: Transform a directory of parquet files with a nested structure into a directory of parquet files with a flat schema that can be loaded into impala or hive
 9 | 
10 | load: Load a directory of parquet files (which must have a flat schema) into impala or hive (defaults to impala). Use the --nocompute-stats option for faster loading into impala (but probably slower querying later on!)
11 | 
12 | tsv: Transform a directory of parquet files into a directory of tsv files (which you can concat properly later with `hadoop fs -getmerge /path/to/tsvs`)
13 | 
14 | compact: Transform a directory of parquet files into a directory of fewer larger parquet files
15 | 
16 | 
17 | Example usage:
18 | 
19 | `herringbone flatten -i /path/to/input/directory -o /path/to/output/directory`
20 | 
21 | `herringbone load [--hive] [-u] [--nocompute-stats] -d db_name -t table -p /path/to/parquet/directory`
22 | 
23 | `herringbone tsv -i /path/to/input/directory -o /path/to/output/directory`
24 | 
25 | `herringbone compact -i /path/to/input/directory -o /path/to/output/directory`
26 | 
27 | 
28 | See 'herringbone COMMAND --help' for more information on a specific command.
29 | 
30 | 
31 |   USAGE
32 | 
33 | command_jobs = {
34 |   'compact' => 'CompactJob',
35 |   'load' => 'ParquetLoad',
36 |   'flatten' => 'FlattenJob',
37 |   'tsv' => 'TsvJob',
38 | }
39 | 
40 | # Validate the given command and print usage if needed.
41 | command = ARGV.shift
42 | JOB = command_jobs[command]
43 | 
44 | if ['-h', '--help'].include?(command)
45 |   puts usage
46 |   exit 0
47 | elsif !JOB
48 |   STDERR.puts "\nError: #{command} is not an available command\n\n"
49 |   puts "#{'=' * 30}\n\n"
50 |   puts usage
51 |   exit 1
52 | end
53 | 
54 | jar_path = File.join(
55 |   File.dirname(__FILE__),
56 |   '../',
57 |   'herringbone-main',
58 |   'target',
59 |   'herringbone-0.0.1-jar-with-dependencies.jar'
60 | )
61 | JAR = File.expand_path(jar_path)
62 | 
63 | ENV["HADOOP_CLASSPATH"] = JAR
64 | ENV["HADOOP_USER_CLASSPATH_FIRST"] = "true"
65 | 
66 | exec(
67 |   "hadoop",
68 |   "jar",
69 |   JAR,
70 |   "com.stripe.herringbone.#{JOB}",
71 |   *ARGV
72 | )
73 | 


--------------------------------------------------------------------------------
/herringbone-impala/pom.xml:
--------------------------------------------------------------------------------
  1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  2 |   xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  3 |   <modelVersion>4.0.0</modelVersion>
  4 | 
  5 |   <groupId>com.stripe</groupId>
  6 |   <artifactId>herringbone-impala</artifactId>
  7 |   <version>0.0.2</version>
  8 |   <packaging>jar</packaging>
  9 | 
 10 |   <name>Herringbone Impala</name>
 11 | 
 12 |   <pluginRepositories>
 13 |     <pluginRepository>
 14 |       <id>dtrott</id>
 15 |       <url>https://maven.davidtrott.com/repository</url>
 16 |     </pluginRepository>
 17 |   </pluginRepositories>
 18 | 
 19 |   <build>
 20 |     <plugins>
 21 |       <plugin>
 22 |         <groupId>org.apache.maven.plugins</groupId>
 23 |         <artifactId>maven-compiler-plugin</artifactId>
 24 |         <version>3.1</version>
 25 |         <configuration>
 26 |           <source>1.6</source>
 27 |           <target>1.6</target>
 28 |         </configuration>
 29 |       </plugin>
 30 | 
 31 |       <plugin>
 32 |         <artifactId>maven-jar-plugin</artifactId>
 33 |         <version>2.3.1</version>
 34 |       </plugin>
 35 | 
 36 |       <plugin>
 37 |         <artifactId>maven-resources-plugin</artifactId>
 38 |         <version>2.4.3</version>
 39 |       </plugin>
 40 | 
 41 |       <plugin>
 42 |         <groupId>net.alchim31.maven</groupId>
 43 |         <artifactId>scala-maven-plugin</artifactId>
 44 |         <version>3.1.6</version>
 45 |         <executions>
 46 |           <execution>
 47 |             <goals>
 48 |               <goal>compile</goal>
 49 |               <goal>testCompile</goal>
 50 |             </goals>
 51 |           </execution>
 52 |         </executions>
 53 |       </plugin>
 54 | 
 55 |       <plugin>
 56 |         <groupId>org.apache.thrift.tools</groupId>
 57 |         <artifactId>maven-thrift-plugin</artifactId>
 58 |         <version>0.1.11</version>
 59 |         <configuration>
 60 |           <checkStaleness>true</checkStaleness>
 61 |           <thriftExecutable>thrift</thriftExecutable>
 62 |         </configuration>
 63 |         <executions>
 64 |           <execution>
 65 |             <id>thrift-sources</id>
 66 |             <phase>generate-sources</phase>
 67 |             <goals>
 68 |               <goal>compile</goal>
 69 |             </goals>
 70 |           </execution>
 71 |           <execution>
 72 |             <id>thrift-test-sources</id>
 73 |             <phase>generate-test-sources</phase>
 74 |             <goals>
 75 |               <goal>testCompile</goal>
 76 |             </goals>
 77 |           </execution>
 78 |         </executions>
 79 |       </plugin>
 80 | 
 81 |     </plugins>
 82 |   </build>
 83 | 
 84 |   <properties>
 85 |     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
 86 |     <scala.version>2.10.4</scala.version>
 87 |     <maven.compiler.source>1.6</maven.compiler.source>
 88 |     <maven.compiler.target>1.6</maven.compiler.target>
 89 |   </properties>
 90 | 
 91 |   <repositories>
 92 |     <repository>
 93 |       <id>cloudera-releases</id>
 94 |       <url>https://repository.cloudera.com/artifactory/cloudera-repos</url>
 95 |       <releases>
 96 |         <enabled>true</enabled>
 97 |       </releases>
 98 |       <snapshots>
 99 |         <enabled>false</enabled>
100 |       </snapshots>
101 |     </repository>
102 |   </repositories>
103 | 
104 |   <dependencies>
105 |     <dependency>
106 |       <groupId>org.apache.thrift</groupId>
107 |       <artifactId>libthrift</artifactId>
108 |       <version>0.12.0</version>
109 |     </dependency>
110 |     <dependency>
111 |       <groupId>org.slf4j</groupId>
112 |       <artifactId>slf4j-log4j12</artifactId>
113 |       <version>1.5.2</version>
114 |     </dependency>
115 |   </dependencies>
116 | </project>
117 | 
118 | 
119 | 


--------------------------------------------------------------------------------
/herringbone-impala/src/main/scala/com/stripe/herringbone/impala/Connection.scala:
--------------------------------------------------------------------------------
 1 | package com.stripe.herringbone.impala
 2 | 
 3 | import org.apache.thrift.transport.TSocket
 4 | import org.apache.thrift.protocol.TBinaryProtocol
 5 | 
 6 | import com.cloudera.impala.thrift.ImpalaService.{Client => ClouderaImpalaClient}
 7 | import com.cloudera.beeswax.api._
 8 | 
 9 | import scala.annotation.tailrec
10 | import scala.collection.JavaConversions._
11 | 
12 | case class Connection(host: String, port: Int) {
13 |   var isOpen = false
14 |   val logContext = "herringbone-impala"
15 |   lazy val socket = new TSocket(host, port)
16 |   lazy val client = new ClouderaImpalaClient(new TBinaryProtocol(socket))
17 | 
18 |   open
19 | 
20 |   def open = {
21 |     if (!isOpen) {
22 |       socket.open
23 |       client.ResetCatalog
24 |       isOpen = true
25 |     }
26 |   }
27 | 
28 |   def close = {
29 |     if (isOpen) {
30 |       socket.close
31 |       isOpen = false
32 |     }
33 |   }
34 | 
35 |   // Refresh the metadata store.
36 |   def refresh = {
37 |     if (!isOpen) throw ConnectionException("Connection closed")
38 |     client.ResetCatalog
39 |   }
40 | 
41 |   // Perform a query, and pass in a function that will be called with each
42 |   // row of the results
43 |   def query(raw: String)(fn: Seq[ImpalaValue] => Unit) {
44 |     val cursor = execute(raw)
45 |     cursor.foreach { row => fn(row) }
46 |     cursor.close
47 |   }
48 | 
49 |   // Perform a query and return a cursor for iterating over the results.
50 |   // You probably want to call cursor.close when you're done with it.
51 |   def execute(raw: String): Cursor = {
52 |     if (!isOpen) throw ConnectionException("Connection closed")
53 |     validateQuery(raw)
54 | 
55 |     val query = new Query
56 |     query.query = raw
57 | 
58 |     val handle = client.executeAndWait(query, logContext)
59 |     Cursor(handle, client)
60 |   }
61 | 
62 |   private def validateQuery(raw: String) = {
63 |     val words = raw.split("\\s+")
64 |     if (words.isEmpty) throw InvalidQueryException("Empty query")
65 |   }
66 | }
67 | 


--------------------------------------------------------------------------------
/herringbone-impala/src/main/scala/com/stripe/herringbone/impala/Cursor.scala:
--------------------------------------------------------------------------------
 1 | package com.stripe.herringbone.impala
 2 | 
 3 | import org.apache.hadoop.hive.metastore.api.FieldSchema
 4 | 
 5 | import com.cloudera.impala.thrift.ImpalaService.{Client => ClouderaImpalaClient}
 6 | import com.cloudera.beeswax.api._
 7 | 
 8 | import scala.collection.mutable.ArrayBuffer
 9 | import scala.collection.JavaConversions._
10 | 
11 | case class Cursor(handle: QueryHandle, client: ClouderaImpalaClient) {
12 |   var done = false
13 |   var isOpen = true
14 |   var rowBuffer = ArrayBuffer.empty[Seq[ImpalaValue]]
15 |   val bufferSize = 1024
16 |   private lazy val metadata: ResultsMetadata = client.get_results_metadata(handle)
17 | 
18 |   def foreach(fn: Seq[ImpalaValue] => Unit) = {
19 |     var row = fetchRow
20 |     while (row.isDefined) {
21 |       fn(row.get)
22 |       row = fetchRow
23 |     }
24 |   }
25 | 
26 |   def fetchRow: Option[Seq[ImpalaValue]] = {
27 |     if (rowBuffer.isEmpty) {
28 |       if (done) {
29 |         None
30 |       } else {
31 |         fetchMore
32 |         fetchRow
33 |       }
34 |     } else {
35 |       val row = rowBuffer.head
36 |       rowBuffer = rowBuffer.tail
37 |       Some(row)
38 |     }
39 |   }
40 | 
41 |   // Close the cursor on the remote server. Once a cursor is closed, you
42 |   // can no longer fetch any rows from it.
43 |   def close = {
44 |     if (!isOpen) {
45 |       isOpen = false
46 |       client.close(handle)
47 |     }
48 |   }
49 | 
50 |   // Returns true if there are any more rows to fetch.
51 |   def hasMore = !done || !rowBuffer.isEmpty
52 | 
53 |   def runtime_profile = client.GetRuntimeProfile(handle)
54 | 
55 |   private def fetchMore = {
56 |     while (!done && rowBuffer.size < bufferSize) {
57 |       fetchBatch
58 |     }
59 |   }
60 | 
61 |   private def fetchBatch = {
62 |     if (!isOpen) throw CursorException("Cursor has expired or been closed")
63 | 
64 |     try {
65 |       val response = client.fetch(handle, false, bufferSize)
66 |       validateQueryState(client.get_state(handle))
67 | 
68 |       val rows  = response.data.map { row => parseRow(row) }
69 |       rowBuffer ++= rows
70 | 
71 |       if (!response.has_more) {
72 |         done = true
73 |         close
74 |       }
75 |     } catch {
76 |       case e: BeeswaxException => {
77 |         isOpen = false
78 |         throw e
79 |       }
80 |       case e: Exception => throw e
81 |     }
82 |   }
83 | 
84 |   private def parseRow(row: String) = {
85 |     val fields = row.split(metadata.delim)
86 | 
87 |     metadata.schema.getFieldSchemas.zip(fields).map { case(schema, rawValue) =>
88 |       ImpalaValue(rawValue, schema.getName, schema.getType)
89 |     }
90 |   }
91 | 
92 |   private def validateQueryState(state: QueryState) = {
93 |     if (state == QueryState.EXCEPTION) {
94 |       close
95 |       throw CursorException("The query was aborted")
96 |     }
97 |   }
98 | }
99 | 


--------------------------------------------------------------------------------
/herringbone-impala/src/main/scala/com/stripe/herringbone/impala/Exceptions.scala:
--------------------------------------------------------------------------------
1 | package com.stripe.herringbone.impala
2 | 
3 | case class ConnectionException(message: String) extends Exception
4 | case class CursorException(message: String) extends Exception
5 | case class InvalidQueryException(message: String) extends Exception
6 | case class ParsingException(message: String) extends Exception
7 | 
8 | 


--------------------------------------------------------------------------------
/herringbone-impala/src/main/scala/com/stripe/herringbone/impala/ImpalaClient.scala:
--------------------------------------------------------------------------------
 1 | package com.stripe.herringbone.impala
 2 | 
 3 | case class ImpalaClient(host: String, port: Int) {
 4 |   lazy val connection = Connection(host, port)
 5 | 
 6 |   def execute(raw: String) {
 7 |     query(raw){ row =>
 8 |       println(row.map { _.raw }.mkString(" "))
 9 |     }
10 |   }
11 | 
12 |   def query(raw: String)(fn: Seq[ImpalaValue] => Unit) {
13 |     println(raw)
14 |     connection.query(raw){ row => fn(row) }
15 |   }
16 | }
17 | 


--------------------------------------------------------------------------------
/herringbone-impala/src/main/scala/com/stripe/herringbone/impala/ImpalaValue.scala:
--------------------------------------------------------------------------------
 1 | package com.stripe.herringbone.impala
 2 | 
 3 | import java.text.SimpleDateFormat
 4 | 
 5 | case class ImpalaValue(raw: String, fieldName: String, fieldType: String) {
 6 |   lazy val convertedValue = convertRawValue(raw)
 7 | 
 8 |   private def convertRawValue(raw: String): Option[Any] = {
 9 |     if (raw == "NULL") {
10 |       None
11 |     } else {
12 |       val converted = fieldType match {
13 |         case "string" => raw
14 |         case "boolean" => convertBoolean(raw)
15 |         case "tinyint" | "smallint" | "int" | "bigint" => raw.toInt
16 |         case "double" | "float" | "decimal" => raw.toDouble
17 |         case "timestamp" => convertTimestamp(raw)
18 |         case _ => throw ParsingException("Unknown type: " + fieldType)
19 |       }
20 |       Some(converted)
21 |     }
22 |   }
23 | 
24 |   private def convertBoolean(raw: String) = {
25 |     try {
26 |       raw.toBoolean
27 |     } catch {
28 |       case e: java.lang.IllegalArgumentException =>
29 |         throw ParsingException("Invalid value for boolean: " + raw)
30 |     }
31 |   }
32 | 
33 |   private def convertTimestamp(raw: String) = {
34 |     val formatStr = if (raw.indexOf(".") == -1) {
35 |       "YYYY-MM-DD HH:MM:SS"
36 |     } else {
37 |       "YYYY-MM-DD HH:MM:SS.sssssssss"
38 |     }
39 | 
40 |     val dateFormat = new SimpleDateFormat(formatStr)
41 |     dateFormat.parse(raw)
42 |   }
43 | }
44 | 


--------------------------------------------------------------------------------
/herringbone-impala/src/main/thrift/ImpalaService.thrift:
--------------------------------------------------------------------------------
  1 | // Copyright 2012 Cloudera Inc.
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | // http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | namespace cpp impala
 16 | namespace java com.cloudera.impala.thrift
 17 | namespace rb impala.protocol
 18 | 
 19 | include "Status.thrift"
 20 | include "beeswax.thrift"
 21 | include "cli_service.thrift"
 22 | 
 23 | // ImpalaService accepts query execution options through beeswax.Query.configuration in
 24 | // key:value form. For example, the list of strings could be:
 25 | //     "num_nodes:1", "abort_on_error:false"
 26 | // The valid keys are listed in this enum. They map to TQueryOptions.
 27 | // Note: If you add an option or change the default, you also need to update:
 28 | // - ImpalaInternalService.thrift: TQueryOptions
 29 | // - ImpaladClientExecutor.getBeeswaxQueryConfigurations()
 30 | // - ImpalaServer::SetQueryOptions()
 31 | // - ImpalaServer::TQueryOptionsToMap()
 32 | enum TImpalaQueryOptions {
 33 |   // if true, abort execution on the first error
 34 |   ABORT_ON_ERROR,
 35 | 
 36 |   // maximum # of errors to be reported; Unspecified or 0 indicates backend default
 37 |   MAX_ERRORS,
 38 | 
 39 |   // if true, disable llvm codegen
 40 |   DISABLE_CODEGEN,
 41 | 
 42 |   // batch size to be used by backend; Unspecified or a size of 0 indicates backend
 43 |   // default
 44 |   BATCH_SIZE,
 45 | 
 46 |   // a per-machine approximate limit on the memory consumption of this query;
 47 |   // unspecified or a limit of 0 means no limit;
 48 |   // otherwise specified either as:
 49 |   // a) an int (= number of bytes);
 50 |   // b) a float followed by "M" (MB) or "G" (GB)
 51 |   MEM_LIMIT,
 52 | 
 53 |   // specifies the degree of parallelism with which to execute the query;
 54 |   // 1: single-node execution
 55 |   // NUM_NODES_ALL: executes on all nodes that contain relevant data
 56 |   // NUM_NODES_ALL_RACKS: executes on one node per rack that holds relevant data
 57 |   // > 1: executes on at most that many nodes at any point in time (ie, there can be
 58 |   //      more nodes than numNodes with plan fragments for this query, but at most
 59 |   //      numNodes would be active at any point in time)
 60 |   // Constants (NUM_NODES_ALL, NUM_NODES_ALL_RACKS) are defined in JavaConstants.thrift.
 61 |   NUM_NODES,
 62 | 
 63 |   // maximum length of the scan range; only applicable to HDFS scan range; Unspecified or
 64 |   // a length of 0 indicates backend default;
 65 |   MAX_SCAN_RANGE_LENGTH,
 66 | 
 67 |   // Maximum number of io buffers (per disk)
 68 |   MAX_IO_BUFFERS,
 69 | 
 70 |   // Number of scanner threads.
 71 |   NUM_SCANNER_THREADS,
 72 | 
 73 |   // If true, Impala will try to execute on file formats that are not fully supported yet
 74 |   ALLOW_UNSUPPORTED_FORMATS,
 75 | 
 76 |   // if set and > -1, specifies the default limit applied to a top-level SELECT statement
 77 |   // with an ORDER BY but without a LIMIT clause (ie, if the SELECT statement also has
 78 |   // a LIMIT clause, this default is ignored)
 79 |   DEFAULT_ORDER_BY_LIMIT,
 80 | 
 81 |   // DEBUG ONLY:
 82 |   // If set to
 83 |   //   "[<backend number>:]<node id>:<TExecNodePhase>:<TDebugAction>",
 84 |   // the exec node with the given id will perform the specified action in the given
 85 |   // phase. If the optional backend number (starting from 0) is specified, only that
 86 |   // backend instance will perform the debug action, otherwise all backends will behave
 87 |   // in that way.
 88 |   // If the string doesn't have the required format or if any of its components is
 89 |   // invalid, the option is ignored.
 90 |   DEBUG_ACTION,
 91 | 
 92 |   // If true, raise an error when the DEFAULT_ORDER_BY_LIMIT has been reached.
 93 |   ABORT_ON_DEFAULT_LIMIT_EXCEEDED,
 94 | 
 95 |   // Compression codec for parquet when inserting into parquet tables.
 96 |   // Valid values are "snappy", "gzip" and "none"
 97 |   // Leave blank to use default.
 98 |   PARQUET_COMPRESSION_CODEC,
 99 | 
100 |   // HBase scan query option. If set and > 0, HBASE_CACHING is the value for
101 |   // "hbase.client.Scan.setCaching()" when querying HBase table. Otherwise, use backend
102 |   // default.
103 |   // If the value is too high, then the hbase region server will have a hard time (GC
104 |   // pressure and long response times). If the value is too small, then there will be
105 |   // extra trips to the hbase region server.
106 |   HBASE_CACHING,
107 | 
108 |   // HBase scan query option. If set, HBase scan will always set
109 |   // "hbase.client.setCacheBlocks" to CACHE_BLOCKS. Default is false.
110 |   // If the table is large and the query is doing big scan, set it to false to
111 |   // avoid polluting the cache in the hbase region server.
112 |   // If the table is small and the table is used several time, set it to true to improve
113 |   // performance.
114 |   HBASE_CACHE_BLOCKS,
115 | }
116 | 
117 | // The summary of an insert.
118 | struct TInsertResult {
119 |   // Number of appended rows per modified partition. Only applies to HDFS tables.
120 |   // The keys represent partitions to create, coded as k1=v1/k2=v2/k3=v3..., with the
121 |   // root in an unpartitioned table being the empty string.
122 |   1: required map<string, i64> rows_appended
123 | }
124 | 
125 | // Response from a call to PingImpalaService
126 | struct TPingImpalaServiceResp {
127 |   // The Impala service's version string.
128 |   1: string version
129 | }
130 | 
131 | // Parameters for a ResetTable request which will invalidate a table's metadata.
132 | // DEPRECATED.
133 | struct TResetTableReq {
134 |   // Name of the table's parent database.
135 |   1: required string db_name
136 | 
137 |   // Name of the table.
138 |   2: required string table_name
139 | }
140 | 
141 | // For all rpc that return a TStatus as part of their result type,
142 | // if the status_code field is set to anything other than OK, the contents
143 | // of the remainder of the result type is undefined (typically not set)
144 | service ImpalaService extends beeswax.BeeswaxService {
145 |   // Cancel execution of query. Returns RUNTIME_ERROR if query_id
146 |   // unknown.
147 |   // This terminates all threads running on behalf of this query at
148 |   // all nodes that were involved in the execution.
149 |   // Throws BeeswaxException if the query handle is invalid (this doesn't
150 |   // necessarily indicate an error: the query might have finished).
151 |   Status.TStatus Cancel(1:beeswax.QueryHandle query_id)
152 |       throws(1:beeswax.BeeswaxException error);
153 | 
154 |   // Invalidates all catalog metadata, forcing a reload
155 |   // DEPRECATED; execute query "invalidate metadata" to refresh metadata
156 |   Status.TStatus ResetCatalog();
157 | 
158 |   // Invalidates a specific table's catalog metadata, forcing a reload on the next access
159 |   // DEPRECATED; execute query "refresh <table>" to refresh metadata
160 |   Status.TStatus ResetTable(1:TResetTableReq request)
161 | 
162 |   // Returns the runtime profile string for the given query handle.
163 |   string GetRuntimeProfile(1:beeswax.QueryHandle query_id)
164 |       throws(1:beeswax.BeeswaxException error);
165 | 
166 |   // Closes the query handle and return the result summary of the insert.
167 |   TInsertResult CloseInsert(1:beeswax.QueryHandle handle)
168 |       throws(1:beeswax.QueryNotFoundException error, 2:beeswax.BeeswaxException error2);
169 | 
170 |   // Client calls this RPC to verify that the server is an ImpalaService. Returns the
171 |   // server version.
172 |   TPingImpalaServiceResp PingImpalaService();
173 | }
174 | 
175 | // Impala HiveServer2 service
176 | service ImpalaHiveServer2Service extends cli_service.TCLIService {
177 | }
178 | 


--------------------------------------------------------------------------------
/herringbone-impala/src/main/thrift/Status.thrift:
--------------------------------------------------------------------------------
 1 | // Copyright 2012 Cloudera Inc.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | // http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | namespace cpp impala
16 | namespace java com.cloudera.impala.thrift
17 | namespace rb impala.protocol
18 | 
19 | enum TStatusCode {
20 |   OK,
21 |   CANCELLED,
22 |   ANALYSIS_ERROR,
23 |   NOT_IMPLEMENTED_ERROR,
24 |   RUNTIME_ERROR,
25 |   MEM_LIMIT_EXCEEDED,
26 |   INTERNAL_ERROR
27 | }
28 | 
29 | struct TStatus {
30 |   1: required TStatusCode status_code
31 |   2: list<string> error_msgs
32 | }
33 | 


--------------------------------------------------------------------------------
/herringbone-impala/src/main/thrift/beeswax.thrift:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to Cloudera, Inc. under one
  3 |  * or more contributor license agreements.  See the NOTICE file
  4 |  * distributed with this work for additional information
  5 |  * regarding copyright ownership.  Cloudera, Inc. licenses this file
  6 |  * to you under the Apache License, Version 2.0 (the
  7 |  * "License"); you may not use this file except in compliance
  8 |  * with the License.  You may obtain a copy of the License at
  9 |  *
 10 |  *     http://www.apache.org/licenses/LICENSE-2.0
 11 |  *
 12 |  * Unless required by applicable law or agreed to in writing, software
 13 |  * distributed under the License is distributed on an "AS IS" BASIS,
 14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 |  * See the License for the specific language governing permissions and
 16 |  * limitations under the License.
 17 |  *
 18 |  * Interface for interacting with Beeswax Server
 19 |  */
 20 | 
 21 | namespace java com.cloudera.beeswax.api
 22 | namespace py beeswaxd
 23 | namespace cpp beeswax
 24 | namespace rb impala.protocol.beeswax
 25 | 
 26 | include "hive_metastore.thrift"
 27 | 
 28 | // A Query
 29 | struct Query {
 30 |   1: string query;
 31 |   // A list of HQL commands to execute before the query.
 32 |   // This is typically defining UDFs, setting settings, and loading resources.
 33 |   3: list<string> configuration;
 34 | 
 35 |   // User and groups to "act as" for purposes of Hadoop.
 36 |   4: string hadoop_user;
 37 | }
 38 | 
 39 | typedef string LogContextId
 40 | 
 41 | enum QueryState {
 42 |   CREATED,
 43 |   INITIALIZED,
 44 |   COMPILED,
 45 |   RUNNING,
 46 |   FINISHED,
 47 |   EXCEPTION
 48 | }
 49 | 
 50 | struct QueryHandle {
 51 |   1: string id;
 52 |   2: LogContextId log_context;
 53 | }
 54 | 
 55 | struct QueryExplanation {
 56 |   1: string textual
 57 | }
 58 | 
 59 | struct Results {
 60 |   // If set, data is valid.  Otherwise, results aren't ready yet.
 61 |   1: bool ready,
 62 |   // Columns for the results
 63 |   2: list<string> columns,
 64 |   // A set of results
 65 |   3: list<string> data,
 66 |   // The starting row of the results
 67 |   4: i64 start_row,
 68 |   // Whether there are more results to fetch
 69 |   5: bool has_more
 70 | }
 71 | 
 72 | /**
 73 |  * Metadata information about the results.
 74 |  * Applicable only for SELECT.
 75 |  */
 76 | struct ResultsMetadata {
 77 |   /** The schema of the results */
 78 |   1: hive_metastore.Schema schema,
 79 |   /** The directory containing the results. Not applicable for partition table. */
 80 |   2: string table_dir,
 81 |   /** If the results are straight from an existing table, the table name. */
 82 |   3: string in_tablename,
 83 |   /** Field delimiter */
 84 |   4: string delim,
 85 | }
 86 | 
 87 | exception BeeswaxException {
 88 |   1: string message,
 89 |   // Use get_log(log_context) to retrieve any log related to this exception
 90 |   2: LogContextId log_context,
 91 |   // (Optional) The QueryHandle that caused this exception
 92 |   3: QueryHandle handle,
 93 |   4: optional i32 errorCode = 0,
 94 |   5: optional string SQLState = "     "
 95 | }
 96 | 
 97 | exception QueryNotFoundException {
 98 | }
 99 | 
100 | /** Represents a Hadoop-style configuration variable. */
101 | struct ConfigVariable {
102 |   1: string key,
103 |   2: string value,
104 |   3: string description
105 | }
106 | 
107 | service BeeswaxService {
108 |   /**
109 |    * Submit a query and return a handle (QueryHandle). The query runs asynchronously.
110 |    */
111 |   QueryHandle query(1:Query query) throws(1:BeeswaxException error),
112 | 
113 |   /**
114 |    * run a query synchronously and return a handle (QueryHandle).
115 |    */
116 |   QueryHandle executeAndWait(1:Query query, 2:LogContextId clientCtx)
117 |                         throws(1:BeeswaxException error),
118 | 
119 |   /**
120 |    * Get the query plan for a query.
121 |    */
122 |   QueryExplanation explain(1:Query query)
123 |                         throws(1:BeeswaxException error),
124 | 
125 |   /**
126 |    * Get the results of a query. This is non-blocking. Caller should check
127 |    * Results.ready to determine if the results are in yet. The call requests
128 |    * the batch size of fetch.
129 |    */
130 |   Results fetch(1:QueryHandle query_id, 2:bool start_over, 3:i32 fetch_size=-1)
131 |               throws(1:QueryNotFoundException error, 2:BeeswaxException error2),
132 | 
133 |   /**
134 |    * Get the state of the query
135 |    */
136 |   QueryState get_state(1:QueryHandle handle) throws(1:QueryNotFoundException error),
137 | 
138 |   /**
139 |    * Get the result metadata
140 |    */
141 |   ResultsMetadata get_results_metadata(1:QueryHandle handle)
142 |                                     throws(1:QueryNotFoundException error),
143 | 
144 |   /**
145 |    * Used to test connection to server.  A "noop" command.
146 |    */
147 |   string echo(1:string s)
148 | 
149 |   /**
150 |    * Returns a string representation of the configuration object being used.
151 |    * Handy for debugging.
152 |    */
153 |   string dump_config()
154 | 
155 |   /**
156 |    * Get the log messages related to the given context.
157 |    */
158 |   string get_log(1:LogContextId context) throws(1:QueryNotFoundException error)
159 | 
160 |   /*
161 |    * Returns "default" configuration.
162 |    */
163 |   list<ConfigVariable> get_default_configuration(1:bool include_hadoop)
164 | 
165 |   /*
166 |    * closes the query with given handle
167 |    */
168 |   void close(1:QueryHandle handle) throws(1:QueryNotFoundException error,
169 |                             2:BeeswaxException error2)
170 | 
171 |   /*
172 |    * clean the log context for given id
173 |    */
174 |   void clean(1:LogContextId log_context)
175 | }
176 | 


--------------------------------------------------------------------------------
/herringbone-impala/src/main/thrift/cli_service.thrift:
--------------------------------------------------------------------------------
   1 | // Licensed to the Apache Software Foundation (ASF) under one
   2 | // or more contributor license agreements.  See the NOTICE file
   3 | // distributed with this work for additional information
   4 | // regarding copyright ownership.  The ASF licenses this file
   5 | // to you under the Apache License, Version 2.0 (the
   6 | // "License"); you may not use this file except in compliance
   7 | // with the License.  You may obtain a copy of the License at
   8 | //
   9 | //     http://www.apache.org/licenses/LICENSE-2.0
  10 | //
  11 | // Unless required by applicable law or agreed to in writing, software
  12 | // distributed under the License is distributed on an "AS IS" BASIS,
  13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14 | // See the License for the specific language governing permissions and
  15 | // limitations under the License.
  16 | 
  17 | // Coding Conventions for this file:
  18 | //
  19 | // Structs/Enums/Unions
  20 | // * Struct, Enum, and Union names begin with a "T",
  21 | //   and use a capital letter for each new word, with no underscores.
  22 | // * All fields should be declared as either optional or required.
  23 | //
  24 | // Functions
  25 | // * Function names start with a capital letter and have a capital letter for
  26 | //   each new word, with no underscores.
  27 | // * Each function should take exactly one parameter, named TFunctionNameReq,
  28 | //   and should return either void or TFunctionNameResp. This convention allows
  29 | //   incremental updates.
  30 | //
  31 | // Services
  32 | // * Service names begin with the letter "T", use a capital letter for each
  33 | //   new word (with no underscores), and end with the word "Service".
  34 | 
  35 | namespace java org.apache.hive.service.cli.thrift
  36 | namespace cpp apache.hive.service.cli.thrift
  37 | namespace rb impala.protocol.hive
  38 | 
  39 | // List of protocol versions. A new token should be
  40 | // added to the end of this list every time a change is made.
  41 | enum TProtocolVersion {
  42 |   HIVE_CLI_SERVICE_PROTOCOL_V1
  43 | }
  44 | 
  45 | enum TTypeId {
  46 |   BOOLEAN_TYPE,
  47 |   TINYINT_TYPE,
  48 |   SMALLINT_TYPE,
  49 |   INT_TYPE,
  50 |   BIGINT_TYPE,
  51 |   FLOAT_TYPE,
  52 |   DOUBLE_TYPE,
  53 |   STRING_TYPE,
  54 |   TIMESTAMP_TYPE,
  55 |   BINARY_TYPE,
  56 |   ARRAY_TYPE,
  57 |   MAP_TYPE,
  58 |   STRUCT_TYPE,
  59 |   UNION_TYPE,
  60 |   USER_DEFINED_TYPE,
  61 |   DECIMAL_TYPE
  62 | }
  63 | 
  64 | const set<TTypeId> PRIMITIVE_TYPES = [
  65 |   TTypeId.BOOLEAN_TYPE
  66 |   TTypeId.TINYINT_TYPE
  67 |   TTypeId.SMALLINT_TYPE
  68 |   TTypeId.INT_TYPE
  69 |   TTypeId.BIGINT_TYPE
  70 |   TTypeId.FLOAT_TYPE
  71 |   TTypeId.DOUBLE_TYPE
  72 |   TTypeId.STRING_TYPE
  73 |   TTypeId.TIMESTAMP_TYPE
  74 |   TTypeId.BINARY_TYPE,
  75 |   TTypeId.DECIMAL_TYPE
  76 | ]
  77 | 
  78 | const set<TTypeId> COMPLEX_TYPES = [
  79 |   TTypeId.ARRAY_TYPE
  80 |   TTypeId.MAP_TYPE
  81 |   TTypeId.STRUCT_TYPE
  82 |   TTypeId.UNION_TYPE
  83 |   TTypeId.USER_DEFINED_TYPE
  84 | ]
  85 | 
  86 | const set<TTypeId> COLLECTION_TYPES = [
  87 |   TTypeId.ARRAY_TYPE
  88 |   TTypeId.MAP_TYPE
  89 | ]
  90 | 
  91 | const map<TTypeId,string> TYPE_NAMES = {
  92 |   TTypeId.BOOLEAN_TYPE: "BOOLEAN",
  93 |   TTypeId.TINYINT_TYPE: "TINYINT",
  94 |   TTypeId.SMALLINT_TYPE: "SMALLINT",
  95 |   TTypeId.INT_TYPE: "INT",
  96 |   TTypeId.BIGINT_TYPE: "BIGINT",
  97 |   TTypeId.FLOAT_TYPE: "FLOAT",
  98 |   TTypeId.DOUBLE_TYPE: "DOUBLE",
  99 |   TTypeId.STRING_TYPE: "STRING",
 100 |   TTypeId.TIMESTAMP_TYPE: "TIMESTAMP",
 101 |   TTypeId.BINARY_TYPE: "BINARY",
 102 |   TTypeId.ARRAY_TYPE: "ARRAY",
 103 |   TTypeId.MAP_TYPE: "MAP",
 104 |   TTypeId.STRUCT_TYPE: "STRUCT",
 105 |   TTypeId.UNION_TYPE: "UNIONTYPE"
 106 |   TTypeId.DECIMAL_TYPE: "DECIMAL"
 107 | }
 108 | 
 109 | // Thrift does not support recursively defined types or forward declarations,
 110 | // which makes it difficult to represent Hive's nested types.
 111 | // To get around these limitations TTypeDesc employs a type list that maps
 112 | // integer "pointers" to TTypeEntry objects. The following examples show
 113 | // how different types are represented using this scheme:
 114 | //
 115 | // "INT":
 116 | // TTypeDesc {
 117 | //   types = [
 118 | //     TTypeEntry.primitive_entry {
 119 | //       type = INT_TYPE
 120 | //     }
 121 | //   ]
 122 | // }
 123 | //
 124 | // "ARRAY<INT>":
 125 | // TTypeDesc {
 126 | //   types = [
 127 | //     TTypeEntry.array_entry {
 128 | //       object_type_ptr = 1
 129 | //     },
 130 | //     TTypeEntry.primitive_entry {
 131 | //       type = INT_TYPE
 132 | //     }
 133 | //   ]
 134 | // }
 135 | //
 136 | // "MAP<INT,STRING>":
 137 | // TTypeDesc {
 138 | //   types = [
 139 | //     TTypeEntry.map_entry {
 140 | //       key_type_ptr = 1
 141 | //       value_type_ptr = 2
 142 | //     },
 143 | //     TTypeEntry.primitive_entry {
 144 | //       type = INT_TYPE
 145 | //     },
 146 | //     TTypeEntry.primitive_entry {
 147 | //       type = STRING_TYPE
 148 | //     }
 149 | //   ]
 150 | // }
 151 | 
 152 | typedef i32 TTypeEntryPtr
 153 | 
 154 | // Type entry for a primitive type.
 155 | struct TPrimitiveTypeEntry {
 156 |   // The primitive type token. This must satisfy the condition
 157 |   // that type is in the PRIMITIVE_TYPES set.
 158 |   1: required TTypeId type
 159 | }
 160 | 
 161 | // Type entry for an ARRAY type.
 162 | struct TArrayTypeEntry {
 163 |   1: required TTypeEntryPtr objectTypePtr
 164 | }
 165 | 
 166 | // Type entry for a MAP type.
 167 | struct TMapTypeEntry {
 168 |   1: required TTypeEntryPtr keyTypePtr
 169 |   2: required TTypeEntryPtr valueTypePtr
 170 | }
 171 | 
 172 | // Type entry for a STRUCT type.
 173 | struct TStructTypeEntry {
 174 |   1: required map<string, TTypeEntryPtr> nameToTypePtr
 175 | }
 176 | 
 177 | // Type entry for a UNIONTYPE type.
 178 | struct TUnionTypeEntry {
 179 |   1: required map<string, TTypeEntryPtr> nameToTypePtr
 180 | }
 181 | 
 182 | struct TUserDefinedTypeEntry {
 183 |   // The fully qualified name of the class implementing this type.
 184 |   1: required string typeClassName
 185 | }
 186 | 
 187 | // We use a union here since Thrift does not support inheritance.
 188 | union TTypeEntry {
 189 |   1: TPrimitiveTypeEntry primitiveEntry
 190 |   2: TArrayTypeEntry arrayEntry
 191 |   3: TMapTypeEntry mapEntry
 192 |   4: TStructTypeEntry structEntry
 193 |   5: TUnionTypeEntry unionEntry
 194 |   6: TUserDefinedTypeEntry userDefinedTypeEntry
 195 | }
 196 | 
 197 | // Type descriptor for columns.
 198 | struct TTypeDesc {
 199 |   // The "top" type is always the first element of the list.
 200 |   // If the top type is an ARRAY, MAP, STRUCT, or UNIONTYPE
 201 |   // type, then subsequent elements represent nested types.
 202 |   1: required list<TTypeEntry> types
 203 | }
 204 | 
 205 | // A result set column descriptor.
 206 | struct TColumnDesc {
 207 |   // The name of the column
 208 |   1: required string columnName
 209 | 
 210 |   // The type descriptor for this column
 211 |   2: required TTypeDesc typeDesc
 212 | 
 213 |   // The ordinal position of this column in the schema
 214 |   3: required i32 position
 215 | 
 216 |   4: optional string comment
 217 | }
 218 | 
 219 | // Metadata used to describe the schema (column names, types, comments)
 220 | // of result sets.
 221 | struct TTableSchema {
 222 |   1: required list<TColumnDesc> columns
 223 | }
 224 | 
 225 | // A Boolean column value.
 226 | struct TBoolValue {
 227 |   // NULL if value is unset.
 228 |   1: optional bool value
 229 | }
 230 | 
 231 | // A Byte column value.
 232 | struct TByteValue {
 233 |   // NULL if value is unset.
 234 |   1: optional byte value
 235 | }
 236 | 
 237 | // A signed, 16 bit column value.
 238 | struct TI16Value {
 239 |   // NULL if value is unset
 240 |   1: optional i16 value
 241 | }
 242 | 
 243 | // A signed, 32 bit column value
 244 | struct TI32Value {
 245 |   // NULL if value is unset
 246 |   1: optional i32 value
 247 | }
 248 | 
 249 | // A signed 64 bit column value
 250 | struct TI64Value {
 251 |   // NULL if value is unset
 252 |   1: optional i64 value
 253 | }
 254 | 
 255 | // A floating point 64 bit column value
 256 | struct TDoubleValue {
 257 |   // NULL if value is unset
 258 |   1: optional double value
 259 | }
 260 | 
 261 | struct TStringValue {
 262 |   // NULL if value is unset
 263 |   1: optional string value
 264 | }
 265 | 
 266 | union TColumn {
 267 |   1: list<TBoolValue> boolColumn
 268 |   2: list<TByteValue> byteColumn
 269 |   3: list<TI16Value> i16Column
 270 |   4: list<TI32Value> i32Column
 271 |   5: list<TI64Value> i64Column
 272 |   6: list<TDoubleValue> doubleColumn
 273 |   7: list<TStringValue> stringColumn
 274 | }
 275 | 
 276 | // A single column value in a result set.
 277 | // Note that Hive's type system is richer than Thrift's,
 278 | // so in some cases we have to map multiple Hive types
 279 | // to the same Thrift type. On the client-side this is
 280 | // disambiguated by looking at the Schema of the
 281 | // result set.
 282 | union TColumnValue {
 283 |   1: TBoolValue   boolVal      // BOOLEAN
 284 |   2: TByteValue   byteVal      // TINYINT
 285 |   3: TI16Value    i16Val       // SMALLINT
 286 |   4: TI32Value    i32Val       // INT
 287 |   5: TI64Value    i64Val       // BIGINT, TIMESTAMP
 288 |   6: TDoubleValue doubleVal    // FLOAT, DOUBLE
 289 |   7: TStringValue stringVal    // STRING, LIST, MAP, STRUCT, UNIONTYPE, BINARY, DECIMAL
 290 | }
 291 | 
 292 | // Represents a row in a rowset.
 293 | struct TRow {
 294 |   1: required list<TColumnValue> colVals
 295 | }
 296 | 
 297 | // Represents a rowset
 298 | struct TRowSet {
 299 |   // The starting row offset of this rowset.
 300 |   1: required i64 startRowOffset
 301 |   2: required list<TRow> rows
 302 |   3: optional list<TColumn> columns
 303 | }
 304 | 
 305 | // The return status code contained in each response.
 306 | enum TStatusCode {
 307 |   SUCCESS_STATUS,
 308 |   SUCCESS_WITH_INFO_STATUS,
 309 |   STILL_EXECUTING_STATUS,
 310 |   ERROR_STATUS,
 311 |   INVALID_HANDLE_STATUS
 312 | }
 313 | 
 314 | // The return status of a remote request
 315 | struct TStatus {
 316 |   1: required TStatusCode statusCode
 317 | 
 318 |   // If status is SUCCESS_WITH_INFO, info_msgs may be populated with
 319 |   // additional diagnostic information.
 320 |   2: optional list<string> infoMessages
 321 | 
 322 |   // If status is ERROR, then the following fields may be set
 323 |   3: optional string sqlState  // as defined in the ISO/IEF CLI specification
 324 |   4: optional i32 errorCode    // internal error code
 325 |   5: optional string errorMessage
 326 | }
 327 | 
 328 | // The state of an operation (i.e. a query or other
 329 | // asynchronous operation that generates a result set)
 330 | // on the server.
 331 | enum TOperationState {
 332 |   // The operation has been initialized
 333 |   INITIALIZED_STATE,
 334 | 
 335 |   // The operation is running. In this state the result
 336 |   // set is not available.
 337 |   RUNNING_STATE,
 338 | 
 339 |   // The operation has completed. When an operation is in
 340 |   // this state its result set may be fetched.
 341 |   FINISHED_STATE,
 342 | 
 343 |   // The operation was canceled by a client
 344 |   CANCELED_STATE,
 345 | 
 346 |   // The operation was closed by a client
 347 |   CLOSED_STATE,
 348 | 
 349 |   // The operation failed due to an error
 350 |   ERROR_STATE,
 351 | 
 352 |   // The operation is in an unrecognized state
 353 |   UKNOWN_STATE,
 354 | }
 355 | 
 356 | 
 357 | // A string identifier. This is interpreted literally.
 358 | typedef string TIdentifier
 359 | 
 360 | // A search pattern.
 361 | //
 362 | // Valid search pattern characters:
 363 | // '_': Any single character.
 364 | // '%': Any sequence of zero or more characters.
 365 | // '\': Escape character used to include special characters,
 366 | //      e.g. '_', '%', '\'. If a '\' precedes a non-special
 367 | //      character it has no special meaning and is interpreted
 368 | //      literally.
 369 | typedef string TPattern
 370 | 
 371 | 
 372 | // A search pattern or identifier. Used as input
 373 | // parameter for many of the catalog functions.
 374 | typedef string TPatternOrIdentifier
 375 | 
 376 | struct THandleIdentifier {
 377 |   // 16 byte globally unique identifier
 378 |   // This is the public ID of the handle and
 379 |   // can be used for reporting.
 380 |   1: required binary guid,
 381 | 
 382 |   // 16 byte secret generated by the server
 383 |   // and used to verify that the handle is not
 384 |   // being hijacked by another user.
 385 |   2: required binary secret,
 386 | }
 387 | 
 388 | // Client-side handle to persistent
 389 | // session information on the server-side.
 390 | struct TSessionHandle {
 391 |   1: required THandleIdentifier sessionId
 392 | }
 393 | 
 394 | // The subtype of an OperationHandle.
 395 | enum TOperationType {
 396 |   EXECUTE_STATEMENT,
 397 |   GET_TYPE_INFO,
 398 |   GET_CATALOGS,
 399 |   GET_SCHEMAS,
 400 |   GET_TABLES,
 401 |   GET_TABLE_TYPES,
 402 |   GET_COLUMNS,
 403 |   GET_FUNCTIONS,
 404 |   UNKNOWN,
 405 | }
 406 | 
 407 | // Client-side reference to a task running
 408 | // asynchronously on the server.
 409 | struct TOperationHandle {
 410 |   1: required THandleIdentifier operationId
 411 |   2: required TOperationType operationType
 412 | 
 413 |   // If hasResultSet = TRUE, then this operation
 414 |   // generates a result set that can be fetched.
 415 |   // Note that the result set may be empty.
 416 |   //
 417 |   // If hasResultSet = FALSE, then this operation
 418 |   // does not generate a result set, and calling
 419 |   // GetResultSetMetadata or FetchResults against
 420 |   // this OperationHandle will generate an error.
 421 |   3: required bool hasResultSet
 422 | 
 423 |   // For operations that don't generate result sets,
 424 |   // modifiedRowCount is either:
 425 |   //
 426 |   // 1) The number of rows that were modified by
 427 |   //    the DML operation (e.g. number of rows inserted,
 428 |   //    number of rows deleted, etc).
 429 |   //
 430 |   // 2) 0 for operations that don't modify or add rows.
 431 |   //
 432 |   // 3) < 0 if the operation is capable of modifiying rows,
 433 |   //    but Hive is unable to determine how many rows were
 434 |   //    modified. For example, Hive's LOAD DATA command
 435 |   //    doesn't generate row count information because
 436 |   //    Hive doesn't inspect the data as it is loaded.
 437 |   //
 438 |   // modifiedRowCount is unset if the operation generates
 439 |   // a result set.
 440 |   4: optional double modifiedRowCount
 441 | }
 442 | 
 443 | 
 444 | // OpenSession()
 445 | //
 446 | // Open a session (connection) on the server against
 447 | // which operations may be executed.
 448 | struct TOpenSessionReq {
 449 |   // The version of the HiveServer2 protocol that the client is using.
 450 |   1: required TProtocolVersion client_protocol = TProtocolVersion.HIVE_CLI_SERVICE_PROTOCOL_V1
 451 | 
 452 |   // Username and password for authentication.
 453 |   // Depending on the authentication scheme being used,
 454 |   // this information may instead be provided by a lower
 455 |   // protocol layer, in which case these fields may be
 456 |   // left unset.
 457 |   2: optional string username
 458 |   3: optional string password
 459 | 
 460 |   // Configuration overlay which is applied when the session is
 461 |   // first created.
 462 |   4: optional map<string, string> configuration
 463 | }
 464 | 
 465 | struct TOpenSessionResp {
 466 |   1: required TStatus status
 467 | 
 468 |   // The protocol version that the server is using.
 469 |   2: required TProtocolVersion serverProtocolVersion = TProtocolVersion.HIVE_CLI_SERVICE_PROTOCOL_V1
 470 | 
 471 |   // Session Handle
 472 |   3: optional TSessionHandle sessionHandle
 473 | 
 474 |   // The configuration settings for this session.
 475 |   4: optional map<string, string> configuration
 476 | }
 477 | 
 478 | 
 479 | // CloseSession()
 480 | //
 481 | // Closes the specified session and frees any resources
 482 | // currently allocated to that session. Any open
 483 | // operations in that session will be canceled.
 484 | struct TCloseSessionReq {
 485 |   1: required TSessionHandle sessionHandle
 486 | }
 487 | 
 488 | struct TCloseSessionResp {
 489 |   1: required TStatus status
 490 | }
 491 | 
 492 | 
 493 | 
 494 | enum TGetInfoType {
 495 |   CLI_MAX_DRIVER_CONNECTIONS =           0,
 496 |   CLI_MAX_CONCURRENT_ACTIVITIES =        1,
 497 |   CLI_DATA_SOURCE_NAME =                 2,
 498 |   CLI_FETCH_DIRECTION =                  8,
 499 |   CLI_SERVER_NAME =                      13,
 500 |   CLI_SEARCH_PATTERN_ESCAPE =            14,
 501 |   CLI_DBMS_NAME =                        17,
 502 |   CLI_DBMS_VER =                         18,
 503 |   CLI_ACCESSIBLE_TABLES =                19,
 504 |   CLI_ACCESSIBLE_PROCEDURES =            20,
 505 |   CLI_CURSOR_COMMIT_BEHAVIOR =           23,
 506 |   CLI_DATA_SOURCE_READ_ONLY =            25,
 507 |   CLI_DEFAULT_TXN_ISOLATION =            26,
 508 |   CLI_IDENTIFIER_CASE =                  28,
 509 |   CLI_IDENTIFIER_QUOTE_CHAR =            29,
 510 |   CLI_MAX_COLUMN_NAME_LEN =              30,
 511 |   CLI_MAX_CURSOR_NAME_LEN =              31,
 512 |   CLI_MAX_SCHEMA_NAME_LEN =              32,
 513 |   CLI_MAX_CATALOG_NAME_LEN =             34,
 514 |   CLI_MAX_TABLE_NAME_LEN =               35,
 515 |   CLI_SCROLL_CONCURRENCY =               43,
 516 |   CLI_TXN_CAPABLE =                      46,
 517 |   CLI_USER_NAME =                        47,
 518 |   CLI_TXN_ISOLATION_OPTION =             72,
 519 |   CLI_INTEGRITY =                        73,
 520 |   CLI_GETDATA_EXTENSIONS =               81,
 521 |   CLI_NULL_COLLATION =                   85,
 522 |   CLI_ALTER_TABLE =                      86,
 523 |   CLI_ORDER_BY_COLUMNS_IN_SELECT =       90,
 524 |   CLI_SPECIAL_CHARACTERS =               94,
 525 |   CLI_MAX_COLUMNS_IN_GROUP_BY =          97,
 526 |   CLI_MAX_COLUMNS_IN_INDEX =             98,
 527 |   CLI_MAX_COLUMNS_IN_ORDER_BY =          99,
 528 |   CLI_MAX_COLUMNS_IN_SELECT =            100,
 529 |   CLI_MAX_COLUMNS_IN_TABLE =             101,
 530 |   CLI_MAX_INDEX_SIZE =                   102,
 531 |   CLI_MAX_ROW_SIZE =                     104,
 532 |   CLI_MAX_STATEMENT_LEN =                105,
 533 |   CLI_MAX_TABLES_IN_SELECT =             106,
 534 |   CLI_MAX_USER_NAME_LEN =                107,
 535 |   CLI_OJ_CAPABILITIES =                  115,
 536 | 
 537 |   CLI_XOPEN_CLI_YEAR =                   10000,
 538 |   CLI_CURSOR_SENSITIVITY =               10001,
 539 |   CLI_DESCRIBE_PARAMETER =               10002,
 540 |   CLI_CATALOG_NAME =                     10003,
 541 |   CLI_COLLATION_SEQ =                    10004,
 542 |   CLI_MAX_IDENTIFIER_LEN =               10005,
 543 | }
 544 | 
 545 | union TGetInfoValue {
 546 |   1: string stringValue
 547 |   2: i16 smallIntValue
 548 |   3: i32 integerBitmask
 549 |   4: i32 integerFlag
 550 |   5: i32 binaryValue
 551 |   6: i64 lenValue
 552 | }
 553 | 
 554 | // GetInfo()
 555 | //
 556 | // This function is based on ODBC's CLIGetInfo() function.
 557 | // The function returns general information about the data source
 558 | // using the same keys as ODBC.
 559 | struct TGetInfoReq {
 560 |   // The sesssion to run this request against
 561 |   1: required TSessionHandle sessionHandle
 562 | 
 563 |   2: required TGetInfoType infoType
 564 | }
 565 | 
 566 | struct TGetInfoResp {
 567 |   1: required TStatus status
 568 | 
 569 |   2: required TGetInfoValue infoValue
 570 | }
 571 | 
 572 | 
 573 | // ExecuteStatement()
 574 | //
 575 | // Execute a statement.
 576 | // The returned OperationHandle can be used to check on the
 577 | // status of the statement, and to fetch results once the
 578 | // statement has finished executing.
 579 | struct TExecuteStatementReq {
 580 |   // The session to exexcute the statement against
 581 |   1: required TSessionHandle sessionHandle
 582 | 
 583 |   // The statement to be executed (DML, DDL, SET, etc)
 584 |   2: required string statement
 585 | 
 586 |   // Configuration properties that are overlayed on top of the
 587 |   // the existing session configuration before this statement
 588 |   // is executed. These properties apply to this statement
 589 |   // only and will not affect the subsequent state of the Session.
 590 |   3: optional map<string, string> confOverlay
 591 | }
 592 | 
 593 | struct TExecuteStatementResp {
 594 |   1: required TStatus status
 595 |   2: optional TOperationHandle operationHandle
 596 | }
 597 | 
 598 | 
 599 | // GetTypeInfo()
 600 | //
 601 | // Get information about types supported by the HiveServer instance.
 602 | // The information is returned as a result set which can be fetched
 603 | // using the OperationHandle provided in the response.
 604 | //
 605 | // Refer to the documentation for ODBC's CLIGetTypeInfo function for
 606 | // the format of the result set.
 607 | struct TGetTypeInfoReq {
 608 |   // The session to run this request against.
 609 |   1: required TSessionHandle sessionHandle
 610 | }
 611 | 
 612 | struct TGetTypeInfoResp {
 613 |   1: required TStatus status
 614 |   2: optional TOperationHandle operationHandle
 615 | }
 616 | 
 617 | 
 618 | // GetCatalogs()
 619 | //
 620 | // Returns the list of catalogs (databases)
 621 | // Results are ordered by TABLE_CATALOG
 622 | //
 623 | // Resultset columns :
 624 | // col1
 625 | // name: TABLE_CAT
 626 | // type: STRING
 627 | // desc: Catalog name. NULL if not applicable.
 628 | //
 629 | struct TGetCatalogsReq {
 630 |   // Session to run this request against
 631 |   1: required TSessionHandle sessionHandle
 632 | }
 633 | 
 634 | struct TGetCatalogsResp {
 635 |   1: required TStatus status
 636 |   2: optional TOperationHandle operationHandle
 637 | }
 638 | 
 639 | 
 640 | // GetSchemas()
 641 | //
 642 | // Retrieves the schema names available in this database.
 643 | // The results are ordered by TABLE_CATALOG and TABLE_SCHEM.
 644 | // col1
 645 | // name: TABLE_SCHEM
 646 | // type: STRING
 647 | // desc: schema name
 648 | // col2
 649 | // name: TABLE_CATALOG
 650 | // type: STRING
 651 | // desc: catalog name
 652 | struct TGetSchemasReq {
 653 |   // Session to run this request against
 654 |   1: required TSessionHandle sessionHandle
 655 | 
 656 |   // Name of the catalog. Must not contain a search pattern.
 657 |   2: optional TIdentifier catalogName
 658 | 
 659 |   // schema name or pattern
 660 |   3: optional TPatternOrIdentifier schemaName
 661 | }
 662 | 
 663 | struct TGetSchemasResp {
 664 |   1: required TStatus status
 665 |   2: optional TOperationHandle operationHandle
 666 | }
 667 | 
 668 | 
 669 | // GetTables()
 670 | //
 671 | // Returns a list of tables with catalog, schema, and table
 672 | // type information. The information is returned as a result
 673 | // set which can be fetched using the OperationHandle
 674 | // provided in the response.
 675 | // Results are ordered by TABLE_TYPE, TABLE_CAT, TABLE_SCHEM, and TABLE_NAME
 676 | //
 677 | // Result Set Columns:
 678 | //
 679 | // col1
 680 | // name: TABLE_CAT
 681 | // type: STRING
 682 | // desc: Catalog name. NULL if not applicable.
 683 | //
 684 | // col2
 685 | // name: TABLE_SCHEM
 686 | // type: STRING
 687 | // desc: Schema name.
 688 | //
 689 | // col3
 690 | // name: TABLE_NAME
 691 | // type: STRING
 692 | // desc: Table name.
 693 | //
 694 | // col4
 695 | // name: TABLE_TYPE
 696 | // type: STRING
 697 | // desc: The table type, e.g. "TABLE", "VIEW", etc.
 698 | //
 699 | // col5
 700 | // name: REMARKS
 701 | // type: STRING
 702 | // desc: Comments about the table
 703 | //
 704 | struct TGetTablesReq {
 705 |   // Session to run this request against
 706 |   1: required TSessionHandle sessionHandle
 707 | 
 708 |   // Name of the catalog or a search pattern.
 709 |   2: optional TPatternOrIdentifier catalogName
 710 | 
 711 |   // Name of the schema or a search pattern.
 712 |   3: optional TPatternOrIdentifier schemaName
 713 | 
 714 |   // Name of the table or a search pattern.
 715 |   4: optional TPatternOrIdentifier tableName
 716 | 
 717 |   // List of table types to match
 718 |   // e.g. "TABLE", "VIEW", "SYSTEM TABLE", "GLOBAL TEMPORARY",
 719 |   // "LOCAL TEMPORARY", "ALIAS", "SYNONYM", etc.
 720 |   5: optional list<string> tableTypes
 721 | }
 722 | 
 723 | struct TGetTablesResp {
 724 |   1: required TStatus status
 725 |   2: optional TOperationHandle operationHandle
 726 | }
 727 | 
 728 | 
 729 | // GetTableTypes()
 730 | //
 731 | // Returns the table types available in this database.
 732 | // The results are ordered by table type.
 733 | //
 734 | // col1
 735 | // name: TABLE_TYPE
 736 | // type: STRING
 737 | // desc: Table type name.
 738 | struct TGetTableTypesReq {
 739 |   // Session to run this request against
 740 |   1: required TSessionHandle sessionHandle
 741 | }
 742 | 
 743 | struct TGetTableTypesResp {
 744 |   1: required TStatus status
 745 |   2: optional TOperationHandle operationHandle
 746 | }
 747 | 
 748 | 
 749 | // GetColumns()
 750 | //
 751 | // Returns a list of columns in the specified tables.
 752 | // The information is returned as a result set which can be fetched
 753 | // using the OperationHandle provided in the response.
 754 | // Results are ordered by TABLE_CAT, TABLE_SCHEM, TABLE_NAME,
 755 | // and ORDINAL_POSITION.
 756 | //
 757 | // Result Set Columns are the same as those for the ODBC CLIColumns
 758 | // function.
 759 | //
 760 | struct TGetColumnsReq {
 761 |   // Session to run this request against
 762 |   1: required TSessionHandle sessionHandle
 763 | 
 764 |   // Name of the catalog. Must not contain a search pattern.
 765 |   2: optional TIdentifier catalogName
 766 | 
 767 |   // Schema name or search pattern
 768 |   3: optional TPatternOrIdentifier schemaName
 769 | 
 770 |   // Table name or search pattern
 771 |   4: optional TPatternOrIdentifier tableName
 772 | 
 773 |   // Column name or search pattern
 774 |   5: optional TPatternOrIdentifier columnName
 775 | }
 776 | 
 777 | struct TGetColumnsResp {
 778 |   1: required TStatus status
 779 |   2: optional TOperationHandle operationHandle
 780 | }
 781 | 
 782 | 
 783 | // GetFunctions()
 784 | //
 785 | // Returns a list of functions supported by the data source. The
 786 | // behavior of this function matches
 787 | // java.sql.DatabaseMetaData.getFunctions() both in terms of
 788 | // inputs and outputs.
 789 | //
 790 | // Result Set Columns:
 791 | //
 792 | // col1
 793 | // name: FUNCTION_CAT
 794 | // type: STRING
 795 | // desc: Function catalog (may be null)
 796 | //
 797 | // col2
 798 | // name: FUNCTION_SCHEM
 799 | // type: STRING
 800 | // desc: Function schema (may be null)
 801 | //
 802 | // col3
 803 | // name: FUNCTION_NAME
 804 | // type: STRING
 805 | // desc: Function name. This is the name used to invoke the function.
 806 | //
 807 | // col4
 808 | // name: REMARKS
 809 | // type: STRING
 810 | // desc: Explanatory comment on the function.
 811 | //
 812 | // col5
 813 | // name: FUNCTION_TYPE
 814 | // type: SMALLINT
 815 | // desc: Kind of function. One of:
 816 | //       * functionResultUnknown - Cannot determine if a return value or a table
 817 | //                                 will be returned.
 818 | //       * functionNoTable       - Does not a return a table.
 819 | //       * functionReturnsTable  - Returns a table.
 820 | //
 821 | // col6
 822 | // name: SPECIFIC_NAME
 823 | // type: STRING
 824 | // desc: The name which uniquely identifies this function within its schema.
 825 | //       In this case this is the fully qualified class name of the class
 826 | //       that implements this function.
 827 | //
 828 | struct TGetFunctionsReq {
 829 |   // Session to run this request against
 830 |   1: required TSessionHandle sessionHandle
 831 | 
 832 |   // A catalog name; must match the catalog name as it is stored in the
 833 |   // database; "" retrieves those without a catalog; null means
 834 |   // that the catalog name should not be used to narrow the search.
 835 |   2: optional TIdentifier catalogName
 836 | 
 837 |   // A schema name pattern; must match the schema name as it is stored
 838 |   // in the database; "" retrieves those without a schema; null means
 839 |   // that the schema name should not be used to narrow the search.
 840 |   3: optional TPatternOrIdentifier schemaName
 841 | 
 842 |   // A function name pattern; must match the function name as it is stored
 843 |   // in the database.
 844 |   4: required TPatternOrIdentifier functionName
 845 | }
 846 | 
 847 | struct TGetFunctionsResp {
 848 |   1: required TStatus status
 849 |   2: optional TOperationHandle operationHandle
 850 | }
 851 | 
 852 | 
 853 | // GetOperationStatus()
 854 | //
 855 | // Get the status of an operation running on the server.
 856 | struct TGetOperationStatusReq {
 857 |   // Session to run this request against
 858 |   1: required TOperationHandle operationHandle
 859 | }
 860 | 
 861 | struct TGetOperationStatusResp {
 862 |   1: required TStatus status
 863 |   2: optional TOperationState operationState
 864 | }
 865 | 
 866 | 
 867 | // CancelOperation()
 868 | //
 869 | // Cancels processing on the specified operation handle and
 870 | // frees any resources which were allocated.
 871 | struct TCancelOperationReq {
 872 |   // Operation to cancel
 873 |   1: required TOperationHandle operationHandle
 874 | }
 875 | 
 876 | struct TCancelOperationResp {
 877 |   1: required TStatus status
 878 | }
 879 | 
 880 | 
 881 | // CloseOperation()
 882 | //
 883 | // Given an operation in the FINISHED, CANCELED,
 884 | // or ERROR states, CloseOperation() will free
 885 | // all of the resources which were allocated on
 886 | // the server to service the operation.
 887 | struct TCloseOperationReq {
 888 |   1: required TOperationHandle operationHandle
 889 | }
 890 | 
 891 | struct TCloseOperationResp {
 892 |   1: required TStatus status
 893 | }
 894 | 
 895 | 
 896 | // GetResultSetMetadata()
 897 | //
 898 | // Retrieves schema information for the specified operation
 899 | struct TGetResultSetMetadataReq {
 900 |   // Operation for which to fetch result set schema information
 901 |   1: required TOperationHandle operationHandle
 902 | }
 903 | 
 904 | struct TGetResultSetMetadataResp {
 905 |   1: required TStatus status
 906 |   2: optional TTableSchema schema
 907 | }
 908 | 
 909 | 
 910 | enum TFetchOrientation {
 911 |   // Get the next rowset. The fetch offset is ignored.
 912 |   FETCH_NEXT,
 913 | 
 914 |   // Get the previous rowset. The fetch offset is ignored.
 915 |   // NOT SUPPORTED
 916 |   FETCH_PRIOR,
 917 | 
 918 |   // Return the rowset at the given fetch offset relative
 919 |   // to the curren rowset.
 920 |   // NOT SUPPORTED
 921 |   FETCH_RELATIVE,
 922 | 
 923 |   // Return the rowset at the specified fetch offset.
 924 |   // NOT SUPPORTED
 925 |   FETCH_ABSOLUTE,
 926 | 
 927 |   // Get the first rowset in the result set.
 928 |   FETCH_FIRST,
 929 | 
 930 |   // Get the last rowset in the result set.
 931 |   // NOT SUPPORTED
 932 |   FETCH_LAST
 933 | }
 934 | 
 935 | // FetchResults()
 936 | //
 937 | // Fetch rows from the server corresponding to
 938 | // a particular OperationHandle.
 939 | struct TFetchResultsReq {
 940 |   // Operation from which to fetch results.
 941 |   1: required TOperationHandle operationHandle
 942 | 
 943 |   // The fetch orientation. For V1 this must be either
 944 |   // FETCH_NEXT or FETCH_FIRST. Defaults to FETCH_NEXT.
 945 |   2: required TFetchOrientation orientation = TFetchOrientation.FETCH_NEXT
 946 | 
 947 |   // Max number of rows that should be returned in
 948 |   // the rowset.
 949 |   3: required i64 maxRows
 950 | }
 951 | 
 952 | struct TFetchResultsResp {
 953 |   1: required TStatus status
 954 | 
 955 |   // TRUE if there are more rows left to fetch from the server.
 956 |   2: optional bool hasMoreRows
 957 | 
 958 |   // The rowset. This is optional so that we have the
 959 |   // option in the future of adding alternate formats for
 960 |   // representing result set data, e.g. delimited strings,
 961 |   // binary encoded, etc.
 962 |   3: optional TRowSet results
 963 | }
 964 | 
 965 | // GetLog()
 966 | //
 967 | // Fetch operation log from the server corresponding to
 968 | // a particular OperationHandle.
 969 | struct TGetLogReq {
 970 |   // Operation whose log is requested
 971 |   1: required TOperationHandle operationHandle
 972 | }
 973 | 
 974 | struct TGetLogResp {
 975 |   1: required TStatus status
 976 | 
 977 |   2: required string log
 978 | }
 979 | 
 980 | service TCLIService {
 981 | 
 982 |   TOpenSessionResp OpenSession(1:TOpenSessionReq req);
 983 | 
 984 |   TCloseSessionResp CloseSession(1:TCloseSessionReq req);
 985 | 
 986 |   TGetInfoResp GetInfo(1:TGetInfoReq req);
 987 | 
 988 |   TExecuteStatementResp ExecuteStatement(1:TExecuteStatementReq req);
 989 | 
 990 |   TGetTypeInfoResp GetTypeInfo(1:TGetTypeInfoReq req);
 991 | 
 992 |   TGetCatalogsResp GetCatalogs(1:TGetCatalogsReq req);
 993 | 
 994 |   TGetSchemasResp GetSchemas(1:TGetSchemasReq req);
 995 | 
 996 |   TGetTablesResp GetTables(1:TGetTablesReq req);
 997 | 
 998 |   TGetTableTypesResp GetTableTypes(1:TGetTableTypesReq req);
 999 | 
1000 |   TGetColumnsResp GetColumns(1:TGetColumnsReq req);
1001 | 
1002 |   TGetFunctionsResp GetFunctions(1:TGetFunctionsReq req);
1003 | 
1004 |   TGetOperationStatusResp GetOperationStatus(1:TGetOperationStatusReq req);
1005 | 
1006 |   TCancelOperationResp CancelOperation(1:TCancelOperationReq req);
1007 | 
1008 |   TCloseOperationResp CloseOperation(1:TCloseOperationReq req);
1009 | 
1010 |   TGetResultSetMetadataResp GetResultSetMetadata(1:TGetResultSetMetadataReq req);
1011 | 
1012 |   TFetchResultsResp FetchResults(1:TFetchResultsReq req);
1013 | 
1014 |   TGetLogResp GetLog(1:TGetLogReq req);
1015 | }
1016 | 


--------------------------------------------------------------------------------
/herringbone-impala/src/main/thrift/fb303.thrift:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one
  3 |  * or more contributor license agreements. See the NOTICE file
  4 |  * distributed with this work for additional information
  5 |  * regarding copyright ownership. The ASF licenses this file
  6 |  * to you under the Apache License, Version 2.0 (the
  7 |  * "License"); you may not use this file except in compliance
  8 |  * with the License. You may obtain a copy of the License at
  9 |  *
 10 |  *   http://www.apache.org/licenses/LICENSE-2.0
 11 |  *
 12 |  * Unless required by applicable law or agreed to in writing,
 13 |  * software distributed under the License is distributed on an
 14 |  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 15 |  * KIND, either express or implied. See the License for the
 16 |  * specific language governing permissions and limitations
 17 |  * under the License.
 18 |  */
 19 | 
 20 | /**
 21 |  * fb303.thrift
 22 |  */
 23 | 
 24 | namespace java com.facebook.fb303
 25 | namespace cpp facebook.fb303
 26 | namespace rb Impala.Protocol.fb303
 27 | 
 28 | /**
 29 |  * Common status reporting mechanism across all services
 30 |  */
 31 | enum fb_status {
 32 |   DEAD = 0,
 33 |   STARTING = 1,
 34 |   ALIVE = 2,
 35 |   STOPPING = 3,
 36 |   STOPPED = 4,
 37 |   WARNING = 5,
 38 | }
 39 | 
 40 | /**
 41 |  * Standard base service
 42 |  */
 43 | service FacebookService {
 44 | 
 45 |   /**
 46 |    * Returns a descriptive name of the service
 47 |    */
 48 |   string getName(),
 49 | 
 50 |   /**
 51 |    * Returns the version of the service
 52 |    */
 53 |   string getVersion(),
 54 | 
 55 |   /**
 56 |    * Gets the status of this service
 57 |    */
 58 |   fb_status getStatus(),
 59 | 
 60 |   /**
 61 |    * User friendly description of status, such as why the service is in
 62 |    * the dead or warning state, or what is being started or stopped.
 63 |    */
 64 |   string getStatusDetails(),
 65 | 
 66 |   /**
 67 |    * Gets the counters for this service
 68 |    */
 69 |   map<string, i64> getCounters(),
 70 | 
 71 |   /**
 72 |    * Gets the value of a single counter
 73 |    */
 74 |   i64 getCounter(1: string key),
 75 | 
 76 |   /**
 77 |    * Sets an option
 78 |    */
 79 |   void setOption(1: string key, 2: string value),
 80 | 
 81 |   /**
 82 |    * Gets an option
 83 |    */
 84 |   string getOption(1: string key),
 85 | 
 86 |   /**
 87 |    * Gets all options
 88 |    */
 89 |   map<string, string> getOptions(),
 90 | 
 91 |   /**
 92 |    * Returns a CPU profile over the given time interval (client and server
 93 |    * must agree on the profile format).
 94 |    */
 95 |   string getCpuProfile(1: i32 profileDurationInSec),
 96 | 
 97 |   /**
 98 |    * Returns the unix time that the server has been running since
 99 |    */
100 |   i64 aliveSince(),
101 | 
102 |   /**
103 |    * Tell the server to reload its configuration, reopen log files, etc
104 |    */
105 |   oneway void reinitialize(),
106 | 
107 |   /**
108 |    * Suggest a shutdown to the server
109 |    */
110 |   oneway void shutdown(),
111 | 
112 | }
113 | 


--------------------------------------------------------------------------------
/herringbone-impala/src/main/thrift/hive_metastore.thrift:
--------------------------------------------------------------------------------
  1 | #!/usr/local/bin/thrift -java
  2 | 
  3 | /**
  4 |  * Licensed to the Apache Software Foundation (ASF) under one
  5 |  * or more contributor license agreements.  See the NOTICE file
  6 |  * distributed with this work for additional information
  7 |  * regarding copyright ownership.  The ASF licenses this file
  8 |  * to you under the Apache License, Version 2.0 (the
  9 |  * "License"); you may not use this file except in compliance
 10 |  * with the License.  You may obtain a copy of the License at
 11 |  *
 12 |  *     http://www.apache.org/licenses/LICENSE-2.0
 13 |  *
 14 |  * Unless required by applicable law or agreed to in writing, software
 15 |  * distributed under the License is distributed on an "AS IS" BASIS,
 16 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 17 |  * See the License for the specific language governing permissions and
 18 |  * limitations under the License.
 19 |  */
 20 | 
 21 | #
 22 | # Thrift Service that the MetaStore is built on
 23 | #
 24 | 
 25 | include "fb303.thrift"
 26 | 
 27 | namespace java org.apache.hadoop.hive.metastore.api
 28 | namespace php metastore
 29 | namespace cpp Apache.Hadoop.Hive
 30 | namespace rb Impala.Protocol.HiveMetastore
 31 | 
 32 | const string DDL_TIME = "transient_lastDdlTime"
 33 | 
 34 | struct Version {
 35 |   1: string version,
 36 |   2: string comments
 37 | }
 38 | 
 39 | struct FieldSchema {
 40 |   1: string name, // name of the field
 41 |   2: string type, // type of the field. primitive types defined above, specify list<TYPE_NAME>, map<TYPE_NAME, TYPE_NAME> for lists & maps
 42 |   3: string comment
 43 | }
 44 | 
 45 | struct Type {
 46 |   1: string          name,             // one of the types in PrimitiveTypes or CollectionTypes or User defined types
 47 |   2: optional string type1,            // object type if the name is 'list' (LIST_TYPE), key type if the name is 'map' (MAP_TYPE)
 48 |   3: optional string type2,            // val type if the name is 'map' (MAP_TYPE)
 49 |   //4: optional list<FieldSchema> fields // if the name is one of the user defined types
 50 | }
 51 | 
 52 | enum HiveObjectType {
 53 |   GLOBAL = 1,
 54 |   DATABASE = 2,
 55 |   TABLE = 3,
 56 |   PARTITION = 4,
 57 |   COLUMN = 5,
 58 | }
 59 | 
 60 | enum PrincipalType {
 61 |   USER = 1,
 62 |   ROLE = 2,
 63 |   GROUP = 3,
 64 | }
 65 | 
 66 | const string HIVE_FILTER_FIELD_OWNER = "hive_filter_field_owner__"
 67 | const string HIVE_FILTER_FIELD_PARAMS = "hive_filter_field_params__"
 68 | const string HIVE_FILTER_FIELD_LAST_ACCESS = "hive_filter_field_last_access__"
 69 | 
 70 | enum PartitionEventType {
 71 |   LOAD_DONE = 1,
 72 | }
 73 | 
 74 | struct HiveObjectRef{
 75 |   1: HiveObjectType objectType,
 76 |   2: string dbName,
 77 |   3: string objectName,
 78 |   4: list<string> partValues,
 79 |   5: string columnName,
 80 | }
 81 | 
 82 | struct PrivilegeGrantInfo {
 83 |   1: string privilege,
 84 |   2: i32 createTime,
 85 |   3: string grantor,
 86 |   4: PrincipalType grantorType,
 87 |   5: bool grantOption,
 88 | }
 89 | 
 90 | struct HiveObjectPrivilege {
 91 |   1: HiveObjectRef  hiveObject,
 92 |   2: string principalName,
 93 |   3: PrincipalType principalType,
 94 |   4: PrivilegeGrantInfo grantInfo,
 95 | }
 96 | 
 97 | struct PrivilegeBag {
 98 |   1: list<HiveObjectPrivilege> privileges,
 99 | }
100 | 
101 | struct PrincipalPrivilegeSet {
102 |   1: map<string, list<PrivilegeGrantInfo>> userPrivileges, // user name -> privilege grant info
103 |   2: map<string, list<PrivilegeGrantInfo>> groupPrivileges, // group name -> privilege grant info
104 |   3: map<string, list<PrivilegeGrantInfo>> rolePrivileges, //role name -> privilege grant info
105 | }
106 | 
107 | struct Role {
108 |   1: string roleName,
109 |   2: i32 createTime,
110 |   3: string ownerName,
111 | }
112 | 
113 | // namespace for tables
114 | struct Database {
115 |   1: string name,
116 |   2: string description,
117 |   3: string locationUri,
118 |   4: map<string, string> parameters, // properties associated with the database
119 |   5: optional PrincipalPrivilegeSet privileges
120 | }
121 | 
122 | // This object holds the information needed by SerDes
123 | struct SerDeInfo {
124 |   1: string name,                   // name of the serde, table name by default
125 |   2: string serializationLib,       // usually the class that implements the extractor & loader
126 |   3: map<string, string> parameters // initialization parameters
127 | }
128 | 
129 | // sort order of a column (column name along with asc(1)/desc(0))
130 | struct Order {
131 |   1: string col,  // sort column name
132 |   2: i32    order // asc(1) or desc(0)
133 | }
134 | 
135 | // this object holds all the information about physical storage of the data belonging to a table
136 | struct StorageDescriptor {
137 |   1: list<FieldSchema> cols,  // required (refer to types defined above)
138 |   2: string location,         // defaults to <warehouse loc>/<db loc>/tablename
139 |   3: string inputFormat,      // SequenceFileInputFormat (binary) or TextInputFormat`  or custom format
140 |   4: string outputFormat,     // SequenceFileOutputFormat (binary) or IgnoreKeyTextOutputFormat or custom format
141 |   5: bool   compressed,       // compressed or not
142 |   6: i32    numBuckets,       // this must be specified if there are any dimension columns
143 |   7: SerDeInfo    serdeInfo,  // serialization and deserialization information
144 |   8: list<string> bucketCols, // reducer grouping columns and clustering columns and bucketing columns`
145 |   9: list<Order>  sortCols,   // sort order of the data in each bucket
146 |   10: map<string, string> parameters // any user supplied key value hash
147 | }
148 | 
149 | // table information
150 | struct Table {
151 |   1: string tableName,                // name of the table
152 |   2: string dbName,                   // database name ('default')
153 |   3: string owner,                    // owner of this table
154 |   4: i32    createTime,               // creation time of the table
155 |   5: i32    lastAccessTime,           // last access time (usually this will be filled from HDFS and shouldn't be relied on)
156 |   6: i32    retention,                // retention time
157 |   7: StorageDescriptor sd,            // storage descriptor of the table
158 |   8: list<FieldSchema> partitionKeys, // partition keys of the table. only primitive types are supported
159 |   9: map<string, string> parameters,   // to store comments or any other user level parameters
160 |   10: string viewOriginalText,         // original view text, null for non-view
161 |   11: string viewExpandedText,         // expanded view text, null for non-view
162 |   12: string tableType,                 // table type enum, e.g. EXTERNAL_TABLE
163 |   13: optional PrincipalPrivilegeSet privileges,
164 | }
165 | 
166 | struct Partition {
167 |   1: list<string> values // string value is converted to appropriate partition key type
168 |   2: string       dbName,
169 |   3: string       tableName,
170 |   4: i32          createTime,
171 |   5: i32          lastAccessTime,
172 |   6: StorageDescriptor   sd,
173 |   7: map<string, string> parameters,
174 |   8: optional PrincipalPrivilegeSet privileges
175 | }
176 | 
177 | struct Index {
178 |   1: string       indexName, // unique with in the whole database namespace
179 |   2: string       indexHandlerClass, // reserved
180 |   3: string       dbName,
181 |   4: string       origTableName,
182 |   5: i32          createTime,
183 |   6: i32          lastAccessTime,
184 |   7: string       indexTableName,
185 |   8: StorageDescriptor   sd,
186 |   9: map<string, string> parameters,
187 |   10: bool         deferredRebuild
188 | }
189 | 
190 | // schema of the table/query results etc.
191 | struct Schema {
192 |  // column names, types, comments
193 |  1: list<FieldSchema> fieldSchemas,  // delimiters etc
194 |  2: map<string, string> properties
195 | }
196 | 
197 | // Key-value store to be used with selected
198 | // Metastore APIs (create, alter methods).
199 | // The client can pass environment properties / configs that can be
200 | // accessed in hooks.
201 | struct EnvironmentContext {
202 |   1: map<string, string> properties
203 | }
204 | 
205 | exception MetaException {
206 |   1: string message
207 | }
208 | 
209 | exception UnknownTableException {
210 |   1: string message
211 | }
212 | 
213 | exception UnknownDBException {
214 |   1: string message
215 | }
216 | 
217 | exception AlreadyExistsException {
218 |   1: string message
219 | }
220 | 
221 | exception InvalidPartitionException {
222 |   1: string message
223 | }
224 | 
225 | exception UnknownPartitionException {
226 |   1: string message
227 | }
228 | 
229 | exception InvalidObjectException {
230 |   1: string message
231 | }
232 | 
233 | exception NoSuchObjectException {
234 |   1: string message
235 | }
236 | 
237 | exception IndexAlreadyExistsException {
238 |   1: string message
239 | }
240 | 
241 | exception InvalidOperationException {
242 |   1: string message
243 | }
244 | 
245 | exception ConfigValSecurityException {
246 |   1: string message
247 | }
248 | 
249 | /**
250 | * This interface is live.
251 | */
252 | service ThriftHiveMetastore extends fb303.FacebookService
253 | {
254 |   void create_database(1:Database database) throws(1:AlreadyExistsException o1, 2:InvalidObjectException o2, 3:MetaException o3)
255 |   Database get_database(1:string name) throws(1:NoSuchObjectException o1, 2:MetaException o2)
256 |   void drop_database(1:string name, 2:bool deleteData, 3:bool cascade) throws(1:NoSuchObjectException o1, 2:InvalidOperationException o2, 3:MetaException o3)
257 |   list<string> get_databases(1:string pattern) throws(1:MetaException o1)
258 |   list<string> get_all_databases() throws(1:MetaException o1)
259 |   void alter_database(1:string dbname, 2:Database db) throws(1:MetaException o1, 2:NoSuchObjectException o2)
260 | 
261 |   // returns the type with given name (make seperate calls for the dependent types if needed)
262 |   Type get_type(1:string name)  throws(1:MetaException o1, 2:NoSuchObjectException o2)
263 |   bool create_type(1:Type type) throws(1:AlreadyExistsException o1, 2:InvalidObjectException o2, 3:MetaException o3)
264 |   bool drop_type(1:string type) throws(1:MetaException o1, 2:NoSuchObjectException o2)
265 |   map<string, Type> get_type_all(1:string name)
266 |                                 throws(1:MetaException o2)
267 | 
268 |   // Gets a list of FieldSchemas describing the columns of a particular table
269 |   list<FieldSchema> get_fields(1: string db_name, 2: string table_name) throws (1: MetaException o1, 2: UnknownTableException o2, 3: UnknownDBException o3),
270 | 
271 |   // Gets a list of FieldSchemas describing both the columns and the partition keys of a particular table
272 |   list<FieldSchema> get_schema(1: string db_name, 2: string table_name) throws (1: MetaException o1, 2: UnknownTableException o2, 3: UnknownDBException o3)
273 | 
274 |   // create a Hive table. Following fields must be set
275 |   // tableName
276 |   // database        (only 'default' for now until Hive QL supports databases)
277 |   // owner           (not needed, but good to have for tracking purposes)
278 |   // sd.cols         (list of field schemas)
279 |   // sd.inputFormat  (SequenceFileInputFormat (binary like falcon tables or u_full) or TextInputFormat)
280 |   // sd.outputFormat (SequenceFileInputFormat (binary) or TextInputFormat)
281 |   // sd.serdeInfo.serializationLib (SerDe class name eg org.apache.hadoop.hive.serde.simple_meta.MetadataTypedColumnsetSerDe
282 |   // * See notes on DDL_TIME
283 |   void create_table(1:Table tbl) throws(1:AlreadyExistsException o1, 2:InvalidObjectException o2, 3:MetaException o3, 4:NoSuchObjectException o4)
284 |   void create_table_with_environment_context(1:Table tbl,
285 |       2:EnvironmentContext environment_context)
286 |       throws (1:AlreadyExistsException o1,
287 |               2:InvalidObjectException o2, 3:MetaException o3,
288 |               4:NoSuchObjectException o4)
289 |   // drops the table and all the partitions associated with it if the table has partitions
290 |   // delete data (including partitions) if deleteData is set to true
291 |   void drop_table(1:string dbname, 2:string name, 3:bool deleteData)
292 |                        throws(1:NoSuchObjectException o1, 2:MetaException o3)
293 |   list<string> get_tables(1: string db_name, 2: string pattern) throws (1: MetaException o1)
294 |   list<string> get_all_tables(1: string db_name) throws (1: MetaException o1)
295 | 
296 |   Table get_table(1:string dbname, 2:string tbl_name)
297 |                        throws (1:MetaException o1, 2:NoSuchObjectException o2)
298 |   list<Table> get_table_objects_by_name(1:string dbname, 2:list<string> tbl_names)
299 | 				   throws (1:MetaException o1, 2:InvalidOperationException o2, 3:UnknownDBException o3)
300 | 
301 |   // Get a list of table names that match a filter.
302 |   // The filter operators are LIKE, <, <=, >, >=, =, <>
303 |   //
304 |   // In the filter statement, values interpreted as strings must be enclosed in quotes,
305 |   // while values interpreted as integers should not be.  Strings and integers are the only
306 |   // supported value types.
307 |   //
308 |   // The currently supported key names in the filter are:
309 |   // Constants.HIVE_FILTER_FIELD_OWNER, which filters on the tables' owner's name
310 |   //   and supports all filter operators
311 |   // Constants.HIVE_FILTER_FIELD_LAST_ACCESS, which filters on the last access times
312 |   //   and supports all filter operators except LIKE
313 |   // Constants.HIVE_FILTER_FIELD_PARAMS, which filters on the tables' parameter keys and values
314 |   //   and only supports the filter operators = and <>.
315 |   //   Append the parameter key name to HIVE_FILTER_FIELD_PARAMS in the filter statement.
316 |   //   For example, to filter on parameter keys called "retention", the key name in the filter
317 |   //   statement should be Constants.HIVE_FILTER_FIELD_PARAMS + "retention"
318 |   //   Also, = and <> only work for keys that exist
319 |   //   in the tables. E.g., if you are looking for tables where key1 <> value, it will only
320 |   //   look at tables that have a value for the parameter key1.
321 |   // Some example filter statements include:
322 |   // filter = Constants.HIVE_FILTER_FIELD_OWNER + " like \".*test.*\" and " +
323 |   //   Constants.HIVE_FILTER_FIELD_LAST_ACCESS + " = 0";
324 |   // filter = Constants.HIVE_FILTER_FIELD_PARAMS + "retention = \"30\" or " +
325 |   //   Constants.HIVE_FILTER_FIELD_PARAMS + "retention = \"90\""
326 |   // @param dbName
327 |   //          The name of the database from which you will retrieve the table names
328 |   // @param filterType
329 |   //          The type of filter
330 |   // @param filter
331 |   //          The filter string
332 |   // @param max_tables
333 |   //          The maximum number of tables returned
334 |   // @return  A list of table names that match the desired filter
335 |   list<string> get_table_names_by_filter(1:string dbname, 2:string filter, 3:i16 max_tables=-1)
336 |                        throws (1:MetaException o1, 2:InvalidOperationException o2, 3:UnknownDBException o3)
337 | 
338 |   // alter table applies to only future partitions not for existing partitions
339 |   // * See notes on DDL_TIME
340 |   void alter_table(1:string dbname, 2:string tbl_name, 3:Table new_tbl)
341 |                        throws (1:InvalidOperationException o1, 2:MetaException o2)
342 |   void alter_table_with_environment_context(1:string dbname, 2:string tbl_name,
343 |       3:Table new_tbl, 4:EnvironmentContext environment_context)
344 |       throws (1:InvalidOperationException o1, 2:MetaException o2)
345 |   // the following applies to only tables that have partitions
346 |   // * See notes on DDL_TIME
347 |   Partition add_partition(1:Partition new_part)
348 |                        throws(1:InvalidObjectException o1, 2:AlreadyExistsException o2, 3:MetaException o3)
349 |   Partition add_partition_with_environment_context(1:Partition new_part,
350 |       2:EnvironmentContext environment_context)
351 |       throws (1:InvalidObjectException o1, 2:AlreadyExistsException o2,
352 |       3:MetaException o3)
353 |   i32 add_partitions(1:list<Partition> new_parts)
354 |                        throws(1:InvalidObjectException o1, 2:AlreadyExistsException o2, 3:MetaException o3)
355 |   Partition append_partition(1:string db_name, 2:string tbl_name, 3:list<string> part_vals)
356 |                        throws (1:InvalidObjectException o1, 2:AlreadyExistsException o2, 3:MetaException o3)
357 |   Partition append_partition_by_name(1:string db_name, 2:string tbl_name, 3:string part_name)
358 |                        throws (1:InvalidObjectException o1, 2:AlreadyExistsException o2, 3:MetaException o3)
359 |   bool drop_partition(1:string db_name, 2:string tbl_name, 3:list<string> part_vals, 4:bool deleteData)
360 |                        throws(1:NoSuchObjectException o1, 2:MetaException o2)
361 |   bool drop_partition_by_name(1:string db_name, 2:string tbl_name, 3:string part_name, 4:bool deleteData)
362 |                        throws(1:NoSuchObjectException o1, 2:MetaException o2)
363 |   Partition get_partition(1:string db_name, 2:string tbl_name, 3:list<string> part_vals)
364 |                        throws(1:MetaException o1, 2:NoSuchObjectException o2)
365 | 
366 |   Partition get_partition_with_auth(1:string db_name, 2:string tbl_name, 3:list<string> part_vals,
367 |       4: string user_name, 5: list<string> group_names) throws(1:MetaException o1, 2:NoSuchObjectException o2)
368 | 
369 |   Partition get_partition_by_name(1:string db_name 2:string tbl_name, 3:string part_name)
370 |                        throws(1:MetaException o1, 2:NoSuchObjectException o2)
371 | 
372 |   // returns all the partitions for this table in reverse chronological order.
373 |   // If max parts is given then it will return only that many.
374 |   list<Partition> get_partitions(1:string db_name, 2:string tbl_name, 3:i16 max_parts=-1)
375 |                        throws(1:NoSuchObjectException o1, 2:MetaException o2)
376 |   list<Partition> get_partitions_with_auth(1:string db_name, 2:string tbl_name, 3:i16 max_parts=-1,
377 |      4: string user_name, 5: list<string> group_names) throws(1:NoSuchObjectException o1, 2:MetaException o2)
378 | 
379 |   list<string> get_partition_names(1:string db_name, 2:string tbl_name, 3:i16 max_parts=-1)
380 |                        throws(1:MetaException o2)
381 | 
382 |   // get_partition*_ps methods allow filtering by a partial partition specification,
383 |   // as needed for dynamic partitions. The values that are not restricted should
384 |   // be empty strings. Nulls were considered (instead of "") but caused errors in
385 |   // generated Python code. The size of part_vals may be smaller than the
386 |   // number of partition columns - the unspecified values are considered the same
387 |   // as "".
388 |   list<Partition> get_partitions_ps(1:string db_name 2:string tbl_name
389 |   	3:list<string> part_vals, 4:i16 max_parts=-1)
390 |                        throws(1:MetaException o1, 2:NoSuchObjectException o2)
391 |   list<Partition> get_partitions_ps_with_auth(1:string db_name, 2:string tbl_name, 3:list<string> part_vals, 4:i16 max_parts=-1,
392 |      5: string user_name, 6: list<string> group_names) throws(1:NoSuchObjectException o1, 2:MetaException o2)
393 | 
394 |   list<string> get_partition_names_ps(1:string db_name,
395 |   	2:string tbl_name, 3:list<string> part_vals, 4:i16 max_parts=-1)
396 |   	                   throws(1:MetaException o1, 2:NoSuchObjectException o2)
397 | 
398 |   // get the partitions matching the given partition filter
399 |   list<Partition> get_partitions_by_filter(1:string db_name 2:string tbl_name
400 |     3:string filter, 4:i16 max_parts=-1)
401 |                        throws(1:MetaException o1, 2:NoSuchObjectException o2)
402 | 
403 |   // get partitions give a list of partition names
404 |   list<Partition> get_partitions_by_names(1:string db_name 2:string tbl_name 3:list<string> names)
405 |                        throws(1:MetaException o1, 2:NoSuchObjectException o2)
406 | 
407 |   // changes the partition to the new partition object. partition is identified from the part values
408 |   // in the new_part
409 |   // * See notes on DDL_TIME
410 |   void alter_partition(1:string db_name, 2:string tbl_name, 3:Partition new_part)
411 |                        throws (1:InvalidOperationException o1, 2:MetaException o2)
412 | 
413 |   void alter_partition_with_environment_context(1:string db_name,
414 |       2:string tbl_name, 3:Partition new_part,
415 |       4:EnvironmentContext environment_context)
416 |       throws (1:InvalidOperationException o1, 2:MetaException o2)
417 | 
418 |   // rename the old partition to the new partition object by changing old part values to the part values
419 |   // in the new_part. old partition is identified from part_vals.
420 |   // partition keys in new_part should be the same as those in old partition.
421 |   void rename_partition(1:string db_name, 2:string tbl_name, 3:list<string> part_vals, 4:Partition new_part)
422 |                        throws (1:InvalidOperationException o1, 2:MetaException o2)
423 | 
424 |   // gets the value of the configuration key in the metastore server. returns
425 |   // defaultValue if the key does not exist. if the configuration key does not
426 |   // begin with "hive", "mapred", or "hdfs", a ConfigValSecurityException is
427 |   // thrown.
428 |   string get_config_value(1:string name, 2:string defaultValue)
429 |                           throws(1:ConfigValSecurityException o1)
430 | 
431 |   // converts a partition name into a partition values array
432 |   list<string> partition_name_to_vals(1: string part_name)
433 |                           throws(1: MetaException o1)
434 |   // converts a partition name into a partition specification (a mapping from
435 |   // the partition cols to the values)
436 |   map<string, string> partition_name_to_spec(1: string part_name)
437 |                           throws(1: MetaException o1)
438 | 
439 |   void markPartitionForEvent(1:string db_name, 2:string tbl_name, 3:map<string,string> part_vals,
440 |                   4:PartitionEventType eventType) throws (1: MetaException o1, 2: NoSuchObjectException o2,
441 |                   3: UnknownDBException o3, 4: UnknownTableException o4, 5: UnknownPartitionException o5,
442 |                   6: InvalidPartitionException o6)
443 |   bool isPartitionMarkedForEvent(1:string db_name, 2:string tbl_name, 3:map<string,string> part_vals,
444 |                   4: PartitionEventType eventType) throws (1: MetaException o1, 2:NoSuchObjectException o2,
445 |                   3: UnknownDBException o3, 4: UnknownTableException o4, 5: UnknownPartitionException o5,
446 |                   6: InvalidPartitionException o6)
447 | 
448 |   //index
449 |   Index add_index(1:Index new_index, 2: Table index_table)
450 |                        throws(1:InvalidObjectException o1, 2:AlreadyExistsException o2, 3:MetaException o3)
451 |   void alter_index(1:string dbname, 2:string base_tbl_name, 3:string idx_name, 4:Index new_idx)
452 |                        throws (1:InvalidOperationException o1, 2:MetaException o2)
453 |   bool drop_index_by_name(1:string db_name, 2:string tbl_name, 3:string index_name, 4:bool deleteData)
454 |                        throws(1:NoSuchObjectException o1, 2:MetaException o2)
455 |   Index get_index_by_name(1:string db_name 2:string tbl_name, 3:string index_name)
456 |                        throws(1:MetaException o1, 2:NoSuchObjectException o2)
457 | 
458 |   list<Index> get_indexes(1:string db_name, 2:string tbl_name, 3:i16 max_indexes=-1)
459 |                        throws(1:NoSuchObjectException o1, 2:MetaException o2)
460 |   list<string> get_index_names(1:string db_name, 2:string tbl_name, 3:i16 max_indexes=-1)
461 |                        throws(1:MetaException o2)
462 | 
463 |   //authorization privileges
464 | 
465 |   bool create_role(1:Role role) throws(1:MetaException o1)
466 |   bool drop_role(1:string role_name) throws(1:MetaException o1)
467 |   list<string> get_role_names() throws(1:MetaException o1)
468 |   bool grant_role(1:string role_name, 2:string principal_name, 3:PrincipalType principal_type,
469 |     4:string grantor, 5:PrincipalType grantorType, 6:bool grant_option) throws(1:MetaException o1)
470 |   bool revoke_role(1:string role_name, 2:string principal_name, 3:PrincipalType principal_type)
471 |                         throws(1:MetaException o1)
472 |   list<Role> list_roles(1:string principal_name, 2:PrincipalType principal_type) throws(1:MetaException o1)
473 | 
474 |   PrincipalPrivilegeSet get_privilege_set(1:HiveObjectRef hiveObject, 2:string user_name,
475 |     3: list<string> group_names) throws(1:MetaException o1)
476 |   list<HiveObjectPrivilege> list_privileges(1:string principal_name, 2:PrincipalType principal_type,
477 |     3: HiveObjectRef hiveObject) throws(1:MetaException o1)
478 | 
479 |   bool grant_privileges(1:PrivilegeBag privileges) throws(1:MetaException o1)
480 |   bool revoke_privileges(1:PrivilegeBag privileges) throws(1:MetaException o1)
481 | 
482 |   // this is used by metastore client to send UGI information to metastore server immediately
483 |   // after setting up a connection.
484 |   list<string> set_ugi(1:string user_name, 2:list<string> group_names) throws (1:MetaException o1)
485 | 
486 |   //Authentication (delegation token) interfaces
487 | 
488 |   // get metastore server delegation token for use from the map/reduce tasks to authenticate
489 |   // to metastore server
490 |   string get_delegation_token(1:string token_owner, 2:string renewer_kerberos_principal_name)
491 |     throws (1:MetaException o1)
492 | 
493 |   // method to renew delegation token obtained from metastore server
494 |   i64 renew_delegation_token(1:string token_str_form) throws (1:MetaException o1)
495 | 
496 |   // method to cancel delegation token obtained from metastore server
497 |   void cancel_delegation_token(1:string token_str_form) throws (1:MetaException o1)
498 | }
499 | 
500 | // * Note about the DDL_TIME: When creating or altering a table or a partition,
501 | // if the DDL_TIME is not set, the current time will be used.
502 | 
503 | // For storing info about archived partitions in parameters
504 | 
505 | // Whether the partition is archived
506 | const string IS_ARCHIVED = "is_archived",
507 | // The original location of the partition, before archiving. After archiving,
508 | // this directory will contain the archive. When the partition
509 | // is dropped, this directory will be deleted
510 | const string ORIGINAL_LOCATION = "original_location",
511 | 
512 | // these should be needed only for backward compatibility with filestore
513 | const string META_TABLE_COLUMNS   = "columns",
514 | const string META_TABLE_COLUMN_TYPES   = "columns.types",
515 | const string BUCKET_FIELD_NAME    = "bucket_field_name",
516 | const string BUCKET_COUNT         = "bucket_count",
517 | const string FIELD_TO_DIMENSION   = "field_to_dimension",
518 | const string META_TABLE_NAME      = "name",
519 | const string META_TABLE_DB        = "db",
520 | const string META_TABLE_LOCATION  = "location",
521 | const string META_TABLE_SERDE     = "serde",
522 | const string META_TABLE_PARTITION_COLUMNS = "partition_columns",
523 | const string FILE_INPUT_FORMAT    = "file.inputformat",
524 | const string FILE_OUTPUT_FORMAT   = "file.outputformat",
525 | const string META_TABLE_STORAGE   = "storage_handler",
526 | 
527 | 
528 | 
529 | 


--------------------------------------------------------------------------------
/herringbone-main/pom.xml:
--------------------------------------------------------------------------------
  1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  2 |   xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  3 |   <modelVersion>4.0.0</modelVersion>
  4 | 
  5 |   <groupId>com.stripe</groupId>
  6 |   <artifactId>herringbone-main</artifactId>
  7 |   <version>0.0.1</version>
  8 |   <packaging>jar</packaging>
  9 | 
 10 |   <name>Herringbone Main</name>
 11 | 
 12 |   <pluginRepositories>
 13 |     <pluginRepository>
 14 |       <id>dtrott</id>
 15 |       <url>https://maven.davidtrott.com/repository</url>
 16 |     </pluginRepository>
 17 |   </pluginRepositories>
 18 | 
 19 |   <build>
 20 |     <plugins>
 21 |       <plugin>
 22 |         <groupId>org.scalatest</groupId>
 23 |         <artifactId>scalatest-maven-plugin</artifactId>
 24 |         <version>1.0-M2</version>
 25 |         <configuration>
 26 |           <reportsDirectory>${project.build.directory}/surefire-reports</reportsDirectory>
 27 |           <junitxml>.</junitxml>
 28 |           <filereports>WDF TestSuite.txt</filereports>
 29 |           <htmlreporters>${project.build.directory}/html/scalatest</htmlreporters>
 30 |           <testFailureIgnore>false</testFailureIgnore>
 31 |         </configuration>
 32 |         <executions>
 33 |           <execution>
 34 |             <id>test</id>
 35 |             <goals>
 36 |               <goal>test</goal>
 37 |             </goals>
 38 |           </execution>
 39 |         </executions>
 40 |       </plugin>
 41 | 
 42 |       <plugin>
 43 |         <groupId>org.apache.maven.plugins</groupId>
 44 |         <artifactId>maven-compiler-plugin</artifactId>
 45 |         <version>3.1</version>
 46 |         <configuration>
 47 |           <source>1.6</source>
 48 |           <target>1.6</target>
 49 |         </configuration>
 50 |       </plugin>
 51 |       <plugin>
 52 |         <artifactId>maven-jar-plugin</artifactId>
 53 |         <version>2.3.1</version>
 54 |       </plugin>
 55 | 
 56 |       <plugin>
 57 |         <artifactId>maven-resources-plugin</artifactId>
 58 |         <version>2.4.3</version>
 59 |       </plugin>
 60 | 
 61 |       <plugin>
 62 |         <groupId>net.alchim31.maven</groupId>
 63 |         <artifactId>scala-maven-plugin</artifactId>
 64 |         <version>3.1.6</version>
 65 |         <configuration>
 66 |           <recompileMode>incremental</recompileMode>
 67 |           <useZincServer>true</useZincServer>
 68 |         </configuration>
 69 |         <executions>
 70 |           <execution>
 71 |             <goals>
 72 |               <goal>compile</goal>
 73 |               <goal>testCompile</goal>
 74 |             </goals>
 75 |           </execution>
 76 |         </executions>
 77 |       </plugin>
 78 | 
 79 |       <plugin>
 80 |         <groupId>org.apache.maven.plugins</groupId>
 81 |         <artifactId>maven-shade-plugin</artifactId>
 82 |         <version>2.3</version>
 83 |         <configuration>
 84 |           <shadedArtifactAttached>false</shadedArtifactAttached>
 85 |           <outputFile>target/herringbone-${project.version}-jar-with-dependencies.jar</outputFile>
 86 |         </configuration>
 87 |         <executions>
 88 |           <execution>
 89 |             <phase>package</phase>
 90 |             <goals>
 91 |               <goal>shade</goal>
 92 |             </goals>
 93 |           </execution>
 94 |         </executions>
 95 |       </plugin>
 96 |     </plugins>
 97 |   </build>
 98 | 
 99 |   <properties>
100 |     <parquet.version>1.6.0rc7</parquet.version>
101 |     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
102 |     <scala.version>2.10.4</scala.version>
103 |     <maven.compiler.source>1.7</maven.compiler.source>
104 |     <maven.compiler.target>1.7</maven.compiler.target>
105 |   </properties>
106 | 
107 |   <dependencies>
108 |     <dependency>
109 |       <groupId>com.twitter</groupId>
110 |       <artifactId>parquet-common</artifactId>
111 |       <version>${parquet.version}</version>
112 |     </dependency>
113 |     <dependency>
114 |       <groupId>com.twitter</groupId>
115 |       <artifactId>parquet-encoding</artifactId>
116 |       <version>${parquet.version}</version>
117 |     </dependency>
118 |     <dependency>
119 |       <groupId>com.twitter</groupId>
120 |       <artifactId>parquet-column</artifactId>
121 |       <version>${parquet.version}</version>
122 |     </dependency>
123 |     <dependency>
124 |       <groupId>com.twitter</groupId>
125 |       <artifactId>parquet-hadoop</artifactId>
126 |       <version>${parquet.version}</version>
127 |     </dependency>
128 |     <dependency>
129 |       <groupId>org.apache.hadoop</groupId>
130 |       <artifactId>hadoop-client</artifactId>
131 |       <version>2.5.2</version>
132 |       <scope>provided</scope>
133 |     </dependency>
134 |     <dependency>
135 |       <groupId>org.apache.hive</groupId>
136 |       <artifactId>hive-jdbc</artifactId>
137 |       <version>0.14.0</version>
138 |       <exclusions>
139 |         <exclusion>
140 |           <groupId>com.twitter</groupId>
141 |           <artifactId>parquet-hadoop-bundle</artifactId>
142 |         </exclusion>
143 |       </exclusions>
144 |     </dependency>
145 |     <dependency>
146 |       <groupId>org.rogach</groupId>
147 |       <artifactId>scallop_2.10</artifactId>
148 |       <version>0.9.5</version>
149 |     </dependency>
150 |     <dependency>
151 |       <groupId>org.scala-lang</groupId>
152 |       <artifactId>jline</artifactId>
153 |       <version>2.9.0-1</version>
154 |     </dependency>
155 |     <dependency>
156 |       <groupId>org.scalatest</groupId>
157 |       <artifactId>scalatest_2.10</artifactId>
158 |       <version>2.0</version>
159 |       <scope>test</scope>
160 |     </dependency>
161 |     <dependency>
162 |       <groupId>org.scalamock</groupId>
163 |       <artifactId>scalamock-scalatest-support_2.10</artifactId>
164 |       <version>3.1.RC1</version>
165 |       <scope>test</scope>
166 |     </dependency>
167 |     <dependency>
168 |       <groupId>com.stripe</groupId>
169 |       <artifactId>herringbone-impala</artifactId>
170 |       <version>0.0.2</version>
171 |       <exclusions>
172 |         <exclusion>
173 |           <groupId>org.apache.thrift</groupId>
174 |           <artifactId>libthrift</artifactId>
175 |         </exclusion>
176 |       </exclusions>
177 |     </dependency>
178 |   </dependencies>
179 | </project>
180 | 


--------------------------------------------------------------------------------
/herringbone-main/src/main/scala/com/stripe/herringbone/CompactInputFormat.scala:
--------------------------------------------------------------------------------
  1 | package com.stripe.herringbone
  2 | 
  3 | import java.util.{List => JavaList}
  4 | import java.io.DataOutput
  5 | import java.io.DataInput
  6 | 
  7 | import scala.collection.mutable.MutableList
  8 | import scala.collection.JavaConverters._
  9 | import scala.collection.JavaConversions._
 10 | 
 11 | import org.apache.hadoop.io.Writable
 12 | import org.apache.hadoop.mapreduce.{InputSplit,Job,JobContext,Mapper,TaskAttemptContext}
 13 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat
 14 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat
 15 | import parquet.hadoop.api.ReadSupport
 16 | import parquet.hadoop.{ParquetInputFormat,ParquetInputSplit,ParquetOutputFormat,ParquetRecordReader}
 17 | import parquet.hadoop.example.{ExampleOutputFormat,GroupReadSupport}
 18 | import parquet.hadoop.util.ContextUtil
 19 | import parquet.example.data.{Group,GroupWriter}
 20 | import parquet.example.data.simple.SimpleGroup
 21 | 
 22 | 
 23 | class CompactInputFormat[T](readSupportClass: Class[_ <: ReadSupport[T]]) extends ParquetInputFormat[T](readSupportClass) {
 24 | 
 25 |   // Our HDFS block size is 1024MB so we'll get pretty close.
 26 |   val TARGET = 1024 * 1024 * 1024 // 1024MB.
 27 | 
 28 |   override def getSplits(context: JobContext): JavaList[InputSplit] = {
 29 |     // Limit the splits to 100MB so it's easy to assemble them into 1024MB
 30 |     // chunks.  This is not actually reliable. Chunks can come back bigger than
 31 |     // 100MB, but it does limit the size of most chunks.
 32 |     val conf = ContextUtil.getConfiguration(context)
 33 |     conf.set("mapred.max.split.size", (100 * 1024 * 1024).toString)
 34 | 
 35 |     val splits = super.getSplits(conf, getFooters(context)).asScala.toList
 36 |     val m = if (splits.isEmpty) splits else mergeSplits(splits)
 37 |     m.asInstanceOf[List[InputSplit]].asJava
 38 |   }
 39 | 
 40 |   def mergeSplits(splits: List[ParquetInputSplit]): List[MergedInputSplit] = {
 41 |     val sizes = splits.map { _.getLength }
 42 |     println(s"""${splits.length} initial splits were generated.
 43 |                 |  Max: ${mb(sizes.max)}
 44 |                 |  Min: ${mb(sizes.min)}
 45 |                 |  Avg: ${mb(sizes.sum.toDouble / sizes.length)}""".stripMargin)
 46 | 
 47 |     // TODO: get a CS undergrad to give us better bin packing.
 48 |     var buckets = MutableList[MutableList[ParquetInputSplit]](MutableList(splits.head))
 49 |     splits.tail.foreach { split =>
 50 |       val bucket = buckets.minBy { b => b.map { _.getLength }.sum }
 51 |       if ((split.getLength + bucket.map { _.getLength }.sum) < TARGET) {
 52 |         bucket += split
 53 |       } else {
 54 |         buckets += MutableList(split)
 55 |       }
 56 |     }
 57 | 
 58 |     val newSizes = buckets.map { _.map { _.getLength }.sum }.toList
 59 |     println(s"""${buckets.length} merged splits were generated.
 60 |                 |  Max: ${mb(newSizes.max)}
 61 |                 |  Min: ${mb(newSizes.min)}
 62 |                 |  Avg: ${mb(newSizes.sum.toDouble / newSizes.length)}""".stripMargin)
 63 | 
 64 |     buckets.map { b => new MergedInputSplit(b.toList) }.toList
 65 |   }
 66 | 
 67 |   override def createRecordReader(split: InputSplit, context: TaskAttemptContext): MergedRecordReader[T] = {
 68 |     val readSupport = ParquetInputFormat.getReadSupportInstance[T](ContextUtil.getConfiguration(context))
 69 |     split match {
 70 |       case s: MergedInputSplit => new MergedRecordReader[T](s, context, readSupport)
 71 |       case _ => throw new Exception(s"Expected a MergedInputSplit. Found a $split.")
 72 |     }
 73 |   }
 74 | 
 75 |   // Helper for pretty-printing byte values.
 76 |   def mb(n: Double): String = {
 77 |     val K = 1024
 78 |     val M = K * K
 79 |     val G = K * M
 80 |     if (n < K) f"$n%.2fB"
 81 |     else if (n < M) f"${n / K}%.2fK"
 82 |     else if (n < G) f"${n / M}%.2fM"
 83 |     else f"${n / G}%.2fG"
 84 |   }
 85 | }
 86 | 
 87 | class MergedInputSplit(var splits: List[ParquetInputSplit]) extends InputSplit with Writable {
 88 |   def this() = this(List())
 89 | 
 90 |   var splitNumber = 0
 91 | 
 92 |   def currentSplit: ParquetInputSplit = splits(splitNumber)
 93 |   def nextSplit: Option[ParquetInputSplit] = {
 94 |     if (splitNumber < splits.length - 1) {
 95 |       splitNumber += 1
 96 |       Some(currentSplit)
 97 |     } else {
 98 |       None
 99 |     }
100 |   }
101 | 
102 |   // write and readFields are paired for serialization/deserialization.
103 |   override def write(out: DataOutput) = {
104 |     out.writeInt(splits.length)
105 |     splits.foreach { s => s.write(out) }
106 |   }
107 | 
108 |   override def readFields(in: DataInput) = {
109 |     val count = in.readInt
110 |     splits = for (i <- List.range(0, count)) yield {
111 |       val s = new ParquetInputSplit
112 |       s.readFields(in)
113 |       s
114 |     }
115 |   }
116 | 
117 |   override def getLength: Long = splits.map { _.getLength }.sum
118 |   override def getLocations: Array[String] = splits.flatMap { _.getLocations }.toArray
119 |   override def toString = "<MergedInputSplit splits:" + this.splits.length + ">"
120 | }
121 | 
122 | class MergedRecordReader[T](split: MergedInputSplit,
123 |                             taskContext: TaskAttemptContext,
124 |                             readSupport: ReadSupport[T]) extends ParquetRecordReader[T](readSupport) {
125 |   val totalLength = split.getLength
126 |   var progress = 0L
127 | 
128 |   override def initialize(split: InputSplit, context: TaskAttemptContext) {
129 |     super.initialize(split.asInstanceOf[MergedInputSplit].currentSplit, context)
130 |   }
131 | 
132 |   def startNextSplit(split: MergedInputSplit, context: TaskAttemptContext): Boolean = {
133 |     split.nextSplit match {
134 |       case Some(s) => {
135 |         super.initialize(s, context)
136 |         true
137 |       }
138 |       case None => false
139 |     }
140 |   }
141 | 
142 |   // nextKeyValue is used to ask for the next tuple and returns false when the
143 |   // recordReader has no more tuples. Since we're wrapping multiple splits, and
144 |   // therefore multiple record readers, we detect when the current inernal
145 |   // reader is done and move to the next reader.
146 |   override def nextKeyValue: Boolean = {
147 |     val next = super.nextKeyValue
148 |     if (next) {
149 |       next
150 |     } else {
151 |       super.close
152 |       progress += split.currentSplit.getLength
153 | 
154 |       if (startNextSplit(split, taskContext)) {
155 |         nextKeyValue
156 |       } else {
157 |         false
158 |       }
159 |     }
160 |   }
161 | 
162 |   override def toString = "<MergedRecordReader>"
163 |   override def getProgress: Float = progress / totalLength
164 | }
165 | 
166 | 
167 | class CompactGroupInputFormat extends CompactInputFormat[Group](classOf[GroupReadSupport]) { }
168 | 


--------------------------------------------------------------------------------
/herringbone-main/src/main/scala/com/stripe/herringbone/CompactJob.scala:
--------------------------------------------------------------------------------
  1 | package com.stripe.herringbone
  2 | 
  3 | import com.stripe.herringbone.util.ParquetUtils
  4 | 
  5 | import java.util.{List => JavaList}
  6 | import java.io.DataOutput
  7 | import java.io.DataInput
  8 | 
  9 | import scala.collection.mutable.MutableList
 10 | import scala.collection.JavaConverters._
 11 | 
 12 | import org.apache.hadoop.conf.{Configuration,Configured}
 13 | import org.apache.hadoop.fs.{FileSystem,Path}
 14 | import org.apache.hadoop.mapreduce.{Job,Mapper}
 15 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat
 16 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat
 17 | import org.apache.hadoop.util.{Tool,ToolRunner}
 18 | 
 19 | import org.codehaus.jackson.map.ObjectMapper
 20 | import org.codehaus.jackson.`type`.TypeReference
 21 | 
 22 | import org.rogach.scallop.ScallopConf
 23 | 
 24 | import parquet.example.data.{Group,GroupWriter}
 25 | import parquet.hadoop.{BadConfigurationException,ParquetInputFormat,ParquetOutputFormat}
 26 | import parquet.hadoop.api.{DelegatingWriteSupport,WriteSupport}
 27 | import parquet.hadoop.api.WriteSupport.FinalizedWriteContext
 28 | import parquet.hadoop.example.{GroupReadSupport,GroupWriteSupport}
 29 | 
 30 | class ParquetCompactConf(arguments: Seq[String]) extends ScallopConf(arguments) {
 31 |   val inputPath = opt[String](required = true)
 32 |   val outputPath = opt[String](descr = "Default is input path with `-compact` appended")
 33 | }
 34 | 
 35 | class ParquetCompactWriteSupport extends DelegatingWriteSupport[Group](new GroupWriteSupport) {
 36 |   var extraMetadata: java.util.Map[String, String] = _
 37 | 
 38 |   override def init(configuration: Configuration): WriteSupport.WriteContext = {
 39 |     extractMetadata(configuration)
 40 |     super.init(configuration)
 41 |   }
 42 | 
 43 |   override def finalizeWrite(): FinalizedWriteContext = {
 44 |     new FinalizedWriteContext(extraMetadata)
 45 |   }
 46 | 
 47 |   def extractMetadata(configuration: Configuration) = {
 48 |     val metadataJson = configuration.get(ParquetCompactWriteSupport.ExtraMetadataKey)
 49 |     try {
 50 |       extraMetadata = new ObjectMapper().readValue(metadataJson, new TypeReference[java.util.Map[String,String]](){})
 51 |     } catch { case e: java.io.IOException =>
 52 |       throw new BadConfigurationException("Unable to deserialize extra extra metadata: " + metadataJson, e)
 53 |     }
 54 |   }
 55 | }
 56 | 
 57 | object ParquetCompactWriteSupport {
 58 |   val ExtraMetadataKey = "herringbone.compact.extrametadata"
 59 | }
 60 | 
 61 | class CompactJob extends Configured with Tool {
 62 |   override def run(arguments: Array[String]) = {
 63 |     val conf = new ParquetCompactConf(arguments)
 64 |     val inputPath = new Path(conf.inputPath())
 65 |     val fs = inputPath.getFileSystem(getConf)
 66 |     val outputPathString = conf.outputPath.get.getOrElse(conf.inputPath().stripSuffix("/").concat("-compact"))
 67 |     val outputPath = new Path(outputPathString)
 68 | 
 69 |     // Pass along metadata (which includes the thrift schema) to the results.
 70 |     val metadata = ParquetUtils.readKeyValueMetaData(inputPath)
 71 |     val metadataJson = new ObjectMapper().writeValueAsString(metadata)
 72 |     getConf.set(ParquetCompactWriteSupport.ExtraMetadataKey, metadataJson)
 73 | 
 74 |     if (fs.exists(outputPath)) {
 75 |       println(s"Deleting existing $outputPath")
 76 |       fs.delete(outputPath, true)
 77 |     }
 78 | 
 79 |     val job = new Job(getConf)
 80 | 
 81 |     FileInputFormat.setInputPaths(job, inputPath)
 82 |     FileOutputFormat.setOutputPath(job, outputPath)
 83 |     ParquetInputFormat.setReadSupportClass(job, classOf[GroupReadSupport])
 84 |     ParquetOutputFormat.setWriteSupportClass(job, classOf[ParquetCompactWriteSupport])
 85 |     GroupWriteSupport.setSchema(ParquetUtils.readSchema(inputPath), job.getConfiguration)
 86 | 
 87 |     job.setJobName("compact " + conf.inputPath() + " → " + outputPathString)
 88 |     job.setInputFormatClass(classOf[CompactGroupInputFormat]);
 89 |     job.setOutputFormatClass(classOf[ParquetOutputFormat[Group]])
 90 |     job.setMapperClass(classOf[Mapper[Void,Group,Void,Group]])
 91 |     job.setJarByClass(classOf[CompactJob])
 92 |     job.getConfiguration.setBoolean("mapreduce.job.user.classpath.first", true)
 93 |     job.getConfiguration.setBoolean(ParquetOutputFormat.ENABLE_JOB_SUMMARY, false)
 94 |     job.getConfiguration.setBoolean(ParquetInputFormat.TASK_SIDE_METADATA, false);
 95 |     job.setNumReduceTasks(0)
 96 | 
 97 |     if(job.waitForCompletion(true)) 0 else 1
 98 |   }
 99 | }
100 | 
101 | object CompactJob {
102 | 
103 |   def main(args: Array[String]) = {
104 |     val result = ToolRunner.run(new Configuration, new CompactJob, args)
105 |     System.exit(result)
106 |   }
107 | }
108 | 


--------------------------------------------------------------------------------
/herringbone-main/src/main/scala/com/stripe/herringbone/FlattenJob.scala:
--------------------------------------------------------------------------------
 1 | package com.stripe.herringbone
 2 | 
 3 | import com.stripe.herringbone.flatten.{ParquetFlatConf,ParquetFlatMapper,TypeFlattener}
 4 | import com.stripe.herringbone.flatten.FlatConverter
 5 | import com.stripe.herringbone.util.ParquetUtils
 6 | 
 7 | import org.apache.hadoop.mapreduce._
 8 | import org.apache.hadoop.mapreduce.lib.input._
 9 | import org.apache.hadoop.mapreduce.lib.output._
10 | import org.apache.hadoop.util._
11 | import org.apache.hadoop.fs._
12 | import org.apache.hadoop.conf._
13 | 
14 | import parquet.example.data._
15 | import parquet.example.data.simple._
16 | import parquet.hadoop._
17 | import parquet.hadoop.example._
18 | import parquet.io.api._
19 | import parquet.schema._
20 | 
21 | import org.rogach.scallop._
22 | 
23 | class FlattenMapper extends ParquetFlatMapper[Group] {
24 |   def valueOut(value: Group) = {
25 |     FlatConverter.flattenGroup(value, flattenedSchema, separator, renameId)
26 |   }
27 | }
28 | 
29 | class FlattenJob extends Configured with Tool {
30 |   override def run(args: Array[String]) = {
31 |     val conf = new ParquetFlatConf(args)
32 |     val fs = FileSystem.get(getConf)
33 |     val inputPath = new Path(conf.inputPath())
34 |     val outputPathString = conf.outputPath.get.getOrElse(conf.inputPath().stripSuffix("/").concat("-flat"))
35 |     val outputPath = new Path(outputPathString)
36 |     val previousPath = conf.previousPath.get.map{new Path(_)}
37 | 
38 |     val separator = conf.separator()
39 |     getConf.set(ParquetFlatMapper.SeparatorKey, separator)
40 | 
41 |     val renameId = conf.renameId()
42 |     getConf.set(ParquetFlatMapper.RenameIdKey, renameId.toString)
43 | 
44 |     if (fs.exists(outputPath)) {
45 |       println(s"Deleting existing $outputPath")
46 |       fs.delete(outputPath, true)
47 |     }
48 | 
49 |     val flattenedSchema = TypeFlattener.flatten(
50 |       ParquetUtils.readSchema(inputPath),
51 |       previousPath.map { ParquetUtils.readSchema(_) },
52 |       separator,
53 |       renameId
54 |     )
55 | 
56 |     val jobName = "flatten " + conf.inputPath() + " -> " + outputPathString
57 |     val job = new Job(getConf, jobName)
58 | 
59 |     FileInputFormat.setInputPaths(job, inputPath)
60 |     FileOutputFormat.setOutputPath(job, outputPath)
61 |     ExampleOutputFormat.setSchema(job, flattenedSchema)
62 |     ParquetInputFormat.setReadSupportClass(job, classOf[GroupReadSupport])
63 | 
64 |     job.setInputFormatClass(classOf[CompactGroupInputFormat]);
65 |     job.setOutputFormatClass(classOf[ExampleOutputFormat])
66 |     job.setMapperClass(classOf[FlattenMapper])
67 |     job.setJarByClass(classOf[FlattenJob])
68 |     job.getConfiguration.setBoolean("mapreduce.job.user.classpath.first", true)
69 |     job.getConfiguration.setBoolean(ParquetOutputFormat.ENABLE_JOB_SUMMARY, false)
70 |     job.getConfiguration.setBoolean(ParquetInputFormat.TASK_SIDE_METADATA, false);
71 |     job.setNumReduceTasks(0)
72 | 
73 |     if (job.waitForCompletion(true)) 0 else 1
74 |   }
75 | }
76 | 
77 | object FlattenJob {
78 |   def main(args: Array[String]) = {
79 |     val result = ToolRunner.run(new Configuration, new FlattenJob, args)
80 |     System.exit(result)
81 |   }
82 | }
83 | 


--------------------------------------------------------------------------------
/herringbone-main/src/main/scala/com/stripe/herringbone/ParquetLoad.scala:
--------------------------------------------------------------------------------
 1 | package com.stripe.herringbone
 2 | 
 3 | import com.stripe.herringbone.load._
 4 | 
 5 | import org.apache.hadoop.conf._
 6 | import org.apache.hadoop.util._
 7 | 
 8 | class ParquetLoad extends Configured with Tool {
 9 |   override def run(args: Array[String]): Int = {
10 |     val conf = new ParquetLoadConf(args)
11 |     val hadoopFs = new HadoopFs()
12 |     val fieldUtils = FieldUtils(hadoopFs, ImpalaHiveSchemaTypeMapper)
13 | 
14 |     val loader: ParquetLoader = if (conf.hive()) {
15 |       HiveLoader(conf, hadoopFs, fieldUtils)
16 |     } else {
17 |       ImpalaLoader(conf, hadoopFs, fieldUtils)
18 |     }
19 | 
20 |     if (conf.updatePartitions()) {
21 |       val tableExists = loader.checkTableExists(conf.table(), conf.database())
22 | 
23 |       (conf.path.get, tableExists) match {
24 |         case (_, true) => loader.updateTable(conf.table(), conf.database())
25 |         case (Some(path), false) => loader.createTable(path, conf.table(), conf.database())
26 |         case (None, false) => {
27 |           println("ERROR - path not specified and table not yet created. Specify path from which to create the table")
28 |           return 1
29 |         }
30 |       }
31 |     } else {
32 |       loader.createTable(conf.path(), conf.table(), conf.database())
33 |     }
34 |     loader.closeConnection
35 | 
36 |     0
37 |   }
38 | }
39 | 
40 | object ParquetLoad {
41 |   def main(args: Array[String]) = {
42 |     val result = ToolRunner.run(new Configuration, new ParquetLoad, args)
43 |     System.exit(result)
44 |   }
45 | }
46 | 


--------------------------------------------------------------------------------
/herringbone-main/src/main/scala/com/stripe/herringbone/TsvJob.scala:
--------------------------------------------------------------------------------
  1 | package com.stripe.herringbone
  2 | 
  3 | import com.stripe.herringbone.flatten.{ParquetFlatConf,ParquetFlatMapper,TypeFlattener}
  4 | import com.stripe.herringbone.flatten.FlatConverter
  5 | import com.stripe.herringbone.util.ParquetUtils
  6 | 
  7 | import java.io.{BufferedWriter, OutputStreamWriter}
  8 | 
  9 | import org.apache.hadoop.mapreduce._
 10 | import org.apache.hadoop.mapreduce.lib.input._
 11 | import org.apache.hadoop.mapreduce.lib.output._
 12 | import org.apache.hadoop.util._
 13 | import org.apache.hadoop.fs._
 14 | import org.apache.hadoop.conf._
 15 | import org.apache.hadoop.io.Text
 16 | 
 17 | import org.rogach.scallop._
 18 | 
 19 | import parquet.example.data._
 20 | import parquet.example.data.simple._
 21 | import parquet.hadoop._
 22 | import parquet.hadoop.example._
 23 | import parquet.io.api._
 24 | import parquet.schema._
 25 | 
 26 | import scala.collection.JavaConversions._
 27 | 
 28 | class TsvMapper extends ParquetFlatMapper[Text] {
 29 |   def valueOut(value: Group) = {
 30 |     val tsvLine = FlatConverter.groupToTSV(value, flattenedSchema, separator, renameId)
 31 |     new Text(tsvLine)
 32 |   }
 33 | }
 34 | 
 35 | class TsvJob extends Configured with Tool {
 36 |   override def run(args: Array[String]) = {
 37 |     val conf = new ParquetFlatConf(args)
 38 |     val fs = FileSystem.get(getConf)
 39 |     val inputPath = new Path(conf.inputPath())
 40 |     val outputPathString = conf.outputPath.get.getOrElse(conf.inputPath().stripSuffix("/").concat("-tsv"))
 41 |     val outputPath = new Path(outputPathString)
 42 |     val previousPath = conf.previousPath.get.map{new Path(_)}
 43 | 
 44 |     val separator = conf.separator()
 45 |     getConf.set(ParquetFlatMapper.SeparatorKey, separator)
 46 | 
 47 |     val renameId = conf.renameId()
 48 |     getConf.set(ParquetFlatMapper.RenameIdKey, renameId.toString)
 49 | 
 50 |     if (fs.exists(outputPath)) {
 51 |       println(s"Deleting existing $outputPath")
 52 |       fs.delete(outputPath, true)
 53 |     }
 54 | 
 55 |     val flattenedSchema = TypeFlattener.flatten(
 56 |       ParquetUtils.readSchema(inputPath),
 57 |       previousPath.map { ParquetUtils.readSchema(_) },
 58 |       separator,
 59 |       renameId
 60 |     )
 61 | 
 62 |     val jobName = "tsv " + conf.inputPath() + " -> " + outputPathString
 63 |     val job = new Job(getConf, jobName)
 64 | 
 65 |     FileInputFormat.setInputPaths(job, inputPath)
 66 |     FileOutputFormat.setOutputPath(job, outputPath)
 67 |     ParquetInputFormat.setReadSupportClass(job, classOf[GroupReadSupport])
 68 |     ExampleOutputFormat.setSchema(job, flattenedSchema)
 69 | 
 70 |     job.setInputFormatClass(classOf[CompactGroupInputFormat])
 71 |     job.setOutputFormatClass(classOf[TextOutputFormat[Text, Text]].asInstanceOf[Class[Nothing]])
 72 |     job.setMapperClass(classOf[TsvMapper])
 73 |     job.setJarByClass(classOf[TsvJob])
 74 |     job.getConfiguration.set("mapreduce.job.user.classpath.first", "true")
 75 |     job.getConfiguration.setBoolean(ParquetInputFormat.TASK_SIDE_METADATA, false)
 76 |     job.setNumReduceTasks(0)
 77 | 
 78 |     if (job.waitForCompletion(true)) {
 79 |       val headerPath = new Path(outputPathString + "/_header.tsv")
 80 |       writeHeader(fs, headerPath, flattenedSchema)
 81 |       0
 82 |     } else {
 83 |       1
 84 |     }
 85 |   }
 86 | 
 87 |   def writeHeader(fs: FileSystem, outputPath: Path, schema: MessageType) {
 88 |     val header = FlatConverter.constructHeader(schema)
 89 |     val writer = new BufferedWriter(new OutputStreamWriter(fs.create(outputPath, true)))
 90 |     writer.write(header)
 91 |     writer.write("\n")
 92 |     writer.close()
 93 |   }
 94 | }
 95 | 
 96 | object TsvJob {
 97 |   def main(args: Array[String]) = {
 98 |     val result = ToolRunner.run(new Configuration, new TsvJob, args)
 99 |     System.exit(result)
100 |   }
101 | }
102 | 


--------------------------------------------------------------------------------
/herringbone-main/src/main/scala/com/stripe/herringbone/flatten/FlatConsumer.scala:
--------------------------------------------------------------------------------
  1 | package com.stripe.herringbone.flatten
  2 | 
  3 | import org.apache.hadoop.mapreduce._
  4 | import org.apache.hadoop.mapreduce.lib.input._
  5 | import org.apache.hadoop.mapreduce.lib.output._
  6 | import org.apache.hadoop.util._
  7 | import org.apache.hadoop.fs._
  8 | import org.apache.hadoop.conf._
  9 | 
 10 | import parquet.example.data._
 11 | import parquet.example.data.simple._
 12 | import parquet.hadoop._
 13 | import parquet.hadoop.example._
 14 | import parquet.io.api._
 15 | import parquet.schema._
 16 | 
 17 | class FlatConsumer(output: Group, separator: String, renameId: Boolean) extends RecordConsumer {
 18 | 
 19 |   case class StackFrame(field: String, var values: List[Binary])
 20 |   var stack = List[StackFrame]()
 21 |   // Impala stops working after a field becomes too long. The docs
 22 |   // indicate that we should have 32k. However, a binary search on a
 23 |   // too-long field yielded 6776 as the maximum working value.
 24 |   val MaxStringBytes = 6776
 25 | 
 26 |   def startMessage {}
 27 |   def endMessage {}
 28 |   def startGroup {}
 29 |   def endGroup {}
 30 | 
 31 |   def startField(field: String, index: Int) {
 32 |     stack ::= StackFrame(field, Nil)
 33 |   }
 34 | 
 35 |   def endField(field: String, index: Int) {
 36 |     if (stack.head.values.size == 1) {
 37 |       withField{name => output.add(name, stack.head.values.head)}
 38 |     } else if (stack.head.values.size > 1) {
 39 |       withField {name =>
 40 |         val joined = Binary.fromString(
 41 |           stack
 42 |             .head
 43 |             .values
 44 |             .reverse
 45 |             .map{_.toStringUsingUTF8}
 46 |             .mkString(",")
 47 |             .replace("\t", " ")
 48 |         )
 49 |         val truncated = truncate(joined, MaxStringBytes)
 50 |         output.add(name, truncated)
 51 |       }
 52 |     }
 53 |     stack = stack.tail
 54 |   }
 55 | 
 56 |   def addInteger(value: Int) {
 57 |     writeField{Binary.fromString(value.toString)}{name => output.add(name, value)}
 58 |   }
 59 | 
 60 |   def addLong(value: Long) {
 61 |     writeField{Binary.fromString(value.toString)}{name => output.add(name, value)}
 62 |   }
 63 | 
 64 |   def addBoolean(value: Boolean) {
 65 |     writeField{Binary.fromString(value.toString)}{name => output.add(name, value)}
 66 |   }
 67 | 
 68 |   def truncate(value: Binary, length: Integer): Binary = {
 69 |     if (value.length <= length) {
 70 |       value
 71 |     } else {
 72 |       val bytesTruncated = new Array[Byte](length)
 73 |       value.toByteBuffer.get(bytesTruncated, 0, length)
 74 |       Binary.fromByteArray(bytesTruncated)
 75 |     }
 76 |   }
 77 | 
 78 |   def addBinary(value: Binary) {
 79 |     // Truncate strings so Impala doesn't break
 80 |     val truncated = truncate(value, MaxStringBytes)
 81 |     writeField(truncated){name => output.add(name, truncated)}
 82 |   }
 83 | 
 84 |   def addFloat(value: Float) {
 85 |     writeField{Binary.fromString(value.toString)}{name => output.add(name, value)}
 86 |   }
 87 | 
 88 |   def addDouble(value: Double) {
 89 |     writeField{Binary.fromString(value.toString)}{name => output.add(name, value)}
 90 |   }
 91 | 
 92 |   def withField(fn: String=>Unit) {
 93 |     val path = if (TypeFlattener.omitIdField(stack.head.field, stack.size, renameId))
 94 |       stack.tail
 95 |     else
 96 |       stack
 97 | 
 98 |     val name = path.reverse.map{_.field}.mkString(separator)
 99 |     if(output.getType.containsField(name))
100 |       fn(name)
101 |   }
102 | 
103 |   def writeField(binRep: =>Binary)(fn: String => Unit) {
104 |     withField{name =>
105 |       val fieldType = output.getType.getType(name)
106 |       if(fieldType.asInstanceOf[PrimitiveType].getPrimitiveTypeName == PrimitiveType.PrimitiveTypeName.BINARY)
107 |         stack.head.values ::= binRep
108 |       else
109 |         fn(name)
110 |     }
111 |   }
112 | }
113 | 


--------------------------------------------------------------------------------
/herringbone-main/src/main/scala/com/stripe/herringbone/flatten/FlatConverter.scala:
--------------------------------------------------------------------------------
 1 | package com.stripe.herringbone.flatten
 2 | 
 3 | import org.apache.hadoop.fs.Path
 4 | import org.apache.hadoop.conf.Configuration
 5 | 
 6 | import parquet.example.data.Group
 7 | import parquet.example.data.GroupWriter
 8 | import parquet.example.data.simple.SimpleGroup
 9 | import parquet.schema.MessageType
10 | 
11 | import scala.collection.JavaConversions._
12 | 
13 | object FlatConverter {
14 |   def groupToTSV(group: Group, flatSchema: MessageType, separator: String, renameId: Boolean): String = {
15 |     val flatGroup = flattenGroup(group, flatSchema, separator, renameId)
16 |     val fieldValues = (0 until flatSchema.getFieldCount).map{ field =>
17 |       val valueCount = flatGroup.getFieldRepetitionCount(field)
18 |       if (valueCount == 0) {
19 |         ""
20 |       } else if (valueCount == 1) {
21 |         escapeString(flatGroup.getValueToString(field, 0))
22 |       } else {
23 |         escapeString(flatGroup.getValueToString(field, 0))
24 |         System.err.println("Warning: Field contains multiple values, extracting only the first")
25 |         System.err.println(flatGroup.toString)
26 |       }
27 |     }
28 |     fieldValues.mkString("\t")
29 |   }
30 | 
31 |   def constructHeader(schema: MessageType) = {
32 |     schema
33 |       .getPaths()
34 |       .toList
35 |       .map{_(0)}
36 |       .mkString("\t")
37 |   }
38 | 
39 |   def flattenGroup(group: Group, flatSchema: MessageType, separator: String, renameId: Boolean) = {
40 |     var flatGroup = new SimpleGroup(flatSchema)
41 |     val writer = new GroupWriter(new FlatConsumer(flatGroup, separator, renameId), group.getType)
42 |     writer.write(group)
43 |     flatGroup
44 |   }
45 | 
46 |   private def escapeString(s: String) = {
47 |     val quote = "\""
48 |     if (s.contains("\t"))
49 |       // This is how pandas escapes tabs and quotes
50 |       quote + s.replace(quote, "\"\"") + quote
51 |     else
52 |       s
53 |   }
54 | }
55 | 


--------------------------------------------------------------------------------
/herringbone-main/src/main/scala/com/stripe/herringbone/flatten/ParquetFlatConf.scala:
--------------------------------------------------------------------------------
 1 | package com.stripe.herringbone.flatten
 2 | 
 3 | import org.rogach.scallop._
 4 | 
 5 | class ParquetFlatConf(arguments: Seq[String]) extends ScallopConf(arguments) {
 6 |   val inputPath = opt[String](required = true)
 7 |   val outputPath = opt[String](descr = "Default is input path with `-flat` or `-tsv` appended as appropriate")
 8 |   val previousPath = opt[String](descr = "Path of previously generated flat output, so field ordering can be maintained (optional)")
 9 |   val separator = opt[String](default = Some("__"))
10 |   val renameId = opt[Boolean](descr = "Flatten a.b.id as a__b instead of a__b__id")
11 | }
12 | 


--------------------------------------------------------------------------------
/herringbone-main/src/main/scala/com/stripe/herringbone/flatten/ParquetFlatMapper.scala:
--------------------------------------------------------------------------------
 1 | package com.stripe.herringbone.flatten
 2 | 
 3 | import org.apache.hadoop.mapreduce.Mapper
 4 | import parquet.example.data.Group
 5 | import parquet.schema.{MessageType,MessageTypeParser}
 6 | 
 7 | abstract class ParquetFlatMapper[ValueOut] extends Mapper[Void,Group,Void,ValueOut] {
 8 |   var flattenedSchema: MessageType = _
 9 |   var separator: String = _
10 |   var renameId: Boolean = _
11 | 
12 |   override def setup(context: Mapper[Void,Group,Void,ValueOut]#Context) {
13 |     // the schema is stored in the job context when we call ExampleOutputFormat.setSchema
14 |     flattenedSchema = MessageTypeParser.parseMessageType(context.getConfiguration.get("parquet.example.schema"))
15 |     separator = context.getConfiguration.get(ParquetFlatMapper.SeparatorKey)
16 |     renameId = context.getConfiguration.get(ParquetFlatMapper.RenameIdKey) == "true"
17 |   }
18 | 
19 |   override def map(key: Void, value: Group, context: Mapper[Void,Group,Void,ValueOut]#Context) {
20 |     context.write(key, valueOut(value))
21 |   }
22 | 
23 |   def valueOut(value: Group): ValueOut
24 | }
25 | 
26 | object ParquetFlatMapper {
27 |   val SeparatorKey = "herringbone.flatten.separator"
28 |   val RenameIdKey = "herringbone.flatten.rename.id"
29 | }
30 | 


--------------------------------------------------------------------------------
/herringbone-main/src/main/scala/com/stripe/herringbone/flatten/TypeFlattener.scala:
--------------------------------------------------------------------------------
 1 | package com.stripe.herringbone.flatten
 2 | 
 3 | import parquet.schema._
 4 | import java.util.{List=>JList}
 5 | import scala.collection.JavaConverters._
 6 | 
 7 | class TypeFlattener(separator: String, renameId: Boolean) extends TypeConverter[List[Type]] {
 8 |     def convertPrimitiveType(path: JList[GroupType], primitiveType: PrimitiveType) = {
 9 |       val typeName =
10 |         if(TypeFlattener.isRepeated(primitiveType))
11 |           PrimitiveType.PrimitiveTypeName.BINARY
12 |         else
13 |           primitiveType.getPrimitiveTypeName
14 | 
15 |       val types = if (TypeFlattener.omitIdField(primitiveType.getName, path.size, renameId))
16 |         path.asScala.tail
17 |       else
18 |         (path.asScala.tail :+ primitiveType)
19 | 
20 |       val name = types.map{_.getName}.mkString(separator)
21 |       List(new PrimitiveType(Type.Repetition.OPTIONAL, typeName, primitiveType.getTypeLength, name))
22 |     }
23 | 
24 |     def convertGroupType(path: JList[GroupType], groupType: GroupType, children: JList[List[Type]]) = {
25 |       if(TypeFlattener.isRepeated(groupType))
26 |         Nil
27 |       else
28 |         flatten(children)
29 |     }
30 | 
31 |     def convertMessageType(messageType: MessageType, children: JList[List[Type]]) = flatten(children)
32 | 
33 |     def flatten(children: JList[List[Type]]) = children.asScala.flatten.toList
34 | }
35 | 
36 | object TypeFlattener {
37 |   def flatten(messageType: MessageType,
38 |     previousMessageType: Option[MessageType],
39 |     separator: String,
40 |     renameId: Boolean) = {
41 |     val flattened = messageType.convertWith(new TypeFlattener(separator, renameId))
42 |     val fieldsToUse = previousMessageType match {
43 |       case Some(prevMessageType) => {
44 |         // if passed a previous flattened schema, preserve that field ordering,
45 |         // and append any new fields
46 |         val prevFields = prevMessageType.getFields.asScala.toList
47 |         prevFields ::: flattened.filterNot{prevFields.contains(_)}
48 |       }
49 |       case None => flattened
50 |     }
51 |     new MessageType(messageType.getName, fieldsToUse.asJava)
52 |   }
53 | 
54 |   def isRepeated(t: Type) = t.isRepetition(Type.Repetition.REPEATED)
55 | 
56 |   def omitIdField(fieldName: String, numberOfFields: Integer, renameId: Boolean) = {
57 |     renameId && numberOfFields > 1 && (fieldName == "id" || fieldName == "_id")
58 |   }
59 | }
60 | 


--------------------------------------------------------------------------------
/herringbone-main/src/main/scala/com/stripe/herringbone/load/FieldUtils.scala:
--------------------------------------------------------------------------------
 1 | package com.stripe.herringbone.load
 2 | 
 3 | import com.stripe.herringbone.util.ParquetUtils
 4 | 
 5 | import org.apache.hadoop.fs._
 6 | 
 7 | import parquet.schema.{ PrimitiveType, Type }
 8 | import parquet.schema.PrimitiveType.PrimitiveTypeName
 9 | import parquet.schema.PrimitiveType.PrimitiveTypeName._
10 | 
11 | import scala.collection.JavaConversions._
12 | 
13 | case class FieldUtils(hadoopFs: HadoopFs, schemaTypeMapper: SchemaTypeMapper) {
14 |   def findPartitionFields(path: Path) = {
15 |     hadoopFs.findPartitions(path).map {
16 |       case (name, example) if (example.forall{_.isDigit}) =>
17 |         "`%s` int".format(name)
18 |       case (name, _) =>
19 |         "`%s` string".format(name)
20 |     }
21 |   }
22 | 
23 |   def findTableFields(path: Path) = {
24 |     val schema = ParquetUtils.readSchema(path)
25 |     tableFieldsFromSchemaFields(schema.getFields)
26 |   }
27 | 
28 |   def tableFieldsFromSchemaFields(fields: Seq[Type]) = {
29 |     fields
30 |       .filter { f => f.isPrimitive }
31 |       .map { f =>
32 |         "`%s` %s".format(f.getName, schemaTypeMapper.getSchemaType(f.asInstanceOf[PrimitiveType].getPrimitiveTypeName))
33 |       }.toList
34 |   }
35 | }
36 | 
37 | trait SchemaTypeMapper {
38 |   def getSchemaType(pt: PrimitiveTypeName): String
39 | }
40 | 
41 | object ImpalaHiveSchemaTypeMapper extends SchemaTypeMapper {
42 |   def getSchemaType(pt: PrimitiveTypeName) = {
43 |     pt match {
44 |       case BINARY => "STRING"
45 |       case INT32 => "INT"
46 |       case INT64 | INT96 => "BIGINT"
47 |       case DOUBLE => "DOUBLE"
48 |       case BOOLEAN => "BOOLEAN"
49 |       case FLOAT => "FLOAT"
50 |       case FIXED_LEN_BYTE_ARRAY => "BINARY"
51 |     }
52 |   }
53 | }
54 | 


--------------------------------------------------------------------------------
/herringbone-main/src/main/scala/com/stripe/herringbone/load/HadoopFs.scala:
--------------------------------------------------------------------------------
 1 | package com.stripe.herringbone.load
 2 | 
 3 | import com.stripe.herringbone.util.ParquetUtils
 4 | 
 5 | import org.apache.hadoop.conf._
 6 | import org.apache.hadoop.fs._
 7 | import org.apache.hadoop.util._
 8 | 
 9 | class HadoopFs {
10 |   def findAbsolutePath(path: Path) = {
11 |     path.getFileSystem(new Configuration).getFileStatus(path).getPath.toUri.toString
12 |   }
13 | 
14 |   def findSortedLeafPaths(path: Path): List[Path] =
15 |     findLeafPaths(path).sortBy{case (path,time) => time}.map{_._1}
16 | 
17 |   def findLeafPaths(path: Path): List[(Path,Long)] = {
18 |     val fs = path.getFileSystem(new Configuration)
19 |     val parquetFileStatuses = fs.listStatus(path, ParquetUtils.parquetFilter)
20 |     if (parquetFileStatuses.size > 0)
21 |       List((path, parquetFileStatuses.head.getModificationTime))
22 |     else {
23 |       fs.listStatus(path, ParquetUtils.partitionFilter)
24 |         .toList
25 |         .map{_.getPath}
26 |         .flatMap{findLeafPaths(_)}
27 |     }
28 |   }
29 | 
30 |   def findPartitions(path: Path) = {
31 |     path.toUri.getPath.split("/")
32 |       .filter{_.contains("=")}
33 |       .map{segment =>
34 |         val parts = segment.split("=")
35 |         (parts(0), parts(1))
36 |       }.toList
37 |   }
38 | }
39 | 


--------------------------------------------------------------------------------
/herringbone-main/src/main/scala/com/stripe/herringbone/load/HiveLoader.scala:
--------------------------------------------------------------------------------
 1 | package com.stripe.herringbone
 2 | 
 3 | import com.stripe.herringbone.load._
 4 | 
 5 | import java.sql.ResultSet
 6 | 
 7 | import org.apache.hadoop.conf._
 8 | import org.apache.hadoop.fs._
 9 | import org.apache.hadoop.util._
10 | 
11 | case class HiveLoader(conf: ParquetLoadConf,
12 |   hadoopFs: HadoopFs,
13 |   fieldUtils: FieldUtils) extends ParquetLoader {
14 | 
15 |   val connection = HiveServer2Connection(conf.connectionUrl() + ":" + conf.connectionPort())
16 | 
17 |   def checkTableExists(table: String, database: String): Boolean = {
18 |     useDatabase(database)
19 |     var exists: Boolean = false
20 |     connection.executeQuery("SHOW TABLES") { resultSet =>
21 |       val existingTable = resultSet.getString(1).trim
22 |       if (existingTable == table)
23 |         exists = true
24 |     }
25 |     exists
26 |   }
27 | 
28 |   def createTable(pathString: String, table: String, database: String = "default") {
29 |     val path = new Path(pathString)
30 |     val location = hadoopFs.findAbsolutePath(path)
31 |     val leafPaths = hadoopFs.findSortedLeafPaths(path)
32 | 
33 |     if (leafPaths.isEmpty)
34 |       error("Could not find parquet files under " + path)
35 | 
36 |     val tableFields = fieldUtils.findTableFields(leafPaths.last)
37 |     val partitionFields = fieldUtils.findPartitionFields(leafPaths.last)
38 |     val tableWhileImporting = table + "__import"
39 | 
40 |     useDatabase(database)
41 | 
42 |     createTableWithPartitionFields(location, tableWhileImporting, tableFields, partitionFields)
43 | 
44 |     connection.execute("DROP TABLE IF EXISTS %s".format(table))
45 |     connection.execute("ALTER TABLE %s RENAME TO %s".format(tableWhileImporting, table))
46 | 
47 |     if (!partitionFields.isEmpty)
48 |       updateTable(table, database)
49 |   }
50 | 
51 |   def createTableWithPartitionFields(location: String, table: String, tableFields: List[String],
52 |     partitionFields: List[String]) {
53 | 
54 |     connection.execute("DROP TABLE IF EXISTS `%s`".format (table))
55 | 
56 |     val tableClause = "CREATE EXTERNAL TABLE IF NOT EXISTS `%s` (%s)".format(
57 |       table, tableFields.mkString(", "))
58 | 
59 |     val partitionClause =
60 |       if (partitionFields.isEmpty)
61 |         ""
62 |       else
63 |         " PARTITIONED BY (%s)".format(partitionFields.mkString(" ,"))
64 | 
65 |     val storedClause = " STORED AS PARQUET LOCATION \"%s\"".format(location)
66 | 
67 |     connection.execute(tableClause + partitionClause + storedClause)
68 |   }
69 | 
70 |   def updateTable(table: String, database: String) = {
71 |     connection.execute("MSCK REPAIR TABLE %s".format(table))
72 |   }
73 | 
74 |   def closeConnection() = connection.close
75 | 
76 |   private def useDatabase(database: String) = {
77 |     connection.execute("CREATE DATABASE IF NOT EXISTS %s".format(database))
78 |     connection.execute("USE %s".format(database))
79 |   }
80 | }
81 | 


--------------------------------------------------------------------------------
/herringbone-main/src/main/scala/com/stripe/herringbone/load/HiveServer2Connection.scala:
--------------------------------------------------------------------------------
 1 | package com.stripe.herringbone.load
 2 | 
 3 | import java.sql.{ Connection, DriverManager, ResultSet }
 4 | 
 5 | case class HiveServer2Connection(connectionUrl: String) {
 6 |   lazy val connection: Connection = {
 7 |     Class.forName("org.apache.hive.jdbc.HiveDriver")
 8 |     DriverManager.getConnection(connectionUrl)
 9 |   }
10 | 
11 |   def execute(query: String) {
12 |     try {
13 |       println(query)
14 |       val statement = connection.createStatement
15 |       statement.execute(query)
16 |     } catch {
17 |       case e: Throwable => e.printStackTrace
18 |     }
19 |   }
20 | 
21 |   def executeQuery(query: String)(fn: ResultSet => Unit) {
22 |     try {
23 |       println(query)
24 |       val statement = connection.createStatement
25 |       val resultSet = statement.executeQuery(query)
26 |       while (resultSet.next) {
27 |         fn(resultSet)
28 |       }
29 |     } catch {
30 |       case e: Throwable => e.printStackTrace
31 |     }
32 |   }
33 | 
34 |   def close = connection.close
35 | }
36 | 


--------------------------------------------------------------------------------
/herringbone-main/src/main/scala/com/stripe/herringbone/load/ImpalaLoader.scala:
--------------------------------------------------------------------------------
  1 | package com.stripe.herringbone.load
  2 | 
  3 | import com.stripe.herringbone.impala.{ImpalaClient,ImpalaValue}
  4 | 
  5 | import org.apache.hadoop.conf._
  6 | import org.apache.hadoop.util._
  7 | import org.apache.hadoop.fs._
  8 | 
  9 | case class ImpalaLoader(conf: ParquetLoadConf,
 10 |   hadoopFs: HadoopFs,
 11 |   fieldUtils: FieldUtils) extends ParquetLoader {
 12 | 
 13 |   lazy val impalaClient = ImpalaClient(conf.connectionUrl(),
 14 |     conf.connectionPort().toInt)
 15 | 
 16 |   def checkTableExists(table: String, database: String): Boolean = {
 17 |     useDatabase(database)
 18 |     var exists: Boolean = false
 19 |     query("SHOW TABLES"){row =>
 20 |       row.foreach { value =>
 21 |         if (value.raw == table) exists = true
 22 |       }
 23 |     }
 24 |     exists
 25 |   }
 26 | 
 27 |   def createTable(pathString: String, table: String, database: String = "default") {
 28 |     val path = new Path(pathString)
 29 |     val location = hadoopFs.findAbsolutePath(path)
 30 |     val leafPaths = hadoopFs.findSortedLeafPaths(path)
 31 | 
 32 |     if(leafPaths.isEmpty)
 33 |       error("Could not find parquet files under " + path)
 34 | 
 35 |     val tableFields = fieldUtils.findTableFields(leafPaths.last)
 36 |     val partitionFields = fieldUtils.findPartitionFields(leafPaths.last)
 37 | 
 38 |     useDatabase("importing")
 39 | 
 40 |     createTableWithPartitionFields(location, table, tableFields, partitionFields)
 41 | 
 42 |     if(partitionFields.size > 0)
 43 |       addPartitions(table, leafPaths.map{hadoopFs.findPartitions(_)})
 44 | 
 45 |     useDatabase(database)
 46 |     execute("DROP TABLE IF EXISTS %s.%s".format(database, table))
 47 |     execute("ALTER TABLE importing.%s RENAME TO %s.%s".format(table, database, table))
 48 |     if (partitionFields.isEmpty && conf.computeStats()) execute("COMPUTE STATS %s.%s".format(database, table))
 49 |   }
 50 | 
 51 |   def updateTable(table: String, database: String) {
 52 |     useDatabase(database)
 53 | 
 54 |     val basePath = findBasePath(table)
 55 |     val tablePartitions = findTablePartitions(table)
 56 |     val leafPaths = hadoopFs.findSortedLeafPaths(new Path(basePath))
 57 |     leafPaths.reverse.foreach{path =>
 58 |       val partitions = hadoopFs.findPartitions(path)
 59 |       if(!tablePartitions.contains(partitions.map{_._2}))
 60 |         addPartition(table, partitions)
 61 |     }
 62 |   }
 63 | 
 64 |   def findBasePath(table: String) = {
 65 |     var location: String = null
 66 |     query("DESCRIBE FORMATTED %s".format(table)){row =>
 67 |       if(row(0).raw.startsWith("Location:"))
 68 |         location = row(1).raw
 69 |     }
 70 |     location
 71 |   }
 72 | 
 73 |   def findTablePartitions(table: String) = {
 74 |     var partitions: List[List[String]] = Nil
 75 |     query("SHOW TABLE STATS %s".format(table)){row =>
 76 |       if(row.size > 4)
 77 |         partitions ::= List(row(0).raw)
 78 |     }
 79 |     partitions
 80 |   }
 81 | 
 82 |   def createTableWithPartitionFields(location: String, table: String, tableFields: List[String], partitionFields: List[String]) {
 83 |     execute("DROP TABLE IF EXISTS `%s`".format (table))
 84 | 
 85 |     val tableClause = "CREATE EXTERNAL TABLE IF NOT EXISTS `%s` (%s)".format(table, tableFields.mkString(", "))
 86 |     val partitionClause =
 87 |       if(partitionFields.isEmpty)
 88 |         ""
 89 |       else
 90 |         " PARTITIONED BY (%s)".format(partitionFields.mkString(" ,"))
 91 |     val storedClause = " STORED AS PARQUETFILE LOCATION \"%s\"".format(location)
 92 | 
 93 |     execute(tableClause + partitionClause + storedClause)
 94 |   }
 95 | 
 96 |   def addPartitions(table: String, partitions: List[List[(String, String)]]) {
 97 |     partitions.foreach{addPartition(table, _)}
 98 |   }
 99 | 
100 |   def addPartition(table: String, partitions: List[(String,String)]) {
101 |     val partitionClause =
102 |       partitions.map {
103 |         case (name, value) if(value.forall{_.isDigit}) =>
104 |           "`%s`=%s".format(name, value)
105 |          case (name, value) =>
106 |           "`%s`='%s'".format(name, value)
107 |         }.mkString(", ")
108 | 
109 |     execute("ALTER TABLE %s ADD IF NOT EXISTS PARTITION (%s)".format(table, partitionClause))
110 |   }
111 | 
112 |   def closeConnection() = {}
113 | 
114 |   private def useDatabase(database: String) = {
115 |     execute("CREATE DATABASE IF NOT EXISTS %s".format(database))
116 |     execute("USE %s".format(database))
117 |   }
118 | 
119 |   private def execute(stmt: String) {
120 |     impalaClient.execute(stmt)
121 |   }
122 | 
123 |   private def query(stmt: String)(fn: Seq[ImpalaValue] => Unit) {
124 |     impalaClient.query(stmt){ r =>  fn(r) }
125 |   }
126 | }
127 | 


--------------------------------------------------------------------------------
/herringbone-main/src/main/scala/com/stripe/herringbone/load/ParquetLoadConf.scala:
--------------------------------------------------------------------------------
 1 | package com.stripe.herringbone.load
 2 | 
 3 | import org.rogach.scallop._
 4 | 
 5 | class ParquetLoadConf(arguments: Seq[String]) extends ScallopConf(arguments) {
 6 |   val database = opt[String](default = Some("default"))
 7 |   val table = opt[String](required = true)
 8 |   val path = opt[String]()
 9 |   val hive = opt[Boolean]("hive")
10 |   val connectionUrl = opt[String](required = true)
11 |   val connectionPort = opt[String](required = true)
12 |   val computeStats = toggle(descrYes = "Compute table stats after loading files into impala. Turn this off for faster loading into impala (but probably slower querying later on!)", default = Some(true))
13 |   val updatePartitions = toggle(descrYes = "Create table if not present, otherwise update with new partitions. If a schema change is being made to an existing table, turn this off.", default = Some(false))
14 |   validateOpt (path, updatePartitions) {
15 |     case (None, None) => Left("You must specify at least one of path or update-partitions")
16 |     case _ => Right(Unit)
17 |   }
18 | }
19 | 


--------------------------------------------------------------------------------
/herringbone-main/src/main/scala/com/stripe/herringbone/load/ParquetLoader.scala:
--------------------------------------------------------------------------------
 1 | package com.stripe.herringbone.load
 2 | 
 3 | trait ParquetLoader {
 4 |   def checkTableExists(table: String, db: String): Boolean
 5 |   def updateTable(table: String, db: String): Unit
 6 |   def createTable(path: String, table: String, db: String): Unit
 7 |   def closeConnection(): Unit
 8 | }
 9 | 
10 | 


--------------------------------------------------------------------------------
/herringbone-main/src/main/scala/com/stripe/herringbone/util/ParquetUtils.scala:
--------------------------------------------------------------------------------
 1 | package com.stripe.herringbone.util
 2 | 
 3 | import org.apache.hadoop.conf._
 4 | import org.apache.hadoop.util._
 5 | import org.apache.hadoop.fs._
 6 | 
 7 | import parquet.hadoop.ParquetFileReader
 8 | 
 9 | object ParquetUtils {
10 |   def getParquetMetadata(path: Path) = {
11 |     // Just use the first parquet file to figure out the impala fields
12 |     // This also dodges the problem of any non-parquet files stashed
13 |     // in the path.
14 |     val fs = path.getFileSystem(new Configuration)
15 |     val parquetFileStatuses = fs.listStatus(path, parquetFilter)
16 |     val representativeParquetPath = parquetFileStatuses.head.getPath
17 | 
18 |     val footers = ParquetFileReader.readFooters(new Configuration, representativeParquetPath)
19 |     footers.get(0).getParquetMetadata
20 |   }
21 | 
22 |   def readSchema(path: Path) = {
23 |     getParquetMetadata(path).getFileMetaData.getSchema
24 |   }
25 | 
26 |   def readKeyValueMetaData(path: Path) = {
27 |     getParquetMetadata(path).getFileMetaData.getKeyValueMetaData
28 |   }
29 | 
30 |   val parquetFilter = new PathFilter {
31 |     def accept(path: Path) = path.getName.endsWith(".parquet")
32 |   }
33 | 
34 |   val partitionFilter = new PathFilter {
35 |     def accept(path: Path) = path.getName.contains("=")
36 |   }
37 | }
38 | 


--------------------------------------------------------------------------------
/herringbone-main/src/main/thrift/ImpalaService.thrift:
--------------------------------------------------------------------------------
  1 | // Copyright 2012 Cloudera Inc.
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | // http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | namespace cpp impala
 16 | namespace java com.cloudera.impala.thrift
 17 | namespace rb impala.protocol
 18 | 
 19 | include "Status.thrift"
 20 | include "beeswax.thrift"
 21 | include "cli_service.thrift"
 22 | 
 23 | // ImpalaService accepts query execution options through beeswax.Query.configuration in
 24 | // key:value form. For example, the list of strings could be:
 25 | //     "num_nodes:1", "abort_on_error:false"
 26 | // The valid keys are listed in this enum. They map to TQueryOptions.
 27 | // Note: If you add an option or change the default, you also need to update:
 28 | // - ImpalaInternalService.thrift: TQueryOptions
 29 | // - ImpaladClientExecutor.getBeeswaxQueryConfigurations()
 30 | // - ImpalaServer::SetQueryOptions()
 31 | // - ImpalaServer::TQueryOptionsToMap()
 32 | enum TImpalaQueryOptions {
 33 |   // if true, abort execution on the first error
 34 |   ABORT_ON_ERROR,
 35 | 
 36 |   // maximum # of errors to be reported; Unspecified or 0 indicates backend default
 37 |   MAX_ERRORS,
 38 | 
 39 |   // if true, disable llvm codegen
 40 |   DISABLE_CODEGEN,
 41 | 
 42 |   // batch size to be used by backend; Unspecified or a size of 0 indicates backend
 43 |   // default
 44 |   BATCH_SIZE,
 45 | 
 46 |   // a per-machine approximate limit on the memory consumption of this query;
 47 |   // unspecified or a limit of 0 means no limit;
 48 |   // otherwise specified either as:
 49 |   // a) an int (= number of bytes);
 50 |   // b) a float followed by "M" (MB) or "G" (GB)
 51 |   MEM_LIMIT,
 52 | 
 53 |   // specifies the degree of parallelism with which to execute the query;
 54 |   // 1: single-node execution
 55 |   // NUM_NODES_ALL: executes on all nodes that contain relevant data
 56 |   // NUM_NODES_ALL_RACKS: executes on one node per rack that holds relevant data
 57 |   // > 1: executes on at most that many nodes at any point in time (ie, there can be
 58 |   //      more nodes than numNodes with plan fragments for this query, but at most
 59 |   //      numNodes would be active at any point in time)
 60 |   // Constants (NUM_NODES_ALL, NUM_NODES_ALL_RACKS) are defined in JavaConstants.thrift.
 61 |   NUM_NODES,
 62 | 
 63 |   // maximum length of the scan range; only applicable to HDFS scan range; Unspecified or
 64 |   // a length of 0 indicates backend default;
 65 |   MAX_SCAN_RANGE_LENGTH,
 66 | 
 67 |   // Maximum number of io buffers (per disk)
 68 |   MAX_IO_BUFFERS,
 69 | 
 70 |   // Number of scanner threads.
 71 |   NUM_SCANNER_THREADS,
 72 | 
 73 |   // If true, Impala will try to execute on file formats that are not fully supported yet
 74 |   ALLOW_UNSUPPORTED_FORMATS,
 75 | 
 76 |   // if set and > -1, specifies the default limit applied to a top-level SELECT statement
 77 |   // with an ORDER BY but without a LIMIT clause (ie, if the SELECT statement also has
 78 |   // a LIMIT clause, this default is ignored)
 79 |   DEFAULT_ORDER_BY_LIMIT,
 80 | 
 81 |   // DEBUG ONLY:
 82 |   // If set to
 83 |   //   "[<backend number>:]<node id>:<TExecNodePhase>:<TDebugAction>",
 84 |   // the exec node with the given id will perform the specified action in the given
 85 |   // phase. If the optional backend number (starting from 0) is specified, only that
 86 |   // backend instance will perform the debug action, otherwise all backends will behave
 87 |   // in that way.
 88 |   // If the string doesn't have the required format or if any of its components is
 89 |   // invalid, the option is ignored.
 90 |   DEBUG_ACTION,
 91 | 
 92 |   // If true, raise an error when the DEFAULT_ORDER_BY_LIMIT has been reached.
 93 |   ABORT_ON_DEFAULT_LIMIT_EXCEEDED,
 94 | 
 95 |   // Compression codec for parquet when inserting into parquet tables.
 96 |   // Valid values are "snappy", "gzip" and "none"
 97 |   // Leave blank to use default.
 98 |   PARQUET_COMPRESSION_CODEC,
 99 | 
100 |   // HBase scan query option. If set and > 0, HBASE_CACHING is the value for
101 |   // "hbase.client.Scan.setCaching()" when querying HBase table. Otherwise, use backend
102 |   // default.
103 |   // If the value is too high, then the hbase region server will have a hard time (GC
104 |   // pressure and long response times). If the value is too small, then there will be
105 |   // extra trips to the hbase region server.
106 |   HBASE_CACHING,
107 | 
108 |   // HBase scan query option. If set, HBase scan will always set
109 |   // "hbase.client.setCacheBlocks" to CACHE_BLOCKS. Default is false.
110 |   // If the table is large and the query is doing big scan, set it to false to
111 |   // avoid polluting the cache in the hbase region server.
112 |   // If the table is small and the table is used several time, set it to true to improve
113 |   // performance.
114 |   HBASE_CACHE_BLOCKS,
115 | }
116 | 
117 | // The summary of an insert.
118 | struct TInsertResult {
119 |   // Number of appended rows per modified partition. Only applies to HDFS tables.
120 |   // The keys represent partitions to create, coded as k1=v1/k2=v2/k3=v3..., with the
121 |   // root in an unpartitioned table being the empty string.
122 |   1: required map<string, i64> rows_appended
123 | }
124 | 
125 | // Response from a call to PingImpalaService
126 | struct TPingImpalaServiceResp {
127 |   // The Impala service's version string.
128 |   1: string version
129 | }
130 | 
131 | // Parameters for a ResetTable request which will invalidate a table's metadata.
132 | // DEPRECATED.
133 | struct TResetTableReq {
134 |   // Name of the table's parent database.
135 |   1: required string db_name
136 | 
137 |   // Name of the table.
138 |   2: required string table_name
139 | }
140 | 
141 | // For all rpc that return a TStatus as part of their result type,
142 | // if the status_code field is set to anything other than OK, the contents
143 | // of the remainder of the result type is undefined (typically not set)
144 | service ImpalaService extends beeswax.BeeswaxService {
145 |   // Cancel execution of query. Returns RUNTIME_ERROR if query_id
146 |   // unknown.
147 |   // This terminates all threads running on behalf of this query at
148 |   // all nodes that were involved in the execution.
149 |   // Throws BeeswaxException if the query handle is invalid (this doesn't
150 |   // necessarily indicate an error: the query might have finished).
151 |   Status.TStatus Cancel(1:beeswax.QueryHandle query_id)
152 |       throws(1:beeswax.BeeswaxException error);
153 | 
154 |   // Invalidates all catalog metadata, forcing a reload
155 |   // DEPRECATED; execute query "invalidate metadata" to refresh metadata
156 |   Status.TStatus ResetCatalog();
157 | 
158 |   // Invalidates a specific table's catalog metadata, forcing a reload on the next access
159 |   // DEPRECATED; execute query "refresh <table>" to refresh metadata
160 |   Status.TStatus ResetTable(1:TResetTableReq request)
161 | 
162 |   // Returns the runtime profile string for the given query handle.
163 |   string GetRuntimeProfile(1:beeswax.QueryHandle query_id)
164 |       throws(1:beeswax.BeeswaxException error);
165 | 
166 |   // Closes the query handle and return the result summary of the insert.
167 |   TInsertResult CloseInsert(1:beeswax.QueryHandle handle)
168 |       throws(1:beeswax.QueryNotFoundException error, 2:beeswax.BeeswaxException error2);
169 | 
170 |   // Client calls this RPC to verify that the server is an ImpalaService. Returns the
171 |   // server version.
172 |   TPingImpalaServiceResp PingImpalaService();
173 | }
174 | 
175 | // Impala HiveServer2 service
176 | service ImpalaHiveServer2Service extends cli_service.TCLIService {
177 | }
178 | 


--------------------------------------------------------------------------------
/herringbone-main/src/main/thrift/Status.thrift:
--------------------------------------------------------------------------------
 1 | // Copyright 2012 Cloudera Inc.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | // http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | namespace cpp impala
16 | namespace java com.cloudera.impala.thrift
17 | namespace rb impala.protocol
18 | 
19 | enum TStatusCode {
20 |   OK,
21 |   CANCELLED,
22 |   ANALYSIS_ERROR,
23 |   NOT_IMPLEMENTED_ERROR,
24 |   RUNTIME_ERROR,
25 |   MEM_LIMIT_EXCEEDED,
26 |   INTERNAL_ERROR
27 | }
28 | 
29 | struct TStatus {
30 |   1: required TStatusCode status_code
31 |   2: list<string> error_msgs
32 | }
33 | 


--------------------------------------------------------------------------------
/herringbone-main/src/main/thrift/beeswax.thrift:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to Cloudera, Inc. under one
  3 |  * or more contributor license agreements.  See the NOTICE file
  4 |  * distributed with this work for additional information
  5 |  * regarding copyright ownership.  Cloudera, Inc. licenses this file
  6 |  * to you under the Apache License, Version 2.0 (the
  7 |  * "License"); you may not use this file except in compliance
  8 |  * with the License.  You may obtain a copy of the License at
  9 |  *
 10 |  *     http://www.apache.org/licenses/LICENSE-2.0
 11 |  *
 12 |  * Unless required by applicable law or agreed to in writing, software
 13 |  * distributed under the License is distributed on an "AS IS" BASIS,
 14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 |  * See the License for the specific language governing permissions and
 16 |  * limitations under the License.
 17 |  *
 18 |  * Interface for interacting with Beeswax Server
 19 |  */
 20 | 
 21 | namespace java com.cloudera.beeswax.api
 22 | namespace py beeswaxd
 23 | namespace cpp beeswax
 24 | namespace rb impala.protocol.beeswax
 25 | 
 26 | include "hive_metastore.thrift"
 27 | 
 28 | // A Query
 29 | struct Query {
 30 |   1: string query;
 31 |   // A list of HQL commands to execute before the query.
 32 |   // This is typically defining UDFs, setting settings, and loading resources.
 33 |   3: list<string> configuration;
 34 | 
 35 |   // User and groups to "act as" for purposes of Hadoop.
 36 |   4: string hadoop_user;
 37 | }
 38 | 
 39 | typedef string LogContextId
 40 | 
 41 | enum QueryState {
 42 |   CREATED,
 43 |   INITIALIZED,
 44 |   COMPILED,
 45 |   RUNNING,
 46 |   FINISHED,
 47 |   EXCEPTION
 48 | }
 49 | 
 50 | struct QueryHandle {
 51 |   1: string id;
 52 |   2: LogContextId log_context;
 53 | }
 54 | 
 55 | struct QueryExplanation {
 56 |   1: string textual
 57 | }
 58 | 
 59 | struct Results {
 60 |   // If set, data is valid.  Otherwise, results aren't ready yet.
 61 |   1: bool ready,
 62 |   // Columns for the results
 63 |   2: list<string> columns,
 64 |   // A set of results
 65 |   3: list<string> data,
 66 |   // The starting row of the results
 67 |   4: i64 start_row,
 68 |   // Whether there are more results to fetch
 69 |   5: bool has_more
 70 | }
 71 | 
 72 | /**
 73 |  * Metadata information about the results.
 74 |  * Applicable only for SELECT.
 75 |  */
 76 | struct ResultsMetadata {
 77 |   /** The schema of the results */
 78 |   1: hive_metastore.Schema schema,
 79 |   /** The directory containing the results. Not applicable for partition table. */
 80 |   2: string table_dir,
 81 |   /** If the results are straight from an existing table, the table name. */
 82 |   3: string in_tablename,
 83 |   /** Field delimiter */
 84 |   4: string delim,
 85 | }
 86 | 
 87 | exception BeeswaxException {
 88 |   1: string message,
 89 |   // Use get_log(log_context) to retrieve any log related to this exception
 90 |   2: LogContextId log_context,
 91 |   // (Optional) The QueryHandle that caused this exception
 92 |   3: QueryHandle handle,
 93 |   4: optional i32 errorCode = 0,
 94 |   5: optional string SQLState = "     "
 95 | }
 96 | 
 97 | exception QueryNotFoundException {
 98 | }
 99 | 
100 | /** Represents a Hadoop-style configuration variable. */
101 | struct ConfigVariable {
102 |   1: string key,
103 |   2: string value,
104 |   3: string description
105 | }
106 | 
107 | service BeeswaxService {
108 |   /**
109 |    * Submit a query and return a handle (QueryHandle). The query runs asynchronously.
110 |    */
111 |   QueryHandle query(1:Query query) throws(1:BeeswaxException error),
112 | 
113 |   /**
114 |    * run a query synchronously and return a handle (QueryHandle).
115 |    */
116 |   QueryHandle executeAndWait(1:Query query, 2:LogContextId clientCtx)
117 |                         throws(1:BeeswaxException error),
118 | 
119 |   /**
120 |    * Get the query plan for a query.
121 |    */
122 |   QueryExplanation explain(1:Query query)
123 |                         throws(1:BeeswaxException error),
124 | 
125 |   /**
126 |    * Get the results of a query. This is non-blocking. Caller should check
127 |    * Results.ready to determine if the results are in yet. The call requests
128 |    * the batch size of fetch.
129 |    */
130 |   Results fetch(1:QueryHandle query_id, 2:bool start_over, 3:i32 fetch_size=-1)
131 |               throws(1:QueryNotFoundException error, 2:BeeswaxException error2),
132 | 
133 |   /**
134 |    * Get the state of the query
135 |    */
136 |   QueryState get_state(1:QueryHandle handle) throws(1:QueryNotFoundException error),
137 | 
138 |   /**
139 |    * Get the result metadata
140 |    */
141 |   ResultsMetadata get_results_metadata(1:QueryHandle handle)
142 |                                     throws(1:QueryNotFoundException error),
143 | 
144 |   /**
145 |    * Used to test connection to server.  A "noop" command.
146 |    */
147 |   string echo(1:string s)
148 | 
149 |   /**
150 |    * Returns a string representation of the configuration object being used.
151 |    * Handy for debugging.
152 |    */
153 |   string dump_config()
154 | 
155 |   /**
156 |    * Get the log messages related to the given context.
157 |    */
158 |   string get_log(1:LogContextId context) throws(1:QueryNotFoundException error)
159 | 
160 |   /*
161 |    * Returns "default" configuration.
162 |    */
163 |   list<ConfigVariable> get_default_configuration(1:bool include_hadoop)
164 | 
165 |   /*
166 |    * closes the query with given handle
167 |    */
168 |   void close(1:QueryHandle handle) throws(1:QueryNotFoundException error,
169 |                             2:BeeswaxException error2)
170 | 
171 |   /*
172 |    * clean the log context for given id
173 |    */
174 |   void clean(1:LogContextId log_context)
175 | }
176 | 


--------------------------------------------------------------------------------
/herringbone-main/src/main/thrift/fb303.thrift:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one
  3 |  * or more contributor license agreements. See the NOTICE file
  4 |  * distributed with this work for additional information
  5 |  * regarding copyright ownership. The ASF licenses this file
  6 |  * to you under the Apache License, Version 2.0 (the
  7 |  * "License"); you may not use this file except in compliance
  8 |  * with the License. You may obtain a copy of the License at
  9 |  *
 10 |  *   http://www.apache.org/licenses/LICENSE-2.0
 11 |  *
 12 |  * Unless required by applicable law or agreed to in writing,
 13 |  * software distributed under the License is distributed on an
 14 |  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 15 |  * KIND, either express or implied. See the License for the
 16 |  * specific language governing permissions and limitations
 17 |  * under the License.
 18 |  */
 19 | 
 20 | /**
 21 |  * fb303.thrift
 22 |  */
 23 | 
 24 | namespace java com.facebook.fb303
 25 | namespace cpp facebook.fb303
 26 | namespace rb Impala.Protocol.fb303
 27 | 
 28 | /**
 29 |  * Common status reporting mechanism across all services
 30 |  */
 31 | enum fb_status {
 32 |   DEAD = 0,
 33 |   STARTING = 1,
 34 |   ALIVE = 2,
 35 |   STOPPING = 3,
 36 |   STOPPED = 4,
 37 |   WARNING = 5,
 38 | }
 39 | 
 40 | /**
 41 |  * Standard base service
 42 |  */
 43 | service FacebookService {
 44 | 
 45 |   /**
 46 |    * Returns a descriptive name of the service
 47 |    */
 48 |   string getName(),
 49 | 
 50 |   /**
 51 |    * Returns the version of the service
 52 |    */
 53 |   string getVersion(),
 54 | 
 55 |   /**
 56 |    * Gets the status of this service
 57 |    */
 58 |   fb_status getStatus(),
 59 | 
 60 |   /**
 61 |    * User friendly description of status, such as why the service is in
 62 |    * the dead or warning state, or what is being started or stopped.
 63 |    */
 64 |   string getStatusDetails(),
 65 | 
 66 |   /**
 67 |    * Gets the counters for this service
 68 |    */
 69 |   map<string, i64> getCounters(),
 70 | 
 71 |   /**
 72 |    * Gets the value of a single counter
 73 |    */
 74 |   i64 getCounter(1: string key),
 75 | 
 76 |   /**
 77 |    * Sets an option
 78 |    */
 79 |   void setOption(1: string key, 2: string value),
 80 | 
 81 |   /**
 82 |    * Gets an option
 83 |    */
 84 |   string getOption(1: string key),
 85 | 
 86 |   /**
 87 |    * Gets all options
 88 |    */
 89 |   map<string, string> getOptions(),
 90 | 
 91 |   /**
 92 |    * Returns a CPU profile over the given time interval (client and server
 93 |    * must agree on the profile format).
 94 |    */
 95 |   string getCpuProfile(1: i32 profileDurationInSec),
 96 | 
 97 |   /**
 98 |    * Returns the unix time that the server has been running since
 99 |    */
100 |   i64 aliveSince(),
101 | 
102 |   /**
103 |    * Tell the server to reload its configuration, reopen log files, etc
104 |    */
105 |   oneway void reinitialize(),
106 | 
107 |   /**
108 |    * Suggest a shutdown to the server
109 |    */
110 |   oneway void shutdown(),
111 | 
112 | }
113 | 


--------------------------------------------------------------------------------
/herringbone-main/src/main/thrift/hive_metastore.thrift:
--------------------------------------------------------------------------------
  1 | #!/usr/local/bin/thrift -java
  2 | 
  3 | /**
  4 |  * Licensed to the Apache Software Foundation (ASF) under one
  5 |  * or more contributor license agreements.  See the NOTICE file
  6 |  * distributed with this work for additional information
  7 |  * regarding copyright ownership.  The ASF licenses this file
  8 |  * to you under the Apache License, Version 2.0 (the
  9 |  * "License"); you may not use this file except in compliance
 10 |  * with the License.  You may obtain a copy of the License at
 11 |  *
 12 |  *     http://www.apache.org/licenses/LICENSE-2.0
 13 |  *
 14 |  * Unless required by applicable law or agreed to in writing, software
 15 |  * distributed under the License is distributed on an "AS IS" BASIS,
 16 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 17 |  * See the License for the specific language governing permissions and
 18 |  * limitations under the License.
 19 |  */
 20 | 
 21 | #
 22 | # Thrift Service that the MetaStore is built on
 23 | #
 24 | 
 25 | include "fb303.thrift"
 26 | 
 27 | namespace java org.apache.hadoop.hive.metastore.api
 28 | namespace php metastore
 29 | namespace cpp Apache.Hadoop.Hive
 30 | namespace rb Impala.Protocol.HiveMetastore
 31 | 
 32 | const string DDL_TIME = "transient_lastDdlTime"
 33 | 
 34 | struct Version {
 35 |   1: string version,
 36 |   2: string comments
 37 | }
 38 | 
 39 | struct FieldSchema {
 40 |   1: string name, // name of the field
 41 |   2: string type, // type of the field. primitive types defined above, specify list<TYPE_NAME>, map<TYPE_NAME, TYPE_NAME> for lists & maps
 42 |   3: string comment
 43 | }
 44 | 
 45 | struct Type {
 46 |   1: string          name,             // one of the types in PrimitiveTypes or CollectionTypes or User defined types
 47 |   2: optional string type1,            // object type if the name is 'list' (LIST_TYPE), key type if the name is 'map' (MAP_TYPE)
 48 |   3: optional string type2,            // val type if the name is 'map' (MAP_TYPE)
 49 |   //4: optional list<FieldSchema> fields // if the name is one of the user defined types
 50 | }
 51 | 
 52 | enum HiveObjectType {
 53 |   GLOBAL = 1,
 54 |   DATABASE = 2,
 55 |   TABLE = 3,
 56 |   PARTITION = 4,
 57 |   COLUMN = 5,
 58 | }
 59 | 
 60 | enum PrincipalType {
 61 |   USER = 1,
 62 |   ROLE = 2,
 63 |   GROUP = 3,
 64 | }
 65 | 
 66 | const string HIVE_FILTER_FIELD_OWNER = "hive_filter_field_owner__"
 67 | const string HIVE_FILTER_FIELD_PARAMS = "hive_filter_field_params__"
 68 | const string HIVE_FILTER_FIELD_LAST_ACCESS = "hive_filter_field_last_access__"
 69 | 
 70 | enum PartitionEventType {
 71 |   LOAD_DONE = 1,
 72 | }
 73 | 
 74 | struct HiveObjectRef{
 75 |   1: HiveObjectType objectType,
 76 |   2: string dbName,
 77 |   3: string objectName,
 78 |   4: list<string> partValues,
 79 |   5: string columnName,
 80 | }
 81 | 
 82 | struct PrivilegeGrantInfo {
 83 |   1: string privilege,
 84 |   2: i32 createTime,
 85 |   3: string grantor,
 86 |   4: PrincipalType grantorType,
 87 |   5: bool grantOption,
 88 | }
 89 | 
 90 | struct HiveObjectPrivilege {
 91 |   1: HiveObjectRef  hiveObject,
 92 |   2: string principalName,
 93 |   3: PrincipalType principalType,
 94 |   4: PrivilegeGrantInfo grantInfo,
 95 | }
 96 | 
 97 | struct PrivilegeBag {
 98 |   1: list<HiveObjectPrivilege> privileges,
 99 | }
100 | 
101 | struct PrincipalPrivilegeSet {
102 |   1: map<string, list<PrivilegeGrantInfo>> userPrivileges, // user name -> privilege grant info
103 |   2: map<string, list<PrivilegeGrantInfo>> groupPrivileges, // group name -> privilege grant info
104 |   3: map<string, list<PrivilegeGrantInfo>> rolePrivileges, //role name -> privilege grant info
105 | }
106 | 
107 | struct Role {
108 |   1: string roleName,
109 |   2: i32 createTime,
110 |   3: string ownerName,
111 | }
112 | 
113 | // namespace for tables
114 | struct Database {
115 |   1: string name,
116 |   2: string description,
117 |   3: string locationUri,
118 |   4: map<string, string> parameters, // properties associated with the database
119 |   5: optional PrincipalPrivilegeSet privileges
120 | }
121 | 
122 | // This object holds the information needed by SerDes
123 | struct SerDeInfo {
124 |   1: string name,                   // name of the serde, table name by default
125 |   2: string serializationLib,       // usually the class that implements the extractor & loader
126 |   3: map<string, string> parameters // initialization parameters
127 | }
128 | 
129 | // sort order of a column (column name along with asc(1)/desc(0))
130 | struct Order {
131 |   1: string col,  // sort column name
132 |   2: i32    order // asc(1) or desc(0)
133 | }
134 | 
135 | // this object holds all the information about physical storage of the data belonging to a table
136 | struct StorageDescriptor {
137 |   1: list<FieldSchema> cols,  // required (refer to types defined above)
138 |   2: string location,         // defaults to <warehouse loc>/<db loc>/tablename
139 |   3: string inputFormat,      // SequenceFileInputFormat (binary) or TextInputFormat`  or custom format
140 |   4: string outputFormat,     // SequenceFileOutputFormat (binary) or IgnoreKeyTextOutputFormat or custom format
141 |   5: bool   compressed,       // compressed or not
142 |   6: i32    numBuckets,       // this must be specified if there are any dimension columns
143 |   7: SerDeInfo    serdeInfo,  // serialization and deserialization information
144 |   8: list<string> bucketCols, // reducer grouping columns and clustering columns and bucketing columns`
145 |   9: list<Order>  sortCols,   // sort order of the data in each bucket
146 |   10: map<string, string> parameters // any user supplied key value hash
147 | }
148 | 
149 | // table information
150 | struct Table {
151 |   1: string tableName,                // name of the table
152 |   2: string dbName,                   // database name ('default')
153 |   3: string owner,                    // owner of this table
154 |   4: i32    createTime,               // creation time of the table
155 |   5: i32    lastAccessTime,           // last access time (usually this will be filled from HDFS and shouldn't be relied on)
156 |   6: i32    retention,                // retention time
157 |   7: StorageDescriptor sd,            // storage descriptor of the table
158 |   8: list<FieldSchema> partitionKeys, // partition keys of the table. only primitive types are supported
159 |   9: map<string, string> parameters,   // to store comments or any other user level parameters
160 |   10: string viewOriginalText,         // original view text, null for non-view
161 |   11: string viewExpandedText,         // expanded view text, null for non-view
162 |   12: string tableType,                 // table type enum, e.g. EXTERNAL_TABLE
163 |   13: optional PrincipalPrivilegeSet privileges,
164 | }
165 | 
166 | struct Partition {
167 |   1: list<string> values // string value is converted to appropriate partition key type
168 |   2: string       dbName,
169 |   3: string       tableName,
170 |   4: i32          createTime,
171 |   5: i32          lastAccessTime,
172 |   6: StorageDescriptor   sd,
173 |   7: map<string, string> parameters,
174 |   8: optional PrincipalPrivilegeSet privileges
175 | }
176 | 
177 | struct Index {
178 |   1: string       indexName, // unique with in the whole database namespace
179 |   2: string       indexHandlerClass, // reserved
180 |   3: string       dbName,
181 |   4: string       origTableName,
182 |   5: i32          createTime,
183 |   6: i32          lastAccessTime,
184 |   7: string       indexTableName,
185 |   8: StorageDescriptor   sd,
186 |   9: map<string, string> parameters,
187 |   10: bool         deferredRebuild
188 | }
189 | 
190 | // schema of the table/query results etc.
191 | struct Schema {
192 |  // column names, types, comments
193 |  1: list<FieldSchema> fieldSchemas,  // delimiters etc
194 |  2: map<string, string> properties
195 | }
196 | 
197 | // Key-value store to be used with selected
198 | // Metastore APIs (create, alter methods).
199 | // The client can pass environment properties / configs that can be
200 | // accessed in hooks.
201 | struct EnvironmentContext {
202 |   1: map<string, string> properties
203 | }
204 | 
205 | exception MetaException {
206 |   1: string message
207 | }
208 | 
209 | exception UnknownTableException {
210 |   1: string message
211 | }
212 | 
213 | exception UnknownDBException {
214 |   1: string message
215 | }
216 | 
217 | exception AlreadyExistsException {
218 |   1: string message
219 | }
220 | 
221 | exception InvalidPartitionException {
222 |   1: string message
223 | }
224 | 
225 | exception UnknownPartitionException {
226 |   1: string message
227 | }
228 | 
229 | exception InvalidObjectException {
230 |   1: string message
231 | }
232 | 
233 | exception NoSuchObjectException {
234 |   1: string message
235 | }
236 | 
237 | exception IndexAlreadyExistsException {
238 |   1: string message
239 | }
240 | 
241 | exception InvalidOperationException {
242 |   1: string message
243 | }
244 | 
245 | exception ConfigValSecurityException {
246 |   1: string message
247 | }
248 | 
249 | /**
250 | * This interface is live.
251 | */
252 | service ThriftHiveMetastore extends fb303.FacebookService
253 | {
254 |   void create_database(1:Database database) throws(1:AlreadyExistsException o1, 2:InvalidObjectException o2, 3:MetaException o3)
255 |   Database get_database(1:string name) throws(1:NoSuchObjectException o1, 2:MetaException o2)
256 |   void drop_database(1:string name, 2:bool deleteData, 3:bool cascade) throws(1:NoSuchObjectException o1, 2:InvalidOperationException o2, 3:MetaException o3)
257 |   list<string> get_databases(1:string pattern) throws(1:MetaException o1)
258 |   list<string> get_all_databases() throws(1:MetaException o1)
259 |   void alter_database(1:string dbname, 2:Database db) throws(1:MetaException o1, 2:NoSuchObjectException o2)
260 | 
261 |   // returns the type with given name (make seperate calls for the dependent types if needed)
262 |   Type get_type(1:string name)  throws(1:MetaException o1, 2:NoSuchObjectException o2)
263 |   bool create_type(1:Type type) throws(1:AlreadyExistsException o1, 2:InvalidObjectException o2, 3:MetaException o3)
264 |   bool drop_type(1:string type) throws(1:MetaException o1, 2:NoSuchObjectException o2)
265 |   map<string, Type> get_type_all(1:string name)
266 |                                 throws(1:MetaException o2)
267 | 
268 |   // Gets a list of FieldSchemas describing the columns of a particular table
269 |   list<FieldSchema> get_fields(1: string db_name, 2: string table_name) throws (1: MetaException o1, 2: UnknownTableException o2, 3: UnknownDBException o3),
270 | 
271 |   // Gets a list of FieldSchemas describing both the columns and the partition keys of a particular table
272 |   list<FieldSchema> get_schema(1: string db_name, 2: string table_name) throws (1: MetaException o1, 2: UnknownTableException o2, 3: UnknownDBException o3)
273 | 
274 |   // create a Hive table. Following fields must be set
275 |   // tableName
276 |   // database        (only 'default' for now until Hive QL supports databases)
277 |   // owner           (not needed, but good to have for tracking purposes)
278 |   // sd.cols         (list of field schemas)
279 |   // sd.inputFormat  (SequenceFileInputFormat (binary like falcon tables or u_full) or TextInputFormat)
280 |   // sd.outputFormat (SequenceFileInputFormat (binary) or TextInputFormat)
281 |   // sd.serdeInfo.serializationLib (SerDe class name eg org.apache.hadoop.hive.serde.simple_meta.MetadataTypedColumnsetSerDe
282 |   // * See notes on DDL_TIME
283 |   void create_table(1:Table tbl) throws(1:AlreadyExistsException o1, 2:InvalidObjectException o2, 3:MetaException o3, 4:NoSuchObjectException o4)
284 |   void create_table_with_environment_context(1:Table tbl,
285 |       2:EnvironmentContext environment_context)
286 |       throws (1:AlreadyExistsException o1,
287 |               2:InvalidObjectException o2, 3:MetaException o3,
288 |               4:NoSuchObjectException o4)
289 |   // drops the table and all the partitions associated with it if the table has partitions
290 |   // delete data (including partitions) if deleteData is set to true
291 |   void drop_table(1:string dbname, 2:string name, 3:bool deleteData)
292 |                        throws(1:NoSuchObjectException o1, 2:MetaException o3)
293 |   list<string> get_tables(1: string db_name, 2: string pattern) throws (1: MetaException o1)
294 |   list<string> get_all_tables(1: string db_name) throws (1: MetaException o1)
295 | 
296 |   Table get_table(1:string dbname, 2:string tbl_name)
297 |                        throws (1:MetaException o1, 2:NoSuchObjectException o2)
298 |   list<Table> get_table_objects_by_name(1:string dbname, 2:list<string> tbl_names)
299 | 				   throws (1:MetaException o1, 2:InvalidOperationException o2, 3:UnknownDBException o3)
300 | 
301 |   // Get a list of table names that match a filter.
302 |   // The filter operators are LIKE, <, <=, >, >=, =, <>
303 |   //
304 |   // In the filter statement, values interpreted as strings must be enclosed in quotes,
305 |   // while values interpreted as integers should not be.  Strings and integers are the only
306 |   // supported value types.
307 |   //
308 |   // The currently supported key names in the filter are:
309 |   // Constants.HIVE_FILTER_FIELD_OWNER, which filters on the tables' owner's name
310 |   //   and supports all filter operators
311 |   // Constants.HIVE_FILTER_FIELD_LAST_ACCESS, which filters on the last access times
312 |   //   and supports all filter operators except LIKE
313 |   // Constants.HIVE_FILTER_FIELD_PARAMS, which filters on the tables' parameter keys and values
314 |   //   and only supports the filter operators = and <>.
315 |   //   Append the parameter key name to HIVE_FILTER_FIELD_PARAMS in the filter statement.
316 |   //   For example, to filter on parameter keys called "retention", the key name in the filter
317 |   //   statement should be Constants.HIVE_FILTER_FIELD_PARAMS + "retention"
318 |   //   Also, = and <> only work for keys that exist
319 |   //   in the tables. E.g., if you are looking for tables where key1 <> value, it will only
320 |   //   look at tables that have a value for the parameter key1.
321 |   // Some example filter statements include:
322 |   // filter = Constants.HIVE_FILTER_FIELD_OWNER + " like \".*test.*\" and " +
323 |   //   Constants.HIVE_FILTER_FIELD_LAST_ACCESS + " = 0";
324 |   // filter = Constants.HIVE_FILTER_FIELD_PARAMS + "retention = \"30\" or " +
325 |   //   Constants.HIVE_FILTER_FIELD_PARAMS + "retention = \"90\""
326 |   // @param dbName
327 |   //          The name of the database from which you will retrieve the table names
328 |   // @param filterType
329 |   //          The type of filter
330 |   // @param filter
331 |   //          The filter string
332 |   // @param max_tables
333 |   //          The maximum number of tables returned
334 |   // @return  A list of table names that match the desired filter
335 |   list<string> get_table_names_by_filter(1:string dbname, 2:string filter, 3:i16 max_tables=-1)
336 |                        throws (1:MetaException o1, 2:InvalidOperationException o2, 3:UnknownDBException o3)
337 | 
338 |   // alter table applies to only future partitions not for existing partitions
339 |   // * See notes on DDL_TIME
340 |   void alter_table(1:string dbname, 2:string tbl_name, 3:Table new_tbl)
341 |                        throws (1:InvalidOperationException o1, 2:MetaException o2)
342 |   void alter_table_with_environment_context(1:string dbname, 2:string tbl_name,
343 |       3:Table new_tbl, 4:EnvironmentContext environment_context)
344 |       throws (1:InvalidOperationException o1, 2:MetaException o2)
345 |   // the following applies to only tables that have partitions
346 |   // * See notes on DDL_TIME
347 |   Partition add_partition(1:Partition new_part)
348 |                        throws(1:InvalidObjectException o1, 2:AlreadyExistsException o2, 3:MetaException o3)
349 |   Partition add_partition_with_environment_context(1:Partition new_part,
350 |       2:EnvironmentContext environment_context)
351 |       throws (1:InvalidObjectException o1, 2:AlreadyExistsException o2,
352 |       3:MetaException o3)
353 |   i32 add_partitions(1:list<Partition> new_parts)
354 |                        throws(1:InvalidObjectException o1, 2:AlreadyExistsException o2, 3:MetaException o3)
355 |   Partition append_partition(1:string db_name, 2:string tbl_name, 3:list<string> part_vals)
356 |                        throws (1:InvalidObjectException o1, 2:AlreadyExistsException o2, 3:MetaException o3)
357 |   Partition append_partition_by_name(1:string db_name, 2:string tbl_name, 3:string part_name)
358 |                        throws (1:InvalidObjectException o1, 2:AlreadyExistsException o2, 3:MetaException o3)
359 |   bool drop_partition(1:string db_name, 2:string tbl_name, 3:list<string> part_vals, 4:bool deleteData)
360 |                        throws(1:NoSuchObjectException o1, 2:MetaException o2)
361 |   bool drop_partition_by_name(1:string db_name, 2:string tbl_name, 3:string part_name, 4:bool deleteData)
362 |                        throws(1:NoSuchObjectException o1, 2:MetaException o2)
363 |   Partition get_partition(1:string db_name, 2:string tbl_name, 3:list<string> part_vals)
364 |                        throws(1:MetaException o1, 2:NoSuchObjectException o2)
365 | 
366 |   Partition get_partition_with_auth(1:string db_name, 2:string tbl_name, 3:list<string> part_vals,
367 |       4: string user_name, 5: list<string> group_names) throws(1:MetaException o1, 2:NoSuchObjectException o2)
368 | 
369 |   Partition get_partition_by_name(1:string db_name 2:string tbl_name, 3:string part_name)
370 |                        throws(1:MetaException o1, 2:NoSuchObjectException o2)
371 | 
372 |   // returns all the partitions for this table in reverse chronological order.
373 |   // If max parts is given then it will return only that many.
374 |   list<Partition> get_partitions(1:string db_name, 2:string tbl_name, 3:i16 max_parts=-1)
375 |                        throws(1:NoSuchObjectException o1, 2:MetaException o2)
376 |   list<Partition> get_partitions_with_auth(1:string db_name, 2:string tbl_name, 3:i16 max_parts=-1,
377 |      4: string user_name, 5: list<string> group_names) throws(1:NoSuchObjectException o1, 2:MetaException o2)
378 | 
379 |   list<string> get_partition_names(1:string db_name, 2:string tbl_name, 3:i16 max_parts=-1)
380 |                        throws(1:MetaException o2)
381 | 
382 |   // get_partition*_ps methods allow filtering by a partial partition specification,
383 |   // as needed for dynamic partitions. The values that are not restricted should
384 |   // be empty strings. Nulls were considered (instead of "") but caused errors in
385 |   // generated Python code. The size of part_vals may be smaller than the
386 |   // number of partition columns - the unspecified values are considered the same
387 |   // as "".
388 |   list<Partition> get_partitions_ps(1:string db_name 2:string tbl_name
389 |   	3:list<string> part_vals, 4:i16 max_parts=-1)
390 |                        throws(1:MetaException o1, 2:NoSuchObjectException o2)
391 |   list<Partition> get_partitions_ps_with_auth(1:string db_name, 2:string tbl_name, 3:list<string> part_vals, 4:i16 max_parts=-1,
392 |      5: string user_name, 6: list<string> group_names) throws(1:NoSuchObjectException o1, 2:MetaException o2)
393 | 
394 |   list<string> get_partition_names_ps(1:string db_name,
395 |   	2:string tbl_name, 3:list<string> part_vals, 4:i16 max_parts=-1)
396 |   	                   throws(1:MetaException o1, 2:NoSuchObjectException o2)
397 | 
398 |   // get the partitions matching the given partition filter
399 |   list<Partition> get_partitions_by_filter(1:string db_name 2:string tbl_name
400 |     3:string filter, 4:i16 max_parts=-1)
401 |                        throws(1:MetaException o1, 2:NoSuchObjectException o2)
402 | 
403 |   // get partitions give a list of partition names
404 |   list<Partition> get_partitions_by_names(1:string db_name 2:string tbl_name 3:list<string> names)
405 |                        throws(1:MetaException o1, 2:NoSuchObjectException o2)
406 | 
407 |   // changes the partition to the new partition object. partition is identified from the part values
408 |   // in the new_part
409 |   // * See notes on DDL_TIME
410 |   void alter_partition(1:string db_name, 2:string tbl_name, 3:Partition new_part)
411 |                        throws (1:InvalidOperationException o1, 2:MetaException o2)
412 | 
413 |   void alter_partition_with_environment_context(1:string db_name,
414 |       2:string tbl_name, 3:Partition new_part,
415 |       4:EnvironmentContext environment_context)
416 |       throws (1:InvalidOperationException o1, 2:MetaException o2)
417 | 
418 |   // rename the old partition to the new partition object by changing old part values to the part values
419 |   // in the new_part. old partition is identified from part_vals.
420 |   // partition keys in new_part should be the same as those in old partition.
421 |   void rename_partition(1:string db_name, 2:string tbl_name, 3:list<string> part_vals, 4:Partition new_part)
422 |                        throws (1:InvalidOperationException o1, 2:MetaException o2)
423 | 
424 |   // gets the value of the configuration key in the metastore server. returns
425 |   // defaultValue if the key does not exist. if the configuration key does not
426 |   // begin with "hive", "mapred", or "hdfs", a ConfigValSecurityException is
427 |   // thrown.
428 |   string get_config_value(1:string name, 2:string defaultValue)
429 |                           throws(1:ConfigValSecurityException o1)
430 | 
431 |   // converts a partition name into a partition values array
432 |   list<string> partition_name_to_vals(1: string part_name)
433 |                           throws(1: MetaException o1)
434 |   // converts a partition name into a partition specification (a mapping from
435 |   // the partition cols to the values)
436 |   map<string, string> partition_name_to_spec(1: string part_name)
437 |                           throws(1: MetaException o1)
438 | 
439 |   void markPartitionForEvent(1:string db_name, 2:string tbl_name, 3:map<string,string> part_vals,
440 |                   4:PartitionEventType eventType) throws (1: MetaException o1, 2: NoSuchObjectException o2,
441 |                   3: UnknownDBException o3, 4: UnknownTableException o4, 5: UnknownPartitionException o5,
442 |                   6: InvalidPartitionException o6)
443 |   bool isPartitionMarkedForEvent(1:string db_name, 2:string tbl_name, 3:map<string,string> part_vals,
444 |                   4: PartitionEventType eventType) throws (1: MetaException o1, 2:NoSuchObjectException o2,
445 |                   3: UnknownDBException o3, 4: UnknownTableException o4, 5: UnknownPartitionException o5,
446 |                   6: InvalidPartitionException o6)
447 | 
448 |   //index
449 |   Index add_index(1:Index new_index, 2: Table index_table)
450 |                        throws(1:InvalidObjectException o1, 2:AlreadyExistsException o2, 3:MetaException o3)
451 |   void alter_index(1:string dbname, 2:string base_tbl_name, 3:string idx_name, 4:Index new_idx)
452 |                        throws (1:InvalidOperationException o1, 2:MetaException o2)
453 |   bool drop_index_by_name(1:string db_name, 2:string tbl_name, 3:string index_name, 4:bool deleteData)
454 |                        throws(1:NoSuchObjectException o1, 2:MetaException o2)
455 |   Index get_index_by_name(1:string db_name 2:string tbl_name, 3:string index_name)
456 |                        throws(1:MetaException o1, 2:NoSuchObjectException o2)
457 | 
458 |   list<Index> get_indexes(1:string db_name, 2:string tbl_name, 3:i16 max_indexes=-1)
459 |                        throws(1:NoSuchObjectException o1, 2:MetaException o2)
460 |   list<string> get_index_names(1:string db_name, 2:string tbl_name, 3:i16 max_indexes=-1)
461 |                        throws(1:MetaException o2)
462 | 
463 |   //authorization privileges
464 | 
465 |   bool create_role(1:Role role) throws(1:MetaException o1)
466 |   bool drop_role(1:string role_name) throws(1:MetaException o1)
467 |   list<string> get_role_names() throws(1:MetaException o1)
468 |   bool grant_role(1:string role_name, 2:string principal_name, 3:PrincipalType principal_type,
469 |     4:string grantor, 5:PrincipalType grantorType, 6:bool grant_option) throws(1:MetaException o1)
470 |   bool revoke_role(1:string role_name, 2:string principal_name, 3:PrincipalType principal_type)
471 |                         throws(1:MetaException o1)
472 |   list<Role> list_roles(1:string principal_name, 2:PrincipalType principal_type) throws(1:MetaException o1)
473 | 
474 |   PrincipalPrivilegeSet get_privilege_set(1:HiveObjectRef hiveObject, 2:string user_name,
475 |     3: list<string> group_names) throws(1:MetaException o1)
476 |   list<HiveObjectPrivilege> list_privileges(1:string principal_name, 2:PrincipalType principal_type,
477 |     3: HiveObjectRef hiveObject) throws(1:MetaException o1)
478 | 
479 |   bool grant_privileges(1:PrivilegeBag privileges) throws(1:MetaException o1)
480 |   bool revoke_privileges(1:PrivilegeBag privileges) throws(1:MetaException o1)
481 | 
482 |   // this is used by metastore client to send UGI information to metastore server immediately
483 |   // after setting up a connection.
484 |   list<string> set_ugi(1:string user_name, 2:list<string> group_names) throws (1:MetaException o1)
485 | 
486 |   //Authentication (delegation token) interfaces
487 | 
488 |   // get metastore server delegation token for use from the map/reduce tasks to authenticate
489 |   // to metastore server
490 |   string get_delegation_token(1:string token_owner, 2:string renewer_kerberos_principal_name)
491 |     throws (1:MetaException o1)
492 | 
493 |   // method to renew delegation token obtained from metastore server
494 |   i64 renew_delegation_token(1:string token_str_form) throws (1:MetaException o1)
495 | 
496 |   // method to cancel delegation token obtained from metastore server
497 |   void cancel_delegation_token(1:string token_str_form) throws (1:MetaException o1)
498 | }
499 | 
500 | // * Note about the DDL_TIME: When creating or altering a table or a partition,
501 | // if the DDL_TIME is not set, the current time will be used.
502 | 
503 | // For storing info about archived partitions in parameters
504 | 
505 | // Whether the partition is archived
506 | const string IS_ARCHIVED = "is_archived",
507 | // The original location of the partition, before archiving. After archiving,
508 | // this directory will contain the archive. When the partition
509 | // is dropped, this directory will be deleted
510 | const string ORIGINAL_LOCATION = "original_location",
511 | 
512 | // these should be needed only for backward compatibility with filestore
513 | const string META_TABLE_COLUMNS   = "columns",
514 | const string META_TABLE_COLUMN_TYPES   = "columns.types",
515 | const string BUCKET_FIELD_NAME    = "bucket_field_name",
516 | const string BUCKET_COUNT         = "bucket_count",
517 | const string FIELD_TO_DIMENSION   = "field_to_dimension",
518 | const string META_TABLE_NAME      = "name",
519 | const string META_TABLE_DB        = "db",
520 | const string META_TABLE_LOCATION  = "location",
521 | const string META_TABLE_SERDE     = "serde",
522 | const string META_TABLE_PARTITION_COLUMNS = "partition_columns",
523 | const string FILE_INPUT_FORMAT    = "file.inputformat",
524 | const string FILE_OUTPUT_FORMAT   = "file.outputformat",
525 | const string META_TABLE_STORAGE   = "storage_handler",
526 | 
527 | 
528 | 
529 | 


--------------------------------------------------------------------------------
/herringbone-main/src/test/resources/test.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stripe-archive/herringbone/4f0524287ef47fc897702d654572bbeee1004879/herringbone-main/src/test/resources/test.parquet


--------------------------------------------------------------------------------
/herringbone-main/src/test/scala/com/stripe/herringbone/FlattenJobTest.scala:
--------------------------------------------------------------------------------
 1 | package com.stripe.herringbone.test
 2 | 
 3 | import com.stripe.herringbone.flatten._
 4 | import org.scalatest._
 5 | import parquet.example.Paper
 6 | import parquet.io.api.Binary
 7 | 
 8 | class FlattenJobTest extends FlatSpec with Matchers {
 9 |   def toBinary(x: Array[Byte]) = Binary.fromByteArray(x)
10 | 
11 |   "truncate" should "truncate to correct length" in {
12 |     val consumer = new FlatConsumer(Paper.r1, "__", false)
13 |     val bytes = toBinary(Array[Byte](1,2,3,4))
14 |     assert(consumer.truncate(bytes, 3).getBytes().sameElements(Array[Byte](1,2,3)))
15 |   }
16 | 
17 |   "truncate" should "not truncate if unnecessary" in {
18 |     val consumer = new FlatConsumer(Paper.r1, "__", false)
19 |     val bytes = toBinary(Array[Byte](1,2,3,4))
20 |     assert(consumer.truncate(bytes, 8) == bytes)
21 |   }
22 | }
23 | 


--------------------------------------------------------------------------------
/herringbone-main/src/test/scala/com/stripe/herringbone/flatten/FlatConverterTest.scala:
--------------------------------------------------------------------------------
 1 | package com.stripe.herringbone.test
 2 | 
 3 | import com.stripe.herringbone.flatten.{FlatConverter,TypeFlattener}
 4 | 
 5 | import org.scalatest._
 6 | import org.apache.hadoop.fs.Path
 7 | 
 8 | import parquet.example.Paper
 9 | import parquet.example.data.simple.SimpleGroup
10 | import parquet.example.data.GroupWriter
11 | import parquet.schema.MessageType
12 | import parquet.schema.PrimitiveType
13 | import parquet.schema.Type.Repetition.OPTIONAL
14 | import parquet.schema.Type.Repetition.REQUIRED
15 | import parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY
16 | 
17 | import scala.collection.mutable.StringBuilder
18 | import java.io.StringWriter
19 | 
20 | class FlatConverterTest extends FlatSpec with Matchers {
21 | 
22 |   def nestedGroupFixture =
23 |     new {
24 |       val group = Paper.r1
25 |       val schema = Paper.schema
26 |       val flatSchema = TypeFlattener.flatten(schema, None, "__", true)
27 |       val flatGroup = FlatConverter.flattenGroup(group, flatSchema, "__", true)
28 |     }
29 | 
30 |   def flatGroupFixture =
31 |     new {
32 |       val flatSchema =
33 |         new MessageType("Charge",
34 |           new PrimitiveType(REQUIRED, BINARY, "_id"),
35 |           new PrimitiveType(OPTIONAL, BINARY, "email"),
36 |           new PrimitiveType(REQUIRED, BINARY, "merchant")
37 |         )
38 |       val flatGroupMissingFields = new SimpleGroup(flatSchema)
39 |       flatGroupMissingFields.add("_id", "ch_1")
40 |       flatGroupMissingFields.add("merchant", "acct_1")
41 |       val flatGroupAllFields = new SimpleGroup(flatSchema)
42 |       flatGroupAllFields.add("email", "bob@stripe.com")
43 |       flatGroupAllFields.add("merchant", "acct_1")
44 |       flatGroupAllFields.add("_id", "ch_1")
45 |     }
46 | 
47 |   "groupToTSV" should "convert a flattened group" in {
48 |     val f = nestedGroupFixture
49 |     val groupTSV = FlatConverter.groupToTSV(f.flatGroup, f.flatSchema, "__", true)
50 |     assert(groupTSV == "10\t\t20,40,60")
51 |   }
52 | 
53 |   "groupToTSV" should "respect schema ordering, handle optional fields" in {
54 |     val f = flatGroupFixture
55 |     val missingTSV = FlatConverter.groupToTSV(f.flatGroupMissingFields, f.flatSchema, "__", true)
56 |     assert(missingTSV == "ch_1\t\tacct_1")
57 |     val allTSV = FlatConverter.groupToTSV(f.flatGroupAllFields, f.flatSchema, "__", true)
58 |     assert(allTSV == "ch_1\tbob@stripe.com\tacct_1")
59 |   }
60 | }
61 | 
62 | 


--------------------------------------------------------------------------------
/herringbone-main/src/test/scala/com/stripe/herringbone/flatten/TypeFlattenerTest.scala:
--------------------------------------------------------------------------------
 1 | package com.stripe.herringbone.test
 2 | 
 3 | import com.stripe.herringbone.flatten.TypeFlattener
 4 | 
 5 | import org.scalatest._
 6 | 
 7 | import parquet.schema.GroupType
 8 | import parquet.schema.MessageType
 9 | import parquet.schema.PrimitiveType
10 | import parquet.schema.Type.Repetition.OPTIONAL
11 | import parquet.schema.Type.Repetition.REPEATED
12 | import parquet.schema.Type.Repetition.REQUIRED
13 | import parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY
14 | import parquet.schema.PrimitiveType.PrimitiveTypeName.INT64
15 | 
16 | class TypeFlattenerTest extends FlatSpec with Matchers {
17 | 
18 |   "flatten" should "omit the idField in nested fieldname if specified" in {
19 |     val input = new MessageType("Document",
20 |       new PrimitiveType(OPTIONAL, BINARY, "_id"),
21 |       new GroupType(OPTIONAL, "Page",
22 |           new PrimitiveType(OPTIONAL, BINARY, "_id")))
23 | 
24 |     val expected = new MessageType("Document",
25 |       new PrimitiveType(OPTIONAL, BINARY, "_id"),
26 |       new PrimitiveType(OPTIONAL, BINARY, "Page"))
27 | 
28 |     val result = TypeFlattener.flatten(input, None, "__", true)
29 |     assert(expected == result)
30 |   }
31 | 
32 |   "flatten" should "not omit the idField in nested fieldname if none is specified" in {
33 |     val input = new MessageType("Document",
34 |       new PrimitiveType(OPTIONAL, BINARY, "_id"),
35 |       new GroupType(OPTIONAL, "Page",
36 |           new PrimitiveType(OPTIONAL, BINARY, "_id")))
37 | 
38 |     val expected = new MessageType("Document",
39 |       new PrimitiveType(OPTIONAL, BINARY, "_id"),
40 |       new PrimitiveType(OPTIONAL, BINARY, "Page___id"))
41 | 
42 |     val result = TypeFlattener.flatten(input, None, "__", false)
43 |     assert(expected == result)
44 |   }
45 | 
46 |   "flatten" should "not include repeated groups" in {
47 |     val input = new MessageType("Document",
48 |       new PrimitiveType(OPTIONAL, BINARY, "_id"),
49 |       new GroupType(REPEATED, "Nope",
50 |           new PrimitiveType(REPEATED, INT64, "Never")))
51 | 
52 |     val expected = new MessageType("Document",
53 |       new PrimitiveType(OPTIONAL, BINARY, "_id"))
54 | 
55 |     val result = TypeFlattener.flatten(input, None, "__", true)
56 |     assert(expected == result)
57 |   }
58 | 
59 |   "flatten" should "set all fields as optional" in {
60 |     val input = new MessageType("Document",
61 |       new GroupType(OPTIONAL, "Yep",
62 |           new GroupType(REQUIRED, "Grouped",
63 |               new PrimitiveType(REQUIRED, BINARY, "Yes"),
64 |               new PrimitiveType(REPEATED, BINARY, "Maybe")),
65 |           new PrimitiveType(OPTIONAL, BINARY, "Sometimes")))
66 | 
67 |     val expected = new MessageType("Document",
68 |       new PrimitiveType(OPTIONAL, BINARY, "Yep__Grouped__Yes"),
69 |       new PrimitiveType(OPTIONAL, BINARY, "Yep__Grouped__Maybe"),
70 |       new PrimitiveType(OPTIONAL, BINARY, "Yep__Sometimes"))
71 | 
72 |     val result = TypeFlattener.flatten(input, None, "__", true)
73 |     assert(expected == result)
74 |   }
75 | 
76 |   "flatten" should "preserve the order of previously flattened fields" in {
77 |     val input = new MessageType("Document",
78 |       new PrimitiveType(REQUIRED, BINARY, "Old__Two"),
79 |       new GroupType(OPTIONAL, "New",
80 |         new PrimitiveType(REQUIRED, BINARY, "One")),
81 |       new PrimitiveType(REQUIRED, BINARY, "Old__One"))
82 | 
83 |     val old = new MessageType("Document",
84 |       new PrimitiveType(OPTIONAL, BINARY, "Old__One"),
85 |       new PrimitiveType(OPTIONAL, BINARY, "Old__Two"))
86 | 
87 |     val expected = new MessageType("Document",
88 |       new PrimitiveType(OPTIONAL, BINARY, "Old__One"),
89 |       new PrimitiveType(OPTIONAL, BINARY, "Old__Two"),
90 |       new PrimitiveType(OPTIONAL, BINARY, "New__One"))
91 | 
92 |     val result = TypeFlattener.flatten(input, Some(old), "__", true)
93 |     assert(expected == result)
94 |   }
95 | }
96 | 


--------------------------------------------------------------------------------
/herringbone-main/src/test/scala/com/stripe/herringbone/load/FieldUtilsTest.scala:
--------------------------------------------------------------------------------
 1 | package com.stripe.herringbone.test.load
 2 | 
 3 | import com.stripe.herringbone.load.{FieldUtils, HadoopFs, ImpalaHiveSchemaTypeMapper}
 4 | import org.apache.hadoop.fs._
 5 | import org.scalamock.scalatest.MockFactory
 6 | import org.scalatest._
 7 | import parquet.schema.{PrimitiveType, Type}
 8 | 
 9 | class FieldUtilsTest extends FlatSpec with Matchers with MockFactory {
10 | 
11 |   "findPartitionFields" should "find the partition field names and types" in {
12 |     val hadoopFs = mock[HadoopFs]
13 |     val path = new Path("path")
14 | 
15 |     val partitions = List(("day", "123"), ("type", "foo"))
16 |     (hadoopFs.findPartitions _).expects(path).returning(partitions)
17 | 
18 |     val expected = List("`day` int", "`type` string")
19 |     FieldUtils(hadoopFs, ImpalaHiveSchemaTypeMapper).findPartitionFields(path) should equal (expected)
20 |   }
21 | 
22 |   "tableFieldsFromSchemaFields" should "find the table fields from the parquet schema" in {
23 |     val hadoopFs = mock[HadoopFs]
24 |     val optional = Type.Repetition.valueOf("OPTIONAL")
25 |     val input = List(
26 |       new PrimitiveType(optional, PrimitiveType.PrimitiveTypeName.valueOf("BINARY"), "a"),
27 |       new PrimitiveType(optional, PrimitiveType.PrimitiveTypeName.valueOf("INT32"), "b"),
28 |       new PrimitiveType(optional, PrimitiveType.PrimitiveTypeName.valueOf("INT64"), "c"),
29 |       new PrimitiveType(optional, PrimitiveType.PrimitiveTypeName.valueOf("INT96"), "d"),
30 |       new PrimitiveType(optional, PrimitiveType.PrimitiveTypeName.valueOf("DOUBLE"), "e"),
31 |       new PrimitiveType(optional, PrimitiveType.PrimitiveTypeName.valueOf("BOOLEAN"), "f"),
32 |       new PrimitiveType(optional, PrimitiveType.PrimitiveTypeName.valueOf("FLOAT"), "g"),
33 |       new PrimitiveType(optional, PrimitiveType.PrimitiveTypeName.valueOf("FIXED_LEN_BYTE_ARRAY"), "h")
34 |     )
35 | 
36 |     val expected = List(
37 |       "`a` STRING",
38 |       "`b` INT",
39 |       "`c` BIGINT",
40 |       "`d` BIGINT",
41 |       "`e` DOUBLE",
42 |       "`f` BOOLEAN",
43 |       "`g` FLOAT",
44 |       "`h` BINARY"
45 |     )
46 | 
47 |     FieldUtils(hadoopFs, ImpalaHiveSchemaTypeMapper).tableFieldsFromSchemaFields(input) should equal (expected)
48 |   }
49 | }
50 | 


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
 1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 2 |   xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 3 |   <modelVersion>4.0.0</modelVersion>
 4 | 
 5 |   <groupId>com.stripe</groupId>
 6 |   <artifactId>herringbone</artifactId>
 7 |   <version>0.0.1</version>
 8 |   <packaging>pom</packaging>
 9 | 
10 |   <name>Herringbone</name>
11 | 
12 |   <modules>
13 |     <module>herringbone-impala</module>
14 |     <module>herringbone-main</module>
15 |   </modules>
16 | </project>
17 | 
18 | 


--------------------------------------------------------------------------------