├── .gitignore
├── LICENSE
├── README.md
├── bin
└── herringbone
├── herringbone-impala
├── pom.xml
└── src
│ └── main
│ ├── scala
│ └── com
│ │ └── stripe
│ │ └── herringbone
│ │ └── impala
│ │ ├── Connection.scala
│ │ ├── Cursor.scala
│ │ ├── Exceptions.scala
│ │ ├── ImpalaClient.scala
│ │ └── ImpalaValue.scala
│ └── thrift
│ ├── ImpalaService.thrift
│ ├── Status.thrift
│ ├── beeswax.thrift
│ ├── cli_service.thrift
│ ├── fb303.thrift
│ └── hive_metastore.thrift
├── herringbone-main
├── pom.xml
└── src
│ ├── main
│ ├── scala
│ │ └── com
│ │ │ └── stripe
│ │ │ └── herringbone
│ │ │ ├── CompactInputFormat.scala
│ │ │ ├── CompactJob.scala
│ │ │ ├── FlattenJob.scala
│ │ │ ├── ParquetLoad.scala
│ │ │ ├── TsvJob.scala
│ │ │ ├── flatten
│ │ │ ├── FlatConsumer.scala
│ │ │ ├── FlatConverter.scala
│ │ │ ├── ParquetFlatConf.scala
│ │ │ ├── ParquetFlatMapper.scala
│ │ │ └── TypeFlattener.scala
│ │ │ ├── load
│ │ │ ├── FieldUtils.scala
│ │ │ ├── HadoopFs.scala
│ │ │ ├── HiveLoader.scala
│ │ │ ├── HiveServer2Connection.scala
│ │ │ ├── ImpalaLoader.scala
│ │ │ ├── ParquetLoadConf.scala
│ │ │ └── ParquetLoader.scala
│ │ │ └── util
│ │ │ └── ParquetUtils.scala
│ └── thrift
│ │ ├── ImpalaService.thrift
│ │ ├── Status.thrift
│ │ ├── beeswax.thrift
│ │ ├── cli_service.thrift
│ │ ├── fb303.thrift
│ │ └── hive_metastore.thrift
│ └── test
│ ├── resources
│ └── test.parquet
│ └── scala
│ └── com
│ └── stripe
│ └── herringbone
│ ├── FlattenJobTest.scala
│ ├── flatten
│ ├── FlatConverterTest.scala
│ └── TypeFlattenerTest.scala
│ └── load
│ └── FieldUtilsTest.scala
└── pom.xml
/.gitignore:
--------------------------------------------------------------------------------
1 | target/
2 | data/
3 | .idea/
4 | *.pyc
5 | *.iml
6 | # ignore ROC plots
7 | *.pdf
8 | .tddium*
9 |
10 | .DS_Store
11 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2014- Stripe, Inc. (https://stripe.com)
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Herringbone
2 | ===========
3 |
4 | > _**Herringbone is deprecated and is no longer being actively maintained.**_
5 |
6 | Herringbone is a suite of tools for working with parquet files on hdfs, and with impala and hive.
7 |
8 | The available commands are:
9 |
10 | `flatten`: transform a directory of parquet files with a nested structure into a directory of parquet files with a flat schema that can be loaded into impala or hive (neither of which support nested schemas). Default output directory is `/path/to/input/directory-flat`.
11 |
12 | $ herringbone flatten -i /path/to/input/directory [-o /path/to/non/default/output/directory]
13 |
14 | `load`: load a directory of parquet files (which must have a flat schema) into impala or hive (defaulting to impala). Use the --nocompute-stats option for faster loading into impala (but probably slower querying later on!)
15 |
16 | $ herringbone load [--hive] [-u] [--nocompute-stats] -d db_name -t table -p /path/to/parquet/directory
17 |
18 | `tsv`: transform a directory of parquet files into a directory of tsv files (which you can concat properly later with `hadoop fs -getmerge /path/to/tsvs`). Default output directory is `/path/to/input/directory-tsv`.
19 |
20 | $ herringbone tsv -i /path/to/input/directory [-o /path/to/non/default/output/directory]
21 |
22 | `compact`: transform a directory of parquet files into a directory of fewer larger parquet files. Default output directory is `/path/to/input/directory-compact`.
23 |
24 | $ herringbone compact -i /path/to/input/directory [-o /path/to/non/default/output/directory]
25 |
26 | See `herringbone COMMAND --help` for more information on a specific command.
27 |
28 | Building
29 | --------
30 |
31 | You'll need thrift 0.9.1 on your path.
32 |
33 | $ git clone github.com/stripe/herringbone
34 | $ cd herringbone
35 | $ mvn package
36 |
37 | Authors
38 | -------
39 |
40 | - [Avi Bryant](http://twitter.com/avibryant)
41 | - [Danielle Sucher](http://twitter.com/daniellesucher)
42 | - [Jeff Balogh](http://twitter.com/jbalogh)
43 |
--------------------------------------------------------------------------------
/bin/herringbone:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env ruby
2 |
3 | usage = <<-USAGE
4 | Herringbone is a suite of tools for working with parquet files on hdfs.
5 |
6 | The available commands are:
7 |
8 | flatten: Transform a directory of parquet files with a nested structure into a directory of parquet files with a flat schema that can be loaded into impala or hive
9 |
10 | load: Load a directory of parquet files (which must have a flat schema) into impala or hive (defaults to impala). Use the --nocompute-stats option for faster loading into impala (but probably slower querying later on!)
11 |
12 | tsv: Transform a directory of parquet files into a directory of tsv files (which you can concat properly later with `hadoop fs -getmerge /path/to/tsvs`)
13 |
14 | compact: Transform a directory of parquet files into a directory of fewer larger parquet files
15 |
16 |
17 | Example usage:
18 |
19 | `herringbone flatten -i /path/to/input/directory -o /path/to/output/directory`
20 |
21 | `herringbone load [--hive] [-u] [--nocompute-stats] -d db_name -t table -p /path/to/parquet/directory`
22 |
23 | `herringbone tsv -i /path/to/input/directory -o /path/to/output/directory`
24 |
25 | `herringbone compact -i /path/to/input/directory -o /path/to/output/directory`
26 |
27 |
28 | See 'herringbone COMMAND --help' for more information on a specific command.
29 |
30 |
31 | USAGE
32 |
33 | command_jobs = {
34 | 'compact' => 'CompactJob',
35 | 'load' => 'ParquetLoad',
36 | 'flatten' => 'FlattenJob',
37 | 'tsv' => 'TsvJob',
38 | }
39 |
40 | # Validate the given command and print usage if needed.
41 | command = ARGV.shift
42 | JOB = command_jobs[command]
43 |
44 | if ['-h', '--help'].include?(command)
45 | puts usage
46 | exit 0
47 | elsif !JOB
48 | STDERR.puts "\nError: #{command} is not an available command\n\n"
49 | puts "#{'=' * 30}\n\n"
50 | puts usage
51 | exit 1
52 | end
53 |
54 | jar_path = File.join(
55 | File.dirname(__FILE__),
56 | '../',
57 | 'herringbone-main',
58 | 'target',
59 | 'herringbone-0.0.1-jar-with-dependencies.jar'
60 | )
61 | JAR = File.expand_path(jar_path)
62 |
63 | ENV["HADOOP_CLASSPATH"] = JAR
64 | ENV["HADOOP_USER_CLASSPATH_FIRST"] = "true"
65 |
66 | exec(
67 | "hadoop",
68 | "jar",
69 | JAR,
70 | "com.stripe.herringbone.#{JOB}",
71 | *ARGV
72 | )
73 |
--------------------------------------------------------------------------------
/herringbone-impala/pom.xml:
--------------------------------------------------------------------------------
1 |
3 | 4.0.0
4 |
5 | com.stripe
6 | herringbone-impala
7 | 0.0.2
8 | jar
9 |
10 | Herringbone Impala
11 |
12 |
13 |
14 | dtrott
15 | https://maven.davidtrott.com/repository
16 |
17 |
18 |
19 |
20 |
21 |
22 | org.apache.maven.plugins
23 | maven-compiler-plugin
24 | 3.1
25 |
26 | 1.6
27 | 1.6
28 |
29 |
30 |
31 |
32 | maven-jar-plugin
33 | 2.3.1
34 |
35 |
36 |
37 | maven-resources-plugin
38 | 2.4.3
39 |
40 |
41 |
42 | net.alchim31.maven
43 | scala-maven-plugin
44 | 3.1.6
45 |
46 |
47 |
48 | compile
49 | testCompile
50 |
51 |
52 |
53 |
54 |
55 |
56 | org.apache.thrift.tools
57 | maven-thrift-plugin
58 | 0.1.11
59 |
60 | true
61 | thrift
62 |
63 |
64 |
65 | thrift-sources
66 | generate-sources
67 |
68 | compile
69 |
70 |
71 |
72 | thrift-test-sources
73 | generate-test-sources
74 |
75 | testCompile
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 | UTF-8
86 | 2.10.4
87 | 1.6
88 | 1.6
89 |
90 |
91 |
92 |
93 | cloudera-releases
94 | https://repository.cloudera.com/artifactory/cloudera-repos
95 |
96 | true
97 |
98 |
99 | false
100 |
101 |
102 |
103 |
104 |
105 |
106 | org.apache.thrift
107 | libthrift
108 | 0.12.0
109 |
110 |
111 | org.slf4j
112 | slf4j-log4j12
113 | 1.5.2
114 |
115 |
116 |
117 |
118 |
119 |
--------------------------------------------------------------------------------
/herringbone-impala/src/main/scala/com/stripe/herringbone/impala/Connection.scala:
--------------------------------------------------------------------------------
1 | package com.stripe.herringbone.impala
2 |
3 | import org.apache.thrift.transport.TSocket
4 | import org.apache.thrift.protocol.TBinaryProtocol
5 |
6 | import com.cloudera.impala.thrift.ImpalaService.{Client => ClouderaImpalaClient}
7 | import com.cloudera.beeswax.api._
8 |
9 | import scala.annotation.tailrec
10 | import scala.collection.JavaConversions._
11 |
12 | case class Connection(host: String, port: Int) {
13 | var isOpen = false
14 | val logContext = "herringbone-impala"
15 | lazy val socket = new TSocket(host, port)
16 | lazy val client = new ClouderaImpalaClient(new TBinaryProtocol(socket))
17 |
18 | open
19 |
20 | def open = {
21 | if (!isOpen) {
22 | socket.open
23 | client.ResetCatalog
24 | isOpen = true
25 | }
26 | }
27 |
28 | def close = {
29 | if (isOpen) {
30 | socket.close
31 | isOpen = false
32 | }
33 | }
34 |
35 | // Refresh the metadata store.
36 | def refresh = {
37 | if (!isOpen) throw ConnectionException("Connection closed")
38 | client.ResetCatalog
39 | }
40 |
41 | // Perform a query, and pass in a function that will be called with each
42 | // row of the results
43 | def query(raw: String)(fn: Seq[ImpalaValue] => Unit) {
44 | val cursor = execute(raw)
45 | cursor.foreach { row => fn(row) }
46 | cursor.close
47 | }
48 |
49 | // Perform a query and return a cursor for iterating over the results.
50 | // You probably want to call cursor.close when you're done with it.
51 | def execute(raw: String): Cursor = {
52 | if (!isOpen) throw ConnectionException("Connection closed")
53 | validateQuery(raw)
54 |
55 | val query = new Query
56 | query.query = raw
57 |
58 | val handle = client.executeAndWait(query, logContext)
59 | Cursor(handle, client)
60 | }
61 |
62 | private def validateQuery(raw: String) = {
63 | val words = raw.split("\\s+")
64 | if (words.isEmpty) throw InvalidQueryException("Empty query")
65 | }
66 | }
67 |
--------------------------------------------------------------------------------
/herringbone-impala/src/main/scala/com/stripe/herringbone/impala/Cursor.scala:
--------------------------------------------------------------------------------
1 | package com.stripe.herringbone.impala
2 |
3 | import org.apache.hadoop.hive.metastore.api.FieldSchema
4 |
5 | import com.cloudera.impala.thrift.ImpalaService.{Client => ClouderaImpalaClient}
6 | import com.cloudera.beeswax.api._
7 |
8 | import scala.collection.mutable.ArrayBuffer
9 | import scala.collection.JavaConversions._
10 |
11 | case class Cursor(handle: QueryHandle, client: ClouderaImpalaClient) {
12 | var done = false
13 | var isOpen = true
14 | var rowBuffer = ArrayBuffer.empty[Seq[ImpalaValue]]
15 | val bufferSize = 1024
16 | private lazy val metadata: ResultsMetadata = client.get_results_metadata(handle)
17 |
18 | def foreach(fn: Seq[ImpalaValue] => Unit) = {
19 | var row = fetchRow
20 | while (row.isDefined) {
21 | fn(row.get)
22 | row = fetchRow
23 | }
24 | }
25 |
26 | def fetchRow: Option[Seq[ImpalaValue]] = {
27 | if (rowBuffer.isEmpty) {
28 | if (done) {
29 | None
30 | } else {
31 | fetchMore
32 | fetchRow
33 | }
34 | } else {
35 | val row = rowBuffer.head
36 | rowBuffer = rowBuffer.tail
37 | Some(row)
38 | }
39 | }
40 |
41 | // Close the cursor on the remote server. Once a cursor is closed, you
42 | // can no longer fetch any rows from it.
43 | def close = {
44 | if (!isOpen) {
45 | isOpen = false
46 | client.close(handle)
47 | }
48 | }
49 |
50 | // Returns true if there are any more rows to fetch.
51 | def hasMore = !done || !rowBuffer.isEmpty
52 |
53 | def runtime_profile = client.GetRuntimeProfile(handle)
54 |
55 | private def fetchMore = {
56 | while (!done && rowBuffer.size < bufferSize) {
57 | fetchBatch
58 | }
59 | }
60 |
61 | private def fetchBatch = {
62 | if (!isOpen) throw CursorException("Cursor has expired or been closed")
63 |
64 | try {
65 | val response = client.fetch(handle, false, bufferSize)
66 | validateQueryState(client.get_state(handle))
67 |
68 | val rows = response.data.map { row => parseRow(row) }
69 | rowBuffer ++= rows
70 |
71 | if (!response.has_more) {
72 | done = true
73 | close
74 | }
75 | } catch {
76 | case e: BeeswaxException => {
77 | isOpen = false
78 | throw e
79 | }
80 | case e: Exception => throw e
81 | }
82 | }
83 |
84 | private def parseRow(row: String) = {
85 | val fields = row.split(metadata.delim)
86 |
87 | metadata.schema.getFieldSchemas.zip(fields).map { case(schema, rawValue) =>
88 | ImpalaValue(rawValue, schema.getName, schema.getType)
89 | }
90 | }
91 |
92 | private def validateQueryState(state: QueryState) = {
93 | if (state == QueryState.EXCEPTION) {
94 | close
95 | throw CursorException("The query was aborted")
96 | }
97 | }
98 | }
99 |
--------------------------------------------------------------------------------
/herringbone-impala/src/main/scala/com/stripe/herringbone/impala/Exceptions.scala:
--------------------------------------------------------------------------------
1 | package com.stripe.herringbone.impala
2 |
3 | case class ConnectionException(message: String) extends Exception
4 | case class CursorException(message: String) extends Exception
5 | case class InvalidQueryException(message: String) extends Exception
6 | case class ParsingException(message: String) extends Exception
7 |
8 |
--------------------------------------------------------------------------------
/herringbone-impala/src/main/scala/com/stripe/herringbone/impala/ImpalaClient.scala:
--------------------------------------------------------------------------------
1 | package com.stripe.herringbone.impala
2 |
3 | case class ImpalaClient(host: String, port: Int) {
4 | lazy val connection = Connection(host, port)
5 |
6 | def execute(raw: String) {
7 | query(raw){ row =>
8 | println(row.map { _.raw }.mkString(" "))
9 | }
10 | }
11 |
12 | def query(raw: String)(fn: Seq[ImpalaValue] => Unit) {
13 | println(raw)
14 | connection.query(raw){ row => fn(row) }
15 | }
16 | }
17 |
--------------------------------------------------------------------------------
/herringbone-impala/src/main/scala/com/stripe/herringbone/impala/ImpalaValue.scala:
--------------------------------------------------------------------------------
1 | package com.stripe.herringbone.impala
2 |
3 | import java.text.SimpleDateFormat
4 |
5 | case class ImpalaValue(raw: String, fieldName: String, fieldType: String) {
6 | lazy val convertedValue = convertRawValue(raw)
7 |
8 | private def convertRawValue(raw: String): Option[Any] = {
9 | if (raw == "NULL") {
10 | None
11 | } else {
12 | val converted = fieldType match {
13 | case "string" => raw
14 | case "boolean" => convertBoolean(raw)
15 | case "tinyint" | "smallint" | "int" | "bigint" => raw.toInt
16 | case "double" | "float" | "decimal" => raw.toDouble
17 | case "timestamp" => convertTimestamp(raw)
18 | case _ => throw ParsingException("Unknown type: " + fieldType)
19 | }
20 | Some(converted)
21 | }
22 | }
23 |
24 | private def convertBoolean(raw: String) = {
25 | try {
26 | raw.toBoolean
27 | } catch {
28 | case e: java.lang.IllegalArgumentException =>
29 | throw ParsingException("Invalid value for boolean: " + raw)
30 | }
31 | }
32 |
33 | private def convertTimestamp(raw: String) = {
34 | val formatStr = if (raw.indexOf(".") == -1) {
35 | "YYYY-MM-DD HH:MM:SS"
36 | } else {
37 | "YYYY-MM-DD HH:MM:SS.sssssssss"
38 | }
39 |
40 | val dateFormat = new SimpleDateFormat(formatStr)
41 | dateFormat.parse(raw)
42 | }
43 | }
44 |
--------------------------------------------------------------------------------
/herringbone-impala/src/main/thrift/ImpalaService.thrift:
--------------------------------------------------------------------------------
1 | // Copyright 2012 Cloudera Inc.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | namespace cpp impala
16 | namespace java com.cloudera.impala.thrift
17 | namespace rb impala.protocol
18 |
19 | include "Status.thrift"
20 | include "beeswax.thrift"
21 | include "cli_service.thrift"
22 |
23 | // ImpalaService accepts query execution options through beeswax.Query.configuration in
24 | // key:value form. For example, the list of strings could be:
25 | // "num_nodes:1", "abort_on_error:false"
26 | // The valid keys are listed in this enum. They map to TQueryOptions.
27 | // Note: If you add an option or change the default, you also need to update:
28 | // - ImpalaInternalService.thrift: TQueryOptions
29 | // - ImpaladClientExecutor.getBeeswaxQueryConfigurations()
30 | // - ImpalaServer::SetQueryOptions()
31 | // - ImpalaServer::TQueryOptionsToMap()
32 | enum TImpalaQueryOptions {
33 | // if true, abort execution on the first error
34 | ABORT_ON_ERROR,
35 |
36 | // maximum # of errors to be reported; Unspecified or 0 indicates backend default
37 | MAX_ERRORS,
38 |
39 | // if true, disable llvm codegen
40 | DISABLE_CODEGEN,
41 |
42 | // batch size to be used by backend; Unspecified or a size of 0 indicates backend
43 | // default
44 | BATCH_SIZE,
45 |
46 | // a per-machine approximate limit on the memory consumption of this query;
47 | // unspecified or a limit of 0 means no limit;
48 | // otherwise specified either as:
49 | // a) an int (= number of bytes);
50 | // b) a float followed by "M" (MB) or "G" (GB)
51 | MEM_LIMIT,
52 |
53 | // specifies the degree of parallelism with which to execute the query;
54 | // 1: single-node execution
55 | // NUM_NODES_ALL: executes on all nodes that contain relevant data
56 | // NUM_NODES_ALL_RACKS: executes on one node per rack that holds relevant data
57 | // > 1: executes on at most that many nodes at any point in time (ie, there can be
58 | // more nodes than numNodes with plan fragments for this query, but at most
59 | // numNodes would be active at any point in time)
60 | // Constants (NUM_NODES_ALL, NUM_NODES_ALL_RACKS) are defined in JavaConstants.thrift.
61 | NUM_NODES,
62 |
63 | // maximum length of the scan range; only applicable to HDFS scan range; Unspecified or
64 | // a length of 0 indicates backend default;
65 | MAX_SCAN_RANGE_LENGTH,
66 |
67 | // Maximum number of io buffers (per disk)
68 | MAX_IO_BUFFERS,
69 |
70 | // Number of scanner threads.
71 | NUM_SCANNER_THREADS,
72 |
73 | // If true, Impala will try to execute on file formats that are not fully supported yet
74 | ALLOW_UNSUPPORTED_FORMATS,
75 |
76 | // if set and > -1, specifies the default limit applied to a top-level SELECT statement
77 | // with an ORDER BY but without a LIMIT clause (ie, if the SELECT statement also has
78 | // a LIMIT clause, this default is ignored)
79 | DEFAULT_ORDER_BY_LIMIT,
80 |
81 | // DEBUG ONLY:
82 | // If set to
83 | // "[:]::",
84 | // the exec node with the given id will perform the specified action in the given
85 | // phase. If the optional backend number (starting from 0) is specified, only that
86 | // backend instance will perform the debug action, otherwise all backends will behave
87 | // in that way.
88 | // If the string doesn't have the required format or if any of its components is
89 | // invalid, the option is ignored.
90 | DEBUG_ACTION,
91 |
92 | // If true, raise an error when the DEFAULT_ORDER_BY_LIMIT has been reached.
93 | ABORT_ON_DEFAULT_LIMIT_EXCEEDED,
94 |
95 | // Compression codec for parquet when inserting into parquet tables.
96 | // Valid values are "snappy", "gzip" and "none"
97 | // Leave blank to use default.
98 | PARQUET_COMPRESSION_CODEC,
99 |
100 | // HBase scan query option. If set and > 0, HBASE_CACHING is the value for
101 | // "hbase.client.Scan.setCaching()" when querying HBase table. Otherwise, use backend
102 | // default.
103 | // If the value is too high, then the hbase region server will have a hard time (GC
104 | // pressure and long response times). If the value is too small, then there will be
105 | // extra trips to the hbase region server.
106 | HBASE_CACHING,
107 |
108 | // HBase scan query option. If set, HBase scan will always set
109 | // "hbase.client.setCacheBlocks" to CACHE_BLOCKS. Default is false.
110 | // If the table is large and the query is doing big scan, set it to false to
111 | // avoid polluting the cache in the hbase region server.
112 | // If the table is small and the table is used several time, set it to true to improve
113 | // performance.
114 | HBASE_CACHE_BLOCKS,
115 | }
116 |
117 | // The summary of an insert.
118 | struct TInsertResult {
119 | // Number of appended rows per modified partition. Only applies to HDFS tables.
120 | // The keys represent partitions to create, coded as k1=v1/k2=v2/k3=v3..., with the
121 | // root in an unpartitioned table being the empty string.
122 | 1: required map rows_appended
123 | }
124 |
125 | // Response from a call to PingImpalaService
126 | struct TPingImpalaServiceResp {
127 | // The Impala service's version string.
128 | 1: string version
129 | }
130 |
131 | // Parameters for a ResetTable request which will invalidate a table's metadata.
132 | // DEPRECATED.
133 | struct TResetTableReq {
134 | // Name of the table's parent database.
135 | 1: required string db_name
136 |
137 | // Name of the table.
138 | 2: required string table_name
139 | }
140 |
141 | // For all rpc that return a TStatus as part of their result type,
142 | // if the status_code field is set to anything other than OK, the contents
143 | // of the remainder of the result type is undefined (typically not set)
144 | service ImpalaService extends beeswax.BeeswaxService {
145 | // Cancel execution of query. Returns RUNTIME_ERROR if query_id
146 | // unknown.
147 | // This terminates all threads running on behalf of this query at
148 | // all nodes that were involved in the execution.
149 | // Throws BeeswaxException if the query handle is invalid (this doesn't
150 | // necessarily indicate an error: the query might have finished).
151 | Status.TStatus Cancel(1:beeswax.QueryHandle query_id)
152 | throws(1:beeswax.BeeswaxException error);
153 |
154 | // Invalidates all catalog metadata, forcing a reload
155 | // DEPRECATED; execute query "invalidate metadata" to refresh metadata
156 | Status.TStatus ResetCatalog();
157 |
158 | // Invalidates a specific table's catalog metadata, forcing a reload on the next access
159 | // DEPRECATED; execute query "refresh " to refresh metadata
160 | Status.TStatus ResetTable(1:TResetTableReq request)
161 |
162 | // Returns the runtime profile string for the given query handle.
163 | string GetRuntimeProfile(1:beeswax.QueryHandle query_id)
164 | throws(1:beeswax.BeeswaxException error);
165 |
166 | // Closes the query handle and return the result summary of the insert.
167 | TInsertResult CloseInsert(1:beeswax.QueryHandle handle)
168 | throws(1:beeswax.QueryNotFoundException error, 2:beeswax.BeeswaxException error2);
169 |
170 | // Client calls this RPC to verify that the server is an ImpalaService. Returns the
171 | // server version.
172 | TPingImpalaServiceResp PingImpalaService();
173 | }
174 |
175 | // Impala HiveServer2 service
176 | service ImpalaHiveServer2Service extends cli_service.TCLIService {
177 | }
178 |
--------------------------------------------------------------------------------
/herringbone-impala/src/main/thrift/Status.thrift:
--------------------------------------------------------------------------------
1 | // Copyright 2012 Cloudera Inc.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | namespace cpp impala
16 | namespace java com.cloudera.impala.thrift
17 | namespace rb impala.protocol
18 |
19 | enum TStatusCode {
20 | OK,
21 | CANCELLED,
22 | ANALYSIS_ERROR,
23 | NOT_IMPLEMENTED_ERROR,
24 | RUNTIME_ERROR,
25 | MEM_LIMIT_EXCEEDED,
26 | INTERNAL_ERROR
27 | }
28 |
29 | struct TStatus {
30 | 1: required TStatusCode status_code
31 | 2: list error_msgs
32 | }
33 |
--------------------------------------------------------------------------------
/herringbone-impala/src/main/thrift/beeswax.thrift:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to Cloudera, Inc. under one
3 | * or more contributor license agreements. See the NOTICE file
4 | * distributed with this work for additional information
5 | * regarding copyright ownership. Cloudera, Inc. licenses this file
6 | * to you under the Apache License, Version 2.0 (the
7 | * "License"); you may not use this file except in compliance
8 | * with the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | *
18 | * Interface for interacting with Beeswax Server
19 | */
20 |
21 | namespace java com.cloudera.beeswax.api
22 | namespace py beeswaxd
23 | namespace cpp beeswax
24 | namespace rb impala.protocol.beeswax
25 |
26 | include "hive_metastore.thrift"
27 |
28 | // A Query
29 | struct Query {
30 | 1: string query;
31 | // A list of HQL commands to execute before the query.
32 | // This is typically defining UDFs, setting settings, and loading resources.
33 | 3: list configuration;
34 |
35 | // User and groups to "act as" for purposes of Hadoop.
36 | 4: string hadoop_user;
37 | }
38 |
39 | typedef string LogContextId
40 |
41 | enum QueryState {
42 | CREATED,
43 | INITIALIZED,
44 | COMPILED,
45 | RUNNING,
46 | FINISHED,
47 | EXCEPTION
48 | }
49 |
50 | struct QueryHandle {
51 | 1: string id;
52 | 2: LogContextId log_context;
53 | }
54 |
55 | struct QueryExplanation {
56 | 1: string textual
57 | }
58 |
59 | struct Results {
60 | // If set, data is valid. Otherwise, results aren't ready yet.
61 | 1: bool ready,
62 | // Columns for the results
63 | 2: list columns,
64 | // A set of results
65 | 3: list data,
66 | // The starting row of the results
67 | 4: i64 start_row,
68 | // Whether there are more results to fetch
69 | 5: bool has_more
70 | }
71 |
72 | /**
73 | * Metadata information about the results.
74 | * Applicable only for SELECT.
75 | */
76 | struct ResultsMetadata {
77 | /** The schema of the results */
78 | 1: hive_metastore.Schema schema,
79 | /** The directory containing the results. Not applicable for partition table. */
80 | 2: string table_dir,
81 | /** If the results are straight from an existing table, the table name. */
82 | 3: string in_tablename,
83 | /** Field delimiter */
84 | 4: string delim,
85 | }
86 |
87 | exception BeeswaxException {
88 | 1: string message,
89 | // Use get_log(log_context) to retrieve any log related to this exception
90 | 2: LogContextId log_context,
91 | // (Optional) The QueryHandle that caused this exception
92 | 3: QueryHandle handle,
93 | 4: optional i32 errorCode = 0,
94 | 5: optional string SQLState = " "
95 | }
96 |
97 | exception QueryNotFoundException {
98 | }
99 |
100 | /** Represents a Hadoop-style configuration variable. */
101 | struct ConfigVariable {
102 | 1: string key,
103 | 2: string value,
104 | 3: string description
105 | }
106 |
107 | service BeeswaxService {
108 | /**
109 | * Submit a query and return a handle (QueryHandle). The query runs asynchronously.
110 | */
111 | QueryHandle query(1:Query query) throws(1:BeeswaxException error),
112 |
113 | /**
114 | * run a query synchronously and return a handle (QueryHandle).
115 | */
116 | QueryHandle executeAndWait(1:Query query, 2:LogContextId clientCtx)
117 | throws(1:BeeswaxException error),
118 |
119 | /**
120 | * Get the query plan for a query.
121 | */
122 | QueryExplanation explain(1:Query query)
123 | throws(1:BeeswaxException error),
124 |
125 | /**
126 | * Get the results of a query. This is non-blocking. Caller should check
127 | * Results.ready to determine if the results are in yet. The call requests
128 | * the batch size of fetch.
129 | */
130 | Results fetch(1:QueryHandle query_id, 2:bool start_over, 3:i32 fetch_size=-1)
131 | throws(1:QueryNotFoundException error, 2:BeeswaxException error2),
132 |
133 | /**
134 | * Get the state of the query
135 | */
136 | QueryState get_state(1:QueryHandle handle) throws(1:QueryNotFoundException error),
137 |
138 | /**
139 | * Get the result metadata
140 | */
141 | ResultsMetadata get_results_metadata(1:QueryHandle handle)
142 | throws(1:QueryNotFoundException error),
143 |
144 | /**
145 | * Used to test connection to server. A "noop" command.
146 | */
147 | string echo(1:string s)
148 |
149 | /**
150 | * Returns a string representation of the configuration object being used.
151 | * Handy for debugging.
152 | */
153 | string dump_config()
154 |
155 | /**
156 | * Get the log messages related to the given context.
157 | */
158 | string get_log(1:LogContextId context) throws(1:QueryNotFoundException error)
159 |
160 | /*
161 | * Returns "default" configuration.
162 | */
163 | list get_default_configuration(1:bool include_hadoop)
164 |
165 | /*
166 | * closes the query with given handle
167 | */
168 | void close(1:QueryHandle handle) throws(1:QueryNotFoundException error,
169 | 2:BeeswaxException error2)
170 |
171 | /*
172 | * clean the log context for given id
173 | */
174 | void clean(1:LogContextId log_context)
175 | }
176 |
--------------------------------------------------------------------------------
/herringbone-impala/src/main/thrift/cli_service.thrift:
--------------------------------------------------------------------------------
1 | // Licensed to the Apache Software Foundation (ASF) under one
2 | // or more contributor license agreements. See the NOTICE file
3 | // distributed with this work for additional information
4 | // regarding copyright ownership. The ASF licenses this file
5 | // to you under the Apache License, Version 2.0 (the
6 | // "License"); you may not use this file except in compliance
7 | // with the License. You may obtain a copy of the License at
8 | //
9 | // http://www.apache.org/licenses/LICENSE-2.0
10 | //
11 | // Unless required by applicable law or agreed to in writing, software
12 | // distributed under the License is distributed on an "AS IS" BASIS,
13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | // See the License for the specific language governing permissions and
15 | // limitations under the License.
16 |
17 | // Coding Conventions for this file:
18 | //
19 | // Structs/Enums/Unions
20 | // * Struct, Enum, and Union names begin with a "T",
21 | // and use a capital letter for each new word, with no underscores.
22 | // * All fields should be declared as either optional or required.
23 | //
24 | // Functions
25 | // * Function names start with a capital letter and have a capital letter for
26 | // each new word, with no underscores.
27 | // * Each function should take exactly one parameter, named TFunctionNameReq,
28 | // and should return either void or TFunctionNameResp. This convention allows
29 | // incremental updates.
30 | //
31 | // Services
32 | // * Service names begin with the letter "T", use a capital letter for each
33 | // new word (with no underscores), and end with the word "Service".
34 |
35 | namespace java org.apache.hive.service.cli.thrift
36 | namespace cpp apache.hive.service.cli.thrift
37 | namespace rb impala.protocol.hive
38 |
39 | // List of protocol versions. A new token should be
40 | // added to the end of this list every time a change is made.
41 | enum TProtocolVersion {
42 | HIVE_CLI_SERVICE_PROTOCOL_V1
43 | }
44 |
45 | enum TTypeId {
46 | BOOLEAN_TYPE,
47 | TINYINT_TYPE,
48 | SMALLINT_TYPE,
49 | INT_TYPE,
50 | BIGINT_TYPE,
51 | FLOAT_TYPE,
52 | DOUBLE_TYPE,
53 | STRING_TYPE,
54 | TIMESTAMP_TYPE,
55 | BINARY_TYPE,
56 | ARRAY_TYPE,
57 | MAP_TYPE,
58 | STRUCT_TYPE,
59 | UNION_TYPE,
60 | USER_DEFINED_TYPE,
61 | DECIMAL_TYPE
62 | }
63 |
64 | const set PRIMITIVE_TYPES = [
65 | TTypeId.BOOLEAN_TYPE
66 | TTypeId.TINYINT_TYPE
67 | TTypeId.SMALLINT_TYPE
68 | TTypeId.INT_TYPE
69 | TTypeId.BIGINT_TYPE
70 | TTypeId.FLOAT_TYPE
71 | TTypeId.DOUBLE_TYPE
72 | TTypeId.STRING_TYPE
73 | TTypeId.TIMESTAMP_TYPE
74 | TTypeId.BINARY_TYPE,
75 | TTypeId.DECIMAL_TYPE
76 | ]
77 |
78 | const set COMPLEX_TYPES = [
79 | TTypeId.ARRAY_TYPE
80 | TTypeId.MAP_TYPE
81 | TTypeId.STRUCT_TYPE
82 | TTypeId.UNION_TYPE
83 | TTypeId.USER_DEFINED_TYPE
84 | ]
85 |
86 | const set COLLECTION_TYPES = [
87 | TTypeId.ARRAY_TYPE
88 | TTypeId.MAP_TYPE
89 | ]
90 |
91 | const map TYPE_NAMES = {
92 | TTypeId.BOOLEAN_TYPE: "BOOLEAN",
93 | TTypeId.TINYINT_TYPE: "TINYINT",
94 | TTypeId.SMALLINT_TYPE: "SMALLINT",
95 | TTypeId.INT_TYPE: "INT",
96 | TTypeId.BIGINT_TYPE: "BIGINT",
97 | TTypeId.FLOAT_TYPE: "FLOAT",
98 | TTypeId.DOUBLE_TYPE: "DOUBLE",
99 | TTypeId.STRING_TYPE: "STRING",
100 | TTypeId.TIMESTAMP_TYPE: "TIMESTAMP",
101 | TTypeId.BINARY_TYPE: "BINARY",
102 | TTypeId.ARRAY_TYPE: "ARRAY",
103 | TTypeId.MAP_TYPE: "MAP",
104 | TTypeId.STRUCT_TYPE: "STRUCT",
105 | TTypeId.UNION_TYPE: "UNIONTYPE"
106 | TTypeId.DECIMAL_TYPE: "DECIMAL"
107 | }
108 |
109 | // Thrift does not support recursively defined types or forward declarations,
110 | // which makes it difficult to represent Hive's nested types.
111 | // To get around these limitations TTypeDesc employs a type list that maps
112 | // integer "pointers" to TTypeEntry objects. The following examples show
113 | // how different types are represented using this scheme:
114 | //
115 | // "INT":
116 | // TTypeDesc {
117 | // types = [
118 | // TTypeEntry.primitive_entry {
119 | // type = INT_TYPE
120 | // }
121 | // ]
122 | // }
123 | //
124 | // "ARRAY":
125 | // TTypeDesc {
126 | // types = [
127 | // TTypeEntry.array_entry {
128 | // object_type_ptr = 1
129 | // },
130 | // TTypeEntry.primitive_entry {
131 | // type = INT_TYPE
132 | // }
133 | // ]
134 | // }
135 | //
136 | // "MAP":
137 | // TTypeDesc {
138 | // types = [
139 | // TTypeEntry.map_entry {
140 | // key_type_ptr = 1
141 | // value_type_ptr = 2
142 | // },
143 | // TTypeEntry.primitive_entry {
144 | // type = INT_TYPE
145 | // },
146 | // TTypeEntry.primitive_entry {
147 | // type = STRING_TYPE
148 | // }
149 | // ]
150 | // }
151 |
152 | typedef i32 TTypeEntryPtr
153 |
154 | // Type entry for a primitive type.
155 | struct TPrimitiveTypeEntry {
156 | // The primitive type token. This must satisfy the condition
157 | // that type is in the PRIMITIVE_TYPES set.
158 | 1: required TTypeId type
159 | }
160 |
161 | // Type entry for an ARRAY type.
162 | struct TArrayTypeEntry {
163 | 1: required TTypeEntryPtr objectTypePtr
164 | }
165 |
166 | // Type entry for a MAP type.
167 | struct TMapTypeEntry {
168 | 1: required TTypeEntryPtr keyTypePtr
169 | 2: required TTypeEntryPtr valueTypePtr
170 | }
171 |
172 | // Type entry for a STRUCT type.
173 | struct TStructTypeEntry {
174 | 1: required map nameToTypePtr
175 | }
176 |
177 | // Type entry for a UNIONTYPE type.
178 | struct TUnionTypeEntry {
179 | 1: required map nameToTypePtr
180 | }
181 |
182 | struct TUserDefinedTypeEntry {
183 | // The fully qualified name of the class implementing this type.
184 | 1: required string typeClassName
185 | }
186 |
187 | // We use a union here since Thrift does not support inheritance.
188 | union TTypeEntry {
189 | 1: TPrimitiveTypeEntry primitiveEntry
190 | 2: TArrayTypeEntry arrayEntry
191 | 3: TMapTypeEntry mapEntry
192 | 4: TStructTypeEntry structEntry
193 | 5: TUnionTypeEntry unionEntry
194 | 6: TUserDefinedTypeEntry userDefinedTypeEntry
195 | }
196 |
197 | // Type descriptor for columns.
198 | struct TTypeDesc {
199 | // The "top" type is always the first element of the list.
200 | // If the top type is an ARRAY, MAP, STRUCT, or UNIONTYPE
201 | // type, then subsequent elements represent nested types.
202 | 1: required list types
203 | }
204 |
205 | // A result set column descriptor.
206 | struct TColumnDesc {
207 | // The name of the column
208 | 1: required string columnName
209 |
210 | // The type descriptor for this column
211 | 2: required TTypeDesc typeDesc
212 |
213 | // The ordinal position of this column in the schema
214 | 3: required i32 position
215 |
216 | 4: optional string comment
217 | }
218 |
219 | // Metadata used to describe the schema (column names, types, comments)
220 | // of result sets.
221 | struct TTableSchema {
222 | 1: required list columns
223 | }
224 |
225 | // A Boolean column value.
226 | struct TBoolValue {
227 | // NULL if value is unset.
228 | 1: optional bool value
229 | }
230 |
231 | // A Byte column value.
232 | struct TByteValue {
233 | // NULL if value is unset.
234 | 1: optional byte value
235 | }
236 |
237 | // A signed, 16 bit column value.
238 | struct TI16Value {
239 | // NULL if value is unset
240 | 1: optional i16 value
241 | }
242 |
243 | // A signed, 32 bit column value
244 | struct TI32Value {
245 | // NULL if value is unset
246 | 1: optional i32 value
247 | }
248 |
249 | // A signed 64 bit column value
250 | struct TI64Value {
251 | // NULL if value is unset
252 | 1: optional i64 value
253 | }
254 |
255 | // A floating point 64 bit column value
256 | struct TDoubleValue {
257 | // NULL if value is unset
258 | 1: optional double value
259 | }
260 |
261 | struct TStringValue {
262 | // NULL if value is unset
263 | 1: optional string value
264 | }
265 |
266 | union TColumn {
267 | 1: list boolColumn
268 | 2: list byteColumn
269 | 3: list i16Column
270 | 4: list i32Column
271 | 5: list i64Column
272 | 6: list doubleColumn
273 | 7: list stringColumn
274 | }
275 |
276 | // A single column value in a result set.
277 | // Note that Hive's type system is richer than Thrift's,
278 | // so in some cases we have to map multiple Hive types
279 | // to the same Thrift type. On the client-side this is
280 | // disambiguated by looking at the Schema of the
281 | // result set.
282 | union TColumnValue {
283 | 1: TBoolValue boolVal // BOOLEAN
284 | 2: TByteValue byteVal // TINYINT
285 | 3: TI16Value i16Val // SMALLINT
286 | 4: TI32Value i32Val // INT
287 | 5: TI64Value i64Val // BIGINT, TIMESTAMP
288 | 6: TDoubleValue doubleVal // FLOAT, DOUBLE
289 | 7: TStringValue stringVal // STRING, LIST, MAP, STRUCT, UNIONTYPE, BINARY, DECIMAL
290 | }
291 |
292 | // Represents a row in a rowset.
293 | struct TRow {
294 | 1: required list colVals
295 | }
296 |
297 | // Represents a rowset
298 | struct TRowSet {
299 | // The starting row offset of this rowset.
300 | 1: required i64 startRowOffset
301 | 2: required list rows
302 | 3: optional list columns
303 | }
304 |
305 | // The return status code contained in each response.
306 | enum TStatusCode {
307 | SUCCESS_STATUS,
308 | SUCCESS_WITH_INFO_STATUS,
309 | STILL_EXECUTING_STATUS,
310 | ERROR_STATUS,
311 | INVALID_HANDLE_STATUS
312 | }
313 |
314 | // The return status of a remote request
315 | struct TStatus {
316 | 1: required TStatusCode statusCode
317 |
318 | // If status is SUCCESS_WITH_INFO, info_msgs may be populated with
319 | // additional diagnostic information.
320 | 2: optional list infoMessages
321 |
322 | // If status is ERROR, then the following fields may be set
323 | 3: optional string sqlState // as defined in the ISO/IEF CLI specification
324 | 4: optional i32 errorCode // internal error code
325 | 5: optional string errorMessage
326 | }
327 |
328 | // The state of an operation (i.e. a query or other
329 | // asynchronous operation that generates a result set)
330 | // on the server.
331 | enum TOperationState {
332 | // The operation has been initialized
333 | INITIALIZED_STATE,
334 |
335 | // The operation is running. In this state the result
336 | // set is not available.
337 | RUNNING_STATE,
338 |
339 | // The operation has completed. When an operation is in
340 | // this state its result set may be fetched.
341 | FINISHED_STATE,
342 |
343 | // The operation was canceled by a client
344 | CANCELED_STATE,
345 |
346 | // The operation was closed by a client
347 | CLOSED_STATE,
348 |
349 | // The operation failed due to an error
350 | ERROR_STATE,
351 |
352 | // The operation is in an unrecognized state
353 | UKNOWN_STATE,
354 | }
355 |
356 |
357 | // A string identifier. This is interpreted literally.
358 | typedef string TIdentifier
359 |
360 | // A search pattern.
361 | //
362 | // Valid search pattern characters:
363 | // '_': Any single character.
364 | // '%': Any sequence of zero or more characters.
365 | // '\': Escape character used to include special characters,
366 | // e.g. '_', '%', '\'. If a '\' precedes a non-special
367 | // character it has no special meaning and is interpreted
368 | // literally.
369 | typedef string TPattern
370 |
371 |
372 | // A search pattern or identifier. Used as input
373 | // parameter for many of the catalog functions.
374 | typedef string TPatternOrIdentifier
375 |
376 | struct THandleIdentifier {
377 | // 16 byte globally unique identifier
378 | // This is the public ID of the handle and
379 | // can be used for reporting.
380 | 1: required binary guid,
381 |
382 | // 16 byte secret generated by the server
383 | // and used to verify that the handle is not
384 | // being hijacked by another user.
385 | 2: required binary secret,
386 | }
387 |
388 | // Client-side handle to persistent
389 | // session information on the server-side.
390 | struct TSessionHandle {
391 | 1: required THandleIdentifier sessionId
392 | }
393 |
394 | // The subtype of an OperationHandle.
395 | enum TOperationType {
396 | EXECUTE_STATEMENT,
397 | GET_TYPE_INFO,
398 | GET_CATALOGS,
399 | GET_SCHEMAS,
400 | GET_TABLES,
401 | GET_TABLE_TYPES,
402 | GET_COLUMNS,
403 | GET_FUNCTIONS,
404 | UNKNOWN,
405 | }
406 |
407 | // Client-side reference to a task running
408 | // asynchronously on the server.
409 | struct TOperationHandle {
410 | 1: required THandleIdentifier operationId
411 | 2: required TOperationType operationType
412 |
413 | // If hasResultSet = TRUE, then this operation
414 | // generates a result set that can be fetched.
415 | // Note that the result set may be empty.
416 | //
417 | // If hasResultSet = FALSE, then this operation
418 | // does not generate a result set, and calling
419 | // GetResultSetMetadata or FetchResults against
420 | // this OperationHandle will generate an error.
421 | 3: required bool hasResultSet
422 |
423 | // For operations that don't generate result sets,
424 | // modifiedRowCount is either:
425 | //
426 | // 1) The number of rows that were modified by
427 | // the DML operation (e.g. number of rows inserted,
428 | // number of rows deleted, etc).
429 | //
430 | // 2) 0 for operations that don't modify or add rows.
431 | //
432 | // 3) < 0 if the operation is capable of modifiying rows,
433 | // but Hive is unable to determine how many rows were
434 | // modified. For example, Hive's LOAD DATA command
435 | // doesn't generate row count information because
436 | // Hive doesn't inspect the data as it is loaded.
437 | //
438 | // modifiedRowCount is unset if the operation generates
439 | // a result set.
440 | 4: optional double modifiedRowCount
441 | }
442 |
443 |
444 | // OpenSession()
445 | //
446 | // Open a session (connection) on the server against
447 | // which operations may be executed.
448 | struct TOpenSessionReq {
449 | // The version of the HiveServer2 protocol that the client is using.
450 | 1: required TProtocolVersion client_protocol = TProtocolVersion.HIVE_CLI_SERVICE_PROTOCOL_V1
451 |
452 | // Username and password for authentication.
453 | // Depending on the authentication scheme being used,
454 | // this information may instead be provided by a lower
455 | // protocol layer, in which case these fields may be
456 | // left unset.
457 | 2: optional string username
458 | 3: optional string password
459 |
460 | // Configuration overlay which is applied when the session is
461 | // first created.
462 | 4: optional map configuration
463 | }
464 |
465 | struct TOpenSessionResp {
466 | 1: required TStatus status
467 |
468 | // The protocol version that the server is using.
469 | 2: required TProtocolVersion serverProtocolVersion = TProtocolVersion.HIVE_CLI_SERVICE_PROTOCOL_V1
470 |
471 | // Session Handle
472 | 3: optional TSessionHandle sessionHandle
473 |
474 | // The configuration settings for this session.
475 | 4: optional map configuration
476 | }
477 |
478 |
479 | // CloseSession()
480 | //
481 | // Closes the specified session and frees any resources
482 | // currently allocated to that session. Any open
483 | // operations in that session will be canceled.
484 | struct TCloseSessionReq {
485 | 1: required TSessionHandle sessionHandle
486 | }
487 |
488 | struct TCloseSessionResp {
489 | 1: required TStatus status
490 | }
491 |
492 |
493 |
494 | enum TGetInfoType {
495 | CLI_MAX_DRIVER_CONNECTIONS = 0,
496 | CLI_MAX_CONCURRENT_ACTIVITIES = 1,
497 | CLI_DATA_SOURCE_NAME = 2,
498 | CLI_FETCH_DIRECTION = 8,
499 | CLI_SERVER_NAME = 13,
500 | CLI_SEARCH_PATTERN_ESCAPE = 14,
501 | CLI_DBMS_NAME = 17,
502 | CLI_DBMS_VER = 18,
503 | CLI_ACCESSIBLE_TABLES = 19,
504 | CLI_ACCESSIBLE_PROCEDURES = 20,
505 | CLI_CURSOR_COMMIT_BEHAVIOR = 23,
506 | CLI_DATA_SOURCE_READ_ONLY = 25,
507 | CLI_DEFAULT_TXN_ISOLATION = 26,
508 | CLI_IDENTIFIER_CASE = 28,
509 | CLI_IDENTIFIER_QUOTE_CHAR = 29,
510 | CLI_MAX_COLUMN_NAME_LEN = 30,
511 | CLI_MAX_CURSOR_NAME_LEN = 31,
512 | CLI_MAX_SCHEMA_NAME_LEN = 32,
513 | CLI_MAX_CATALOG_NAME_LEN = 34,
514 | CLI_MAX_TABLE_NAME_LEN = 35,
515 | CLI_SCROLL_CONCURRENCY = 43,
516 | CLI_TXN_CAPABLE = 46,
517 | CLI_USER_NAME = 47,
518 | CLI_TXN_ISOLATION_OPTION = 72,
519 | CLI_INTEGRITY = 73,
520 | CLI_GETDATA_EXTENSIONS = 81,
521 | CLI_NULL_COLLATION = 85,
522 | CLI_ALTER_TABLE = 86,
523 | CLI_ORDER_BY_COLUMNS_IN_SELECT = 90,
524 | CLI_SPECIAL_CHARACTERS = 94,
525 | CLI_MAX_COLUMNS_IN_GROUP_BY = 97,
526 | CLI_MAX_COLUMNS_IN_INDEX = 98,
527 | CLI_MAX_COLUMNS_IN_ORDER_BY = 99,
528 | CLI_MAX_COLUMNS_IN_SELECT = 100,
529 | CLI_MAX_COLUMNS_IN_TABLE = 101,
530 | CLI_MAX_INDEX_SIZE = 102,
531 | CLI_MAX_ROW_SIZE = 104,
532 | CLI_MAX_STATEMENT_LEN = 105,
533 | CLI_MAX_TABLES_IN_SELECT = 106,
534 | CLI_MAX_USER_NAME_LEN = 107,
535 | CLI_OJ_CAPABILITIES = 115,
536 |
537 | CLI_XOPEN_CLI_YEAR = 10000,
538 | CLI_CURSOR_SENSITIVITY = 10001,
539 | CLI_DESCRIBE_PARAMETER = 10002,
540 | CLI_CATALOG_NAME = 10003,
541 | CLI_COLLATION_SEQ = 10004,
542 | CLI_MAX_IDENTIFIER_LEN = 10005,
543 | }
544 |
545 | union TGetInfoValue {
546 | 1: string stringValue
547 | 2: i16 smallIntValue
548 | 3: i32 integerBitmask
549 | 4: i32 integerFlag
550 | 5: i32 binaryValue
551 | 6: i64 lenValue
552 | }
553 |
554 | // GetInfo()
555 | //
556 | // This function is based on ODBC's CLIGetInfo() function.
557 | // The function returns general information about the data source
558 | // using the same keys as ODBC.
559 | struct TGetInfoReq {
560 | // The sesssion to run this request against
561 | 1: required TSessionHandle sessionHandle
562 |
563 | 2: required TGetInfoType infoType
564 | }
565 |
566 | struct TGetInfoResp {
567 | 1: required TStatus status
568 |
569 | 2: required TGetInfoValue infoValue
570 | }
571 |
572 |
573 | // ExecuteStatement()
574 | //
575 | // Execute a statement.
576 | // The returned OperationHandle can be used to check on the
577 | // status of the statement, and to fetch results once the
578 | // statement has finished executing.
579 | struct TExecuteStatementReq {
580 | // The session to exexcute the statement against
581 | 1: required TSessionHandle sessionHandle
582 |
583 | // The statement to be executed (DML, DDL, SET, etc)
584 | 2: required string statement
585 |
586 | // Configuration properties that are overlayed on top of the
587 | // the existing session configuration before this statement
588 | // is executed. These properties apply to this statement
589 | // only and will not affect the subsequent state of the Session.
590 | 3: optional map confOverlay
591 | }
592 |
593 | struct TExecuteStatementResp {
594 | 1: required TStatus status
595 | 2: optional TOperationHandle operationHandle
596 | }
597 |
598 |
599 | // GetTypeInfo()
600 | //
601 | // Get information about types supported by the HiveServer instance.
602 | // The information is returned as a result set which can be fetched
603 | // using the OperationHandle provided in the response.
604 | //
605 | // Refer to the documentation for ODBC's CLIGetTypeInfo function for
606 | // the format of the result set.
607 | struct TGetTypeInfoReq {
608 | // The session to run this request against.
609 | 1: required TSessionHandle sessionHandle
610 | }
611 |
612 | struct TGetTypeInfoResp {
613 | 1: required TStatus status
614 | 2: optional TOperationHandle operationHandle
615 | }
616 |
617 |
618 | // GetCatalogs()
619 | //
620 | // Returns the list of catalogs (databases)
621 | // Results are ordered by TABLE_CATALOG
622 | //
623 | // Resultset columns :
624 | // col1
625 | // name: TABLE_CAT
626 | // type: STRING
627 | // desc: Catalog name. NULL if not applicable.
628 | //
629 | struct TGetCatalogsReq {
630 | // Session to run this request against
631 | 1: required TSessionHandle sessionHandle
632 | }
633 |
634 | struct TGetCatalogsResp {
635 | 1: required TStatus status
636 | 2: optional TOperationHandle operationHandle
637 | }
638 |
639 |
640 | // GetSchemas()
641 | //
642 | // Retrieves the schema names available in this database.
643 | // The results are ordered by TABLE_CATALOG and TABLE_SCHEM.
644 | // col1
645 | // name: TABLE_SCHEM
646 | // type: STRING
647 | // desc: schema name
648 | // col2
649 | // name: TABLE_CATALOG
650 | // type: STRING
651 | // desc: catalog name
652 | struct TGetSchemasReq {
653 | // Session to run this request against
654 | 1: required TSessionHandle sessionHandle
655 |
656 | // Name of the catalog. Must not contain a search pattern.
657 | 2: optional TIdentifier catalogName
658 |
659 | // schema name or pattern
660 | 3: optional TPatternOrIdentifier schemaName
661 | }
662 |
663 | struct TGetSchemasResp {
664 | 1: required TStatus status
665 | 2: optional TOperationHandle operationHandle
666 | }
667 |
668 |
669 | // GetTables()
670 | //
671 | // Returns a list of tables with catalog, schema, and table
672 | // type information. The information is returned as a result
673 | // set which can be fetched using the OperationHandle
674 | // provided in the response.
675 | // Results are ordered by TABLE_TYPE, TABLE_CAT, TABLE_SCHEM, and TABLE_NAME
676 | //
677 | // Result Set Columns:
678 | //
679 | // col1
680 | // name: TABLE_CAT
681 | // type: STRING
682 | // desc: Catalog name. NULL if not applicable.
683 | //
684 | // col2
685 | // name: TABLE_SCHEM
686 | // type: STRING
687 | // desc: Schema name.
688 | //
689 | // col3
690 | // name: TABLE_NAME
691 | // type: STRING
692 | // desc: Table name.
693 | //
694 | // col4
695 | // name: TABLE_TYPE
696 | // type: STRING
697 | // desc: The table type, e.g. "TABLE", "VIEW", etc.
698 | //
699 | // col5
700 | // name: REMARKS
701 | // type: STRING
702 | // desc: Comments about the table
703 | //
704 | struct TGetTablesReq {
705 | // Session to run this request against
706 | 1: required TSessionHandle sessionHandle
707 |
708 | // Name of the catalog or a search pattern.
709 | 2: optional TPatternOrIdentifier catalogName
710 |
711 | // Name of the schema or a search pattern.
712 | 3: optional TPatternOrIdentifier schemaName
713 |
714 | // Name of the table or a search pattern.
715 | 4: optional TPatternOrIdentifier tableName
716 |
717 | // List of table types to match
718 | // e.g. "TABLE", "VIEW", "SYSTEM TABLE", "GLOBAL TEMPORARY",
719 | // "LOCAL TEMPORARY", "ALIAS", "SYNONYM", etc.
720 | 5: optional list tableTypes
721 | }
722 |
723 | struct TGetTablesResp {
724 | 1: required TStatus status
725 | 2: optional TOperationHandle operationHandle
726 | }
727 |
728 |
729 | // GetTableTypes()
730 | //
731 | // Returns the table types available in this database.
732 | // The results are ordered by table type.
733 | //
734 | // col1
735 | // name: TABLE_TYPE
736 | // type: STRING
737 | // desc: Table type name.
738 | struct TGetTableTypesReq {
739 | // Session to run this request against
740 | 1: required TSessionHandle sessionHandle
741 | }
742 |
743 | struct TGetTableTypesResp {
744 | 1: required TStatus status
745 | 2: optional TOperationHandle operationHandle
746 | }
747 |
748 |
749 | // GetColumns()
750 | //
751 | // Returns a list of columns in the specified tables.
752 | // The information is returned as a result set which can be fetched
753 | // using the OperationHandle provided in the response.
754 | // Results are ordered by TABLE_CAT, TABLE_SCHEM, TABLE_NAME,
755 | // and ORDINAL_POSITION.
756 | //
757 | // Result Set Columns are the same as those for the ODBC CLIColumns
758 | // function.
759 | //
760 | struct TGetColumnsReq {
761 | // Session to run this request against
762 | 1: required TSessionHandle sessionHandle
763 |
764 | // Name of the catalog. Must not contain a search pattern.
765 | 2: optional TIdentifier catalogName
766 |
767 | // Schema name or search pattern
768 | 3: optional TPatternOrIdentifier schemaName
769 |
770 | // Table name or search pattern
771 | 4: optional TPatternOrIdentifier tableName
772 |
773 | // Column name or search pattern
774 | 5: optional TPatternOrIdentifier columnName
775 | }
776 |
777 | struct TGetColumnsResp {
778 | 1: required TStatus status
779 | 2: optional TOperationHandle operationHandle
780 | }
781 |
782 |
783 | // GetFunctions()
784 | //
785 | // Returns a list of functions supported by the data source. The
786 | // behavior of this function matches
787 | // java.sql.DatabaseMetaData.getFunctions() both in terms of
788 | // inputs and outputs.
789 | //
790 | // Result Set Columns:
791 | //
792 | // col1
793 | // name: FUNCTION_CAT
794 | // type: STRING
795 | // desc: Function catalog (may be null)
796 | //
797 | // col2
798 | // name: FUNCTION_SCHEM
799 | // type: STRING
800 | // desc: Function schema (may be null)
801 | //
802 | // col3
803 | // name: FUNCTION_NAME
804 | // type: STRING
805 | // desc: Function name. This is the name used to invoke the function.
806 | //
807 | // col4
808 | // name: REMARKS
809 | // type: STRING
810 | // desc: Explanatory comment on the function.
811 | //
812 | // col5
813 | // name: FUNCTION_TYPE
814 | // type: SMALLINT
815 | // desc: Kind of function. One of:
816 | // * functionResultUnknown - Cannot determine if a return value or a table
817 | // will be returned.
818 | // * functionNoTable - Does not a return a table.
819 | // * functionReturnsTable - Returns a table.
820 | //
821 | // col6
822 | // name: SPECIFIC_NAME
823 | // type: STRING
824 | // desc: The name which uniquely identifies this function within its schema.
825 | // In this case this is the fully qualified class name of the class
826 | // that implements this function.
827 | //
828 | struct TGetFunctionsReq {
829 | // Session to run this request against
830 | 1: required TSessionHandle sessionHandle
831 |
832 | // A catalog name; must match the catalog name as it is stored in the
833 | // database; "" retrieves those without a catalog; null means
834 | // that the catalog name should not be used to narrow the search.
835 | 2: optional TIdentifier catalogName
836 |
837 | // A schema name pattern; must match the schema name as it is stored
838 | // in the database; "" retrieves those without a schema; null means
839 | // that the schema name should not be used to narrow the search.
840 | 3: optional TPatternOrIdentifier schemaName
841 |
842 | // A function name pattern; must match the function name as it is stored
843 | // in the database.
844 | 4: required TPatternOrIdentifier functionName
845 | }
846 |
847 | struct TGetFunctionsResp {
848 | 1: required TStatus status
849 | 2: optional TOperationHandle operationHandle
850 | }
851 |
852 |
853 | // GetOperationStatus()
854 | //
855 | // Get the status of an operation running on the server.
856 | struct TGetOperationStatusReq {
857 | // Session to run this request against
858 | 1: required TOperationHandle operationHandle
859 | }
860 |
861 | struct TGetOperationStatusResp {
862 | 1: required TStatus status
863 | 2: optional TOperationState operationState
864 | }
865 |
866 |
867 | // CancelOperation()
868 | //
869 | // Cancels processing on the specified operation handle and
870 | // frees any resources which were allocated.
871 | struct TCancelOperationReq {
872 | // Operation to cancel
873 | 1: required TOperationHandle operationHandle
874 | }
875 |
876 | struct TCancelOperationResp {
877 | 1: required TStatus status
878 | }
879 |
880 |
881 | // CloseOperation()
882 | //
883 | // Given an operation in the FINISHED, CANCELED,
884 | // or ERROR states, CloseOperation() will free
885 | // all of the resources which were allocated on
886 | // the server to service the operation.
887 | struct TCloseOperationReq {
888 | 1: required TOperationHandle operationHandle
889 | }
890 |
891 | struct TCloseOperationResp {
892 | 1: required TStatus status
893 | }
894 |
895 |
896 | // GetResultSetMetadata()
897 | //
898 | // Retrieves schema information for the specified operation
899 | struct TGetResultSetMetadataReq {
900 | // Operation for which to fetch result set schema information
901 | 1: required TOperationHandle operationHandle
902 | }
903 |
904 | struct TGetResultSetMetadataResp {
905 | 1: required TStatus status
906 | 2: optional TTableSchema schema
907 | }
908 |
909 |
910 | enum TFetchOrientation {
911 | // Get the next rowset. The fetch offset is ignored.
912 | FETCH_NEXT,
913 |
914 | // Get the previous rowset. The fetch offset is ignored.
915 | // NOT SUPPORTED
916 | FETCH_PRIOR,
917 |
918 | // Return the rowset at the given fetch offset relative
919 | // to the curren rowset.
920 | // NOT SUPPORTED
921 | FETCH_RELATIVE,
922 |
923 | // Return the rowset at the specified fetch offset.
924 | // NOT SUPPORTED
925 | FETCH_ABSOLUTE,
926 |
927 | // Get the first rowset in the result set.
928 | FETCH_FIRST,
929 |
930 | // Get the last rowset in the result set.
931 | // NOT SUPPORTED
932 | FETCH_LAST
933 | }
934 |
935 | // FetchResults()
936 | //
937 | // Fetch rows from the server corresponding to
938 | // a particular OperationHandle.
939 | struct TFetchResultsReq {
940 | // Operation from which to fetch results.
941 | 1: required TOperationHandle operationHandle
942 |
943 | // The fetch orientation. For V1 this must be either
944 | // FETCH_NEXT or FETCH_FIRST. Defaults to FETCH_NEXT.
945 | 2: required TFetchOrientation orientation = TFetchOrientation.FETCH_NEXT
946 |
947 | // Max number of rows that should be returned in
948 | // the rowset.
949 | 3: required i64 maxRows
950 | }
951 |
952 | struct TFetchResultsResp {
953 | 1: required TStatus status
954 |
955 | // TRUE if there are more rows left to fetch from the server.
956 | 2: optional bool hasMoreRows
957 |
958 | // The rowset. This is optional so that we have the
959 | // option in the future of adding alternate formats for
960 | // representing result set data, e.g. delimited strings,
961 | // binary encoded, etc.
962 | 3: optional TRowSet results
963 | }
964 |
965 | // GetLog()
966 | //
967 | // Fetch operation log from the server corresponding to
968 | // a particular OperationHandle.
969 | struct TGetLogReq {
970 | // Operation whose log is requested
971 | 1: required TOperationHandle operationHandle
972 | }
973 |
974 | struct TGetLogResp {
975 | 1: required TStatus status
976 |
977 | 2: required string log
978 | }
979 |
980 | service TCLIService {
981 |
982 | TOpenSessionResp OpenSession(1:TOpenSessionReq req);
983 |
984 | TCloseSessionResp CloseSession(1:TCloseSessionReq req);
985 |
986 | TGetInfoResp GetInfo(1:TGetInfoReq req);
987 |
988 | TExecuteStatementResp ExecuteStatement(1:TExecuteStatementReq req);
989 |
990 | TGetTypeInfoResp GetTypeInfo(1:TGetTypeInfoReq req);
991 |
992 | TGetCatalogsResp GetCatalogs(1:TGetCatalogsReq req);
993 |
994 | TGetSchemasResp GetSchemas(1:TGetSchemasReq req);
995 |
996 | TGetTablesResp GetTables(1:TGetTablesReq req);
997 |
998 | TGetTableTypesResp GetTableTypes(1:TGetTableTypesReq req);
999 |
1000 | TGetColumnsResp GetColumns(1:TGetColumnsReq req);
1001 |
1002 | TGetFunctionsResp GetFunctions(1:TGetFunctionsReq req);
1003 |
1004 | TGetOperationStatusResp GetOperationStatus(1:TGetOperationStatusReq req);
1005 |
1006 | TCancelOperationResp CancelOperation(1:TCancelOperationReq req);
1007 |
1008 | TCloseOperationResp CloseOperation(1:TCloseOperationReq req);
1009 |
1010 | TGetResultSetMetadataResp GetResultSetMetadata(1:TGetResultSetMetadataReq req);
1011 |
1012 | TFetchResultsResp FetchResults(1:TFetchResultsReq req);
1013 |
1014 | TGetLogResp GetLog(1:TGetLogReq req);
1015 | }
1016 |
--------------------------------------------------------------------------------
/herringbone-impala/src/main/thrift/fb303.thrift:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one
3 | * or more contributor license agreements. See the NOTICE file
4 | * distributed with this work for additional information
5 | * regarding copyright ownership. The ASF licenses this file
6 | * to you under the Apache License, Version 2.0 (the
7 | * "License"); you may not use this file except in compliance
8 | * with the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing,
13 | * software distributed under the License is distributed on an
14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 | * KIND, either express or implied. See the License for the
16 | * specific language governing permissions and limitations
17 | * under the License.
18 | */
19 |
20 | /**
21 | * fb303.thrift
22 | */
23 |
24 | namespace java com.facebook.fb303
25 | namespace cpp facebook.fb303
26 | namespace rb Impala.Protocol.fb303
27 |
28 | /**
29 | * Common status reporting mechanism across all services
30 | */
31 | enum fb_status {
32 | DEAD = 0,
33 | STARTING = 1,
34 | ALIVE = 2,
35 | STOPPING = 3,
36 | STOPPED = 4,
37 | WARNING = 5,
38 | }
39 |
40 | /**
41 | * Standard base service
42 | */
43 | service FacebookService {
44 |
45 | /**
46 | * Returns a descriptive name of the service
47 | */
48 | string getName(),
49 |
50 | /**
51 | * Returns the version of the service
52 | */
53 | string getVersion(),
54 |
55 | /**
56 | * Gets the status of this service
57 | */
58 | fb_status getStatus(),
59 |
60 | /**
61 | * User friendly description of status, such as why the service is in
62 | * the dead or warning state, or what is being started or stopped.
63 | */
64 | string getStatusDetails(),
65 |
66 | /**
67 | * Gets the counters for this service
68 | */
69 | map getCounters(),
70 |
71 | /**
72 | * Gets the value of a single counter
73 | */
74 | i64 getCounter(1: string key),
75 |
76 | /**
77 | * Sets an option
78 | */
79 | void setOption(1: string key, 2: string value),
80 |
81 | /**
82 | * Gets an option
83 | */
84 | string getOption(1: string key),
85 |
86 | /**
87 | * Gets all options
88 | */
89 | map getOptions(),
90 |
91 | /**
92 | * Returns a CPU profile over the given time interval (client and server
93 | * must agree on the profile format).
94 | */
95 | string getCpuProfile(1: i32 profileDurationInSec),
96 |
97 | /**
98 | * Returns the unix time that the server has been running since
99 | */
100 | i64 aliveSince(),
101 |
102 | /**
103 | * Tell the server to reload its configuration, reopen log files, etc
104 | */
105 | oneway void reinitialize(),
106 |
107 | /**
108 | * Suggest a shutdown to the server
109 | */
110 | oneway void shutdown(),
111 |
112 | }
113 |
--------------------------------------------------------------------------------
/herringbone-impala/src/main/thrift/hive_metastore.thrift:
--------------------------------------------------------------------------------
1 | #!/usr/local/bin/thrift -java
2 |
3 | /**
4 | * Licensed to the Apache Software Foundation (ASF) under one
5 | * or more contributor license agreements. See the NOTICE file
6 | * distributed with this work for additional information
7 | * regarding copyright ownership. The ASF licenses this file
8 | * to you under the Apache License, Version 2.0 (the
9 | * "License"); you may not use this file except in compliance
10 | * with the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | */
20 |
21 | #
22 | # Thrift Service that the MetaStore is built on
23 | #
24 |
25 | include "fb303.thrift"
26 |
27 | namespace java org.apache.hadoop.hive.metastore.api
28 | namespace php metastore
29 | namespace cpp Apache.Hadoop.Hive
30 | namespace rb Impala.Protocol.HiveMetastore
31 |
32 | const string DDL_TIME = "transient_lastDdlTime"
33 |
34 | struct Version {
35 | 1: string version,
36 | 2: string comments
37 | }
38 |
39 | struct FieldSchema {
40 | 1: string name, // name of the field
41 | 2: string type, // type of the field. primitive types defined above, specify list, map for lists & maps
42 | 3: string comment
43 | }
44 |
45 | struct Type {
46 | 1: string name, // one of the types in PrimitiveTypes or CollectionTypes or User defined types
47 | 2: optional string type1, // object type if the name is 'list' (LIST_TYPE), key type if the name is 'map' (MAP_TYPE)
48 | 3: optional string type2, // val type if the name is 'map' (MAP_TYPE)
49 | //4: optional list fields // if the name is one of the user defined types
50 | }
51 |
52 | enum HiveObjectType {
53 | GLOBAL = 1,
54 | DATABASE = 2,
55 | TABLE = 3,
56 | PARTITION = 4,
57 | COLUMN = 5,
58 | }
59 |
60 | enum PrincipalType {
61 | USER = 1,
62 | ROLE = 2,
63 | GROUP = 3,
64 | }
65 |
66 | const string HIVE_FILTER_FIELD_OWNER = "hive_filter_field_owner__"
67 | const string HIVE_FILTER_FIELD_PARAMS = "hive_filter_field_params__"
68 | const string HIVE_FILTER_FIELD_LAST_ACCESS = "hive_filter_field_last_access__"
69 |
70 | enum PartitionEventType {
71 | LOAD_DONE = 1,
72 | }
73 |
74 | struct HiveObjectRef{
75 | 1: HiveObjectType objectType,
76 | 2: string dbName,
77 | 3: string objectName,
78 | 4: list partValues,
79 | 5: string columnName,
80 | }
81 |
82 | struct PrivilegeGrantInfo {
83 | 1: string privilege,
84 | 2: i32 createTime,
85 | 3: string grantor,
86 | 4: PrincipalType grantorType,
87 | 5: bool grantOption,
88 | }
89 |
90 | struct HiveObjectPrivilege {
91 | 1: HiveObjectRef hiveObject,
92 | 2: string principalName,
93 | 3: PrincipalType principalType,
94 | 4: PrivilegeGrantInfo grantInfo,
95 | }
96 |
97 | struct PrivilegeBag {
98 | 1: list privileges,
99 | }
100 |
101 | struct PrincipalPrivilegeSet {
102 | 1: map> userPrivileges, // user name -> privilege grant info
103 | 2: map> groupPrivileges, // group name -> privilege grant info
104 | 3: map> rolePrivileges, //role name -> privilege grant info
105 | }
106 |
107 | struct Role {
108 | 1: string roleName,
109 | 2: i32 createTime,
110 | 3: string ownerName,
111 | }
112 |
113 | // namespace for tables
114 | struct Database {
115 | 1: string name,
116 | 2: string description,
117 | 3: string locationUri,
118 | 4: map parameters, // properties associated with the database
119 | 5: optional PrincipalPrivilegeSet privileges
120 | }
121 |
122 | // This object holds the information needed by SerDes
123 | struct SerDeInfo {
124 | 1: string name, // name of the serde, table name by default
125 | 2: string serializationLib, // usually the class that implements the extractor & loader
126 | 3: map parameters // initialization parameters
127 | }
128 |
129 | // sort order of a column (column name along with asc(1)/desc(0))
130 | struct Order {
131 | 1: string col, // sort column name
132 | 2: i32 order // asc(1) or desc(0)
133 | }
134 |
135 | // this object holds all the information about physical storage of the data belonging to a table
136 | struct StorageDescriptor {
137 | 1: list cols, // required (refer to types defined above)
138 | 2: string location, // defaults to //tablename
139 | 3: string inputFormat, // SequenceFileInputFormat (binary) or TextInputFormat` or custom format
140 | 4: string outputFormat, // SequenceFileOutputFormat (binary) or IgnoreKeyTextOutputFormat or custom format
141 | 5: bool compressed, // compressed or not
142 | 6: i32 numBuckets, // this must be specified if there are any dimension columns
143 | 7: SerDeInfo serdeInfo, // serialization and deserialization information
144 | 8: list bucketCols, // reducer grouping columns and clustering columns and bucketing columns`
145 | 9: list sortCols, // sort order of the data in each bucket
146 | 10: map parameters // any user supplied key value hash
147 | }
148 |
149 | // table information
150 | struct Table {
151 | 1: string tableName, // name of the table
152 | 2: string dbName, // database name ('default')
153 | 3: string owner, // owner of this table
154 | 4: i32 createTime, // creation time of the table
155 | 5: i32 lastAccessTime, // last access time (usually this will be filled from HDFS and shouldn't be relied on)
156 | 6: i32 retention, // retention time
157 | 7: StorageDescriptor sd, // storage descriptor of the table
158 | 8: list partitionKeys, // partition keys of the table. only primitive types are supported
159 | 9: map parameters, // to store comments or any other user level parameters
160 | 10: string viewOriginalText, // original view text, null for non-view
161 | 11: string viewExpandedText, // expanded view text, null for non-view
162 | 12: string tableType, // table type enum, e.g. EXTERNAL_TABLE
163 | 13: optional PrincipalPrivilegeSet privileges,
164 | }
165 |
166 | struct Partition {
167 | 1: list values // string value is converted to appropriate partition key type
168 | 2: string dbName,
169 | 3: string tableName,
170 | 4: i32 createTime,
171 | 5: i32 lastAccessTime,
172 | 6: StorageDescriptor sd,
173 | 7: map parameters,
174 | 8: optional PrincipalPrivilegeSet privileges
175 | }
176 |
177 | struct Index {
178 | 1: string indexName, // unique with in the whole database namespace
179 | 2: string indexHandlerClass, // reserved
180 | 3: string dbName,
181 | 4: string origTableName,
182 | 5: i32 createTime,
183 | 6: i32 lastAccessTime,
184 | 7: string indexTableName,
185 | 8: StorageDescriptor sd,
186 | 9: map parameters,
187 | 10: bool deferredRebuild
188 | }
189 |
190 | // schema of the table/query results etc.
191 | struct Schema {
192 | // column names, types, comments
193 | 1: list fieldSchemas, // delimiters etc
194 | 2: map properties
195 | }
196 |
197 | // Key-value store to be used with selected
198 | // Metastore APIs (create, alter methods).
199 | // The client can pass environment properties / configs that can be
200 | // accessed in hooks.
201 | struct EnvironmentContext {
202 | 1: map properties
203 | }
204 |
205 | exception MetaException {
206 | 1: string message
207 | }
208 |
209 | exception UnknownTableException {
210 | 1: string message
211 | }
212 |
213 | exception UnknownDBException {
214 | 1: string message
215 | }
216 |
217 | exception AlreadyExistsException {
218 | 1: string message
219 | }
220 |
221 | exception InvalidPartitionException {
222 | 1: string message
223 | }
224 |
225 | exception UnknownPartitionException {
226 | 1: string message
227 | }
228 |
229 | exception InvalidObjectException {
230 | 1: string message
231 | }
232 |
233 | exception NoSuchObjectException {
234 | 1: string message
235 | }
236 |
237 | exception IndexAlreadyExistsException {
238 | 1: string message
239 | }
240 |
241 | exception InvalidOperationException {
242 | 1: string message
243 | }
244 |
245 | exception ConfigValSecurityException {
246 | 1: string message
247 | }
248 |
249 | /**
250 | * This interface is live.
251 | */
252 | service ThriftHiveMetastore extends fb303.FacebookService
253 | {
254 | void create_database(1:Database database) throws(1:AlreadyExistsException o1, 2:InvalidObjectException o2, 3:MetaException o3)
255 | Database get_database(1:string name) throws(1:NoSuchObjectException o1, 2:MetaException o2)
256 | void drop_database(1:string name, 2:bool deleteData, 3:bool cascade) throws(1:NoSuchObjectException o1, 2:InvalidOperationException o2, 3:MetaException o3)
257 | list get_databases(1:string pattern) throws(1:MetaException o1)
258 | list get_all_databases() throws(1:MetaException o1)
259 | void alter_database(1:string dbname, 2:Database db) throws(1:MetaException o1, 2:NoSuchObjectException o2)
260 |
261 | // returns the type with given name (make seperate calls for the dependent types if needed)
262 | Type get_type(1:string name) throws(1:MetaException o1, 2:NoSuchObjectException o2)
263 | bool create_type(1:Type type) throws(1:AlreadyExistsException o1, 2:InvalidObjectException o2, 3:MetaException o3)
264 | bool drop_type(1:string type) throws(1:MetaException o1, 2:NoSuchObjectException o2)
265 | map get_type_all(1:string name)
266 | throws(1:MetaException o2)
267 |
268 | // Gets a list of FieldSchemas describing the columns of a particular table
269 | list get_fields(1: string db_name, 2: string table_name) throws (1: MetaException o1, 2: UnknownTableException o2, 3: UnknownDBException o3),
270 |
271 | // Gets a list of FieldSchemas describing both the columns and the partition keys of a particular table
272 | list get_schema(1: string db_name, 2: string table_name) throws (1: MetaException o1, 2: UnknownTableException o2, 3: UnknownDBException o3)
273 |
274 | // create a Hive table. Following fields must be set
275 | // tableName
276 | // database (only 'default' for now until Hive QL supports databases)
277 | // owner (not needed, but good to have for tracking purposes)
278 | // sd.cols (list of field schemas)
279 | // sd.inputFormat (SequenceFileInputFormat (binary like falcon tables or u_full) or TextInputFormat)
280 | // sd.outputFormat (SequenceFileInputFormat (binary) or TextInputFormat)
281 | // sd.serdeInfo.serializationLib (SerDe class name eg org.apache.hadoop.hive.serde.simple_meta.MetadataTypedColumnsetSerDe
282 | // * See notes on DDL_TIME
283 | void create_table(1:Table tbl) throws(1:AlreadyExistsException o1, 2:InvalidObjectException o2, 3:MetaException o3, 4:NoSuchObjectException o4)
284 | void create_table_with_environment_context(1:Table tbl,
285 | 2:EnvironmentContext environment_context)
286 | throws (1:AlreadyExistsException o1,
287 | 2:InvalidObjectException o2, 3:MetaException o3,
288 | 4:NoSuchObjectException o4)
289 | // drops the table and all the partitions associated with it if the table has partitions
290 | // delete data (including partitions) if deleteData is set to true
291 | void drop_table(1:string dbname, 2:string name, 3:bool deleteData)
292 | throws(1:NoSuchObjectException o1, 2:MetaException o3)
293 | list get_tables(1: string db_name, 2: string pattern) throws (1: MetaException o1)
294 | list get_all_tables(1: string db_name) throws (1: MetaException o1)
295 |
296 | Table get_table(1:string dbname, 2:string tbl_name)
297 | throws (1:MetaException o1, 2:NoSuchObjectException o2)
298 | list get_table_objects_by_name(1:string dbname, 2:list tbl_names)
299 | throws (1:MetaException o1, 2:InvalidOperationException o2, 3:UnknownDBException o3)
300 |
301 | // Get a list of table names that match a filter.
302 | // The filter operators are LIKE, <, <=, >, >=, =, <>
303 | //
304 | // In the filter statement, values interpreted as strings must be enclosed in quotes,
305 | // while values interpreted as integers should not be. Strings and integers are the only
306 | // supported value types.
307 | //
308 | // The currently supported key names in the filter are:
309 | // Constants.HIVE_FILTER_FIELD_OWNER, which filters on the tables' owner's name
310 | // and supports all filter operators
311 | // Constants.HIVE_FILTER_FIELD_LAST_ACCESS, which filters on the last access times
312 | // and supports all filter operators except LIKE
313 | // Constants.HIVE_FILTER_FIELD_PARAMS, which filters on the tables' parameter keys and values
314 | // and only supports the filter operators = and <>.
315 | // Append the parameter key name to HIVE_FILTER_FIELD_PARAMS in the filter statement.
316 | // For example, to filter on parameter keys called "retention", the key name in the filter
317 | // statement should be Constants.HIVE_FILTER_FIELD_PARAMS + "retention"
318 | // Also, = and <> only work for keys that exist
319 | // in the tables. E.g., if you are looking for tables where key1 <> value, it will only
320 | // look at tables that have a value for the parameter key1.
321 | // Some example filter statements include:
322 | // filter = Constants.HIVE_FILTER_FIELD_OWNER + " like \".*test.*\" and " +
323 | // Constants.HIVE_FILTER_FIELD_LAST_ACCESS + " = 0";
324 | // filter = Constants.HIVE_FILTER_FIELD_PARAMS + "retention = \"30\" or " +
325 | // Constants.HIVE_FILTER_FIELD_PARAMS + "retention = \"90\""
326 | // @param dbName
327 | // The name of the database from which you will retrieve the table names
328 | // @param filterType
329 | // The type of filter
330 | // @param filter
331 | // The filter string
332 | // @param max_tables
333 | // The maximum number of tables returned
334 | // @return A list of table names that match the desired filter
335 | list get_table_names_by_filter(1:string dbname, 2:string filter, 3:i16 max_tables=-1)
336 | throws (1:MetaException o1, 2:InvalidOperationException o2, 3:UnknownDBException o3)
337 |
338 | // alter table applies to only future partitions not for existing partitions
339 | // * See notes on DDL_TIME
340 | void alter_table(1:string dbname, 2:string tbl_name, 3:Table new_tbl)
341 | throws (1:InvalidOperationException o1, 2:MetaException o2)
342 | void alter_table_with_environment_context(1:string dbname, 2:string tbl_name,
343 | 3:Table new_tbl, 4:EnvironmentContext environment_context)
344 | throws (1:InvalidOperationException o1, 2:MetaException o2)
345 | // the following applies to only tables that have partitions
346 | // * See notes on DDL_TIME
347 | Partition add_partition(1:Partition new_part)
348 | throws(1:InvalidObjectException o1, 2:AlreadyExistsException o2, 3:MetaException o3)
349 | Partition add_partition_with_environment_context(1:Partition new_part,
350 | 2:EnvironmentContext environment_context)
351 | throws (1:InvalidObjectException o1, 2:AlreadyExistsException o2,
352 | 3:MetaException o3)
353 | i32 add_partitions(1:list new_parts)
354 | throws(1:InvalidObjectException o1, 2:AlreadyExistsException o2, 3:MetaException o3)
355 | Partition append_partition(1:string db_name, 2:string tbl_name, 3:list part_vals)
356 | throws (1:InvalidObjectException o1, 2:AlreadyExistsException o2, 3:MetaException o3)
357 | Partition append_partition_by_name(1:string db_name, 2:string tbl_name, 3:string part_name)
358 | throws (1:InvalidObjectException o1, 2:AlreadyExistsException o2, 3:MetaException o3)
359 | bool drop_partition(1:string db_name, 2:string tbl_name, 3:list part_vals, 4:bool deleteData)
360 | throws(1:NoSuchObjectException o1, 2:MetaException o2)
361 | bool drop_partition_by_name(1:string db_name, 2:string tbl_name, 3:string part_name, 4:bool deleteData)
362 | throws(1:NoSuchObjectException o1, 2:MetaException o2)
363 | Partition get_partition(1:string db_name, 2:string tbl_name, 3:list part_vals)
364 | throws(1:MetaException o1, 2:NoSuchObjectException o2)
365 |
366 | Partition get_partition_with_auth(1:string db_name, 2:string tbl_name, 3:list part_vals,
367 | 4: string user_name, 5: list group_names) throws(1:MetaException o1, 2:NoSuchObjectException o2)
368 |
369 | Partition get_partition_by_name(1:string db_name 2:string tbl_name, 3:string part_name)
370 | throws(1:MetaException o1, 2:NoSuchObjectException o2)
371 |
372 | // returns all the partitions for this table in reverse chronological order.
373 | // If max parts is given then it will return only that many.
374 | list get_partitions(1:string db_name, 2:string tbl_name, 3:i16 max_parts=-1)
375 | throws(1:NoSuchObjectException o1, 2:MetaException o2)
376 | list get_partitions_with_auth(1:string db_name, 2:string tbl_name, 3:i16 max_parts=-1,
377 | 4: string user_name, 5: list group_names) throws(1:NoSuchObjectException o1, 2:MetaException o2)
378 |
379 | list get_partition_names(1:string db_name, 2:string tbl_name, 3:i16 max_parts=-1)
380 | throws(1:MetaException o2)
381 |
382 | // get_partition*_ps methods allow filtering by a partial partition specification,
383 | // as needed for dynamic partitions. The values that are not restricted should
384 | // be empty strings. Nulls were considered (instead of "") but caused errors in
385 | // generated Python code. The size of part_vals may be smaller than the
386 | // number of partition columns - the unspecified values are considered the same
387 | // as "".
388 | list get_partitions_ps(1:string db_name 2:string tbl_name
389 | 3:list part_vals, 4:i16 max_parts=-1)
390 | throws(1:MetaException o1, 2:NoSuchObjectException o2)
391 | list get_partitions_ps_with_auth(1:string db_name, 2:string tbl_name, 3:list part_vals, 4:i16 max_parts=-1,
392 | 5: string user_name, 6: list group_names) throws(1:NoSuchObjectException o1, 2:MetaException o2)
393 |
394 | list get_partition_names_ps(1:string db_name,
395 | 2:string tbl_name, 3:list part_vals, 4:i16 max_parts=-1)
396 | throws(1:MetaException o1, 2:NoSuchObjectException o2)
397 |
398 | // get the partitions matching the given partition filter
399 | list get_partitions_by_filter(1:string db_name 2:string tbl_name
400 | 3:string filter, 4:i16 max_parts=-1)
401 | throws(1:MetaException o1, 2:NoSuchObjectException o2)
402 |
403 | // get partitions give a list of partition names
404 | list get_partitions_by_names(1:string db_name 2:string tbl_name 3:list names)
405 | throws(1:MetaException o1, 2:NoSuchObjectException o2)
406 |
407 | // changes the partition to the new partition object. partition is identified from the part values
408 | // in the new_part
409 | // * See notes on DDL_TIME
410 | void alter_partition(1:string db_name, 2:string tbl_name, 3:Partition new_part)
411 | throws (1:InvalidOperationException o1, 2:MetaException o2)
412 |
413 | void alter_partition_with_environment_context(1:string db_name,
414 | 2:string tbl_name, 3:Partition new_part,
415 | 4:EnvironmentContext environment_context)
416 | throws (1:InvalidOperationException o1, 2:MetaException o2)
417 |
418 | // rename the old partition to the new partition object by changing old part values to the part values
419 | // in the new_part. old partition is identified from part_vals.
420 | // partition keys in new_part should be the same as those in old partition.
421 | void rename_partition(1:string db_name, 2:string tbl_name, 3:list part_vals, 4:Partition new_part)
422 | throws (1:InvalidOperationException o1, 2:MetaException o2)
423 |
424 | // gets the value of the configuration key in the metastore server. returns
425 | // defaultValue if the key does not exist. if the configuration key does not
426 | // begin with "hive", "mapred", or "hdfs", a ConfigValSecurityException is
427 | // thrown.
428 | string get_config_value(1:string name, 2:string defaultValue)
429 | throws(1:ConfigValSecurityException o1)
430 |
431 | // converts a partition name into a partition values array
432 | list partition_name_to_vals(1: string part_name)
433 | throws(1: MetaException o1)
434 | // converts a partition name into a partition specification (a mapping from
435 | // the partition cols to the values)
436 | map partition_name_to_spec(1: string part_name)
437 | throws(1: MetaException o1)
438 |
439 | void markPartitionForEvent(1:string db_name, 2:string tbl_name, 3:map part_vals,
440 | 4:PartitionEventType eventType) throws (1: MetaException o1, 2: NoSuchObjectException o2,
441 | 3: UnknownDBException o3, 4: UnknownTableException o4, 5: UnknownPartitionException o5,
442 | 6: InvalidPartitionException o6)
443 | bool isPartitionMarkedForEvent(1:string db_name, 2:string tbl_name, 3:map part_vals,
444 | 4: PartitionEventType eventType) throws (1: MetaException o1, 2:NoSuchObjectException o2,
445 | 3: UnknownDBException o3, 4: UnknownTableException o4, 5: UnknownPartitionException o5,
446 | 6: InvalidPartitionException o6)
447 |
448 | //index
449 | Index add_index(1:Index new_index, 2: Table index_table)
450 | throws(1:InvalidObjectException o1, 2:AlreadyExistsException o2, 3:MetaException o3)
451 | void alter_index(1:string dbname, 2:string base_tbl_name, 3:string idx_name, 4:Index new_idx)
452 | throws (1:InvalidOperationException o1, 2:MetaException o2)
453 | bool drop_index_by_name(1:string db_name, 2:string tbl_name, 3:string index_name, 4:bool deleteData)
454 | throws(1:NoSuchObjectException o1, 2:MetaException o2)
455 | Index get_index_by_name(1:string db_name 2:string tbl_name, 3:string index_name)
456 | throws(1:MetaException o1, 2:NoSuchObjectException o2)
457 |
458 | list get_indexes(1:string db_name, 2:string tbl_name, 3:i16 max_indexes=-1)
459 | throws(1:NoSuchObjectException o1, 2:MetaException o2)
460 | list get_index_names(1:string db_name, 2:string tbl_name, 3:i16 max_indexes=-1)
461 | throws(1:MetaException o2)
462 |
463 | //authorization privileges
464 |
465 | bool create_role(1:Role role) throws(1:MetaException o1)
466 | bool drop_role(1:string role_name) throws(1:MetaException o1)
467 | list get_role_names() throws(1:MetaException o1)
468 | bool grant_role(1:string role_name, 2:string principal_name, 3:PrincipalType principal_type,
469 | 4:string grantor, 5:PrincipalType grantorType, 6:bool grant_option) throws(1:MetaException o1)
470 | bool revoke_role(1:string role_name, 2:string principal_name, 3:PrincipalType principal_type)
471 | throws(1:MetaException o1)
472 | list list_roles(1:string principal_name, 2:PrincipalType principal_type) throws(1:MetaException o1)
473 |
474 | PrincipalPrivilegeSet get_privilege_set(1:HiveObjectRef hiveObject, 2:string user_name,
475 | 3: list group_names) throws(1:MetaException o1)
476 | list list_privileges(1:string principal_name, 2:PrincipalType principal_type,
477 | 3: HiveObjectRef hiveObject) throws(1:MetaException o1)
478 |
479 | bool grant_privileges(1:PrivilegeBag privileges) throws(1:MetaException o1)
480 | bool revoke_privileges(1:PrivilegeBag privileges) throws(1:MetaException o1)
481 |
482 | // this is used by metastore client to send UGI information to metastore server immediately
483 | // after setting up a connection.
484 | list set_ugi(1:string user_name, 2:list group_names) throws (1:MetaException o1)
485 |
486 | //Authentication (delegation token) interfaces
487 |
488 | // get metastore server delegation token for use from the map/reduce tasks to authenticate
489 | // to metastore server
490 | string get_delegation_token(1:string token_owner, 2:string renewer_kerberos_principal_name)
491 | throws (1:MetaException o1)
492 |
493 | // method to renew delegation token obtained from metastore server
494 | i64 renew_delegation_token(1:string token_str_form) throws (1:MetaException o1)
495 |
496 | // method to cancel delegation token obtained from metastore server
497 | void cancel_delegation_token(1:string token_str_form) throws (1:MetaException o1)
498 | }
499 |
500 | // * Note about the DDL_TIME: When creating or altering a table or a partition,
501 | // if the DDL_TIME is not set, the current time will be used.
502 |
503 | // For storing info about archived partitions in parameters
504 |
505 | // Whether the partition is archived
506 | const string IS_ARCHIVED = "is_archived",
507 | // The original location of the partition, before archiving. After archiving,
508 | // this directory will contain the archive. When the partition
509 | // is dropped, this directory will be deleted
510 | const string ORIGINAL_LOCATION = "original_location",
511 |
512 | // these should be needed only for backward compatibility with filestore
513 | const string META_TABLE_COLUMNS = "columns",
514 | const string META_TABLE_COLUMN_TYPES = "columns.types",
515 | const string BUCKET_FIELD_NAME = "bucket_field_name",
516 | const string BUCKET_COUNT = "bucket_count",
517 | const string FIELD_TO_DIMENSION = "field_to_dimension",
518 | const string META_TABLE_NAME = "name",
519 | const string META_TABLE_DB = "db",
520 | const string META_TABLE_LOCATION = "location",
521 | const string META_TABLE_SERDE = "serde",
522 | const string META_TABLE_PARTITION_COLUMNS = "partition_columns",
523 | const string FILE_INPUT_FORMAT = "file.inputformat",
524 | const string FILE_OUTPUT_FORMAT = "file.outputformat",
525 | const string META_TABLE_STORAGE = "storage_handler",
526 |
527 |
528 |
529 |
--------------------------------------------------------------------------------
/herringbone-main/pom.xml:
--------------------------------------------------------------------------------
1 |
3 | 4.0.0
4 |
5 | com.stripe
6 | herringbone-main
7 | 0.0.1
8 | jar
9 |
10 | Herringbone Main
11 |
12 |
13 |
14 | dtrott
15 | https://maven.davidtrott.com/repository
16 |
17 |
18 |
19 |
20 |
21 |
22 | org.scalatest
23 | scalatest-maven-plugin
24 | 1.0-M2
25 |
26 | ${project.build.directory}/surefire-reports
27 | .
28 | WDF TestSuite.txt
29 | ${project.build.directory}/html/scalatest
30 | false
31 |
32 |
33 |
34 | test
35 |
36 | test
37 |
38 |
39 |
40 |
41 |
42 |
43 | org.apache.maven.plugins
44 | maven-compiler-plugin
45 | 3.1
46 |
47 | 1.6
48 | 1.6
49 |
50 |
51 |
52 | maven-jar-plugin
53 | 2.3.1
54 |
55 |
56 |
57 | maven-resources-plugin
58 | 2.4.3
59 |
60 |
61 |
62 | net.alchim31.maven
63 | scala-maven-plugin
64 | 3.1.6
65 |
66 | incremental
67 | true
68 |
69 |
70 |
71 |
72 | compile
73 | testCompile
74 |
75 |
76 |
77 |
78 |
79 |
80 | org.apache.maven.plugins
81 | maven-shade-plugin
82 | 2.3
83 |
84 | false
85 | target/herringbone-${project.version}-jar-with-dependencies.jar
86 |
87 |
88 |
89 | package
90 |
91 | shade
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 | 1.6.0rc7
101 | UTF-8
102 | 2.10.4
103 | 1.7
104 | 1.7
105 |
106 |
107 |
108 |
109 | com.twitter
110 | parquet-common
111 | ${parquet.version}
112 |
113 |
114 | com.twitter
115 | parquet-encoding
116 | ${parquet.version}
117 |
118 |
119 | com.twitter
120 | parquet-column
121 | ${parquet.version}
122 |
123 |
124 | com.twitter
125 | parquet-hadoop
126 | ${parquet.version}
127 |
128 |
129 | org.apache.hadoop
130 | hadoop-client
131 | 2.5.2
132 | provided
133 |
134 |
135 | org.apache.hive
136 | hive-jdbc
137 | 0.14.0
138 |
139 |
140 | com.twitter
141 | parquet-hadoop-bundle
142 |
143 |
144 |
145 |
146 | org.rogach
147 | scallop_2.10
148 | 0.9.5
149 |
150 |
151 | org.scala-lang
152 | jline
153 | 2.9.0-1
154 |
155 |
156 | org.scalatest
157 | scalatest_2.10
158 | 2.0
159 | test
160 |
161 |
162 | org.scalamock
163 | scalamock-scalatest-support_2.10
164 | 3.1.RC1
165 | test
166 |
167 |
168 | com.stripe
169 | herringbone-impala
170 | 0.0.2
171 |
172 |
173 | org.apache.thrift
174 | libthrift
175 |
176 |
177 |
178 |
179 |
180 |
--------------------------------------------------------------------------------
/herringbone-main/src/main/scala/com/stripe/herringbone/CompactInputFormat.scala:
--------------------------------------------------------------------------------
1 | package com.stripe.herringbone
2 |
3 | import java.util.{List => JavaList}
4 | import java.io.DataOutput
5 | import java.io.DataInput
6 |
7 | import scala.collection.mutable.MutableList
8 | import scala.collection.JavaConverters._
9 | import scala.collection.JavaConversions._
10 |
11 | import org.apache.hadoop.io.Writable
12 | import org.apache.hadoop.mapreduce.{InputSplit,Job,JobContext,Mapper,TaskAttemptContext}
13 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat
14 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat
15 | import parquet.hadoop.api.ReadSupport
16 | import parquet.hadoop.{ParquetInputFormat,ParquetInputSplit,ParquetOutputFormat,ParquetRecordReader}
17 | import parquet.hadoop.example.{ExampleOutputFormat,GroupReadSupport}
18 | import parquet.hadoop.util.ContextUtil
19 | import parquet.example.data.{Group,GroupWriter}
20 | import parquet.example.data.simple.SimpleGroup
21 |
22 |
23 | class CompactInputFormat[T](readSupportClass: Class[_ <: ReadSupport[T]]) extends ParquetInputFormat[T](readSupportClass) {
24 |
25 | // Our HDFS block size is 1024MB so we'll get pretty close.
26 | val TARGET = 1024 * 1024 * 1024 // 1024MB.
27 |
28 | override def getSplits(context: JobContext): JavaList[InputSplit] = {
29 | // Limit the splits to 100MB so it's easy to assemble them into 1024MB
30 | // chunks. This is not actually reliable. Chunks can come back bigger than
31 | // 100MB, but it does limit the size of most chunks.
32 | val conf = ContextUtil.getConfiguration(context)
33 | conf.set("mapred.max.split.size", (100 * 1024 * 1024).toString)
34 |
35 | val splits = super.getSplits(conf, getFooters(context)).asScala.toList
36 | val m = if (splits.isEmpty) splits else mergeSplits(splits)
37 | m.asInstanceOf[List[InputSplit]].asJava
38 | }
39 |
40 | def mergeSplits(splits: List[ParquetInputSplit]): List[MergedInputSplit] = {
41 | val sizes = splits.map { _.getLength }
42 | println(s"""${splits.length} initial splits were generated.
43 | | Max: ${mb(sizes.max)}
44 | | Min: ${mb(sizes.min)}
45 | | Avg: ${mb(sizes.sum.toDouble / sizes.length)}""".stripMargin)
46 |
47 | // TODO: get a CS undergrad to give us better bin packing.
48 | var buckets = MutableList[MutableList[ParquetInputSplit]](MutableList(splits.head))
49 | splits.tail.foreach { split =>
50 | val bucket = buckets.minBy { b => b.map { _.getLength }.sum }
51 | if ((split.getLength + bucket.map { _.getLength }.sum) < TARGET) {
52 | bucket += split
53 | } else {
54 | buckets += MutableList(split)
55 | }
56 | }
57 |
58 | val newSizes = buckets.map { _.map { _.getLength }.sum }.toList
59 | println(s"""${buckets.length} merged splits were generated.
60 | | Max: ${mb(newSizes.max)}
61 | | Min: ${mb(newSizes.min)}
62 | | Avg: ${mb(newSizes.sum.toDouble / newSizes.length)}""".stripMargin)
63 |
64 | buckets.map { b => new MergedInputSplit(b.toList) }.toList
65 | }
66 |
67 | override def createRecordReader(split: InputSplit, context: TaskAttemptContext): MergedRecordReader[T] = {
68 | val readSupport = ParquetInputFormat.getReadSupportInstance[T](ContextUtil.getConfiguration(context))
69 | split match {
70 | case s: MergedInputSplit => new MergedRecordReader[T](s, context, readSupport)
71 | case _ => throw new Exception(s"Expected a MergedInputSplit. Found a $split.")
72 | }
73 | }
74 |
75 | // Helper for pretty-printing byte values.
76 | def mb(n: Double): String = {
77 | val K = 1024
78 | val M = K * K
79 | val G = K * M
80 | if (n < K) f"$n%.2fB"
81 | else if (n < M) f"${n / K}%.2fK"
82 | else if (n < G) f"${n / M}%.2fM"
83 | else f"${n / G}%.2fG"
84 | }
85 | }
86 |
87 | class MergedInputSplit(var splits: List[ParquetInputSplit]) extends InputSplit with Writable {
88 | def this() = this(List())
89 |
90 | var splitNumber = 0
91 |
92 | def currentSplit: ParquetInputSplit = splits(splitNumber)
93 | def nextSplit: Option[ParquetInputSplit] = {
94 | if (splitNumber < splits.length - 1) {
95 | splitNumber += 1
96 | Some(currentSplit)
97 | } else {
98 | None
99 | }
100 | }
101 |
102 | // write and readFields are paired for serialization/deserialization.
103 | override def write(out: DataOutput) = {
104 | out.writeInt(splits.length)
105 | splits.foreach { s => s.write(out) }
106 | }
107 |
108 | override def readFields(in: DataInput) = {
109 | val count = in.readInt
110 | splits = for (i <- List.range(0, count)) yield {
111 | val s = new ParquetInputSplit
112 | s.readFields(in)
113 | s
114 | }
115 | }
116 |
117 | override def getLength: Long = splits.map { _.getLength }.sum
118 | override def getLocations: Array[String] = splits.flatMap { _.getLocations }.toArray
119 | override def toString = ""
120 | }
121 |
122 | class MergedRecordReader[T](split: MergedInputSplit,
123 | taskContext: TaskAttemptContext,
124 | readSupport: ReadSupport[T]) extends ParquetRecordReader[T](readSupport) {
125 | val totalLength = split.getLength
126 | var progress = 0L
127 |
128 | override def initialize(split: InputSplit, context: TaskAttemptContext) {
129 | super.initialize(split.asInstanceOf[MergedInputSplit].currentSplit, context)
130 | }
131 |
132 | def startNextSplit(split: MergedInputSplit, context: TaskAttemptContext): Boolean = {
133 | split.nextSplit match {
134 | case Some(s) => {
135 | super.initialize(s, context)
136 | true
137 | }
138 | case None => false
139 | }
140 | }
141 |
142 | // nextKeyValue is used to ask for the next tuple and returns false when the
143 | // recordReader has no more tuples. Since we're wrapping multiple splits, and
144 | // therefore multiple record readers, we detect when the current inernal
145 | // reader is done and move to the next reader.
146 | override def nextKeyValue: Boolean = {
147 | val next = super.nextKeyValue
148 | if (next) {
149 | next
150 | } else {
151 | super.close
152 | progress += split.currentSplit.getLength
153 |
154 | if (startNextSplit(split, taskContext)) {
155 | nextKeyValue
156 | } else {
157 | false
158 | }
159 | }
160 | }
161 |
162 | override def toString = ""
163 | override def getProgress: Float = progress / totalLength
164 | }
165 |
166 |
167 | class CompactGroupInputFormat extends CompactInputFormat[Group](classOf[GroupReadSupport]) { }
168 |
--------------------------------------------------------------------------------
/herringbone-main/src/main/scala/com/stripe/herringbone/CompactJob.scala:
--------------------------------------------------------------------------------
1 | package com.stripe.herringbone
2 |
3 | import com.stripe.herringbone.util.ParquetUtils
4 |
5 | import java.util.{List => JavaList}
6 | import java.io.DataOutput
7 | import java.io.DataInput
8 |
9 | import scala.collection.mutable.MutableList
10 | import scala.collection.JavaConverters._
11 |
12 | import org.apache.hadoop.conf.{Configuration,Configured}
13 | import org.apache.hadoop.fs.{FileSystem,Path}
14 | import org.apache.hadoop.mapreduce.{Job,Mapper}
15 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat
16 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat
17 | import org.apache.hadoop.util.{Tool,ToolRunner}
18 |
19 | import org.codehaus.jackson.map.ObjectMapper
20 | import org.codehaus.jackson.`type`.TypeReference
21 |
22 | import org.rogach.scallop.ScallopConf
23 |
24 | import parquet.example.data.{Group,GroupWriter}
25 | import parquet.hadoop.{BadConfigurationException,ParquetInputFormat,ParquetOutputFormat}
26 | import parquet.hadoop.api.{DelegatingWriteSupport,WriteSupport}
27 | import parquet.hadoop.api.WriteSupport.FinalizedWriteContext
28 | import parquet.hadoop.example.{GroupReadSupport,GroupWriteSupport}
29 |
30 | class ParquetCompactConf(arguments: Seq[String]) extends ScallopConf(arguments) {
31 | val inputPath = opt[String](required = true)
32 | val outputPath = opt[String](descr = "Default is input path with `-compact` appended")
33 | }
34 |
35 | class ParquetCompactWriteSupport extends DelegatingWriteSupport[Group](new GroupWriteSupport) {
36 | var extraMetadata: java.util.Map[String, String] = _
37 |
38 | override def init(configuration: Configuration): WriteSupport.WriteContext = {
39 | extractMetadata(configuration)
40 | super.init(configuration)
41 | }
42 |
43 | override def finalizeWrite(): FinalizedWriteContext = {
44 | new FinalizedWriteContext(extraMetadata)
45 | }
46 |
47 | def extractMetadata(configuration: Configuration) = {
48 | val metadataJson = configuration.get(ParquetCompactWriteSupport.ExtraMetadataKey)
49 | try {
50 | extraMetadata = new ObjectMapper().readValue(metadataJson, new TypeReference[java.util.Map[String,String]](){})
51 | } catch { case e: java.io.IOException =>
52 | throw new BadConfigurationException("Unable to deserialize extra extra metadata: " + metadataJson, e)
53 | }
54 | }
55 | }
56 |
57 | object ParquetCompactWriteSupport {
58 | val ExtraMetadataKey = "herringbone.compact.extrametadata"
59 | }
60 |
61 | class CompactJob extends Configured with Tool {
62 | override def run(arguments: Array[String]) = {
63 | val conf = new ParquetCompactConf(arguments)
64 | val inputPath = new Path(conf.inputPath())
65 | val fs = inputPath.getFileSystem(getConf)
66 | val outputPathString = conf.outputPath.get.getOrElse(conf.inputPath().stripSuffix("/").concat("-compact"))
67 | val outputPath = new Path(outputPathString)
68 |
69 | // Pass along metadata (which includes the thrift schema) to the results.
70 | val metadata = ParquetUtils.readKeyValueMetaData(inputPath)
71 | val metadataJson = new ObjectMapper().writeValueAsString(metadata)
72 | getConf.set(ParquetCompactWriteSupport.ExtraMetadataKey, metadataJson)
73 |
74 | if (fs.exists(outputPath)) {
75 | println(s"Deleting existing $outputPath")
76 | fs.delete(outputPath, true)
77 | }
78 |
79 | val job = new Job(getConf)
80 |
81 | FileInputFormat.setInputPaths(job, inputPath)
82 | FileOutputFormat.setOutputPath(job, outputPath)
83 | ParquetInputFormat.setReadSupportClass(job, classOf[GroupReadSupport])
84 | ParquetOutputFormat.setWriteSupportClass(job, classOf[ParquetCompactWriteSupport])
85 | GroupWriteSupport.setSchema(ParquetUtils.readSchema(inputPath), job.getConfiguration)
86 |
87 | job.setJobName("compact " + conf.inputPath() + " → " + outputPathString)
88 | job.setInputFormatClass(classOf[CompactGroupInputFormat]);
89 | job.setOutputFormatClass(classOf[ParquetOutputFormat[Group]])
90 | job.setMapperClass(classOf[Mapper[Void,Group,Void,Group]])
91 | job.setJarByClass(classOf[CompactJob])
92 | job.getConfiguration.setBoolean("mapreduce.job.user.classpath.first", true)
93 | job.getConfiguration.setBoolean(ParquetOutputFormat.ENABLE_JOB_SUMMARY, false)
94 | job.getConfiguration.setBoolean(ParquetInputFormat.TASK_SIDE_METADATA, false);
95 | job.setNumReduceTasks(0)
96 |
97 | if(job.waitForCompletion(true)) 0 else 1
98 | }
99 | }
100 |
101 | object CompactJob {
102 |
103 | def main(args: Array[String]) = {
104 | val result = ToolRunner.run(new Configuration, new CompactJob, args)
105 | System.exit(result)
106 | }
107 | }
108 |
--------------------------------------------------------------------------------
/herringbone-main/src/main/scala/com/stripe/herringbone/FlattenJob.scala:
--------------------------------------------------------------------------------
1 | package com.stripe.herringbone
2 |
3 | import com.stripe.herringbone.flatten.{ParquetFlatConf,ParquetFlatMapper,TypeFlattener}
4 | import com.stripe.herringbone.flatten.FlatConverter
5 | import com.stripe.herringbone.util.ParquetUtils
6 |
7 | import org.apache.hadoop.mapreduce._
8 | import org.apache.hadoop.mapreduce.lib.input._
9 | import org.apache.hadoop.mapreduce.lib.output._
10 | import org.apache.hadoop.util._
11 | import org.apache.hadoop.fs._
12 | import org.apache.hadoop.conf._
13 |
14 | import parquet.example.data._
15 | import parquet.example.data.simple._
16 | import parquet.hadoop._
17 | import parquet.hadoop.example._
18 | import parquet.io.api._
19 | import parquet.schema._
20 |
21 | import org.rogach.scallop._
22 |
23 | class FlattenMapper extends ParquetFlatMapper[Group] {
24 | def valueOut(value: Group) = {
25 | FlatConverter.flattenGroup(value, flattenedSchema, separator, renameId)
26 | }
27 | }
28 |
29 | class FlattenJob extends Configured with Tool {
30 | override def run(args: Array[String]) = {
31 | val conf = new ParquetFlatConf(args)
32 | val fs = FileSystem.get(getConf)
33 | val inputPath = new Path(conf.inputPath())
34 | val outputPathString = conf.outputPath.get.getOrElse(conf.inputPath().stripSuffix("/").concat("-flat"))
35 | val outputPath = new Path(outputPathString)
36 | val previousPath = conf.previousPath.get.map{new Path(_)}
37 |
38 | val separator = conf.separator()
39 | getConf.set(ParquetFlatMapper.SeparatorKey, separator)
40 |
41 | val renameId = conf.renameId()
42 | getConf.set(ParquetFlatMapper.RenameIdKey, renameId.toString)
43 |
44 | if (fs.exists(outputPath)) {
45 | println(s"Deleting existing $outputPath")
46 | fs.delete(outputPath, true)
47 | }
48 |
49 | val flattenedSchema = TypeFlattener.flatten(
50 | ParquetUtils.readSchema(inputPath),
51 | previousPath.map { ParquetUtils.readSchema(_) },
52 | separator,
53 | renameId
54 | )
55 |
56 | val jobName = "flatten " + conf.inputPath() + " -> " + outputPathString
57 | val job = new Job(getConf, jobName)
58 |
59 | FileInputFormat.setInputPaths(job, inputPath)
60 | FileOutputFormat.setOutputPath(job, outputPath)
61 | ExampleOutputFormat.setSchema(job, flattenedSchema)
62 | ParquetInputFormat.setReadSupportClass(job, classOf[GroupReadSupport])
63 |
64 | job.setInputFormatClass(classOf[CompactGroupInputFormat]);
65 | job.setOutputFormatClass(classOf[ExampleOutputFormat])
66 | job.setMapperClass(classOf[FlattenMapper])
67 | job.setJarByClass(classOf[FlattenJob])
68 | job.getConfiguration.setBoolean("mapreduce.job.user.classpath.first", true)
69 | job.getConfiguration.setBoolean(ParquetOutputFormat.ENABLE_JOB_SUMMARY, false)
70 | job.getConfiguration.setBoolean(ParquetInputFormat.TASK_SIDE_METADATA, false);
71 | job.setNumReduceTasks(0)
72 |
73 | if (job.waitForCompletion(true)) 0 else 1
74 | }
75 | }
76 |
77 | object FlattenJob {
78 | def main(args: Array[String]) = {
79 | val result = ToolRunner.run(new Configuration, new FlattenJob, args)
80 | System.exit(result)
81 | }
82 | }
83 |
--------------------------------------------------------------------------------
/herringbone-main/src/main/scala/com/stripe/herringbone/ParquetLoad.scala:
--------------------------------------------------------------------------------
1 | package com.stripe.herringbone
2 |
3 | import com.stripe.herringbone.load._
4 |
5 | import org.apache.hadoop.conf._
6 | import org.apache.hadoop.util._
7 |
8 | class ParquetLoad extends Configured with Tool {
9 | override def run(args: Array[String]): Int = {
10 | val conf = new ParquetLoadConf(args)
11 | val hadoopFs = new HadoopFs()
12 | val fieldUtils = FieldUtils(hadoopFs, ImpalaHiveSchemaTypeMapper)
13 |
14 | val loader: ParquetLoader = if (conf.hive()) {
15 | HiveLoader(conf, hadoopFs, fieldUtils)
16 | } else {
17 | ImpalaLoader(conf, hadoopFs, fieldUtils)
18 | }
19 |
20 | if (conf.updatePartitions()) {
21 | val tableExists = loader.checkTableExists(conf.table(), conf.database())
22 |
23 | (conf.path.get, tableExists) match {
24 | case (_, true) => loader.updateTable(conf.table(), conf.database())
25 | case (Some(path), false) => loader.createTable(path, conf.table(), conf.database())
26 | case (None, false) => {
27 | println("ERROR - path not specified and table not yet created. Specify path from which to create the table")
28 | return 1
29 | }
30 | }
31 | } else {
32 | loader.createTable(conf.path(), conf.table(), conf.database())
33 | }
34 | loader.closeConnection
35 |
36 | 0
37 | }
38 | }
39 |
40 | object ParquetLoad {
41 | def main(args: Array[String]) = {
42 | val result = ToolRunner.run(new Configuration, new ParquetLoad, args)
43 | System.exit(result)
44 | }
45 | }
46 |
--------------------------------------------------------------------------------
/herringbone-main/src/main/scala/com/stripe/herringbone/TsvJob.scala:
--------------------------------------------------------------------------------
1 | package com.stripe.herringbone
2 |
3 | import com.stripe.herringbone.flatten.{ParquetFlatConf,ParquetFlatMapper,TypeFlattener}
4 | import com.stripe.herringbone.flatten.FlatConverter
5 | import com.stripe.herringbone.util.ParquetUtils
6 |
7 | import java.io.{BufferedWriter, OutputStreamWriter}
8 |
9 | import org.apache.hadoop.mapreduce._
10 | import org.apache.hadoop.mapreduce.lib.input._
11 | import org.apache.hadoop.mapreduce.lib.output._
12 | import org.apache.hadoop.util._
13 | import org.apache.hadoop.fs._
14 | import org.apache.hadoop.conf._
15 | import org.apache.hadoop.io.Text
16 |
17 | import org.rogach.scallop._
18 |
19 | import parquet.example.data._
20 | import parquet.example.data.simple._
21 | import parquet.hadoop._
22 | import parquet.hadoop.example._
23 | import parquet.io.api._
24 | import parquet.schema._
25 |
26 | import scala.collection.JavaConversions._
27 |
28 | class TsvMapper extends ParquetFlatMapper[Text] {
29 | def valueOut(value: Group) = {
30 | val tsvLine = FlatConverter.groupToTSV(value, flattenedSchema, separator, renameId)
31 | new Text(tsvLine)
32 | }
33 | }
34 |
35 | class TsvJob extends Configured with Tool {
36 | override def run(args: Array[String]) = {
37 | val conf = new ParquetFlatConf(args)
38 | val fs = FileSystem.get(getConf)
39 | val inputPath = new Path(conf.inputPath())
40 | val outputPathString = conf.outputPath.get.getOrElse(conf.inputPath().stripSuffix("/").concat("-tsv"))
41 | val outputPath = new Path(outputPathString)
42 | val previousPath = conf.previousPath.get.map{new Path(_)}
43 |
44 | val separator = conf.separator()
45 | getConf.set(ParquetFlatMapper.SeparatorKey, separator)
46 |
47 | val renameId = conf.renameId()
48 | getConf.set(ParquetFlatMapper.RenameIdKey, renameId.toString)
49 |
50 | if (fs.exists(outputPath)) {
51 | println(s"Deleting existing $outputPath")
52 | fs.delete(outputPath, true)
53 | }
54 |
55 | val flattenedSchema = TypeFlattener.flatten(
56 | ParquetUtils.readSchema(inputPath),
57 | previousPath.map { ParquetUtils.readSchema(_) },
58 | separator,
59 | renameId
60 | )
61 |
62 | val jobName = "tsv " + conf.inputPath() + " -> " + outputPathString
63 | val job = new Job(getConf, jobName)
64 |
65 | FileInputFormat.setInputPaths(job, inputPath)
66 | FileOutputFormat.setOutputPath(job, outputPath)
67 | ParquetInputFormat.setReadSupportClass(job, classOf[GroupReadSupport])
68 | ExampleOutputFormat.setSchema(job, flattenedSchema)
69 |
70 | job.setInputFormatClass(classOf[CompactGroupInputFormat])
71 | job.setOutputFormatClass(classOf[TextOutputFormat[Text, Text]].asInstanceOf[Class[Nothing]])
72 | job.setMapperClass(classOf[TsvMapper])
73 | job.setJarByClass(classOf[TsvJob])
74 | job.getConfiguration.set("mapreduce.job.user.classpath.first", "true")
75 | job.getConfiguration.setBoolean(ParquetInputFormat.TASK_SIDE_METADATA, false)
76 | job.setNumReduceTasks(0)
77 |
78 | if (job.waitForCompletion(true)) {
79 | val headerPath = new Path(outputPathString + "/_header.tsv")
80 | writeHeader(fs, headerPath, flattenedSchema)
81 | 0
82 | } else {
83 | 1
84 | }
85 | }
86 |
87 | def writeHeader(fs: FileSystem, outputPath: Path, schema: MessageType) {
88 | val header = FlatConverter.constructHeader(schema)
89 | val writer = new BufferedWriter(new OutputStreamWriter(fs.create(outputPath, true)))
90 | writer.write(header)
91 | writer.write("\n")
92 | writer.close()
93 | }
94 | }
95 |
96 | object TsvJob {
97 | def main(args: Array[String]) = {
98 | val result = ToolRunner.run(new Configuration, new TsvJob, args)
99 | System.exit(result)
100 | }
101 | }
102 |
--------------------------------------------------------------------------------
/herringbone-main/src/main/scala/com/stripe/herringbone/flatten/FlatConsumer.scala:
--------------------------------------------------------------------------------
1 | package com.stripe.herringbone.flatten
2 |
3 | import org.apache.hadoop.mapreduce._
4 | import org.apache.hadoop.mapreduce.lib.input._
5 | import org.apache.hadoop.mapreduce.lib.output._
6 | import org.apache.hadoop.util._
7 | import org.apache.hadoop.fs._
8 | import org.apache.hadoop.conf._
9 |
10 | import parquet.example.data._
11 | import parquet.example.data.simple._
12 | import parquet.hadoop._
13 | import parquet.hadoop.example._
14 | import parquet.io.api._
15 | import parquet.schema._
16 |
17 | class FlatConsumer(output: Group, separator: String, renameId: Boolean) extends RecordConsumer {
18 |
19 | case class StackFrame(field: String, var values: List[Binary])
20 | var stack = List[StackFrame]()
21 | // Impala stops working after a field becomes too long. The docs
22 | // indicate that we should have 32k. However, a binary search on a
23 | // too-long field yielded 6776 as the maximum working value.
24 | val MaxStringBytes = 6776
25 |
26 | def startMessage {}
27 | def endMessage {}
28 | def startGroup {}
29 | def endGroup {}
30 |
31 | def startField(field: String, index: Int) {
32 | stack ::= StackFrame(field, Nil)
33 | }
34 |
35 | def endField(field: String, index: Int) {
36 | if (stack.head.values.size == 1) {
37 | withField{name => output.add(name, stack.head.values.head)}
38 | } else if (stack.head.values.size > 1) {
39 | withField {name =>
40 | val joined = Binary.fromString(
41 | stack
42 | .head
43 | .values
44 | .reverse
45 | .map{_.toStringUsingUTF8}
46 | .mkString(",")
47 | .replace("\t", " ")
48 | )
49 | val truncated = truncate(joined, MaxStringBytes)
50 | output.add(name, truncated)
51 | }
52 | }
53 | stack = stack.tail
54 | }
55 |
56 | def addInteger(value: Int) {
57 | writeField{Binary.fromString(value.toString)}{name => output.add(name, value)}
58 | }
59 |
60 | def addLong(value: Long) {
61 | writeField{Binary.fromString(value.toString)}{name => output.add(name, value)}
62 | }
63 |
64 | def addBoolean(value: Boolean) {
65 | writeField{Binary.fromString(value.toString)}{name => output.add(name, value)}
66 | }
67 |
68 | def truncate(value: Binary, length: Integer): Binary = {
69 | if (value.length <= length) {
70 | value
71 | } else {
72 | val bytesTruncated = new Array[Byte](length)
73 | value.toByteBuffer.get(bytesTruncated, 0, length)
74 | Binary.fromByteArray(bytesTruncated)
75 | }
76 | }
77 |
78 | def addBinary(value: Binary) {
79 | // Truncate strings so Impala doesn't break
80 | val truncated = truncate(value, MaxStringBytes)
81 | writeField(truncated){name => output.add(name, truncated)}
82 | }
83 |
84 | def addFloat(value: Float) {
85 | writeField{Binary.fromString(value.toString)}{name => output.add(name, value)}
86 | }
87 |
88 | def addDouble(value: Double) {
89 | writeField{Binary.fromString(value.toString)}{name => output.add(name, value)}
90 | }
91 |
92 | def withField(fn: String=>Unit) {
93 | val path = if (TypeFlattener.omitIdField(stack.head.field, stack.size, renameId))
94 | stack.tail
95 | else
96 | stack
97 |
98 | val name = path.reverse.map{_.field}.mkString(separator)
99 | if(output.getType.containsField(name))
100 | fn(name)
101 | }
102 |
103 | def writeField(binRep: =>Binary)(fn: String => Unit) {
104 | withField{name =>
105 | val fieldType = output.getType.getType(name)
106 | if(fieldType.asInstanceOf[PrimitiveType].getPrimitiveTypeName == PrimitiveType.PrimitiveTypeName.BINARY)
107 | stack.head.values ::= binRep
108 | else
109 | fn(name)
110 | }
111 | }
112 | }
113 |
--------------------------------------------------------------------------------
/herringbone-main/src/main/scala/com/stripe/herringbone/flatten/FlatConverter.scala:
--------------------------------------------------------------------------------
1 | package com.stripe.herringbone.flatten
2 |
3 | import org.apache.hadoop.fs.Path
4 | import org.apache.hadoop.conf.Configuration
5 |
6 | import parquet.example.data.Group
7 | import parquet.example.data.GroupWriter
8 | import parquet.example.data.simple.SimpleGroup
9 | import parquet.schema.MessageType
10 |
11 | import scala.collection.JavaConversions._
12 |
13 | object FlatConverter {
14 | def groupToTSV(group: Group, flatSchema: MessageType, separator: String, renameId: Boolean): String = {
15 | val flatGroup = flattenGroup(group, flatSchema, separator, renameId)
16 | val fieldValues = (0 until flatSchema.getFieldCount).map{ field =>
17 | val valueCount = flatGroup.getFieldRepetitionCount(field)
18 | if (valueCount == 0) {
19 | ""
20 | } else if (valueCount == 1) {
21 | escapeString(flatGroup.getValueToString(field, 0))
22 | } else {
23 | escapeString(flatGroup.getValueToString(field, 0))
24 | System.err.println("Warning: Field contains multiple values, extracting only the first")
25 | System.err.println(flatGroup.toString)
26 | }
27 | }
28 | fieldValues.mkString("\t")
29 | }
30 |
31 | def constructHeader(schema: MessageType) = {
32 | schema
33 | .getPaths()
34 | .toList
35 | .map{_(0)}
36 | .mkString("\t")
37 | }
38 |
39 | def flattenGroup(group: Group, flatSchema: MessageType, separator: String, renameId: Boolean) = {
40 | var flatGroup = new SimpleGroup(flatSchema)
41 | val writer = new GroupWriter(new FlatConsumer(flatGroup, separator, renameId), group.getType)
42 | writer.write(group)
43 | flatGroup
44 | }
45 |
46 | private def escapeString(s: String) = {
47 | val quote = "\""
48 | if (s.contains("\t"))
49 | // This is how pandas escapes tabs and quotes
50 | quote + s.replace(quote, "\"\"") + quote
51 | else
52 | s
53 | }
54 | }
55 |
--------------------------------------------------------------------------------
/herringbone-main/src/main/scala/com/stripe/herringbone/flatten/ParquetFlatConf.scala:
--------------------------------------------------------------------------------
1 | package com.stripe.herringbone.flatten
2 |
3 | import org.rogach.scallop._
4 |
5 | class ParquetFlatConf(arguments: Seq[String]) extends ScallopConf(arguments) {
6 | val inputPath = opt[String](required = true)
7 | val outputPath = opt[String](descr = "Default is input path with `-flat` or `-tsv` appended as appropriate")
8 | val previousPath = opt[String](descr = "Path of previously generated flat output, so field ordering can be maintained (optional)")
9 | val separator = opt[String](default = Some("__"))
10 | val renameId = opt[Boolean](descr = "Flatten a.b.id as a__b instead of a__b__id")
11 | }
12 |
--------------------------------------------------------------------------------
/herringbone-main/src/main/scala/com/stripe/herringbone/flatten/ParquetFlatMapper.scala:
--------------------------------------------------------------------------------
1 | package com.stripe.herringbone.flatten
2 |
3 | import org.apache.hadoop.mapreduce.Mapper
4 | import parquet.example.data.Group
5 | import parquet.schema.{MessageType,MessageTypeParser}
6 |
7 | abstract class ParquetFlatMapper[ValueOut] extends Mapper[Void,Group,Void,ValueOut] {
8 | var flattenedSchema: MessageType = _
9 | var separator: String = _
10 | var renameId: Boolean = _
11 |
12 | override def setup(context: Mapper[Void,Group,Void,ValueOut]#Context) {
13 | // the schema is stored in the job context when we call ExampleOutputFormat.setSchema
14 | flattenedSchema = MessageTypeParser.parseMessageType(context.getConfiguration.get("parquet.example.schema"))
15 | separator = context.getConfiguration.get(ParquetFlatMapper.SeparatorKey)
16 | renameId = context.getConfiguration.get(ParquetFlatMapper.RenameIdKey) == "true"
17 | }
18 |
19 | override def map(key: Void, value: Group, context: Mapper[Void,Group,Void,ValueOut]#Context) {
20 | context.write(key, valueOut(value))
21 | }
22 |
23 | def valueOut(value: Group): ValueOut
24 | }
25 |
26 | object ParquetFlatMapper {
27 | val SeparatorKey = "herringbone.flatten.separator"
28 | val RenameIdKey = "herringbone.flatten.rename.id"
29 | }
30 |
--------------------------------------------------------------------------------
/herringbone-main/src/main/scala/com/stripe/herringbone/flatten/TypeFlattener.scala:
--------------------------------------------------------------------------------
1 | package com.stripe.herringbone.flatten
2 |
3 | import parquet.schema._
4 | import java.util.{List=>JList}
5 | import scala.collection.JavaConverters._
6 |
7 | class TypeFlattener(separator: String, renameId: Boolean) extends TypeConverter[List[Type]] {
8 | def convertPrimitiveType(path: JList[GroupType], primitiveType: PrimitiveType) = {
9 | val typeName =
10 | if(TypeFlattener.isRepeated(primitiveType))
11 | PrimitiveType.PrimitiveTypeName.BINARY
12 | else
13 | primitiveType.getPrimitiveTypeName
14 |
15 | val types = if (TypeFlattener.omitIdField(primitiveType.getName, path.size, renameId))
16 | path.asScala.tail
17 | else
18 | (path.asScala.tail :+ primitiveType)
19 |
20 | val name = types.map{_.getName}.mkString(separator)
21 | List(new PrimitiveType(Type.Repetition.OPTIONAL, typeName, primitiveType.getTypeLength, name))
22 | }
23 |
24 | def convertGroupType(path: JList[GroupType], groupType: GroupType, children: JList[List[Type]]) = {
25 | if(TypeFlattener.isRepeated(groupType))
26 | Nil
27 | else
28 | flatten(children)
29 | }
30 |
31 | def convertMessageType(messageType: MessageType, children: JList[List[Type]]) = flatten(children)
32 |
33 | def flatten(children: JList[List[Type]]) = children.asScala.flatten.toList
34 | }
35 |
36 | object TypeFlattener {
37 | def flatten(messageType: MessageType,
38 | previousMessageType: Option[MessageType],
39 | separator: String,
40 | renameId: Boolean) = {
41 | val flattened = messageType.convertWith(new TypeFlattener(separator, renameId))
42 | val fieldsToUse = previousMessageType match {
43 | case Some(prevMessageType) => {
44 | // if passed a previous flattened schema, preserve that field ordering,
45 | // and append any new fields
46 | val prevFields = prevMessageType.getFields.asScala.toList
47 | prevFields ::: flattened.filterNot{prevFields.contains(_)}
48 | }
49 | case None => flattened
50 | }
51 | new MessageType(messageType.getName, fieldsToUse.asJava)
52 | }
53 |
54 | def isRepeated(t: Type) = t.isRepetition(Type.Repetition.REPEATED)
55 |
56 | def omitIdField(fieldName: String, numberOfFields: Integer, renameId: Boolean) = {
57 | renameId && numberOfFields > 1 && (fieldName == "id" || fieldName == "_id")
58 | }
59 | }
60 |
--------------------------------------------------------------------------------
/herringbone-main/src/main/scala/com/stripe/herringbone/load/FieldUtils.scala:
--------------------------------------------------------------------------------
1 | package com.stripe.herringbone.load
2 |
3 | import com.stripe.herringbone.util.ParquetUtils
4 |
5 | import org.apache.hadoop.fs._
6 |
7 | import parquet.schema.{ PrimitiveType, Type }
8 | import parquet.schema.PrimitiveType.PrimitiveTypeName
9 | import parquet.schema.PrimitiveType.PrimitiveTypeName._
10 |
11 | import scala.collection.JavaConversions._
12 |
13 | case class FieldUtils(hadoopFs: HadoopFs, schemaTypeMapper: SchemaTypeMapper) {
14 | def findPartitionFields(path: Path) = {
15 | hadoopFs.findPartitions(path).map {
16 | case (name, example) if (example.forall{_.isDigit}) =>
17 | "`%s` int".format(name)
18 | case (name, _) =>
19 | "`%s` string".format(name)
20 | }
21 | }
22 |
23 | def findTableFields(path: Path) = {
24 | val schema = ParquetUtils.readSchema(path)
25 | tableFieldsFromSchemaFields(schema.getFields)
26 | }
27 |
28 | def tableFieldsFromSchemaFields(fields: Seq[Type]) = {
29 | fields
30 | .filter { f => f.isPrimitive }
31 | .map { f =>
32 | "`%s` %s".format(f.getName, schemaTypeMapper.getSchemaType(f.asInstanceOf[PrimitiveType].getPrimitiveTypeName))
33 | }.toList
34 | }
35 | }
36 |
37 | trait SchemaTypeMapper {
38 | def getSchemaType(pt: PrimitiveTypeName): String
39 | }
40 |
41 | object ImpalaHiveSchemaTypeMapper extends SchemaTypeMapper {
42 | def getSchemaType(pt: PrimitiveTypeName) = {
43 | pt match {
44 | case BINARY => "STRING"
45 | case INT32 => "INT"
46 | case INT64 | INT96 => "BIGINT"
47 | case DOUBLE => "DOUBLE"
48 | case BOOLEAN => "BOOLEAN"
49 | case FLOAT => "FLOAT"
50 | case FIXED_LEN_BYTE_ARRAY => "BINARY"
51 | }
52 | }
53 | }
54 |
--------------------------------------------------------------------------------
/herringbone-main/src/main/scala/com/stripe/herringbone/load/HadoopFs.scala:
--------------------------------------------------------------------------------
1 | package com.stripe.herringbone.load
2 |
3 | import com.stripe.herringbone.util.ParquetUtils
4 |
5 | import org.apache.hadoop.conf._
6 | import org.apache.hadoop.fs._
7 | import org.apache.hadoop.util._
8 |
9 | class HadoopFs {
10 | def findAbsolutePath(path: Path) = {
11 | path.getFileSystem(new Configuration).getFileStatus(path).getPath.toUri.toString
12 | }
13 |
14 | def findSortedLeafPaths(path: Path): List[Path] =
15 | findLeafPaths(path).sortBy{case (path,time) => time}.map{_._1}
16 |
17 | def findLeafPaths(path: Path): List[(Path,Long)] = {
18 | val fs = path.getFileSystem(new Configuration)
19 | val parquetFileStatuses = fs.listStatus(path, ParquetUtils.parquetFilter)
20 | if (parquetFileStatuses.size > 0)
21 | List((path, parquetFileStatuses.head.getModificationTime))
22 | else {
23 | fs.listStatus(path, ParquetUtils.partitionFilter)
24 | .toList
25 | .map{_.getPath}
26 | .flatMap{findLeafPaths(_)}
27 | }
28 | }
29 |
30 | def findPartitions(path: Path) = {
31 | path.toUri.getPath.split("/")
32 | .filter{_.contains("=")}
33 | .map{segment =>
34 | val parts = segment.split("=")
35 | (parts(0), parts(1))
36 | }.toList
37 | }
38 | }
39 |
--------------------------------------------------------------------------------
/herringbone-main/src/main/scala/com/stripe/herringbone/load/HiveLoader.scala:
--------------------------------------------------------------------------------
1 | package com.stripe.herringbone
2 |
3 | import com.stripe.herringbone.load._
4 |
5 | import java.sql.ResultSet
6 |
7 | import org.apache.hadoop.conf._
8 | import org.apache.hadoop.fs._
9 | import org.apache.hadoop.util._
10 |
11 | case class HiveLoader(conf: ParquetLoadConf,
12 | hadoopFs: HadoopFs,
13 | fieldUtils: FieldUtils) extends ParquetLoader {
14 |
15 | val connection = HiveServer2Connection(conf.connectionUrl() + ":" + conf.connectionPort())
16 |
17 | def checkTableExists(table: String, database: String): Boolean = {
18 | useDatabase(database)
19 | var exists: Boolean = false
20 | connection.executeQuery("SHOW TABLES") { resultSet =>
21 | val existingTable = resultSet.getString(1).trim
22 | if (existingTable == table)
23 | exists = true
24 | }
25 | exists
26 | }
27 |
28 | def createTable(pathString: String, table: String, database: String = "default") {
29 | val path = new Path(pathString)
30 | val location = hadoopFs.findAbsolutePath(path)
31 | val leafPaths = hadoopFs.findSortedLeafPaths(path)
32 |
33 | if (leafPaths.isEmpty)
34 | error("Could not find parquet files under " + path)
35 |
36 | val tableFields = fieldUtils.findTableFields(leafPaths.last)
37 | val partitionFields = fieldUtils.findPartitionFields(leafPaths.last)
38 | val tableWhileImporting = table + "__import"
39 |
40 | useDatabase(database)
41 |
42 | createTableWithPartitionFields(location, tableWhileImporting, tableFields, partitionFields)
43 |
44 | connection.execute("DROP TABLE IF EXISTS %s".format(table))
45 | connection.execute("ALTER TABLE %s RENAME TO %s".format(tableWhileImporting, table))
46 |
47 | if (!partitionFields.isEmpty)
48 | updateTable(table, database)
49 | }
50 |
51 | def createTableWithPartitionFields(location: String, table: String, tableFields: List[String],
52 | partitionFields: List[String]) {
53 |
54 | connection.execute("DROP TABLE IF EXISTS `%s`".format (table))
55 |
56 | val tableClause = "CREATE EXTERNAL TABLE IF NOT EXISTS `%s` (%s)".format(
57 | table, tableFields.mkString(", "))
58 |
59 | val partitionClause =
60 | if (partitionFields.isEmpty)
61 | ""
62 | else
63 | " PARTITIONED BY (%s)".format(partitionFields.mkString(" ,"))
64 |
65 | val storedClause = " STORED AS PARQUET LOCATION \"%s\"".format(location)
66 |
67 | connection.execute(tableClause + partitionClause + storedClause)
68 | }
69 |
70 | def updateTable(table: String, database: String) = {
71 | connection.execute("MSCK REPAIR TABLE %s".format(table))
72 | }
73 |
74 | def closeConnection() = connection.close
75 |
76 | private def useDatabase(database: String) = {
77 | connection.execute("CREATE DATABASE IF NOT EXISTS %s".format(database))
78 | connection.execute("USE %s".format(database))
79 | }
80 | }
81 |
--------------------------------------------------------------------------------
/herringbone-main/src/main/scala/com/stripe/herringbone/load/HiveServer2Connection.scala:
--------------------------------------------------------------------------------
1 | package com.stripe.herringbone.load
2 |
3 | import java.sql.{ Connection, DriverManager, ResultSet }
4 |
5 | case class HiveServer2Connection(connectionUrl: String) {
6 | lazy val connection: Connection = {
7 | Class.forName("org.apache.hive.jdbc.HiveDriver")
8 | DriverManager.getConnection(connectionUrl)
9 | }
10 |
11 | def execute(query: String) {
12 | try {
13 | println(query)
14 | val statement = connection.createStatement
15 | statement.execute(query)
16 | } catch {
17 | case e: Throwable => e.printStackTrace
18 | }
19 | }
20 |
21 | def executeQuery(query: String)(fn: ResultSet => Unit) {
22 | try {
23 | println(query)
24 | val statement = connection.createStatement
25 | val resultSet = statement.executeQuery(query)
26 | while (resultSet.next) {
27 | fn(resultSet)
28 | }
29 | } catch {
30 | case e: Throwable => e.printStackTrace
31 | }
32 | }
33 |
34 | def close = connection.close
35 | }
36 |
--------------------------------------------------------------------------------
/herringbone-main/src/main/scala/com/stripe/herringbone/load/ImpalaLoader.scala:
--------------------------------------------------------------------------------
1 | package com.stripe.herringbone.load
2 |
3 | import com.stripe.herringbone.impala.{ImpalaClient,ImpalaValue}
4 |
5 | import org.apache.hadoop.conf._
6 | import org.apache.hadoop.util._
7 | import org.apache.hadoop.fs._
8 |
9 | case class ImpalaLoader(conf: ParquetLoadConf,
10 | hadoopFs: HadoopFs,
11 | fieldUtils: FieldUtils) extends ParquetLoader {
12 |
13 | lazy val impalaClient = ImpalaClient(conf.connectionUrl(),
14 | conf.connectionPort().toInt)
15 |
16 | def checkTableExists(table: String, database: String): Boolean = {
17 | useDatabase(database)
18 | var exists: Boolean = false
19 | query("SHOW TABLES"){row =>
20 | row.foreach { value =>
21 | if (value.raw == table) exists = true
22 | }
23 | }
24 | exists
25 | }
26 |
27 | def createTable(pathString: String, table: String, database: String = "default") {
28 | val path = new Path(pathString)
29 | val location = hadoopFs.findAbsolutePath(path)
30 | val leafPaths = hadoopFs.findSortedLeafPaths(path)
31 |
32 | if(leafPaths.isEmpty)
33 | error("Could not find parquet files under " + path)
34 |
35 | val tableFields = fieldUtils.findTableFields(leafPaths.last)
36 | val partitionFields = fieldUtils.findPartitionFields(leafPaths.last)
37 |
38 | useDatabase("importing")
39 |
40 | createTableWithPartitionFields(location, table, tableFields, partitionFields)
41 |
42 | if(partitionFields.size > 0)
43 | addPartitions(table, leafPaths.map{hadoopFs.findPartitions(_)})
44 |
45 | useDatabase(database)
46 | execute("DROP TABLE IF EXISTS %s.%s".format(database, table))
47 | execute("ALTER TABLE importing.%s RENAME TO %s.%s".format(table, database, table))
48 | if (partitionFields.isEmpty && conf.computeStats()) execute("COMPUTE STATS %s.%s".format(database, table))
49 | }
50 |
51 | def updateTable(table: String, database: String) {
52 | useDatabase(database)
53 |
54 | val basePath = findBasePath(table)
55 | val tablePartitions = findTablePartitions(table)
56 | val leafPaths = hadoopFs.findSortedLeafPaths(new Path(basePath))
57 | leafPaths.reverse.foreach{path =>
58 | val partitions = hadoopFs.findPartitions(path)
59 | if(!tablePartitions.contains(partitions.map{_._2}))
60 | addPartition(table, partitions)
61 | }
62 | }
63 |
64 | def findBasePath(table: String) = {
65 | var location: String = null
66 | query("DESCRIBE FORMATTED %s".format(table)){row =>
67 | if(row(0).raw.startsWith("Location:"))
68 | location = row(1).raw
69 | }
70 | location
71 | }
72 |
73 | def findTablePartitions(table: String) = {
74 | var partitions: List[List[String]] = Nil
75 | query("SHOW TABLE STATS %s".format(table)){row =>
76 | if(row.size > 4)
77 | partitions ::= List(row(0).raw)
78 | }
79 | partitions
80 | }
81 |
82 | def createTableWithPartitionFields(location: String, table: String, tableFields: List[String], partitionFields: List[String]) {
83 | execute("DROP TABLE IF EXISTS `%s`".format (table))
84 |
85 | val tableClause = "CREATE EXTERNAL TABLE IF NOT EXISTS `%s` (%s)".format(table, tableFields.mkString(", "))
86 | val partitionClause =
87 | if(partitionFields.isEmpty)
88 | ""
89 | else
90 | " PARTITIONED BY (%s)".format(partitionFields.mkString(" ,"))
91 | val storedClause = " STORED AS PARQUETFILE LOCATION \"%s\"".format(location)
92 |
93 | execute(tableClause + partitionClause + storedClause)
94 | }
95 |
96 | def addPartitions(table: String, partitions: List[List[(String, String)]]) {
97 | partitions.foreach{addPartition(table, _)}
98 | }
99 |
100 | def addPartition(table: String, partitions: List[(String,String)]) {
101 | val partitionClause =
102 | partitions.map {
103 | case (name, value) if(value.forall{_.isDigit}) =>
104 | "`%s`=%s".format(name, value)
105 | case (name, value) =>
106 | "`%s`='%s'".format(name, value)
107 | }.mkString(", ")
108 |
109 | execute("ALTER TABLE %s ADD IF NOT EXISTS PARTITION (%s)".format(table, partitionClause))
110 | }
111 |
112 | def closeConnection() = {}
113 |
114 | private def useDatabase(database: String) = {
115 | execute("CREATE DATABASE IF NOT EXISTS %s".format(database))
116 | execute("USE %s".format(database))
117 | }
118 |
119 | private def execute(stmt: String) {
120 | impalaClient.execute(stmt)
121 | }
122 |
123 | private def query(stmt: String)(fn: Seq[ImpalaValue] => Unit) {
124 | impalaClient.query(stmt){ r => fn(r) }
125 | }
126 | }
127 |
--------------------------------------------------------------------------------
/herringbone-main/src/main/scala/com/stripe/herringbone/load/ParquetLoadConf.scala:
--------------------------------------------------------------------------------
1 | package com.stripe.herringbone.load
2 |
3 | import org.rogach.scallop._
4 |
5 | class ParquetLoadConf(arguments: Seq[String]) extends ScallopConf(arguments) {
6 | val database = opt[String](default = Some("default"))
7 | val table = opt[String](required = true)
8 | val path = opt[String]()
9 | val hive = opt[Boolean]("hive")
10 | val connectionUrl = opt[String](required = true)
11 | val connectionPort = opt[String](required = true)
12 | val computeStats = toggle(descrYes = "Compute table stats after loading files into impala. Turn this off for faster loading into impala (but probably slower querying later on!)", default = Some(true))
13 | val updatePartitions = toggle(descrYes = "Create table if not present, otherwise update with new partitions. If a schema change is being made to an existing table, turn this off.", default = Some(false))
14 | validateOpt (path, updatePartitions) {
15 | case (None, None) => Left("You must specify at least one of path or update-partitions")
16 | case _ => Right(Unit)
17 | }
18 | }
19 |
--------------------------------------------------------------------------------
/herringbone-main/src/main/scala/com/stripe/herringbone/load/ParquetLoader.scala:
--------------------------------------------------------------------------------
1 | package com.stripe.herringbone.load
2 |
3 | trait ParquetLoader {
4 | def checkTableExists(table: String, db: String): Boolean
5 | def updateTable(table: String, db: String): Unit
6 | def createTable(path: String, table: String, db: String): Unit
7 | def closeConnection(): Unit
8 | }
9 |
10 |
--------------------------------------------------------------------------------
/herringbone-main/src/main/scala/com/stripe/herringbone/util/ParquetUtils.scala:
--------------------------------------------------------------------------------
1 | package com.stripe.herringbone.util
2 |
3 | import org.apache.hadoop.conf._
4 | import org.apache.hadoop.util._
5 | import org.apache.hadoop.fs._
6 |
7 | import parquet.hadoop.ParquetFileReader
8 |
9 | object ParquetUtils {
10 | def getParquetMetadata(path: Path) = {
11 | // Just use the first parquet file to figure out the impala fields
12 | // This also dodges the problem of any non-parquet files stashed
13 | // in the path.
14 | val fs = path.getFileSystem(new Configuration)
15 | val parquetFileStatuses = fs.listStatus(path, parquetFilter)
16 | val representativeParquetPath = parquetFileStatuses.head.getPath
17 |
18 | val footers = ParquetFileReader.readFooters(new Configuration, representativeParquetPath)
19 | footers.get(0).getParquetMetadata
20 | }
21 |
22 | def readSchema(path: Path) = {
23 | getParquetMetadata(path).getFileMetaData.getSchema
24 | }
25 |
26 | def readKeyValueMetaData(path: Path) = {
27 | getParquetMetadata(path).getFileMetaData.getKeyValueMetaData
28 | }
29 |
30 | val parquetFilter = new PathFilter {
31 | def accept(path: Path) = path.getName.endsWith(".parquet")
32 | }
33 |
34 | val partitionFilter = new PathFilter {
35 | def accept(path: Path) = path.getName.contains("=")
36 | }
37 | }
38 |
--------------------------------------------------------------------------------
/herringbone-main/src/main/thrift/ImpalaService.thrift:
--------------------------------------------------------------------------------
1 | // Copyright 2012 Cloudera Inc.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | namespace cpp impala
16 | namespace java com.cloudera.impala.thrift
17 | namespace rb impala.protocol
18 |
19 | include "Status.thrift"
20 | include "beeswax.thrift"
21 | include "cli_service.thrift"
22 |
23 | // ImpalaService accepts query execution options through beeswax.Query.configuration in
24 | // key:value form. For example, the list of strings could be:
25 | // "num_nodes:1", "abort_on_error:false"
26 | // The valid keys are listed in this enum. They map to TQueryOptions.
27 | // Note: If you add an option or change the default, you also need to update:
28 | // - ImpalaInternalService.thrift: TQueryOptions
29 | // - ImpaladClientExecutor.getBeeswaxQueryConfigurations()
30 | // - ImpalaServer::SetQueryOptions()
31 | // - ImpalaServer::TQueryOptionsToMap()
32 | enum TImpalaQueryOptions {
33 | // if true, abort execution on the first error
34 | ABORT_ON_ERROR,
35 |
36 | // maximum # of errors to be reported; Unspecified or 0 indicates backend default
37 | MAX_ERRORS,
38 |
39 | // if true, disable llvm codegen
40 | DISABLE_CODEGEN,
41 |
42 | // batch size to be used by backend; Unspecified or a size of 0 indicates backend
43 | // default
44 | BATCH_SIZE,
45 |
46 | // a per-machine approximate limit on the memory consumption of this query;
47 | // unspecified or a limit of 0 means no limit;
48 | // otherwise specified either as:
49 | // a) an int (= number of bytes);
50 | // b) a float followed by "M" (MB) or "G" (GB)
51 | MEM_LIMIT,
52 |
53 | // specifies the degree of parallelism with which to execute the query;
54 | // 1: single-node execution
55 | // NUM_NODES_ALL: executes on all nodes that contain relevant data
56 | // NUM_NODES_ALL_RACKS: executes on one node per rack that holds relevant data
57 | // > 1: executes on at most that many nodes at any point in time (ie, there can be
58 | // more nodes than numNodes with plan fragments for this query, but at most
59 | // numNodes would be active at any point in time)
60 | // Constants (NUM_NODES_ALL, NUM_NODES_ALL_RACKS) are defined in JavaConstants.thrift.
61 | NUM_NODES,
62 |
63 | // maximum length of the scan range; only applicable to HDFS scan range; Unspecified or
64 | // a length of 0 indicates backend default;
65 | MAX_SCAN_RANGE_LENGTH,
66 |
67 | // Maximum number of io buffers (per disk)
68 | MAX_IO_BUFFERS,
69 |
70 | // Number of scanner threads.
71 | NUM_SCANNER_THREADS,
72 |
73 | // If true, Impala will try to execute on file formats that are not fully supported yet
74 | ALLOW_UNSUPPORTED_FORMATS,
75 |
76 | // if set and > -1, specifies the default limit applied to a top-level SELECT statement
77 | // with an ORDER BY but without a LIMIT clause (ie, if the SELECT statement also has
78 | // a LIMIT clause, this default is ignored)
79 | DEFAULT_ORDER_BY_LIMIT,
80 |
81 | // DEBUG ONLY:
82 | // If set to
83 | // "[:]::",
84 | // the exec node with the given id will perform the specified action in the given
85 | // phase. If the optional backend number (starting from 0) is specified, only that
86 | // backend instance will perform the debug action, otherwise all backends will behave
87 | // in that way.
88 | // If the string doesn't have the required format or if any of its components is
89 | // invalid, the option is ignored.
90 | DEBUG_ACTION,
91 |
92 | // If true, raise an error when the DEFAULT_ORDER_BY_LIMIT has been reached.
93 | ABORT_ON_DEFAULT_LIMIT_EXCEEDED,
94 |
95 | // Compression codec for parquet when inserting into parquet tables.
96 | // Valid values are "snappy", "gzip" and "none"
97 | // Leave blank to use default.
98 | PARQUET_COMPRESSION_CODEC,
99 |
100 | // HBase scan query option. If set and > 0, HBASE_CACHING is the value for
101 | // "hbase.client.Scan.setCaching()" when querying HBase table. Otherwise, use backend
102 | // default.
103 | // If the value is too high, then the hbase region server will have a hard time (GC
104 | // pressure and long response times). If the value is too small, then there will be
105 | // extra trips to the hbase region server.
106 | HBASE_CACHING,
107 |
108 | // HBase scan query option. If set, HBase scan will always set
109 | // "hbase.client.setCacheBlocks" to CACHE_BLOCKS. Default is false.
110 | // If the table is large and the query is doing big scan, set it to false to
111 | // avoid polluting the cache in the hbase region server.
112 | // If the table is small and the table is used several time, set it to true to improve
113 | // performance.
114 | HBASE_CACHE_BLOCKS,
115 | }
116 |
117 | // The summary of an insert.
118 | struct TInsertResult {
119 | // Number of appended rows per modified partition. Only applies to HDFS tables.
120 | // The keys represent partitions to create, coded as k1=v1/k2=v2/k3=v3..., with the
121 | // root in an unpartitioned table being the empty string.
122 | 1: required map rows_appended
123 | }
124 |
125 | // Response from a call to PingImpalaService
126 | struct TPingImpalaServiceResp {
127 | // The Impala service's version string.
128 | 1: string version
129 | }
130 |
131 | // Parameters for a ResetTable request which will invalidate a table's metadata.
132 | // DEPRECATED.
133 | struct TResetTableReq {
134 | // Name of the table's parent database.
135 | 1: required string db_name
136 |
137 | // Name of the table.
138 | 2: required string table_name
139 | }
140 |
141 | // For all rpc that return a TStatus as part of their result type,
142 | // if the status_code field is set to anything other than OK, the contents
143 | // of the remainder of the result type is undefined (typically not set)
144 | service ImpalaService extends beeswax.BeeswaxService {
145 | // Cancel execution of query. Returns RUNTIME_ERROR if query_id
146 | // unknown.
147 | // This terminates all threads running on behalf of this query at
148 | // all nodes that were involved in the execution.
149 | // Throws BeeswaxException if the query handle is invalid (this doesn't
150 | // necessarily indicate an error: the query might have finished).
151 | Status.TStatus Cancel(1:beeswax.QueryHandle query_id)
152 | throws(1:beeswax.BeeswaxException error);
153 |
154 | // Invalidates all catalog metadata, forcing a reload
155 | // DEPRECATED; execute query "invalidate metadata" to refresh metadata
156 | Status.TStatus ResetCatalog();
157 |
158 | // Invalidates a specific table's catalog metadata, forcing a reload on the next access
159 | // DEPRECATED; execute query "refresh " to refresh metadata
160 | Status.TStatus ResetTable(1:TResetTableReq request)
161 |
162 | // Returns the runtime profile string for the given query handle.
163 | string GetRuntimeProfile(1:beeswax.QueryHandle query_id)
164 | throws(1:beeswax.BeeswaxException error);
165 |
166 | // Closes the query handle and return the result summary of the insert.
167 | TInsertResult CloseInsert(1:beeswax.QueryHandle handle)
168 | throws(1:beeswax.QueryNotFoundException error, 2:beeswax.BeeswaxException error2);
169 |
170 | // Client calls this RPC to verify that the server is an ImpalaService. Returns the
171 | // server version.
172 | TPingImpalaServiceResp PingImpalaService();
173 | }
174 |
175 | // Impala HiveServer2 service
176 | service ImpalaHiveServer2Service extends cli_service.TCLIService {
177 | }
178 |
--------------------------------------------------------------------------------
/herringbone-main/src/main/thrift/Status.thrift:
--------------------------------------------------------------------------------
1 | // Copyright 2012 Cloudera Inc.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | namespace cpp impala
16 | namespace java com.cloudera.impala.thrift
17 | namespace rb impala.protocol
18 |
19 | enum TStatusCode {
20 | OK,
21 | CANCELLED,
22 | ANALYSIS_ERROR,
23 | NOT_IMPLEMENTED_ERROR,
24 | RUNTIME_ERROR,
25 | MEM_LIMIT_EXCEEDED,
26 | INTERNAL_ERROR
27 | }
28 |
29 | struct TStatus {
30 | 1: required TStatusCode status_code
31 | 2: list error_msgs
32 | }
33 |
--------------------------------------------------------------------------------
/herringbone-main/src/main/thrift/beeswax.thrift:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to Cloudera, Inc. under one
3 | * or more contributor license agreements. See the NOTICE file
4 | * distributed with this work for additional information
5 | * regarding copyright ownership. Cloudera, Inc. licenses this file
6 | * to you under the Apache License, Version 2.0 (the
7 | * "License"); you may not use this file except in compliance
8 | * with the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | *
18 | * Interface for interacting with Beeswax Server
19 | */
20 |
21 | namespace java com.cloudera.beeswax.api
22 | namespace py beeswaxd
23 | namespace cpp beeswax
24 | namespace rb impala.protocol.beeswax
25 |
26 | include "hive_metastore.thrift"
27 |
28 | // A Query
29 | struct Query {
30 | 1: string query;
31 | // A list of HQL commands to execute before the query.
32 | // This is typically defining UDFs, setting settings, and loading resources.
33 | 3: list configuration;
34 |
35 | // User and groups to "act as" for purposes of Hadoop.
36 | 4: string hadoop_user;
37 | }
38 |
39 | typedef string LogContextId
40 |
41 | enum QueryState {
42 | CREATED,
43 | INITIALIZED,
44 | COMPILED,
45 | RUNNING,
46 | FINISHED,
47 | EXCEPTION
48 | }
49 |
50 | struct QueryHandle {
51 | 1: string id;
52 | 2: LogContextId log_context;
53 | }
54 |
55 | struct QueryExplanation {
56 | 1: string textual
57 | }
58 |
59 | struct Results {
60 | // If set, data is valid. Otherwise, results aren't ready yet.
61 | 1: bool ready,
62 | // Columns for the results
63 | 2: list columns,
64 | // A set of results
65 | 3: list data,
66 | // The starting row of the results
67 | 4: i64 start_row,
68 | // Whether there are more results to fetch
69 | 5: bool has_more
70 | }
71 |
72 | /**
73 | * Metadata information about the results.
74 | * Applicable only for SELECT.
75 | */
76 | struct ResultsMetadata {
77 | /** The schema of the results */
78 | 1: hive_metastore.Schema schema,
79 | /** The directory containing the results. Not applicable for partition table. */
80 | 2: string table_dir,
81 | /** If the results are straight from an existing table, the table name. */
82 | 3: string in_tablename,
83 | /** Field delimiter */
84 | 4: string delim,
85 | }
86 |
87 | exception BeeswaxException {
88 | 1: string message,
89 | // Use get_log(log_context) to retrieve any log related to this exception
90 | 2: LogContextId log_context,
91 | // (Optional) The QueryHandle that caused this exception
92 | 3: QueryHandle handle,
93 | 4: optional i32 errorCode = 0,
94 | 5: optional string SQLState = " "
95 | }
96 |
97 | exception QueryNotFoundException {
98 | }
99 |
100 | /** Represents a Hadoop-style configuration variable. */
101 | struct ConfigVariable {
102 | 1: string key,
103 | 2: string value,
104 | 3: string description
105 | }
106 |
107 | service BeeswaxService {
108 | /**
109 | * Submit a query and return a handle (QueryHandle). The query runs asynchronously.
110 | */
111 | QueryHandle query(1:Query query) throws(1:BeeswaxException error),
112 |
113 | /**
114 | * run a query synchronously and return a handle (QueryHandle).
115 | */
116 | QueryHandle executeAndWait(1:Query query, 2:LogContextId clientCtx)
117 | throws(1:BeeswaxException error),
118 |
119 | /**
120 | * Get the query plan for a query.
121 | */
122 | QueryExplanation explain(1:Query query)
123 | throws(1:BeeswaxException error),
124 |
125 | /**
126 | * Get the results of a query. This is non-blocking. Caller should check
127 | * Results.ready to determine if the results are in yet. The call requests
128 | * the batch size of fetch.
129 | */
130 | Results fetch(1:QueryHandle query_id, 2:bool start_over, 3:i32 fetch_size=-1)
131 | throws(1:QueryNotFoundException error, 2:BeeswaxException error2),
132 |
133 | /**
134 | * Get the state of the query
135 | */
136 | QueryState get_state(1:QueryHandle handle) throws(1:QueryNotFoundException error),
137 |
138 | /**
139 | * Get the result metadata
140 | */
141 | ResultsMetadata get_results_metadata(1:QueryHandle handle)
142 | throws(1:QueryNotFoundException error),
143 |
144 | /**
145 | * Used to test connection to server. A "noop" command.
146 | */
147 | string echo(1:string s)
148 |
149 | /**
150 | * Returns a string representation of the configuration object being used.
151 | * Handy for debugging.
152 | */
153 | string dump_config()
154 |
155 | /**
156 | * Get the log messages related to the given context.
157 | */
158 | string get_log(1:LogContextId context) throws(1:QueryNotFoundException error)
159 |
160 | /*
161 | * Returns "default" configuration.
162 | */
163 | list get_default_configuration(1:bool include_hadoop)
164 |
165 | /*
166 | * closes the query with given handle
167 | */
168 | void close(1:QueryHandle handle) throws(1:QueryNotFoundException error,
169 | 2:BeeswaxException error2)
170 |
171 | /*
172 | * clean the log context for given id
173 | */
174 | void clean(1:LogContextId log_context)
175 | }
176 |
--------------------------------------------------------------------------------
/herringbone-main/src/main/thrift/fb303.thrift:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one
3 | * or more contributor license agreements. See the NOTICE file
4 | * distributed with this work for additional information
5 | * regarding copyright ownership. The ASF licenses this file
6 | * to you under the Apache License, Version 2.0 (the
7 | * "License"); you may not use this file except in compliance
8 | * with the License. You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing,
13 | * software distributed under the License is distributed on an
14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 | * KIND, either express or implied. See the License for the
16 | * specific language governing permissions and limitations
17 | * under the License.
18 | */
19 |
20 | /**
21 | * fb303.thrift
22 | */
23 |
24 | namespace java com.facebook.fb303
25 | namespace cpp facebook.fb303
26 | namespace rb Impala.Protocol.fb303
27 |
28 | /**
29 | * Common status reporting mechanism across all services
30 | */
31 | enum fb_status {
32 | DEAD = 0,
33 | STARTING = 1,
34 | ALIVE = 2,
35 | STOPPING = 3,
36 | STOPPED = 4,
37 | WARNING = 5,
38 | }
39 |
40 | /**
41 | * Standard base service
42 | */
43 | service FacebookService {
44 |
45 | /**
46 | * Returns a descriptive name of the service
47 | */
48 | string getName(),
49 |
50 | /**
51 | * Returns the version of the service
52 | */
53 | string getVersion(),
54 |
55 | /**
56 | * Gets the status of this service
57 | */
58 | fb_status getStatus(),
59 |
60 | /**
61 | * User friendly description of status, such as why the service is in
62 | * the dead or warning state, or what is being started or stopped.
63 | */
64 | string getStatusDetails(),
65 |
66 | /**
67 | * Gets the counters for this service
68 | */
69 | map getCounters(),
70 |
71 | /**
72 | * Gets the value of a single counter
73 | */
74 | i64 getCounter(1: string key),
75 |
76 | /**
77 | * Sets an option
78 | */
79 | void setOption(1: string key, 2: string value),
80 |
81 | /**
82 | * Gets an option
83 | */
84 | string getOption(1: string key),
85 |
86 | /**
87 | * Gets all options
88 | */
89 | map getOptions(),
90 |
91 | /**
92 | * Returns a CPU profile over the given time interval (client and server
93 | * must agree on the profile format).
94 | */
95 | string getCpuProfile(1: i32 profileDurationInSec),
96 |
97 | /**
98 | * Returns the unix time that the server has been running since
99 | */
100 | i64 aliveSince(),
101 |
102 | /**
103 | * Tell the server to reload its configuration, reopen log files, etc
104 | */
105 | oneway void reinitialize(),
106 |
107 | /**
108 | * Suggest a shutdown to the server
109 | */
110 | oneway void shutdown(),
111 |
112 | }
113 |
--------------------------------------------------------------------------------
/herringbone-main/src/main/thrift/hive_metastore.thrift:
--------------------------------------------------------------------------------
1 | #!/usr/local/bin/thrift -java
2 |
3 | /**
4 | * Licensed to the Apache Software Foundation (ASF) under one
5 | * or more contributor license agreements. See the NOTICE file
6 | * distributed with this work for additional information
7 | * regarding copyright ownership. The ASF licenses this file
8 | * to you under the Apache License, Version 2.0 (the
9 | * "License"); you may not use this file except in compliance
10 | * with the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | */
20 |
21 | #
22 | # Thrift Service that the MetaStore is built on
23 | #
24 |
25 | include "fb303.thrift"
26 |
27 | namespace java org.apache.hadoop.hive.metastore.api
28 | namespace php metastore
29 | namespace cpp Apache.Hadoop.Hive
30 | namespace rb Impala.Protocol.HiveMetastore
31 |
32 | const string DDL_TIME = "transient_lastDdlTime"
33 |
34 | struct Version {
35 | 1: string version,
36 | 2: string comments
37 | }
38 |
39 | struct FieldSchema {
40 | 1: string name, // name of the field
41 | 2: string type, // type of the field. primitive types defined above, specify list, map for lists & maps
42 | 3: string comment
43 | }
44 |
45 | struct Type {
46 | 1: string name, // one of the types in PrimitiveTypes or CollectionTypes or User defined types
47 | 2: optional string type1, // object type if the name is 'list' (LIST_TYPE), key type if the name is 'map' (MAP_TYPE)
48 | 3: optional string type2, // val type if the name is 'map' (MAP_TYPE)
49 | //4: optional list fields // if the name is one of the user defined types
50 | }
51 |
52 | enum HiveObjectType {
53 | GLOBAL = 1,
54 | DATABASE = 2,
55 | TABLE = 3,
56 | PARTITION = 4,
57 | COLUMN = 5,
58 | }
59 |
60 | enum PrincipalType {
61 | USER = 1,
62 | ROLE = 2,
63 | GROUP = 3,
64 | }
65 |
66 | const string HIVE_FILTER_FIELD_OWNER = "hive_filter_field_owner__"
67 | const string HIVE_FILTER_FIELD_PARAMS = "hive_filter_field_params__"
68 | const string HIVE_FILTER_FIELD_LAST_ACCESS = "hive_filter_field_last_access__"
69 |
70 | enum PartitionEventType {
71 | LOAD_DONE = 1,
72 | }
73 |
74 | struct HiveObjectRef{
75 | 1: HiveObjectType objectType,
76 | 2: string dbName,
77 | 3: string objectName,
78 | 4: list partValues,
79 | 5: string columnName,
80 | }
81 |
82 | struct PrivilegeGrantInfo {
83 | 1: string privilege,
84 | 2: i32 createTime,
85 | 3: string grantor,
86 | 4: PrincipalType grantorType,
87 | 5: bool grantOption,
88 | }
89 |
90 | struct HiveObjectPrivilege {
91 | 1: HiveObjectRef hiveObject,
92 | 2: string principalName,
93 | 3: PrincipalType principalType,
94 | 4: PrivilegeGrantInfo grantInfo,
95 | }
96 |
97 | struct PrivilegeBag {
98 | 1: list privileges,
99 | }
100 |
101 | struct PrincipalPrivilegeSet {
102 | 1: map> userPrivileges, // user name -> privilege grant info
103 | 2: map> groupPrivileges, // group name -> privilege grant info
104 | 3: map> rolePrivileges, //role name -> privilege grant info
105 | }
106 |
107 | struct Role {
108 | 1: string roleName,
109 | 2: i32 createTime,
110 | 3: string ownerName,
111 | }
112 |
113 | // namespace for tables
114 | struct Database {
115 | 1: string name,
116 | 2: string description,
117 | 3: string locationUri,
118 | 4: map parameters, // properties associated with the database
119 | 5: optional PrincipalPrivilegeSet privileges
120 | }
121 |
122 | // This object holds the information needed by SerDes
123 | struct SerDeInfo {
124 | 1: string name, // name of the serde, table name by default
125 | 2: string serializationLib, // usually the class that implements the extractor & loader
126 | 3: map parameters // initialization parameters
127 | }
128 |
129 | // sort order of a column (column name along with asc(1)/desc(0))
130 | struct Order {
131 | 1: string col, // sort column name
132 | 2: i32 order // asc(1) or desc(0)
133 | }
134 |
135 | // this object holds all the information about physical storage of the data belonging to a table
136 | struct StorageDescriptor {
137 | 1: list cols, // required (refer to types defined above)
138 | 2: string location, // defaults to //tablename
139 | 3: string inputFormat, // SequenceFileInputFormat (binary) or TextInputFormat` or custom format
140 | 4: string outputFormat, // SequenceFileOutputFormat (binary) or IgnoreKeyTextOutputFormat or custom format
141 | 5: bool compressed, // compressed or not
142 | 6: i32 numBuckets, // this must be specified if there are any dimension columns
143 | 7: SerDeInfo serdeInfo, // serialization and deserialization information
144 | 8: list bucketCols, // reducer grouping columns and clustering columns and bucketing columns`
145 | 9: list sortCols, // sort order of the data in each bucket
146 | 10: map parameters // any user supplied key value hash
147 | }
148 |
149 | // table information
150 | struct Table {
151 | 1: string tableName, // name of the table
152 | 2: string dbName, // database name ('default')
153 | 3: string owner, // owner of this table
154 | 4: i32 createTime, // creation time of the table
155 | 5: i32 lastAccessTime, // last access time (usually this will be filled from HDFS and shouldn't be relied on)
156 | 6: i32 retention, // retention time
157 | 7: StorageDescriptor sd, // storage descriptor of the table
158 | 8: list partitionKeys, // partition keys of the table. only primitive types are supported
159 | 9: map parameters, // to store comments or any other user level parameters
160 | 10: string viewOriginalText, // original view text, null for non-view
161 | 11: string viewExpandedText, // expanded view text, null for non-view
162 | 12: string tableType, // table type enum, e.g. EXTERNAL_TABLE
163 | 13: optional PrincipalPrivilegeSet privileges,
164 | }
165 |
166 | struct Partition {
167 | 1: list values // string value is converted to appropriate partition key type
168 | 2: string dbName,
169 | 3: string tableName,
170 | 4: i32 createTime,
171 | 5: i32 lastAccessTime,
172 | 6: StorageDescriptor sd,
173 | 7: map parameters,
174 | 8: optional PrincipalPrivilegeSet privileges
175 | }
176 |
177 | struct Index {
178 | 1: string indexName, // unique with in the whole database namespace
179 | 2: string indexHandlerClass, // reserved
180 | 3: string dbName,
181 | 4: string origTableName,
182 | 5: i32 createTime,
183 | 6: i32 lastAccessTime,
184 | 7: string indexTableName,
185 | 8: StorageDescriptor sd,
186 | 9: map parameters,
187 | 10: bool deferredRebuild
188 | }
189 |
190 | // schema of the table/query results etc.
191 | struct Schema {
192 | // column names, types, comments
193 | 1: list fieldSchemas, // delimiters etc
194 | 2: map properties
195 | }
196 |
197 | // Key-value store to be used with selected
198 | // Metastore APIs (create, alter methods).
199 | // The client can pass environment properties / configs that can be
200 | // accessed in hooks.
201 | struct EnvironmentContext {
202 | 1: map properties
203 | }
204 |
205 | exception MetaException {
206 | 1: string message
207 | }
208 |
209 | exception UnknownTableException {
210 | 1: string message
211 | }
212 |
213 | exception UnknownDBException {
214 | 1: string message
215 | }
216 |
217 | exception AlreadyExistsException {
218 | 1: string message
219 | }
220 |
221 | exception InvalidPartitionException {
222 | 1: string message
223 | }
224 |
225 | exception UnknownPartitionException {
226 | 1: string message
227 | }
228 |
229 | exception InvalidObjectException {
230 | 1: string message
231 | }
232 |
233 | exception NoSuchObjectException {
234 | 1: string message
235 | }
236 |
237 | exception IndexAlreadyExistsException {
238 | 1: string message
239 | }
240 |
241 | exception InvalidOperationException {
242 | 1: string message
243 | }
244 |
245 | exception ConfigValSecurityException {
246 | 1: string message
247 | }
248 |
249 | /**
250 | * This interface is live.
251 | */
252 | service ThriftHiveMetastore extends fb303.FacebookService
253 | {
254 | void create_database(1:Database database) throws(1:AlreadyExistsException o1, 2:InvalidObjectException o2, 3:MetaException o3)
255 | Database get_database(1:string name) throws(1:NoSuchObjectException o1, 2:MetaException o2)
256 | void drop_database(1:string name, 2:bool deleteData, 3:bool cascade) throws(1:NoSuchObjectException o1, 2:InvalidOperationException o2, 3:MetaException o3)
257 | list get_databases(1:string pattern) throws(1:MetaException o1)
258 | list get_all_databases() throws(1:MetaException o1)
259 | void alter_database(1:string dbname, 2:Database db) throws(1:MetaException o1, 2:NoSuchObjectException o2)
260 |
261 | // returns the type with given name (make seperate calls for the dependent types if needed)
262 | Type get_type(1:string name) throws(1:MetaException o1, 2:NoSuchObjectException o2)
263 | bool create_type(1:Type type) throws(1:AlreadyExistsException o1, 2:InvalidObjectException o2, 3:MetaException o3)
264 | bool drop_type(1:string type) throws(1:MetaException o1, 2:NoSuchObjectException o2)
265 | map get_type_all(1:string name)
266 | throws(1:MetaException o2)
267 |
268 | // Gets a list of FieldSchemas describing the columns of a particular table
269 | list get_fields(1: string db_name, 2: string table_name) throws (1: MetaException o1, 2: UnknownTableException o2, 3: UnknownDBException o3),
270 |
271 | // Gets a list of FieldSchemas describing both the columns and the partition keys of a particular table
272 | list get_schema(1: string db_name, 2: string table_name) throws (1: MetaException o1, 2: UnknownTableException o2, 3: UnknownDBException o3)
273 |
274 | // create a Hive table. Following fields must be set
275 | // tableName
276 | // database (only 'default' for now until Hive QL supports databases)
277 | // owner (not needed, but good to have for tracking purposes)
278 | // sd.cols (list of field schemas)
279 | // sd.inputFormat (SequenceFileInputFormat (binary like falcon tables or u_full) or TextInputFormat)
280 | // sd.outputFormat (SequenceFileInputFormat (binary) or TextInputFormat)
281 | // sd.serdeInfo.serializationLib (SerDe class name eg org.apache.hadoop.hive.serde.simple_meta.MetadataTypedColumnsetSerDe
282 | // * See notes on DDL_TIME
283 | void create_table(1:Table tbl) throws(1:AlreadyExistsException o1, 2:InvalidObjectException o2, 3:MetaException o3, 4:NoSuchObjectException o4)
284 | void create_table_with_environment_context(1:Table tbl,
285 | 2:EnvironmentContext environment_context)
286 | throws (1:AlreadyExistsException o1,
287 | 2:InvalidObjectException o2, 3:MetaException o3,
288 | 4:NoSuchObjectException o4)
289 | // drops the table and all the partitions associated with it if the table has partitions
290 | // delete data (including partitions) if deleteData is set to true
291 | void drop_table(1:string dbname, 2:string name, 3:bool deleteData)
292 | throws(1:NoSuchObjectException o1, 2:MetaException o3)
293 | list get_tables(1: string db_name, 2: string pattern) throws (1: MetaException o1)
294 | list get_all_tables(1: string db_name) throws (1: MetaException o1)
295 |
296 | Table get_table(1:string dbname, 2:string tbl_name)
297 | throws (1:MetaException o1, 2:NoSuchObjectException o2)
298 | list get_table_objects_by_name(1:string dbname, 2:list tbl_names)
299 | throws (1:MetaException o1, 2:InvalidOperationException o2, 3:UnknownDBException o3)
300 |
301 | // Get a list of table names that match a filter.
302 | // The filter operators are LIKE, <, <=, >, >=, =, <>
303 | //
304 | // In the filter statement, values interpreted as strings must be enclosed in quotes,
305 | // while values interpreted as integers should not be. Strings and integers are the only
306 | // supported value types.
307 | //
308 | // The currently supported key names in the filter are:
309 | // Constants.HIVE_FILTER_FIELD_OWNER, which filters on the tables' owner's name
310 | // and supports all filter operators
311 | // Constants.HIVE_FILTER_FIELD_LAST_ACCESS, which filters on the last access times
312 | // and supports all filter operators except LIKE
313 | // Constants.HIVE_FILTER_FIELD_PARAMS, which filters on the tables' parameter keys and values
314 | // and only supports the filter operators = and <>.
315 | // Append the parameter key name to HIVE_FILTER_FIELD_PARAMS in the filter statement.
316 | // For example, to filter on parameter keys called "retention", the key name in the filter
317 | // statement should be Constants.HIVE_FILTER_FIELD_PARAMS + "retention"
318 | // Also, = and <> only work for keys that exist
319 | // in the tables. E.g., if you are looking for tables where key1 <> value, it will only
320 | // look at tables that have a value for the parameter key1.
321 | // Some example filter statements include:
322 | // filter = Constants.HIVE_FILTER_FIELD_OWNER + " like \".*test.*\" and " +
323 | // Constants.HIVE_FILTER_FIELD_LAST_ACCESS + " = 0";
324 | // filter = Constants.HIVE_FILTER_FIELD_PARAMS + "retention = \"30\" or " +
325 | // Constants.HIVE_FILTER_FIELD_PARAMS + "retention = \"90\""
326 | // @param dbName
327 | // The name of the database from which you will retrieve the table names
328 | // @param filterType
329 | // The type of filter
330 | // @param filter
331 | // The filter string
332 | // @param max_tables
333 | // The maximum number of tables returned
334 | // @return A list of table names that match the desired filter
335 | list get_table_names_by_filter(1:string dbname, 2:string filter, 3:i16 max_tables=-1)
336 | throws (1:MetaException o1, 2:InvalidOperationException o2, 3:UnknownDBException o3)
337 |
338 | // alter table applies to only future partitions not for existing partitions
339 | // * See notes on DDL_TIME
340 | void alter_table(1:string dbname, 2:string tbl_name, 3:Table new_tbl)
341 | throws (1:InvalidOperationException o1, 2:MetaException o2)
342 | void alter_table_with_environment_context(1:string dbname, 2:string tbl_name,
343 | 3:Table new_tbl, 4:EnvironmentContext environment_context)
344 | throws (1:InvalidOperationException o1, 2:MetaException o2)
345 | // the following applies to only tables that have partitions
346 | // * See notes on DDL_TIME
347 | Partition add_partition(1:Partition new_part)
348 | throws(1:InvalidObjectException o1, 2:AlreadyExistsException o2, 3:MetaException o3)
349 | Partition add_partition_with_environment_context(1:Partition new_part,
350 | 2:EnvironmentContext environment_context)
351 | throws (1:InvalidObjectException o1, 2:AlreadyExistsException o2,
352 | 3:MetaException o3)
353 | i32 add_partitions(1:list new_parts)
354 | throws(1:InvalidObjectException o1, 2:AlreadyExistsException o2, 3:MetaException o3)
355 | Partition append_partition(1:string db_name, 2:string tbl_name, 3:list part_vals)
356 | throws (1:InvalidObjectException o1, 2:AlreadyExistsException o2, 3:MetaException o3)
357 | Partition append_partition_by_name(1:string db_name, 2:string tbl_name, 3:string part_name)
358 | throws (1:InvalidObjectException o1, 2:AlreadyExistsException o2, 3:MetaException o3)
359 | bool drop_partition(1:string db_name, 2:string tbl_name, 3:list part_vals, 4:bool deleteData)
360 | throws(1:NoSuchObjectException o1, 2:MetaException o2)
361 | bool drop_partition_by_name(1:string db_name, 2:string tbl_name, 3:string part_name, 4:bool deleteData)
362 | throws(1:NoSuchObjectException o1, 2:MetaException o2)
363 | Partition get_partition(1:string db_name, 2:string tbl_name, 3:list part_vals)
364 | throws(1:MetaException o1, 2:NoSuchObjectException o2)
365 |
366 | Partition get_partition_with_auth(1:string db_name, 2:string tbl_name, 3:list part_vals,
367 | 4: string user_name, 5: list group_names) throws(1:MetaException o1, 2:NoSuchObjectException o2)
368 |
369 | Partition get_partition_by_name(1:string db_name 2:string tbl_name, 3:string part_name)
370 | throws(1:MetaException o1, 2:NoSuchObjectException o2)
371 |
372 | // returns all the partitions for this table in reverse chronological order.
373 | // If max parts is given then it will return only that many.
374 | list get_partitions(1:string db_name, 2:string tbl_name, 3:i16 max_parts=-1)
375 | throws(1:NoSuchObjectException o1, 2:MetaException o2)
376 | list get_partitions_with_auth(1:string db_name, 2:string tbl_name, 3:i16 max_parts=-1,
377 | 4: string user_name, 5: list group_names) throws(1:NoSuchObjectException o1, 2:MetaException o2)
378 |
379 | list get_partition_names(1:string db_name, 2:string tbl_name, 3:i16 max_parts=-1)
380 | throws(1:MetaException o2)
381 |
382 | // get_partition*_ps methods allow filtering by a partial partition specification,
383 | // as needed for dynamic partitions. The values that are not restricted should
384 | // be empty strings. Nulls were considered (instead of "") but caused errors in
385 | // generated Python code. The size of part_vals may be smaller than the
386 | // number of partition columns - the unspecified values are considered the same
387 | // as "".
388 | list get_partitions_ps(1:string db_name 2:string tbl_name
389 | 3:list part_vals, 4:i16 max_parts=-1)
390 | throws(1:MetaException o1, 2:NoSuchObjectException o2)
391 | list get_partitions_ps_with_auth(1:string db_name, 2:string tbl_name, 3:list part_vals, 4:i16 max_parts=-1,
392 | 5: string user_name, 6: list group_names) throws(1:NoSuchObjectException o1, 2:MetaException o2)
393 |
394 | list get_partition_names_ps(1:string db_name,
395 | 2:string tbl_name, 3:list part_vals, 4:i16 max_parts=-1)
396 | throws(1:MetaException o1, 2:NoSuchObjectException o2)
397 |
398 | // get the partitions matching the given partition filter
399 | list get_partitions_by_filter(1:string db_name 2:string tbl_name
400 | 3:string filter, 4:i16 max_parts=-1)
401 | throws(1:MetaException o1, 2:NoSuchObjectException o2)
402 |
403 | // get partitions give a list of partition names
404 | list get_partitions_by_names(1:string db_name 2:string tbl_name 3:list names)
405 | throws(1:MetaException o1, 2:NoSuchObjectException o2)
406 |
407 | // changes the partition to the new partition object. partition is identified from the part values
408 | // in the new_part
409 | // * See notes on DDL_TIME
410 | void alter_partition(1:string db_name, 2:string tbl_name, 3:Partition new_part)
411 | throws (1:InvalidOperationException o1, 2:MetaException o2)
412 |
413 | void alter_partition_with_environment_context(1:string db_name,
414 | 2:string tbl_name, 3:Partition new_part,
415 | 4:EnvironmentContext environment_context)
416 | throws (1:InvalidOperationException o1, 2:MetaException o2)
417 |
418 | // rename the old partition to the new partition object by changing old part values to the part values
419 | // in the new_part. old partition is identified from part_vals.
420 | // partition keys in new_part should be the same as those in old partition.
421 | void rename_partition(1:string db_name, 2:string tbl_name, 3:list part_vals, 4:Partition new_part)
422 | throws (1:InvalidOperationException o1, 2:MetaException o2)
423 |
424 | // gets the value of the configuration key in the metastore server. returns
425 | // defaultValue if the key does not exist. if the configuration key does not
426 | // begin with "hive", "mapred", or "hdfs", a ConfigValSecurityException is
427 | // thrown.
428 | string get_config_value(1:string name, 2:string defaultValue)
429 | throws(1:ConfigValSecurityException o1)
430 |
431 | // converts a partition name into a partition values array
432 | list partition_name_to_vals(1: string part_name)
433 | throws(1: MetaException o1)
434 | // converts a partition name into a partition specification (a mapping from
435 | // the partition cols to the values)
436 | map partition_name_to_spec(1: string part_name)
437 | throws(1: MetaException o1)
438 |
439 | void markPartitionForEvent(1:string db_name, 2:string tbl_name, 3:map part_vals,
440 | 4:PartitionEventType eventType) throws (1: MetaException o1, 2: NoSuchObjectException o2,
441 | 3: UnknownDBException o3, 4: UnknownTableException o4, 5: UnknownPartitionException o5,
442 | 6: InvalidPartitionException o6)
443 | bool isPartitionMarkedForEvent(1:string db_name, 2:string tbl_name, 3:map part_vals,
444 | 4: PartitionEventType eventType) throws (1: MetaException o1, 2:NoSuchObjectException o2,
445 | 3: UnknownDBException o3, 4: UnknownTableException o4, 5: UnknownPartitionException o5,
446 | 6: InvalidPartitionException o6)
447 |
448 | //index
449 | Index add_index(1:Index new_index, 2: Table index_table)
450 | throws(1:InvalidObjectException o1, 2:AlreadyExistsException o2, 3:MetaException o3)
451 | void alter_index(1:string dbname, 2:string base_tbl_name, 3:string idx_name, 4:Index new_idx)
452 | throws (1:InvalidOperationException o1, 2:MetaException o2)
453 | bool drop_index_by_name(1:string db_name, 2:string tbl_name, 3:string index_name, 4:bool deleteData)
454 | throws(1:NoSuchObjectException o1, 2:MetaException o2)
455 | Index get_index_by_name(1:string db_name 2:string tbl_name, 3:string index_name)
456 | throws(1:MetaException o1, 2:NoSuchObjectException o2)
457 |
458 | list get_indexes(1:string db_name, 2:string tbl_name, 3:i16 max_indexes=-1)
459 | throws(1:NoSuchObjectException o1, 2:MetaException o2)
460 | list get_index_names(1:string db_name, 2:string tbl_name, 3:i16 max_indexes=-1)
461 | throws(1:MetaException o2)
462 |
463 | //authorization privileges
464 |
465 | bool create_role(1:Role role) throws(1:MetaException o1)
466 | bool drop_role(1:string role_name) throws(1:MetaException o1)
467 | list get_role_names() throws(1:MetaException o1)
468 | bool grant_role(1:string role_name, 2:string principal_name, 3:PrincipalType principal_type,
469 | 4:string grantor, 5:PrincipalType grantorType, 6:bool grant_option) throws(1:MetaException o1)
470 | bool revoke_role(1:string role_name, 2:string principal_name, 3:PrincipalType principal_type)
471 | throws(1:MetaException o1)
472 | list list_roles(1:string principal_name, 2:PrincipalType principal_type) throws(1:MetaException o1)
473 |
474 | PrincipalPrivilegeSet get_privilege_set(1:HiveObjectRef hiveObject, 2:string user_name,
475 | 3: list group_names) throws(1:MetaException o1)
476 | list list_privileges(1:string principal_name, 2:PrincipalType principal_type,
477 | 3: HiveObjectRef hiveObject) throws(1:MetaException o1)
478 |
479 | bool grant_privileges(1:PrivilegeBag privileges) throws(1:MetaException o1)
480 | bool revoke_privileges(1:PrivilegeBag privileges) throws(1:MetaException o1)
481 |
482 | // this is used by metastore client to send UGI information to metastore server immediately
483 | // after setting up a connection.
484 | list set_ugi(1:string user_name, 2:list group_names) throws (1:MetaException o1)
485 |
486 | //Authentication (delegation token) interfaces
487 |
488 | // get metastore server delegation token for use from the map/reduce tasks to authenticate
489 | // to metastore server
490 | string get_delegation_token(1:string token_owner, 2:string renewer_kerberos_principal_name)
491 | throws (1:MetaException o1)
492 |
493 | // method to renew delegation token obtained from metastore server
494 | i64 renew_delegation_token(1:string token_str_form) throws (1:MetaException o1)
495 |
496 | // method to cancel delegation token obtained from metastore server
497 | void cancel_delegation_token(1:string token_str_form) throws (1:MetaException o1)
498 | }
499 |
500 | // * Note about the DDL_TIME: When creating or altering a table or a partition,
501 | // if the DDL_TIME is not set, the current time will be used.
502 |
503 | // For storing info about archived partitions in parameters
504 |
505 | // Whether the partition is archived
506 | const string IS_ARCHIVED = "is_archived",
507 | // The original location of the partition, before archiving. After archiving,
508 | // this directory will contain the archive. When the partition
509 | // is dropped, this directory will be deleted
510 | const string ORIGINAL_LOCATION = "original_location",
511 |
512 | // these should be needed only for backward compatibility with filestore
513 | const string META_TABLE_COLUMNS = "columns",
514 | const string META_TABLE_COLUMN_TYPES = "columns.types",
515 | const string BUCKET_FIELD_NAME = "bucket_field_name",
516 | const string BUCKET_COUNT = "bucket_count",
517 | const string FIELD_TO_DIMENSION = "field_to_dimension",
518 | const string META_TABLE_NAME = "name",
519 | const string META_TABLE_DB = "db",
520 | const string META_TABLE_LOCATION = "location",
521 | const string META_TABLE_SERDE = "serde",
522 | const string META_TABLE_PARTITION_COLUMNS = "partition_columns",
523 | const string FILE_INPUT_FORMAT = "file.inputformat",
524 | const string FILE_OUTPUT_FORMAT = "file.outputformat",
525 | const string META_TABLE_STORAGE = "storage_handler",
526 |
527 |
528 |
529 |
--------------------------------------------------------------------------------
/herringbone-main/src/test/resources/test.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stripe-archive/herringbone/4f0524287ef47fc897702d654572bbeee1004879/herringbone-main/src/test/resources/test.parquet
--------------------------------------------------------------------------------
/herringbone-main/src/test/scala/com/stripe/herringbone/FlattenJobTest.scala:
--------------------------------------------------------------------------------
1 | package com.stripe.herringbone.test
2 |
3 | import com.stripe.herringbone.flatten._
4 | import org.scalatest._
5 | import parquet.example.Paper
6 | import parquet.io.api.Binary
7 |
8 | class FlattenJobTest extends FlatSpec with Matchers {
9 | def toBinary(x: Array[Byte]) = Binary.fromByteArray(x)
10 |
11 | "truncate" should "truncate to correct length" in {
12 | val consumer = new FlatConsumer(Paper.r1, "__", false)
13 | val bytes = toBinary(Array[Byte](1,2,3,4))
14 | assert(consumer.truncate(bytes, 3).getBytes().sameElements(Array[Byte](1,2,3)))
15 | }
16 |
17 | "truncate" should "not truncate if unnecessary" in {
18 | val consumer = new FlatConsumer(Paper.r1, "__", false)
19 | val bytes = toBinary(Array[Byte](1,2,3,4))
20 | assert(consumer.truncate(bytes, 8) == bytes)
21 | }
22 | }
23 |
--------------------------------------------------------------------------------
/herringbone-main/src/test/scala/com/stripe/herringbone/flatten/FlatConverterTest.scala:
--------------------------------------------------------------------------------
1 | package com.stripe.herringbone.test
2 |
3 | import com.stripe.herringbone.flatten.{FlatConverter,TypeFlattener}
4 |
5 | import org.scalatest._
6 | import org.apache.hadoop.fs.Path
7 |
8 | import parquet.example.Paper
9 | import parquet.example.data.simple.SimpleGroup
10 | import parquet.example.data.GroupWriter
11 | import parquet.schema.MessageType
12 | import parquet.schema.PrimitiveType
13 | import parquet.schema.Type.Repetition.OPTIONAL
14 | import parquet.schema.Type.Repetition.REQUIRED
15 | import parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY
16 |
17 | import scala.collection.mutable.StringBuilder
18 | import java.io.StringWriter
19 |
20 | class FlatConverterTest extends FlatSpec with Matchers {
21 |
22 | def nestedGroupFixture =
23 | new {
24 | val group = Paper.r1
25 | val schema = Paper.schema
26 | val flatSchema = TypeFlattener.flatten(schema, None, "__", true)
27 | val flatGroup = FlatConverter.flattenGroup(group, flatSchema, "__", true)
28 | }
29 |
30 | def flatGroupFixture =
31 | new {
32 | val flatSchema =
33 | new MessageType("Charge",
34 | new PrimitiveType(REQUIRED, BINARY, "_id"),
35 | new PrimitiveType(OPTIONAL, BINARY, "email"),
36 | new PrimitiveType(REQUIRED, BINARY, "merchant")
37 | )
38 | val flatGroupMissingFields = new SimpleGroup(flatSchema)
39 | flatGroupMissingFields.add("_id", "ch_1")
40 | flatGroupMissingFields.add("merchant", "acct_1")
41 | val flatGroupAllFields = new SimpleGroup(flatSchema)
42 | flatGroupAllFields.add("email", "bob@stripe.com")
43 | flatGroupAllFields.add("merchant", "acct_1")
44 | flatGroupAllFields.add("_id", "ch_1")
45 | }
46 |
47 | "groupToTSV" should "convert a flattened group" in {
48 | val f = nestedGroupFixture
49 | val groupTSV = FlatConverter.groupToTSV(f.flatGroup, f.flatSchema, "__", true)
50 | assert(groupTSV == "10\t\t20,40,60")
51 | }
52 |
53 | "groupToTSV" should "respect schema ordering, handle optional fields" in {
54 | val f = flatGroupFixture
55 | val missingTSV = FlatConverter.groupToTSV(f.flatGroupMissingFields, f.flatSchema, "__", true)
56 | assert(missingTSV == "ch_1\t\tacct_1")
57 | val allTSV = FlatConverter.groupToTSV(f.flatGroupAllFields, f.flatSchema, "__", true)
58 | assert(allTSV == "ch_1\tbob@stripe.com\tacct_1")
59 | }
60 | }
61 |
62 |
--------------------------------------------------------------------------------
/herringbone-main/src/test/scala/com/stripe/herringbone/flatten/TypeFlattenerTest.scala:
--------------------------------------------------------------------------------
1 | package com.stripe.herringbone.test
2 |
3 | import com.stripe.herringbone.flatten.TypeFlattener
4 |
5 | import org.scalatest._
6 |
7 | import parquet.schema.GroupType
8 | import parquet.schema.MessageType
9 | import parquet.schema.PrimitiveType
10 | import parquet.schema.Type.Repetition.OPTIONAL
11 | import parquet.schema.Type.Repetition.REPEATED
12 | import parquet.schema.Type.Repetition.REQUIRED
13 | import parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY
14 | import parquet.schema.PrimitiveType.PrimitiveTypeName.INT64
15 |
16 | class TypeFlattenerTest extends FlatSpec with Matchers {
17 |
18 | "flatten" should "omit the idField in nested fieldname if specified" in {
19 | val input = new MessageType("Document",
20 | new PrimitiveType(OPTIONAL, BINARY, "_id"),
21 | new GroupType(OPTIONAL, "Page",
22 | new PrimitiveType(OPTIONAL, BINARY, "_id")))
23 |
24 | val expected = new MessageType("Document",
25 | new PrimitiveType(OPTIONAL, BINARY, "_id"),
26 | new PrimitiveType(OPTIONAL, BINARY, "Page"))
27 |
28 | val result = TypeFlattener.flatten(input, None, "__", true)
29 | assert(expected == result)
30 | }
31 |
32 | "flatten" should "not omit the idField in nested fieldname if none is specified" in {
33 | val input = new MessageType("Document",
34 | new PrimitiveType(OPTIONAL, BINARY, "_id"),
35 | new GroupType(OPTIONAL, "Page",
36 | new PrimitiveType(OPTIONAL, BINARY, "_id")))
37 |
38 | val expected = new MessageType("Document",
39 | new PrimitiveType(OPTIONAL, BINARY, "_id"),
40 | new PrimitiveType(OPTIONAL, BINARY, "Page___id"))
41 |
42 | val result = TypeFlattener.flatten(input, None, "__", false)
43 | assert(expected == result)
44 | }
45 |
46 | "flatten" should "not include repeated groups" in {
47 | val input = new MessageType("Document",
48 | new PrimitiveType(OPTIONAL, BINARY, "_id"),
49 | new GroupType(REPEATED, "Nope",
50 | new PrimitiveType(REPEATED, INT64, "Never")))
51 |
52 | val expected = new MessageType("Document",
53 | new PrimitiveType(OPTIONAL, BINARY, "_id"))
54 |
55 | val result = TypeFlattener.flatten(input, None, "__", true)
56 | assert(expected == result)
57 | }
58 |
59 | "flatten" should "set all fields as optional" in {
60 | val input = new MessageType("Document",
61 | new GroupType(OPTIONAL, "Yep",
62 | new GroupType(REQUIRED, "Grouped",
63 | new PrimitiveType(REQUIRED, BINARY, "Yes"),
64 | new PrimitiveType(REPEATED, BINARY, "Maybe")),
65 | new PrimitiveType(OPTIONAL, BINARY, "Sometimes")))
66 |
67 | val expected = new MessageType("Document",
68 | new PrimitiveType(OPTIONAL, BINARY, "Yep__Grouped__Yes"),
69 | new PrimitiveType(OPTIONAL, BINARY, "Yep__Grouped__Maybe"),
70 | new PrimitiveType(OPTIONAL, BINARY, "Yep__Sometimes"))
71 |
72 | val result = TypeFlattener.flatten(input, None, "__", true)
73 | assert(expected == result)
74 | }
75 |
76 | "flatten" should "preserve the order of previously flattened fields" in {
77 | val input = new MessageType("Document",
78 | new PrimitiveType(REQUIRED, BINARY, "Old__Two"),
79 | new GroupType(OPTIONAL, "New",
80 | new PrimitiveType(REQUIRED, BINARY, "One")),
81 | new PrimitiveType(REQUIRED, BINARY, "Old__One"))
82 |
83 | val old = new MessageType("Document",
84 | new PrimitiveType(OPTIONAL, BINARY, "Old__One"),
85 | new PrimitiveType(OPTIONAL, BINARY, "Old__Two"))
86 |
87 | val expected = new MessageType("Document",
88 | new PrimitiveType(OPTIONAL, BINARY, "Old__One"),
89 | new PrimitiveType(OPTIONAL, BINARY, "Old__Two"),
90 | new PrimitiveType(OPTIONAL, BINARY, "New__One"))
91 |
92 | val result = TypeFlattener.flatten(input, Some(old), "__", true)
93 | assert(expected == result)
94 | }
95 | }
96 |
--------------------------------------------------------------------------------
/herringbone-main/src/test/scala/com/stripe/herringbone/load/FieldUtilsTest.scala:
--------------------------------------------------------------------------------
1 | package com.stripe.herringbone.test.load
2 |
3 | import com.stripe.herringbone.load.{FieldUtils, HadoopFs, ImpalaHiveSchemaTypeMapper}
4 | import org.apache.hadoop.fs._
5 | import org.scalamock.scalatest.MockFactory
6 | import org.scalatest._
7 | import parquet.schema.{PrimitiveType, Type}
8 |
9 | class FieldUtilsTest extends FlatSpec with Matchers with MockFactory {
10 |
11 | "findPartitionFields" should "find the partition field names and types" in {
12 | val hadoopFs = mock[HadoopFs]
13 | val path = new Path("path")
14 |
15 | val partitions = List(("day", "123"), ("type", "foo"))
16 | (hadoopFs.findPartitions _).expects(path).returning(partitions)
17 |
18 | val expected = List("`day` int", "`type` string")
19 | FieldUtils(hadoopFs, ImpalaHiveSchemaTypeMapper).findPartitionFields(path) should equal (expected)
20 | }
21 |
22 | "tableFieldsFromSchemaFields" should "find the table fields from the parquet schema" in {
23 | val hadoopFs = mock[HadoopFs]
24 | val optional = Type.Repetition.valueOf("OPTIONAL")
25 | val input = List(
26 | new PrimitiveType(optional, PrimitiveType.PrimitiveTypeName.valueOf("BINARY"), "a"),
27 | new PrimitiveType(optional, PrimitiveType.PrimitiveTypeName.valueOf("INT32"), "b"),
28 | new PrimitiveType(optional, PrimitiveType.PrimitiveTypeName.valueOf("INT64"), "c"),
29 | new PrimitiveType(optional, PrimitiveType.PrimitiveTypeName.valueOf("INT96"), "d"),
30 | new PrimitiveType(optional, PrimitiveType.PrimitiveTypeName.valueOf("DOUBLE"), "e"),
31 | new PrimitiveType(optional, PrimitiveType.PrimitiveTypeName.valueOf("BOOLEAN"), "f"),
32 | new PrimitiveType(optional, PrimitiveType.PrimitiveTypeName.valueOf("FLOAT"), "g"),
33 | new PrimitiveType(optional, PrimitiveType.PrimitiveTypeName.valueOf("FIXED_LEN_BYTE_ARRAY"), "h")
34 | )
35 |
36 | val expected = List(
37 | "`a` STRING",
38 | "`b` INT",
39 | "`c` BIGINT",
40 | "`d` BIGINT",
41 | "`e` DOUBLE",
42 | "`f` BOOLEAN",
43 | "`g` FLOAT",
44 | "`h` BINARY"
45 | )
46 |
47 | FieldUtils(hadoopFs, ImpalaHiveSchemaTypeMapper).tableFieldsFromSchemaFields(input) should equal (expected)
48 | }
49 | }
50 |
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
3 | 4.0.0
4 |
5 | com.stripe
6 | herringbone
7 | 0.0.1
8 | pom
9 |
10 | Herringbone
11 |
12 |
13 | herringbone-impala
14 | herringbone-main
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------