├── .gitignore ├── LICENSE ├── README.md ├── bin └── herringbone ├── herringbone-impala ├── pom.xml └── src │ └── main │ ├── scala │ └── com │ │ └── stripe │ │ └── herringbone │ │ └── impala │ │ ├── Connection.scala │ │ ├── Cursor.scala │ │ ├── Exceptions.scala │ │ ├── ImpalaClient.scala │ │ └── ImpalaValue.scala │ └── thrift │ ├── ImpalaService.thrift │ ├── Status.thrift │ ├── beeswax.thrift │ ├── cli_service.thrift │ ├── fb303.thrift │ └── hive_metastore.thrift ├── herringbone-main ├── pom.xml └── src │ ├── main │ ├── scala │ │ └── com │ │ │ └── stripe │ │ │ └── herringbone │ │ │ ├── CompactInputFormat.scala │ │ │ ├── CompactJob.scala │ │ │ ├── FlattenJob.scala │ │ │ ├── ParquetLoad.scala │ │ │ ├── TsvJob.scala │ │ │ ├── flatten │ │ │ ├── FlatConsumer.scala │ │ │ ├── FlatConverter.scala │ │ │ ├── ParquetFlatConf.scala │ │ │ ├── ParquetFlatMapper.scala │ │ │ └── TypeFlattener.scala │ │ │ ├── load │ │ │ ├── FieldUtils.scala │ │ │ ├── HadoopFs.scala │ │ │ ├── HiveLoader.scala │ │ │ ├── HiveServer2Connection.scala │ │ │ ├── ImpalaLoader.scala │ │ │ ├── ParquetLoadConf.scala │ │ │ └── ParquetLoader.scala │ │ │ └── util │ │ │ └── ParquetUtils.scala │ └── thrift │ │ ├── ImpalaService.thrift │ │ ├── Status.thrift │ │ ├── beeswax.thrift │ │ ├── cli_service.thrift │ │ ├── fb303.thrift │ │ └── hive_metastore.thrift │ └── test │ ├── resources │ └── test.parquet │ └── scala │ └── com │ └── stripe │ └── herringbone │ ├── FlattenJobTest.scala │ ├── flatten │ ├── FlatConverterTest.scala │ └── TypeFlattenerTest.scala │ └── load │ └── FieldUtilsTest.scala └── pom.xml /.gitignore: -------------------------------------------------------------------------------- 1 | target/ 2 | data/ 3 | .idea/ 4 | *.pyc 5 | *.iml 6 | # ignore ROC plots 7 | *.pdf 8 | .tddium* 9 | 10 | .DS_Store 11 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014- Stripe, Inc. (https://stripe.com) 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Herringbone 2 | =========== 3 | 4 | > _**Herringbone is deprecated and is no longer being actively maintained.**_ 5 | 6 | Herringbone is a suite of tools for working with parquet files on hdfs, and with impala and hive. 7 | 8 | The available commands are: 9 | 10 | `flatten`: transform a directory of parquet files with a nested structure into a directory of parquet files with a flat schema that can be loaded into impala or hive (neither of which support nested schemas). Default output directory is `/path/to/input/directory-flat`. 11 | 12 | $ herringbone flatten -i /path/to/input/directory [-o /path/to/non/default/output/directory] 13 | 14 | `load`: load a directory of parquet files (which must have a flat schema) into impala or hive (defaulting to impala). Use the --nocompute-stats option for faster loading into impala (but probably slower querying later on!) 15 | 16 | $ herringbone load [--hive] [-u] [--nocompute-stats] -d db_name -t table -p /path/to/parquet/directory 17 | 18 | `tsv`: transform a directory of parquet files into a directory of tsv files (which you can concat properly later with `hadoop fs -getmerge /path/to/tsvs`). Default output directory is `/path/to/input/directory-tsv`. 19 | 20 | $ herringbone tsv -i /path/to/input/directory [-o /path/to/non/default/output/directory] 21 | 22 | `compact`: transform a directory of parquet files into a directory of fewer larger parquet files. Default output directory is `/path/to/input/directory-compact`. 23 | 24 | $ herringbone compact -i /path/to/input/directory [-o /path/to/non/default/output/directory] 25 | 26 | See `herringbone COMMAND --help` for more information on a specific command. 27 | 28 | Building 29 | -------- 30 | 31 | You'll need thrift 0.9.1 on your path. 32 | 33 | $ git clone github.com/stripe/herringbone 34 | $ cd herringbone 35 | $ mvn package 36 | 37 | Authors 38 | ------- 39 | 40 | - [Avi Bryant](http://twitter.com/avibryant) 41 | - [Danielle Sucher](http://twitter.com/daniellesucher) 42 | - [Jeff Balogh](http://twitter.com/jbalogh) 43 | -------------------------------------------------------------------------------- /bin/herringbone: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | usage = <<-USAGE 4 | Herringbone is a suite of tools for working with parquet files on hdfs. 5 | 6 | The available commands are: 7 | 8 | flatten: Transform a directory of parquet files with a nested structure into a directory of parquet files with a flat schema that can be loaded into impala or hive 9 | 10 | load: Load a directory of parquet files (which must have a flat schema) into impala or hive (defaults to impala). Use the --nocompute-stats option for faster loading into impala (but probably slower querying later on!) 11 | 12 | tsv: Transform a directory of parquet files into a directory of tsv files (which you can concat properly later with `hadoop fs -getmerge /path/to/tsvs`) 13 | 14 | compact: Transform a directory of parquet files into a directory of fewer larger parquet files 15 | 16 | 17 | Example usage: 18 | 19 | `herringbone flatten -i /path/to/input/directory -o /path/to/output/directory` 20 | 21 | `herringbone load [--hive] [-u] [--nocompute-stats] -d db_name -t table -p /path/to/parquet/directory` 22 | 23 | `herringbone tsv -i /path/to/input/directory -o /path/to/output/directory` 24 | 25 | `herringbone compact -i /path/to/input/directory -o /path/to/output/directory` 26 | 27 | 28 | See 'herringbone COMMAND --help' for more information on a specific command. 29 | 30 | 31 | USAGE 32 | 33 | command_jobs = { 34 | 'compact' => 'CompactJob', 35 | 'load' => 'ParquetLoad', 36 | 'flatten' => 'FlattenJob', 37 | 'tsv' => 'TsvJob', 38 | } 39 | 40 | # Validate the given command and print usage if needed. 41 | command = ARGV.shift 42 | JOB = command_jobs[command] 43 | 44 | if ['-h', '--help'].include?(command) 45 | puts usage 46 | exit 0 47 | elsif !JOB 48 | STDERR.puts "\nError: #{command} is not an available command\n\n" 49 | puts "#{'=' * 30}\n\n" 50 | puts usage 51 | exit 1 52 | end 53 | 54 | jar_path = File.join( 55 | File.dirname(__FILE__), 56 | '../', 57 | 'herringbone-main', 58 | 'target', 59 | 'herringbone-0.0.1-jar-with-dependencies.jar' 60 | ) 61 | JAR = File.expand_path(jar_path) 62 | 63 | ENV["HADOOP_CLASSPATH"] = JAR 64 | ENV["HADOOP_USER_CLASSPATH_FIRST"] = "true" 65 | 66 | exec( 67 | "hadoop", 68 | "jar", 69 | JAR, 70 | "com.stripe.herringbone.#{JOB}", 71 | *ARGV 72 | ) 73 | -------------------------------------------------------------------------------- /herringbone-impala/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | com.stripe 6 | herringbone-impala 7 | 0.0.2 8 | jar 9 | 10 | Herringbone Impala 11 | 12 | 13 | 14 | dtrott 15 | https://maven.davidtrott.com/repository 16 | 17 | 18 | 19 | 20 | 21 | 22 | org.apache.maven.plugins 23 | maven-compiler-plugin 24 | 3.1 25 | 26 | 1.6 27 | 1.6 28 | 29 | 30 | 31 | 32 | maven-jar-plugin 33 | 2.3.1 34 | 35 | 36 | 37 | maven-resources-plugin 38 | 2.4.3 39 | 40 | 41 | 42 | net.alchim31.maven 43 | scala-maven-plugin 44 | 3.1.6 45 | 46 | 47 | 48 | compile 49 | testCompile 50 | 51 | 52 | 53 | 54 | 55 | 56 | org.apache.thrift.tools 57 | maven-thrift-plugin 58 | 0.1.11 59 | 60 | true 61 | thrift 62 | 63 | 64 | 65 | thrift-sources 66 | generate-sources 67 | 68 | compile 69 | 70 | 71 | 72 | thrift-test-sources 73 | generate-test-sources 74 | 75 | testCompile 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | UTF-8 86 | 2.10.4 87 | 1.6 88 | 1.6 89 | 90 | 91 | 92 | 93 | cloudera-releases 94 | https://repository.cloudera.com/artifactory/cloudera-repos 95 | 96 | true 97 | 98 | 99 | false 100 | 101 | 102 | 103 | 104 | 105 | 106 | org.apache.thrift 107 | libthrift 108 | 0.12.0 109 | 110 | 111 | org.slf4j 112 | slf4j-log4j12 113 | 1.5.2 114 | 115 | 116 | 117 | 118 | 119 | -------------------------------------------------------------------------------- /herringbone-impala/src/main/scala/com/stripe/herringbone/impala/Connection.scala: -------------------------------------------------------------------------------- 1 | package com.stripe.herringbone.impala 2 | 3 | import org.apache.thrift.transport.TSocket 4 | import org.apache.thrift.protocol.TBinaryProtocol 5 | 6 | import com.cloudera.impala.thrift.ImpalaService.{Client => ClouderaImpalaClient} 7 | import com.cloudera.beeswax.api._ 8 | 9 | import scala.annotation.tailrec 10 | import scala.collection.JavaConversions._ 11 | 12 | case class Connection(host: String, port: Int) { 13 | var isOpen = false 14 | val logContext = "herringbone-impala" 15 | lazy val socket = new TSocket(host, port) 16 | lazy val client = new ClouderaImpalaClient(new TBinaryProtocol(socket)) 17 | 18 | open 19 | 20 | def open = { 21 | if (!isOpen) { 22 | socket.open 23 | client.ResetCatalog 24 | isOpen = true 25 | } 26 | } 27 | 28 | def close = { 29 | if (isOpen) { 30 | socket.close 31 | isOpen = false 32 | } 33 | } 34 | 35 | // Refresh the metadata store. 36 | def refresh = { 37 | if (!isOpen) throw ConnectionException("Connection closed") 38 | client.ResetCatalog 39 | } 40 | 41 | // Perform a query, and pass in a function that will be called with each 42 | // row of the results 43 | def query(raw: String)(fn: Seq[ImpalaValue] => Unit) { 44 | val cursor = execute(raw) 45 | cursor.foreach { row => fn(row) } 46 | cursor.close 47 | } 48 | 49 | // Perform a query and return a cursor for iterating over the results. 50 | // You probably want to call cursor.close when you're done with it. 51 | def execute(raw: String): Cursor = { 52 | if (!isOpen) throw ConnectionException("Connection closed") 53 | validateQuery(raw) 54 | 55 | val query = new Query 56 | query.query = raw 57 | 58 | val handle = client.executeAndWait(query, logContext) 59 | Cursor(handle, client) 60 | } 61 | 62 | private def validateQuery(raw: String) = { 63 | val words = raw.split("\\s+") 64 | if (words.isEmpty) throw InvalidQueryException("Empty query") 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /herringbone-impala/src/main/scala/com/stripe/herringbone/impala/Cursor.scala: -------------------------------------------------------------------------------- 1 | package com.stripe.herringbone.impala 2 | 3 | import org.apache.hadoop.hive.metastore.api.FieldSchema 4 | 5 | import com.cloudera.impala.thrift.ImpalaService.{Client => ClouderaImpalaClient} 6 | import com.cloudera.beeswax.api._ 7 | 8 | import scala.collection.mutable.ArrayBuffer 9 | import scala.collection.JavaConversions._ 10 | 11 | case class Cursor(handle: QueryHandle, client: ClouderaImpalaClient) { 12 | var done = false 13 | var isOpen = true 14 | var rowBuffer = ArrayBuffer.empty[Seq[ImpalaValue]] 15 | val bufferSize = 1024 16 | private lazy val metadata: ResultsMetadata = client.get_results_metadata(handle) 17 | 18 | def foreach(fn: Seq[ImpalaValue] => Unit) = { 19 | var row = fetchRow 20 | while (row.isDefined) { 21 | fn(row.get) 22 | row = fetchRow 23 | } 24 | } 25 | 26 | def fetchRow: Option[Seq[ImpalaValue]] = { 27 | if (rowBuffer.isEmpty) { 28 | if (done) { 29 | None 30 | } else { 31 | fetchMore 32 | fetchRow 33 | } 34 | } else { 35 | val row = rowBuffer.head 36 | rowBuffer = rowBuffer.tail 37 | Some(row) 38 | } 39 | } 40 | 41 | // Close the cursor on the remote server. Once a cursor is closed, you 42 | // can no longer fetch any rows from it. 43 | def close = { 44 | if (!isOpen) { 45 | isOpen = false 46 | client.close(handle) 47 | } 48 | } 49 | 50 | // Returns true if there are any more rows to fetch. 51 | def hasMore = !done || !rowBuffer.isEmpty 52 | 53 | def runtime_profile = client.GetRuntimeProfile(handle) 54 | 55 | private def fetchMore = { 56 | while (!done && rowBuffer.size < bufferSize) { 57 | fetchBatch 58 | } 59 | } 60 | 61 | private def fetchBatch = { 62 | if (!isOpen) throw CursorException("Cursor has expired or been closed") 63 | 64 | try { 65 | val response = client.fetch(handle, false, bufferSize) 66 | validateQueryState(client.get_state(handle)) 67 | 68 | val rows = response.data.map { row => parseRow(row) } 69 | rowBuffer ++= rows 70 | 71 | if (!response.has_more) { 72 | done = true 73 | close 74 | } 75 | } catch { 76 | case e: BeeswaxException => { 77 | isOpen = false 78 | throw e 79 | } 80 | case e: Exception => throw e 81 | } 82 | } 83 | 84 | private def parseRow(row: String) = { 85 | val fields = row.split(metadata.delim) 86 | 87 | metadata.schema.getFieldSchemas.zip(fields).map { case(schema, rawValue) => 88 | ImpalaValue(rawValue, schema.getName, schema.getType) 89 | } 90 | } 91 | 92 | private def validateQueryState(state: QueryState) = { 93 | if (state == QueryState.EXCEPTION) { 94 | close 95 | throw CursorException("The query was aborted") 96 | } 97 | } 98 | } 99 | -------------------------------------------------------------------------------- /herringbone-impala/src/main/scala/com/stripe/herringbone/impala/Exceptions.scala: -------------------------------------------------------------------------------- 1 | package com.stripe.herringbone.impala 2 | 3 | case class ConnectionException(message: String) extends Exception 4 | case class CursorException(message: String) extends Exception 5 | case class InvalidQueryException(message: String) extends Exception 6 | case class ParsingException(message: String) extends Exception 7 | 8 | -------------------------------------------------------------------------------- /herringbone-impala/src/main/scala/com/stripe/herringbone/impala/ImpalaClient.scala: -------------------------------------------------------------------------------- 1 | package com.stripe.herringbone.impala 2 | 3 | case class ImpalaClient(host: String, port: Int) { 4 | lazy val connection = Connection(host, port) 5 | 6 | def execute(raw: String) { 7 | query(raw){ row => 8 | println(row.map { _.raw }.mkString(" ")) 9 | } 10 | } 11 | 12 | def query(raw: String)(fn: Seq[ImpalaValue] => Unit) { 13 | println(raw) 14 | connection.query(raw){ row => fn(row) } 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /herringbone-impala/src/main/scala/com/stripe/herringbone/impala/ImpalaValue.scala: -------------------------------------------------------------------------------- 1 | package com.stripe.herringbone.impala 2 | 3 | import java.text.SimpleDateFormat 4 | 5 | case class ImpalaValue(raw: String, fieldName: String, fieldType: String) { 6 | lazy val convertedValue = convertRawValue(raw) 7 | 8 | private def convertRawValue(raw: String): Option[Any] = { 9 | if (raw == "NULL") { 10 | None 11 | } else { 12 | val converted = fieldType match { 13 | case "string" => raw 14 | case "boolean" => convertBoolean(raw) 15 | case "tinyint" | "smallint" | "int" | "bigint" => raw.toInt 16 | case "double" | "float" | "decimal" => raw.toDouble 17 | case "timestamp" => convertTimestamp(raw) 18 | case _ => throw ParsingException("Unknown type: " + fieldType) 19 | } 20 | Some(converted) 21 | } 22 | } 23 | 24 | private def convertBoolean(raw: String) = { 25 | try { 26 | raw.toBoolean 27 | } catch { 28 | case e: java.lang.IllegalArgumentException => 29 | throw ParsingException("Invalid value for boolean: " + raw) 30 | } 31 | } 32 | 33 | private def convertTimestamp(raw: String) = { 34 | val formatStr = if (raw.indexOf(".") == -1) { 35 | "YYYY-MM-DD HH:MM:SS" 36 | } else { 37 | "YYYY-MM-DD HH:MM:SS.sssssssss" 38 | } 39 | 40 | val dateFormat = new SimpleDateFormat(formatStr) 41 | dateFormat.parse(raw) 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /herringbone-impala/src/main/thrift/ImpalaService.thrift: -------------------------------------------------------------------------------- 1 | // Copyright 2012 Cloudera Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | namespace cpp impala 16 | namespace java com.cloudera.impala.thrift 17 | namespace rb impala.protocol 18 | 19 | include "Status.thrift" 20 | include "beeswax.thrift" 21 | include "cli_service.thrift" 22 | 23 | // ImpalaService accepts query execution options through beeswax.Query.configuration in 24 | // key:value form. For example, the list of strings could be: 25 | // "num_nodes:1", "abort_on_error:false" 26 | // The valid keys are listed in this enum. They map to TQueryOptions. 27 | // Note: If you add an option or change the default, you also need to update: 28 | // - ImpalaInternalService.thrift: TQueryOptions 29 | // - ImpaladClientExecutor.getBeeswaxQueryConfigurations() 30 | // - ImpalaServer::SetQueryOptions() 31 | // - ImpalaServer::TQueryOptionsToMap() 32 | enum TImpalaQueryOptions { 33 | // if true, abort execution on the first error 34 | ABORT_ON_ERROR, 35 | 36 | // maximum # of errors to be reported; Unspecified or 0 indicates backend default 37 | MAX_ERRORS, 38 | 39 | // if true, disable llvm codegen 40 | DISABLE_CODEGEN, 41 | 42 | // batch size to be used by backend; Unspecified or a size of 0 indicates backend 43 | // default 44 | BATCH_SIZE, 45 | 46 | // a per-machine approximate limit on the memory consumption of this query; 47 | // unspecified or a limit of 0 means no limit; 48 | // otherwise specified either as: 49 | // a) an int (= number of bytes); 50 | // b) a float followed by "M" (MB) or "G" (GB) 51 | MEM_LIMIT, 52 | 53 | // specifies the degree of parallelism with which to execute the query; 54 | // 1: single-node execution 55 | // NUM_NODES_ALL: executes on all nodes that contain relevant data 56 | // NUM_NODES_ALL_RACKS: executes on one node per rack that holds relevant data 57 | // > 1: executes on at most that many nodes at any point in time (ie, there can be 58 | // more nodes than numNodes with plan fragments for this query, but at most 59 | // numNodes would be active at any point in time) 60 | // Constants (NUM_NODES_ALL, NUM_NODES_ALL_RACKS) are defined in JavaConstants.thrift. 61 | NUM_NODES, 62 | 63 | // maximum length of the scan range; only applicable to HDFS scan range; Unspecified or 64 | // a length of 0 indicates backend default; 65 | MAX_SCAN_RANGE_LENGTH, 66 | 67 | // Maximum number of io buffers (per disk) 68 | MAX_IO_BUFFERS, 69 | 70 | // Number of scanner threads. 71 | NUM_SCANNER_THREADS, 72 | 73 | // If true, Impala will try to execute on file formats that are not fully supported yet 74 | ALLOW_UNSUPPORTED_FORMATS, 75 | 76 | // if set and > -1, specifies the default limit applied to a top-level SELECT statement 77 | // with an ORDER BY but without a LIMIT clause (ie, if the SELECT statement also has 78 | // a LIMIT clause, this default is ignored) 79 | DEFAULT_ORDER_BY_LIMIT, 80 | 81 | // DEBUG ONLY: 82 | // If set to 83 | // "[:]::", 84 | // the exec node with the given id will perform the specified action in the given 85 | // phase. If the optional backend number (starting from 0) is specified, only that 86 | // backend instance will perform the debug action, otherwise all backends will behave 87 | // in that way. 88 | // If the string doesn't have the required format or if any of its components is 89 | // invalid, the option is ignored. 90 | DEBUG_ACTION, 91 | 92 | // If true, raise an error when the DEFAULT_ORDER_BY_LIMIT has been reached. 93 | ABORT_ON_DEFAULT_LIMIT_EXCEEDED, 94 | 95 | // Compression codec for parquet when inserting into parquet tables. 96 | // Valid values are "snappy", "gzip" and "none" 97 | // Leave blank to use default. 98 | PARQUET_COMPRESSION_CODEC, 99 | 100 | // HBase scan query option. If set and > 0, HBASE_CACHING is the value for 101 | // "hbase.client.Scan.setCaching()" when querying HBase table. Otherwise, use backend 102 | // default. 103 | // If the value is too high, then the hbase region server will have a hard time (GC 104 | // pressure and long response times). If the value is too small, then there will be 105 | // extra trips to the hbase region server. 106 | HBASE_CACHING, 107 | 108 | // HBase scan query option. If set, HBase scan will always set 109 | // "hbase.client.setCacheBlocks" to CACHE_BLOCKS. Default is false. 110 | // If the table is large and the query is doing big scan, set it to false to 111 | // avoid polluting the cache in the hbase region server. 112 | // If the table is small and the table is used several time, set it to true to improve 113 | // performance. 114 | HBASE_CACHE_BLOCKS, 115 | } 116 | 117 | // The summary of an insert. 118 | struct TInsertResult { 119 | // Number of appended rows per modified partition. Only applies to HDFS tables. 120 | // The keys represent partitions to create, coded as k1=v1/k2=v2/k3=v3..., with the 121 | // root in an unpartitioned table being the empty string. 122 | 1: required map rows_appended 123 | } 124 | 125 | // Response from a call to PingImpalaService 126 | struct TPingImpalaServiceResp { 127 | // The Impala service's version string. 128 | 1: string version 129 | } 130 | 131 | // Parameters for a ResetTable request which will invalidate a table's metadata. 132 | // DEPRECATED. 133 | struct TResetTableReq { 134 | // Name of the table's parent database. 135 | 1: required string db_name 136 | 137 | // Name of the table. 138 | 2: required string table_name 139 | } 140 | 141 | // For all rpc that return a TStatus as part of their result type, 142 | // if the status_code field is set to anything other than OK, the contents 143 | // of the remainder of the result type is undefined (typically not set) 144 | service ImpalaService extends beeswax.BeeswaxService { 145 | // Cancel execution of query. Returns RUNTIME_ERROR if query_id 146 | // unknown. 147 | // This terminates all threads running on behalf of this query at 148 | // all nodes that were involved in the execution. 149 | // Throws BeeswaxException if the query handle is invalid (this doesn't 150 | // necessarily indicate an error: the query might have finished). 151 | Status.TStatus Cancel(1:beeswax.QueryHandle query_id) 152 | throws(1:beeswax.BeeswaxException error); 153 | 154 | // Invalidates all catalog metadata, forcing a reload 155 | // DEPRECATED; execute query "invalidate metadata" to refresh metadata 156 | Status.TStatus ResetCatalog(); 157 | 158 | // Invalidates a specific table's catalog metadata, forcing a reload on the next access 159 | // DEPRECATED; execute query "refresh " to refresh metadata 160 | Status.TStatus ResetTable(1:TResetTableReq request) 161 | 162 | // Returns the runtime profile string for the given query handle. 163 | string GetRuntimeProfile(1:beeswax.QueryHandle query_id) 164 | throws(1:beeswax.BeeswaxException error); 165 | 166 | // Closes the query handle and return the result summary of the insert. 167 | TInsertResult CloseInsert(1:beeswax.QueryHandle handle) 168 | throws(1:beeswax.QueryNotFoundException error, 2:beeswax.BeeswaxException error2); 169 | 170 | // Client calls this RPC to verify that the server is an ImpalaService. Returns the 171 | // server version. 172 | TPingImpalaServiceResp PingImpalaService(); 173 | } 174 | 175 | // Impala HiveServer2 service 176 | service ImpalaHiveServer2Service extends cli_service.TCLIService { 177 | } 178 | -------------------------------------------------------------------------------- /herringbone-impala/src/main/thrift/Status.thrift: -------------------------------------------------------------------------------- 1 | // Copyright 2012 Cloudera Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | namespace cpp impala 16 | namespace java com.cloudera.impala.thrift 17 | namespace rb impala.protocol 18 | 19 | enum TStatusCode { 20 | OK, 21 | CANCELLED, 22 | ANALYSIS_ERROR, 23 | NOT_IMPLEMENTED_ERROR, 24 | RUNTIME_ERROR, 25 | MEM_LIMIT_EXCEEDED, 26 | INTERNAL_ERROR 27 | } 28 | 29 | struct TStatus { 30 | 1: required TStatusCode status_code 31 | 2: list error_msgs 32 | } 33 | -------------------------------------------------------------------------------- /herringbone-impala/src/main/thrift/beeswax.thrift: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to Cloudera, Inc. under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. Cloudera, Inc. licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | * 18 | * Interface for interacting with Beeswax Server 19 | */ 20 | 21 | namespace java com.cloudera.beeswax.api 22 | namespace py beeswaxd 23 | namespace cpp beeswax 24 | namespace rb impala.protocol.beeswax 25 | 26 | include "hive_metastore.thrift" 27 | 28 | // A Query 29 | struct Query { 30 | 1: string query; 31 | // A list of HQL commands to execute before the query. 32 | // This is typically defining UDFs, setting settings, and loading resources. 33 | 3: list configuration; 34 | 35 | // User and groups to "act as" for purposes of Hadoop. 36 | 4: string hadoop_user; 37 | } 38 | 39 | typedef string LogContextId 40 | 41 | enum QueryState { 42 | CREATED, 43 | INITIALIZED, 44 | COMPILED, 45 | RUNNING, 46 | FINISHED, 47 | EXCEPTION 48 | } 49 | 50 | struct QueryHandle { 51 | 1: string id; 52 | 2: LogContextId log_context; 53 | } 54 | 55 | struct QueryExplanation { 56 | 1: string textual 57 | } 58 | 59 | struct Results { 60 | // If set, data is valid. Otherwise, results aren't ready yet. 61 | 1: bool ready, 62 | // Columns for the results 63 | 2: list columns, 64 | // A set of results 65 | 3: list data, 66 | // The starting row of the results 67 | 4: i64 start_row, 68 | // Whether there are more results to fetch 69 | 5: bool has_more 70 | } 71 | 72 | /** 73 | * Metadata information about the results. 74 | * Applicable only for SELECT. 75 | */ 76 | struct ResultsMetadata { 77 | /** The schema of the results */ 78 | 1: hive_metastore.Schema schema, 79 | /** The directory containing the results. Not applicable for partition table. */ 80 | 2: string table_dir, 81 | /** If the results are straight from an existing table, the table name. */ 82 | 3: string in_tablename, 83 | /** Field delimiter */ 84 | 4: string delim, 85 | } 86 | 87 | exception BeeswaxException { 88 | 1: string message, 89 | // Use get_log(log_context) to retrieve any log related to this exception 90 | 2: LogContextId log_context, 91 | // (Optional) The QueryHandle that caused this exception 92 | 3: QueryHandle handle, 93 | 4: optional i32 errorCode = 0, 94 | 5: optional string SQLState = " " 95 | } 96 | 97 | exception QueryNotFoundException { 98 | } 99 | 100 | /** Represents a Hadoop-style configuration variable. */ 101 | struct ConfigVariable { 102 | 1: string key, 103 | 2: string value, 104 | 3: string description 105 | } 106 | 107 | service BeeswaxService { 108 | /** 109 | * Submit a query and return a handle (QueryHandle). The query runs asynchronously. 110 | */ 111 | QueryHandle query(1:Query query) throws(1:BeeswaxException error), 112 | 113 | /** 114 | * run a query synchronously and return a handle (QueryHandle). 115 | */ 116 | QueryHandle executeAndWait(1:Query query, 2:LogContextId clientCtx) 117 | throws(1:BeeswaxException error), 118 | 119 | /** 120 | * Get the query plan for a query. 121 | */ 122 | QueryExplanation explain(1:Query query) 123 | throws(1:BeeswaxException error), 124 | 125 | /** 126 | * Get the results of a query. This is non-blocking. Caller should check 127 | * Results.ready to determine if the results are in yet. The call requests 128 | * the batch size of fetch. 129 | */ 130 | Results fetch(1:QueryHandle query_id, 2:bool start_over, 3:i32 fetch_size=-1) 131 | throws(1:QueryNotFoundException error, 2:BeeswaxException error2), 132 | 133 | /** 134 | * Get the state of the query 135 | */ 136 | QueryState get_state(1:QueryHandle handle) throws(1:QueryNotFoundException error), 137 | 138 | /** 139 | * Get the result metadata 140 | */ 141 | ResultsMetadata get_results_metadata(1:QueryHandle handle) 142 | throws(1:QueryNotFoundException error), 143 | 144 | /** 145 | * Used to test connection to server. A "noop" command. 146 | */ 147 | string echo(1:string s) 148 | 149 | /** 150 | * Returns a string representation of the configuration object being used. 151 | * Handy for debugging. 152 | */ 153 | string dump_config() 154 | 155 | /** 156 | * Get the log messages related to the given context. 157 | */ 158 | string get_log(1:LogContextId context) throws(1:QueryNotFoundException error) 159 | 160 | /* 161 | * Returns "default" configuration. 162 | */ 163 | list get_default_configuration(1:bool include_hadoop) 164 | 165 | /* 166 | * closes the query with given handle 167 | */ 168 | void close(1:QueryHandle handle) throws(1:QueryNotFoundException error, 169 | 2:BeeswaxException error2) 170 | 171 | /* 172 | * clean the log context for given id 173 | */ 174 | void clean(1:LogContextId log_context) 175 | } 176 | -------------------------------------------------------------------------------- /herringbone-impala/src/main/thrift/cli_service.thrift: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, software 12 | // distributed under the License is distributed on an "AS IS" BASIS, 13 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | // See the License for the specific language governing permissions and 15 | // limitations under the License. 16 | 17 | // Coding Conventions for this file: 18 | // 19 | // Structs/Enums/Unions 20 | // * Struct, Enum, and Union names begin with a "T", 21 | // and use a capital letter for each new word, with no underscores. 22 | // * All fields should be declared as either optional or required. 23 | // 24 | // Functions 25 | // * Function names start with a capital letter and have a capital letter for 26 | // each new word, with no underscores. 27 | // * Each function should take exactly one parameter, named TFunctionNameReq, 28 | // and should return either void or TFunctionNameResp. This convention allows 29 | // incremental updates. 30 | // 31 | // Services 32 | // * Service names begin with the letter "T", use a capital letter for each 33 | // new word (with no underscores), and end with the word "Service". 34 | 35 | namespace java org.apache.hive.service.cli.thrift 36 | namespace cpp apache.hive.service.cli.thrift 37 | namespace rb impala.protocol.hive 38 | 39 | // List of protocol versions. A new token should be 40 | // added to the end of this list every time a change is made. 41 | enum TProtocolVersion { 42 | HIVE_CLI_SERVICE_PROTOCOL_V1 43 | } 44 | 45 | enum TTypeId { 46 | BOOLEAN_TYPE, 47 | TINYINT_TYPE, 48 | SMALLINT_TYPE, 49 | INT_TYPE, 50 | BIGINT_TYPE, 51 | FLOAT_TYPE, 52 | DOUBLE_TYPE, 53 | STRING_TYPE, 54 | TIMESTAMP_TYPE, 55 | BINARY_TYPE, 56 | ARRAY_TYPE, 57 | MAP_TYPE, 58 | STRUCT_TYPE, 59 | UNION_TYPE, 60 | USER_DEFINED_TYPE, 61 | DECIMAL_TYPE 62 | } 63 | 64 | const set PRIMITIVE_TYPES = [ 65 | TTypeId.BOOLEAN_TYPE 66 | TTypeId.TINYINT_TYPE 67 | TTypeId.SMALLINT_TYPE 68 | TTypeId.INT_TYPE 69 | TTypeId.BIGINT_TYPE 70 | TTypeId.FLOAT_TYPE 71 | TTypeId.DOUBLE_TYPE 72 | TTypeId.STRING_TYPE 73 | TTypeId.TIMESTAMP_TYPE 74 | TTypeId.BINARY_TYPE, 75 | TTypeId.DECIMAL_TYPE 76 | ] 77 | 78 | const set COMPLEX_TYPES = [ 79 | TTypeId.ARRAY_TYPE 80 | TTypeId.MAP_TYPE 81 | TTypeId.STRUCT_TYPE 82 | TTypeId.UNION_TYPE 83 | TTypeId.USER_DEFINED_TYPE 84 | ] 85 | 86 | const set COLLECTION_TYPES = [ 87 | TTypeId.ARRAY_TYPE 88 | TTypeId.MAP_TYPE 89 | ] 90 | 91 | const map TYPE_NAMES = { 92 | TTypeId.BOOLEAN_TYPE: "BOOLEAN", 93 | TTypeId.TINYINT_TYPE: "TINYINT", 94 | TTypeId.SMALLINT_TYPE: "SMALLINT", 95 | TTypeId.INT_TYPE: "INT", 96 | TTypeId.BIGINT_TYPE: "BIGINT", 97 | TTypeId.FLOAT_TYPE: "FLOAT", 98 | TTypeId.DOUBLE_TYPE: "DOUBLE", 99 | TTypeId.STRING_TYPE: "STRING", 100 | TTypeId.TIMESTAMP_TYPE: "TIMESTAMP", 101 | TTypeId.BINARY_TYPE: "BINARY", 102 | TTypeId.ARRAY_TYPE: "ARRAY", 103 | TTypeId.MAP_TYPE: "MAP", 104 | TTypeId.STRUCT_TYPE: "STRUCT", 105 | TTypeId.UNION_TYPE: "UNIONTYPE" 106 | TTypeId.DECIMAL_TYPE: "DECIMAL" 107 | } 108 | 109 | // Thrift does not support recursively defined types or forward declarations, 110 | // which makes it difficult to represent Hive's nested types. 111 | // To get around these limitations TTypeDesc employs a type list that maps 112 | // integer "pointers" to TTypeEntry objects. The following examples show 113 | // how different types are represented using this scheme: 114 | // 115 | // "INT": 116 | // TTypeDesc { 117 | // types = [ 118 | // TTypeEntry.primitive_entry { 119 | // type = INT_TYPE 120 | // } 121 | // ] 122 | // } 123 | // 124 | // "ARRAY": 125 | // TTypeDesc { 126 | // types = [ 127 | // TTypeEntry.array_entry { 128 | // object_type_ptr = 1 129 | // }, 130 | // TTypeEntry.primitive_entry { 131 | // type = INT_TYPE 132 | // } 133 | // ] 134 | // } 135 | // 136 | // "MAP": 137 | // TTypeDesc { 138 | // types = [ 139 | // TTypeEntry.map_entry { 140 | // key_type_ptr = 1 141 | // value_type_ptr = 2 142 | // }, 143 | // TTypeEntry.primitive_entry { 144 | // type = INT_TYPE 145 | // }, 146 | // TTypeEntry.primitive_entry { 147 | // type = STRING_TYPE 148 | // } 149 | // ] 150 | // } 151 | 152 | typedef i32 TTypeEntryPtr 153 | 154 | // Type entry for a primitive type. 155 | struct TPrimitiveTypeEntry { 156 | // The primitive type token. This must satisfy the condition 157 | // that type is in the PRIMITIVE_TYPES set. 158 | 1: required TTypeId type 159 | } 160 | 161 | // Type entry for an ARRAY type. 162 | struct TArrayTypeEntry { 163 | 1: required TTypeEntryPtr objectTypePtr 164 | } 165 | 166 | // Type entry for a MAP type. 167 | struct TMapTypeEntry { 168 | 1: required TTypeEntryPtr keyTypePtr 169 | 2: required TTypeEntryPtr valueTypePtr 170 | } 171 | 172 | // Type entry for a STRUCT type. 173 | struct TStructTypeEntry { 174 | 1: required map nameToTypePtr 175 | } 176 | 177 | // Type entry for a UNIONTYPE type. 178 | struct TUnionTypeEntry { 179 | 1: required map nameToTypePtr 180 | } 181 | 182 | struct TUserDefinedTypeEntry { 183 | // The fully qualified name of the class implementing this type. 184 | 1: required string typeClassName 185 | } 186 | 187 | // We use a union here since Thrift does not support inheritance. 188 | union TTypeEntry { 189 | 1: TPrimitiveTypeEntry primitiveEntry 190 | 2: TArrayTypeEntry arrayEntry 191 | 3: TMapTypeEntry mapEntry 192 | 4: TStructTypeEntry structEntry 193 | 5: TUnionTypeEntry unionEntry 194 | 6: TUserDefinedTypeEntry userDefinedTypeEntry 195 | } 196 | 197 | // Type descriptor for columns. 198 | struct TTypeDesc { 199 | // The "top" type is always the first element of the list. 200 | // If the top type is an ARRAY, MAP, STRUCT, or UNIONTYPE 201 | // type, then subsequent elements represent nested types. 202 | 1: required list types 203 | } 204 | 205 | // A result set column descriptor. 206 | struct TColumnDesc { 207 | // The name of the column 208 | 1: required string columnName 209 | 210 | // The type descriptor for this column 211 | 2: required TTypeDesc typeDesc 212 | 213 | // The ordinal position of this column in the schema 214 | 3: required i32 position 215 | 216 | 4: optional string comment 217 | } 218 | 219 | // Metadata used to describe the schema (column names, types, comments) 220 | // of result sets. 221 | struct TTableSchema { 222 | 1: required list columns 223 | } 224 | 225 | // A Boolean column value. 226 | struct TBoolValue { 227 | // NULL if value is unset. 228 | 1: optional bool value 229 | } 230 | 231 | // A Byte column value. 232 | struct TByteValue { 233 | // NULL if value is unset. 234 | 1: optional byte value 235 | } 236 | 237 | // A signed, 16 bit column value. 238 | struct TI16Value { 239 | // NULL if value is unset 240 | 1: optional i16 value 241 | } 242 | 243 | // A signed, 32 bit column value 244 | struct TI32Value { 245 | // NULL if value is unset 246 | 1: optional i32 value 247 | } 248 | 249 | // A signed 64 bit column value 250 | struct TI64Value { 251 | // NULL if value is unset 252 | 1: optional i64 value 253 | } 254 | 255 | // A floating point 64 bit column value 256 | struct TDoubleValue { 257 | // NULL if value is unset 258 | 1: optional double value 259 | } 260 | 261 | struct TStringValue { 262 | // NULL if value is unset 263 | 1: optional string value 264 | } 265 | 266 | union TColumn { 267 | 1: list boolColumn 268 | 2: list byteColumn 269 | 3: list i16Column 270 | 4: list i32Column 271 | 5: list i64Column 272 | 6: list doubleColumn 273 | 7: list stringColumn 274 | } 275 | 276 | // A single column value in a result set. 277 | // Note that Hive's type system is richer than Thrift's, 278 | // so in some cases we have to map multiple Hive types 279 | // to the same Thrift type. On the client-side this is 280 | // disambiguated by looking at the Schema of the 281 | // result set. 282 | union TColumnValue { 283 | 1: TBoolValue boolVal // BOOLEAN 284 | 2: TByteValue byteVal // TINYINT 285 | 3: TI16Value i16Val // SMALLINT 286 | 4: TI32Value i32Val // INT 287 | 5: TI64Value i64Val // BIGINT, TIMESTAMP 288 | 6: TDoubleValue doubleVal // FLOAT, DOUBLE 289 | 7: TStringValue stringVal // STRING, LIST, MAP, STRUCT, UNIONTYPE, BINARY, DECIMAL 290 | } 291 | 292 | // Represents a row in a rowset. 293 | struct TRow { 294 | 1: required list colVals 295 | } 296 | 297 | // Represents a rowset 298 | struct TRowSet { 299 | // The starting row offset of this rowset. 300 | 1: required i64 startRowOffset 301 | 2: required list rows 302 | 3: optional list columns 303 | } 304 | 305 | // The return status code contained in each response. 306 | enum TStatusCode { 307 | SUCCESS_STATUS, 308 | SUCCESS_WITH_INFO_STATUS, 309 | STILL_EXECUTING_STATUS, 310 | ERROR_STATUS, 311 | INVALID_HANDLE_STATUS 312 | } 313 | 314 | // The return status of a remote request 315 | struct TStatus { 316 | 1: required TStatusCode statusCode 317 | 318 | // If status is SUCCESS_WITH_INFO, info_msgs may be populated with 319 | // additional diagnostic information. 320 | 2: optional list infoMessages 321 | 322 | // If status is ERROR, then the following fields may be set 323 | 3: optional string sqlState // as defined in the ISO/IEF CLI specification 324 | 4: optional i32 errorCode // internal error code 325 | 5: optional string errorMessage 326 | } 327 | 328 | // The state of an operation (i.e. a query or other 329 | // asynchronous operation that generates a result set) 330 | // on the server. 331 | enum TOperationState { 332 | // The operation has been initialized 333 | INITIALIZED_STATE, 334 | 335 | // The operation is running. In this state the result 336 | // set is not available. 337 | RUNNING_STATE, 338 | 339 | // The operation has completed. When an operation is in 340 | // this state its result set may be fetched. 341 | FINISHED_STATE, 342 | 343 | // The operation was canceled by a client 344 | CANCELED_STATE, 345 | 346 | // The operation was closed by a client 347 | CLOSED_STATE, 348 | 349 | // The operation failed due to an error 350 | ERROR_STATE, 351 | 352 | // The operation is in an unrecognized state 353 | UKNOWN_STATE, 354 | } 355 | 356 | 357 | // A string identifier. This is interpreted literally. 358 | typedef string TIdentifier 359 | 360 | // A search pattern. 361 | // 362 | // Valid search pattern characters: 363 | // '_': Any single character. 364 | // '%': Any sequence of zero or more characters. 365 | // '\': Escape character used to include special characters, 366 | // e.g. '_', '%', '\'. If a '\' precedes a non-special 367 | // character it has no special meaning and is interpreted 368 | // literally. 369 | typedef string TPattern 370 | 371 | 372 | // A search pattern or identifier. Used as input 373 | // parameter for many of the catalog functions. 374 | typedef string TPatternOrIdentifier 375 | 376 | struct THandleIdentifier { 377 | // 16 byte globally unique identifier 378 | // This is the public ID of the handle and 379 | // can be used for reporting. 380 | 1: required binary guid, 381 | 382 | // 16 byte secret generated by the server 383 | // and used to verify that the handle is not 384 | // being hijacked by another user. 385 | 2: required binary secret, 386 | } 387 | 388 | // Client-side handle to persistent 389 | // session information on the server-side. 390 | struct TSessionHandle { 391 | 1: required THandleIdentifier sessionId 392 | } 393 | 394 | // The subtype of an OperationHandle. 395 | enum TOperationType { 396 | EXECUTE_STATEMENT, 397 | GET_TYPE_INFO, 398 | GET_CATALOGS, 399 | GET_SCHEMAS, 400 | GET_TABLES, 401 | GET_TABLE_TYPES, 402 | GET_COLUMNS, 403 | GET_FUNCTIONS, 404 | UNKNOWN, 405 | } 406 | 407 | // Client-side reference to a task running 408 | // asynchronously on the server. 409 | struct TOperationHandle { 410 | 1: required THandleIdentifier operationId 411 | 2: required TOperationType operationType 412 | 413 | // If hasResultSet = TRUE, then this operation 414 | // generates a result set that can be fetched. 415 | // Note that the result set may be empty. 416 | // 417 | // If hasResultSet = FALSE, then this operation 418 | // does not generate a result set, and calling 419 | // GetResultSetMetadata or FetchResults against 420 | // this OperationHandle will generate an error. 421 | 3: required bool hasResultSet 422 | 423 | // For operations that don't generate result sets, 424 | // modifiedRowCount is either: 425 | // 426 | // 1) The number of rows that were modified by 427 | // the DML operation (e.g. number of rows inserted, 428 | // number of rows deleted, etc). 429 | // 430 | // 2) 0 for operations that don't modify or add rows. 431 | // 432 | // 3) < 0 if the operation is capable of modifiying rows, 433 | // but Hive is unable to determine how many rows were 434 | // modified. For example, Hive's LOAD DATA command 435 | // doesn't generate row count information because 436 | // Hive doesn't inspect the data as it is loaded. 437 | // 438 | // modifiedRowCount is unset if the operation generates 439 | // a result set. 440 | 4: optional double modifiedRowCount 441 | } 442 | 443 | 444 | // OpenSession() 445 | // 446 | // Open a session (connection) on the server against 447 | // which operations may be executed. 448 | struct TOpenSessionReq { 449 | // The version of the HiveServer2 protocol that the client is using. 450 | 1: required TProtocolVersion client_protocol = TProtocolVersion.HIVE_CLI_SERVICE_PROTOCOL_V1 451 | 452 | // Username and password for authentication. 453 | // Depending on the authentication scheme being used, 454 | // this information may instead be provided by a lower 455 | // protocol layer, in which case these fields may be 456 | // left unset. 457 | 2: optional string username 458 | 3: optional string password 459 | 460 | // Configuration overlay which is applied when the session is 461 | // first created. 462 | 4: optional map configuration 463 | } 464 | 465 | struct TOpenSessionResp { 466 | 1: required TStatus status 467 | 468 | // The protocol version that the server is using. 469 | 2: required TProtocolVersion serverProtocolVersion = TProtocolVersion.HIVE_CLI_SERVICE_PROTOCOL_V1 470 | 471 | // Session Handle 472 | 3: optional TSessionHandle sessionHandle 473 | 474 | // The configuration settings for this session. 475 | 4: optional map configuration 476 | } 477 | 478 | 479 | // CloseSession() 480 | // 481 | // Closes the specified session and frees any resources 482 | // currently allocated to that session. Any open 483 | // operations in that session will be canceled. 484 | struct TCloseSessionReq { 485 | 1: required TSessionHandle sessionHandle 486 | } 487 | 488 | struct TCloseSessionResp { 489 | 1: required TStatus status 490 | } 491 | 492 | 493 | 494 | enum TGetInfoType { 495 | CLI_MAX_DRIVER_CONNECTIONS = 0, 496 | CLI_MAX_CONCURRENT_ACTIVITIES = 1, 497 | CLI_DATA_SOURCE_NAME = 2, 498 | CLI_FETCH_DIRECTION = 8, 499 | CLI_SERVER_NAME = 13, 500 | CLI_SEARCH_PATTERN_ESCAPE = 14, 501 | CLI_DBMS_NAME = 17, 502 | CLI_DBMS_VER = 18, 503 | CLI_ACCESSIBLE_TABLES = 19, 504 | CLI_ACCESSIBLE_PROCEDURES = 20, 505 | CLI_CURSOR_COMMIT_BEHAVIOR = 23, 506 | CLI_DATA_SOURCE_READ_ONLY = 25, 507 | CLI_DEFAULT_TXN_ISOLATION = 26, 508 | CLI_IDENTIFIER_CASE = 28, 509 | CLI_IDENTIFIER_QUOTE_CHAR = 29, 510 | CLI_MAX_COLUMN_NAME_LEN = 30, 511 | CLI_MAX_CURSOR_NAME_LEN = 31, 512 | CLI_MAX_SCHEMA_NAME_LEN = 32, 513 | CLI_MAX_CATALOG_NAME_LEN = 34, 514 | CLI_MAX_TABLE_NAME_LEN = 35, 515 | CLI_SCROLL_CONCURRENCY = 43, 516 | CLI_TXN_CAPABLE = 46, 517 | CLI_USER_NAME = 47, 518 | CLI_TXN_ISOLATION_OPTION = 72, 519 | CLI_INTEGRITY = 73, 520 | CLI_GETDATA_EXTENSIONS = 81, 521 | CLI_NULL_COLLATION = 85, 522 | CLI_ALTER_TABLE = 86, 523 | CLI_ORDER_BY_COLUMNS_IN_SELECT = 90, 524 | CLI_SPECIAL_CHARACTERS = 94, 525 | CLI_MAX_COLUMNS_IN_GROUP_BY = 97, 526 | CLI_MAX_COLUMNS_IN_INDEX = 98, 527 | CLI_MAX_COLUMNS_IN_ORDER_BY = 99, 528 | CLI_MAX_COLUMNS_IN_SELECT = 100, 529 | CLI_MAX_COLUMNS_IN_TABLE = 101, 530 | CLI_MAX_INDEX_SIZE = 102, 531 | CLI_MAX_ROW_SIZE = 104, 532 | CLI_MAX_STATEMENT_LEN = 105, 533 | CLI_MAX_TABLES_IN_SELECT = 106, 534 | CLI_MAX_USER_NAME_LEN = 107, 535 | CLI_OJ_CAPABILITIES = 115, 536 | 537 | CLI_XOPEN_CLI_YEAR = 10000, 538 | CLI_CURSOR_SENSITIVITY = 10001, 539 | CLI_DESCRIBE_PARAMETER = 10002, 540 | CLI_CATALOG_NAME = 10003, 541 | CLI_COLLATION_SEQ = 10004, 542 | CLI_MAX_IDENTIFIER_LEN = 10005, 543 | } 544 | 545 | union TGetInfoValue { 546 | 1: string stringValue 547 | 2: i16 smallIntValue 548 | 3: i32 integerBitmask 549 | 4: i32 integerFlag 550 | 5: i32 binaryValue 551 | 6: i64 lenValue 552 | } 553 | 554 | // GetInfo() 555 | // 556 | // This function is based on ODBC's CLIGetInfo() function. 557 | // The function returns general information about the data source 558 | // using the same keys as ODBC. 559 | struct TGetInfoReq { 560 | // The sesssion to run this request against 561 | 1: required TSessionHandle sessionHandle 562 | 563 | 2: required TGetInfoType infoType 564 | } 565 | 566 | struct TGetInfoResp { 567 | 1: required TStatus status 568 | 569 | 2: required TGetInfoValue infoValue 570 | } 571 | 572 | 573 | // ExecuteStatement() 574 | // 575 | // Execute a statement. 576 | // The returned OperationHandle can be used to check on the 577 | // status of the statement, and to fetch results once the 578 | // statement has finished executing. 579 | struct TExecuteStatementReq { 580 | // The session to exexcute the statement against 581 | 1: required TSessionHandle sessionHandle 582 | 583 | // The statement to be executed (DML, DDL, SET, etc) 584 | 2: required string statement 585 | 586 | // Configuration properties that are overlayed on top of the 587 | // the existing session configuration before this statement 588 | // is executed. These properties apply to this statement 589 | // only and will not affect the subsequent state of the Session. 590 | 3: optional map confOverlay 591 | } 592 | 593 | struct TExecuteStatementResp { 594 | 1: required TStatus status 595 | 2: optional TOperationHandle operationHandle 596 | } 597 | 598 | 599 | // GetTypeInfo() 600 | // 601 | // Get information about types supported by the HiveServer instance. 602 | // The information is returned as a result set which can be fetched 603 | // using the OperationHandle provided in the response. 604 | // 605 | // Refer to the documentation for ODBC's CLIGetTypeInfo function for 606 | // the format of the result set. 607 | struct TGetTypeInfoReq { 608 | // The session to run this request against. 609 | 1: required TSessionHandle sessionHandle 610 | } 611 | 612 | struct TGetTypeInfoResp { 613 | 1: required TStatus status 614 | 2: optional TOperationHandle operationHandle 615 | } 616 | 617 | 618 | // GetCatalogs() 619 | // 620 | // Returns the list of catalogs (databases) 621 | // Results are ordered by TABLE_CATALOG 622 | // 623 | // Resultset columns : 624 | // col1 625 | // name: TABLE_CAT 626 | // type: STRING 627 | // desc: Catalog name. NULL if not applicable. 628 | // 629 | struct TGetCatalogsReq { 630 | // Session to run this request against 631 | 1: required TSessionHandle sessionHandle 632 | } 633 | 634 | struct TGetCatalogsResp { 635 | 1: required TStatus status 636 | 2: optional TOperationHandle operationHandle 637 | } 638 | 639 | 640 | // GetSchemas() 641 | // 642 | // Retrieves the schema names available in this database. 643 | // The results are ordered by TABLE_CATALOG and TABLE_SCHEM. 644 | // col1 645 | // name: TABLE_SCHEM 646 | // type: STRING 647 | // desc: schema name 648 | // col2 649 | // name: TABLE_CATALOG 650 | // type: STRING 651 | // desc: catalog name 652 | struct TGetSchemasReq { 653 | // Session to run this request against 654 | 1: required TSessionHandle sessionHandle 655 | 656 | // Name of the catalog. Must not contain a search pattern. 657 | 2: optional TIdentifier catalogName 658 | 659 | // schema name or pattern 660 | 3: optional TPatternOrIdentifier schemaName 661 | } 662 | 663 | struct TGetSchemasResp { 664 | 1: required TStatus status 665 | 2: optional TOperationHandle operationHandle 666 | } 667 | 668 | 669 | // GetTables() 670 | // 671 | // Returns a list of tables with catalog, schema, and table 672 | // type information. The information is returned as a result 673 | // set which can be fetched using the OperationHandle 674 | // provided in the response. 675 | // Results are ordered by TABLE_TYPE, TABLE_CAT, TABLE_SCHEM, and TABLE_NAME 676 | // 677 | // Result Set Columns: 678 | // 679 | // col1 680 | // name: TABLE_CAT 681 | // type: STRING 682 | // desc: Catalog name. NULL if not applicable. 683 | // 684 | // col2 685 | // name: TABLE_SCHEM 686 | // type: STRING 687 | // desc: Schema name. 688 | // 689 | // col3 690 | // name: TABLE_NAME 691 | // type: STRING 692 | // desc: Table name. 693 | // 694 | // col4 695 | // name: TABLE_TYPE 696 | // type: STRING 697 | // desc: The table type, e.g. "TABLE", "VIEW", etc. 698 | // 699 | // col5 700 | // name: REMARKS 701 | // type: STRING 702 | // desc: Comments about the table 703 | // 704 | struct TGetTablesReq { 705 | // Session to run this request against 706 | 1: required TSessionHandle sessionHandle 707 | 708 | // Name of the catalog or a search pattern. 709 | 2: optional TPatternOrIdentifier catalogName 710 | 711 | // Name of the schema or a search pattern. 712 | 3: optional TPatternOrIdentifier schemaName 713 | 714 | // Name of the table or a search pattern. 715 | 4: optional TPatternOrIdentifier tableName 716 | 717 | // List of table types to match 718 | // e.g. "TABLE", "VIEW", "SYSTEM TABLE", "GLOBAL TEMPORARY", 719 | // "LOCAL TEMPORARY", "ALIAS", "SYNONYM", etc. 720 | 5: optional list tableTypes 721 | } 722 | 723 | struct TGetTablesResp { 724 | 1: required TStatus status 725 | 2: optional TOperationHandle operationHandle 726 | } 727 | 728 | 729 | // GetTableTypes() 730 | // 731 | // Returns the table types available in this database. 732 | // The results are ordered by table type. 733 | // 734 | // col1 735 | // name: TABLE_TYPE 736 | // type: STRING 737 | // desc: Table type name. 738 | struct TGetTableTypesReq { 739 | // Session to run this request against 740 | 1: required TSessionHandle sessionHandle 741 | } 742 | 743 | struct TGetTableTypesResp { 744 | 1: required TStatus status 745 | 2: optional TOperationHandle operationHandle 746 | } 747 | 748 | 749 | // GetColumns() 750 | // 751 | // Returns a list of columns in the specified tables. 752 | // The information is returned as a result set which can be fetched 753 | // using the OperationHandle provided in the response. 754 | // Results are ordered by TABLE_CAT, TABLE_SCHEM, TABLE_NAME, 755 | // and ORDINAL_POSITION. 756 | // 757 | // Result Set Columns are the same as those for the ODBC CLIColumns 758 | // function. 759 | // 760 | struct TGetColumnsReq { 761 | // Session to run this request against 762 | 1: required TSessionHandle sessionHandle 763 | 764 | // Name of the catalog. Must not contain a search pattern. 765 | 2: optional TIdentifier catalogName 766 | 767 | // Schema name or search pattern 768 | 3: optional TPatternOrIdentifier schemaName 769 | 770 | // Table name or search pattern 771 | 4: optional TPatternOrIdentifier tableName 772 | 773 | // Column name or search pattern 774 | 5: optional TPatternOrIdentifier columnName 775 | } 776 | 777 | struct TGetColumnsResp { 778 | 1: required TStatus status 779 | 2: optional TOperationHandle operationHandle 780 | } 781 | 782 | 783 | // GetFunctions() 784 | // 785 | // Returns a list of functions supported by the data source. The 786 | // behavior of this function matches 787 | // java.sql.DatabaseMetaData.getFunctions() both in terms of 788 | // inputs and outputs. 789 | // 790 | // Result Set Columns: 791 | // 792 | // col1 793 | // name: FUNCTION_CAT 794 | // type: STRING 795 | // desc: Function catalog (may be null) 796 | // 797 | // col2 798 | // name: FUNCTION_SCHEM 799 | // type: STRING 800 | // desc: Function schema (may be null) 801 | // 802 | // col3 803 | // name: FUNCTION_NAME 804 | // type: STRING 805 | // desc: Function name. This is the name used to invoke the function. 806 | // 807 | // col4 808 | // name: REMARKS 809 | // type: STRING 810 | // desc: Explanatory comment on the function. 811 | // 812 | // col5 813 | // name: FUNCTION_TYPE 814 | // type: SMALLINT 815 | // desc: Kind of function. One of: 816 | // * functionResultUnknown - Cannot determine if a return value or a table 817 | // will be returned. 818 | // * functionNoTable - Does not a return a table. 819 | // * functionReturnsTable - Returns a table. 820 | // 821 | // col6 822 | // name: SPECIFIC_NAME 823 | // type: STRING 824 | // desc: The name which uniquely identifies this function within its schema. 825 | // In this case this is the fully qualified class name of the class 826 | // that implements this function. 827 | // 828 | struct TGetFunctionsReq { 829 | // Session to run this request against 830 | 1: required TSessionHandle sessionHandle 831 | 832 | // A catalog name; must match the catalog name as it is stored in the 833 | // database; "" retrieves those without a catalog; null means 834 | // that the catalog name should not be used to narrow the search. 835 | 2: optional TIdentifier catalogName 836 | 837 | // A schema name pattern; must match the schema name as it is stored 838 | // in the database; "" retrieves those without a schema; null means 839 | // that the schema name should not be used to narrow the search. 840 | 3: optional TPatternOrIdentifier schemaName 841 | 842 | // A function name pattern; must match the function name as it is stored 843 | // in the database. 844 | 4: required TPatternOrIdentifier functionName 845 | } 846 | 847 | struct TGetFunctionsResp { 848 | 1: required TStatus status 849 | 2: optional TOperationHandle operationHandle 850 | } 851 | 852 | 853 | // GetOperationStatus() 854 | // 855 | // Get the status of an operation running on the server. 856 | struct TGetOperationStatusReq { 857 | // Session to run this request against 858 | 1: required TOperationHandle operationHandle 859 | } 860 | 861 | struct TGetOperationStatusResp { 862 | 1: required TStatus status 863 | 2: optional TOperationState operationState 864 | } 865 | 866 | 867 | // CancelOperation() 868 | // 869 | // Cancels processing on the specified operation handle and 870 | // frees any resources which were allocated. 871 | struct TCancelOperationReq { 872 | // Operation to cancel 873 | 1: required TOperationHandle operationHandle 874 | } 875 | 876 | struct TCancelOperationResp { 877 | 1: required TStatus status 878 | } 879 | 880 | 881 | // CloseOperation() 882 | // 883 | // Given an operation in the FINISHED, CANCELED, 884 | // or ERROR states, CloseOperation() will free 885 | // all of the resources which were allocated on 886 | // the server to service the operation. 887 | struct TCloseOperationReq { 888 | 1: required TOperationHandle operationHandle 889 | } 890 | 891 | struct TCloseOperationResp { 892 | 1: required TStatus status 893 | } 894 | 895 | 896 | // GetResultSetMetadata() 897 | // 898 | // Retrieves schema information for the specified operation 899 | struct TGetResultSetMetadataReq { 900 | // Operation for which to fetch result set schema information 901 | 1: required TOperationHandle operationHandle 902 | } 903 | 904 | struct TGetResultSetMetadataResp { 905 | 1: required TStatus status 906 | 2: optional TTableSchema schema 907 | } 908 | 909 | 910 | enum TFetchOrientation { 911 | // Get the next rowset. The fetch offset is ignored. 912 | FETCH_NEXT, 913 | 914 | // Get the previous rowset. The fetch offset is ignored. 915 | // NOT SUPPORTED 916 | FETCH_PRIOR, 917 | 918 | // Return the rowset at the given fetch offset relative 919 | // to the curren rowset. 920 | // NOT SUPPORTED 921 | FETCH_RELATIVE, 922 | 923 | // Return the rowset at the specified fetch offset. 924 | // NOT SUPPORTED 925 | FETCH_ABSOLUTE, 926 | 927 | // Get the first rowset in the result set. 928 | FETCH_FIRST, 929 | 930 | // Get the last rowset in the result set. 931 | // NOT SUPPORTED 932 | FETCH_LAST 933 | } 934 | 935 | // FetchResults() 936 | // 937 | // Fetch rows from the server corresponding to 938 | // a particular OperationHandle. 939 | struct TFetchResultsReq { 940 | // Operation from which to fetch results. 941 | 1: required TOperationHandle operationHandle 942 | 943 | // The fetch orientation. For V1 this must be either 944 | // FETCH_NEXT or FETCH_FIRST. Defaults to FETCH_NEXT. 945 | 2: required TFetchOrientation orientation = TFetchOrientation.FETCH_NEXT 946 | 947 | // Max number of rows that should be returned in 948 | // the rowset. 949 | 3: required i64 maxRows 950 | } 951 | 952 | struct TFetchResultsResp { 953 | 1: required TStatus status 954 | 955 | // TRUE if there are more rows left to fetch from the server. 956 | 2: optional bool hasMoreRows 957 | 958 | // The rowset. This is optional so that we have the 959 | // option in the future of adding alternate formats for 960 | // representing result set data, e.g. delimited strings, 961 | // binary encoded, etc. 962 | 3: optional TRowSet results 963 | } 964 | 965 | // GetLog() 966 | // 967 | // Fetch operation log from the server corresponding to 968 | // a particular OperationHandle. 969 | struct TGetLogReq { 970 | // Operation whose log is requested 971 | 1: required TOperationHandle operationHandle 972 | } 973 | 974 | struct TGetLogResp { 975 | 1: required TStatus status 976 | 977 | 2: required string log 978 | } 979 | 980 | service TCLIService { 981 | 982 | TOpenSessionResp OpenSession(1:TOpenSessionReq req); 983 | 984 | TCloseSessionResp CloseSession(1:TCloseSessionReq req); 985 | 986 | TGetInfoResp GetInfo(1:TGetInfoReq req); 987 | 988 | TExecuteStatementResp ExecuteStatement(1:TExecuteStatementReq req); 989 | 990 | TGetTypeInfoResp GetTypeInfo(1:TGetTypeInfoReq req); 991 | 992 | TGetCatalogsResp GetCatalogs(1:TGetCatalogsReq req); 993 | 994 | TGetSchemasResp GetSchemas(1:TGetSchemasReq req); 995 | 996 | TGetTablesResp GetTables(1:TGetTablesReq req); 997 | 998 | TGetTableTypesResp GetTableTypes(1:TGetTableTypesReq req); 999 | 1000 | TGetColumnsResp GetColumns(1:TGetColumnsReq req); 1001 | 1002 | TGetFunctionsResp GetFunctions(1:TGetFunctionsReq req); 1003 | 1004 | TGetOperationStatusResp GetOperationStatus(1:TGetOperationStatusReq req); 1005 | 1006 | TCancelOperationResp CancelOperation(1:TCancelOperationReq req); 1007 | 1008 | TCloseOperationResp CloseOperation(1:TCloseOperationReq req); 1009 | 1010 | TGetResultSetMetadataResp GetResultSetMetadata(1:TGetResultSetMetadataReq req); 1011 | 1012 | TFetchResultsResp FetchResults(1:TFetchResultsReq req); 1013 | 1014 | TGetLogResp GetLog(1:TGetLogReq req); 1015 | } 1016 | -------------------------------------------------------------------------------- /herringbone-impala/src/main/thrift/fb303.thrift: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | 20 | /** 21 | * fb303.thrift 22 | */ 23 | 24 | namespace java com.facebook.fb303 25 | namespace cpp facebook.fb303 26 | namespace rb Impala.Protocol.fb303 27 | 28 | /** 29 | * Common status reporting mechanism across all services 30 | */ 31 | enum fb_status { 32 | DEAD = 0, 33 | STARTING = 1, 34 | ALIVE = 2, 35 | STOPPING = 3, 36 | STOPPED = 4, 37 | WARNING = 5, 38 | } 39 | 40 | /** 41 | * Standard base service 42 | */ 43 | service FacebookService { 44 | 45 | /** 46 | * Returns a descriptive name of the service 47 | */ 48 | string getName(), 49 | 50 | /** 51 | * Returns the version of the service 52 | */ 53 | string getVersion(), 54 | 55 | /** 56 | * Gets the status of this service 57 | */ 58 | fb_status getStatus(), 59 | 60 | /** 61 | * User friendly description of status, such as why the service is in 62 | * the dead or warning state, or what is being started or stopped. 63 | */ 64 | string getStatusDetails(), 65 | 66 | /** 67 | * Gets the counters for this service 68 | */ 69 | map getCounters(), 70 | 71 | /** 72 | * Gets the value of a single counter 73 | */ 74 | i64 getCounter(1: string key), 75 | 76 | /** 77 | * Sets an option 78 | */ 79 | void setOption(1: string key, 2: string value), 80 | 81 | /** 82 | * Gets an option 83 | */ 84 | string getOption(1: string key), 85 | 86 | /** 87 | * Gets all options 88 | */ 89 | map getOptions(), 90 | 91 | /** 92 | * Returns a CPU profile over the given time interval (client and server 93 | * must agree on the profile format). 94 | */ 95 | string getCpuProfile(1: i32 profileDurationInSec), 96 | 97 | /** 98 | * Returns the unix time that the server has been running since 99 | */ 100 | i64 aliveSince(), 101 | 102 | /** 103 | * Tell the server to reload its configuration, reopen log files, etc 104 | */ 105 | oneway void reinitialize(), 106 | 107 | /** 108 | * Suggest a shutdown to the server 109 | */ 110 | oneway void shutdown(), 111 | 112 | } 113 | -------------------------------------------------------------------------------- /herringbone-impala/src/main/thrift/hive_metastore.thrift: -------------------------------------------------------------------------------- 1 | #!/usr/local/bin/thrift -java 2 | 3 | /** 4 | * Licensed to the Apache Software Foundation (ASF) under one 5 | * or more contributor license agreements. See the NOTICE file 6 | * distributed with this work for additional information 7 | * regarding copyright ownership. The ASF licenses this file 8 | * to you under the Apache License, Version 2.0 (the 9 | * "License"); you may not use this file except in compliance 10 | * with the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | */ 20 | 21 | # 22 | # Thrift Service that the MetaStore is built on 23 | # 24 | 25 | include "fb303.thrift" 26 | 27 | namespace java org.apache.hadoop.hive.metastore.api 28 | namespace php metastore 29 | namespace cpp Apache.Hadoop.Hive 30 | namespace rb Impala.Protocol.HiveMetastore 31 | 32 | const string DDL_TIME = "transient_lastDdlTime" 33 | 34 | struct Version { 35 | 1: string version, 36 | 2: string comments 37 | } 38 | 39 | struct FieldSchema { 40 | 1: string name, // name of the field 41 | 2: string type, // type of the field. primitive types defined above, specify list, map for lists & maps 42 | 3: string comment 43 | } 44 | 45 | struct Type { 46 | 1: string name, // one of the types in PrimitiveTypes or CollectionTypes or User defined types 47 | 2: optional string type1, // object type if the name is 'list' (LIST_TYPE), key type if the name is 'map' (MAP_TYPE) 48 | 3: optional string type2, // val type if the name is 'map' (MAP_TYPE) 49 | //4: optional list fields // if the name is one of the user defined types 50 | } 51 | 52 | enum HiveObjectType { 53 | GLOBAL = 1, 54 | DATABASE = 2, 55 | TABLE = 3, 56 | PARTITION = 4, 57 | COLUMN = 5, 58 | } 59 | 60 | enum PrincipalType { 61 | USER = 1, 62 | ROLE = 2, 63 | GROUP = 3, 64 | } 65 | 66 | const string HIVE_FILTER_FIELD_OWNER = "hive_filter_field_owner__" 67 | const string HIVE_FILTER_FIELD_PARAMS = "hive_filter_field_params__" 68 | const string HIVE_FILTER_FIELD_LAST_ACCESS = "hive_filter_field_last_access__" 69 | 70 | enum PartitionEventType { 71 | LOAD_DONE = 1, 72 | } 73 | 74 | struct HiveObjectRef{ 75 | 1: HiveObjectType objectType, 76 | 2: string dbName, 77 | 3: string objectName, 78 | 4: list partValues, 79 | 5: string columnName, 80 | } 81 | 82 | struct PrivilegeGrantInfo { 83 | 1: string privilege, 84 | 2: i32 createTime, 85 | 3: string grantor, 86 | 4: PrincipalType grantorType, 87 | 5: bool grantOption, 88 | } 89 | 90 | struct HiveObjectPrivilege { 91 | 1: HiveObjectRef hiveObject, 92 | 2: string principalName, 93 | 3: PrincipalType principalType, 94 | 4: PrivilegeGrantInfo grantInfo, 95 | } 96 | 97 | struct PrivilegeBag { 98 | 1: list privileges, 99 | } 100 | 101 | struct PrincipalPrivilegeSet { 102 | 1: map> userPrivileges, // user name -> privilege grant info 103 | 2: map> groupPrivileges, // group name -> privilege grant info 104 | 3: map> rolePrivileges, //role name -> privilege grant info 105 | } 106 | 107 | struct Role { 108 | 1: string roleName, 109 | 2: i32 createTime, 110 | 3: string ownerName, 111 | } 112 | 113 | // namespace for tables 114 | struct Database { 115 | 1: string name, 116 | 2: string description, 117 | 3: string locationUri, 118 | 4: map parameters, // properties associated with the database 119 | 5: optional PrincipalPrivilegeSet privileges 120 | } 121 | 122 | // This object holds the information needed by SerDes 123 | struct SerDeInfo { 124 | 1: string name, // name of the serde, table name by default 125 | 2: string serializationLib, // usually the class that implements the extractor & loader 126 | 3: map parameters // initialization parameters 127 | } 128 | 129 | // sort order of a column (column name along with asc(1)/desc(0)) 130 | struct Order { 131 | 1: string col, // sort column name 132 | 2: i32 order // asc(1) or desc(0) 133 | } 134 | 135 | // this object holds all the information about physical storage of the data belonging to a table 136 | struct StorageDescriptor { 137 | 1: list cols, // required (refer to types defined above) 138 | 2: string location, // defaults to //tablename 139 | 3: string inputFormat, // SequenceFileInputFormat (binary) or TextInputFormat` or custom format 140 | 4: string outputFormat, // SequenceFileOutputFormat (binary) or IgnoreKeyTextOutputFormat or custom format 141 | 5: bool compressed, // compressed or not 142 | 6: i32 numBuckets, // this must be specified if there are any dimension columns 143 | 7: SerDeInfo serdeInfo, // serialization and deserialization information 144 | 8: list bucketCols, // reducer grouping columns and clustering columns and bucketing columns` 145 | 9: list sortCols, // sort order of the data in each bucket 146 | 10: map parameters // any user supplied key value hash 147 | } 148 | 149 | // table information 150 | struct Table { 151 | 1: string tableName, // name of the table 152 | 2: string dbName, // database name ('default') 153 | 3: string owner, // owner of this table 154 | 4: i32 createTime, // creation time of the table 155 | 5: i32 lastAccessTime, // last access time (usually this will be filled from HDFS and shouldn't be relied on) 156 | 6: i32 retention, // retention time 157 | 7: StorageDescriptor sd, // storage descriptor of the table 158 | 8: list partitionKeys, // partition keys of the table. only primitive types are supported 159 | 9: map parameters, // to store comments or any other user level parameters 160 | 10: string viewOriginalText, // original view text, null for non-view 161 | 11: string viewExpandedText, // expanded view text, null for non-view 162 | 12: string tableType, // table type enum, e.g. EXTERNAL_TABLE 163 | 13: optional PrincipalPrivilegeSet privileges, 164 | } 165 | 166 | struct Partition { 167 | 1: list values // string value is converted to appropriate partition key type 168 | 2: string dbName, 169 | 3: string tableName, 170 | 4: i32 createTime, 171 | 5: i32 lastAccessTime, 172 | 6: StorageDescriptor sd, 173 | 7: map parameters, 174 | 8: optional PrincipalPrivilegeSet privileges 175 | } 176 | 177 | struct Index { 178 | 1: string indexName, // unique with in the whole database namespace 179 | 2: string indexHandlerClass, // reserved 180 | 3: string dbName, 181 | 4: string origTableName, 182 | 5: i32 createTime, 183 | 6: i32 lastAccessTime, 184 | 7: string indexTableName, 185 | 8: StorageDescriptor sd, 186 | 9: map parameters, 187 | 10: bool deferredRebuild 188 | } 189 | 190 | // schema of the table/query results etc. 191 | struct Schema { 192 | // column names, types, comments 193 | 1: list fieldSchemas, // delimiters etc 194 | 2: map properties 195 | } 196 | 197 | // Key-value store to be used with selected 198 | // Metastore APIs (create, alter methods). 199 | // The client can pass environment properties / configs that can be 200 | // accessed in hooks. 201 | struct EnvironmentContext { 202 | 1: map properties 203 | } 204 | 205 | exception MetaException { 206 | 1: string message 207 | } 208 | 209 | exception UnknownTableException { 210 | 1: string message 211 | } 212 | 213 | exception UnknownDBException { 214 | 1: string message 215 | } 216 | 217 | exception AlreadyExistsException { 218 | 1: string message 219 | } 220 | 221 | exception InvalidPartitionException { 222 | 1: string message 223 | } 224 | 225 | exception UnknownPartitionException { 226 | 1: string message 227 | } 228 | 229 | exception InvalidObjectException { 230 | 1: string message 231 | } 232 | 233 | exception NoSuchObjectException { 234 | 1: string message 235 | } 236 | 237 | exception IndexAlreadyExistsException { 238 | 1: string message 239 | } 240 | 241 | exception InvalidOperationException { 242 | 1: string message 243 | } 244 | 245 | exception ConfigValSecurityException { 246 | 1: string message 247 | } 248 | 249 | /** 250 | * This interface is live. 251 | */ 252 | service ThriftHiveMetastore extends fb303.FacebookService 253 | { 254 | void create_database(1:Database database) throws(1:AlreadyExistsException o1, 2:InvalidObjectException o2, 3:MetaException o3) 255 | Database get_database(1:string name) throws(1:NoSuchObjectException o1, 2:MetaException o2) 256 | void drop_database(1:string name, 2:bool deleteData, 3:bool cascade) throws(1:NoSuchObjectException o1, 2:InvalidOperationException o2, 3:MetaException o3) 257 | list get_databases(1:string pattern) throws(1:MetaException o1) 258 | list get_all_databases() throws(1:MetaException o1) 259 | void alter_database(1:string dbname, 2:Database db) throws(1:MetaException o1, 2:NoSuchObjectException o2) 260 | 261 | // returns the type with given name (make seperate calls for the dependent types if needed) 262 | Type get_type(1:string name) throws(1:MetaException o1, 2:NoSuchObjectException o2) 263 | bool create_type(1:Type type) throws(1:AlreadyExistsException o1, 2:InvalidObjectException o2, 3:MetaException o3) 264 | bool drop_type(1:string type) throws(1:MetaException o1, 2:NoSuchObjectException o2) 265 | map get_type_all(1:string name) 266 | throws(1:MetaException o2) 267 | 268 | // Gets a list of FieldSchemas describing the columns of a particular table 269 | list get_fields(1: string db_name, 2: string table_name) throws (1: MetaException o1, 2: UnknownTableException o2, 3: UnknownDBException o3), 270 | 271 | // Gets a list of FieldSchemas describing both the columns and the partition keys of a particular table 272 | list get_schema(1: string db_name, 2: string table_name) throws (1: MetaException o1, 2: UnknownTableException o2, 3: UnknownDBException o3) 273 | 274 | // create a Hive table. Following fields must be set 275 | // tableName 276 | // database (only 'default' for now until Hive QL supports databases) 277 | // owner (not needed, but good to have for tracking purposes) 278 | // sd.cols (list of field schemas) 279 | // sd.inputFormat (SequenceFileInputFormat (binary like falcon tables or u_full) or TextInputFormat) 280 | // sd.outputFormat (SequenceFileInputFormat (binary) or TextInputFormat) 281 | // sd.serdeInfo.serializationLib (SerDe class name eg org.apache.hadoop.hive.serde.simple_meta.MetadataTypedColumnsetSerDe 282 | // * See notes on DDL_TIME 283 | void create_table(1:Table tbl) throws(1:AlreadyExistsException o1, 2:InvalidObjectException o2, 3:MetaException o3, 4:NoSuchObjectException o4) 284 | void create_table_with_environment_context(1:Table tbl, 285 | 2:EnvironmentContext environment_context) 286 | throws (1:AlreadyExistsException o1, 287 | 2:InvalidObjectException o2, 3:MetaException o3, 288 | 4:NoSuchObjectException o4) 289 | // drops the table and all the partitions associated with it if the table has partitions 290 | // delete data (including partitions) if deleteData is set to true 291 | void drop_table(1:string dbname, 2:string name, 3:bool deleteData) 292 | throws(1:NoSuchObjectException o1, 2:MetaException o3) 293 | list get_tables(1: string db_name, 2: string pattern) throws (1: MetaException o1) 294 | list get_all_tables(1: string db_name) throws (1: MetaException o1) 295 | 296 | Table get_table(1:string dbname, 2:string tbl_name) 297 | throws (1:MetaException o1, 2:NoSuchObjectException o2) 298 | list
get_table_objects_by_name(1:string dbname, 2:list tbl_names) 299 | throws (1:MetaException o1, 2:InvalidOperationException o2, 3:UnknownDBException o3) 300 | 301 | // Get a list of table names that match a filter. 302 | // The filter operators are LIKE, <, <=, >, >=, =, <> 303 | // 304 | // In the filter statement, values interpreted as strings must be enclosed in quotes, 305 | // while values interpreted as integers should not be. Strings and integers are the only 306 | // supported value types. 307 | // 308 | // The currently supported key names in the filter are: 309 | // Constants.HIVE_FILTER_FIELD_OWNER, which filters on the tables' owner's name 310 | // and supports all filter operators 311 | // Constants.HIVE_FILTER_FIELD_LAST_ACCESS, which filters on the last access times 312 | // and supports all filter operators except LIKE 313 | // Constants.HIVE_FILTER_FIELD_PARAMS, which filters on the tables' parameter keys and values 314 | // and only supports the filter operators = and <>. 315 | // Append the parameter key name to HIVE_FILTER_FIELD_PARAMS in the filter statement. 316 | // For example, to filter on parameter keys called "retention", the key name in the filter 317 | // statement should be Constants.HIVE_FILTER_FIELD_PARAMS + "retention" 318 | // Also, = and <> only work for keys that exist 319 | // in the tables. E.g., if you are looking for tables where key1 <> value, it will only 320 | // look at tables that have a value for the parameter key1. 321 | // Some example filter statements include: 322 | // filter = Constants.HIVE_FILTER_FIELD_OWNER + " like \".*test.*\" and " + 323 | // Constants.HIVE_FILTER_FIELD_LAST_ACCESS + " = 0"; 324 | // filter = Constants.HIVE_FILTER_FIELD_PARAMS + "retention = \"30\" or " + 325 | // Constants.HIVE_FILTER_FIELD_PARAMS + "retention = \"90\"" 326 | // @param dbName 327 | // The name of the database from which you will retrieve the table names 328 | // @param filterType 329 | // The type of filter 330 | // @param filter 331 | // The filter string 332 | // @param max_tables 333 | // The maximum number of tables returned 334 | // @return A list of table names that match the desired filter 335 | list get_table_names_by_filter(1:string dbname, 2:string filter, 3:i16 max_tables=-1) 336 | throws (1:MetaException o1, 2:InvalidOperationException o2, 3:UnknownDBException o3) 337 | 338 | // alter table applies to only future partitions not for existing partitions 339 | // * See notes on DDL_TIME 340 | void alter_table(1:string dbname, 2:string tbl_name, 3:Table new_tbl) 341 | throws (1:InvalidOperationException o1, 2:MetaException o2) 342 | void alter_table_with_environment_context(1:string dbname, 2:string tbl_name, 343 | 3:Table new_tbl, 4:EnvironmentContext environment_context) 344 | throws (1:InvalidOperationException o1, 2:MetaException o2) 345 | // the following applies to only tables that have partitions 346 | // * See notes on DDL_TIME 347 | Partition add_partition(1:Partition new_part) 348 | throws(1:InvalidObjectException o1, 2:AlreadyExistsException o2, 3:MetaException o3) 349 | Partition add_partition_with_environment_context(1:Partition new_part, 350 | 2:EnvironmentContext environment_context) 351 | throws (1:InvalidObjectException o1, 2:AlreadyExistsException o2, 352 | 3:MetaException o3) 353 | i32 add_partitions(1:list new_parts) 354 | throws(1:InvalidObjectException o1, 2:AlreadyExistsException o2, 3:MetaException o3) 355 | Partition append_partition(1:string db_name, 2:string tbl_name, 3:list part_vals) 356 | throws (1:InvalidObjectException o1, 2:AlreadyExistsException o2, 3:MetaException o3) 357 | Partition append_partition_by_name(1:string db_name, 2:string tbl_name, 3:string part_name) 358 | throws (1:InvalidObjectException o1, 2:AlreadyExistsException o2, 3:MetaException o3) 359 | bool drop_partition(1:string db_name, 2:string tbl_name, 3:list part_vals, 4:bool deleteData) 360 | throws(1:NoSuchObjectException o1, 2:MetaException o2) 361 | bool drop_partition_by_name(1:string db_name, 2:string tbl_name, 3:string part_name, 4:bool deleteData) 362 | throws(1:NoSuchObjectException o1, 2:MetaException o2) 363 | Partition get_partition(1:string db_name, 2:string tbl_name, 3:list part_vals) 364 | throws(1:MetaException o1, 2:NoSuchObjectException o2) 365 | 366 | Partition get_partition_with_auth(1:string db_name, 2:string tbl_name, 3:list part_vals, 367 | 4: string user_name, 5: list group_names) throws(1:MetaException o1, 2:NoSuchObjectException o2) 368 | 369 | Partition get_partition_by_name(1:string db_name 2:string tbl_name, 3:string part_name) 370 | throws(1:MetaException o1, 2:NoSuchObjectException o2) 371 | 372 | // returns all the partitions for this table in reverse chronological order. 373 | // If max parts is given then it will return only that many. 374 | list get_partitions(1:string db_name, 2:string tbl_name, 3:i16 max_parts=-1) 375 | throws(1:NoSuchObjectException o1, 2:MetaException o2) 376 | list get_partitions_with_auth(1:string db_name, 2:string tbl_name, 3:i16 max_parts=-1, 377 | 4: string user_name, 5: list group_names) throws(1:NoSuchObjectException o1, 2:MetaException o2) 378 | 379 | list get_partition_names(1:string db_name, 2:string tbl_name, 3:i16 max_parts=-1) 380 | throws(1:MetaException o2) 381 | 382 | // get_partition*_ps methods allow filtering by a partial partition specification, 383 | // as needed for dynamic partitions. The values that are not restricted should 384 | // be empty strings. Nulls were considered (instead of "") but caused errors in 385 | // generated Python code. The size of part_vals may be smaller than the 386 | // number of partition columns - the unspecified values are considered the same 387 | // as "". 388 | list get_partitions_ps(1:string db_name 2:string tbl_name 389 | 3:list part_vals, 4:i16 max_parts=-1) 390 | throws(1:MetaException o1, 2:NoSuchObjectException o2) 391 | list get_partitions_ps_with_auth(1:string db_name, 2:string tbl_name, 3:list part_vals, 4:i16 max_parts=-1, 392 | 5: string user_name, 6: list group_names) throws(1:NoSuchObjectException o1, 2:MetaException o2) 393 | 394 | list get_partition_names_ps(1:string db_name, 395 | 2:string tbl_name, 3:list part_vals, 4:i16 max_parts=-1) 396 | throws(1:MetaException o1, 2:NoSuchObjectException o2) 397 | 398 | // get the partitions matching the given partition filter 399 | list get_partitions_by_filter(1:string db_name 2:string tbl_name 400 | 3:string filter, 4:i16 max_parts=-1) 401 | throws(1:MetaException o1, 2:NoSuchObjectException o2) 402 | 403 | // get partitions give a list of partition names 404 | list get_partitions_by_names(1:string db_name 2:string tbl_name 3:list names) 405 | throws(1:MetaException o1, 2:NoSuchObjectException o2) 406 | 407 | // changes the partition to the new partition object. partition is identified from the part values 408 | // in the new_part 409 | // * See notes on DDL_TIME 410 | void alter_partition(1:string db_name, 2:string tbl_name, 3:Partition new_part) 411 | throws (1:InvalidOperationException o1, 2:MetaException o2) 412 | 413 | void alter_partition_with_environment_context(1:string db_name, 414 | 2:string tbl_name, 3:Partition new_part, 415 | 4:EnvironmentContext environment_context) 416 | throws (1:InvalidOperationException o1, 2:MetaException o2) 417 | 418 | // rename the old partition to the new partition object by changing old part values to the part values 419 | // in the new_part. old partition is identified from part_vals. 420 | // partition keys in new_part should be the same as those in old partition. 421 | void rename_partition(1:string db_name, 2:string tbl_name, 3:list part_vals, 4:Partition new_part) 422 | throws (1:InvalidOperationException o1, 2:MetaException o2) 423 | 424 | // gets the value of the configuration key in the metastore server. returns 425 | // defaultValue if the key does not exist. if the configuration key does not 426 | // begin with "hive", "mapred", or "hdfs", a ConfigValSecurityException is 427 | // thrown. 428 | string get_config_value(1:string name, 2:string defaultValue) 429 | throws(1:ConfigValSecurityException o1) 430 | 431 | // converts a partition name into a partition values array 432 | list partition_name_to_vals(1: string part_name) 433 | throws(1: MetaException o1) 434 | // converts a partition name into a partition specification (a mapping from 435 | // the partition cols to the values) 436 | map partition_name_to_spec(1: string part_name) 437 | throws(1: MetaException o1) 438 | 439 | void markPartitionForEvent(1:string db_name, 2:string tbl_name, 3:map part_vals, 440 | 4:PartitionEventType eventType) throws (1: MetaException o1, 2: NoSuchObjectException o2, 441 | 3: UnknownDBException o3, 4: UnknownTableException o4, 5: UnknownPartitionException o5, 442 | 6: InvalidPartitionException o6) 443 | bool isPartitionMarkedForEvent(1:string db_name, 2:string tbl_name, 3:map part_vals, 444 | 4: PartitionEventType eventType) throws (1: MetaException o1, 2:NoSuchObjectException o2, 445 | 3: UnknownDBException o3, 4: UnknownTableException o4, 5: UnknownPartitionException o5, 446 | 6: InvalidPartitionException o6) 447 | 448 | //index 449 | Index add_index(1:Index new_index, 2: Table index_table) 450 | throws(1:InvalidObjectException o1, 2:AlreadyExistsException o2, 3:MetaException o3) 451 | void alter_index(1:string dbname, 2:string base_tbl_name, 3:string idx_name, 4:Index new_idx) 452 | throws (1:InvalidOperationException o1, 2:MetaException o2) 453 | bool drop_index_by_name(1:string db_name, 2:string tbl_name, 3:string index_name, 4:bool deleteData) 454 | throws(1:NoSuchObjectException o1, 2:MetaException o2) 455 | Index get_index_by_name(1:string db_name 2:string tbl_name, 3:string index_name) 456 | throws(1:MetaException o1, 2:NoSuchObjectException o2) 457 | 458 | list get_indexes(1:string db_name, 2:string tbl_name, 3:i16 max_indexes=-1) 459 | throws(1:NoSuchObjectException o1, 2:MetaException o2) 460 | list get_index_names(1:string db_name, 2:string tbl_name, 3:i16 max_indexes=-1) 461 | throws(1:MetaException o2) 462 | 463 | //authorization privileges 464 | 465 | bool create_role(1:Role role) throws(1:MetaException o1) 466 | bool drop_role(1:string role_name) throws(1:MetaException o1) 467 | list get_role_names() throws(1:MetaException o1) 468 | bool grant_role(1:string role_name, 2:string principal_name, 3:PrincipalType principal_type, 469 | 4:string grantor, 5:PrincipalType grantorType, 6:bool grant_option) throws(1:MetaException o1) 470 | bool revoke_role(1:string role_name, 2:string principal_name, 3:PrincipalType principal_type) 471 | throws(1:MetaException o1) 472 | list list_roles(1:string principal_name, 2:PrincipalType principal_type) throws(1:MetaException o1) 473 | 474 | PrincipalPrivilegeSet get_privilege_set(1:HiveObjectRef hiveObject, 2:string user_name, 475 | 3: list group_names) throws(1:MetaException o1) 476 | list list_privileges(1:string principal_name, 2:PrincipalType principal_type, 477 | 3: HiveObjectRef hiveObject) throws(1:MetaException o1) 478 | 479 | bool grant_privileges(1:PrivilegeBag privileges) throws(1:MetaException o1) 480 | bool revoke_privileges(1:PrivilegeBag privileges) throws(1:MetaException o1) 481 | 482 | // this is used by metastore client to send UGI information to metastore server immediately 483 | // after setting up a connection. 484 | list set_ugi(1:string user_name, 2:list group_names) throws (1:MetaException o1) 485 | 486 | //Authentication (delegation token) interfaces 487 | 488 | // get metastore server delegation token for use from the map/reduce tasks to authenticate 489 | // to metastore server 490 | string get_delegation_token(1:string token_owner, 2:string renewer_kerberos_principal_name) 491 | throws (1:MetaException o1) 492 | 493 | // method to renew delegation token obtained from metastore server 494 | i64 renew_delegation_token(1:string token_str_form) throws (1:MetaException o1) 495 | 496 | // method to cancel delegation token obtained from metastore server 497 | void cancel_delegation_token(1:string token_str_form) throws (1:MetaException o1) 498 | } 499 | 500 | // * Note about the DDL_TIME: When creating or altering a table or a partition, 501 | // if the DDL_TIME is not set, the current time will be used. 502 | 503 | // For storing info about archived partitions in parameters 504 | 505 | // Whether the partition is archived 506 | const string IS_ARCHIVED = "is_archived", 507 | // The original location of the partition, before archiving. After archiving, 508 | // this directory will contain the archive. When the partition 509 | // is dropped, this directory will be deleted 510 | const string ORIGINAL_LOCATION = "original_location", 511 | 512 | // these should be needed only for backward compatibility with filestore 513 | const string META_TABLE_COLUMNS = "columns", 514 | const string META_TABLE_COLUMN_TYPES = "columns.types", 515 | const string BUCKET_FIELD_NAME = "bucket_field_name", 516 | const string BUCKET_COUNT = "bucket_count", 517 | const string FIELD_TO_DIMENSION = "field_to_dimension", 518 | const string META_TABLE_NAME = "name", 519 | const string META_TABLE_DB = "db", 520 | const string META_TABLE_LOCATION = "location", 521 | const string META_TABLE_SERDE = "serde", 522 | const string META_TABLE_PARTITION_COLUMNS = "partition_columns", 523 | const string FILE_INPUT_FORMAT = "file.inputformat", 524 | const string FILE_OUTPUT_FORMAT = "file.outputformat", 525 | const string META_TABLE_STORAGE = "storage_handler", 526 | 527 | 528 | 529 | -------------------------------------------------------------------------------- /herringbone-main/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | com.stripe 6 | herringbone-main 7 | 0.0.1 8 | jar 9 | 10 | Herringbone Main 11 | 12 | 13 | 14 | dtrott 15 | https://maven.davidtrott.com/repository 16 | 17 | 18 | 19 | 20 | 21 | 22 | org.scalatest 23 | scalatest-maven-plugin 24 | 1.0-M2 25 | 26 | ${project.build.directory}/surefire-reports 27 | . 28 | WDF TestSuite.txt 29 | ${project.build.directory}/html/scalatest 30 | false 31 | 32 | 33 | 34 | test 35 | 36 | test 37 | 38 | 39 | 40 | 41 | 42 | 43 | org.apache.maven.plugins 44 | maven-compiler-plugin 45 | 3.1 46 | 47 | 1.6 48 | 1.6 49 | 50 | 51 | 52 | maven-jar-plugin 53 | 2.3.1 54 | 55 | 56 | 57 | maven-resources-plugin 58 | 2.4.3 59 | 60 | 61 | 62 | net.alchim31.maven 63 | scala-maven-plugin 64 | 3.1.6 65 | 66 | incremental 67 | true 68 | 69 | 70 | 71 | 72 | compile 73 | testCompile 74 | 75 | 76 | 77 | 78 | 79 | 80 | org.apache.maven.plugins 81 | maven-shade-plugin 82 | 2.3 83 | 84 | false 85 | target/herringbone-${project.version}-jar-with-dependencies.jar 86 | 87 | 88 | 89 | package 90 | 91 | shade 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 1.6.0rc7 101 | UTF-8 102 | 2.10.4 103 | 1.7 104 | 1.7 105 | 106 | 107 | 108 | 109 | com.twitter 110 | parquet-common 111 | ${parquet.version} 112 | 113 | 114 | com.twitter 115 | parquet-encoding 116 | ${parquet.version} 117 | 118 | 119 | com.twitter 120 | parquet-column 121 | ${parquet.version} 122 | 123 | 124 | com.twitter 125 | parquet-hadoop 126 | ${parquet.version} 127 | 128 | 129 | org.apache.hadoop 130 | hadoop-client 131 | 2.5.2 132 | provided 133 | 134 | 135 | org.apache.hive 136 | hive-jdbc 137 | 0.14.0 138 | 139 | 140 | com.twitter 141 | parquet-hadoop-bundle 142 | 143 | 144 | 145 | 146 | org.rogach 147 | scallop_2.10 148 | 0.9.5 149 | 150 | 151 | org.scala-lang 152 | jline 153 | 2.9.0-1 154 | 155 | 156 | org.scalatest 157 | scalatest_2.10 158 | 2.0 159 | test 160 | 161 | 162 | org.scalamock 163 | scalamock-scalatest-support_2.10 164 | 3.1.RC1 165 | test 166 | 167 | 168 | com.stripe 169 | herringbone-impala 170 | 0.0.2 171 | 172 | 173 | org.apache.thrift 174 | libthrift 175 | 176 | 177 | 178 | 179 | 180 | -------------------------------------------------------------------------------- /herringbone-main/src/main/scala/com/stripe/herringbone/CompactInputFormat.scala: -------------------------------------------------------------------------------- 1 | package com.stripe.herringbone 2 | 3 | import java.util.{List => JavaList} 4 | import java.io.DataOutput 5 | import java.io.DataInput 6 | 7 | import scala.collection.mutable.MutableList 8 | import scala.collection.JavaConverters._ 9 | import scala.collection.JavaConversions._ 10 | 11 | import org.apache.hadoop.io.Writable 12 | import org.apache.hadoop.mapreduce.{InputSplit,Job,JobContext,Mapper,TaskAttemptContext} 13 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat 14 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat 15 | import parquet.hadoop.api.ReadSupport 16 | import parquet.hadoop.{ParquetInputFormat,ParquetInputSplit,ParquetOutputFormat,ParquetRecordReader} 17 | import parquet.hadoop.example.{ExampleOutputFormat,GroupReadSupport} 18 | import parquet.hadoop.util.ContextUtil 19 | import parquet.example.data.{Group,GroupWriter} 20 | import parquet.example.data.simple.SimpleGroup 21 | 22 | 23 | class CompactInputFormat[T](readSupportClass: Class[_ <: ReadSupport[T]]) extends ParquetInputFormat[T](readSupportClass) { 24 | 25 | // Our HDFS block size is 1024MB so we'll get pretty close. 26 | val TARGET = 1024 * 1024 * 1024 // 1024MB. 27 | 28 | override def getSplits(context: JobContext): JavaList[InputSplit] = { 29 | // Limit the splits to 100MB so it's easy to assemble them into 1024MB 30 | // chunks. This is not actually reliable. Chunks can come back bigger than 31 | // 100MB, but it does limit the size of most chunks. 32 | val conf = ContextUtil.getConfiguration(context) 33 | conf.set("mapred.max.split.size", (100 * 1024 * 1024).toString) 34 | 35 | val splits = super.getSplits(conf, getFooters(context)).asScala.toList 36 | val m = if (splits.isEmpty) splits else mergeSplits(splits) 37 | m.asInstanceOf[List[InputSplit]].asJava 38 | } 39 | 40 | def mergeSplits(splits: List[ParquetInputSplit]): List[MergedInputSplit] = { 41 | val sizes = splits.map { _.getLength } 42 | println(s"""${splits.length} initial splits were generated. 43 | | Max: ${mb(sizes.max)} 44 | | Min: ${mb(sizes.min)} 45 | | Avg: ${mb(sizes.sum.toDouble / sizes.length)}""".stripMargin) 46 | 47 | // TODO: get a CS undergrad to give us better bin packing. 48 | var buckets = MutableList[MutableList[ParquetInputSplit]](MutableList(splits.head)) 49 | splits.tail.foreach { split => 50 | val bucket = buckets.minBy { b => b.map { _.getLength }.sum } 51 | if ((split.getLength + bucket.map { _.getLength }.sum) < TARGET) { 52 | bucket += split 53 | } else { 54 | buckets += MutableList(split) 55 | } 56 | } 57 | 58 | val newSizes = buckets.map { _.map { _.getLength }.sum }.toList 59 | println(s"""${buckets.length} merged splits were generated. 60 | | Max: ${mb(newSizes.max)} 61 | | Min: ${mb(newSizes.min)} 62 | | Avg: ${mb(newSizes.sum.toDouble / newSizes.length)}""".stripMargin) 63 | 64 | buckets.map { b => new MergedInputSplit(b.toList) }.toList 65 | } 66 | 67 | override def createRecordReader(split: InputSplit, context: TaskAttemptContext): MergedRecordReader[T] = { 68 | val readSupport = ParquetInputFormat.getReadSupportInstance[T](ContextUtil.getConfiguration(context)) 69 | split match { 70 | case s: MergedInputSplit => new MergedRecordReader[T](s, context, readSupport) 71 | case _ => throw new Exception(s"Expected a MergedInputSplit. Found a $split.") 72 | } 73 | } 74 | 75 | // Helper for pretty-printing byte values. 76 | def mb(n: Double): String = { 77 | val K = 1024 78 | val M = K * K 79 | val G = K * M 80 | if (n < K) f"$n%.2fB" 81 | else if (n < M) f"${n / K}%.2fK" 82 | else if (n < G) f"${n / M}%.2fM" 83 | else f"${n / G}%.2fG" 84 | } 85 | } 86 | 87 | class MergedInputSplit(var splits: List[ParquetInputSplit]) extends InputSplit with Writable { 88 | def this() = this(List()) 89 | 90 | var splitNumber = 0 91 | 92 | def currentSplit: ParquetInputSplit = splits(splitNumber) 93 | def nextSplit: Option[ParquetInputSplit] = { 94 | if (splitNumber < splits.length - 1) { 95 | splitNumber += 1 96 | Some(currentSplit) 97 | } else { 98 | None 99 | } 100 | } 101 | 102 | // write and readFields are paired for serialization/deserialization. 103 | override def write(out: DataOutput) = { 104 | out.writeInt(splits.length) 105 | splits.foreach { s => s.write(out) } 106 | } 107 | 108 | override def readFields(in: DataInput) = { 109 | val count = in.readInt 110 | splits = for (i <- List.range(0, count)) yield { 111 | val s = new ParquetInputSplit 112 | s.readFields(in) 113 | s 114 | } 115 | } 116 | 117 | override def getLength: Long = splits.map { _.getLength }.sum 118 | override def getLocations: Array[String] = splits.flatMap { _.getLocations }.toArray 119 | override def toString = "" 120 | } 121 | 122 | class MergedRecordReader[T](split: MergedInputSplit, 123 | taskContext: TaskAttemptContext, 124 | readSupport: ReadSupport[T]) extends ParquetRecordReader[T](readSupport) { 125 | val totalLength = split.getLength 126 | var progress = 0L 127 | 128 | override def initialize(split: InputSplit, context: TaskAttemptContext) { 129 | super.initialize(split.asInstanceOf[MergedInputSplit].currentSplit, context) 130 | } 131 | 132 | def startNextSplit(split: MergedInputSplit, context: TaskAttemptContext): Boolean = { 133 | split.nextSplit match { 134 | case Some(s) => { 135 | super.initialize(s, context) 136 | true 137 | } 138 | case None => false 139 | } 140 | } 141 | 142 | // nextKeyValue is used to ask for the next tuple and returns false when the 143 | // recordReader has no more tuples. Since we're wrapping multiple splits, and 144 | // therefore multiple record readers, we detect when the current inernal 145 | // reader is done and move to the next reader. 146 | override def nextKeyValue: Boolean = { 147 | val next = super.nextKeyValue 148 | if (next) { 149 | next 150 | } else { 151 | super.close 152 | progress += split.currentSplit.getLength 153 | 154 | if (startNextSplit(split, taskContext)) { 155 | nextKeyValue 156 | } else { 157 | false 158 | } 159 | } 160 | } 161 | 162 | override def toString = "" 163 | override def getProgress: Float = progress / totalLength 164 | } 165 | 166 | 167 | class CompactGroupInputFormat extends CompactInputFormat[Group](classOf[GroupReadSupport]) { } 168 | -------------------------------------------------------------------------------- /herringbone-main/src/main/scala/com/stripe/herringbone/CompactJob.scala: -------------------------------------------------------------------------------- 1 | package com.stripe.herringbone 2 | 3 | import com.stripe.herringbone.util.ParquetUtils 4 | 5 | import java.util.{List => JavaList} 6 | import java.io.DataOutput 7 | import java.io.DataInput 8 | 9 | import scala.collection.mutable.MutableList 10 | import scala.collection.JavaConverters._ 11 | 12 | import org.apache.hadoop.conf.{Configuration,Configured} 13 | import org.apache.hadoop.fs.{FileSystem,Path} 14 | import org.apache.hadoop.mapreduce.{Job,Mapper} 15 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat 16 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat 17 | import org.apache.hadoop.util.{Tool,ToolRunner} 18 | 19 | import org.codehaus.jackson.map.ObjectMapper 20 | import org.codehaus.jackson.`type`.TypeReference 21 | 22 | import org.rogach.scallop.ScallopConf 23 | 24 | import parquet.example.data.{Group,GroupWriter} 25 | import parquet.hadoop.{BadConfigurationException,ParquetInputFormat,ParquetOutputFormat} 26 | import parquet.hadoop.api.{DelegatingWriteSupport,WriteSupport} 27 | import parquet.hadoop.api.WriteSupport.FinalizedWriteContext 28 | import parquet.hadoop.example.{GroupReadSupport,GroupWriteSupport} 29 | 30 | class ParquetCompactConf(arguments: Seq[String]) extends ScallopConf(arguments) { 31 | val inputPath = opt[String](required = true) 32 | val outputPath = opt[String](descr = "Default is input path with `-compact` appended") 33 | } 34 | 35 | class ParquetCompactWriteSupport extends DelegatingWriteSupport[Group](new GroupWriteSupport) { 36 | var extraMetadata: java.util.Map[String, String] = _ 37 | 38 | override def init(configuration: Configuration): WriteSupport.WriteContext = { 39 | extractMetadata(configuration) 40 | super.init(configuration) 41 | } 42 | 43 | override def finalizeWrite(): FinalizedWriteContext = { 44 | new FinalizedWriteContext(extraMetadata) 45 | } 46 | 47 | def extractMetadata(configuration: Configuration) = { 48 | val metadataJson = configuration.get(ParquetCompactWriteSupport.ExtraMetadataKey) 49 | try { 50 | extraMetadata = new ObjectMapper().readValue(metadataJson, new TypeReference[java.util.Map[String,String]](){}) 51 | } catch { case e: java.io.IOException => 52 | throw new BadConfigurationException("Unable to deserialize extra extra metadata: " + metadataJson, e) 53 | } 54 | } 55 | } 56 | 57 | object ParquetCompactWriteSupport { 58 | val ExtraMetadataKey = "herringbone.compact.extrametadata" 59 | } 60 | 61 | class CompactJob extends Configured with Tool { 62 | override def run(arguments: Array[String]) = { 63 | val conf = new ParquetCompactConf(arguments) 64 | val inputPath = new Path(conf.inputPath()) 65 | val fs = inputPath.getFileSystem(getConf) 66 | val outputPathString = conf.outputPath.get.getOrElse(conf.inputPath().stripSuffix("/").concat("-compact")) 67 | val outputPath = new Path(outputPathString) 68 | 69 | // Pass along metadata (which includes the thrift schema) to the results. 70 | val metadata = ParquetUtils.readKeyValueMetaData(inputPath) 71 | val metadataJson = new ObjectMapper().writeValueAsString(metadata) 72 | getConf.set(ParquetCompactWriteSupport.ExtraMetadataKey, metadataJson) 73 | 74 | if (fs.exists(outputPath)) { 75 | println(s"Deleting existing $outputPath") 76 | fs.delete(outputPath, true) 77 | } 78 | 79 | val job = new Job(getConf) 80 | 81 | FileInputFormat.setInputPaths(job, inputPath) 82 | FileOutputFormat.setOutputPath(job, outputPath) 83 | ParquetInputFormat.setReadSupportClass(job, classOf[GroupReadSupport]) 84 | ParquetOutputFormat.setWriteSupportClass(job, classOf[ParquetCompactWriteSupport]) 85 | GroupWriteSupport.setSchema(ParquetUtils.readSchema(inputPath), job.getConfiguration) 86 | 87 | job.setJobName("compact " + conf.inputPath() + " → " + outputPathString) 88 | job.setInputFormatClass(classOf[CompactGroupInputFormat]); 89 | job.setOutputFormatClass(classOf[ParquetOutputFormat[Group]]) 90 | job.setMapperClass(classOf[Mapper[Void,Group,Void,Group]]) 91 | job.setJarByClass(classOf[CompactJob]) 92 | job.getConfiguration.setBoolean("mapreduce.job.user.classpath.first", true) 93 | job.getConfiguration.setBoolean(ParquetOutputFormat.ENABLE_JOB_SUMMARY, false) 94 | job.getConfiguration.setBoolean(ParquetInputFormat.TASK_SIDE_METADATA, false); 95 | job.setNumReduceTasks(0) 96 | 97 | if(job.waitForCompletion(true)) 0 else 1 98 | } 99 | } 100 | 101 | object CompactJob { 102 | 103 | def main(args: Array[String]) = { 104 | val result = ToolRunner.run(new Configuration, new CompactJob, args) 105 | System.exit(result) 106 | } 107 | } 108 | -------------------------------------------------------------------------------- /herringbone-main/src/main/scala/com/stripe/herringbone/FlattenJob.scala: -------------------------------------------------------------------------------- 1 | package com.stripe.herringbone 2 | 3 | import com.stripe.herringbone.flatten.{ParquetFlatConf,ParquetFlatMapper,TypeFlattener} 4 | import com.stripe.herringbone.flatten.FlatConverter 5 | import com.stripe.herringbone.util.ParquetUtils 6 | 7 | import org.apache.hadoop.mapreduce._ 8 | import org.apache.hadoop.mapreduce.lib.input._ 9 | import org.apache.hadoop.mapreduce.lib.output._ 10 | import org.apache.hadoop.util._ 11 | import org.apache.hadoop.fs._ 12 | import org.apache.hadoop.conf._ 13 | 14 | import parquet.example.data._ 15 | import parquet.example.data.simple._ 16 | import parquet.hadoop._ 17 | import parquet.hadoop.example._ 18 | import parquet.io.api._ 19 | import parquet.schema._ 20 | 21 | import org.rogach.scallop._ 22 | 23 | class FlattenMapper extends ParquetFlatMapper[Group] { 24 | def valueOut(value: Group) = { 25 | FlatConverter.flattenGroup(value, flattenedSchema, separator, renameId) 26 | } 27 | } 28 | 29 | class FlattenJob extends Configured with Tool { 30 | override def run(args: Array[String]) = { 31 | val conf = new ParquetFlatConf(args) 32 | val fs = FileSystem.get(getConf) 33 | val inputPath = new Path(conf.inputPath()) 34 | val outputPathString = conf.outputPath.get.getOrElse(conf.inputPath().stripSuffix("/").concat("-flat")) 35 | val outputPath = new Path(outputPathString) 36 | val previousPath = conf.previousPath.get.map{new Path(_)} 37 | 38 | val separator = conf.separator() 39 | getConf.set(ParquetFlatMapper.SeparatorKey, separator) 40 | 41 | val renameId = conf.renameId() 42 | getConf.set(ParquetFlatMapper.RenameIdKey, renameId.toString) 43 | 44 | if (fs.exists(outputPath)) { 45 | println(s"Deleting existing $outputPath") 46 | fs.delete(outputPath, true) 47 | } 48 | 49 | val flattenedSchema = TypeFlattener.flatten( 50 | ParquetUtils.readSchema(inputPath), 51 | previousPath.map { ParquetUtils.readSchema(_) }, 52 | separator, 53 | renameId 54 | ) 55 | 56 | val jobName = "flatten " + conf.inputPath() + " -> " + outputPathString 57 | val job = new Job(getConf, jobName) 58 | 59 | FileInputFormat.setInputPaths(job, inputPath) 60 | FileOutputFormat.setOutputPath(job, outputPath) 61 | ExampleOutputFormat.setSchema(job, flattenedSchema) 62 | ParquetInputFormat.setReadSupportClass(job, classOf[GroupReadSupport]) 63 | 64 | job.setInputFormatClass(classOf[CompactGroupInputFormat]); 65 | job.setOutputFormatClass(classOf[ExampleOutputFormat]) 66 | job.setMapperClass(classOf[FlattenMapper]) 67 | job.setJarByClass(classOf[FlattenJob]) 68 | job.getConfiguration.setBoolean("mapreduce.job.user.classpath.first", true) 69 | job.getConfiguration.setBoolean(ParquetOutputFormat.ENABLE_JOB_SUMMARY, false) 70 | job.getConfiguration.setBoolean(ParquetInputFormat.TASK_SIDE_METADATA, false); 71 | job.setNumReduceTasks(0) 72 | 73 | if (job.waitForCompletion(true)) 0 else 1 74 | } 75 | } 76 | 77 | object FlattenJob { 78 | def main(args: Array[String]) = { 79 | val result = ToolRunner.run(new Configuration, new FlattenJob, args) 80 | System.exit(result) 81 | } 82 | } 83 | -------------------------------------------------------------------------------- /herringbone-main/src/main/scala/com/stripe/herringbone/ParquetLoad.scala: -------------------------------------------------------------------------------- 1 | package com.stripe.herringbone 2 | 3 | import com.stripe.herringbone.load._ 4 | 5 | import org.apache.hadoop.conf._ 6 | import org.apache.hadoop.util._ 7 | 8 | class ParquetLoad extends Configured with Tool { 9 | override def run(args: Array[String]): Int = { 10 | val conf = new ParquetLoadConf(args) 11 | val hadoopFs = new HadoopFs() 12 | val fieldUtils = FieldUtils(hadoopFs, ImpalaHiveSchemaTypeMapper) 13 | 14 | val loader: ParquetLoader = if (conf.hive()) { 15 | HiveLoader(conf, hadoopFs, fieldUtils) 16 | } else { 17 | ImpalaLoader(conf, hadoopFs, fieldUtils) 18 | } 19 | 20 | if (conf.updatePartitions()) { 21 | val tableExists = loader.checkTableExists(conf.table(), conf.database()) 22 | 23 | (conf.path.get, tableExists) match { 24 | case (_, true) => loader.updateTable(conf.table(), conf.database()) 25 | case (Some(path), false) => loader.createTable(path, conf.table(), conf.database()) 26 | case (None, false) => { 27 | println("ERROR - path not specified and table not yet created. Specify path from which to create the table") 28 | return 1 29 | } 30 | } 31 | } else { 32 | loader.createTable(conf.path(), conf.table(), conf.database()) 33 | } 34 | loader.closeConnection 35 | 36 | 0 37 | } 38 | } 39 | 40 | object ParquetLoad { 41 | def main(args: Array[String]) = { 42 | val result = ToolRunner.run(new Configuration, new ParquetLoad, args) 43 | System.exit(result) 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /herringbone-main/src/main/scala/com/stripe/herringbone/TsvJob.scala: -------------------------------------------------------------------------------- 1 | package com.stripe.herringbone 2 | 3 | import com.stripe.herringbone.flatten.{ParquetFlatConf,ParquetFlatMapper,TypeFlattener} 4 | import com.stripe.herringbone.flatten.FlatConverter 5 | import com.stripe.herringbone.util.ParquetUtils 6 | 7 | import java.io.{BufferedWriter, OutputStreamWriter} 8 | 9 | import org.apache.hadoop.mapreduce._ 10 | import org.apache.hadoop.mapreduce.lib.input._ 11 | import org.apache.hadoop.mapreduce.lib.output._ 12 | import org.apache.hadoop.util._ 13 | import org.apache.hadoop.fs._ 14 | import org.apache.hadoop.conf._ 15 | import org.apache.hadoop.io.Text 16 | 17 | import org.rogach.scallop._ 18 | 19 | import parquet.example.data._ 20 | import parquet.example.data.simple._ 21 | import parquet.hadoop._ 22 | import parquet.hadoop.example._ 23 | import parquet.io.api._ 24 | import parquet.schema._ 25 | 26 | import scala.collection.JavaConversions._ 27 | 28 | class TsvMapper extends ParquetFlatMapper[Text] { 29 | def valueOut(value: Group) = { 30 | val tsvLine = FlatConverter.groupToTSV(value, flattenedSchema, separator, renameId) 31 | new Text(tsvLine) 32 | } 33 | } 34 | 35 | class TsvJob extends Configured with Tool { 36 | override def run(args: Array[String]) = { 37 | val conf = new ParquetFlatConf(args) 38 | val fs = FileSystem.get(getConf) 39 | val inputPath = new Path(conf.inputPath()) 40 | val outputPathString = conf.outputPath.get.getOrElse(conf.inputPath().stripSuffix("/").concat("-tsv")) 41 | val outputPath = new Path(outputPathString) 42 | val previousPath = conf.previousPath.get.map{new Path(_)} 43 | 44 | val separator = conf.separator() 45 | getConf.set(ParquetFlatMapper.SeparatorKey, separator) 46 | 47 | val renameId = conf.renameId() 48 | getConf.set(ParquetFlatMapper.RenameIdKey, renameId.toString) 49 | 50 | if (fs.exists(outputPath)) { 51 | println(s"Deleting existing $outputPath") 52 | fs.delete(outputPath, true) 53 | } 54 | 55 | val flattenedSchema = TypeFlattener.flatten( 56 | ParquetUtils.readSchema(inputPath), 57 | previousPath.map { ParquetUtils.readSchema(_) }, 58 | separator, 59 | renameId 60 | ) 61 | 62 | val jobName = "tsv " + conf.inputPath() + " -> " + outputPathString 63 | val job = new Job(getConf, jobName) 64 | 65 | FileInputFormat.setInputPaths(job, inputPath) 66 | FileOutputFormat.setOutputPath(job, outputPath) 67 | ParquetInputFormat.setReadSupportClass(job, classOf[GroupReadSupport]) 68 | ExampleOutputFormat.setSchema(job, flattenedSchema) 69 | 70 | job.setInputFormatClass(classOf[CompactGroupInputFormat]) 71 | job.setOutputFormatClass(classOf[TextOutputFormat[Text, Text]].asInstanceOf[Class[Nothing]]) 72 | job.setMapperClass(classOf[TsvMapper]) 73 | job.setJarByClass(classOf[TsvJob]) 74 | job.getConfiguration.set("mapreduce.job.user.classpath.first", "true") 75 | job.getConfiguration.setBoolean(ParquetInputFormat.TASK_SIDE_METADATA, false) 76 | job.setNumReduceTasks(0) 77 | 78 | if (job.waitForCompletion(true)) { 79 | val headerPath = new Path(outputPathString + "/_header.tsv") 80 | writeHeader(fs, headerPath, flattenedSchema) 81 | 0 82 | } else { 83 | 1 84 | } 85 | } 86 | 87 | def writeHeader(fs: FileSystem, outputPath: Path, schema: MessageType) { 88 | val header = FlatConverter.constructHeader(schema) 89 | val writer = new BufferedWriter(new OutputStreamWriter(fs.create(outputPath, true))) 90 | writer.write(header) 91 | writer.write("\n") 92 | writer.close() 93 | } 94 | } 95 | 96 | object TsvJob { 97 | def main(args: Array[String]) = { 98 | val result = ToolRunner.run(new Configuration, new TsvJob, args) 99 | System.exit(result) 100 | } 101 | } 102 | -------------------------------------------------------------------------------- /herringbone-main/src/main/scala/com/stripe/herringbone/flatten/FlatConsumer.scala: -------------------------------------------------------------------------------- 1 | package com.stripe.herringbone.flatten 2 | 3 | import org.apache.hadoop.mapreduce._ 4 | import org.apache.hadoop.mapreduce.lib.input._ 5 | import org.apache.hadoop.mapreduce.lib.output._ 6 | import org.apache.hadoop.util._ 7 | import org.apache.hadoop.fs._ 8 | import org.apache.hadoop.conf._ 9 | 10 | import parquet.example.data._ 11 | import parquet.example.data.simple._ 12 | import parquet.hadoop._ 13 | import parquet.hadoop.example._ 14 | import parquet.io.api._ 15 | import parquet.schema._ 16 | 17 | class FlatConsumer(output: Group, separator: String, renameId: Boolean) extends RecordConsumer { 18 | 19 | case class StackFrame(field: String, var values: List[Binary]) 20 | var stack = List[StackFrame]() 21 | // Impala stops working after a field becomes too long. The docs 22 | // indicate that we should have 32k. However, a binary search on a 23 | // too-long field yielded 6776 as the maximum working value. 24 | val MaxStringBytes = 6776 25 | 26 | def startMessage {} 27 | def endMessage {} 28 | def startGroup {} 29 | def endGroup {} 30 | 31 | def startField(field: String, index: Int) { 32 | stack ::= StackFrame(field, Nil) 33 | } 34 | 35 | def endField(field: String, index: Int) { 36 | if (stack.head.values.size == 1) { 37 | withField{name => output.add(name, stack.head.values.head)} 38 | } else if (stack.head.values.size > 1) { 39 | withField {name => 40 | val joined = Binary.fromString( 41 | stack 42 | .head 43 | .values 44 | .reverse 45 | .map{_.toStringUsingUTF8} 46 | .mkString(",") 47 | .replace("\t", " ") 48 | ) 49 | val truncated = truncate(joined, MaxStringBytes) 50 | output.add(name, truncated) 51 | } 52 | } 53 | stack = stack.tail 54 | } 55 | 56 | def addInteger(value: Int) { 57 | writeField{Binary.fromString(value.toString)}{name => output.add(name, value)} 58 | } 59 | 60 | def addLong(value: Long) { 61 | writeField{Binary.fromString(value.toString)}{name => output.add(name, value)} 62 | } 63 | 64 | def addBoolean(value: Boolean) { 65 | writeField{Binary.fromString(value.toString)}{name => output.add(name, value)} 66 | } 67 | 68 | def truncate(value: Binary, length: Integer): Binary = { 69 | if (value.length <= length) { 70 | value 71 | } else { 72 | val bytesTruncated = new Array[Byte](length) 73 | value.toByteBuffer.get(bytesTruncated, 0, length) 74 | Binary.fromByteArray(bytesTruncated) 75 | } 76 | } 77 | 78 | def addBinary(value: Binary) { 79 | // Truncate strings so Impala doesn't break 80 | val truncated = truncate(value, MaxStringBytes) 81 | writeField(truncated){name => output.add(name, truncated)} 82 | } 83 | 84 | def addFloat(value: Float) { 85 | writeField{Binary.fromString(value.toString)}{name => output.add(name, value)} 86 | } 87 | 88 | def addDouble(value: Double) { 89 | writeField{Binary.fromString(value.toString)}{name => output.add(name, value)} 90 | } 91 | 92 | def withField(fn: String=>Unit) { 93 | val path = if (TypeFlattener.omitIdField(stack.head.field, stack.size, renameId)) 94 | stack.tail 95 | else 96 | stack 97 | 98 | val name = path.reverse.map{_.field}.mkString(separator) 99 | if(output.getType.containsField(name)) 100 | fn(name) 101 | } 102 | 103 | def writeField(binRep: =>Binary)(fn: String => Unit) { 104 | withField{name => 105 | val fieldType = output.getType.getType(name) 106 | if(fieldType.asInstanceOf[PrimitiveType].getPrimitiveTypeName == PrimitiveType.PrimitiveTypeName.BINARY) 107 | stack.head.values ::= binRep 108 | else 109 | fn(name) 110 | } 111 | } 112 | } 113 | -------------------------------------------------------------------------------- /herringbone-main/src/main/scala/com/stripe/herringbone/flatten/FlatConverter.scala: -------------------------------------------------------------------------------- 1 | package com.stripe.herringbone.flatten 2 | 3 | import org.apache.hadoop.fs.Path 4 | import org.apache.hadoop.conf.Configuration 5 | 6 | import parquet.example.data.Group 7 | import parquet.example.data.GroupWriter 8 | import parquet.example.data.simple.SimpleGroup 9 | import parquet.schema.MessageType 10 | 11 | import scala.collection.JavaConversions._ 12 | 13 | object FlatConverter { 14 | def groupToTSV(group: Group, flatSchema: MessageType, separator: String, renameId: Boolean): String = { 15 | val flatGroup = flattenGroup(group, flatSchema, separator, renameId) 16 | val fieldValues = (0 until flatSchema.getFieldCount).map{ field => 17 | val valueCount = flatGroup.getFieldRepetitionCount(field) 18 | if (valueCount == 0) { 19 | "" 20 | } else if (valueCount == 1) { 21 | escapeString(flatGroup.getValueToString(field, 0)) 22 | } else { 23 | escapeString(flatGroup.getValueToString(field, 0)) 24 | System.err.println("Warning: Field contains multiple values, extracting only the first") 25 | System.err.println(flatGroup.toString) 26 | } 27 | } 28 | fieldValues.mkString("\t") 29 | } 30 | 31 | def constructHeader(schema: MessageType) = { 32 | schema 33 | .getPaths() 34 | .toList 35 | .map{_(0)} 36 | .mkString("\t") 37 | } 38 | 39 | def flattenGroup(group: Group, flatSchema: MessageType, separator: String, renameId: Boolean) = { 40 | var flatGroup = new SimpleGroup(flatSchema) 41 | val writer = new GroupWriter(new FlatConsumer(flatGroup, separator, renameId), group.getType) 42 | writer.write(group) 43 | flatGroup 44 | } 45 | 46 | private def escapeString(s: String) = { 47 | val quote = "\"" 48 | if (s.contains("\t")) 49 | // This is how pandas escapes tabs and quotes 50 | quote + s.replace(quote, "\"\"") + quote 51 | else 52 | s 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /herringbone-main/src/main/scala/com/stripe/herringbone/flatten/ParquetFlatConf.scala: -------------------------------------------------------------------------------- 1 | package com.stripe.herringbone.flatten 2 | 3 | import org.rogach.scallop._ 4 | 5 | class ParquetFlatConf(arguments: Seq[String]) extends ScallopConf(arguments) { 6 | val inputPath = opt[String](required = true) 7 | val outputPath = opt[String](descr = "Default is input path with `-flat` or `-tsv` appended as appropriate") 8 | val previousPath = opt[String](descr = "Path of previously generated flat output, so field ordering can be maintained (optional)") 9 | val separator = opt[String](default = Some("__")) 10 | val renameId = opt[Boolean](descr = "Flatten a.b.id as a__b instead of a__b__id") 11 | } 12 | -------------------------------------------------------------------------------- /herringbone-main/src/main/scala/com/stripe/herringbone/flatten/ParquetFlatMapper.scala: -------------------------------------------------------------------------------- 1 | package com.stripe.herringbone.flatten 2 | 3 | import org.apache.hadoop.mapreduce.Mapper 4 | import parquet.example.data.Group 5 | import parquet.schema.{MessageType,MessageTypeParser} 6 | 7 | abstract class ParquetFlatMapper[ValueOut] extends Mapper[Void,Group,Void,ValueOut] { 8 | var flattenedSchema: MessageType = _ 9 | var separator: String = _ 10 | var renameId: Boolean = _ 11 | 12 | override def setup(context: Mapper[Void,Group,Void,ValueOut]#Context) { 13 | // the schema is stored in the job context when we call ExampleOutputFormat.setSchema 14 | flattenedSchema = MessageTypeParser.parseMessageType(context.getConfiguration.get("parquet.example.schema")) 15 | separator = context.getConfiguration.get(ParquetFlatMapper.SeparatorKey) 16 | renameId = context.getConfiguration.get(ParquetFlatMapper.RenameIdKey) == "true" 17 | } 18 | 19 | override def map(key: Void, value: Group, context: Mapper[Void,Group,Void,ValueOut]#Context) { 20 | context.write(key, valueOut(value)) 21 | } 22 | 23 | def valueOut(value: Group): ValueOut 24 | } 25 | 26 | object ParquetFlatMapper { 27 | val SeparatorKey = "herringbone.flatten.separator" 28 | val RenameIdKey = "herringbone.flatten.rename.id" 29 | } 30 | -------------------------------------------------------------------------------- /herringbone-main/src/main/scala/com/stripe/herringbone/flatten/TypeFlattener.scala: -------------------------------------------------------------------------------- 1 | package com.stripe.herringbone.flatten 2 | 3 | import parquet.schema._ 4 | import java.util.{List=>JList} 5 | import scala.collection.JavaConverters._ 6 | 7 | class TypeFlattener(separator: String, renameId: Boolean) extends TypeConverter[List[Type]] { 8 | def convertPrimitiveType(path: JList[GroupType], primitiveType: PrimitiveType) = { 9 | val typeName = 10 | if(TypeFlattener.isRepeated(primitiveType)) 11 | PrimitiveType.PrimitiveTypeName.BINARY 12 | else 13 | primitiveType.getPrimitiveTypeName 14 | 15 | val types = if (TypeFlattener.omitIdField(primitiveType.getName, path.size, renameId)) 16 | path.asScala.tail 17 | else 18 | (path.asScala.tail :+ primitiveType) 19 | 20 | val name = types.map{_.getName}.mkString(separator) 21 | List(new PrimitiveType(Type.Repetition.OPTIONAL, typeName, primitiveType.getTypeLength, name)) 22 | } 23 | 24 | def convertGroupType(path: JList[GroupType], groupType: GroupType, children: JList[List[Type]]) = { 25 | if(TypeFlattener.isRepeated(groupType)) 26 | Nil 27 | else 28 | flatten(children) 29 | } 30 | 31 | def convertMessageType(messageType: MessageType, children: JList[List[Type]]) = flatten(children) 32 | 33 | def flatten(children: JList[List[Type]]) = children.asScala.flatten.toList 34 | } 35 | 36 | object TypeFlattener { 37 | def flatten(messageType: MessageType, 38 | previousMessageType: Option[MessageType], 39 | separator: String, 40 | renameId: Boolean) = { 41 | val flattened = messageType.convertWith(new TypeFlattener(separator, renameId)) 42 | val fieldsToUse = previousMessageType match { 43 | case Some(prevMessageType) => { 44 | // if passed a previous flattened schema, preserve that field ordering, 45 | // and append any new fields 46 | val prevFields = prevMessageType.getFields.asScala.toList 47 | prevFields ::: flattened.filterNot{prevFields.contains(_)} 48 | } 49 | case None => flattened 50 | } 51 | new MessageType(messageType.getName, fieldsToUse.asJava) 52 | } 53 | 54 | def isRepeated(t: Type) = t.isRepetition(Type.Repetition.REPEATED) 55 | 56 | def omitIdField(fieldName: String, numberOfFields: Integer, renameId: Boolean) = { 57 | renameId && numberOfFields > 1 && (fieldName == "id" || fieldName == "_id") 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /herringbone-main/src/main/scala/com/stripe/herringbone/load/FieldUtils.scala: -------------------------------------------------------------------------------- 1 | package com.stripe.herringbone.load 2 | 3 | import com.stripe.herringbone.util.ParquetUtils 4 | 5 | import org.apache.hadoop.fs._ 6 | 7 | import parquet.schema.{ PrimitiveType, Type } 8 | import parquet.schema.PrimitiveType.PrimitiveTypeName 9 | import parquet.schema.PrimitiveType.PrimitiveTypeName._ 10 | 11 | import scala.collection.JavaConversions._ 12 | 13 | case class FieldUtils(hadoopFs: HadoopFs, schemaTypeMapper: SchemaTypeMapper) { 14 | def findPartitionFields(path: Path) = { 15 | hadoopFs.findPartitions(path).map { 16 | case (name, example) if (example.forall{_.isDigit}) => 17 | "`%s` int".format(name) 18 | case (name, _) => 19 | "`%s` string".format(name) 20 | } 21 | } 22 | 23 | def findTableFields(path: Path) = { 24 | val schema = ParquetUtils.readSchema(path) 25 | tableFieldsFromSchemaFields(schema.getFields) 26 | } 27 | 28 | def tableFieldsFromSchemaFields(fields: Seq[Type]) = { 29 | fields 30 | .filter { f => f.isPrimitive } 31 | .map { f => 32 | "`%s` %s".format(f.getName, schemaTypeMapper.getSchemaType(f.asInstanceOf[PrimitiveType].getPrimitiveTypeName)) 33 | }.toList 34 | } 35 | } 36 | 37 | trait SchemaTypeMapper { 38 | def getSchemaType(pt: PrimitiveTypeName): String 39 | } 40 | 41 | object ImpalaHiveSchemaTypeMapper extends SchemaTypeMapper { 42 | def getSchemaType(pt: PrimitiveTypeName) = { 43 | pt match { 44 | case BINARY => "STRING" 45 | case INT32 => "INT" 46 | case INT64 | INT96 => "BIGINT" 47 | case DOUBLE => "DOUBLE" 48 | case BOOLEAN => "BOOLEAN" 49 | case FLOAT => "FLOAT" 50 | case FIXED_LEN_BYTE_ARRAY => "BINARY" 51 | } 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /herringbone-main/src/main/scala/com/stripe/herringbone/load/HadoopFs.scala: -------------------------------------------------------------------------------- 1 | package com.stripe.herringbone.load 2 | 3 | import com.stripe.herringbone.util.ParquetUtils 4 | 5 | import org.apache.hadoop.conf._ 6 | import org.apache.hadoop.fs._ 7 | import org.apache.hadoop.util._ 8 | 9 | class HadoopFs { 10 | def findAbsolutePath(path: Path) = { 11 | path.getFileSystem(new Configuration).getFileStatus(path).getPath.toUri.toString 12 | } 13 | 14 | def findSortedLeafPaths(path: Path): List[Path] = 15 | findLeafPaths(path).sortBy{case (path,time) => time}.map{_._1} 16 | 17 | def findLeafPaths(path: Path): List[(Path,Long)] = { 18 | val fs = path.getFileSystem(new Configuration) 19 | val parquetFileStatuses = fs.listStatus(path, ParquetUtils.parquetFilter) 20 | if (parquetFileStatuses.size > 0) 21 | List((path, parquetFileStatuses.head.getModificationTime)) 22 | else { 23 | fs.listStatus(path, ParquetUtils.partitionFilter) 24 | .toList 25 | .map{_.getPath} 26 | .flatMap{findLeafPaths(_)} 27 | } 28 | } 29 | 30 | def findPartitions(path: Path) = { 31 | path.toUri.getPath.split("/") 32 | .filter{_.contains("=")} 33 | .map{segment => 34 | val parts = segment.split("=") 35 | (parts(0), parts(1)) 36 | }.toList 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /herringbone-main/src/main/scala/com/stripe/herringbone/load/HiveLoader.scala: -------------------------------------------------------------------------------- 1 | package com.stripe.herringbone 2 | 3 | import com.stripe.herringbone.load._ 4 | 5 | import java.sql.ResultSet 6 | 7 | import org.apache.hadoop.conf._ 8 | import org.apache.hadoop.fs._ 9 | import org.apache.hadoop.util._ 10 | 11 | case class HiveLoader(conf: ParquetLoadConf, 12 | hadoopFs: HadoopFs, 13 | fieldUtils: FieldUtils) extends ParquetLoader { 14 | 15 | val connection = HiveServer2Connection(conf.connectionUrl() + ":" + conf.connectionPort()) 16 | 17 | def checkTableExists(table: String, database: String): Boolean = { 18 | useDatabase(database) 19 | var exists: Boolean = false 20 | connection.executeQuery("SHOW TABLES") { resultSet => 21 | val existingTable = resultSet.getString(1).trim 22 | if (existingTable == table) 23 | exists = true 24 | } 25 | exists 26 | } 27 | 28 | def createTable(pathString: String, table: String, database: String = "default") { 29 | val path = new Path(pathString) 30 | val location = hadoopFs.findAbsolutePath(path) 31 | val leafPaths = hadoopFs.findSortedLeafPaths(path) 32 | 33 | if (leafPaths.isEmpty) 34 | error("Could not find parquet files under " + path) 35 | 36 | val tableFields = fieldUtils.findTableFields(leafPaths.last) 37 | val partitionFields = fieldUtils.findPartitionFields(leafPaths.last) 38 | val tableWhileImporting = table + "__import" 39 | 40 | useDatabase(database) 41 | 42 | createTableWithPartitionFields(location, tableWhileImporting, tableFields, partitionFields) 43 | 44 | connection.execute("DROP TABLE IF EXISTS %s".format(table)) 45 | connection.execute("ALTER TABLE %s RENAME TO %s".format(tableWhileImporting, table)) 46 | 47 | if (!partitionFields.isEmpty) 48 | updateTable(table, database) 49 | } 50 | 51 | def createTableWithPartitionFields(location: String, table: String, tableFields: List[String], 52 | partitionFields: List[String]) { 53 | 54 | connection.execute("DROP TABLE IF EXISTS `%s`".format (table)) 55 | 56 | val tableClause = "CREATE EXTERNAL TABLE IF NOT EXISTS `%s` (%s)".format( 57 | table, tableFields.mkString(", ")) 58 | 59 | val partitionClause = 60 | if (partitionFields.isEmpty) 61 | "" 62 | else 63 | " PARTITIONED BY (%s)".format(partitionFields.mkString(" ,")) 64 | 65 | val storedClause = " STORED AS PARQUET LOCATION \"%s\"".format(location) 66 | 67 | connection.execute(tableClause + partitionClause + storedClause) 68 | } 69 | 70 | def updateTable(table: String, database: String) = { 71 | connection.execute("MSCK REPAIR TABLE %s".format(table)) 72 | } 73 | 74 | def closeConnection() = connection.close 75 | 76 | private def useDatabase(database: String) = { 77 | connection.execute("CREATE DATABASE IF NOT EXISTS %s".format(database)) 78 | connection.execute("USE %s".format(database)) 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /herringbone-main/src/main/scala/com/stripe/herringbone/load/HiveServer2Connection.scala: -------------------------------------------------------------------------------- 1 | package com.stripe.herringbone.load 2 | 3 | import java.sql.{ Connection, DriverManager, ResultSet } 4 | 5 | case class HiveServer2Connection(connectionUrl: String) { 6 | lazy val connection: Connection = { 7 | Class.forName("org.apache.hive.jdbc.HiveDriver") 8 | DriverManager.getConnection(connectionUrl) 9 | } 10 | 11 | def execute(query: String) { 12 | try { 13 | println(query) 14 | val statement = connection.createStatement 15 | statement.execute(query) 16 | } catch { 17 | case e: Throwable => e.printStackTrace 18 | } 19 | } 20 | 21 | def executeQuery(query: String)(fn: ResultSet => Unit) { 22 | try { 23 | println(query) 24 | val statement = connection.createStatement 25 | val resultSet = statement.executeQuery(query) 26 | while (resultSet.next) { 27 | fn(resultSet) 28 | } 29 | } catch { 30 | case e: Throwable => e.printStackTrace 31 | } 32 | } 33 | 34 | def close = connection.close 35 | } 36 | -------------------------------------------------------------------------------- /herringbone-main/src/main/scala/com/stripe/herringbone/load/ImpalaLoader.scala: -------------------------------------------------------------------------------- 1 | package com.stripe.herringbone.load 2 | 3 | import com.stripe.herringbone.impala.{ImpalaClient,ImpalaValue} 4 | 5 | import org.apache.hadoop.conf._ 6 | import org.apache.hadoop.util._ 7 | import org.apache.hadoop.fs._ 8 | 9 | case class ImpalaLoader(conf: ParquetLoadConf, 10 | hadoopFs: HadoopFs, 11 | fieldUtils: FieldUtils) extends ParquetLoader { 12 | 13 | lazy val impalaClient = ImpalaClient(conf.connectionUrl(), 14 | conf.connectionPort().toInt) 15 | 16 | def checkTableExists(table: String, database: String): Boolean = { 17 | useDatabase(database) 18 | var exists: Boolean = false 19 | query("SHOW TABLES"){row => 20 | row.foreach { value => 21 | if (value.raw == table) exists = true 22 | } 23 | } 24 | exists 25 | } 26 | 27 | def createTable(pathString: String, table: String, database: String = "default") { 28 | val path = new Path(pathString) 29 | val location = hadoopFs.findAbsolutePath(path) 30 | val leafPaths = hadoopFs.findSortedLeafPaths(path) 31 | 32 | if(leafPaths.isEmpty) 33 | error("Could not find parquet files under " + path) 34 | 35 | val tableFields = fieldUtils.findTableFields(leafPaths.last) 36 | val partitionFields = fieldUtils.findPartitionFields(leafPaths.last) 37 | 38 | useDatabase("importing") 39 | 40 | createTableWithPartitionFields(location, table, tableFields, partitionFields) 41 | 42 | if(partitionFields.size > 0) 43 | addPartitions(table, leafPaths.map{hadoopFs.findPartitions(_)}) 44 | 45 | useDatabase(database) 46 | execute("DROP TABLE IF EXISTS %s.%s".format(database, table)) 47 | execute("ALTER TABLE importing.%s RENAME TO %s.%s".format(table, database, table)) 48 | if (partitionFields.isEmpty && conf.computeStats()) execute("COMPUTE STATS %s.%s".format(database, table)) 49 | } 50 | 51 | def updateTable(table: String, database: String) { 52 | useDatabase(database) 53 | 54 | val basePath = findBasePath(table) 55 | val tablePartitions = findTablePartitions(table) 56 | val leafPaths = hadoopFs.findSortedLeafPaths(new Path(basePath)) 57 | leafPaths.reverse.foreach{path => 58 | val partitions = hadoopFs.findPartitions(path) 59 | if(!tablePartitions.contains(partitions.map{_._2})) 60 | addPartition(table, partitions) 61 | } 62 | } 63 | 64 | def findBasePath(table: String) = { 65 | var location: String = null 66 | query("DESCRIBE FORMATTED %s".format(table)){row => 67 | if(row(0).raw.startsWith("Location:")) 68 | location = row(1).raw 69 | } 70 | location 71 | } 72 | 73 | def findTablePartitions(table: String) = { 74 | var partitions: List[List[String]] = Nil 75 | query("SHOW TABLE STATS %s".format(table)){row => 76 | if(row.size > 4) 77 | partitions ::= List(row(0).raw) 78 | } 79 | partitions 80 | } 81 | 82 | def createTableWithPartitionFields(location: String, table: String, tableFields: List[String], partitionFields: List[String]) { 83 | execute("DROP TABLE IF EXISTS `%s`".format (table)) 84 | 85 | val tableClause = "CREATE EXTERNAL TABLE IF NOT EXISTS `%s` (%s)".format(table, tableFields.mkString(", ")) 86 | val partitionClause = 87 | if(partitionFields.isEmpty) 88 | "" 89 | else 90 | " PARTITIONED BY (%s)".format(partitionFields.mkString(" ,")) 91 | val storedClause = " STORED AS PARQUETFILE LOCATION \"%s\"".format(location) 92 | 93 | execute(tableClause + partitionClause + storedClause) 94 | } 95 | 96 | def addPartitions(table: String, partitions: List[List[(String, String)]]) { 97 | partitions.foreach{addPartition(table, _)} 98 | } 99 | 100 | def addPartition(table: String, partitions: List[(String,String)]) { 101 | val partitionClause = 102 | partitions.map { 103 | case (name, value) if(value.forall{_.isDigit}) => 104 | "`%s`=%s".format(name, value) 105 | case (name, value) => 106 | "`%s`='%s'".format(name, value) 107 | }.mkString(", ") 108 | 109 | execute("ALTER TABLE %s ADD IF NOT EXISTS PARTITION (%s)".format(table, partitionClause)) 110 | } 111 | 112 | def closeConnection() = {} 113 | 114 | private def useDatabase(database: String) = { 115 | execute("CREATE DATABASE IF NOT EXISTS %s".format(database)) 116 | execute("USE %s".format(database)) 117 | } 118 | 119 | private def execute(stmt: String) { 120 | impalaClient.execute(stmt) 121 | } 122 | 123 | private def query(stmt: String)(fn: Seq[ImpalaValue] => Unit) { 124 | impalaClient.query(stmt){ r => fn(r) } 125 | } 126 | } 127 | -------------------------------------------------------------------------------- /herringbone-main/src/main/scala/com/stripe/herringbone/load/ParquetLoadConf.scala: -------------------------------------------------------------------------------- 1 | package com.stripe.herringbone.load 2 | 3 | import org.rogach.scallop._ 4 | 5 | class ParquetLoadConf(arguments: Seq[String]) extends ScallopConf(arguments) { 6 | val database = opt[String](default = Some("default")) 7 | val table = opt[String](required = true) 8 | val path = opt[String]() 9 | val hive = opt[Boolean]("hive") 10 | val connectionUrl = opt[String](required = true) 11 | val connectionPort = opt[String](required = true) 12 | val computeStats = toggle(descrYes = "Compute table stats after loading files into impala. Turn this off for faster loading into impala (but probably slower querying later on!)", default = Some(true)) 13 | val updatePartitions = toggle(descrYes = "Create table if not present, otherwise update with new partitions. If a schema change is being made to an existing table, turn this off.", default = Some(false)) 14 | validateOpt (path, updatePartitions) { 15 | case (None, None) => Left("You must specify at least one of path or update-partitions") 16 | case _ => Right(Unit) 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /herringbone-main/src/main/scala/com/stripe/herringbone/load/ParquetLoader.scala: -------------------------------------------------------------------------------- 1 | package com.stripe.herringbone.load 2 | 3 | trait ParquetLoader { 4 | def checkTableExists(table: String, db: String): Boolean 5 | def updateTable(table: String, db: String): Unit 6 | def createTable(path: String, table: String, db: String): Unit 7 | def closeConnection(): Unit 8 | } 9 | 10 | -------------------------------------------------------------------------------- /herringbone-main/src/main/scala/com/stripe/herringbone/util/ParquetUtils.scala: -------------------------------------------------------------------------------- 1 | package com.stripe.herringbone.util 2 | 3 | import org.apache.hadoop.conf._ 4 | import org.apache.hadoop.util._ 5 | import org.apache.hadoop.fs._ 6 | 7 | import parquet.hadoop.ParquetFileReader 8 | 9 | object ParquetUtils { 10 | def getParquetMetadata(path: Path) = { 11 | // Just use the first parquet file to figure out the impala fields 12 | // This also dodges the problem of any non-parquet files stashed 13 | // in the path. 14 | val fs = path.getFileSystem(new Configuration) 15 | val parquetFileStatuses = fs.listStatus(path, parquetFilter) 16 | val representativeParquetPath = parquetFileStatuses.head.getPath 17 | 18 | val footers = ParquetFileReader.readFooters(new Configuration, representativeParquetPath) 19 | footers.get(0).getParquetMetadata 20 | } 21 | 22 | def readSchema(path: Path) = { 23 | getParquetMetadata(path).getFileMetaData.getSchema 24 | } 25 | 26 | def readKeyValueMetaData(path: Path) = { 27 | getParquetMetadata(path).getFileMetaData.getKeyValueMetaData 28 | } 29 | 30 | val parquetFilter = new PathFilter { 31 | def accept(path: Path) = path.getName.endsWith(".parquet") 32 | } 33 | 34 | val partitionFilter = new PathFilter { 35 | def accept(path: Path) = path.getName.contains("=") 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /herringbone-main/src/main/thrift/ImpalaService.thrift: -------------------------------------------------------------------------------- 1 | // Copyright 2012 Cloudera Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | namespace cpp impala 16 | namespace java com.cloudera.impala.thrift 17 | namespace rb impala.protocol 18 | 19 | include "Status.thrift" 20 | include "beeswax.thrift" 21 | include "cli_service.thrift" 22 | 23 | // ImpalaService accepts query execution options through beeswax.Query.configuration in 24 | // key:value form. For example, the list of strings could be: 25 | // "num_nodes:1", "abort_on_error:false" 26 | // The valid keys are listed in this enum. They map to TQueryOptions. 27 | // Note: If you add an option or change the default, you also need to update: 28 | // - ImpalaInternalService.thrift: TQueryOptions 29 | // - ImpaladClientExecutor.getBeeswaxQueryConfigurations() 30 | // - ImpalaServer::SetQueryOptions() 31 | // - ImpalaServer::TQueryOptionsToMap() 32 | enum TImpalaQueryOptions { 33 | // if true, abort execution on the first error 34 | ABORT_ON_ERROR, 35 | 36 | // maximum # of errors to be reported; Unspecified or 0 indicates backend default 37 | MAX_ERRORS, 38 | 39 | // if true, disable llvm codegen 40 | DISABLE_CODEGEN, 41 | 42 | // batch size to be used by backend; Unspecified or a size of 0 indicates backend 43 | // default 44 | BATCH_SIZE, 45 | 46 | // a per-machine approximate limit on the memory consumption of this query; 47 | // unspecified or a limit of 0 means no limit; 48 | // otherwise specified either as: 49 | // a) an int (= number of bytes); 50 | // b) a float followed by "M" (MB) or "G" (GB) 51 | MEM_LIMIT, 52 | 53 | // specifies the degree of parallelism with which to execute the query; 54 | // 1: single-node execution 55 | // NUM_NODES_ALL: executes on all nodes that contain relevant data 56 | // NUM_NODES_ALL_RACKS: executes on one node per rack that holds relevant data 57 | // > 1: executes on at most that many nodes at any point in time (ie, there can be 58 | // more nodes than numNodes with plan fragments for this query, but at most 59 | // numNodes would be active at any point in time) 60 | // Constants (NUM_NODES_ALL, NUM_NODES_ALL_RACKS) are defined in JavaConstants.thrift. 61 | NUM_NODES, 62 | 63 | // maximum length of the scan range; only applicable to HDFS scan range; Unspecified or 64 | // a length of 0 indicates backend default; 65 | MAX_SCAN_RANGE_LENGTH, 66 | 67 | // Maximum number of io buffers (per disk) 68 | MAX_IO_BUFFERS, 69 | 70 | // Number of scanner threads. 71 | NUM_SCANNER_THREADS, 72 | 73 | // If true, Impala will try to execute on file formats that are not fully supported yet 74 | ALLOW_UNSUPPORTED_FORMATS, 75 | 76 | // if set and > -1, specifies the default limit applied to a top-level SELECT statement 77 | // with an ORDER BY but without a LIMIT clause (ie, if the SELECT statement also has 78 | // a LIMIT clause, this default is ignored) 79 | DEFAULT_ORDER_BY_LIMIT, 80 | 81 | // DEBUG ONLY: 82 | // If set to 83 | // "[:]::", 84 | // the exec node with the given id will perform the specified action in the given 85 | // phase. If the optional backend number (starting from 0) is specified, only that 86 | // backend instance will perform the debug action, otherwise all backends will behave 87 | // in that way. 88 | // If the string doesn't have the required format or if any of its components is 89 | // invalid, the option is ignored. 90 | DEBUG_ACTION, 91 | 92 | // If true, raise an error when the DEFAULT_ORDER_BY_LIMIT has been reached. 93 | ABORT_ON_DEFAULT_LIMIT_EXCEEDED, 94 | 95 | // Compression codec for parquet when inserting into parquet tables. 96 | // Valid values are "snappy", "gzip" and "none" 97 | // Leave blank to use default. 98 | PARQUET_COMPRESSION_CODEC, 99 | 100 | // HBase scan query option. If set and > 0, HBASE_CACHING is the value for 101 | // "hbase.client.Scan.setCaching()" when querying HBase table. Otherwise, use backend 102 | // default. 103 | // If the value is too high, then the hbase region server will have a hard time (GC 104 | // pressure and long response times). If the value is too small, then there will be 105 | // extra trips to the hbase region server. 106 | HBASE_CACHING, 107 | 108 | // HBase scan query option. If set, HBase scan will always set 109 | // "hbase.client.setCacheBlocks" to CACHE_BLOCKS. Default is false. 110 | // If the table is large and the query is doing big scan, set it to false to 111 | // avoid polluting the cache in the hbase region server. 112 | // If the table is small and the table is used several time, set it to true to improve 113 | // performance. 114 | HBASE_CACHE_BLOCKS, 115 | } 116 | 117 | // The summary of an insert. 118 | struct TInsertResult { 119 | // Number of appended rows per modified partition. Only applies to HDFS tables. 120 | // The keys represent partitions to create, coded as k1=v1/k2=v2/k3=v3..., with the 121 | // root in an unpartitioned table being the empty string. 122 | 1: required map rows_appended 123 | } 124 | 125 | // Response from a call to PingImpalaService 126 | struct TPingImpalaServiceResp { 127 | // The Impala service's version string. 128 | 1: string version 129 | } 130 | 131 | // Parameters for a ResetTable request which will invalidate a table's metadata. 132 | // DEPRECATED. 133 | struct TResetTableReq { 134 | // Name of the table's parent database. 135 | 1: required string db_name 136 | 137 | // Name of the table. 138 | 2: required string table_name 139 | } 140 | 141 | // For all rpc that return a TStatus as part of their result type, 142 | // if the status_code field is set to anything other than OK, the contents 143 | // of the remainder of the result type is undefined (typically not set) 144 | service ImpalaService extends beeswax.BeeswaxService { 145 | // Cancel execution of query. Returns RUNTIME_ERROR if query_id 146 | // unknown. 147 | // This terminates all threads running on behalf of this query at 148 | // all nodes that were involved in the execution. 149 | // Throws BeeswaxException if the query handle is invalid (this doesn't 150 | // necessarily indicate an error: the query might have finished). 151 | Status.TStatus Cancel(1:beeswax.QueryHandle query_id) 152 | throws(1:beeswax.BeeswaxException error); 153 | 154 | // Invalidates all catalog metadata, forcing a reload 155 | // DEPRECATED; execute query "invalidate metadata" to refresh metadata 156 | Status.TStatus ResetCatalog(); 157 | 158 | // Invalidates a specific table's catalog metadata, forcing a reload on the next access 159 | // DEPRECATED; execute query "refresh
" to refresh metadata 160 | Status.TStatus ResetTable(1:TResetTableReq request) 161 | 162 | // Returns the runtime profile string for the given query handle. 163 | string GetRuntimeProfile(1:beeswax.QueryHandle query_id) 164 | throws(1:beeswax.BeeswaxException error); 165 | 166 | // Closes the query handle and return the result summary of the insert. 167 | TInsertResult CloseInsert(1:beeswax.QueryHandle handle) 168 | throws(1:beeswax.QueryNotFoundException error, 2:beeswax.BeeswaxException error2); 169 | 170 | // Client calls this RPC to verify that the server is an ImpalaService. Returns the 171 | // server version. 172 | TPingImpalaServiceResp PingImpalaService(); 173 | } 174 | 175 | // Impala HiveServer2 service 176 | service ImpalaHiveServer2Service extends cli_service.TCLIService { 177 | } 178 | -------------------------------------------------------------------------------- /herringbone-main/src/main/thrift/Status.thrift: -------------------------------------------------------------------------------- 1 | // Copyright 2012 Cloudera Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | namespace cpp impala 16 | namespace java com.cloudera.impala.thrift 17 | namespace rb impala.protocol 18 | 19 | enum TStatusCode { 20 | OK, 21 | CANCELLED, 22 | ANALYSIS_ERROR, 23 | NOT_IMPLEMENTED_ERROR, 24 | RUNTIME_ERROR, 25 | MEM_LIMIT_EXCEEDED, 26 | INTERNAL_ERROR 27 | } 28 | 29 | struct TStatus { 30 | 1: required TStatusCode status_code 31 | 2: list error_msgs 32 | } 33 | -------------------------------------------------------------------------------- /herringbone-main/src/main/thrift/beeswax.thrift: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to Cloudera, Inc. under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. Cloudera, Inc. licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | * 18 | * Interface for interacting with Beeswax Server 19 | */ 20 | 21 | namespace java com.cloudera.beeswax.api 22 | namespace py beeswaxd 23 | namespace cpp beeswax 24 | namespace rb impala.protocol.beeswax 25 | 26 | include "hive_metastore.thrift" 27 | 28 | // A Query 29 | struct Query { 30 | 1: string query; 31 | // A list of HQL commands to execute before the query. 32 | // This is typically defining UDFs, setting settings, and loading resources. 33 | 3: list configuration; 34 | 35 | // User and groups to "act as" for purposes of Hadoop. 36 | 4: string hadoop_user; 37 | } 38 | 39 | typedef string LogContextId 40 | 41 | enum QueryState { 42 | CREATED, 43 | INITIALIZED, 44 | COMPILED, 45 | RUNNING, 46 | FINISHED, 47 | EXCEPTION 48 | } 49 | 50 | struct QueryHandle { 51 | 1: string id; 52 | 2: LogContextId log_context; 53 | } 54 | 55 | struct QueryExplanation { 56 | 1: string textual 57 | } 58 | 59 | struct Results { 60 | // If set, data is valid. Otherwise, results aren't ready yet. 61 | 1: bool ready, 62 | // Columns for the results 63 | 2: list columns, 64 | // A set of results 65 | 3: list data, 66 | // The starting row of the results 67 | 4: i64 start_row, 68 | // Whether there are more results to fetch 69 | 5: bool has_more 70 | } 71 | 72 | /** 73 | * Metadata information about the results. 74 | * Applicable only for SELECT. 75 | */ 76 | struct ResultsMetadata { 77 | /** The schema of the results */ 78 | 1: hive_metastore.Schema schema, 79 | /** The directory containing the results. Not applicable for partition table. */ 80 | 2: string table_dir, 81 | /** If the results are straight from an existing table, the table name. */ 82 | 3: string in_tablename, 83 | /** Field delimiter */ 84 | 4: string delim, 85 | } 86 | 87 | exception BeeswaxException { 88 | 1: string message, 89 | // Use get_log(log_context) to retrieve any log related to this exception 90 | 2: LogContextId log_context, 91 | // (Optional) The QueryHandle that caused this exception 92 | 3: QueryHandle handle, 93 | 4: optional i32 errorCode = 0, 94 | 5: optional string SQLState = " " 95 | } 96 | 97 | exception QueryNotFoundException { 98 | } 99 | 100 | /** Represents a Hadoop-style configuration variable. */ 101 | struct ConfigVariable { 102 | 1: string key, 103 | 2: string value, 104 | 3: string description 105 | } 106 | 107 | service BeeswaxService { 108 | /** 109 | * Submit a query and return a handle (QueryHandle). The query runs asynchronously. 110 | */ 111 | QueryHandle query(1:Query query) throws(1:BeeswaxException error), 112 | 113 | /** 114 | * run a query synchronously and return a handle (QueryHandle). 115 | */ 116 | QueryHandle executeAndWait(1:Query query, 2:LogContextId clientCtx) 117 | throws(1:BeeswaxException error), 118 | 119 | /** 120 | * Get the query plan for a query. 121 | */ 122 | QueryExplanation explain(1:Query query) 123 | throws(1:BeeswaxException error), 124 | 125 | /** 126 | * Get the results of a query. This is non-blocking. Caller should check 127 | * Results.ready to determine if the results are in yet. The call requests 128 | * the batch size of fetch. 129 | */ 130 | Results fetch(1:QueryHandle query_id, 2:bool start_over, 3:i32 fetch_size=-1) 131 | throws(1:QueryNotFoundException error, 2:BeeswaxException error2), 132 | 133 | /** 134 | * Get the state of the query 135 | */ 136 | QueryState get_state(1:QueryHandle handle) throws(1:QueryNotFoundException error), 137 | 138 | /** 139 | * Get the result metadata 140 | */ 141 | ResultsMetadata get_results_metadata(1:QueryHandle handle) 142 | throws(1:QueryNotFoundException error), 143 | 144 | /** 145 | * Used to test connection to server. A "noop" command. 146 | */ 147 | string echo(1:string s) 148 | 149 | /** 150 | * Returns a string representation of the configuration object being used. 151 | * Handy for debugging. 152 | */ 153 | string dump_config() 154 | 155 | /** 156 | * Get the log messages related to the given context. 157 | */ 158 | string get_log(1:LogContextId context) throws(1:QueryNotFoundException error) 159 | 160 | /* 161 | * Returns "default" configuration. 162 | */ 163 | list get_default_configuration(1:bool include_hadoop) 164 | 165 | /* 166 | * closes the query with given handle 167 | */ 168 | void close(1:QueryHandle handle) throws(1:QueryNotFoundException error, 169 | 2:BeeswaxException error2) 170 | 171 | /* 172 | * clean the log context for given id 173 | */ 174 | void clean(1:LogContextId log_context) 175 | } 176 | -------------------------------------------------------------------------------- /herringbone-main/src/main/thrift/fb303.thrift: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one 3 | * or more contributor license agreements. See the NOTICE file 4 | * distributed with this work for additional information 5 | * regarding copyright ownership. The ASF licenses this file 6 | * to you under the Apache License, Version 2.0 (the 7 | * "License"); you may not use this file except in compliance 8 | * with the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, 13 | * software distributed under the License is distributed on an 14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 | * KIND, either express or implied. See the License for the 16 | * specific language governing permissions and limitations 17 | * under the License. 18 | */ 19 | 20 | /** 21 | * fb303.thrift 22 | */ 23 | 24 | namespace java com.facebook.fb303 25 | namespace cpp facebook.fb303 26 | namespace rb Impala.Protocol.fb303 27 | 28 | /** 29 | * Common status reporting mechanism across all services 30 | */ 31 | enum fb_status { 32 | DEAD = 0, 33 | STARTING = 1, 34 | ALIVE = 2, 35 | STOPPING = 3, 36 | STOPPED = 4, 37 | WARNING = 5, 38 | } 39 | 40 | /** 41 | * Standard base service 42 | */ 43 | service FacebookService { 44 | 45 | /** 46 | * Returns a descriptive name of the service 47 | */ 48 | string getName(), 49 | 50 | /** 51 | * Returns the version of the service 52 | */ 53 | string getVersion(), 54 | 55 | /** 56 | * Gets the status of this service 57 | */ 58 | fb_status getStatus(), 59 | 60 | /** 61 | * User friendly description of status, such as why the service is in 62 | * the dead or warning state, or what is being started or stopped. 63 | */ 64 | string getStatusDetails(), 65 | 66 | /** 67 | * Gets the counters for this service 68 | */ 69 | map getCounters(), 70 | 71 | /** 72 | * Gets the value of a single counter 73 | */ 74 | i64 getCounter(1: string key), 75 | 76 | /** 77 | * Sets an option 78 | */ 79 | void setOption(1: string key, 2: string value), 80 | 81 | /** 82 | * Gets an option 83 | */ 84 | string getOption(1: string key), 85 | 86 | /** 87 | * Gets all options 88 | */ 89 | map getOptions(), 90 | 91 | /** 92 | * Returns a CPU profile over the given time interval (client and server 93 | * must agree on the profile format). 94 | */ 95 | string getCpuProfile(1: i32 profileDurationInSec), 96 | 97 | /** 98 | * Returns the unix time that the server has been running since 99 | */ 100 | i64 aliveSince(), 101 | 102 | /** 103 | * Tell the server to reload its configuration, reopen log files, etc 104 | */ 105 | oneway void reinitialize(), 106 | 107 | /** 108 | * Suggest a shutdown to the server 109 | */ 110 | oneway void shutdown(), 111 | 112 | } 113 | -------------------------------------------------------------------------------- /herringbone-main/src/main/thrift/hive_metastore.thrift: -------------------------------------------------------------------------------- 1 | #!/usr/local/bin/thrift -java 2 | 3 | /** 4 | * Licensed to the Apache Software Foundation (ASF) under one 5 | * or more contributor license agreements. See the NOTICE file 6 | * distributed with this work for additional information 7 | * regarding copyright ownership. The ASF licenses this file 8 | * to you under the Apache License, Version 2.0 (the 9 | * "License"); you may not use this file except in compliance 10 | * with the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | */ 20 | 21 | # 22 | # Thrift Service that the MetaStore is built on 23 | # 24 | 25 | include "fb303.thrift" 26 | 27 | namespace java org.apache.hadoop.hive.metastore.api 28 | namespace php metastore 29 | namespace cpp Apache.Hadoop.Hive 30 | namespace rb Impala.Protocol.HiveMetastore 31 | 32 | const string DDL_TIME = "transient_lastDdlTime" 33 | 34 | struct Version { 35 | 1: string version, 36 | 2: string comments 37 | } 38 | 39 | struct FieldSchema { 40 | 1: string name, // name of the field 41 | 2: string type, // type of the field. primitive types defined above, specify list, map for lists & maps 42 | 3: string comment 43 | } 44 | 45 | struct Type { 46 | 1: string name, // one of the types in PrimitiveTypes or CollectionTypes or User defined types 47 | 2: optional string type1, // object type if the name is 'list' (LIST_TYPE), key type if the name is 'map' (MAP_TYPE) 48 | 3: optional string type2, // val type if the name is 'map' (MAP_TYPE) 49 | //4: optional list fields // if the name is one of the user defined types 50 | } 51 | 52 | enum HiveObjectType { 53 | GLOBAL = 1, 54 | DATABASE = 2, 55 | TABLE = 3, 56 | PARTITION = 4, 57 | COLUMN = 5, 58 | } 59 | 60 | enum PrincipalType { 61 | USER = 1, 62 | ROLE = 2, 63 | GROUP = 3, 64 | } 65 | 66 | const string HIVE_FILTER_FIELD_OWNER = "hive_filter_field_owner__" 67 | const string HIVE_FILTER_FIELD_PARAMS = "hive_filter_field_params__" 68 | const string HIVE_FILTER_FIELD_LAST_ACCESS = "hive_filter_field_last_access__" 69 | 70 | enum PartitionEventType { 71 | LOAD_DONE = 1, 72 | } 73 | 74 | struct HiveObjectRef{ 75 | 1: HiveObjectType objectType, 76 | 2: string dbName, 77 | 3: string objectName, 78 | 4: list partValues, 79 | 5: string columnName, 80 | } 81 | 82 | struct PrivilegeGrantInfo { 83 | 1: string privilege, 84 | 2: i32 createTime, 85 | 3: string grantor, 86 | 4: PrincipalType grantorType, 87 | 5: bool grantOption, 88 | } 89 | 90 | struct HiveObjectPrivilege { 91 | 1: HiveObjectRef hiveObject, 92 | 2: string principalName, 93 | 3: PrincipalType principalType, 94 | 4: PrivilegeGrantInfo grantInfo, 95 | } 96 | 97 | struct PrivilegeBag { 98 | 1: list privileges, 99 | } 100 | 101 | struct PrincipalPrivilegeSet { 102 | 1: map> userPrivileges, // user name -> privilege grant info 103 | 2: map> groupPrivileges, // group name -> privilege grant info 104 | 3: map> rolePrivileges, //role name -> privilege grant info 105 | } 106 | 107 | struct Role { 108 | 1: string roleName, 109 | 2: i32 createTime, 110 | 3: string ownerName, 111 | } 112 | 113 | // namespace for tables 114 | struct Database { 115 | 1: string name, 116 | 2: string description, 117 | 3: string locationUri, 118 | 4: map parameters, // properties associated with the database 119 | 5: optional PrincipalPrivilegeSet privileges 120 | } 121 | 122 | // This object holds the information needed by SerDes 123 | struct SerDeInfo { 124 | 1: string name, // name of the serde, table name by default 125 | 2: string serializationLib, // usually the class that implements the extractor & loader 126 | 3: map parameters // initialization parameters 127 | } 128 | 129 | // sort order of a column (column name along with asc(1)/desc(0)) 130 | struct Order { 131 | 1: string col, // sort column name 132 | 2: i32 order // asc(1) or desc(0) 133 | } 134 | 135 | // this object holds all the information about physical storage of the data belonging to a table 136 | struct StorageDescriptor { 137 | 1: list cols, // required (refer to types defined above) 138 | 2: string location, // defaults to //tablename 139 | 3: string inputFormat, // SequenceFileInputFormat (binary) or TextInputFormat` or custom format 140 | 4: string outputFormat, // SequenceFileOutputFormat (binary) or IgnoreKeyTextOutputFormat or custom format 141 | 5: bool compressed, // compressed or not 142 | 6: i32 numBuckets, // this must be specified if there are any dimension columns 143 | 7: SerDeInfo serdeInfo, // serialization and deserialization information 144 | 8: list bucketCols, // reducer grouping columns and clustering columns and bucketing columns` 145 | 9: list sortCols, // sort order of the data in each bucket 146 | 10: map parameters // any user supplied key value hash 147 | } 148 | 149 | // table information 150 | struct Table { 151 | 1: string tableName, // name of the table 152 | 2: string dbName, // database name ('default') 153 | 3: string owner, // owner of this table 154 | 4: i32 createTime, // creation time of the table 155 | 5: i32 lastAccessTime, // last access time (usually this will be filled from HDFS and shouldn't be relied on) 156 | 6: i32 retention, // retention time 157 | 7: StorageDescriptor sd, // storage descriptor of the table 158 | 8: list partitionKeys, // partition keys of the table. only primitive types are supported 159 | 9: map parameters, // to store comments or any other user level parameters 160 | 10: string viewOriginalText, // original view text, null for non-view 161 | 11: string viewExpandedText, // expanded view text, null for non-view 162 | 12: string tableType, // table type enum, e.g. EXTERNAL_TABLE 163 | 13: optional PrincipalPrivilegeSet privileges, 164 | } 165 | 166 | struct Partition { 167 | 1: list values // string value is converted to appropriate partition key type 168 | 2: string dbName, 169 | 3: string tableName, 170 | 4: i32 createTime, 171 | 5: i32 lastAccessTime, 172 | 6: StorageDescriptor sd, 173 | 7: map parameters, 174 | 8: optional PrincipalPrivilegeSet privileges 175 | } 176 | 177 | struct Index { 178 | 1: string indexName, // unique with in the whole database namespace 179 | 2: string indexHandlerClass, // reserved 180 | 3: string dbName, 181 | 4: string origTableName, 182 | 5: i32 createTime, 183 | 6: i32 lastAccessTime, 184 | 7: string indexTableName, 185 | 8: StorageDescriptor sd, 186 | 9: map parameters, 187 | 10: bool deferredRebuild 188 | } 189 | 190 | // schema of the table/query results etc. 191 | struct Schema { 192 | // column names, types, comments 193 | 1: list fieldSchemas, // delimiters etc 194 | 2: map properties 195 | } 196 | 197 | // Key-value store to be used with selected 198 | // Metastore APIs (create, alter methods). 199 | // The client can pass environment properties / configs that can be 200 | // accessed in hooks. 201 | struct EnvironmentContext { 202 | 1: map properties 203 | } 204 | 205 | exception MetaException { 206 | 1: string message 207 | } 208 | 209 | exception UnknownTableException { 210 | 1: string message 211 | } 212 | 213 | exception UnknownDBException { 214 | 1: string message 215 | } 216 | 217 | exception AlreadyExistsException { 218 | 1: string message 219 | } 220 | 221 | exception InvalidPartitionException { 222 | 1: string message 223 | } 224 | 225 | exception UnknownPartitionException { 226 | 1: string message 227 | } 228 | 229 | exception InvalidObjectException { 230 | 1: string message 231 | } 232 | 233 | exception NoSuchObjectException { 234 | 1: string message 235 | } 236 | 237 | exception IndexAlreadyExistsException { 238 | 1: string message 239 | } 240 | 241 | exception InvalidOperationException { 242 | 1: string message 243 | } 244 | 245 | exception ConfigValSecurityException { 246 | 1: string message 247 | } 248 | 249 | /** 250 | * This interface is live. 251 | */ 252 | service ThriftHiveMetastore extends fb303.FacebookService 253 | { 254 | void create_database(1:Database database) throws(1:AlreadyExistsException o1, 2:InvalidObjectException o2, 3:MetaException o3) 255 | Database get_database(1:string name) throws(1:NoSuchObjectException o1, 2:MetaException o2) 256 | void drop_database(1:string name, 2:bool deleteData, 3:bool cascade) throws(1:NoSuchObjectException o1, 2:InvalidOperationException o2, 3:MetaException o3) 257 | list get_databases(1:string pattern) throws(1:MetaException o1) 258 | list get_all_databases() throws(1:MetaException o1) 259 | void alter_database(1:string dbname, 2:Database db) throws(1:MetaException o1, 2:NoSuchObjectException o2) 260 | 261 | // returns the type with given name (make seperate calls for the dependent types if needed) 262 | Type get_type(1:string name) throws(1:MetaException o1, 2:NoSuchObjectException o2) 263 | bool create_type(1:Type type) throws(1:AlreadyExistsException o1, 2:InvalidObjectException o2, 3:MetaException o3) 264 | bool drop_type(1:string type) throws(1:MetaException o1, 2:NoSuchObjectException o2) 265 | map get_type_all(1:string name) 266 | throws(1:MetaException o2) 267 | 268 | // Gets a list of FieldSchemas describing the columns of a particular table 269 | list get_fields(1: string db_name, 2: string table_name) throws (1: MetaException o1, 2: UnknownTableException o2, 3: UnknownDBException o3), 270 | 271 | // Gets a list of FieldSchemas describing both the columns and the partition keys of a particular table 272 | list get_schema(1: string db_name, 2: string table_name) throws (1: MetaException o1, 2: UnknownTableException o2, 3: UnknownDBException o3) 273 | 274 | // create a Hive table. Following fields must be set 275 | // tableName 276 | // database (only 'default' for now until Hive QL supports databases) 277 | // owner (not needed, but good to have for tracking purposes) 278 | // sd.cols (list of field schemas) 279 | // sd.inputFormat (SequenceFileInputFormat (binary like falcon tables or u_full) or TextInputFormat) 280 | // sd.outputFormat (SequenceFileInputFormat (binary) or TextInputFormat) 281 | // sd.serdeInfo.serializationLib (SerDe class name eg org.apache.hadoop.hive.serde.simple_meta.MetadataTypedColumnsetSerDe 282 | // * See notes on DDL_TIME 283 | void create_table(1:Table tbl) throws(1:AlreadyExistsException o1, 2:InvalidObjectException o2, 3:MetaException o3, 4:NoSuchObjectException o4) 284 | void create_table_with_environment_context(1:Table tbl, 285 | 2:EnvironmentContext environment_context) 286 | throws (1:AlreadyExistsException o1, 287 | 2:InvalidObjectException o2, 3:MetaException o3, 288 | 4:NoSuchObjectException o4) 289 | // drops the table and all the partitions associated with it if the table has partitions 290 | // delete data (including partitions) if deleteData is set to true 291 | void drop_table(1:string dbname, 2:string name, 3:bool deleteData) 292 | throws(1:NoSuchObjectException o1, 2:MetaException o3) 293 | list get_tables(1: string db_name, 2: string pattern) throws (1: MetaException o1) 294 | list get_all_tables(1: string db_name) throws (1: MetaException o1) 295 | 296 | Table get_table(1:string dbname, 2:string tbl_name) 297 | throws (1:MetaException o1, 2:NoSuchObjectException o2) 298 | list
get_table_objects_by_name(1:string dbname, 2:list tbl_names) 299 | throws (1:MetaException o1, 2:InvalidOperationException o2, 3:UnknownDBException o3) 300 | 301 | // Get a list of table names that match a filter. 302 | // The filter operators are LIKE, <, <=, >, >=, =, <> 303 | // 304 | // In the filter statement, values interpreted as strings must be enclosed in quotes, 305 | // while values interpreted as integers should not be. Strings and integers are the only 306 | // supported value types. 307 | // 308 | // The currently supported key names in the filter are: 309 | // Constants.HIVE_FILTER_FIELD_OWNER, which filters on the tables' owner's name 310 | // and supports all filter operators 311 | // Constants.HIVE_FILTER_FIELD_LAST_ACCESS, which filters on the last access times 312 | // and supports all filter operators except LIKE 313 | // Constants.HIVE_FILTER_FIELD_PARAMS, which filters on the tables' parameter keys and values 314 | // and only supports the filter operators = and <>. 315 | // Append the parameter key name to HIVE_FILTER_FIELD_PARAMS in the filter statement. 316 | // For example, to filter on parameter keys called "retention", the key name in the filter 317 | // statement should be Constants.HIVE_FILTER_FIELD_PARAMS + "retention" 318 | // Also, = and <> only work for keys that exist 319 | // in the tables. E.g., if you are looking for tables where key1 <> value, it will only 320 | // look at tables that have a value for the parameter key1. 321 | // Some example filter statements include: 322 | // filter = Constants.HIVE_FILTER_FIELD_OWNER + " like \".*test.*\" and " + 323 | // Constants.HIVE_FILTER_FIELD_LAST_ACCESS + " = 0"; 324 | // filter = Constants.HIVE_FILTER_FIELD_PARAMS + "retention = \"30\" or " + 325 | // Constants.HIVE_FILTER_FIELD_PARAMS + "retention = \"90\"" 326 | // @param dbName 327 | // The name of the database from which you will retrieve the table names 328 | // @param filterType 329 | // The type of filter 330 | // @param filter 331 | // The filter string 332 | // @param max_tables 333 | // The maximum number of tables returned 334 | // @return A list of table names that match the desired filter 335 | list get_table_names_by_filter(1:string dbname, 2:string filter, 3:i16 max_tables=-1) 336 | throws (1:MetaException o1, 2:InvalidOperationException o2, 3:UnknownDBException o3) 337 | 338 | // alter table applies to only future partitions not for existing partitions 339 | // * See notes on DDL_TIME 340 | void alter_table(1:string dbname, 2:string tbl_name, 3:Table new_tbl) 341 | throws (1:InvalidOperationException o1, 2:MetaException o2) 342 | void alter_table_with_environment_context(1:string dbname, 2:string tbl_name, 343 | 3:Table new_tbl, 4:EnvironmentContext environment_context) 344 | throws (1:InvalidOperationException o1, 2:MetaException o2) 345 | // the following applies to only tables that have partitions 346 | // * See notes on DDL_TIME 347 | Partition add_partition(1:Partition new_part) 348 | throws(1:InvalidObjectException o1, 2:AlreadyExistsException o2, 3:MetaException o3) 349 | Partition add_partition_with_environment_context(1:Partition new_part, 350 | 2:EnvironmentContext environment_context) 351 | throws (1:InvalidObjectException o1, 2:AlreadyExistsException o2, 352 | 3:MetaException o3) 353 | i32 add_partitions(1:list new_parts) 354 | throws(1:InvalidObjectException o1, 2:AlreadyExistsException o2, 3:MetaException o3) 355 | Partition append_partition(1:string db_name, 2:string tbl_name, 3:list part_vals) 356 | throws (1:InvalidObjectException o1, 2:AlreadyExistsException o2, 3:MetaException o3) 357 | Partition append_partition_by_name(1:string db_name, 2:string tbl_name, 3:string part_name) 358 | throws (1:InvalidObjectException o1, 2:AlreadyExistsException o2, 3:MetaException o3) 359 | bool drop_partition(1:string db_name, 2:string tbl_name, 3:list part_vals, 4:bool deleteData) 360 | throws(1:NoSuchObjectException o1, 2:MetaException o2) 361 | bool drop_partition_by_name(1:string db_name, 2:string tbl_name, 3:string part_name, 4:bool deleteData) 362 | throws(1:NoSuchObjectException o1, 2:MetaException o2) 363 | Partition get_partition(1:string db_name, 2:string tbl_name, 3:list part_vals) 364 | throws(1:MetaException o1, 2:NoSuchObjectException o2) 365 | 366 | Partition get_partition_with_auth(1:string db_name, 2:string tbl_name, 3:list part_vals, 367 | 4: string user_name, 5: list group_names) throws(1:MetaException o1, 2:NoSuchObjectException o2) 368 | 369 | Partition get_partition_by_name(1:string db_name 2:string tbl_name, 3:string part_name) 370 | throws(1:MetaException o1, 2:NoSuchObjectException o2) 371 | 372 | // returns all the partitions for this table in reverse chronological order. 373 | // If max parts is given then it will return only that many. 374 | list get_partitions(1:string db_name, 2:string tbl_name, 3:i16 max_parts=-1) 375 | throws(1:NoSuchObjectException o1, 2:MetaException o2) 376 | list get_partitions_with_auth(1:string db_name, 2:string tbl_name, 3:i16 max_parts=-1, 377 | 4: string user_name, 5: list group_names) throws(1:NoSuchObjectException o1, 2:MetaException o2) 378 | 379 | list get_partition_names(1:string db_name, 2:string tbl_name, 3:i16 max_parts=-1) 380 | throws(1:MetaException o2) 381 | 382 | // get_partition*_ps methods allow filtering by a partial partition specification, 383 | // as needed for dynamic partitions. The values that are not restricted should 384 | // be empty strings. Nulls were considered (instead of "") but caused errors in 385 | // generated Python code. The size of part_vals may be smaller than the 386 | // number of partition columns - the unspecified values are considered the same 387 | // as "". 388 | list get_partitions_ps(1:string db_name 2:string tbl_name 389 | 3:list part_vals, 4:i16 max_parts=-1) 390 | throws(1:MetaException o1, 2:NoSuchObjectException o2) 391 | list get_partitions_ps_with_auth(1:string db_name, 2:string tbl_name, 3:list part_vals, 4:i16 max_parts=-1, 392 | 5: string user_name, 6: list group_names) throws(1:NoSuchObjectException o1, 2:MetaException o2) 393 | 394 | list get_partition_names_ps(1:string db_name, 395 | 2:string tbl_name, 3:list part_vals, 4:i16 max_parts=-1) 396 | throws(1:MetaException o1, 2:NoSuchObjectException o2) 397 | 398 | // get the partitions matching the given partition filter 399 | list get_partitions_by_filter(1:string db_name 2:string tbl_name 400 | 3:string filter, 4:i16 max_parts=-1) 401 | throws(1:MetaException o1, 2:NoSuchObjectException o2) 402 | 403 | // get partitions give a list of partition names 404 | list get_partitions_by_names(1:string db_name 2:string tbl_name 3:list names) 405 | throws(1:MetaException o1, 2:NoSuchObjectException o2) 406 | 407 | // changes the partition to the new partition object. partition is identified from the part values 408 | // in the new_part 409 | // * See notes on DDL_TIME 410 | void alter_partition(1:string db_name, 2:string tbl_name, 3:Partition new_part) 411 | throws (1:InvalidOperationException o1, 2:MetaException o2) 412 | 413 | void alter_partition_with_environment_context(1:string db_name, 414 | 2:string tbl_name, 3:Partition new_part, 415 | 4:EnvironmentContext environment_context) 416 | throws (1:InvalidOperationException o1, 2:MetaException o2) 417 | 418 | // rename the old partition to the new partition object by changing old part values to the part values 419 | // in the new_part. old partition is identified from part_vals. 420 | // partition keys in new_part should be the same as those in old partition. 421 | void rename_partition(1:string db_name, 2:string tbl_name, 3:list part_vals, 4:Partition new_part) 422 | throws (1:InvalidOperationException o1, 2:MetaException o2) 423 | 424 | // gets the value of the configuration key in the metastore server. returns 425 | // defaultValue if the key does not exist. if the configuration key does not 426 | // begin with "hive", "mapred", or "hdfs", a ConfigValSecurityException is 427 | // thrown. 428 | string get_config_value(1:string name, 2:string defaultValue) 429 | throws(1:ConfigValSecurityException o1) 430 | 431 | // converts a partition name into a partition values array 432 | list partition_name_to_vals(1: string part_name) 433 | throws(1: MetaException o1) 434 | // converts a partition name into a partition specification (a mapping from 435 | // the partition cols to the values) 436 | map partition_name_to_spec(1: string part_name) 437 | throws(1: MetaException o1) 438 | 439 | void markPartitionForEvent(1:string db_name, 2:string tbl_name, 3:map part_vals, 440 | 4:PartitionEventType eventType) throws (1: MetaException o1, 2: NoSuchObjectException o2, 441 | 3: UnknownDBException o3, 4: UnknownTableException o4, 5: UnknownPartitionException o5, 442 | 6: InvalidPartitionException o6) 443 | bool isPartitionMarkedForEvent(1:string db_name, 2:string tbl_name, 3:map part_vals, 444 | 4: PartitionEventType eventType) throws (1: MetaException o1, 2:NoSuchObjectException o2, 445 | 3: UnknownDBException o3, 4: UnknownTableException o4, 5: UnknownPartitionException o5, 446 | 6: InvalidPartitionException o6) 447 | 448 | //index 449 | Index add_index(1:Index new_index, 2: Table index_table) 450 | throws(1:InvalidObjectException o1, 2:AlreadyExistsException o2, 3:MetaException o3) 451 | void alter_index(1:string dbname, 2:string base_tbl_name, 3:string idx_name, 4:Index new_idx) 452 | throws (1:InvalidOperationException o1, 2:MetaException o2) 453 | bool drop_index_by_name(1:string db_name, 2:string tbl_name, 3:string index_name, 4:bool deleteData) 454 | throws(1:NoSuchObjectException o1, 2:MetaException o2) 455 | Index get_index_by_name(1:string db_name 2:string tbl_name, 3:string index_name) 456 | throws(1:MetaException o1, 2:NoSuchObjectException o2) 457 | 458 | list get_indexes(1:string db_name, 2:string tbl_name, 3:i16 max_indexes=-1) 459 | throws(1:NoSuchObjectException o1, 2:MetaException o2) 460 | list get_index_names(1:string db_name, 2:string tbl_name, 3:i16 max_indexes=-1) 461 | throws(1:MetaException o2) 462 | 463 | //authorization privileges 464 | 465 | bool create_role(1:Role role) throws(1:MetaException o1) 466 | bool drop_role(1:string role_name) throws(1:MetaException o1) 467 | list get_role_names() throws(1:MetaException o1) 468 | bool grant_role(1:string role_name, 2:string principal_name, 3:PrincipalType principal_type, 469 | 4:string grantor, 5:PrincipalType grantorType, 6:bool grant_option) throws(1:MetaException o1) 470 | bool revoke_role(1:string role_name, 2:string principal_name, 3:PrincipalType principal_type) 471 | throws(1:MetaException o1) 472 | list list_roles(1:string principal_name, 2:PrincipalType principal_type) throws(1:MetaException o1) 473 | 474 | PrincipalPrivilegeSet get_privilege_set(1:HiveObjectRef hiveObject, 2:string user_name, 475 | 3: list group_names) throws(1:MetaException o1) 476 | list list_privileges(1:string principal_name, 2:PrincipalType principal_type, 477 | 3: HiveObjectRef hiveObject) throws(1:MetaException o1) 478 | 479 | bool grant_privileges(1:PrivilegeBag privileges) throws(1:MetaException o1) 480 | bool revoke_privileges(1:PrivilegeBag privileges) throws(1:MetaException o1) 481 | 482 | // this is used by metastore client to send UGI information to metastore server immediately 483 | // after setting up a connection. 484 | list set_ugi(1:string user_name, 2:list group_names) throws (1:MetaException o1) 485 | 486 | //Authentication (delegation token) interfaces 487 | 488 | // get metastore server delegation token for use from the map/reduce tasks to authenticate 489 | // to metastore server 490 | string get_delegation_token(1:string token_owner, 2:string renewer_kerberos_principal_name) 491 | throws (1:MetaException o1) 492 | 493 | // method to renew delegation token obtained from metastore server 494 | i64 renew_delegation_token(1:string token_str_form) throws (1:MetaException o1) 495 | 496 | // method to cancel delegation token obtained from metastore server 497 | void cancel_delegation_token(1:string token_str_form) throws (1:MetaException o1) 498 | } 499 | 500 | // * Note about the DDL_TIME: When creating or altering a table or a partition, 501 | // if the DDL_TIME is not set, the current time will be used. 502 | 503 | // For storing info about archived partitions in parameters 504 | 505 | // Whether the partition is archived 506 | const string IS_ARCHIVED = "is_archived", 507 | // The original location of the partition, before archiving. After archiving, 508 | // this directory will contain the archive. When the partition 509 | // is dropped, this directory will be deleted 510 | const string ORIGINAL_LOCATION = "original_location", 511 | 512 | // these should be needed only for backward compatibility with filestore 513 | const string META_TABLE_COLUMNS = "columns", 514 | const string META_TABLE_COLUMN_TYPES = "columns.types", 515 | const string BUCKET_FIELD_NAME = "bucket_field_name", 516 | const string BUCKET_COUNT = "bucket_count", 517 | const string FIELD_TO_DIMENSION = "field_to_dimension", 518 | const string META_TABLE_NAME = "name", 519 | const string META_TABLE_DB = "db", 520 | const string META_TABLE_LOCATION = "location", 521 | const string META_TABLE_SERDE = "serde", 522 | const string META_TABLE_PARTITION_COLUMNS = "partition_columns", 523 | const string FILE_INPUT_FORMAT = "file.inputformat", 524 | const string FILE_OUTPUT_FORMAT = "file.outputformat", 525 | const string META_TABLE_STORAGE = "storage_handler", 526 | 527 | 528 | 529 | -------------------------------------------------------------------------------- /herringbone-main/src/test/resources/test.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stripe-archive/herringbone/4f0524287ef47fc897702d654572bbeee1004879/herringbone-main/src/test/resources/test.parquet -------------------------------------------------------------------------------- /herringbone-main/src/test/scala/com/stripe/herringbone/FlattenJobTest.scala: -------------------------------------------------------------------------------- 1 | package com.stripe.herringbone.test 2 | 3 | import com.stripe.herringbone.flatten._ 4 | import org.scalatest._ 5 | import parquet.example.Paper 6 | import parquet.io.api.Binary 7 | 8 | class FlattenJobTest extends FlatSpec with Matchers { 9 | def toBinary(x: Array[Byte]) = Binary.fromByteArray(x) 10 | 11 | "truncate" should "truncate to correct length" in { 12 | val consumer = new FlatConsumer(Paper.r1, "__", false) 13 | val bytes = toBinary(Array[Byte](1,2,3,4)) 14 | assert(consumer.truncate(bytes, 3).getBytes().sameElements(Array[Byte](1,2,3))) 15 | } 16 | 17 | "truncate" should "not truncate if unnecessary" in { 18 | val consumer = new FlatConsumer(Paper.r1, "__", false) 19 | val bytes = toBinary(Array[Byte](1,2,3,4)) 20 | assert(consumer.truncate(bytes, 8) == bytes) 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /herringbone-main/src/test/scala/com/stripe/herringbone/flatten/FlatConverterTest.scala: -------------------------------------------------------------------------------- 1 | package com.stripe.herringbone.test 2 | 3 | import com.stripe.herringbone.flatten.{FlatConverter,TypeFlattener} 4 | 5 | import org.scalatest._ 6 | import org.apache.hadoop.fs.Path 7 | 8 | import parquet.example.Paper 9 | import parquet.example.data.simple.SimpleGroup 10 | import parquet.example.data.GroupWriter 11 | import parquet.schema.MessageType 12 | import parquet.schema.PrimitiveType 13 | import parquet.schema.Type.Repetition.OPTIONAL 14 | import parquet.schema.Type.Repetition.REQUIRED 15 | import parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY 16 | 17 | import scala.collection.mutable.StringBuilder 18 | import java.io.StringWriter 19 | 20 | class FlatConverterTest extends FlatSpec with Matchers { 21 | 22 | def nestedGroupFixture = 23 | new { 24 | val group = Paper.r1 25 | val schema = Paper.schema 26 | val flatSchema = TypeFlattener.flatten(schema, None, "__", true) 27 | val flatGroup = FlatConverter.flattenGroup(group, flatSchema, "__", true) 28 | } 29 | 30 | def flatGroupFixture = 31 | new { 32 | val flatSchema = 33 | new MessageType("Charge", 34 | new PrimitiveType(REQUIRED, BINARY, "_id"), 35 | new PrimitiveType(OPTIONAL, BINARY, "email"), 36 | new PrimitiveType(REQUIRED, BINARY, "merchant") 37 | ) 38 | val flatGroupMissingFields = new SimpleGroup(flatSchema) 39 | flatGroupMissingFields.add("_id", "ch_1") 40 | flatGroupMissingFields.add("merchant", "acct_1") 41 | val flatGroupAllFields = new SimpleGroup(flatSchema) 42 | flatGroupAllFields.add("email", "bob@stripe.com") 43 | flatGroupAllFields.add("merchant", "acct_1") 44 | flatGroupAllFields.add("_id", "ch_1") 45 | } 46 | 47 | "groupToTSV" should "convert a flattened group" in { 48 | val f = nestedGroupFixture 49 | val groupTSV = FlatConverter.groupToTSV(f.flatGroup, f.flatSchema, "__", true) 50 | assert(groupTSV == "10\t\t20,40,60") 51 | } 52 | 53 | "groupToTSV" should "respect schema ordering, handle optional fields" in { 54 | val f = flatGroupFixture 55 | val missingTSV = FlatConverter.groupToTSV(f.flatGroupMissingFields, f.flatSchema, "__", true) 56 | assert(missingTSV == "ch_1\t\tacct_1") 57 | val allTSV = FlatConverter.groupToTSV(f.flatGroupAllFields, f.flatSchema, "__", true) 58 | assert(allTSV == "ch_1\tbob@stripe.com\tacct_1") 59 | } 60 | } 61 | 62 | -------------------------------------------------------------------------------- /herringbone-main/src/test/scala/com/stripe/herringbone/flatten/TypeFlattenerTest.scala: -------------------------------------------------------------------------------- 1 | package com.stripe.herringbone.test 2 | 3 | import com.stripe.herringbone.flatten.TypeFlattener 4 | 5 | import org.scalatest._ 6 | 7 | import parquet.schema.GroupType 8 | import parquet.schema.MessageType 9 | import parquet.schema.PrimitiveType 10 | import parquet.schema.Type.Repetition.OPTIONAL 11 | import parquet.schema.Type.Repetition.REPEATED 12 | import parquet.schema.Type.Repetition.REQUIRED 13 | import parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY 14 | import parquet.schema.PrimitiveType.PrimitiveTypeName.INT64 15 | 16 | class TypeFlattenerTest extends FlatSpec with Matchers { 17 | 18 | "flatten" should "omit the idField in nested fieldname if specified" in { 19 | val input = new MessageType("Document", 20 | new PrimitiveType(OPTIONAL, BINARY, "_id"), 21 | new GroupType(OPTIONAL, "Page", 22 | new PrimitiveType(OPTIONAL, BINARY, "_id"))) 23 | 24 | val expected = new MessageType("Document", 25 | new PrimitiveType(OPTIONAL, BINARY, "_id"), 26 | new PrimitiveType(OPTIONAL, BINARY, "Page")) 27 | 28 | val result = TypeFlattener.flatten(input, None, "__", true) 29 | assert(expected == result) 30 | } 31 | 32 | "flatten" should "not omit the idField in nested fieldname if none is specified" in { 33 | val input = new MessageType("Document", 34 | new PrimitiveType(OPTIONAL, BINARY, "_id"), 35 | new GroupType(OPTIONAL, "Page", 36 | new PrimitiveType(OPTIONAL, BINARY, "_id"))) 37 | 38 | val expected = new MessageType("Document", 39 | new PrimitiveType(OPTIONAL, BINARY, "_id"), 40 | new PrimitiveType(OPTIONAL, BINARY, "Page___id")) 41 | 42 | val result = TypeFlattener.flatten(input, None, "__", false) 43 | assert(expected == result) 44 | } 45 | 46 | "flatten" should "not include repeated groups" in { 47 | val input = new MessageType("Document", 48 | new PrimitiveType(OPTIONAL, BINARY, "_id"), 49 | new GroupType(REPEATED, "Nope", 50 | new PrimitiveType(REPEATED, INT64, "Never"))) 51 | 52 | val expected = new MessageType("Document", 53 | new PrimitiveType(OPTIONAL, BINARY, "_id")) 54 | 55 | val result = TypeFlattener.flatten(input, None, "__", true) 56 | assert(expected == result) 57 | } 58 | 59 | "flatten" should "set all fields as optional" in { 60 | val input = new MessageType("Document", 61 | new GroupType(OPTIONAL, "Yep", 62 | new GroupType(REQUIRED, "Grouped", 63 | new PrimitiveType(REQUIRED, BINARY, "Yes"), 64 | new PrimitiveType(REPEATED, BINARY, "Maybe")), 65 | new PrimitiveType(OPTIONAL, BINARY, "Sometimes"))) 66 | 67 | val expected = new MessageType("Document", 68 | new PrimitiveType(OPTIONAL, BINARY, "Yep__Grouped__Yes"), 69 | new PrimitiveType(OPTIONAL, BINARY, "Yep__Grouped__Maybe"), 70 | new PrimitiveType(OPTIONAL, BINARY, "Yep__Sometimes")) 71 | 72 | val result = TypeFlattener.flatten(input, None, "__", true) 73 | assert(expected == result) 74 | } 75 | 76 | "flatten" should "preserve the order of previously flattened fields" in { 77 | val input = new MessageType("Document", 78 | new PrimitiveType(REQUIRED, BINARY, "Old__Two"), 79 | new GroupType(OPTIONAL, "New", 80 | new PrimitiveType(REQUIRED, BINARY, "One")), 81 | new PrimitiveType(REQUIRED, BINARY, "Old__One")) 82 | 83 | val old = new MessageType("Document", 84 | new PrimitiveType(OPTIONAL, BINARY, "Old__One"), 85 | new PrimitiveType(OPTIONAL, BINARY, "Old__Two")) 86 | 87 | val expected = new MessageType("Document", 88 | new PrimitiveType(OPTIONAL, BINARY, "Old__One"), 89 | new PrimitiveType(OPTIONAL, BINARY, "Old__Two"), 90 | new PrimitiveType(OPTIONAL, BINARY, "New__One")) 91 | 92 | val result = TypeFlattener.flatten(input, Some(old), "__", true) 93 | assert(expected == result) 94 | } 95 | } 96 | -------------------------------------------------------------------------------- /herringbone-main/src/test/scala/com/stripe/herringbone/load/FieldUtilsTest.scala: -------------------------------------------------------------------------------- 1 | package com.stripe.herringbone.test.load 2 | 3 | import com.stripe.herringbone.load.{FieldUtils, HadoopFs, ImpalaHiveSchemaTypeMapper} 4 | import org.apache.hadoop.fs._ 5 | import org.scalamock.scalatest.MockFactory 6 | import org.scalatest._ 7 | import parquet.schema.{PrimitiveType, Type} 8 | 9 | class FieldUtilsTest extends FlatSpec with Matchers with MockFactory { 10 | 11 | "findPartitionFields" should "find the partition field names and types" in { 12 | val hadoopFs = mock[HadoopFs] 13 | val path = new Path("path") 14 | 15 | val partitions = List(("day", "123"), ("type", "foo")) 16 | (hadoopFs.findPartitions _).expects(path).returning(partitions) 17 | 18 | val expected = List("`day` int", "`type` string") 19 | FieldUtils(hadoopFs, ImpalaHiveSchemaTypeMapper).findPartitionFields(path) should equal (expected) 20 | } 21 | 22 | "tableFieldsFromSchemaFields" should "find the table fields from the parquet schema" in { 23 | val hadoopFs = mock[HadoopFs] 24 | val optional = Type.Repetition.valueOf("OPTIONAL") 25 | val input = List( 26 | new PrimitiveType(optional, PrimitiveType.PrimitiveTypeName.valueOf("BINARY"), "a"), 27 | new PrimitiveType(optional, PrimitiveType.PrimitiveTypeName.valueOf("INT32"), "b"), 28 | new PrimitiveType(optional, PrimitiveType.PrimitiveTypeName.valueOf("INT64"), "c"), 29 | new PrimitiveType(optional, PrimitiveType.PrimitiveTypeName.valueOf("INT96"), "d"), 30 | new PrimitiveType(optional, PrimitiveType.PrimitiveTypeName.valueOf("DOUBLE"), "e"), 31 | new PrimitiveType(optional, PrimitiveType.PrimitiveTypeName.valueOf("BOOLEAN"), "f"), 32 | new PrimitiveType(optional, PrimitiveType.PrimitiveTypeName.valueOf("FLOAT"), "g"), 33 | new PrimitiveType(optional, PrimitiveType.PrimitiveTypeName.valueOf("FIXED_LEN_BYTE_ARRAY"), "h") 34 | ) 35 | 36 | val expected = List( 37 | "`a` STRING", 38 | "`b` INT", 39 | "`c` BIGINT", 40 | "`d` BIGINT", 41 | "`e` DOUBLE", 42 | "`f` BOOLEAN", 43 | "`g` FLOAT", 44 | "`h` BINARY" 45 | ) 46 | 47 | FieldUtils(hadoopFs, ImpalaHiveSchemaTypeMapper).tableFieldsFromSchemaFields(input) should equal (expected) 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | com.stripe 6 | herringbone 7 | 0.0.1 8 | pom 9 | 10 | Herringbone 11 | 12 | 13 | herringbone-impala 14 | herringbone-main 15 | 16 | 17 | 18 | --------------------------------------------------------------------------------