├── .gitignore ├── gradle └── wrapper │ ├── gradle-wrapper.jar │ └── gradle-wrapper.properties ├── settings.gradle ├── src └── main │ ├── proto │ └── Example.proto │ └── java │ └── com │ └── example │ └── ProtobufToParquet.java ├── gradlew.bat ├── README.md └── gradlew /.gitignore: -------------------------------------------------------------------------------- 1 | *.iml 2 | *.ipr 3 | *.iws 4 | .idea/** 5 | .gradle/** 6 | build/** 7 | 8 | -------------------------------------------------------------------------------- /gradle/wrapper/gradle-wrapper.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rdblue/parquet-avro-protobuf/HEAD/gradle/wrapper/gradle-wrapper.jar -------------------------------------------------------------------------------- /gradle/wrapper/gradle-wrapper.properties: -------------------------------------------------------------------------------- 1 | #Wed Oct 07 14:59:35 PDT 2015 2 | distributionBase=GRADLE_USER_HOME 3 | distributionPath=wrapper/dists 4 | zipStoreBase=GRADLE_USER_HOME 5 | zipStorePath=wrapper/dists 6 | distributionUrl=https\://services.gradle.org/distributions/gradle-2.7-bin.zip 7 | -------------------------------------------------------------------------------- /settings.gradle: -------------------------------------------------------------------------------- 1 | /* 2 | * This settings file was auto generated by the Gradle buildInit task 3 | * by 'blue' at '10/7/15 2:59 PM' with Gradle 2.7 4 | * 5 | * The settings file is used to specify which projects to include in your build. 6 | * In a single project build this file can be empty or even removed. 7 | * 8 | * Detailed information about configuring a multi-project build in Gradle can be found 9 | * in the user guide at https://docs.gradle.org/2.7/userguide/multi_project_builds.html 10 | */ 11 | 12 | /* 13 | // To declare projects as part of a multi-project build use the 'include' method 14 | include 'shared' 15 | include 'api' 16 | include 'services:webservice' 17 | */ 18 | 19 | rootProject.name = 'parquet-avro-protobuf' 20 | -------------------------------------------------------------------------------- /src/main/proto/Example.proto: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2015 Cloudera, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package ExampleProtobuf; 18 | 19 | option java_package = "com.example"; 20 | 21 | message ExampleMessage { 22 | required int64 id = 1; 23 | repeated string strings = 2; 24 | } 25 | -------------------------------------------------------------------------------- /gradlew.bat: -------------------------------------------------------------------------------- 1 | @if "%DEBUG%" == "" @echo off 2 | @rem ########################################################################## 3 | @rem 4 | @rem Gradle startup script for Windows 5 | @rem 6 | @rem ########################################################################## 7 | 8 | @rem Set local scope for the variables with windows NT shell 9 | if "%OS%"=="Windows_NT" setlocal 10 | 11 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 12 | set DEFAULT_JVM_OPTS= 13 | 14 | set DIRNAME=%~dp0 15 | if "%DIRNAME%" == "" set DIRNAME=. 16 | set APP_BASE_NAME=%~n0 17 | set APP_HOME=%DIRNAME% 18 | 19 | @rem Find java.exe 20 | if defined JAVA_HOME goto findJavaFromJavaHome 21 | 22 | set JAVA_EXE=java.exe 23 | %JAVA_EXE% -version >NUL 2>&1 24 | if "%ERRORLEVEL%" == "0" goto init 25 | 26 | echo. 27 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 28 | echo. 29 | echo Please set the JAVA_HOME variable in your environment to match the 30 | echo location of your Java installation. 31 | 32 | goto fail 33 | 34 | :findJavaFromJavaHome 35 | set JAVA_HOME=%JAVA_HOME:"=% 36 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe 37 | 38 | if exist "%JAVA_EXE%" goto init 39 | 40 | echo. 41 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% 42 | echo. 43 | echo Please set the JAVA_HOME variable in your environment to match the 44 | echo location of your Java installation. 45 | 46 | goto fail 47 | 48 | :init 49 | @rem Get command-line arguments, handling Windowz variants 50 | 51 | if not "%OS%" == "Windows_NT" goto win9xME_args 52 | if "%@eval[2+2]" == "4" goto 4NT_args 53 | 54 | :win9xME_args 55 | @rem Slurp the command line arguments. 56 | set CMD_LINE_ARGS= 57 | set _SKIP=2 58 | 59 | :win9xME_args_slurp 60 | if "x%~1" == "x" goto execute 61 | 62 | set CMD_LINE_ARGS=%* 63 | goto execute 64 | 65 | :4NT_args 66 | @rem Get arguments from the 4NT Shell from JP Software 67 | set CMD_LINE_ARGS=%$ 68 | 69 | :execute 70 | @rem Setup the command line 71 | 72 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar 73 | 74 | @rem Execute Gradle 75 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS% 76 | 77 | :end 78 | @rem End local scope for the variables with windows NT shell 79 | if "%ERRORLEVEL%"=="0" goto mainEnd 80 | 81 | :fail 82 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of 83 | rem the _cmd.exe /c_ return code! 84 | if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1 85 | exit /b 1 86 | 87 | :mainEnd 88 | if "%OS%"=="Windows_NT" endlocal 89 | 90 | :omega 91 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Converting Protobuf to Parquet via Avro 2 | 3 | ### Why? 4 | 5 | This example shows how to convert a Protobuf file to a Parquet file using 6 | Parquet's Avro object model and Avro's support for protobuf objects. Parquet 7 | has a module to work directly with Protobuf objects, but this isn't always a 8 | good option when writing data for other readers, like Hive. 9 | 10 | The reason is that Parquet and Protobuf use the same schema definitions. Both 11 | support required, optional, and repeated data fields and use repeated to encode 12 | arrays. The mapping from Protobuf to Parquet is always 1-to-1. 13 | 14 | Other object models, like Avro, allow arrays to be null or to contain null 15 | elements and have an annotation, [LIST][list-annotation-docs], for encoding 16 | these more complicated structures in Parquet's schema format using extra hidden 17 | layers. More object models use this structure than bare repeated fields, so it 18 | is desirable to use it when converting. 19 | 20 | The easiest way to use the complex LIST stucture for protobuf data is to write 21 | using parquet-avro and use Avro's support for Protobuf objects, avro-protobuf. 22 | 23 | [list-annotation-docs]: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md 24 | 25 | ### Code 26 | 27 | Conversion is done in the [`writeProtobufToParquetAvro`method][write-proto-method]. 28 | The first step is to get a handle to Avro's Protobuf object model using 29 | `ProtobufData.get()`. 30 | 31 | ```Java 32 | ProtobufData model = ProtobufData.get(); 33 | ``` 34 | 35 | The Protobuf object model is used to convert the Protobuf data class, 36 | `ExampleMessage`, into an Avro schema. 37 | 38 | ```Java 39 | Schema schema = model.getSchema(ExampleMessage.class); 40 | ``` 41 | 42 | Then, the Protobuf object model is passed to the builder when creating a 43 | `ParquetWriter`. 44 | 45 | ```Java 46 | ParquetWriter parquetWriter = AvroParquetWriter 47 | .builder(new Path(parquetFile)) 48 | .withDataModel(model) // use the protobuf data model 49 | .withSchema(schema) // Avro schema for the protobuf data 50 | .build(); 51 | ``` 52 | 53 | Once the parquet-avro writer is configured to use Avro's protobuf support, it 54 | is able to write protobuf messages to the outgoing Parquet file. 55 | 56 | ```Java 57 | ExampleMessage m; 58 | while ((m = ExampleMessage.parseDelimitedFrom(protoStream)) != null) { 59 | parquetWriter.write(m); 60 | } 61 | ``` 62 | 63 | [write-proto-method]: https://github.com/rdblue/parquet-avro-protobuf/blob/master/src/main/java/com/example/ProtobufToParquet.java#L59 64 | 65 | ### Result 66 | 67 | After running the example, you will end up with `example.parquet` in temp. 68 | Using `parquet-tools` to view the schema shows the correct 3-level list 69 | representation. 70 | 71 | ``` 72 | message com.example.Example$.ExampleMessage { 73 | required int64 id; 74 | required group strings (LIST) { 75 | repeated group list { 76 | required binary element (UTF8); 77 | } 78 | } 79 | } 80 | ``` 81 | 82 | The original protobuf schema did not include the LIST annotation or the 83 | additional levels needed for compatibility. 84 | 85 | ``` 86 | message ExampleMessage { 87 | required int64 id = 1; 88 | repeated string strings = 2; 89 | } 90 | ``` 91 | 92 | The data looks like this when converted to JSON: 93 | 94 | ``` 95 | {"id": 0, "strings": ["a", "b", "c"]} 96 | {"id": 1, "strings": ["b", "c", "d"]} 97 | {"id": 2, "strings": ["c", "d", "e"]} 98 | {"id": 3, "strings": ["d", "e", "f"]} 99 | {"id": 4, "strings": ["e", "f", "g"]} 100 | ``` 101 | -------------------------------------------------------------------------------- /src/main/java/com/example/ProtobufToParquet.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2015 Cloudera, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.example; 18 | 19 | import com.example.Example.ExampleMessage; 20 | import org.apache.avro.Schema; 21 | import org.apache.avro.protobuf.ProtobufData; 22 | import org.apache.hadoop.conf.Configuration; 23 | import org.apache.hadoop.fs.Path; 24 | import org.apache.parquet.avro.AvroParquetWriter; 25 | import org.apache.parquet.hadoop.ParquetWriter; 26 | import java.io.File; 27 | import java.io.FileInputStream; 28 | import java.io.FileOutputStream; 29 | import java.io.IOException; 30 | import java.util.Arrays; 31 | 32 | public class ProtobufToParquet { 33 | public static final String ALPHABET = "abcdefghijklmnopqrstuvwxyz"; 34 | 35 | public static String letter(int ordinal) { 36 | int start = (ordinal % ALPHABET.length()); 37 | return ALPHABET.substring(start, start + 1); 38 | } 39 | 40 | public static void writeProtoFile(String path) throws IOException { 41 | File file = new File(path); 42 | file.deleteOnExit(); 43 | 44 | FileOutputStream out = new FileOutputStream(file); 45 | try { 46 | for (int i = 0; i < 1000; i += 1) { 47 | ExampleMessage message = ExampleMessage.newBuilder() 48 | .setId(i) 49 | .addAllStrings(Arrays.asList( 50 | letter(i), letter(i + 1), letter(i + 2))) 51 | .build(); 52 | message.writeDelimitedTo(out); 53 | } 54 | } finally { 55 | out.close(); 56 | } 57 | } 58 | 59 | public static void writeProtobufToParquetAvro(String protoFile, 60 | String parquetFile) 61 | throws IOException { 62 | ProtobufData model = ProtobufData.get(); 63 | 64 | Schema schema = model.getSchema(ExampleMessage.class); 65 | System.err.println("Using Avro schema: " + schema.toString(true)); 66 | 67 | // use the 3-level structure instead of the 2-level 68 | // 2-level is the default for forward-compatibility until 2.x 69 | Configuration conf = new Configuration(); 70 | conf.setBoolean("parquet.avro.write-old-list-structure", false); 71 | 72 | ParquetWriter writer = AvroParquetWriter 73 | .builder(new Path(parquetFile)) 74 | .withConf(conf) // conf set to use 3-level lists 75 | .withDataModel(model) // use the protobuf data model 76 | .withSchema(schema) // Avro schema for the protobuf data 77 | .build(); 78 | 79 | FileInputStream protoStream = new FileInputStream(new File(protoFile)); 80 | try { 81 | ExampleMessage m; 82 | while ((m = ExampleMessage.parseDelimitedFrom(protoStream)) != null) { 83 | writer.write(m); 84 | } 85 | } finally { 86 | protoStream.close(); 87 | } 88 | 89 | writer.close(); 90 | } 91 | 92 | public static void main(String[] argv) throws IOException { 93 | String protoFile = "/tmp/example.proto"; 94 | String parquetFile = "/tmp/example.parquet"; 95 | new File(parquetFile).delete(); 96 | writeProtoFile(protoFile); 97 | writeProtobufToParquetAvro(protoFile, parquetFile); 98 | } 99 | } 100 | -------------------------------------------------------------------------------- /gradlew: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | ############################################################################## 4 | ## 5 | ## Gradle start up script for UN*X 6 | ## 7 | ############################################################################## 8 | 9 | # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 10 | DEFAULT_JVM_OPTS="" 11 | 12 | APP_NAME="Gradle" 13 | APP_BASE_NAME=`basename "$0"` 14 | 15 | # Use the maximum available, or set MAX_FD != -1 to use that value. 16 | MAX_FD="maximum" 17 | 18 | warn ( ) { 19 | echo "$*" 20 | } 21 | 22 | die ( ) { 23 | echo 24 | echo "$*" 25 | echo 26 | exit 1 27 | } 28 | 29 | # OS specific support (must be 'true' or 'false'). 30 | cygwin=false 31 | msys=false 32 | darwin=false 33 | case "`uname`" in 34 | CYGWIN* ) 35 | cygwin=true 36 | ;; 37 | Darwin* ) 38 | darwin=true 39 | ;; 40 | MINGW* ) 41 | msys=true 42 | ;; 43 | esac 44 | 45 | # Attempt to set APP_HOME 46 | # Resolve links: $0 may be a link 47 | PRG="$0" 48 | # Need this for relative symlinks. 49 | while [ -h "$PRG" ] ; do 50 | ls=`ls -ld "$PRG"` 51 | link=`expr "$ls" : '.*-> \(.*\)$'` 52 | if expr "$link" : '/.*' > /dev/null; then 53 | PRG="$link" 54 | else 55 | PRG=`dirname "$PRG"`"/$link" 56 | fi 57 | done 58 | SAVED="`pwd`" 59 | cd "`dirname \"$PRG\"`/" >&- 60 | APP_HOME="`pwd -P`" 61 | cd "$SAVED" >&- 62 | 63 | CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar 64 | 65 | # Determine the Java command to use to start the JVM. 66 | if [ -n "$JAVA_HOME" ] ; then 67 | if [ -x "$JAVA_HOME/jre/sh/java" ] ; then 68 | # IBM's JDK on AIX uses strange locations for the executables 69 | JAVACMD="$JAVA_HOME/jre/sh/java" 70 | else 71 | JAVACMD="$JAVA_HOME/bin/java" 72 | fi 73 | if [ ! -x "$JAVACMD" ] ; then 74 | die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME 75 | 76 | Please set the JAVA_HOME variable in your environment to match the 77 | location of your Java installation." 78 | fi 79 | else 80 | JAVACMD="java" 81 | which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 82 | 83 | Please set the JAVA_HOME variable in your environment to match the 84 | location of your Java installation." 85 | fi 86 | 87 | # Increase the maximum file descriptors if we can. 88 | if [ "$cygwin" = "false" -a "$darwin" = "false" ] ; then 89 | MAX_FD_LIMIT=`ulimit -H -n` 90 | if [ $? -eq 0 ] ; then 91 | if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then 92 | MAX_FD="$MAX_FD_LIMIT" 93 | fi 94 | ulimit -n $MAX_FD 95 | if [ $? -ne 0 ] ; then 96 | warn "Could not set maximum file descriptor limit: $MAX_FD" 97 | fi 98 | else 99 | warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT" 100 | fi 101 | fi 102 | 103 | # For Darwin, add options to specify how the application appears in the dock 104 | if $darwin; then 105 | GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\"" 106 | fi 107 | 108 | # For Cygwin, switch paths to Windows format before running java 109 | if $cygwin ; then 110 | APP_HOME=`cygpath --path --mixed "$APP_HOME"` 111 | CLASSPATH=`cygpath --path --mixed "$CLASSPATH"` 112 | JAVACMD=`cygpath --unix "$JAVACMD"` 113 | 114 | # We build the pattern for arguments to be converted via cygpath 115 | ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null` 116 | SEP="" 117 | for dir in $ROOTDIRSRAW ; do 118 | ROOTDIRS="$ROOTDIRS$SEP$dir" 119 | SEP="|" 120 | done 121 | OURCYGPATTERN="(^($ROOTDIRS))" 122 | # Add a user-defined pattern to the cygpath arguments 123 | if [ "$GRADLE_CYGPATTERN" != "" ] ; then 124 | OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)" 125 | fi 126 | # Now convert the arguments - kludge to limit ourselves to /bin/sh 127 | i=0 128 | for arg in "$@" ; do 129 | CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -` 130 | CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option 131 | 132 | if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition 133 | eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"` 134 | else 135 | eval `echo args$i`="\"$arg\"" 136 | fi 137 | i=$((i+1)) 138 | done 139 | case $i in 140 | (0) set -- ;; 141 | (1) set -- "$args0" ;; 142 | (2) set -- "$args0" "$args1" ;; 143 | (3) set -- "$args0" "$args1" "$args2" ;; 144 | (4) set -- "$args0" "$args1" "$args2" "$args3" ;; 145 | (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;; 146 | (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;; 147 | (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;; 148 | (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;; 149 | (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;; 150 | esac 151 | fi 152 | 153 | # Split up the JVM_OPTS And GRADLE_OPTS values into an array, following the shell quoting and substitution rules 154 | function splitJvmOpts() { 155 | JVM_OPTS=("$@") 156 | } 157 | eval splitJvmOpts $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS 158 | JVM_OPTS[${#JVM_OPTS[*]}]="-Dorg.gradle.appname=$APP_BASE_NAME" 159 | 160 | exec "$JAVACMD" "${JVM_OPTS[@]}" -classpath "$CLASSPATH" org.gradle.wrapper.GradleWrapperMain "$@" 161 | --------------------------------------------------------------------------------