├── .github └── workflows │ └── maven.yml ├── .gitignore ├── .scalafmt.conf ├── LICENSE ├── README.md ├── pom.xml └── src ├── main └── java │ └── uk │ └── co │ └── realb │ └── flink │ └── orc │ ├── HadoopOutputStreamAdapter.java │ ├── OrcUtils.java │ ├── OrcWriters.java │ ├── StreamFileSystem.java │ ├── encoder │ ├── CustomEncoderOrcBuilder.java │ ├── EncoderOrcBuilder.java │ ├── EncoderOrcWriter.java │ ├── EncoderOrcWriterFactory.java │ ├── EncoderWriter.java │ └── OrcRowEncoder.java │ └── hive │ ├── GenericRecordHiveOrcBuilder.java │ ├── GenericRecordOrcWriter.java │ ├── GenericRecordOrcWriterFactory.java │ ├── HiveOrcBuilder.java │ ├── HiveOrcWriter.java │ ├── HiveOrcWriterFactory.java │ └── ReflectHiveOrcBuilder.java └── test ├── java └── uk │ └── co │ └── realb │ └── flink │ └── orc │ ├── EncoderSinkTest.java │ ├── GenericRecordSinkTest.java │ ├── ReflectSinkTest.java │ ├── TestBucketAssigner.java │ ├── TestDataJava.java │ └── TestTupleEncoder.java └── scala └── uk └── co └── realb └── flink └── orc ├── EncoderSinkSpec.scala ├── GenericRecordSinkSpec.scala ├── ReflectSinkSpec.scala ├── TestData.scala ├── TestEncoder.scala └── TestUtils.scala /.github/workflows/maven.yml: -------------------------------------------------------------------------------- 1 | name: Java CI 2 | 3 | on: [push] 4 | 5 | jobs: 6 | build: 7 | 8 | runs-on: ubuntu-latest 9 | 10 | steps: 11 | - uses: actions/checkout@v1 12 | - name: Set up JDK 1.8 13 | uses: actions/setup-java@v1 14 | with: 15 | java-version: 1.8 16 | - name: Build with Maven 17 | run: mvn -B package --file pom.xml 18 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.iml 2 | .idea 3 | target 4 | dependency-reduced-pom.xml 5 | -------------------------------------------------------------------------------- /.scalafmt.conf: -------------------------------------------------------------------------------- 1 | version=2.3.2 2 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Flink ORC Streaming File Sink 2 | 3 | Adds ORC support to Flink Streaming File Sink. 4 | 5 | ## Project configuration 6 | 7 | ### Dependencies 8 | 9 | [![](https://jitpack.io/v/mattczyz/flink-orc.svg)](https://jitpack.io/#mattczyz/flink-orc) 10 | 11 | ``` 12 | repositories { 13 | maven { url 'https://jitpack.io' } 14 | } 15 | 16 | dependencies { 17 | compileOnly 'org.apache.hadoop:hadoop-common:2.8.3' 18 | compile 'com.github.mattczyz:flink-orc:release-0.3' 19 | # For reflection based writers 20 | compile ('org.apache.hive:hive-exec:2.3.4:core') 21 | } 22 | ``` 23 | 24 | ## Usage 25 | 26 | ### Encoder 27 | To configure the sink with Encoder, an implementation of `OrcRowEncoder[T]` is required with logic to transform user record `T` into `ColumnVectors` and then populate `VectorizedRowBatch`. 28 | 29 | Helper methods: 30 | * nextIndex(batch) - returning the next row index as Int 31 | * incrementBatchSize(batch) - completing the row and incrementing internal VectorizedRowBatch counter 32 | 33 | ``` 34 | class Encoder extends OrcRowEncoder[(Int, String, String)]() with Serializable { 35 | override def encodeAndAdd( 36 | datum: (Int, String, String), 37 | batch: VectorizedRowBatch 38 | ): Unit = { 39 | val row = nextIndex(batch) 40 | batch.cols(0).asInstanceOf[LongColumnVector].vector(row) = datum._1 41 | batch 42 | .cols(1) 43 | .asInstanceOf[BytesColumnVector] 44 | .setVal(row, datum._2.getBytes) 45 | batch 46 | .cols(2) 47 | .asInstanceOf[BytesColumnVector] 48 | .setVal(row, datum._3.getBytes) 49 | incrementBatchSize(batch) 50 | } 51 | } 52 | ``` 53 | 54 | Visit ORC [documentation](https://orc.apache.org/docs/core-java.html) to get more information on VectorizedRowBatch. 55 | 56 | Sink is built with `writerFactory` returned from 57 | ```OrcWriters.withCustomEncoder[(Int, String, String)](encoder, schema, props)``` 58 | passing encoder, output schema and additional ORC configuration. 59 | 60 | * `[(Int, String, String)]` - input data type 61 | * encoder - implementation of `OrcRowEncoder[T]` 62 | * schema - ORC `TypeDescription` 63 | * props - non-default ORC configuration as `Properties` 64 | 65 | ``` 66 | val props = new Properties() 67 | props.setProperty("orc.compress", "SNAPPY") 68 | props.setProperty("orc.bloom.filter.columns", "x") 69 | 70 | val schemaString = """struct""" 71 | val schema = TypeDescription.fromString(schemaString) 72 | 73 | stream 74 | .addSink(StreamingFileSink 75 | .forBulkFormat( 76 | new Path(out), 77 | OrcWriters 78 | .withCustomEncoder[(Int, String, String)](new Encoder, schema, props) 79 | ) 80 | .withBucketAssigner(new BucketAssigner) 81 | .build()) 82 | 83 | ``` 84 | 85 | ### Reflection 86 | Sink can be configured to use reflection to inspect types and encode records. It requires Java POJO or Scala Case Class type specified when instantiating the sink. Internally using Hive ObjectInspector. 87 | 88 | Sink is built with `writerFactory` returned from 89 | ```OrcWriters.forReflectRecord(classOf[TestData], props)``` 90 | specifying incoming data type and additional ORC configuration. 91 | 92 | * `classOf[TestData]` - input data type `Class` of Java POJO or Scala Case Class 93 | * props - non-default ORC configuration as `Properties` 94 | 95 | ``` 96 | val props = new Properties() 97 | stream 98 | .addSink(StreamingFileSink 99 | .forBulkFormat( 100 | new Path(out), 101 | OrcWriters.forReflectRecord(classOf[TestData], props) 102 | ) 103 | .withBucketAssigner(new BucketAssigner) 104 | .build()) 105 | ``` 106 | 107 | ### Avro GenericRecord 108 | Sink can encode Avro GenericRecord directly to ORC. It requires Avro schema provided when instantiating the sink. 109 | 110 | Sink is built with `writerFactory` returned from 111 | ```OrcWriters.forGenericRecord[GenericRecord](avroSchemaString, props)``` 112 | with schema of incoming data and additional ORC configuration. 113 | 114 | * `avroSchemaString` - Avro schema as `JSON String` 115 | * props - non-default ORC configuration as `Properties` 116 | 117 | ``` 118 | val schema = """{ 119 | | "name": "record", 120 | | "type": "record", 121 | | "fields": [{ 122 | | "name": "x", 123 | | "type": "int", 124 | | "doc": "" 125 | | }, { 126 | | "name": "y", 127 | | "type": "string", 128 | | "doc": "" 129 | | }, { 130 | | "name": "z", 131 | | "type": "string", 132 | | "doc": "" 133 | | }] 134 | |}""".stripMargin 135 | 136 | val props = new Properties() 137 | stream 138 | .addSink(StreamingFileSink 139 | .forBulkFormat( 140 | new Path(out), 141 | OrcWriters.forGenericRecord[GenericRecord](schema, props) 142 | ) 143 | .withBucketAssigner(new BucketAssigner) 144 | .build()) 145 | ``` 146 | 147 | ## Releases 148 | 149 | * 0.4 150 | * Avro GenericRecord Writer 151 | * Removed deprecated EncoderOrcWriters class 152 | * 0.3 - Reflection based Writer 153 | * 0.2 - VectorizedRowBatch based Writer 154 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4.0.0 3 | uk.co.realb.flink.orc 4 | flink-orc 5 | 0.4 6 | jar 7 | 8 | Flink Streaming File ORC Sink 9 | https://github.com/mattczyz/flink-orc 10 | 11 | 12 | UTF-8 13 | 1.9.1 14 | 2.11 15 | 2.11.12 16 | 1.8 17 | 1.8 18 | 19 | 20 | 21 | github 22 | GitHub Apache Maven Packages 23 | https://maven.pkg.github.com/mattczyz/flink-orc 24 | 25 | 26 | 27 | 28 | org.apache.flink 29 | flink-scala_${scala.binary.version} 30 | ${flink.version} 31 | provided 32 | 33 | 34 | org.apache.flink 35 | flink-streaming-scala_${scala.binary.version} 36 | ${flink.version} 37 | provided 38 | 39 | 40 | org.scala-lang 41 | scala-library 42 | ${scala.version} 43 | provided 44 | 45 | 46 | 47 | org.apache.orc 48 | orc-core 49 | 1.6.2 50 | 51 | 52 | 53 | org.apache.flink 54 | flink-shaded-hadoop-2-uber 55 | 2.7.5-8.0 56 | provided 57 | 58 | 59 | 60 | org.apache.hive 61 | hive-exec 62 | 2.3.4 63 | core 64 | provided 65 | 66 | 67 | 68 | org.scalatest 69 | scalatest_${scala.binary.version} 70 | 3.1.0 71 | test 72 | 73 | 74 | org.apache.flink 75 | flink-test-utils_${scala.binary.version} 76 | ${flink.version} 77 | test 78 | 79 | 80 | org.apache.flink 81 | flink-streaming-java_${scala.binary.version} 82 | ${flink.version} 83 | tests 84 | test 85 | 86 | 87 | junit 88 | junit 89 | 4.13 90 | test 91 | 92 | 93 | org.scalatestplus 94 | scalatestplus-junit_${scala.binary.version} 95 | 1.0.0-M2 96 | test 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | org.apache.maven.plugins 105 | maven-compiler-plugin 106 | 3.1 107 | 108 | 1.8 109 | 1.8 110 | 111 | 112 | 113 | compile 114 | 115 | compile 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | net.alchim31.maven 124 | scala-maven-plugin 125 | 3.2.2 126 | 127 | 128 | scala-compile-first 129 | process-resources 130 | 131 | add-source 132 | compile 133 | 134 | 135 | 136 | scala-test-compile 137 | process-test-resources 138 | 139 | testCompile 140 | 141 | 142 | 143 | 144 | 145 | org.antipathy 146 | mvn-scalafmt_${scala.binary.version} 147 | 1.0.3 148 | 149 | ${project.basedir}/.scalafmt.conf 150 | true 151 | 152 | 153 | 154 | validate 155 | 156 | format 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | org.apache.maven.plugins 166 | maven-surefire-plugin 167 | 3.0.0-M4 168 | 169 | 170 | **/*Test.* 171 | **/*Spec.* 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | -------------------------------------------------------------------------------- /src/main/java/uk/co/realb/flink/orc/HadoopOutputStreamAdapter.java: -------------------------------------------------------------------------------- 1 | package uk.co.realb.flink.orc; 2 | 3 | import org.apache.flink.core.fs.FSDataOutputStream; 4 | import java.io.IOException; 5 | 6 | public class HadoopOutputStreamAdapter extends org.apache.hadoop.fs.FSDataOutputStream { 7 | 8 | final private FSDataOutputStream out; 9 | 10 | public HadoopOutputStreamAdapter(FSDataOutputStream out) throws IOException { 11 | super(out, null); 12 | this.out = out; 13 | } 14 | 15 | @Override 16 | public long getPos() throws IOException { 17 | return out.getPos(); 18 | } 19 | 20 | @Override 21 | public void close() throws IOException {} 22 | 23 | @Override 24 | public void hsync() throws IOException { 25 | out.sync(); 26 | } 27 | 28 | @Override 29 | public synchronized void write(byte[] b, int off, int len) throws IOException { 30 | out.write(b, off, len); 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /src/main/java/uk/co/realb/flink/orc/OrcUtils.java: -------------------------------------------------------------------------------- 1 | package uk.co.realb.flink.orc; 2 | 3 | import org.apache.hadoop.conf.Configuration; 4 | 5 | import java.util.Properties; 6 | 7 | public class OrcUtils { 8 | public static Configuration getConfiguration(Properties props) { 9 | Configuration config = new Configuration(); 10 | props.stringPropertyNames() 11 | .stream() 12 | .filter(p -> props.getProperty(p) != null) 13 | .forEach(p -> config.set(p, props.getProperty(p))); 14 | return config; 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /src/main/java/uk/co/realb/flink/orc/OrcWriters.java: -------------------------------------------------------------------------------- 1 | package uk.co.realb.flink.orc; 2 | 3 | import org.apache.avro.generic.GenericRecord; 4 | import org.apache.orc.TypeDescription; 5 | import uk.co.realb.flink.orc.encoder.CustomEncoderOrcBuilder; 6 | import uk.co.realb.flink.orc.encoder.EncoderOrcBuilder; 7 | import uk.co.realb.flink.orc.encoder.EncoderOrcWriterFactory; 8 | import uk.co.realb.flink.orc.encoder.OrcRowEncoder; 9 | import uk.co.realb.flink.orc.hive.GenericRecordHiveOrcBuilder; 10 | import uk.co.realb.flink.orc.hive.GenericRecordOrcWriterFactory; 11 | import uk.co.realb.flink.orc.hive.HiveOrcWriterFactory; 12 | import uk.co.realb.flink.orc.hive.ReflectHiveOrcBuilder; 13 | 14 | import java.io.Serializable; 15 | import java.util.Properties; 16 | 17 | public class OrcWriters implements Serializable { 18 | 19 | public static HiveOrcWriterFactory forReflectRecord(Class type, Properties props) { 20 | ReflectHiveOrcBuilder builder = new ReflectHiveOrcBuilder<>(type, props); 21 | return new HiveOrcWriterFactory<>(builder); 22 | } 23 | 24 | public static EncoderOrcWriterFactory withCustomEncoder(OrcRowEncoder encoder, 25 | TypeDescription schema, 26 | Properties props) { 27 | EncoderOrcBuilder builder = new CustomEncoderOrcBuilder<>(encoder, schema, props); 28 | return new EncoderOrcWriterFactory<>(builder); 29 | } 30 | 31 | public static GenericRecordOrcWriterFactory forGenericRecord(String avroSchemaString, Properties props) { 32 | GenericRecordHiveOrcBuilder builder = new GenericRecordHiveOrcBuilder(avroSchemaString, props); 33 | return new GenericRecordOrcWriterFactory<>(builder); 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /src/main/java/uk/co/realb/flink/orc/StreamFileSystem.java: -------------------------------------------------------------------------------- 1 | package uk.co.realb.flink.orc; 2 | 3 | import org.apache.hadoop.fs.FSDataInputStream; 4 | import org.apache.hadoop.fs.FSDataOutputStream; 5 | import org.apache.hadoop.fs.FileStatus; 6 | import org.apache.hadoop.fs.FileSystem; 7 | import org.apache.hadoop.fs.Path; 8 | import org.apache.hadoop.fs.permission.FsPermission; 9 | import org.apache.hadoop.util.Progressable; 10 | 11 | import java.io.FileNotFoundException; 12 | import java.io.IOException; 13 | import java.net.URI; 14 | 15 | public class StreamFileSystem extends FileSystem { 16 | final private org.apache.flink.core.fs.FSDataOutputStream out; 17 | public StreamFileSystem(org.apache.flink.core.fs.FSDataOutputStream out) { 18 | this.out = out; 19 | } 20 | 21 | @Override 22 | public URI getUri() { 23 | return null; 24 | } 25 | 26 | @Override 27 | public FSDataInputStream open(Path path, int i) throws IOException { 28 | return null; 29 | } 30 | 31 | @Override 32 | public FSDataOutputStream create(Path path, FsPermission fsPermission, boolean b, int i, short i1, long l, Progressable progressable) throws IOException { 33 | return new HadoopOutputStreamAdapter(out); 34 | } 35 | 36 | @Override 37 | public FSDataOutputStream append(Path path, int i, Progressable progressable) throws IOException { 38 | return null; 39 | } 40 | 41 | @Override 42 | public boolean rename(Path path, Path path1) throws IOException { 43 | return false; 44 | } 45 | 46 | @Override 47 | public boolean delete(Path path, boolean b) throws IOException { 48 | return false; 49 | } 50 | 51 | @Override 52 | public FileStatus[] listStatus(Path path) throws FileNotFoundException, IOException { 53 | return new FileStatus[0]; 54 | } 55 | 56 | @Override 57 | public void setWorkingDirectory(Path path) { 58 | 59 | } 60 | 61 | @Override 62 | public Path getWorkingDirectory() { 63 | return null; 64 | } 65 | 66 | @Override 67 | public boolean mkdirs(Path path, FsPermission fsPermission) throws IOException { 68 | return false; 69 | } 70 | 71 | @Override 72 | public FileStatus getFileStatus(Path path) throws IOException { 73 | return null; 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /src/main/java/uk/co/realb/flink/orc/encoder/CustomEncoderOrcBuilder.java: -------------------------------------------------------------------------------- 1 | package uk.co.realb.flink.orc.encoder; 2 | 3 | import org.apache.hadoop.fs.FileSystem; 4 | import org.apache.hadoop.fs.Path; 5 | import org.apache.orc.OrcFile; 6 | import org.apache.orc.TypeDescription; 7 | import org.apache.orc.Writer; 8 | import uk.co.realb.flink.orc.OrcUtils; 9 | 10 | import java.io.IOException; 11 | import java.io.Serializable; 12 | import java.util.Properties; 13 | 14 | public class CustomEncoderOrcBuilder implements EncoderOrcBuilder, Serializable { 15 | final private Properties props; 16 | final private TypeDescription schema; 17 | final private OrcRowEncoder encoder; 18 | 19 | public CustomEncoderOrcBuilder(OrcRowEncoder encoder, TypeDescription schema, Properties props) { 20 | this.props = props; 21 | this.schema = schema; 22 | this.encoder = encoder; 23 | } 24 | 25 | @Override 26 | public EncoderWriter createWriter(FileSystem stream) throws IOException { 27 | Writer writer = org.apache.orc.OrcFile.createWriter( 28 | new Path(Integer.toString(stream.hashCode())), 29 | OrcFile 30 | .writerOptions(OrcUtils.getConfiguration(this.props)) 31 | .setSchema(schema) 32 | .fileSystem(stream)); 33 | 34 | return EncoderWriter 35 | .builder(writer) 36 | .withEncoder(encoder) 37 | .withSchema(schema) 38 | .build(); 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /src/main/java/uk/co/realb/flink/orc/encoder/EncoderOrcBuilder.java: -------------------------------------------------------------------------------- 1 | package uk.co.realb.flink.orc.encoder; 2 | 3 | import org.apache.hadoop.fs.FileSystem; 4 | import java.io.IOException; 5 | 6 | public interface EncoderOrcBuilder { 7 | EncoderWriter createWriter(FileSystem stream) throws IOException; 8 | } 9 | -------------------------------------------------------------------------------- /src/main/java/uk/co/realb/flink/orc/encoder/EncoderOrcWriter.java: -------------------------------------------------------------------------------- 1 | package uk.co.realb.flink.orc.encoder; 2 | 3 | import org.apache.flink.api.common.serialization.BulkWriter; 4 | import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; 5 | import org.apache.orc.TypeDescription; 6 | import org.apache.orc.Writer; 7 | 8 | import java.io.IOException; 9 | 10 | public class EncoderOrcWriter implements BulkWriter { 11 | 12 | final private OrcRowEncoder encoder; 13 | final private Writer writer; 14 | final private VectorizedRowBatch buffer; 15 | 16 | public EncoderOrcWriter(EncoderWriter orcWriter) { 17 | TypeDescription typeDescription = orcWriter.getSchema(); 18 | this.encoder = orcWriter.getEncoder(); 19 | this.writer = orcWriter.getWriter(); 20 | this.buffer = typeDescription.createRowBatch(); 21 | } 22 | 23 | @Override 24 | public void addElement(T element) throws IOException { 25 | encoder.encodeAndAdd(element, buffer); 26 | if (buffer.size == buffer.getMaxSize()) { 27 | flush(); 28 | } 29 | } 30 | 31 | @Override 32 | public void flush() throws IOException { 33 | if (buffer.size > 0) { 34 | writer.addRowBatch(buffer); 35 | buffer.reset(); 36 | } 37 | } 38 | 39 | @Override 40 | public void finish() throws IOException { 41 | flush(); 42 | writer.close(); 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /src/main/java/uk/co/realb/flink/orc/encoder/EncoderOrcWriterFactory.java: -------------------------------------------------------------------------------- 1 | package uk.co.realb.flink.orc.encoder; 2 | 3 | import org.apache.flink.api.common.serialization.BulkWriter; 4 | import org.apache.flink.core.fs.FSDataOutputStream; 5 | import uk.co.realb.flink.orc.StreamFileSystem; 6 | 7 | import java.io.IOException; 8 | 9 | public class EncoderOrcWriterFactory implements BulkWriter.Factory { 10 | final private EncoderOrcBuilder writerBuilder; 11 | 12 | public EncoderOrcWriterFactory(EncoderOrcBuilder writerBuilder) { 13 | this.writerBuilder = writerBuilder; 14 | } 15 | 16 | @Override 17 | public BulkWriter create(FSDataOutputStream fsDataOutputStream) throws IOException { 18 | StreamFileSystem stream = new StreamFileSystem(fsDataOutputStream); 19 | EncoderWriter writer = writerBuilder.createWriter(stream); 20 | return new EncoderOrcWriter<>(writer); 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /src/main/java/uk/co/realb/flink/orc/encoder/EncoderWriter.java: -------------------------------------------------------------------------------- 1 | package uk.co.realb.flink.orc.encoder; 2 | 3 | import org.apache.orc.TypeDescription; 4 | import org.apache.orc.Writer; 5 | 6 | public class EncoderWriter { 7 | final private Writer writer; 8 | final private OrcRowEncoder encoder; 9 | final private TypeDescription schema; 10 | 11 | public static Builder builder(Writer writer) { 12 | return new Builder<>(writer); 13 | } 14 | 15 | private EncoderWriter(Writer writer, OrcRowEncoder encoder, TypeDescription schema) { 16 | this.writer = writer; 17 | this.encoder = encoder; 18 | this.schema = schema; 19 | } 20 | 21 | public Writer getWriter() { 22 | return writer; 23 | } 24 | 25 | public OrcRowEncoder getEncoder() { 26 | return encoder; 27 | } 28 | 29 | public TypeDescription getSchema() { 30 | return schema; 31 | } 32 | 33 | public static class Builder { 34 | final private Writer writer; 35 | private OrcRowEncoder encoder; 36 | private TypeDescription schema; 37 | 38 | public Builder(Writer writer) { 39 | this.writer = writer; 40 | } 41 | 42 | public Builder withEncoder(OrcRowEncoder encoder){ 43 | this.encoder = encoder; 44 | return this; 45 | } 46 | 47 | public Builder withSchema(TypeDescription schema){ 48 | this.schema = schema; 49 | return this; 50 | } 51 | 52 | public EncoderWriter build(){ 53 | return new EncoderWriter<>(writer, encoder, schema); 54 | } 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /src/main/java/uk/co/realb/flink/orc/encoder/OrcRowEncoder.java: -------------------------------------------------------------------------------- 1 | package uk.co.realb.flink.orc.encoder; 2 | 3 | import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; 4 | 5 | public abstract class OrcRowEncoder { 6 | 7 | public abstract void encodeAndAdd(T datum, VectorizedRowBatch batch); 8 | 9 | public Integer nextIndex(VectorizedRowBatch batch) { 10 | return batch.size; 11 | } 12 | 13 | public void incrementBatchSize(VectorizedRowBatch batch) { 14 | batch.size += 1; 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /src/main/java/uk/co/realb/flink/orc/hive/GenericRecordHiveOrcBuilder.java: -------------------------------------------------------------------------------- 1 | package uk.co.realb.flink.orc.hive; 2 | 3 | import org.apache.avro.Schema; 4 | import org.apache.hadoop.fs.FileSystem; 5 | import org.apache.hadoop.fs.Path; 6 | import org.apache.hadoop.hive.ql.io.orc.OrcFile; 7 | import org.apache.hadoop.hive.ql.io.orc.Writer; 8 | import org.apache.hadoop.hive.serde2.SerDeException; 9 | import org.apache.hadoop.hive.serde2.avro.AvroObjectInspectorGenerator; 10 | import uk.co.realb.flink.orc.OrcUtils; 11 | 12 | import java.io.IOException; 13 | import java.io.Serializable; 14 | import java.util.Properties; 15 | 16 | public class GenericRecordHiveOrcBuilder implements HiveOrcBuilder, Serializable { 17 | final private Properties props; 18 | final private String schemaString; 19 | 20 | public GenericRecordHiveOrcBuilder(String schemaString, Properties props) { 21 | this.props = props; 22 | this.schemaString = schemaString; 23 | } 24 | 25 | @Override 26 | public Writer createWriter(FileSystem stream) throws IOException { 27 | try { 28 | Schema schema = new Schema.Parser().parse(schemaString); 29 | AvroObjectInspectorGenerator generator = new AvroObjectInspectorGenerator(schema); 30 | 31 | return OrcFile.createWriter( 32 | new Path(Integer.toString(stream.hashCode())), 33 | OrcFile 34 | .writerOptions(OrcUtils.getConfiguration(this.props)) 35 | .inspector(generator.getObjectInspector()) 36 | .fileSystem(stream)); 37 | } catch (SerDeException e) { 38 | throw new IOException(e); 39 | } 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /src/main/java/uk/co/realb/flink/orc/hive/GenericRecordOrcWriter.java: -------------------------------------------------------------------------------- 1 | package uk.co.realb.flink.orc.hive; 2 | 3 | import org.apache.avro.generic.GenericRecord; 4 | import org.apache.flink.api.common.serialization.BulkWriter; 5 | import org.apache.hadoop.hive.ql.io.orc.Writer; 6 | 7 | import java.io.IOException; 8 | 9 | public class GenericRecordOrcWriter implements BulkWriter { 10 | final private Writer writer; 11 | 12 | public GenericRecordOrcWriter(Writer orcWriter) { 13 | this.writer = orcWriter; 14 | } 15 | 16 | @Override 17 | public void addElement(T element) throws IOException { 18 | int fieldCount = element 19 | .getSchema() 20 | .getFields() 21 | .size(); 22 | 23 | Object[] row = new Object[fieldCount]; 24 | 25 | for(int i=0; i < fieldCount; i++) { 26 | row[i] = element.get(i); 27 | } 28 | 29 | writer.addRow(row); 30 | } 31 | 32 | @Override 33 | public void flush() {} 34 | 35 | @Override 36 | public void finish() throws IOException { 37 | writer.close(); 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /src/main/java/uk/co/realb/flink/orc/hive/GenericRecordOrcWriterFactory.java: -------------------------------------------------------------------------------- 1 | package uk.co.realb.flink.orc.hive; 2 | 3 | import org.apache.avro.generic.GenericRecord; 4 | import org.apache.flink.api.common.serialization.BulkWriter; 5 | import org.apache.flink.core.fs.FSDataOutputStream; 6 | import org.apache.hadoop.hive.ql.io.orc.Writer; 7 | import uk.co.realb.flink.orc.StreamFileSystem; 8 | 9 | import java.io.IOException; 10 | 11 | public class GenericRecordOrcWriterFactory implements BulkWriter.Factory { 12 | final private HiveOrcBuilder writerBuilder; 13 | 14 | public GenericRecordOrcWriterFactory(HiveOrcBuilder writerBuilder) { 15 | this.writerBuilder = writerBuilder; 16 | } 17 | 18 | @Override 19 | public BulkWriter create(FSDataOutputStream fsDataOutputStream) throws IOException { 20 | StreamFileSystem stream = new StreamFileSystem(fsDataOutputStream); 21 | Writer writer = writerBuilder.createWriter(stream); 22 | return new GenericRecordOrcWriter<>(writer); 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /src/main/java/uk/co/realb/flink/orc/hive/HiveOrcBuilder.java: -------------------------------------------------------------------------------- 1 | package uk.co.realb.flink.orc.hive; 2 | 3 | import org.apache.hadoop.fs.FileSystem; 4 | import org.apache.hadoop.hive.ql.io.orc.Writer; 5 | import java.io.IOException; 6 | 7 | public interface HiveOrcBuilder { 8 | Writer createWriter(FileSystem stream) throws IOException; 9 | } 10 | -------------------------------------------------------------------------------- /src/main/java/uk/co/realb/flink/orc/hive/HiveOrcWriter.java: -------------------------------------------------------------------------------- 1 | package uk.co.realb.flink.orc.hive; 2 | 3 | import org.apache.flink.api.common.serialization.BulkWriter; 4 | import org.apache.hadoop.hive.ql.io.orc.Writer; 5 | 6 | import java.io.IOException; 7 | 8 | public class HiveOrcWriter implements BulkWriter { 9 | 10 | final private Writer writer; 11 | 12 | public HiveOrcWriter(Writer orcWriter) { 13 | this.writer = orcWriter; 14 | } 15 | 16 | @Override 17 | public void addElement(T element) throws IOException { 18 | writer.addRow(element); 19 | } 20 | 21 | @Override 22 | public void flush() {} 23 | 24 | @Override 25 | public void finish() throws IOException { 26 | flush(); 27 | writer.close(); 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/main/java/uk/co/realb/flink/orc/hive/HiveOrcWriterFactory.java: -------------------------------------------------------------------------------- 1 | package uk.co.realb.flink.orc.hive; 2 | 3 | import org.apache.flink.api.common.serialization.BulkWriter; 4 | import org.apache.flink.core.fs.FSDataOutputStream; 5 | import org.apache.hadoop.hive.ql.io.orc.Writer; 6 | import uk.co.realb.flink.orc.StreamFileSystem; 7 | 8 | import java.io.IOException; 9 | 10 | public class HiveOrcWriterFactory implements BulkWriter.Factory { 11 | final private HiveOrcBuilder writerBuilder; 12 | 13 | public HiveOrcWriterFactory(HiveOrcBuilder writerBuilder) { 14 | this.writerBuilder = writerBuilder; 15 | } 16 | 17 | @Override 18 | public BulkWriter create(FSDataOutputStream fsDataOutputStream) throws IOException { 19 | StreamFileSystem stream = new StreamFileSystem(fsDataOutputStream); 20 | Writer writer = writerBuilder.createWriter(stream); 21 | return new HiveOrcWriter<>(writer); 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /src/main/java/uk/co/realb/flink/orc/hive/ReflectHiveOrcBuilder.java: -------------------------------------------------------------------------------- 1 | package uk.co.realb.flink.orc.hive; 2 | 3 | import org.apache.hadoop.fs.FileSystem; 4 | import org.apache.hadoop.fs.Path; 5 | import org.apache.hadoop.hive.ql.io.orc.OrcFile; 6 | import org.apache.hadoop.hive.ql.io.orc.Writer; 7 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; 8 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; 9 | import uk.co.realb.flink.orc.OrcUtils; 10 | 11 | import java.io.IOException; 12 | import java.io.Serializable; 13 | import java.util.Properties; 14 | 15 | public class ReflectHiveOrcBuilder implements HiveOrcBuilder, Serializable { 16 | final private Properties props; 17 | final private Class type; 18 | 19 | public ReflectHiveOrcBuilder(Class type, Properties props) { 20 | this.props = props; 21 | this.type = type; 22 | } 23 | 24 | @Override 25 | public Writer createWriter(FileSystem stream) throws IOException { 26 | ObjectInspector inspector = ObjectInspectorFactory.getReflectionObjectInspector( 27 | type, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); 28 | 29 | return OrcFile.createWriter( 30 | new Path(Integer.toString(stream.hashCode())), 31 | OrcFile 32 | .writerOptions(OrcUtils.getConfiguration(this.props)) 33 | .inspector(inspector) 34 | .fileSystem(stream)); 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /src/test/java/uk/co/realb/flink/orc/EncoderSinkTest.java: -------------------------------------------------------------------------------- 1 | package uk.co.realb.flink.orc; 2 | 3 | import org.apache.flink.api.java.tuple.Tuple3; 4 | import org.apache.flink.core.fs.Path; 5 | import org.apache.flink.streaming.api.functions.sink.filesystem.StreamingFileSink; 6 | import org.apache.flink.streaming.api.operators.StreamSink; 7 | import org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness; 8 | import org.apache.orc.TypeDescription; 9 | import org.junit.Before; 10 | import org.junit.Rule; 11 | import org.junit.Test; 12 | import org.junit.rules.TemporaryFolder; 13 | import uk.co.realb.flink.orc.encoder.EncoderOrcWriterFactory; 14 | 15 | import java.io.File; 16 | import java.nio.file.Paths; 17 | import java.util.Arrays; 18 | import java.util.Collections; 19 | import java.util.List; 20 | import java.util.Properties; 21 | 22 | import static org.junit.Assert.assertEquals; 23 | import static org.junit.Assert.assertTrue; 24 | import static scala.collection.JavaConversions.asScalaBuffer; 25 | import static scala.collection.JavaConversions.seqAsJavaList; 26 | 27 | public class EncoderSinkTest { 28 | @Rule 29 | public TemporaryFolder folder = new TemporaryFolder(); 30 | 31 | private OneInputStreamOperatorTestHarness, Object> testHarness; 32 | 33 | private TypeDescription schema = TypeDescription.fromString("struct"); 34 | 35 | @Before 36 | public void setupTestHarness() throws Exception { 37 | TestTupleEncoder encoder = new TestTupleEncoder(); 38 | 39 | Properties conf = new Properties(); 40 | conf.setProperty("orc.compress", "SNAPPY"); 41 | conf.setProperty("orc.bloom.filter.columns", "x"); 42 | 43 | String out = folder.getRoot().getAbsolutePath(); 44 | EncoderOrcWriterFactory> writer = OrcWriters.withCustomEncoder(encoder, schema, conf); 45 | StreamingFileSink> sink = StreamingFileSink 46 | .forBulkFormat(new Path(out), writer) 47 | .withBucketAssigner(new TestBucketAssigner()) 48 | .build(); 49 | testHarness = new OneInputStreamOperatorTestHarness<>(new StreamSink<>(sink)); 50 | testHarness.setup(); 51 | testHarness.open(); 52 | } 53 | 54 | @Test 55 | public void testJavaEncoderSink() throws Exception { 56 | testHarness.processElement(Tuple3.of(1, "partition", "test"), 100L); 57 | testHarness.processElement(Tuple3.of(2, "partition", "test2"), 101L); 58 | testHarness.snapshot(1L, 10001L); 59 | testHarness.notifyOfCompletedCheckpoint(10002L); 60 | File result = Paths.get(folder.getRoot().getAbsolutePath(), "partition", "part-0-0").toFile(); 61 | 62 | String tempDir = folder.getRoot().getAbsolutePath(); 63 | 64 | String paths = TestUtils.testFile(asScalaBuffer(Arrays.asList("partition", "part-0-0")), tempDir); 65 | 66 | List> rows = seqAsJavaList(TestUtils.testTupleReader( 67 | schema, 68 | asScalaBuffer(Collections.singletonList(paths)) 69 | )); 70 | 71 | assertTrue(result.exists()); 72 | assertEquals(2, rows.size()); 73 | assertEquals(2, rows.get(1)._1()); 74 | assertEquals("partition", rows.get(1)._2()); 75 | assertEquals("test2", rows.get(1)._3()); 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /src/test/java/uk/co/realb/flink/orc/GenericRecordSinkTest.java: -------------------------------------------------------------------------------- 1 | package uk.co.realb.flink.orc; 2 | 3 | import org.apache.avro.Schema; 4 | import org.apache.avro.generic.GenericData; 5 | import org.apache.avro.generic.GenericRecord; 6 | import org.apache.flink.core.fs.Path; 7 | import org.apache.flink.core.io.SimpleVersionedSerializer; 8 | import org.apache.flink.streaming.api.functions.sink.filesystem.BucketAssigner; 9 | import org.apache.flink.streaming.api.functions.sink.filesystem.StreamingFileSink; 10 | import org.apache.flink.streaming.api.functions.sink.filesystem.bucketassigners.SimpleVersionedStringSerializer; 11 | import org.apache.flink.streaming.api.operators.StreamSink; 12 | import org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness; 13 | import org.apache.orc.TypeDescription; 14 | import org.junit.Before; 15 | import org.junit.Rule; 16 | import org.junit.Test; 17 | import org.junit.rules.TemporaryFolder; 18 | import uk.co.realb.flink.orc.hive.GenericRecordOrcWriterFactory; 19 | 20 | import java.io.File; 21 | import java.nio.file.Paths; 22 | import java.util.Arrays; 23 | import java.util.Collections; 24 | import java.util.List; 25 | import java.util.Properties; 26 | 27 | import static org.junit.Assert.assertEquals; 28 | import static org.junit.Assert.assertTrue; 29 | import static scala.collection.JavaConversions.asScalaBuffer; 30 | import static scala.collection.JavaConversions.seqAsJavaList; 31 | 32 | public class GenericRecordSinkTest { 33 | @Rule 34 | public TemporaryFolder folder = new TemporaryFolder(); 35 | 36 | private OneInputStreamOperatorTestHarness testHarness; 37 | 38 | private TypeDescription schema = TypeDescription.fromString("struct"); 39 | 40 | private String avroSchemaString = "" + 41 | "{\n" + 42 | "\t\"name\": \"record\",\n" + 43 | "\t\"type\": \"record\",\n" + 44 | "\t\"fields\": [{\n" + 45 | "\t\t\"name\": \"x\",\n" + 46 | "\t\t\"type\": \"int\",\n" + 47 | "\t\t\"doc\": \"x\"\n" + 48 | "\t}, {\n" + 49 | "\t\t\"name\": \"y\",\n" + 50 | "\t\t\"type\": \"string\",\n" + 51 | "\t\t\"doc\": \"y\"\n" + 52 | "\t}, {\n" + 53 | "\t\t\"name\": \"z\",\n" + 54 | "\t\t\"type\": \"string\",\n" + 55 | "\t\t\"doc\": \"z\"\n" + 56 | "\t}]\n" + 57 | "}"; 58 | 59 | @Before 60 | public void setupTestHarness() throws Exception { 61 | Properties conf = new Properties(); 62 | conf.setProperty("orc.compress", "SNAPPY"); 63 | conf.setProperty("orc.bloom.filter.columns", "x"); 64 | 65 | String out = folder.getRoot().getAbsolutePath(); 66 | GenericRecordOrcWriterFactory writer = OrcWriters.forGenericRecord(avroSchemaString, conf); 67 | StreamingFileSink sink = StreamingFileSink 68 | .forBulkFormat(new Path(out), writer) 69 | .withBucketAssigner(new BucketAssigner() { 70 | @Override 71 | public String getBucketId(GenericRecord javaTestData, Context context) { 72 | return javaTestData.get(1).toString(); 73 | } 74 | 75 | @Override 76 | public SimpleVersionedSerializer getSerializer() { 77 | return SimpleVersionedStringSerializer.INSTANCE; 78 | } 79 | }) 80 | .build(); 81 | testHarness = new OneInputStreamOperatorTestHarness<>(new StreamSink<>(sink)); 82 | testHarness.setup(); 83 | testHarness.open(); 84 | } 85 | 86 | @Test 87 | public void testJavaGenericRecordSink() throws Exception { 88 | 89 | Schema avroSchema = new Schema.Parser().parse(avroSchemaString); 90 | 91 | GenericRecord r = new GenericData.Record(avroSchema); 92 | r.put("x", 1); 93 | r.put("y", "partition"); 94 | r.put("z", "test"); 95 | testHarness.processElement(r, 100L); 96 | 97 | GenericRecord r2 = new GenericData.Record(avroSchema); 98 | r2.put("x", 2); 99 | r2.put("y", "partition"); 100 | r2.put("z", "test2"); 101 | testHarness.processElement(r2, 101L); 102 | 103 | testHarness.snapshot(1L, 10001L); 104 | testHarness.notifyOfCompletedCheckpoint(10002L); 105 | 106 | File result = Paths.get(folder.getRoot().getAbsolutePath(), "partition", "part-0-0").toFile(); 107 | 108 | String tempDir = folder.getRoot().getAbsolutePath(); 109 | 110 | String paths = TestUtils.testFile(asScalaBuffer(Arrays.asList("partition", "part-0-0")), tempDir); 111 | 112 | List> rows = seqAsJavaList(TestUtils.testTupleReader( 113 | schema, 114 | asScalaBuffer(Collections.singletonList(paths)) 115 | )); 116 | 117 | assertTrue(result.exists()); 118 | assertEquals(2, rows.size()); 119 | assertEquals(2, rows.get(1)._1()); 120 | assertEquals("partition", rows.get(1)._2()); 121 | assertEquals("test2", rows.get(1)._3()); 122 | } 123 | } 124 | -------------------------------------------------------------------------------- /src/test/java/uk/co/realb/flink/orc/ReflectSinkTest.java: -------------------------------------------------------------------------------- 1 | package uk.co.realb.flink.orc; 2 | 3 | import org.apache.flink.core.fs.Path; 4 | import org.apache.flink.core.io.SimpleVersionedSerializer; 5 | import org.apache.flink.streaming.api.functions.sink.filesystem.BucketAssigner; 6 | import org.apache.flink.streaming.api.functions.sink.filesystem.StreamingFileSink; 7 | import org.apache.flink.streaming.api.functions.sink.filesystem.bucketassigners.SimpleVersionedStringSerializer; 8 | import org.apache.flink.streaming.api.operators.StreamSink; 9 | import org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness; 10 | import org.apache.orc.TypeDescription; 11 | import org.junit.Before; 12 | import org.junit.Rule; 13 | import org.junit.Test; 14 | import org.junit.rules.TemporaryFolder; 15 | import uk.co.realb.flink.orc.hive.HiveOrcWriterFactory; 16 | 17 | import java.io.File; 18 | import java.nio.file.Paths; 19 | import java.util.Arrays; 20 | import java.util.Collections; 21 | import java.util.List; 22 | import java.util.Properties; 23 | 24 | import static org.junit.Assert.assertEquals; 25 | import static org.junit.Assert.assertTrue; 26 | import static scala.collection.JavaConversions.asScalaBuffer; 27 | import static scala.collection.JavaConversions.seqAsJavaList; 28 | 29 | public class ReflectSinkTest { 30 | @Rule 31 | public TemporaryFolder folder = new TemporaryFolder(); 32 | 33 | private OneInputStreamOperatorTestHarness testHarness; 34 | 35 | private TypeDescription schema = TypeDescription.fromString("struct"); 36 | 37 | @Before 38 | public void setupTestHarness() throws Exception { 39 | Properties conf = new Properties(); 40 | conf.setProperty("orc.compress", "SNAPPY"); 41 | conf.setProperty("orc.bloom.filter.columns", "x"); 42 | 43 | String out = folder.getRoot().getAbsolutePath(); 44 | HiveOrcWriterFactory writer = OrcWriters.forReflectRecord(TestDataJava.class, conf); 45 | StreamingFileSink sink = StreamingFileSink 46 | .forBulkFormat(new Path(out), writer) 47 | .withBucketAssigner(new BucketAssigner() { 48 | @Override 49 | public String getBucketId(TestDataJava javaTestData, Context context) { 50 | return javaTestData.y; 51 | } 52 | 53 | @Override 54 | public SimpleVersionedSerializer getSerializer() { 55 | return SimpleVersionedStringSerializer.INSTANCE; 56 | } 57 | }) 58 | .build(); 59 | testHarness = new OneInputStreamOperatorTestHarness<>(new StreamSink<>(sink)); 60 | testHarness.setup(); 61 | testHarness.open(); 62 | } 63 | 64 | @Test 65 | public void testJavaReflectSink() throws Exception { 66 | testHarness.processElement(new TestDataJava(1, "partition", "test"), 100L); 67 | testHarness.processElement(new TestDataJava(2, "partition", "test2"), 101L); 68 | testHarness.snapshot(1L, 10001L); 69 | testHarness.notifyOfCompletedCheckpoint(10002L); 70 | File result = Paths.get(folder.getRoot().getAbsolutePath(), "partition", "part-0-0").toFile(); 71 | 72 | String tempDir = folder.getRoot().getAbsolutePath(); 73 | 74 | String paths = TestUtils.testFile(asScalaBuffer(Arrays.asList("partition", "part-0-0")), tempDir); 75 | 76 | List> rows = seqAsJavaList(TestUtils.testTupleReader( 77 | schema, 78 | asScalaBuffer(Collections.singletonList(paths)) 79 | )); 80 | 81 | assertTrue(result.exists()); 82 | assertEquals(2, rows.size()); 83 | assertEquals(2, rows.get(1)._1()); 84 | assertEquals("partition", rows.get(1)._2()); 85 | assertEquals("test2", rows.get(1)._3()); 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /src/test/java/uk/co/realb/flink/orc/TestBucketAssigner.java: -------------------------------------------------------------------------------- 1 | package uk.co.realb.flink.orc; 2 | 3 | import org.apache.flink.api.java.tuple.Tuple3; 4 | import org.apache.flink.core.io.SimpleVersionedSerializer; 5 | import org.apache.flink.streaming.api.functions.sink.filesystem.BucketAssigner; 6 | import org.apache.flink.streaming.api.functions.sink.filesystem.bucketassigners.SimpleVersionedStringSerializer; 7 | 8 | public class TestBucketAssigner implements BucketAssigner, String> { 9 | private static final long serialVersionUID = 987325769970523327L; 10 | @Override 11 | public String getBucketId(Tuple3 integerStringStringTuple3, Context context) { 12 | return integerStringStringTuple3.f1; 13 | } 14 | 15 | @Override 16 | public SimpleVersionedSerializer getSerializer() { 17 | return SimpleVersionedStringSerializer.INSTANCE; 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /src/test/java/uk/co/realb/flink/orc/TestDataJava.java: -------------------------------------------------------------------------------- 1 | package uk.co.realb.flink.orc; 2 | 3 | public class TestDataJava { 4 | Integer x; 5 | String y; 6 | String z; 7 | 8 | public TestDataJava(Integer x, String y, String z) { 9 | this.x = x; 10 | this.y = y; 11 | this.z = z; 12 | } 13 | 14 | public Integer getX() { 15 | return x; 16 | } 17 | 18 | public void setX(Integer x) { 19 | this.x = x; 20 | } 21 | 22 | public String getY() { 23 | return y; 24 | } 25 | 26 | public void setY(String y) { 27 | this.y = y; 28 | } 29 | 30 | public String getZ() { 31 | return z; 32 | } 33 | 34 | public void setZ(String z) { 35 | this.z = z; 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/test/java/uk/co/realb/flink/orc/TestTupleEncoder.java: -------------------------------------------------------------------------------- 1 | package uk.co.realb.flink.orc; 2 | 3 | import org.apache.flink.api.java.tuple.Tuple3; 4 | import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; 5 | import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; 6 | import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; 7 | import uk.co.realb.flink.orc.encoder.OrcRowEncoder; 8 | 9 | import java.io.Serializable; 10 | 11 | public class TestTupleEncoder extends OrcRowEncoder> implements Serializable { 12 | @Override 13 | public void encodeAndAdd(Tuple3 datum, VectorizedRowBatch batch) { 14 | int row = nextIndex(batch); 15 | LongColumnVector col0 = (LongColumnVector) batch.cols[0]; 16 | col0.vector[row] = datum.f0; 17 | BytesColumnVector col1 = (BytesColumnVector) batch.cols[1]; 18 | col1.setVal(row, datum.f1.getBytes()); 19 | BytesColumnVector col2 = (BytesColumnVector) batch.cols[2]; 20 | col2.setVal(row, datum.f2.getBytes()); 21 | incrementBatchSize(batch); 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /src/test/scala/uk/co/realb/flink/orc/EncoderSinkSpec.scala: -------------------------------------------------------------------------------- 1 | package uk.co.realb.flink.orc 2 | 3 | import java.nio.file.Paths 4 | import java.util.Properties 5 | 6 | import org.apache.flink.core.fs.Path 7 | import org.apache.flink.core.io.SimpleVersionedSerializer 8 | import org.apache.flink.streaming.api.functions.sink.filesystem.bucketassigners.SimpleVersionedStringSerializer 9 | import org.apache.flink.streaming.api.functions.sink.filesystem.{ 10 | BucketAssigner, 11 | StreamingFileSink 12 | } 13 | import org.apache.flink.streaming.api.operators.StreamSink 14 | import org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness 15 | import org.apache.hadoop.conf.Configuration 16 | import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch 17 | import org.apache.orc.{TypeDescription, Writer} 18 | import org.junit.rules.TemporaryFolder 19 | import org.junit.{Rule, Test} 20 | import org.scalatest.matchers.should.Matchers 21 | import uk.co.realb.flink.orc.TestUtils._ 22 | import uk.co.realb.flink.orc.encoder.OrcRowEncoder 23 | 24 | import scala.annotation.meta.getter 25 | import scala.util.hashing.MurmurHash3 26 | 27 | class EncoderSinkSpec extends Matchers { 28 | 29 | @(Rule @getter) 30 | val streamingOutput = new TemporaryFolder 31 | @(Rule @getter) 32 | val orcOutput = new TemporaryFolder 33 | 34 | private val schemaString = """struct""" 35 | 36 | private val conf = new Properties 37 | conf.setProperty("orc.compress", "SNAPPY") 38 | conf.setProperty("orc.bloom.filter.columns", "x") 39 | conf.setProperty("orc.stripe.size", "4194304") 40 | conf.setProperty("orc.create.index", "true") 41 | 42 | private val schema = TypeDescription.fromString(schemaString) 43 | private val encoder = new TestEncoder() 44 | 45 | private val bucketAssigner = 46 | new BucketAssigner[(Int, String, String), String] { 47 | override def getBucketId( 48 | in: (Int, String, String), 49 | context: BucketAssigner.Context 50 | ): String = in._2 51 | 52 | override def getSerializer: SimpleVersionedSerializer[String] = 53 | SimpleVersionedStringSerializer.INSTANCE 54 | } 55 | 56 | @Test def testCompareOrcWriterOutputWithFlink() { 57 | val streamingTempDir = streamingOutput.getRoot.getAbsolutePath 58 | val orcTempDir = orcOutput.getRoot.getAbsolutePath 59 | val testData = (0 to 1000000) 60 | .map(i => (i, "testText", MurmurHash3.stringHash(i.toString).toString)) 61 | 62 | val sink = StreamingFileSink 63 | .forBulkFormat( 64 | new Path(streamingTempDir), 65 | OrcWriters 66 | .withCustomEncoder[(Int, String, String)](encoder, schema, conf) 67 | ) 68 | .withBucketAssigner(bucketAssigner) 69 | .build() 70 | 71 | val testHarness = 72 | new OneInputStreamOperatorTestHarness[ 73 | (Int, String, String), 74 | AnyRef 75 | ](new StreamSink[(Int, String, String)](sink), 4, 4, 3) 76 | 77 | testHarness.setup() 78 | testHarness.open() 79 | testData.foreach(d => testHarness.processElement(d, d._1)) 80 | 81 | testHarness.snapshot(1L, 10001L) 82 | testHarness.notifyOfCompletedCheckpoint(10002L) 83 | 84 | val testWriterFile = 85 | Paths.get(orcTempDir, "test.orc").toAbsolutePath.toString 86 | 87 | val writer = createWriter(conf, schema, testWriterFile) 88 | val batch: VectorizedRowBatch = schema.createRowBatch() 89 | 90 | testData.foreach(d => writeTestRow(d, batch, writer, encoder)) 91 | flush(batch, writer) 92 | 93 | writer.close() 94 | 95 | val testStreamingFile = Paths 96 | .get(streamingTempDir, "testText", "part-3-0") 97 | .toAbsolutePath 98 | .toString 99 | 100 | fileHash(testWriterFile) should be( 101 | fileHash(testStreamingFile) 102 | ) 103 | 104 | val reader = TestUtils.createReader(new Configuration(), testStreamingFile) 105 | 106 | reader.getFileTail.getPostscript.getCompression.toString should be( 107 | "SNAPPY" 108 | ) 109 | reader.getFileTail.getFooter.getStripesList.size() should be(4) 110 | } 111 | 112 | @Test def testSinkIntoMultipleBuckets() { 113 | val tempDir = streamingOutput.getRoot.getAbsolutePath 114 | val testData = (0 to 10000) 115 | .map(i => 116 | (i, "testText" + i % 3, MurmurHash3.stringHash(i.toString).toString) 117 | ) 118 | 119 | val sink = StreamingFileSink 120 | .forBulkFormat( 121 | new Path(tempDir), 122 | OrcWriters 123 | .withCustomEncoder[(Int, String, String)](encoder, schema, conf) 124 | ) 125 | .withBucketAssigner(bucketAssigner) 126 | .build() 127 | 128 | val testHarness = 129 | new OneInputStreamOperatorTestHarness[ 130 | (Int, String, String), 131 | AnyRef 132 | ](new StreamSink[(Int, String, String)](sink), 4, 4, 0) 133 | 134 | testHarness.setup() 135 | testHarness.open() 136 | testData.foreach(d => testHarness.processElement(d, d._1)) 137 | 138 | testHarness.snapshot(1L, 10001L) 139 | testHarness.notifyOfCompletedCheckpoint(10002L) 140 | 141 | val result = testTupleReader( 142 | schema, 143 | Seq( 144 | testFile(Seq("testText0", "part-0-0"), tempDir), 145 | testFile(Seq("testText1", "part-0-1"), tempDir), 146 | testFile(Seq("testText2", "part-0-2"), tempDir) 147 | ) 148 | ) 149 | 150 | result.sortBy(_._1) should be(testData) 151 | } 152 | 153 | private def writeTestRow( 154 | row: (Int, String, String), 155 | batch: VectorizedRowBatch, 156 | writer: Writer, 157 | encoder: OrcRowEncoder[(Int, String, String)] 158 | ): Unit = { 159 | encoder.encodeAndAdd(row, batch) 160 | if (batch.size == batch.getMaxSize) { 161 | flush(batch, writer) 162 | } 163 | } 164 | } 165 | -------------------------------------------------------------------------------- /src/test/scala/uk/co/realb/flink/orc/GenericRecordSinkSpec.scala: -------------------------------------------------------------------------------- 1 | package uk.co.realb.flink.orc 2 | 3 | import java.nio.file.Paths 4 | import java.util.Properties 5 | 6 | import org.apache.avro.Schema 7 | import org.apache.avro.generic.{GenericData, GenericRecord} 8 | import org.apache.flink.core.fs.Path 9 | import org.apache.flink.core.io.SimpleVersionedSerializer 10 | import org.apache.flink.streaming.api.functions.sink.filesystem.bucketassigners.SimpleVersionedStringSerializer 11 | import org.apache.flink.streaming.api.functions.sink.filesystem.{ 12 | BucketAssigner, 13 | StreamingFileSink 14 | } 15 | import org.apache.flink.streaming.api.operators.StreamSink 16 | import org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness 17 | import org.junit.rules.TemporaryFolder 18 | import org.junit.{Rule, Test} 19 | import org.scalatest.matchers.should.Matchers 20 | import uk.co.realb.flink.orc.TestUtils.fileHash 21 | 22 | import scala.annotation.meta.getter 23 | import scala.collection.JavaConverters._ 24 | import scala.util.hashing.MurmurHash3 25 | 26 | class GenericRecordSinkSpec extends Matchers { 27 | 28 | @(Rule @getter) 29 | val streamingOutput = new TemporaryFolder 30 | @(Rule @getter) 31 | val orcOutput = new TemporaryFolder 32 | 33 | private val conf = new Properties 34 | conf.setProperty("orc.compress", "SNAPPY") 35 | conf.setProperty("orc.bloom.filter.columns", "id") 36 | 37 | private val avroSchemaString = 38 | """ 39 | |{ 40 | | "name": "record", 41 | | "type": "record", 42 | | "fields": [{ 43 | | "name": "x", 44 | | "type": "int", 45 | | "doc": "" 46 | | }, { 47 | | "name": "y", 48 | | "type": "string", 49 | | "doc": "" 50 | | }, { 51 | | "name": "z", 52 | | "type": "string", 53 | | "doc": "" 54 | | }] 55 | |} 56 | |""".stripMargin 57 | 58 | private val avroSchema = new Schema.Parser().parse(avroSchemaString) 59 | 60 | private val bucketAssigner = 61 | new BucketAssigner[GenericRecord, String] { 62 | override def getBucketId( 63 | in: GenericRecord, 64 | context: BucketAssigner.Context 65 | ): String = in.get(1).asInstanceOf[String] 66 | 67 | override def getSerializer: SimpleVersionedSerializer[String] = 68 | SimpleVersionedStringSerializer.INSTANCE 69 | } 70 | 71 | @Test def testCompareGenericRecordOrcWriterOutputWithFlink(): Unit = { 72 | val streamingTempDir = streamingOutput.getRoot.getAbsolutePath 73 | val orcTempDir = orcOutput.getRoot.getAbsolutePath 74 | 75 | val testData = (0 to 10000) 76 | .map(i => { 77 | val r: GenericRecord = new GenericData.Record(avroSchema) 78 | r.put("x", i) 79 | r.put("y", "testText") 80 | r.put("z", MurmurHash3.stringHash(i.toString).toString) 81 | r 82 | }) 83 | 84 | val sink = StreamingFileSink 85 | .forBulkFormat( 86 | new Path(streamingTempDir), 87 | OrcWriters 88 | .forGenericRecord[GenericRecord](avroSchemaString, conf) 89 | ) 90 | .withBucketAssigner(bucketAssigner) 91 | .build() 92 | 93 | val testHarness = 94 | new OneInputStreamOperatorTestHarness[ 95 | GenericRecord, 96 | AnyRef 97 | ](new StreamSink[GenericRecord](sink), 4, 4, 3) 98 | 99 | testHarness.setup() 100 | testHarness.open() 101 | testData.foreach(d => 102 | testHarness.processElement(d, d.get(0).asInstanceOf[Int]) 103 | ) 104 | 105 | testHarness.snapshot(1L, 10001L) 106 | testHarness.notifyOfCompletedCheckpoint(10002L) 107 | 108 | val testWriterFile = 109 | Paths.get(orcTempDir, "test.orc").toAbsolutePath.toString 110 | val writer = TestUtils 111 | .createGenericRecordWriter(avroSchema, conf, testWriterFile) 112 | 113 | testData.foreach(d => 114 | writer.addRow( 115 | d.getSchema.getFields.asScala.map(f => d.get(f.name())).asJava 116 | ) 117 | ) 118 | writer.close() 119 | 120 | val testStreamingFile = Paths 121 | .get(streamingTempDir, "testText", "part-3-0") 122 | .toAbsolutePath 123 | .toString 124 | 125 | fileHash(testWriterFile) should be( 126 | fileHash(testStreamingFile) 127 | ) 128 | } 129 | } 130 | -------------------------------------------------------------------------------- /src/test/scala/uk/co/realb/flink/orc/ReflectSinkSpec.scala: -------------------------------------------------------------------------------- 1 | package uk.co.realb.flink.orc 2 | 3 | import java.nio.file.Paths 4 | import java.util.Properties 5 | 6 | import org.apache.flink.core.fs.Path 7 | import org.apache.flink.core.io.SimpleVersionedSerializer 8 | import org.apache.flink.streaming.api.functions.sink.filesystem.bucketassigners.SimpleVersionedStringSerializer 9 | import org.apache.flink.streaming.api.functions.sink.filesystem.{ 10 | BucketAssigner, 11 | StreamingFileSink 12 | } 13 | import org.apache.flink.streaming.api.operators.StreamSink 14 | import org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness 15 | import org.junit.rules.TemporaryFolder 16 | import org.junit.{Rule, Test} 17 | import org.scalatest.matchers.should.Matchers 18 | import uk.co.realb.flink.orc.TestUtils.fileHash 19 | 20 | import scala.annotation.meta.getter 21 | import scala.util.hashing.MurmurHash3 22 | 23 | class ReflectSinkSpec extends Matchers { 24 | 25 | @(Rule @getter) 26 | val streamingOutput = new TemporaryFolder 27 | @(Rule @getter) 28 | val orcOutput = new TemporaryFolder 29 | 30 | private val conf = new Properties 31 | conf.setProperty("orc.compress", "SNAPPY") 32 | conf.setProperty("orc.bloom.filter.columns", "id") 33 | 34 | private val bucketAssigner = 35 | new BucketAssigner[TestData, String] { 36 | override def getBucketId( 37 | in: TestData, 38 | context: BucketAssigner.Context 39 | ): String = in.y 40 | 41 | override def getSerializer: SimpleVersionedSerializer[String] = 42 | SimpleVersionedStringSerializer.INSTANCE 43 | } 44 | 45 | @Test def testComparePOJOOrcWriterOutputWithFlink(): Unit = { 46 | val streamingTempDir = streamingOutput.getRoot.getAbsolutePath 47 | val orcTempDir = orcOutput.getRoot.getAbsolutePath 48 | 49 | val testData = (0 to 10000) 50 | .map(i => 51 | TestData( 52 | i, 53 | "testText", 54 | MurmurHash3.stringHash(i.toString).toString 55 | ) 56 | ) 57 | 58 | val sink = StreamingFileSink 59 | .forBulkFormat( 60 | new Path(streamingTempDir), 61 | OrcWriters 62 | .forReflectRecord[TestData](classOf[TestData], conf) 63 | ) 64 | .withBucketAssigner(bucketAssigner) 65 | .build() 66 | 67 | val testHarness = 68 | new OneInputStreamOperatorTestHarness[ 69 | TestData, 70 | AnyRef 71 | ](new StreamSink[TestData](sink), 4, 4, 3) 72 | 73 | testHarness.setup() 74 | testHarness.open() 75 | testData.foreach(d => testHarness.processElement(d, d.x)) 76 | 77 | testHarness.snapshot(1L, 10001L) 78 | testHarness.notifyOfCompletedCheckpoint(10002L) 79 | 80 | val testWriterFile = 81 | Paths.get(orcTempDir, "test.orc").toAbsolutePath.toString 82 | val writer = TestUtils 83 | .createHiveWriter[TestData](conf, classOf[TestData], testWriterFile) 84 | 85 | testData.foreach(d => writer.addRow(d)) 86 | writer.close() 87 | 88 | val testStreamingFile = Paths 89 | .get(streamingTempDir, "testText", "part-3-0") 90 | .toAbsolutePath 91 | .toString 92 | 93 | fileHash(testWriterFile) should be( 94 | fileHash(testStreamingFile) 95 | ) 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /src/test/scala/uk/co/realb/flink/orc/TestData.scala: -------------------------------------------------------------------------------- 1 | package uk.co.realb.flink.orc 2 | 3 | case class TestData(x: Int, y: String, z: String) 4 | -------------------------------------------------------------------------------- /src/test/scala/uk/co/realb/flink/orc/TestEncoder.scala: -------------------------------------------------------------------------------- 1 | package uk.co.realb.flink.orc 2 | 3 | import org.apache.hadoop.hive.ql.exec.vector.{ 4 | BytesColumnVector, 5 | LongColumnVector, 6 | VectorizedRowBatch 7 | } 8 | import uk.co.realb.flink.orc.encoder.OrcRowEncoder 9 | 10 | class TestEncoder 11 | extends OrcRowEncoder[(Int, String, String)]() 12 | with Serializable { 13 | override def encodeAndAdd( 14 | datum: (Int, String, String), 15 | batch: VectorizedRowBatch 16 | ): Unit = { 17 | val row = nextIndex(batch) 18 | batch.cols(0).asInstanceOf[LongColumnVector].vector(row) = datum._1 19 | batch 20 | .cols(1) 21 | .asInstanceOf[BytesColumnVector] 22 | .setVal(row, datum._2.getBytes) 23 | batch 24 | .cols(2) 25 | .asInstanceOf[BytesColumnVector] 26 | .setVal(row, datum._3.getBytes) 27 | incrementBatchSize(batch) 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/test/scala/uk/co/realb/flink/orc/TestUtils.scala: -------------------------------------------------------------------------------- 1 | package uk.co.realb.flink.orc 2 | 3 | import java.nio.file.Paths 4 | import java.security.MessageDigest 5 | import java.util.Properties 6 | 7 | import org.apache.avro.Schema 8 | import org.apache.hadoop.conf.Configuration 9 | import org.apache.hadoop.fs 10 | import org.apache.hadoop.hive.ql.exec.vector.{ 11 | BytesColumnVector, 12 | LongColumnVector, 13 | VectorizedRowBatch 14 | } 15 | import org.apache.hadoop.hive.ql.io.orc 16 | import org.apache.hadoop.hive.serde2.avro.AvroObjectInspectorGenerator 17 | import org.apache.hadoop.hive.serde2.objectinspector.{ 18 | ObjectInspector, 19 | ObjectInspectorFactory 20 | } 21 | import org.apache.orc.{OrcFile, Reader, TypeDescription, Writer} 22 | 23 | import scala.collection.mutable.ArrayBuffer 24 | import scala.reflect.io.File 25 | 26 | object TestUtils { 27 | def createReader(conf: Configuration, testFile: String): Reader = { 28 | OrcFile.createReader( 29 | new fs.Path(testFile), 30 | OrcFile.readerOptions(conf) 31 | ) 32 | } 33 | 34 | def createWriter( 35 | props: Properties, 36 | schema: TypeDescription, 37 | testFile: String 38 | ): Writer = { 39 | OrcFile.createWriter( 40 | new fs.Path(testFile), 41 | OrcFile 42 | .writerOptions(OrcUtils.getConfiguration(props)) 43 | .setSchema(schema) 44 | ) 45 | } 46 | 47 | def createHiveWriter[T]( 48 | props: Properties, 49 | classType: Class[T], 50 | testFile: String 51 | ): orc.Writer = { 52 | 53 | val inspector = ObjectInspectorFactory.getReflectionObjectInspector( 54 | classType, 55 | ObjectInspectorFactory.ObjectInspectorOptions.JAVA 56 | ) 57 | 58 | orcHiveWriter(props, testFile, inspector) 59 | } 60 | 61 | def createGenericRecordWriter( 62 | avroSchema: Schema, 63 | props: Properties, 64 | testFile: String 65 | ): orc.Writer = { 66 | 67 | val inspector = new AvroObjectInspectorGenerator(avroSchema).getObjectInspector 68 | 69 | orcHiveWriter(props, testFile, inspector) 70 | } 71 | 72 | private def orcHiveWriter( 73 | props: Properties, 74 | testFile: String, 75 | inspector: ObjectInspector 76 | ) = { 77 | val writer: orc.Writer = 78 | org.apache.hadoop.hive.ql.io.orc.OrcFile.createWriter( 79 | new fs.Path(testFile), 80 | org.apache.hadoop.hive.ql.io.orc.OrcFile 81 | .writerOptions(props, new Configuration()) 82 | .inspector(inspector) 83 | ) 84 | writer 85 | } 86 | 87 | def testTupleReader( 88 | schema: TypeDescription, 89 | testFiles: Seq[String] 90 | ): Seq[(Int, String, String)] = { 91 | val result = ArrayBuffer[(Int, String, String)]() 92 | testFiles.foreach(testFile => { 93 | val batch = schema.createRowBatch 94 | val rr = createReader( 95 | new Configuration(), 96 | testFile 97 | ).rows() 98 | 99 | while (rr.nextBatch(batch)) { 100 | (0 until batch.size) 101 | .map(i => { 102 | val x = batch.cols(0).asInstanceOf[LongColumnVector].vector.toSeq 103 | val y = batch 104 | .cols(1) 105 | .asInstanceOf[BytesColumnVector] 106 | val z = batch 107 | .cols(2) 108 | .asInstanceOf[BytesColumnVector] 109 | result.append((x(i).toInt, y.toString(i), z.toString(i))) 110 | 111 | }) 112 | } 113 | }) 114 | 115 | result 116 | } 117 | 118 | def testFile(x: Seq[String], tempDir: String): String = { 119 | Paths 120 | .get(tempDir, x: _*) 121 | .toAbsolutePath 122 | .toString 123 | } 124 | 125 | def flush(batch: VectorizedRowBatch, writer: Writer): Unit = { 126 | if (batch.size > 0) { 127 | writer.addRowBatch(batch) 128 | batch.reset() 129 | } 130 | } 131 | 132 | def fileHash(file: String): String = 133 | MessageDigest 134 | .getInstance("MD5") 135 | .digest(File(file).toByteArray()) 136 | .map("%02x".format(_)) 137 | .mkString 138 | } 139 | --------------------------------------------------------------------------------