├── LICENSE ├── README.md ├── build-single-assembly.sh ├── log4j.properties ├── pom.xml ├── run-read.sh ├── run-write.sh └── src └── main └── java └── com └── github └── animeshtrivedi └── arrowexample ├── ArrowExampleClass.java ├── ArrowOutputStream.java ├── ArrowRead.java ├── ArrowWrite.java └── Utils.java /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ArrowExample 2 | Java read and write example for Apache Arrow 3 | 4 | For more details please see 5 | [https://github.com/animeshtrivedi/animeshtrivedi.github.io/blob/master/blog/2017-12-26-arrow.md](https://github.com/animeshtrivedi/animeshtrivedi.github.io/blob/master/blog/2017-12-26-arrow.md) 6 | 7 | ## How to run 8 | 9 | Run `build-single-assembly.sh` to make a single uber jar containing 10 | all classes. 11 | 12 | To write a file run `run-example.sh`. This will generate random 13 | data and write it to `./example.arrow` file. 14 | 15 | Then run `run-read.sh`, this will read the file and display the 16 | data. -------------------------------------------------------------------------------- /build-single-assembly.sh: -------------------------------------------------------------------------------- 1 | mvn -T 1C clean compile assembly:single 2 | -------------------------------------------------------------------------------- /log4j.properties: -------------------------------------------------------------------------------- 1 | # Set root logger level to DEBUG and its only appender to A1. 2 | log4j.rootLogger=ERROR, A1 3 | 4 | # A1 is set to be a ConsoleAppender. 5 | log4j.appender.A1=org.apache.log4j.ConsoleAppender 6 | 7 | # A1 uses PatternLayout. 8 | log4j.appender.A1.layout=org.apache.log4j.PatternLayout 9 | log4j.appender.A1.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n 10 | 11 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4.0.0 3 | com.github.animeshtrivedi 4 | arrowexample 5 | 1.0 6 | ${project.artifactId} 7 | Java read/write example for Apache Arrow 8 | 2016 9 | 10 | 11 | 12 | Apache 2.0 License 13 | http://www.apache.org/licenses/LICENSE-2.0.html 14 | repo 15 | 16 | 17 | 18 | 19 | atr 20 | Animesh Trivedi 21 | atr@zurich.ibm.com 22 | IBM Research, Zurich 23 | 24 | 25 | 26 | 27 | 1.8 28 | 1.8 29 | UTF-8 30 | 31 | 32 | 33 | 34 | 35 | org.apache.arrow 36 | arrow-memory 37 | 0.8.0 38 | 39 | 40 | 41 | org.apache.arrow 42 | arrow-vector 43 | 0.8.0 44 | 45 | 46 | 47 | junit 48 | junit 49 | 4.11 50 | test 51 | 52 | 53 | 54 | 55 | 56 | 57 | maven-compiler-plugin 58 | 3.1 59 | 60 | 1.8 61 | 1.8 62 | 63 | 64 | 65 | org.apache.maven.plugins 66 | maven-surefire-plugin 67 | 2.18.1 68 | 69 | 70 | 71 | org.apache.maven.plugins 72 | maven-jar-plugin 73 | 2.4 74 | 75 | 76 | 77 | true 78 | lib/ 79 | com.github.animeshtrivedi.arrowexample.ArrowRead 80 | 81 | 82 | 83 | 84 | 85 | maven-assembly-plugin 86 | 87 | 88 | 89 | com.github.animeshtrivedi.arrowexample.ArrowRead 90 | 91 | 92 | 93 | jar-with-dependencies 94 | 95 | 96 | 97 | 98 | 99 | 100 | -------------------------------------------------------------------------------- /run-read.sh: -------------------------------------------------------------------------------- 1 | java -cp .:./target/arrowexample-1.0-jar-with-dependencies.jar com.github.animeshtrivedi.arrowexample.ArrowRead $@ 2 | -------------------------------------------------------------------------------- /run-write.sh: -------------------------------------------------------------------------------- 1 | rm example.arrow 2 | java -cp .:./target/arrowexample-1.0-jar-with-dependencies.jar com.github.animeshtrivedi.arrowexample.ArrowWrite $@ 3 | -------------------------------------------------------------------------------- /src/main/java/com/github/animeshtrivedi/arrowexample/ArrowExampleClass.java: -------------------------------------------------------------------------------- 1 | package com.github.animeshtrivedi.arrowexample; 2 | 3 | import java.util.Random; 4 | 5 | /** 6 | * Created by atr on 15.12.17. 7 | */ 8 | public class ArrowExampleClass { 9 | public int anInt; 10 | public long aLong; 11 | public byte[] arr; 12 | public float aFloat; 13 | public Random random; 14 | 15 | public ArrowExampleClass(Random random, int index){ 16 | this.random = random; 17 | this.anInt = this.random.nextInt(1024); 18 | this.aLong = this.random.nextInt(Integer.MAX_VALUE); 19 | this.arr = new byte[this.random.nextInt(1024)]; 20 | this.random.nextBytes(this.arr); 21 | this.aFloat = this.random.nextFloat(); 22 | } 23 | 24 | public static String firstX(byte[] data, int items){ 25 | int toProcess = Math.min(items, data.length); 26 | StringBuilder sb = new StringBuilder(); 27 | for(int i = 0; i < toProcess; i++) { 28 | sb.append(String.format("0x%02x", data[i])+ " "); 29 | } 30 | return sb.toString(); 31 | } 32 | 33 | public static long hashArray(byte[] data){ 34 | long ret = 0; 35 | for(int i = 0; i < data.length;i++) 36 | ret+=data[i]; 37 | return ret; 38 | } 39 | 40 | public String toString() { 41 | return anInt + "\t | " + 42 | + aLong + "\t | " + 43 | " arr[" + this.arr.length + "] " + firstX(this.arr, 5) + "\t | " + 44 | + aFloat; 45 | } 46 | 47 | public long getSumHash(){ 48 | long ret = 0; 49 | ret+=anInt; 50 | ret+=aLong; 51 | ret+=ArrowExampleClass.hashArray(this.arr); 52 | ret+=aFloat; 53 | return ret; 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /src/main/java/com/github/animeshtrivedi/arrowexample/ArrowOutputStream.java: -------------------------------------------------------------------------------- 1 | package com.github.animeshtrivedi.arrowexample; 2 | 3 | import java.io.FileOutputStream; 4 | import java.io.IOException; 5 | import java.nio.ByteBuffer; 6 | import java.nio.channels.WritableByteChannel; 7 | 8 | /** 9 | * Created by atr on 21.12.17. 10 | */ 11 | public class ArrowOutputStream implements WritableByteChannel { 12 | private FileOutputStream outStream; 13 | private Boolean isOpen; 14 | private byte[] tempBuffer; 15 | private long bytesSoFar; 16 | 17 | public ArrowOutputStream(FileOutputStream outStream){ 18 | this.outStream = outStream; 19 | this.isOpen = true; 20 | this.tempBuffer = new byte[1024*1024]; // 1MB buffering 21 | this.bytesSoFar = 0; 22 | } 23 | 24 | private int writeDirectBuffer(ByteBuffer src) throws IOException { 25 | int remaining = src.remaining(); 26 | int soFar = 0; 27 | while(soFar < remaining){ 28 | int toPush = Math.min(remaining - soFar, this.tempBuffer.length); 29 | // this will move the position index 30 | src.get(this.tempBuffer, 0, toPush); 31 | // i have no way of knowing how much can i push at HDFS 32 | this.outStream.write(this.tempBuffer, 0, toPush); 33 | soFar+=toPush; 34 | } 35 | this.bytesSoFar+=remaining; 36 | return remaining; 37 | } 38 | 39 | private int writeHeapBuffer(ByteBuffer src) throws IOException { 40 | int remaining = src.remaining(); 41 | // get the heap buffer directly and pass the offset and length 42 | this.outStream.write(src.array(), src.position(), src.remaining()); 43 | src.position(src.position() + remaining); 44 | this.bytesSoFar+=remaining; 45 | return remaining; 46 | } 47 | 48 | @Override 49 | public int write(ByteBuffer src) throws IOException { 50 | if(src.isDirect()){ 51 | return writeDirectBuffer(src); 52 | } else { 53 | return writeHeapBuffer(src); 54 | } 55 | } 56 | 57 | @Override 58 | public boolean isOpen() { 59 | return this.isOpen; 60 | } 61 | 62 | @Override 63 | public void close() throws IOException { 64 | // flushes the client buffer 65 | this.outStream.flush(); 66 | this.outStream.close(); 67 | this.isOpen = false; 68 | System.out.println("Output stream wrote " + this.bytesSoFar + " bytes"); 69 | } 70 | } -------------------------------------------------------------------------------- /src/main/java/com/github/animeshtrivedi/arrowexample/ArrowRead.java: -------------------------------------------------------------------------------- 1 | package com.github.animeshtrivedi.arrowexample; 2 | 3 | import com.google.common.collect.ImmutableList; 4 | import org.apache.arrow.memory.RootAllocator; 5 | import org.apache.arrow.vector.*; 6 | import org.apache.arrow.vector.dictionary.DictionaryProvider; 7 | import org.apache.arrow.vector.ipc.ArrowFileReader; 8 | import org.apache.arrow.vector.ipc.SeekableReadChannel; 9 | import org.apache.arrow.vector.ipc.message.ArrowBlock; 10 | import org.apache.arrow.vector.types.Types; 11 | import org.apache.arrow.vector.types.pojo.ArrowType; 12 | import org.apache.arrow.vector.types.pojo.Field; 13 | import org.apache.arrow.vector.types.pojo.FieldType; 14 | import org.apache.arrow.vector.types.pojo.Schema; 15 | 16 | import java.io.File; 17 | import java.io.FileInputStream; 18 | import java.io.IOException; 19 | import java.util.List; 20 | 21 | import static com.github.animeshtrivedi.arrowexample.Utils.validateFile; 22 | import static org.apache.arrow.vector.types.FloatingPointPrecision.SINGLE; 23 | 24 | /** 25 | * Created by atr on 15.12.17. 26 | */ 27 | public class ArrowRead { 28 | private RootAllocator ra = null; 29 | private long checkSumx; 30 | private long intCsum; 31 | private long longCsum; 32 | private long arrCsum; 33 | private long floatCsum; 34 | private long nullEntries; 35 | 36 | public ArrowRead(){ 37 | this.ra = new RootAllocator(Integer.MAX_VALUE); 38 | this.nullEntries = 0; 39 | this.checkSumx = 0; 40 | this.intCsum = 0; 41 | this.longCsum = 0; 42 | this.arrCsum = 0; 43 | this.floatCsum = 0; 44 | } 45 | 46 | public void makeRead(String filename) throws Exception { 47 | File arrowFile = validateFile(filename, true); 48 | FileInputStream fileInputStream = new FileInputStream(arrowFile); 49 | DictionaryProvider.MapDictionaryProvider provider = new DictionaryProvider.MapDictionaryProvider(); 50 | 51 | ArrowFileReader arrowFileReader = new ArrowFileReader(new SeekableReadChannel(fileInputStream.getChannel()), 52 | this.ra); 53 | System.out.println("\nReading the arrow file : " + filename); 54 | VectorSchemaRoot root = arrowFileReader.getVectorSchemaRoot(); 55 | System.out.println("File size : " + arrowFile.length() + 56 | " schema is " + root.getSchema().toString()); 57 | 58 | List arrowBlocks = arrowFileReader.getRecordBlocks(); 59 | System.out.println("Number of arrow blocks are " + arrowBlocks.size()); 60 | for (int i = 0; i < arrowBlocks.size(); i++) { 61 | ArrowBlock rbBlock = arrowBlocks.get(i); 62 | if (!arrowFileReader.loadRecordBatch(rbBlock)) { 63 | throw new IOException("Expected to read record batch"); 64 | } 65 | System.out.println("\t["+i+"] ArrowBlock, offset: " + rbBlock.getOffset() + 66 | ", metadataLength: " + rbBlock.getMetadataLength() + 67 | ", bodyLength " + rbBlock.getBodyLength()); 68 | /* we can now process this block, it is now loaded */ 69 | System.out.println("\t["+i+"] row count for this block is " + root.getRowCount()); 70 | List fieldVector = root.getFieldVectors(); 71 | System.out.println("\t["+i+"] number of fieldVectors (corresponding to columns) : " + fieldVector.size()); 72 | for(int j = 0; j < fieldVector.size(); j++){ 73 | Types.MinorType mt = fieldVector.get(j).getMinorType(); 74 | switch(mt){ 75 | case INT: showIntAccessor(fieldVector.get(j)); break; 76 | case BIGINT: showBigIntAccessor(fieldVector.get(j)); break; 77 | case VARBINARY: showVarBinaryAccessor(fieldVector.get(j)); break; 78 | case FLOAT4: showFloat4Accessor(fieldVector.get(j));break; 79 | case FLOAT8: showFloat8Accessor(fieldVector.get(j));break; 80 | default: throw new Exception(" MinorType " + mt); 81 | } 82 | //showAccessor(fieldVector.get(j).getAccessor()); 83 | //System.out.println("\t["+i+"] accessor " + j + " | " + getAccessorString(accessor)); 84 | } 85 | } 86 | System.out.println("Done processing the file"); 87 | arrowFileReader.close(); 88 | long s1 = this.intCsum + this.longCsum + this.arrCsum + this.floatCsum; 89 | System.out.println("intSum " + intCsum + " longSum " + longCsum + " arrSum " + arrCsum + " floatSum " + floatCsum + " = " + s1); 90 | System.err.println("Colsum Checksum > " + this.checkSumx + " , difference " + (s1 - this.checkSumx)); 91 | } 92 | 93 | private String getAccessorString(ValueVector accessor){ 94 | return "accessorType: " + accessor.getClass().getCanonicalName() 95 | + " valueCount " + accessor.getValueCount() 96 | + " nullCount " + accessor.getNullCount(); 97 | } 98 | 99 | private void showAccessor(ValueVector accessor){ 100 | for(int j = 0; j < accessor.getValueCount(); j++){ 101 | if(!accessor.isNull(j)){ 102 | System.out.println("\t\t accessorType: " + accessor.getClass().getCanonicalName() 103 | + " value[" + j +"] " + accessor.getObject(j)); 104 | } else { 105 | this.nullEntries++; 106 | System.out.println("\t\t accessorType: " + accessor.getClass().getCanonicalName() + " NULL at " + j); 107 | } 108 | } 109 | } 110 | 111 | private void showIntAccessor(FieldVector fx){ 112 | IntVector intVector = ((IntVector) fx); 113 | for(int j = 0; j < intVector.getValueCount(); j++){ 114 | if(!intVector.isNull(j)){ 115 | int value = intVector.get(j); 116 | System.out.println("\t\t intAccessor[" + j +"] " + value); 117 | intCsum+=value; 118 | this.checkSumx+=value; 119 | } else { 120 | this.nullEntries++; 121 | System.out.println("\t\t intAccessor[" + j +"] : NULL "); 122 | } 123 | } 124 | } 125 | 126 | private void showBigIntAccessor(FieldVector fx){ 127 | BigIntVector bigIntVector = ((BigIntVector)fx); 128 | for(int j = 0; j < bigIntVector.getValueCount(); j++){ 129 | if(!bigIntVector.isNull(j)){ 130 | long value = bigIntVector.get(j); 131 | System.out.println("\t\t bigIntAccessor[" + j +"] " + value); 132 | longCsum+=value; 133 | this.checkSumx+=value; 134 | } else { 135 | this.nullEntries++; 136 | System.out.println("\t\t bigIntAccessor[" + j +"] : NULL "); 137 | } 138 | } 139 | } 140 | 141 | private void showVarBinaryAccessor(FieldVector fx){ 142 | VarBinaryVector varBinaryVector =((VarBinaryVector) fx); 143 | for(int j = 0; j < varBinaryVector.getValueCount(); j++){ 144 | if(!varBinaryVector.isNull(j)){ 145 | byte[] value = varBinaryVector.get(j); 146 | long valHash = ArrowExampleClass.hashArray(value); 147 | System.out.println("\t\t varBinaryAccessor[" + j +"] " + ArrowExampleClass.firstX(value, 5)); 148 | arrCsum += valHash; 149 | this.checkSumx+=valHash; 150 | } else { 151 | this.nullEntries++; 152 | System.out.println("\t\t varBinaryAccessor[" + j +"] : NULL "); 153 | } 154 | } 155 | } 156 | 157 | private void showFloat4Accessor(FieldVector fx){ 158 | Float4Vector float4Vector = ((Float4Vector)fx); 159 | for(int j = 0; j < float4Vector.getValueCount(); j++){ 160 | if(!float4Vector.isNull(j)){ 161 | float value = float4Vector.get(j); 162 | System.out.println("\t\t float4[" + j +"] " + value); 163 | floatCsum+=value; 164 | this.checkSumx+=value; 165 | } else { 166 | this.nullEntries++; 167 | System.out.println("\t\t float4[" + j +"] : NULL "); 168 | } 169 | } 170 | } 171 | 172 | private void showFloat8Accessor(FieldVector fx){ 173 | Float8Vector float8Vector = ((Float8Vector)fx); 174 | for(int j = 0; j < float8Vector.getValueCount(); j++){ 175 | if(!float8Vector.isNull(j)){ 176 | double value = float8Vector.get(j); 177 | System.out.println("\t\t float8[" + j +"] " + value); 178 | floatCsum+=value; 179 | this.checkSumx+=value; 180 | } else { 181 | this.nullEntries++; 182 | System.out.println("\t\t float8[" + j +"] : NULL "); 183 | } 184 | } 185 | } 186 | 187 | // 188 | // int batchNumber = 0; 189 | // int runningIndex = 0; 190 | // while(hasMore){ 191 | // System.out.println("\t processing batch ..." + batchNumber); 192 | // // do something here 193 | // FieldVector fv = root.getVector("int"); 194 | // ValueVector.Accessor accessor = fv.getAccessor(); 195 | // System.out.println("\tintAccessor | nullCount: " + accessor.getNullCount() + " getValueCount: " + accessor.getValueCount()); 196 | // System.out.println("\t isNull at 0 " + accessor.isNull(0) + " value " + accessor.getObject(0)); 197 | // // and then accounting stuff 198 | // batchNumber++; 199 | // hasMore = arrowFileReader.loadNextBatch(); 200 | // } 201 | 202 | public void someDeadCode(ArrowFileReader arrowFileReader) throws IOException { 203 | for (ArrowBlock rbBlock : arrowFileReader.getRecordBlocks()) { 204 | if (!arrowFileReader.loadRecordBatch(rbBlock)) { 205 | throw new IOException("Expected to read record batch"); 206 | } 207 | System.out.println(" \t\t " + rbBlock); 208 | } 209 | } 210 | 211 | public static void main(String[] args) { 212 | System.out.println("Hello World!"); // Display the string. 213 | ArrowRead ex = new ArrowRead(); 214 | try { 215 | System.out.println(" args are " + args.length); 216 | if(args.length == 2) { 217 | ex.makeRead(args[1]); 218 | } else { 219 | ex.makeRead("./example.arrow"); 220 | } 221 | } catch (Exception e) { 222 | e.printStackTrace(); 223 | } 224 | System.out.println(" >>> null entires " + ex.nullEntries); 225 | } 226 | } 227 | -------------------------------------------------------------------------------- /src/main/java/com/github/animeshtrivedi/arrowexample/ArrowWrite.java: -------------------------------------------------------------------------------- 1 | package com.github.animeshtrivedi.arrowexample; 2 | 3 | import java.io.File; 4 | import java.io.FileOutputStream; 5 | import java.util.List; 6 | import java.util.Random; 7 | 8 | import com.google.common.collect.ImmutableList; 9 | import io.netty.buffer.ArrowBuf; 10 | import org.apache.arrow.memory.*; 11 | import org.apache.arrow.vector.*; 12 | import org.apache.arrow.vector.dictionary.DictionaryProvider; 13 | import org.apache.arrow.vector.ipc.ArrowFileWriter; 14 | import org.apache.arrow.vector.types.pojo.*; 15 | 16 | import static com.github.animeshtrivedi.arrowexample.Utils.validateFile; 17 | import static org.apache.arrow.vector.types.FloatingPointPrecision.SINGLE; 18 | 19 | /** 20 | * Created by atr on 14.12.17. 21 | */ 22 | public class ArrowWrite { 23 | // lets say we want to make a schema of int, long, binary, double 24 | // we have 1 element 25 | 26 | private ArrowExampleClass data[]; 27 | private RootAllocator ra = null; 28 | private Random random; 29 | private int entries; 30 | private int maxEntries; 31 | private long checkSum; 32 | private long nullEntries; 33 | private boolean useNullValues; 34 | private FileOutputStream fileOutputStream; 35 | private VectorSchemaRoot root; 36 | private ArrowFileWriter arrowFileWriter; 37 | private int batchSize; 38 | 39 | public ArrowWrite(){ 40 | this.useNullValues = false; 41 | this.nullEntries = 0; 42 | this.maxEntries = 1024; 43 | this.checkSum = 0; 44 | this.batchSize = 100; 45 | random = new Random(System.nanoTime()); 46 | this.entries = this.random.nextInt(this.maxEntries); 47 | this.data = new ArrowExampleClass[this.entries]; 48 | for(int i =0; i < this.entries; i++){ 49 | this.data[i] = new ArrowExampleClass(this.random, i); 50 | long csum = this.data[i].getSumHash(); 51 | //System.out.println(this.data[i].toString() + " csum: " + csum); 52 | checkSum+=csum; 53 | } 54 | long s1 = showColumnSum(); 55 | System.out.println(); 56 | //essentially here is the problem - this sum should match 57 | //System.out.println("They match : " + (s1 == checkSum) + " colSum " + s1 + " rowSum " + this.checkSum + " difference is " + (this.checkSum - s1)); 58 | this.ra = new RootAllocator(Integer.MAX_VALUE); 59 | } 60 | 61 | private long showColumnSum(){ 62 | long intSum = 0; 63 | long longSum = 0; 64 | long arrSum = 0; 65 | long floatSum = 0; 66 | for(int i =0; i < this.entries; i++){ 67 | intSum+=this.data[i].anInt; 68 | longSum+=this.data[i].aLong; 69 | arrSum+=ArrowExampleClass.hashArray(this.data[i].arr); 70 | floatSum+=this.data[i].aFloat; 71 | } 72 | System.out.println("intSum " + intSum + " longSum " + longSum + " arrSum " + arrSum + " floatSum " + floatSum); 73 | return intSum + longSum + arrSum + floatSum; 74 | } 75 | 76 | 77 | private Schema makeSchema(){ 78 | ImmutableList.Builder childrenBuilder = ImmutableList.builder(); 79 | childrenBuilder.add(new Field("int", FieldType.nullable(new ArrowType.Int(32, true)), null)); 80 | childrenBuilder.add(new Field("long", FieldType.nullable(new ArrowType.Int(64, true)), null)); 81 | childrenBuilder.add(new Field("binary", FieldType.nullable(new ArrowType.Binary()), null)); 82 | childrenBuilder.add(new Field("double", FieldType.nullable(new ArrowType.FloatingPoint(SINGLE)), null)); 83 | return new Schema(childrenBuilder.build(), null); 84 | } 85 | 86 | public void setupWrite(String filename, boolean useCustom) throws Exception { 87 | File arrowFile = validateFile(filename, false); 88 | this.fileOutputStream = new FileOutputStream(arrowFile); 89 | Schema schema = makeSchema(); 90 | this.root = VectorSchemaRoot.create(schema, this.ra); 91 | DictionaryProvider.MapDictionaryProvider provider = new DictionaryProvider.MapDictionaryProvider(); 92 | if (!useCustom) { 93 | /* default java implementation of the channel */ 94 | this.arrowFileWriter = new ArrowFileWriter(root, 95 | provider, 96 | this.fileOutputStream.getChannel()); 97 | } else { 98 | /* custom channel implementation in ArrowOutputStream */ 99 | this.arrowFileWriter = new ArrowFileWriter(root, 100 | provider, 101 | new ArrowOutputStream(this.fileOutputStream)); 102 | } 103 | 104 | if (false) { 105 | // show some stuff about the schema and layout 106 | for (Field field : root.getSchema().getFields()) { 107 | FieldVector vector = root.getVector(field.getName()); 108 | showFieldLayout(field, vector); 109 | } 110 | } 111 | System.out.println("Generated " + this.entries + " data entries , batch size " + batchSize + " usingCustomWriter: " + useCustom + " useNullValues " + this.useNullValues); 112 | } 113 | public void writeData() throws Exception{ 114 | // writing logic starts here 115 | this.batchSize = 100; 116 | arrowFileWriter.start(); 117 | for(int i = 0; i < this.entries;) { 118 | int toProcessItems = Math.min(this.batchSize, this.entries - i); 119 | // set the batch row count 120 | root.setRowCount(toProcessItems); 121 | for (Field field : root.getSchema().getFields()) { 122 | FieldVector vector = root.getVector(field.getName()); 123 | switch (vector.getMinorType()) { 124 | case INT: 125 | writeFieldInt(vector, i, toProcessItems); 126 | break; 127 | case BIGINT: 128 | writeFieldLong(vector, i, toProcessItems); 129 | break; 130 | case VARBINARY: 131 | writeFieldVarBinary(vector, i, toProcessItems); 132 | break; 133 | case FLOAT4: 134 | writeFieldFloat4(vector, i, toProcessItems); 135 | break; 136 | default: 137 | throw new Exception(" Not supported yet type: " + vector.getMinorType()); 138 | } 139 | } 140 | arrowFileWriter.writeBatch(); 141 | i+=toProcessItems; 142 | } 143 | arrowFileWriter.end(); 144 | arrowFileWriter.close(); 145 | fileOutputStream.flush(); 146 | fileOutputStream.close(); 147 | System.err.println("****** : " + this.checkSum); 148 | } 149 | 150 | private int isSet(){ 151 | if(useNullValues) { 152 | if (this.random.nextInt() % 10 == 0) { 153 | this.nullEntries++; 154 | return 0; 155 | } 156 | } 157 | return 1; 158 | } 159 | 160 | private void writeFieldInt(FieldVector fieldVector, int from, int items){ 161 | IntVector intVector = (IntVector) fieldVector; 162 | intVector.setInitialCapacity(items); 163 | intVector.allocateNew(); 164 | for(int i = 0; i < items; i++){ 165 | intVector.setSafe(i, isSet(), this.data[from + i].anInt); 166 | } 167 | // how many are set 168 | fieldVector.setValueCount(items); 169 | } 170 | 171 | private void writeFieldLong(FieldVector fieldVector, int from, int items){ 172 | BigIntVector bigIntVector = (BigIntVector) fieldVector; 173 | bigIntVector.setInitialCapacity(items); 174 | bigIntVector.allocateNew(); 175 | for(int i = 0; i < items; i++){ 176 | bigIntVector.setSafe(i, isSet(), this.data[from + i].aLong); 177 | } 178 | // how many are set 179 | bigIntVector.setValueCount(items); 180 | } 181 | 182 | private void writeFieldVarBinary(FieldVector fieldVector, int from, int items){ 183 | VarBinaryVector varBinaryVector = (VarBinaryVector) fieldVector; 184 | varBinaryVector.setInitialCapacity(items); 185 | varBinaryVector.allocateNew(); 186 | for(int i = 0; i < items; i++){ 187 | if(isSet() == 0){ 188 | varBinaryVector.setNull(i); 189 | } else { 190 | varBinaryVector.setIndexDefined(i); 191 | varBinaryVector.setValueLengthSafe(i, this.data[from + i].arr.length); 192 | varBinaryVector.setSafe(i, this.data[from + i].arr); 193 | } 194 | } 195 | // how many are set 196 | varBinaryVector.setValueCount(items); 197 | } 198 | 199 | private void writeFieldFloat4(FieldVector fieldVector, int from, int items){ 200 | Float4Vector float4Vector = (Float4Vector ) fieldVector; 201 | float4Vector.setInitialCapacity(items); 202 | float4Vector.allocateNew(); 203 | for(int i = 0; i < items; i++){ 204 | float4Vector.setSafe(i, isSet(), this.data[from + i].aFloat); 205 | } 206 | // how many are set 207 | float4Vector.setValueCount(items); 208 | } 209 | 210 | private void showFieldLayout(Field field, FieldVector fieldVector){ 211 | // per field execution 212 | TypeLayout typeLayout = TypeLayout.getTypeLayout(field.getType()); 213 | List vectorTypes = typeLayout.getBufferTypes(); 214 | ArrowBuf[] vectorBuffers = new ArrowBuf[vectorTypes.size()]; 215 | 216 | if (vectorTypes.size() != vectorBuffers.length) { 217 | throw new IllegalArgumentException("vector types and vector buffers are not the same size: " + vectorTypes.size() + " != " + vectorBuffers.length); 218 | } 219 | System.out.println(" ----- [ " + field.toString() + " ] -------- "); 220 | System.out.println("FieldVector type: " + fieldVector.getClass().getCanonicalName()); 221 | System.out.println("TypeLayout is " + typeLayout.toString() + " vectorSize is " + vectorTypes.size()); 222 | for(int i = 0; i < vectorTypes.size(); i++){ 223 | /* fields in the vector type tells how to locate, for primitive types it has only 2 validity and data 224 | whereas for binary it has 3, validity, offset and data. I suppose if I remove the nullable part, then 225 | it will be only 1 and 2 types - can confirm? TODO: 226 | */ 227 | System.out.println(" \t vector type entries [" + i + "] " + vectorTypes.get(i).toString()); 228 | } 229 | System.out.println("*********************************************"); 230 | // 231 | // fieldVector.allocateNew(); 232 | // fieldVector.getMutator().setValueCount(this.entries); 233 | // System.out.println(" Setting up the mutator count to be " + this.entries + " ** " + fieldVector.getMutator().getClass().getCanonicalName() + " >>>>> " + field.getFieldType().getType().getTypeID()); 234 | // /* based upon the schema */ 235 | // for (int v = 0; v < vectorTypes.size(); v++){ 236 | // /* we get specific ArrowVectorType and associated BufferBacked */ 237 | // ArrowVectorType vectorType = vectorTypes.get(v); 238 | // BufferBacked bufferBacked = fieldInnerVectors.get(v); 239 | // /* I don't like this explicit casting ? */ 240 | // ValueVector valueVector = (ValueVector) bufferBacked; 241 | // System.out.println("\t valueVector minor type is : " + valueVector.getMinorType()); 242 | // valueVector.setInitialCapacity(this.entries); 243 | // valueVector.allocateNew(); 244 | // valueVector.clear(); 245 | // } 246 | } 247 | 248 | public static void main(String[] args) { 249 | ArrowWrite ex = new ArrowWrite(); 250 | try { 251 | System.out.println("Number of arguments " + args.length); 252 | if(args.length == 2){ 253 | ex.useNullValues = true; 254 | ex.setupWrite("./example.arrow", true); 255 | } else if(args.length == 1){ 256 | ex.setupWrite("./example.arrow", true); 257 | } else{ 258 | ex.setupWrite("./example.arrow", false); 259 | } 260 | ex.writeData(); 261 | } catch (Exception e) { 262 | e.printStackTrace(); 263 | } 264 | System.out.println("null entries " + ex.nullEntries); 265 | } 266 | } -------------------------------------------------------------------------------- /src/main/java/com/github/animeshtrivedi/arrowexample/Utils.java: -------------------------------------------------------------------------------- 1 | package com.github.animeshtrivedi.arrowexample; 2 | 3 | import java.io.File; 4 | 5 | /** 6 | * Created by atr on 20.12.17. 7 | */ 8 | public class Utils { 9 | static File validateFile(String fileName, boolean shouldExist) { 10 | if (fileName == null) { 11 | throw new IllegalArgumentException("missing file parameter"); 12 | } 13 | File f = new File(fileName); 14 | if (shouldExist && (!f.exists() || f.isDirectory())) { 15 | throw new IllegalArgumentException(fileName + " file not found: " + f.getAbsolutePath()); 16 | } 17 | if (!shouldExist && f.exists()) { 18 | throw new IllegalArgumentException(fileName + " file already exists: " + f.getAbsolutePath()); 19 | } 20 | return f; 21 | } 22 | } 23 | --------------------------------------------------------------------------------