├── .travis.yml ├── LICENSE ├── README.md ├── build.gradle ├── gradle.properties ├── gradle └── wrapper │ ├── gradle-wrapper.jar │ └── gradle-wrapper.properties ├── gradlew ├── gradlew.bat ├── settings.gradle └── src ├── main └── scala │ └── com │ └── landoop │ └── avro │ ├── DataBlock.java │ ├── FastDataFileWriter.scala │ ├── FastDataFileWriterBuilder.scala │ ├── codec │ ├── BZip2Codec.java │ ├── CodecFactory.java │ ├── CodecFactory.scala │ ├── DeflateCodec.java │ ├── NullCodec.java │ ├── SnappyCodec.java │ └── XZCodec.java │ └── concurrent │ ├── ExecutorExtension.scala │ └── FutureAwaitWithFailFastFn.scala └── test ├── resources └── log4j.properties └── scala └── com └── landoop └── avro ├── AvroFileWriter.scala ├── FastDataFileWriterTest.scala ├── FastWriteProgram.scala ├── StandardWriteProgram.scala ├── StockQuote.scala └── Timed.scala /.travis.yml: -------------------------------------------------------------------------------- 1 | language: scala 2 | scala: 3 | - 2.11.8 4 | 5 | jdk: 6 | - oraclejdk8 7 | 8 | # sudo: true 9 | 10 | # Enable if you want to use gradlew 11 | before_install: 12 | - chmod +x gradlew 13 | 14 | # If you omit install, travis will always run gradle assemble 15 | install: echo "skip 'gradle assembly'" 16 | 17 | script: 18 | - ./gradlew clean build 19 | 20 | cache: 21 | directories: 22 | - $HOME/.gradle/caches/ 23 | - $HOME/.gradle/wrapper/ 24 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Build Status](https://travis-ci.org/Landoop/fast-avro-write.svg?branch=master)](https://travis-ci.org/Landoop/fast-avro-write) 2 | [![Maven Central](https://maven-badges.herokuapp.com/maven-central/com.landoop/fast-avro-write/badge.svg)](https://maven-badges.herokuapp.com/maven-central/com.landoop/fast-avro-write) 3 | [![GitHub license](https://img.shields.io/github/license/Landoop/fast-avro-write.svg)]() 4 | 5 | # fast-avro-write 6 | A small library allowing you to parallelize the write to an avro file 7 | thus achieving much better throughput 8 | 9 | 10 | How to use it: 11 | ```scala 12 | val datumWriter = new GenericDatumWriter[GenericRecord](schema) 13 | val builder = FastDataFileWriterBuilder(datumWriter, out, schema) 14 | .withCodec(CodecFactory.snappyCodec()) 15 | .withFlushOnEveryBlock(false) 16 | .withParallelization(parallelization) 17 | 18 | builder.encoderFactory.configureBufferSize(4 * 1048576) 19 | builder.encoderFactory.configureBlockSize(4 * 1048576) 20 | 21 | val fileWriter = builder.build() 22 | fileWriter.write(records) 23 | ``` 24 | This will write all the records to the file. If the records count passes a threshold it will parallelize the write. 25 | You can set the threshold as well; the write method takes a default parameter threshold. 26 | Simple! 27 | 28 | ## Blog article 29 | 30 | http://www.landoop.com/blog/2017/05/fast-avro-write/ 31 | 32 | ## Release History 33 | 34 | 0.2 - [2017-09-18] Upgrade to Avro 1.8.2 35 | 36 | 0.1 - [2017-04-02] Initial release 37 | 38 | ## Performance 39 | 40 | Run on 8GB, i7-4650U, SSD 41 | Here is the class from which the GenericRecords are created 42 | 43 | ```scala 44 | case class StockQuote(symbol: String, 45 | timestamp: Long, 46 | ask: Double, 47 | askSize: Int, 48 | bid: Double, 49 | bidSize: Int, 50 | dayHigh: Double, 51 | dayLow: Double, 52 | lastTradeSize: Int, 53 | lastTradeTime: Long, 54 | open: Double, 55 | previousClose: Double, 56 | price: Double, 57 | priceAvg50: Double, 58 | priceAvg200: Double, 59 | volume: Long, 60 | yearHigh: Double, 61 | yearLow: Double, 62 | f1:String="value", 63 | f2:String="value", 64 | f3:String="value", 65 | f4:String="value", 66 | f5:String="value", 67 | f6:String="value", 68 | f7:String="value", 69 | f8:String="value", 70 | f9:String="value", 71 | f10:String="value", 72 | f11:String="value", 73 | f12:String="value", 74 | f13:String="value", 75 | f14:String="value", 76 | f15:String="value", 77 | f16:String="value", 78 | f17:String="value", 79 | f18:String="value", 80 | f19:String="value", 81 | f20:String="value", 82 | f21:String="value", 83 | f22:String="value", 84 | f23:String="value", 85 | f24:String="value", 86 | f25:String="value", 87 | f26:String="value", 88 | f27:String="value", 89 | f28:String="value", 90 | f29:String="value", 91 | f30:String="value", 92 | f31:String="value", 93 | f32:String="value", 94 | f33:String="value", 95 | f34:String="value", 96 | f35:String="value", 97 | f36:String="value", 98 | f37:String="value", 99 | f38:String="value", 100 | f39:String="value", 101 | f40:String="value", 102 | f41:String="value", 103 | f42:String="value", 104 | f43:String="value", 105 | f44:String="value", 106 | f45:String="value", 107 | f46:String="value", 108 | f47:String="value", 109 | f48:String="value", 110 | f49:String="value", 111 | f50:String="value", 112 | f51:String="value", 113 | f52:String="value", 114 | f53:String="value", 115 | f54:String="value", 116 | f55:String="value", 117 | f56:String="value", 118 | f57:String="value", 119 | f58:String="value", 120 | f59:String="value", 121 | f60:String="value" 122 | ) 123 | ``` 124 | 125 | For each record count 10 runs have been made sequentially and the min and max values have been retained. All the values are in milliseconds 126 | For Fast writes different parallelization factor has been used - see p in the header 127 | 128 | |Record Count| Standard Min| Standard Max|Fast Min (p=8)|Fast Max (p=8)|Fast Min (p=4)|Fast Max (p=4)|Fast Min (p=6)|Fast Min (p=6)| 129 | |------------|-------------|-------------|--------------|--------------|--------------|--------------|--------------|--------------| 130 | |100K |490 |530 |286 |365 |306 |562 |284 |316 | 131 | |200K |981 |1097 |570 |692 |545 |783 |586 |777 | 132 | |500K |2534 |2755 |1443 |1575 |1313 |1607 |1365 |1402 | 133 | |1M |5079 |5322 |2853 |2948 |2571 |2820 |2816 |2984 | 134 | -------------------------------------------------------------------------------- /build.gradle: -------------------------------------------------------------------------------- 1 | buildscript { 2 | repositories { 3 | jcenter() 4 | maven { 5 | url 'https://plugins.gradle.org/m2/' 6 | } 7 | } 8 | dependencies { 9 | classpath 'com.github.maiflai:gradle-scalatest:0.14' 10 | classpath 'io.codearte.gradle.nexus:gradle-nexus-staging-plugin:0.5.3' 11 | classpath 'net.researchgate:gradle-release:2.3.4' 12 | } 13 | } 14 | 15 | apply plugin: 'signing' 16 | apply plugin: 'io.codearte.nexus-staging' 17 | apply plugin: 'net.researchgate.release' 18 | 19 | 20 | allprojects { 21 | group = 'com.landoop' 22 | version = version 23 | description = "Small utility lib to speed up Avro write" 24 | 25 | apply plugin: 'scala' 26 | apply plugin: 'maven' 27 | apply plugin: 'com.github.maiflai.scalatest' 28 | sourceCompatibility = 1.8 29 | targetCompatibility = 1.8 30 | 31 | ext { 32 | scalaMajorVersion = '2.11' 33 | scala = '2.11.8' 34 | scalaCheck = '1.11.1' 35 | scalaTest = '2.2.6' 36 | junitVersion = '4.12' 37 | avroVersion = '1.8.2' 38 | avro4sVersion = "1.8.0" 39 | scalaLoggingVersion = '3.5.0' 40 | } 41 | 42 | repositories { 43 | mavenLocal() 44 | mavenCentral() 45 | maven { url "http://repo.typesafe.com/typesafe/releases/" } 46 | } 47 | 48 | configurations { 49 | provided 50 | compile.extendsFrom provided 51 | 52 | } 53 | 54 | dependencies { 55 | compile "org.scala-lang:scala-library:$scala" 56 | compile "org.apache.avro:avro:$avroVersion" 57 | 58 | testCompile "com.sksamuel.avro4s:avro4s-core_$scalaMajorVersion:${avro4sVersion}" 59 | testCompile "org.scalacheck:scalacheck_$scalaMajorVersion:$scalaCheck" 60 | testCompile "org.scalatest:scalatest_$scalaMajorVersion:$scalaTest" 61 | testCompile "com.typesafe.scala-logging:scala-logging_$scalaMajorVersion:$scalaLoggingVersion" 62 | testRuntime 'org.pegdown:pegdown:1.1.0' 63 | } 64 | 65 | test { 66 | maxParallelForks = 1 67 | minHeapSize '256m' 68 | maxHeapSize '2048m' 69 | systemProperty 'keystore', projectDir.canonicalPath + "/src/test/resources/stc_keystore.jks" 70 | systemProperty 'truststore', projectDir.canonicalPath + "/src/test/resources/stc_truststore.jks" 71 | } 72 | 73 | task testJar(type: Jar, dependsOn: testClasses) { 74 | baseName = "test-${project.archivesBaseName}" 75 | from sourceSets.test.output 76 | } 77 | 78 | configurations { 79 | tests 80 | } 81 | 82 | task sourcesJar(type: Jar) { 83 | classifier = 'sources' 84 | from sourceSets.main.allSource 85 | } 86 | 87 | task javadocJar(type: Jar) { 88 | classifier = 'javadoc' 89 | from javadoc 90 | } 91 | 92 | task scaladocJar(type: Jar) { 93 | classifier = 'scaladoc' 94 | from '../LICENSE' 95 | from scaladoc 96 | } 97 | 98 | tasks.withType(Tar) { 99 | compression Compression.GZIP 100 | extension = 'tgz' 101 | } 102 | 103 | artifacts { 104 | archives javadocJar, scaladocJar, sourcesJar 105 | } 106 | 107 | task compile(dependsOn: 'compileScala') 108 | 109 | signing { 110 | required { gradle.taskGraph.hasTask("uploadArchives") } 111 | sign configurations.archives 112 | } 113 | 114 | // OSSRH publication 115 | if (project.hasProperty('release')) { 116 | uploadArchives { 117 | repositories { 118 | mavenDeployer { 119 | // POM signature 120 | beforeDeployment { MavenDeployment deployment -> signing.signPom(deployment) } 121 | // Target repository 122 | repository(url: "https://oss.sonatype.org/service/local/staging/deploy/maven2/") { 123 | authentication(userName: ossrhUsername, password: ossrhPassword) 124 | } 125 | pom.project { 126 | name project.name 127 | description project.description 128 | packaging 'jar' 129 | url 'https://github.com/datamountaineer/kafka-connect-common' 130 | 131 | scm { 132 | connection 'scm:git:https://github.com/datamountaineer/kafka-connect-common.git' 133 | developerConnection 'scm:git:git@github.com:datamountaineer/kafka-connect-common.git' 134 | url 'https://github.com/datamountaineer/kafka-connect-common.git' 135 | } 136 | 137 | licenses { 138 | license { 139 | name 'Apache License 2.0' 140 | url 'http://www.apache.org/licenses/LICENSE-2.0.html' 141 | distribution 'repo' 142 | } 143 | } 144 | 145 | developers { 146 | developer { 147 | id = 'stheppi' 148 | name = 'Stefan Bocutiu' 149 | email = 'stefan@datamountaineer.com' 150 | } 151 | developer { 152 | id = 'Antwnis' 153 | name = 'Antonios Chalkiopoulos' 154 | email = 'antonios@datamountaineer.com' 155 | } 156 | } 157 | } 158 | } 159 | } 160 | } 161 | 162 | nexusStaging { 163 | packageGroup = project.getGroup() 164 | username = ossrhUsername 165 | password = ossrhPassword 166 | } 167 | } 168 | } 169 | 170 | -------------------------------------------------------------------------------- /gradle.properties: -------------------------------------------------------------------------------- 1 | version=0.2 2 | ossrhUsername=me 3 | ossrhPassword=you -------------------------------------------------------------------------------- /gradle/wrapper/gradle-wrapper.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lensesio/fast-avro-write/dd54bad41f0323bbef52cc35cdbe890d26c037ed/gradle/wrapper/gradle-wrapper.jar -------------------------------------------------------------------------------- /gradle/wrapper/gradle-wrapper.properties: -------------------------------------------------------------------------------- 1 | #Mon Sep 18 14:42:46 BST 2017 2 | distributionBase=GRADLE_USER_HOME 3 | distributionPath=wrapper/dists 4 | zipStoreBase=GRADLE_USER_HOME 5 | zipStorePath=wrapper/dists 6 | distributionUrl=https\://services.gradle.org/distributions/gradle-3.4.1-all.zip 7 | -------------------------------------------------------------------------------- /gradlew: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | 3 | ############################################################################## 4 | ## 5 | ## Gradle start up script for UN*X 6 | ## 7 | ############################################################################## 8 | 9 | # Attempt to set APP_HOME 10 | # Resolve links: $0 may be a link 11 | PRG="$0" 12 | # Need this for relative symlinks. 13 | while [ -h "$PRG" ] ; do 14 | ls=`ls -ld "$PRG"` 15 | link=`expr "$ls" : '.*-> \(.*\)$'` 16 | if expr "$link" : '/.*' > /dev/null; then 17 | PRG="$link" 18 | else 19 | PRG=`dirname "$PRG"`"/$link" 20 | fi 21 | done 22 | SAVED="`pwd`" 23 | cd "`dirname \"$PRG\"`/" >/dev/null 24 | APP_HOME="`pwd -P`" 25 | cd "$SAVED" >/dev/null 26 | 27 | APP_NAME="Gradle" 28 | APP_BASE_NAME=`basename "$0"` 29 | 30 | # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 31 | DEFAULT_JVM_OPTS="" 32 | 33 | # Use the maximum available or set MAX_FD != -1 to use that value. 34 | MAX_FD="maximum" 35 | 36 | warn ( ) { 37 | echo "$*" 38 | } 39 | 40 | die ( ) { 41 | echo 42 | echo "$*" 43 | echo 44 | exit 1 45 | } 46 | 47 | # OS specific support (must be 'true' or 'false'). 48 | cygwin=false 49 | msys=false 50 | darwin=false 51 | nonstop=false 52 | case "`uname`" in 53 | CYGWIN* ) 54 | cygwin=true 55 | ;; 56 | Darwin* ) 57 | darwin=true 58 | ;; 59 | MINGW* ) 60 | msys=true 61 | ;; 62 | NONSTOP* ) 63 | nonstop=true 64 | ;; 65 | esac 66 | 67 | CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar 68 | 69 | # Determine the Java command to use to start the JVM. 70 | if [ -n "$JAVA_HOME" ] ; then 71 | if [ -x "$JAVA_HOME/jre/sh/java" ] ; then 72 | # IBM's JDK on AIX uses strange locations for the executables 73 | JAVACMD="$JAVA_HOME/jre/sh/java" 74 | else 75 | JAVACMD="$JAVA_HOME/bin/java" 76 | fi 77 | if [ ! -x "$JAVACMD" ] ; then 78 | die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME 79 | 80 | Please set the JAVA_HOME variable in your environment to match the 81 | location of your Java installation." 82 | fi 83 | else 84 | JAVACMD="java" 85 | which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 86 | 87 | Please set the JAVA_HOME variable in your environment to match the 88 | location of your Java installation." 89 | fi 90 | 91 | # Increase the maximum file descriptors if we can. 92 | if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then 93 | MAX_FD_LIMIT=`ulimit -H -n` 94 | if [ $? -eq 0 ] ; then 95 | if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then 96 | MAX_FD="$MAX_FD_LIMIT" 97 | fi 98 | ulimit -n $MAX_FD 99 | if [ $? -ne 0 ] ; then 100 | warn "Could not set maximum file descriptor limit: $MAX_FD" 101 | fi 102 | else 103 | warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT" 104 | fi 105 | fi 106 | 107 | # For Darwin, add options to specify how the application appears in the dock 108 | if $darwin; then 109 | GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\"" 110 | fi 111 | 112 | # For Cygwin, switch paths to Windows format before running java 113 | if $cygwin ; then 114 | APP_HOME=`cygpath --path --mixed "$APP_HOME"` 115 | CLASSPATH=`cygpath --path --mixed "$CLASSPATH"` 116 | JAVACMD=`cygpath --unix "$JAVACMD"` 117 | 118 | # We build the pattern for arguments to be converted via cygpath 119 | ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null` 120 | SEP="" 121 | for dir in $ROOTDIRSRAW ; do 122 | ROOTDIRS="$ROOTDIRS$SEP$dir" 123 | SEP="|" 124 | done 125 | OURCYGPATTERN="(^($ROOTDIRS))" 126 | # Add a user-defined pattern to the cygpath arguments 127 | if [ "$GRADLE_CYGPATTERN" != "" ] ; then 128 | OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)" 129 | fi 130 | # Now convert the arguments - kludge to limit ourselves to /bin/sh 131 | i=0 132 | for arg in "$@" ; do 133 | CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -` 134 | CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option 135 | 136 | if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition 137 | eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"` 138 | else 139 | eval `echo args$i`="\"$arg\"" 140 | fi 141 | i=$((i+1)) 142 | done 143 | case $i in 144 | (0) set -- ;; 145 | (1) set -- "$args0" ;; 146 | (2) set -- "$args0" "$args1" ;; 147 | (3) set -- "$args0" "$args1" "$args2" ;; 148 | (4) set -- "$args0" "$args1" "$args2" "$args3" ;; 149 | (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;; 150 | (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;; 151 | (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;; 152 | (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;; 153 | (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;; 154 | esac 155 | fi 156 | 157 | # Escape application args 158 | save ( ) { 159 | for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done 160 | echo " " 161 | } 162 | APP_ARGS=$(save "$@") 163 | 164 | # Collect all arguments for the java command, following the shell quoting and substitution rules 165 | eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS" 166 | 167 | # by default we should be in the correct project dir, but when run from Finder on Mac, the cwd is wrong 168 | if [ "$(uname)" = "Darwin" ] && [ "$HOME" = "$PWD" ]; then 169 | cd "$(dirname "$0")" 170 | fi 171 | 172 | exec "$JAVACMD" "$@" 173 | -------------------------------------------------------------------------------- /gradlew.bat: -------------------------------------------------------------------------------- 1 | @if "%DEBUG%" == "" @echo off 2 | @rem ########################################################################## 3 | @rem 4 | @rem Gradle startup script for Windows 5 | @rem 6 | @rem ########################################################################## 7 | 8 | @rem Set local scope for the variables with windows NT shell 9 | if "%OS%"=="Windows_NT" setlocal 10 | 11 | set DIRNAME=%~dp0 12 | if "%DIRNAME%" == "" set DIRNAME=. 13 | set APP_BASE_NAME=%~n0 14 | set APP_HOME=%DIRNAME% 15 | 16 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 17 | set DEFAULT_JVM_OPTS= 18 | 19 | @rem Find java.exe 20 | if defined JAVA_HOME goto findJavaFromJavaHome 21 | 22 | set JAVA_EXE=java.exe 23 | %JAVA_EXE% -version >NUL 2>&1 24 | if "%ERRORLEVEL%" == "0" goto init 25 | 26 | echo. 27 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 28 | echo. 29 | echo Please set the JAVA_HOME variable in your environment to match the 30 | echo location of your Java installation. 31 | 32 | goto fail 33 | 34 | :findJavaFromJavaHome 35 | set JAVA_HOME=%JAVA_HOME:"=% 36 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe 37 | 38 | if exist "%JAVA_EXE%" goto init 39 | 40 | echo. 41 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% 42 | echo. 43 | echo Please set the JAVA_HOME variable in your environment to match the 44 | echo location of your Java installation. 45 | 46 | goto fail 47 | 48 | :init 49 | @rem Get command-line arguments, handling Windows variants 50 | 51 | if not "%OS%" == "Windows_NT" goto win9xME_args 52 | 53 | :win9xME_args 54 | @rem Slurp the command line arguments. 55 | set CMD_LINE_ARGS= 56 | set _SKIP=2 57 | 58 | :win9xME_args_slurp 59 | if "x%~1" == "x" goto execute 60 | 61 | set CMD_LINE_ARGS=%* 62 | 63 | :execute 64 | @rem Setup the command line 65 | 66 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar 67 | 68 | @rem Execute Gradle 69 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS% 70 | 71 | :end 72 | @rem End local scope for the variables with windows NT shell 73 | if "%ERRORLEVEL%"=="0" goto mainEnd 74 | 75 | :fail 76 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of 77 | rem the _cmd.exe /c_ return code! 78 | if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1 79 | exit /b 1 80 | 81 | :mainEnd 82 | if "%OS%"=="Windows_NT" endlocal 83 | 84 | :omega 85 | -------------------------------------------------------------------------------- /settings.gradle: -------------------------------------------------------------------------------- 1 | rootProject.name = 'fast-avro-write' 2 | -------------------------------------------------------------------------------- /src/main/scala/com/landoop/avro/DataBlock.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Landoop. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.landoop.avro; 17 | 18 | import org.apache.avro.file.Codec; 19 | import org.apache.avro.io.BinaryEncoder; 20 | 21 | import java.io.IOException; 22 | import java.nio.ByteBuffer; 23 | 24 | class DataBlock { 25 | private byte[] data; 26 | private long numEntries; 27 | private int blockSize; 28 | private int offset = 0; 29 | private boolean flushOnWrite = true; 30 | 31 | private DataBlock(long numEntries, int blockSize) { 32 | this.data = new byte[blockSize]; 33 | this.numEntries = numEntries; 34 | this.blockSize = blockSize; 35 | } 36 | 37 | DataBlock(ByteBuffer block, long numEntries) { 38 | this.data = block.array(); 39 | this.blockSize = block.remaining(); 40 | this.offset = block.arrayOffset() + block.position(); 41 | this.numEntries = numEntries; 42 | } 43 | 44 | byte[] getData() { 45 | return data; 46 | } 47 | 48 | long getNumEntries() { 49 | return numEntries; 50 | } 51 | 52 | int getBlockSize() { 53 | return blockSize; 54 | } 55 | 56 | boolean isFlushOnWrite() { 57 | return flushOnWrite; 58 | } 59 | 60 | void setFlushOnWrite(boolean flushOnWrite) { 61 | this.flushOnWrite = flushOnWrite; 62 | } 63 | 64 | ByteBuffer getAsByteBuffer() { 65 | return ByteBuffer.wrap(data, offset, blockSize); 66 | } 67 | 68 | void decompressUsing(Codec c) throws IOException { 69 | ByteBuffer result = c.decompress(getAsByteBuffer()); 70 | data = result.array(); 71 | blockSize = result.remaining(); 72 | } 73 | 74 | void compressUsing(Codec c) throws IOException { 75 | ByteBuffer result = c.compress(getAsByteBuffer()); 76 | data = result.array(); 77 | blockSize = result.remaining(); 78 | } 79 | 80 | void writeBlockTo(BinaryEncoder e, byte[] sync) throws IOException { 81 | e.writeLong(this.numEntries); 82 | e.writeLong(this.blockSize); 83 | e.writeFixed(this.data, offset, this.blockSize); 84 | e.writeFixed(sync); 85 | if (flushOnWrite) { 86 | e.flush(); 87 | } 88 | } 89 | 90 | } -------------------------------------------------------------------------------- /src/main/scala/com/landoop/avro/FastDataFileWriter.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Landoop. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.landoop.avro 17 | 18 | import java.io._ 19 | import java.nio.ByteBuffer 20 | import java.security.{MessageDigest, NoSuchAlgorithmException} 21 | import java.util.UUID 22 | import java.util.concurrent.Executors 23 | 24 | import com.landoop.avro.FastDataFileWriter.NonCopyingByteArrayOutputStream 25 | import com.landoop.avro.codec.CodecFactory 26 | import com.landoop.avro.concurrent.FutureAwaitWithFailFastFn 27 | import org.apache.avro.file.{Codec, DataFileConstants, Syncable} 28 | import org.apache.avro.io.{BinaryEncoder, DatumWriter, EncoderFactory} 29 | import org.apache.avro.{AvroRuntimeException, Schema} 30 | 31 | import scala.concurrent.duration.{Duration, _} 32 | 33 | 34 | /** Stores in a file a sequence of data conforming to a schema. The schema is 35 | * stored in the file with the data. Each datum in a file is of the same 36 | * schema. Data is written with a {@link DatumWriter}. Data is grouped into 37 | * blocks. A synchronization marker is written between blocks, so that 38 | * files may be split. Blocks may be compressed. Extensible metadata is 39 | * stored at the end of the file. Files may be appended to. 40 | * 41 | * @see DataFileReader 42 | */ 43 | class FastDataFileWriter[D] private[avro](datumWriter: DatumWriter[D], 44 | out: OutputStream, 45 | schema: Schema, 46 | codecFactory: CodecFactory, 47 | val flushOnEveryBlock: Boolean, 48 | val syncInterval: Int, 49 | syncMarker: Array[Byte], 50 | val parallelization: Int, 51 | meta: Map[String, Array[Byte]], 52 | encoderFactory: EncoderFactory) extends Closeable with Flushable { 53 | require(datumWriter != null, "Invalid DatumWriter") 54 | require(out != null, "Invalid output stream") 55 | require(schema != null, "Invalid schema") 56 | require(codecFactory != null, "Invalid codecFactory") 57 | require(syncMarker != null && syncMarker.length == 16, "Invalid syncMarker") 58 | require(parallelization > 0, "Invalid parallelization") 59 | require(syncInterval > 32 && syncInterval < (1 << 30), "Invalid syncInterval value: " + syncInterval) 60 | 61 | private val vout: BinaryEncoder = encoderFactory.binaryEncoder(out, null) 62 | private var isOpen = true 63 | private val lock = new Object 64 | private val codecs = (1 to parallelization).map(_ => codecFactory.createInstance()).toArray 65 | 66 | datumWriter.setSchema(schema) 67 | vout.writeFixed(DataFileConstants.MAGIC) // write magic 68 | vout.writeMapStart() // write metadata 69 | vout.setItemCount(meta.size) 70 | meta.foreach { case (key, value) => 71 | vout.startItem() 72 | vout.writeString(key) 73 | vout.writeBytes(value) 74 | } 75 | vout.writeMapEnd() 76 | vout.writeFixed(syncMarker) // write initial sync 77 | vout.flush() //vout may be buffered, flush before writing to out 78 | 79 | 80 | private def assertOpen() = { 81 | if (!isOpen) throw new AvroRuntimeException("not open") 82 | } 83 | 84 | private def assertNotOpen() = { 85 | if (isOpen) throw new AvroRuntimeException("already open") 86 | } 87 | 88 | /** Write datum to the file. 89 | * 90 | * @see AppendWriteException 91 | */ 92 | @throws[IOException] 93 | def write(data: IndexedSeq[D], threshold: Int = 50000, duration: Duration = 1.hour): Unit = { 94 | assertOpen() 95 | if (data.length < threshold) { 96 | implicit val codec = codecs(0) 97 | writeData(data, 0, data.length, false) 98 | } else { 99 | val threadPool = Executors.newFixedThreadPool(parallelization) 100 | val chunk = data.length / parallelization 101 | import com.landoop.avro.concurrent.ExecutorExtension._ 102 | val futures = (0 until parallelization).map { i => 103 | implicit val codec = codecs(i) 104 | if (i == parallelization - 1) { 105 | threadPool.submit { 106 | writeData(data, i * chunk, data.length, true) 107 | } 108 | } else { 109 | threadPool.submit { 110 | writeData(data, i * chunk, (i + 1) * chunk, true) 111 | } 112 | } 113 | } 114 | FutureAwaitWithFailFastFn(threadPool, futures, duration) 115 | } 116 | } 117 | 118 | private def writeData(data: IndexedSeq[D], from: Int, to: Int, synchronize: Boolean)(implicit codec: Codec) = { 119 | implicit val buffer = new FastDataFileWriter.NonCopyingByteArrayOutputStream(Math.min((syncInterval * 1.25).toInt, Integer.MAX_VALUE / 2 - 1)) 120 | implicit val bufOut = encoderFactory.binaryEncoder(buffer, null) 121 | implicit val withSynchronization = synchronize 122 | var blockCount = 0 123 | for (i <- from until to) { 124 | datumWriter.write(data(i), bufOut) 125 | blockCount += 1 126 | if (writeIfBlockFull(blockCount, synchronize)) { 127 | blockCount = 0 128 | } 129 | } 130 | writeBlock(blockCount, synchronize) 131 | } 132 | 133 | private def bufferInUse(implicit 134 | bufOut: BinaryEncoder, 135 | buffer: NonCopyingByteArrayOutputStream) = buffer.size + bufOut.bytesBuffered 136 | 137 | private def writeIfBlockFull(blockCount: Int, synchronize: Boolean) 138 | (implicit codec: Codec, bufOut: BinaryEncoder, buffer: NonCopyingByteArrayOutputStream) = { 139 | if (bufferInUse >= syncInterval) 140 | writeBlock(blockCount, synchronize) 141 | else 142 | false 143 | } 144 | 145 | @throws[IOException] 146 | private def writeBlock(blockCount: Int, synchronize: Boolean) 147 | (implicit codec: Codec, bufOut: BinaryEncoder, buffer: NonCopyingByteArrayOutputStream) = { 148 | if (blockCount > 0) { 149 | bufOut.flush() 150 | val uncompressed = buffer.getByteArrayAsByteBuffer 151 | val block = new DataBlock(uncompressed, blockCount) 152 | block.setFlushOnWrite(flushOnEveryBlock) 153 | block.compressUsing(codec) 154 | if (synchronize) { 155 | lock.synchronized { 156 | block.writeBlockTo(vout, syncMarker) 157 | } 158 | } else { 159 | block.writeBlockTo(vout, syncMarker) 160 | } 161 | buffer.reset() 162 | true 163 | } else false 164 | } 165 | 166 | 167 | /** 168 | * Flushes the current state of the file. 169 | */ 170 | override def flush(): Unit = { 171 | vout.flush() 172 | } 173 | 174 | /** 175 | * If this writer was instantiated using a File or using an 176 | * {@linkplain Syncable} instance, this method flushes all buffers for this 177 | * writer to disk. In other cases, this method behaves exactly 178 | * like {@linkplain #flush()}. 179 | * 180 | * @throws IOException 181 | */ 182 | @throws[IOException] 183 | def fSync(): Unit = { 184 | flush() 185 | out match { 186 | case s: Syncable => s.sync() 187 | case _ => 188 | } 189 | } 190 | 191 | /** Flush and close the file. */ 192 | @throws[IOException] 193 | override def close(): Unit = { 194 | if (isOpen) { 195 | flush() 196 | out.close() 197 | isOpen = false 198 | } 199 | } 200 | } 201 | 202 | object FastDataFileWriter { 203 | private[avro] def generateSync = { 204 | try { 205 | val digester = MessageDigest.getInstance("MD5") 206 | val time = System.currentTimeMillis 207 | digester.update((UUID.randomUUID + "@" + time).getBytes) 208 | digester.digest 209 | } catch { 210 | case e: NoSuchAlgorithmException => throw new RuntimeException(e) 211 | } 212 | } 213 | 214 | def isReservedMeta(key: String): Boolean = key.startsWith("avro.") 215 | 216 | private class NonCopyingByteArrayOutputStream private[avro](val initialSize: Int) extends ByteArrayOutputStream(initialSize) { 217 | private[avro] def getByteArrayAsByteBuffer = ByteBuffer.wrap(buf, 0, count) 218 | } 219 | 220 | } -------------------------------------------------------------------------------- /src/main/scala/com/landoop/avro/FastDataFileWriterBuilder.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Landoop. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.landoop.avro 17 | 18 | import java.io.OutputStream 19 | 20 | import com.landoop.avro.codec.CodecFactory 21 | import org.apache.avro.file.DataFileConstants 22 | import org.apache.avro.io.{DatumWriter, EncoderFactory} 23 | import org.apache.avro.{AvroRuntimeException, Schema} 24 | 25 | /** 26 | * Created by stefan on 01/04/2017. 27 | */ 28 | case class FastDataFileWriterBuilder[D] private(datumout: DatumWriter[D], 29 | outputStream: OutputStream, 30 | schema: Schema, 31 | codecFactory: CodecFactory, 32 | flushOnEveryBlock: Boolean = true, 33 | syncInterval: Int = DataFileConstants.DEFAULT_SYNC_INTERVAL, 34 | sync: Array[Byte] = null, 35 | parallelization: Int = 4, 36 | metaMap: Map[String, Array[Byte]] = Map.empty[String, Array[Byte]], 37 | encoderFactory: EncoderFactory = new EncoderFactory) { 38 | require(datumout != null, "Invalid DatumWriter") 39 | require(outputStream != null, "Invalid output stream") 40 | require(schema != null, "Invalid schema") 41 | 42 | 43 | def this(datumWriter: DatumWriter[D], 44 | outputStream: OutputStream, 45 | schema: Schema) = { 46 | this(datumWriter, outputStream, schema, CodecFactory.nullCodec()) 47 | } 48 | 49 | /** 50 | * Creates a new instance of {@link FastDataFileWriter} 51 | * 52 | * @return 53 | */ 54 | def build(): FastDataFileWriter[D] = { 55 | val syncMarker = { 56 | if (sync == null) FastDataFileWriter.generateSync 57 | else sync 58 | } 59 | 60 | new FastDataFileWriter[D]( 61 | datumout, 62 | outputStream, 63 | schema, 64 | codecFactory, 65 | flushOnEveryBlock, 66 | syncInterval, 67 | syncMarker, 68 | parallelization, 69 | metaMap + 70 | (DataFileConstants.CODEC -> codecFactory.createInstance().getName.getBytes("UTF-8")) + 71 | (DataFileConstants.SCHEMA -> schema.toString().getBytes("UTF-8")), 72 | encoderFactory) 73 | } 74 | 75 | /** 76 | * Sets the codec to be used 77 | * 78 | * @param codecFactory - An instance of a codec factory 79 | * @return 80 | */ 81 | def withCodec(codecFactory: CodecFactory): FastDataFileWriterBuilder[D] = { 82 | require(codecFactory != null, "Invalid codecFactory") 83 | copy(codecFactory = codecFactory, metaMap = metaMap + (DataFileConstants.CODEC -> codecFactory.createInstance().getName.getBytes("UTF-8"))) 84 | } 85 | 86 | /** 87 | * Set whether this writer should flush the block to the stream every time 88 | * a sync marker is written. By default, the writer will flush the buffer 89 | * each time a sync marker is written (if the block size limit is reached 90 | * or the {@link FastDataFileWriter#sync()} is called. 91 | * 92 | * @param flag - If set to false, this writer will not flush 93 | * the block to the stream until { @linkplain 94 | * #flush()} is explicitly called. 95 | */ 96 | def withFlushOnEveryBlock(flag: Boolean): FastDataFileWriterBuilder[D] = copy(flushOnEveryBlock = flag) 97 | 98 | /** 99 | * Adds metadata property 100 | * 101 | * @param key 102 | * @param value 103 | * @return the new instance of the builder 104 | */ 105 | def withMeta(key: String, value: Array[Byte]): FastDataFileWriterBuilder[D] = { 106 | if (FastDataFileWriter.isReservedMeta(key)) throw new AvroRuntimeException("Cannot set reserved meta key: " + key) 107 | require(key != null && key.trim.nonEmpty, "Invalid key") 108 | require(value != null && value.nonEmpty, "Invalid value") 109 | copy(metaMap = metaMap + (key -> value)) 110 | } 111 | 112 | 113 | /** 114 | * Adds metadata property 115 | * 116 | * @param key 117 | * @param value 118 | * @return the new instance of the builder 119 | */ 120 | def withMeta(key: String, value: String): FastDataFileWriterBuilder[D] = { 121 | require(value != null && value.trim.nonEmpty, "Invalid value") 122 | withMeta(key, value.getBytes("UTF-8")) 123 | } 124 | 125 | 126 | /** 127 | * Adds metadata property 128 | * 129 | * @param key 130 | * @param value 131 | * @return the new instance of the builder 132 | */ 133 | def withMeta(key: String, value: Long): FastDataFileWriterBuilder[D] = withMeta(key, java.lang.Long.toString(value)) 134 | 135 | /** 136 | * Set the synchronization interval for this file, in bytes. 137 | * Valid values range from 32 to `2^30` 138 | * Suggested values are between 2K and 2M 139 | * 140 | * The stream is flushed by default at the end of each synchronization 141 | * interval. 142 | * 143 | * If {@linkplain #setFlushOnEveryBlock(boolean)} is 144 | * called with param set to false, then the block may not be flushed to the 145 | * stream after the sync marker is written. In this case, 146 | * the {@linkplain #flush()} must be called to flush the stream. 147 | * 148 | * Invalid values throw IllegalArgumentException 149 | * 150 | * @param syncInterval 151 | * the approximate number of uncompressed bytes to write in each block 152 | * @return 153 | * this DataFileWriter 154 | */ 155 | def withSyncInterval(syncInterval: Int): FastDataFileWriterBuilder[D] = { 156 | require(syncInterval > 32 && syncInterval < (1 << 30), "Invalid syncInterval value: " + syncInterval) 157 | copy(syncInterval = syncInterval) 158 | } 159 | 160 | /** 161 | * How many worker threads will be used to serialize to Avro. 162 | * 163 | * @param parallelization - Number of threads to run when serializing to avro 164 | * @return 165 | */ 166 | def withParallelization(parallelization: Int): FastDataFileWriterBuilder[D] = { 167 | require(parallelization > 1, "Invalid parallelization") 168 | copy(parallelization = parallelization) 169 | } 170 | 171 | def withSync(sync: Array[Byte]): FastDataFileWriterBuilder[D] = { 172 | require(sync != null && sync.length == 16, "Invalid sync") 173 | copy(sync = sync) 174 | } 175 | } 176 | 177 | 178 | object FastDataFileWriterBuilder { 179 | def apply[D](datumWriter: DatumWriter[D], outputStream: OutputStream, schema: Schema) = { 180 | new FastDataFileWriterBuilder[D](datumWriter, outputStream, schema) 181 | } 182 | } -------------------------------------------------------------------------------- /src/main/scala/com/landoop/avro/codec/BZip2Codec.java: -------------------------------------------------------------------------------- 1 | package com.landoop.avro.codec; 2 | 3 | import org.apache.avro.file.Codec; 4 | import org.apache.avro.file.DataFileConstants; 5 | import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream; 6 | import org.apache.commons.compress.compressors.bzip2.BZip2CompressorOutputStream; 7 | 8 | import java.io.ByteArrayInputStream; 9 | import java.io.ByteArrayOutputStream; 10 | import java.io.IOException; 11 | import java.nio.ByteBuffer; 12 | 13 | /** 14 | * Implements bzip2 compression and decompression. 15 | */ 16 | public class BZip2Codec extends Codec { 17 | 18 | public static final int DEFAULT_BUFFER_SIZE = 64 * 1024; 19 | private ByteArrayOutputStream outputBuffer; 20 | 21 | @Override 22 | public String getName() { 23 | return DataFileConstants.BZIP2_CODEC; 24 | } 25 | 26 | @Override 27 | public ByteBuffer compress(ByteBuffer uncompressedData) throws IOException { 28 | 29 | ByteArrayOutputStream baos = getOutputBuffer(uncompressedData.remaining()); 30 | BZip2CompressorOutputStream outputStream = new BZip2CompressorOutputStream(baos); 31 | 32 | try { 33 | outputStream.write(uncompressedData.array(), 34 | uncompressedData.position(), 35 | uncompressedData.remaining()); 36 | } finally { 37 | outputStream.close(); 38 | } 39 | 40 | ByteBuffer result = ByteBuffer.wrap(baos.toByteArray()); 41 | return result; 42 | } 43 | 44 | @Override 45 | public ByteBuffer decompress(ByteBuffer compressedData) throws IOException { 46 | ByteArrayInputStream bais = new ByteArrayInputStream(compressedData.array()); 47 | BZip2CompressorInputStream inputStream = new BZip2CompressorInputStream(bais); 48 | try { 49 | ByteArrayOutputStream baos = new ByteArrayOutputStream(); 50 | 51 | byte[] buffer = new byte[DEFAULT_BUFFER_SIZE]; 52 | 53 | int readCount = -1; 54 | 55 | while ((readCount = inputStream.read(buffer, compressedData.position(), buffer.length)) > 0) { 56 | baos.write(buffer, 0, readCount); 57 | } 58 | 59 | ByteBuffer result = ByteBuffer.wrap(baos.toByteArray()); 60 | return result; 61 | } finally { 62 | inputStream.close(); 63 | } 64 | } 65 | 66 | @Override 67 | public int hashCode() { 68 | return getName().hashCode(); 69 | } 70 | 71 | @Override 72 | public boolean equals(Object obj) { 73 | if (this == obj) 74 | return true; 75 | if (obj == null || obj.getClass() != getClass()) 76 | return false; 77 | return true; 78 | } 79 | 80 | //get and initialize the output buffer for use. 81 | private ByteArrayOutputStream getOutputBuffer(int suggestedLength) { 82 | if (null == outputBuffer) { 83 | outputBuffer = new ByteArrayOutputStream(suggestedLength); 84 | } 85 | outputBuffer.reset(); 86 | return outputBuffer; 87 | } 88 | 89 | 90 | } -------------------------------------------------------------------------------- /src/main/scala/com/landoop/avro/codec/CodecFactory.java: -------------------------------------------------------------------------------- 1 | package com.landoop.avro.codec; 2 | 3 | import org.apache.avro.AvroRuntimeException; 4 | import org.apache.avro.file.Codec; 5 | import org.tukaani.xz.LZMA2Options; 6 | 7 | import java.util.HashMap; 8 | import java.util.Map; 9 | import java.util.zip.Deflater; 10 | 11 | 12 | -------------------------------------------------------------------------------- /src/main/scala/com/landoop/avro/codec/CodecFactory.scala: -------------------------------------------------------------------------------- 1 | package com.landoop.avro.codec 2 | 3 | import org.apache.avro.file.Codec 4 | 5 | sealed trait CodecFactory { 6 | def createInstance(): Codec 7 | } 8 | 9 | object CodecFactory { 10 | def nullCodec() = new CodecFactory { 11 | override def createInstance(): Codec = NullCodec.INSTANCE 12 | } 13 | 14 | def deflateCodec(compressionLevel: Int) = new CodecFactory { 15 | override def createInstance(): Codec = new DeflateCodec(compressionLevel) 16 | } 17 | 18 | def xzCodec(compressionLevel: Int) = new CodecFactory { 19 | override def createInstance(): Codec = new XZCodec(compressionLevel) 20 | } 21 | 22 | def bzip2Codec() = new CodecFactory { 23 | override def createInstance(): Codec = new BZip2Codec() 24 | } 25 | 26 | def snappyCodec() = new CodecFactory { 27 | override def createInstance(): Codec = new SnappyCodec() 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/main/scala/com/landoop/avro/codec/DeflateCodec.java: -------------------------------------------------------------------------------- 1 | package com.landoop.avro.codec; 2 | 3 | import org.apache.avro.file.Codec; 4 | import org.apache.avro.file.DataFileConstants; 5 | 6 | import java.io.ByteArrayOutputStream; 7 | import java.io.IOException; 8 | import java.io.OutputStream; 9 | import java.nio.ByteBuffer; 10 | import java.util.zip.Deflater; 11 | import java.util.zip.DeflaterOutputStream; 12 | import java.util.zip.Inflater; 13 | import java.util.zip.InflaterOutputStream; 14 | 15 | /** 16 | * Implements DEFLATE (RFC1951) compression and decompression. 17 | *

18 | * Note that there is a distinction between RFC1951 (deflate) 19 | * and RFC1950 (zlib). zlib adds an extra 2-byte header 20 | * at the front, and a 4-byte checksum at the end. The 21 | * code here, by passing "true" as the "nowrap" option to 22 | * {@link Inflater} and {@link Deflater}, is using 23 | * RFC1951. 24 | */ 25 | class DeflateCodec extends Codec { 26 | 27 | private ByteArrayOutputStream outputBuffer; 28 | private Deflater deflater; 29 | private Inflater inflater; 30 | //currently only do 'nowrap' -- RFC 1951, not zlib 31 | private boolean nowrap = true; 32 | private int compressionLevel; 33 | 34 | public DeflateCodec(int compressionLevel) { 35 | this.compressionLevel = compressionLevel; 36 | } 37 | 38 | @Override 39 | public String getName() { 40 | return DataFileConstants.DEFLATE_CODEC; 41 | } 42 | 43 | @Override 44 | public ByteBuffer compress(ByteBuffer data) throws IOException { 45 | ByteArrayOutputStream baos = getOutputBuffer(data.remaining()); 46 | DeflaterOutputStream ios = new DeflaterOutputStream(baos, getDeflater()); 47 | writeAndClose(data, ios); 48 | ByteBuffer result = ByteBuffer.wrap(baos.toByteArray()); 49 | return result; 50 | } 51 | 52 | @Override 53 | public ByteBuffer decompress(ByteBuffer data) throws IOException { 54 | ByteArrayOutputStream baos = getOutputBuffer(data.remaining()); 55 | InflaterOutputStream ios = new InflaterOutputStream(baos, getInflater()); 56 | writeAndClose(data, ios); 57 | ByteBuffer result = ByteBuffer.wrap(baos.toByteArray()); 58 | return result; 59 | } 60 | 61 | private void writeAndClose(ByteBuffer data, OutputStream to) throws IOException { 62 | byte[] input = data.array(); 63 | int offset = data.arrayOffset() + data.position(); 64 | int length = data.remaining(); 65 | try { 66 | to.write(input, offset, length); 67 | } finally { 68 | to.close(); 69 | } 70 | } 71 | 72 | // get and initialize the inflater for use. 73 | private Inflater getInflater() { 74 | if (null == inflater) { 75 | inflater = new Inflater(nowrap); 76 | } 77 | inflater.reset(); 78 | return inflater; 79 | } 80 | 81 | // get and initialize the deflater for use. 82 | private Deflater getDeflater() { 83 | if (null == deflater) { 84 | deflater = new Deflater(compressionLevel, nowrap); 85 | } 86 | deflater.reset(); 87 | return deflater; 88 | } 89 | 90 | // get and initialize the output buffer for use. 91 | private ByteArrayOutputStream getOutputBuffer(int suggestedLength) { 92 | if (null == outputBuffer) { 93 | outputBuffer = new ByteArrayOutputStream(suggestedLength); 94 | } 95 | outputBuffer.reset(); 96 | return outputBuffer; 97 | } 98 | 99 | @Override 100 | public int hashCode() { 101 | return nowrap ? 0 : 1; 102 | } 103 | 104 | @Override 105 | public boolean equals(Object obj) { 106 | if (this == obj) 107 | return true; 108 | if (obj == null || obj.getClass() != getClass()) 109 | return false; 110 | DeflateCodec other = (DeflateCodec) obj; 111 | return (this.nowrap == other.nowrap); 112 | } 113 | 114 | @Override 115 | public String toString() { 116 | return getName() + "-" + compressionLevel; 117 | } 118 | } -------------------------------------------------------------------------------- /src/main/scala/com/landoop/avro/codec/NullCodec.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Landoop. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.landoop.avro.codec; 17 | 18 | import org.apache.avro.file.Codec; 19 | import org.apache.avro.file.DataFileConstants; 20 | 21 | import java.io.IOException; 22 | import java.nio.ByteBuffer; 23 | 24 | /** Implements "null" (pass through) codec. */ 25 | final class NullCodec extends Codec { 26 | 27 | public static final NullCodec INSTANCE = new NullCodec(); 28 | 29 | private NullCodec(){ 30 | 31 | } 32 | 33 | @Override 34 | public String getName() { 35 | return DataFileConstants.NULL_CODEC; 36 | } 37 | 38 | @Override 39 | public ByteBuffer compress(ByteBuffer buffer) throws IOException { 40 | return buffer; 41 | } 42 | 43 | @Override 44 | public ByteBuffer decompress(ByteBuffer data) throws IOException { 45 | return data; 46 | } 47 | 48 | @Override 49 | public boolean equals(Object other) { 50 | return this == other || (other != null && other.getClass() == getClass()); 51 | } 52 | 53 | @Override 54 | public int hashCode() { 55 | return 2; 56 | } 57 | } -------------------------------------------------------------------------------- /src/main/scala/com/landoop/avro/codec/SnappyCodec.java: -------------------------------------------------------------------------------- 1 | package com.landoop.avro.codec; 2 | 3 | import org.apache.avro.file.Codec; 4 | import org.apache.avro.file.DataFileConstants; 5 | import org.xerial.snappy.Snappy; 6 | 7 | import java.io.IOException; 8 | import java.nio.ByteBuffer; 9 | import java.util.zip.CRC32; 10 | 11 | /** 12 | * Implements Snappy compression and decompression. 13 | */ 14 | class SnappyCodec extends Codec { 15 | private CRC32 crc32 = new CRC32(); 16 | 17 | @Override 18 | public String getName() { 19 | return DataFileConstants.SNAPPY_CODEC; 20 | } 21 | 22 | @Override 23 | public ByteBuffer compress(ByteBuffer in) throws IOException { 24 | ByteBuffer out = ByteBuffer.allocate(Snappy.maxCompressedLength(in.remaining()) + 4); 25 | int size = Snappy.compress(in.array(), in.position(), in.remaining(), 26 | out.array(), 0); 27 | crc32.reset(); 28 | crc32.update(in.array(), in.position(), in.remaining()); 29 | out.putInt(size, (int) crc32.getValue()); 30 | 31 | out.limit(size + 4); 32 | 33 | return out; 34 | } 35 | 36 | @Override 37 | public ByteBuffer decompress(ByteBuffer in) throws IOException { 38 | ByteBuffer out = ByteBuffer.allocate 39 | (Snappy.uncompressedLength(in.array(), in.position(), in.remaining() - 4)); 40 | int size = Snappy.uncompress(in.array(), in.position(), in.remaining() - 4, 41 | out.array(), 0); 42 | out.limit(size); 43 | 44 | crc32.reset(); 45 | crc32.update(out.array(), 0, size); 46 | if (in.getInt(in.limit() - 4) != (int) crc32.getValue()) 47 | throw new IOException("Checksum failure"); 48 | 49 | return out; 50 | } 51 | 52 | @Override 53 | public int hashCode() { 54 | return getName().hashCode(); 55 | } 56 | 57 | @Override 58 | public boolean equals(Object obj) { 59 | if (this == obj) 60 | return true; 61 | if (obj == null || obj.getClass() != getClass()) 62 | return false; 63 | return true; 64 | } 65 | 66 | } -------------------------------------------------------------------------------- /src/main/scala/com/landoop/avro/codec/XZCodec.java: -------------------------------------------------------------------------------- 1 | package com.landoop.avro.codec; 2 | 3 | import org.apache.avro.file.Codec; 4 | import org.apache.avro.file.DataFileConstants; 5 | import org.apache.commons.compress.compressors.xz.XZCompressorInputStream; 6 | import org.apache.commons.compress.compressors.xz.XZCompressorOutputStream; 7 | import org.apache.commons.compress.utils.IOUtils; 8 | 9 | import java.io.*; 10 | import java.nio.ByteBuffer; 11 | 12 | /** 13 | * Implements xz compression and decompression. 14 | */ 15 | public class XZCodec extends Codec { 16 | 17 | 18 | private ByteArrayOutputStream outputBuffer; 19 | private int compressionLevel; 20 | 21 | public XZCodec(int compressionLevel) { 22 | this.compressionLevel = compressionLevel; 23 | } 24 | 25 | @Override 26 | public String getName() { 27 | return DataFileConstants.XZ_CODEC; 28 | } 29 | 30 | @Override 31 | public ByteBuffer compress(ByteBuffer data) throws IOException { 32 | ByteArrayOutputStream baos = getOutputBuffer(data.remaining()); 33 | OutputStream ios = new XZCompressorOutputStream(baos, compressionLevel); 34 | writeAndClose(data, ios); 35 | return ByteBuffer.wrap(baos.toByteArray()); 36 | } 37 | 38 | @Override 39 | public ByteBuffer decompress(ByteBuffer data) throws IOException { 40 | ByteArrayOutputStream baos = getOutputBuffer(data.remaining()); 41 | InputStream bytesIn = new ByteArrayInputStream( 42 | data.array(), 43 | data.arrayOffset() + data.position(), 44 | data.remaining()); 45 | InputStream ios = new XZCompressorInputStream(bytesIn); 46 | try { 47 | IOUtils.copy(ios, baos); 48 | } finally { 49 | ios.close(); 50 | } 51 | return ByteBuffer.wrap(baos.toByteArray()); 52 | } 53 | 54 | private void writeAndClose(ByteBuffer data, OutputStream to) throws IOException { 55 | byte[] input = data.array(); 56 | int offset = data.arrayOffset() + data.position(); 57 | int length = data.remaining(); 58 | try { 59 | to.write(input, offset, length); 60 | } finally { 61 | to.close(); 62 | } 63 | } 64 | 65 | // get and initialize the output buffer for use. 66 | private ByteArrayOutputStream getOutputBuffer(int suggestedLength) { 67 | if (null == outputBuffer) { 68 | outputBuffer = new ByteArrayOutputStream(suggestedLength); 69 | } 70 | outputBuffer.reset(); 71 | return outputBuffer; 72 | } 73 | 74 | @Override 75 | public int hashCode() { 76 | return compressionLevel; 77 | } 78 | 79 | @Override 80 | public boolean equals(Object obj) { 81 | if (this == obj) 82 | return true; 83 | if (obj == null || obj.getClass() != getClass()) 84 | return false; 85 | XZCodec other = (XZCodec) obj; 86 | return (this.compressionLevel == other.compressionLevel); 87 | } 88 | 89 | @Override 90 | public String toString() { 91 | return getName() + "-" + compressionLevel; 92 | } 93 | } 94 | -------------------------------------------------------------------------------- /src/main/scala/com/landoop/avro/concurrent/ExecutorExtension.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Landoop. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.landoop.avro.concurrent 17 | 18 | import java.util.concurrent.Executor 19 | 20 | import scala.concurrent.{Future, Promise} 21 | 22 | object ExecutorExtension { 23 | 24 | implicit class RunnableWrapper(val executor: Executor) extends AnyVal { 25 | def submit[T](thunk: => T): Future[T] = { 26 | val promise = Promise[T]() 27 | executor.execute(new Runnable { 28 | override def run(): Unit = { 29 | try { 30 | val t = thunk 31 | promise.success(t) 32 | } catch { 33 | case t: Throwable => promise.failure(t) 34 | } 35 | } 36 | }) 37 | promise.future 38 | } 39 | } 40 | 41 | } 42 | -------------------------------------------------------------------------------- /src/main/scala/com/landoop/avro/concurrent/FutureAwaitWithFailFastFn.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2017 Datamountaineer. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.landoop.avro.concurrent 18 | 19 | import java.util.concurrent.{ExecutorService, TimeUnit} 20 | 21 | import scala.concurrent.ExecutionContext.Implicits.global 22 | import scala.concurrent.duration._ 23 | import scala.concurrent.{Await, Future, Promise} 24 | import scala.util.Failure 25 | 26 | object FutureAwaitWithFailFastFn { 27 | 28 | def apply(executorService: ExecutorService, futures: Seq[Future[Unit]], duration: Duration): Unit = { 29 | //make sure we ask the executor to shutdown to ensure the process exits 30 | executorService.shutdown() 31 | 32 | val promise = Promise[Boolean]() 33 | 34 | //stop on the first failure 35 | futures.foreach { f => 36 | f.onFailure { case t => 37 | if (promise.tryFailure(t)) { 38 | executorService.shutdownNow() 39 | } 40 | } 41 | } 42 | 43 | val fut = Future.sequence(futures) 44 | fut.foreach { case t => 45 | if (promise.trySuccess(true)) { 46 | val failed = executorService.shutdownNow() 47 | if (failed.size() > 0) { 48 | //do something?! 49 | } 50 | } 51 | } 52 | 53 | Await.ready(promise.future, duration).value match { 54 | case Some(Failure(t)) => 55 | executorService.awaitTermination(1, TimeUnit.MINUTES) 56 | //throw the underlying error 57 | throw t 58 | 59 | case _ => 60 | executorService.awaitTermination(1, TimeUnit.MINUTES) 61 | } 62 | } 63 | 64 | def apply[T](executorService: ExecutorService, futures: Seq[Future[T]], duration: Duration = 1.hours): Seq[T] = { 65 | //make sure we ask the executor to shutdown to ensure the process exits 66 | executorService.shutdown() 67 | 68 | val promise = Promise[Boolean]() 69 | 70 | //stop on the first failure 71 | futures.foreach { f => 72 | f.onFailure { case t => 73 | if (promise.tryFailure(t)) { 74 | executorService.shutdownNow() 75 | } 76 | } 77 | } 78 | 79 | val fut = Future.sequence(futures) 80 | fut.foreach { case t => 81 | if (promise.trySuccess(true)) { 82 | val failed = executorService.shutdownNow() 83 | if (failed.size() > 0) { 84 | //do something?!logging 85 | } 86 | } 87 | } 88 | 89 | Await.ready(promise.future, duration).value match { 90 | case Some(Failure(t)) => 91 | executorService.awaitTermination(1, TimeUnit.MINUTES) 92 | //throw the underlying error 93 | throw t 94 | 95 | case _ => 96 | executorService.awaitTermination(1, TimeUnit.MINUTES) 97 | //return the result from each of the futures 98 | Await.result(Future.sequence(futures), 1.minute) 99 | } 100 | } 101 | } 102 | -------------------------------------------------------------------------------- /src/test/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2017 Datamountaineer. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | # suppress inspection "UnusedProperty" for whole file 18 | log4j.rootLogger=INFO,stdout 19 | 20 | #stdout 21 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 22 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 23 | log4j.appender.stdout.layout.conversionPattern=%d{ISO8601} %-5p [%t] [%c] [%M:%L] %m%n 24 | -------------------------------------------------------------------------------- /src/test/scala/com/landoop/avro/AvroFileWriter.scala: -------------------------------------------------------------------------------- 1 | package com.landoop.avro 2 | 3 | import java.io.{BufferedOutputStream, File, FileOutputStream} 4 | 5 | import com.landoop.avro.codec.CodecFactory 6 | import org.apache.avro.Schema 7 | import org.apache.avro.file.DataFileWriter 8 | import org.apache.avro.generic.GenericRecord 9 | 10 | object AvroFileWriter { 11 | def fastWrite(file: File, 12 | count: Int, 13 | parallelization: Int, 14 | schema: Schema, 15 | records: IndexedSeq[GenericRecord]) = { 16 | val out = new BufferedOutputStream(new FileOutputStream(file), 4 * 1048576) 17 | 18 | import org.apache.avro.generic.GenericDatumWriter 19 | val datumWriter = new GenericDatumWriter[GenericRecord](schema) 20 | val builder = FastDataFileWriterBuilder(datumWriter, out, schema) 21 | .withCodec(CodecFactory.snappyCodec()) 22 | .withFlushOnEveryBlock(false) 23 | .withParallelization(parallelization) 24 | 25 | builder.encoderFactory.configureBufferSize(4 * 1048576) 26 | builder.encoderFactory.configureBlockSize(4 * 1048576) 27 | 28 | val fileWriter = builder.build() 29 | fileWriter.write(records) 30 | fileWriter.close() 31 | } 32 | 33 | def write(file: File, 34 | count: Int, 35 | schema: Schema, 36 | records: Seq[GenericRecord]) = { 37 | val out = new BufferedOutputStream(new FileOutputStream(file), 4 * 1048576) 38 | 39 | import org.apache.avro.generic.GenericDatumWriter 40 | val datumWriter = new GenericDatumWriter[GenericRecord](schema) 41 | val writer = new DataFileWriter(datumWriter) 42 | .setCodec(org.apache.avro.file.CodecFactory.snappyCodec()) 43 | .create(schema, out) 44 | 45 | writer.setFlushOnEveryBlock(false) 46 | 47 | records.foreach(writer.append) 48 | writer.close() 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /src/test/scala/com/landoop/avro/FastDataFileWriterTest.scala: -------------------------------------------------------------------------------- 1 | package com.landoop.avro 2 | 3 | import java.io.{BufferedOutputStream, File, FileOutputStream} 4 | import java.util.UUID 5 | 6 | import com.landoop.avro.codec.CodecFactory 7 | import com.sksamuel.avro4s.{RecordFormat, SchemaFor} 8 | import org.apache.avro.Schema 9 | import org.apache.avro.generic.{GenericDatumReader, GenericRecord} 10 | import org.scalatest.{Matchers, WordSpec} 11 | 12 | class FastDataFileWriterTest extends WordSpec with Matchers { 13 | "FastDataFileWriter" should { 14 | "write 50000 Stock Quotes" in { 15 | runTest(50000, 4) 16 | } 17 | 18 | "write 123341 Stock Quotes" in { 19 | runTest(123341, 4) 20 | } 21 | 22 | "write 1000000 Stock Quotes" in { 23 | runTest(1000000, 8) 24 | } 25 | 26 | } 27 | 28 | private def runTest(count: Int, parallelization: Int) = { 29 | val file = new File(UUID.randomUUID().toString + ".avro") 30 | file.deleteOnExit() 31 | try { 32 | 33 | val out = new BufferedOutputStream(new FileOutputStream(file), 4 * 1048576) 34 | val schema = SchemaFor[StockQuote]() 35 | val recordFormat = RecordFormat[StockQuote] 36 | val records = StockQuote.generate(count) 37 | import org.apache.avro.generic.GenericDatumWriter 38 | val datumWriter = new GenericDatumWriter[GenericRecord](schema) 39 | val builder = FastDataFileWriterBuilder(datumWriter, out, schema) 40 | .withCodec(CodecFactory.snappyCodec()) 41 | .withFlushOnEveryBlock(false) 42 | .withParallelization(parallelization) 43 | 44 | builder.encoderFactory.configureBufferSize(4 * 1048576) 45 | builder.encoderFactory.configureBlockSize(4 * 1048576) 46 | 47 | val fileWriter = builder.build() 48 | fileWriter.write(records) 49 | fileWriter.close() 50 | 51 | import org.apache.avro.file.{DataFileConstants, DataFileReader} 52 | val datumReader = new GenericDatumReader[GenericRecord]() 53 | val reader = new DataFileReader[GenericRecord](file, datumReader) 54 | 55 | val scheamText = new String(reader.getMeta(DataFileConstants.SCHEMA)) 56 | val actualSchema = new Schema.Parser().parse(scheamText) 57 | actualSchema shouldBe schema 58 | 59 | val codecMeta = reader.getMetaString(DataFileConstants.CODEC) 60 | codecMeta shouldBe CodecFactory.snappyCodec().createInstance().getName 61 | val iter = new Iterator[GenericRecord] { 62 | override def hasNext: Boolean = reader.hasNext 63 | 64 | override def next(): GenericRecord = reader.next() 65 | } 66 | val actualRecordsCount = iter.foldLeft(0) { case (total, r) => 67 | val quote = recordFormat.from(r) 68 | quote.symbol shouldBe StockQuote.SampleQuote.symbol 69 | 70 | total + 1 71 | } 72 | 73 | actualRecordsCount shouldBe count 74 | reader.close() 75 | } 76 | finally { 77 | if (file.exists()) file.delete() 78 | } 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /src/test/scala/com/landoop/avro/FastWriteProgram.scala: -------------------------------------------------------------------------------- 1 | package com.landoop.avro 2 | 3 | import java.io.File 4 | import java.util.UUID 5 | 6 | import com.sksamuel.avro4s.{RecordFormat, SchemaFor} 7 | 8 | object FastWriteProgram extends App with Timed { 9 | 10 | val recordsCount = 1000000 11 | val schema = SchemaFor[StockQuote]() 12 | val recordFormat = RecordFormat[StockQuote] 13 | val records = StockQuote.generate(recordsCount) 14 | 15 | val runs = 10 16 | val files = (1 to runs + 1).map(_ => new File(UUID.randomUUID().toString + ".avro")) 17 | .toVector 18 | 19 | files.foreach(_.deleteOnExit()) 20 | AvroFileWriter.fastWrite(files.last, recordsCount, 8, schema, records) 21 | val stats = (1 to runs).map(files).map { f => 22 | withTime { 23 | AvroFileWriter.fastWrite(f, recordsCount, 8, schema, records) 24 | } 25 | } 26 | 27 | stats.zipWithIndex.foreach { case (d, i) => 28 | logger.info(s"Run number $i took ${d.toMillis} ms") 29 | println(s"Run number $i took ${d.toMillis} ms") 30 | } 31 | 32 | logger.info(s"Min run took ${stats.min.toMillis} ms") 33 | println(s"Min run took ${stats.min.toMillis} ms") 34 | logger.info(s"Max run took ${stats.max.toMillis} ms") 35 | println(s"Max run took ${stats.max.toMillis} ms") 36 | logger.info(s"Avg run took ${stats.map(_.toMillis).sum / runs} ms") 37 | println(s"Avg run took ${stats.map(_.toMillis).sum / runs} ms") 38 | } 39 | -------------------------------------------------------------------------------- /src/test/scala/com/landoop/avro/StandardWriteProgram.scala: -------------------------------------------------------------------------------- 1 | package com.landoop.avro 2 | 3 | import java.io.File 4 | import java.util.UUID 5 | 6 | import com.sksamuel.avro4s.{RecordFormat, SchemaFor} 7 | 8 | object StandardWriteProgram extends App with Timed { 9 | 10 | val recordsCount = 1000000 11 | val schema = SchemaFor[StockQuote]() 12 | val recordFormat = RecordFormat[StockQuote] 13 | val records = StockQuote.generate(recordsCount) 14 | 15 | val runs = 10 16 | val files = (1 to runs + 1).map(_ => new File(UUID.randomUUID().toString + ".avro")) 17 | .toVector 18 | 19 | files.foreach(_.deleteOnExit()) 20 | AvroFileWriter.write(files.last, recordsCount, schema, records) 21 | val stats = (1 to runs).map(files).map { f => 22 | withTime { 23 | AvroFileWriter.write(f, recordsCount, schema, records) 24 | } 25 | }.toVector 26 | 27 | stats.zipWithIndex.foreach { case (d, i) => 28 | logger.info(s"Run number $i took ${d.toMillis} ms") 29 | println(s"Run number $i took ${d.toMillis} ms") 30 | } 31 | 32 | logger.info(s"Min run took ${stats.min.toMillis} ms") 33 | println(s"Min run took ${stats.min.toMillis} ms") 34 | logger.info(s"Max run took ${stats.max.toMillis} ms") 35 | println(s"Max run took ${stats.max.toMillis} ms") 36 | logger.info(s"Avg run took ${stats.map(_.toMillis).sum / runs} ms") 37 | println(s"Avg run took ${stats.map(_.toMillis).sum / runs} ms") 38 | } 39 | -------------------------------------------------------------------------------- /src/test/scala/com/landoop/avro/StockQuote.scala: -------------------------------------------------------------------------------- 1 | package com.landoop.avro 2 | 3 | import com.sksamuel.avro4s.RecordFormat 4 | import org.apache.avro.generic.GenericRecord 5 | 6 | case class StockQuote(symbol: String, 7 | timestamp: Long, 8 | ask: Double, 9 | askSize: Int, 10 | bid: Double, 11 | bidSize: Int, 12 | dayHigh: Double, 13 | dayLow: Double, 14 | lastTradeSize: Int, 15 | lastTradeTime: Long, 16 | open: Double, 17 | previousClose: Double, 18 | price: Double, 19 | priceAvg50: Double, 20 | priceAvg200: Double, 21 | volume: Long, 22 | yearHigh: Double, 23 | yearLow: Double, 24 | f1:String="value", 25 | f2:String="value", 26 | f3:String="value", 27 | f4:String="value", 28 | f5:String="value", 29 | f6:String="value", 30 | f7:String="value", 31 | f8:String="value", 32 | f9:String="value", 33 | f10:String="value", 34 | f11:String="value", 35 | f12:String="value", 36 | f13:String="value", 37 | f14:String="value", 38 | f15:String="value", 39 | f16:String="value", 40 | f17:String="value", 41 | f18:String="value", 42 | f19:String="value", 43 | f20:String="value", 44 | f21:String="value", 45 | f22:String="value", 46 | f23:String="value", 47 | f24:String="value", 48 | f25:String="value", 49 | f26:String="value", 50 | f27:String="value", 51 | f28:String="value", 52 | f29:String="value", 53 | f30:String="value", 54 | f31:String="value", 55 | f32:String="value", 56 | f33:String="value", 57 | f34:String="value", 58 | f35:String="value", 59 | f36:String="value", 60 | f37:String="value", 61 | f38:String="value", 62 | f39:String="value", 63 | f40:String="value", 64 | f41:String="value", 65 | f42:String="value", 66 | f43:String="value", 67 | f44:String="value", 68 | f45:String="value", 69 | f46:String="value", 70 | f47:String="value", 71 | f48:String="value", 72 | f49:String="value", 73 | f50:String="value", 74 | f51:String="value", 75 | f52:String="value", 76 | f53:String="value", 77 | f54:String="value", 78 | f55:String="value", 79 | f56:String="value", 80 | f57:String="value", 81 | f58:String="value", 82 | f59:String="value", 83 | f60:String="value" 84 | ) 85 | 86 | 87 | object StockQuote { 88 | private implicit val format = RecordFormat[StockQuote] 89 | 90 | val SampleQuote = StockQuote("MSFT", 91 | System.currentTimeMillis(), 92 | 52.29, 93 | 1000, 94 | 52.21, 95 | 1259, 96 | 52.36, 97 | 51.01, 98 | 100, 99 | System.currentTimeMillis(), 100 | 51.73, 101 | 51.38, 102 | 52.30, 103 | 52.11, 104 | 52.01, 105 | 3000000, 106 | 56.85, 107 | 47.85) 108 | 109 | def generate(count: Int): Vector[GenericRecord] = { 110 | (1 to count) 111 | .foldLeft(Vector.empty[GenericRecord]) { case (col, _) => 112 | val quote = SampleQuote 113 | col :+ format.to(quote) 114 | } 115 | } 116 | } -------------------------------------------------------------------------------- /src/test/scala/com/landoop/avro/Timed.scala: -------------------------------------------------------------------------------- 1 | package com.landoop.avro 2 | 3 | import java.util.concurrent.TimeUnit 4 | 5 | import com.typesafe.scalalogging.StrictLogging 6 | 7 | import scala.concurrent.duration._ 8 | 9 | trait Timed extends StrictLogging { 10 | def withTime[T](message: String)(thunk: => T): T = { 11 | val start = System.nanoTime() 12 | val r = thunk 13 | val end = System.nanoTime() 14 | val duration = Duration.create(end - start, TimeUnit.NANOSECONDS).toMillis 15 | logger.info(s"$message took $duration ms") 16 | r 17 | } 18 | 19 | def withTime(thunk: => Unit): Duration = { 20 | val start = System.nanoTime() 21 | val r = thunk 22 | val end = System.nanoTime() 23 | Duration.create(end - start, TimeUnit.NANOSECONDS) 24 | } 25 | } 26 | --------------------------------------------------------------------------------