├── .travis.yml
├── LICENSE
├── README.md
├── build.gradle
├── gradle.properties
├── gradle
    └── wrapper
    │   ├── gradle-wrapper.jar
    │   └── gradle-wrapper.properties
├── gradlew
├── gradlew.bat
├── settings.gradle
└── src
    ├── main
        └── scala
        │   └── com
        │       └── landoop
        │           └── avro
        │               ├── DataBlock.java
        │               ├── FastDataFileWriter.scala
        │               ├── FastDataFileWriterBuilder.scala
        │               ├── codec
        │                   ├── BZip2Codec.java
        │                   ├── CodecFactory.java
        │                   ├── CodecFactory.scala
        │                   ├── DeflateCodec.java
        │                   ├── NullCodec.java
        │                   ├── SnappyCodec.java
        │                   └── XZCodec.java
        │               └── concurrent
        │                   ├── ExecutorExtension.scala
        │                   └── FutureAwaitWithFailFastFn.scala
    └── test
        ├── resources
            └── log4j.properties
        └── scala
            └── com
                └── landoop
                    └── avro
                        ├── AvroFileWriter.scala
                        ├── FastDataFileWriterTest.scala
                        ├── FastWriteProgram.scala
                        ├── StandardWriteProgram.scala
                        ├── StockQuote.scala
                        └── Timed.scala


/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: scala
 2 | scala:
 3 |   - 2.11.8
 4 | 
 5 | jdk:
 6 |   - oraclejdk8
 7 | 
 8 | # sudo: true
 9 | 
10 | # Enable if you want to use gradlew
11 | before_install:
12 |   - chmod +x gradlew
13 | 
14 | # If you omit install, travis will always run gradle assemble
15 | install: echo "skip 'gradle assembly'"
16 | 
17 | script:
18 |  - ./gradlew clean build
19 | 
20 | cache:
21 |   directories:
22 |     - $HOME/.gradle/caches/
23 |     - $HOME/.gradle/wrapper/
24 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![Build Status](https://travis-ci.org/Landoop/fast-avro-write.svg?branch=master)](https://travis-ci.org/Landoop/fast-avro-write) 
  2 | [![Maven Central](https://maven-badges.herokuapp.com/maven-central/com.landoop/fast-avro-write/badge.svg)](https://maven-badges.herokuapp.com/maven-central/com.landoop/fast-avro-write)
  3 | [![GitHub license](https://img.shields.io/github/license/Landoop/fast-avro-write.svg)]()
  4 | 
  5 | # fast-avro-write
  6 | A small library allowing you to parallelize the write to an avro file
  7 | thus achieving much better throughput
  8 | 
  9 | 
 10 | How to use it:
 11 | ```scala
 12 | val datumWriter = new GenericDatumWriter[GenericRecord](schema)
 13 | val builder = FastDataFileWriterBuilder(datumWriter, out, schema)
 14 |     .withCodec(CodecFactory.snappyCodec())
 15 |     .withFlushOnEveryBlock(false)
 16 |     .withParallelization(parallelization)
 17 |     
 18 | builder.encoderFactory.configureBufferSize(4 * 1048576)
 19 | builder.encoderFactory.configureBlockSize(4 * 1048576)
 20 | 
 21 | val fileWriter = builder.build()
 22 | fileWriter.write(records)
 23 | ```
 24 | This will write all the records to the file. If the records count passes a threshold it will parallelize the write.
 25 | You can set the threshold as well; the write method takes a default parameter threshold.
 26 | Simple!
 27 | 
 28 | ## Blog article
 29 | 
 30 | http://www.landoop.com/blog/2017/05/fast-avro-write/
 31 | 
 32 | ## Release History
 33 | 
 34 | 0.2 - [2017-09-18] Upgrade to Avro 1.8.2
 35 | 
 36 | 0.1 - [2017-04-02] Initial release
 37 | 
 38 | ## Performance
 39 | 
 40 | Run on 8GB, i7-4650U, SSD
 41 | Here is the class from which the GenericRecords are created
 42 | 
 43 | ```scala
 44 | case class StockQuote(symbol: String,
 45 |                       timestamp: Long,
 46 |                       ask: Double,
 47 |                       askSize: Int,
 48 |                       bid: Double,
 49 |                       bidSize: Int,
 50 |                       dayHigh: Double,
 51 |                       dayLow: Double,
 52 |                       lastTradeSize: Int,
 53 |                       lastTradeTime: Long,
 54 |                       open: Double,
 55 |                       previousClose: Double,
 56 |                       price: Double,
 57 |                       priceAvg50: Double,
 58 |                       priceAvg200: Double,
 59 |                       volume: Long,
 60 |                       yearHigh: Double,
 61 |                       yearLow: Double,
 62 |                       f1:String="value",
 63 |                       f2:String="value",
 64 |                       f3:String="value",
 65 |                       f4:String="value",
 66 |                       f5:String="value",
 67 |                       f6:String="value",
 68 |                       f7:String="value",
 69 |                       f8:String="value",
 70 |                       f9:String="value",
 71 |                       f10:String="value",
 72 |                       f11:String="value",
 73 |                       f12:String="value",
 74 |                       f13:String="value",
 75 |                       f14:String="value",
 76 |                       f15:String="value",
 77 |                       f16:String="value",
 78 |                       f17:String="value",
 79 |                       f18:String="value",
 80 |                       f19:String="value",
 81 |                       f20:String="value",
 82 |                       f21:String="value",
 83 |                       f22:String="value",
 84 |                       f23:String="value",
 85 |                       f24:String="value",
 86 |                       f25:String="value",
 87 |                       f26:String="value",
 88 |                       f27:String="value",
 89 |                       f28:String="value",
 90 |                       f29:String="value",
 91 |                       f30:String="value",
 92 |                       f31:String="value",
 93 |                       f32:String="value",
 94 |                       f33:String="value",
 95 |                       f34:String="value",
 96 |                       f35:String="value",
 97 |                       f36:String="value",
 98 |                       f37:String="value",
 99 |                       f38:String="value",
100 |                       f39:String="value",
101 |                       f40:String="value",
102 |                       f41:String="value",
103 |                       f42:String="value",
104 |                       f43:String="value",
105 |                       f44:String="value",
106 |                       f45:String="value",
107 |                       f46:String="value",
108 |                       f47:String="value",
109 |                       f48:String="value",
110 |                       f49:String="value",
111 |                       f50:String="value",
112 |                       f51:String="value",
113 |                       f52:String="value",
114 |                       f53:String="value",
115 |                       f54:String="value",
116 |                       f55:String="value",
117 |                       f56:String="value",
118 |                       f57:String="value",
119 |                       f58:String="value",
120 |                       f59:String="value",
121 |                       f60:String="value"
122 |                      )
123 | ```
124 | 
125 | For each record count 10 runs have been made sequentially and the min and max values have been retained. All the values are in milliseconds
126 | For Fast writes different parallelization factor has been used - see p in the header
127 | 
128 | |Record Count| Standard Min| Standard Max|Fast Min (p=8)|Fast Max (p=8)|Fast Min (p=4)|Fast Max (p=4)|Fast Min (p=6)|Fast Min (p=6)|
129 | |------------|-------------|-------------|--------------|--------------|--------------|--------------|--------------|--------------|
130 | |100K        |490          |530          |286           |365           |306           |562           |284           |316           |
131 | |200K        |981          |1097         |570           |692           |545           |783           |586           |777           |
132 | |500K        |2534         |2755         |1443          |1575          |1313          |1607          |1365          |1402          |
133 | |1M          |5079         |5322         |2853          |2948          |2571          |2820          |2816          |2984          |
134 | 


--------------------------------------------------------------------------------
/build.gradle:
--------------------------------------------------------------------------------
  1 | buildscript {
  2 |     repositories {
  3 |         jcenter()
  4 |         maven {
  5 |             url 'https://plugins.gradle.org/m2/'
  6 |         }
  7 |     }
  8 |     dependencies {
  9 |         classpath 'com.github.maiflai:gradle-scalatest:0.14'
 10 |         classpath 'io.codearte.gradle.nexus:gradle-nexus-staging-plugin:0.5.3'
 11 |         classpath 'net.researchgate:gradle-release:2.3.4'
 12 |     }
 13 | }
 14 | 
 15 | apply plugin: 'signing'
 16 | apply plugin: 'io.codearte.nexus-staging'
 17 | apply plugin: 'net.researchgate.release'
 18 | 
 19 | 
 20 | allprojects {
 21 |     group = 'com.landoop'
 22 |     version = version
 23 |     description = "Small utility lib to speed up Avro write"
 24 | 
 25 |     apply plugin: 'scala'
 26 |     apply plugin: 'maven'
 27 |     apply plugin: 'com.github.maiflai.scalatest'
 28 |     sourceCompatibility = 1.8
 29 |     targetCompatibility = 1.8
 30 | 
 31 |     ext {
 32 |         scalaMajorVersion = '2.11'
 33 |         scala = '2.11.8'
 34 |         scalaCheck = '1.11.1'
 35 |         scalaTest = '2.2.6'
 36 |         junitVersion = '4.12'
 37 |         avroVersion = '1.8.2'
 38 |         avro4sVersion = "1.8.0"
 39 |         scalaLoggingVersion = '3.5.0'
 40 |     }
 41 | 
 42 |     repositories {
 43 |         mavenLocal()
 44 |         mavenCentral()
 45 |         maven { url "http://repo.typesafe.com/typesafe/releases/" }
 46 |     }
 47 | 
 48 |     configurations {
 49 |         provided
 50 |         compile.extendsFrom provided
 51 | 
 52 |     }
 53 | 
 54 |     dependencies {
 55 |         compile "org.scala-lang:scala-library:$scala"
 56 |         compile "org.apache.avro:avro:$avroVersion"
 57 | 
 58 |         testCompile "com.sksamuel.avro4s:avro4s-core_$scalaMajorVersion:${avro4sVersion}"
 59 |         testCompile "org.scalacheck:scalacheck_$scalaMajorVersion:$scalaCheck"
 60 |         testCompile "org.scalatest:scalatest_$scalaMajorVersion:$scalaTest"
 61 |         testCompile "com.typesafe.scala-logging:scala-logging_$scalaMajorVersion:$scalaLoggingVersion"
 62 |         testRuntime 'org.pegdown:pegdown:1.1.0'
 63 |     }
 64 | 
 65 |     test {
 66 |         maxParallelForks = 1
 67 |         minHeapSize '256m'
 68 |         maxHeapSize '2048m'
 69 |         systemProperty 'keystore', projectDir.canonicalPath + "/src/test/resources/stc_keystore.jks"
 70 |         systemProperty 'truststore', projectDir.canonicalPath + "/src/test/resources/stc_truststore.jks"
 71 |     }
 72 | 
 73 |     task testJar(type: Jar, dependsOn: testClasses) {
 74 |         baseName = "test-${project.archivesBaseName}"
 75 |         from sourceSets.test.output
 76 |     }
 77 | 
 78 |     configurations {
 79 |         tests
 80 |     }
 81 | 
 82 |     task sourcesJar(type: Jar) {
 83 |         classifier = 'sources'
 84 |         from sourceSets.main.allSource
 85 |     }
 86 | 
 87 |     task javadocJar(type: Jar) {
 88 |         classifier = 'javadoc'
 89 |         from javadoc
 90 |     }
 91 | 
 92 |     task scaladocJar(type: Jar) {
 93 |         classifier = 'scaladoc'
 94 |         from '../LICENSE'
 95 |         from scaladoc
 96 |     }
 97 | 
 98 |     tasks.withType(Tar) {
 99 |         compression Compression.GZIP
100 |         extension = 'tgz'
101 |     }
102 | 
103 |     artifacts {
104 |         archives javadocJar, scaladocJar, sourcesJar
105 |     }
106 | 
107 |     task compile(dependsOn: 'compileScala')
108 | 
109 |     signing {
110 |         required { gradle.taskGraph.hasTask("uploadArchives") }
111 |         sign configurations.archives
112 |     }
113 | 
114 |     // OSSRH publication
115 |     if (project.hasProperty('release')) {
116 |         uploadArchives {
117 |             repositories {
118 |                 mavenDeployer {
119 |                     // POM signature
120 |                     beforeDeployment { MavenDeployment deployment -> signing.signPom(deployment) }
121 |                     // Target repository
122 |                     repository(url: "https://oss.sonatype.org/service/local/staging/deploy/maven2/") {
123 |                         authentication(userName: ossrhUsername, password: ossrhPassword)
124 |                     }
125 |                     pom.project {
126 |                         name project.name
127 |                         description project.description
128 |                         packaging 'jar'
129 |                         url 'https://github.com/datamountaineer/kafka-connect-common'
130 | 
131 |                         scm {
132 |                             connection 'scm:git:https://github.com/datamountaineer/kafka-connect-common.git'
133 |                             developerConnection 'scm:git:git@github.com:datamountaineer/kafka-connect-common.git'
134 |                             url 'https://github.com/datamountaineer/kafka-connect-common.git'
135 |                         }
136 | 
137 |                         licenses {
138 |                             license {
139 |                                 name 'Apache License 2.0'
140 |                                 url 'http://www.apache.org/licenses/LICENSE-2.0.html'
141 |                                 distribution 'repo'
142 |                             }
143 |                         }
144 | 
145 |                         developers {
146 |                             developer {
147 |                                 id = 'stheppi'
148 |                                 name = 'Stefan Bocutiu'
149 |                                 email = 'stefan@datamountaineer.com'
150 |                             }
151 |                             developer {
152 |                                 id = 'Antwnis'
153 |                                 name = 'Antonios Chalkiopoulos'
154 |                                 email = 'antonios@datamountaineer.com'
155 |                             }
156 |                         }
157 |                     }
158 |                 }
159 |             }
160 |         }
161 | 
162 |         nexusStaging {
163 |             packageGroup = project.getGroup()
164 |             username = ossrhUsername
165 |             password = ossrhPassword
166 |         }
167 |     }
168 | }
169 | 
170 | 


--------------------------------------------------------------------------------
/gradle.properties:
--------------------------------------------------------------------------------
1 | version=0.2
2 | ossrhUsername=me
3 | ossrhPassword=you


--------------------------------------------------------------------------------
/gradle/wrapper/gradle-wrapper.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lensesio/fast-avro-write/dd54bad41f0323bbef52cc35cdbe890d26c037ed/gradle/wrapper/gradle-wrapper.jar


--------------------------------------------------------------------------------
/gradle/wrapper/gradle-wrapper.properties:
--------------------------------------------------------------------------------
1 | #Mon Sep 18 14:42:46 BST 2017
2 | distributionBase=GRADLE_USER_HOME
3 | distributionPath=wrapper/dists
4 | zipStoreBase=GRADLE_USER_HOME
5 | zipStorePath=wrapper/dists
6 | distributionUrl=https\://services.gradle.org/distributions/gradle-3.4.1-all.zip
7 | 


--------------------------------------------------------------------------------
/gradlew:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env sh
  2 | 
  3 | ##############################################################################
  4 | ##
  5 | ##  Gradle start up script for UN*X
  6 | ##
  7 | ##############################################################################
  8 | 
  9 | # Attempt to set APP_HOME
 10 | # Resolve links: $0 may be a link
 11 | PRG="$0"
 12 | # Need this for relative symlinks.
 13 | while [ -h "$PRG" ] ; do
 14 |     ls=`ls -ld "$PRG"`
 15 |     link=`expr "$ls" : '.*-> \(.*\)$'`
 16 |     if expr "$link" : '/.*' > /dev/null; then
 17 |         PRG="$link"
 18 |     else
 19 |         PRG=`dirname "$PRG"`"/$link"
 20 |     fi
 21 | done
 22 | SAVED="`pwd`"
 23 | cd "`dirname \"$PRG\"`/" >/dev/null
 24 | APP_HOME="`pwd -P`"
 25 | cd "$SAVED" >/dev/null
 26 | 
 27 | APP_NAME="Gradle"
 28 | APP_BASE_NAME=`basename "$0"`
 29 | 
 30 | # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
 31 | DEFAULT_JVM_OPTS=""
 32 | 
 33 | # Use the maximum available or set MAX_FD != -1 to use that value.
 34 | MAX_FD="maximum"
 35 | 
 36 | warn ( ) {
 37 |     echo "$*"
 38 | }
 39 | 
 40 | die ( ) {
 41 |     echo
 42 |     echo "$*"
 43 |     echo
 44 |     exit 1
 45 | }
 46 | 
 47 | # OS specific support (must be 'true' or 'false').
 48 | cygwin=false
 49 | msys=false
 50 | darwin=false
 51 | nonstop=false
 52 | case "`uname`" in
 53 |   CYGWIN* )
 54 |     cygwin=true
 55 |     ;;
 56 |   Darwin* )
 57 |     darwin=true
 58 |     ;;
 59 |   MINGW* )
 60 |     msys=true
 61 |     ;;
 62 |   NONSTOP* )
 63 |     nonstop=true
 64 |     ;;
 65 | esac
 66 | 
 67 | CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
 68 | 
 69 | # Determine the Java command to use to start the JVM.
 70 | if [ -n "$JAVA_HOME" ] ; then
 71 |     if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
 72 |         # IBM's JDK on AIX uses strange locations for the executables
 73 |         JAVACMD="$JAVA_HOME/jre/sh/java"
 74 |     else
 75 |         JAVACMD="$JAVA_HOME/bin/java"
 76 |     fi
 77 |     if [ ! -x "$JAVACMD" ] ; then
 78 |         die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
 79 | 
 80 | Please set the JAVA_HOME variable in your environment to match the
 81 | location of your Java installation."
 82 |     fi
 83 | else
 84 |     JAVACMD="java"
 85 |     which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
 86 | 
 87 | Please set the JAVA_HOME variable in your environment to match the
 88 | location of your Java installation."
 89 | fi
 90 | 
 91 | # Increase the maximum file descriptors if we can.
 92 | if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then
 93 |     MAX_FD_LIMIT=`ulimit -H -n`
 94 |     if [ $? -eq 0 ] ; then
 95 |         if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
 96 |             MAX_FD="$MAX_FD_LIMIT"
 97 |         fi
 98 |         ulimit -n $MAX_FD
 99 |         if [ $? -ne 0 ] ; then
100 |             warn "Could not set maximum file descriptor limit: $MAX_FD"
101 |         fi
102 |     else
103 |         warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
104 |     fi
105 | fi
106 | 
107 | # For Darwin, add options to specify how the application appears in the dock
108 | if $darwin; then
109 |     GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
110 | fi
111 | 
112 | # For Cygwin, switch paths to Windows format before running java
113 | if $cygwin ; then
114 |     APP_HOME=`cygpath --path --mixed "$APP_HOME"`
115 |     CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
116 |     JAVACMD=`cygpath --unix "$JAVACMD"`
117 | 
118 |     # We build the pattern for arguments to be converted via cygpath
119 |     ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
120 |     SEP=""
121 |     for dir in $ROOTDIRSRAW ; do
122 |         ROOTDIRS="$ROOTDIRS$SEP$dir"
123 |         SEP="|"
124 |     done
125 |     OURCYGPATTERN="(^($ROOTDIRS))"
126 |     # Add a user-defined pattern to the cygpath arguments
127 |     if [ "$GRADLE_CYGPATTERN" != "" ] ; then
128 |         OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
129 |     fi
130 |     # Now convert the arguments - kludge to limit ourselves to /bin/sh
131 |     i=0
132 |     for arg in "$@" ; do
133 |         CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
134 |         CHECK2=`echo "$arg"|egrep -c "^-"`                                 ### Determine if an option
135 | 
136 |         if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then                    ### Added a condition
137 |             eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
138 |         else
139 |             eval `echo args$i`="\"$arg\""
140 |         fi
141 |         i=$((i+1))
142 |     done
143 |     case $i in
144 |         (0) set -- ;;
145 |         (1) set -- "$args0" ;;
146 |         (2) set -- "$args0" "$args1" ;;
147 |         (3) set -- "$args0" "$args1" "$args2" ;;
148 |         (4) set -- "$args0" "$args1" "$args2" "$args3" ;;
149 |         (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
150 |         (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
151 |         (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
152 |         (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
153 |         (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
154 |     esac
155 | fi
156 | 
157 | # Escape application args
158 | save ( ) {
159 |     for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done
160 |     echo " "
161 | }
162 | APP_ARGS=$(save "$@")
163 | 
164 | # Collect all arguments for the java command, following the shell quoting and substitution rules
165 | eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS"
166 | 
167 | # by default we should be in the correct project dir, but when run from Finder on Mac, the cwd is wrong
168 | if [ "$(uname)" = "Darwin" ] && [ "$HOME" = "$PWD" ]; then
169 |   cd "$(dirname "$0")"
170 | fi
171 | 
172 | exec "$JAVACMD" "$@"
173 | 


--------------------------------------------------------------------------------
/gradlew.bat:
--------------------------------------------------------------------------------
 1 | @if "%DEBUG%" == "" @echo off
 2 | @rem ##########################################################################
 3 | @rem
 4 | @rem  Gradle startup script for Windows
 5 | @rem
 6 | @rem ##########################################################################
 7 | 
 8 | @rem Set local scope for the variables with windows NT shell
 9 | if "%OS%"=="Windows_NT" setlocal
10 | 
11 | set DIRNAME=%~dp0
12 | if "%DIRNAME%" == "" set DIRNAME=.
13 | set APP_BASE_NAME=%~n0
14 | set APP_HOME=%DIRNAME%
15 | 
16 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
17 | set DEFAULT_JVM_OPTS=
18 | 
19 | @rem Find java.exe
20 | if defined JAVA_HOME goto findJavaFromJavaHome
21 | 
22 | set JAVA_EXE=java.exe
23 | %JAVA_EXE% -version >NUL 2>&1
24 | if "%ERRORLEVEL%" == "0" goto init
25 | 
26 | echo.
27 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
28 | echo.
29 | echo Please set the JAVA_HOME variable in your environment to match the
30 | echo location of your Java installation.
31 | 
32 | goto fail
33 | 
34 | :findJavaFromJavaHome
35 | set JAVA_HOME=%JAVA_HOME:"=%
36 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe
37 | 
38 | if exist "%JAVA_EXE%" goto init
39 | 
40 | echo.
41 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
42 | echo.
43 | echo Please set the JAVA_HOME variable in your environment to match the
44 | echo location of your Java installation.
45 | 
46 | goto fail
47 | 
48 | :init
49 | @rem Get command-line arguments, handling Windows variants
50 | 
51 | if not "%OS%" == "Windows_NT" goto win9xME_args
52 | 
53 | :win9xME_args
54 | @rem Slurp the command line arguments.
55 | set CMD_LINE_ARGS=
56 | set _SKIP=2
57 | 
58 | :win9xME_args_slurp
59 | if "x%~1" == "x" goto execute
60 | 
61 | set CMD_LINE_ARGS=%*
62 | 
63 | :execute
64 | @rem Setup the command line
65 | 
66 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
67 | 
68 | @rem Execute Gradle
69 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
70 | 
71 | :end
72 | @rem End local scope for the variables with windows NT shell
73 | if "%ERRORLEVEL%"=="0" goto mainEnd
74 | 
75 | :fail
76 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
77 | rem the _cmd.exe /c_ return code!
78 | if  not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
79 | exit /b 1
80 | 
81 | :mainEnd
82 | if "%OS%"=="Windows_NT" endlocal
83 | 
84 | :omega
85 | 


--------------------------------------------------------------------------------
/settings.gradle:
--------------------------------------------------------------------------------
1 | rootProject.name = 'fast-avro-write'
2 | 


--------------------------------------------------------------------------------
/src/main/scala/com/landoop/avro/DataBlock.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2017 Landoop.
 3 |  *
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *  http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | package com.landoop.avro;
17 | 
18 | import org.apache.avro.file.Codec;
19 | import org.apache.avro.io.BinaryEncoder;
20 | 
21 | import java.io.IOException;
22 | import java.nio.ByteBuffer;
23 | 
24 | class DataBlock {
25 |     private byte[] data;
26 |     private long numEntries;
27 |     private int blockSize;
28 |     private int offset = 0;
29 |     private boolean flushOnWrite = true;
30 | 
31 |     private DataBlock(long numEntries, int blockSize) {
32 |         this.data = new byte[blockSize];
33 |         this.numEntries = numEntries;
34 |         this.blockSize = blockSize;
35 |     }
36 | 
37 |     DataBlock(ByteBuffer block, long numEntries) {
38 |         this.data = block.array();
39 |         this.blockSize = block.remaining();
40 |         this.offset = block.arrayOffset() + block.position();
41 |         this.numEntries = numEntries;
42 |     }
43 | 
44 |     byte[] getData() {
45 |         return data;
46 |     }
47 | 
48 |     long getNumEntries() {
49 |         return numEntries;
50 |     }
51 | 
52 |     int getBlockSize() {
53 |         return blockSize;
54 |     }
55 | 
56 |     boolean isFlushOnWrite() {
57 |         return flushOnWrite;
58 |     }
59 | 
60 |     void setFlushOnWrite(boolean flushOnWrite) {
61 |         this.flushOnWrite = flushOnWrite;
62 |     }
63 | 
64 |     ByteBuffer getAsByteBuffer() {
65 |         return ByteBuffer.wrap(data, offset, blockSize);
66 |     }
67 | 
68 |     void decompressUsing(Codec c) throws IOException {
69 |         ByteBuffer result = c.decompress(getAsByteBuffer());
70 |         data = result.array();
71 |         blockSize = result.remaining();
72 |     }
73 | 
74 |     void compressUsing(Codec c) throws IOException {
75 |         ByteBuffer result = c.compress(getAsByteBuffer());
76 |         data = result.array();
77 |         blockSize = result.remaining();
78 |     }
79 | 
80 |     void writeBlockTo(BinaryEncoder e, byte[] sync) throws IOException {
81 |         e.writeLong(this.numEntries);
82 |         e.writeLong(this.blockSize);
83 |         e.writeFixed(this.data, offset, this.blockSize);
84 |         e.writeFixed(sync);
85 |         if (flushOnWrite) {
86 |             e.flush();
87 |         }
88 |     }
89 | 
90 | }


--------------------------------------------------------------------------------
/src/main/scala/com/landoop/avro/FastDataFileWriter.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  *  Copyright 2017 Landoop.
  3 |  *
  4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
  5 |  *  you may not use this file except in compliance with the License.
  6 |  *  You may obtain a copy of the License at
  7 |  *
  8 |  *  http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  *  Unless required by applicable law or agreed to in writing, software
 11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
 12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  *  See the License for the specific language governing permissions and
 14 |  *  limitations under the License.
 15 |  */
 16 | package com.landoop.avro
 17 | 
 18 | import java.io._
 19 | import java.nio.ByteBuffer
 20 | import java.security.{MessageDigest, NoSuchAlgorithmException}
 21 | import java.util.UUID
 22 | import java.util.concurrent.Executors
 23 | 
 24 | import com.landoop.avro.FastDataFileWriter.NonCopyingByteArrayOutputStream
 25 | import com.landoop.avro.codec.CodecFactory
 26 | import com.landoop.avro.concurrent.FutureAwaitWithFailFastFn
 27 | import org.apache.avro.file.{Codec, DataFileConstants, Syncable}
 28 | import org.apache.avro.io.{BinaryEncoder, DatumWriter, EncoderFactory}
 29 | import org.apache.avro.{AvroRuntimeException, Schema}
 30 | 
 31 | import scala.concurrent.duration.{Duration, _}
 32 | 
 33 | 
 34 | /** Stores in a file a sequence of data conforming to a schema.  The schema is
 35 |   * stored in the file with the data.  Each datum in a file is of the same
 36 |   * schema.  Data is written with a {@link DatumWriter}.  Data is grouped into
 37 |   * <i>blocks</i>.  A synchronization marker is written between blocks, so that
 38 |   * files may be split.  Blocks may be compressed.  Extensible metadata is
 39 |   * stored at the end of the file.  Files may be appended to.
 40 |   *
 41 |   * @see DataFileReader
 42 |   */
 43 | class FastDataFileWriter[D] private[avro](datumWriter: DatumWriter[D],
 44 |                                           out: OutputStream,
 45 |                                           schema: Schema,
 46 |                                           codecFactory: CodecFactory,
 47 |                                           val flushOnEveryBlock: Boolean,
 48 |                                           val syncInterval: Int,
 49 |                                           syncMarker: Array[Byte],
 50 |                                           val parallelization: Int,
 51 |                                           meta: Map[String, Array[Byte]],
 52 |                                           encoderFactory: EncoderFactory) extends Closeable with Flushable {
 53 |   require(datumWriter != null, "Invalid DatumWriter")
 54 |   require(out != null, "Invalid output stream")
 55 |   require(schema != null, "Invalid schema")
 56 |   require(codecFactory != null, "Invalid codecFactory")
 57 |   require(syncMarker != null && syncMarker.length == 16, "Invalid syncMarker")
 58 |   require(parallelization > 0, "Invalid parallelization")
 59 |   require(syncInterval > 32 && syncInterval < (1 << 30), "Invalid syncInterval value: " + syncInterval)
 60 | 
 61 |   private val vout: BinaryEncoder = encoderFactory.binaryEncoder(out, null)
 62 |   private var isOpen = true
 63 |   private val lock = new Object
 64 |   private val codecs = (1 to parallelization).map(_ => codecFactory.createInstance()).toArray
 65 | 
 66 |   datumWriter.setSchema(schema)
 67 |   vout.writeFixed(DataFileConstants.MAGIC) // write magic
 68 |   vout.writeMapStart() // write metadata
 69 |   vout.setItemCount(meta.size)
 70 |   meta.foreach { case (key, value) =>
 71 |     vout.startItem()
 72 |     vout.writeString(key)
 73 |     vout.writeBytes(value)
 74 |   }
 75 |   vout.writeMapEnd()
 76 |   vout.writeFixed(syncMarker) // write initial sync
 77 |   vout.flush() //vout may be buffered, flush before writing to out
 78 | 
 79 | 
 80 |   private def assertOpen() = {
 81 |     if (!isOpen) throw new AvroRuntimeException("not open")
 82 |   }
 83 | 
 84 |   private def assertNotOpen() = {
 85 |     if (isOpen) throw new AvroRuntimeException("already open")
 86 |   }
 87 | 
 88 |   /** Write datum to the file.
 89 |     *
 90 |     * @see AppendWriteException
 91 |     */
 92 |   @throws[IOException]
 93 |   def write(data: IndexedSeq[D], threshold: Int = 50000, duration: Duration = 1.hour): Unit = {
 94 |     assertOpen()
 95 |     if (data.length < threshold) {
 96 |       implicit val codec = codecs(0)
 97 |       writeData(data, 0, data.length, false)
 98 |     } else {
 99 |       val threadPool = Executors.newFixedThreadPool(parallelization)
100 |       val chunk = data.length / parallelization
101 |       import com.landoop.avro.concurrent.ExecutorExtension._
102 |       val futures = (0 until parallelization).map { i =>
103 |         implicit val codec = codecs(i)
104 |         if (i == parallelization - 1) {
105 |           threadPool.submit {
106 |             writeData(data, i * chunk, data.length, true)
107 |           }
108 |         } else {
109 |           threadPool.submit {
110 |             writeData(data, i * chunk, (i + 1) * chunk, true)
111 |           }
112 |         }
113 |       }
114 |       FutureAwaitWithFailFastFn(threadPool, futures, duration)
115 |     }
116 |   }
117 | 
118 |   private def writeData(data: IndexedSeq[D], from: Int, to: Int, synchronize: Boolean)(implicit codec: Codec) = {
119 |     implicit val buffer = new FastDataFileWriter.NonCopyingByteArrayOutputStream(Math.min((syncInterval * 1.25).toInt, Integer.MAX_VALUE / 2 - 1))
120 |     implicit val bufOut = encoderFactory.binaryEncoder(buffer, null)
121 |     implicit val withSynchronization = synchronize
122 |     var blockCount = 0
123 |     for (i <- from until to) {
124 |       datumWriter.write(data(i), bufOut)
125 |       blockCount += 1
126 |       if (writeIfBlockFull(blockCount, synchronize)) {
127 |         blockCount = 0
128 |       }
129 |     }
130 |     writeBlock(blockCount, synchronize)
131 |   }
132 | 
133 |   private def bufferInUse(implicit
134 |                           bufOut: BinaryEncoder,
135 |                           buffer: NonCopyingByteArrayOutputStream) = buffer.size + bufOut.bytesBuffered
136 | 
137 |   private def writeIfBlockFull(blockCount: Int, synchronize: Boolean)
138 |                               (implicit codec: Codec, bufOut: BinaryEncoder, buffer: NonCopyingByteArrayOutputStream) = {
139 |     if (bufferInUse >= syncInterval)
140 |       writeBlock(blockCount, synchronize)
141 |     else
142 |       false
143 |   }
144 | 
145 |   @throws[IOException]
146 |   private def writeBlock(blockCount: Int, synchronize: Boolean)
147 |                         (implicit codec: Codec, bufOut: BinaryEncoder, buffer: NonCopyingByteArrayOutputStream) = {
148 |     if (blockCount > 0) {
149 |       bufOut.flush()
150 |       val uncompressed = buffer.getByteArrayAsByteBuffer
151 |       val block = new DataBlock(uncompressed, blockCount)
152 |       block.setFlushOnWrite(flushOnEveryBlock)
153 |       block.compressUsing(codec)
154 |       if (synchronize) {
155 |         lock.synchronized {
156 |           block.writeBlockTo(vout, syncMarker)
157 |         }
158 |       } else {
159 |         block.writeBlockTo(vout, syncMarker)
160 |       }
161 |       buffer.reset()
162 |       true
163 |     } else false
164 |   }
165 | 
166 | 
167 |   /**
168 |     * Flushes the current state of the file.
169 |     */
170 |   override def flush(): Unit = {
171 |     vout.flush()
172 |   }
173 | 
174 |   /**
175 |     * If this writer was instantiated using a File or using an
176 |     * {@linkplain Syncable} instance, this method flushes all buffers for this
177 |     * writer to disk. In other cases, this method behaves exactly
178 |     * like {@linkplain #flush()}.
179 |     *
180 |     * @throws IOException
181 |     */
182 |   @throws[IOException]
183 |   def fSync(): Unit = {
184 |     flush()
185 |     out match {
186 |       case s: Syncable => s.sync()
187 |       case _ =>
188 |     }
189 |   }
190 | 
191 |   /** Flush and close the file. */
192 |   @throws[IOException]
193 |   override def close(): Unit = {
194 |     if (isOpen) {
195 |       flush()
196 |       out.close()
197 |       isOpen = false
198 |     }
199 |   }
200 | }
201 | 
202 | object FastDataFileWriter {
203 |   private[avro] def generateSync = {
204 |     try {
205 |       val digester = MessageDigest.getInstance("MD5")
206 |       val time = System.currentTimeMillis
207 |       digester.update((UUID.randomUUID + "@" + time).getBytes)
208 |       digester.digest
209 |     } catch {
210 |       case e: NoSuchAlgorithmException => throw new RuntimeException(e)
211 |     }
212 |   }
213 | 
214 |   def isReservedMeta(key: String): Boolean = key.startsWith("avro.")
215 | 
216 |   private class NonCopyingByteArrayOutputStream private[avro](val initialSize: Int) extends ByteArrayOutputStream(initialSize) {
217 |     private[avro] def getByteArrayAsByteBuffer = ByteBuffer.wrap(buf, 0, count)
218 |   }
219 | 
220 | }


--------------------------------------------------------------------------------
/src/main/scala/com/landoop/avro/FastDataFileWriterBuilder.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  *  Copyright 2017 Landoop.
  3 |  *
  4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
  5 |  *  you may not use this file except in compliance with the License.
  6 |  *  You may obtain a copy of the License at
  7 |  *
  8 |  *  http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  *  Unless required by applicable law or agreed to in writing, software
 11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
 12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  *  See the License for the specific language governing permissions and
 14 |  *  limitations under the License.
 15 |  */
 16 | package com.landoop.avro
 17 | 
 18 | import java.io.OutputStream
 19 | 
 20 | import com.landoop.avro.codec.CodecFactory
 21 | import org.apache.avro.file.DataFileConstants
 22 | import org.apache.avro.io.{DatumWriter, EncoderFactory}
 23 | import org.apache.avro.{AvroRuntimeException, Schema}
 24 | 
 25 | /**
 26 |   * Created by stefan on 01/04/2017.
 27 |   */
 28 | case class FastDataFileWriterBuilder[D] private(datumout: DatumWriter[D],
 29 |                                                 outputStream: OutputStream,
 30 |                                                 schema: Schema,
 31 |                                                 codecFactory: CodecFactory,
 32 |                                                 flushOnEveryBlock: Boolean = true,
 33 |                                                 syncInterval: Int = DataFileConstants.DEFAULT_SYNC_INTERVAL,
 34 |                                                 sync: Array[Byte] = null,
 35 |                                                 parallelization: Int = 4,
 36 |                                                 metaMap: Map[String, Array[Byte]] = Map.empty[String, Array[Byte]],
 37 |                                                 encoderFactory: EncoderFactory = new EncoderFactory) {
 38 |   require(datumout != null, "Invalid DatumWriter")
 39 |   require(outputStream != null, "Invalid output stream")
 40 |   require(schema != null, "Invalid schema")
 41 | 
 42 | 
 43 |   def this(datumWriter: DatumWriter[D],
 44 |            outputStream: OutputStream,
 45 |            schema: Schema) = {
 46 |     this(datumWriter, outputStream, schema, CodecFactory.nullCodec())
 47 |   }
 48 | 
 49 |   /**
 50 |     * Creates a new instance of {@link FastDataFileWriter}
 51 |     *
 52 |     * @return
 53 |     */
 54 |   def build(): FastDataFileWriter[D] = {
 55 |     val syncMarker = {
 56 |       if (sync == null) FastDataFileWriter.generateSync
 57 |       else sync
 58 |     }
 59 | 
 60 |     new FastDataFileWriter[D](
 61 |       datumout,
 62 |       outputStream,
 63 |       schema,
 64 |       codecFactory,
 65 |       flushOnEveryBlock,
 66 |       syncInterval,
 67 |       syncMarker,
 68 |       parallelization,
 69 |       metaMap +
 70 |         (DataFileConstants.CODEC -> codecFactory.createInstance().getName.getBytes("UTF-8")) +
 71 |         (DataFileConstants.SCHEMA -> schema.toString().getBytes("UTF-8")),
 72 |       encoderFactory)
 73 |   }
 74 | 
 75 |   /**
 76 |     * Sets the codec to be used
 77 |     *
 78 |     * @param codecFactory - An instance of a codec factory
 79 |     * @return
 80 |     */
 81 |   def withCodec(codecFactory: CodecFactory): FastDataFileWriterBuilder[D] = {
 82 |     require(codecFactory != null, "Invalid codecFactory")
 83 |     copy(codecFactory = codecFactory, metaMap = metaMap + (DataFileConstants.CODEC -> codecFactory.createInstance().getName.getBytes("UTF-8")))
 84 |   }
 85 | 
 86 |   /**
 87 |     * Set whether this writer should flush the block to the stream every time
 88 |     * a sync marker is written. By default, the writer will flush the buffer
 89 |     * each time a sync marker is written (if the block size limit is reached
 90 |     * or the {@link FastDataFileWriter#sync()} is called.
 91 |     *
 92 |     * @param flag - If set to false, this writer will not flush
 93 |     *             the block to the stream until { @linkplain
 94 |     *             #flush()} is explicitly called.
 95 |     */
 96 |   def withFlushOnEveryBlock(flag: Boolean): FastDataFileWriterBuilder[D] = copy(flushOnEveryBlock = flag)
 97 | 
 98 |   /**
 99 |     * Adds metadata property
100 |     *
101 |     * @param key
102 |     * @param value
103 |     * @return the new instance of the builder
104 |     */
105 |   def withMeta(key: String, value: Array[Byte]): FastDataFileWriterBuilder[D] = {
106 |     if (FastDataFileWriter.isReservedMeta(key)) throw new AvroRuntimeException("Cannot set reserved meta key: " + key)
107 |     require(key != null && key.trim.nonEmpty, "Invalid key")
108 |     require(value != null && value.nonEmpty, "Invalid value")
109 |     copy(metaMap = metaMap + (key -> value))
110 |   }
111 | 
112 | 
113 |   /**
114 |     * Adds metadata property
115 |     *
116 |     * @param key
117 |     * @param value
118 |     * @return the new instance of the builder
119 |     */
120 |   def withMeta(key: String, value: String): FastDataFileWriterBuilder[D] = {
121 |     require(value != null && value.trim.nonEmpty, "Invalid value")
122 |     withMeta(key, value.getBytes("UTF-8"))
123 |   }
124 | 
125 | 
126 |   /**
127 |     * Adds metadata property
128 |     *
129 |     * @param key
130 |     * @param value
131 |     * @return the new instance of the builder
132 |     */
133 |   def withMeta(key: String, value: Long): FastDataFileWriterBuilder[D] = withMeta(key, java.lang.Long.toString(value))
134 | 
135 |   /**
136 |     * Set the synchronization interval for this file, in bytes.
137 |     * Valid values range from 32 to `2^30`
138 |     * Suggested values are between 2K and 2M
139 |     *
140 |     * The stream is flushed by default at the end of each synchronization
141 |     * interval.
142 |     *
143 |     * If {@linkplain #setFlushOnEveryBlock(boolean)} is
144 |     * called with param set to false, then the block may not be flushed to the
145 |     * stream after the sync marker is written. In this case,
146 |     * the {@linkplain #flush()} must be called to flush the stream.
147 |     *
148 |     * Invalid values throw IllegalArgumentException
149 |     *
150 |     * @param syncInterval
151 |     * the approximate number of uncompressed bytes to write in each block
152 |     * @return
153 |     * this DataFileWriter
154 |     */
155 |   def withSyncInterval(syncInterval: Int): FastDataFileWriterBuilder[D] = {
156 |     require(syncInterval > 32 && syncInterval < (1 << 30), "Invalid syncInterval value: " + syncInterval)
157 |     copy(syncInterval = syncInterval)
158 |   }
159 | 
160 |   /**
161 |     * How many worker threads will be used to serialize to Avro.
162 |     *
163 |     * @param parallelization - Number of threads to run when serializing to avro
164 |     * @return
165 |     */
166 |   def withParallelization(parallelization: Int): FastDataFileWriterBuilder[D] = {
167 |     require(parallelization > 1, "Invalid parallelization")
168 |     copy(parallelization = parallelization)
169 |   }
170 | 
171 |   def withSync(sync: Array[Byte]): FastDataFileWriterBuilder[D] = {
172 |     require(sync != null && sync.length == 16, "Invalid sync")
173 |     copy(sync = sync)
174 |   }
175 | }
176 | 
177 | 
178 | object FastDataFileWriterBuilder {
179 |   def apply[D](datumWriter: DatumWriter[D], outputStream: OutputStream, schema: Schema) = {
180 |     new FastDataFileWriterBuilder[D](datumWriter, outputStream, schema)
181 |   }
182 | }


--------------------------------------------------------------------------------
/src/main/scala/com/landoop/avro/codec/BZip2Codec.java:
--------------------------------------------------------------------------------
 1 | package com.landoop.avro.codec;
 2 | 
 3 | import org.apache.avro.file.Codec;
 4 | import org.apache.avro.file.DataFileConstants;
 5 | import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
 6 | import org.apache.commons.compress.compressors.bzip2.BZip2CompressorOutputStream;
 7 | 
 8 | import java.io.ByteArrayInputStream;
 9 | import java.io.ByteArrayOutputStream;
10 | import java.io.IOException;
11 | import java.nio.ByteBuffer;
12 | 
13 | /**
14 |  * Implements bzip2 compression and decompression.
15 |  */
16 | public class BZip2Codec extends Codec {
17 | 
18 |     public static final int DEFAULT_BUFFER_SIZE = 64 * 1024;
19 |     private ByteArrayOutputStream outputBuffer;
20 | 
21 |     @Override
22 |     public String getName() {
23 |         return DataFileConstants.BZIP2_CODEC;
24 |     }
25 | 
26 |     @Override
27 |     public ByteBuffer compress(ByteBuffer uncompressedData) throws IOException {
28 | 
29 |         ByteArrayOutputStream baos = getOutputBuffer(uncompressedData.remaining());
30 |         BZip2CompressorOutputStream outputStream = new BZip2CompressorOutputStream(baos);
31 | 
32 |         try {
33 |             outputStream.write(uncompressedData.array(),
34 |                     uncompressedData.position(),
35 |                     uncompressedData.remaining());
36 |         } finally {
37 |             outputStream.close();
38 |         }
39 | 
40 |         ByteBuffer result = ByteBuffer.wrap(baos.toByteArray());
41 |         return result;
42 |     }
43 | 
44 |     @Override
45 |     public ByteBuffer decompress(ByteBuffer compressedData) throws IOException {
46 |         ByteArrayInputStream bais = new ByteArrayInputStream(compressedData.array());
47 |         BZip2CompressorInputStream inputStream = new BZip2CompressorInputStream(bais);
48 |         try {
49 |             ByteArrayOutputStream baos = new ByteArrayOutputStream();
50 | 
51 |             byte[] buffer = new byte[DEFAULT_BUFFER_SIZE];
52 | 
53 |             int readCount = -1;
54 | 
55 |             while ((readCount = inputStream.read(buffer, compressedData.position(), buffer.length)) > 0) {
56 |                 baos.write(buffer, 0, readCount);
57 |             }
58 | 
59 |             ByteBuffer result = ByteBuffer.wrap(baos.toByteArray());
60 |             return result;
61 |         } finally {
62 |             inputStream.close();
63 |         }
64 |     }
65 | 
66 |     @Override
67 |     public int hashCode() {
68 |         return getName().hashCode();
69 |     }
70 | 
71 |     @Override
72 |     public boolean equals(Object obj) {
73 |         if (this == obj)
74 |             return true;
75 |         if (obj == null || obj.getClass() != getClass())
76 |             return false;
77 |         return true;
78 |     }
79 | 
80 |     //get and initialize the output buffer for use.
81 |     private ByteArrayOutputStream getOutputBuffer(int suggestedLength) {
82 |         if (null == outputBuffer) {
83 |             outputBuffer = new ByteArrayOutputStream(suggestedLength);
84 |         }
85 |         outputBuffer.reset();
86 |         return outputBuffer;
87 |     }
88 | 
89 | 
90 | }


--------------------------------------------------------------------------------
/src/main/scala/com/landoop/avro/codec/CodecFactory.java:
--------------------------------------------------------------------------------
 1 | package com.landoop.avro.codec;
 2 | 
 3 | import org.apache.avro.AvroRuntimeException;
 4 | import org.apache.avro.file.Codec;
 5 | import org.tukaani.xz.LZMA2Options;
 6 | 
 7 | import java.util.HashMap;
 8 | import java.util.Map;
 9 | import java.util.zip.Deflater;
10 | 
11 | 
12 | 


--------------------------------------------------------------------------------
/src/main/scala/com/landoop/avro/codec/CodecFactory.scala:
--------------------------------------------------------------------------------
 1 | package com.landoop.avro.codec
 2 | 
 3 | import org.apache.avro.file.Codec
 4 | 
 5 | sealed trait CodecFactory {
 6 |   def createInstance(): Codec
 7 | }
 8 | 
 9 | object CodecFactory {
10 |   def nullCodec() = new CodecFactory {
11 |     override def createInstance(): Codec = NullCodec.INSTANCE
12 |   }
13 | 
14 |   def deflateCodec(compressionLevel: Int) = new CodecFactory {
15 |     override def createInstance(): Codec = new DeflateCodec(compressionLevel)
16 |   }
17 | 
18 |   def xzCodec(compressionLevel: Int) = new CodecFactory {
19 |     override def createInstance(): Codec = new XZCodec(compressionLevel)
20 |   }
21 | 
22 |   def bzip2Codec() = new CodecFactory {
23 |     override def createInstance(): Codec = new BZip2Codec()
24 |   }
25 | 
26 |   def snappyCodec() = new CodecFactory {
27 |     override def createInstance(): Codec = new SnappyCodec()
28 |   }
29 | }
30 | 


--------------------------------------------------------------------------------
/src/main/scala/com/landoop/avro/codec/DeflateCodec.java:
--------------------------------------------------------------------------------
  1 | package com.landoop.avro.codec;
  2 | 
  3 | import org.apache.avro.file.Codec;
  4 | import org.apache.avro.file.DataFileConstants;
  5 | 
  6 | import java.io.ByteArrayOutputStream;
  7 | import java.io.IOException;
  8 | import java.io.OutputStream;
  9 | import java.nio.ByteBuffer;
 10 | import java.util.zip.Deflater;
 11 | import java.util.zip.DeflaterOutputStream;
 12 | import java.util.zip.Inflater;
 13 | import java.util.zip.InflaterOutputStream;
 14 | 
 15 | /**
 16 |  * Implements DEFLATE (RFC1951) compression and decompression.
 17 |  * <p>
 18 |  * Note that there is a distinction between RFC1951 (deflate)
 19 |  * and RFC1950 (zlib).  zlib adds an extra 2-byte header
 20 |  * at the front, and a 4-byte checksum at the end.  The
 21 |  * code here, by passing "true" as the "nowrap" option to
 22 |  * {@link Inflater} and {@link Deflater}, is using
 23 |  * RFC1951.
 24 |  */
 25 | class DeflateCodec extends Codec {
 26 | 
 27 |     private ByteArrayOutputStream outputBuffer;
 28 |     private Deflater deflater;
 29 |     private Inflater inflater;
 30 |     //currently only do 'nowrap' -- RFC 1951, not zlib
 31 |     private boolean nowrap = true;
 32 |     private int compressionLevel;
 33 | 
 34 |     public DeflateCodec(int compressionLevel) {
 35 |         this.compressionLevel = compressionLevel;
 36 |     }
 37 | 
 38 |     @Override
 39 |     public String getName() {
 40 |         return DataFileConstants.DEFLATE_CODEC;
 41 |     }
 42 | 
 43 |     @Override
 44 |     public ByteBuffer compress(ByteBuffer data) throws IOException {
 45 |         ByteArrayOutputStream baos = getOutputBuffer(data.remaining());
 46 |         DeflaterOutputStream ios = new DeflaterOutputStream(baos, getDeflater());
 47 |         writeAndClose(data, ios);
 48 |         ByteBuffer result = ByteBuffer.wrap(baos.toByteArray());
 49 |         return result;
 50 |     }
 51 | 
 52 |     @Override
 53 |     public ByteBuffer decompress(ByteBuffer data) throws IOException {
 54 |         ByteArrayOutputStream baos = getOutputBuffer(data.remaining());
 55 |         InflaterOutputStream ios = new InflaterOutputStream(baos, getInflater());
 56 |         writeAndClose(data, ios);
 57 |         ByteBuffer result = ByteBuffer.wrap(baos.toByteArray());
 58 |         return result;
 59 |     }
 60 | 
 61 |     private void writeAndClose(ByteBuffer data, OutputStream to) throws IOException {
 62 |         byte[] input = data.array();
 63 |         int offset = data.arrayOffset() + data.position();
 64 |         int length = data.remaining();
 65 |         try {
 66 |             to.write(input, offset, length);
 67 |         } finally {
 68 |             to.close();
 69 |         }
 70 |     }
 71 | 
 72 |     // get and initialize the inflater for use.
 73 |     private Inflater getInflater() {
 74 |         if (null == inflater) {
 75 |             inflater = new Inflater(nowrap);
 76 |         }
 77 |         inflater.reset();
 78 |         return inflater;
 79 |     }
 80 | 
 81 |     // get and initialize the deflater for use.
 82 |     private Deflater getDeflater() {
 83 |         if (null == deflater) {
 84 |             deflater = new Deflater(compressionLevel, nowrap);
 85 |         }
 86 |         deflater.reset();
 87 |         return deflater;
 88 |     }
 89 | 
 90 |     // get and initialize the output buffer for use.
 91 |     private ByteArrayOutputStream getOutputBuffer(int suggestedLength) {
 92 |         if (null == outputBuffer) {
 93 |             outputBuffer = new ByteArrayOutputStream(suggestedLength);
 94 |         }
 95 |         outputBuffer.reset();
 96 |         return outputBuffer;
 97 |     }
 98 | 
 99 |     @Override
100 |     public int hashCode() {
101 |         return nowrap ? 0 : 1;
102 |     }
103 | 
104 |     @Override
105 |     public boolean equals(Object obj) {
106 |         if (this == obj)
107 |             return true;
108 |         if (obj == null || obj.getClass() != getClass())
109 |             return false;
110 |         DeflateCodec other = (DeflateCodec) obj;
111 |         return (this.nowrap == other.nowrap);
112 |     }
113 | 
114 |     @Override
115 |     public String toString() {
116 |         return getName() + "-" + compressionLevel;
117 |     }
118 | }


--------------------------------------------------------------------------------
/src/main/scala/com/landoop/avro/codec/NullCodec.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2017 Landoop.
 3 |  *
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *  http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | package com.landoop.avro.codec;
17 | 
18 | import org.apache.avro.file.Codec;
19 | import org.apache.avro.file.DataFileConstants;
20 | 
21 | import java.io.IOException;
22 | import java.nio.ByteBuffer;
23 | 
24 | /** Implements "null" (pass through) codec. */
25 | final class NullCodec extends Codec {
26 | 
27 |     public static final NullCodec INSTANCE = new NullCodec();
28 | 
29 |     private NullCodec(){
30 | 
31 |     }
32 | 
33 |     @Override
34 |     public String getName() {
35 |         return DataFileConstants.NULL_CODEC;
36 |     }
37 | 
38 |     @Override
39 |     public ByteBuffer compress(ByteBuffer buffer) throws IOException {
40 |         return buffer;
41 |     }
42 | 
43 |     @Override
44 |     public ByteBuffer decompress(ByteBuffer data) throws IOException {
45 |         return data;
46 |     }
47 | 
48 |     @Override
49 |     public boolean equals(Object other) {
50 |         return this == other || (other != null && other.getClass() == getClass());
51 |     }
52 | 
53 |     @Override
54 |     public int hashCode() {
55 |         return 2;
56 |     }
57 | }


--------------------------------------------------------------------------------
/src/main/scala/com/landoop/avro/codec/SnappyCodec.java:
--------------------------------------------------------------------------------
 1 | package com.landoop.avro.codec;
 2 | 
 3 | import org.apache.avro.file.Codec;
 4 | import org.apache.avro.file.DataFileConstants;
 5 | import org.xerial.snappy.Snappy;
 6 | 
 7 | import java.io.IOException;
 8 | import java.nio.ByteBuffer;
 9 | import java.util.zip.CRC32;
10 | 
11 | /**
12 |  * Implements Snappy compression and decompression.
13 |  */
14 | class SnappyCodec extends Codec {
15 |     private CRC32 crc32 = new CRC32();
16 | 
17 |     @Override
18 |     public String getName() {
19 |         return DataFileConstants.SNAPPY_CODEC;
20 |     }
21 | 
22 |     @Override
23 |     public ByteBuffer compress(ByteBuffer in) throws IOException {
24 |         ByteBuffer out = ByteBuffer.allocate(Snappy.maxCompressedLength(in.remaining()) + 4);
25 |         int size = Snappy.compress(in.array(), in.position(), in.remaining(),
26 |                 out.array(), 0);
27 |         crc32.reset();
28 |         crc32.update(in.array(), in.position(), in.remaining());
29 |         out.putInt(size, (int) crc32.getValue());
30 | 
31 |         out.limit(size + 4);
32 | 
33 |         return out;
34 |     }
35 | 
36 |     @Override
37 |     public ByteBuffer decompress(ByteBuffer in) throws IOException {
38 |         ByteBuffer out = ByteBuffer.allocate
39 |                 (Snappy.uncompressedLength(in.array(), in.position(), in.remaining() - 4));
40 |         int size = Snappy.uncompress(in.array(), in.position(), in.remaining() - 4,
41 |                 out.array(), 0);
42 |         out.limit(size);
43 | 
44 |         crc32.reset();
45 |         crc32.update(out.array(), 0, size);
46 |         if (in.getInt(in.limit() - 4) != (int) crc32.getValue())
47 |             throw new IOException("Checksum failure");
48 | 
49 |         return out;
50 |     }
51 | 
52 |     @Override
53 |     public int hashCode() {
54 |         return getName().hashCode();
55 |     }
56 | 
57 |     @Override
58 |     public boolean equals(Object obj) {
59 |         if (this == obj)
60 |             return true;
61 |         if (obj == null || obj.getClass() != getClass())
62 |             return false;
63 |         return true;
64 |     }
65 | 
66 | }


--------------------------------------------------------------------------------
/src/main/scala/com/landoop/avro/codec/XZCodec.java:
--------------------------------------------------------------------------------
 1 | package com.landoop.avro.codec;
 2 | 
 3 | import org.apache.avro.file.Codec;
 4 | import org.apache.avro.file.DataFileConstants;
 5 | import org.apache.commons.compress.compressors.xz.XZCompressorInputStream;
 6 | import org.apache.commons.compress.compressors.xz.XZCompressorOutputStream;
 7 | import org.apache.commons.compress.utils.IOUtils;
 8 | 
 9 | import java.io.*;
10 | import java.nio.ByteBuffer;
11 | 
12 | /**
13 |  * Implements xz compression and decompression.
14 |  */
15 | public class XZCodec extends Codec {
16 | 
17 | 
18 |     private ByteArrayOutputStream outputBuffer;
19 |     private int compressionLevel;
20 | 
21 |     public XZCodec(int compressionLevel) {
22 |         this.compressionLevel = compressionLevel;
23 |     }
24 | 
25 |     @Override
26 |     public String getName() {
27 |         return DataFileConstants.XZ_CODEC;
28 |     }
29 | 
30 |     @Override
31 |     public ByteBuffer compress(ByteBuffer data) throws IOException {
32 |         ByteArrayOutputStream baos = getOutputBuffer(data.remaining());
33 |         OutputStream ios = new XZCompressorOutputStream(baos, compressionLevel);
34 |         writeAndClose(data, ios);
35 |         return ByteBuffer.wrap(baos.toByteArray());
36 |     }
37 | 
38 |     @Override
39 |     public ByteBuffer decompress(ByteBuffer data) throws IOException {
40 |         ByteArrayOutputStream baos = getOutputBuffer(data.remaining());
41 |         InputStream bytesIn = new ByteArrayInputStream(
42 |                 data.array(),
43 |                 data.arrayOffset() + data.position(),
44 |                 data.remaining());
45 |         InputStream ios = new XZCompressorInputStream(bytesIn);
46 |         try {
47 |             IOUtils.copy(ios, baos);
48 |         } finally {
49 |             ios.close();
50 |         }
51 |         return ByteBuffer.wrap(baos.toByteArray());
52 |     }
53 | 
54 |     private void writeAndClose(ByteBuffer data, OutputStream to) throws IOException {
55 |         byte[] input = data.array();
56 |         int offset = data.arrayOffset() + data.position();
57 |         int length = data.remaining();
58 |         try {
59 |             to.write(input, offset, length);
60 |         } finally {
61 |             to.close();
62 |         }
63 |     }
64 | 
65 |     // get and initialize the output buffer for use.
66 |     private ByteArrayOutputStream getOutputBuffer(int suggestedLength) {
67 |         if (null == outputBuffer) {
68 |             outputBuffer = new ByteArrayOutputStream(suggestedLength);
69 |         }
70 |         outputBuffer.reset();
71 |         return outputBuffer;
72 |     }
73 | 
74 |     @Override
75 |     public int hashCode() {
76 |         return compressionLevel;
77 |     }
78 | 
79 |     @Override
80 |     public boolean equals(Object obj) {
81 |         if (this == obj)
82 |             return true;
83 |         if (obj == null || obj.getClass() != getClass())
84 |             return false;
85 |         XZCodec other = (XZCodec) obj;
86 |         return (this.compressionLevel == other.compressionLevel);
87 |     }
88 | 
89 |     @Override
90 |     public String toString() {
91 |         return getName() + "-" + compressionLevel;
92 |     }
93 | }
94 | 


--------------------------------------------------------------------------------
/src/main/scala/com/landoop/avro/concurrent/ExecutorExtension.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2017 Landoop.
 3 |  *
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *  http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | package com.landoop.avro.concurrent
17 | 
18 | import java.util.concurrent.Executor
19 | 
20 | import scala.concurrent.{Future, Promise}
21 | 
22 | object ExecutorExtension {
23 | 
24 |   implicit class RunnableWrapper(val executor: Executor) extends AnyVal {
25 |     def submit[T](thunk: => T): Future[T] = {
26 |       val promise = Promise[T]()
27 |       executor.execute(new Runnable {
28 |         override def run(): Unit = {
29 |           try {
30 |             val t = thunk
31 |             promise.success(t)
32 |           } catch {
33 |             case t: Throwable => promise.failure(t)
34 |           }
35 |         }
36 |       })
37 |       promise.future
38 |     }
39 |   }
40 | 
41 | }
42 | 


--------------------------------------------------------------------------------
/src/main/scala/com/landoop/avro/concurrent/FutureAwaitWithFailFastFn.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  *  Copyright 2017 Datamountaineer.
  3 |  *
  4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
  5 |  *  you may not use this file except in compliance with the License.
  6 |  *  You may obtain a copy of the License at
  7 |  *
  8 |  *  http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  *  Unless required by applicable law or agreed to in writing, software
 11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
 12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  *  See the License for the specific language governing permissions and
 14 |  *  limitations under the License.
 15 |  */
 16 | 
 17 | package com.landoop.avro.concurrent
 18 | 
 19 | import java.util.concurrent.{ExecutorService, TimeUnit}
 20 | 
 21 | import scala.concurrent.ExecutionContext.Implicits.global
 22 | import scala.concurrent.duration._
 23 | import scala.concurrent.{Await, Future, Promise}
 24 | import scala.util.Failure
 25 | 
 26 | object FutureAwaitWithFailFastFn {
 27 | 
 28 |   def apply(executorService: ExecutorService, futures: Seq[Future[Unit]], duration: Duration): Unit = {
 29 |     //make sure we ask the executor to shutdown to ensure the process exits
 30 |     executorService.shutdown()
 31 | 
 32 |     val promise = Promise[Boolean]()
 33 | 
 34 |     //stop on the first failure
 35 |     futures.foreach { f =>
 36 |       f.onFailure { case t =>
 37 |         if (promise.tryFailure(t)) {
 38 |           executorService.shutdownNow()
 39 |         }
 40 |       }
 41 |     }
 42 | 
 43 |     val fut = Future.sequence(futures)
 44 |     fut.foreach { case t =>
 45 |       if (promise.trySuccess(true)) {
 46 |         val failed = executorService.shutdownNow()
 47 |         if (failed.size() > 0) {
 48 |           //do something?!
 49 |         }
 50 |       }
 51 |     }
 52 | 
 53 |     Await.ready(promise.future, duration).value match {
 54 |       case Some(Failure(t)) =>
 55 |         executorService.awaitTermination(1, TimeUnit.MINUTES)
 56 |         //throw the underlying error
 57 |         throw t
 58 | 
 59 |       case _ =>
 60 |         executorService.awaitTermination(1, TimeUnit.MINUTES)
 61 |     }
 62 |   }
 63 | 
 64 |   def apply[T](executorService: ExecutorService, futures: Seq[Future[T]], duration: Duration = 1.hours): Seq[T] = {
 65 |     //make sure we ask the executor to shutdown to ensure the process exits
 66 |     executorService.shutdown()
 67 | 
 68 |     val promise = Promise[Boolean]()
 69 | 
 70 |     //stop on the first failure
 71 |     futures.foreach { f =>
 72 |       f.onFailure { case t =>
 73 |         if (promise.tryFailure(t)) {
 74 |           executorService.shutdownNow()
 75 |         }
 76 |       }
 77 |     }
 78 | 
 79 |     val fut = Future.sequence(futures)
 80 |     fut.foreach { case t =>
 81 |       if (promise.trySuccess(true)) {
 82 |         val failed = executorService.shutdownNow()
 83 |         if (failed.size() > 0) {
 84 |           //do something?!logging
 85 |         }
 86 |       }
 87 |     }
 88 | 
 89 |     Await.ready(promise.future, duration).value match {
 90 |       case Some(Failure(t)) =>
 91 |         executorService.awaitTermination(1, TimeUnit.MINUTES)
 92 |         //throw the underlying error
 93 |         throw t
 94 | 
 95 |       case _ =>
 96 |         executorService.awaitTermination(1, TimeUnit.MINUTES)
 97 |         //return the result from each of the futures
 98 |         Await.result(Future.sequence(futures), 1.minute)
 99 |     }
100 |   }
101 | }
102 | 


--------------------------------------------------------------------------------
/src/test/resources/log4j.properties:
--------------------------------------------------------------------------------
 1 | #
 2 | #  Copyright 2017 Datamountaineer.
 3 | #
 4 | #  Licensed under the Apache License, Version 2.0 (the "License");
 5 | #  you may not use this file except in compliance with the License.
 6 | #  You may obtain a copy of the License at
 7 | #
 8 | #  http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | #  Unless required by applicable law or agreed to in writing, software
11 | #  distributed under the License is distributed on an "AS IS" BASIS,
12 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | #  See the License for the specific language governing permissions and
14 | #  limitations under the License.
15 | #
16 | 
17 | # suppress inspection "UnusedProperty" for whole file
18 | log4j.rootLogger=INFO,stdout
19 | 
20 | #stdout
21 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
22 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
23 | log4j.appender.stdout.layout.conversionPattern=%d{ISO8601} %-5p [%t] [%c] [%M:%L] %m%n
24 | 


--------------------------------------------------------------------------------
/src/test/scala/com/landoop/avro/AvroFileWriter.scala:
--------------------------------------------------------------------------------
 1 | package com.landoop.avro
 2 | 
 3 | import java.io.{BufferedOutputStream, File, FileOutputStream}
 4 | 
 5 | import com.landoop.avro.codec.CodecFactory
 6 | import org.apache.avro.Schema
 7 | import org.apache.avro.file.DataFileWriter
 8 | import org.apache.avro.generic.GenericRecord
 9 | 
10 | object AvroFileWriter {
11 |   def fastWrite(file: File,
12 |                 count: Int,
13 |                 parallelization: Int,
14 |                 schema: Schema,
15 |                 records: IndexedSeq[GenericRecord]) = {
16 |     val out = new BufferedOutputStream(new FileOutputStream(file), 4 * 1048576)
17 | 
18 |     import org.apache.avro.generic.GenericDatumWriter
19 |     val datumWriter = new GenericDatumWriter[GenericRecord](schema)
20 |     val builder = FastDataFileWriterBuilder(datumWriter, out, schema)
21 |       .withCodec(CodecFactory.snappyCodec())
22 |       .withFlushOnEveryBlock(false)
23 |       .withParallelization(parallelization)
24 | 
25 |     builder.encoderFactory.configureBufferSize(4 * 1048576)
26 |     builder.encoderFactory.configureBlockSize(4 * 1048576)
27 | 
28 |     val fileWriter = builder.build()
29 |     fileWriter.write(records)
30 |     fileWriter.close()
31 |   }
32 | 
33 |   def write(file: File,
34 |             count: Int,
35 |             schema: Schema,
36 |             records: Seq[GenericRecord]) = {
37 |     val out = new BufferedOutputStream(new FileOutputStream(file), 4 * 1048576)
38 |     
39 |     import org.apache.avro.generic.GenericDatumWriter
40 |     val datumWriter = new GenericDatumWriter[GenericRecord](schema)
41 |     val writer = new DataFileWriter(datumWriter)
42 |       .setCodec(org.apache.avro.file.CodecFactory.snappyCodec())
43 |       .create(schema, out)
44 | 
45 |     writer.setFlushOnEveryBlock(false)
46 | 
47 |     records.foreach(writer.append)
48 |     writer.close()
49 |   }
50 | }
51 | 


--------------------------------------------------------------------------------
/src/test/scala/com/landoop/avro/FastDataFileWriterTest.scala:
--------------------------------------------------------------------------------
 1 | package com.landoop.avro
 2 | 
 3 | import java.io.{BufferedOutputStream, File, FileOutputStream}
 4 | import java.util.UUID
 5 | 
 6 | import com.landoop.avro.codec.CodecFactory
 7 | import com.sksamuel.avro4s.{RecordFormat, SchemaFor}
 8 | import org.apache.avro.Schema
 9 | import org.apache.avro.generic.{GenericDatumReader, GenericRecord}
10 | import org.scalatest.{Matchers, WordSpec}
11 | 
12 | class FastDataFileWriterTest extends WordSpec with Matchers {
13 |   "FastDataFileWriter" should {
14 |     "write 50000 Stock Quotes" in {
15 |       runTest(50000, 4)
16 |     }
17 | 
18 |     "write 123341 Stock Quotes" in {
19 |       runTest(123341, 4)
20 |     }
21 | 
22 |     "write 1000000 Stock Quotes" in {
23 |       runTest(1000000, 8)
24 |     }
25 | 
26 |   }
27 | 
28 |   private def runTest(count: Int, parallelization: Int) = {
29 |     val file = new File(UUID.randomUUID().toString + ".avro")
30 |     file.deleteOnExit()
31 |     try {
32 | 
33 |       val out = new BufferedOutputStream(new FileOutputStream(file), 4 * 1048576)
34 |       val schema = SchemaFor[StockQuote]()
35 |       val recordFormat = RecordFormat[StockQuote]
36 |       val records = StockQuote.generate(count)
37 |       import org.apache.avro.generic.GenericDatumWriter
38 |       val datumWriter = new GenericDatumWriter[GenericRecord](schema)
39 |       val builder = FastDataFileWriterBuilder(datumWriter, out, schema)
40 |         .withCodec(CodecFactory.snappyCodec())
41 |         .withFlushOnEveryBlock(false)
42 |         .withParallelization(parallelization)
43 | 
44 |       builder.encoderFactory.configureBufferSize(4 * 1048576)
45 |       builder.encoderFactory.configureBlockSize(4 * 1048576)
46 | 
47 |       val fileWriter = builder.build()
48 |       fileWriter.write(records)
49 |       fileWriter.close()
50 | 
51 |       import org.apache.avro.file.{DataFileConstants, DataFileReader}
52 |       val datumReader = new GenericDatumReader[GenericRecord]()
53 |       val reader = new DataFileReader[GenericRecord](file, datumReader)
54 | 
55 |       val scheamText = new String(reader.getMeta(DataFileConstants.SCHEMA))
56 |       val actualSchema = new Schema.Parser().parse(scheamText)
57 |       actualSchema shouldBe schema
58 | 
59 |       val codecMeta = reader.getMetaString(DataFileConstants.CODEC)
60 |       codecMeta shouldBe CodecFactory.snappyCodec().createInstance().getName
61 |       val iter = new Iterator[GenericRecord] {
62 |         override def hasNext: Boolean = reader.hasNext
63 | 
64 |         override def next(): GenericRecord = reader.next()
65 |       }
66 |       val actualRecordsCount = iter.foldLeft(0) { case (total, r) =>
67 |         val quote = recordFormat.from(r)
68 |         quote.symbol shouldBe StockQuote.SampleQuote.symbol
69 | 
70 |         total + 1
71 |       }
72 | 
73 |       actualRecordsCount shouldBe count
74 |       reader.close()
75 |     }
76 |     finally {
77 |       if (file.exists()) file.delete()
78 |     }
79 |   }
80 | }
81 | 


--------------------------------------------------------------------------------
/src/test/scala/com/landoop/avro/FastWriteProgram.scala:
--------------------------------------------------------------------------------
 1 | package com.landoop.avro
 2 | 
 3 | import java.io.File
 4 | import java.util.UUID
 5 | 
 6 | import com.sksamuel.avro4s.{RecordFormat, SchemaFor}
 7 | 
 8 | object FastWriteProgram extends App with Timed {
 9 | 
10 |   val recordsCount = 1000000
11 |   val schema = SchemaFor[StockQuote]()
12 |   val recordFormat = RecordFormat[StockQuote]
13 |   val records = StockQuote.generate(recordsCount)
14 | 
15 |   val runs = 10
16 |   val files = (1 to runs + 1).map(_ => new File(UUID.randomUUID().toString + ".avro"))
17 |     .toVector
18 | 
19 |   files.foreach(_.deleteOnExit())
20 |   AvroFileWriter.fastWrite(files.last, recordsCount, 8, schema, records)
21 |   val stats = (1 to runs).map(files).map { f =>
22 |     withTime {
23 |       AvroFileWriter.fastWrite(f, recordsCount, 8, schema, records)
24 |     }
25 |   }
26 | 
27 |   stats.zipWithIndex.foreach { case (d, i) =>
28 |     logger.info(s"Run number $i took ${d.toMillis} ms")
29 |     println(s"Run number $i took ${d.toMillis} ms")
30 |   }
31 | 
32 |   logger.info(s"Min run took ${stats.min.toMillis} ms")
33 |   println(s"Min run took ${stats.min.toMillis} ms")
34 |   logger.info(s"Max run took ${stats.max.toMillis} ms")
35 |   println(s"Max run took ${stats.max.toMillis} ms")
36 |   logger.info(s"Avg run took ${stats.map(_.toMillis).sum / runs} ms")
37 |   println(s"Avg run took ${stats.map(_.toMillis).sum / runs} ms")
38 | }
39 | 


--------------------------------------------------------------------------------
/src/test/scala/com/landoop/avro/StandardWriteProgram.scala:
--------------------------------------------------------------------------------
 1 | package com.landoop.avro
 2 | 
 3 | import java.io.File
 4 | import java.util.UUID
 5 | 
 6 | import com.sksamuel.avro4s.{RecordFormat, SchemaFor}
 7 | 
 8 | object StandardWriteProgram extends App with Timed {
 9 | 
10 |   val recordsCount = 1000000
11 |   val schema = SchemaFor[StockQuote]()
12 |   val recordFormat = RecordFormat[StockQuote]
13 |   val records = StockQuote.generate(recordsCount)
14 | 
15 |   val runs = 10
16 |   val files = (1 to runs + 1).map(_ => new File(UUID.randomUUID().toString + ".avro"))
17 |     .toVector
18 | 
19 |   files.foreach(_.deleteOnExit())
20 |   AvroFileWriter.write(files.last, recordsCount, schema, records)
21 |   val stats = (1 to runs).map(files).map { f =>
22 |     withTime {
23 |       AvroFileWriter.write(f, recordsCount, schema, records)
24 |     }
25 |   }.toVector
26 | 
27 |   stats.zipWithIndex.foreach { case (d, i) =>
28 |     logger.info(s"Run number $i took ${d.toMillis} ms")
29 |     println(s"Run number $i took ${d.toMillis} ms")
30 |   }
31 | 
32 |   logger.info(s"Min run took ${stats.min.toMillis} ms")
33 |   println(s"Min run took ${stats.min.toMillis} ms")
34 |   logger.info(s"Max run took ${stats.max.toMillis} ms")
35 |   println(s"Max run took ${stats.max.toMillis} ms")
36 |   logger.info(s"Avg run took ${stats.map(_.toMillis).sum / runs} ms")
37 |   println(s"Avg run took ${stats.map(_.toMillis).sum / runs} ms")
38 | }
39 | 


--------------------------------------------------------------------------------
/src/test/scala/com/landoop/avro/StockQuote.scala:
--------------------------------------------------------------------------------
  1 | package com.landoop.avro
  2 | 
  3 | import com.sksamuel.avro4s.RecordFormat
  4 | import org.apache.avro.generic.GenericRecord
  5 | 
  6 | case class StockQuote(symbol: String,
  7 |                       timestamp: Long,
  8 |                       ask: Double,
  9 |                       askSize: Int,
 10 |                       bid: Double,
 11 |                       bidSize: Int,
 12 |                       dayHigh: Double,
 13 |                       dayLow: Double,
 14 |                       lastTradeSize: Int,
 15 |                       lastTradeTime: Long,
 16 |                       open: Double,
 17 |                       previousClose: Double,
 18 |                       price: Double,
 19 |                       priceAvg50: Double,
 20 |                       priceAvg200: Double,
 21 |                       volume: Long,
 22 |                       yearHigh: Double,
 23 |                       yearLow: Double,
 24 |                       f1:String="value",
 25 |                       f2:String="value",
 26 |                       f3:String="value",
 27 |                       f4:String="value",
 28 |                       f5:String="value",
 29 |                       f6:String="value",
 30 |                       f7:String="value",
 31 |                       f8:String="value",
 32 |                       f9:String="value",
 33 |                       f10:String="value",
 34 |                       f11:String="value",
 35 |                       f12:String="value",
 36 |                       f13:String="value",
 37 |                       f14:String="value",
 38 |                       f15:String="value",
 39 |                       f16:String="value",
 40 |                       f17:String="value",
 41 |                       f18:String="value",
 42 |                       f19:String="value",
 43 |                       f20:String="value",
 44 |                       f21:String="value",
 45 |                       f22:String="value",
 46 |                       f23:String="value",
 47 |                       f24:String="value",
 48 |                       f25:String="value",
 49 |                       f26:String="value",
 50 |                       f27:String="value",
 51 |                       f28:String="value",
 52 |                       f29:String="value",
 53 |                       f30:String="value",
 54 |                       f31:String="value",
 55 |                       f32:String="value",
 56 |                       f33:String="value",
 57 |                       f34:String="value",
 58 |                       f35:String="value",
 59 |                       f36:String="value",
 60 |                       f37:String="value",
 61 |                       f38:String="value",
 62 |                       f39:String="value",
 63 |                       f40:String="value",
 64 |                       f41:String="value",
 65 |                       f42:String="value",
 66 |                       f43:String="value",
 67 |                       f44:String="value",
 68 |                       f45:String="value",
 69 |                       f46:String="value",
 70 |                       f47:String="value",
 71 |                       f48:String="value",
 72 |                       f49:String="value",
 73 |                       f50:String="value",
 74 |                       f51:String="value",
 75 |                       f52:String="value",
 76 |                       f53:String="value",
 77 |                       f54:String="value",
 78 |                       f55:String="value",
 79 |                       f56:String="value",
 80 |                       f57:String="value",
 81 |                       f58:String="value",
 82 |                       f59:String="value",
 83 |                       f60:String="value"
 84 |                      )
 85 | 
 86 | 
 87 | object StockQuote {
 88 |   private implicit val format = RecordFormat[StockQuote]
 89 | 
 90 |   val SampleQuote = StockQuote("MSFT",
 91 |     System.currentTimeMillis(),
 92 |     52.29,
 93 |     1000,
 94 |     52.21,
 95 |     1259,
 96 |     52.36,
 97 |     51.01,
 98 |     100,
 99 |     System.currentTimeMillis(),
100 |     51.73,
101 |     51.38,
102 |     52.30,
103 |     52.11,
104 |     52.01,
105 |     3000000,
106 |     56.85,
107 |     47.85)
108 | 
109 |   def generate(count: Int): Vector[GenericRecord] = {
110 |     (1 to count)
111 |       .foldLeft(Vector.empty[GenericRecord]) { case (col, _) =>
112 |         val quote = SampleQuote
113 |         col :+ format.to(quote)
114 |       }
115 |   }
116 | }


--------------------------------------------------------------------------------
/src/test/scala/com/landoop/avro/Timed.scala:
--------------------------------------------------------------------------------
 1 | package com.landoop.avro
 2 | 
 3 | import java.util.concurrent.TimeUnit
 4 | 
 5 | import com.typesafe.scalalogging.StrictLogging
 6 | 
 7 | import scala.concurrent.duration._
 8 | 
 9 | trait Timed extends StrictLogging {
10 |   def withTime[T](message: String)(thunk: => T): T = {
11 |     val start = System.nanoTime()
12 |     val r = thunk
13 |     val end = System.nanoTime()
14 |     val duration = Duration.create(end - start, TimeUnit.NANOSECONDS).toMillis
15 |     logger.info(s"$message took $duration ms")
16 |     r
17 |   }
18 | 
19 |   def withTime(thunk: => Unit): Duration = {
20 |     val start = System.nanoTime()
21 |     val r = thunk
22 |     val end = System.nanoTime()
23 |     Duration.create(end - start, TimeUnit.NANOSECONDS)
24 |   }
25 | }
26 | 


--------------------------------------------------------------------------------