├── .gitignore ├── README.md ├── src └── main │ ├── resources │ └── avro │ │ └── example.avdl │ └── scala │ └── com │ └── zenfractal │ ├── SerializableAminoAcid.java │ └── SparkParquetExample.scala ├── pom.xml └── LICENSE /.gitignore: -------------------------------------------------------------------------------- 1 | target/ 2 | .idea 3 | *.iml 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | spark-parquet-example 2 | ===================== 3 | 4 | Example project to show how to use Spark to read and write Avro/Parquet files 5 | 6 | To run this example, you will need to have Maven installed. Once installed, 7 | you can launch the example by cloning this repo and running, 8 | 9 | $ mvn scala:run -DmainClass=com.zenfractal.SparkParquetExample 10 | -------------------------------------------------------------------------------- /src/main/resources/avro/example.avdl: -------------------------------------------------------------------------------- 1 | @namespace("com.zenfractal") 2 | protocol SparkParquetAvro { 3 | 4 | enum AminoAcidType { 5 | ALIPHATIC, 6 | HYDROXYL, 7 | CYCLIC, 8 | AROMATIC, 9 | BASIC, 10 | ACIDIC 11 | } 12 | 13 | record AminoAcid { 14 | AminoAcidType type; 15 | string fullName; 16 | string abbreviation; 17 | float molecularWeight; 18 | } 19 | 20 | } 21 | -------------------------------------------------------------------------------- /src/main/scala/com/zenfractal/SerializableAminoAcid.java: -------------------------------------------------------------------------------- 1 | package com.zenfractal; 2 | 3 | import org.apache.avro.io.*; 4 | import org.apache.avro.specific.SpecificDatumReader; 5 | import org.apache.avro.specific.SpecificDatumWriter; 6 | 7 | import java.io.IOException; 8 | import java.io.ObjectStreamException; 9 | import java.io.Serializable; 10 | 11 | /** 12 | * For now, Spark does not support Avro. This class is just a quick 13 | * workaround that (de)serializes AminoAcid objects using Avro. 14 | */ 15 | public class SerializableAminoAcid extends AminoAcid implements Serializable { 16 | 17 | private void setValues(AminoAcid acid) { 18 | setAbbreviation(acid.getAbbreviation()); 19 | setFullName(acid.getFullName()); 20 | setMolecularWeight(acid.getMolecularWeight()); 21 | setType(acid.getType()); 22 | } 23 | 24 | public SerializableAminoAcid(AminoAcid acid) { 25 | setValues(acid); 26 | } 27 | 28 | private void writeObject(java.io.ObjectOutputStream out) 29 | throws IOException { 30 | DatumWriter writer = new SpecificDatumWriter(AminoAcid.class); 31 | Encoder encoder = EncoderFactory.get().binaryEncoder(out, null); 32 | writer.write(this, encoder); 33 | encoder.flush(); 34 | } 35 | 36 | private void readObject(java.io.ObjectInputStream in) 37 | throws IOException, ClassNotFoundException { 38 | DatumReader reader = 39 | new SpecificDatumReader(AminoAcid.class); 40 | Decoder decoder = DecoderFactory.get().binaryDecoder(in, null); 41 | setValues(reader.read(null, decoder)); 42 | } 43 | 44 | private void readObjectNoData() 45 | throws ObjectStreamException { 46 | } 47 | 48 | } 49 | -------------------------------------------------------------------------------- /src/main/scala/com/zenfractal/SparkParquetExample.scala: -------------------------------------------------------------------------------- 1 | package com.zenfractal 2 | 3 | import parquet.hadoop.{ParquetOutputFormat, ParquetInputFormat} 4 | import spark.SparkContext 5 | import spark.SparkContext._ 6 | import org.apache.hadoop.mapreduce.Job 7 | import parquet.avro.{AvroParquetOutputFormat, AvroWriteSupport, AvroReadSupport} 8 | import parquet.filter.{RecordFilter, UnboundRecordFilter} 9 | import java.lang.Iterable 10 | import parquet.column.ColumnReader 11 | import parquet.filter.ColumnRecordFilter._ 12 | import parquet.filter.ColumnPredicates._ 13 | import com.google.common.io.Files 14 | import java.io.File 15 | 16 | object SparkParquetExample { 17 | 18 | // This predicate will remove all amino acids that are not basic 19 | class BasicAminoAcidPredicate extends UnboundRecordFilter { 20 | def bind(readers: Iterable[ColumnReader]): RecordFilter = { 21 | column("type", equalTo(AminoAcidType.BASIC)).bind(readers) 22 | } 23 | } 24 | 25 | // Only prints non-null amino acids 26 | private def aminoAcidPrinter(tuple: Tuple2[Void, AminoAcid]) = { 27 | if (tuple._2 != null) println(tuple._2) 28 | } 29 | 30 | def main(args: Array[String]) { 31 | val sc = new SparkContext("local", "ParquetExample") 32 | val job = new Job() 33 | 34 | val tempDir = Files.createTempDir() 35 | val outputDir = new File(tempDir, "output").getAbsolutePath 36 | println(outputDir) 37 | 38 | val essentialAminoAcids = List( 39 | new AminoAcid(AminoAcidType.BASIC, "histidine", "his", 155.16f), 40 | new AminoAcid(AminoAcidType.ALIPHATIC, "isoleucine", "ile", 131.18f), 41 | new AminoAcid(AminoAcidType.ALIPHATIC, "leucine", "leu", 131.18f), 42 | new AminoAcid(AminoAcidType.BASIC, "lysine", "lys", 146.19f), 43 | new AminoAcid(AminoAcidType.HYDROXYL, "methionine", "met", 149.21f), 44 | new AminoAcid(AminoAcidType.AROMATIC, "phenylalanine", "phe", 165.19f), 45 | new AminoAcid(AminoAcidType.HYDROXYL, "threonine", "thr", 119.12f), 46 | new AminoAcid(AminoAcidType.AROMATIC, "tryptophan", "trp", 204.23f), 47 | new AminoAcid(AminoAcidType.ALIPHATIC, "valine", "val", 117.15f)) 48 | 49 | 50 | // Configure the ParquetOutputFormat to use Avro as the serialization format 51 | ParquetOutputFormat.setWriteSupportClass(job, classOf[AvroWriteSupport]) 52 | // You need to pass the schema to AvroParquet when you are writing objects but not when you 53 | // are reading them. The schema is saved in Parquet file for future readers to use. 54 | AvroParquetOutputFormat.setSchema(job, AminoAcid.SCHEMA$) 55 | // Create a PairRDD with all keys set to null and wrap each amino acid in serializable objects 56 | val rdd = sc.makeRDD(essentialAminoAcids.map(acid => (null, new SerializableAminoAcid(acid)))) 57 | // Save the RDD to a Parquet file in our temporary output directory 58 | rdd.saveAsNewAPIHadoopFile(outputDir, classOf[Void], classOf[AminoAcid], 59 | classOf[ParquetOutputFormat[AminoAcid]], job.getConfiguration) 60 | 61 | // Read all the amino acids back to show that they were all saved to the Parquet file 62 | ParquetInputFormat.setReadSupportClass(job, classOf[AvroReadSupport[AminoAcid]]) 63 | val file = sc.newAPIHadoopFile(outputDir, classOf[ParquetInputFormat[AminoAcid]], 64 | classOf[Void], classOf[AminoAcid], job.getConfiguration) 65 | file.foreach(aminoAcidPrinter) 66 | 67 | // Set a predicate and Parquet only deserializes amino acids that are basic. 68 | // Non-basic amino acids will returned as null. 69 | ParquetInputFormat.setUnboundRecordFilter(job, classOf[BasicAminoAcidPredicate]) 70 | val filteredFile = sc.newAPIHadoopFile(outputDir, classOf[ParquetInputFormat[AminoAcid]], 71 | classOf[Void], classOf[AminoAcid], job.getConfiguration) 72 | filteredFile.foreach(aminoAcidPrinter) 73 | } 74 | 75 | 76 | } 77 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | 4.0.0 5 | com.zenfractal 6 | spark-parquet-example 7 | 0.1.0-SNAPSHOT 8 | pom 9 | Spark Avro/Parquet Example 10 | 11 | 12 | 2.9.3 13 | 1.7.4 14 | 1.6 15 | 16 | 17 | 18 | target/scala-${scala.version}/classes 19 | target/scala-${scala.version}/test-classes 20 | 21 | 22 | org.apache.maven.plugins 23 | maven-compiler-plugin 24 | 3.1 25 | 26 | ${java.version} 27 | ${java.version} 28 | 29 | 30 | 31 | org.scalatest 32 | scalatest-maven-plugin 33 | 1.0-M4-SNAP1 34 | 35 | 36 | org.apache.avro 37 | avro-maven-plugin 38 | ${avro.version} 39 | 40 | 41 | schemas 42 | generate-sources 43 | 44 | schema 45 | protocol 46 | idl-protocol 47 | 48 | 49 | ${project.basedir}/src/main/resources/avro 50 | 51 | 52 | 53 | 54 | 55 | net.alchim31.maven 56 | scala-maven-plugin 57 | 3.1.0 58 | 59 | 60 | scala-compile-first 61 | process-resources 62 | 63 | compile 64 | 65 | 66 | 67 | scala-test-compile-first 68 | process-test-resources 69 | 70 | testCompile 71 | 72 | 73 | 74 | attach-scaladocs 75 | verify 76 | 77 | doc-jar 78 | 79 | 80 | 81 | 82 | ${scala.version} 83 | incremental 84 | 85 | -unchecked 86 | -optimise 87 | 88 | 89 | -Xms64m 90 | -Xmx1024m 91 | 92 | 93 | -source 94 | ${java.version} 95 | -target 96 | ${java.version} 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | org.spark-project 106 | spark-core_${scala.version} 107 | 0.7.3 108 | 109 | 110 | com.twitter 111 | parquet-avro 112 | 1.0.0 113 | 114 | 115 | org.apache.avro 116 | avro 117 | ${avro.version} 118 | 119 | 120 | 121 | 122 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, and 10 | distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by the copyright 13 | owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all other entities 16 | that control, are controlled by, or are under common control with that entity. 17 | For the purposes of this definition, "control" means (i) the power, direct or 18 | indirect, to cause the direction or management of such entity, whether by 19 | contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the 20 | outstanding shares, or (iii) beneficial ownership of such entity. 21 | 22 | "You" (or "Your") shall mean an individual or Legal Entity exercising 23 | permissions granted by this License. 24 | 25 | "Source" form shall mean the preferred form for making modifications, including 26 | but not limited to software source code, documentation source, and configuration 27 | files. 28 | 29 | "Object" form shall mean any form resulting from mechanical transformation or 30 | translation of a Source form, including but not limited to compiled object code, 31 | generated documentation, and conversions to other media types. 32 | 33 | "Work" shall mean the work of authorship, whether in Source or Object form, made 34 | available under the License, as indicated by a copyright notice that is included 35 | in or attached to the work (an example is provided in the Appendix below). 36 | 37 | "Derivative Works" shall mean any work, whether in Source or Object form, that 38 | is based on (or derived from) the Work and for which the editorial revisions, 39 | annotations, elaborations, or other modifications represent, as a whole, an 40 | original work of authorship. For the purposes of this License, Derivative Works 41 | shall not include works that remain separable from, or merely link (or bind by 42 | name) to the interfaces of, the Work and Derivative Works thereof. 43 | 44 | "Contribution" shall mean any work of authorship, including the original version 45 | of the Work and any modifications or additions to that Work or Derivative Works 46 | thereof, that is intentionally submitted to Licensor for inclusion in the Work 47 | by the copyright owner or by an individual or Legal Entity authorized to submit 48 | on behalf of the copyright owner. For the purposes of this definition, 49 | "submitted" means any form of electronic, verbal, or written communication sent 50 | to the Licensor or its representatives, including but not limited to 51 | communication on electronic mailing lists, source code control systems, and 52 | issue tracking systems that are managed by, or on behalf of, the Licensor for 53 | the purpose of discussing and improving the Work, but excluding communication 54 | that is conspicuously marked or otherwise designated in writing by the copyright 55 | owner as "Not a Contribution." 56 | 57 | "Contributor" shall mean Licensor and any individual or Legal Entity on behalf 58 | of whom a Contribution has been received by Licensor and subsequently 59 | incorporated within the Work. 60 | 61 | 2. Grant of Copyright License. 62 | 63 | Subject to the terms and conditions of this License, each Contributor hereby 64 | grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, 65 | irrevocable copyright license to reproduce, prepare Derivative Works of, 66 | publicly display, publicly perform, sublicense, and distribute the Work and such 67 | Derivative Works in Source or Object form. 68 | 69 | 3. Grant of Patent License. 70 | 71 | Subject to the terms and conditions of this License, each Contributor hereby 72 | grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, 73 | irrevocable (except as stated in this section) patent license to make, have 74 | made, use, offer to sell, sell, import, and otherwise transfer the Work, where 75 | such license applies only to those patent claims licensable by such Contributor 76 | that are necessarily infringed by their Contribution(s) alone or by combination 77 | of their Contribution(s) with the Work to which such Contribution(s) was 78 | submitted. If You institute patent litigation against any entity (including a 79 | cross-claim or counterclaim in a lawsuit) alleging that the Work or a 80 | Contribution incorporated within the Work constitutes direct or contributory 81 | patent infringement, then any patent licenses granted to You under this License 82 | for that Work shall terminate as of the date such litigation is filed. 83 | 84 | 4. Redistribution. 85 | 86 | You may reproduce and distribute copies of the Work or Derivative Works thereof 87 | in any medium, with or without modifications, and in Source or Object form, 88 | provided that You meet the following conditions: 89 | 90 | You must give any other recipients of the Work or Derivative Works a copy of 91 | this License; and 92 | You must cause any modified files to carry prominent notices stating that You 93 | changed the files; and 94 | You must retain, in the Source form of any Derivative Works that You distribute, 95 | all copyright, patent, trademark, and attribution notices from the Source form 96 | of the Work, excluding those notices that do not pertain to any part of the 97 | Derivative Works; and 98 | If the Work includes a "NOTICE" text file as part of its distribution, then any 99 | Derivative Works that You distribute must include a readable copy of the 100 | attribution notices contained within such NOTICE file, excluding those notices 101 | that do not pertain to any part of the Derivative Works, in at least one of the 102 | following places: within a NOTICE text file distributed as part of the 103 | Derivative Works; within the Source form or documentation, if provided along 104 | with the Derivative Works; or, within a display generated by the Derivative 105 | Works, if and wherever such third-party notices normally appear. The contents of 106 | the NOTICE file are for informational purposes only and do not modify the 107 | License. You may add Your own attribution notices within Derivative Works that 108 | You distribute, alongside or as an addendum to the NOTICE text from the Work, 109 | provided that such additional attribution notices cannot be construed as 110 | modifying the License. 111 | You may add Your own copyright statement to Your modifications and may provide 112 | additional or different license terms and conditions for use, reproduction, or 113 | distribution of Your modifications, or for any such Derivative Works as a whole, 114 | provided Your use, reproduction, and distribution of the Work otherwise complies 115 | with the conditions stated in this License. 116 | 117 | 5. Submission of Contributions. 118 | 119 | Unless You explicitly state otherwise, any Contribution intentionally submitted 120 | for inclusion in the Work by You to the Licensor shall be under the terms and 121 | conditions of this License, without any additional terms or conditions. 122 | Notwithstanding the above, nothing herein shall supersede or modify the terms of 123 | any separate license agreement you may have executed with Licensor regarding 124 | such Contributions. 125 | 126 | 6. Trademarks. 127 | 128 | This License does not grant permission to use the trade names, trademarks, 129 | service marks, or product names of the Licensor, except as required for 130 | reasonable and customary use in describing the origin of the Work and 131 | reproducing the content of the NOTICE file. 132 | 133 | 7. Disclaimer of Warranty. 134 | 135 | Unless required by applicable law or agreed to in writing, Licensor provides the 136 | Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, 137 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, 138 | including, without limitation, any warranties or conditions of TITLE, 139 | NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are 140 | solely responsible for determining the appropriateness of using or 141 | redistributing the Work and assume any risks associated with Your exercise of 142 | permissions under this License. 143 | 144 | 8. Limitation of Liability. 145 | 146 | In no event and under no legal theory, whether in tort (including negligence), 147 | contract, or otherwise, unless required by applicable law (such as deliberate 148 | and grossly negligent acts) or agreed to in writing, shall any Contributor be 149 | liable to You for damages, including any direct, indirect, special, incidental, 150 | or consequential damages of any character arising as a result of this License or 151 | out of the use or inability to use the Work (including but not limited to 152 | damages for loss of goodwill, work stoppage, computer failure or malfunction, or 153 | any and all other commercial damages or losses), even if such Contributor has 154 | been advised of the possibility of such damages. 155 | 156 | 9. Accepting Warranty or Additional Liability. 157 | 158 | While redistributing the Work or Derivative Works thereof, You may choose to 159 | offer, and charge a fee for, acceptance of support, warranty, indemnity, or 160 | other liability obligations and/or rights consistent with this License. However, 161 | in accepting such obligations, You may act only on Your own behalf and on Your 162 | sole responsibility, not on behalf of any other Contributor, and only if You 163 | agree to indemnify, defend, and hold each Contributor harmless for any liability 164 | incurred by, or claims asserted against, such Contributor by reason of your 165 | accepting any such warranty or additional liability. 166 | 167 | END OF TERMS AND CONDITIONS 168 | 169 | APPENDIX: How to apply the Apache License to your work 170 | 171 | To apply the Apache License to your work, attach the following boilerplate 172 | notice, with the fields enclosed by brackets "[]" replaced with your own 173 | identifying information. (Don't include the brackets!) The text should be 174 | enclosed in the appropriate comment syntax for the file format. We also 175 | recommend that a file or class name and description of purpose be included on 176 | the same "printed page" as the copyright notice for easier identification within 177 | third-party archives. 178 | 179 | Copyright [yyyy] [name of copyright owner] 180 | 181 | Licensed under the Apache License, Version 2.0 (the "License"); 182 | you may not use this file except in compliance with the License. 183 | You may obtain a copy of the License at 184 | 185 | http://www.apache.org/licenses/LICENSE-2.0 186 | 187 | Unless required by applicable law or agreed to in writing, software 188 | distributed under the License is distributed on an "AS IS" BASIS, 189 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 190 | See the License for the specific language governing permissions and 191 | limitations under the License. 192 | --------------------------------------------------------------------------------