├── LICENSE.txt ├── README ├── pom.xml └── src ├── main └── java │ └── org │ └── msgpack │ └── hadoop │ ├── hive │ ├── MessagePackStorageHandler.java │ ├── serde2 │ │ ├── MessagePackSerDe.java │ │ └── lazy │ │ │ └── LazyMessagePackRow.java │ └── udf │ │ ├── GenericUDTFMessagePackArray.java │ │ └── GenericUDTFMessagePackMap.java │ ├── io │ └── MessagePackWritable.java │ ├── mapred │ ├── MessagePackInputFormat.java │ ├── MessagePackOutputFormat.java │ ├── MessagePackRecordReader.java │ └── MessagePackRecordWriter.java │ └── mapreduce │ ├── input │ ├── MessagePackInputFormat.java │ └── MessagePackRecordReader.java │ └── output │ ├── MessagePackOutputFormat.java │ └── MessagePackRecordWriter.java └── test └── java └── org └── msgpack └── hadoop ├── hive └── serde2 │ └── TestMessagePackSerDe.java ├── io └── TestMessagePackWritable.java └── mapreduce └── input └── TestMessagePackInputFormat.java /LICENSE.txt: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | MessagePack-Hadoop Integration 2 | ======================================== 3 | 4 | This package contains the bridge layer between MessagePack (http://msgpack.org) 5 | and Hadoop (http://hadoop.apache.org/) families. 6 | 7 | This enables you to run MR jobs on the MessagePack-formatted data, and also 8 | enables you to issue Hive query language over it. 9 | 10 | MessagePack-Hive adapter enables SQL-based adhoc-query, which takes *nested* 11 | *unstructured* data as input (like JSON, but binary-encoded). Of course, query 12 | is executed with MapReduce framework! 13 | 14 | Here is the sample MessagePack-Hive query, which counts unique user per URL. 15 | 16 | > CREATE EXTERNAL TABLE IF NOT EXISTS mpbin (v string) \ 17 | ROW FORMAT DELIMITED FIELDS TERMINATED BY '@' LINES TERMINATED BY '\n' \ 18 | LOCATION '/path/to/hdfs/'; 19 | 20 | > SELECT url, COUNT(1) \ 21 | FROM mpbin LATERAL VIEW msgpack_map(v, 'user', 'url') m AS user, url 22 | GROUP BY txt; 23 | 24 | Required Setup 25 | ======================================== 26 | 27 | Please setup Hadoop + Hive system. Either Local, Pseudo-Distributed, or 28 | Distributed environment is OK. 29 | 30 | Hive Getting Started 31 | ======================================== 32 | 33 | 1. locate jars 34 | 35 | Put these jars to $HIVE_HOME/lib/ directory. 36 | 37 | * msgpack-hadoop-$version.jar 38 | * msgpack-$version.jar 39 | * javassist-$version.jar 40 | 41 | 2. exec hive shell 42 | 43 | Please execute the following command. 44 | 45 | $ hive --auxpath $HIVE_HOME/lib/msgpack-hadoop-$version.jar,$HIVE_HOME/lib/msgpack-$version.jar,$HIVE_HOME/lib/javassist-$version.jar 46 | 47 | You can skip --auxpath option once modify your hive-site.xml. 48 | 49 | 50 | hive.aux.jars.path 51 | $HIVE_HOME/lib/msgpack-hadoop-$version.jar,$HIVE_HOME/lib/msgpack-$version.jar,$HIVE_HOME/lib/javassist-$version.jar 52 | 53 | 54 | 3. add jar and load custom UDTF function 55 | 56 | This step is required for every Hive query. 57 | 58 | hive> add $HIVE_HOME/lib/msgpack-hadoop-$version.jar 59 | hive> add $HIVE_HOME/lib/msgpack-$version.jar 60 | hive> add $HIVE_HOME/lib/javassist-$version.jar 61 | hive> CREATE TEMPORARY FUNCTION msgpack_map AS 'org.msgpack.hadoop.hive.udf.GenericUDTFMessagePackMap'; 62 | 63 | 4. create external table 64 | 65 | Create external table, which points the data directory. 66 | 67 | hive> CREATE EXTERNAL TABLE IF NOT EXISTS mp_table (v string) \ 68 | ROW FORMAT DELIMITED FIELDS TERMINATED BY '@' LINES TERMINATED BY '\n' \ 69 | LOCATION '/path/to/hdfs/'; 70 | 71 | 5. execute the query 72 | 73 | Finally, execute the SELECT query over input data. 74 | 75 | Input msgpack data is unstructured, nested data. Therefore, you need to "map" 76 | MessagePack structure to Hive field name. Actually, you can map the field by 77 | using msgpack_map() UDTF function, and name the fields by "AS" clause. 78 | 79 | hive> SELECT url, COUNT(1) \ 80 | FROM mp_table LATERAL VIEW msgpack_map(v, 'user', 'url') m AS user, url 81 | GROUP BY txt; 82 | 83 | Caveats 84 | ======================================== 85 | 86 | Currently, MessagePackInputFormat is now unsplittable. Therefore, you need to 87 | manually *shred* the data into small pieces. 88 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4.0.0 3 | org.msgpack.hadoop 4 | msgpack-hadoop 5 | 0.1.0 6 | MessagePack-Hadoop Integration 7 | 8 | MessagePack-Hadoop Integration 9 | http://msgpack.org/ 10 | 11 | 12 | 13 | The Apache Software License, Version 2.0 14 | http://www.apache.org/licenses/LICENSE-2.0.txt 15 | repo 16 | 17 | 18 | 19 | 20 | scm:git:git://github.com/msgpack/msgpack-hadoop.git 21 | scm:git:git://github.com/msgpack/msgpack-hadoop.git 22 | 23 | 24 | 25 | 0.5.2-devel 26 | 0.20.2 27 | 0.8.0-SNAPSHOT 28 | 29 | 30 | 31 | 32 | junit 33 | junit 34 | 4.8.1 35 | test 36 | 37 | 38 | org.slf4j 39 | slf4j-api 40 | 1.4.3 41 | 42 | 43 | org.slf4j 44 | slf4j-log4j12 45 | 1.4.3 46 | 47 | 48 | commons-logging 49 | commons-logging 50 | 1.0.4 51 | 52 | 53 | org.msgpack 54 | msgpack 55 | ${msgpack.version} 56 | 57 | 58 | org.apache.hadoop 59 | hadoop-core 60 | ${hadoop.version} 61 | 62 | 63 | hsqldb 64 | hsqldb 65 | 66 | 67 | net.sf.kosmosfs 68 | kfs 69 | 70 | 71 | org.eclipse.jdt 72 | core 73 | 74 | 75 | net.java.dev.jets3t 76 | jets3t 77 | 78 | 79 | oro 80 | oro 81 | 82 | 83 | 84 | 85 | org.apache.hive 86 | common 87 | ${hive.version} 88 | 89 | 90 | org.apache.hive 91 | exec 92 | ${hive.version} 93 | 94 | 95 | org.apache.hive 96 | serde 97 | ${hive.version} 98 | 99 | 100 | 101 | 102 | 103 | 104 | src/main/resources 105 | 106 | 107 | 108 | 109 | src/test/resources 110 | 111 | 112 | 113 | 114 | 115 | maven-compiler-plugin 116 | 117 | 1.5 118 | 1.5 119 | 120 | 121 | 122 | 123 | maven-eclipse-plugin 124 | 2.5.1 125 | 126 | 127 | 128 | maven-release-plugin 129 | 130 | 131 | deploy 132 | scm:git://github.com/msgpack/msgpack-hadoop.git 133 | 134 | 135 | 136 | 137 | org.apache.maven.plugins 138 | maven-source-plugin 139 | 140 | 141 | attach-sources 142 | 143 | jar 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | org.apache.maven.plugins 157 | maven-javadoc-plugin 158 | 159 | ${project.name} ${project.version} API 160 | true 161 | en_US 162 | UTF-8 163 | 164 | 165 | 166 | 167 | org.apache.maven.plugins 168 | maven-jxr-plugin 169 | 170 | 171 | 172 | org.apache.maven.plugins 173 | maven-surefire-report-plugin 174 | 175 | 176 | 177 | 178 | 179 | 180 | apache release 181 | https://repository.apache.org/content/repositories/releases/ 182 | 183 | 184 | msgpack.org 185 | MessagePack Maven2 Repository 186 | http://msgpack.org/maven2/ 187 | 188 | 189 | repository.jboss.org 190 | https://repository.jboss.org/nexus/content/groups/public/ 191 | 192 | false 193 | 194 | 195 | 196 | 197 | apache.snapshots 198 | Apache Development Snapshot Repository 199 | https://repository.apache.org/content/repositories/snapshots/ 200 | 201 | false 202 | 203 | 204 | true 205 | 206 | 207 | 208 | 209 | 210 | 211 | false 212 | msgpack.org 213 | Repository at msgpack.org 214 | file://${project.build.directory}/website/maven2/ 215 | 216 | 217 | true 218 | msgpack.org 219 | Repository at msgpack.org 220 | file://${project.build.directory}/website/maven2/ 221 | 222 | 234 | 235 | 236 | 237 | 238 | release 239 | 240 | 241 | 242 | true 243 | org.apache.maven.plugins 244 | maven-deploy-plugin 245 | 2.4 246 | 247 | true 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | -------------------------------------------------------------------------------- /src/main/java/org/msgpack/hadoop/hive/MessagePackStorageHandler.java: -------------------------------------------------------------------------------- 1 | /* 2 | * MessagePack-Hadoop Integration 3 | * 4 | * Copyright (C) 2009-2011 MessagePack Project 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package org.msgpack.hadoop.hive; 20 | 21 | import java.io.IOException; 22 | import java.util.ArrayList; 23 | import java.util.Arrays; 24 | import java.util.HashSet; 25 | import java.util.List; 26 | import java.util.Map; 27 | import java.util.Properties; 28 | import java.util.Set; 29 | 30 | import org.apache.hadoop.hive.ql.metadata.DefaultStorageHandler; 31 | import org.apache.hadoop.hive.serde2.SerDe; 32 | import org.apache.hadoop.hive.serde2.SerDeException; 33 | import org.apache.hadoop.mapred.InputFormat; 34 | import org.apache.hadoop.mapred.OutputFormat; 35 | 36 | import org.msgpack.hadoop.hive.serde2.MessagePackSerDe; 37 | import org.msgpack.hadoop.mapred.MessagePackInputFormat; 38 | import org.msgpack.hadoop.mapred.MessagePackOutputFormat; 39 | 40 | class MessagePackStorageHandler extends DefaultStorageHandler { 41 | @Override 42 | public Class getInputFormatClass() { 43 | return MessagePackInputFormat.class; 44 | } 45 | 46 | @Override 47 | public Class getOutputFormatClass() { 48 | return MessagePackOutputFormat.class; 49 | } 50 | 51 | @Override 52 | public Class getSerDeClass() { 53 | return MessagePackSerDe.class; 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /src/main/java/org/msgpack/hadoop/hive/serde2/MessagePackSerDe.java: -------------------------------------------------------------------------------- 1 | /* 2 | * MessagePack-Hadoop Integration 3 | * 4 | * Copyright (C) 2009-2011 MessagePack Project 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package org.msgpack.hadoop.hive.serde2; 20 | 21 | import java.util.ArrayList; 22 | import java.util.Arrays; 23 | import java.util.List; 24 | import java.util.Properties; 25 | import java.io.IOException; 26 | import java.io.DataOutputStream; 27 | import java.io.ByteArrayOutputStream; 28 | 29 | import org.apache.commons.codec.binary.Base64; 30 | import org.apache.commons.logging.Log; 31 | import org.apache.commons.logging.LogFactory; 32 | import org.apache.hadoop.conf.Configuration; 33 | import org.apache.hadoop.hive.serde.Constants; 34 | import org.apache.hadoop.hive.serde2.SerDe; 35 | import org.apache.hadoop.hive.serde2.SerDeException; 36 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; 37 | import org.apache.hadoop.hive.serde2.lazy.objectinspector.LazySimpleStructObjectInspector; 38 | import org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe.SerDeParameters; 39 | import org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe; 40 | import org.apache.hadoop.hive.serde2.lazy.LazyFactory; 41 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; 42 | import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; 43 | import org.apache.hadoop.io.Text; 44 | import org.apache.hadoop.io.Writable; 45 | 46 | import org.msgpack.hadoop.io.MessagePackWritable; 47 | import org.msgpack.hadoop.hive.serde2.lazy.LazyMessagePackRow; 48 | 49 | public class MessagePackSerDe implements SerDe { 50 | private static final Log LOG = LogFactory.getLog(MessagePackSerDe.class.getName()); 51 | 52 | private SerDeParameters serdeParams_; 53 | private ObjectInspector cachedObjectInspector_; 54 | private LazyMessagePackRow cachedMessagePackRow_; 55 | 56 | @Override 57 | public void initialize(Configuration conf, Properties tbl) throws SerDeException { 58 | String serdeName = getClass().getName(); 59 | serdeParams_ = LazySimpleSerDe.initSerdeParams(conf, tbl, serdeName); 60 | 61 | cachedObjectInspector_ = LazyFactory.createLazyStructInspector( 62 | serdeParams_.getColumnNames(), 63 | serdeParams_.getColumnTypes(), 64 | serdeParams_.getSeparators(), 65 | serdeParams_.getNullSequence(), 66 | serdeParams_.isLastColumnTakesRest(), 67 | serdeParams_.isEscaped(), 68 | serdeParams_.getEscapeChar()); 69 | 70 | cachedMessagePackRow_ = new LazyMessagePackRow((LazySimpleStructObjectInspector)cachedObjectInspector_); 71 | } 72 | 73 | @Override 74 | public ObjectInspector getObjectInspector() throws SerDeException { 75 | return cachedObjectInspector_; 76 | } 77 | 78 | @Override 79 | public Object deserialize(Writable blob) throws SerDeException { 80 | if (!(blob instanceof MessagePackWritable)) { 81 | throw new SerDeException(getClass().toString() 82 | + ": expects either MessagePackWritable object!"); 83 | } 84 | cachedMessagePackRow_.init((MessagePackWritable)blob); 85 | return cachedMessagePackRow_; 86 | } 87 | 88 | @Override 89 | public Class getSerializedClass() { 90 | return MessagePackWritable.class; 91 | } 92 | 93 | @Override 94 | public Writable serialize(Object obj, ObjectInspector objInspector) throws SerDeException { 95 | LOG.info(obj.toString()); 96 | LOG.info(objInspector.toString()); 97 | return null; 98 | } 99 | } 100 | -------------------------------------------------------------------------------- /src/main/java/org/msgpack/hadoop/hive/serde2/lazy/LazyMessagePackRow.java: -------------------------------------------------------------------------------- 1 | /* 2 | * MessagePack-Hadoop Integration 3 | * 4 | * Copyright (C) 2009-2011 MessagePack Project 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package org.msgpack.hadoop.hive.serde2.lazy; 20 | 21 | import java.util.ArrayList; 22 | import java.util.Arrays; 23 | import java.util.List; 24 | import java.util.Properties; 25 | import java.io.IOException; 26 | import java.io.DataOutputStream; 27 | import java.io.ByteArrayOutputStream; 28 | 29 | import org.apache.hadoop.hive.serde2.lazy.ByteArrayRef; 30 | import org.apache.hadoop.hive.serde2.lazy.LazyFactory; 31 | import org.apache.hadoop.hive.serde2.lazy.LazyObject; 32 | import org.apache.hadoop.hive.serde2.lazy.LazyStruct; 33 | import org.apache.hadoop.hive.serde2.lazy.objectinspector.LazyMapObjectInspector; 34 | import org.apache.hadoop.hive.serde2.lazy.objectinspector.LazySimpleStructObjectInspector; 35 | import org.apache.hadoop.hive.serde2.objectinspector.StructField; 36 | import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; 37 | 38 | import org.msgpack.hadoop.io.MessagePackWritable; 39 | 40 | public class LazyMessagePackRow extends LazyStruct { 41 | private MessagePackWritable result_; 42 | private ArrayList cachedList_; 43 | 44 | public LazyMessagePackRow(LazySimpleStructObjectInspector oi) { 45 | super(oi); 46 | } 47 | 48 | public void init(MessagePackWritable r) { 49 | result_ = r; 50 | setParsed(false); 51 | } 52 | 53 | private void parse() { 54 | if (getFields() == null) { 55 | List fieldRefs = ((StructObjectInspector)getInspector()).getAllStructFieldRefs(); 56 | setFields(new LazyObject[fieldRefs.size()]); 57 | for (int i = 0; i < getFields().length; i++) { 58 | getFields()[i] = LazyFactory.createLazyObject(fieldRefs.get(i).getFieldObjectInspector()); 59 | } 60 | setFieldInited(new boolean[getFields().length]); 61 | } 62 | Arrays.fill(getFieldInited(), false); 63 | setParsed(true); 64 | } 65 | 66 | private Object uncheckedGetField(int fieldID) { 67 | if (!getFieldInited()[fieldID]) { 68 | getFieldInited()[fieldID] = true; 69 | 70 | ByteArrayRef ref = new ByteArrayRef(); 71 | byte[] raw = result_.getRawBytes(); 72 | ref.setData(raw); 73 | getFields()[fieldID].init(ref, 0, ref.getData().length); 74 | } 75 | 76 | return getFields()[fieldID].getObject(); 77 | } 78 | 79 | @Override 80 | public Object getField(int fieldID) { 81 | if (!getParsed()) { 82 | parse(); 83 | } 84 | return uncheckedGetField(fieldID); 85 | } 86 | 87 | @Override 88 | public ArrayList getFieldsAsList() { 89 | if (!getParsed()) { 90 | parse(); 91 | } 92 | if (cachedList_ == null) { 93 | cachedList_ = new ArrayList(); 94 | } else { 95 | cachedList_.clear(); 96 | } 97 | for (int i = 0; i < getFields().length; i++) { 98 | cachedList_.add(uncheckedGetField(i)); 99 | } 100 | return cachedList_; 101 | } 102 | 103 | @Override 104 | public Object getObject() { 105 | return this; 106 | } 107 | } 108 | -------------------------------------------------------------------------------- /src/main/java/org/msgpack/hadoop/hive/udf/GenericUDTFMessagePackArray.java: -------------------------------------------------------------------------------- 1 | /* 2 | * MessagePack-Hadoop Integration 3 | * 4 | * Copyright (C) 2009-2011 MessagePack Project 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package org.msgpack.hadoop.hive.udf; 20 | 21 | import java.util.ArrayList; 22 | import java.util.List; 23 | 24 | import org.apache.commons.logging.Log; 25 | import org.apache.commons.logging.LogFactory; 26 | import org.apache.hadoop.hive.ql.exec.Description; 27 | import org.apache.hadoop.hive.ql.exec.UDFArgumentException; 28 | import org.apache.hadoop.hive.ql.metadata.HiveException; 29 | import org.apache.hadoop.hive.serde.Constants; 30 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; 31 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; 32 | import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; 33 | import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; 34 | import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils; 35 | import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector; 36 | import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; 37 | import org.apache.hadoop.io.Writable; 38 | import org.apache.hadoop.io.ArrayWritable; 39 | import org.apache.hadoop.io.BooleanWritable; 40 | import org.apache.hadoop.io.FloatWritable; 41 | import org.apache.hadoop.io.DoubleWritable; 42 | import org.apache.hadoop.io.IntWritable; 43 | import org.apache.hadoop.io.LongWritable; 44 | import org.apache.hadoop.io.MapWritable; 45 | import org.apache.hadoop.io.Text; 46 | import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF; 47 | import org.apache.commons.codec.binary.Base64; 48 | import org.msgpack.MessagePackObject; 49 | import org.msgpack.MessageTypeException; 50 | import org.msgpack.MessagePack; 51 | import static org.msgpack.Templates.*; 52 | 53 | @Description(name = "msgpack_array", 54 | value = "_FUNC_(msgpackBinary, index1, index2, ..., indexN) - parse MessagePack raw binary into a array. " + 55 | "All the input parameters and output column types are string.") 56 | public class GenericUDTFMessagePackArray extends GenericUDTF { 57 | 58 | private static Log LOG = LogFactory.getLog(GenericUDTFMessagePackArray.class.getName()); 59 | 60 | int numCols; // number of output columns 61 | int[] indexes; // array of path expressions, each of which corresponds to a column 62 | Text[] retVals; // array of returned column values 63 | Text[] cols; // object pool of non-null Text, avoid creating objects all the time 64 | Object[] nullVals; // array of null column values 65 | ObjectInspector[] inputOIs; // input ObjectInspectors 66 | boolean pathParsed = false; 67 | boolean seenErrors = false; 68 | 69 | @Override 70 | public void close() throws HiveException { 71 | } 72 | 73 | @Override 74 | public StructObjectInspector initialize(ObjectInspector[] args) throws UDFArgumentException { 75 | inputOIs = args; 76 | numCols = args.length - 1; 77 | 78 | if (numCols < 1) { 79 | throw new UDFArgumentException("msgpack_array() takes at least two arguments: " + 80 | "the MessagePack binary a key"); 81 | } 82 | 83 | if (!(args[0] instanceof StringObjectInspector)) { 84 | throw new UDFArgumentException("msgpack_array() takes string type for the first argument"); 85 | } 86 | 87 | for (int i = 1; i < args.length; ++i) { 88 | if (!(args[i] instanceof PrimitiveObjectInspector)) { 89 | throw new UDFArgumentException("msgpack_array()'s arguments have to be int type"); 90 | } 91 | } 92 | 93 | seenErrors = false; 94 | pathParsed = false; 95 | indexes = new int[numCols]; 96 | cols = new Text[numCols]; 97 | retVals = new Text[numCols]; 98 | nullVals = new Object[numCols]; 99 | 100 | for (int i = 0; i < numCols; ++i) { 101 | cols[i] = new Text(); 102 | //retVals[i] = cols[i]; 103 | nullVals[i] = null; 104 | } 105 | 106 | // construct output object inspector 107 | ArrayList fieldNames = new ArrayList(numCols); 108 | ArrayList fieldOIs = new ArrayList(numCols); 109 | for (int i = 0; i < numCols; ++i) { 110 | // column name can be anything since it will be named by UDTF as clause 111 | fieldNames.add("c" + i); 112 | // all returned type will be Text 113 | fieldOIs.add(PrimitiveObjectInspectorFactory.writableStringObjectInspector); 114 | } 115 | return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs); 116 | } 117 | 118 | @Override 119 | public void process(Object[] o) throws HiveException { 120 | 121 | if (o[0] == null) { 122 | forward(nullVals); 123 | return; 124 | } 125 | // get the path expression for the 1st row only 126 | if (!pathParsed) { 127 | for (int i = 0;i < numCols; ++i) { 128 | indexes[i] = PrimitiveObjectInspectorUtils.getInt(o[i+1], (PrimitiveObjectInspector) inputOIs[i+1]); 129 | } 130 | pathParsed = true; 131 | } 132 | 133 | byte[] binary = ((StringObjectInspector) inputOIs[0]).getPrimitiveWritableObject(o[0]).getBytes(); 134 | if (binary == null) { 135 | forward(nullVals); 136 | return; 137 | } 138 | try { 139 | List array = (List) 140 | MessagePack.unpack(binary, tList(TAny)); 141 | for (int i = 0; i < numCols; ++i) { 142 | MessagePackObject obj = null; 143 | int index = indexes[i]; 144 | if(array.size() > index && index > 0) { 145 | obj = array.get(indexes[i]); 146 | } 147 | if(obj == null) { 148 | retVals[i] = null; 149 | } else { 150 | retVals[i] = setText(cols[i], obj); 151 | } 152 | } 153 | //for (int i = 0; i < numCols; ++i) { 154 | // if (jsonObj.isNull(indexes[i])) { 155 | // retVals[i] = null; 156 | // } else { 157 | // if (retVals[i] == null) { 158 | // retVals[i] = cols[i]; // use the object pool rather than creating a new object 159 | // } 160 | // retVals[i].set(jsonObj.getString(indexes[i])); 161 | // } 162 | //} 163 | forward(retVals); 164 | return; 165 | 166 | } catch (MessageTypeException e) { 167 | // type error, object is not an array 168 | if (!seenErrors) { 169 | LOG.error("The input is not an array: " + e + ". Skipping such error messages in the future."); 170 | seenErrors = true; 171 | } 172 | forward(nullVals); 173 | return; 174 | } catch (Exception e) { 175 | // parsing error, invalid MessagePack binary 176 | if (!seenErrors) { 177 | String base64 = new String(Base64.encodeBase64(binary)); 178 | LOG.error("The input is not a valid MessagePack binary: " + base64 + ". Skipping such error messages in the future."); 179 | seenErrors = true; 180 | } 181 | forward(nullVals); 182 | return; 183 | } catch (Throwable e) { 184 | LOG.error("MessagePack parsing/evaluation exception" + e); 185 | forward(nullVals); 186 | return; 187 | } 188 | } 189 | 190 | private Text setText(Text to, MessagePackObject obj) { 191 | if(obj.isBooleanType()) { 192 | if(obj.asBoolean()) { 193 | to.set("1"); 194 | } else { 195 | to.set("0"); 196 | } 197 | return to; 198 | 199 | } else if(obj.isIntegerType()) { 200 | to.set(Long.toString(obj.asLong())); 201 | return to; 202 | 203 | } else if(obj.isFloatType()) { 204 | to.set(Double.toString(obj.asDouble())); 205 | return to; 206 | 207 | } else if(obj.isArrayType()) { 208 | to.set(MessagePack.pack(obj)); 209 | return to; 210 | 211 | } else if(obj.isMapType()) { 212 | to.set(MessagePack.pack(obj)); 213 | return to; 214 | 215 | } else if(obj.isRawType()) { 216 | to.set(obj.asByteArray()); 217 | return to; 218 | 219 | } else { 220 | return null; 221 | } 222 | } 223 | 224 | @Override 225 | public String toString() { 226 | return "msgpack_array"; 227 | } 228 | } 229 | -------------------------------------------------------------------------------- /src/main/java/org/msgpack/hadoop/hive/udf/GenericUDTFMessagePackMap.java: -------------------------------------------------------------------------------- 1 | /* 2 | * MessagePack-Hadoop Integration 3 | * 4 | * Copyright (C) 2009-2011 MessagePack Project 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package org.msgpack.hadoop.hive.udf; 20 | 21 | import java.util.ArrayList; 22 | import java.util.Map; 23 | 24 | import org.apache.commons.logging.Log; 25 | import org.apache.commons.logging.LogFactory; 26 | import org.apache.hadoop.hive.ql.exec.Description; 27 | import org.apache.hadoop.hive.ql.exec.UDFArgumentException; 28 | import org.apache.hadoop.hive.ql.metadata.HiveException; 29 | import org.apache.hadoop.hive.serde.Constants; 30 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; 31 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; 32 | import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; 33 | import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; 34 | import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector; 35 | import org.apache.hadoop.io.Writable; 36 | import org.apache.hadoop.io.ArrayWritable; 37 | import org.apache.hadoop.io.BooleanWritable; 38 | import org.apache.hadoop.io.FloatWritable; 39 | import org.apache.hadoop.io.DoubleWritable; 40 | import org.apache.hadoop.io.IntWritable; 41 | import org.apache.hadoop.io.LongWritable; 42 | import org.apache.hadoop.io.MapWritable; 43 | import org.apache.hadoop.io.Text; 44 | import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF; 45 | import org.apache.commons.codec.binary.Base64; 46 | import org.msgpack.MessagePackObject; 47 | import org.msgpack.MessageTypeException; 48 | import org.msgpack.MessagePack; 49 | import static org.msgpack.Templates.*; 50 | 51 | @Description(name = "msgpack_map", 52 | value = "_FUNC_(msgpackBinary, col1, col2, ..., colN) - parse MessagePack raw binary into a map. " + 53 | "All the input parameters and output column types are string.") 54 | public class GenericUDTFMessagePackMap extends GenericUDTF { 55 | 56 | private static Log LOG = LogFactory.getLog(GenericUDTFMessagePackMap.class.getName()); 57 | 58 | int numCols; // number of output columns 59 | String[] keys; // array of path expressions, each of which corresponds to a column 60 | Text[] retVals; // array of returned column values 61 | Text[] cols; // object pool of non-null Text, avoid creating objects all the time 62 | Object[] nullVals; // array of null column values 63 | ObjectInspector[] inputOIs; // input ObjectInspectors 64 | boolean pathParsed = false; 65 | boolean seenErrors = false; 66 | 67 | @Override 68 | public void close() throws HiveException { 69 | } 70 | 71 | @Override 72 | public StructObjectInspector initialize(ObjectInspector[] args) throws UDFArgumentException { 73 | inputOIs = args; 74 | numCols = args.length - 1; 75 | 76 | if (numCols < 1) { 77 | throw new UDFArgumentException("msgpack_map() takes at least two arguments: " + 78 | "the MessagePack binary a key"); 79 | } 80 | 81 | if (!(args[0] instanceof StringObjectInspector)) { 82 | throw new UDFArgumentException("msgpack_map() takes string type for the first argument"); 83 | } 84 | 85 | for (int i = 1; i < args.length; ++i) { 86 | if (!(args[i] instanceof StringObjectInspector)) { 87 | throw new UDFArgumentException("msgpack_map()'s keys have to be string type"); 88 | } 89 | } 90 | 91 | seenErrors = false; 92 | pathParsed = false; 93 | keys = new String[numCols]; 94 | cols = new Text[numCols]; 95 | retVals = new Text[numCols]; 96 | nullVals = new Object[numCols]; 97 | 98 | for (int i = 0; i < numCols; ++i) { 99 | cols[i] = new Text(); 100 | //retVals[i] = cols[i]; 101 | nullVals[i] = null; 102 | } 103 | 104 | // construct output object inspector 105 | ArrayList fieldNames = new ArrayList(numCols); 106 | ArrayList fieldOIs = new ArrayList(numCols); 107 | for (int i = 0; i < numCols; ++i) { 108 | // column name can be anything since it will be named by UDTF as clause 109 | fieldNames.add("c" + i); 110 | // all returned type will be Text 111 | fieldOIs.add(PrimitiveObjectInspectorFactory.writableStringObjectInspector); 112 | } 113 | return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs); 114 | } 115 | 116 | @Override 117 | public void process(Object[] o) throws HiveException { 118 | 119 | if (o[0] == null) { 120 | forward(nullVals); 121 | return; 122 | } 123 | // get the path expression for the 1st row only 124 | if (!pathParsed) { 125 | for (int i = 0;i < numCols; ++i) { 126 | keys[i] = ((StringObjectInspector) inputOIs[i+1]).getPrimitiveJavaObject(o[i+1]); 127 | } 128 | pathParsed = true; 129 | } 130 | 131 | byte[] binary = ((StringObjectInspector) inputOIs[0]).getPrimitiveWritableObject(o[0]).getBytes(); 132 | if (binary == null) { 133 | forward(nullVals); 134 | return; 135 | } 136 | try { 137 | Map map = (Map) 138 | MessagePack.unpack(binary, tMap(TString,TAny)); 139 | for (int i = 0; i < numCols; ++i) { 140 | MessagePackObject obj = map.get(keys[i]); 141 | if(obj == null) { 142 | retVals[i] = null; 143 | } else { 144 | retVals[i] = setText(cols[i], obj); 145 | } 146 | } 147 | //for (int i = 0; i < numCols; ++i) { 148 | // if (jsonObj.isNull(keys[i])) { 149 | // retVals[i] = null; 150 | // } else { 151 | // if (retVals[i] == null) { 152 | // retVals[i] = cols[i]; // use the object pool rather than creating a new object 153 | // } 154 | // retVals[i].set(jsonObj.getString(keys[i])); 155 | // } 156 | //} 157 | forward(retVals); 158 | return; 159 | 160 | } catch (MessageTypeException e) { 161 | // type error, object is not a map 162 | if (!seenErrors) { 163 | LOG.error("The input is not a map: " + e + ". Skipping such error messages in the future."); 164 | seenErrors = true; 165 | } 166 | forward(nullVals); 167 | return; 168 | } catch (Exception e) { 169 | // parsing error, invalid MessagePack binary 170 | if (!seenErrors) { 171 | String base64 = new String(Base64.encodeBase64(binary)); 172 | LOG.error("The input is not a valid MessagePack binary: " + base64 + ". Skipping such error messages in the future."); 173 | seenErrors = true; 174 | } 175 | forward(nullVals); 176 | return; 177 | } catch (Throwable e) { 178 | LOG.error("MessagePack parsing/evaluation exception" + e); 179 | forward(nullVals); 180 | return; 181 | } 182 | } 183 | 184 | private Text setText(Text to, MessagePackObject obj) { 185 | if(obj.isBooleanType()) { 186 | if(obj.asBoolean()) { 187 | to.set("1"); 188 | } else { 189 | to.set("0"); 190 | } 191 | return to; 192 | 193 | } else if(obj.isIntegerType()) { 194 | to.set(Long.toString(obj.asLong())); 195 | return to; 196 | 197 | } else if(obj.isFloatType()) { 198 | to.set(Double.toString(obj.asDouble())); 199 | return to; 200 | 201 | } else if(obj.isArrayType()) { 202 | to.set(MessagePack.pack(obj)); 203 | return to; 204 | 205 | } else if(obj.isMapType()) { 206 | to.set(MessagePack.pack(obj)); 207 | return to; 208 | 209 | } else if(obj.isRawType()) { 210 | to.set(obj.asByteArray()); 211 | return to; 212 | 213 | } else { 214 | return null; 215 | } 216 | } 217 | 218 | @Override 219 | public String toString() { 220 | return "msgpack_map"; 221 | } 222 | } 223 | -------------------------------------------------------------------------------- /src/main/java/org/msgpack/hadoop/io/MessagePackWritable.java: -------------------------------------------------------------------------------- 1 | /* 2 | * MessagePack-Hadoop Integration 3 | * 4 | * Copyright (C) 2009-2011 MessagePack Project 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package org.msgpack.hadoop.io; 20 | 21 | import java.io.DataInput; 22 | import java.io.DataOutput; 23 | import java.io.IOException; 24 | 25 | import org.apache.hadoop.io.BytesWritable; 26 | import org.apache.hadoop.io.WritableComparable; 27 | 28 | import org.msgpack.MessagePack; 29 | import org.msgpack.MessagePackObject; 30 | 31 | /** 32 | * A Hadoop Writable wrapper for MessagePack (untyped). 33 | */ 34 | public class MessagePackWritable implements WritableComparable { 35 | protected MessagePackObject obj_ = null; 36 | 37 | public MessagePackWritable() {} 38 | 39 | public MessagePackWritable(MessagePackObject obj) { 40 | obj_ = obj; 41 | } 42 | 43 | public void set(MessagePackObject obj) { obj_ = obj; } 44 | 45 | public MessagePackObject get() { return obj_; } 46 | 47 | public byte[] getRawBytes() { 48 | return MessagePack.pack(obj_); 49 | } 50 | 51 | public void write(DataOutput out) throws IOException { 52 | assert(obj_ != null); 53 | byte[] raw = MessagePack.pack(obj_); 54 | if (raw == null) return; 55 | out.writeInt(raw.length); 56 | out.write(raw, 0, raw.length); 57 | } 58 | 59 | @SuppressWarnings("unchecked") 60 | public void readFields(DataInput in) throws IOException { 61 | int size = in.readInt(); 62 | if (size > 0) { 63 | byte[] raw = new byte[size]; 64 | in.readFully(raw, 0, size); 65 | // TODO: 2011/05/07 Kazuki Ohta 66 | // Want to avoid extra allocation here, but MessagePackObject is 67 | // abstract. 68 | obj_ = MessagePack.unpack(raw); 69 | assert(obj_ != null); 70 | } 71 | } 72 | 73 | @Override 74 | public int compareTo(MessagePackWritable other) { 75 | // TODO: 2010/11/09 Kazuki Ohta 76 | // compare without packing 77 | byte[] raw1 = MessagePack.pack(this.get()); 78 | byte[] raw2 = MessagePack.pack(other.get()); 79 | return BytesWritable.Comparator.compareBytes(raw1, 0, raw1.length, raw2, 0, raw2.length); 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /src/main/java/org/msgpack/hadoop/mapred/MessagePackInputFormat.java: -------------------------------------------------------------------------------- 1 | /* 2 | * MessagePack-Hadoop Integration 3 | * 4 | * Copyright (C) 2009-2011 MessagePack Project 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package org.msgpack.hadoop.mapred; 20 | 21 | import java.io.IOException; 22 | 23 | import org.apache.hadoop.fs.FileSystem; 24 | import org.apache.hadoop.io.LongWritable; 25 | import org.apache.hadoop.fs.Path; 26 | import org.apache.hadoop.mapred.InputSplit; 27 | import org.apache.hadoop.mapred.RecordReader; 28 | import org.apache.hadoop.mapred.JobConf; 29 | import org.apache.hadoop.mapred.Reporter; 30 | import org.apache.hadoop.mapred.FileInputFormat; 31 | 32 | import org.msgpack.hadoop.io.MessagePackWritable; 33 | import org.msgpack.hadoop.mapred.MessagePackRecordReader; 34 | 35 | public class MessagePackInputFormat extends FileInputFormat { 36 | @Override 37 | protected boolean isSplitable(FileSystem fs, Path filename) { 38 | return false; 39 | } 40 | 41 | @Override 42 | public RecordReader getRecordReader(InputSplit split, 43 | JobConf conf, Reporter reporter) 44 | throws IOException { 45 | return new MessagePackRecordReader(split, conf); 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /src/main/java/org/msgpack/hadoop/mapred/MessagePackOutputFormat.java: -------------------------------------------------------------------------------- 1 | /* 2 | * MessagePack-Hadoop Integration 3 | * 4 | * Copyright (C) 2009-2011 MessagePack Project 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package org.msgpack.hadoop.mapred; 20 | 21 | import java.io.IOException; 22 | 23 | import org.apache.hadoop.fs.FileSystem; 24 | import org.apache.hadoop.io.LongWritable; 25 | import org.apache.hadoop.io.NullWritable; 26 | import org.apache.hadoop.fs.Path; 27 | import org.apache.hadoop.fs.FSDataOutputStream; 28 | import org.apache.hadoop.mapred.InputSplit; 29 | import org.apache.hadoop.mapred.RecordWriter; 30 | import org.apache.hadoop.mapred.JobConf; 31 | import org.apache.hadoop.mapred.Reporter; 32 | import org.apache.hadoop.mapred.FileOutputFormat; 33 | import org.apache.hadoop.util.Progressable; 34 | 35 | import org.msgpack.hadoop.io.MessagePackWritable; 36 | import org.msgpack.hadoop.mapred.MessagePackRecordWriter; 37 | 38 | public class MessagePackOutputFormat extends FileOutputFormat { 39 | @Override 40 | public RecordWriter getRecordWriter(FileSystem ignored, 41 | JobConf job, String name, Progressable progress) 42 | throws IOException { 43 | Path file = FileOutputFormat.getTaskOutputPath(job, name); 44 | FileSystem fs = file.getFileSystem(job); 45 | FSDataOutputStream fileOut = fs.create(file, progress); 46 | return new MessagePackRecordWriter(fileOut); 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /src/main/java/org/msgpack/hadoop/mapred/MessagePackRecordReader.java: -------------------------------------------------------------------------------- 1 | /* 2 | * MessagePack-Hadoop Integration 3 | * 4 | * Copyright (C) 2009-2011 MessagePack Project 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package org.msgpack.hadoop.mapred; 20 | 21 | import java.io.IOException; 22 | import java.io.InputStream; 23 | 24 | import org.apache.hadoop.conf.Configuration; 25 | import org.apache.hadoop.fs.FSDataInputStream; 26 | import org.apache.hadoop.fs.FileSystem; 27 | import org.apache.hadoop.fs.Path; 28 | import org.apache.hadoop.mapred.InputSplit; 29 | import org.apache.hadoop.mapred.RecordReader; 30 | import org.apache.hadoop.mapred.JobConf; 31 | import org.apache.hadoop.mapred.FileSplit; 32 | import org.apache.hadoop.io.LongWritable; 33 | 34 | import org.msgpack.MessagePack; 35 | import org.msgpack.Unpacker; 36 | import org.msgpack.MessagePackObject; 37 | import org.msgpack.hadoop.io.MessagePackWritable; 38 | 39 | public class MessagePackRecordReader implements RecordReader { 40 | private Unpacker unpacker_; 41 | 42 | protected long start_; 43 | protected long pos_; 44 | protected long end_; 45 | private FSDataInputStream fileIn_; 46 | 47 | public MessagePackRecordReader(InputSplit genericSplit, JobConf conf) throws IOException { 48 | FileSplit split = (FileSplit)genericSplit; 49 | final Path file = split.getPath(); 50 | 51 | // Open the file 52 | FileSystem fs = file.getFileSystem(conf); 53 | fileIn_ = fs.open(split.getPath()); 54 | 55 | // Create streaming unpacker 56 | unpacker_ = new Unpacker(fileIn_); 57 | 58 | // Seek to the start of the split 59 | start_ = split.getStart(); 60 | end_ = start_ + split.getLength(); 61 | pos_ = start_; 62 | } 63 | 64 | public float getProgress() { 65 | if (start_ == end_) { 66 | return 0.0f; 67 | } else { 68 | return Math.min(1.0f, (pos_ - start_) / (float) (end_ - start_)); 69 | } 70 | } 71 | 72 | public long getPos() { 73 | return pos_; 74 | } 75 | 76 | public synchronized void close() throws IOException { 77 | } 78 | 79 | public LongWritable createKey() { 80 | return new LongWritable(); 81 | } 82 | 83 | public MessagePackWritable createValue() { 84 | return new MessagePackWritable(); 85 | } 86 | 87 | public boolean next(LongWritable key, MessagePackWritable val) 88 | throws IOException { 89 | for (MessagePackObject obj : unpacker_) { 90 | key.set(fileIn_.getPos()); 91 | val.set(obj); 92 | return true; 93 | } 94 | return false; 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /src/main/java/org/msgpack/hadoop/mapred/MessagePackRecordWriter.java: -------------------------------------------------------------------------------- 1 | /* 2 | * MessagePack-Hadoop Integration 3 | * 4 | * Copyright (C) 2009-2011 MessagePack Project 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package org.msgpack.hadoop.mapred; 20 | 21 | import java.io.DataOutputStream; 22 | import java.io.IOException; 23 | import java.io.InputStream; 24 | 25 | import org.apache.hadoop.conf.Configuration; 26 | import org.apache.hadoop.fs.FSDataInputStream; 27 | import org.apache.hadoop.fs.FileSystem; 28 | import org.apache.hadoop.fs.Path; 29 | import org.apache.hadoop.mapred.InputSplit; 30 | import org.apache.hadoop.mapred.RecordWriter; 31 | import org.apache.hadoop.mapred.JobConf; 32 | import org.apache.hadoop.mapred.Reporter; 33 | import org.apache.hadoop.io.NullWritable; 34 | 35 | import org.msgpack.MessagePack; 36 | import org.msgpack.Unpacker; 37 | import org.msgpack.MessagePackObject; 38 | import org.msgpack.hadoop.io.MessagePackWritable; 39 | 40 | public class MessagePackRecordWriter implements RecordWriter { 41 | protected DataOutputStream out_; 42 | 43 | public MessagePackRecordWriter(DataOutputStream out) throws IOException { 44 | out_ = out; 45 | } 46 | 47 | public synchronized void write(NullWritable key, MessagePackWritable value) throws IOException { 48 | out_.write(value.getRawBytes()); 49 | } 50 | 51 | public synchronized void close(Reporter reporter) throws IOException { 52 | out_.close(); 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /src/main/java/org/msgpack/hadoop/mapreduce/input/MessagePackInputFormat.java: -------------------------------------------------------------------------------- 1 | /* 2 | * MessagePack-Hadoop Integration 3 | * 4 | * Copyright (C) 2009-2011 MessagePack Project 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package org.msgpack.hadoop.mapreduce.input; 20 | 21 | import java.io.IOException; 22 | 23 | import org.apache.hadoop.io.LongWritable; 24 | import org.apache.hadoop.fs.Path; 25 | import org.apache.hadoop.mapreduce.InputSplit; 26 | import org.apache.hadoop.mapreduce.RecordReader; 27 | import org.apache.hadoop.mapreduce.JobContext; 28 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 29 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 30 | 31 | import org.msgpack.hadoop.io.MessagePackWritable; 32 | import org.msgpack.hadoop.mapreduce.input.MessagePackRecordReader; 33 | 34 | public class MessagePackInputFormat extends FileInputFormat { 35 | @Override 36 | protected boolean isSplitable(JobContext context, Path filename) { 37 | return false; 38 | } 39 | 40 | @Override 41 | public RecordReader createRecordReader(InputSplit split, 42 | TaskAttemptContext taskAttempt) 43 | throws IOException, InterruptedException { 44 | return new MessagePackRecordReader(); 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/main/java/org/msgpack/hadoop/mapreduce/input/MessagePackRecordReader.java: -------------------------------------------------------------------------------- 1 | /* 2 | * MessagePack-Hadoop Integration 3 | * 4 | * Copyright (C) 2009-2011 MessagePack Project 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package org.msgpack.hadoop.mapreduce.input; 20 | 21 | import java.io.IOException; 22 | import java.io.InputStream; 23 | 24 | import org.apache.hadoop.conf.Configuration; 25 | import org.apache.hadoop.fs.FSDataInputStream; 26 | import org.apache.hadoop.fs.FileSystem; 27 | import org.apache.hadoop.fs.Path; 28 | import org.apache.hadoop.mapreduce.InputSplit; 29 | import org.apache.hadoop.mapreduce.RecordReader; 30 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 31 | import org.apache.hadoop.mapreduce.lib.input.FileSplit; 32 | import org.apache.hadoop.io.LongWritable; 33 | 34 | import org.msgpack.MessagePack; 35 | import org.msgpack.Unpacker; 36 | import org.msgpack.MessagePackObject; 37 | import org.msgpack.hadoop.io.MessagePackWritable; 38 | 39 | public class MessagePackRecordReader extends RecordReader { 40 | private Unpacker unpacker_; 41 | 42 | private final LongWritable key_ = new LongWritable(0); 43 | private final MessagePackWritable val_; 44 | 45 | protected long start_; 46 | protected long pos_; 47 | protected long end_; 48 | private FSDataInputStream fileIn_; 49 | 50 | public MessagePackRecordReader() { 51 | val_ = new MessagePackWritable(); 52 | } 53 | 54 | @Override 55 | public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException, InterruptedException { 56 | FileSplit split = (FileSplit)genericSplit; 57 | final Path file = split.getPath(); 58 | Configuration conf = context.getConfiguration(); 59 | 60 | // Open the file 61 | FileSystem fs = file.getFileSystem(conf); 62 | fileIn_ = fs.open(split.getPath()); 63 | 64 | // Create streaming unpacker 65 | unpacker_ = new Unpacker(fileIn_); 66 | 67 | // Seek to the start of the split 68 | start_ = split.getStart(); 69 | end_ = start_ + split.getLength(); 70 | pos_ = start_; 71 | } 72 | 73 | @Override 74 | public float getProgress() { 75 | if (start_ == end_) { 76 | return 0.0f; 77 | } else { 78 | return Math.min(1.0f, (pos_ - start_) / (float) (end_ - start_)); 79 | } 80 | } 81 | 82 | @Override 83 | public synchronized void close() throws IOException { 84 | } 85 | 86 | @Override 87 | public LongWritable getCurrentKey() throws IOException, InterruptedException { 88 | return key_; 89 | } 90 | 91 | @Override 92 | public MessagePackWritable getCurrentValue() throws IOException, InterruptedException { 93 | return val_; 94 | } 95 | 96 | @Override 97 | public boolean nextKeyValue() throws IOException, InterruptedException { 98 | for (MessagePackObject obj : unpacker_) { 99 | long key = fileIn_.getPos(); 100 | MessagePackObject val = obj; 101 | key_.set(key); 102 | val_.set(val); 103 | return true; 104 | } 105 | return false; 106 | } 107 | } 108 | -------------------------------------------------------------------------------- /src/main/java/org/msgpack/hadoop/mapreduce/output/MessagePackOutputFormat.java: -------------------------------------------------------------------------------- 1 | /* 2 | * MessagePack-Hadoop Integration 3 | * 4 | * Copyright (C) 2009-2011 MessagePack Project 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package org.msgpack.hadoop.mapreduce.output; 20 | 21 | import java.io.DataOutputStream; 22 | import java.io.IOException; 23 | 24 | import org.apache.hadoop.conf.Configuration; 25 | import org.apache.hadoop.fs.FSDataOutputStream; 26 | import org.apache.hadoop.fs.FileSystem; 27 | import org.apache.hadoop.fs.Path; 28 | import org.apache.hadoop.io.NullWritable; 29 | import org.apache.hadoop.mapreduce.RecordWriter; 30 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 31 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 32 | import org.slf4j.Logger; 33 | import org.slf4j.LoggerFactory; 34 | 35 | import org.msgpack.hadoop.io.MessagePackWritable; 36 | 37 | public class MessagePackOutputFormat extends FileOutputFormat { 38 | public RecordWriter getRecordWriter(TaskAttemptContext job) throws IOException, InterruptedException { 39 | Configuration conf = job.getConfiguration(); 40 | Path file = getDefaultWorkFile(job, ""); 41 | FileSystem fs = file.getFileSystem(conf); 42 | FSDataOutputStream fileOut = fs.create(file, false); 43 | return new MessagePackRecordWriter(fileOut); 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /src/main/java/org/msgpack/hadoop/mapreduce/output/MessagePackRecordWriter.java: -------------------------------------------------------------------------------- 1 | /* 2 | * MessagePack-Hadoop Integration 3 | * 4 | * Copyright (C) 2009-2011 MessagePack Project 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package org.msgpack.hadoop.mapreduce.output; 20 | 21 | import java.io.DataOutputStream; 22 | import java.io.IOException; 23 | 24 | import java.io.DataOutputStream; 25 | import java.io.IOException; 26 | 27 | import org.apache.commons.codec.binary.Base64; 28 | import org.apache.hadoop.io.NullWritable; 29 | import org.apache.hadoop.mapreduce.RecordWriter; 30 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 31 | import org.slf4j.Logger; 32 | import org.slf4j.LoggerFactory; 33 | 34 | import org.msgpack.MessagePack; 35 | import org.msgpack.hadoop.io.MessagePackWritable; 36 | 37 | public class MessagePackRecordWriter extends RecordWriter { 38 | protected final DataOutputStream out_; 39 | 40 | public MessagePackRecordWriter(DataOutputStream out) { 41 | out_ = out; 42 | } 43 | 44 | public void write(NullWritable key, MessagePackWritable val) throws IOException, InterruptedException { 45 | out_.write(val.getRawBytes()); 46 | } 47 | 48 | public void close(TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException { 49 | out_.close(); 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /src/test/java/org/msgpack/hadoop/hive/serde2/TestMessagePackSerDe.java: -------------------------------------------------------------------------------- 1 | /* 2 | * MessagePack-Hadoop Integration 3 | * 4 | * Copyright (C) 2009-2011 MessagePack Project 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package org.msgpack.hadoop.hive.serde2; 20 | 21 | import java.util.ArrayList; 22 | import java.util.Collections; 23 | import java.util.List; 24 | import java.util.Properties; 25 | import java.io.*; 26 | 27 | 28 | import org.apache.commons.codec.binary.Base64; 29 | import junit.framework.TestCase; 30 | 31 | import org.apache.hadoop.conf.Configuration; 32 | import org.apache.hadoop.hive.serde.Constants; 33 | import org.apache.hadoop.hive.serde2.SerDeException; 34 | import org.apache.hadoop.hive.serde2.io.ByteWritable; 35 | import org.apache.hadoop.hive.serde2.io.DoubleWritable; 36 | import org.apache.hadoop.hive.serde2.io.ShortWritable; 37 | import org.apache.hadoop.hive.serde2.lazy.LazyPrimitive; 38 | import org.apache.hadoop.hive.serde2.objectinspector.StructField; 39 | import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; 40 | import org.apache.hadoop.io.BooleanWritable; 41 | import org.apache.hadoop.io.FloatWritable; 42 | import org.apache.hadoop.io.IntWritable; 43 | import org.apache.hadoop.io.LongWritable; 44 | import org.apache.hadoop.io.Text; 45 | 46 | import org.msgpack.*; 47 | import org.msgpack.hadoop.io.MessagePackWritable; 48 | import org.msgpack.hadoop.hive.serde2.MessagePackSerDe; 49 | 50 | /** 51 | * Tests the MessagePackSerDe class. 52 | */ 53 | public class TestMessagePackSerDe extends TestCase { 54 | public void testMessagePackSerDe() throws Exception { 55 | // Create the SerDe 56 | MessagePackSerDe serDe = new MessagePackSerDe(); 57 | Configuration conf = new Configuration(); 58 | Properties tbl = createProperties(); 59 | serDe.initialize(conf, tbl); 60 | 61 | byte[] raw = MessagePack.pack(10); 62 | MessagePackWritable r = new MessagePackWritable(MessagePack.unpack(raw)); 63 | Object[] expectedFieldsData = { 64 | new Text(raw), 65 | }; 66 | deserializeAndSerialize(serDe, r, expectedFieldsData); 67 | } 68 | 69 | private void deserializeAndSerialize( 70 | MessagePackSerDe serDe, MessagePackWritable r, 71 | Object[] expectedFieldsData) throws SerDeException { 72 | 73 | // Get the row structure 74 | StructObjectInspector oi = (StructObjectInspector)serDe.getObjectInspector(); 75 | List fieldRefs = oi.getAllStructFieldRefs(); 76 | assertEquals(1, fieldRefs.size()); 77 | 78 | // Deserialize 79 | Object row = serDe.deserialize(r); 80 | for (int i = 0; i < fieldRefs.size(); i++) { 81 | Object fieldData = oi.getStructFieldData(row, fieldRefs.get(i)); 82 | if (fieldData != null) { 83 | fieldData = ((LazyPrimitive)fieldData).getWritableObject(); 84 | } 85 | assertEquals("Field " + i, expectedFieldsData[i], fieldData); 86 | } 87 | } 88 | 89 | private Properties createProperties() { 90 | Properties tbl = new Properties(); 91 | // Set the configuration parameters 92 | tbl.setProperty(Constants.SERIALIZATION_FORMAT, "9"); 93 | tbl.setProperty("columns", "v"); 94 | tbl.setProperty("columns.types", "string"); 95 | return tbl; 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /src/test/java/org/msgpack/hadoop/io/TestMessagePackWritable.java: -------------------------------------------------------------------------------- 1 | /* 2 | * MessagePack-Hadoop Integration 3 | * 4 | * Copyright (C) 2009-2011 MessagePack Project 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package org.msgpack.hadoop.io; 20 | 21 | import java.util.*; 22 | import java.io.*; 23 | 24 | import org.apache.commons.codec.binary.Base64; 25 | import junit.framework.TestCase; 26 | 27 | import org.apache.hadoop.conf.Configuration; 28 | import org.apache.hadoop.io.BooleanWritable; 29 | import org.apache.hadoop.io.FloatWritable; 30 | import org.apache.hadoop.io.IntWritable; 31 | import org.apache.hadoop.io.LongWritable; 32 | import org.apache.hadoop.io.Text; 33 | 34 | import org.msgpack.*; 35 | import org.msgpack.Templates.*; 36 | import org.msgpack.hadoop.io.MessagePackWritable; 37 | 38 | import static org.junit.Assert.assertEquals; 39 | import static org.junit.Assert.assertTrue; 40 | 41 | /** 42 | * Tests the MessagePackWritable class. 43 | */ 44 | public class TestMessagePackWritable extends TestCase { 45 | public void testMessagePackWritable() throws Exception { 46 | int n = 100; 47 | 48 | ByteArrayOutputStream bo = new ByteArrayOutputStream(); 49 | DataOutputStream out = new DataOutputStream(bo); 50 | for (int i = 0; i < n; i++) { 51 | byte[] raw = MessagePack.pack(i); 52 | MessagePackObject obj = MessagePack.unpack(raw); 53 | MessagePackWritable r1 = new MessagePackWritable(obj); 54 | r1.write(out); 55 | } 56 | byte[] serialized = bo.toByteArray(); 57 | 58 | MessagePackWritable r2 = new MessagePackWritable(); 59 | ByteArrayInputStream bi = new ByteArrayInputStream(serialized); 60 | DataInputStream in = new DataInputStream(bi); 61 | for (int i = 0; i < n; i++) { 62 | r2.readFields(in); 63 | assertEquals((long)i, 64 | r2.get().convert(Templates.TLong)); 65 | } 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /src/test/java/org/msgpack/hadoop/mapreduce/input/TestMessagePackInputFormat.java: -------------------------------------------------------------------------------- 1 | /* 2 | * MessagePack-Hadoop Integration 3 | * 4 | * Copyright (C) 2009-2011 MessagePack Project 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package org.msgpack.hadoop.mapreduce.input; 20 | 21 | import java.io.*; 22 | import java.util.*; 23 | 24 | import org.apache.hadoop.fs.*; 25 | import org.apache.hadoop.io.*; 26 | 27 | import org.apache.hadoop.mapreduce.lib.input.*; 28 | import org.apache.hadoop.mapreduce.*; 29 | import org.apache.hadoop.conf.*; 30 | import org.apache.hadoop.mapreduce.*; 31 | 32 | import org.msgpack.MessagePack; 33 | import org.msgpack.MessagePackObject; 34 | import org.msgpack.hadoop.io.MessagePackWritable; 35 | 36 | import junit.framework.TestCase; 37 | import static org.junit.Assert.assertEquals; 38 | 39 | /** 40 | * Test cases for MessagePackInputFormat. 41 | */ 42 | public class TestMessagePackInputFormat extends TestCase { 43 | private static final int MAX_LENGTH = 200; 44 | 45 | private static Configuration defaultConf = new Configuration(); 46 | private static FileSystem localFs = null; 47 | 48 | private static Path workDir = 49 | new Path(new Path(System.getProperty("test.build.data", "."), "data"), 50 | "TestLineInputFormat"); 51 | 52 | public void testFormat() throws Exception { 53 | localFs = FileSystem.getLocal(defaultConf); 54 | localFs.delete(workDir, true); 55 | 56 | Job job = new Job(new Configuration(defaultConf)); 57 | Path file = new Path(workDir, "test.txt"); 58 | 59 | int seed = new Random().nextInt(); 60 | Random random = new Random(seed); 61 | 62 | // for a variety of lengths 63 | for (int length = 0; length < MAX_LENGTH; 64 | length += random.nextInt(MAX_LENGTH/10) + 1) { 65 | // create a file with length entries 66 | BufferedOutputStream writer = new BufferedOutputStream(localFs.create(file)); 67 | try { 68 | for (int i = 0; i < length; i++) { 69 | long val = i; 70 | byte[] raw = MessagePack.pack(val); 71 | writer.write(raw, 0, raw.length); 72 | } 73 | } finally { 74 | writer.close(); 75 | } 76 | checkFormat(job); 77 | } 78 | } 79 | 80 | void checkFormat(Job job) throws Exception { 81 | TaskAttemptContext attemptContext = new TaskAttemptContext(job.getConfiguration(), 82 | new TaskAttemptID("123", 0, false, 1, 2)); 83 | 84 | MessagePackInputFormat format = new MessagePackInputFormat(); 85 | FileInputFormat.setInputPaths(job, workDir); 86 | 87 | List splits = format.getSplits(job); 88 | assertEquals(1, splits.size()); 89 | for (int j = 0; j < splits.size(); j++) { 90 | RecordReader reader = 91 | format.createRecordReader(splits.get(j), attemptContext); 92 | reader.initialize(splits.get(j), attemptContext); 93 | 94 | int count = 0; 95 | try { 96 | while (reader.nextKeyValue()) { 97 | LongWritable key = reader.getCurrentKey(); 98 | MessagePackWritable val = reader.getCurrentValue(); 99 | MessagePackObject obj = val.get(); 100 | assertEquals(count, obj.asLong()); 101 | count++; 102 | } 103 | } finally { 104 | reader.close(); 105 | } 106 | } 107 | } 108 | } 109 | --------------------------------------------------------------------------------