├── .gitignore ├── LICENSE ├── README.md ├── Resources ├── HiveKudu-Handler-0.0.1.jar ├── async-1.3.1.jar ├── kudu-client-0.6.0.jar └── kudu-mapreduce-0.7.1.jar ├── doc ├── DesignDocument.md ├── README.md ├── UserGuide.md └── figures │ ├── StorageHandlerDesign.png │ └── hivekudu-design.png ├── pom.xml └── src └── main └── java └── org ├── apache └── hadoop │ └── hive │ └── kududb │ └── KuduHandler │ ├── HiveKuduBridgeUtils.java │ ├── HiveKuduConstants.java │ ├── HiveKuduSerDe.java │ ├── HiveKuduWritable.java │ └── KuduStorageHandler.java └── kududb └── mapred ├── HiveKuduTableInputFormat.java └── HiveKuduTableOutputFormat.java /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Java class files 3 | *.class 4 | 5 | # Windows thumbnail db 6 | Thumbs.db 7 | 8 | # OSX files 9 | .DS_Store 10 | 11 | # Eclipse project files 12 | .classpath 13 | .project 14 | 15 | # IntelliJ 16 | *.iml 17 | .idea 18 | /sync/app/build/ 19 | 20 | #Maven target 21 | target/ 22 | 23 | # Project 24 | /sync/app/test 25 | 26 | #test data 27 | data/ 28 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | #Update April 29th 2016 2 | Hive on Spark is working but there is a connection drop in my InputFormat, which is currently running on a Band-Aid. Please use branch-0.0.2 if you want to use Hive on Spark. 3 | 4 | # HiveKudu-Handler 5 | Hive Kudu Storage Handler, Input & Output format, Writable and SerDe 6 | 7 | This is the first release of Hive on Kudu. 8 | 9 | I have placed the jars in the Resource folder which you can add in hive and test. 10 | 11 | If you would like to build from source then make install and use "HiveKudu-Handler-0.0.1.jar" to add in hive cli or hiveserver2 lib path. 12 | 13 | ## Working Test case 14 | ### simple_test.sql 15 | ```sql 16 | add jar HiveKudu-Handler-0.0.1.jar; 17 | add jar kudu-client-0.6.0.jar; 18 | add jar async-1.3.1.jar; 19 | 20 | set hive.cli.print.header=true; 21 | 22 | CREATE TABLE if not exists test_drop ( 23 | id INT, 24 | name STRING 25 | ) 26 | stored by 'org.apache.hadoop.hive.kududb.KuduHandler.KuduStorageHandler' 27 | TBLPROPERTIES( 28 | 'kudu.table_name' = 'test_drop', 29 | 'kudu.master_addresses' = 'ip-172-31-56-74.ec2.internal:7051', 30 | 'kudu.key_columns' = 'id' 31 | ); 32 | 33 | describe formatted test_drop; 34 | 35 | insert into test_drop values (1, 'a'), (2, 'b'), (3, 'a'); 36 | 37 | select count(*) from test_drop; 38 | 39 | select id from test_Drop where name = 'a'; 40 | 41 | select name, count(*) from test_drop group by name; 42 | 43 | drop table test_Drop; 44 | ``` 45 | 46 | ### Output of simple test 47 | ``` 48 | 49 | Logging initialized using configuration in jar:file:/opt/cloudera/parcels/CDH-5.5.2-1.cdh5.5.2.p0.4/jars/hive-common-1.1.0-cdh5.5.2.jar!/hive-log4j.properties 50 | add jar HiveKudu-Handler-0.0.1.jar 51 | Added [HiveKudu-Handler-0.0.1.jar] to class path 52 | Added resources: [HiveKudu-Handler-0.0.1.jar] 53 | add jar kudu-client-0.6.0.jar 54 | Added [kudu-client-0.6.0.jar] to class path 55 | Added resources: [kudu-client-0.6.0.jar] 56 | add jar async-1.3.1.jar 57 | Added [async-1.3.1.jar] to class path 58 | Added resources: [async-1.3.1.jar] 59 | set hive.cli.print.header=true 60 | 61 | 62 | CREATE TABLE if not exists test_drop ( 63 | id INT, 64 | name STRING 65 | ) 66 | stored by 'org.apache.hadoop.hive.kududb.KuduHandler.KuduStorageHandler' 67 | TBLPROPERTIES( 68 | 'kudu.table_name' = 'test_drop', 69 | 'kudu.master_addresses' = 'ip-172-31-56-74.ec2.internal:7051', 70 | 'kudu.key_columns' = 'id' 71 | ) 72 | OK 73 | Time taken: 2.924 seconds 74 | 75 | 76 | describe formatted test_drop 77 | OK 78 | col_name data_type comment 79 | # col_name data_type comment 80 | 81 | id int from deserializer 82 | name string from deserializer 83 | 84 | # Detailed Table Information 85 | Database: default 86 | Owner: hdfs 87 | CreateTime: Fri Apr 15 00:45:42 EDT 2016 88 | LastAccessTime: UNKNOWN 89 | Protect Mode: None 90 | Retention: 0 91 | Location: hdfs://ip-172-31-56-74.ec2.internal:8020/user/hive/warehouse/test_drop 92 | Table Type: MANAGED_TABLE 93 | Table Parameters: 94 | kudu.key_columns id 95 | kudu.master_addresses ip-172-31-56-74.ec2.internal:7051 96 | kudu.table_name test_drop 97 | storage_handler org.apache.hadoop.hive.kududb.KuduHandler.KuduStorageHandler 98 | transient_lastDdlTime 1460695542 99 | 100 | # Storage Information 101 | SerDe Library: org.apache.hadoop.hive.kududb.KuduHandler.HiveKuduSerDe 102 | InputFormat: null 103 | OutputFormat: null 104 | Compressed: No 105 | Num Buckets: -1 106 | Bucket Columns: [] 107 | Sort Columns: [] 108 | Storage Desc Params: 109 | serialization.format 1 110 | Time taken: 0.277 seconds, Fetched: 31 row(s) 111 | 112 | 113 | insert into test_drop values (1, 'a'), (2, 'b'), (3, 'a') 114 | Query ID = hdfs_20160415004545_5d94fdd4-d6e1-4fe3-b6ef-29eda4f309e5 115 | Total jobs = 1 116 | Launching Job 1 out of 1 117 | Number of reduce tasks is set to 0 since there's no reduce operator 118 | Starting Job = job_1460484956690_0052, Tracking URL = http://ip-172-31-56-74.ec2.internal:8088/proxy/application_1460484956690_0052/ 119 | Kill Command = /opt/cloudera/parcels/CDH-5.5.2-1.cdh5.5.2.p0.4/lib/hadoop/bin/hadoop job -kill job_1460484956690_0052 120 | Hadoop job information for Stage-0: number of mappers: 1; number of reducers: 0 121 | 2016-04-15 00:45:53,003 Stage-0 map = 0%, reduce = 0% 122 | 2016-04-15 00:46:00,375 Stage-0 map = 100%, reduce = 0%, Cumulative CPU 1.73 sec 123 | MapReduce Total cumulative CPU time: 1 seconds 730 msec 124 | Ended Job = job_1460484956690_0052 125 | MapReduce Jobs Launched: 126 | Stage-Stage-0: Map: 1 Cumulative CPU: 1.73 sec HDFS Read: 3934 HDFS Write: 0 SUCCESS 127 | Total MapReduce CPU Time Spent: 1 seconds 730 msec 128 | OK 129 | _col0 _col1 130 | Time taken: 18.704 seconds 131 | 132 | 133 | select count(*) from test_drop 134 | Query ID = hdfs_20160415004646_ee73a7b3-1beb-4dc7-b102-aa5ccf322f10 135 | Total jobs = 1 136 | Launching Job 1 out of 1 137 | Number of reduce tasks determined at compile time: 1 138 | In order to change the average load for a reducer (in bytes): 139 | set hive.exec.reducers.bytes.per.reducer= 140 | In order to limit the maximum number of reducers: 141 | set hive.exec.reducers.max= 142 | In order to set a constant number of reducers: 143 | set mapreduce.job.reduces= 144 | Starting Job = job_1460484956690_0053, Tracking URL = http://ip-172-31-56-74.ec2.internal:8088/proxy/application_1460484956690_0053/ 145 | Kill Command = /opt/cloudera/parcels/CDH-5.5.2-1.cdh5.5.2.p0.4/lib/hadoop/bin/hadoop job -kill job_1460484956690_0053 146 | Hadoop job information for Stage-1: number of mappers: 1; number of reducers: 1 147 | 2016-04-15 00:46:09,350 Stage-1 map = 0%, reduce = 0% 148 | 2016-04-15 00:46:15,773 Stage-1 map = 100%, reduce = 0%, Cumulative CPU 1.39 sec 149 | 2016-04-15 00:46:24,094 Stage-1 map = 100%, reduce = 100%, Cumulative CPU 2.98 sec 150 | MapReduce Total cumulative CPU time: 2 seconds 980 msec 151 | Ended Job = job_1460484956690_0053 152 | MapReduce Jobs Launched: 153 | Stage-Stage-1: Map: 1 Reduce: 1 Cumulative CPU: 2.98 sec HDFS Read: 6865 HDFS Write: 2 SUCCESS 154 | Total MapReduce CPU Time Spent: 2 seconds 980 msec 155 | OK 156 | _c0 157 | 3 158 | Time taken: 23.661 seconds, Fetched: 1 row(s) 159 | 160 | 161 | select id from test_Drop where name = 'a' 162 | Query ID = hdfs_20160415004646_fc52eb30-7464-4ff3-a83f-91b0db8d73df 163 | Total jobs = 1 164 | Launching Job 1 out of 1 165 | Number of reduce tasks is set to 0 since there's no reduce operator 166 | Starting Job = job_1460484956690_0054, Tracking URL = http://ip-172-31-56-74.ec2.internal:8088/proxy/application_1460484956690_0054/ 167 | Kill Command = /opt/cloudera/parcels/CDH-5.5.2-1.cdh5.5.2.p0.4/lib/hadoop/bin/hadoop job -kill job_1460484956690_0054 168 | Hadoop job information for Stage-1: number of mappers: 1; number of reducers: 0 169 | 2016-04-15 00:46:32,780 Stage-1 map = 0%, reduce = 0% 170 | 2016-04-15 00:46:40,051 Stage-1 map = 100%, reduce = 0%, Cumulative CPU 2.11 sec 171 | MapReduce Total cumulative CPU time: 2 seconds 110 msec 172 | Ended Job = job_1460484956690_0054 173 | MapReduce Jobs Launched: 174 | Stage-Stage-1: Map: 1 Cumulative CPU: 2.11 sec HDFS Read: 3991 HDFS Write: 4 SUCCESS 175 | Total MapReduce CPU Time Spent: 2 seconds 110 msec 176 | OK 177 | id 178 | 1 179 | 3 180 | Time taken: 15.94 seconds, Fetched: 2 row(s) 181 | 182 | 183 | select name, count(*) from test_drop group by name 184 | Query ID = hdfs_20160415004646_15757794-72c2-45a7-84b4-ba7b0a5d4405 185 | Total jobs = 1 186 | Launching Job 1 out of 1 187 | Number of reduce tasks not specified. Estimated from input data size: 1 188 | In order to change the average load for a reducer (in bytes): 189 | set hive.exec.reducers.bytes.per.reducer= 190 | In order to limit the maximum number of reducers: 191 | set hive.exec.reducers.max= 192 | In order to set a constant number of reducers: 193 | set mapreduce.job.reduces= 194 | Starting Job = job_1460484956690_0055, Tracking URL = http://ip-172-31-56-74.ec2.internal:8088/proxy/application_1460484956690_0055/ 195 | Kill Command = /opt/cloudera/parcels/CDH-5.5.2-1.cdh5.5.2.p0.4/lib/hadoop/bin/hadoop job -kill job_1460484956690_0055 196 | Hadoop job information for Stage-1: number of mappers: 1; number of reducers: 1 197 | 2016-04-15 00:46:48,532 Stage-1 map = 0%, reduce = 0% 198 | 2016-04-15 00:46:55,742 Stage-1 map = 100%, reduce = 0%, Cumulative CPU 1.4 sec 199 | 2016-04-15 00:47:02,987 Stage-1 map = 100%, reduce = 100%, Cumulative CPU 2.84 sec 200 | MapReduce Total cumulative CPU time: 2 seconds 840 msec 201 | Ended Job = job_1460484956690_0055 202 | MapReduce Jobs Launched: 203 | Stage-Stage-1: Map: 1 Reduce: 1 Cumulative CPU: 2.84 sec HDFS Read: 7341 HDFS Write: 8 SUCCESS 204 | Total MapReduce CPU Time Spent: 2 seconds 840 msec 205 | OK 206 | name _c1 207 | a 2 208 | b 1 209 | Time taken: 22.899 seconds, Fetched: 2 row(s) 210 | 211 | 212 | drop table test_Drop 213 | OK 214 | Time taken: 0.113 seconds 215 | WARN: The method class org.apache.commons.logging.impl.SLF4JLogFactory#release() was invoked. 216 | WARN: Please see http://www.slf4j.org/codes.html#release for an explanation. 217 | 218 | ``` 219 | -------------------------------------------------------------------------------- /Resources/HiveKudu-Handler-0.0.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BimalTandel/HiveKudu-Handler/70ff341ca642fe9de74048a5f70833ee0b6b3baa/Resources/HiveKudu-Handler-0.0.1.jar -------------------------------------------------------------------------------- /Resources/async-1.3.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BimalTandel/HiveKudu-Handler/70ff341ca642fe9de74048a5f70833ee0b6b3baa/Resources/async-1.3.1.jar -------------------------------------------------------------------------------- /Resources/kudu-client-0.6.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BimalTandel/HiveKudu-Handler/70ff341ca642fe9de74048a5f70833ee0b6b3baa/Resources/kudu-client-0.6.0.jar -------------------------------------------------------------------------------- /Resources/kudu-mapreduce-0.7.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BimalTandel/HiveKudu-Handler/70ff341ca642fe9de74048a5f70833ee0b6b3baa/Resources/kudu-mapreduce-0.7.1.jar -------------------------------------------------------------------------------- /doc/DesignDocument.md: -------------------------------------------------------------------------------- 1 | # Hive On Kudu Design Document 2 | ## Design Goals 3 | The primary goal of developing Hive on Kudu is to fully leverage Hive and Kudu's capabilities. The goal is to release the first version of Hive on Kudu with the following features. 4 | 5 | 1. Support for Hive External tables to Kudu 6 | * SerDe to auto-create columns in Hive 7 | 2. Support for Hive managed tables on Kudu 8 | 3. Support basic and advanced partitioning capabilities of Kudu 9 | 4. Support for Hive transactions (Updates and Deletes) 10 | 5. Support for predicates push-down 11 | 12 | ## Design 13 | Hive provides a Storage Handler to integrated other storage systems with Hive. Primary example of a Storage System that leverages storage handler is HBase. Below are some useful links to familiarize with Storage Handlers in Hive with examples. 14 | * [Hive Storage Handler guide](https://cwiki.apache.org/confluence/display/Hive/StorageHandlers) 15 | * [Hive-HBase Storage Handler](https://github.com/BimalTandel/hive/tree/master/hbase-handler) 16 | * [Hive-JDBC Storage Handler](https://github.com/qubole/Hive-JDBC-Storage-Handler) 17 | * [ElasticSearch Storage Handler](https://github.com/elastic/elasticsearch-hadoop/tree/master/hive/src/main/java/org/elasticsearch/hadoop/hive) 18 | 19 | In addition to a Storage Handler Hive provides capability to provide specific Input and Output format to handle reads and writes, and a SerDe to serialize and deserialize data to and from the Output and Input formats. 20 | 21 | High Level interactions and relationships before the components: 22 | 23 | ![alt text](./figures/StorageHandlerDesign.png) "Custom Storage Handler Components") 24 | 25 | To complete integration of Hive and Kudu these components will have to be developed. 26 | * HiveKudu Storage Handler 27 | * HiveKudu Input Format (With ACID support for Updates and Deletes) 28 | * HiveKudu Output Format (With ACID support for Updates and Deletes) 29 | * HiveKudu SerDe 30 | * HiveKudu Writable 31 | 32 | > Hive Interfaces and Classes are all based on MR1 APIs. I found it challenging to extend the KuduTableInputFormat and KuduTableOutputFormat as they are based on MR2 APIs. The only way to successfully use them would be to convert and publish a version with MR1 APIs. 33 | 34 | ## Detailed Design (Work in Progress) 35 | ### HiveKudu Storage Handler 36 | Things that need further discussion 37 | * How should we map Hive DDLs for providing partitioning options for Kudu Tables? 38 | * Option 1: Use Hives "Clustered By" and "INTO numbuckets BUCKETS" clauses. 39 | * Option 2: Use TBLPROPERTIES Key Value Pairs 40 | * How should we decompose predicates to allow Kudu to filter records during table scans? We can attempt to do what is currently supported via Impala. 41 | 42 | ### HiveKudu Serde and HiveKudu Writable 43 | * Review the current design of the Writable object 44 | * Hive to Kudu datatype mappings. (Kudu treats timestamps as LONG). 45 | 46 | ### HiveKudu Input & Output format 47 | * Can we leverage the ACIDInputOutput format for Kudu? 48 | * How to leverage hive transactions for Kudu? 49 | -------------------------------------------------------------------------------- /doc/README.md: -------------------------------------------------------------------------------- 1 | ## Documents related to Hive on Kudu 2 | * [Design Document](DesignDocument.MD) 3 | * [User Guide](UserGuide.MD) 4 | 5 | -------------------------------------------------------------------------------- /doc/UserGuide.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BimalTandel/HiveKudu-Handler/70ff341ca642fe9de74048a5f70833ee0b6b3baa/doc/UserGuide.md -------------------------------------------------------------------------------- /doc/figures/StorageHandlerDesign.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BimalTandel/HiveKudu-Handler/70ff341ca642fe9de74048a5f70833ee0b6b3baa/doc/figures/StorageHandlerDesign.png -------------------------------------------------------------------------------- /doc/figures/hivekudu-design.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BimalTandel/HiveKudu-Handler/70ff341ca642fe9de74048a5f70833ee0b6b3baa/doc/figures/hivekudu-design.png -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | com.cloudera.ps.HiveKudu 8 | KuduHandler 9 | 0.0.1 10 | 11 | 12 | UTF-8 13 | 14 | 15 | 16 | 17 | cdh.repo 18 | Cloudera Repositories 19 | https://repository.cloudera.com/artifactory/cloudera-repos 20 | 21 | false 22 | 23 | 24 | 25 | 26 | 27 | 28 | org.apache.hive 29 | hive-exec 30 | 1.1.0 31 | compile 32 | 33 | 34 | 35 | junit 36 | junit 37 | 4.10 38 | test 39 | 40 | 41 | org.easymock 42 | easymock 43 | 3.1 44 | test 45 | 46 | 47 | org.apache.hadoop 48 | hadoop-common 49 | 2.6.1 50 | provided 51 | 52 | 53 | org.apache.hadoop 54 | hadoop-mapreduce-client-common 55 | 2.6.1 56 | provided 57 | 58 | 59 | org.kududb 60 | kudu-client 61 | 0.7.1 62 | provided 63 | 64 | 65 | org.kududb 66 | kudu-mapreduce 67 | 0.7.1 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | org.apache.maven.plugins 76 | maven-compiler-plugin 77 | 2.5.1 78 | 79 | 1.7 80 | 1.7 81 | true 82 | true 83 | true 84 | 85 | 86 | 87 | org.apache.maven.plugins 88 | maven-shade-plugin 89 | 2.3 90 | 91 | 92 | 93 | com.amazonaws:* 94 | com.google.guava:guava 95 | joda-time:joda-time 96 | 97 | 98 | 99 | 100 | com.google 101 | qksh.shaded.com.google 102 | 103 | 104 | com.amazonaws 105 | qksh.shaded.com.amazonaws 106 | 107 | 108 | org.joda 109 | qksh.shaded.org.joda 110 | 111 | 112 | HiveKudu-Handler-${project.version} 113 | 114 | 115 | 116 | package 117 | 118 | shade 119 | 120 | 121 | false 122 | 123 | 124 | 125 | 126 | 127 | org.apache.maven.plugins 128 | maven-assembly-plugin 129 | 130 | 131 | jar-with-dependencies 132 | 133 | 134 | 135 | 136 | assemble-all 137 | package 138 | 139 | single 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | -------------------------------------------------------------------------------- /src/main/java/org/apache/hadoop/hive/kududb/KuduHandler/HiveKuduBridgeUtils.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2016 Bimal Tandel 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package org.apache.hadoop.hive.kududb.KuduHandler; 18 | 19 | import org.apache.hadoop.hive.serde2.SerDeException; 20 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; 21 | import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; 22 | import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; 23 | import org.kududb.Type; 24 | 25 | import java.io.DataInput; 26 | import java.io.DataOutput; 27 | import java.io.IOException; 28 | import java.sql.Timestamp; 29 | 30 | 31 | /** 32 | * Created by bimal on 4/12/16. 33 | */ 34 | public class HiveKuduBridgeUtils { 35 | 36 | public static Type hiveTypeToKuduType(String hiveType) throws SerDeException { 37 | final String lchiveType = hiveType.toLowerCase(); 38 | switch(lchiveType) { 39 | case "string": 40 | case "varchar": 41 | case "char": 42 | return Type.STRING; 43 | 44 | case "tinyint": 45 | return Type.INT8; 46 | case "smallint": 47 | return Type.INT16; 48 | case "int": 49 | return Type.INT32; 50 | case "bigint": 51 | return Type.INT64; 52 | case "float": 53 | return Type.FLOAT; 54 | case "double": 55 | return Type.DOUBLE; 56 | 57 | case "timestamp": 58 | return Type.TIMESTAMP; 59 | 60 | case "boolean": 61 | return Type.BOOL; 62 | 63 | case "binary": 64 | return Type.BINARY; 65 | default: 66 | throw new SerDeException("Unrecognized column type: " + hiveType + " not supported in Kudu"); 67 | } 68 | } 69 | 70 | public static ObjectInspector getObjectInspector(Type kuduType, 71 | String hiveType) throws SerDeException { 72 | switch (kuduType) { 73 | case STRING: 74 | return PrimitiveObjectInspectorFactory.javaStringObjectInspector; 75 | case FLOAT: 76 | return PrimitiveObjectInspectorFactory.javaFloatObjectInspector; 77 | case DOUBLE: 78 | return PrimitiveObjectInspectorFactory.javaDoubleObjectInspector; 79 | case BOOL: 80 | return PrimitiveObjectInspectorFactory.javaBooleanObjectInspector; 81 | case INT8: 82 | return PrimitiveObjectInspectorFactory.javaByteObjectInspector; 83 | case INT16: 84 | return PrimitiveObjectInspectorFactory.javaShortObjectInspector; 85 | case INT32: 86 | return PrimitiveObjectInspectorFactory.javaIntObjectInspector; 87 | case INT64: 88 | return PrimitiveObjectInspectorFactory.javaLongObjectInspector; 89 | case TIMESTAMP: 90 | return PrimitiveObjectInspectorFactory.javaTimestampObjectInspector; 91 | case BINARY: 92 | return PrimitiveObjectInspectorFactory.javaByteArrayObjectInspector; 93 | default: 94 | throw new SerDeException("Cannot find getObjectInspector for: " 95 | + hiveType); 96 | } 97 | } 98 | 99 | public static Object deparseObject(Object field, ObjectInspector fieldOI) 100 | throws SerDeException { 101 | switch (fieldOI.getCategory()) { 102 | case PRIMITIVE: { 103 | PrimitiveObjectInspector oi = (PrimitiveObjectInspector) fieldOI; 104 | return oi.getPrimitiveJavaObject(field); 105 | } 106 | 107 | //Kudu doesnt support LIST or MAP based data types 108 | 109 | default: 110 | throw new SerDeException("Unexpected fieldOI: " + fieldOI); 111 | } 112 | } 113 | 114 | 115 | public static Object readObject(DataInput in, Type kuduType) 116 | throws IOException { 117 | switch (kuduType) { 118 | case STRING: 119 | return in.readUTF(); 120 | case FLOAT: 121 | return Float.valueOf(in.readFloat()); 122 | case DOUBLE: 123 | return Double.valueOf(in.readDouble()); 124 | case BOOL: 125 | return Boolean.valueOf(in.readBoolean()); 126 | case INT8: 127 | return Byte.valueOf(in.readByte()); 128 | case INT16: 129 | return Short.valueOf(in.readShort()); 130 | case INT32: 131 | return Integer.valueOf(in.readInt()); 132 | case INT64: 133 | return Long.valueOf(in.readLong()); 134 | case TIMESTAMP: { 135 | long time = in.readLong(); 136 | return new Timestamp(time); 137 | } 138 | case BINARY: { 139 | int size = in.readInt(); 140 | byte[] b = new byte[size]; 141 | in.readFully(b); 142 | return b; 143 | } 144 | default: 145 | throw new IOException("Cannot read Object for type: " + kuduType.name()); 146 | } 147 | } 148 | 149 | public static void writeObject(Object obj, Type kuduType, DataOutput out) 150 | throws IOException { 151 | switch (kuduType) { 152 | case STRING: { 153 | String s = obj.toString(); 154 | out.writeUTF(s); 155 | return; 156 | } 157 | case FLOAT: { 158 | Float f = (Float) obj; 159 | out.writeFloat(f); 160 | return; 161 | } 162 | case DOUBLE: { 163 | Double d = (Double) obj; 164 | out.writeDouble(d); 165 | return; 166 | } 167 | case BOOL: { 168 | Boolean b = (Boolean) obj; 169 | out.writeBoolean(b); 170 | return; 171 | } 172 | case INT8: { 173 | Byte b = (Byte) obj; 174 | out.writeByte(b.intValue()); 175 | return; 176 | } 177 | case INT16: { 178 | Short s = (Short) obj; 179 | out.writeShort(s.shortValue()); 180 | return; 181 | } 182 | case INT32: { 183 | Integer i = (Integer) obj; 184 | out.writeInt(i.intValue()); 185 | return; 186 | } 187 | case INT64: { 188 | Long l = (Long) obj; 189 | out.writeLong(l.longValue()); 190 | return; 191 | } 192 | case TIMESTAMP: { 193 | Timestamp time = (Timestamp) obj; 194 | out.writeLong(time.getTime()); 195 | return; 196 | } 197 | case BINARY: { 198 | byte[] b = (byte[]) obj; 199 | out.writeInt(b.length); 200 | out.write(b); 201 | return; 202 | } 203 | default: 204 | throw new IOException("Cannot write Object '" 205 | + obj.getClass().getSimpleName() + "' as type: " + kuduType.name()); 206 | } 207 | } 208 | } 209 | -------------------------------------------------------------------------------- /src/main/java/org/apache/hadoop/hive/kududb/KuduHandler/HiveKuduConstants.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2016 Bimal Tandel 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package org.apache.hadoop.hive.kududb.KuduHandler; 18 | 19 | /** 20 | * Created by bimal on 4/11/16. 21 | */ 22 | 23 | public final class HiveKuduConstants { 24 | 25 | //Table Properties 26 | public static final String LIST_COLUMNS = "columns"; 27 | public static final String LIST_COLUMN_TYPES = "columns.types"; 28 | public static final String MASTER_ADDRESS_NAME = "kudu.master_addresses"; 29 | public static final String TABLE_NAME = "kudu.table_name"; 30 | public static final String KEY_COLUMNS = "kudu.key_columns"; 31 | 32 | //SerDe Properties 33 | 34 | //MapReduce Properties 35 | public static final String MR_INPUT_TABLE_NAME = "kudu.mapreduce.input.table"; 36 | public static final String MR_OUTPUT_TABLE_NAME = "kudu.mapreduce.output.table"; 37 | public static final String MR_MASTER_ADDRESS_NAME = "kudu.mapreduce.master.addresses"; 38 | public static final String MR_PROPERTY_PREFIX = "kudu.mapreduce."; 39 | //DEFAULT VALUES & Getters for Default values 40 | 41 | 42 | private HiveKuduConstants() { 43 | } 44 | } -------------------------------------------------------------------------------- /src/main/java/org/apache/hadoop/hive/kududb/KuduHandler/HiveKuduSerDe.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2016 Bimal Tandel 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package org.apache.hadoop.hive.kududb.KuduHandler; 18 | 19 | import org.apache.commons.logging.Log; 20 | import org.apache.commons.logging.LogFactory; 21 | import org.apache.hadoop.conf.Configuration; 22 | import org.apache.hadoop.hive.serde2.SerDe; 23 | import org.apache.hadoop.hive.serde2.SerDeException; 24 | import org.apache.hadoop.hive.serde2.SerDeStats; 25 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; 26 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; 27 | import org.apache.hadoop.hive.serde2.objectinspector.StructField; 28 | import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; 29 | import org.apache.hadoop.io.Writable; 30 | import org.kududb.Type; 31 | 32 | import java.util.ArrayList; 33 | import java.util.Arrays; 34 | import java.util.List; 35 | import java.util.Properties; 36 | 37 | 38 | /** 39 | * Created by bimal on 4/12/16. 40 | */ 41 | 42 | public class HiveKuduSerDe implements SerDe { 43 | 44 | private static final Log LOG = LogFactory.getLog(HiveKuduSerDe.class); 45 | 46 | private HiveKuduWritable cachedWritable; //Currently Update/Delete not supported from Hive. 47 | 48 | private int fieldCount; 49 | 50 | private StructObjectInspector objectInspector; 51 | private List deserializeCache; 52 | 53 | public HiveKuduSerDe() { 54 | } 55 | 56 | @Override 57 | public void initialize(Configuration sysConf, Properties tblProps) 58 | throws SerDeException { 59 | 60 | LOG.debug("tblProps: " + tblProps); 61 | 62 | String columnNameProperty = tblProps 63 | .getProperty(HiveKuduConstants.LIST_COLUMNS); 64 | String columnTypeProperty = tblProps 65 | .getProperty(HiveKuduConstants.LIST_COLUMN_TYPES); 66 | 67 | if (columnNameProperty.length() == 0 68 | && columnTypeProperty.length() == 0) { 69 | //This is where we will implement option to connect to Kudu and get the column list using Serde. 70 | } 71 | 72 | List columnNames = Arrays.asList(columnNameProperty.split(",")); 73 | 74 | String[] columnTypes = columnTypeProperty.split(":"); 75 | 76 | if (columnNames.size() != columnTypes.length) { 77 | throw new SerDeException("Splitting column and types failed." + "columnNames: " 78 | + columnNames + ", columnTypes: " 79 | + Arrays.toString(columnTypes)); 80 | } 81 | 82 | final Type[] types = new Type[columnTypes.length]; 83 | 84 | for (int i = 0; i < types.length; i++) { 85 | types[i] = HiveKuduBridgeUtils.hiveTypeToKuduType(columnTypes[i]); 86 | } 87 | 88 | this.cachedWritable = new HiveKuduWritable(types); 89 | 90 | this.fieldCount = types.length; 91 | 92 | final List fieldOIs = new ArrayList<>(columnTypes.length); 93 | 94 | for (int i = 0; i < types.length; i++) { 95 | ObjectInspector oi = HiveKuduBridgeUtils.getObjectInspector(types[i], columnTypes[i]); 96 | fieldOIs.add(oi); 97 | } 98 | 99 | this.objectInspector = ObjectInspectorFactory.getStandardStructObjectInspector(columnNames, fieldOIs); 100 | 101 | this.deserializeCache = new ArrayList<>(columnTypes.length); 102 | 103 | } 104 | 105 | @Override 106 | public ObjectInspector getObjectInspector() throws SerDeException { 107 | return objectInspector; 108 | } 109 | 110 | @Override 111 | public Class getSerializedClass() { 112 | return HiveKuduWritable.class; 113 | } 114 | 115 | @Override 116 | public HiveKuduWritable serialize(Object row, ObjectInspector inspector) 117 | throws SerDeException { 118 | 119 | final StructObjectInspector structInspector = (StructObjectInspector) inspector; 120 | final List fields = structInspector.getAllStructFieldRefs(); 121 | if (fields.size() != fieldCount) { 122 | throw new SerDeException(String.format( 123 | "Required %d columns, received %d.", fieldCount, 124 | fields.size())); 125 | } 126 | 127 | cachedWritable.clear(); 128 | 129 | for (int i = 0; i < fieldCount; i++) { 130 | StructField structField = fields.get(i); 131 | if (structField != null) { 132 | Object field = structInspector.getStructFieldData(row, 133 | structField); 134 | ObjectInspector fieldOI = structField.getFieldObjectInspector(); 135 | 136 | Object javaObject = HiveKuduBridgeUtils.deparseObject(field, 137 | fieldOI); 138 | LOG.warn("Column value of " + i + " is " + javaObject.toString()); 139 | cachedWritable.set(i, javaObject); 140 | } 141 | } 142 | return cachedWritable; 143 | } 144 | 145 | @Override 146 | public Object deserialize(Writable record) throws SerDeException { 147 | if (!(record instanceof HiveKuduWritable)) { 148 | throw new SerDeException("Expected HiveKuduWritable, received " 149 | + record.getClass().getName()); 150 | } 151 | HiveKuduWritable tuple = (HiveKuduWritable) record; 152 | deserializeCache.clear(); 153 | for (int i = 0; i < fieldCount; i++) { 154 | Object o = tuple.get(i); 155 | deserializeCache.add(o); 156 | } 157 | return deserializeCache; 158 | } 159 | 160 | @Override 161 | public SerDeStats getSerDeStats() { 162 | // TODO How to implement this? 163 | return null; 164 | } 165 | } 166 | 167 | 168 | -------------------------------------------------------------------------------- /src/main/java/org/apache/hadoop/hive/kududb/KuduHandler/HiveKuduWritable.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2016 Bimal Tandel 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package org.apache.hadoop.hive.kududb.KuduHandler; 18 | 19 | import org.apache.hadoop.io.Writable; 20 | import org.apache.hadoop.io.WritableUtils; 21 | import org.kududb.Type; 22 | 23 | import java.io.DataInput; 24 | import java.io.DataOutput; 25 | import java.io.IOException; 26 | import java.util.Arrays; 27 | 28 | /** 29 | * Created by bimal on 4/12/16. 30 | */ 31 | public class HiveKuduWritable implements Writable { 32 | 33 | 34 | private Object[] columnValues; 35 | private Type[] columnTypes; 36 | 37 | public HiveKuduWritable() { 38 | 39 | } 40 | 41 | public HiveKuduWritable(Type[] types) { 42 | this.columnValues = new Object[types.length]; 43 | this.columnTypes = types; 44 | } 45 | 46 | public void clear() { 47 | Arrays.fill(columnValues, null); 48 | } 49 | 50 | public void set(int i, Object javaObject) { 51 | columnValues[i] = javaObject; 52 | } 53 | 54 | public Object get(int i) { 55 | return columnValues[i]; 56 | } 57 | 58 | public Type getType(int i) { return columnTypes[i]; } 59 | 60 | public int getColCount() { 61 | return this.columnTypes.length; 62 | } 63 | 64 | @Override 65 | public void readFields(DataInput in) throws IOException { 66 | int size = in.readInt(); 67 | if (size == -1) { 68 | return; 69 | } 70 | if (columnValues == null) { 71 | this.columnValues = new Object[size]; 72 | this.columnTypes = new Type[size]; 73 | } else { 74 | clear(); 75 | } 76 | for (int i = 0; i < size; i++) { 77 | Type kuduType = WritableUtils.readEnum(in, Type.class); 78 | columnTypes[i] = kuduType; 79 | Object v = HiveKuduBridgeUtils.readObject(in, kuduType); 80 | columnValues[i] = v; 81 | } 82 | } 83 | @Override 84 | public void write(DataOutput out) throws IOException { 85 | if (columnValues == null) { 86 | out.writeInt(-1); 87 | return; 88 | } 89 | if (columnTypes == null) { 90 | out.writeInt(-1); 91 | return; 92 | } 93 | 94 | final Object[] values = this.columnValues; 95 | final Type[] types = this.columnTypes; 96 | 97 | out.writeInt(values.length); 98 | 99 | for (int i = 0; i < values.length; i++) { 100 | HiveKuduBridgeUtils.writeObject(values[i], types[i], out); 101 | } 102 | } 103 | } 104 | -------------------------------------------------------------------------------- /src/main/java/org/apache/hadoop/hive/kududb/KuduHandler/KuduStorageHandler.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2016 Bimal Tandel 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | package org.apache.hadoop.hive.kududb.KuduHandler; 18 | 19 | import org.apache.commons.logging.Log; 20 | import org.apache.commons.logging.LogFactory; 21 | import org.apache.hadoop.conf.Configuration; 22 | import org.apache.hadoop.hive.metastore.HiveMetaHook; 23 | import org.apache.hadoop.hive.metastore.MetaStoreUtils; 24 | import org.apache.hadoop.hive.metastore.api.FieldSchema; 25 | import org.apache.hadoop.hive.metastore.api.MetaException; 26 | import org.apache.hadoop.hive.metastore.api.Table; 27 | import org.apache.hadoop.hive.ql.metadata.DefaultStorageHandler; 28 | import org.apache.hadoop.hive.ql.metadata.HiveException; 29 | import org.apache.hadoop.hive.ql.metadata.HiveStoragePredicateHandler; 30 | import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; 31 | import org.apache.hadoop.hive.ql.plan.TableDesc; 32 | import org.apache.hadoop.hive.ql.security.authorization.DefaultHiveAuthorizationProvider; 33 | import org.apache.hadoop.hive.ql.security.authorization.HiveAuthorizationProvider; 34 | import org.apache.hadoop.hive.serde2.Deserializer; 35 | import org.apache.hadoop.hive.serde2.SerDe; 36 | import org.apache.hadoop.mapred.JobConf; 37 | import org.apache.hadoop.mapred.InputFormat; 38 | import org.apache.hadoop.mapred.OutputFormat; 39 | import org.kududb.ColumnSchema; 40 | import org.kududb.Schema; 41 | import org.kududb.client.KuduClient; 42 | import org.kududb.client.CreateTableOptions; 43 | import org.kududb.mapred.HiveKuduTableInputFormat; 44 | import org.kududb.mapred.HiveKuduTableOutputFormat; 45 | 46 | import java.io.IOException; 47 | import java.util.*; 48 | 49 | /** 50 | * Created by bimal on 4/11/16. 51 | */ 52 | 53 | @SuppressWarnings({ "deprecation", "rawtypes" }) 54 | public class KuduStorageHandler extends DefaultStorageHandler 55 | implements HiveMetaHook, HiveStoragePredicateHandler { 56 | 57 | private static final Log LOG = LogFactory.getLog(KuduStorageHandler.class); 58 | 59 | private Configuration conf; 60 | 61 | private String kuduMaster; 62 | private String kuduTableName; 63 | 64 | @Override 65 | public Class getInputFormatClass() { 66 | return HiveKuduTableInputFormat.class; 67 | } 68 | 69 | @Override 70 | public Class getOutputFormatClass() { 71 | return HiveKuduTableOutputFormat.class; 72 | } 73 | 74 | @Override 75 | public Class getSerDeClass() { 76 | return HiveKuduSerDe.class; 77 | } 78 | 79 | private KuduClient getKuduClient(String master) throws MetaException { 80 | try { 81 | 82 | return new KuduClient.KuduClientBuilder(master).build(); 83 | } catch (Exception ioe){ 84 | throw new MetaException("Error creating Kudu Client: " + ioe); 85 | } 86 | } 87 | 88 | public KuduStorageHandler() { 89 | // TODO: Empty initializer?? 90 | } 91 | 92 | @Override 93 | public Configuration getConf() { 94 | return conf; 95 | } 96 | 97 | @Override 98 | public void setConf(Configuration conf) { 99 | this.conf = conf; 100 | } 101 | 102 | @Override 103 | public HiveMetaHook getMetaHook() { 104 | return this; 105 | } 106 | 107 | @Override 108 | public void configureInputJobProperties(TableDesc tableDesc, 109 | Map jobProperties) { 110 | configureJobProperties(tableDesc, jobProperties); 111 | } 112 | 113 | @Override 114 | public void configureOutputJobProperties(TableDesc tableDesc, 115 | Map jobProperties) { 116 | configureJobProperties(tableDesc, jobProperties); 117 | } 118 | 119 | @Override 120 | public void configureTableJobProperties(TableDesc tableDesc, 121 | Map jobProperties) { 122 | configureJobProperties(tableDesc, jobProperties); 123 | } 124 | 125 | private void configureJobProperties(TableDesc tableDesc, 126 | Map jobProperties) { 127 | 128 | //This will always have the DB Name qualifier of Hive. Dont use this to set Kudu Tablename. 129 | String tblName = tableDesc.getTableName(); 130 | LOG.debug("Hive Table Name:" + tblName); 131 | Properties tblProps = tableDesc.getProperties(); 132 | String columnNames = tblProps.getProperty(HiveKuduConstants.LIST_COLUMNS); 133 | String columnTypes = tblProps.getProperty(HiveKuduConstants.LIST_COLUMN_TYPES); 134 | LOG.debug("Columns names:" + columnNames); 135 | LOG.debug("Column types:" + columnTypes); 136 | 137 | if (columnNames.length() == 0) { 138 | //TODO: Place keeper to insert SerDeHelper code to connect to Kudu to extract column names. 139 | LOG.warn("SerDe currently doesn't support column names and types. Please provide it explicitly"); 140 | } 141 | 142 | //set map reduce properties. 143 | jobProperties.put(HiveKuduConstants.MR_INPUT_TABLE_NAME, 144 | tblProps.getProperty(HiveKuduConstants.TABLE_NAME)); 145 | jobProperties.put(HiveKuduConstants.MR_OUTPUT_TABLE_NAME, 146 | tblProps.getProperty(HiveKuduConstants.TABLE_NAME)); 147 | jobProperties.put(HiveKuduConstants.MR_MASTER_ADDRESS_NAME, 148 | tblProps.getProperty(HiveKuduConstants.MASTER_ADDRESS_NAME)); 149 | 150 | LOG.debug("Kudu Table Name: " + tblProps.getProperty(HiveKuduConstants.TABLE_NAME)); 151 | LOG.debug("Kudu Master Addresses: " + tblProps.getProperty(HiveKuduConstants.MASTER_ADDRESS_NAME)); 152 | 153 | 154 | //set configuration property 155 | conf.set(HiveKuduConstants.MR_INPUT_TABLE_NAME, 156 | tblProps.getProperty(HiveKuduConstants.TABLE_NAME)); 157 | conf.set(HiveKuduConstants.MR_OUTPUT_TABLE_NAME, 158 | tblProps.getProperty(HiveKuduConstants.TABLE_NAME)); 159 | conf.set(HiveKuduConstants.MR_MASTER_ADDRESS_NAME, 160 | tblProps.getProperty(HiveKuduConstants.MASTER_ADDRESS_NAME)); 161 | 162 | conf.set(HiveKuduConstants.TABLE_NAME, 163 | tblProps.getProperty(HiveKuduConstants.TABLE_NAME)); 164 | conf.set(HiveKuduConstants.MASTER_ADDRESS_NAME, 165 | tblProps.getProperty(HiveKuduConstants.MASTER_ADDRESS_NAME)); 166 | 167 | //set class variables 168 | kuduMaster = conf.get(HiveKuduConstants.MASTER_ADDRESS_NAME); 169 | kuduTableName = conf.get(HiveKuduConstants.TABLE_NAME); 170 | 171 | for (String key : tblProps.stringPropertyNames()) { 172 | if (key.startsWith(HiveKuduConstants.MR_PROPERTY_PREFIX)) { 173 | String value = tblProps.getProperty(key); 174 | jobProperties.put(key, value); 175 | //Also set configuration for Non Map Reduce Hive calls to the Handler 176 | conf.set(key, value); 177 | } 178 | } 179 | } 180 | 181 | @Override 182 | public HiveAuthorizationProvider getAuthorizationProvider() 183 | throws HiveException { 184 | return new DefaultHiveAuthorizationProvider(); 185 | } 186 | 187 | @Override 188 | public DecomposedPredicate decomposePredicate(JobConf jobConf, 189 | Deserializer deserializer, ExprNodeDesc predicate) { 190 | // TODO: Implement push down to Kudu here. 191 | DecomposedPredicate decomposedPredicate = new DecomposedPredicate(); 192 | return decomposedPredicate; 193 | } 194 | 195 | private String getKuduTableName(Table tbl) { 196 | 197 | String tableName = conf.get(HiveKuduConstants.TABLE_NAME); 198 | if (tableName == null) { 199 | LOG.warn("Kudu Table name was not provided in table properties."); 200 | LOG.warn("Attempting to use Hive Table name"); 201 | tableName = tbl.getTableName().replaceAll(".*\\.", ""); 202 | LOG.warn("Kudu Table name will be: " + tableName); 203 | 204 | } 205 | return tableName; 206 | } 207 | 208 | private void printSchema(Schema schema) { 209 | if (schema == null) { 210 | return; 211 | } 212 | 213 | LOG.debug("Printing schema for Kudu table.."); 214 | for (ColumnSchema sch : schema.getColumns()) { 215 | LOG.debug("Column Name: " + sch.getName() 216 | + " [" + sch.getType().getName() + "]" 217 | + " key column: [" + sch.isKey() + "]" 218 | ); 219 | } 220 | } 221 | 222 | 223 | @Override 224 | public void preCreateTable(Table tbl) 225 | throws MetaException { 226 | KuduClient client = getKuduClient(tbl.getParameters().get(HiveKuduConstants.MASTER_ADDRESS_NAME)); 227 | 228 | boolean isExternal = MetaStoreUtils.isExternalTable(tbl); 229 | 230 | if (isExternal) { 231 | //TODO: Check if Kudu table exists to allow external table. 232 | //TODO: Check if column and types are compatible with existing Kudu table. 233 | throw new MetaException("External Table to Kudu not yet supported."); 234 | } 235 | if (tbl.getSd().getLocation() != null) { 236 | throw new MetaException("LOCATION may not be specified for Kudu"); 237 | } 238 | 239 | String tablename = getKuduTableName(tbl); 240 | 241 | try { 242 | List keyColumns = Arrays.asList(tbl.getParameters().get(HiveKuduConstants.KEY_COLUMNS).split("\\s*,\\s*")); 243 | 244 | List tabColumns = tbl.getSd().getCols(); 245 | 246 | int numberOfCols = tabColumns.size(); 247 | List columns = new ArrayList<>(numberOfCols); 248 | 249 | for (FieldSchema fields : tabColumns) { 250 | 251 | ColumnSchema columnSchema = new ColumnSchema 252 | .ColumnSchemaBuilder(fields.getName(), HiveKuduBridgeUtils.hiveTypeToKuduType(fields.getType())) 253 | .key(keyColumns.contains(fields.getName())) 254 | .nullable(!keyColumns.contains(fields.getName())) 255 | .build(); 256 | 257 | columns.add(columnSchema); 258 | } 259 | 260 | Schema schema = new Schema(columns); 261 | 262 | printSchema(schema); 263 | 264 | CreateTableOptions createTableOptions = new CreateTableOptions(); 265 | 266 | //TODO : add support for partition and buckets 267 | client.createTable(tablename, schema, createTableOptions); 268 | 269 | } catch (Exception se) { 270 | throw new MetaException("Error creating Kudu table: " + tablename + ":" + se); 271 | } finally { 272 | try { 273 | client.shutdown(); 274 | } catch (Exception e) { 275 | e.printStackTrace(); 276 | } 277 | } 278 | } 279 | 280 | @Override 281 | public void commitCreateTable(Table tbl) throws MetaException { 282 | // Nothing to do 283 | } 284 | 285 | @Override 286 | public void preDropTable(Table tbl) throws MetaException { 287 | // Nothing to do 288 | 289 | } 290 | 291 | @Override 292 | public void commitDropTable(Table tbl, boolean deleteData) 293 | throws MetaException { 294 | KuduClient client = getKuduClient(tbl.getParameters().get(HiveKuduConstants.MASTER_ADDRESS_NAME)); 295 | String tablename = getKuduTableName(tbl); 296 | boolean isExternal = MetaStoreUtils.isExternalTable(tbl); 297 | try { 298 | if (deleteData && !isExternal) { 299 | client.deleteTable(tablename); 300 | } 301 | } catch (Exception ioe) { 302 | throw new MetaException("Error dropping table:" +tablename); 303 | } finally { 304 | try { 305 | client.shutdown(); 306 | } catch (Exception e) { 307 | e.printStackTrace(); 308 | } 309 | } 310 | } 311 | 312 | @Override 313 | public void rollbackCreateTable(Table tbl) throws MetaException { 314 | KuduClient client = getKuduClient(tbl.getParameters().get(HiveKuduConstants.MASTER_ADDRESS_NAME)); 315 | String tablename = getKuduTableName(tbl); 316 | boolean isExternal = MetaStoreUtils.isExternalTable(tbl); 317 | try { 318 | if ( client.tableExists(tablename) && !isExternal) { 319 | client.deleteTable(tablename); 320 | } 321 | } catch (Exception ioe) { 322 | throw new MetaException("Error dropping table while rollback of create table:" +tablename); 323 | } finally { 324 | try { 325 | client.shutdown(); 326 | } catch (Exception e) { 327 | e.printStackTrace(); 328 | } 329 | } 330 | } 331 | 332 | @Override 333 | public void rollbackDropTable(Table tbl) throws MetaException { 334 | // Nothing to do 335 | } 336 | 337 | } 338 | -------------------------------------------------------------------------------- /src/main/java/org/kududb/mapred/HiveKuduTableInputFormat.java: -------------------------------------------------------------------------------- 1 | package org.kududb.mapred; 2 | 3 | /** 4 | * Created by bimal on 4/13/16. 5 | */ 6 | import org.apache.hadoop.hive.kududb.KuduHandler.HiveKuduWritable; 7 | import com.google.common.base.Objects; 8 | import com.google.common.base.Splitter; 9 | import com.google.common.collect.Lists; 10 | import org.apache.hadoop.fs.Path; 11 | import org.apache.hadoop.io.Text; 12 | import org.kududb.Type; 13 | import org.apache.commons.net.util.Base64; 14 | import org.apache.hadoop.mapred.*; 15 | import org.apache.hadoop.util.StringUtils; 16 | import org.kududb.Schema; 17 | import org.kududb.annotations.InterfaceAudience; 18 | import org.kududb.annotations.InterfaceStability; 19 | import org.kududb.client.*; 20 | import org.apache.commons.logging.Log; 21 | import org.apache.commons.logging.LogFactory; 22 | import org.apache.hadoop.conf.Configurable; 23 | import org.apache.hadoop.conf.Configuration; 24 | import org.apache.hadoop.io.NullWritable; 25 | import org.apache.hadoop.io.Writable; 26 | import org.apache.hadoop.net.DNS; 27 | 28 | import javax.naming.NamingException; 29 | import java.io.DataInput; 30 | import java.io.DataOutput; 31 | import java.io.IOException; 32 | import java.net.InetAddress; 33 | import java.net.InetSocketAddress; 34 | import java.util.ArrayList; 35 | import java.util.Arrays; 36 | import java.util.HashMap; 37 | import java.util.List; 38 | import java.util.Map; 39 | 40 | /** 41 | *

42 | * This input format generates one split per tablet and the only location for each split is that 43 | * tablet's leader. 44 | *

45 | * 46 | *

47 | * Hadoop doesn't have the concept of "closing" the input format so in order to release the 48 | * resources we assume that once either {@link #getSplits(org.apache.hadoop.mapred.JobConf, int)} 49 | * or {@link HiveKuduTableInputFormat.TableRecordReader#close()} have been called that 50 | * the object won't be used again and the AsyncKuduClient is shut down. 51 | *

52 | */ 53 | @InterfaceAudience.Public 54 | @InterfaceStability.Evolving 55 | public class HiveKuduTableInputFormat implements InputFormat, Configurable { 56 | 57 | private static final Log LOG = LogFactory.getLog(HiveKuduTableInputFormat.class); 58 | 59 | private static final long SLEEP_TIME_FOR_RETRIES_MS = 1000; 60 | 61 | /** Job parameter that specifies the input table. */ 62 | static final String INPUT_TABLE_KEY = "kudu.mapreduce.input.table"; 63 | 64 | /** Job parameter that specifies if the scanner should cache blocks or not (default: false). */ 65 | static final String SCAN_CACHE_BLOCKS = "kudu.mapreduce.input.scan.cache.blocks"; 66 | 67 | /** Job parameter that specifies where the masters are. */ 68 | static final String MASTER_ADDRESSES_KEY = "kudu.mapreduce.master.addresses"; 69 | 70 | /** Job parameter that specifies how long we wait for operations to complete (default: 10s). */ 71 | static final String OPERATION_TIMEOUT_MS_KEY = "kudu.mapreduce.operation.timeout.ms"; 72 | 73 | /** Job parameter that specifies the address for the name server. */ 74 | static final String NAME_SERVER_KEY = "kudu.mapreduce.name.server"; 75 | 76 | /** Job parameter that specifies the encoded column range predicates (may be empty). */ 77 | static final String ENCODED_COLUMN_RANGE_PREDICATES_KEY = 78 | "kudu.mapreduce.encoded.column.range.predicates"; 79 | 80 | /** 81 | * Job parameter that specifies the column projection as a comma-separated list of column names. 82 | * 83 | * Not specifying this at all (i.e. setting to null) or setting to the special string 84 | * '*' means to project all columns. 85 | * 86 | * Specifying the empty string means to project no columns (i.e just count the rows). 87 | */ 88 | static final String COLUMN_PROJECTION_KEY = "kudu.mapreduce.column.projection"; 89 | 90 | /** 91 | * The reverse DNS lookup cache mapping: address from Kudu => hostname for Hadoop. This cache is 92 | * used in order to not do DNS lookups multiple times for each tablet server. 93 | */ 94 | private final Map reverseDNSCacheMap = new HashMap(); 95 | 96 | private Configuration conf; 97 | private KuduClient client; 98 | private KuduTable table; 99 | private long operationTimeoutMs; 100 | private String nameServer; 101 | private boolean cacheBlocks; 102 | private List projectedCols; 103 | private byte[] rawPredicates; 104 | 105 | static class KuduHiveSplit extends FileSplit { 106 | InputSplit delegate; 107 | private Path path; 108 | 109 | KuduHiveSplit() { 110 | this(new TableSplit(), null); 111 | } 112 | 113 | KuduHiveSplit(InputSplit delegate, Path path) { 114 | super(path, 0, 0, (String[]) null); 115 | this.delegate = delegate; 116 | this.path = path; 117 | } 118 | 119 | public long getLength() { 120 | // TODO: can this be delegated? 121 | return 1L; 122 | } 123 | 124 | public String[] getLocations() throws IOException { 125 | return delegate.getLocations(); 126 | } 127 | 128 | public void write(DataOutput out) throws IOException { 129 | Text.writeString(out, path.toString()); 130 | delegate.write(out); 131 | } 132 | 133 | public void readFields(DataInput in) throws IOException { 134 | path = new Path(Text.readString(in)); 135 | delegate.readFields(in); 136 | } 137 | 138 | @Override 139 | public String toString() { 140 | return delegate.toString(); 141 | } 142 | 143 | @Override 144 | public Path getPath() { 145 | return path; 146 | } 147 | } 148 | @Override 149 | public FileSplit[] getSplits(JobConf jobConf, int i) 150 | throws IOException { 151 | LOG.warn("I was called : getSplits"); 152 | try { 153 | if (table == null) { 154 | throw new IOException("No table was provided"); 155 | } 156 | InputSplit[] splits; 157 | DeadlineTracker deadline = new DeadlineTracker(); 158 | deadline.setDeadline(operationTimeoutMs); 159 | // If the job is started while a leader election is running, we might not be able to find a 160 | // leader right away. We'll wait as long as the user is willing to wait with the operation 161 | // timeout, and once we've waited long enough we just start picking the first replica we see 162 | // for those tablets that don't have a leader. The client will later try to find the leader 163 | // and it might fail, in which case the task will get retried. 164 | retryloop: 165 | while (true) { 166 | List locations; 167 | try { 168 | locations = table.getTabletsLocations(operationTimeoutMs); 169 | } catch (Exception e) { 170 | throw new IOException("Could not get the tablets locations", e); 171 | } 172 | 173 | if (locations.isEmpty()) { 174 | throw new IOException("The requested table has 0 tablets, cannot continue"); 175 | } 176 | 177 | // For the moment we only pass the leader since that's who we read from. 178 | // If we've been trying to get a leader for each tablet for too long, we stop looping 179 | // and just finish with what we have. 180 | splits = new InputSplit[locations.size()]; 181 | int count = 0; 182 | for (LocatedTablet locatedTablet : locations) { 183 | List addresses = Lists.newArrayList(); 184 | LocatedTablet.Replica replica = locatedTablet.getLeaderReplica(); 185 | if (replica == null) { 186 | if (deadline.wouldSleepingTimeout(SLEEP_TIME_FOR_RETRIES_MS)) { 187 | LOG.debug("We ran out of retries, picking a non-leader replica for this tablet: " + 188 | locatedTablet.toString()); 189 | // We already checked it's not empty. 190 | replica = locatedTablet.getReplicas().get(0); 191 | } else { 192 | LOG.debug("Retrying creating the splits because this tablet is missing a leader: " + 193 | locatedTablet.toString()); 194 | try { 195 | Thread.sleep(SLEEP_TIME_FOR_RETRIES_MS); 196 | } catch (InterruptedException ioe) { 197 | throw new IOException(StringUtils.stringifyException(ioe)); 198 | } 199 | 200 | continue retryloop; 201 | } 202 | } 203 | addresses.add(reverseDNS(replica.getRpcHost(), replica.getRpcPort())); 204 | String[] addressesArray = addresses.toArray(new String[addresses.size()]); 205 | Partition partition = locatedTablet.getPartition(); 206 | TableSplit split = new TableSplit(partition.getPartitionKeyStart(), 207 | partition.getPartitionKeyEnd(), 208 | addressesArray); 209 | splits[count] = split; 210 | count++; 211 | } 212 | FileSplit[] wrappers = new FileSplit[splits.length]; 213 | Path path = new Path(jobConf.get("location")); 214 | for (int counter = 0; counter < wrappers.length; counter++) { 215 | wrappers[counter] = new KuduHiveSplit(splits[counter], path); 216 | } 217 | return wrappers; 218 | } 219 | } finally { 220 | //shutdownClient(); 221 | LOG.warn("This is a Bug. No need to shutdown client."); 222 | } 223 | } 224 | 225 | private void shutdownClient() throws IOException { 226 | LOG.warn("I was called : shutdownClient"); 227 | try { 228 | client.shutdown(); 229 | } catch (Exception e) { 230 | LOG.error("Error shutting down Kudu Client" + e); 231 | } 232 | } 233 | 234 | /** 235 | * This method might seem alien, but we do this in order to resolve the hostnames the same way 236 | * Hadoop does. This ensures we get locality if Kudu is running along MR/YARN. 237 | * @param host hostname we got from the master 238 | * @param port port we got from the master 239 | * @return reverse DNS'd address 240 | */ 241 | private String reverseDNS(String host, Integer port) { 242 | LOG.warn("I was called : reverseDNS"); 243 | String location = this.reverseDNSCacheMap.get(host); 244 | if (location != null) { 245 | return location; 246 | } 247 | // The below InetSocketAddress creation does a name resolution. 248 | InetSocketAddress isa = new InetSocketAddress(host, port); 249 | if (isa.isUnresolved()) { 250 | LOG.warn("Failed address resolve for: " + isa); 251 | } 252 | InetAddress tabletInetAddress = isa.getAddress(); 253 | try { 254 | location = domainNamePointerToHostName( 255 | DNS.reverseDns(tabletInetAddress, this.nameServer)); 256 | this.reverseDNSCacheMap.put(host, location); 257 | } catch (NamingException e) { 258 | LOG.warn("Cannot resolve the host name for " + tabletInetAddress + " because of " + e); 259 | location = host; 260 | } 261 | return location; 262 | } 263 | 264 | @Override 265 | public RecordReader getRecordReader(InputSplit inputSplit, 266 | final JobConf jobConf, final Reporter reporter) 267 | throws IOException { 268 | InputSplit delegate = ((KuduHiveSplit) inputSplit).delegate; 269 | LOG.warn("I was called : getRecordReader"); 270 | try { 271 | return new TableRecordReader(delegate); 272 | } catch (InterruptedException e) 273 | { 274 | throw new IOException(e); 275 | } 276 | } 277 | 278 | @Override 279 | public void setConf(Configuration entries) { 280 | LOG.warn("I was called : setConf"); 281 | this.conf = new Configuration(entries); 282 | 283 | String tableName = conf.get(INPUT_TABLE_KEY); 284 | String masterAddresses = conf.get(MASTER_ADDRESSES_KEY); 285 | this.operationTimeoutMs = conf.getLong(OPERATION_TIMEOUT_MS_KEY, 286 | AsyncKuduClient.DEFAULT_OPERATION_TIMEOUT_MS); 287 | this.nameServer = conf.get(NAME_SERVER_KEY); 288 | this.cacheBlocks = conf.getBoolean(SCAN_CACHE_BLOCKS, false); 289 | 290 | LOG.warn(" the master address here is " + masterAddresses); 291 | 292 | this.client = new KuduClient.KuduClientBuilder(masterAddresses) 293 | .defaultOperationTimeoutMs(operationTimeoutMs) 294 | .build(); 295 | try { 296 | this.table = client.openTable(tableName); 297 | } catch (Exception ex) { 298 | throw new RuntimeException("Could not obtain the table from the master, " + 299 | "is the master running and is this table created? tablename=" + tableName + " and " + 300 | "master address= " + masterAddresses, ex); 301 | } 302 | 303 | //String projectionConfig = conf.get(COLUMN_PROJECTION_KEY); 304 | String projectionConfig = "id,name"; 305 | if (projectionConfig == null || projectionConfig.equals("*")) { 306 | this.projectedCols = null; // project the whole table 307 | } else if ("".equals(projectionConfig)) { 308 | this.projectedCols = new ArrayList<>(); 309 | } else { 310 | this.projectedCols = Lists.newArrayList(Splitter.on(',').split(projectionConfig)); 311 | 312 | // Verify that the column names are valid -- better to fail with an exception 313 | // before we submit the job. 314 | Schema tableSchema = table.getSchema(); 315 | for (String columnName : projectedCols) { 316 | if (tableSchema.getColumn(columnName) == null) { 317 | throw new IllegalArgumentException("Unknown column " + columnName); 318 | } 319 | } 320 | } 321 | 322 | String encodedPredicates = conf.get(ENCODED_COLUMN_RANGE_PREDICATES_KEY, ""); 323 | rawPredicates = Base64.decodeBase64(encodedPredicates); 324 | } 325 | 326 | /** 327 | * Given a PTR string generated via reverse DNS lookup, return everything 328 | * except the trailing period. Example for host.example.com., return 329 | * host.example.com 330 | * @param dnPtr a domain name pointer (PTR) string. 331 | * @return Sanitized hostname with last period stripped off. 332 | * 333 | */ 334 | private static String domainNamePointerToHostName(String dnPtr) { 335 | LOG.warn("I was called : domainNamePointerToHostName"); 336 | if (dnPtr == null) 337 | return null; 338 | String r = dnPtr.endsWith(".") ? dnPtr.substring(0, dnPtr.length() - 1) : dnPtr; 339 | LOG.warn(r); 340 | return r; 341 | } 342 | 343 | @Override 344 | public Configuration getConf() { 345 | return conf; 346 | } 347 | 348 | static class TableSplit implements InputSplit, Writable, Comparable { 349 | 350 | private byte[] startPartitionKey; 351 | private byte[] endPartitionKey; 352 | private String[] locations; 353 | 354 | public TableSplit() { } // Writable 355 | 356 | public TableSplit(byte[] startPartitionKey, byte[] endPartitionKey, String[] locations) { 357 | LOG.warn("I was called : TableSplit"); 358 | this.startPartitionKey = startPartitionKey; 359 | this.endPartitionKey = endPartitionKey; 360 | this.locations = locations; 361 | } 362 | 363 | @Override 364 | public long getLength() throws IOException { 365 | // TODO Guesstimate a size 366 | return 0; 367 | } 368 | 369 | @Override 370 | public String[] getLocations() throws IOException { 371 | LOG.warn("I was called : getLocations"); 372 | return locations; 373 | } 374 | 375 | public byte[] getStartPartitionKey() { 376 | return startPartitionKey; 377 | } 378 | 379 | public byte[] getEndPartitionKey() { 380 | return endPartitionKey; 381 | } 382 | 383 | @Override 384 | public int compareTo(TableSplit tableSplit) { 385 | LOG.warn("I was called : compareTo"); 386 | return Bytes.memcmp(startPartitionKey, tableSplit.getStartPartitionKey()); 387 | } 388 | 389 | @Override 390 | public void write(DataOutput dataOutput) throws IOException { 391 | LOG.warn("I was called : write"); 392 | Bytes.writeByteArray(dataOutput, startPartitionKey); 393 | Bytes.writeByteArray(dataOutput, endPartitionKey); 394 | dataOutput.writeInt(locations.length); 395 | for (String location : locations) { 396 | byte[] str = Bytes.fromString(location); 397 | Bytes.writeByteArray(dataOutput, str); 398 | } 399 | } 400 | 401 | @Override 402 | public void readFields(DataInput dataInput) throws IOException { 403 | LOG.warn("I was called : readFields"); 404 | startPartitionKey = Bytes.readByteArray(dataInput); 405 | endPartitionKey = Bytes.readByteArray(dataInput); 406 | locations = new String[dataInput.readInt()]; 407 | LOG.warn("readFields " + locations.length); 408 | for (int i = 0; i < locations.length; i++) { 409 | byte[] str = Bytes.readByteArray(dataInput); 410 | locations[i] = Bytes.getString(str); 411 | LOG.warn("readFields " + locations[i]); 412 | } 413 | } 414 | 415 | @Override 416 | public int hashCode() { 417 | LOG.warn("I was called : hashCode"); 418 | // We currently just care about the row key since we're within the same table 419 | return Arrays.hashCode(startPartitionKey); 420 | } 421 | 422 | @Override 423 | public boolean equals(Object o) { 424 | LOG.warn("I was called : equals"); 425 | if (this == o) return true; 426 | if (o == null || getClass() != o.getClass()) return false; 427 | 428 | TableSplit that = (TableSplit) o; 429 | 430 | return this.compareTo(that) == 0; 431 | } 432 | 433 | @Override 434 | public String toString() { 435 | LOG.warn("I was called : toString"); 436 | return Objects.toStringHelper(this) 437 | .add("startPartitionKey", Bytes.pretty(startPartitionKey)) 438 | .add("endPartitionKey", Bytes.pretty(endPartitionKey)) 439 | .add("locations", Arrays.toString(locations)) 440 | .toString(); 441 | } 442 | } 443 | 444 | class TableRecordReader implements RecordReader { 445 | 446 | private final NullWritable currentKey = NullWritable.get(); 447 | private RowResult currentValue; 448 | private RowResultIterator iterator; 449 | private KuduScanner scanner; 450 | private TableSplit split; 451 | private Type[] types; 452 | private boolean first = true; 453 | 454 | public TableRecordReader(InputSplit inputSplit) throws IOException, InterruptedException { 455 | LOG.warn("I was called : TableRecordReader"); 456 | if (!(inputSplit instanceof TableSplit)) { 457 | throw new IllegalArgumentException("TableSplit is the only accepted input split"); 458 | } 459 | 460 | //Create another client 461 | //setConf(getConf()); 462 | 463 | split = (TableSplit) inputSplit; 464 | scanner = client.newScannerBuilder(table) 465 | .setProjectedColumnNames(projectedCols) 466 | .lowerBoundPartitionKeyRaw(split.getStartPartitionKey()) 467 | .exclusiveUpperBoundPartitionKeyRaw(split.getEndPartitionKey()) 468 | .cacheBlocks(cacheBlocks) 469 | .addColumnRangePredicatesRaw(rawPredicates) 470 | .build(); 471 | 472 | LOG.warn("table name: " +table.getName()); 473 | LOG.warn("projectedCols name: " + projectedCols.size()); 474 | LOG.warn("getStartPartitionKey: " + split.getStartPartitionKey().toString()); 475 | LOG.warn("getEndPartitionKey " + split.getEndPartitionKey().toString()); 476 | LOG.warn("cacheBlocks " + cacheBlocks); 477 | LOG.warn("rawPredicates " + rawPredicates.length); 478 | 479 | 480 | Schema schema = table.getSchema(); 481 | types = new Type[schema.getColumnCount()]; 482 | for (int i = 0; i < types.length; i++) { 483 | types[i] = schema.getColumnByIndex(i).getType(); 484 | LOG.warn("Setting types array "+ i + " to " + types[i].name()); 485 | } 486 | // Calling this now to set iterator. 487 | tryRefreshIterator(); 488 | } 489 | 490 | @Override 491 | public boolean next(NullWritable o, HiveKuduWritable o2) throws IOException { 492 | LOG.warn("I was called : next"); 493 | /* 494 | if (first) { 495 | //tryRefreshIterator(); 496 | List projectColumns = new ArrayList<>(2); 497 | projectColumns.add("id"); 498 | projectColumns.add("name"); 499 | KuduScanner scanner = client.newScannerBuilder(table) 500 | .setProjectedColumnNames(projectColumns) 501 | .build(); 502 | try { 503 | iterator = scanner.nextRows(); 504 | } catch (Exception e) { 505 | throw new IOException("Couldn't get scan data", e); 506 | } 507 | first = false; 508 | } else { 509 | return false; 510 | } 511 | */ 512 | if (!iterator.hasNext()) { 513 | tryRefreshIterator(); 514 | if (!iterator.hasNext()) { 515 | // Means we still have the same iterator, we're done 516 | return false; 517 | } 518 | } 519 | 520 | currentValue = iterator.next(); 521 | o = currentKey; 522 | o2.clear(); 523 | for (int i = 0; i < types.length; i++) { 524 | switch(types[i]) { 525 | case STRING: { 526 | o2.set(i, currentValue.getString(i)); 527 | break; 528 | } 529 | case FLOAT: { 530 | o2.set(i, currentValue.getFloat(i)); 531 | break; 532 | } 533 | case DOUBLE: { 534 | o2.set(i, currentValue.getDouble(i)); 535 | break; 536 | } 537 | case BOOL: { 538 | o2.set(i, currentValue.getBoolean(i)); 539 | break; 540 | } 541 | case INT8: { 542 | o2.set(i, currentValue.getByte(i)); 543 | break; 544 | } 545 | case INT16: { 546 | o2.set(i, currentValue.getShort(i)); 547 | break; 548 | } 549 | case INT32: { 550 | o2.set(i, currentValue.getInt(i)); 551 | break; 552 | } 553 | case INT64: { 554 | o2.set(i, currentValue.getLong(i)); 555 | break; 556 | } 557 | case TIMESTAMP: { 558 | o2.set(i, currentValue.getLong(i)); 559 | break; 560 | } 561 | case BINARY: { 562 | o2.set(i, currentValue.getBinaryCopy(i)); 563 | break; 564 | } 565 | default: 566 | throw new IOException("Cannot write Object '" 567 | + currentValue.getColumnType(i).name() + "' as type: " + types[i].name()); 568 | } 569 | LOG.warn("Value returned " + o2.get(i)); 570 | } 571 | return true; 572 | } 573 | 574 | @Override 575 | public NullWritable createKey() { 576 | LOG.warn("I was called : createKey"); 577 | return NullWritable.get(); 578 | } 579 | 580 | @Override 581 | public HiveKuduWritable createValue() { 582 | LOG.warn("I was called : createValue"); 583 | return new HiveKuduWritable(types); 584 | } 585 | 586 | @Override 587 | public long getPos() throws IOException { 588 | LOG.warn("I was called : getPos"); 589 | return 0; 590 | //TODO: Get progress 591 | } 592 | /* 593 | //mapreduce code for reference. 594 | @Override 595 | public boolean nextKeyValue() throws IOException, InterruptedException { 596 | if (!iterator.hasNext()) { 597 | tryRefreshIterator(); 598 | if (!iterator.hasNext()) { 599 | // Means we still have the same iterator, we're done 600 | return false; 601 | } 602 | } 603 | currentValue = iterator.next(); 604 | return true; 605 | } 606 | */ 607 | /** 608 | * If the scanner has more rows, get a new iterator else don't do anything. 609 | * @throws IOException 610 | */ 611 | private void tryRefreshIterator() throws IOException { 612 | LOG.warn("I was called : tryRefreshIterator"); 613 | if (!scanner.hasMoreRows()) { 614 | return; 615 | } 616 | try { 617 | iterator = scanner.nextRows(); 618 | } catch (Exception e) { 619 | throw new IOException("Couldn't get scan data", e); 620 | } 621 | } 622 | 623 | /* 624 | Mapreduce code for reference 625 | 626 | @Override 627 | public NullWritable getCurrentKey() throws IOException, InterruptedException { 628 | return currentKey; 629 | } 630 | 631 | @Override 632 | public RowResult getCurrentValue() throws IOException, InterruptedException { 633 | return currentValue; 634 | } 635 | */ 636 | 637 | @Override 638 | public float getProgress() throws IOException { 639 | LOG.warn("I was called : getProgress"); 640 | // TODO Guesstimate progress 641 | return 0; 642 | } 643 | 644 | 645 | @Override 646 | public void close() throws IOException { 647 | LOG.warn("I was called : close"); 648 | try { 649 | scanner.close(); 650 | } catch (NullPointerException npe) { 651 | LOG.warn("The scanner is supposed to be open but its not. TODO: Fix me."); 652 | } 653 | catch (Exception e) { 654 | throw new IOException(e); 655 | } 656 | shutdownClient(); 657 | } 658 | } 659 | } -------------------------------------------------------------------------------- /src/main/java/org/kududb/mapred/HiveKuduTableOutputFormat.java: -------------------------------------------------------------------------------- 1 | package org.kududb.mapred; 2 | 3 | /** 4 | * Created by bimal on 4/13/16. 5 | */ 6 | import org.apache.hadoop.hive.kududb.KuduHandler.HiveKuduWritable; 7 | import org.apache.hadoop.fs.FileSystem; 8 | import org.apache.hadoop.mapred.JobConf; 9 | import org.apache.hadoop.mapred.RecordWriter; 10 | import org.apache.hadoop.mapred.Reporter; 11 | import org.apache.hadoop.util.Progressable; 12 | import org.kududb.Schema; 13 | import org.kududb.annotations.InterfaceAudience; 14 | import org.kududb.annotations.InterfaceStability; 15 | import org.kududb.client.*; 16 | import org.apache.hadoop.conf.Configurable; 17 | import org.apache.hadoop.conf.Configuration; 18 | import org.apache.hadoop.io.NullWritable; 19 | import org.apache.hadoop.mapred.OutputFormat; 20 | import org.slf4j.Logger; 21 | import org.slf4j.LoggerFactory; 22 | 23 | import java.io.IOException; 24 | import java.util.ArrayList; 25 | import java.util.List; 26 | import java.util.concurrent.ConcurrentHashMap; 27 | import java.util.concurrent.atomic.AtomicLong; 28 | 29 | 30 | @InterfaceAudience.Public 31 | @InterfaceStability.Evolving 32 | public class HiveKuduTableOutputFormat implements OutputFormat, Configurable { 33 | 34 | private static final Logger LOG = LoggerFactory.getLogger(HiveKuduTableOutputFormat.class); 35 | 36 | /** Job parameter that specifies the output table. */ 37 | static final String OUTPUT_TABLE_KEY = "kudu.mapreduce.output.table"; 38 | 39 | /** Job parameter that specifies where the masters are */ 40 | static final String MASTER_ADDRESSES_KEY = "kudu.mapreduce.master.addresses"; 41 | 42 | /** Job parameter that specifies how long we wait for operations to complete */ 43 | static final String OPERATION_TIMEOUT_MS_KEY = "kudu.mapreduce.operation.timeout.ms"; 44 | 45 | /** Number of rows that are buffered before flushing to the tablet server */ 46 | static final String BUFFER_ROW_COUNT_KEY = "kudu.mapreduce.buffer.row.count"; 47 | 48 | /** 49 | * Job parameter that specifies which key is to be used to reach the HiveKuduTableOutputFormat 50 | * belonging to the caller 51 | */ 52 | static final String MULTITON_KEY = "kudu.mapreduce.multitonkey"; 53 | 54 | /** 55 | * This multiton is used so that the tasks using this output format/record writer can find 56 | * their KuduTable without having a direct dependency on this class, 57 | * with the additional complexity that the output format cannot be shared between threads. 58 | */ 59 | private static final ConcurrentHashMap MULTITON = new 60 | ConcurrentHashMap(); 61 | 62 | /** 63 | * This counter helps indicate which task log to look at since rows that weren't applied will 64 | * increment this counter. 65 | */ 66 | public enum Counters { ROWS_WITH_ERRORS } 67 | 68 | private Configuration conf = null; 69 | 70 | private KuduClient client; 71 | private KuduTable table; 72 | private KuduSession session; 73 | private long operationTimeoutMs; 74 | 75 | @Override 76 | public void setConf(Configuration entries) { 77 | LOG.warn("I was called : setConf"); 78 | this.conf = new Configuration(entries); 79 | 80 | String masterAddress = this.conf.get(MASTER_ADDRESSES_KEY); 81 | String tableName = this.conf.get(OUTPUT_TABLE_KEY); 82 | this.operationTimeoutMs = this.conf.getLong(OPERATION_TIMEOUT_MS_KEY, 83 | AsyncKuduClient.DEFAULT_OPERATION_TIMEOUT_MS); 84 | int bufferSpace = this.conf.getInt(BUFFER_ROW_COUNT_KEY, 1000); 85 | 86 | LOG.warn(" the master address here is " + masterAddress); 87 | 88 | this.client = new KuduClient.KuduClientBuilder(masterAddress) 89 | .defaultOperationTimeoutMs(operationTimeoutMs) 90 | .build(); 91 | try { 92 | this.table = client.openTable(tableName); 93 | } catch (Exception ex) { 94 | throw new RuntimeException("Could not obtain the table from the master, " + 95 | "is the master running and is this table created? tablename=" + tableName + " and " + 96 | "master address= " + masterAddress, ex); 97 | } 98 | this.session = client.newSession(); 99 | this.session.setFlushMode(AsyncKuduSession.FlushMode.AUTO_FLUSH_BACKGROUND); 100 | this.session.setMutationBufferSpace(bufferSpace); 101 | this.session.setIgnoreAllDuplicateRows(true); 102 | String multitonKey = String.valueOf(Thread.currentThread().getId()); 103 | assert(MULTITON.get(multitonKey) == null); 104 | MULTITON.put(multitonKey, this); 105 | entries.set(MULTITON_KEY, multitonKey); 106 | } 107 | 108 | private void shutdownClient() throws IOException { 109 | LOG.warn("I was called : shutdownClient"); 110 | try { 111 | client.shutdown(); 112 | } catch (Exception e) { 113 | throw new IOException(e); 114 | } 115 | } 116 | 117 | public static KuduTable getKuduTable(String multitonKey) { 118 | LOG.warn("I was called : getKuduTable"); 119 | return MULTITON.get(multitonKey).getKuduTable(); 120 | } 121 | 122 | private KuduTable getKuduTable() { 123 | LOG.warn("I was called : getKuduTable"); 124 | return this.table; 125 | } 126 | 127 | @Override 128 | public Configuration getConf() { 129 | LOG.warn("I was called : getConf"); 130 | return conf; 131 | } 132 | 133 | 134 | @Override 135 | public RecordWriter getRecordWriter(FileSystem fileSystem, JobConf jobConf, String s, Progressable progressable) 136 | throws IOException { 137 | LOG.warn("I was called : getRecordWriter"); 138 | return new TableRecordWriter(this.session); 139 | } 140 | 141 | @Override 142 | public void checkOutputSpecs(FileSystem fileSystem, JobConf jobConf) throws IOException { 143 | LOG.warn("I was called : checkOutputSpecs"); 144 | shutdownClient(); 145 | } 146 | 147 | /* 148 | @Override 149 | public OutputCommitter getOutputCommitter(TaskAttemptContext taskAttemptContext) throws 150 | IOException, InterruptedException { 151 | return new KuduTableOutputCommitter(); 152 | } 153 | */ 154 | 155 | protected class TableRecordWriter implements RecordWriter { 156 | 157 | private final AtomicLong rowsWithErrors = new AtomicLong(); 158 | private final KuduSession session; 159 | 160 | public TableRecordWriter(KuduSession session) { 161 | LOG.warn("I was called : TableRecordWriter"); 162 | this.session = session; 163 | } 164 | 165 | private Operation getOperation(HiveKuduWritable hiveKuduWritable) 166 | throws IOException{ 167 | LOG.warn("I was called : getOperation"); 168 | int recCount = hiveKuduWritable.getColCount(); 169 | Schema schema = table.getSchema(); 170 | int colCount = schema.getColumnCount(); 171 | if (recCount != colCount) { 172 | throw new IOException("Kudu table column count of " + colCount + " does not match " 173 | + "with Serialized object record count of " + recCount); 174 | } 175 | //TODO: Find out if the record needs to be updated or deleted. 176 | //Assume only insert 177 | 178 | Insert insert = table.newInsert(); 179 | PartialRow row = insert.getRow(); 180 | 181 | for (int i = 0; i < recCount; i++) { 182 | Object obj = hiveKuduWritable.get(i); 183 | LOG.warn("From Writable Column value of " + i + " is " + obj.toString() + " and type is " + hiveKuduWritable.getType(i).name()); 184 | LOG.warn("From Schema Column name of " + i + " is " + schema.getColumnByIndex(i).getName()); 185 | switch(hiveKuduWritable.getType(i)) { 186 | case STRING: { 187 | LOG.warn("I was called : STRING"); 188 | String s = obj.toString(); 189 | row.addString(i, s); 190 | break; 191 | } 192 | case FLOAT: { 193 | LOG.warn("I was called : FLOAT"); 194 | Float f = (Float) obj; 195 | row.addFloat(i, f); 196 | break; 197 | } 198 | case DOUBLE: { 199 | LOG.warn("I was called : DOUBLE"); 200 | Double d = (Double) obj; 201 | row.addDouble(i, d); 202 | break; 203 | } 204 | case BOOL: { 205 | LOG.warn("I was called : BOOL"); 206 | Boolean b = (Boolean) obj; 207 | row.addBoolean(i, b); 208 | break; 209 | } 210 | case INT8: { 211 | LOG.warn("I was called : INT8"); 212 | Byte b = (Byte) obj; 213 | row.addByte(i, b); 214 | break; 215 | } 216 | case INT16: { 217 | LOG.warn("I was called : INT16"); 218 | Short s = (Short) obj; 219 | row.addShort(i, s); 220 | break; 221 | } 222 | case INT32: { 223 | LOG.warn("I was called : INT32"); 224 | Integer x = (Integer) obj; 225 | row.addInt(i, x); 226 | break; 227 | } 228 | case INT64: { 229 | LOG.warn("I was called : INT64"); 230 | Long l = (Long) obj; 231 | row.addLong(i, l); 232 | break; 233 | } 234 | case TIMESTAMP: { 235 | LOG.warn("I was called : TIMESTAMP"); 236 | Long time = (Long) obj; 237 | row.addLong(i, time); 238 | break; 239 | } 240 | case BINARY: { 241 | LOG.warn("I was called : BINARY"); 242 | byte[] b = (byte[]) obj; 243 | row.addBinary(i, b); 244 | break; 245 | } 246 | default: 247 | throw new IOException("Cannot write Object '" 248 | + obj.getClass().getSimpleName() + "' as type: " + hiveKuduWritable.getType(i).name()); 249 | } 250 | } 251 | 252 | return insert; 253 | } 254 | @Override 255 | public void write(NullWritable key, HiveKuduWritable kw) 256 | throws IOException { 257 | try { 258 | LOG.warn("I was called : write"); 259 | Operation operation = getOperation(kw); 260 | session.apply(operation); 261 | 262 | //read from Kudu if the insert was successful 263 | List projectColumns = new ArrayList<>(2); 264 | projectColumns.add("id"); 265 | projectColumns.add("name"); 266 | KuduScanner scanner = client.newScannerBuilder(table) 267 | .setProjectedColumnNames(projectColumns) 268 | .build(); 269 | 270 | while (scanner.hasMoreRows()) { 271 | RowResultIterator results = scanner.nextRows(); 272 | while (results.hasNext()) { 273 | RowResult result = results.next(); 274 | LOG.warn("Returned from kudu" + result.getInt(0) + ":" +result.getString(1)); 275 | } 276 | } 277 | 278 | LOG.warn("applying operation"); 279 | 280 | } catch (Exception e) { 281 | throw new IOException("Encountered an error while writing", e); 282 | } 283 | } 284 | 285 | @Override 286 | public void close(Reporter reporter) throws IOException { 287 | try { 288 | LOG.warn("I was called : close"); 289 | processRowErrors(session.close()); 290 | shutdownClient(); 291 | } catch (Exception e) { 292 | throw new IOException("Encountered an error while closing this task", e); 293 | } finally { 294 | if (reporter != null) { 295 | // This is the only place where we have access to the context in the record writer, 296 | // so set the counter here. 297 | reporter.getCounter(Counters.ROWS_WITH_ERRORS).setValue(rowsWithErrors.get()); 298 | } 299 | } 300 | } 301 | 302 | private void processRowErrors(List responses) { 303 | LOG.warn("I was called : processRowErrors"); 304 | List errors = OperationResponse.collectErrors(responses); 305 | if (!errors.isEmpty()) { 306 | int rowErrorsCount = errors.size(); 307 | rowsWithErrors.addAndGet(rowErrorsCount); 308 | LOG.warn("Got per errors for " + rowErrorsCount + " rows, " + 309 | "the first one being " + errors.get(0).getStatus()); 310 | } 311 | } 312 | } 313 | } --------------------------------------------------------------------------------