├── .gitignore
├── LICENSE
├── README.md
├── Resources
    ├── HiveKudu-Handler-0.0.1.jar
    ├── async-1.3.1.jar
    ├── kudu-client-0.6.0.jar
    └── kudu-mapreduce-0.7.1.jar
├── doc
    ├── DesignDocument.md
    ├── README.md
    ├── UserGuide.md
    └── figures
    │   ├── StorageHandlerDesign.png
    │   └── hivekudu-design.png
├── pom.xml
└── src
    └── main
        └── java
            └── org
                ├── apache
                    └── hadoop
                    │   └── hive
                    │       └── kududb
                    │           └── KuduHandler
                    │               ├── HiveKuduBridgeUtils.java
                    │               ├── HiveKuduConstants.java
                    │               ├── HiveKuduSerDe.java
                    │               ├── HiveKuduWritable.java
                    │               └── KuduStorageHandler.java
                └── kududb
                    └── mapred
                        ├── HiveKuduTableInputFormat.java
                        └── HiveKuduTableOutputFormat.java


/.gitignore:
--------------------------------------------------------------------------------
 1 | 
 2 | # Java class files
 3 | *.class
 4 | 
 5 | # Windows thumbnail db
 6 | Thumbs.db
 7 | 
 8 | # OSX files
 9 | .DS_Store
10 | 
11 | # Eclipse project files
12 | .classpath
13 | .project
14 | 
15 | # IntelliJ
16 | *.iml
17 | .idea
18 | /sync/app/build/
19 | 
20 | #Maven target
21 | target/
22 | 
23 | # Project
24 | /sync/app/test
25 | 
26 | #test data
27 | data/
28 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | #Update April 29th 2016
  2 | Hive on Spark is working but there is a connection drop in my InputFormat, which is currently running on a Band-Aid. Please use branch-0.0.2 if you want to use Hive on Spark.
  3 | 
  4 | # HiveKudu-Handler
  5 | Hive Kudu Storage Handler, Input & Output format, Writable and SerDe
  6 | 
  7 | This is the first release of Hive on Kudu.
  8 | 
  9 | I have placed the jars in the Resource folder which you can add in hive and test.
 10 | 
 11 | If you would like to build from source then make install and use "HiveKudu-Handler-0.0.1.jar" to add in hive cli or hiveserver2 lib path.
 12 | 
 13 | ## Working Test case
 14 | ### simple_test.sql
 15 | ```sql
 16 | add jar HiveKudu-Handler-0.0.1.jar; 
 17 | add jar kudu-client-0.6.0.jar; 
 18 | add jar async-1.3.1.jar;
 19 | 
 20 | set hive.cli.print.header=true;
 21 | 
 22 | CREATE TABLE if not exists test_drop (
 23 | id INT,
 24 | name STRING
 25 | )
 26 | stored by 'org.apache.hadoop.hive.kududb.KuduHandler.KuduStorageHandler'
 27 | TBLPROPERTIES(
 28 |   'kudu.table_name' = 'test_drop',
 29 |   'kudu.master_addresses' = 'ip-172-31-56-74.ec2.internal:7051',
 30 |   'kudu.key_columns' = 'id'
 31 | );
 32 | 
 33 | describe formatted test_drop;
 34 | 
 35 | insert into test_drop values (1, 'a'), (2, 'b'), (3, 'a');
 36 | 
 37 | select count(*) from test_drop;
 38 | 
 39 | select id from test_Drop where name = 'a';
 40 | 
 41 | select name, count(*) from test_drop group by name;
 42 | 
 43 | drop table test_Drop;
 44 | ```
 45 | 
 46 | ### Output of simple test
 47 | ```
 48 | 
 49 | Logging initialized using configuration in jar:file:/opt/cloudera/parcels/CDH-5.5.2-1.cdh5.5.2.p0.4/jars/hive-common-1.1.0-cdh5.5.2.jar!/hive-log4j.properties
 50 | add jar HiveKudu-Handler-0.0.1.jar
 51 | Added [HiveKudu-Handler-0.0.1.jar] to class path
 52 | Added resources: [HiveKudu-Handler-0.0.1.jar]
 53 | add jar kudu-client-0.6.0.jar
 54 | Added [kudu-client-0.6.0.jar] to class path
 55 | Added resources: [kudu-client-0.6.0.jar]
 56 | add jar async-1.3.1.jar
 57 | Added [async-1.3.1.jar] to class path
 58 | Added resources: [async-1.3.1.jar]
 59 | set hive.cli.print.header=true
 60 | 
 61 | 
 62 | CREATE TABLE if not exists test_drop (
 63 | id INT,
 64 | name STRING
 65 | )
 66 | stored by 'org.apache.hadoop.hive.kududb.KuduHandler.KuduStorageHandler'
 67 | TBLPROPERTIES(
 68 |   'kudu.table_name' = 'test_drop',
 69 |   'kudu.master_addresses' = 'ip-172-31-56-74.ec2.internal:7051',
 70 |   'kudu.key_columns' = 'id'
 71 | )
 72 | OK
 73 | Time taken: 2.924 seconds
 74 | 
 75 | 
 76 | describe formatted test_drop
 77 | OK
 78 | col_name	data_type	comment
 79 | # col_name            	data_type           	comment             
 80 | 	 	 
 81 | id                  	int                 	from deserializer   
 82 | name                	string              	from deserializer   
 83 | 	 	 
 84 | # Detailed Table Information	 	 
 85 | Database:           	default             	 
 86 | Owner:              	hdfs                	 
 87 | CreateTime:         	Fri Apr 15 00:45:42 EDT 2016	 
 88 | LastAccessTime:     	UNKNOWN             	 
 89 | Protect Mode:       	None                	 
 90 | Retention:          	0                   	 
 91 | Location:           	hdfs://ip-172-31-56-74.ec2.internal:8020/user/hive/warehouse/test_drop	 
 92 | Table Type:         	MANAGED_TABLE       	 
 93 | Table Parameters:	 	 
 94 | 	kudu.key_columns    	id                  
 95 | 	kudu.master_addresses	ip-172-31-56-74.ec2.internal:7051
 96 | 	kudu.table_name     	test_drop           
 97 | 	storage_handler     	org.apache.hadoop.hive.kududb.KuduHandler.KuduStorageHandler
 98 | 	transient_lastDdlTime	1460695542          
 99 | 	 	 
100 | # Storage Information	 	 
101 | SerDe Library:      	org.apache.hadoop.hive.kududb.KuduHandler.HiveKuduSerDe	 
102 | InputFormat:        	null                	 
103 | OutputFormat:       	null                	 
104 | Compressed:         	No                  	 
105 | Num Buckets:        	-1                  	 
106 | Bucket Columns:     	[]                  	 
107 | Sort Columns:       	[]                  	 
108 | Storage Desc Params:	 	 
109 | 	serialization.format	1                   
110 | Time taken: 0.277 seconds, Fetched: 31 row(s)
111 | 
112 | 
113 | insert into test_drop values (1, 'a'), (2, 'b'), (3, 'a')
114 | Query ID = hdfs_20160415004545_5d94fdd4-d6e1-4fe3-b6ef-29eda4f309e5
115 | Total jobs = 1
116 | Launching Job 1 out of 1
117 | Number of reduce tasks is set to 0 since there's no reduce operator
118 | Starting Job = job_1460484956690_0052, Tracking URL = http://ip-172-31-56-74.ec2.internal:8088/proxy/application_1460484956690_0052/
119 | Kill Command = /opt/cloudera/parcels/CDH-5.5.2-1.cdh5.5.2.p0.4/lib/hadoop/bin/hadoop job  -kill job_1460484956690_0052
120 | Hadoop job information for Stage-0: number of mappers: 1; number of reducers: 0
121 | 2016-04-15 00:45:53,003 Stage-0 map = 0%,  reduce = 0%
122 | 2016-04-15 00:46:00,375 Stage-0 map = 100%,  reduce = 0%, Cumulative CPU 1.73 sec
123 | MapReduce Total cumulative CPU time: 1 seconds 730 msec
124 | Ended Job = job_1460484956690_0052
125 | MapReduce Jobs Launched: 
126 | Stage-Stage-0: Map: 1   Cumulative CPU: 1.73 sec   HDFS Read: 3934 HDFS Write: 0 SUCCESS
127 | Total MapReduce CPU Time Spent: 1 seconds 730 msec
128 | OK
129 | _col0	_col1
130 | Time taken: 18.704 seconds
131 | 
132 | 
133 | select count(*) from test_drop
134 | Query ID = hdfs_20160415004646_ee73a7b3-1beb-4dc7-b102-aa5ccf322f10
135 | Total jobs = 1
136 | Launching Job 1 out of 1
137 | Number of reduce tasks determined at compile time: 1
138 | In order to change the average load for a reducer (in bytes):
139 |   set hive.exec.reducers.bytes.per.reducer=<number>
140 | In order to limit the maximum number of reducers:
141 |   set hive.exec.reducers.max=<number>
142 | In order to set a constant number of reducers:
143 |   set mapreduce.job.reduces=<number>
144 | Starting Job = job_1460484956690_0053, Tracking URL = http://ip-172-31-56-74.ec2.internal:8088/proxy/application_1460484956690_0053/
145 | Kill Command = /opt/cloudera/parcels/CDH-5.5.2-1.cdh5.5.2.p0.4/lib/hadoop/bin/hadoop job  -kill job_1460484956690_0053
146 | Hadoop job information for Stage-1: number of mappers: 1; number of reducers: 1
147 | 2016-04-15 00:46:09,350 Stage-1 map = 0%,  reduce = 0%
148 | 2016-04-15 00:46:15,773 Stage-1 map = 100%,  reduce = 0%, Cumulative CPU 1.39 sec
149 | 2016-04-15 00:46:24,094 Stage-1 map = 100%,  reduce = 100%, Cumulative CPU 2.98 sec
150 | MapReduce Total cumulative CPU time: 2 seconds 980 msec
151 | Ended Job = job_1460484956690_0053
152 | MapReduce Jobs Launched: 
153 | Stage-Stage-1: Map: 1  Reduce: 1   Cumulative CPU: 2.98 sec   HDFS Read: 6865 HDFS Write: 2 SUCCESS
154 | Total MapReduce CPU Time Spent: 2 seconds 980 msec
155 | OK
156 | _c0
157 | 3
158 | Time taken: 23.661 seconds, Fetched: 1 row(s)
159 | 
160 | 
161 | select id from test_Drop where name = 'a'
162 | Query ID = hdfs_20160415004646_fc52eb30-7464-4ff3-a83f-91b0db8d73df
163 | Total jobs = 1
164 | Launching Job 1 out of 1
165 | Number of reduce tasks is set to 0 since there's no reduce operator
166 | Starting Job = job_1460484956690_0054, Tracking URL = http://ip-172-31-56-74.ec2.internal:8088/proxy/application_1460484956690_0054/
167 | Kill Command = /opt/cloudera/parcels/CDH-5.5.2-1.cdh5.5.2.p0.4/lib/hadoop/bin/hadoop job  -kill job_1460484956690_0054
168 | Hadoop job information for Stage-1: number of mappers: 1; number of reducers: 0
169 | 2016-04-15 00:46:32,780 Stage-1 map = 0%,  reduce = 0%
170 | 2016-04-15 00:46:40,051 Stage-1 map = 100%,  reduce = 0%, Cumulative CPU 2.11 sec
171 | MapReduce Total cumulative CPU time: 2 seconds 110 msec
172 | Ended Job = job_1460484956690_0054
173 | MapReduce Jobs Launched: 
174 | Stage-Stage-1: Map: 1   Cumulative CPU: 2.11 sec   HDFS Read: 3991 HDFS Write: 4 SUCCESS
175 | Total MapReduce CPU Time Spent: 2 seconds 110 msec
176 | OK
177 | id
178 | 1
179 | 3
180 | Time taken: 15.94 seconds, Fetched: 2 row(s)
181 | 
182 | 
183 | select name, count(*) from test_drop group by name
184 | Query ID = hdfs_20160415004646_15757794-72c2-45a7-84b4-ba7b0a5d4405
185 | Total jobs = 1
186 | Launching Job 1 out of 1
187 | Number of reduce tasks not specified. Estimated from input data size: 1
188 | In order to change the average load for a reducer (in bytes):
189 |   set hive.exec.reducers.bytes.per.reducer=<number>
190 | In order to limit the maximum number of reducers:
191 |   set hive.exec.reducers.max=<number>
192 | In order to set a constant number of reducers:
193 |   set mapreduce.job.reduces=<number>
194 | Starting Job = job_1460484956690_0055, Tracking URL = http://ip-172-31-56-74.ec2.internal:8088/proxy/application_1460484956690_0055/
195 | Kill Command = /opt/cloudera/parcels/CDH-5.5.2-1.cdh5.5.2.p0.4/lib/hadoop/bin/hadoop job  -kill job_1460484956690_0055
196 | Hadoop job information for Stage-1: number of mappers: 1; number of reducers: 1
197 | 2016-04-15 00:46:48,532 Stage-1 map = 0%,  reduce = 0%
198 | 2016-04-15 00:46:55,742 Stage-1 map = 100%,  reduce = 0%, Cumulative CPU 1.4 sec
199 | 2016-04-15 00:47:02,987 Stage-1 map = 100%,  reduce = 100%, Cumulative CPU 2.84 sec
200 | MapReduce Total cumulative CPU time: 2 seconds 840 msec
201 | Ended Job = job_1460484956690_0055
202 | MapReduce Jobs Launched: 
203 | Stage-Stage-1: Map: 1  Reduce: 1   Cumulative CPU: 2.84 sec   HDFS Read: 7341 HDFS Write: 8 SUCCESS
204 | Total MapReduce CPU Time Spent: 2 seconds 840 msec
205 | OK
206 | name	_c1
207 | a	2
208 | b	1
209 | Time taken: 22.899 seconds, Fetched: 2 row(s)
210 | 
211 | 
212 | drop table test_Drop
213 | OK
214 | Time taken: 0.113 seconds
215 | WARN: The method class org.apache.commons.logging.impl.SLF4JLogFactory#release() was invoked.
216 | WARN: Please see http://www.slf4j.org/codes.html#release for an explanation.
217 | 
218 | ```
219 | 


--------------------------------------------------------------------------------
/Resources/HiveKudu-Handler-0.0.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BimalTandel/HiveKudu-Handler/70ff341ca642fe9de74048a5f70833ee0b6b3baa/Resources/HiveKudu-Handler-0.0.1.jar


--------------------------------------------------------------------------------
/Resources/async-1.3.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BimalTandel/HiveKudu-Handler/70ff341ca642fe9de74048a5f70833ee0b6b3baa/Resources/async-1.3.1.jar


--------------------------------------------------------------------------------
/Resources/kudu-client-0.6.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BimalTandel/HiveKudu-Handler/70ff341ca642fe9de74048a5f70833ee0b6b3baa/Resources/kudu-client-0.6.0.jar


--------------------------------------------------------------------------------
/Resources/kudu-mapreduce-0.7.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BimalTandel/HiveKudu-Handler/70ff341ca642fe9de74048a5f70833ee0b6b3baa/Resources/kudu-mapreduce-0.7.1.jar


--------------------------------------------------------------------------------
/doc/DesignDocument.md:
--------------------------------------------------------------------------------
 1 | # Hive On Kudu Design Document
 2 | ## Design Goals
 3 | The primary goal of developing Hive on Kudu is to fully leverage Hive and Kudu's capabilities. The goal is to release the first version of Hive on Kudu with the following features. 
 4 | 
 5 | 1. Support for Hive External tables to Kudu
 6 |     * SerDe to auto-create columns in Hive
 7 | 2. Support for Hive managed tables on Kudu
 8 | 3. Support basic and advanced partitioning capabilities of Kudu
 9 | 4. Support for Hive transactions (Updates and Deletes)
10 | 5. Support for predicates push-down
11 | 
12 | ## Design
13 | Hive provides a Storage Handler to integrated other storage systems with Hive. Primary example of a Storage System that leverages storage handler is HBase. Below are some useful links to familiarize with Storage Handlers in Hive with examples.
14 | * [Hive Storage Handler guide](https://cwiki.apache.org/confluence/display/Hive/StorageHandlers)
15 | * [Hive-HBase Storage Handler](https://github.com/BimalTandel/hive/tree/master/hbase-handler)
16 | * [Hive-JDBC Storage Handler](https://github.com/qubole/Hive-JDBC-Storage-Handler)
17 | * [ElasticSearch Storage Handler](https://github.com/elastic/elasticsearch-hadoop/tree/master/hive/src/main/java/org/elasticsearch/hadoop/hive)
18 | 
19 | In addition to a Storage Handler Hive provides capability to provide specific Input and Output format to handle reads and writes, and a SerDe to serialize and deserialize data to and from the Output and Input formats. 
20 | 
21 | High Level interactions and relationships before the components:
22 | 
23 | ![alt text](./figures/StorageHandlerDesign.png) "Custom Storage Handler Components")
24 | 
25 | To complete integration of Hive and Kudu these components will have to be developed.
26 | * HiveKudu Storage Handler
27 | * HiveKudu Input Format (With ACID support for Updates and Deletes)
28 | * HiveKudu Output Format (With ACID support for Updates and Deletes)
29 | * HiveKudu SerDe
30 | * HiveKudu Writable 
31 | 
32 | > Hive Interfaces and Classes are all based on MR1 APIs. I found it challenging to extend the KuduTableInputFormat and KuduTableOutputFormat as they are based on MR2 APIs. The only way to successfully use them would be to convert and publish a version with MR1 APIs. 
33 | 
34 | ## Detailed Design (Work in Progress)
35 | ### HiveKudu Storage Handler
36 | Things that need further discussion
37 | * How should we map Hive DDLs for providing partitioning options for Kudu Tables?
38 |     * Option 1: Use Hives "Clustered By" and "INTO numbuckets BUCKETS" clauses.
39 |     * Option 2: Use TBLPROPERTIES Key Value Pairs
40 | * How should we decompose predicates to allow Kudu to filter records during table scans? We can attempt to do what is currently supported via Impala. 
41 | 
42 | ### HiveKudu Serde and HiveKudu Writable
43 | * Review the current design of the Writable object
44 | * Hive to Kudu datatype mappings. (Kudu treats timestamps as LONG).
45 | 
46 | ### HiveKudu Input & Output format
47 | * Can we leverage the ACIDInputOutput format for Kudu?
48 | * How to leverage hive transactions for Kudu?
49 | 


--------------------------------------------------------------------------------
/doc/README.md:
--------------------------------------------------------------------------------
1 | ## Documents related to Hive on Kudu
2 | * [Design Document](DesignDocument.MD)
3 | * [User Guide](UserGuide.MD)
4 | 
5 | 


--------------------------------------------------------------------------------
/doc/UserGuide.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BimalTandel/HiveKudu-Handler/70ff341ca642fe9de74048a5f70833ee0b6b3baa/doc/UserGuide.md


--------------------------------------------------------------------------------
/doc/figures/StorageHandlerDesign.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BimalTandel/HiveKudu-Handler/70ff341ca642fe9de74048a5f70833ee0b6b3baa/doc/figures/StorageHandlerDesign.png


--------------------------------------------------------------------------------
/doc/figures/hivekudu-design.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BimalTandel/HiveKudu-Handler/70ff341ca642fe9de74048a5f70833ee0b6b3baa/doc/figures/hivekudu-design.png


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <project xmlns="http://maven.apache.org/POM/4.0.0"
  3 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  5 |     <modelVersion>4.0.0</modelVersion>
  6 | 
  7 |     <groupId>com.cloudera.ps.HiveKudu</groupId>
  8 |     <artifactId>KuduHandler</artifactId>
  9 |     <version>0.0.1</version>
 10 | 
 11 |     <properties>
 12 |         <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
 13 |     </properties>
 14 | 
 15 |     <repositories>
 16 |         <repository>
 17 |             <id>cdh.repo</id>
 18 |             <name>Cloudera Repositories</name>
 19 |             <url>https://repository.cloudera.com/artifactory/cloudera-repos</url>
 20 |             <snapshots>
 21 |                 <enabled>false</enabled>
 22 |             </snapshots>
 23 |         </repository>
 24 |     </repositories>
 25 | 
 26 |     <dependencies>
 27 |         <dependency>
 28 |             <groupId>org.apache.hive</groupId>
 29 |             <artifactId>hive-exec</artifactId>
 30 |             <version>1.1.0</version>
 31 |             <scope>compile</scope>
 32 |         </dependency>
 33 | 
 34 |         <dependency>
 35 |             <groupId>junit</groupId>
 36 |             <artifactId>junit</artifactId>
 37 |             <version>4.10</version>
 38 |             <scope>test</scope>
 39 |         </dependency>
 40 |         <dependency>
 41 |             <groupId>org.easymock</groupId>
 42 |             <artifactId>easymock</artifactId>
 43 |             <version>3.1</version>
 44 |             <scope>test</scope>
 45 |         </dependency>
 46 |         <dependency>
 47 |             <groupId>org.apache.hadoop</groupId>
 48 |             <artifactId>hadoop-common</artifactId>
 49 |             <version>2.6.1</version>
 50 |             <scope>provided</scope>
 51 |         </dependency>
 52 |         <dependency>
 53 |             <groupId>org.apache.hadoop</groupId>
 54 |             <artifactId>hadoop-mapreduce-client-common</artifactId>
 55 |             <version>2.6.1</version>
 56 |             <scope>provided</scope>
 57 |         </dependency>
 58 |         <dependency>
 59 |             <groupId>org.kududb</groupId>
 60 |             <artifactId>kudu-client</artifactId>
 61 |             <version>0.7.1</version>
 62 |             <scope>provided</scope>
 63 |         </dependency>
 64 |         <dependency>
 65 |             <groupId>org.kududb</groupId>
 66 |             <artifactId>kudu-mapreduce</artifactId>
 67 |             <version>0.7.1</version>
 68 |         </dependency>
 69 |     </dependencies>
 70 | 
 71 |     <build>
 72 | 
 73 |         <plugins>
 74 |             <plugin>
 75 |                 <groupId>org.apache.maven.plugins</groupId>
 76 |                 <artifactId>maven-compiler-plugin</artifactId>
 77 |                 <version>2.5.1</version>
 78 |                 <configuration>
 79 |                     <source>1.7</source>
 80 |                     <target>1.7</target>
 81 |                     <showDeprecation>true</showDeprecation>
 82 |                     <showWarnings>true</showWarnings>
 83 |                     <fork>true</fork>
 84 |                 </configuration>
 85 |             </plugin>
 86 |             <plugin>
 87 |                 <groupId>org.apache.maven.plugins</groupId>
 88 |                 <artifactId>maven-shade-plugin</artifactId>
 89 |                 <version>2.3</version>
 90 |                 <configuration>
 91 |                     <artifactSet>
 92 |                         <includes>
 93 |                             <include>com.amazonaws:*</include>
 94 |                             <include>com.google.guava:guava</include>
 95 |                             <include>joda-time:joda-time</include>
 96 |                         </includes>
 97 |                     </artifactSet>
 98 |                     <relocations>
 99 |                         <relocation>
100 |                             <pattern>com.google</pattern>
101 |                             <shadedPattern>qksh.shaded.com.google</shadedPattern>
102 |                         </relocation>
103 |                         <relocation>
104 |                             <pattern>com.amazonaws</pattern>
105 |                             <shadedPattern>qksh.shaded.com.amazonaws</shadedPattern>
106 |                         </relocation>
107 |                         <relocation>
108 |                             <pattern>org.joda</pattern>
109 |                             <shadedPattern>qksh.shaded.org.joda</shadedPattern>
110 |                         </relocation>
111 |                     </relocations>
112 |                     <finalName>HiveKudu-Handler-${project.version}</finalName>
113 |                 </configuration>
114 |                 <executions>
115 |                     <execution>
116 |                         <phase>package</phase>
117 |                         <goals>
118 |                             <goal>shade</goal>
119 |                         </goals>
120 |                         <configuration>
121 |                             <minimizeJar>false</minimizeJar>
122 |                         </configuration>
123 |                     </execution>
124 |                 </executions>
125 |             </plugin>
126 |             <plugin>
127 |                 <groupId>org.apache.maven.plugins</groupId>
128 |                 <artifactId>maven-assembly-plugin</artifactId>
129 |                 <configuration>
130 |                     <descriptorRefs>
131 |                         <descriptorRef>jar-with-dependencies</descriptorRef>
132 |                     </descriptorRefs>
133 |                 </configuration>
134 |                 <executions>
135 |                     <execution>
136 |                         <id>assemble-all</id>
137 |                         <phase>package</phase>
138 |                         <goals>
139 |                             <goal>single</goal>
140 |                         </goals>
141 |                     </execution>
142 |                 </executions>
143 |             </plugin>
144 | 
145 |         </plugins>
146 |     </build>
147 | 
148 | </project>


--------------------------------------------------------------------------------
/src/main/java/org/apache/hadoop/hive/kududb/KuduHandler/HiveKuduBridgeUtils.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright 2016 Bimal Tandel
  3 | 
  4 |  Licensed under the Apache License, Version 2.0 (the "License");
  5 |  you may not use this file except in compliance with the License.
  6 |  You may obtain a copy of the License at
  7 | 
  8 |  http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 |  Unless required by applicable law or agreed to in writing, software
 11 |  distributed under the License is distributed on an "AS IS" BASIS,
 12 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  See the License for the specific language governing permissions and
 14 |  limitations under the License.
 15 |  */
 16 | 
 17 | package org.apache.hadoop.hive.kududb.KuduHandler;
 18 | 
 19 | import org.apache.hadoop.hive.serde2.SerDeException;
 20 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
 21 | import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
 22 | import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
 23 | import org.kududb.Type;
 24 | 
 25 | import java.io.DataInput;
 26 | import java.io.DataOutput;
 27 | import java.io.IOException;
 28 | import java.sql.Timestamp;
 29 | 
 30 | 
 31 | /**
 32 |  * Created by bimal on 4/12/16.
 33 |  */
 34 | public class HiveKuduBridgeUtils {
 35 | 
 36 |     public static Type hiveTypeToKuduType(String hiveType) throws SerDeException {
 37 |         final String lchiveType = hiveType.toLowerCase();
 38 |         switch(lchiveType) {
 39 |             case "string":
 40 |             case "varchar":
 41 |             case "char":
 42 |                 return Type.STRING;
 43 | 
 44 |             case "tinyint":
 45 |                 return Type.INT8;
 46 |             case "smallint":
 47 |                 return Type.INT16;
 48 |             case "int":
 49 |                 return Type.INT32;
 50 |             case "bigint":
 51 |                 return Type.INT64;
 52 |             case "float":
 53 |                 return Type.FLOAT;
 54 |             case "double":
 55 |                 return Type.DOUBLE;
 56 | 
 57 |             case "timestamp":
 58 |                 return Type.TIMESTAMP;
 59 | 
 60 |             case "boolean":
 61 |                 return Type.BOOL;
 62 | 
 63 |             case "binary":
 64 |                 return Type.BINARY;
 65 |             default:
 66 |                 throw new SerDeException("Unrecognized column type: " + hiveType + " not supported in Kudu");
 67 |         }
 68 |     }
 69 | 
 70 |     public static ObjectInspector getObjectInspector(Type kuduType,
 71 |                                                      String hiveType) throws SerDeException {
 72 |         switch (kuduType) {
 73 |             case STRING:
 74 |                 return PrimitiveObjectInspectorFactory.javaStringObjectInspector;
 75 |             case FLOAT:
 76 |                 return PrimitiveObjectInspectorFactory.javaFloatObjectInspector;
 77 |             case DOUBLE:
 78 |                 return PrimitiveObjectInspectorFactory.javaDoubleObjectInspector;
 79 |             case BOOL:
 80 |                 return PrimitiveObjectInspectorFactory.javaBooleanObjectInspector;
 81 |             case INT8:
 82 |                 return PrimitiveObjectInspectorFactory.javaByteObjectInspector;
 83 |             case INT16:
 84 |                 return PrimitiveObjectInspectorFactory.javaShortObjectInspector;
 85 |             case INT32:
 86 |                 return PrimitiveObjectInspectorFactory.javaIntObjectInspector;
 87 |             case INT64:
 88 |                 return PrimitiveObjectInspectorFactory.javaLongObjectInspector;
 89 |             case TIMESTAMP:
 90 |                 return PrimitiveObjectInspectorFactory.javaTimestampObjectInspector;
 91 |             case BINARY:
 92 |                 return PrimitiveObjectInspectorFactory.javaByteArrayObjectInspector;
 93 |             default:
 94 |                 throw new SerDeException("Cannot find getObjectInspector for: "
 95 |                         + hiveType);
 96 |         }
 97 |     }
 98 | 
 99 |     public static Object deparseObject(Object field, ObjectInspector fieldOI)
100 |             throws SerDeException {
101 |         switch (fieldOI.getCategory()) {
102 |             case PRIMITIVE: {
103 |                 PrimitiveObjectInspector oi = (PrimitiveObjectInspector) fieldOI;
104 |                 return oi.getPrimitiveJavaObject(field);
105 |             }
106 | 
107 |             //Kudu doesnt support LIST or MAP based data types
108 | 
109 |             default:
110 |                 throw new SerDeException("Unexpected fieldOI: " + fieldOI);
111 |         }
112 |     }
113 | 
114 | 
115 |     public static Object readObject(DataInput in, Type kuduType)
116 |             throws IOException {
117 |         switch (kuduType) {
118 |             case STRING:
119 |                 return in.readUTF();
120 |             case FLOAT:
121 |                 return Float.valueOf(in.readFloat());
122 |             case DOUBLE:
123 |                 return Double.valueOf(in.readDouble());
124 |             case BOOL:
125 |                 return Boolean.valueOf(in.readBoolean());
126 |             case INT8:
127 |                 return Byte.valueOf(in.readByte());
128 |             case INT16:
129 |                 return Short.valueOf(in.readShort());
130 |             case INT32:
131 |                 return Integer.valueOf(in.readInt());
132 |             case INT64:
133 |                 return Long.valueOf(in.readLong());
134 |             case TIMESTAMP: {
135 |                 long time = in.readLong();
136 |                 return new Timestamp(time);
137 |             }
138 |             case BINARY: {
139 |                 int size = in.readInt();
140 |                 byte[] b = new byte[size];
141 |                 in.readFully(b);
142 |                 return b;
143 |             }
144 |             default:
145 |                 throw new IOException("Cannot read Object for type: " + kuduType.name());
146 |         }
147 |     }
148 | 
149 |     public static void writeObject(Object obj, Type kuduType, DataOutput out)
150 |             throws IOException {
151 |         switch (kuduType) {
152 |             case STRING: {
153 |                 String s = obj.toString();
154 |                 out.writeUTF(s);
155 |                 return;
156 |             }
157 |             case FLOAT: {
158 |                 Float f = (Float) obj;
159 |                 out.writeFloat(f);
160 |                 return;
161 |             }
162 |             case DOUBLE: {
163 |                 Double d = (Double) obj;
164 |                 out.writeDouble(d);
165 |                 return;
166 |             }
167 |             case BOOL: {
168 |                 Boolean b = (Boolean) obj;
169 |                 out.writeBoolean(b);
170 |                 return;
171 |             }
172 |             case INT8: {
173 |                 Byte b = (Byte) obj;
174 |                 out.writeByte(b.intValue());
175 |                 return;
176 |             }
177 |             case INT16: {
178 |                 Short s = (Short) obj;
179 |                 out.writeShort(s.shortValue());
180 |                 return;
181 |             }
182 |             case INT32: {
183 |                 Integer i = (Integer) obj;
184 |                 out.writeInt(i.intValue());
185 |                 return;
186 |             }
187 |             case INT64: {
188 |                 Long l = (Long) obj;
189 |                 out.writeLong(l.longValue());
190 |                 return;
191 |             }
192 |             case TIMESTAMP: {
193 |                 Timestamp time = (Timestamp) obj;
194 |                 out.writeLong(time.getTime());
195 |                 return;
196 |             }
197 |             case BINARY: {
198 |                 byte[] b = (byte[]) obj;
199 |                 out.writeInt(b.length);
200 |                 out.write(b);
201 |                 return;
202 |             }
203 |             default:
204 |                 throw new IOException("Cannot write Object '"
205 |                         + obj.getClass().getSimpleName() + "' as type: " + kuduType.name());
206 |         }
207 |     }
208 | }
209 | 


--------------------------------------------------------------------------------
/src/main/java/org/apache/hadoop/hive/kududb/KuduHandler/HiveKuduConstants.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2016 Bimal Tandel
 3 | 
 4 |  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  you may not use this file except in compliance with the License.
 6 |  You may obtain a copy of the License at
 7 | 
 8 |  http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 |  Unless required by applicable law or agreed to in writing, software
11 |  distributed under the License is distributed on an "AS IS" BASIS,
12 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  See the License for the specific language governing permissions and
14 |  limitations under the License.
15 |  */
16 | 
17 | package org.apache.hadoop.hive.kududb.KuduHandler;
18 | 
19 | /**
20 |  * Created by bimal on 4/11/16.
21 |  */
22 | 
23 | public final class HiveKuduConstants {
24 | 
25 |     //Table Properties
26 |     public static final String LIST_COLUMNS = "columns";
27 |     public static final String LIST_COLUMN_TYPES = "columns.types";
28 |     public static final String MASTER_ADDRESS_NAME = "kudu.master_addresses";
29 |     public static final String TABLE_NAME = "kudu.table_name";
30 |     public static final String KEY_COLUMNS = "kudu.key_columns";
31 | 
32 |     //SerDe Properties
33 | 
34 |     //MapReduce Properties
35 |     public static final String MR_INPUT_TABLE_NAME = "kudu.mapreduce.input.table";
36 |     public static final String MR_OUTPUT_TABLE_NAME = "kudu.mapreduce.output.table";
37 |     public static final String MR_MASTER_ADDRESS_NAME = "kudu.mapreduce.master.addresses";
38 |     public static final String MR_PROPERTY_PREFIX = "kudu.mapreduce.";
39 |     //DEFAULT VALUES & Getters for Default values
40 | 
41 | 
42 |     private HiveKuduConstants() {
43 |     }
44 | }


--------------------------------------------------------------------------------
/src/main/java/org/apache/hadoop/hive/kududb/KuduHandler/HiveKuduSerDe.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright 2016 Bimal Tandel
  3 | 
  4 |  Licensed under the Apache License, Version 2.0 (the "License");
  5 |  you may not use this file except in compliance with the License.
  6 |  You may obtain a copy of the License at
  7 | 
  8 |  http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 |  Unless required by applicable law or agreed to in writing, software
 11 |  distributed under the License is distributed on an "AS IS" BASIS,
 12 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  See the License for the specific language governing permissions and
 14 |  limitations under the License.
 15 |  */
 16 | 
 17 | package org.apache.hadoop.hive.kududb.KuduHandler;
 18 | 
 19 | import org.apache.commons.logging.Log;
 20 | import org.apache.commons.logging.LogFactory;
 21 | import org.apache.hadoop.conf.Configuration;
 22 | import org.apache.hadoop.hive.serde2.SerDe;
 23 | import org.apache.hadoop.hive.serde2.SerDeException;
 24 | import org.apache.hadoop.hive.serde2.SerDeStats;
 25 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
 26 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
 27 | import org.apache.hadoop.hive.serde2.objectinspector.StructField;
 28 | import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
 29 | import org.apache.hadoop.io.Writable;
 30 | import org.kududb.Type;
 31 | 
 32 | import java.util.ArrayList;
 33 | import java.util.Arrays;
 34 | import java.util.List;
 35 | import java.util.Properties;
 36 | 
 37 | 
 38 | /**
 39 |  * Created by bimal on 4/12/16.
 40 |  */
 41 | 
 42 | public class HiveKuduSerDe implements SerDe {
 43 | 
 44 |     private static final Log LOG = LogFactory.getLog(HiveKuduSerDe.class);
 45 | 
 46 |     private HiveKuduWritable cachedWritable; //Currently Update/Delete not supported from Hive.
 47 | 
 48 |     private int fieldCount;
 49 | 
 50 |     private StructObjectInspector objectInspector;
 51 |     private List<Object> deserializeCache;
 52 | 
 53 |     public HiveKuduSerDe() {
 54 |     }
 55 | 
 56 |     @Override
 57 |     public void initialize(Configuration sysConf, Properties tblProps)
 58 |         throws SerDeException {
 59 | 
 60 |         LOG.debug("tblProps: " + tblProps);
 61 | 
 62 |         String columnNameProperty = tblProps
 63 |                 .getProperty(HiveKuduConstants.LIST_COLUMNS);
 64 |         String columnTypeProperty = tblProps
 65 |                 .getProperty(HiveKuduConstants.LIST_COLUMN_TYPES);
 66 | 
 67 |         if (columnNameProperty.length() == 0
 68 |                 && columnTypeProperty.length() == 0) {
 69 |             //This is where we will implement option to connect to Kudu and get the column list using Serde.
 70 |         }
 71 | 
 72 |         List<String> columnNames = Arrays.asList(columnNameProperty.split(","));
 73 | 
 74 |         String[] columnTypes = columnTypeProperty.split(":");
 75 | 
 76 |         if (columnNames.size() != columnTypes.length) {
 77 |             throw new SerDeException("Splitting column and types failed." + "columnNames: "
 78 |                     + columnNames + ", columnTypes: "
 79 |                     + Arrays.toString(columnTypes));
 80 |         }
 81 | 
 82 |         final Type[] types = new Type[columnTypes.length];
 83 | 
 84 |         for (int i = 0; i < types.length; i++) {
 85 |             types[i] = HiveKuduBridgeUtils.hiveTypeToKuduType(columnTypes[i]);
 86 |         }
 87 | 
 88 |         this.cachedWritable = new HiveKuduWritable(types);
 89 | 
 90 |         this.fieldCount = types.length;
 91 | 
 92 |         final List<ObjectInspector> fieldOIs = new ArrayList<>(columnTypes.length);
 93 | 
 94 |         for (int i = 0; i < types.length; i++) {
 95 |             ObjectInspector oi = HiveKuduBridgeUtils.getObjectInspector(types[i], columnTypes[i]);
 96 |             fieldOIs.add(oi);
 97 |         }
 98 | 
 99 |         this.objectInspector = ObjectInspectorFactory.getStandardStructObjectInspector(columnNames, fieldOIs);
100 | 
101 |         this.deserializeCache = new ArrayList<>(columnTypes.length);
102 | 
103 |     }
104 | 
105 |     @Override
106 |     public ObjectInspector getObjectInspector() throws SerDeException {
107 |         return objectInspector;
108 |     }
109 | 
110 |     @Override
111 |     public Class<? extends Writable> getSerializedClass() {
112 |         return HiveKuduWritable.class;
113 |     }
114 | 
115 |     @Override
116 |     public HiveKuduWritable serialize(Object row, ObjectInspector inspector)
117 |         throws SerDeException {
118 | 
119 |         final StructObjectInspector structInspector = (StructObjectInspector) inspector;
120 |         final List<? extends StructField> fields = structInspector.getAllStructFieldRefs();
121 |         if (fields.size() != fieldCount) {
122 |             throw new SerDeException(String.format(
123 |                     "Required %d columns, received %d.", fieldCount,
124 |                     fields.size()));
125 |         }
126 | 
127 |         cachedWritable.clear();
128 | 
129 |         for (int i = 0; i < fieldCount; i++) {
130 |             StructField structField = fields.get(i);
131 |             if (structField != null) {
132 |                 Object field = structInspector.getStructFieldData(row,
133 |                         structField);
134 |                 ObjectInspector fieldOI = structField.getFieldObjectInspector();
135 | 
136 |                 Object javaObject = HiveKuduBridgeUtils.deparseObject(field,
137 |                         fieldOI);
138 |                 LOG.warn("Column value of " + i + " is " + javaObject.toString());
139 |                 cachedWritable.set(i, javaObject);
140 |             }
141 |         }
142 |         return cachedWritable;
143 |     }
144 | 
145 |     @Override
146 |     public Object deserialize(Writable record) throws SerDeException {
147 |         if (!(record instanceof HiveKuduWritable)) {
148 |             throw new SerDeException("Expected HiveKuduWritable, received "
149 |                     + record.getClass().getName());
150 |         }
151 |         HiveKuduWritable tuple = (HiveKuduWritable) record;
152 |         deserializeCache.clear();
153 |         for (int i = 0; i < fieldCount; i++) {
154 |             Object o = tuple.get(i);
155 |             deserializeCache.add(o);
156 |         }
157 |         return deserializeCache;
158 |     }
159 | 
160 |     @Override
161 |     public SerDeStats getSerDeStats() {
162 |         // TODO How to implement this?
163 |         return null;
164 |     }
165 | }
166 | 
167 | 
168 | 


--------------------------------------------------------------------------------
/src/main/java/org/apache/hadoop/hive/kududb/KuduHandler/HiveKuduWritable.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright 2016 Bimal Tandel
  3 | 
  4 |  Licensed under the Apache License, Version 2.0 (the "License");
  5 |  you may not use this file except in compliance with the License.
  6 |  You may obtain a copy of the License at
  7 | 
  8 |  http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 |  Unless required by applicable law or agreed to in writing, software
 11 |  distributed under the License is distributed on an "AS IS" BASIS,
 12 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  See the License for the specific language governing permissions and
 14 |  limitations under the License.
 15 |  */
 16 | 
 17 | package org.apache.hadoop.hive.kududb.KuduHandler;
 18 | 
 19 | import org.apache.hadoop.io.Writable;
 20 | import org.apache.hadoop.io.WritableUtils;
 21 | import org.kududb.Type;
 22 | 
 23 | import java.io.DataInput;
 24 | import java.io.DataOutput;
 25 | import java.io.IOException;
 26 | import java.util.Arrays;
 27 | 
 28 | /**
 29 |  * Created by bimal on 4/12/16.
 30 |  */
 31 | public class HiveKuduWritable implements Writable {
 32 | 
 33 | 
 34 |     private Object[] columnValues;
 35 |     private Type[] columnTypes;
 36 | 
 37 |     public HiveKuduWritable() {
 38 | 
 39 |     }
 40 | 
 41 |     public HiveKuduWritable(Type[] types) {
 42 |         this.columnValues = new Object[types.length];
 43 |         this.columnTypes = types;
 44 |     }
 45 | 
 46 |     public void clear() {
 47 |         Arrays.fill(columnValues, null);
 48 |     }
 49 | 
 50 |     public void set(int i, Object javaObject) {
 51 |         columnValues[i] = javaObject;
 52 |     }
 53 | 
 54 |     public Object get(int i) {
 55 |         return columnValues[i];
 56 |     }
 57 | 
 58 |     public Type getType(int i) { return columnTypes[i]; }
 59 | 
 60 |     public int getColCount() {
 61 |         return this.columnTypes.length;
 62 |     }
 63 | 
 64 |     @Override
 65 |     public void readFields(DataInput in) throws IOException {
 66 |         int size = in.readInt();
 67 |         if (size == -1) {
 68 |             return;
 69 |         }
 70 |         if (columnValues == null) {
 71 |             this.columnValues = new Object[size];
 72 |             this.columnTypes = new Type[size];
 73 |         } else {
 74 |             clear();
 75 |         }
 76 |         for (int i = 0; i < size; i++) {
 77 |             Type kuduType = WritableUtils.readEnum(in, Type.class);
 78 |             columnTypes[i] = kuduType;
 79 |             Object v = HiveKuduBridgeUtils.readObject(in, kuduType);
 80 |             columnValues[i] = v;
 81 |         }
 82 |     }
 83 |     @Override
 84 |     public void write(DataOutput out) throws IOException {
 85 |         if (columnValues == null) {
 86 |             out.writeInt(-1);
 87 |             return;
 88 |         }
 89 |         if (columnTypes == null) {
 90 |             out.writeInt(-1);
 91 |             return;
 92 |         }
 93 | 
 94 |         final Object[] values = this.columnValues;
 95 |         final Type[] types = this.columnTypes;
 96 | 
 97 |         out.writeInt(values.length);
 98 | 
 99 |         for (int i = 0; i < values.length; i++) {
100 |             HiveKuduBridgeUtils.writeObject(values[i], types[i], out);
101 |         }
102 |     }
103 | }
104 | 


--------------------------------------------------------------------------------
/src/main/java/org/apache/hadoop/hive/kududb/KuduHandler/KuduStorageHandler.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright 2016 Bimal Tandel
  3 | 
  4 |  Licensed under the Apache License, Version 2.0 (the "License");
  5 |  you may not use this file except in compliance with the License.
  6 |  You may obtain a copy of the License at
  7 | 
  8 |  http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 |  Unless required by applicable law or agreed to in writing, software
 11 |  distributed under the License is distributed on an "AS IS" BASIS,
 12 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  See the License for the specific language governing permissions and
 14 |  limitations under the License.
 15 |  */
 16 | 
 17 | package org.apache.hadoop.hive.kududb.KuduHandler;
 18 | 
 19 | import org.apache.commons.logging.Log;
 20 | import org.apache.commons.logging.LogFactory;
 21 | import org.apache.hadoop.conf.Configuration;
 22 | import org.apache.hadoop.hive.metastore.HiveMetaHook;
 23 | import org.apache.hadoop.hive.metastore.MetaStoreUtils;
 24 | import org.apache.hadoop.hive.metastore.api.FieldSchema;
 25 | import org.apache.hadoop.hive.metastore.api.MetaException;
 26 | import org.apache.hadoop.hive.metastore.api.Table;
 27 | import org.apache.hadoop.hive.ql.metadata.DefaultStorageHandler;
 28 | import org.apache.hadoop.hive.ql.metadata.HiveException;
 29 | import org.apache.hadoop.hive.ql.metadata.HiveStoragePredicateHandler;
 30 | import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
 31 | import org.apache.hadoop.hive.ql.plan.TableDesc;
 32 | import org.apache.hadoop.hive.ql.security.authorization.DefaultHiveAuthorizationProvider;
 33 | import org.apache.hadoop.hive.ql.security.authorization.HiveAuthorizationProvider;
 34 | import org.apache.hadoop.hive.serde2.Deserializer;
 35 | import org.apache.hadoop.hive.serde2.SerDe;
 36 | import org.apache.hadoop.mapred.JobConf;
 37 | import org.apache.hadoop.mapred.InputFormat;
 38 | import org.apache.hadoop.mapred.OutputFormat;
 39 | import org.kududb.ColumnSchema;
 40 | import org.kududb.Schema;
 41 | import org.kududb.client.KuduClient;
 42 | import org.kududb.client.CreateTableOptions;
 43 | import org.kududb.mapred.HiveKuduTableInputFormat;
 44 | import org.kududb.mapred.HiveKuduTableOutputFormat;
 45 | 
 46 | import java.io.IOException;
 47 | import java.util.*;
 48 | 
 49 | /**
 50 |  * Created by bimal on 4/11/16.
 51 |  */
 52 | 
 53 | @SuppressWarnings({ "deprecation", "rawtypes" })
 54 | public class KuduStorageHandler extends DefaultStorageHandler
 55 |         implements HiveMetaHook, HiveStoragePredicateHandler {
 56 | 
 57 |     private static final Log LOG = LogFactory.getLog(KuduStorageHandler.class);
 58 | 
 59 |     private Configuration conf;
 60 | 
 61 |     private String kuduMaster;
 62 |     private String kuduTableName;
 63 | 
 64 |     @Override
 65 |     public Class<? extends InputFormat> getInputFormatClass() {
 66 |         return HiveKuduTableInputFormat.class;
 67 |     }
 68 | 
 69 |     @Override
 70 |     public Class<? extends OutputFormat> getOutputFormatClass() {
 71 |         return HiveKuduTableOutputFormat.class;
 72 |     }
 73 | 
 74 |     @Override
 75 |     public Class<? extends SerDe> getSerDeClass() {
 76 |         return HiveKuduSerDe.class;
 77 |     }
 78 | 
 79 |     private KuduClient getKuduClient(String master) throws MetaException {
 80 |         try {
 81 | 
 82 |             return new KuduClient.KuduClientBuilder(master).build();
 83 |         } catch (Exception ioe){
 84 |             throw new MetaException("Error creating Kudu Client: " + ioe);
 85 |         }
 86 |     }
 87 | 
 88 |     public KuduStorageHandler() {
 89 |         // TODO: Empty initializer??
 90 |     }
 91 | 
 92 |     @Override
 93 |     public Configuration getConf() {
 94 |         return conf;
 95 |     }
 96 | 
 97 |     @Override
 98 |     public void setConf(Configuration conf) {
 99 |         this.conf = conf;
100 |     }
101 | 
102 |     @Override
103 |     public HiveMetaHook getMetaHook() {
104 |         return this;
105 |     }
106 | 
107 |     @Override
108 |     public void configureInputJobProperties(TableDesc tableDesc,
109 |                                             Map<String, String> jobProperties) {
110 |         configureJobProperties(tableDesc, jobProperties);
111 |     }
112 | 
113 |     @Override
114 |     public void configureOutputJobProperties(TableDesc tableDesc,
115 |                                              Map<String, String> jobProperties) {
116 |         configureJobProperties(tableDesc, jobProperties);
117 |     }
118 | 
119 |     @Override
120 |     public void configureTableJobProperties(TableDesc tableDesc,
121 |                                             Map<String, String> jobProperties) {
122 |         configureJobProperties(tableDesc, jobProperties);
123 |     }
124 | 
125 |     private void configureJobProperties(TableDesc tableDesc,
126 |                                         Map<String, String> jobProperties) {
127 | 
128 |         //This will always have the DB Name qualifier of Hive. Dont use this to set Kudu Tablename.
129 |         String tblName = tableDesc.getTableName();
130 |         LOG.debug("Hive Table Name:" + tblName);
131 |         Properties tblProps = tableDesc.getProperties();
132 |         String columnNames = tblProps.getProperty(HiveKuduConstants.LIST_COLUMNS);
133 |         String columnTypes = tblProps.getProperty(HiveKuduConstants.LIST_COLUMN_TYPES);
134 |         LOG.debug("Columns names:" + columnNames);
135 |         LOG.debug("Column types:" + columnTypes);
136 | 
137 |         if (columnNames.length() == 0) {
138 |             //TODO: Place keeper to insert SerDeHelper code to connect to Kudu to extract column names.
139 |             LOG.warn("SerDe currently doesn't support column names and types. Please provide it explicitly");
140 |         }
141 | 
142 |         //set map reduce properties.
143 |         jobProperties.put(HiveKuduConstants.MR_INPUT_TABLE_NAME,
144 |                 tblProps.getProperty(HiveKuduConstants.TABLE_NAME));
145 |         jobProperties.put(HiveKuduConstants.MR_OUTPUT_TABLE_NAME,
146 |                 tblProps.getProperty(HiveKuduConstants.TABLE_NAME));
147 |         jobProperties.put(HiveKuduConstants.MR_MASTER_ADDRESS_NAME,
148 |                 tblProps.getProperty(HiveKuduConstants.MASTER_ADDRESS_NAME));
149 | 
150 |         LOG.debug("Kudu Table Name: " + tblProps.getProperty(HiveKuduConstants.TABLE_NAME));
151 |         LOG.debug("Kudu Master Addresses: " + tblProps.getProperty(HiveKuduConstants.MASTER_ADDRESS_NAME));
152 | 
153 | 
154 |         //set configuration property
155 |         conf.set(HiveKuduConstants.MR_INPUT_TABLE_NAME,
156 |                 tblProps.getProperty(HiveKuduConstants.TABLE_NAME));
157 |         conf.set(HiveKuduConstants.MR_OUTPUT_TABLE_NAME,
158 |                 tblProps.getProperty(HiveKuduConstants.TABLE_NAME));
159 |         conf.set(HiveKuduConstants.MR_MASTER_ADDRESS_NAME,
160 |                 tblProps.getProperty(HiveKuduConstants.MASTER_ADDRESS_NAME));
161 | 
162 |         conf.set(HiveKuduConstants.TABLE_NAME,
163 |                 tblProps.getProperty(HiveKuduConstants.TABLE_NAME));
164 |         conf.set(HiveKuduConstants.MASTER_ADDRESS_NAME,
165 |                 tblProps.getProperty(HiveKuduConstants.MASTER_ADDRESS_NAME));
166 | 
167 |         //set class variables
168 |         kuduMaster = conf.get(HiveKuduConstants.MASTER_ADDRESS_NAME);
169 |         kuduTableName = conf.get(HiveKuduConstants.TABLE_NAME);
170 | 
171 |         for (String key : tblProps.stringPropertyNames()) {
172 |             if (key.startsWith(HiveKuduConstants.MR_PROPERTY_PREFIX)) {
173 |                 String value = tblProps.getProperty(key);
174 |                 jobProperties.put(key, value);
175 |                 //Also set configuration for Non Map Reduce Hive calls to the Handler
176 |                 conf.set(key, value);
177 |             }
178 |         }
179 |     }
180 | 
181 |     @Override
182 |     public HiveAuthorizationProvider getAuthorizationProvider()
183 |             throws HiveException {
184 |         return new DefaultHiveAuthorizationProvider();
185 |     }
186 | 
187 |     @Override
188 |     public DecomposedPredicate decomposePredicate(JobConf jobConf,
189 |                                                   Deserializer deserializer, ExprNodeDesc predicate) {
190 |         // TODO: Implement push down to Kudu here.
191 |         DecomposedPredicate decomposedPredicate = new DecomposedPredicate();
192 |         return decomposedPredicate;
193 |     }
194 | 
195 |     private String getKuduTableName(Table tbl) {
196 | 
197 |         String tableName = conf.get(HiveKuduConstants.TABLE_NAME);
198 |         if (tableName == null) {
199 |             LOG.warn("Kudu Table name was not provided in table properties.");
200 |             LOG.warn("Attempting to use Hive Table name");
201 |             tableName = tbl.getTableName().replaceAll(".*\\.", "");
202 |             LOG.warn("Kudu Table name will be: " + tableName);
203 | 
204 |         }
205 |         return tableName;
206 |     }
207 | 
208 |     private void printSchema(Schema schema) {
209 |         if (schema == null) {
210 |               return;
211 |             }
212 | 
213 |         LOG.debug("Printing schema for Kudu table..");
214 |         for (ColumnSchema sch : schema.getColumns()) {
215 |             LOG.debug("Column Name: " + sch.getName()
216 |                     + " [" + sch.getType().getName() + "]"
217 |                     + " key column: [" + sch.isKey() + "]"
218 |               );
219 |         }
220 |     }
221 | 
222 | 
223 |     @Override
224 |     public void preCreateTable(Table tbl)
225 |             throws MetaException {
226 |         KuduClient client = getKuduClient(tbl.getParameters().get(HiveKuduConstants.MASTER_ADDRESS_NAME));
227 | 
228 |         boolean isExternal = MetaStoreUtils.isExternalTable(tbl);
229 | 
230 |         if (isExternal) {
231 |             //TODO: Check if Kudu table exists to allow external table.
232 |             //TODO: Check if column and types are compatible with existing Kudu table.
233 |             throw new MetaException("External Table to Kudu not yet supported.");
234 |         }
235 |         if (tbl.getSd().getLocation() != null) {
236 |             throw new MetaException("LOCATION may not be specified for Kudu");
237 |         }
238 | 
239 |         String tablename = getKuduTableName(tbl);
240 | 
241 |         try {
242 |             List<String> keyColumns = Arrays.asList(tbl.getParameters().get(HiveKuduConstants.KEY_COLUMNS).split("\\s*,\\s*"));
243 | 
244 |             List<FieldSchema> tabColumns = tbl.getSd().getCols();
245 | 
246 |             int numberOfCols = tabColumns.size();
247 |             List<ColumnSchema> columns = new ArrayList<>(numberOfCols);
248 | 
249 |             for (FieldSchema fields : tabColumns) {
250 | 
251 |                 ColumnSchema columnSchema = new ColumnSchema
252 |                         .ColumnSchemaBuilder(fields.getName(), HiveKuduBridgeUtils.hiveTypeToKuduType(fields.getType()))
253 |                         .key(keyColumns.contains(fields.getName()))
254 |                         .nullable(!keyColumns.contains(fields.getName()))
255 |                         .build();
256 | 
257 |                 columns.add(columnSchema);
258 |             }
259 | 
260 |             Schema schema = new Schema(columns);
261 | 
262 |             printSchema(schema);
263 | 
264 |             CreateTableOptions createTableOptions = new CreateTableOptions();
265 | 
266 |             //TODO : add support for partition and buckets
267 |             client.createTable(tablename, schema, createTableOptions);
268 | 
269 |         } catch (Exception se) {
270 |             throw new MetaException("Error creating Kudu table: " + tablename + ":" + se);
271 |         } finally {
272 |             try {
273 |                 client.shutdown();
274 |             } catch (Exception e) {
275 |                 e.printStackTrace();
276 |             }
277 |         }
278 |     }
279 | 
280 |     @Override
281 |     public void commitCreateTable(Table tbl) throws MetaException {
282 |         // Nothing to do
283 |     }
284 | 
285 |     @Override
286 |     public void preDropTable(Table tbl) throws MetaException {
287 |         // Nothing to do
288 | 
289 |     }
290 | 
291 |     @Override
292 |     public void commitDropTable(Table tbl, boolean deleteData)
293 |             throws MetaException {
294 |         KuduClient client = getKuduClient(tbl.getParameters().get(HiveKuduConstants.MASTER_ADDRESS_NAME));
295 |         String tablename = getKuduTableName(tbl);
296 |         boolean isExternal = MetaStoreUtils.isExternalTable(tbl);
297 |         try {
298 |             if (deleteData && !isExternal) {
299 |                 client.deleteTable(tablename);
300 |             }
301 |         } catch (Exception ioe) {
302 |             throw new MetaException("Error dropping table:" +tablename);
303 |         } finally {
304 |             try {
305 |                 client.shutdown();
306 |             } catch (Exception e) {
307 |                 e.printStackTrace();
308 |             }
309 |         }
310 |     }
311 | 
312 |     @Override
313 |     public void rollbackCreateTable(Table tbl) throws MetaException {
314 |         KuduClient client = getKuduClient(tbl.getParameters().get(HiveKuduConstants.MASTER_ADDRESS_NAME));
315 |         String tablename = getKuduTableName(tbl);
316 |         boolean isExternal = MetaStoreUtils.isExternalTable(tbl);
317 |         try {
318 |             if ( client.tableExists(tablename) && !isExternal) {
319 |                 client.deleteTable(tablename);
320 |             }
321 |         } catch (Exception ioe) {
322 |             throw new MetaException("Error dropping table while rollback of create table:" +tablename);
323 |         } finally {
324 |             try {
325 |                 client.shutdown();
326 |             } catch (Exception e) {
327 |                 e.printStackTrace();
328 |             }
329 |         }
330 |     }
331 | 
332 |     @Override
333 |     public void rollbackDropTable(Table tbl) throws MetaException {
334 |         // Nothing to do
335 |     }
336 | 
337 | }
338 | 


--------------------------------------------------------------------------------
/src/main/java/org/kududb/mapred/HiveKuduTableInputFormat.java:
--------------------------------------------------------------------------------
  1 | package org.kududb.mapred;
  2 | 
  3 | /**
  4 |  * Created by bimal on 4/13/16.
  5 |  */
  6 | import org.apache.hadoop.hive.kududb.KuduHandler.HiveKuduWritable;
  7 | import com.google.common.base.Objects;
  8 | import com.google.common.base.Splitter;
  9 | import com.google.common.collect.Lists;
 10 | import org.apache.hadoop.fs.Path;
 11 | import org.apache.hadoop.io.Text;
 12 | import org.kududb.Type;
 13 | import org.apache.commons.net.util.Base64;
 14 | import org.apache.hadoop.mapred.*;
 15 | import org.apache.hadoop.util.StringUtils;
 16 | import org.kududb.Schema;
 17 | import org.kududb.annotations.InterfaceAudience;
 18 | import org.kududb.annotations.InterfaceStability;
 19 | import org.kududb.client.*;
 20 | import org.apache.commons.logging.Log;
 21 | import org.apache.commons.logging.LogFactory;
 22 | import org.apache.hadoop.conf.Configurable;
 23 | import org.apache.hadoop.conf.Configuration;
 24 | import org.apache.hadoop.io.NullWritable;
 25 | import org.apache.hadoop.io.Writable;
 26 | import org.apache.hadoop.net.DNS;
 27 | 
 28 | import javax.naming.NamingException;
 29 | import java.io.DataInput;
 30 | import java.io.DataOutput;
 31 | import java.io.IOException;
 32 | import java.net.InetAddress;
 33 | import java.net.InetSocketAddress;
 34 | import java.util.ArrayList;
 35 | import java.util.Arrays;
 36 | import java.util.HashMap;
 37 | import java.util.List;
 38 | import java.util.Map;
 39 | 
 40 | /**
 41 |  * <p>
 42 |  * This input format generates one split per tablet and the only location for each split is that
 43 |  * tablet's leader.
 44 |  * </p>
 45 |  *
 46 |  * <p>
 47 |  * Hadoop doesn't have the concept of "closing" the input format so in order to release the
 48 |  * resources we assume that once either {@link #getSplits(org.apache.hadoop.mapred.JobConf, int)}
 49 |  * or {@link HiveKuduTableInputFormat.TableRecordReader#close()} have been called that
 50 |  * the object won't be used again and the AsyncKuduClient is shut down.
 51 |  * </p>
 52 |  */
 53 | @InterfaceAudience.Public
 54 | @InterfaceStability.Evolving
 55 | public class HiveKuduTableInputFormat implements InputFormat, Configurable {
 56 | 
 57 |     private static final Log LOG = LogFactory.getLog(HiveKuduTableInputFormat.class);
 58 | 
 59 |     private static final long SLEEP_TIME_FOR_RETRIES_MS = 1000;
 60 | 
 61 |     /** Job parameter that specifies the input table. */
 62 |     static final String INPUT_TABLE_KEY = "kudu.mapreduce.input.table";
 63 | 
 64 |     /** Job parameter that specifies if the scanner should cache blocks or not (default: false). */
 65 |     static final String SCAN_CACHE_BLOCKS = "kudu.mapreduce.input.scan.cache.blocks";
 66 | 
 67 |     /** Job parameter that specifies where the masters are. */
 68 |     static final String MASTER_ADDRESSES_KEY = "kudu.mapreduce.master.addresses";
 69 | 
 70 |     /** Job parameter that specifies how long we wait for operations to complete (default: 10s). */
 71 |     static final String OPERATION_TIMEOUT_MS_KEY = "kudu.mapreduce.operation.timeout.ms";
 72 | 
 73 |     /** Job parameter that specifies the address for the name server. */
 74 |     static final String NAME_SERVER_KEY = "kudu.mapreduce.name.server";
 75 | 
 76 |     /** Job parameter that specifies the encoded column range predicates (may be empty). */
 77 |     static final String ENCODED_COLUMN_RANGE_PREDICATES_KEY =
 78 |             "kudu.mapreduce.encoded.column.range.predicates";
 79 | 
 80 |     /**
 81 |      * Job parameter that specifies the column projection as a comma-separated list of column names.
 82 |      *
 83 |      * Not specifying this at all (i.e. setting to null) or setting to the special string
 84 |      * '*' means to project all columns.
 85 |      *
 86 |      * Specifying the empty string means to project no columns (i.e just count the rows).
 87 |      */
 88 |     static final String COLUMN_PROJECTION_KEY = "kudu.mapreduce.column.projection";
 89 | 
 90 |     /**
 91 |      * The reverse DNS lookup cache mapping: address from Kudu => hostname for Hadoop. This cache is
 92 |      * used in order to not do DNS lookups multiple times for each tablet server.
 93 |      */
 94 |     private final Map<String, String> reverseDNSCacheMap = new HashMap<String, String>();
 95 | 
 96 |     private Configuration conf;
 97 |     private KuduClient client;
 98 |     private KuduTable table;
 99 |     private long operationTimeoutMs;
100 |     private String nameServer;
101 |     private boolean cacheBlocks;
102 |     private List<String> projectedCols;
103 |     private byte[] rawPredicates;
104 | 
105 |     static class KuduHiveSplit extends FileSplit {
106 |         InputSplit delegate;
107 |         private Path path;
108 | 
109 |         KuduHiveSplit() {
110 |             this(new TableSplit(), null);
111 |         }
112 | 
113 |         KuduHiveSplit(InputSplit delegate, Path path) {
114 |             super(path, 0, 0, (String[]) null);
115 |             this.delegate = delegate;
116 |             this.path = path;
117 |         }
118 | 
119 |         public long getLength() {
120 |             // TODO: can this be delegated?
121 |             return 1L;
122 |         }
123 | 
124 |         public String[] getLocations() throws IOException {
125 |             return delegate.getLocations();
126 |         }
127 | 
128 |         public void write(DataOutput out) throws IOException {
129 |             Text.writeString(out, path.toString());
130 |             delegate.write(out);
131 |         }
132 | 
133 |         public void readFields(DataInput in) throws IOException {
134 |             path = new Path(Text.readString(in));
135 |             delegate.readFields(in);
136 |         }
137 | 
138 |         @Override
139 |         public String toString() {
140 |             return delegate.toString();
141 |         }
142 | 
143 |         @Override
144 |         public Path getPath() {
145 |             return path;
146 |         }
147 |     }
148 |     @Override
149 |     public FileSplit[] getSplits(JobConf jobConf, int i)
150 |             throws IOException {
151 |         LOG.warn("I was called : getSplits");
152 |         try {
153 |             if (table == null) {
154 |                 throw new IOException("No table was provided");
155 |             }
156 |             InputSplit[] splits;
157 |             DeadlineTracker deadline = new DeadlineTracker();
158 |             deadline.setDeadline(operationTimeoutMs);
159 |             // If the job is started while a leader election is running, we might not be able to find a
160 |             // leader right away. We'll wait as long as the user is willing to wait with the operation
161 |             // timeout, and once we've waited long enough we just start picking the first replica we see
162 |             // for those tablets that don't have a leader. The client will later try to find the leader
163 |             // and it might fail, in which case the task will get retried.
164 |             retryloop:
165 |             while (true) {
166 |                 List<LocatedTablet> locations;
167 |                 try {
168 |                     locations = table.getTabletsLocations(operationTimeoutMs);
169 |                 } catch (Exception e) {
170 |                     throw new IOException("Could not get the tablets locations", e);
171 |                 }
172 | 
173 |                 if (locations.isEmpty()) {
174 |                     throw new IOException("The requested table has 0 tablets, cannot continue");
175 |                 }
176 | 
177 |                 // For the moment we only pass the leader since that's who we read from.
178 |                 // If we've been trying to get a leader for each tablet for too long, we stop looping
179 |                 // and just finish with what we have.
180 |                 splits = new InputSplit[locations.size()];
181 |                 int count = 0;
182 |                 for (LocatedTablet locatedTablet : locations) {
183 |                     List<String> addresses = Lists.newArrayList();
184 |                     LocatedTablet.Replica replica = locatedTablet.getLeaderReplica();
185 |                     if (replica == null) {
186 |                         if (deadline.wouldSleepingTimeout(SLEEP_TIME_FOR_RETRIES_MS)) {
187 |                             LOG.debug("We ran out of retries, picking a non-leader replica for this tablet: " +
188 |                                     locatedTablet.toString());
189 |                             // We already checked it's not empty.
190 |                             replica = locatedTablet.getReplicas().get(0);
191 |                         } else {
192 |                             LOG.debug("Retrying creating the splits because this tablet is missing a leader: " +
193 |                                     locatedTablet.toString());
194 |                             try {
195 |                                 Thread.sleep(SLEEP_TIME_FOR_RETRIES_MS);
196 |                             } catch (InterruptedException ioe) {
197 |                                 throw new IOException(StringUtils.stringifyException(ioe));
198 |                             }
199 | 
200 |                             continue retryloop;
201 |                         }
202 |                     }
203 |                     addresses.add(reverseDNS(replica.getRpcHost(), replica.getRpcPort()));
204 |                     String[] addressesArray = addresses.toArray(new String[addresses.size()]);
205 |                     Partition partition = locatedTablet.getPartition();
206 |                     TableSplit split = new TableSplit(partition.getPartitionKeyStart(),
207 |                             partition.getPartitionKeyEnd(),
208 |                             addressesArray);
209 |                     splits[count] = split;
210 |                     count++;
211 |                 }
212 |                 FileSplit[] wrappers = new FileSplit[splits.length];
213 |                 Path path = new Path(jobConf.get("location"));
214 |                 for (int counter = 0; counter < wrappers.length; counter++) {
215 |                     wrappers[counter] = new KuduHiveSplit(splits[counter], path);
216 |                 }
217 |                 return wrappers;
218 |             }
219 |         } finally {
220 |             //shutdownClient();
221 |             LOG.warn("This is a Bug. No need to shutdown client.");
222 |         }
223 |     }
224 | 
225 |     private void shutdownClient() throws IOException {
226 |         LOG.warn("I was called : shutdownClient");
227 |         try {
228 |             client.shutdown();
229 |         } catch (Exception e) {
230 |             LOG.error("Error shutting down Kudu Client" + e);
231 |         }
232 |     }
233 | 
234 |     /**
235 |      * This method might seem alien, but we do this in order to resolve the hostnames the same way
236 |      * Hadoop does. This ensures we get locality if Kudu is running along MR/YARN.
237 |      * @param host hostname we got from the master
238 |      * @param port port we got from the master
239 |      * @return reverse DNS'd address
240 |      */
241 |     private String reverseDNS(String host, Integer port) {
242 |         LOG.warn("I was called : reverseDNS");
243 |         String location = this.reverseDNSCacheMap.get(host);
244 |         if (location != null) {
245 |             return location;
246 |         }
247 |         // The below InetSocketAddress creation does a name resolution.
248 |         InetSocketAddress isa = new InetSocketAddress(host, port);
249 |         if (isa.isUnresolved()) {
250 |             LOG.warn("Failed address resolve for: " + isa);
251 |         }
252 |         InetAddress tabletInetAddress = isa.getAddress();
253 |         try {
254 |             location = domainNamePointerToHostName(
255 |                     DNS.reverseDns(tabletInetAddress, this.nameServer));
256 |             this.reverseDNSCacheMap.put(host, location);
257 |         } catch (NamingException e) {
258 |             LOG.warn("Cannot resolve the host name for " + tabletInetAddress + " because of " + e);
259 |             location = host;
260 |         }
261 |         return location;
262 |     }
263 | 
264 |     @Override
265 |     public RecordReader<NullWritable, HiveKuduWritable> getRecordReader(InputSplit inputSplit,
266 |                                                                         final JobConf jobConf, final Reporter reporter)
267 |             throws IOException {
268 |         InputSplit delegate = ((KuduHiveSplit) inputSplit).delegate;
269 |         LOG.warn("I was called : getRecordReader");
270 |         try {
271 |             return new TableRecordReader(delegate);
272 |         } catch (InterruptedException e)
273 |         {
274 |             throw new IOException(e);
275 |         }
276 |     }
277 | 
278 |     @Override
279 |     public void setConf(Configuration entries) {
280 |         LOG.warn("I was called : setConf");
281 |         this.conf = new Configuration(entries);
282 | 
283 |         String tableName = conf.get(INPUT_TABLE_KEY);
284 |         String masterAddresses = conf.get(MASTER_ADDRESSES_KEY);
285 |         this.operationTimeoutMs = conf.getLong(OPERATION_TIMEOUT_MS_KEY,
286 |                 AsyncKuduClient.DEFAULT_OPERATION_TIMEOUT_MS);
287 |         this.nameServer = conf.get(NAME_SERVER_KEY);
288 |         this.cacheBlocks = conf.getBoolean(SCAN_CACHE_BLOCKS, false);
289 | 
290 |         LOG.warn(" the master address here is " + masterAddresses);
291 | 
292 |         this.client = new KuduClient.KuduClientBuilder(masterAddresses)
293 |                 .defaultOperationTimeoutMs(operationTimeoutMs)
294 |                 .build();
295 |         try {
296 |             this.table = client.openTable(tableName);
297 |         } catch (Exception ex) {
298 |             throw new RuntimeException("Could not obtain the table from the master, " +
299 |                     "is the master running and is this table created? tablename=" + tableName + " and " +
300 |                     "master address= " + masterAddresses, ex);
301 |         }
302 | 
303 |         //String projectionConfig = conf.get(COLUMN_PROJECTION_KEY);
304 |         String projectionConfig = "id,name";
305 |         if (projectionConfig == null || projectionConfig.equals("*")) {
306 |             this.projectedCols = null; // project the whole table
307 |         } else if ("".equals(projectionConfig)) {
308 |             this.projectedCols = new ArrayList<>();
309 |         } else {
310 |             this.projectedCols = Lists.newArrayList(Splitter.on(',').split(projectionConfig));
311 | 
312 |             // Verify that the column names are valid -- better to fail with an exception
313 |             // before we submit the job.
314 |             Schema tableSchema = table.getSchema();
315 |             for (String columnName : projectedCols) {
316 |                 if (tableSchema.getColumn(columnName) == null) {
317 |                     throw new IllegalArgumentException("Unknown column " + columnName);
318 |                 }
319 |             }
320 |         }
321 | 
322 |         String encodedPredicates = conf.get(ENCODED_COLUMN_RANGE_PREDICATES_KEY, "");
323 |         rawPredicates = Base64.decodeBase64(encodedPredicates);
324 |     }
325 | 
326 |     /**
327 |      * Given a PTR string generated via reverse DNS lookup, return everything
328 |      * except the trailing period. Example for host.example.com., return
329 |      * host.example.com
330 |      * @param dnPtr a domain name pointer (PTR) string.
331 |      * @return Sanitized hostname with last period stripped off.
332 |      *
333 |      */
334 |     private static String domainNamePointerToHostName(String dnPtr) {
335 |         LOG.warn("I was called : domainNamePointerToHostName");
336 |         if (dnPtr == null)
337 |             return null;
338 |         String r = dnPtr.endsWith(".") ? dnPtr.substring(0, dnPtr.length() - 1) : dnPtr;
339 |         LOG.warn(r);
340 |         return r;
341 |     }
342 | 
343 |     @Override
344 |     public Configuration getConf() {
345 |         return conf;
346 |     }
347 | 
348 |     static class TableSplit implements InputSplit, Writable, Comparable<TableSplit> {
349 | 
350 |         private byte[] startPartitionKey;
351 |         private byte[] endPartitionKey;
352 |         private String[] locations;
353 | 
354 |         public TableSplit() { } // Writable
355 | 
356 |         public TableSplit(byte[] startPartitionKey, byte[] endPartitionKey, String[] locations) {
357 |             LOG.warn("I was called : TableSplit");
358 |             this.startPartitionKey = startPartitionKey;
359 |             this.endPartitionKey = endPartitionKey;
360 |             this.locations = locations;
361 |         }
362 | 
363 |         @Override
364 |         public long getLength() throws IOException {
365 |             // TODO Guesstimate a size
366 |             return 0;
367 |         }
368 | 
369 |         @Override
370 |         public String[] getLocations() throws IOException {
371 |             LOG.warn("I was called : getLocations");
372 |             return locations;
373 |         }
374 | 
375 |         public byte[] getStartPartitionKey() {
376 |             return startPartitionKey;
377 |         }
378 | 
379 |         public byte[] getEndPartitionKey() {
380 |             return endPartitionKey;
381 |         }
382 | 
383 |         @Override
384 |         public int compareTo(TableSplit tableSplit) {
385 |             LOG.warn("I was called : compareTo");
386 |             return Bytes.memcmp(startPartitionKey, tableSplit.getStartPartitionKey());
387 |         }
388 | 
389 |         @Override
390 |         public void write(DataOutput dataOutput) throws IOException {
391 |             LOG.warn("I was called : write");
392 |             Bytes.writeByteArray(dataOutput, startPartitionKey);
393 |             Bytes.writeByteArray(dataOutput, endPartitionKey);
394 |             dataOutput.writeInt(locations.length);
395 |             for (String location : locations) {
396 |                 byte[] str = Bytes.fromString(location);
397 |                 Bytes.writeByteArray(dataOutput, str);
398 |             }
399 |         }
400 | 
401 |         @Override
402 |         public void readFields(DataInput dataInput) throws IOException {
403 |             LOG.warn("I was called : readFields");
404 |             startPartitionKey = Bytes.readByteArray(dataInput);
405 |             endPartitionKey = Bytes.readByteArray(dataInput);
406 |             locations = new String[dataInput.readInt()];
407 |             LOG.warn("readFields " + locations.length);
408 |             for (int i = 0; i < locations.length; i++) {
409 |                 byte[] str = Bytes.readByteArray(dataInput);
410 |                 locations[i] = Bytes.getString(str);
411 |                 LOG.warn("readFields " + locations[i]);
412 |             }
413 |         }
414 | 
415 |         @Override
416 |         public int hashCode() {
417 |             LOG.warn("I was called : hashCode");
418 |             // We currently just care about the row key since we're within the same table
419 |             return Arrays.hashCode(startPartitionKey);
420 |         }
421 | 
422 |         @Override
423 |         public boolean equals(Object o) {
424 |             LOG.warn("I was called : equals");
425 |             if (this == o) return true;
426 |             if (o == null || getClass() != o.getClass()) return false;
427 | 
428 |             TableSplit that = (TableSplit) o;
429 | 
430 |             return this.compareTo(that) == 0;
431 |         }
432 | 
433 |         @Override
434 |         public String toString() {
435 |             LOG.warn("I was called : toString");
436 |             return Objects.toStringHelper(this)
437 |                     .add("startPartitionKey", Bytes.pretty(startPartitionKey))
438 |                     .add("endPartitionKey", Bytes.pretty(endPartitionKey))
439 |                     .add("locations", Arrays.toString(locations))
440 |                     .toString();
441 |         }
442 |     }
443 | 
444 |     class TableRecordReader implements RecordReader<NullWritable, HiveKuduWritable> {
445 | 
446 |         private final NullWritable currentKey = NullWritable.get();
447 |         private RowResult currentValue;
448 |         private RowResultIterator iterator;
449 |         private KuduScanner scanner;
450 |         private TableSplit split;
451 |         private Type[] types;
452 |         private boolean first = true;
453 | 
454 |         public TableRecordReader(InputSplit inputSplit) throws IOException, InterruptedException {
455 |             LOG.warn("I was called : TableRecordReader");
456 |             if (!(inputSplit instanceof TableSplit)) {
457 |                 throw new IllegalArgumentException("TableSplit is the only accepted input split");
458 |             }
459 | 
460 |             //Create another client
461 |             //setConf(getConf());
462 | 
463 |             split = (TableSplit) inputSplit;
464 |             scanner = client.newScannerBuilder(table)
465 |                     .setProjectedColumnNames(projectedCols)
466 |                     .lowerBoundPartitionKeyRaw(split.getStartPartitionKey())
467 |                     .exclusiveUpperBoundPartitionKeyRaw(split.getEndPartitionKey())
468 |                     .cacheBlocks(cacheBlocks)
469 |                     .addColumnRangePredicatesRaw(rawPredicates)
470 |                     .build();
471 | 
472 |             LOG.warn("table name: " +table.getName());
473 |             LOG.warn("projectedCols name: " + projectedCols.size());
474 |             LOG.warn("getStartPartitionKey: " + split.getStartPartitionKey().toString());
475 |             LOG.warn("getEndPartitionKey " + split.getEndPartitionKey().toString());
476 |             LOG.warn("cacheBlocks " + cacheBlocks);
477 |             LOG.warn("rawPredicates " + rawPredicates.length);
478 | 
479 | 
480 |             Schema schema = table.getSchema();
481 |             types = new Type[schema.getColumnCount()];
482 |             for (int i = 0; i < types.length; i++) {
483 |                 types[i] = schema.getColumnByIndex(i).getType();
484 |                 LOG.warn("Setting types array "+ i + " to " + types[i].name());
485 |             }
486 |             // Calling this now to set iterator.
487 |             tryRefreshIterator();
488 |         }
489 | 
490 |         @Override
491 |         public boolean next(NullWritable o, HiveKuduWritable o2) throws IOException {
492 |             LOG.warn("I was called : next");
493 |             /*
494 |             if (first) {
495 |                 //tryRefreshIterator();
496 |                 List<String> projectColumns = new ArrayList<>(2);
497 |                 projectColumns.add("id");
498 |                 projectColumns.add("name");
499 |                 KuduScanner scanner = client.newScannerBuilder(table)
500 |                         .setProjectedColumnNames(projectColumns)
501 |                         .build();
502 |                 try {
503 |                     iterator = scanner.nextRows();
504 |                 } catch (Exception e) {
505 |                     throw new IOException("Couldn't get scan data", e);
506 |                 }
507 |                 first = false;
508 |             } else {
509 |                 return false;
510 |             }
511 |             */
512 |             if (!iterator.hasNext()) {
513 |                 tryRefreshIterator();
514 |                 if (!iterator.hasNext()) {
515 |                     // Means we still have the same iterator, we're done
516 |                     return false;
517 |                 }
518 |             }
519 | 
520 |             currentValue = iterator.next();
521 |             o = currentKey;
522 |             o2.clear();
523 |             for (int i = 0; i < types.length; i++) {
524 |                 switch(types[i]) {
525 |                     case STRING: {
526 |                         o2.set(i, currentValue.getString(i));
527 |                         break;
528 |                     }
529 |                     case FLOAT: {
530 |                         o2.set(i, currentValue.getFloat(i));
531 |                         break;
532 |                     }
533 |                     case DOUBLE: {
534 |                         o2.set(i, currentValue.getDouble(i));
535 |                         break;
536 |                     }
537 |                     case BOOL: {
538 |                         o2.set(i, currentValue.getBoolean(i));
539 |                         break;
540 |                     }
541 |                     case INT8: {
542 |                         o2.set(i, currentValue.getByte(i));
543 |                         break;
544 |                     }
545 |                     case INT16: {
546 |                         o2.set(i, currentValue.getShort(i));
547 |                         break;
548 |                     }
549 |                     case INT32: {
550 |                         o2.set(i, currentValue.getInt(i));
551 |                         break;
552 |                     }
553 |                     case INT64: {
554 |                         o2.set(i, currentValue.getLong(i));
555 |                         break;
556 |                     }
557 |                     case TIMESTAMP: {
558 |                         o2.set(i, currentValue.getLong(i));
559 |                         break;
560 |                     }
561 |                     case BINARY: {
562 |                         o2.set(i, currentValue.getBinaryCopy(i));
563 |                         break;
564 |                     }
565 |                     default:
566 |                         throw new IOException("Cannot write Object '"
567 |                                 + currentValue.getColumnType(i).name() + "' as type: " + types[i].name());
568 |                 }
569 |                 LOG.warn("Value returned " + o2.get(i));
570 |             }
571 |             return true;
572 |         }
573 | 
574 |         @Override
575 |         public NullWritable createKey() {
576 |             LOG.warn("I was called : createKey");
577 |             return NullWritable.get();
578 |         }
579 | 
580 |         @Override
581 |         public HiveKuduWritable createValue() {
582 |             LOG.warn("I was called : createValue");
583 |             return new HiveKuduWritable(types);
584 |         }
585 | 
586 |         @Override
587 |         public long getPos() throws IOException {
588 |             LOG.warn("I was called : getPos");
589 |             return 0;
590 |             //TODO: Get progress
591 |         }
592 | /*
593 |         //mapreduce code for reference.
594 |         @Override
595 |         public boolean nextKeyValue() throws IOException, InterruptedException {
596 |             if (!iterator.hasNext()) {
597 |                 tryRefreshIterator();
598 |                 if (!iterator.hasNext()) {
599 |                     // Means we still have the same iterator, we're done
600 |                     return false;
601 |                 }
602 |             }
603 |             currentValue = iterator.next();
604 |             return true;
605 |         }
606 | */
607 |         /**
608 |          * If the scanner has more rows, get a new iterator else don't do anything.
609 |          * @throws IOException
610 |          */
611 |         private void tryRefreshIterator() throws IOException {
612 |             LOG.warn("I was called : tryRefreshIterator");
613 |             if (!scanner.hasMoreRows()) {
614 |                 return;
615 |             }
616 |             try {
617 |                 iterator = scanner.nextRows();
618 |             } catch (Exception e) {
619 |                 throw new IOException("Couldn't get scan data", e);
620 |             }
621 |         }
622 | 
623 |         /*
624 |         Mapreduce code for reference
625 | 
626 |         @Override
627 |         public NullWritable getCurrentKey() throws IOException, InterruptedException {
628 |             return currentKey;
629 |         }
630 | 
631 |         @Override
632 |         public RowResult getCurrentValue() throws IOException, InterruptedException {
633 |             return currentValue;
634 |         }
635 |         */
636 | 
637 |         @Override
638 |         public float getProgress() throws IOException {
639 |             LOG.warn("I was called : getProgress");
640 |             // TODO Guesstimate progress
641 |             return 0;
642 |         }
643 | 
644 | 
645 |         @Override
646 |         public void close() throws IOException {
647 |             LOG.warn("I was called : close");
648 |             try {
649 |                 scanner.close();
650 |             } catch (NullPointerException npe) {
651 |                 LOG.warn("The scanner is supposed to be open but its not. TODO: Fix me.");
652 |             }
653 |             catch (Exception e) {
654 |                 throw new IOException(e);
655 |             }
656 |             shutdownClient();
657 |         }
658 |     }
659 | }


--------------------------------------------------------------------------------
/src/main/java/org/kududb/mapred/HiveKuduTableOutputFormat.java:
--------------------------------------------------------------------------------
  1 | package org.kududb.mapred;
  2 | 
  3 | /**
  4 |  * Created by bimal on 4/13/16.
  5 |  */
  6 | import org.apache.hadoop.hive.kududb.KuduHandler.HiveKuduWritable;
  7 | import org.apache.hadoop.fs.FileSystem;
  8 | import org.apache.hadoop.mapred.JobConf;
  9 | import org.apache.hadoop.mapred.RecordWriter;
 10 | import org.apache.hadoop.mapred.Reporter;
 11 | import org.apache.hadoop.util.Progressable;
 12 | import org.kududb.Schema;
 13 | import org.kududb.annotations.InterfaceAudience;
 14 | import org.kududb.annotations.InterfaceStability;
 15 | import org.kududb.client.*;
 16 | import org.apache.hadoop.conf.Configurable;
 17 | import org.apache.hadoop.conf.Configuration;
 18 | import org.apache.hadoop.io.NullWritable;
 19 | import org.apache.hadoop.mapred.OutputFormat;
 20 | import org.slf4j.Logger;
 21 | import org.slf4j.LoggerFactory;
 22 | 
 23 | import java.io.IOException;
 24 | import java.util.ArrayList;
 25 | import java.util.List;
 26 | import java.util.concurrent.ConcurrentHashMap;
 27 | import java.util.concurrent.atomic.AtomicLong;
 28 | 
 29 | 
 30 | @InterfaceAudience.Public
 31 | @InterfaceStability.Evolving
 32 | public class HiveKuduTableOutputFormat implements OutputFormat, Configurable {
 33 | 
 34 |     private static final Logger LOG = LoggerFactory.getLogger(HiveKuduTableOutputFormat.class);
 35 | 
 36 |     /** Job parameter that specifies the output table. */
 37 |     static final String OUTPUT_TABLE_KEY = "kudu.mapreduce.output.table";
 38 | 
 39 |     /** Job parameter that specifies where the masters are */
 40 |     static final String MASTER_ADDRESSES_KEY = "kudu.mapreduce.master.addresses";
 41 | 
 42 |     /** Job parameter that specifies how long we wait for operations to complete */
 43 |     static final String OPERATION_TIMEOUT_MS_KEY = "kudu.mapreduce.operation.timeout.ms";
 44 | 
 45 |     /** Number of rows that are buffered before flushing to the tablet server */
 46 |     static final String BUFFER_ROW_COUNT_KEY = "kudu.mapreduce.buffer.row.count";
 47 | 
 48 |     /**
 49 |      * Job parameter that specifies which key is to be used to reach the HiveKuduTableOutputFormat
 50 |      * belonging to the caller
 51 |      */
 52 |     static final String MULTITON_KEY = "kudu.mapreduce.multitonkey";
 53 | 
 54 |     /**
 55 |      * This multiton is used so that the tasks using this output format/record writer can find
 56 |      * their KuduTable without having a direct dependency on this class,
 57 |      * with the additional complexity that the output format cannot be shared between threads.
 58 |      */
 59 |     private static final ConcurrentHashMap<String, HiveKuduTableOutputFormat> MULTITON = new
 60 |             ConcurrentHashMap<String, HiveKuduTableOutputFormat>();
 61 | 
 62 |     /**
 63 |      * This counter helps indicate which task log to look at since rows that weren't applied will
 64 |      * increment this counter.
 65 |      */
 66 |     public enum Counters { ROWS_WITH_ERRORS }
 67 | 
 68 |     private Configuration conf = null;
 69 | 
 70 |     private KuduClient client;
 71 |     private KuduTable table;
 72 |     private KuduSession session;
 73 |     private long operationTimeoutMs;
 74 | 
 75 |     @Override
 76 |     public void setConf(Configuration entries) {
 77 |         LOG.warn("I was called : setConf");
 78 |         this.conf = new Configuration(entries);
 79 | 
 80 |         String masterAddress = this.conf.get(MASTER_ADDRESSES_KEY);
 81 |         String tableName = this.conf.get(OUTPUT_TABLE_KEY);
 82 |         this.operationTimeoutMs = this.conf.getLong(OPERATION_TIMEOUT_MS_KEY,
 83 |                 AsyncKuduClient.DEFAULT_OPERATION_TIMEOUT_MS);
 84 |         int bufferSpace = this.conf.getInt(BUFFER_ROW_COUNT_KEY, 1000);
 85 | 
 86 |         LOG.warn(" the master address here is " + masterAddress);
 87 | 
 88 |         this.client = new KuduClient.KuduClientBuilder(masterAddress)
 89 |                 .defaultOperationTimeoutMs(operationTimeoutMs)
 90 |                 .build();
 91 |         try {
 92 |             this.table = client.openTable(tableName);
 93 |         } catch (Exception ex) {
 94 |             throw new RuntimeException("Could not obtain the table from the master, " +
 95 |                     "is the master running and is this table created? tablename=" + tableName + " and " +
 96 |                     "master address= " + masterAddress, ex);
 97 |         }
 98 |         this.session = client.newSession();
 99 |         this.session.setFlushMode(AsyncKuduSession.FlushMode.AUTO_FLUSH_BACKGROUND);
100 |         this.session.setMutationBufferSpace(bufferSpace);
101 |         this.session.setIgnoreAllDuplicateRows(true);
102 |         String multitonKey = String.valueOf(Thread.currentThread().getId());
103 |         assert(MULTITON.get(multitonKey) == null);
104 |         MULTITON.put(multitonKey, this);
105 |         entries.set(MULTITON_KEY, multitonKey);
106 |     }
107 | 
108 |     private void shutdownClient() throws IOException {
109 |         LOG.warn("I was called : shutdownClient");
110 |         try {
111 |             client.shutdown();
112 |         } catch (Exception e) {
113 |             throw new IOException(e);
114 |         }
115 |     }
116 | 
117 |     public static KuduTable getKuduTable(String multitonKey) {
118 |         LOG.warn("I was called : getKuduTable");
119 |         return MULTITON.get(multitonKey).getKuduTable();
120 |     }
121 | 
122 |     private KuduTable getKuduTable() {
123 |         LOG.warn("I was called : getKuduTable");
124 |         return this.table;
125 |     }
126 | 
127 |     @Override
128 |     public Configuration getConf() {
129 |         LOG.warn("I was called : getConf");
130 |         return conf;
131 |     }
132 | 
133 | 
134 |     @Override
135 |     public RecordWriter getRecordWriter(FileSystem fileSystem, JobConf jobConf, String s, Progressable progressable)
136 |             throws IOException {
137 |         LOG.warn("I was called : getRecordWriter");
138 |         return new TableRecordWriter(this.session);
139 |     }
140 | 
141 |     @Override
142 |     public void checkOutputSpecs(FileSystem fileSystem, JobConf jobConf) throws IOException {
143 |         LOG.warn("I was called : checkOutputSpecs");
144 |         shutdownClient();
145 |     }
146 | 
147 |     /*
148 |     @Override
149 |     public OutputCommitter getOutputCommitter(TaskAttemptContext taskAttemptContext) throws
150 |             IOException, InterruptedException {
151 |         return new KuduTableOutputCommitter();
152 |     }
153 |     */
154 | 
155 |     protected class TableRecordWriter implements RecordWriter<NullWritable, HiveKuduWritable> {
156 | 
157 |         private final AtomicLong rowsWithErrors = new AtomicLong();
158 |         private final KuduSession session;
159 | 
160 |         public TableRecordWriter(KuduSession session) {
161 |             LOG.warn("I was called : TableRecordWriter");
162 |             this.session = session;
163 |         }
164 | 
165 |         private Operation getOperation(HiveKuduWritable hiveKuduWritable)
166 |             throws IOException{
167 |             LOG.warn("I was called : getOperation");
168 |             int recCount = hiveKuduWritable.getColCount();
169 |             Schema schema = table.getSchema();
170 |             int colCount = schema.getColumnCount();
171 |             if (recCount != colCount) {
172 |                 throw new IOException("Kudu table column count of " + colCount + " does not match "
173 |                         + "with Serialized object record count of " + recCount);
174 |             }
175 |             //TODO: Find out if the record needs to be updated or deleted.
176 |             //Assume only insert
177 | 
178 |             Insert insert = table.newInsert();
179 |             PartialRow row = insert.getRow();
180 | 
181 |             for (int i = 0; i < recCount; i++) {
182 |                 Object obj = hiveKuduWritable.get(i);
183 |                 LOG.warn("From Writable Column value of " + i + " is " + obj.toString() + " and type is " + hiveKuduWritable.getType(i).name());
184 |                 LOG.warn("From Schema Column name of " + i + " is " + schema.getColumnByIndex(i).getName());
185 |                 switch(hiveKuduWritable.getType(i)) {
186 |                     case STRING: {
187 |                         LOG.warn("I was called : STRING");
188 |                         String s = obj.toString();
189 |                         row.addString(i, s);
190 |                         break;
191 |                     }
192 |                     case FLOAT: {
193 |                         LOG.warn("I was called : FLOAT");
194 |                         Float f = (Float) obj;
195 |                         row.addFloat(i, f);
196 |                         break;
197 |                     }
198 |                     case DOUBLE: {
199 |                         LOG.warn("I was called : DOUBLE");
200 |                         Double d = (Double) obj;
201 |                         row.addDouble(i, d);
202 |                         break;
203 |                     }
204 |                     case BOOL: {
205 |                         LOG.warn("I was called : BOOL");
206 |                         Boolean b = (Boolean) obj;
207 |                         row.addBoolean(i, b);
208 |                         break;
209 |                     }
210 |                     case INT8: {
211 |                         LOG.warn("I was called : INT8");
212 |                         Byte b = (Byte) obj;
213 |                         row.addByte(i, b);
214 |                         break;
215 |                     }
216 |                     case INT16: {
217 |                         LOG.warn("I was called : INT16");
218 |                         Short s = (Short) obj;
219 |                         row.addShort(i, s);
220 |                         break;
221 |                     }
222 |                     case INT32: {
223 |                         LOG.warn("I was called : INT32");
224 |                         Integer x = (Integer) obj;
225 |                         row.addInt(i, x);
226 |                         break;
227 |                     }
228 |                     case INT64: {
229 |                         LOG.warn("I was called : INT64");
230 |                         Long l = (Long) obj;
231 |                         row.addLong(i, l);
232 |                         break;
233 |                     }
234 |                     case TIMESTAMP: {
235 |                         LOG.warn("I was called : TIMESTAMP");
236 |                         Long time = (Long) obj;
237 |                         row.addLong(i, time);
238 |                         break;
239 |                     }
240 |                     case BINARY: {
241 |                         LOG.warn("I was called : BINARY");
242 |                         byte[] b = (byte[]) obj;
243 |                         row.addBinary(i, b);
244 |                         break;
245 |                     }
246 |                     default:
247 |                         throw new IOException("Cannot write Object '"
248 |                                 + obj.getClass().getSimpleName() + "' as type: " + hiveKuduWritable.getType(i).name());
249 |                 }
250 |             }
251 | 
252 |             return insert;
253 |         }
254 |         @Override
255 |         public void write(NullWritable key, HiveKuduWritable kw)
256 |                 throws IOException {
257 |             try {
258 |                 LOG.warn("I was called : write");
259 |                 Operation operation = getOperation(kw);
260 |                 session.apply(operation);
261 | 
262 |                 //read from Kudu if the insert was successful
263 |                 List<String> projectColumns = new ArrayList<>(2);
264 |                 projectColumns.add("id");
265 |                 projectColumns.add("name");
266 |                 KuduScanner scanner = client.newScannerBuilder(table)
267 |                         .setProjectedColumnNames(projectColumns)
268 |                         .build();
269 | 
270 |                 while (scanner.hasMoreRows()) {
271 |                     RowResultIterator results = scanner.nextRows();
272 |                     while (results.hasNext()) {
273 |                         RowResult result = results.next();
274 |                         LOG.warn("Returned from kudu" + result.getInt(0) + ":" +result.getString(1));
275 |                     }
276 |                 }
277 | 
278 |                 LOG.warn("applying operation");
279 | 
280 |             } catch (Exception e) {
281 |                 throw new IOException("Encountered an error while writing", e);
282 |             }
283 |         }
284 | 
285 |         @Override
286 |         public void close(Reporter reporter) throws IOException {
287 |             try {
288 |                 LOG.warn("I was called : close");
289 |                 processRowErrors(session.close());
290 |                 shutdownClient();
291 |             } catch (Exception e) {
292 |                 throw new IOException("Encountered an error while closing this task", e);
293 |             } finally {
294 |                 if (reporter != null) {
295 |                     // This is the only place where we have access to the context in the record writer,
296 |                     // so set the counter here.
297 |                     reporter.getCounter(Counters.ROWS_WITH_ERRORS).setValue(rowsWithErrors.get());
298 |                 }
299 |             }
300 |         }
301 | 
302 |         private void processRowErrors(List<OperationResponse> responses) {
303 |             LOG.warn("I was called : processRowErrors");
304 |             List<RowError> errors = OperationResponse.collectErrors(responses);
305 |             if (!errors.isEmpty()) {
306 |                 int rowErrorsCount = errors.size();
307 |                 rowsWithErrors.addAndGet(rowErrorsCount);
308 |                 LOG.warn("Got per errors for " + rowErrorsCount + " rows, " +
309 |                         "the first one being " + errors.get(0).getStatus());
310 |             }
311 |         }
312 |     }
313 | }


--------------------------------------------------------------------------------