├── .idea ├── libraries │ ├── Maven__com_amazonaws_aws_java_sdk_core_1_10_6.xml │ ├── Maven__com_amazonaws_aws_java_sdk_kms_1_10_6.xml │ ├── Maven__com_amazonaws_aws_java_sdk_s3_1_10_6.xml │ ├── Maven__com_sun_jersey_jersey_client_1_9.xml │ ├── Maven__javax_activation_activation_1_1.xml │ ├── Maven__javax_xml_bind_jaxb_api_2_2_2.xml │ ├── Maven__javax_xml_stream_stax_api_1_0_2.xml │ ├── Maven__org_apache_hadoop_hadoop_annotations_2_6_0_cdh5_7_0.xml │ ├── Maven__org_apache_hadoop_hadoop_aws_2_6_0_cdh5_7_0.xml │ ├── Maven__org_apache_hadoop_hadoop_client_2_6_0_cdh5_7_0.xml │ ├── Maven__org_apache_hadoop_hadoop_common_2_6_0_cdh5_7_0.xml │ ├── Maven__org_apache_hadoop_hadoop_hdfs_2_6_0_cdh5_7_0.xml │ ├── Maven__org_apache_hadoop_hadoop_mapreduce_client_app_2_6_0_cdh5_7_0.xml │ ├── Maven__org_apache_hadoop_hadoop_mapreduce_client_common_2_6_0_cdh5_7_0.xml │ ├── Maven__org_apache_hadoop_hadoop_mapreduce_client_core_2_6_0_cdh5_7_0.xml │ ├── Maven__org_apache_hadoop_hadoop_mapreduce_client_jobclient_2_6_0_cdh5_7_0.xml │ ├── Maven__org_apache_hadoop_hadoop_mapreduce_client_shuffle_2_6_0_cdh5_7_0.xml │ ├── Maven__org_apache_hadoop_hadoop_yarn_api_2_6_0_cdh5_7_0.xml │ ├── Maven__org_apache_hadoop_hadoop_yarn_client_2_6_0_cdh5_7_0.xml │ ├── Maven__org_apache_hadoop_hadoop_yarn_common_2_6_0_cdh5_7_0.xml │ ├── Maven__org_apache_hadoop_hadoop_yarn_server_common_2_6_0_cdh5_7_0.xml │ ├── Maven__org_apache_htrace_htrace_core4_4_0_1_incubating.xml │ ├── Maven__xerces_xercesImpl_2_9_1.xml │ └── Maven__xml_apis_xml_apis_1_3_04.xml └── uiDesigner.xml ├── GamerSetup.md ├── LICENSE ├── README.md ├── kudu-spark.iml ├── notes.txt ├── pom.xml └── src └── main └── scala └── org └── kududb └── spark ├── DefaultSource.scala ├── KuduContext.scala ├── KuduDStreamFunctions.scala ├── KuduRDDFunctions.scala └── demo ├── basic ├── AddSingleRecord.scala ├── BasicExample.scala ├── BasicSparkSQLExamples.scala ├── InitialDataPopulation.scala ├── ModifySingleRecord.scala ├── NameGenerator.scala └── ScanTable.scala └── gamer ├── GamerEvent.scala ├── aggregates ├── CreateKuduTable.scala ├── DirectDataInjector.scala ├── GamerAggergatesSparkStreaming.scala ├── GamerDataGenerator.scala ├── GamerSparkSQLExample.scala ├── KafkaProducerInjector.scala └── SparkSQLCmd.scala └── cdc ├── ApplyNewRecordRunnable.scala ├── CreateKuduTable.scala ├── DirectDataInjector.scala └── DirectDataMultiThreadedInjector.scala /.idea/libraries/Maven__com_amazonaws_aws_java_sdk_core_1_10_6.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__com_amazonaws_aws_java_sdk_kms_1_10_6.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__com_amazonaws_aws_java_sdk_s3_1_10_6.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__com_sun_jersey_jersey_client_1_9.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__javax_activation_activation_1_1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__javax_xml_bind_jaxb_api_2_2_2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__javax_xml_stream_stax_api_1_0_2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_hadoop_hadoop_annotations_2_6_0_cdh5_7_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_hadoop_hadoop_aws_2_6_0_cdh5_7_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_hadoop_hadoop_client_2_6_0_cdh5_7_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_hadoop_hadoop_common_2_6_0_cdh5_7_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_hadoop_hadoop_hdfs_2_6_0_cdh5_7_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_hadoop_hadoop_mapreduce_client_app_2_6_0_cdh5_7_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_hadoop_hadoop_mapreduce_client_common_2_6_0_cdh5_7_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_hadoop_hadoop_mapreduce_client_core_2_6_0_cdh5_7_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_hadoop_hadoop_mapreduce_client_jobclient_2_6_0_cdh5_7_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_hadoop_hadoop_mapreduce_client_shuffle_2_6_0_cdh5_7_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_hadoop_hadoop_yarn_api_2_6_0_cdh5_7_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_hadoop_hadoop_yarn_client_2_6_0_cdh5_7_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_hadoop_hadoop_yarn_common_2_6_0_cdh5_7_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_hadoop_hadoop_yarn_server_common_2_6_0_cdh5_7_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_htrace_htrace_core4_4_0_1_incubating.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__xerces_xercesImpl_2_9_1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__xml_apis_xml_apis_1_3_04.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/uiDesigner.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | -------------------------------------------------------------------------------- /GamerSetup.md: -------------------------------------------------------------------------------- 1 | 2 | ssh root@mriggs-strata-1.vpc.cloudera.com 3 | 4 | scp -i "tedm2.pem" KuduSpark.jar ec2_user@ec2-52-36-220-83.us-west-2.compute.amazonaws.com:./ 5 | 6 | --Setting up Kafka 7 | kafka-topics --zookeeper mriggs-strata-1.vpc.cloudera.com:2181 --create --topic gamer --partitions 1 --replication-factor 1 8 | kafka-topics --zookeeper mriggs-strata-1.vpc.cloudera.com:2181 --list 9 | kafka-console-producer --broker-list mriggs-strata-1.vpc.cloudera.com:9092 --topic test 10 | kafka-cocsole-consumer --zookeeper mriggs-strata-1.vpc.cloudera.com:2181 --topic gamer --from-beginning 11 | 12 | vi .bash_profile 13 | export PATH=/usr/java/jdk1.7.0_67-cloudera/bin/:$PATH 14 | export JAVA_HOME=/usr/java/jdk1.7.0_67-cloudera/ 15 | 16 | ##Populating Kafka 17 | java -cp KuduSpark.jar org.kududb.spark.demo.gamer.KafkaProducerGenerator mriggs-strata-1.vpc.cloudera.com:9092 gamer 10000 300 1000 18 | 19 | ##create Table 20 | java -cp KuduSpark.jar org.kududb.spark.demo.gamer.CreateKuduTable ec2-52-36-220-83.us-west-2.compute.amazonaws.com gamer 3 21 | 22 | ##Run Spark Streaming 23 | spark-submit \ 24 | --master yarn --deploy-mode client \ 25 | --executor-memory 2G \ 26 | --num-executors 2 \ 27 | --jars kudu-mapreduce-0.1.0-20150903.033037-21-jar-with-dependencies.jar \ 28 | --class org.kududb.spark.demo.gamer.GamerAggergatesSparkStreaming KuduSpark.jar \ 29 | mriggs-strata-1.vpc.cloudera.com:9092 gamer mriggs-strata-1.vpc.cloudera.com gamer C 30 | 31 | ##Run SparkSQL 32 | spark-submit \ 33 | --master yarn --deploy-mode client \ 34 | --executor-memory 2G \ 35 | --num-executors 2 \ 36 | --class org.kududb.spark.demo.gamer.GamerSparkSQLExample \ 37 | KuduSpark.jar ec2-52-36-220-83.us-west-2.compute.amazonaws.com l 38 | 39 | ##Run direct insert 40 | java -cp KuduSpark.jar org.kududb.spark.demo.gamer.DirectDataGenerator ec2-52-36-220-83.us-west-2.compute.amazonaws.com gamer 3 41 | 42 | ##Impala 43 | impala-shell 44 | connect ec2-52-11-171-85.us-west-2.compute.amazonaws.com:21007; 45 | 46 | java -cp KuduSpark.jar org.kududb.spark.demo.gamer.cdc.CreateGamerCDCKuduTable ec2-52-36-220-83.us-west-2.compute.amazonaws.com gamer_cdc 3 47 | 48 | java -cp KuduSpark.jar org.kududb.spark.demo.gamer.cdc.DirectDataMultiThreadedInjector ec2-52-36-220-83.us-west-2.compute.amazonaws.com gamer_cdc 10 5 1000 49 | 50 | java -cp KuduSpark.jar org.kududb.spark.demo.gamer.cdc.DirectDataMultiThreadedInjector ec2-52-36-220-83.us-west-2.compute.amazonaws.com gamer_cdc 100 5 5 51 | 52 | java -cp KuduSpark.jar org.kududb.spark.demo.gamer.DropTable ec2-52-36-220-83.us-west-2.compute.amazonaws.com gamer_cdc -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | 203 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SparkOnKudu 2 | ## Overview 3 | This is a simple reusable lib for working with Kudu with Spark 4 | 5 | 6 | ##Functionality 7 | Current functionality supports the following functions 8 | 9 | * RDD foreachPartition with iterator and kudu client 10 | * RDD mapPartition with iterator and kudu client 11 | * DStream foreachPartition with iterator and kudu client 12 | * DStream mapPartition with iterator and kudu client 13 | * Spark SQL integration with Kudu (Basic no filter push down yet) 14 | 15 | ##Examples 16 | * Basic example 17 | ** Creating Kudu tables 18 | ** Connecting with SparkSQL 19 | ** Converting values from Kudu to SparkSQL to MlLib 20 | * Gamer example 21 | ** Creating Kudu Gamer table 22 | ** Generating Gamer data and pushing it to Kafka 23 | ** Reading Gamer data from Kafka with Spark Streaming 24 | ** Aggregating Gamer data in Spark Streaming then pushing mutations to Kudu 25 | ** Running Impala SQL on Kudu Gamer table 26 | ** Running SparkSQL on Kudu Gamer table 27 | ** Converting SparkSQL results to Vectors so we can do KMeans 28 | 29 | ##Near Future 30 | * Key SQL predict push down 31 | * Need to update POM file with public repo 32 | * Need to work with Kudu project to integrate into Kudu 33 | 34 | ##Build 35 | mvn clean package 36 | 37 | ##Setup for Gamer Example 38 | 39 | ###Setting up Kafka 40 | kafka-topics --zookeeper ZooKeeperNode:2181 --create --topic gamer --partitions 1 --replication-factor 1 41 | kafka-topics --zookeeper ZooKeeperNode:2181 --list 42 | 43 | ###Basic Testing with Kafka 44 | kafka-console-producer --broker-list BrokerNode:9092 --topic test 45 | kafka-cocsole-consumer --zookeeper ZooKeeperNode:2181 --topic gamer --from-beginning 46 | 47 | 48 | ###Populating Kafka 49 | java -cp KuduSpark.jar org.kududb.spark.demo.gamer.KafkaProducerGenerator mriggs-strata-1.vpc.cloudera.com:9092 gamer 10000 300 1000 50 | 51 | ###create Table 52 | java -cp KuduSpark.jar org.kududb.spark.demo.gamer.CreateKuduTable mriggs-strata-1.vpc.cloudera.com gamer 53 | 54 | ###Run Spark Streaming 55 | spark-submit \ 56 | --master yarn --deploy-mode client \ 57 | --executor-memory 2G \ 58 | --num-executors 2 \ 59 | --jars kudu-mapreduce-0.1.0-20150903.033037-21-jar-with-dependencies.jar \ 60 | --class org.kududb.spark.demo.gamer.GamerAggergatesSparkStreaming KuduSpark.jar \ 61 | BrokerNode:9092 gamer KuduMaster gamer C 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | -------------------------------------------------------------------------------- /kudu-spark.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | -------------------------------------------------------------------------------- /notes.txt: -------------------------------------------------------------------------------- 1 | 1. Add impala-kudu parcel 2 | 2. Search for the *Impala Service Environment Advanced Configuration Snippet (Safety 85 3 | Valve)* configuration item. Add the following to the text field and save your changes: ` 4 | IMPALA_NEXT=1` 5 | 3. Talk to Martin Grund 6 | 7 | scp target/KuduSpark.jar root@mriggs-strata-1.vpc.cloudera.com:./ 8 | 9 | scp kudu-mapreduce-0.1.0-20150903.033037-21-jar-with-dependencies.jar root@mriggs-strata-1.vpc.cloudera.com:./ 10 | 11 | spark-shell --jars kudu-mapreduce-0.1.0-20150903.033037-21-jar-with-dependencies.jar,KuduSpark.jar --class org.kududb.spark.demo.SimpleGroupByExample --executor-cores 2 --num-executors 3 --executor-memory 2g --master yarn --deploy-mode client mriggs-strata-1.vpc.cloudera.com foo y -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 23 | 24 | 28 | 4.0.0 29 | 30 | org.apache 31 | apache 32 | 14 33 | 34 | com.cloudera.kudu 35 | kudu-spark 36 | Kudu - Spark 37 | jar 38 | 39 | 40 | 1.5.0-cdh5.5.0-SNAPSHOT 41 | 2.10.4 42 | 0.8.2.0-kafka-1.4.0-SNAPSHOT 43 | 2.10 44 | 0.7.1 45 | 2.6.0-cdh5.7.0 46 | ${project.basedir}/.. 47 | 48 | 49 | 50 | 51 | 52 | 53 | org.apache.maven.plugins 54 | maven-compiler-plugin 55 | 3.3 56 | 57 | 1.8 58 | 1.8 59 | 60 | 61 | 62 | 63 | net.alchim31.maven 64 | scala-maven-plugin 65 | 3.2.0 66 | 67 | UTF-8 68 | ${scala.version} 69 | 70 | 71 | 72 | scala-compile-first 73 | process-resources 74 | 75 | add-source 76 | compile 77 | 78 | 79 | 80 | scala-test-compile 81 | process-test-resources 82 | 83 | testCompile 84 | 85 | 86 | 87 | 88 | 89 | 90 | org.scalatest 91 | scalatest-maven-plugin 92 | 1.0 93 | 94 | ${project.build.directory}/surefire-reports 95 | . 96 | WDF TestSuite.txt 97 | false 98 | 99 | 100 | 101 | test 102 | test 103 | 104 | test 105 | 106 | 107 | true 108 | 109 | 110 | 111 | integration-test 112 | integration-test 113 | 114 | test 115 | 116 | 117 | Integration-Test 118 | 119 | -Xmx1536m -XX:MaxPermSize=512m -XX:ReservedCodeCacheSize=512m 120 | 121 | false 122 | 123 | 124 | 125 | 126 | 127 | org.apache.maven.plugins 128 | maven-shade-plugin 129 | 2.2 130 | 131 | false 132 | target/KuduSpark.jar 133 | 134 | 135 | *:* 136 | 137 | 138 | 139 | 140 | *:* 141 | 142 | META-INF/*.SF 143 | META-INF/*.DSA 144 | META-INF/*.RSA 145 | 146 | 147 | 148 | 149 | 150 | 151 | package 152 | 153 | shade 154 | 155 | 156 | 157 | 159 | 161 | reference.conf 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | cloudera-repo 175 | Cloudera Repository 176 | https://repository.cloudera.com/artifactory/cloudera-repos 177 | 178 | 179 | 180 | 207 | 208 | 209 | 210 | 211 | javax.servlet 212 | javax.servlet-api 213 | 3.0.1 214 | test 215 | 216 | 217 | 218 | 219 | org.scala-lang 220 | scala-library 221 | ${scala.version} 222 | 223 | 224 | org.apache.spark 225 | spark-core_${scala.binary.version} 226 | ${spark.version} 227 | 228 | 229 | 230 | 231 | org.scala-lang 232 | scala-library 233 | 234 | 235 | 236 | org.scala-lang 237 | scalap 238 | 239 | 240 | 241 | 242 | org.apache.spark 243 | spark-sql_${scala.binary.version} 244 | ${spark.version} 245 | 246 | 247 | 248 | org.apache.spark 249 | spark-mllib_${scala.binary.version} 250 | ${spark.version} 251 | 252 | 253 | 254 | org.apache.spark 255 | spark-streaming_${scala.binary.version} 256 | ${spark.version} 257 | 258 | 259 | org.apache.spark 260 | spark-streaming-kafka_${scala.binary.version} 261 | ${spark.version} 262 | 263 | 264 | org.apache.spark 265 | spark-streaming_${scala.binary.version} 266 | ${spark.version} 267 | test-jar 268 | tests 269 | test 270 | 271 | 272 | org.apache.kafka 273 | kafka-clients 274 | ${kafka.version} 275 | 276 | 277 | junit 278 | junit 279 | 4.12 280 | test 281 | 282 | 283 | 284 | org.scalatest 285 | scalatest_${scala.binary.version} 286 | 2.2.1 287 | test 288 | 289 | 290 | 291 | org.kududb 292 | kudu-client 293 | ${kudu.version} 294 | 295 | 296 | org.kududb 297 | kudu-mapreduce 298 | ${kudu.version} 299 | 300 | 301 | org.apache.hadoop 302 | hadoop-client 303 | ${hadoop.version} 304 | 305 | 306 | log4j 307 | log4j 308 | 309 | 310 | javax.servlet 311 | servlet-api 312 | 313 | 314 | javax.servlet.jsp 315 | jsp-api 316 | 317 | 318 | org.jruby 319 | jruby-complete 320 | 321 | 322 | org.jboss.netty 323 | netty 324 | 325 | 326 | io.netty 327 | netty 328 | 329 | 330 | 331 | 332 | 333 | 334 | 335 | 336 | -------------------------------------------------------------------------------- /src/main/scala/org/kududb/spark/DefaultSource.scala: -------------------------------------------------------------------------------- 1 | package org.kududb.spark 2 | 3 | import org.apache.spark.Logging 4 | import org.apache.spark.rdd.RDD 5 | import org.apache.spark.sql.{Row, SQLContext} 6 | import org.apache.spark.sql.sources._ 7 | import org.apache.spark.sql.types._ 8 | import org.kududb.client.{RowResult} 9 | import org.kududb.{Schema, ColumnSchema, Type} 10 | 11 | import scala.collection.mutable 12 | 13 | /** 14 | * DefaultSource for integration with Spark's dataframe datasources. 15 | * This class with produce a relationProvider based on input give to it from spark 16 | * 17 | * In all this DefaultSource support the following datasource functionality 18 | * - Scan range pruning through filter push down logic based on rowKeys 19 | * - Filter push down logic on columns that are not rowKey columns 20 | * - Qualifier filtering based on columns used in the SparkSQL statement 21 | * - Type conversions of basic SQL types 22 | */ 23 | class DefaultSource extends RelationProvider { 24 | 25 | val TABLE_KEY:String = "kudu.table" 26 | val KUDU_MASTER:String = "kudu.master" 27 | 28 | /** 29 | * Is given input from SparkSQL to construct a BaseRelation 30 | * @param sqlContext SparkSQL context 31 | * @param parameters Parameters given to us from SparkSQL 32 | * @return A BaseRelation Object 33 | */ 34 | override def createRelation(sqlContext: SQLContext, 35 | parameters: Map[String, String]): 36 | BaseRelation = { 37 | 38 | 39 | val tableName = parameters.get(TABLE_KEY) 40 | if (tableName.isEmpty) 41 | new Throwable("Invalid value for " + TABLE_KEY +" '" + tableName + "'") 42 | 43 | val kuduMaster = parameters.getOrElse(KUDU_MASTER, "") 44 | 45 | new KuduRelation(tableName.get, kuduMaster)(sqlContext) 46 | } 47 | } 48 | 49 | /** 50 | * Implementation of Spark BaseRelation that will build up our scan logic 51 | * , do the scan pruning, filter push down, and value conversions 52 | * 53 | * @param tableName Kudu table that we plan to read from 54 | * @param kuduMaster Kudu master definition 55 | * @param sqlContext SparkSQL context 56 | */ 57 | class KuduRelation (val tableName:String, 58 | val kuduMaster: String) ( 59 | @transient val sqlContext:SQLContext) 60 | extends BaseRelation with PrunedFilteredScan with Logging with Serializable { 61 | 62 | //create or get latest HBaseContext 63 | @transient var kuduContext = new KuduContext(sqlContext.sparkContext, kuduMaster) 64 | @transient var kuduClient = KuduClientCache.getKuduClient(kuduMaster) 65 | @transient var kuduTable = kuduClient.openTable(tableName) 66 | @transient var kuduSchema = kuduTable.getSchema 67 | @transient var kuduSchemaColumnMap = buildKuduSchemaColumnMap(kuduSchema) 68 | 69 | def getKuduSchemaColumnMap(): mutable.HashMap[String, ColumnSchema] = { 70 | if (kuduSchemaColumnMap == null) { 71 | kuduClient = KuduClientCache.getKuduClient(kuduMaster) 72 | kuduTable = kuduClient.openTable(tableName) 73 | kuduSchema = kuduTable.getSchema 74 | kuduSchemaColumnMap = buildKuduSchemaColumnMap(kuduSchema) 75 | } 76 | kuduSchemaColumnMap 77 | } 78 | 79 | def buildKuduSchemaColumnMap(kuduSchema:Schema): mutable.HashMap[String, ColumnSchema] = { 80 | 81 | var kuduSchemaColumnMap = new mutable.HashMap[String, ColumnSchema]() 82 | 83 | val columnIt = kuduSchema.getColumns.iterator() 84 | while (columnIt.hasNext) { 85 | val c = columnIt.next() 86 | kuduSchemaColumnMap.+=((c.getName, c)) 87 | } 88 | kuduSchemaColumnMap 89 | } 90 | 91 | /** 92 | * Generates a Spark SQL schema object so Spark SQL knows what is being 93 | * provided by this BaseRelation 94 | * 95 | * @return schema generated from the SCHEMA_COLUMNS_MAPPING_KEY value 96 | */ 97 | override def schema: StructType = { 98 | 99 | val metadataBuilder = new MetadataBuilder() 100 | 101 | val structFieldArray = new Array[StructField](kuduSchema.getColumnCount) 102 | 103 | val columnIt = kuduSchema.getColumns.iterator() 104 | var indexCounter = 0 105 | while (columnIt.hasNext) { 106 | val c = columnIt.next() 107 | 108 | val columnSparkSqlType = if (c.getType.equals(Type.BOOL)) BooleanType 109 | else if (c.getType.equals(Type.INT16)) IntegerType 110 | else if (c.getType.equals(Type.INT32)) IntegerType 111 | else if (c.getType.equals(Type.INT64)) LongType 112 | else if (c.getType.equals(Type.FLOAT)) FloatType 113 | else if (c.getType.equals(Type.DOUBLE)) DoubleType 114 | else if (c.getType.equals(Type.STRING)) StringType 115 | else if (c.getType.equals(Type.TIMESTAMP)) TimestampType 116 | else if (c.getType.equals(Type.BINARY)) BinaryType 117 | else throw new Throwable("Unsupported column type :" + c.getType) 118 | 119 | val metadata = metadataBuilder.putString("name", c.getName).build() 120 | val struckField = 121 | new StructField(c.getName, columnSparkSqlType, nullable = true, metadata) 122 | 123 | structFieldArray(indexCounter) = struckField 124 | indexCounter += 1 125 | } 126 | 127 | val result = new StructType(structFieldArray) 128 | result 129 | } 130 | 131 | /** 132 | * Here we are building the functionality to populate the resulting RDD[Row] 133 | * Here is where we will do the following: 134 | * - Filter push down 135 | * - Scan or GetList pruning 136 | * - Executing our scan(s) or/and GetList to generate result 137 | * 138 | * @param requiredColumns The columns that are being requested by the requesting query 139 | * @param filters The filters that are being applied by the requesting query 140 | * @return RDD will all the results from HBase needed for SparkSQL to 141 | * execute the query on 142 | */ 143 | override def buildScan(requiredColumns: Array[String], filters: Array[Filter]): RDD[Row] = { 144 | 145 | //retain the information for unit testing checks 146 | var resultRDD: RDD[Row] = null 147 | 148 | if (resultRDD == null) { 149 | 150 | val strBuilder = new StringBuilder() 151 | var isFirst = true 152 | requiredColumns.foreach( c => { 153 | if (isFirst) isFirst = false 154 | else strBuilder.append(",") 155 | strBuilder.append(c) 156 | }) 157 | 158 | val rdd = kuduContext.kuduRDD(tableName, strBuilder.toString()).map(r => { 159 | 160 | val rowResults = r._2 161 | Row.fromSeq(requiredColumns.map(c => 162 | getKuduValue(c, rowResults))) 163 | }) 164 | 165 | resultRDD=rdd 166 | } 167 | resultRDD 168 | } 169 | 170 | def getKuduValue(columnName:String, row:RowResult): Any = { 171 | 172 | val columnSchema = getKuduSchemaColumnMap.getOrElse(columnName, null) 173 | 174 | val columnType = row.getColumnType(columnName) 175 | 176 | if (columnType == Type.BINARY) row.getBinary(columnName) 177 | else if (columnType == Type.BOOL) row.getBoolean(columnName) 178 | else if (columnType == Type.DOUBLE) row.getDouble(columnName) 179 | else if (columnType == Type.FLOAT) row.getFloat(columnName) 180 | else if (columnType == Type.INT16) row.getShort(columnName) 181 | else if (columnType == Type.INT32) row.getInt(columnName) 182 | else if (columnType == Type.INT64) row.getLong(columnName) 183 | else if (columnType == Type.INT8) row.getByte(columnName) 184 | else if (columnType == Type.TIMESTAMP) row.getLong(columnName) 185 | else if (columnType == Type.STRING) row.getString(columnName) 186 | } 187 | } 188 | -------------------------------------------------------------------------------- /src/main/scala/org/kududb/spark/KuduContext.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.kududb.spark 19 | 20 | import org.apache.hadoop.conf.Configuration 21 | import org.apache.hadoop.io.NullWritable 22 | import org.apache.spark.rdd.RDD 23 | import org.kududb.client._ 24 | import org.kududb.mapreduce.KuduTableInputFormat 25 | import scala.collection.mutable 26 | import scala.reflect.ClassTag 27 | import org.apache.spark.{Logging, SparkContext} 28 | import org.apache.spark.streaming.dstream.DStream 29 | import java.io._ 30 | 31 | /** 32 | * HBaseContext is a façade for HBase operations 33 | * like bulk put, get, increment, delete, and scan 34 | * 35 | * HBaseContext will take the responsibilities 36 | * of disseminating the configuration information 37 | * to the working and managing the life cycle of HConnections. 38 | */ 39 | class KuduContext(@transient sc: SparkContext, 40 | @transient kuduMaster: String) 41 | extends Serializable with Logging { 42 | 43 | val broadcastedKuduMaster = sc.broadcast(kuduMaster) 44 | 45 | LatestKuduContextCache.latest = this 46 | 47 | /** 48 | * A simple enrichment of the traditional Spark RDD foreachPartition. 49 | * This function differs from the original in that it offers the 50 | * developer access to a already connected HConnection object 51 | * 52 | * Note: Do not close the HConnection object. All HConnection 53 | * management is handled outside this method 54 | * 55 | * @param rdd Original RDD with data to iterate over 56 | * @param f Function to be given a iterator to iterate through 57 | * the RDD values and a HConnection object to interact 58 | * with HBase 59 | */ 60 | def foreachPartition[T](rdd: RDD[T], 61 | f: (Iterator[T], KuduClient, AsyncKuduClient) => Unit):Unit = { 62 | rdd.foreachPartition( 63 | it => kuduForeachPartition(it, f)) 64 | } 65 | 66 | /** 67 | * A simple enrichment of the traditional Spark Streaming dStream foreach 68 | * This function differs from the original in that it offers the 69 | * developer access to a already connected HConnection object 70 | * 71 | * Note: Do not close the HConnection object. All HConnection 72 | * management is handled outside this method 73 | * 74 | * @param dstream Original DStream with data to iterate over 75 | * @param f Function to be given a iterator to iterate through 76 | * the DStream values and a HConnection object to 77 | * interact with HBase 78 | */ 79 | def foreachPartition[T](dstream: DStream[T], 80 | f: (Iterator[T], KuduClient, AsyncKuduClient) => Unit):Unit = { 81 | dstream.foreachRDD((rdd, time) => { 82 | foreachPartition(rdd, f) 83 | }) 84 | } 85 | 86 | /** 87 | * A simple enrichment of the traditional Spark RDD mapPartition. 88 | * This function differs from the original in that it offers the 89 | * developer access to a already connected HConnection object 90 | * 91 | * Note: Do not close the HConnection object. All HConnection 92 | * management is handled outside this method 93 | * 94 | * @param rdd Original RDD with data to iterate over 95 | * @param mp Function to be given a iterator to iterate through 96 | * the RDD values and a HConnection object to interact 97 | * with HBase 98 | * @return Returns a new RDD generated by the user definition 99 | * function just like normal mapPartition 100 | */ 101 | def mapPartitions[T, R: ClassTag](rdd: RDD[T], 102 | mp: (Iterator[T], KuduClient, AsyncKuduClient) => Iterator[R]): RDD[R] = { 103 | 104 | rdd.mapPartitions[R](it => kuduMapPartition[T, R](it, mp)) 105 | 106 | } 107 | 108 | /** 109 | * A simple enrichment of the traditional Spark Streaming DStream 110 | * foreachPartition. 111 | * 112 | * This function differs from the original in that it offers the 113 | * developer access to a already connected HConnection object 114 | * 115 | * Note: Do not close the HConnection object. All HConnection 116 | * management is handled outside this method 117 | * 118 | * Note: Make sure to partition correctly to avoid memory issue when 119 | * getting data from HBase 120 | * 121 | * @param dstream Original DStream with data to iterate over 122 | * @param f Function to be given a iterator to iterate through 123 | * the DStream values and a HConnection object to 124 | * interact with HBase 125 | * @return Returns a new DStream generated by the user 126 | * definition function just like normal mapPartition 127 | */ 128 | def streamForeachPartition[T](dstream: DStream[T], 129 | f: (Iterator[T], KuduClient, AsyncKuduClient) => Unit): Unit = { 130 | 131 | dstream.foreachRDD(rdd => this.foreachPartition(rdd, f)) 132 | } 133 | 134 | /** 135 | * A simple enrichment of the traditional Spark Streaming DStream 136 | * mapPartition. 137 | * 138 | * This function differs from the original in that it offers the 139 | * developer access to a already connected HConnection object 140 | * 141 | * Note: Do not close the HConnection object. All HConnection 142 | * management is handled outside this method 143 | * 144 | * Note: Make sure to partition correctly to avoid memory issue when 145 | * getting data from HBase 146 | * 147 | * @param dstream Original DStream with data to iterate over 148 | * @param f Function to be given a iterator to iterate through 149 | * the DStream values and a HConnection object to 150 | * interact with HBase 151 | * @return Returns a new DStream generated by the user 152 | * definition function just like normal mapPartition 153 | */ 154 | def streamMapPartitions[T, U: ClassTag](dstream: DStream[T], 155 | f: (Iterator[T], KuduClient, AsyncKuduClient) => Iterator[U]): 156 | DStream[U] = { 157 | dstream.mapPartitions(it => kuduMapPartition[T, U]( 158 | it, 159 | f)) 160 | } 161 | 162 | 163 | 164 | 165 | def kuduRDD(tableName: String, columnProjection: String = null): 166 | RDD[(NullWritable, RowResult)] = { 167 | 168 | val conf = new Configuration 169 | conf.set("kudu.mapreduce.master.address",kuduMaster) 170 | conf.set("kudu.mapreduce.input.table", tableName) 171 | if (columnProjection != null) { 172 | conf.set("kudu.mapreduce.column.projection", columnProjection) 173 | } 174 | 175 | val rdd = sc.newAPIHadoopRDD(conf, classOf[KuduTableInputFormat], classOf[NullWritable], classOf[RowResult]) 176 | 177 | rdd 178 | } 179 | 180 | 181 | /** 182 | * underlining wrapper all foreach functions in HBaseContext 183 | */ 184 | private def kuduForeachPartition[T](it: Iterator[T], 185 | f: (Iterator[T], KuduClient, AsyncKuduClient) => Unit) = { 186 | f(it, KuduClientCache.getKuduClient(broadcastedKuduMaster.value), 187 | KuduClientCache.getAsyncKuduClient(broadcastedKuduMaster.value)) 188 | } 189 | 190 | /** 191 | * underlining wrapper all mapPartition functions in HBaseContext 192 | * 193 | */ 194 | private def kuduMapPartition[K, U](it: Iterator[K], 195 | mp: (Iterator[K], KuduClient, AsyncKuduClient) => 196 | Iterator[U]): Iterator[U] = { 197 | 198 | 199 | val res = mp(it, 200 | KuduClientCache.getKuduClient(broadcastedKuduMaster.value), 201 | KuduClientCache.getAsyncKuduClient(broadcastedKuduMaster.value)) 202 | 203 | res 204 | 205 | } 206 | 207 | /** 208 | * underlining wrapper all get mapPartition functions in HBaseContext 209 | */ 210 | private class ScannerMapPartition[T, U](batchSize: Integer, 211 | makeScanner: (T, KuduClient, AsyncKuduClient) => KuduScanner, 212 | convertResult: (RowResultIterator) => U) 213 | extends Serializable { 214 | 215 | def run(iterator: Iterator[T], kuduClient: KuduClient, asyncKuduClient: AsyncKuduClient): Iterator[U] = { 216 | 217 | 218 | iterator.flatMap( t => { 219 | val resultList = new mutable.MutableList[U] 220 | val scanner = makeScanner(t, kuduClient, asyncKuduClient) 221 | 222 | while (scanner.hasMoreRows) { 223 | resultList.+=(convertResult(scanner.nextRows())) 224 | } 225 | resultList.iterator 226 | }) 227 | } 228 | } 229 | 230 | /** 231 | * Produces a ClassTag[T], which is actually just a casted ClassTag[AnyRef]. 232 | * 233 | * This method is used to keep ClassTags out of the external Java API, as 234 | * the Java compiler cannot produce them automatically. While this 235 | * ClassTag-faking does please the compiler, it can cause problems at runtime 236 | * if the Scala API relies on ClassTags for correctness. 237 | * 238 | * Often, though, a ClassTag[AnyRef] will not lead to incorrect behavior, 239 | * just worse performance or security issues. 240 | * For instance, an Array of AnyRef can hold any type T, but may lose primitive 241 | * specialization. 242 | */ 243 | private[spark] 244 | def fakeClassTag[T]: ClassTag[T] = ClassTag.AnyRef.asInstanceOf[ClassTag[T]] 245 | } 246 | 247 | object LatestKuduContextCache { 248 | var latest:KuduContext = null 249 | } 250 | 251 | object KuduClientCache { 252 | var kuduClient: KuduClient = null 253 | var asyncKuduClient: AsyncKuduClient = null 254 | 255 | def getKuduClient(kuduMaster: String): KuduClient = { 256 | this.synchronized { 257 | if (kuduClient == null) { 258 | kuduClient = new KuduClient.KuduClientBuilder(kuduMaster).build() 259 | } 260 | } 261 | kuduClient 262 | } 263 | 264 | def getAsyncKuduClient(kuduMaster: String): AsyncKuduClient = { 265 | this.synchronized { 266 | if (asyncKuduClient == null) { 267 | asyncKuduClient = new AsyncKuduClient.AsyncKuduClientBuilder(kuduMaster).build() 268 | } 269 | } 270 | asyncKuduClient 271 | } 272 | 273 | } 274 | -------------------------------------------------------------------------------- /src/main/scala/org/kududb/spark/KuduDStreamFunctions.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.kududb.spark 18 | 19 | import org.apache.spark.streaming.dstream.DStream 20 | import org.kududb.client._ 21 | 22 | import scala.reflect.ClassTag 23 | 24 | /** 25 | * HBaseDStreamFunctions contains a set of implicit functions that can be 26 | * applied to a Spark DStream so that we can easily interact with HBase 27 | */ 28 | object KuduDStreamFunctions { 29 | 30 | /** 31 | * These are implicit methods for a DStream that contains any type of 32 | * data. 33 | * 34 | * @param dStream This is for dStreams of any type 35 | * @tparam T Type T 36 | */ 37 | implicit class GenericKuduDStreamFunctions[T](val dStream: DStream[T]) { 38 | 39 | 40 | /** 41 | * Implicit method that gives easy access to HBaseContext's 42 | * foreachPartition method. This will ack very much like a normal DStream 43 | * foreach method but for the fact that you will now have a HBase connection 44 | * while iterating through the values. 45 | * 46 | * @param kc The kuduContext object to identify which HBase 47 | * cluster connection to use 48 | * @param f This function will get an iterator for a Partition of an 49 | * DStream along with a connection object to HBase 50 | */ 51 | def kuduForeachPartition(kc: KuduContext, 52 | f: (Iterator[T], KuduClient, AsyncKuduClient) => Unit): Unit = { 53 | kc.streamForeachPartition(dStream, f) 54 | } 55 | 56 | /** 57 | * Implicit method that gives easy access to HBaseContext's 58 | * mapPartitions method. This will ask very much like a normal DStream 59 | * map partitions method but for the fact that you will now have a 60 | * HBase connection while iterating through the values 61 | * 62 | * @param kc The kuduContext object to identify which HBase 63 | * cluster connection to use 64 | * @param f This function will get an iterator for a Partition of an 65 | * DStream along with a connection object to HBase 66 | * @tparam R This is the type of objects that will go into the resulting 67 | * DStream 68 | * @return A resulting DStream of type R 69 | */ 70 | def kuduMapPartitions[R: ClassTag](kc: KuduContext, 71 | f: (Iterator[T], KuduClient, AsyncKuduClient) => Iterator[R]): 72 | DStream[R] = { 73 | kc.streamMapPartitions(dStream, f) 74 | } 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /src/main/scala/org/kududb/spark/KuduRDDFunctions.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.kududb.spark 19 | 20 | import org.apache.spark.rdd.RDD 21 | import org.kududb.client._ 22 | 23 | import scala.reflect.ClassTag 24 | 25 | /** 26 | * HBaseRDDFunctions contains a set of implicit functions that can be 27 | * applied to a Spark RDD so that we can easily interact with HBase 28 | */ 29 | object KuduRDDFunctions 30 | { 31 | 32 | /** 33 | * These are implicit methods for a RDD that contains any type of 34 | * data. 35 | * 36 | * @param rdd This is for rdd of any type 37 | * @tparam T This is any type 38 | */ 39 | implicit class GenericHBaseRDDFunctions[T](val rdd: RDD[T]) { 40 | 41 | 42 | /** 43 | * Implicit method that gives easy access to HBaseContext's 44 | * foreachPartition method. This will ack very much like a normal RDD 45 | * foreach method but for the fact that you will now have a HBase connection 46 | * while iterating through the values. 47 | * 48 | * @param kc The hbaseContext object to identify which HBase 49 | * cluster connection to use 50 | * @param f This function will get an iterator for a Partition of an 51 | * RDD along with a connection object to HBase 52 | */ 53 | def hbaseForeachPartition(kc: KuduContext, 54 | f: (Iterator[T], KuduClient, AsyncKuduClient) => Unit): Unit = { 55 | kc.foreachPartition(rdd, f) 56 | } 57 | 58 | /** 59 | * Implicit method that gives easy access to HBaseContext's 60 | * mapPartitions method. This will ask very much like a normal RDD 61 | * map partitions method but for the fact that you will now have a 62 | * HBase connection while iterating through the values 63 | * 64 | * @param kc The kuduContext object to identify which HBase 65 | * cluster connection to use 66 | * @param f This function will get an iterator for a Partition of an 67 | * RDD along with a connection object to HBase 68 | * @tparam R This is the type of objects that will go into the resulting 69 | * RDD 70 | * @return A resulting RDD of type R 71 | */ 72 | def hbaseMapPartitions[R: ClassTag](kc: KuduContext, 73 | f: (Iterator[T], KuduClient, AsyncKuduClient) => Iterator[R]): 74 | RDD[R] = { 75 | kc.mapPartitions[T,R](rdd, f) 76 | } 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /src/main/scala/org/kududb/spark/demo/basic/AddSingleRecord.scala: -------------------------------------------------------------------------------- 1 | package org.kududb.spark.demo.basic 2 | 3 | import java.util.Random 4 | 5 | import org.kududb.client.{PartialRow, KuduClient} 6 | 7 | object AddSingleRecord { 8 | def main(args:Array[String]): Unit = { 9 | if (args.length == 0) { 10 | println(" ") 11 | return 12 | } 13 | 14 | val kuduMaster = args(0) 15 | val tableName = args(1) 16 | val rowKey = args(2) 17 | 18 | val kuduClient = new KuduClient.KuduClientBuilder(kuduMaster).build() 19 | val table = kuduClient.openTable(tableName) 20 | val session = kuduClient.newSession() 21 | 22 | val lowerBound = new PartialRow(table.getSchema) 23 | lowerBound.addString(0, rowKey) 24 | val upperBound = new PartialRow(table.getSchema) 25 | upperBound.addString(0, rowKey + "_") 26 | 27 | var startTime = System.currentTimeMillis() 28 | val random = new Random() 29 | 30 | startTime = System.currentTimeMillis() 31 | val update = table.newInsert() 32 | val row = update.getRow 33 | row.addString(0, rowKey) 34 | val columns = table.getSchema.getColumns 35 | for (c <- 1 until columns.size()) { 36 | println(columns.get(c).getName + " " + columns.get(c).getType) 37 | row.addInt(columns.get(c).getName, random.nextInt(100000)) 38 | } 39 | session.apply(update) 40 | println("new key: " + rowKey) 41 | println(" new key time spent: " + (System.currentTimeMillis() - startTime)) 42 | 43 | startTime = System.currentTimeMillis() 44 | val scanner2 = kuduClient.newScannerBuilder(table).lowerBound(lowerBound).exclusiveUpperBound(upperBound).build() 45 | 46 | while (scanner2.hasMoreRows) { 47 | val rows = scanner2.nextRows() 48 | while (rows.hasNext) { 49 | val row = rows.next() 50 | println("NewValue: " + rowKey + " " + row.rowToString()) 51 | } 52 | } 53 | scanner2.close() 54 | println(" scan time spent: " + (System.currentTimeMillis() - startTime)) 55 | 56 | val scannerX = kuduClient.newScannerBuilder(table).build() 57 | while (scannerX.hasMoreRows) { 58 | val rows = scannerX.nextRows() 59 | while (rows.hasNext) { 60 | val row = rows.next() 61 | println("Full Scan: " + row.rowToString()) 62 | } 63 | } 64 | println("done") 65 | kuduClient.shutdown() 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /src/main/scala/org/kududb/spark/demo/basic/BasicExample.scala: -------------------------------------------------------------------------------- 1 | package org.kududb.spark.demo.basic 2 | 3 | import java.util 4 | import java.util.Random 5 | 6 | import org.apache.spark.sql.SQLContext 7 | import org.apache.spark.{SparkConf, SparkContext} 8 | import org.kududb.ColumnSchema.ColumnSchemaBuilder 9 | import org.kududb.client.KuduClient 10 | import org.kududb.{ColumnSchema, Schema, Type} 11 | 12 | object BasicExample { 13 | def main(args: Array[String]): Unit = { 14 | 15 | val kuduMaster = "quickstart.cloudera" 16 | 17 | println(" -- Starting ") 18 | val kuduClient = new KuduClient.KuduClientBuilder(kuduMaster).build() 19 | try { 20 | println(" -- ") 21 | 22 | val columnList = new util.ArrayList[ColumnSchema]() 23 | columnList.add(new ColumnSchemaBuilder("KEY_ID", Type.STRING).key(true).build()) 24 | columnList.add(new ColumnSchemaBuilder("COL_A", Type.STRING).key(false).build()) 25 | columnList.add(new ColumnSchemaBuilder("COL_B", Type.STRING).key(false).build()) 26 | columnList.add(new ColumnSchemaBuilder("COL_C", Type.STRING).key(false).build()) 27 | val schema = new Schema(columnList) 28 | 29 | if (kuduClient.tableExists("foobar")) { 30 | kuduClient.deleteTable("foobar") 31 | } 32 | kuduClient.createTable("foobar", schema) 33 | 34 | val session = kuduClient.newSession() 35 | val table = kuduClient.openTable("foobar") 36 | 37 | try { 38 | val random = new Random() 39 | for (i <- 0 until 10) { 40 | val insert = table.newInsert() 41 | val row = insert.getRow() 42 | row.addString(0, i.toString) 43 | row.addString(1, "value " + i) 44 | row.addString(2, "42:" + i) 45 | row.addString(3, "Cat" + random.nextGaussian()) 46 | session.apply(insert) 47 | } 48 | session.flush() 49 | } finally { 50 | session.close() 51 | } 52 | 53 | val tableList = kuduClient.getTablesList.getTablesList 54 | for (i <- 0 until tableList.size()) { 55 | println("Table " + i + ":" + tableList.get(i)) 56 | } 57 | 58 | val sparkConfig = new SparkConf() 59 | sparkConfig.set("spark.broadcast.compress", "false") 60 | sparkConfig.set("spark.shuffle.compress", "false") 61 | sparkConfig.set("spark.shuffle.spill.compress", "false") 62 | val sc = new SparkContext("local[2]", "SparkSQL on Kudu", sparkConfig) 63 | 64 | val sqlContext = new SQLContext(sc) 65 | 66 | val df = sqlContext.load("org.kududb.spark", 67 | Map("kudu.table" -> "foobar", "kudu.master" -> kuduMaster)) 68 | 69 | df.registerTempTable("foobar") 70 | 71 | sqlContext.sql("SELECT * FROM foobar").foreach(r => { 72 | println("Row: " + r) 73 | }) 74 | } finally { 75 | kuduClient.shutdown() 76 | } 77 | println("-- finished") 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /src/main/scala/org/kududb/spark/demo/basic/BasicSparkSQLExamples.scala: -------------------------------------------------------------------------------- 1 | package org.kududb.spark.demo.basic 2 | 3 | import org.apache.log4j.{Level, Logger} 4 | import org.apache.spark.sql.SQLContext 5 | import org.apache.spark.{SparkConf, SparkContext} 6 | import org.apache.spark.mllib.clustering.KMeans 7 | import org.apache.spark.mllib.linalg.Vectors 8 | 9 | object BasicSparkSQLExamples { 10 | def main(args:Array[String]): Unit = { 11 | if (args.length == 0) { 12 | println(" ") 13 | } 14 | 15 | Logger.getRootLogger.setLevel(Level.ERROR) 16 | 17 | val kuduMaster = args(0) 18 | val tableName = args(1) 19 | val runLocal = args(2).equals("l") 20 | 21 | println("starting") 22 | var sc:SparkContext = null 23 | if (runLocal) { 24 | val sparkConfig = new SparkConf() 25 | sparkConfig.set("spark.broadcast.compress", "false") 26 | sparkConfig.set("spark.shuffle.compress", "false") 27 | sparkConfig.set("spark.shuffle.spill.compress", "false") 28 | sc = new SparkContext("local", "TableStatsSinglePathMain", sparkConfig) 29 | } else { 30 | val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain") 31 | sc = new SparkContext(sparkConfig) 32 | } 33 | 34 | try { 35 | println("Setting up Tables") 36 | val sqlContext = new SQLContext(sc) 37 | sqlContext.load("org.kududb.spark", 38 | Map("kudu.table" -> tableName, "kudu.master" -> kuduMaster)).registerTempTable(tableName) 39 | 40 | println("Query 1: SELECT count(*) FROM " + tableName) 41 | val startTimeQ1 = System.currentTimeMillis() 42 | sqlContext.sql("SELECT count(*) FROM " + tableName).take(10).foreach(r => { 43 | println(" - (" + r + ")") 44 | }) 45 | println("Finish Query 1: " + (System.currentTimeMillis() - startTimeQ1)) 46 | 47 | println("Query 2: SELECT key_id, col_1 FROM " + tableName + " limit 100") 48 | val startTimeQ2 = System.currentTimeMillis() 49 | sqlContext.sql("SELECT key_id, col_1 FROM " + tableName + " limit 100 ").take(100).foreach(r => { 50 | println(" - (" + r + ")") 51 | }) 52 | println("Finish Query 2: " + (System.currentTimeMillis() - startTimeQ2)) 53 | 54 | val q3 = "select key_id from " + tableName + " a join (SELECT max(col_1) col_max FROM " + tableName + ") b on (a.col_1 = b.col_max)" 55 | println("Query 3: " + q3) 56 | val startTimeQ3 = System.currentTimeMillis() 57 | sqlContext.sql(q3).take(100).foreach(r => { 58 | println(" - (" + r + ")") 59 | }) 60 | println("Finish Query 3: " + (System.currentTimeMillis() - startTimeQ3)) 61 | /* 62 | val q4 = "select host, metric, avg(value), count(*) from metrics group by host, metric" 63 | println("Query 4: " + q4) 64 | val startTimeQ4 = System.currentTimeMillis() 65 | sqlContext.sql(q4).take(100).foreach(r => { 66 | println(" - (" + r + ")") 67 | }) 68 | println("Finish Query 4: " + (System.currentTimeMillis() - startTimeQ4)) 69 | 70 | */ 71 | 72 | println("Query 5 + MLLIB: SELECT key_id, col_1, col_2 FROM " + tableName ) 73 | val startTimeQ5 = System.currentTimeMillis() 74 | val resultDf = sqlContext.sql("SELECT key_id, col_1, col_2 FROM " + tableName + " limit 1000") 75 | 76 | val parsedData = resultDf.map(r => { 77 | val array = Array(r.getInt(1).toDouble, r.getInt(2).toDouble) 78 | Vectors.dense(array) 79 | }) 80 | val clusters = KMeans.train(parsedData, 3, 4) 81 | clusters.clusterCenters.foreach(v => println(" Vector Center:" + v)) 82 | 83 | //TODO add Mllib here 84 | println("Finish Query 5 + MLLIB: " + (System.currentTimeMillis() - startTimeQ5)) 85 | 86 | } finally { 87 | sc.stop() 88 | } 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /src/main/scala/org/kududb/spark/demo/basic/InitialDataPopulation.scala: -------------------------------------------------------------------------------- 1 | package org.kududb.spark.demo.basic 2 | 3 | import java.util 4 | import java.util.Random 5 | 6 | import org.kududb.{Schema, Type, ColumnSchema} 7 | import org.kududb.ColumnSchema.ColumnSchemaBuilder 8 | import org.kududb.client.{AsyncKuduClient, KuduClient} 9 | 10 | 11 | object InitialDataPopulation { 12 | def main(args:Array[String]): Unit = { 13 | if (args.length == 0) { 14 | println(" ") 15 | 16 | //"quickstart.cloudera" 17 | 18 | return 19 | } 20 | val kuduMaster = args(0) 21 | val tableName = args(1) 22 | val numOfColumns = args(2).toInt 23 | val numOfRows = args(3).toInt 24 | 25 | val kuduClient = new AsyncKuduClient.AsyncKuduClientBuilder(kuduMaster).build() 26 | try { 27 | //Delete table if exist 28 | if (kuduClient.tableExists(tableName).join()) { 29 | kuduClient.deleteTable(tableName).join() 30 | } 31 | 32 | //Create Schema 33 | val columnList = new util.ArrayList[ColumnSchema]() 34 | columnList.add(new ColumnSchemaBuilder("key_id", Type.STRING).key(true).build()) 35 | for (c <- 0 until numOfColumns) { 36 | columnList.add(new ColumnSchemaBuilder("col_" + c, Type.INT32).key(false).build()) 37 | } 38 | val schema = new Schema(columnList) 39 | 40 | //Create table 41 | kuduClient.createTable(tableName, schema).join() 42 | 43 | //Populate table 44 | val random = new Random 45 | val table = kuduClient.openTable(tableName).join() 46 | val asyncSession = kuduClient.newSession() 47 | 48 | for (r <- 0 until numOfRows) { 49 | val insert = table.newInsert() 50 | val row = insert.getRow() 51 | row.addString(0, NameGenerator.getName()) 52 | val columns = table.getSchema.getColumns 53 | for (c <- 1 until columns.size()) { 54 | row.addInt(columns.get(c).getName, random.nextInt(100000)) 55 | } 56 | asyncSession.apply(insert) 57 | 58 | if (r % 1000 == 0) { 59 | println("Inserted: " + r) 60 | } 61 | } 62 | asyncSession.flush() 63 | 64 | val scannerX = kuduClient.newScannerBuilder(table).build() 65 | while (scannerX.hasMoreRows) { 66 | val rows = scannerX.nextRows().join() 67 | while (rows.hasNext) { 68 | val row = rows.next() 69 | println(" - " + row.rowToString()) 70 | } 71 | } 72 | 73 | asyncSession.close() 74 | 75 | } finally { 76 | kuduClient.shutdown() 77 | } 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /src/main/scala/org/kududb/spark/demo/basic/ModifySingleRecord.scala: -------------------------------------------------------------------------------- 1 | package org.kududb.spark.demo.basic 2 | 3 | import org.kududb.client.{PartialRow, KuduClient} 4 | 5 | object ModifySingleRecord { 6 | def main(args:Array[String]): Unit = { 7 | if (args.length == 0) { 8 | println(" ") 9 | return 10 | } 11 | 12 | val kuduMaster = args(0) 13 | val tableName = args(1) 14 | val rowKey = args(2) 15 | val columnIndexToChange = args(3).toInt 16 | val newValue = args(4).toInt 17 | 18 | val kuduClient = new KuduClient.KuduClientBuilder(kuduMaster).build() 19 | val table = kuduClient.openTable(tableName) 20 | val session = kuduClient.newSession() 21 | 22 | val lowerBound = new PartialRow(table.getSchema) 23 | lowerBound.addString(0, rowKey) 24 | val upperBound = new PartialRow(table.getSchema) 25 | upperBound.addString(0, rowKey + "_") 26 | 27 | var startTime = System.currentTimeMillis() 28 | val scanner = kuduClient.newScannerBuilder(table).lowerBound(lowerBound).exclusiveUpperBound(upperBound).build() 29 | 30 | while (scanner.hasMoreRows) { 31 | val rows = scanner.nextRows() 32 | while (rows.hasNext) { 33 | val row = rows.next() 34 | println("InitialValue: " + rowKey + " " + row.rowToString()) 35 | } 36 | } 37 | println(" scan time spent: " + (System.currentTimeMillis() - startTime)) 38 | scanner.close() 39 | 40 | startTime = System.currentTimeMillis() 41 | val update = table.newUpdate() 42 | val row = update.getRow 43 | row.addString(0, rowKey) 44 | row.addInt(columnIndexToChange, newValue) 45 | session.apply(update) 46 | println("Update: " + rowKey) 47 | println(" update time spent: " + (System.currentTimeMillis() - startTime)) 48 | 49 | startTime = System.currentTimeMillis() 50 | val scanner2 = kuduClient.newScannerBuilder(table).lowerBound(lowerBound).exclusiveUpperBound(upperBound).build() 51 | 52 | while (scanner2.hasMoreRows) { 53 | val rows = scanner.nextRows() 54 | while (rows.hasNext) { 55 | val row = rows.next() 56 | println("NewValue: " + rowKey + " " + row.rowToString()) 57 | } 58 | } 59 | scanner2.close() 60 | println(" scan time spent: " + (System.currentTimeMillis() - startTime)) 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /src/main/scala/org/kududb/spark/demo/basic/NameGenerator.scala: -------------------------------------------------------------------------------- 1 | package org.kududb.spark.demo.basic 2 | 3 | import java.util.Random 4 | 5 | import scala.collection.mutable 6 | 7 | object NameGenerator { 8 | 9 | val random = new Random() 10 | val listOfNames = new mutable.MutableList[NameAndCounter] 11 | listOfNames += new NameAndCounter("Katlyn") 12 | listOfNames += new NameAndCounter("Laurena") 13 | listOfNames += new NameAndCounter("Jenise") 14 | listOfNames += new NameAndCounter("Vida") 15 | listOfNames += new NameAndCounter("Delphine") 16 | listOfNames += new NameAndCounter("Tiffanie") 17 | listOfNames += new NameAndCounter("Carroll") 18 | listOfNames += new NameAndCounter("Steve") 19 | listOfNames += new NameAndCounter("Nu") 20 | listOfNames += new NameAndCounter("Robbin") 21 | listOfNames += new NameAndCounter("Mahalia") 22 | listOfNames += new NameAndCounter("Norah") 23 | listOfNames += new NameAndCounter("Selina") 24 | listOfNames += new NameAndCounter("Cornelius") 25 | listOfNames += new NameAndCounter("Bennie") 26 | listOfNames += new NameAndCounter("Kemberly") 27 | listOfNames += new NameAndCounter("Johnie") 28 | listOfNames += new NameAndCounter("Jenee") 29 | listOfNames += new NameAndCounter("Napoleon") 30 | listOfNames += new NameAndCounter("Brenton") 31 | listOfNames += new NameAndCounter("Roxana") 32 | listOfNames += new NameAndCounter("Kalyn") 33 | listOfNames += new NameAndCounter("Jeana") 34 | listOfNames += new NameAndCounter("Tennie") 35 | listOfNames += new NameAndCounter("Tasia") 36 | listOfNames += new NameAndCounter("Ashely") 37 | listOfNames += new NameAndCounter("Hester") 38 | listOfNames += new NameAndCounter("Zita") 39 | listOfNames += new NameAndCounter("Evalyn") 40 | listOfNames += new NameAndCounter("Anderson") 41 | listOfNames += new NameAndCounter("Elaina") 42 | listOfNames += new NameAndCounter("Benny") 43 | listOfNames += new NameAndCounter("Heidi") 44 | listOfNames += new NameAndCounter("Mammie") 45 | listOfNames += new NameAndCounter("Alisa") 46 | listOfNames += new NameAndCounter("Billie") 47 | listOfNames += new NameAndCounter("Wan") 48 | listOfNames += new NameAndCounter("Dionna") 49 | listOfNames += new NameAndCounter("Julene") 50 | listOfNames += new NameAndCounter("Chasidy") 51 | listOfNames += new NameAndCounter("Vennie") 52 | listOfNames += new NameAndCounter("Cara") 53 | listOfNames += new NameAndCounter("Charissa") 54 | listOfNames += new NameAndCounter("Russell") 55 | listOfNames += new NameAndCounter("Daniela") 56 | listOfNames += new NameAndCounter("Kindra") 57 | listOfNames += new NameAndCounter("Eduardo") 58 | listOfNames += new NameAndCounter("Marci") 59 | listOfNames += new NameAndCounter("Gustavo") 60 | listOfNames += new NameAndCounter("Dianna ") 61 | 62 | def getName(): String = { 63 | val nameAndCounter = listOfNames.get(random.nextInt(listOfNames.length - 1)).get 64 | nameAndCounter.counter += 1 65 | nameAndCounter.name + "_" + nameAndCounter.counter 66 | } 67 | } 68 | 69 | class NameAndCounter(val name:String = "N/A", var counter:Int = 0) { 70 | 71 | } 72 | -------------------------------------------------------------------------------- /src/main/scala/org/kududb/spark/demo/basic/ScanTable.scala: -------------------------------------------------------------------------------- 1 | package org.kududb.spark.demo.basic 2 | 3 | import org.kududb.client.KuduClient 4 | 5 | object ScanTable { 6 | def main(args:Array[String]): Unit = { 7 | if (args.length == 0) { 8 | println(" ") 9 | return 10 | } 11 | val kuduMaster = args(0) 12 | val tableName = args(1) 13 | val limit = args(2).toInt 14 | 15 | val kuduClient = new KuduClient.KuduClientBuilder(kuduMaster).build() 16 | val table = kuduClient.openTable(tableName) 17 | println("starting scan") 18 | val scannerX = kuduClient.newScannerBuilder(table).build() 19 | while (scannerX.hasMoreRows) { 20 | val rows = scannerX.nextRows() 21 | while (rows.hasNext) { 22 | val row = rows.next() 23 | println(" - " + row.rowToString()) 24 | } 25 | } 26 | println("finished scan") 27 | kuduClient.shutdown() 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/main/scala/org/kududb/spark/demo/gamer/GamerEvent.scala: -------------------------------------------------------------------------------- 1 | package org.kududb.spark.demo.gamer 2 | 3 | class GamerEvent(var gamerId:String = "", 4 | var lastTimePlayed:Long = 0, 5 | var gamesPlayed:Int = 1, 6 | var gamesWon:Int = 0, 7 | var oks:Int = 0, 8 | var deaths:Int = 0, 9 | var damageGiven:Int = 0, 10 | var damageTaken:Int = 0, 11 | var isInsert:Boolean = false, 12 | var maxOksInOneGame:Int = 0, 13 | var maxDeathsInOneGame:Int = 0, 14 | var hasChanged:Boolean = false) extends Serializable { 15 | 16 | override def toString():String = { 17 | gamerId + "," + 18 | lastTimePlayed + "," + 19 | gamesPlayed + "," + 20 | gamesWon + "," + 21 | oks + "," + 22 | deaths + "," + 23 | damageGiven + "," + 24 | damageTaken + "," + 25 | isInsert + "," + 26 | maxOksInOneGame + "," + 27 | maxDeathsInOneGame 28 | } 29 | 30 | def += (gamerEvent: GamerEvent): Unit = { 31 | gamerId = gamerEvent.gamerId 32 | lastTimePlayed = gamerEvent.lastTimePlayed 33 | gamesPlayed += gamerEvent.gamesPlayed 34 | gamesWon += gamerEvent.gamesWon 35 | oks += gamerEvent.oks 36 | deaths += gamerEvent.deaths 37 | damageGiven += gamerEvent.damageGiven 38 | damageTaken += gamerEvent.damageTaken 39 | if (oks > maxOksInOneGame) maxOksInOneGame = oks 40 | if (deaths > maxDeathsInOneGame) maxDeathsInOneGame = deaths 41 | isInsert = isInsert && gamerEvent.isInsert 42 | } 43 | } 44 | 45 | object GamerEventBuilder extends Serializable { 46 | def build(input:String):GamerEvent = { 47 | val parts = input.split(",") 48 | 49 | if (parts(0).startsWith("14")) println("input:" + input) 50 | 51 | new GamerEvent(parts(0), 52 | parts(1).toLong, 53 | parts(2).toInt, 54 | parts(3).toInt, 55 | parts(4).toInt, 56 | parts(5).toInt, 57 | parts(6).toInt, 58 | parts(7).toInt, 59 | parts(8).equals("true"), 60 | parts(9).toInt, 61 | parts(10).toInt) 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /src/main/scala/org/kududb/spark/demo/gamer/aggregates/CreateKuduTable.scala: -------------------------------------------------------------------------------- 1 | package org.kududb.spark.demo.gamer.aggregates 2 | 3 | import java.util 4 | import java.util.ArrayList 5 | 6 | import org.kududb.ColumnSchema.ColumnSchemaBuilder 7 | import org.kududb.client.{CreateTableOptions, KuduClient} 8 | import org.kududb.{ColumnSchema, Schema, Type} 9 | 10 | object CreateGamerAggregatesKuduTable { 11 | def main(args:Array[String]): Unit = { 12 | if (args.length == 0) { 13 | println("{kuduMaster} {tableName}") 14 | return 15 | } 16 | 17 | val kuduMaster = args(0) 18 | val tableName = args(1) 19 | val numberOfBuckets = args(2).toInt 20 | 21 | val kuduClient = new KuduClient.KuduClientBuilder(kuduMaster).build() 22 | val columnList = new util.ArrayList[ColumnSchema]() 23 | 24 | columnList.add(new ColumnSchemaBuilder("gamer_id", Type.STRING).key(true).build()) 25 | columnList.add(new ColumnSchemaBuilder("last_time_played", Type.INT64).key(false).build()) 26 | columnList.add(new ColumnSchemaBuilder("games_played", Type.INT32).key(false).build()) 27 | columnList.add(new ColumnSchemaBuilder("games_won", Type.INT32).key(false).build()) 28 | columnList.add(new ColumnSchemaBuilder("oks", Type.INT32).key(false).build()) 29 | columnList.add(new ColumnSchemaBuilder("deaths", Type.INT32).key(false).build()) 30 | columnList.add(new ColumnSchemaBuilder("damage_given", Type.INT32).key(false).build()) 31 | columnList.add(new ColumnSchemaBuilder("damage_taken", Type.INT32).key(false).build()) 32 | columnList.add(new ColumnSchemaBuilder("max_oks_in_one_game", Type.INT32).key(false).build()) 33 | columnList.add(new ColumnSchemaBuilder("max_deaths_in_one_game", Type.INT32).key(false).build()) 34 | val schema = new Schema(columnList) 35 | 36 | if (kuduClient.tableExists(tableName)) { 37 | println("Deleting Table") 38 | kuduClient.deleteTable(tableName) 39 | } 40 | val builder = new CreateTableOptions() 41 | 42 | val hashColumnList = new ArrayList[String] 43 | hashColumnList.add("gamer_id") 44 | 45 | builder.addHashPartitions(hashColumnList, numberOfBuckets) 46 | 47 | println("Creating Table") 48 | kuduClient.createTable(tableName, schema, builder) 49 | println("Created Table") 50 | kuduClient.shutdown() 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /src/main/scala/org/kududb/spark/demo/gamer/aggregates/DirectDataInjector.scala: -------------------------------------------------------------------------------- 1 | package org.kududb.spark.demo.gamer.aggregates 2 | 3 | import java.util.Random 4 | 5 | import org.kududb.client.KuduClient 6 | 7 | object DirectDataInjector { 8 | 9 | val random = new Random 10 | def main(args:Array[String]): Unit = { 11 | 12 | if (args.length == 0) { 13 | println(" ") 14 | return 15 | } 16 | 17 | val kuduMaster = args(0) 18 | val tableName = args(1) 19 | val numberOfRecords = args(2).toInt 20 | 21 | val kuduClient = new KuduClient.KuduClientBuilder(kuduMaster).build() 22 | val table = kuduClient.openTable(tableName) 23 | val session = kuduClient.newSession() 24 | 25 | table.newInsert() 26 | 27 | for (i <- 0 to numberOfRecords) { 28 | val record = GamerDataGenerator.makeNewGamerRecord(100000) 29 | val op = table.newInsert() 30 | 31 | val row = op.getRow 32 | row.addString("gamer_id", record.gamerId) 33 | row.addLong("last_time_played", record.lastTimePlayed) 34 | row.addInt("games_played", record.gamesPlayed) 35 | row.addInt("games_won", record.gamesWon) 36 | row.addInt("oks", record.oks) 37 | row.addInt("deaths", record.deaths) 38 | row.addInt("damage_given", record.damageGiven) 39 | row.addInt("damage_taken", record.damageTaken) 40 | row.addInt("max_oks_in_one_game", record.maxOksInOneGame) 41 | row.addInt("max_deaths_in_one_game", record.maxDeathsInOneGame) 42 | 43 | session.apply(op) 44 | } 45 | session.flush() 46 | 47 | kuduClient.close() 48 | 49 | 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /src/main/scala/org/kududb/spark/demo/gamer/aggregates/GamerAggergatesSparkStreaming.scala: -------------------------------------------------------------------------------- 1 | package org.kududb.spark.demo.gamer.aggregates 2 | 3 | import kafka.serializer.StringDecoder 4 | import org.apache.spark.rdd.RDD 5 | import org.apache.spark.streaming.dstream.DStream 6 | import org.apache.spark.streaming.kafka.KafkaUtils 7 | import org.apache.spark.streaming.{Seconds, StreamingContext} 8 | import org.apache.spark.{HashPartitioner, SparkConf, SparkContext} 9 | import org.kududb.client.Operation 10 | import org.kududb.client.SessionConfiguration.FlushMode 11 | import org.kududb.spark.KuduContext 12 | import org.kududb.spark.KuduDStreamFunctions.GenericKuduDStreamFunctions 13 | import org.kududb.spark.demo.gamer.{GamerEvent, GamerEventBuilder} 14 | 15 | object GamerAggergatesSparkStreaming { 16 | 17 | def main(args:Array[String]): Unit = { 18 | if (args.length == 0) { 19 | println("{brokerList} {topics} {kuduMaster} {tableName} {local}") 20 | } 21 | val brokerList = args(0) 22 | val topics = args(1) 23 | val kuduMaster = args(2) 24 | val tableName = args(3) 25 | val runLocal = args(4).equals("L") 26 | 27 | val sparkConf = new SparkConf().setAppName("GamerAggergatesSparkStreaming") 28 | var ssc:StreamingContext = null 29 | if (runLocal) { 30 | println("Running Local") 31 | val sparkConfig = new SparkConf() 32 | sparkConfig.set("spark.broadcast.compress", "false") 33 | sparkConfig.set("spark.shuffle.compress", "false") 34 | sparkConfig.set("spark.shuffle.spill.compress", "false") 35 | sparkConfig.set("spark.io.compression.codec", "lzf") 36 | val sc = new SparkContext("local[4]", "SparkSQL on Kudu", sparkConfig) 37 | ssc = new StreamingContext(sc, Seconds(2)) 38 | } else { 39 | println("Running Cluster") 40 | ssc = new StreamingContext(sparkConf, Seconds(2)) 41 | } 42 | 43 | val kuduContext = new KuduContext(ssc.sparkContext, kuduMaster) 44 | 45 | //Get original values from Kudu 46 | val originalKuduDStream = loadOriginalKuduData(tableName, kuduContext, ssc) 47 | 48 | //Connect to Kafka 49 | val newKafkaMessageDStream = loadDataFromKafka(topics, brokerList, ssc) 50 | 51 | val currentStateDStream = newKafkaMessageDStream.updateStateByKey[GamerEvent]( 52 | (a:Seq[String], b:Option[GamerEvent]) => { 53 | val it = a.iterator 54 | if (!it.hasNext) { 55 | if (!b.isEmpty) { 56 | val existing = b.get 57 | existing.hasChanged = false 58 | Some(existing) 59 | } else { 60 | None 61 | } 62 | } else { 63 | val resultingValue = new GamerEvent() 64 | 65 | //Add up all the values in this micro batch 66 | while (it.hasNext) { 67 | val newPart = it.next() 68 | resultingValue += GamerEventBuilder.build(newPart) 69 | } 70 | 71 | if (b.isEmpty) { 72 | resultingValue.isInsert = true 73 | resultingValue.hasChanged = true 74 | Some(resultingValue) 75 | } else { 76 | val existing = b.get 77 | existing += resultingValue 78 | existing.isInsert = false 79 | existing.hasChanged = true 80 | Some(existing) 81 | } 82 | } 83 | }, new HashPartitioner (ssc.sparkContext.defaultParallelism), originalKuduDStream) 84 | 85 | currentStateDStream.kuduForeachPartition(kuduContext, (it, kuduClient, asyncKuduClient) => { 86 | val table = kuduClient.openTable(tableName) 87 | 88 | //This can be made to be faster 89 | val session = kuduClient.newSession() 90 | session.setFlushMode(FlushMode.AUTO_FLUSH_BACKGROUND) 91 | 92 | var operation: Operation = null 93 | 94 | var upserts = 0 95 | while (it.hasNext) { 96 | val gamerEventTuple = it.next() 97 | 98 | if (gamerEventTuple._2.hasChanged == true) { 99 | if (gamerEventTuple._2.isInsert) { 100 | operation = table.newInsert() 101 | } else { 102 | operation = table.newUpdate() 103 | } 104 | 105 | val row = operation.getRow 106 | row.addString("gamer_id", gamerEventTuple._2.gamerId.toString) 107 | row.addLong("last_time_played", gamerEventTuple._2.lastTimePlayed) 108 | row.addInt("games_played", gamerEventTuple._2.gamesPlayed) 109 | row.addInt("games_won", gamerEventTuple._2.gamesWon) 110 | row.addInt("oks", gamerEventTuple._2.oks) 111 | row.addInt("deaths", gamerEventTuple._2.deaths) 112 | row.addInt("damage_given", gamerEventTuple._2.damageGiven) 113 | row.addInt("damage_taken", gamerEventTuple._2.damageTaken) 114 | row.addInt("max_oks_in_one_game", gamerEventTuple._2.maxOksInOneGame) 115 | row.addInt("max_deaths_in_one_game", gamerEventTuple._2.maxDeathsInOneGame) 116 | 117 | session.apply(operation) 118 | 119 | upserts += 1 120 | } 121 | } 122 | session.close() 123 | 124 | println("upserts: " + upserts) 125 | }) 126 | ssc.checkpoint("./checkpoint") 127 | ssc.start() 128 | ssc.awaitTermination() 129 | } 130 | 131 | def loadDataFromKafka(topics:String, 132 | brokerList:String, 133 | ssc:StreamingContext): DStream[(String, String)] = { 134 | val topicsSet = topics.split(",").toSet 135 | val kafkaParams = Map[String, String]("metadata.broker.list" -> brokerList) 136 | val messages = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder]( 137 | ssc, kafkaParams, topicsSet) 138 | 139 | messages.map(r => { 140 | (r._1, r._2) 141 | }) 142 | } 143 | 144 | def loadOriginalKuduData(tableName:String, 145 | kuduContext:KuduContext, 146 | ssc:StreamingContext):RDD[(String, GamerEvent)] = { 147 | val kuduOriginalRdd = kuduContext.kuduRDD(tableName, 148 | "gamer_id,last_time_played,games_played,games_won,oks,deaths,damage_given,damage_taken,max_oks_in_one_game,max_deaths_in_one_game"). 149 | map(r => { 150 | val row = r._2 151 | 152 | val gamerId = row.getString(0) 153 | val lastTimePlayed = row.getLong(1) 154 | val gamesPlayed = row.getInt(2) 155 | val gamesWon = row.getInt(3) 156 | val oks = row.getInt(4) 157 | val deaths = row.getInt(5) 158 | val damageGiven = row.getInt(6) 159 | val damageTaken = row.getInt(7) 160 | val maxOksInOneGame = row.getInt(8) 161 | val maxDeathsInOneGame = row.getInt(9) 162 | 163 | val initialGamerEvent = new GamerEvent(gamerId,lastTimePlayed, 164 | gamesPlayed, 165 | gamesWon, 166 | oks, 167 | deaths, 168 | damageGiven, 169 | damageTaken, 170 | false, 171 | maxOksInOneGame, 172 | maxDeathsInOneGame) 173 | 174 | (row.getString(0),initialGamerEvent) 175 | }) 176 | 177 | kuduOriginalRdd 178 | } 179 | } 180 | -------------------------------------------------------------------------------- /src/main/scala/org/kududb/spark/demo/gamer/aggregates/GamerDataGenerator.scala: -------------------------------------------------------------------------------- 1 | package org.kududb.spark.demo.gamer.aggregates 2 | 3 | import java.util.{Date, Random} 4 | 5 | import org.kududb.spark.demo.gamer.GamerEvent 6 | 7 | object GamerDataGenerator { 8 | 9 | val random = new Random() 10 | val averagePlayerPercentage = 40 11 | val advancedPlayerPercentage = 80 12 | val superStarPlayerPercentage = 100 13 | var date = System.currentTimeMillis() 14 | 15 | def makeNewGamerRecord(numOfGamers:Int): GamerEvent = { 16 | println("date" + new Date(date)) 17 | date += 60000 * 60 * 6 18 | val playerSelection = random.nextInt(100) 19 | if (playerSelection < averagePlayerPercentage) { 20 | 21 | val gamerId = random.nextInt(numOfGamers/100) * 100 + playerSelection 22 | 23 | new GamerEvent(gamerId.toString, 24 | date, 25 | 1, 26 | if (random.nextInt(10) > 7) 1 else 0, 27 | random.nextInt(10), 28 | random.nextInt(20), 29 | random.nextInt(1000), 30 | random.nextInt(2000)) 31 | } else if (playerSelection < advancedPlayerPercentage) { 32 | val gamerId = random.nextInt(numOfGamers/100) * 100 + playerSelection 33 | 34 | new GamerEvent(gamerId.toString, 35 | date, 36 | 1, 37 | if (random.nextInt(10) > 5) 1 else 0, 38 | random.nextInt(20), 39 | random.nextInt(18), 40 | random.nextInt(2000), 41 | random.nextInt(2000)) 42 | } else { 43 | val gamerId = random.nextInt(numOfGamers/100) * 100 + playerSelection 44 | 45 | new GamerEvent(gamerId.toString, 46 | date, 47 | 1, 48 | if (random.nextInt(10) > 3) 1 else 0, 49 | random.nextInt(20), 50 | random.nextInt(10), 51 | random.nextInt(4000), 52 | random.nextInt(1500)) 53 | } 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /src/main/scala/org/kududb/spark/demo/gamer/aggregates/GamerSparkSQLExample.scala: -------------------------------------------------------------------------------- 1 | package org.kududb.spark.demo.gamer.aggregates 2 | 3 | import org.apache.log4j.{Level, Logger} 4 | import org.apache.spark.mllib.clustering.KMeans 5 | import org.apache.spark.mllib.linalg.Vectors 6 | import org.apache.spark.sql.SQLContext 7 | import org.apache.spark.{SparkConf, SparkContext} 8 | 9 | object GamerSparkSQLExample { 10 | def main(args:Array[String]): Unit = { 11 | if (args.length == 0) { 12 | println("{kudumaster} {runLocal}") 13 | return 14 | } 15 | 16 | Logger.getRootLogger.setLevel(Level.ERROR) 17 | 18 | val kuduMaster = args(0) 19 | val runLocal = args(1).equals("l") 20 | 21 | println("Loading Spark Context") 22 | var sc:SparkContext = null 23 | 24 | if (runLocal) { 25 | val sparkConfig = new SparkConf() 26 | sparkConfig.set("spark.broadcast.compress", "false") 27 | sparkConfig.set("spark.shuffle.compress", "false") 28 | sparkConfig.set("spark.shuffle.spill.compress", "false") 29 | sc = new SparkContext("local", "TableStatsSinglePathMain", sparkConfig) 30 | } else { 31 | val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain") 32 | sc = new SparkContext(sparkConfig) 33 | } 34 | println("Loading Spark Context: Finished") 35 | 36 | println("Setting up Tables") 37 | val sqlContext = new SQLContext(sc) 38 | sqlContext.load("org.kududb.spark", 39 | Map("kudu.table" -> "gamer", "kudu.master" -> kuduMaster)).registerTempTable("gamer") 40 | 41 | println("Query 1: SELECT count(*) FROM gamer") 42 | val startTimeQ1 = System.currentTimeMillis() 43 | sqlContext.sql("SELECT count(*) FROM gamer").take(10).foreach(r => { 44 | println(" - (" + r + ")") 45 | }) 46 | println("Finish Query 1: " + (System.currentTimeMillis() - startTimeQ1)) 47 | 48 | println("Query 2: SELECT * FROM gamer limit 100") 49 | val startTimeQ2 = System.currentTimeMillis() 50 | sqlContext.sql("SELECT * FROM gamer limit 100").take(100).foreach(r => { 51 | println(" - (" + r + ")") 52 | }) 53 | println("Finish Query 2: " + (System.currentTimeMillis() - startTimeQ2)) 54 | 55 | println("Query 3: SELECT * FROM gamer order_by last_time_played desc limit 100") 56 | val startTimeQ3 = System.currentTimeMillis() 57 | sqlContext.sql("SELECT * FROM gamer order by last_time_played desc limit 100").take(100).foreach(r => { 58 | println(" - (" + r + ")") 59 | }) 60 | println("Finish Query 3: " + (System.currentTimeMillis() - startTimeQ3)) 61 | 62 | println("Query 4: SELECT max(games_played), max(oks), max(damage_given) FROM gamer") 63 | val startTimeQ4 = System.currentTimeMillis() 64 | sqlContext.sql("SELECT max(games_played), max(oks), max(damage_given) FROM gamer").take(100).foreach(r => { 65 | println(" - (" + r + ")") 66 | }) 67 | println("Finish Query 4: " + (System.currentTimeMillis() - startTimeQ4)) 68 | 69 | println("Query 5 + MLLIB: SELECT gamer_id, oks, games_won, games_played FROM gamer" ) 70 | val startTimeQ5 = System.currentTimeMillis() 71 | val resultDf = sqlContext.sql("SELECT gamer_id, oks, games_won, games_played FROM gamer") 72 | 73 | val parsedData = resultDf.map(r => { 74 | val array = Array(r.getInt(1).toDouble, r.getInt(2).toDouble, r.getInt(3).toDouble) 75 | Vectors.dense(array) 76 | }) 77 | 78 | val dataCount = parsedData.count() 79 | 80 | if (dataCount > 0) { 81 | val clusters = KMeans.train(parsedData, 3, 5) 82 | clusters.clusterCenters.foreach(v => println(" Vector Center:" + v)) 83 | 84 | } 85 | //TODO add Mllib here 86 | println("Finish Query 5 + MLLIB: " + (System.currentTimeMillis() - startTimeQ5)) 87 | 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /src/main/scala/org/kududb/spark/demo/gamer/aggregates/KafkaProducerInjector.scala: -------------------------------------------------------------------------------- 1 | package org.kududb.spark.demo.gamer.aggregates 2 | 3 | import java.util.Properties 4 | 5 | import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord} 6 | 7 | 8 | object KafkaProducerInjector { 9 | 10 | 11 | def main(args:Array[String]): Unit = { 12 | if (args.length == 0) { 13 | println("{brokerList} {topic} {#OfRecords} {sleepTimeEvery10Records} {#OfGamers}") 14 | return 15 | } 16 | 17 | val brokerList = args(0) 18 | val topic = args(1) 19 | val numOfRecords = args(2).toInt 20 | val sleepTimeEvery10Records = args(3).toInt 21 | val numOfGamers = args(4).toInt 22 | 23 | val producer = getNewProducer(brokerList) 24 | 25 | for (i <- 0 until numOfRecords) { 26 | 27 | val gamerRecord = GamerDataGenerator.makeNewGamerRecord(numOfGamers) 28 | 29 | val message = new ProducerRecord[String, String](topic, gamerRecord.gamerId.toString, gamerRecord.toString()) 30 | 31 | producer.send(message) 32 | 33 | if (i % 10 == 0) { 34 | Thread.sleep(sleepTimeEvery10Records) 35 | print(".") 36 | } 37 | if (i % 2000 == 0) { 38 | println() 39 | println("Records Sent:" + i) 40 | println() 41 | } 42 | } 43 | } 44 | 45 | def getNewProducer(brokerList:String): KafkaProducer[String, String] = { 46 | val kafkaProps = new Properties 47 | kafkaProps.put("bootstrap.servers", brokerList) 48 | kafkaProps.put("metadata.broker.list", brokerList) 49 | 50 | // This is mandatory, even though we don't send keys 51 | kafkaProps.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer") 52 | kafkaProps.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer") 53 | kafkaProps.put("acks", "0") 54 | 55 | // how many times to retry when produce request fails? 56 | kafkaProps.put("retries", "3") 57 | kafkaProps.put("linger.ms", "2") 58 | kafkaProps.put("batch.size", "1000") 59 | kafkaProps.put("queue.time", "2") 60 | 61 | new KafkaProducer[String, String](kafkaProps) 62 | } 63 | 64 | 65 | } 66 | -------------------------------------------------------------------------------- /src/main/scala/org/kududb/spark/demo/gamer/aggregates/SparkSQLCmd.scala: -------------------------------------------------------------------------------- 1 | package org.kududb.spark.demo.gamer.aggregates 2 | 3 | import org.apache.spark.sql.SQLContext 4 | import org.apache.spark.{SparkConf, SparkContext} 5 | 6 | object SparkSQLCmd { 7 | def main(args:Array[String]): Unit = { 8 | if (args.length == 0) { 9 | println("{kuduMaster} {L for Local}") 10 | } 11 | 12 | val kuduMaster = args(0) 13 | val runLocal = args(1).equals("L") 14 | 15 | var sc:SparkContext = null 16 | 17 | if (runLocal) { 18 | val sparkConfig = new SparkConf() 19 | sparkConfig.set("spark.broadcast.compress", "false") 20 | sparkConfig.set("spark.shuffle.compress", "false") 21 | sparkConfig.set("spark.shuffle.spill.compress", "false") 22 | sc = new SparkContext("local", "TableStatsSinglePathMain", sparkConfig) 23 | } else { 24 | val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain") 25 | sc = new SparkContext(sparkConfig) 26 | } 27 | 28 | val sqlContext = new SQLContext(sc) 29 | println("Loading 'gamer' table") 30 | sqlContext.load("org.kududb.spark", 31 | Map("kudu.table" -> "gamer", "kudu.master" -> kuduMaster)).registerTempTable("gamer") 32 | println("Successfully loaded 'gamer' table") 33 | 34 | val doContinue = true 35 | 36 | while (doContinue) { 37 | val input = readLine("SparkSQL> ") 38 | 39 | try { 40 | 41 | 42 | val startTime = System.currentTimeMillis() 43 | val startTimeQ1 = System.currentTimeMillis() 44 | sqlContext.sql(input).take(1000).foreach(r => { 45 | println(" > " + r) 46 | }) 47 | println(" Finished in " + (System.currentTimeMillis() - startTime)) 48 | } catch { 49 | case e: Throwable => { 50 | println(" > Query '" + input + "' failed.") 51 | e.printStackTrace() 52 | } 53 | } 54 | } 55 | 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /src/main/scala/org/kududb/spark/demo/gamer/cdc/ApplyNewRecordRunnable.scala: -------------------------------------------------------------------------------- 1 | package org.kududb.spark.demo.gamer.cdc 2 | 3 | import java.text.SimpleDateFormat 4 | import java.util.concurrent.atomic.AtomicInteger 5 | 6 | import org.kududb.client.{Operation, PartialRow, KuduClient} 7 | import org.kududb.spark.demo.gamer.GamerEvent 8 | 9 | class ApplyNewRecordRunnable(val gameEvent: GamerEvent, 10 | val kuduClient: KuduClient, 11 | val tableName: String, 12 | val leftToRun:AtomicInteger) extends Runnable{ 13 | override def run(): Unit = { 14 | val table = kuduClient.openTable(tableName) 15 | val session = kuduClient.newSession() 16 | val simpleDateFormat = new SimpleDateFormat("MM,dd,yyyy") 17 | 18 | val record = gameEvent 19 | 20 | val pr = new PartialRow(table.getSchema) 21 | pr.addString(0, record.gamerId) 22 | pr.addString(1, "") 23 | val scannerRows = kuduClient.newScannerBuilder(table).lowerBound(pr).limit(1).build().nextRows() 24 | val op:Operation = if (scannerRows.hasNext) { 25 | println(" >> had next") 26 | val oldRow = scannerRows.next() 27 | 28 | val oldRecordUpdateOp = table.newInsert() 29 | 30 | val row = oldRecordUpdateOp.getRow 31 | row.addString("gamer_id", oldRow.getString("gamer_id")) 32 | row.addString("eff_to", simpleDateFormat.format(record.lastTimePlayed)) 33 | row.addString("eff_from", oldRow.getString("eff_from")) 34 | row.addLong("last_time_played", oldRow.getLong("last_time_played")) 35 | row.addInt("games_played", oldRow.getInt("games_played")) 36 | row.addInt("games_won", oldRow.getInt("games_won")) 37 | row.addInt("oks", oldRow.getInt("oks")) 38 | row.addInt("deaths", oldRow.getInt("deaths")) 39 | row.addInt("damage_given", oldRow.getInt("damage_given")) 40 | row.addInt("damage_taken", oldRow.getInt("damage_taken")) 41 | row.addInt("max_oks_in_one_game", oldRow.getInt("max_oks_in_one_game")) 42 | row.addInt("max_deaths_in_one_game", oldRow.getInt("max_deaths_in_one_game")) 43 | 44 | session.apply(oldRecordUpdateOp) 45 | table.newUpdate() 46 | } else { 47 | table.newInsert() 48 | } 49 | 50 | val row = op.getRow 51 | row.addString("gamer_id", record.gamerId) 52 | row.addString("eff_to", "") 53 | row.addString("eff_from", simpleDateFormat.format(record.lastTimePlayed)) 54 | row.addLong("last_time_played", record.lastTimePlayed) 55 | row.addInt("games_played", record.gamesPlayed) 56 | row.addInt("games_won", record.gamesWon) 57 | row.addInt("oks", record.oks) 58 | row.addInt("deaths", record.deaths) 59 | row.addInt("damage_given", record.damageGiven) 60 | row.addInt("damage_taken", record.damageTaken) 61 | row.addInt("max_oks_in_one_game", record.maxOksInOneGame) 62 | row.addInt("max_deaths_in_one_game", record.maxDeathsInOneGame) 63 | 64 | session.apply(op) 65 | 66 | session.flush() 67 | leftToRun.decrementAndGet() 68 | println(" >> finished Submit") 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /src/main/scala/org/kududb/spark/demo/gamer/cdc/CreateKuduTable.scala: -------------------------------------------------------------------------------- 1 | package org.kududb.spark.demo.gamer.cdc 2 | 3 | import java.util 4 | import java.util.ArrayList 5 | 6 | import org.kududb.{Schema, Type, ColumnSchema} 7 | import org.kududb.ColumnSchema.ColumnSchemaBuilder 8 | import org.kududb.client.{CreateTableOptions, KuduClient} 9 | 10 | object CreateGamerCDCKuduTable { 11 | def main(args:Array[String]): Unit = { 12 | if (args.length == 0) { 13 | println("{kuduMaster} {tableName}") 14 | return 15 | } 16 | 17 | val kuduMaster = args(0) 18 | val tableName = args(1) 19 | val numberOfBuckets = args(2).toInt 20 | 21 | val kuduClient = new KuduClient.KuduClientBuilder(kuduMaster).build() 22 | val columnList = new util.ArrayList[ColumnSchema]() 23 | 24 | columnList.add(new ColumnSchemaBuilder("gamer_id", Type.STRING).key(true).build()) 25 | columnList.add(new ColumnSchemaBuilder("eff_to", Type.STRING).key(true).build()) 26 | columnList.add(new ColumnSchemaBuilder("eff_from", Type.STRING).key(false).build()) 27 | columnList.add(new ColumnSchemaBuilder("last_time_played", Type.INT64).key(false).build()) 28 | columnList.add(new ColumnSchemaBuilder("games_played", Type.INT32).key(false).build()) 29 | columnList.add(new ColumnSchemaBuilder("games_won", Type.INT32).key(false).build()) 30 | columnList.add(new ColumnSchemaBuilder("oks", Type.INT32).key(false).build()) 31 | columnList.add(new ColumnSchemaBuilder("deaths", Type.INT32).key(false).build()) 32 | columnList.add(new ColumnSchemaBuilder("damage_given", Type.INT32).key(false).build()) 33 | columnList.add(new ColumnSchemaBuilder("damage_taken", Type.INT32).key(false).build()) 34 | columnList.add(new ColumnSchemaBuilder("max_oks_in_one_game", Type.INT32).key(false).build()) 35 | columnList.add(new ColumnSchemaBuilder("max_deaths_in_one_game", Type.INT32).key(false).build()) 36 | val schema = new Schema(columnList) 37 | 38 | if (kuduClient.tableExists(tableName)) { 39 | println("Deleting Table") 40 | kuduClient.deleteTable(tableName) 41 | } 42 | 43 | val builder = new CreateTableOptions() 44 | 45 | val hashColumnList = new ArrayList[String] 46 | hashColumnList.add("gamer_id") 47 | 48 | builder.addHashPartitions(hashColumnList, numberOfBuckets) 49 | 50 | println("Creating Table") 51 | kuduClient.createTable(tableName, schema, builder) 52 | println("Created Table") 53 | kuduClient.shutdown() 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /src/main/scala/org/kududb/spark/demo/gamer/cdc/DirectDataInjector.scala: -------------------------------------------------------------------------------- 1 | package org.kududb.spark.demo.gamer.cdc 2 | 3 | import java.text.SimpleDateFormat 4 | import java.util.Random 5 | 6 | import org.kududb.client.{PartialRow, Operation, KuduClient} 7 | import org.kududb.spark.demo.gamer.aggregates.GamerDataGenerator 8 | 9 | 10 | class DirectDataInjector { 11 | val simpleDateFormat = new SimpleDateFormat("MM,dd,yyyy") 12 | val random = new Random 13 | def main(args:Array[String]): Unit = { 14 | 15 | if (args.length == 0) { 16 | println(" ") 17 | return 18 | } 19 | 20 | val kuduMaster = args(0) 21 | val tableName = args(1) 22 | val numberOfRecords = args(2).toInt 23 | 24 | 25 | val kuduClient = new KuduClient.KuduClientBuilder(kuduMaster).build() 26 | val table = kuduClient.openTable(tableName) 27 | val session = kuduClient.newSession() 28 | 29 | for (i <- 0 to numberOfRecords) { 30 | val record = GamerDataGenerator.makeNewGamerRecord(100000) 31 | 32 | 33 | 34 | 35 | val pr = new PartialRow(table.getSchema) 36 | pr.addString(0, "record.gamerId") 37 | pr.addString(1, "") 38 | val scannerRows = kuduClient.newScannerBuilder(table).lowerBound(null).limit(1).build().nextRows() 39 | val op:Operation = if (scannerRows.hasNext) { 40 | val oldRow = scannerRows.next() 41 | 42 | val oldRecordUpdateOp = table.newInsert() 43 | 44 | val row = oldRecordUpdateOp.getRow 45 | row.addString("gamer_id", oldRow.getString("gamer_id")) 46 | row.addString("eff_to", simpleDateFormat.format(System.currentTimeMillis())) 47 | row.addString("eff_from", oldRow.getString("eff_from")) 48 | row.addLong("last_time_played", oldRow.getLong("last_time_played")) 49 | row.addInt("games_played", oldRow.getInt("games_played")) 50 | row.addInt("games_won", oldRow.getInt("games_won")) 51 | row.addInt("oks", oldRow.getInt("oks")) 52 | row.addInt("deaths", oldRow.getInt("deaths")) 53 | row.addInt("damage_given", oldRow.getInt("damage_given")) 54 | row.addInt("damage_taken", oldRow.getInt("damage_taken")) 55 | row.addInt("max_oks_in_one_game", oldRow.getInt("max_oks_in_one_game")) 56 | row.addInt("max_deaths_in_one_game", oldRow.getInt("max_deaths_in_one_game")) 57 | 58 | session.apply(oldRecordUpdateOp) 59 | table.newUpdate() 60 | } else { 61 | table.newInsert() 62 | } 63 | 64 | val row = op.getRow 65 | row.addString("gamer_id", record.gamerId) 66 | row.addString("eff_to", "") 67 | row.addString("eff_from", simpleDateFormat.format(System.currentTimeMillis())) 68 | row.addLong("last_time_played", record.lastTimePlayed) 69 | row.addInt("games_played", record.gamesPlayed) 70 | row.addInt("games_won", record.gamesWon) 71 | row.addInt("oks", record.oks) 72 | row.addInt("deaths", record.deaths) 73 | row.addInt("damage_given", record.damageGiven) 74 | row.addInt("damage_taken", record.damageTaken) 75 | row.addInt("max_oks_in_one_game", record.maxOksInOneGame) 76 | row.addInt("max_deaths_in_one_game", record.maxDeathsInOneGame) 77 | 78 | session.apply(op) 79 | } 80 | session.flush() 81 | 82 | kuduClient.close() 83 | 84 | 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /src/main/scala/org/kududb/spark/demo/gamer/cdc/DirectDataMultiThreadedInjector.scala: -------------------------------------------------------------------------------- 1 | package org.kududb.spark.demo.gamer.cdc 2 | 3 | import java.text.SimpleDateFormat 4 | import java.util.Random 5 | import java.util.concurrent.atomic.AtomicInteger 6 | import java.util.concurrent.{TimeUnit, Executors} 7 | 8 | import org.kududb.client.{Operation, PartialRow, KuduClient} 9 | import org.kududb.spark.demo.gamer.aggregates.GamerDataGenerator 10 | 11 | object DirectDataMultiThreadedInjector { 12 | val simpleDateFormat = new SimpleDateFormat("MM,dd,yyyy") 13 | val random = new Random 14 | def main(args:Array[String]): Unit = { 15 | 16 | if (args.length == 0) { 17 | println(" ") 18 | return 19 | } 20 | 21 | val kuduMaster = args(0) 22 | val tableName = args(1) 23 | val numberOfRecords = args(2).toInt 24 | val executor = Executors.newFixedThreadPool(args(3).toInt) 25 | val numberOfGamers = args(4).toInt 26 | val sleepTime = args(5).toInt 27 | 28 | val kuduClient = new KuduClient.KuduClientBuilder(kuduMaster).build() 29 | val leftToRun = new AtomicInteger() 30 | 31 | for (i <- 0 to numberOfRecords) { 32 | leftToRun.incrementAndGet() 33 | executor.execute(new ApplyNewRecordRunnable(GamerDataGenerator.makeNewGamerRecord(numberOfGamers), 34 | kuduClient, tableName, leftToRun)) 35 | println("Summited:" + i) 36 | 37 | Thread.sleep(sleepTime) 38 | } 39 | 40 | 41 | val startTime = System.currentTimeMillis() 42 | while (!executor.awaitTermination(10000, TimeUnit.SECONDS)) { 43 | val newTime = System.currentTimeMillis() 44 | println("> Still Waiting: {Time:" + (newTime - startTime) + ", LeftToRun:" + leftToRun + "}" ) 45 | } 46 | 47 | 48 | kuduClient.close() 49 | 50 | 51 | } 52 | } 53 | --------------------------------------------------------------------------------