├── .idea
├── libraries
│ ├── Maven__com_amazonaws_aws_java_sdk_core_1_10_6.xml
│ ├── Maven__com_amazonaws_aws_java_sdk_kms_1_10_6.xml
│ ├── Maven__com_amazonaws_aws_java_sdk_s3_1_10_6.xml
│ ├── Maven__com_sun_jersey_jersey_client_1_9.xml
│ ├── Maven__javax_activation_activation_1_1.xml
│ ├── Maven__javax_xml_bind_jaxb_api_2_2_2.xml
│ ├── Maven__javax_xml_stream_stax_api_1_0_2.xml
│ ├── Maven__org_apache_hadoop_hadoop_annotations_2_6_0_cdh5_7_0.xml
│ ├── Maven__org_apache_hadoop_hadoop_aws_2_6_0_cdh5_7_0.xml
│ ├── Maven__org_apache_hadoop_hadoop_client_2_6_0_cdh5_7_0.xml
│ ├── Maven__org_apache_hadoop_hadoop_common_2_6_0_cdh5_7_0.xml
│ ├── Maven__org_apache_hadoop_hadoop_hdfs_2_6_0_cdh5_7_0.xml
│ ├── Maven__org_apache_hadoop_hadoop_mapreduce_client_app_2_6_0_cdh5_7_0.xml
│ ├── Maven__org_apache_hadoop_hadoop_mapreduce_client_common_2_6_0_cdh5_7_0.xml
│ ├── Maven__org_apache_hadoop_hadoop_mapreduce_client_core_2_6_0_cdh5_7_0.xml
│ ├── Maven__org_apache_hadoop_hadoop_mapreduce_client_jobclient_2_6_0_cdh5_7_0.xml
│ ├── Maven__org_apache_hadoop_hadoop_mapreduce_client_shuffle_2_6_0_cdh5_7_0.xml
│ ├── Maven__org_apache_hadoop_hadoop_yarn_api_2_6_0_cdh5_7_0.xml
│ ├── Maven__org_apache_hadoop_hadoop_yarn_client_2_6_0_cdh5_7_0.xml
│ ├── Maven__org_apache_hadoop_hadoop_yarn_common_2_6_0_cdh5_7_0.xml
│ ├── Maven__org_apache_hadoop_hadoop_yarn_server_common_2_6_0_cdh5_7_0.xml
│ ├── Maven__org_apache_htrace_htrace_core4_4_0_1_incubating.xml
│ ├── Maven__xerces_xercesImpl_2_9_1.xml
│ └── Maven__xml_apis_xml_apis_1_3_04.xml
└── uiDesigner.xml
├── GamerSetup.md
├── LICENSE
├── README.md
├── kudu-spark.iml
├── notes.txt
├── pom.xml
└── src
└── main
└── scala
└── org
└── kududb
└── spark
├── DefaultSource.scala
├── KuduContext.scala
├── KuduDStreamFunctions.scala
├── KuduRDDFunctions.scala
└── demo
├── basic
├── AddSingleRecord.scala
├── BasicExample.scala
├── BasicSparkSQLExamples.scala
├── InitialDataPopulation.scala
├── ModifySingleRecord.scala
├── NameGenerator.scala
└── ScanTable.scala
└── gamer
├── GamerEvent.scala
├── aggregates
├── CreateKuduTable.scala
├── DirectDataInjector.scala
├── GamerAggergatesSparkStreaming.scala
├── GamerDataGenerator.scala
├── GamerSparkSQLExample.scala
├── KafkaProducerInjector.scala
└── SparkSQLCmd.scala
└── cdc
├── ApplyNewRecordRunnable.scala
├── CreateKuduTable.scala
├── DirectDataInjector.scala
└── DirectDataMultiThreadedInjector.scala
/.idea/libraries/Maven__com_amazonaws_aws_java_sdk_core_1_10_6.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__com_amazonaws_aws_java_sdk_kms_1_10_6.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__com_amazonaws_aws_java_sdk_s3_1_10_6.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__com_sun_jersey_jersey_client_1_9.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__javax_activation_activation_1_1.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__javax_xml_bind_jaxb_api_2_2_2.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__javax_xml_stream_stax_api_1_0_2.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__org_apache_hadoop_hadoop_annotations_2_6_0_cdh5_7_0.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__org_apache_hadoop_hadoop_aws_2_6_0_cdh5_7_0.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__org_apache_hadoop_hadoop_client_2_6_0_cdh5_7_0.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__org_apache_hadoop_hadoop_common_2_6_0_cdh5_7_0.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__org_apache_hadoop_hadoop_hdfs_2_6_0_cdh5_7_0.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__org_apache_hadoop_hadoop_mapreduce_client_app_2_6_0_cdh5_7_0.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__org_apache_hadoop_hadoop_mapreduce_client_common_2_6_0_cdh5_7_0.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__org_apache_hadoop_hadoop_mapreduce_client_core_2_6_0_cdh5_7_0.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__org_apache_hadoop_hadoop_mapreduce_client_jobclient_2_6_0_cdh5_7_0.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__org_apache_hadoop_hadoop_mapreduce_client_shuffle_2_6_0_cdh5_7_0.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__org_apache_hadoop_hadoop_yarn_api_2_6_0_cdh5_7_0.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__org_apache_hadoop_hadoop_yarn_client_2_6_0_cdh5_7_0.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__org_apache_hadoop_hadoop_yarn_common_2_6_0_cdh5_7_0.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__org_apache_hadoop_hadoop_yarn_server_common_2_6_0_cdh5_7_0.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__org_apache_htrace_htrace_core4_4_0_1_incubating.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__xerces_xercesImpl_2_9_1.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__xml_apis_xml_apis_1_3_04.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/uiDesigner.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | -
6 |
7 |
8 | -
9 |
10 |
11 | -
12 |
13 |
14 | -
15 |
16 |
17 | -
18 |
19 |
20 |
21 |
22 |
23 | -
24 |
25 |
26 |
27 |
28 |
29 | -
30 |
31 |
32 |
33 |
34 |
35 | -
36 |
37 |
38 |
39 |
40 |
41 | -
42 |
43 |
44 |
45 |
46 | -
47 |
48 |
49 |
50 |
51 | -
52 |
53 |
54 |
55 |
56 | -
57 |
58 |
59 |
60 |
61 | -
62 |
63 |
64 |
65 |
66 | -
67 |
68 |
69 |
70 |
71 | -
72 |
73 |
74 | -
75 |
76 |
77 |
78 |
79 | -
80 |
81 |
82 |
83 |
84 | -
85 |
86 |
87 |
88 |
89 | -
90 |
91 |
92 |
93 |
94 | -
95 |
96 |
97 |
98 |
99 | -
100 |
101 |
102 | -
103 |
104 |
105 | -
106 |
107 |
108 | -
109 |
110 |
111 | -
112 |
113 |
114 |
115 |
116 | -
117 |
118 |
119 | -
120 |
121 |
122 |
123 |
124 |
--------------------------------------------------------------------------------
/GamerSetup.md:
--------------------------------------------------------------------------------
1 |
2 | ssh root@mriggs-strata-1.vpc.cloudera.com
3 |
4 | scp -i "tedm2.pem" KuduSpark.jar ec2_user@ec2-52-36-220-83.us-west-2.compute.amazonaws.com:./
5 |
6 | --Setting up Kafka
7 | kafka-topics --zookeeper mriggs-strata-1.vpc.cloudera.com:2181 --create --topic gamer --partitions 1 --replication-factor 1
8 | kafka-topics --zookeeper mriggs-strata-1.vpc.cloudera.com:2181 --list
9 | kafka-console-producer --broker-list mriggs-strata-1.vpc.cloudera.com:9092 --topic test
10 | kafka-cocsole-consumer --zookeeper mriggs-strata-1.vpc.cloudera.com:2181 --topic gamer --from-beginning
11 |
12 | vi .bash_profile
13 | export PATH=/usr/java/jdk1.7.0_67-cloudera/bin/:$PATH
14 | export JAVA_HOME=/usr/java/jdk1.7.0_67-cloudera/
15 |
16 | ##Populating Kafka
17 | java -cp KuduSpark.jar org.kududb.spark.demo.gamer.KafkaProducerGenerator mriggs-strata-1.vpc.cloudera.com:9092 gamer 10000 300 1000
18 |
19 | ##create Table
20 | java -cp KuduSpark.jar org.kududb.spark.demo.gamer.CreateKuduTable ec2-52-36-220-83.us-west-2.compute.amazonaws.com gamer 3
21 |
22 | ##Run Spark Streaming
23 | spark-submit \
24 | --master yarn --deploy-mode client \
25 | --executor-memory 2G \
26 | --num-executors 2 \
27 | --jars kudu-mapreduce-0.1.0-20150903.033037-21-jar-with-dependencies.jar \
28 | --class org.kududb.spark.demo.gamer.GamerAggergatesSparkStreaming KuduSpark.jar \
29 | mriggs-strata-1.vpc.cloudera.com:9092 gamer mriggs-strata-1.vpc.cloudera.com gamer C
30 |
31 | ##Run SparkSQL
32 | spark-submit \
33 | --master yarn --deploy-mode client \
34 | --executor-memory 2G \
35 | --num-executors 2 \
36 | --class org.kududb.spark.demo.gamer.GamerSparkSQLExample \
37 | KuduSpark.jar ec2-52-36-220-83.us-west-2.compute.amazonaws.com l
38 |
39 | ##Run direct insert
40 | java -cp KuduSpark.jar org.kududb.spark.demo.gamer.DirectDataGenerator ec2-52-36-220-83.us-west-2.compute.amazonaws.com gamer 3
41 |
42 | ##Impala
43 | impala-shell
44 | connect ec2-52-11-171-85.us-west-2.compute.amazonaws.com:21007;
45 |
46 | java -cp KuduSpark.jar org.kududb.spark.demo.gamer.cdc.CreateGamerCDCKuduTable ec2-52-36-220-83.us-west-2.compute.amazonaws.com gamer_cdc 3
47 |
48 | java -cp KuduSpark.jar org.kududb.spark.demo.gamer.cdc.DirectDataMultiThreadedInjector ec2-52-36-220-83.us-west-2.compute.amazonaws.com gamer_cdc 10 5 1000
49 |
50 | java -cp KuduSpark.jar org.kududb.spark.demo.gamer.cdc.DirectDataMultiThreadedInjector ec2-52-36-220-83.us-west-2.compute.amazonaws.com gamer_cdc 100 5 5
51 |
52 | java -cp KuduSpark.jar org.kududb.spark.demo.gamer.DropTable ec2-52-36-220-83.us-west-2.compute.amazonaws.com gamer_cdc
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "{}"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright {yyyy} {name of copyright owner}
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
203 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # SparkOnKudu
2 | ## Overview
3 | This is a simple reusable lib for working with Kudu with Spark
4 |
5 |
6 | ##Functionality
7 | Current functionality supports the following functions
8 |
9 | * RDD foreachPartition with iterator and kudu client
10 | * RDD mapPartition with iterator and kudu client
11 | * DStream foreachPartition with iterator and kudu client
12 | * DStream mapPartition with iterator and kudu client
13 | * Spark SQL integration with Kudu (Basic no filter push down yet)
14 |
15 | ##Examples
16 | * Basic example
17 | ** Creating Kudu tables
18 | ** Connecting with SparkSQL
19 | ** Converting values from Kudu to SparkSQL to MlLib
20 | * Gamer example
21 | ** Creating Kudu Gamer table
22 | ** Generating Gamer data and pushing it to Kafka
23 | ** Reading Gamer data from Kafka with Spark Streaming
24 | ** Aggregating Gamer data in Spark Streaming then pushing mutations to Kudu
25 | ** Running Impala SQL on Kudu Gamer table
26 | ** Running SparkSQL on Kudu Gamer table
27 | ** Converting SparkSQL results to Vectors so we can do KMeans
28 |
29 | ##Near Future
30 | * Key SQL predict push down
31 | * Need to update POM file with public repo
32 | * Need to work with Kudu project to integrate into Kudu
33 |
34 | ##Build
35 | mvn clean package
36 |
37 | ##Setup for Gamer Example
38 |
39 | ###Setting up Kafka
40 | kafka-topics --zookeeper ZooKeeperNode:2181 --create --topic gamer --partitions 1 --replication-factor 1
41 | kafka-topics --zookeeper ZooKeeperNode:2181 --list
42 |
43 | ###Basic Testing with Kafka
44 | kafka-console-producer --broker-list BrokerNode:9092 --topic test
45 | kafka-cocsole-consumer --zookeeper ZooKeeperNode:2181 --topic gamer --from-beginning
46 |
47 |
48 | ###Populating Kafka
49 | java -cp KuduSpark.jar org.kududb.spark.demo.gamer.KafkaProducerGenerator mriggs-strata-1.vpc.cloudera.com:9092 gamer 10000 300 1000
50 |
51 | ###create Table
52 | java -cp KuduSpark.jar org.kududb.spark.demo.gamer.CreateKuduTable mriggs-strata-1.vpc.cloudera.com gamer
53 |
54 | ###Run Spark Streaming
55 | spark-submit \
56 | --master yarn --deploy-mode client \
57 | --executor-memory 2G \
58 | --num-executors 2 \
59 | --jars kudu-mapreduce-0.1.0-20150903.033037-21-jar-with-dependencies.jar \
60 | --class org.kududb.spark.demo.gamer.GamerAggergatesSparkStreaming KuduSpark.jar \
61 | BrokerNode:9092 gamer KuduMaster gamer C
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
--------------------------------------------------------------------------------
/kudu-spark.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
--------------------------------------------------------------------------------
/notes.txt:
--------------------------------------------------------------------------------
1 | 1. Add impala-kudu parcel
2 | 2. Search for the *Impala Service Environment Advanced Configuration Snippet (Safety 85
3 | Valve)* configuration item. Add the following to the text field and save your changes: `
4 | IMPALA_NEXT=1`
5 | 3. Talk to Martin Grund
6 |
7 | scp target/KuduSpark.jar root@mriggs-strata-1.vpc.cloudera.com:./
8 |
9 | scp kudu-mapreduce-0.1.0-20150903.033037-21-jar-with-dependencies.jar root@mriggs-strata-1.vpc.cloudera.com:./
10 |
11 | spark-shell --jars kudu-mapreduce-0.1.0-20150903.033037-21-jar-with-dependencies.jar,KuduSpark.jar --class org.kududb.spark.demo.SimpleGroupByExample --executor-cores 2 --num-executors 3 --executor-memory 2g --master yarn --deploy-mode client mriggs-strata-1.vpc.cloudera.com foo y
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
23 |
24 |
28 | 4.0.0
29 |
30 | org.apache
31 | apache
32 | 14
33 |
34 | com.cloudera.kudu
35 | kudu-spark
36 | Kudu - Spark
37 | jar
38 |
39 |
40 | 1.5.0-cdh5.5.0-SNAPSHOT
41 | 2.10.4
42 | 0.8.2.0-kafka-1.4.0-SNAPSHOT
43 | 2.10
44 | 0.7.1
45 | 2.6.0-cdh5.7.0
46 | ${project.basedir}/..
47 |
48 |
49 |
50 |
51 |
52 |
53 | org.apache.maven.plugins
54 | maven-compiler-plugin
55 | 3.3
56 |
57 | 1.8
58 | 1.8
59 |
60 |
61 |
62 |
63 | net.alchim31.maven
64 | scala-maven-plugin
65 | 3.2.0
66 |
67 | UTF-8
68 | ${scala.version}
69 |
70 |
71 |
72 | scala-compile-first
73 | process-resources
74 |
75 | add-source
76 | compile
77 |
78 |
79 |
80 | scala-test-compile
81 | process-test-resources
82 |
83 | testCompile
84 |
85 |
86 |
87 |
88 |
89 |
90 | org.scalatest
91 | scalatest-maven-plugin
92 | 1.0
93 |
94 | ${project.build.directory}/surefire-reports
95 | .
96 | WDF TestSuite.txt
97 | false
98 |
99 |
100 |
101 | test
102 | test
103 |
104 | test
105 |
106 |
107 | true
108 |
109 |
110 |
111 | integration-test
112 | integration-test
113 |
114 | test
115 |
116 |
117 | Integration-Test
118 |
119 | -Xmx1536m -XX:MaxPermSize=512m -XX:ReservedCodeCacheSize=512m
120 |
121 | false
122 |
123 |
124 |
125 |
126 |
127 | org.apache.maven.plugins
128 | maven-shade-plugin
129 | 2.2
130 |
131 | false
132 | target/KuduSpark.jar
133 |
134 |
135 | *:*
136 |
137 |
138 |
139 |
140 | *:*
141 |
142 | META-INF/*.SF
143 | META-INF/*.DSA
144 | META-INF/*.RSA
145 |
146 |
147 |
148 |
149 |
150 |
151 | package
152 |
153 | shade
154 |
155 |
156 |
157 |
159 |
161 | reference.conf
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 | cloudera-repo
175 | Cloudera Repository
176 | https://repository.cloudera.com/artifactory/cloudera-repos
177 |
178 |
179 |
180 |
207 |
208 |
209 |
210 |
211 | javax.servlet
212 | javax.servlet-api
213 | 3.0.1
214 | test
215 |
216 |
217 |
218 |
219 | org.scala-lang
220 | scala-library
221 | ${scala.version}
222 |
223 |
224 | org.apache.spark
225 | spark-core_${scala.binary.version}
226 | ${spark.version}
227 |
228 |
229 |
230 |
231 | org.scala-lang
232 | scala-library
233 |
234 |
235 |
236 | org.scala-lang
237 | scalap
238 |
239 |
240 |
241 |
242 | org.apache.spark
243 | spark-sql_${scala.binary.version}
244 | ${spark.version}
245 |
246 |
247 |
248 | org.apache.spark
249 | spark-mllib_${scala.binary.version}
250 | ${spark.version}
251 |
252 |
253 |
254 | org.apache.spark
255 | spark-streaming_${scala.binary.version}
256 | ${spark.version}
257 |
258 |
259 | org.apache.spark
260 | spark-streaming-kafka_${scala.binary.version}
261 | ${spark.version}
262 |
263 |
264 | org.apache.spark
265 | spark-streaming_${scala.binary.version}
266 | ${spark.version}
267 | test-jar
268 | tests
269 | test
270 |
271 |
272 | org.apache.kafka
273 | kafka-clients
274 | ${kafka.version}
275 |
276 |
277 | junit
278 | junit
279 | 4.12
280 | test
281 |
282 |
283 |
284 | org.scalatest
285 | scalatest_${scala.binary.version}
286 | 2.2.1
287 | test
288 |
289 |
290 |
291 | org.kududb
292 | kudu-client
293 | ${kudu.version}
294 |
295 |
296 | org.kududb
297 | kudu-mapreduce
298 | ${kudu.version}
299 |
300 |
301 | org.apache.hadoop
302 | hadoop-client
303 | ${hadoop.version}
304 |
305 |
306 | log4j
307 | log4j
308 |
309 |
310 | javax.servlet
311 | servlet-api
312 |
313 |
314 | javax.servlet.jsp
315 | jsp-api
316 |
317 |
318 | org.jruby
319 | jruby-complete
320 |
321 |
322 | org.jboss.netty
323 | netty
324 |
325 |
326 | io.netty
327 | netty
328 |
329 |
330 |
331 |
332 |
333 |
334 |
335 |
336 |
--------------------------------------------------------------------------------
/src/main/scala/org/kududb/spark/DefaultSource.scala:
--------------------------------------------------------------------------------
1 | package org.kududb.spark
2 |
3 | import org.apache.spark.Logging
4 | import org.apache.spark.rdd.RDD
5 | import org.apache.spark.sql.{Row, SQLContext}
6 | import org.apache.spark.sql.sources._
7 | import org.apache.spark.sql.types._
8 | import org.kududb.client.{RowResult}
9 | import org.kududb.{Schema, ColumnSchema, Type}
10 |
11 | import scala.collection.mutable
12 |
13 | /**
14 | * DefaultSource for integration with Spark's dataframe datasources.
15 | * This class with produce a relationProvider based on input give to it from spark
16 | *
17 | * In all this DefaultSource support the following datasource functionality
18 | * - Scan range pruning through filter push down logic based on rowKeys
19 | * - Filter push down logic on columns that are not rowKey columns
20 | * - Qualifier filtering based on columns used in the SparkSQL statement
21 | * - Type conversions of basic SQL types
22 | */
23 | class DefaultSource extends RelationProvider {
24 |
25 | val TABLE_KEY:String = "kudu.table"
26 | val KUDU_MASTER:String = "kudu.master"
27 |
28 | /**
29 | * Is given input from SparkSQL to construct a BaseRelation
30 | * @param sqlContext SparkSQL context
31 | * @param parameters Parameters given to us from SparkSQL
32 | * @return A BaseRelation Object
33 | */
34 | override def createRelation(sqlContext: SQLContext,
35 | parameters: Map[String, String]):
36 | BaseRelation = {
37 |
38 |
39 | val tableName = parameters.get(TABLE_KEY)
40 | if (tableName.isEmpty)
41 | new Throwable("Invalid value for " + TABLE_KEY +" '" + tableName + "'")
42 |
43 | val kuduMaster = parameters.getOrElse(KUDU_MASTER, "")
44 |
45 | new KuduRelation(tableName.get, kuduMaster)(sqlContext)
46 | }
47 | }
48 |
49 | /**
50 | * Implementation of Spark BaseRelation that will build up our scan logic
51 | * , do the scan pruning, filter push down, and value conversions
52 | *
53 | * @param tableName Kudu table that we plan to read from
54 | * @param kuduMaster Kudu master definition
55 | * @param sqlContext SparkSQL context
56 | */
57 | class KuduRelation (val tableName:String,
58 | val kuduMaster: String) (
59 | @transient val sqlContext:SQLContext)
60 | extends BaseRelation with PrunedFilteredScan with Logging with Serializable {
61 |
62 | //create or get latest HBaseContext
63 | @transient var kuduContext = new KuduContext(sqlContext.sparkContext, kuduMaster)
64 | @transient var kuduClient = KuduClientCache.getKuduClient(kuduMaster)
65 | @transient var kuduTable = kuduClient.openTable(tableName)
66 | @transient var kuduSchema = kuduTable.getSchema
67 | @transient var kuduSchemaColumnMap = buildKuduSchemaColumnMap(kuduSchema)
68 |
69 | def getKuduSchemaColumnMap(): mutable.HashMap[String, ColumnSchema] = {
70 | if (kuduSchemaColumnMap == null) {
71 | kuduClient = KuduClientCache.getKuduClient(kuduMaster)
72 | kuduTable = kuduClient.openTable(tableName)
73 | kuduSchema = kuduTable.getSchema
74 | kuduSchemaColumnMap = buildKuduSchemaColumnMap(kuduSchema)
75 | }
76 | kuduSchemaColumnMap
77 | }
78 |
79 | def buildKuduSchemaColumnMap(kuduSchema:Schema): mutable.HashMap[String, ColumnSchema] = {
80 |
81 | var kuduSchemaColumnMap = new mutable.HashMap[String, ColumnSchema]()
82 |
83 | val columnIt = kuduSchema.getColumns.iterator()
84 | while (columnIt.hasNext) {
85 | val c = columnIt.next()
86 | kuduSchemaColumnMap.+=((c.getName, c))
87 | }
88 | kuduSchemaColumnMap
89 | }
90 |
91 | /**
92 | * Generates a Spark SQL schema object so Spark SQL knows what is being
93 | * provided by this BaseRelation
94 | *
95 | * @return schema generated from the SCHEMA_COLUMNS_MAPPING_KEY value
96 | */
97 | override def schema: StructType = {
98 |
99 | val metadataBuilder = new MetadataBuilder()
100 |
101 | val structFieldArray = new Array[StructField](kuduSchema.getColumnCount)
102 |
103 | val columnIt = kuduSchema.getColumns.iterator()
104 | var indexCounter = 0
105 | while (columnIt.hasNext) {
106 | val c = columnIt.next()
107 |
108 | val columnSparkSqlType = if (c.getType.equals(Type.BOOL)) BooleanType
109 | else if (c.getType.equals(Type.INT16)) IntegerType
110 | else if (c.getType.equals(Type.INT32)) IntegerType
111 | else if (c.getType.equals(Type.INT64)) LongType
112 | else if (c.getType.equals(Type.FLOAT)) FloatType
113 | else if (c.getType.equals(Type.DOUBLE)) DoubleType
114 | else if (c.getType.equals(Type.STRING)) StringType
115 | else if (c.getType.equals(Type.TIMESTAMP)) TimestampType
116 | else if (c.getType.equals(Type.BINARY)) BinaryType
117 | else throw new Throwable("Unsupported column type :" + c.getType)
118 |
119 | val metadata = metadataBuilder.putString("name", c.getName).build()
120 | val struckField =
121 | new StructField(c.getName, columnSparkSqlType, nullable = true, metadata)
122 |
123 | structFieldArray(indexCounter) = struckField
124 | indexCounter += 1
125 | }
126 |
127 | val result = new StructType(structFieldArray)
128 | result
129 | }
130 |
131 | /**
132 | * Here we are building the functionality to populate the resulting RDD[Row]
133 | * Here is where we will do the following:
134 | * - Filter push down
135 | * - Scan or GetList pruning
136 | * - Executing our scan(s) or/and GetList to generate result
137 | *
138 | * @param requiredColumns The columns that are being requested by the requesting query
139 | * @param filters The filters that are being applied by the requesting query
140 | * @return RDD will all the results from HBase needed for SparkSQL to
141 | * execute the query on
142 | */
143 | override def buildScan(requiredColumns: Array[String], filters: Array[Filter]): RDD[Row] = {
144 |
145 | //retain the information for unit testing checks
146 | var resultRDD: RDD[Row] = null
147 |
148 | if (resultRDD == null) {
149 |
150 | val strBuilder = new StringBuilder()
151 | var isFirst = true
152 | requiredColumns.foreach( c => {
153 | if (isFirst) isFirst = false
154 | else strBuilder.append(",")
155 | strBuilder.append(c)
156 | })
157 |
158 | val rdd = kuduContext.kuduRDD(tableName, strBuilder.toString()).map(r => {
159 |
160 | val rowResults = r._2
161 | Row.fromSeq(requiredColumns.map(c =>
162 | getKuduValue(c, rowResults)))
163 | })
164 |
165 | resultRDD=rdd
166 | }
167 | resultRDD
168 | }
169 |
170 | def getKuduValue(columnName:String, row:RowResult): Any = {
171 |
172 | val columnSchema = getKuduSchemaColumnMap.getOrElse(columnName, null)
173 |
174 | val columnType = row.getColumnType(columnName)
175 |
176 | if (columnType == Type.BINARY) row.getBinary(columnName)
177 | else if (columnType == Type.BOOL) row.getBoolean(columnName)
178 | else if (columnType == Type.DOUBLE) row.getDouble(columnName)
179 | else if (columnType == Type.FLOAT) row.getFloat(columnName)
180 | else if (columnType == Type.INT16) row.getShort(columnName)
181 | else if (columnType == Type.INT32) row.getInt(columnName)
182 | else if (columnType == Type.INT64) row.getLong(columnName)
183 | else if (columnType == Type.INT8) row.getByte(columnName)
184 | else if (columnType == Type.TIMESTAMP) row.getLong(columnName)
185 | else if (columnType == Type.STRING) row.getString(columnName)
186 | }
187 | }
188 |
--------------------------------------------------------------------------------
/src/main/scala/org/kududb/spark/KuduContext.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.kududb.spark
19 |
20 | import org.apache.hadoop.conf.Configuration
21 | import org.apache.hadoop.io.NullWritable
22 | import org.apache.spark.rdd.RDD
23 | import org.kududb.client._
24 | import org.kududb.mapreduce.KuduTableInputFormat
25 | import scala.collection.mutable
26 | import scala.reflect.ClassTag
27 | import org.apache.spark.{Logging, SparkContext}
28 | import org.apache.spark.streaming.dstream.DStream
29 | import java.io._
30 |
31 | /**
32 | * HBaseContext is a façade for HBase operations
33 | * like bulk put, get, increment, delete, and scan
34 | *
35 | * HBaseContext will take the responsibilities
36 | * of disseminating the configuration information
37 | * to the working and managing the life cycle of HConnections.
38 | */
39 | class KuduContext(@transient sc: SparkContext,
40 | @transient kuduMaster: String)
41 | extends Serializable with Logging {
42 |
43 | val broadcastedKuduMaster = sc.broadcast(kuduMaster)
44 |
45 | LatestKuduContextCache.latest = this
46 |
47 | /**
48 | * A simple enrichment of the traditional Spark RDD foreachPartition.
49 | * This function differs from the original in that it offers the
50 | * developer access to a already connected HConnection object
51 | *
52 | * Note: Do not close the HConnection object. All HConnection
53 | * management is handled outside this method
54 | *
55 | * @param rdd Original RDD with data to iterate over
56 | * @param f Function to be given a iterator to iterate through
57 | * the RDD values and a HConnection object to interact
58 | * with HBase
59 | */
60 | def foreachPartition[T](rdd: RDD[T],
61 | f: (Iterator[T], KuduClient, AsyncKuduClient) => Unit):Unit = {
62 | rdd.foreachPartition(
63 | it => kuduForeachPartition(it, f))
64 | }
65 |
66 | /**
67 | * A simple enrichment of the traditional Spark Streaming dStream foreach
68 | * This function differs from the original in that it offers the
69 | * developer access to a already connected HConnection object
70 | *
71 | * Note: Do not close the HConnection object. All HConnection
72 | * management is handled outside this method
73 | *
74 | * @param dstream Original DStream with data to iterate over
75 | * @param f Function to be given a iterator to iterate through
76 | * the DStream values and a HConnection object to
77 | * interact with HBase
78 | */
79 | def foreachPartition[T](dstream: DStream[T],
80 | f: (Iterator[T], KuduClient, AsyncKuduClient) => Unit):Unit = {
81 | dstream.foreachRDD((rdd, time) => {
82 | foreachPartition(rdd, f)
83 | })
84 | }
85 |
86 | /**
87 | * A simple enrichment of the traditional Spark RDD mapPartition.
88 | * This function differs from the original in that it offers the
89 | * developer access to a already connected HConnection object
90 | *
91 | * Note: Do not close the HConnection object. All HConnection
92 | * management is handled outside this method
93 | *
94 | * @param rdd Original RDD with data to iterate over
95 | * @param mp Function to be given a iterator to iterate through
96 | * the RDD values and a HConnection object to interact
97 | * with HBase
98 | * @return Returns a new RDD generated by the user definition
99 | * function just like normal mapPartition
100 | */
101 | def mapPartitions[T, R: ClassTag](rdd: RDD[T],
102 | mp: (Iterator[T], KuduClient, AsyncKuduClient) => Iterator[R]): RDD[R] = {
103 |
104 | rdd.mapPartitions[R](it => kuduMapPartition[T, R](it, mp))
105 |
106 | }
107 |
108 | /**
109 | * A simple enrichment of the traditional Spark Streaming DStream
110 | * foreachPartition.
111 | *
112 | * This function differs from the original in that it offers the
113 | * developer access to a already connected HConnection object
114 | *
115 | * Note: Do not close the HConnection object. All HConnection
116 | * management is handled outside this method
117 | *
118 | * Note: Make sure to partition correctly to avoid memory issue when
119 | * getting data from HBase
120 | *
121 | * @param dstream Original DStream with data to iterate over
122 | * @param f Function to be given a iterator to iterate through
123 | * the DStream values and a HConnection object to
124 | * interact with HBase
125 | * @return Returns a new DStream generated by the user
126 | * definition function just like normal mapPartition
127 | */
128 | def streamForeachPartition[T](dstream: DStream[T],
129 | f: (Iterator[T], KuduClient, AsyncKuduClient) => Unit): Unit = {
130 |
131 | dstream.foreachRDD(rdd => this.foreachPartition(rdd, f))
132 | }
133 |
134 | /**
135 | * A simple enrichment of the traditional Spark Streaming DStream
136 | * mapPartition.
137 | *
138 | * This function differs from the original in that it offers the
139 | * developer access to a already connected HConnection object
140 | *
141 | * Note: Do not close the HConnection object. All HConnection
142 | * management is handled outside this method
143 | *
144 | * Note: Make sure to partition correctly to avoid memory issue when
145 | * getting data from HBase
146 | *
147 | * @param dstream Original DStream with data to iterate over
148 | * @param f Function to be given a iterator to iterate through
149 | * the DStream values and a HConnection object to
150 | * interact with HBase
151 | * @return Returns a new DStream generated by the user
152 | * definition function just like normal mapPartition
153 | */
154 | def streamMapPartitions[T, U: ClassTag](dstream: DStream[T],
155 | f: (Iterator[T], KuduClient, AsyncKuduClient) => Iterator[U]):
156 | DStream[U] = {
157 | dstream.mapPartitions(it => kuduMapPartition[T, U](
158 | it,
159 | f))
160 | }
161 |
162 |
163 |
164 |
165 | def kuduRDD(tableName: String, columnProjection: String = null):
166 | RDD[(NullWritable, RowResult)] = {
167 |
168 | val conf = new Configuration
169 | conf.set("kudu.mapreduce.master.address",kuduMaster)
170 | conf.set("kudu.mapreduce.input.table", tableName)
171 | if (columnProjection != null) {
172 | conf.set("kudu.mapreduce.column.projection", columnProjection)
173 | }
174 |
175 | val rdd = sc.newAPIHadoopRDD(conf, classOf[KuduTableInputFormat], classOf[NullWritable], classOf[RowResult])
176 |
177 | rdd
178 | }
179 |
180 |
181 | /**
182 | * underlining wrapper all foreach functions in HBaseContext
183 | */
184 | private def kuduForeachPartition[T](it: Iterator[T],
185 | f: (Iterator[T], KuduClient, AsyncKuduClient) => Unit) = {
186 | f(it, KuduClientCache.getKuduClient(broadcastedKuduMaster.value),
187 | KuduClientCache.getAsyncKuduClient(broadcastedKuduMaster.value))
188 | }
189 |
190 | /**
191 | * underlining wrapper all mapPartition functions in HBaseContext
192 | *
193 | */
194 | private def kuduMapPartition[K, U](it: Iterator[K],
195 | mp: (Iterator[K], KuduClient, AsyncKuduClient) =>
196 | Iterator[U]): Iterator[U] = {
197 |
198 |
199 | val res = mp(it,
200 | KuduClientCache.getKuduClient(broadcastedKuduMaster.value),
201 | KuduClientCache.getAsyncKuduClient(broadcastedKuduMaster.value))
202 |
203 | res
204 |
205 | }
206 |
207 | /**
208 | * underlining wrapper all get mapPartition functions in HBaseContext
209 | */
210 | private class ScannerMapPartition[T, U](batchSize: Integer,
211 | makeScanner: (T, KuduClient, AsyncKuduClient) => KuduScanner,
212 | convertResult: (RowResultIterator) => U)
213 | extends Serializable {
214 |
215 | def run(iterator: Iterator[T], kuduClient: KuduClient, asyncKuduClient: AsyncKuduClient): Iterator[U] = {
216 |
217 |
218 | iterator.flatMap( t => {
219 | val resultList = new mutable.MutableList[U]
220 | val scanner = makeScanner(t, kuduClient, asyncKuduClient)
221 |
222 | while (scanner.hasMoreRows) {
223 | resultList.+=(convertResult(scanner.nextRows()))
224 | }
225 | resultList.iterator
226 | })
227 | }
228 | }
229 |
230 | /**
231 | * Produces a ClassTag[T], which is actually just a casted ClassTag[AnyRef].
232 | *
233 | * This method is used to keep ClassTags out of the external Java API, as
234 | * the Java compiler cannot produce them automatically. While this
235 | * ClassTag-faking does please the compiler, it can cause problems at runtime
236 | * if the Scala API relies on ClassTags for correctness.
237 | *
238 | * Often, though, a ClassTag[AnyRef] will not lead to incorrect behavior,
239 | * just worse performance or security issues.
240 | * For instance, an Array of AnyRef can hold any type T, but may lose primitive
241 | * specialization.
242 | */
243 | private[spark]
244 | def fakeClassTag[T]: ClassTag[T] = ClassTag.AnyRef.asInstanceOf[ClassTag[T]]
245 | }
246 |
247 | object LatestKuduContextCache {
248 | var latest:KuduContext = null
249 | }
250 |
251 | object KuduClientCache {
252 | var kuduClient: KuduClient = null
253 | var asyncKuduClient: AsyncKuduClient = null
254 |
255 | def getKuduClient(kuduMaster: String): KuduClient = {
256 | this.synchronized {
257 | if (kuduClient == null) {
258 | kuduClient = new KuduClient.KuduClientBuilder(kuduMaster).build()
259 | }
260 | }
261 | kuduClient
262 | }
263 |
264 | def getAsyncKuduClient(kuduMaster: String): AsyncKuduClient = {
265 | this.synchronized {
266 | if (asyncKuduClient == null) {
267 | asyncKuduClient = new AsyncKuduClient.AsyncKuduClientBuilder(kuduMaster).build()
268 | }
269 | }
270 | asyncKuduClient
271 | }
272 |
273 | }
274 |
--------------------------------------------------------------------------------
/src/main/scala/org/kududb/spark/KuduDStreamFunctions.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 | package org.kududb.spark
18 |
19 | import org.apache.spark.streaming.dstream.DStream
20 | import org.kududb.client._
21 |
22 | import scala.reflect.ClassTag
23 |
24 | /**
25 | * HBaseDStreamFunctions contains a set of implicit functions that can be
26 | * applied to a Spark DStream so that we can easily interact with HBase
27 | */
28 | object KuduDStreamFunctions {
29 |
30 | /**
31 | * These are implicit methods for a DStream that contains any type of
32 | * data.
33 | *
34 | * @param dStream This is for dStreams of any type
35 | * @tparam T Type T
36 | */
37 | implicit class GenericKuduDStreamFunctions[T](val dStream: DStream[T]) {
38 |
39 |
40 | /**
41 | * Implicit method that gives easy access to HBaseContext's
42 | * foreachPartition method. This will ack very much like a normal DStream
43 | * foreach method but for the fact that you will now have a HBase connection
44 | * while iterating through the values.
45 | *
46 | * @param kc The kuduContext object to identify which HBase
47 | * cluster connection to use
48 | * @param f This function will get an iterator for a Partition of an
49 | * DStream along with a connection object to HBase
50 | */
51 | def kuduForeachPartition(kc: KuduContext,
52 | f: (Iterator[T], KuduClient, AsyncKuduClient) => Unit): Unit = {
53 | kc.streamForeachPartition(dStream, f)
54 | }
55 |
56 | /**
57 | * Implicit method that gives easy access to HBaseContext's
58 | * mapPartitions method. This will ask very much like a normal DStream
59 | * map partitions method but for the fact that you will now have a
60 | * HBase connection while iterating through the values
61 | *
62 | * @param kc The kuduContext object to identify which HBase
63 | * cluster connection to use
64 | * @param f This function will get an iterator for a Partition of an
65 | * DStream along with a connection object to HBase
66 | * @tparam R This is the type of objects that will go into the resulting
67 | * DStream
68 | * @return A resulting DStream of type R
69 | */
70 | def kuduMapPartitions[R: ClassTag](kc: KuduContext,
71 | f: (Iterator[T], KuduClient, AsyncKuduClient) => Iterator[R]):
72 | DStream[R] = {
73 | kc.streamMapPartitions(dStream, f)
74 | }
75 | }
76 | }
77 |
--------------------------------------------------------------------------------
/src/main/scala/org/kududb/spark/KuduRDDFunctions.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.kududb.spark
19 |
20 | import org.apache.spark.rdd.RDD
21 | import org.kududb.client._
22 |
23 | import scala.reflect.ClassTag
24 |
25 | /**
26 | * HBaseRDDFunctions contains a set of implicit functions that can be
27 | * applied to a Spark RDD so that we can easily interact with HBase
28 | */
29 | object KuduRDDFunctions
30 | {
31 |
32 | /**
33 | * These are implicit methods for a RDD that contains any type of
34 | * data.
35 | *
36 | * @param rdd This is for rdd of any type
37 | * @tparam T This is any type
38 | */
39 | implicit class GenericHBaseRDDFunctions[T](val rdd: RDD[T]) {
40 |
41 |
42 | /**
43 | * Implicit method that gives easy access to HBaseContext's
44 | * foreachPartition method. This will ack very much like a normal RDD
45 | * foreach method but for the fact that you will now have a HBase connection
46 | * while iterating through the values.
47 | *
48 | * @param kc The hbaseContext object to identify which HBase
49 | * cluster connection to use
50 | * @param f This function will get an iterator for a Partition of an
51 | * RDD along with a connection object to HBase
52 | */
53 | def hbaseForeachPartition(kc: KuduContext,
54 | f: (Iterator[T], KuduClient, AsyncKuduClient) => Unit): Unit = {
55 | kc.foreachPartition(rdd, f)
56 | }
57 |
58 | /**
59 | * Implicit method that gives easy access to HBaseContext's
60 | * mapPartitions method. This will ask very much like a normal RDD
61 | * map partitions method but for the fact that you will now have a
62 | * HBase connection while iterating through the values
63 | *
64 | * @param kc The kuduContext object to identify which HBase
65 | * cluster connection to use
66 | * @param f This function will get an iterator for a Partition of an
67 | * RDD along with a connection object to HBase
68 | * @tparam R This is the type of objects that will go into the resulting
69 | * RDD
70 | * @return A resulting RDD of type R
71 | */
72 | def hbaseMapPartitions[R: ClassTag](kc: KuduContext,
73 | f: (Iterator[T], KuduClient, AsyncKuduClient) => Iterator[R]):
74 | RDD[R] = {
75 | kc.mapPartitions[T,R](rdd, f)
76 | }
77 | }
78 | }
79 |
--------------------------------------------------------------------------------
/src/main/scala/org/kududb/spark/demo/basic/AddSingleRecord.scala:
--------------------------------------------------------------------------------
1 | package org.kududb.spark.demo.basic
2 |
3 | import java.util.Random
4 |
5 | import org.kududb.client.{PartialRow, KuduClient}
6 |
7 | object AddSingleRecord {
8 | def main(args:Array[String]): Unit = {
9 | if (args.length == 0) {
10 | println(" ")
11 | return
12 | }
13 |
14 | val kuduMaster = args(0)
15 | val tableName = args(1)
16 | val rowKey = args(2)
17 |
18 | val kuduClient = new KuduClient.KuduClientBuilder(kuduMaster).build()
19 | val table = kuduClient.openTable(tableName)
20 | val session = kuduClient.newSession()
21 |
22 | val lowerBound = new PartialRow(table.getSchema)
23 | lowerBound.addString(0, rowKey)
24 | val upperBound = new PartialRow(table.getSchema)
25 | upperBound.addString(0, rowKey + "_")
26 |
27 | var startTime = System.currentTimeMillis()
28 | val random = new Random()
29 |
30 | startTime = System.currentTimeMillis()
31 | val update = table.newInsert()
32 | val row = update.getRow
33 | row.addString(0, rowKey)
34 | val columns = table.getSchema.getColumns
35 | for (c <- 1 until columns.size()) {
36 | println(columns.get(c).getName + " " + columns.get(c).getType)
37 | row.addInt(columns.get(c).getName, random.nextInt(100000))
38 | }
39 | session.apply(update)
40 | println("new key: " + rowKey)
41 | println(" new key time spent: " + (System.currentTimeMillis() - startTime))
42 |
43 | startTime = System.currentTimeMillis()
44 | val scanner2 = kuduClient.newScannerBuilder(table).lowerBound(lowerBound).exclusiveUpperBound(upperBound).build()
45 |
46 | while (scanner2.hasMoreRows) {
47 | val rows = scanner2.nextRows()
48 | while (rows.hasNext) {
49 | val row = rows.next()
50 | println("NewValue: " + rowKey + " " + row.rowToString())
51 | }
52 | }
53 | scanner2.close()
54 | println(" scan time spent: " + (System.currentTimeMillis() - startTime))
55 |
56 | val scannerX = kuduClient.newScannerBuilder(table).build()
57 | while (scannerX.hasMoreRows) {
58 | val rows = scannerX.nextRows()
59 | while (rows.hasNext) {
60 | val row = rows.next()
61 | println("Full Scan: " + row.rowToString())
62 | }
63 | }
64 | println("done")
65 | kuduClient.shutdown()
66 | }
67 | }
68 |
--------------------------------------------------------------------------------
/src/main/scala/org/kududb/spark/demo/basic/BasicExample.scala:
--------------------------------------------------------------------------------
1 | package org.kududb.spark.demo.basic
2 |
3 | import java.util
4 | import java.util.Random
5 |
6 | import org.apache.spark.sql.SQLContext
7 | import org.apache.spark.{SparkConf, SparkContext}
8 | import org.kududb.ColumnSchema.ColumnSchemaBuilder
9 | import org.kududb.client.KuduClient
10 | import org.kududb.{ColumnSchema, Schema, Type}
11 |
12 | object BasicExample {
13 | def main(args: Array[String]): Unit = {
14 |
15 | val kuduMaster = "quickstart.cloudera"
16 |
17 | println(" -- Starting ")
18 | val kuduClient = new KuduClient.KuduClientBuilder(kuduMaster).build()
19 | try {
20 | println(" -- ")
21 |
22 | val columnList = new util.ArrayList[ColumnSchema]()
23 | columnList.add(new ColumnSchemaBuilder("KEY_ID", Type.STRING).key(true).build())
24 | columnList.add(new ColumnSchemaBuilder("COL_A", Type.STRING).key(false).build())
25 | columnList.add(new ColumnSchemaBuilder("COL_B", Type.STRING).key(false).build())
26 | columnList.add(new ColumnSchemaBuilder("COL_C", Type.STRING).key(false).build())
27 | val schema = new Schema(columnList)
28 |
29 | if (kuduClient.tableExists("foobar")) {
30 | kuduClient.deleteTable("foobar")
31 | }
32 | kuduClient.createTable("foobar", schema)
33 |
34 | val session = kuduClient.newSession()
35 | val table = kuduClient.openTable("foobar")
36 |
37 | try {
38 | val random = new Random()
39 | for (i <- 0 until 10) {
40 | val insert = table.newInsert()
41 | val row = insert.getRow()
42 | row.addString(0, i.toString)
43 | row.addString(1, "value " + i)
44 | row.addString(2, "42:" + i)
45 | row.addString(3, "Cat" + random.nextGaussian())
46 | session.apply(insert)
47 | }
48 | session.flush()
49 | } finally {
50 | session.close()
51 | }
52 |
53 | val tableList = kuduClient.getTablesList.getTablesList
54 | for (i <- 0 until tableList.size()) {
55 | println("Table " + i + ":" + tableList.get(i))
56 | }
57 |
58 | val sparkConfig = new SparkConf()
59 | sparkConfig.set("spark.broadcast.compress", "false")
60 | sparkConfig.set("spark.shuffle.compress", "false")
61 | sparkConfig.set("spark.shuffle.spill.compress", "false")
62 | val sc = new SparkContext("local[2]", "SparkSQL on Kudu", sparkConfig)
63 |
64 | val sqlContext = new SQLContext(sc)
65 |
66 | val df = sqlContext.load("org.kududb.spark",
67 | Map("kudu.table" -> "foobar", "kudu.master" -> kuduMaster))
68 |
69 | df.registerTempTable("foobar")
70 |
71 | sqlContext.sql("SELECT * FROM foobar").foreach(r => {
72 | println("Row: " + r)
73 | })
74 | } finally {
75 | kuduClient.shutdown()
76 | }
77 | println("-- finished")
78 | }
79 | }
80 |
--------------------------------------------------------------------------------
/src/main/scala/org/kududb/spark/demo/basic/BasicSparkSQLExamples.scala:
--------------------------------------------------------------------------------
1 | package org.kududb.spark.demo.basic
2 |
3 | import org.apache.log4j.{Level, Logger}
4 | import org.apache.spark.sql.SQLContext
5 | import org.apache.spark.{SparkConf, SparkContext}
6 | import org.apache.spark.mllib.clustering.KMeans
7 | import org.apache.spark.mllib.linalg.Vectors
8 |
9 | object BasicSparkSQLExamples {
10 | def main(args:Array[String]): Unit = {
11 | if (args.length == 0) {
12 | println(" ")
13 | }
14 |
15 | Logger.getRootLogger.setLevel(Level.ERROR)
16 |
17 | val kuduMaster = args(0)
18 | val tableName = args(1)
19 | val runLocal = args(2).equals("l")
20 |
21 | println("starting")
22 | var sc:SparkContext = null
23 | if (runLocal) {
24 | val sparkConfig = new SparkConf()
25 | sparkConfig.set("spark.broadcast.compress", "false")
26 | sparkConfig.set("spark.shuffle.compress", "false")
27 | sparkConfig.set("spark.shuffle.spill.compress", "false")
28 | sc = new SparkContext("local", "TableStatsSinglePathMain", sparkConfig)
29 | } else {
30 | val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain")
31 | sc = new SparkContext(sparkConfig)
32 | }
33 |
34 | try {
35 | println("Setting up Tables")
36 | val sqlContext = new SQLContext(sc)
37 | sqlContext.load("org.kududb.spark",
38 | Map("kudu.table" -> tableName, "kudu.master" -> kuduMaster)).registerTempTable(tableName)
39 |
40 | println("Query 1: SELECT count(*) FROM " + tableName)
41 | val startTimeQ1 = System.currentTimeMillis()
42 | sqlContext.sql("SELECT count(*) FROM " + tableName).take(10).foreach(r => {
43 | println(" - (" + r + ")")
44 | })
45 | println("Finish Query 1: " + (System.currentTimeMillis() - startTimeQ1))
46 |
47 | println("Query 2: SELECT key_id, col_1 FROM " + tableName + " limit 100")
48 | val startTimeQ2 = System.currentTimeMillis()
49 | sqlContext.sql("SELECT key_id, col_1 FROM " + tableName + " limit 100 ").take(100).foreach(r => {
50 | println(" - (" + r + ")")
51 | })
52 | println("Finish Query 2: " + (System.currentTimeMillis() - startTimeQ2))
53 |
54 | val q3 = "select key_id from " + tableName + " a join (SELECT max(col_1) col_max FROM " + tableName + ") b on (a.col_1 = b.col_max)"
55 | println("Query 3: " + q3)
56 | val startTimeQ3 = System.currentTimeMillis()
57 | sqlContext.sql(q3).take(100).foreach(r => {
58 | println(" - (" + r + ")")
59 | })
60 | println("Finish Query 3: " + (System.currentTimeMillis() - startTimeQ3))
61 | /*
62 | val q4 = "select host, metric, avg(value), count(*) from metrics group by host, metric"
63 | println("Query 4: " + q4)
64 | val startTimeQ4 = System.currentTimeMillis()
65 | sqlContext.sql(q4).take(100).foreach(r => {
66 | println(" - (" + r + ")")
67 | })
68 | println("Finish Query 4: " + (System.currentTimeMillis() - startTimeQ4))
69 |
70 | */
71 |
72 | println("Query 5 + MLLIB: SELECT key_id, col_1, col_2 FROM " + tableName )
73 | val startTimeQ5 = System.currentTimeMillis()
74 | val resultDf = sqlContext.sql("SELECT key_id, col_1, col_2 FROM " + tableName + " limit 1000")
75 |
76 | val parsedData = resultDf.map(r => {
77 | val array = Array(r.getInt(1).toDouble, r.getInt(2).toDouble)
78 | Vectors.dense(array)
79 | })
80 | val clusters = KMeans.train(parsedData, 3, 4)
81 | clusters.clusterCenters.foreach(v => println(" Vector Center:" + v))
82 |
83 | //TODO add Mllib here
84 | println("Finish Query 5 + MLLIB: " + (System.currentTimeMillis() - startTimeQ5))
85 |
86 | } finally {
87 | sc.stop()
88 | }
89 | }
90 | }
91 |
--------------------------------------------------------------------------------
/src/main/scala/org/kududb/spark/demo/basic/InitialDataPopulation.scala:
--------------------------------------------------------------------------------
1 | package org.kududb.spark.demo.basic
2 |
3 | import java.util
4 | import java.util.Random
5 |
6 | import org.kududb.{Schema, Type, ColumnSchema}
7 | import org.kududb.ColumnSchema.ColumnSchemaBuilder
8 | import org.kududb.client.{AsyncKuduClient, KuduClient}
9 |
10 |
11 | object InitialDataPopulation {
12 | def main(args:Array[String]): Unit = {
13 | if (args.length == 0) {
14 | println(" ")
15 |
16 | //"quickstart.cloudera"
17 |
18 | return
19 | }
20 | val kuduMaster = args(0)
21 | val tableName = args(1)
22 | val numOfColumns = args(2).toInt
23 | val numOfRows = args(3).toInt
24 |
25 | val kuduClient = new AsyncKuduClient.AsyncKuduClientBuilder(kuduMaster).build()
26 | try {
27 | //Delete table if exist
28 | if (kuduClient.tableExists(tableName).join()) {
29 | kuduClient.deleteTable(tableName).join()
30 | }
31 |
32 | //Create Schema
33 | val columnList = new util.ArrayList[ColumnSchema]()
34 | columnList.add(new ColumnSchemaBuilder("key_id", Type.STRING).key(true).build())
35 | for (c <- 0 until numOfColumns) {
36 | columnList.add(new ColumnSchemaBuilder("col_" + c, Type.INT32).key(false).build())
37 | }
38 | val schema = new Schema(columnList)
39 |
40 | //Create table
41 | kuduClient.createTable(tableName, schema).join()
42 |
43 | //Populate table
44 | val random = new Random
45 | val table = kuduClient.openTable(tableName).join()
46 | val asyncSession = kuduClient.newSession()
47 |
48 | for (r <- 0 until numOfRows) {
49 | val insert = table.newInsert()
50 | val row = insert.getRow()
51 | row.addString(0, NameGenerator.getName())
52 | val columns = table.getSchema.getColumns
53 | for (c <- 1 until columns.size()) {
54 | row.addInt(columns.get(c).getName, random.nextInt(100000))
55 | }
56 | asyncSession.apply(insert)
57 |
58 | if (r % 1000 == 0) {
59 | println("Inserted: " + r)
60 | }
61 | }
62 | asyncSession.flush()
63 |
64 | val scannerX = kuduClient.newScannerBuilder(table).build()
65 | while (scannerX.hasMoreRows) {
66 | val rows = scannerX.nextRows().join()
67 | while (rows.hasNext) {
68 | val row = rows.next()
69 | println(" - " + row.rowToString())
70 | }
71 | }
72 |
73 | asyncSession.close()
74 |
75 | } finally {
76 | kuduClient.shutdown()
77 | }
78 | }
79 | }
80 |
--------------------------------------------------------------------------------
/src/main/scala/org/kududb/spark/demo/basic/ModifySingleRecord.scala:
--------------------------------------------------------------------------------
1 | package org.kududb.spark.demo.basic
2 |
3 | import org.kududb.client.{PartialRow, KuduClient}
4 |
5 | object ModifySingleRecord {
6 | def main(args:Array[String]): Unit = {
7 | if (args.length == 0) {
8 | println(" ")
9 | return
10 | }
11 |
12 | val kuduMaster = args(0)
13 | val tableName = args(1)
14 | val rowKey = args(2)
15 | val columnIndexToChange = args(3).toInt
16 | val newValue = args(4).toInt
17 |
18 | val kuduClient = new KuduClient.KuduClientBuilder(kuduMaster).build()
19 | val table = kuduClient.openTable(tableName)
20 | val session = kuduClient.newSession()
21 |
22 | val lowerBound = new PartialRow(table.getSchema)
23 | lowerBound.addString(0, rowKey)
24 | val upperBound = new PartialRow(table.getSchema)
25 | upperBound.addString(0, rowKey + "_")
26 |
27 | var startTime = System.currentTimeMillis()
28 | val scanner = kuduClient.newScannerBuilder(table).lowerBound(lowerBound).exclusiveUpperBound(upperBound).build()
29 |
30 | while (scanner.hasMoreRows) {
31 | val rows = scanner.nextRows()
32 | while (rows.hasNext) {
33 | val row = rows.next()
34 | println("InitialValue: " + rowKey + " " + row.rowToString())
35 | }
36 | }
37 | println(" scan time spent: " + (System.currentTimeMillis() - startTime))
38 | scanner.close()
39 |
40 | startTime = System.currentTimeMillis()
41 | val update = table.newUpdate()
42 | val row = update.getRow
43 | row.addString(0, rowKey)
44 | row.addInt(columnIndexToChange, newValue)
45 | session.apply(update)
46 | println("Update: " + rowKey)
47 | println(" update time spent: " + (System.currentTimeMillis() - startTime))
48 |
49 | startTime = System.currentTimeMillis()
50 | val scanner2 = kuduClient.newScannerBuilder(table).lowerBound(lowerBound).exclusiveUpperBound(upperBound).build()
51 |
52 | while (scanner2.hasMoreRows) {
53 | val rows = scanner.nextRows()
54 | while (rows.hasNext) {
55 | val row = rows.next()
56 | println("NewValue: " + rowKey + " " + row.rowToString())
57 | }
58 | }
59 | scanner2.close()
60 | println(" scan time spent: " + (System.currentTimeMillis() - startTime))
61 | }
62 | }
63 |
--------------------------------------------------------------------------------
/src/main/scala/org/kududb/spark/demo/basic/NameGenerator.scala:
--------------------------------------------------------------------------------
1 | package org.kududb.spark.demo.basic
2 |
3 | import java.util.Random
4 |
5 | import scala.collection.mutable
6 |
7 | object NameGenerator {
8 |
9 | val random = new Random()
10 | val listOfNames = new mutable.MutableList[NameAndCounter]
11 | listOfNames += new NameAndCounter("Katlyn")
12 | listOfNames += new NameAndCounter("Laurena")
13 | listOfNames += new NameAndCounter("Jenise")
14 | listOfNames += new NameAndCounter("Vida")
15 | listOfNames += new NameAndCounter("Delphine")
16 | listOfNames += new NameAndCounter("Tiffanie")
17 | listOfNames += new NameAndCounter("Carroll")
18 | listOfNames += new NameAndCounter("Steve")
19 | listOfNames += new NameAndCounter("Nu")
20 | listOfNames += new NameAndCounter("Robbin")
21 | listOfNames += new NameAndCounter("Mahalia")
22 | listOfNames += new NameAndCounter("Norah")
23 | listOfNames += new NameAndCounter("Selina")
24 | listOfNames += new NameAndCounter("Cornelius")
25 | listOfNames += new NameAndCounter("Bennie")
26 | listOfNames += new NameAndCounter("Kemberly")
27 | listOfNames += new NameAndCounter("Johnie")
28 | listOfNames += new NameAndCounter("Jenee")
29 | listOfNames += new NameAndCounter("Napoleon")
30 | listOfNames += new NameAndCounter("Brenton")
31 | listOfNames += new NameAndCounter("Roxana")
32 | listOfNames += new NameAndCounter("Kalyn")
33 | listOfNames += new NameAndCounter("Jeana")
34 | listOfNames += new NameAndCounter("Tennie")
35 | listOfNames += new NameAndCounter("Tasia")
36 | listOfNames += new NameAndCounter("Ashely")
37 | listOfNames += new NameAndCounter("Hester")
38 | listOfNames += new NameAndCounter("Zita")
39 | listOfNames += new NameAndCounter("Evalyn")
40 | listOfNames += new NameAndCounter("Anderson")
41 | listOfNames += new NameAndCounter("Elaina")
42 | listOfNames += new NameAndCounter("Benny")
43 | listOfNames += new NameAndCounter("Heidi")
44 | listOfNames += new NameAndCounter("Mammie")
45 | listOfNames += new NameAndCounter("Alisa")
46 | listOfNames += new NameAndCounter("Billie")
47 | listOfNames += new NameAndCounter("Wan")
48 | listOfNames += new NameAndCounter("Dionna")
49 | listOfNames += new NameAndCounter("Julene")
50 | listOfNames += new NameAndCounter("Chasidy")
51 | listOfNames += new NameAndCounter("Vennie")
52 | listOfNames += new NameAndCounter("Cara")
53 | listOfNames += new NameAndCounter("Charissa")
54 | listOfNames += new NameAndCounter("Russell")
55 | listOfNames += new NameAndCounter("Daniela")
56 | listOfNames += new NameAndCounter("Kindra")
57 | listOfNames += new NameAndCounter("Eduardo")
58 | listOfNames += new NameAndCounter("Marci")
59 | listOfNames += new NameAndCounter("Gustavo")
60 | listOfNames += new NameAndCounter("Dianna ")
61 |
62 | def getName(): String = {
63 | val nameAndCounter = listOfNames.get(random.nextInt(listOfNames.length - 1)).get
64 | nameAndCounter.counter += 1
65 | nameAndCounter.name + "_" + nameAndCounter.counter
66 | }
67 | }
68 |
69 | class NameAndCounter(val name:String = "N/A", var counter:Int = 0) {
70 |
71 | }
72 |
--------------------------------------------------------------------------------
/src/main/scala/org/kududb/spark/demo/basic/ScanTable.scala:
--------------------------------------------------------------------------------
1 | package org.kududb.spark.demo.basic
2 |
3 | import org.kududb.client.KuduClient
4 |
5 | object ScanTable {
6 | def main(args:Array[String]): Unit = {
7 | if (args.length == 0) {
8 | println(" ")
9 | return
10 | }
11 | val kuduMaster = args(0)
12 | val tableName = args(1)
13 | val limit = args(2).toInt
14 |
15 | val kuduClient = new KuduClient.KuduClientBuilder(kuduMaster).build()
16 | val table = kuduClient.openTable(tableName)
17 | println("starting scan")
18 | val scannerX = kuduClient.newScannerBuilder(table).build()
19 | while (scannerX.hasMoreRows) {
20 | val rows = scannerX.nextRows()
21 | while (rows.hasNext) {
22 | val row = rows.next()
23 | println(" - " + row.rowToString())
24 | }
25 | }
26 | println("finished scan")
27 | kuduClient.shutdown()
28 | }
29 | }
30 |
--------------------------------------------------------------------------------
/src/main/scala/org/kududb/spark/demo/gamer/GamerEvent.scala:
--------------------------------------------------------------------------------
1 | package org.kududb.spark.demo.gamer
2 |
3 | class GamerEvent(var gamerId:String = "",
4 | var lastTimePlayed:Long = 0,
5 | var gamesPlayed:Int = 1,
6 | var gamesWon:Int = 0,
7 | var oks:Int = 0,
8 | var deaths:Int = 0,
9 | var damageGiven:Int = 0,
10 | var damageTaken:Int = 0,
11 | var isInsert:Boolean = false,
12 | var maxOksInOneGame:Int = 0,
13 | var maxDeathsInOneGame:Int = 0,
14 | var hasChanged:Boolean = false) extends Serializable {
15 |
16 | override def toString():String = {
17 | gamerId + "," +
18 | lastTimePlayed + "," +
19 | gamesPlayed + "," +
20 | gamesWon + "," +
21 | oks + "," +
22 | deaths + "," +
23 | damageGiven + "," +
24 | damageTaken + "," +
25 | isInsert + "," +
26 | maxOksInOneGame + "," +
27 | maxDeathsInOneGame
28 | }
29 |
30 | def += (gamerEvent: GamerEvent): Unit = {
31 | gamerId = gamerEvent.gamerId
32 | lastTimePlayed = gamerEvent.lastTimePlayed
33 | gamesPlayed += gamerEvent.gamesPlayed
34 | gamesWon += gamerEvent.gamesWon
35 | oks += gamerEvent.oks
36 | deaths += gamerEvent.deaths
37 | damageGiven += gamerEvent.damageGiven
38 | damageTaken += gamerEvent.damageTaken
39 | if (oks > maxOksInOneGame) maxOksInOneGame = oks
40 | if (deaths > maxDeathsInOneGame) maxDeathsInOneGame = deaths
41 | isInsert = isInsert && gamerEvent.isInsert
42 | }
43 | }
44 |
45 | object GamerEventBuilder extends Serializable {
46 | def build(input:String):GamerEvent = {
47 | val parts = input.split(",")
48 |
49 | if (parts(0).startsWith("14")) println("input:" + input)
50 |
51 | new GamerEvent(parts(0),
52 | parts(1).toLong,
53 | parts(2).toInt,
54 | parts(3).toInt,
55 | parts(4).toInt,
56 | parts(5).toInt,
57 | parts(6).toInt,
58 | parts(7).toInt,
59 | parts(8).equals("true"),
60 | parts(9).toInt,
61 | parts(10).toInt)
62 | }
63 | }
64 |
--------------------------------------------------------------------------------
/src/main/scala/org/kududb/spark/demo/gamer/aggregates/CreateKuduTable.scala:
--------------------------------------------------------------------------------
1 | package org.kududb.spark.demo.gamer.aggregates
2 |
3 | import java.util
4 | import java.util.ArrayList
5 |
6 | import org.kududb.ColumnSchema.ColumnSchemaBuilder
7 | import org.kududb.client.{CreateTableOptions, KuduClient}
8 | import org.kududb.{ColumnSchema, Schema, Type}
9 |
10 | object CreateGamerAggregatesKuduTable {
11 | def main(args:Array[String]): Unit = {
12 | if (args.length == 0) {
13 | println("{kuduMaster} {tableName}")
14 | return
15 | }
16 |
17 | val kuduMaster = args(0)
18 | val tableName = args(1)
19 | val numberOfBuckets = args(2).toInt
20 |
21 | val kuduClient = new KuduClient.KuduClientBuilder(kuduMaster).build()
22 | val columnList = new util.ArrayList[ColumnSchema]()
23 |
24 | columnList.add(new ColumnSchemaBuilder("gamer_id", Type.STRING).key(true).build())
25 | columnList.add(new ColumnSchemaBuilder("last_time_played", Type.INT64).key(false).build())
26 | columnList.add(new ColumnSchemaBuilder("games_played", Type.INT32).key(false).build())
27 | columnList.add(new ColumnSchemaBuilder("games_won", Type.INT32).key(false).build())
28 | columnList.add(new ColumnSchemaBuilder("oks", Type.INT32).key(false).build())
29 | columnList.add(new ColumnSchemaBuilder("deaths", Type.INT32).key(false).build())
30 | columnList.add(new ColumnSchemaBuilder("damage_given", Type.INT32).key(false).build())
31 | columnList.add(new ColumnSchemaBuilder("damage_taken", Type.INT32).key(false).build())
32 | columnList.add(new ColumnSchemaBuilder("max_oks_in_one_game", Type.INT32).key(false).build())
33 | columnList.add(new ColumnSchemaBuilder("max_deaths_in_one_game", Type.INT32).key(false).build())
34 | val schema = new Schema(columnList)
35 |
36 | if (kuduClient.tableExists(tableName)) {
37 | println("Deleting Table")
38 | kuduClient.deleteTable(tableName)
39 | }
40 | val builder = new CreateTableOptions()
41 |
42 | val hashColumnList = new ArrayList[String]
43 | hashColumnList.add("gamer_id")
44 |
45 | builder.addHashPartitions(hashColumnList, numberOfBuckets)
46 |
47 | println("Creating Table")
48 | kuduClient.createTable(tableName, schema, builder)
49 | println("Created Table")
50 | kuduClient.shutdown()
51 | }
52 | }
53 |
--------------------------------------------------------------------------------
/src/main/scala/org/kududb/spark/demo/gamer/aggregates/DirectDataInjector.scala:
--------------------------------------------------------------------------------
1 | package org.kududb.spark.demo.gamer.aggregates
2 |
3 | import java.util.Random
4 |
5 | import org.kududb.client.KuduClient
6 |
7 | object DirectDataInjector {
8 |
9 | val random = new Random
10 | def main(args:Array[String]): Unit = {
11 |
12 | if (args.length == 0) {
13 | println(" ")
14 | return
15 | }
16 |
17 | val kuduMaster = args(0)
18 | val tableName = args(1)
19 | val numberOfRecords = args(2).toInt
20 |
21 | val kuduClient = new KuduClient.KuduClientBuilder(kuduMaster).build()
22 | val table = kuduClient.openTable(tableName)
23 | val session = kuduClient.newSession()
24 |
25 | table.newInsert()
26 |
27 | for (i <- 0 to numberOfRecords) {
28 | val record = GamerDataGenerator.makeNewGamerRecord(100000)
29 | val op = table.newInsert()
30 |
31 | val row = op.getRow
32 | row.addString("gamer_id", record.gamerId)
33 | row.addLong("last_time_played", record.lastTimePlayed)
34 | row.addInt("games_played", record.gamesPlayed)
35 | row.addInt("games_won", record.gamesWon)
36 | row.addInt("oks", record.oks)
37 | row.addInt("deaths", record.deaths)
38 | row.addInt("damage_given", record.damageGiven)
39 | row.addInt("damage_taken", record.damageTaken)
40 | row.addInt("max_oks_in_one_game", record.maxOksInOneGame)
41 | row.addInt("max_deaths_in_one_game", record.maxDeathsInOneGame)
42 |
43 | session.apply(op)
44 | }
45 | session.flush()
46 |
47 | kuduClient.close()
48 |
49 |
50 | }
51 | }
52 |
--------------------------------------------------------------------------------
/src/main/scala/org/kududb/spark/demo/gamer/aggregates/GamerAggergatesSparkStreaming.scala:
--------------------------------------------------------------------------------
1 | package org.kududb.spark.demo.gamer.aggregates
2 |
3 | import kafka.serializer.StringDecoder
4 | import org.apache.spark.rdd.RDD
5 | import org.apache.spark.streaming.dstream.DStream
6 | import org.apache.spark.streaming.kafka.KafkaUtils
7 | import org.apache.spark.streaming.{Seconds, StreamingContext}
8 | import org.apache.spark.{HashPartitioner, SparkConf, SparkContext}
9 | import org.kududb.client.Operation
10 | import org.kududb.client.SessionConfiguration.FlushMode
11 | import org.kududb.spark.KuduContext
12 | import org.kududb.spark.KuduDStreamFunctions.GenericKuduDStreamFunctions
13 | import org.kududb.spark.demo.gamer.{GamerEvent, GamerEventBuilder}
14 |
15 | object GamerAggergatesSparkStreaming {
16 |
17 | def main(args:Array[String]): Unit = {
18 | if (args.length == 0) {
19 | println("{brokerList} {topics} {kuduMaster} {tableName} {local}")
20 | }
21 | val brokerList = args(0)
22 | val topics = args(1)
23 | val kuduMaster = args(2)
24 | val tableName = args(3)
25 | val runLocal = args(4).equals("L")
26 |
27 | val sparkConf = new SparkConf().setAppName("GamerAggergatesSparkStreaming")
28 | var ssc:StreamingContext = null
29 | if (runLocal) {
30 | println("Running Local")
31 | val sparkConfig = new SparkConf()
32 | sparkConfig.set("spark.broadcast.compress", "false")
33 | sparkConfig.set("spark.shuffle.compress", "false")
34 | sparkConfig.set("spark.shuffle.spill.compress", "false")
35 | sparkConfig.set("spark.io.compression.codec", "lzf")
36 | val sc = new SparkContext("local[4]", "SparkSQL on Kudu", sparkConfig)
37 | ssc = new StreamingContext(sc, Seconds(2))
38 | } else {
39 | println("Running Cluster")
40 | ssc = new StreamingContext(sparkConf, Seconds(2))
41 | }
42 |
43 | val kuduContext = new KuduContext(ssc.sparkContext, kuduMaster)
44 |
45 | //Get original values from Kudu
46 | val originalKuduDStream = loadOriginalKuduData(tableName, kuduContext, ssc)
47 |
48 | //Connect to Kafka
49 | val newKafkaMessageDStream = loadDataFromKafka(topics, brokerList, ssc)
50 |
51 | val currentStateDStream = newKafkaMessageDStream.updateStateByKey[GamerEvent](
52 | (a:Seq[String], b:Option[GamerEvent]) => {
53 | val it = a.iterator
54 | if (!it.hasNext) {
55 | if (!b.isEmpty) {
56 | val existing = b.get
57 | existing.hasChanged = false
58 | Some(existing)
59 | } else {
60 | None
61 | }
62 | } else {
63 | val resultingValue = new GamerEvent()
64 |
65 | //Add up all the values in this micro batch
66 | while (it.hasNext) {
67 | val newPart = it.next()
68 | resultingValue += GamerEventBuilder.build(newPart)
69 | }
70 |
71 | if (b.isEmpty) {
72 | resultingValue.isInsert = true
73 | resultingValue.hasChanged = true
74 | Some(resultingValue)
75 | } else {
76 | val existing = b.get
77 | existing += resultingValue
78 | existing.isInsert = false
79 | existing.hasChanged = true
80 | Some(existing)
81 | }
82 | }
83 | }, new HashPartitioner (ssc.sparkContext.defaultParallelism), originalKuduDStream)
84 |
85 | currentStateDStream.kuduForeachPartition(kuduContext, (it, kuduClient, asyncKuduClient) => {
86 | val table = kuduClient.openTable(tableName)
87 |
88 | //This can be made to be faster
89 | val session = kuduClient.newSession()
90 | session.setFlushMode(FlushMode.AUTO_FLUSH_BACKGROUND)
91 |
92 | var operation: Operation = null
93 |
94 | var upserts = 0
95 | while (it.hasNext) {
96 | val gamerEventTuple = it.next()
97 |
98 | if (gamerEventTuple._2.hasChanged == true) {
99 | if (gamerEventTuple._2.isInsert) {
100 | operation = table.newInsert()
101 | } else {
102 | operation = table.newUpdate()
103 | }
104 |
105 | val row = operation.getRow
106 | row.addString("gamer_id", gamerEventTuple._2.gamerId.toString)
107 | row.addLong("last_time_played", gamerEventTuple._2.lastTimePlayed)
108 | row.addInt("games_played", gamerEventTuple._2.gamesPlayed)
109 | row.addInt("games_won", gamerEventTuple._2.gamesWon)
110 | row.addInt("oks", gamerEventTuple._2.oks)
111 | row.addInt("deaths", gamerEventTuple._2.deaths)
112 | row.addInt("damage_given", gamerEventTuple._2.damageGiven)
113 | row.addInt("damage_taken", gamerEventTuple._2.damageTaken)
114 | row.addInt("max_oks_in_one_game", gamerEventTuple._2.maxOksInOneGame)
115 | row.addInt("max_deaths_in_one_game", gamerEventTuple._2.maxDeathsInOneGame)
116 |
117 | session.apply(operation)
118 |
119 | upserts += 1
120 | }
121 | }
122 | session.close()
123 |
124 | println("upserts: " + upserts)
125 | })
126 | ssc.checkpoint("./checkpoint")
127 | ssc.start()
128 | ssc.awaitTermination()
129 | }
130 |
131 | def loadDataFromKafka(topics:String,
132 | brokerList:String,
133 | ssc:StreamingContext): DStream[(String, String)] = {
134 | val topicsSet = topics.split(",").toSet
135 | val kafkaParams = Map[String, String]("metadata.broker.list" -> brokerList)
136 | val messages = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](
137 | ssc, kafkaParams, topicsSet)
138 |
139 | messages.map(r => {
140 | (r._1, r._2)
141 | })
142 | }
143 |
144 | def loadOriginalKuduData(tableName:String,
145 | kuduContext:KuduContext,
146 | ssc:StreamingContext):RDD[(String, GamerEvent)] = {
147 | val kuduOriginalRdd = kuduContext.kuduRDD(tableName,
148 | "gamer_id,last_time_played,games_played,games_won,oks,deaths,damage_given,damage_taken,max_oks_in_one_game,max_deaths_in_one_game").
149 | map(r => {
150 | val row = r._2
151 |
152 | val gamerId = row.getString(0)
153 | val lastTimePlayed = row.getLong(1)
154 | val gamesPlayed = row.getInt(2)
155 | val gamesWon = row.getInt(3)
156 | val oks = row.getInt(4)
157 | val deaths = row.getInt(5)
158 | val damageGiven = row.getInt(6)
159 | val damageTaken = row.getInt(7)
160 | val maxOksInOneGame = row.getInt(8)
161 | val maxDeathsInOneGame = row.getInt(9)
162 |
163 | val initialGamerEvent = new GamerEvent(gamerId,lastTimePlayed,
164 | gamesPlayed,
165 | gamesWon,
166 | oks,
167 | deaths,
168 | damageGiven,
169 | damageTaken,
170 | false,
171 | maxOksInOneGame,
172 | maxDeathsInOneGame)
173 |
174 | (row.getString(0),initialGamerEvent)
175 | })
176 |
177 | kuduOriginalRdd
178 | }
179 | }
180 |
--------------------------------------------------------------------------------
/src/main/scala/org/kududb/spark/demo/gamer/aggregates/GamerDataGenerator.scala:
--------------------------------------------------------------------------------
1 | package org.kududb.spark.demo.gamer.aggregates
2 |
3 | import java.util.{Date, Random}
4 |
5 | import org.kududb.spark.demo.gamer.GamerEvent
6 |
7 | object GamerDataGenerator {
8 |
9 | val random = new Random()
10 | val averagePlayerPercentage = 40
11 | val advancedPlayerPercentage = 80
12 | val superStarPlayerPercentage = 100
13 | var date = System.currentTimeMillis()
14 |
15 | def makeNewGamerRecord(numOfGamers:Int): GamerEvent = {
16 | println("date" + new Date(date))
17 | date += 60000 * 60 * 6
18 | val playerSelection = random.nextInt(100)
19 | if (playerSelection < averagePlayerPercentage) {
20 |
21 | val gamerId = random.nextInt(numOfGamers/100) * 100 + playerSelection
22 |
23 | new GamerEvent(gamerId.toString,
24 | date,
25 | 1,
26 | if (random.nextInt(10) > 7) 1 else 0,
27 | random.nextInt(10),
28 | random.nextInt(20),
29 | random.nextInt(1000),
30 | random.nextInt(2000))
31 | } else if (playerSelection < advancedPlayerPercentage) {
32 | val gamerId = random.nextInt(numOfGamers/100) * 100 + playerSelection
33 |
34 | new GamerEvent(gamerId.toString,
35 | date,
36 | 1,
37 | if (random.nextInt(10) > 5) 1 else 0,
38 | random.nextInt(20),
39 | random.nextInt(18),
40 | random.nextInt(2000),
41 | random.nextInt(2000))
42 | } else {
43 | val gamerId = random.nextInt(numOfGamers/100) * 100 + playerSelection
44 |
45 | new GamerEvent(gamerId.toString,
46 | date,
47 | 1,
48 | if (random.nextInt(10) > 3) 1 else 0,
49 | random.nextInt(20),
50 | random.nextInt(10),
51 | random.nextInt(4000),
52 | random.nextInt(1500))
53 | }
54 | }
55 | }
56 |
--------------------------------------------------------------------------------
/src/main/scala/org/kududb/spark/demo/gamer/aggregates/GamerSparkSQLExample.scala:
--------------------------------------------------------------------------------
1 | package org.kududb.spark.demo.gamer.aggregates
2 |
3 | import org.apache.log4j.{Level, Logger}
4 | import org.apache.spark.mllib.clustering.KMeans
5 | import org.apache.spark.mllib.linalg.Vectors
6 | import org.apache.spark.sql.SQLContext
7 | import org.apache.spark.{SparkConf, SparkContext}
8 |
9 | object GamerSparkSQLExample {
10 | def main(args:Array[String]): Unit = {
11 | if (args.length == 0) {
12 | println("{kudumaster} {runLocal}")
13 | return
14 | }
15 |
16 | Logger.getRootLogger.setLevel(Level.ERROR)
17 |
18 | val kuduMaster = args(0)
19 | val runLocal = args(1).equals("l")
20 |
21 | println("Loading Spark Context")
22 | var sc:SparkContext = null
23 |
24 | if (runLocal) {
25 | val sparkConfig = new SparkConf()
26 | sparkConfig.set("spark.broadcast.compress", "false")
27 | sparkConfig.set("spark.shuffle.compress", "false")
28 | sparkConfig.set("spark.shuffle.spill.compress", "false")
29 | sc = new SparkContext("local", "TableStatsSinglePathMain", sparkConfig)
30 | } else {
31 | val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain")
32 | sc = new SparkContext(sparkConfig)
33 | }
34 | println("Loading Spark Context: Finished")
35 |
36 | println("Setting up Tables")
37 | val sqlContext = new SQLContext(sc)
38 | sqlContext.load("org.kududb.spark",
39 | Map("kudu.table" -> "gamer", "kudu.master" -> kuduMaster)).registerTempTable("gamer")
40 |
41 | println("Query 1: SELECT count(*) FROM gamer")
42 | val startTimeQ1 = System.currentTimeMillis()
43 | sqlContext.sql("SELECT count(*) FROM gamer").take(10).foreach(r => {
44 | println(" - (" + r + ")")
45 | })
46 | println("Finish Query 1: " + (System.currentTimeMillis() - startTimeQ1))
47 |
48 | println("Query 2: SELECT * FROM gamer limit 100")
49 | val startTimeQ2 = System.currentTimeMillis()
50 | sqlContext.sql("SELECT * FROM gamer limit 100").take(100).foreach(r => {
51 | println(" - (" + r + ")")
52 | })
53 | println("Finish Query 2: " + (System.currentTimeMillis() - startTimeQ2))
54 |
55 | println("Query 3: SELECT * FROM gamer order_by last_time_played desc limit 100")
56 | val startTimeQ3 = System.currentTimeMillis()
57 | sqlContext.sql("SELECT * FROM gamer order by last_time_played desc limit 100").take(100).foreach(r => {
58 | println(" - (" + r + ")")
59 | })
60 | println("Finish Query 3: " + (System.currentTimeMillis() - startTimeQ3))
61 |
62 | println("Query 4: SELECT max(games_played), max(oks), max(damage_given) FROM gamer")
63 | val startTimeQ4 = System.currentTimeMillis()
64 | sqlContext.sql("SELECT max(games_played), max(oks), max(damage_given) FROM gamer").take(100).foreach(r => {
65 | println(" - (" + r + ")")
66 | })
67 | println("Finish Query 4: " + (System.currentTimeMillis() - startTimeQ4))
68 |
69 | println("Query 5 + MLLIB: SELECT gamer_id, oks, games_won, games_played FROM gamer" )
70 | val startTimeQ5 = System.currentTimeMillis()
71 | val resultDf = sqlContext.sql("SELECT gamer_id, oks, games_won, games_played FROM gamer")
72 |
73 | val parsedData = resultDf.map(r => {
74 | val array = Array(r.getInt(1).toDouble, r.getInt(2).toDouble, r.getInt(3).toDouble)
75 | Vectors.dense(array)
76 | })
77 |
78 | val dataCount = parsedData.count()
79 |
80 | if (dataCount > 0) {
81 | val clusters = KMeans.train(parsedData, 3, 5)
82 | clusters.clusterCenters.foreach(v => println(" Vector Center:" + v))
83 |
84 | }
85 | //TODO add Mllib here
86 | println("Finish Query 5 + MLLIB: " + (System.currentTimeMillis() - startTimeQ5))
87 |
88 | }
89 | }
90 |
--------------------------------------------------------------------------------
/src/main/scala/org/kududb/spark/demo/gamer/aggregates/KafkaProducerInjector.scala:
--------------------------------------------------------------------------------
1 | package org.kududb.spark.demo.gamer.aggregates
2 |
3 | import java.util.Properties
4 |
5 | import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord}
6 |
7 |
8 | object KafkaProducerInjector {
9 |
10 |
11 | def main(args:Array[String]): Unit = {
12 | if (args.length == 0) {
13 | println("{brokerList} {topic} {#OfRecords} {sleepTimeEvery10Records} {#OfGamers}")
14 | return
15 | }
16 |
17 | val brokerList = args(0)
18 | val topic = args(1)
19 | val numOfRecords = args(2).toInt
20 | val sleepTimeEvery10Records = args(3).toInt
21 | val numOfGamers = args(4).toInt
22 |
23 | val producer = getNewProducer(brokerList)
24 |
25 | for (i <- 0 until numOfRecords) {
26 |
27 | val gamerRecord = GamerDataGenerator.makeNewGamerRecord(numOfGamers)
28 |
29 | val message = new ProducerRecord[String, String](topic, gamerRecord.gamerId.toString, gamerRecord.toString())
30 |
31 | producer.send(message)
32 |
33 | if (i % 10 == 0) {
34 | Thread.sleep(sleepTimeEvery10Records)
35 | print(".")
36 | }
37 | if (i % 2000 == 0) {
38 | println()
39 | println("Records Sent:" + i)
40 | println()
41 | }
42 | }
43 | }
44 |
45 | def getNewProducer(brokerList:String): KafkaProducer[String, String] = {
46 | val kafkaProps = new Properties
47 | kafkaProps.put("bootstrap.servers", brokerList)
48 | kafkaProps.put("metadata.broker.list", brokerList)
49 |
50 | // This is mandatory, even though we don't send keys
51 | kafkaProps.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer")
52 | kafkaProps.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer")
53 | kafkaProps.put("acks", "0")
54 |
55 | // how many times to retry when produce request fails?
56 | kafkaProps.put("retries", "3")
57 | kafkaProps.put("linger.ms", "2")
58 | kafkaProps.put("batch.size", "1000")
59 | kafkaProps.put("queue.time", "2")
60 |
61 | new KafkaProducer[String, String](kafkaProps)
62 | }
63 |
64 |
65 | }
66 |
--------------------------------------------------------------------------------
/src/main/scala/org/kududb/spark/demo/gamer/aggregates/SparkSQLCmd.scala:
--------------------------------------------------------------------------------
1 | package org.kududb.spark.demo.gamer.aggregates
2 |
3 | import org.apache.spark.sql.SQLContext
4 | import org.apache.spark.{SparkConf, SparkContext}
5 |
6 | object SparkSQLCmd {
7 | def main(args:Array[String]): Unit = {
8 | if (args.length == 0) {
9 | println("{kuduMaster} {L for Local}")
10 | }
11 |
12 | val kuduMaster = args(0)
13 | val runLocal = args(1).equals("L")
14 |
15 | var sc:SparkContext = null
16 |
17 | if (runLocal) {
18 | val sparkConfig = new SparkConf()
19 | sparkConfig.set("spark.broadcast.compress", "false")
20 | sparkConfig.set("spark.shuffle.compress", "false")
21 | sparkConfig.set("spark.shuffle.spill.compress", "false")
22 | sc = new SparkContext("local", "TableStatsSinglePathMain", sparkConfig)
23 | } else {
24 | val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain")
25 | sc = new SparkContext(sparkConfig)
26 | }
27 |
28 | val sqlContext = new SQLContext(sc)
29 | println("Loading 'gamer' table")
30 | sqlContext.load("org.kududb.spark",
31 | Map("kudu.table" -> "gamer", "kudu.master" -> kuduMaster)).registerTempTable("gamer")
32 | println("Successfully loaded 'gamer' table")
33 |
34 | val doContinue = true
35 |
36 | while (doContinue) {
37 | val input = readLine("SparkSQL> ")
38 |
39 | try {
40 |
41 |
42 | val startTime = System.currentTimeMillis()
43 | val startTimeQ1 = System.currentTimeMillis()
44 | sqlContext.sql(input).take(1000).foreach(r => {
45 | println(" > " + r)
46 | })
47 | println(" Finished in " + (System.currentTimeMillis() - startTime))
48 | } catch {
49 | case e: Throwable => {
50 | println(" > Query '" + input + "' failed.")
51 | e.printStackTrace()
52 | }
53 | }
54 | }
55 |
56 | }
57 | }
58 |
--------------------------------------------------------------------------------
/src/main/scala/org/kududb/spark/demo/gamer/cdc/ApplyNewRecordRunnable.scala:
--------------------------------------------------------------------------------
1 | package org.kududb.spark.demo.gamer.cdc
2 |
3 | import java.text.SimpleDateFormat
4 | import java.util.concurrent.atomic.AtomicInteger
5 |
6 | import org.kududb.client.{Operation, PartialRow, KuduClient}
7 | import org.kududb.spark.demo.gamer.GamerEvent
8 |
9 | class ApplyNewRecordRunnable(val gameEvent: GamerEvent,
10 | val kuduClient: KuduClient,
11 | val tableName: String,
12 | val leftToRun:AtomicInteger) extends Runnable{
13 | override def run(): Unit = {
14 | val table = kuduClient.openTable(tableName)
15 | val session = kuduClient.newSession()
16 | val simpleDateFormat = new SimpleDateFormat("MM,dd,yyyy")
17 |
18 | val record = gameEvent
19 |
20 | val pr = new PartialRow(table.getSchema)
21 | pr.addString(0, record.gamerId)
22 | pr.addString(1, "")
23 | val scannerRows = kuduClient.newScannerBuilder(table).lowerBound(pr).limit(1).build().nextRows()
24 | val op:Operation = if (scannerRows.hasNext) {
25 | println(" >> had next")
26 | val oldRow = scannerRows.next()
27 |
28 | val oldRecordUpdateOp = table.newInsert()
29 |
30 | val row = oldRecordUpdateOp.getRow
31 | row.addString("gamer_id", oldRow.getString("gamer_id"))
32 | row.addString("eff_to", simpleDateFormat.format(record.lastTimePlayed))
33 | row.addString("eff_from", oldRow.getString("eff_from"))
34 | row.addLong("last_time_played", oldRow.getLong("last_time_played"))
35 | row.addInt("games_played", oldRow.getInt("games_played"))
36 | row.addInt("games_won", oldRow.getInt("games_won"))
37 | row.addInt("oks", oldRow.getInt("oks"))
38 | row.addInt("deaths", oldRow.getInt("deaths"))
39 | row.addInt("damage_given", oldRow.getInt("damage_given"))
40 | row.addInt("damage_taken", oldRow.getInt("damage_taken"))
41 | row.addInt("max_oks_in_one_game", oldRow.getInt("max_oks_in_one_game"))
42 | row.addInt("max_deaths_in_one_game", oldRow.getInt("max_deaths_in_one_game"))
43 |
44 | session.apply(oldRecordUpdateOp)
45 | table.newUpdate()
46 | } else {
47 | table.newInsert()
48 | }
49 |
50 | val row = op.getRow
51 | row.addString("gamer_id", record.gamerId)
52 | row.addString("eff_to", "")
53 | row.addString("eff_from", simpleDateFormat.format(record.lastTimePlayed))
54 | row.addLong("last_time_played", record.lastTimePlayed)
55 | row.addInt("games_played", record.gamesPlayed)
56 | row.addInt("games_won", record.gamesWon)
57 | row.addInt("oks", record.oks)
58 | row.addInt("deaths", record.deaths)
59 | row.addInt("damage_given", record.damageGiven)
60 | row.addInt("damage_taken", record.damageTaken)
61 | row.addInt("max_oks_in_one_game", record.maxOksInOneGame)
62 | row.addInt("max_deaths_in_one_game", record.maxDeathsInOneGame)
63 |
64 | session.apply(op)
65 |
66 | session.flush()
67 | leftToRun.decrementAndGet()
68 | println(" >> finished Submit")
69 | }
70 | }
71 |
--------------------------------------------------------------------------------
/src/main/scala/org/kududb/spark/demo/gamer/cdc/CreateKuduTable.scala:
--------------------------------------------------------------------------------
1 | package org.kududb.spark.demo.gamer.cdc
2 |
3 | import java.util
4 | import java.util.ArrayList
5 |
6 | import org.kududb.{Schema, Type, ColumnSchema}
7 | import org.kududb.ColumnSchema.ColumnSchemaBuilder
8 | import org.kududb.client.{CreateTableOptions, KuduClient}
9 |
10 | object CreateGamerCDCKuduTable {
11 | def main(args:Array[String]): Unit = {
12 | if (args.length == 0) {
13 | println("{kuduMaster} {tableName}")
14 | return
15 | }
16 |
17 | val kuduMaster = args(0)
18 | val tableName = args(1)
19 | val numberOfBuckets = args(2).toInt
20 |
21 | val kuduClient = new KuduClient.KuduClientBuilder(kuduMaster).build()
22 | val columnList = new util.ArrayList[ColumnSchema]()
23 |
24 | columnList.add(new ColumnSchemaBuilder("gamer_id", Type.STRING).key(true).build())
25 | columnList.add(new ColumnSchemaBuilder("eff_to", Type.STRING).key(true).build())
26 | columnList.add(new ColumnSchemaBuilder("eff_from", Type.STRING).key(false).build())
27 | columnList.add(new ColumnSchemaBuilder("last_time_played", Type.INT64).key(false).build())
28 | columnList.add(new ColumnSchemaBuilder("games_played", Type.INT32).key(false).build())
29 | columnList.add(new ColumnSchemaBuilder("games_won", Type.INT32).key(false).build())
30 | columnList.add(new ColumnSchemaBuilder("oks", Type.INT32).key(false).build())
31 | columnList.add(new ColumnSchemaBuilder("deaths", Type.INT32).key(false).build())
32 | columnList.add(new ColumnSchemaBuilder("damage_given", Type.INT32).key(false).build())
33 | columnList.add(new ColumnSchemaBuilder("damage_taken", Type.INT32).key(false).build())
34 | columnList.add(new ColumnSchemaBuilder("max_oks_in_one_game", Type.INT32).key(false).build())
35 | columnList.add(new ColumnSchemaBuilder("max_deaths_in_one_game", Type.INT32).key(false).build())
36 | val schema = new Schema(columnList)
37 |
38 | if (kuduClient.tableExists(tableName)) {
39 | println("Deleting Table")
40 | kuduClient.deleteTable(tableName)
41 | }
42 |
43 | val builder = new CreateTableOptions()
44 |
45 | val hashColumnList = new ArrayList[String]
46 | hashColumnList.add("gamer_id")
47 |
48 | builder.addHashPartitions(hashColumnList, numberOfBuckets)
49 |
50 | println("Creating Table")
51 | kuduClient.createTable(tableName, schema, builder)
52 | println("Created Table")
53 | kuduClient.shutdown()
54 | }
55 | }
56 |
--------------------------------------------------------------------------------
/src/main/scala/org/kududb/spark/demo/gamer/cdc/DirectDataInjector.scala:
--------------------------------------------------------------------------------
1 | package org.kududb.spark.demo.gamer.cdc
2 |
3 | import java.text.SimpleDateFormat
4 | import java.util.Random
5 |
6 | import org.kududb.client.{PartialRow, Operation, KuduClient}
7 | import org.kududb.spark.demo.gamer.aggregates.GamerDataGenerator
8 |
9 |
10 | class DirectDataInjector {
11 | val simpleDateFormat = new SimpleDateFormat("MM,dd,yyyy")
12 | val random = new Random
13 | def main(args:Array[String]): Unit = {
14 |
15 | if (args.length == 0) {
16 | println(" ")
17 | return
18 | }
19 |
20 | val kuduMaster = args(0)
21 | val tableName = args(1)
22 | val numberOfRecords = args(2).toInt
23 |
24 |
25 | val kuduClient = new KuduClient.KuduClientBuilder(kuduMaster).build()
26 | val table = kuduClient.openTable(tableName)
27 | val session = kuduClient.newSession()
28 |
29 | for (i <- 0 to numberOfRecords) {
30 | val record = GamerDataGenerator.makeNewGamerRecord(100000)
31 |
32 |
33 |
34 |
35 | val pr = new PartialRow(table.getSchema)
36 | pr.addString(0, "record.gamerId")
37 | pr.addString(1, "")
38 | val scannerRows = kuduClient.newScannerBuilder(table).lowerBound(null).limit(1).build().nextRows()
39 | val op:Operation = if (scannerRows.hasNext) {
40 | val oldRow = scannerRows.next()
41 |
42 | val oldRecordUpdateOp = table.newInsert()
43 |
44 | val row = oldRecordUpdateOp.getRow
45 | row.addString("gamer_id", oldRow.getString("gamer_id"))
46 | row.addString("eff_to", simpleDateFormat.format(System.currentTimeMillis()))
47 | row.addString("eff_from", oldRow.getString("eff_from"))
48 | row.addLong("last_time_played", oldRow.getLong("last_time_played"))
49 | row.addInt("games_played", oldRow.getInt("games_played"))
50 | row.addInt("games_won", oldRow.getInt("games_won"))
51 | row.addInt("oks", oldRow.getInt("oks"))
52 | row.addInt("deaths", oldRow.getInt("deaths"))
53 | row.addInt("damage_given", oldRow.getInt("damage_given"))
54 | row.addInt("damage_taken", oldRow.getInt("damage_taken"))
55 | row.addInt("max_oks_in_one_game", oldRow.getInt("max_oks_in_one_game"))
56 | row.addInt("max_deaths_in_one_game", oldRow.getInt("max_deaths_in_one_game"))
57 |
58 | session.apply(oldRecordUpdateOp)
59 | table.newUpdate()
60 | } else {
61 | table.newInsert()
62 | }
63 |
64 | val row = op.getRow
65 | row.addString("gamer_id", record.gamerId)
66 | row.addString("eff_to", "")
67 | row.addString("eff_from", simpleDateFormat.format(System.currentTimeMillis()))
68 | row.addLong("last_time_played", record.lastTimePlayed)
69 | row.addInt("games_played", record.gamesPlayed)
70 | row.addInt("games_won", record.gamesWon)
71 | row.addInt("oks", record.oks)
72 | row.addInt("deaths", record.deaths)
73 | row.addInt("damage_given", record.damageGiven)
74 | row.addInt("damage_taken", record.damageTaken)
75 | row.addInt("max_oks_in_one_game", record.maxOksInOneGame)
76 | row.addInt("max_deaths_in_one_game", record.maxDeathsInOneGame)
77 |
78 | session.apply(op)
79 | }
80 | session.flush()
81 |
82 | kuduClient.close()
83 |
84 |
85 | }
86 | }
87 |
--------------------------------------------------------------------------------
/src/main/scala/org/kududb/spark/demo/gamer/cdc/DirectDataMultiThreadedInjector.scala:
--------------------------------------------------------------------------------
1 | package org.kududb.spark.demo.gamer.cdc
2 |
3 | import java.text.SimpleDateFormat
4 | import java.util.Random
5 | import java.util.concurrent.atomic.AtomicInteger
6 | import java.util.concurrent.{TimeUnit, Executors}
7 |
8 | import org.kududb.client.{Operation, PartialRow, KuduClient}
9 | import org.kududb.spark.demo.gamer.aggregates.GamerDataGenerator
10 |
11 | object DirectDataMultiThreadedInjector {
12 | val simpleDateFormat = new SimpleDateFormat("MM,dd,yyyy")
13 | val random = new Random
14 | def main(args:Array[String]): Unit = {
15 |
16 | if (args.length == 0) {
17 | println(" ")
18 | return
19 | }
20 |
21 | val kuduMaster = args(0)
22 | val tableName = args(1)
23 | val numberOfRecords = args(2).toInt
24 | val executor = Executors.newFixedThreadPool(args(3).toInt)
25 | val numberOfGamers = args(4).toInt
26 | val sleepTime = args(5).toInt
27 |
28 | val kuduClient = new KuduClient.KuduClientBuilder(kuduMaster).build()
29 | val leftToRun = new AtomicInteger()
30 |
31 | for (i <- 0 to numberOfRecords) {
32 | leftToRun.incrementAndGet()
33 | executor.execute(new ApplyNewRecordRunnable(GamerDataGenerator.makeNewGamerRecord(numberOfGamers),
34 | kuduClient, tableName, leftToRun))
35 | println("Summited:" + i)
36 |
37 | Thread.sleep(sleepTime)
38 | }
39 |
40 |
41 | val startTime = System.currentTimeMillis()
42 | while (!executor.awaitTermination(10000, TimeUnit.SECONDS)) {
43 | val newTime = System.currentTimeMillis()
44 | println("> Still Waiting: {Time:" + (newTime - startTime) + ", LeftToRun:" + leftToRun + "}" )
45 | }
46 |
47 |
48 | kuduClient.close()
49 |
50 |
51 | }
52 | }
53 |
--------------------------------------------------------------------------------