├── LICENSE
├── ReadMe.md
├── derby.log
├── pom.xml
└── src
├── main
└── scala
│ └── com
│ └── cloudera
│ └── sa
│ └── spark
│ └── cardgenerator
│ ├── CardDataGenerator.scala
│ └── CardDataNester.scala
└── test
└── scala
├── com
└── cloudera
│ └── sa
│ └── spark
│ └── unittest
│ ├── core
│ └── CoreUnitTest.scala
│ ├── sql
│ ├── MakingNestedTableTest.scala
│ ├── NestedTableTest.scala
│ └── SqlUnitTest.scala
│ └── streaming
│ └── StreamingUnitTest.scala
└── org
└── apache
└── spark
└── streaming
└── TestableQueueInputDStream.scala
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "{}"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright {yyyy} {name of copyright owner}
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/ReadMe.md:
--------------------------------------------------------------------------------
1 | # Spark Unit Test Examples
2 |
3 | In this github you will find examples for Spark Core, Spark SQL, and Spark Streaming unit test.
4 |
5 | This is by no means the only way to unit test Spark, it is just to be used as a guide for training
6 |
7 | ## Running
8 | Simply load this project into your IDE and execute the test classes.
9 |
10 | Back sure to add the following JVM parameters
11 | -Xmx1536m -XX:MaxPermSize=512m -XX:ReservedCodeCacheSize=512m
12 |
13 | Or just use mvn test
--------------------------------------------------------------------------------
/derby.log:
--------------------------------------------------------------------------------
1 | ----------------------------------------------------------------
2 | Wed Jul 20 09:37:05 EDT 2016:
3 | Booting Derby version The Apache Software Foundation - Apache Derby - 10.11.1.1 - (1616546): instance a816c00e-0156-0886-2944-00000c39e680
4 | on database directory /private/var/folders/qt/cn_zyr3d4t75mpmf2rtp11600000gp/T/spark-12e2b97e-e05a-4406-80e6-6122732ab376/metastore with class loader sun.misc.Launcher$AppClassLoader@58d25a40
5 | Loaded from file:/Users/ted.malaska/.m2/repository/org/apache/derby/derby/10.11.1.1/derby-10.11.1.1.jar
6 | java.vendor=Oracle Corporation
7 | java.runtime.version=1.8.0_91-b14
8 | user.dir=/Users/ted.malaska/Documents/workspace/github/SparkUnitTestingExamples
9 | os.name=Mac OS X
10 | os.arch=x86_64
11 | os.version=10.11.3
12 | derby.system.home=null
13 | Database Class Loader started - derby.database.classpath=''
14 |
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 | 4.0.0
6 |
7 | com.cloudera.sa
8 | SparkUnitTestExamples
9 | 1.0-SNAPSHOT
10 |
11 |
12 | 1.5.0-cdh5.5.0-SNAPSHOT
13 | 2.10.4
14 | 2.10
15 | 4.12
16 | ${project.basedir}/..
17 |
18 |
19 |
20 |
21 |
22 | cloudera-repo
23 | Cloudera Repository
24 | https://repository.cloudera.com/artifactory/cloudera-repos
25 |
26 |
27 |
28 |
29 |
30 | org.scala-lang
31 | scala-library
32 | ${scala.version}
33 |
34 |
35 | org.apache.spark
36 | spark-core_${scala.binary.version}
37 | ${spark.version}
38 |
39 |
40 |
41 |
42 | org.scala-lang
43 | scala-library
44 |
45 |
46 |
47 | org.scala-lang
48 | scalap
49 |
50 |
51 |
52 |
53 | org.apache.spark
54 | spark-sql_${scala.binary.version}
55 | ${spark.version}
56 |
57 |
58 |
59 | org.apache.spark
60 | spark-hive_${scala.binary.version}
61 | ${spark.version}
62 |
63 |
64 |
65 | org.apache.spark
66 | spark-mllib_${scala.binary.version}
67 | ${spark.version}
68 |
69 |
70 |
71 | org.apache.spark
72 | spark-streaming_${scala.binary.version}
73 | ${spark.version}
74 |
75 |
76 | org.apache.spark
77 | spark-streaming_${scala.binary.version}
78 | ${spark.version}
79 | test-jar
80 | tests
81 | test
82 |
83 |
84 | junit
85 | junit
86 | ${junit.version}
87 | test
88 |
89 |
90 | org.scalatest
91 | scalatest_${scala.binary.version}
92 | 2.2.4
93 | test
94 |
95 |
96 |
97 |
98 |
99 |
100 | org.apache.maven.plugins
101 | maven-compiler-plugin
102 | 3.3
103 |
104 | 1.8
105 | 1.8
106 |
107 |
108 |
109 |
110 | net.alchim31.maven
111 | scala-maven-plugin
112 | 3.2.0
113 |
114 | UTF-8
115 | ${scala.version}
116 |
117 |
118 |
119 | scala-compile-first
120 | process-resources
121 |
122 | add-source
123 | compile
124 |
125 |
126 |
127 | scala-test-compile
128 | process-test-resources
129 |
130 | testCompile
131 |
132 |
133 |
134 |
135 |
136 |
137 | org.scalatest
138 | scalatest-maven-plugin
139 | 1.0
140 |
141 | ${project.build.directory}/surefire-reports
142 | .
143 | WDF TestSuite.txt
144 | false
145 |
146 |
147 |
148 | test
149 | test
150 |
151 | test
152 |
153 |
154 | true
155 |
156 |
157 |
158 | integration-test
159 | integration-test
160 |
161 | test
162 |
163 |
164 | Integration-Test
165 |
166 | -Xmx1536m -XX:MaxPermSize=512m -XX:ReservedCodeCacheSize=512m
167 |
168 | false
169 |
170 |
171 |
172 |
173 |
174 | org.apache.maven.plugins
175 | maven-shade-plugin
176 | 2.2
177 |
178 | false
179 | target/KuduSpark.jar
180 |
181 |
182 | *:*
183 |
184 |
185 |
186 |
187 | *:*
188 |
189 | META-INF/*.SF
190 | META-INF/*.DSA
191 | META-INF/*.RSA
192 |
193 |
194 |
195 |
196 |
197 |
198 | package
199 |
200 | shade
201 |
202 |
203 |
204 |
206 |
208 | reference.conf
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
--------------------------------------------------------------------------------
/src/main/scala/com/cloudera/sa/spark/cardgenerator/CardDataGenerator.scala:
--------------------------------------------------------------------------------
1 | package com.cloudera.sa.spark.cardgenerator
2 |
3 | import java.util.Random
4 |
5 | import org.apache.spark.rdd.RDD
6 | import org.apache.spark.sql.Row
7 | import org.apache.spark.sql.hive.HiveContext
8 | import org.apache.spark.{SparkConf, SparkContext}
9 |
10 | import scala.collection.mutable
11 |
12 |
13 | object CardDataGenerator {
14 | def main(args:Array[String]): Unit = {
15 |
16 | if (args.length == 0) {
17 | println(" " +
18 | " " +
19 | " " +
20 | " " +
21 | " " +
22 | " " +
23 | " " +
24 | " ")
25 | return
26 | }
27 |
28 | val runLocal = args(0).equalsIgnoreCase("l")
29 | val accountTable = args(1)
30 | val cardTable = args(2)
31 | val transTable = args(3)
32 | val numOfAccounts = args(4).toInt
33 | val numOfCards = args(5).toInt
34 | val numOfTrans = args(6).toInt
35 | val numOfPartitionWriters = args(7).toInt
36 |
37 | val sc: SparkContext = if (runLocal) {
38 | val sparkConfig = new SparkConf()
39 | sparkConfig.set("spark.broadcast.compress", "false")
40 | sparkConfig.set("spark.shuffle.compress", "false")
41 | sparkConfig.set("spark.shuffle.spill.compress", "false")
42 | new SparkContext("local[2]", "TableStatsSinglePathMain", sparkConfig)
43 | } else {
44 | val sparkConf = new SparkConf().setAppName("Spark Data Generator")
45 | new SparkContext(sparkConf)
46 | }
47 |
48 | val hiveContext = new HiveContext(sc)
49 |
50 | println("-----------------------------")
51 | println("Generate Account Data")
52 | println("-----------------------------")
53 | val numOfAccountWriters = (numOfPartitionWriters * (numOfCards.toDouble/numOfTrans.toDouble)).toInt + 1
54 | val accountPartitions = sc.parallelize((1 to numOfAccountWriters).toSeq, numOfAccountWriters)
55 | generateAccountData(accountTable, numOfAccounts, numOfAccountWriters, hiveContext, accountPartitions)
56 |
57 | println("-----------------------------")
58 | println("Generate Card Data")
59 | println("-----------------------------")
60 | val numOfCardWriters = (numOfPartitionWriters * (numOfCards.toDouble/numOfTrans.toDouble)).toInt + 1
61 | val cardPartitions = sc.parallelize((1 to numOfCardWriters).toSeq, numOfCardWriters)
62 | generateCardData(cardTable, numOfAccounts, numOfCards, numOfCardWriters, hiveContext, cardPartitions)
63 |
64 | println("-----------------------------")
65 | println("Generate Tran Data")
66 | println("-----------------------------")
67 | val tranPartitions = sc.parallelize((1 to numOfPartitionWriters).toSeq, numOfPartitionWriters)
68 | generateTranData(transTable, numOfCards, numOfTrans, numOfPartitionWriters, hiveContext, tranPartitions)
69 |
70 | }
71 |
72 | def generateAccountData(accountTable: String, numOfAccounts: Int, numOfPartitionWriters: Int, hiveContext: HiveContext, partitions: RDD[Int]): Unit = {
73 | val accountRDD = partitions.flatMap(r => {
74 | val mutableList = new mutable.MutableList[Row]
75 | val loops = numOfAccounts / numOfPartitionWriters
76 | val random = new Random()
77 | for (i <- 0 until loops) {
78 | mutableList += Row(i.toLong + r.toLong * loops, haiku(random), haiku(random), random.nextInt(120))
79 | }
80 | mutableList.toSeq
81 | })
82 |
83 | hiveContext.sql("create table " + accountTable + " (" +
84 | "account_id BIGINT," +
85 | "first_name STRING," +
86 | "last_name STRING," +
87 | "age INT)" +
88 | "stored as parquet ")
89 |
90 | val emptyAccountDF = hiveContext.sql("select * from " + accountTable + " limit 0")
91 | hiveContext.createDataFrame(accountRDD, emptyAccountDF.schema).registerTempTable("accountTmp")
92 | hiveContext.sql("insert into " + accountTable + " select * from accountTmp")
93 |
94 | hiveContext.sql("select * from " + accountTable + " limit 100").take(100).foreach(println)
95 | }
96 |
97 | def generateCardData(cardTable: String, numOfAccounts:Int, numOfCards: Int, numOfPartitionWriters: Int, hiveContext: HiveContext, partitions: RDD[Int]): Unit = {
98 | val accountRDD = partitions.flatMap(r => {
99 | val mutableList = new mutable.MutableList[Row]
100 | val loops = numOfCards / numOfPartitionWriters
101 | val random = new Random()
102 | for (i <- 0 until loops) {
103 | mutableList += Row(i.toLong + r.toLong * loops, random.nextInt(numOfAccounts).toLong, 2000 + random.nextInt(20), random.nextInt(12))
104 | }
105 | mutableList.toSeq
106 | })
107 |
108 | hiveContext.sql("create table " + cardTable + " (" +
109 | "card_id BIGINT, " +
110 | "account_id BIGINT, " +
111 | "exp_year INT, " +
112 | "exp_month INT)" +
113 | "stored as parquet ")
114 |
115 | val emptyAccountDF = hiveContext.sql("select * from " + cardTable + " limit 0")
116 | hiveContext.createDataFrame(accountRDD, emptyAccountDF.schema).registerTempTable("cardTmp")
117 | hiveContext.sql("insert into " + cardTable + " select * from cardTmp")
118 |
119 | hiveContext.sql("select * from " + cardTable + " limit 100").take(100).foreach(println)
120 | }
121 |
122 | def generateTranData(transTable: String, numOfCards: Int, numOfTrans:Int, numOfPartitionWriters: Int, hiveContext: HiveContext, partitions: RDD[Int]): Unit = {
123 | val accountRDD = partitions.flatMap(r => {
124 | val mutableList = new mutable.MutableList[Row]
125 | val loops = numOfTrans / numOfPartitionWriters
126 |
127 | val now = System.currentTimeMillis()
128 | val random = new Random()
129 | for (i <- 0 until loops) {
130 |
131 | mutableList += Row(i.toLong + r.toLong * loops, random.nextInt(numOfCards).toLong, now + i * 60000l + random.nextInt(1000), random.nextInt(1000), random.nextInt(100000).toLong)
132 | }
133 | mutableList.toSeq
134 | })
135 |
136 | hiveContext.sql("create table " + transTable + " (" +
137 | "tran_id BIGINT, " +
138 | "card_id BIGINT, " +
139 | "time_stamp BIGINT," +
140 | "amount INT," +
141 | "merchant_id BIGINT)" +
142 | "stored as parquet ")
143 |
144 | val emptyAccountDF = hiveContext.sql("select * from " + transTable + " limit 0")
145 | hiveContext.createDataFrame(accountRDD, emptyAccountDF.schema).registerTempTable("transTmp")
146 | hiveContext.sql("insert into " + transTable + " select * from transTmp")
147 |
148 | hiveContext.sql("select * from " + transTable + " limit 100").take(100).foreach(println)
149 | }
150 |
151 | val adjs = List("autumn", "hidden", "bitter", "misty", "silent",
152 | "reckless", "daunting", "short", "rising", "strong", "timber", "tumbling",
153 | "silver", "dusty", "celestial", "cosmic", "crescent", "double", "far",
154 | "terrestrial", "huge", "deep", "epic", "titanic", "mighty", "powerful")
155 |
156 | val nouns = List("waterfall", "river", "breeze", "moon", "rain",
157 | "wind", "sea", "morning", "snow", "lake", "sunset", "pine", "shadow", "leaf",
158 | "sequoia", "cedar", "wrath", "blessing", "spirit", "nova", "storm", "burst",
159 | "giant", "elemental", "throne", "game", "weed", "stone", "apogee", "bang")
160 |
161 | def getRandElt[A](xs: List[A], random:Random): A = xs.apply(random.nextInt(xs.size))
162 |
163 | def getRandNumber(ra: Range, random:Random): String = {
164 | (ra.head + random.nextInt(ra.end - ra.head)).toString
165 | }
166 |
167 | def haiku(random: Random): String = {
168 | val xs = getRandNumber(1000 to 9999, random) :: List(nouns, adjs).map(l => getRandElt(l, random))
169 | xs.reverse.mkString("-")
170 | }
171 | }
172 |
--------------------------------------------------------------------------------
/src/main/scala/com/cloudera/sa/spark/cardgenerator/CardDataNester.scala:
--------------------------------------------------------------------------------
1 | package com.cloudera.sa.spark.cardgenerator
2 |
3 | import org.apache.spark.sql.Row
4 | import org.apache.spark.sql.hive.HiveContext
5 | import org.apache.spark.{SparkConf, SparkContext}
6 |
7 | object CardDataNester {
8 | def main(args:Array[String]): Unit = {
9 | if (args.length == 0) {
10 | println(" " +
11 | " " +
12 | " " +
13 | " " +
14 | " ")
15 | return
16 | }
17 |
18 | val runLocal = args(0).equalsIgnoreCase("l")
19 | val accountTable = args(1)
20 | val cardTable = args(2)
21 | val transTable = args(3)
22 | val nestedTableName = args(4)
23 |
24 | val sc: SparkContext = if (runLocal) {
25 | val sparkConfig = new SparkConf()
26 | sparkConfig.set("spark.broadcast.compress", "false")
27 | sparkConfig.set("spark.shuffle.compress", "false")
28 | sparkConfig.set("spark.shuffle.spill.compress", "false")
29 | new SparkContext("local[2]", "TableStatsSinglePathMain", sparkConfig)
30 | } else {
31 | val sparkConf = new SparkConf().setAppName("Spark Data Generator")
32 | new SparkContext(sparkConf)
33 | }
34 |
35 | val hc = new HiveContext(sc)
36 |
37 | val transTableDF = hc.sql("select * from " + transTable)
38 |
39 | val transGroupByRDD = transTableDF.map(r => {
40 | (r.getLong(r.fieldIndex("card_id")), r)
41 | }).groupByKey()
42 |
43 | val cardTableDF = hc.sql("select * from " + cardTable)
44 |
45 | val nestedCardRDD = cardTableDF.map(r => {
46 | (r.getLong(r.fieldIndex("card_id")), r)
47 | }).join(transGroupByRDD).map(r => {
48 | val card = r._2._1
49 | val trans = r._2._2.map(t => {
50 | Row(
51 | t.getLong(t.fieldIndex("tran_id")),
52 | t.getLong(t.fieldIndex("time_stamp")),
53 | t.getInt(t.fieldIndex("amount")),
54 | t.getLong(t.fieldIndex("merchant_id")))
55 | })
56 |
57 | (card.getLong(card.fieldIndex("account_id")),
58 | Row(
59 | card.getLong(card.fieldIndex("card_id")),
60 | card.getInt(card.fieldIndex("exp_year")),
61 | card.getInt(card.fieldIndex("exp_month")),
62 | trans))
63 | }).groupByKey()
64 |
65 | val accountTableDF = hc.sql("select * from " + accountTable)
66 |
67 | val nestedAccountRdd = accountTableDF.map(r => {
68 | (r.getLong(r.fieldIndex("account_id")), r)
69 | }).join(nestedCardRDD).map(r => {
70 | val account = r._2._1
71 | Row(
72 | account.getLong(account.fieldIndex("account_id")),
73 | account.getString(account.fieldIndex("first_name")),
74 | account.getString(account.fieldIndex("last_name")),
75 | account.getInt(account.fieldIndex("age")),
76 | r._2._2.toSeq
77 | )
78 | })
79 |
80 | hc.sql("create table " + nestedTableName + "(" +
81 | " account_id BIGINT," +
82 | " first_name STRING," +
83 | " last_name STRING," +
84 | " age INT," +
85 | " card ARRAY>" +
95 | " >>" +
96 | ") stored as parquet")
97 |
98 | val emptyNestedDf = hc.sql("select * from " + nestedTableName + " limit 0")
99 |
100 | hc.createDataFrame(nestedAccountRdd, emptyNestedDf.schema).registerTempTable("nestedTmp")
101 |
102 | hc.sql("insert into " + nestedTableName + " select * from nestedTmp")
103 |
104 | sc.stop()
105 | }
106 | }
107 |
--------------------------------------------------------------------------------
/src/test/scala/com/cloudera/sa/spark/unittest/core/CoreUnitTest.scala:
--------------------------------------------------------------------------------
1 | package com.cloudera.sa.spark.unittest.core
2 |
3 | import org.apache.spark.{SparkConf, SparkContext}
4 | import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach, FunSuite}
5 |
6 | import scala.collection.mutable
7 |
8 | class CoreUnitTest extends FunSuite with
9 | BeforeAndAfterEach with BeforeAndAfterAll{
10 |
11 | @transient var sc: SparkContext = null
12 |
13 | override def beforeAll(): Unit = {
14 |
15 | val envMap = Map[String,String](("Xmx", "512m"))
16 |
17 | val sparkConfig = new SparkConf()
18 | sparkConfig.set("spark.broadcast.compress", "false")
19 | sparkConfig.set("spark.shuffle.compress", "false")
20 | sparkConfig.set("spark.shuffle.spill.compress", "false")
21 | sparkConfig.set("spark.io.compression.codec", "lzf")
22 | sc = new SparkContext("local[2]", "unit test", sparkConfig)
23 | }
24 |
25 | override def afterAll(): Unit = {
26 | sc.stop()
27 | }
28 |
29 | test("Test word count") {
30 | val quotesRDD = sc.parallelize(Seq("Courage is not simply one of the virtues, but the form of every virtue at the testing point",
31 | "We have a very active testing community which people don't often think about when you have open source",
32 | "Program testing can be used to show the presence of bugs, but never to show their absence",
33 | "Simple systems are not feasible because they require infinite testing",
34 | "Testing leads to failure, and failure leads to understanding"))
35 |
36 | val wordCountRDD = quotesRDD.flatMap(r => r.split(' ')).
37 | map(r => (r.toLowerCase, 1)).
38 | reduceByKey((a,b) => a + b)
39 |
40 | val wordMap = new mutable.HashMap[String, Int]()
41 | wordCountRDD.take(100).
42 | foreach{case(word, count) => wordMap.put(word, count)}
43 | //Note this is better then foreach(r => wordMap.put(r._1, r._2)
44 |
45 | assert(wordMap.get("to").get == 4, "The word count for 'to' should had been 4 but it was " + wordMap.get("to").get)
46 | assert(wordMap.get("testing").get == 5, "The word count for 'testing' should had been 5 but it was " + wordMap.get("testing").get)
47 | assert(wordMap.get("is").get == 1, "The word count for 'is' should had been 1 but it was " + wordMap.get("is").get)
48 | }
49 | }
50 |
--------------------------------------------------------------------------------
/src/test/scala/com/cloudera/sa/spark/unittest/sql/MakingNestedTableTest.scala:
--------------------------------------------------------------------------------
1 | package com.cloudera.sa.spark.unittest.sql
2 |
3 | import org.apache.spark.sql.Row
4 | import org.apache.spark.{SparkConf, SparkContext}
5 | import org.apache.spark.sql.hive.HiveContext
6 | import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach, FunSuite}
7 |
8 | object MakingNestedTableTest extends FunSuite with
9 | BeforeAndAfterEach with BeforeAndAfterAll {
10 |
11 | @transient var sc: SparkContext = null
12 | @transient var hiveContext: HiveContext = null
13 |
14 | override def beforeAll(): Unit = {
15 |
16 | val envMap = Map[String, String](("Xmx", "512m"))
17 |
18 | val sparkConfig = new SparkConf()
19 | sparkConfig.set("spark.broadcast.compress", "false")
20 | sparkConfig.set("spark.shuffle.compress", "false")
21 | sparkConfig.set("spark.shuffle.spill.compress", "false")
22 | sparkConfig.set("spark.io.compression.codec", "lzf")
23 | sc = new SparkContext("local[2]", "unit test", sparkConfig)
24 | hiveContext = new HiveContext(sc)
25 | }
26 |
27 | override def afterAll(): Unit = {
28 | sc.stop()
29 | }
30 |
31 | test("Test table creation and summing of counts") {
32 |
33 | val loanRDD = sc.parallelize(Seq(Row("100", "100000000"),
34 | Row("101", "100000000"),
35 | Row("102", "100000000")))
36 |
37 | val partiesRDD = sc.parallelize(Seq(Row("100", "ted"),
38 | Row("101", "bob", "42"),
39 | Row("101", "cat", "42"),
40 | Row("102", "Jen", "42"),
41 | Row("102", "Jenny", "42"),
42 | Row("102", "Ed", "42")))
43 |
44 | //loan
45 | hiveContext.sql("create table loan (id string, amount string) as parquet")
46 | val emptyLoanDF = hiveContext.sql("select * from loan limit 0;")
47 | val loanDF = hiveContext.createDataFrame(loanRDD, emptyLoanDF.schema)
48 | loanDF.registerTempTable("loanTmp")
49 | hiveContext.sql("insert into loan select * from loanTmp")
50 |
51 | //parties
52 | hiveContext.sql("create table party (loan_id string, name string, age string) as parquet")
53 | val emptyPartyDF = hiveContext.sql("select * from party limit 0;")
54 | val partyDF = hiveContext.createDataFrame(partiesRDD, emptyPartyDF.schema)
55 | partyDF.registerTempTable("partyTmp")
56 | hiveContext.sql("insert into party select * from partyTmp")
57 |
58 | val keyValueParty = hiveContext.sql("select * from party").map(r => {
59 | //Key Value
60 | (r.getString(r.fieldIndex("loan_id")), Seq(r))
61 | }).reduceByKey((a, b) => {
62 | a ++ b
63 | })
64 |
65 | val keyValueLoan = hiveContext.sql("select * from loan").map(r => {
66 | //Key Value
67 | (r.getString(r.fieldIndex("id")), r.getString(r.fieldIndex("amount")))
68 | })
69 |
70 | val nestedRDD = keyValueLoan.join(keyValueParty).map(r => {
71 | val loanId = r._1
72 | val loanAmount = r._2._1
73 | val seqOfParties = r._2._2.map(r => {
74 | Row(r.getString(r.fieldIndex("name")),
75 | r.getString(r.fieldIndex("age")))
76 | })
77 |
78 | Row(loanId, loanAmount, seqOfParties)
79 | })
80 |
81 | hiveContext.sql("create table nested (" +
82 | "loan_id string, " +
83 | "amount string, " +
84 | "party >" +
87 | ") as parquet")
88 |
89 | val emptyNestedDF = hiveContext.sql("select * from nested limit 0;")
90 | val nestedDF = hiveContext.createDataFrame(nestedRDD, emptyNestedDF.schema)
91 | nestedDF.registerTempTable("nestedTmp")
92 | hiveContext.sql("insert into nested select * from nestedTmp")
93 |
94 |
95 | }
96 | }
97 |
--------------------------------------------------------------------------------
/src/test/scala/com/cloudera/sa/spark/unittest/sql/NestedTableTest.scala:
--------------------------------------------------------------------------------
1 | package com.cloudera.sa.spark.unittest.sql
2 |
3 | import org.apache.spark.sql.Row
4 | import org.apache.spark.{SparkConf, SparkContext}
5 | import org.apache.spark.sql.hive.HiveContext
6 | import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach, FunSuite}
7 |
8 | class NestedTableTest extends FunSuite with
9 | BeforeAndAfterEach with BeforeAndAfterAll {
10 |
11 | @transient var sc: SparkContext = null
12 | @transient var hiveContext: HiveContext = null
13 |
14 | override def beforeAll(): Unit = {
15 |
16 | val envMap = Map[String, String](("Xmx", "512m"))
17 |
18 | val sparkConfig = new SparkConf()
19 | sparkConfig.set("spark.broadcast.compress", "false")
20 | sparkConfig.set("spark.shuffle.compress", "false")
21 | sparkConfig.set("spark.shuffle.spill.compress", "false")
22 | sparkConfig.set("spark.io.compression.codec", "lzf")
23 | sc = new SparkContext("local[2]", "unit test", sparkConfig)
24 | hiveContext = new HiveContext(sc)
25 | }
26 |
27 | override def afterAll(): Unit = {
28 | sc.stop()
29 | }
30 |
31 | test("Test table creation and summing of counts") {
32 | /*
33 | {
34 | "id": "0001",
35 | "type": "donut",
36 | "name": "Cake",
37 | "ppu": 0.55,
38 | "batters":
39 | {
40 | "batter":
41 | [
42 | { "id": "1001", "type": "Regular" },
43 | { "id": "1002", "type": "Chocolate" },
44 | { "id": "1003", "type": "Blueberry" },
45 | { "id": "1004", "type": "Devil's Food" }
46 | ]
47 | },
48 | "topping":
49 | [
50 | { "id": "5001", "type": "None" },
51 | { "id": "5002", "type": "Glazed" },
52 | { "id": "5005", "type": "Sugar" },
53 | { "id": "5007", "type": "Powdered Sugar" },
54 | { "id": "5006", "type": "Chocolate with Sprinkles" },
55 | { "id": "5003", "type": "Chocolate" },
56 | { "id": "5004", "type": "Maple" }
57 | ]
58 | }
59 | */
60 |
61 | val jsonRDD = sc.parallelize(Seq("{\"id\": \"0001\",\"type\": \"donut\",\"name\": \"Cake\",\"ppu\": 0.55,\"batters\":{\"batter\":[{ \"id\": \"1001\", \"type\": \"Regular\" },{ \"id\": \"1002\", \"type\": \"Chocolate\" },{ \"id\": \"1003\", \"type\": \"Blueberry\" },{ \"id\": \"1004\", \"type\": \"Devil's Food\" }]},\"topping\":[{ \"id\": \"5001\", \"type\": \"None\" },{ \"id\": \"5002\", \"type\": \"Glazed\" },{ \"id\": \"5005\", \"type\": \"Sugar\" },{ \"id\": \"5007\", \"type\": \"Powdered Sugar\" },{ \"id\": \"5006\", \"type\": \"Chocolate with Sprinkles\" },{ \"id\": \"5003\", \"type\": \"Chocolate\" },{ \"id\": \"5004\", \"type\": \"Maple\" }]}"))
62 |
63 | val jsonDF = hiveContext.read.json(jsonRDD)
64 |
65 | jsonDF.foreach(row => {
66 | println(row)
67 | })
68 |
69 | jsonDF.write.parquet("./parquet")
70 |
71 | hiveContext.createExternalTable("jsonNestedTable", "./parquet")
72 |
73 | println(jsonDF.schema)
74 |
75 | hiveContext.sql("select * from jsonNestedTable").foreach(row => {
76 | println(row)
77 | })
78 |
79 | }
80 | }
81 |
--------------------------------------------------------------------------------
/src/test/scala/com/cloudera/sa/spark/unittest/sql/SqlUnitTest.scala:
--------------------------------------------------------------------------------
1 | package com.cloudera.sa.spark.unittest.sql
2 |
3 | import org.apache.spark.sql.Row
4 | import org.apache.spark.sql.hive.HiveContext
5 | import org.apache.spark.{SparkConf, SparkContext}
6 | import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach, FunSuite}
7 |
8 | import scala.collection.mutable
9 |
10 | class SqlUnitTest extends FunSuite with
11 | BeforeAndAfterEach with BeforeAndAfterAll{
12 |
13 | @transient var sc: SparkContext = null
14 | @transient var hiveContext: HiveContext = null
15 |
16 | override def beforeAll(): Unit = {
17 |
18 | val envMap = Map[String,String](("Xmx", "512m"))
19 |
20 | val sparkConfig = new SparkConf()
21 | sparkConfig.set("spark.broadcast.compress", "false")
22 | sparkConfig.set("spark.shuffle.compress", "false")
23 | sparkConfig.set("spark.shuffle.spill.compress", "false")
24 | sparkConfig.set("spark.io.compression.codec", "lzf")
25 | sc = new SparkContext("local[2]", "unit test", sparkConfig)
26 | hiveContext = new HiveContext(sc)
27 | }
28 |
29 | override def afterAll(): Unit = {
30 | sc.stop()
31 | }
32 |
33 | test("Test table creation and summing of counts") {
34 | val personRDD = sc.parallelize(Seq(Row("ted", 42, "blue"),
35 | Row("tj", 11, "green"),
36 | Row("andrew", 9, "green")))
37 |
38 | hiveContext.sql("create table person (name string, age int, color string)")
39 |
40 | val emptyDataFrame = hiveContext.sql("select * from person limit 0")
41 |
42 | val personDataFrame = hiveContext.createDataFrame(personRDD, emptyDataFrame.schema)
43 | personDataFrame.registerTempTable("tempPerson")
44 |
45 | val ageSumDataFrame = hiveContext.sql("select sum(age) from tempPerson")
46 |
47 | val localAgeSum = ageSumDataFrame.take(10)
48 |
49 | assert(localAgeSum(0).get(0) == 62, "The sum of age should equal 62 but it equaled " + localAgeSum(0).get(0))
50 | }
51 | }
--------------------------------------------------------------------------------
/src/test/scala/com/cloudera/sa/spark/unittest/streaming/StreamingUnitTest.scala:
--------------------------------------------------------------------------------
1 | package com.cloudera.sa.spark.unittest.streaming
2 |
3 | import org.apache.spark.rdd.RDD
4 | import org.apache.spark.streaming._
5 | import org.apache.spark.streaming.dstream.DStream
6 | import org.apache.spark.{SparkConf, SparkContext}
7 | import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach, FunSuite}
8 |
9 | import scala.collection.mutable.Queue
10 |
11 | class StreamingUnitTest extends FunSuite with
12 | BeforeAndAfterEach with BeforeAndAfterAll{
13 |
14 | @transient var sc: SparkContext = null
15 | @transient var ssc: StreamingContext = null
16 |
17 | override def beforeAll(): Unit = {
18 |
19 | val envMap = Map[String,String](("Xmx", "512m"))
20 |
21 | val sparkConfig = new SparkConf()
22 | sparkConfig.set("spark.broadcast.compress", "false")
23 | sparkConfig.set("spark.shuffle.compress", "false")
24 | sparkConfig.set("spark.shuffle.spill.compress", "false")
25 | sparkConfig.set("spark.io.compression.codec", "lzf")
26 | sc = new SparkContext("local[2]", "unit test", sparkConfig)
27 | ssc = new StreamingContext(sc, Milliseconds(200))
28 | }
29 |
30 | override def afterAll(): Unit = {
31 | sc.stop()
32 | }
33 |
34 | test("Streaming word count") {
35 |
36 | val firstBatchRDD = sc.parallelize(Seq("a", "b", "c"))
37 | val secondBatchRDD = sc.parallelize(Seq("a", "e"))
38 | val thirdBatchRDD = sc.parallelize(Seq("b", "c", "e", "f"))
39 | val forthBatchRDD = sc.parallelize(Seq("a", "e"))
40 |
41 | val queue = new Queue[RDD[String]]
42 |
43 | queue.+=(firstBatchRDD)
44 | queue.+=(secondBatchRDD)
45 | queue.+=(thirdBatchRDD)
46 | queue.+=(forthBatchRDD)
47 |
48 | println(queue)
49 |
50 | val startTime = System.currentTimeMillis()
51 |
52 | val dstream = new TestableQueueInputDStream(ssc, queue, true, sc.makeRDD(Seq[String](), 1))
53 | //ssc.queueStream(queue)
54 |
55 | dstream.checkpoint(Seconds(100))
56 |
57 | val batchTotals:DStream[(String, Int)] = dstream.map(r => (r, 1)).reduceByKey(_ + _)
58 |
59 | val streamTotals = batchTotals.updateStateByKey(
60 | (seq:Seq[Int], opt:Option[Int]) => {
61 | if (!seq.isEmpty) {
62 | val totalCountForNew = seq.reduce(_ + _)
63 | if (opt.isEmpty) {
64 | Option(totalCountForNew)
65 | } else {
66 | Option(opt.get + totalCountForNew)
67 | }
68 | } else {
69 | opt
70 | }
71 | })
72 |
73 | streamTotals.foreachRDD(rdd => {
74 |
75 | })
76 |
77 | ssc.checkpoint("./tmp")
78 | ssc.start()
79 | ssc.awaitTerminationOrTimeout(2000)
80 |
81 | val endTime = System.currentTimeMillis()
82 |
83 | val rddList = streamTotals.slice(new Time(startTime), new Time(endTime))
84 |
85 | rddList(0).collect().foreach(println)
86 | assert(rddList(0).collect().filter(r => r._1.equals("a"))(0)._2 == 1)
87 | rddList(1).collect().foreach(println)
88 | assert(rddList(1).collect().filter(r => r._1.equals("a"))(0)._2 == 2)
89 | rddList(2).collect().foreach(println)
90 | assert(rddList(2).collect().filter(r => r._1.equals("a"))(0)._2 == 2)
91 | rddList(3).collect().foreach(println)
92 | assert(rddList(3).collect().filter(r => r._1.equals("a"))(0)._2 == 3)
93 | }
94 | }
95 |
--------------------------------------------------------------------------------
/src/test/scala/org/apache/spark/streaming/TestableQueueInputDStream.scala:
--------------------------------------------------------------------------------
1 | package org.apache.spark.streaming
2 |
3 | import java.io.{ObjectInputStream, ObjectOutputStream}
4 |
5 | import org.apache.spark.rdd.{RDD, UnionRDD}
6 | import org.apache.spark.streaming.dstream.InputDStream
7 |
8 | import scala.collection.mutable.{ArrayBuffer, Queue}
9 | import scala.reflect.ClassTag
10 |
11 | class TestableQueueInputDStream[T: ClassTag](
12 | ssc: StreamingContext,
13 | val queue: Queue[RDD[T]],
14 | oneAtATime: Boolean,
15 | defaultRDD: RDD[T]
16 | ) extends InputDStream[T](ssc) {
17 |
18 | override def start() { }
19 |
20 | override def stop() { }
21 |
22 | private def readObject(in: ObjectInputStream): Unit = {
23 | logWarning("queueStream doesn't support checkpointing")
24 | }
25 |
26 | private def writeObject(oos: ObjectOutputStream): Unit = {
27 | logWarning("queueStream doesn't support checkpointing")
28 | }
29 |
30 | override def compute(validTime: Time): Option[RDD[T]] = {
31 | val buffer = new ArrayBuffer[RDD[T]]()
32 | queue.synchronized {
33 | if (oneAtATime && queue.nonEmpty) {
34 | buffer += queue.dequeue()
35 | } else {
36 | buffer ++= queue
37 | queue.clear()
38 | }
39 | }
40 | if (buffer.nonEmpty) {
41 | if (oneAtATime) {
42 | Some(buffer.head)
43 | } else {
44 | Some(new UnionRDD(context.sc, buffer.toSeq))
45 | }
46 | } else if (defaultRDD != null) {
47 | Some(defaultRDD)
48 | } else {
49 | Some(ssc.sparkContext.emptyRDD)
50 | }
51 | }
52 |
53 | }
--------------------------------------------------------------------------------