├── LICENSE ├── ReadMe.md ├── derby.log ├── pom.xml └── src ├── main └── scala │ └── com │ └── cloudera │ └── sa │ └── spark │ └── cardgenerator │ ├── CardDataGenerator.scala │ └── CardDataNester.scala └── test └── scala ├── com └── cloudera │ └── sa │ └── spark │ └── unittest │ ├── core │ └── CoreUnitTest.scala │ ├── sql │ ├── MakingNestedTableTest.scala │ ├── NestedTableTest.scala │ └── SqlUnitTest.scala │ └── streaming │ └── StreamingUnitTest.scala └── org └── apache └── spark └── streaming └── TestableQueueInputDStream.scala /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /ReadMe.md: -------------------------------------------------------------------------------- 1 | # Spark Unit Test Examples 2 | 3 | In this github you will find examples for Spark Core, Spark SQL, and Spark Streaming unit test. 4 | 5 | This is by no means the only way to unit test Spark, it is just to be used as a guide for training 6 | 7 | ## Running 8 | Simply load this project into your IDE and execute the test classes. 9 | 10 | Back sure to add the following JVM parameters 11 | -Xmx1536m -XX:MaxPermSize=512m -XX:ReservedCodeCacheSize=512m 12 | 13 | Or just use mvn test -------------------------------------------------------------------------------- /derby.log: -------------------------------------------------------------------------------- 1 | ---------------------------------------------------------------- 2 | Wed Jul 20 09:37:05 EDT 2016: 3 | Booting Derby version The Apache Software Foundation - Apache Derby - 10.11.1.1 - (1616546): instance a816c00e-0156-0886-2944-00000c39e680 4 | on database directory /private/var/folders/qt/cn_zyr3d4t75mpmf2rtp11600000gp/T/spark-12e2b97e-e05a-4406-80e6-6122732ab376/metastore with class loader sun.misc.Launcher$AppClassLoader@58d25a40 5 | Loaded from file:/Users/ted.malaska/.m2/repository/org/apache/derby/derby/10.11.1.1/derby-10.11.1.1.jar 6 | java.vendor=Oracle Corporation 7 | java.runtime.version=1.8.0_91-b14 8 | user.dir=/Users/ted.malaska/Documents/workspace/github/SparkUnitTestingExamples 9 | os.name=Mac OS X 10 | os.arch=x86_64 11 | os.version=10.11.3 12 | derby.system.home=null 13 | Database Class Loader started - derby.database.classpath='' 14 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | com.cloudera.sa 8 | SparkUnitTestExamples 9 | 1.0-SNAPSHOT 10 | 11 | 12 | 1.5.0-cdh5.5.0-SNAPSHOT 13 | 2.10.4 14 | 2.10 15 | 4.12 16 | ${project.basedir}/.. 17 | 18 | 19 | 20 | 21 | 22 | cloudera-repo 23 | Cloudera Repository 24 | https://repository.cloudera.com/artifactory/cloudera-repos 25 | 26 | 27 | 28 | 29 | 30 | org.scala-lang 31 | scala-library 32 | ${scala.version} 33 | 34 | 35 | org.apache.spark 36 | spark-core_${scala.binary.version} 37 | ${spark.version} 38 | 39 | 40 | 41 | 42 | org.scala-lang 43 | scala-library 44 | 45 | 46 | 47 | org.scala-lang 48 | scalap 49 | 50 | 51 | 52 | 53 | org.apache.spark 54 | spark-sql_${scala.binary.version} 55 | ${spark.version} 56 | 57 | 58 | 59 | org.apache.spark 60 | spark-hive_${scala.binary.version} 61 | ${spark.version} 62 | 63 | 64 | 65 | org.apache.spark 66 | spark-mllib_${scala.binary.version} 67 | ${spark.version} 68 | 69 | 70 | 71 | org.apache.spark 72 | spark-streaming_${scala.binary.version} 73 | ${spark.version} 74 | 75 | 76 | org.apache.spark 77 | spark-streaming_${scala.binary.version} 78 | ${spark.version} 79 | test-jar 80 | tests 81 | test 82 | 83 | 84 | junit 85 | junit 86 | ${junit.version} 87 | test 88 | 89 | 90 | org.scalatest 91 | scalatest_${scala.binary.version} 92 | 2.2.4 93 | test 94 | 95 | 96 | 97 | 98 | 99 | 100 | org.apache.maven.plugins 101 | maven-compiler-plugin 102 | 3.3 103 | 104 | 1.8 105 | 1.8 106 | 107 | 108 | 109 | 110 | net.alchim31.maven 111 | scala-maven-plugin 112 | 3.2.0 113 | 114 | UTF-8 115 | ${scala.version} 116 | 117 | 118 | 119 | scala-compile-first 120 | process-resources 121 | 122 | add-source 123 | compile 124 | 125 | 126 | 127 | scala-test-compile 128 | process-test-resources 129 | 130 | testCompile 131 | 132 | 133 | 134 | 135 | 136 | 137 | org.scalatest 138 | scalatest-maven-plugin 139 | 1.0 140 | 141 | ${project.build.directory}/surefire-reports 142 | . 143 | WDF TestSuite.txt 144 | false 145 | 146 | 147 | 148 | test 149 | test 150 | 151 | test 152 | 153 | 154 | true 155 | 156 | 157 | 158 | integration-test 159 | integration-test 160 | 161 | test 162 | 163 | 164 | Integration-Test 165 | 166 | -Xmx1536m -XX:MaxPermSize=512m -XX:ReservedCodeCacheSize=512m 167 | 168 | false 169 | 170 | 171 | 172 | 173 | 174 | org.apache.maven.plugins 175 | maven-shade-plugin 176 | 2.2 177 | 178 | false 179 | target/KuduSpark.jar 180 | 181 | 182 | *:* 183 | 184 | 185 | 186 | 187 | *:* 188 | 189 | META-INF/*.SF 190 | META-INF/*.DSA 191 | META-INF/*.RSA 192 | 193 | 194 | 195 | 196 | 197 | 198 | package 199 | 200 | shade 201 | 202 | 203 | 204 | 206 | 208 | reference.conf 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | -------------------------------------------------------------------------------- /src/main/scala/com/cloudera/sa/spark/cardgenerator/CardDataGenerator.scala: -------------------------------------------------------------------------------- 1 | package com.cloudera.sa.spark.cardgenerator 2 | 3 | import java.util.Random 4 | 5 | import org.apache.spark.rdd.RDD 6 | import org.apache.spark.sql.Row 7 | import org.apache.spark.sql.hive.HiveContext 8 | import org.apache.spark.{SparkConf, SparkContext} 9 | 10 | import scala.collection.mutable 11 | 12 | 13 | object CardDataGenerator { 14 | def main(args:Array[String]): Unit = { 15 | 16 | if (args.length == 0) { 17 | println(" " + 18 | " " + 19 | " " + 20 | " " + 21 | " " + 22 | " " + 23 | " " + 24 | " ") 25 | return 26 | } 27 | 28 | val runLocal = args(0).equalsIgnoreCase("l") 29 | val accountTable = args(1) 30 | val cardTable = args(2) 31 | val transTable = args(3) 32 | val numOfAccounts = args(4).toInt 33 | val numOfCards = args(5).toInt 34 | val numOfTrans = args(6).toInt 35 | val numOfPartitionWriters = args(7).toInt 36 | 37 | val sc: SparkContext = if (runLocal) { 38 | val sparkConfig = new SparkConf() 39 | sparkConfig.set("spark.broadcast.compress", "false") 40 | sparkConfig.set("spark.shuffle.compress", "false") 41 | sparkConfig.set("spark.shuffle.spill.compress", "false") 42 | new SparkContext("local[2]", "TableStatsSinglePathMain", sparkConfig) 43 | } else { 44 | val sparkConf = new SparkConf().setAppName("Spark Data Generator") 45 | new SparkContext(sparkConf) 46 | } 47 | 48 | val hiveContext = new HiveContext(sc) 49 | 50 | println("-----------------------------") 51 | println("Generate Account Data") 52 | println("-----------------------------") 53 | val numOfAccountWriters = (numOfPartitionWriters * (numOfCards.toDouble/numOfTrans.toDouble)).toInt + 1 54 | val accountPartitions = sc.parallelize((1 to numOfAccountWriters).toSeq, numOfAccountWriters) 55 | generateAccountData(accountTable, numOfAccounts, numOfAccountWriters, hiveContext, accountPartitions) 56 | 57 | println("-----------------------------") 58 | println("Generate Card Data") 59 | println("-----------------------------") 60 | val numOfCardWriters = (numOfPartitionWriters * (numOfCards.toDouble/numOfTrans.toDouble)).toInt + 1 61 | val cardPartitions = sc.parallelize((1 to numOfCardWriters).toSeq, numOfCardWriters) 62 | generateCardData(cardTable, numOfAccounts, numOfCards, numOfCardWriters, hiveContext, cardPartitions) 63 | 64 | println("-----------------------------") 65 | println("Generate Tran Data") 66 | println("-----------------------------") 67 | val tranPartitions = sc.parallelize((1 to numOfPartitionWriters).toSeq, numOfPartitionWriters) 68 | generateTranData(transTable, numOfCards, numOfTrans, numOfPartitionWriters, hiveContext, tranPartitions) 69 | 70 | } 71 | 72 | def generateAccountData(accountTable: String, numOfAccounts: Int, numOfPartitionWriters: Int, hiveContext: HiveContext, partitions: RDD[Int]): Unit = { 73 | val accountRDD = partitions.flatMap(r => { 74 | val mutableList = new mutable.MutableList[Row] 75 | val loops = numOfAccounts / numOfPartitionWriters 76 | val random = new Random() 77 | for (i <- 0 until loops) { 78 | mutableList += Row(i.toLong + r.toLong * loops, haiku(random), haiku(random), random.nextInt(120)) 79 | } 80 | mutableList.toSeq 81 | }) 82 | 83 | hiveContext.sql("create table " + accountTable + " (" + 84 | "account_id BIGINT," + 85 | "first_name STRING," + 86 | "last_name STRING," + 87 | "age INT)" + 88 | "stored as parquet ") 89 | 90 | val emptyAccountDF = hiveContext.sql("select * from " + accountTable + " limit 0") 91 | hiveContext.createDataFrame(accountRDD, emptyAccountDF.schema).registerTempTable("accountTmp") 92 | hiveContext.sql("insert into " + accountTable + " select * from accountTmp") 93 | 94 | hiveContext.sql("select * from " + accountTable + " limit 100").take(100).foreach(println) 95 | } 96 | 97 | def generateCardData(cardTable: String, numOfAccounts:Int, numOfCards: Int, numOfPartitionWriters: Int, hiveContext: HiveContext, partitions: RDD[Int]): Unit = { 98 | val accountRDD = partitions.flatMap(r => { 99 | val mutableList = new mutable.MutableList[Row] 100 | val loops = numOfCards / numOfPartitionWriters 101 | val random = new Random() 102 | for (i <- 0 until loops) { 103 | mutableList += Row(i.toLong + r.toLong * loops, random.nextInt(numOfAccounts).toLong, 2000 + random.nextInt(20), random.nextInt(12)) 104 | } 105 | mutableList.toSeq 106 | }) 107 | 108 | hiveContext.sql("create table " + cardTable + " (" + 109 | "card_id BIGINT, " + 110 | "account_id BIGINT, " + 111 | "exp_year INT, " + 112 | "exp_month INT)" + 113 | "stored as parquet ") 114 | 115 | val emptyAccountDF = hiveContext.sql("select * from " + cardTable + " limit 0") 116 | hiveContext.createDataFrame(accountRDD, emptyAccountDF.schema).registerTempTable("cardTmp") 117 | hiveContext.sql("insert into " + cardTable + " select * from cardTmp") 118 | 119 | hiveContext.sql("select * from " + cardTable + " limit 100").take(100).foreach(println) 120 | } 121 | 122 | def generateTranData(transTable: String, numOfCards: Int, numOfTrans:Int, numOfPartitionWriters: Int, hiveContext: HiveContext, partitions: RDD[Int]): Unit = { 123 | val accountRDD = partitions.flatMap(r => { 124 | val mutableList = new mutable.MutableList[Row] 125 | val loops = numOfTrans / numOfPartitionWriters 126 | 127 | val now = System.currentTimeMillis() 128 | val random = new Random() 129 | for (i <- 0 until loops) { 130 | 131 | mutableList += Row(i.toLong + r.toLong * loops, random.nextInt(numOfCards).toLong, now + i * 60000l + random.nextInt(1000), random.nextInt(1000), random.nextInt(100000).toLong) 132 | } 133 | mutableList.toSeq 134 | }) 135 | 136 | hiveContext.sql("create table " + transTable + " (" + 137 | "tran_id BIGINT, " + 138 | "card_id BIGINT, " + 139 | "time_stamp BIGINT," + 140 | "amount INT," + 141 | "merchant_id BIGINT)" + 142 | "stored as parquet ") 143 | 144 | val emptyAccountDF = hiveContext.sql("select * from " + transTable + " limit 0") 145 | hiveContext.createDataFrame(accountRDD, emptyAccountDF.schema).registerTempTable("transTmp") 146 | hiveContext.sql("insert into " + transTable + " select * from transTmp") 147 | 148 | hiveContext.sql("select * from " + transTable + " limit 100").take(100).foreach(println) 149 | } 150 | 151 | val adjs = List("autumn", "hidden", "bitter", "misty", "silent", 152 | "reckless", "daunting", "short", "rising", "strong", "timber", "tumbling", 153 | "silver", "dusty", "celestial", "cosmic", "crescent", "double", "far", 154 | "terrestrial", "huge", "deep", "epic", "titanic", "mighty", "powerful") 155 | 156 | val nouns = List("waterfall", "river", "breeze", "moon", "rain", 157 | "wind", "sea", "morning", "snow", "lake", "sunset", "pine", "shadow", "leaf", 158 | "sequoia", "cedar", "wrath", "blessing", "spirit", "nova", "storm", "burst", 159 | "giant", "elemental", "throne", "game", "weed", "stone", "apogee", "bang") 160 | 161 | def getRandElt[A](xs: List[A], random:Random): A = xs.apply(random.nextInt(xs.size)) 162 | 163 | def getRandNumber(ra: Range, random:Random): String = { 164 | (ra.head + random.nextInt(ra.end - ra.head)).toString 165 | } 166 | 167 | def haiku(random: Random): String = { 168 | val xs = getRandNumber(1000 to 9999, random) :: List(nouns, adjs).map(l => getRandElt(l, random)) 169 | xs.reverse.mkString("-") 170 | } 171 | } 172 | -------------------------------------------------------------------------------- /src/main/scala/com/cloudera/sa/spark/cardgenerator/CardDataNester.scala: -------------------------------------------------------------------------------- 1 | package com.cloudera.sa.spark.cardgenerator 2 | 3 | import org.apache.spark.sql.Row 4 | import org.apache.spark.sql.hive.HiveContext 5 | import org.apache.spark.{SparkConf, SparkContext} 6 | 7 | object CardDataNester { 8 | def main(args:Array[String]): Unit = { 9 | if (args.length == 0) { 10 | println(" " + 11 | " " + 12 | " " + 13 | " " + 14 | " ") 15 | return 16 | } 17 | 18 | val runLocal = args(0).equalsIgnoreCase("l") 19 | val accountTable = args(1) 20 | val cardTable = args(2) 21 | val transTable = args(3) 22 | val nestedTableName = args(4) 23 | 24 | val sc: SparkContext = if (runLocal) { 25 | val sparkConfig = new SparkConf() 26 | sparkConfig.set("spark.broadcast.compress", "false") 27 | sparkConfig.set("spark.shuffle.compress", "false") 28 | sparkConfig.set("spark.shuffle.spill.compress", "false") 29 | new SparkContext("local[2]", "TableStatsSinglePathMain", sparkConfig) 30 | } else { 31 | val sparkConf = new SparkConf().setAppName("Spark Data Generator") 32 | new SparkContext(sparkConf) 33 | } 34 | 35 | val hc = new HiveContext(sc) 36 | 37 | val transTableDF = hc.sql("select * from " + transTable) 38 | 39 | val transGroupByRDD = transTableDF.map(r => { 40 | (r.getLong(r.fieldIndex("card_id")), r) 41 | }).groupByKey() 42 | 43 | val cardTableDF = hc.sql("select * from " + cardTable) 44 | 45 | val nestedCardRDD = cardTableDF.map(r => { 46 | (r.getLong(r.fieldIndex("card_id")), r) 47 | }).join(transGroupByRDD).map(r => { 48 | val card = r._2._1 49 | val trans = r._2._2.map(t => { 50 | Row( 51 | t.getLong(t.fieldIndex("tran_id")), 52 | t.getLong(t.fieldIndex("time_stamp")), 53 | t.getInt(t.fieldIndex("amount")), 54 | t.getLong(t.fieldIndex("merchant_id"))) 55 | }) 56 | 57 | (card.getLong(card.fieldIndex("account_id")), 58 | Row( 59 | card.getLong(card.fieldIndex("card_id")), 60 | card.getInt(card.fieldIndex("exp_year")), 61 | card.getInt(card.fieldIndex("exp_month")), 62 | trans)) 63 | }).groupByKey() 64 | 65 | val accountTableDF = hc.sql("select * from " + accountTable) 66 | 67 | val nestedAccountRdd = accountTableDF.map(r => { 68 | (r.getLong(r.fieldIndex("account_id")), r) 69 | }).join(nestedCardRDD).map(r => { 70 | val account = r._2._1 71 | Row( 72 | account.getLong(account.fieldIndex("account_id")), 73 | account.getString(account.fieldIndex("first_name")), 74 | account.getString(account.fieldIndex("last_name")), 75 | account.getInt(account.fieldIndex("age")), 76 | r._2._2.toSeq 77 | ) 78 | }) 79 | 80 | hc.sql("create table " + nestedTableName + "(" + 81 | " account_id BIGINT," + 82 | " first_name STRING," + 83 | " last_name STRING," + 84 | " age INT," + 85 | " card ARRAY>" + 95 | " >>" + 96 | ") stored as parquet") 97 | 98 | val emptyNestedDf = hc.sql("select * from " + nestedTableName + " limit 0") 99 | 100 | hc.createDataFrame(nestedAccountRdd, emptyNestedDf.schema).registerTempTable("nestedTmp") 101 | 102 | hc.sql("insert into " + nestedTableName + " select * from nestedTmp") 103 | 104 | sc.stop() 105 | } 106 | } 107 | -------------------------------------------------------------------------------- /src/test/scala/com/cloudera/sa/spark/unittest/core/CoreUnitTest.scala: -------------------------------------------------------------------------------- 1 | package com.cloudera.sa.spark.unittest.core 2 | 3 | import org.apache.spark.{SparkConf, SparkContext} 4 | import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach, FunSuite} 5 | 6 | import scala.collection.mutable 7 | 8 | class CoreUnitTest extends FunSuite with 9 | BeforeAndAfterEach with BeforeAndAfterAll{ 10 | 11 | @transient var sc: SparkContext = null 12 | 13 | override def beforeAll(): Unit = { 14 | 15 | val envMap = Map[String,String](("Xmx", "512m")) 16 | 17 | val sparkConfig = new SparkConf() 18 | sparkConfig.set("spark.broadcast.compress", "false") 19 | sparkConfig.set("spark.shuffle.compress", "false") 20 | sparkConfig.set("spark.shuffle.spill.compress", "false") 21 | sparkConfig.set("spark.io.compression.codec", "lzf") 22 | sc = new SparkContext("local[2]", "unit test", sparkConfig) 23 | } 24 | 25 | override def afterAll(): Unit = { 26 | sc.stop() 27 | } 28 | 29 | test("Test word count") { 30 | val quotesRDD = sc.parallelize(Seq("Courage is not simply one of the virtues, but the form of every virtue at the testing point", 31 | "We have a very active testing community which people don't often think about when you have open source", 32 | "Program testing can be used to show the presence of bugs, but never to show their absence", 33 | "Simple systems are not feasible because they require infinite testing", 34 | "Testing leads to failure, and failure leads to understanding")) 35 | 36 | val wordCountRDD = quotesRDD.flatMap(r => r.split(' ')). 37 | map(r => (r.toLowerCase, 1)). 38 | reduceByKey((a,b) => a + b) 39 | 40 | val wordMap = new mutable.HashMap[String, Int]() 41 | wordCountRDD.take(100). 42 | foreach{case(word, count) => wordMap.put(word, count)} 43 | //Note this is better then foreach(r => wordMap.put(r._1, r._2) 44 | 45 | assert(wordMap.get("to").get == 4, "The word count for 'to' should had been 4 but it was " + wordMap.get("to").get) 46 | assert(wordMap.get("testing").get == 5, "The word count for 'testing' should had been 5 but it was " + wordMap.get("testing").get) 47 | assert(wordMap.get("is").get == 1, "The word count for 'is' should had been 1 but it was " + wordMap.get("is").get) 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /src/test/scala/com/cloudera/sa/spark/unittest/sql/MakingNestedTableTest.scala: -------------------------------------------------------------------------------- 1 | package com.cloudera.sa.spark.unittest.sql 2 | 3 | import org.apache.spark.sql.Row 4 | import org.apache.spark.{SparkConf, SparkContext} 5 | import org.apache.spark.sql.hive.HiveContext 6 | import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach, FunSuite} 7 | 8 | object MakingNestedTableTest extends FunSuite with 9 | BeforeAndAfterEach with BeforeAndAfterAll { 10 | 11 | @transient var sc: SparkContext = null 12 | @transient var hiveContext: HiveContext = null 13 | 14 | override def beforeAll(): Unit = { 15 | 16 | val envMap = Map[String, String](("Xmx", "512m")) 17 | 18 | val sparkConfig = new SparkConf() 19 | sparkConfig.set("spark.broadcast.compress", "false") 20 | sparkConfig.set("spark.shuffle.compress", "false") 21 | sparkConfig.set("spark.shuffle.spill.compress", "false") 22 | sparkConfig.set("spark.io.compression.codec", "lzf") 23 | sc = new SparkContext("local[2]", "unit test", sparkConfig) 24 | hiveContext = new HiveContext(sc) 25 | } 26 | 27 | override def afterAll(): Unit = { 28 | sc.stop() 29 | } 30 | 31 | test("Test table creation and summing of counts") { 32 | 33 | val loanRDD = sc.parallelize(Seq(Row("100", "100000000"), 34 | Row("101", "100000000"), 35 | Row("102", "100000000"))) 36 | 37 | val partiesRDD = sc.parallelize(Seq(Row("100", "ted"), 38 | Row("101", "bob", "42"), 39 | Row("101", "cat", "42"), 40 | Row("102", "Jen", "42"), 41 | Row("102", "Jenny", "42"), 42 | Row("102", "Ed", "42"))) 43 | 44 | //loan 45 | hiveContext.sql("create table loan (id string, amount string) as parquet") 46 | val emptyLoanDF = hiveContext.sql("select * from loan limit 0;") 47 | val loanDF = hiveContext.createDataFrame(loanRDD, emptyLoanDF.schema) 48 | loanDF.registerTempTable("loanTmp") 49 | hiveContext.sql("insert into loan select * from loanTmp") 50 | 51 | //parties 52 | hiveContext.sql("create table party (loan_id string, name string, age string) as parquet") 53 | val emptyPartyDF = hiveContext.sql("select * from party limit 0;") 54 | val partyDF = hiveContext.createDataFrame(partiesRDD, emptyPartyDF.schema) 55 | partyDF.registerTempTable("partyTmp") 56 | hiveContext.sql("insert into party select * from partyTmp") 57 | 58 | val keyValueParty = hiveContext.sql("select * from party").map(r => { 59 | //Key Value 60 | (r.getString(r.fieldIndex("loan_id")), Seq(r)) 61 | }).reduceByKey((a, b) => { 62 | a ++ b 63 | }) 64 | 65 | val keyValueLoan = hiveContext.sql("select * from loan").map(r => { 66 | //Key Value 67 | (r.getString(r.fieldIndex("id")), r.getString(r.fieldIndex("amount"))) 68 | }) 69 | 70 | val nestedRDD = keyValueLoan.join(keyValueParty).map(r => { 71 | val loanId = r._1 72 | val loanAmount = r._2._1 73 | val seqOfParties = r._2._2.map(r => { 74 | Row(r.getString(r.fieldIndex("name")), 75 | r.getString(r.fieldIndex("age"))) 76 | }) 77 | 78 | Row(loanId, loanAmount, seqOfParties) 79 | }) 80 | 81 | hiveContext.sql("create table nested (" + 82 | "loan_id string, " + 83 | "amount string, " + 84 | "party >" + 87 | ") as parquet") 88 | 89 | val emptyNestedDF = hiveContext.sql("select * from nested limit 0;") 90 | val nestedDF = hiveContext.createDataFrame(nestedRDD, emptyNestedDF.schema) 91 | nestedDF.registerTempTable("nestedTmp") 92 | hiveContext.sql("insert into nested select * from nestedTmp") 93 | 94 | 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /src/test/scala/com/cloudera/sa/spark/unittest/sql/NestedTableTest.scala: -------------------------------------------------------------------------------- 1 | package com.cloudera.sa.spark.unittest.sql 2 | 3 | import org.apache.spark.sql.Row 4 | import org.apache.spark.{SparkConf, SparkContext} 5 | import org.apache.spark.sql.hive.HiveContext 6 | import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach, FunSuite} 7 | 8 | class NestedTableTest extends FunSuite with 9 | BeforeAndAfterEach with BeforeAndAfterAll { 10 | 11 | @transient var sc: SparkContext = null 12 | @transient var hiveContext: HiveContext = null 13 | 14 | override def beforeAll(): Unit = { 15 | 16 | val envMap = Map[String, String](("Xmx", "512m")) 17 | 18 | val sparkConfig = new SparkConf() 19 | sparkConfig.set("spark.broadcast.compress", "false") 20 | sparkConfig.set("spark.shuffle.compress", "false") 21 | sparkConfig.set("spark.shuffle.spill.compress", "false") 22 | sparkConfig.set("spark.io.compression.codec", "lzf") 23 | sc = new SparkContext("local[2]", "unit test", sparkConfig) 24 | hiveContext = new HiveContext(sc) 25 | } 26 | 27 | override def afterAll(): Unit = { 28 | sc.stop() 29 | } 30 | 31 | test("Test table creation and summing of counts") { 32 | /* 33 | { 34 | "id": "0001", 35 | "type": "donut", 36 | "name": "Cake", 37 | "ppu": 0.55, 38 | "batters": 39 | { 40 | "batter": 41 | [ 42 | { "id": "1001", "type": "Regular" }, 43 | { "id": "1002", "type": "Chocolate" }, 44 | { "id": "1003", "type": "Blueberry" }, 45 | { "id": "1004", "type": "Devil's Food" } 46 | ] 47 | }, 48 | "topping": 49 | [ 50 | { "id": "5001", "type": "None" }, 51 | { "id": "5002", "type": "Glazed" }, 52 | { "id": "5005", "type": "Sugar" }, 53 | { "id": "5007", "type": "Powdered Sugar" }, 54 | { "id": "5006", "type": "Chocolate with Sprinkles" }, 55 | { "id": "5003", "type": "Chocolate" }, 56 | { "id": "5004", "type": "Maple" } 57 | ] 58 | } 59 | */ 60 | 61 | val jsonRDD = sc.parallelize(Seq("{\"id\": \"0001\",\"type\": \"donut\",\"name\": \"Cake\",\"ppu\": 0.55,\"batters\":{\"batter\":[{ \"id\": \"1001\", \"type\": \"Regular\" },{ \"id\": \"1002\", \"type\": \"Chocolate\" },{ \"id\": \"1003\", \"type\": \"Blueberry\" },{ \"id\": \"1004\", \"type\": \"Devil's Food\" }]},\"topping\":[{ \"id\": \"5001\", \"type\": \"None\" },{ \"id\": \"5002\", \"type\": \"Glazed\" },{ \"id\": \"5005\", \"type\": \"Sugar\" },{ \"id\": \"5007\", \"type\": \"Powdered Sugar\" },{ \"id\": \"5006\", \"type\": \"Chocolate with Sprinkles\" },{ \"id\": \"5003\", \"type\": \"Chocolate\" },{ \"id\": \"5004\", \"type\": \"Maple\" }]}")) 62 | 63 | val jsonDF = hiveContext.read.json(jsonRDD) 64 | 65 | jsonDF.foreach(row => { 66 | println(row) 67 | }) 68 | 69 | jsonDF.write.parquet("./parquet") 70 | 71 | hiveContext.createExternalTable("jsonNestedTable", "./parquet") 72 | 73 | println(jsonDF.schema) 74 | 75 | hiveContext.sql("select * from jsonNestedTable").foreach(row => { 76 | println(row) 77 | }) 78 | 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /src/test/scala/com/cloudera/sa/spark/unittest/sql/SqlUnitTest.scala: -------------------------------------------------------------------------------- 1 | package com.cloudera.sa.spark.unittest.sql 2 | 3 | import org.apache.spark.sql.Row 4 | import org.apache.spark.sql.hive.HiveContext 5 | import org.apache.spark.{SparkConf, SparkContext} 6 | import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach, FunSuite} 7 | 8 | import scala.collection.mutable 9 | 10 | class SqlUnitTest extends FunSuite with 11 | BeforeAndAfterEach with BeforeAndAfterAll{ 12 | 13 | @transient var sc: SparkContext = null 14 | @transient var hiveContext: HiveContext = null 15 | 16 | override def beforeAll(): Unit = { 17 | 18 | val envMap = Map[String,String](("Xmx", "512m")) 19 | 20 | val sparkConfig = new SparkConf() 21 | sparkConfig.set("spark.broadcast.compress", "false") 22 | sparkConfig.set("spark.shuffle.compress", "false") 23 | sparkConfig.set("spark.shuffle.spill.compress", "false") 24 | sparkConfig.set("spark.io.compression.codec", "lzf") 25 | sc = new SparkContext("local[2]", "unit test", sparkConfig) 26 | hiveContext = new HiveContext(sc) 27 | } 28 | 29 | override def afterAll(): Unit = { 30 | sc.stop() 31 | } 32 | 33 | test("Test table creation and summing of counts") { 34 | val personRDD = sc.parallelize(Seq(Row("ted", 42, "blue"), 35 | Row("tj", 11, "green"), 36 | Row("andrew", 9, "green"))) 37 | 38 | hiveContext.sql("create table person (name string, age int, color string)") 39 | 40 | val emptyDataFrame = hiveContext.sql("select * from person limit 0") 41 | 42 | val personDataFrame = hiveContext.createDataFrame(personRDD, emptyDataFrame.schema) 43 | personDataFrame.registerTempTable("tempPerson") 44 | 45 | val ageSumDataFrame = hiveContext.sql("select sum(age) from tempPerson") 46 | 47 | val localAgeSum = ageSumDataFrame.take(10) 48 | 49 | assert(localAgeSum(0).get(0) == 62, "The sum of age should equal 62 but it equaled " + localAgeSum(0).get(0)) 50 | } 51 | } -------------------------------------------------------------------------------- /src/test/scala/com/cloudera/sa/spark/unittest/streaming/StreamingUnitTest.scala: -------------------------------------------------------------------------------- 1 | package com.cloudera.sa.spark.unittest.streaming 2 | 3 | import org.apache.spark.rdd.RDD 4 | import org.apache.spark.streaming._ 5 | import org.apache.spark.streaming.dstream.DStream 6 | import org.apache.spark.{SparkConf, SparkContext} 7 | import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach, FunSuite} 8 | 9 | import scala.collection.mutable.Queue 10 | 11 | class StreamingUnitTest extends FunSuite with 12 | BeforeAndAfterEach with BeforeAndAfterAll{ 13 | 14 | @transient var sc: SparkContext = null 15 | @transient var ssc: StreamingContext = null 16 | 17 | override def beforeAll(): Unit = { 18 | 19 | val envMap = Map[String,String](("Xmx", "512m")) 20 | 21 | val sparkConfig = new SparkConf() 22 | sparkConfig.set("spark.broadcast.compress", "false") 23 | sparkConfig.set("spark.shuffle.compress", "false") 24 | sparkConfig.set("spark.shuffle.spill.compress", "false") 25 | sparkConfig.set("spark.io.compression.codec", "lzf") 26 | sc = new SparkContext("local[2]", "unit test", sparkConfig) 27 | ssc = new StreamingContext(sc, Milliseconds(200)) 28 | } 29 | 30 | override def afterAll(): Unit = { 31 | sc.stop() 32 | } 33 | 34 | test("Streaming word count") { 35 | 36 | val firstBatchRDD = sc.parallelize(Seq("a", "b", "c")) 37 | val secondBatchRDD = sc.parallelize(Seq("a", "e")) 38 | val thirdBatchRDD = sc.parallelize(Seq("b", "c", "e", "f")) 39 | val forthBatchRDD = sc.parallelize(Seq("a", "e")) 40 | 41 | val queue = new Queue[RDD[String]] 42 | 43 | queue.+=(firstBatchRDD) 44 | queue.+=(secondBatchRDD) 45 | queue.+=(thirdBatchRDD) 46 | queue.+=(forthBatchRDD) 47 | 48 | println(queue) 49 | 50 | val startTime = System.currentTimeMillis() 51 | 52 | val dstream = new TestableQueueInputDStream(ssc, queue, true, sc.makeRDD(Seq[String](), 1)) 53 | //ssc.queueStream(queue) 54 | 55 | dstream.checkpoint(Seconds(100)) 56 | 57 | val batchTotals:DStream[(String, Int)] = dstream.map(r => (r, 1)).reduceByKey(_ + _) 58 | 59 | val streamTotals = batchTotals.updateStateByKey( 60 | (seq:Seq[Int], opt:Option[Int]) => { 61 | if (!seq.isEmpty) { 62 | val totalCountForNew = seq.reduce(_ + _) 63 | if (opt.isEmpty) { 64 | Option(totalCountForNew) 65 | } else { 66 | Option(opt.get + totalCountForNew) 67 | } 68 | } else { 69 | opt 70 | } 71 | }) 72 | 73 | streamTotals.foreachRDD(rdd => { 74 | 75 | }) 76 | 77 | ssc.checkpoint("./tmp") 78 | ssc.start() 79 | ssc.awaitTerminationOrTimeout(2000) 80 | 81 | val endTime = System.currentTimeMillis() 82 | 83 | val rddList = streamTotals.slice(new Time(startTime), new Time(endTime)) 84 | 85 | rddList(0).collect().foreach(println) 86 | assert(rddList(0).collect().filter(r => r._1.equals("a"))(0)._2 == 1) 87 | rddList(1).collect().foreach(println) 88 | assert(rddList(1).collect().filter(r => r._1.equals("a"))(0)._2 == 2) 89 | rddList(2).collect().foreach(println) 90 | assert(rddList(2).collect().filter(r => r._1.equals("a"))(0)._2 == 2) 91 | rddList(3).collect().foreach(println) 92 | assert(rddList(3).collect().filter(r => r._1.equals("a"))(0)._2 == 3) 93 | } 94 | } 95 | -------------------------------------------------------------------------------- /src/test/scala/org/apache/spark/streaming/TestableQueueInputDStream.scala: -------------------------------------------------------------------------------- 1 | package org.apache.spark.streaming 2 | 3 | import java.io.{ObjectInputStream, ObjectOutputStream} 4 | 5 | import org.apache.spark.rdd.{RDD, UnionRDD} 6 | import org.apache.spark.streaming.dstream.InputDStream 7 | 8 | import scala.collection.mutable.{ArrayBuffer, Queue} 9 | import scala.reflect.ClassTag 10 | 11 | class TestableQueueInputDStream[T: ClassTag]( 12 | ssc: StreamingContext, 13 | val queue: Queue[RDD[T]], 14 | oneAtATime: Boolean, 15 | defaultRDD: RDD[T] 16 | ) extends InputDStream[T](ssc) { 17 | 18 | override def start() { } 19 | 20 | override def stop() { } 21 | 22 | private def readObject(in: ObjectInputStream): Unit = { 23 | logWarning("queueStream doesn't support checkpointing") 24 | } 25 | 26 | private def writeObject(oos: ObjectOutputStream): Unit = { 27 | logWarning("queueStream doesn't support checkpointing") 28 | } 29 | 30 | override def compute(validTime: Time): Option[RDD[T]] = { 31 | val buffer = new ArrayBuffer[RDD[T]]() 32 | queue.synchronized { 33 | if (oneAtATime && queue.nonEmpty) { 34 | buffer += queue.dequeue() 35 | } else { 36 | buffer ++= queue 37 | queue.clear() 38 | } 39 | } 40 | if (buffer.nonEmpty) { 41 | if (oneAtATime) { 42 | Some(buffer.head) 43 | } else { 44 | Some(new UnionRDD(context.sc, buffer.toSeq)) 45 | } 46 | } else if (defaultRDD != null) { 47 | Some(defaultRDD) 48 | } else { 49 | Some(ssc.sparkContext.emptyRDD) 50 | } 51 | } 52 | 53 | } --------------------------------------------------------------------------------