├── .idea ├── .name ├── compiler.xml ├── copyright │ └── profiles_settings.xml ├── dictionaries │ └── abose.xml ├── libraries │ └── datanucleus_api_jdo_3_2_6.xml ├── misc.xml ├── modules.xml ├── uiDesigner.xml ├── vcs.xml └── workspace.xml ├── README.md ├── bin └── run_spark_example.sh ├── spark-unit-testing.iml └── src ├── main └── scala │ └── ml │ └── dolphin │ └── testing │ ├── DistanceFromCentroid.scala │ ├── EuclideanVectorSpace.scala │ ├── SampleRdd.scala │ └── VectorSpace.scala └── test └── scala └── ml └── dolphin └── testing ├── DistanceFromCentroidTests.scala ├── EuclideanVectorSpaceTests.scala └── SparkTestingBaseExample.scala /.idea/.name: -------------------------------------------------------------------------------- 1 | spark-unit-testing -------------------------------------------------------------------------------- /.idea/compiler.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /.idea/copyright/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /.idea/dictionaries/abose.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | llib 5 | 6 | 7 | -------------------------------------------------------------------------------- /.idea/libraries/datanucleus_api_jdo_3_2_6.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | General 25 | 26 | 27 | XPath 28 | 29 | 30 | XSLT 31 | 32 | 33 | 34 | 35 | DefaultFileTemplate 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/uiDesigner.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/workspace.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 15 | 16 | 17 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 105 | 106 | 107 | 121 | 122 | 123 | 124 | 125 | 126 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 161 | 162 | 163 | 164 | 167 | 168 | 171 | 172 | 173 | 174 | 177 | 178 | 181 | 182 | 185 | 186 | 189 | 190 | 193 | 194 | 195 | 196 | 199 | 200 | 203 | 204 | 207 | 208 | 211 | 212 | 213 | 214 | 217 | 218 | 221 | 222 | 225 | 226 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | 269 | 270 | 271 | 289 | 290 | 291 | 310 | 311 | 317 | 318 | 319 | 332 | 333 | 334 | 341 | 344 | 346 | 347 | 348 | 349 | 350 | 351 | 365 | 366 | 367 | 368 | 369 | 370 | 371 | 372 | 373 | 374 | 375 | 391 | 392 | 399 | 400 | 401 | 402 | 403 | 404 | 405 | 406 | 407 | 408 | 409 | 410 | 411 | 412 | 413 | 414 | 415 | 416 | 417 | 418 | 419 | 420 | 421 | 422 | 423 | 424 | 425 | 426 | 427 | 428 | 429 | 430 | 431 | 432 | 433 | 434 | 452 | 459 | 460 | 461 | 462 | 463 | 464 | 465 | 466 | 467 | 468 | 469 | 470 | 471 | 472 | 473 | 491 | 492 | 510 | 511 | 531 | 532 | 553 | 554 | 577 | 578 | 579 | 580 | 581 | 582 | 583 | 584 | 585 | 586 | 587 | 588 | 589 | 590 | 591 | 592 | 593 | 594 | 595 | 596 | 597 | 598 | 599 | 600 | 601 | 602 | 603 | 604 | 605 | 606 | 607 | 608 | 609 | 610 | 611 | 612 | 613 | 614 | 615 | 616 | 617 | 618 | 619 | 620 | 621 | 622 | 623 | 624 | 625 | 626 | 627 | 628 | 629 | 630 | 631 | 632 | 633 | 634 | 635 | 636 | 637 | 638 | 639 | 640 | 641 | 642 | 643 | 644 | localhost 645 | 5050 646 | 647 | 648 | 649 | 652 | 653 | 654 | 655 | 656 | 657 | 658 | 659 | 660 | 1452630774539 661 | 664 | 665 | 666 | 667 | 668 | 669 | 670 | 671 | 672 | 673 | 674 | 675 | 676 | 677 | 678 | 679 | 680 | 681 | 682 | 683 | 684 | 685 | 686 | 687 | 688 | 689 | 690 | 691 | 692 | 693 | 694 | 695 | 696 | 697 | 700 | 703 | 704 | 705 | 707 | 708 | 709 | 711 | 712 | 713 | 714 | 717 | 718 | 719 | 720 | 721 | 722 | 723 | 724 | 725 | 726 | 727 | 728 | 729 | 730 | 731 | 732 | 733 | 734 | 735 | 736 | 737 | 738 | 739 | 740 | 741 | 742 | 743 | 744 | 745 | 746 | 747 | 748 | 749 | 750 | 751 | 752 | 753 | 754 | 755 | 756 | 757 | 758 | 759 | 760 | 761 | 762 | 763 | 764 | 765 | 766 | 767 | 768 | 769 | 770 | 771 | 772 | 773 | 774 | 775 | 776 | 777 | 778 | 779 | 780 | 781 | 782 | 783 | 784 | 785 | 786 | 787 | 788 | 789 | 790 | 791 | 792 | 793 | 794 | 795 | 796 | 797 | 798 | 799 | 800 | 801 | 802 | 803 | 804 | 805 | 806 | 807 | 808 | 809 | 810 | 811 | 812 | 813 | 814 | 815 | 816 | 817 | 818 | 819 | 820 | 821 | 822 | 823 | 824 | 825 | 826 | 827 | 828 | 829 | 830 | 831 | 832 | 833 | 834 | 835 | 836 | 837 | 838 | 839 | 840 | 841 | 842 | 843 | 844 | 845 | 846 | 847 | 848 | 849 | 850 | 851 | 852 | 853 | 854 | 855 | 856 | 857 | 858 | 859 | 860 | 861 | 862 | 863 | 864 | 865 | 866 | 867 | 868 | 869 | 870 | 871 | 872 | 873 | 874 | 875 | 876 | 877 | 878 | 879 | 880 | 881 | 882 | 883 | 884 | 885 | 886 | 887 | 888 | 889 | 890 | 891 | 892 | 893 | 894 | 895 | 896 | 897 | 898 | 899 | 900 | 901 | 902 | 903 | 904 | 905 | 906 | 907 | 908 | 909 | 910 | 911 | 912 | 913 | 914 | 915 | 916 | 917 | 918 | 919 | 920 | 921 | 922 | 923 | 924 | 925 | 926 | 927 | 928 | 929 | 930 | 931 | 932 | 933 | 934 | 935 | 936 | 937 | 938 | 939 | 940 | 941 | 942 | 943 | 944 | 945 | 946 | 947 | 948 | 949 | 950 | 951 | 952 | 953 | 954 | 955 | 956 | 957 | 958 | 959 | 960 | 961 | 962 | 963 | 964 | 965 | 966 | 967 | 968 | 969 | 970 | 971 | 972 | 973 | 974 | 975 | 976 | 977 | 978 | 979 | 980 | 981 | 982 | 983 | 984 | 985 | 986 | 987 | 988 | 989 | 990 | 991 | 992 | 993 | 994 | 995 | 996 | 997 | 998 | 999 | 1000 | 1001 | 1002 | 1003 | 1004 | 1005 | 1006 | 1007 | 1008 | 1009 | 1010 | 1011 | 1012 | 1013 | 1014 | 1015 | 1016 | 1017 | 1018 | 1019 | 1020 | 1021 | 1022 | 1023 | 1024 | 1025 | 1026 | 1027 | 1028 | 1029 | 1030 | 1031 | 1032 | 1033 | 1034 | 1035 | 1036 | 1037 | 1038 | 1039 | 1040 | 1041 | 1042 | 1043 | 1044 | 1045 | 1046 | 1047 | 1048 | 1049 | 1050 | 1051 | 1052 | 1053 | 1054 | 1055 | 1056 | 1057 | 1058 | 1059 | 1060 | 1061 | 1062 | 1063 | 1064 | 1065 | 1066 | 1067 | 1068 | 1069 | 1070 | 1071 | 1072 | 1073 | 1074 | 1075 | 1076 | 1077 | 1078 | 1079 | 1080 | 1081 | 1082 | 1083 | 1084 | 1085 | 1086 | 1087 | 1088 | 1089 | 1090 | 1091 | 1092 | 1093 | 1094 | 1099 | 1100 | 1101 | 1102 | 1103 | 1104 | 1105 | 1110 | 1111 | 1112 | 1113 | 1114 | 1115 | No facets are configured 1116 | 1117 | 1122 | 1123 | 1124 | 1125 | 1126 | 1127 | Python 2.7.1 (/usr/bin/python) interpreter library 1128 | 1129 | 1134 | 1135 | 1136 | 1137 | 1138 | 1139 | 1.6 1140 | 1141 | 1146 | 1147 | 1148 | 1149 | 1150 | 1151 | spark-unit-testing 1152 | 1153 | 1158 | 1159 | 1160 | 1161 | 1162 | 1163 | 1.6 1164 | 1165 | 1170 | 1171 | 1172 | 1173 | 1174 | 1175 | datanucleus-api-jdo-3.2.6 1176 | 1177 | 1182 | 1183 | 1184 | 1185 | 1186 | 1187 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Testing in Apache Spark - A Tutorial 2 | 3 | A tutorial on how to write unit tests and do performance testing of Apache Spark code in 4 | Scala. 5 | 6 | My New Year's resolution: write more tests! May be, this is the year when I finally move over to 7 | TDD (Test Driven Development) i.e. start any new work by writing tests first! Writing tests is a 8 | very good idea :) when you plan to use your code for making real-world decisions, e.g. which 9 | ads to show to which user, or extend how much credit to which customers, etc. We have 10 | been using Spark and MLlib increasingly for these types of problems at work. Unfortunately, 11 | pointers on best practices of testing Spark code are few and scattered, so I wrote this tutorial 12 | to have a single place for all Spark and MLlib related testing, show example code and relevant 13 | URLs. I will continue to add to it as I find new material. 14 | 15 | ## Example: Vector Operations in Euclidean Space 16 | 17 | As an example project, I chose to write some of the basic vector operations in Euclidean space, 18 | such as distance and cosine between two vectors, centroid of an array of vectors, etc. With these 19 | basic operations defined, I then wrote a method that takes an RDD of vectors and a centroid as 20 | input, and calculates the sum of distances between each vector and the centroid. You will 21 | recognize this as very similar to the convergence criteria calculation step in k-means clustering 22 | and the cosine operation as the similarity calculation step in a typical collaborative filtering 23 | algorithm. We will be writing unit tests and performance tests for these in this tutorial. 24 | 25 | The basic vector operations are defined in: 26 | **src/main/scala/ml/dolphin/testing/EuclideanVectorSpace.scala** 27 | 28 | Nothing special here. It defines two 2-vector operations: distance and cosine, and two N-vector 29 | operations: centroid and closest. Notice that these are not defined over RDDs. Rather, we will be 30 | using these within a partition of the RDD, e.g. using mapPartitions. 31 | 32 | Next, let's look at: 33 | **src/main/scala/ml/dolphin/testing/DistanceFromCentroid.scala** 34 | 35 | It has only one method: calcDistance(sc: SparkContext, vPoints: RDD[Vector], centroid: Vector) 36 | 37 | ``` 38 | def calcDistance(sc: SparkContext, vPoints: RDD[Vector], centroid: Vector): Double = { 39 | 40 | // 1. Broadcast centroid to all partitions 41 | val bcCentroid = sc.broadcast(centroid) 42 | 43 | // 2. For each partition, calculate the sum of distances from centroid to each of 44 | // the points in that partition. Then, sum up the partial sums from all the partitions. 45 | 46 | val accmDistance = vPoints.mapPartitions{ points => { 47 | var sum = 0.0 48 | points.foreach { point => { 49 | sum += EuclideanVectorSpace.distance(point, bcCentroid.value) 50 | }} 51 | Iterator(sum) 52 | }}.reduce(_ + _) // 3. Sum up all the partial sums from the partitions 53 | accmDistance 54 | } 55 | ``` 56 | 57 | 1. Broadcast the centroid to all partitions of the input RDD (i.e. vPoints) so that we can 58 | calculate the distance from this centroid to all the vectors within each partition **locally**. 59 | 2. Inside mapPartitions, we calculate all the distances using EuclideanVectorSpace.distance() and 60 | generate a local "sum". Note that we wrap the local "sum" with an Iterator from each partition 61 | since mapPartitions returns an Iterator. 62 | 3. Finally, we sum up all the local contributions using a "reduce". 63 | 64 | We will now write some unit tests for these. Reminder to self: next time, start with the tests! 65 | 66 | ## Apache Spark Unit Testing 67 | 68 | There is no native library for unit testing in Spark as of yet. After researching this topic 69 | for a while, I feel that the best option is to use two libraries: 70 | 71 | - [ScalaTest](http://www.scalatest.org/) 72 | - [Spark-Testing-Base](https://github.com/holdenk/spark-testing-base) 73 | 74 | A little bit about **ScalaTest**. For Scala users, this is the most familiar unit testing 75 | framework (you can also use it for testing Java code and soon for JavaScript). 76 | 77 | it supports a number of different testing styles, each designed to support a specific type of 78 | testing need. For details, see [ScalaTest User Guide](http://www.scalatest.org/user_guide/selecting_a_style). 79 | 80 | Although ScalaTest supports many styles, I find that the quickest way to get started is to use 81 | the following ScalaTest traits and write the tests in the [TDD style (Test Driven Development)] 82 | (https://en.wikipedia.org/wiki/Test-driven_development): 83 | 84 | 1. [*FunSuite*](http://doc.scalatest.org/1.8/org/scalatest/FunSuite.html) 85 | 2. [*Assertions*](http://www.scalatest.org/user_guide/using_assertions) 86 | 3. [*BeforeAndAfter*](http://doc.scalatest.org/1.8/org/scalatest/BeforeAndAfter.html) 87 | 88 | Feel free to browse the above URLs to learn more about these traits - that will make rest of this 89 | tutorial go smoothly. 90 | 91 | FunSuite tests are function values and each test is specified with a strong denoting its name. 92 | Let's go over the syntax of a basic test: 93 | **src/test/scala/ml/dolphin/testing/EuclideanVectorSpaceTests.scala** 94 | 95 | Notice that there are multiple tests defined in this file (corresponding to the methods defined in 96 | EuclideanVectorSpace.scala and described earlier). We define a "test fixture" consisting of the 97 | objects and other artifacts such as files, database connections, connections to services, etc 98 | that the test will utilize to do its work. If you have multiple tests in a test suite that share 99 | some of the same test fixtures and you need to set them up to the same values before each test 100 | begins or you need to clean them up after each test finishes, it is best to use BeforeAndAfter 101 | trait. If an exception occurs during before or after method executes, all tests in the suite are 102 | abandoned. Our class "EuclideanVectorSpaceTests" defines 4 tests in the suite and two of them 103 | use the same fixtures: x and y. We initialize them to the same values every time in the method 104 | "before": 105 | 106 | ``` 107 | class EuclideanVectorSpaceTests extends FunSuite with BeforeAndAfter { 108 | 109 | // 1. Define the instance variables 110 | var x: linalg.Vector = _ 111 | var y: linalg.Vector = _ 112 | 113 | // 2. Set the values at the beginning of each test 114 | before { 115 | x = Vectors.dense(1.0, 2.0, 3.0, 4.0) 116 | y = Vectors.dense(2.0, 3.0, 4.0, 5.0) 117 | } 118 | 119 | // 3. Write the actual test 120 | test("L2 distance between 2 Vector's") { 121 | assert(EuclideanVectorSpace.distance(x, y) === 2.0) 122 | } 123 | 124 | test("Cosine between 2 Vector's") { 125 | // expected value = 40.0 / (sqrt(54) * sqrt(30)) 126 | assert(EuclideanVectorSpace.cosine(x, y) === 40.0 / (sqrt(54) * sqrt(30))) 127 | } 128 | 129 | test("Vectors of 0's will have a zero distance") { 130 | assertResult(0.0) { 131 | EuclideanVectorSpace.distance(Vectors.dense(0.0, 0.0, 0.0), Vectors.dense(0.0, 0.0, 0.0)) 132 | } 133 | } 134 | 135 | // 4. Use "pending" to write tests later. 136 | test ("Centroid of a set of vectors") (pending) 137 | 138 | } 139 | ``` 140 | 141 | 1. Define the instances of Vectors x and y that will be used in the tests. 142 | 2. Initialize x and y before each test 143 | 3. Write the actual test. It uses "assert" to enforce whether the expected value (right hand side) 144 | is same as the value returned by "distance" method in EuclideanVectorSpace. Note the "===" 145 | (three "=" signs) in the assert statement. This syntax provides more detailed error messages. 146 | I prefer to use this for all my tests. 147 | 4. Note the "pending" next to the last test with an empty body. This tells ScalaTest (and reminds us) 148 | that we will be writing future tests. For example, we have the method centroid defined in 149 | EuclideanVectorSpace but we are not testing it yet. (It is always a good idea to write all the tests 150 | in one go though...I may not come back to this again...) 151 | 152 | ### Making life easier with Spark-Testing-Base 153 | 154 | As I mentioned earlier, after trying a few different things, I find spark-testing-base to be the 155 | easiest and most functional unit testing framework for Spark so far. It surfaces some of the same test suites that the Spark 156 | committers use when testing internal Spark code. What you get out of the box: 157 | 158 | 1. There are often multiple tests in a test suite, and we use "before" and "after" to set up and reset the 159 | test artifacts. That is fine for regular Scala code. When you are trying to do this for Spark code, each test needs to 160 | set up and stop a [SparkContext](http://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.SparkContext), 161 | leading to a lot of boilerplate code replication. Also, in between SparkContext stops and starts, one has to clear 162 | spark.driver.port variable. When you use spart-testing-base, this is automatically taken care of so you can focus on 163 | writing the test iteself (i.e. the stuff in the "test() {}"). If you are writing lots of tests, this makes a 164 | big difference. More specifically, spark-testing-base allows you to share the same SparkContext for all the tests defined 165 | in the same test suite. We dig deeper into this below. 166 | 2. If you are testing Spark Streaming or Spark DataFrames (i.e. not just basic RDD methods), there are other things to 167 | worry about, e.g. how long you have to wait for the test to finish. I will be adding more examples on these, especially 168 | DataFrames since there is not much info on testing currently. For now, you can read [**this excellent article**](http://blog.cloudera.com/blog/2015/09/making-apache-spark-testing-easy-with-spark-testing-base/) by creator 169 | of spark-testing-base (Holden Karau). 170 | 171 | Now let's look at a concrete example. 172 | **src/test/scala/ml/dolphin/testing/DistanceFromCentroidTests.scala** 173 | 174 | ``` 175 | class DistanceFromCentroidTests extends FunSuite with BeforeAndAfter with SharedSparkContext { 176 | 177 | var vPoints: Array[Vector] = _ 178 | var centroid: Vector = _ 179 | var vPointsRdd: RDD[Vector] = _ 180 | 181 | // 1. Instantiate the variables for each test 182 | before { 183 | vPoints = Array(Vectors.dense(1.0, 2.0, 3.0, 4.0), Vectors.dense(2.0, 3.0, 4.0, 5.0), 184 | Vectors.dense(3.0, 9.0, 1.0, 7.0), Vectors.dense(1.0, 5.0, 6.0, 8.0)) 185 | centroid = Vectors.dense(1.0, 1.0, 1.0, 1.0) 186 | vPointsRdd = sc.parallelize(vPoints, 3) 187 | } 188 | 189 | // 2. an actual test 190 | test("Testing calcDistance using a shared Spark Context") { 191 | val sum = DistanceFromCentroid.calcDistance(sc, vPointsRdd, centroid) 192 | val expected = sqrt(14.0) + sqrt(30.0) + sqrt(104.0) + sqrt(90.0) 193 | assert(sum === expected) 194 | } 195 | } 196 | ``` 197 | 198 | 1. Note something special here. We are using a SparkContext ("sc") without instantiating it anywhere in this class: 199 | ``` 200 | vPointsRdd = sc.parallelize(vPoints, 3) 201 | ``` 202 | This is done for you within spark-testing-base when you extend your class definition with "SharedSparkContext" trait 203 | (you need to import "com.holdenkarau.spark.testing.SharedSparkContext" in the file where you define your test suite). 204 | 205 | To see how it's handled, take a look at the internal of spark-testing-base, specifically [**SharedSparkContext.scala**](https://github.com/holdenk/spark-testing-base/blob/ef199dc9e93cc80376d7289f7504824d0ffa0870/src/main/1.3/scala/com/holdenkarau/spark/testing/SharedSparkContext.scala) 206 | 207 | ``` 208 | 209 | /** Shares a local `SparkContext` between all tests in a suite and closes it at the end. */ 210 | trait SharedSparkContext extends BeforeAndAfterAll with SparkContextProvider { 211 | self: Suite => 212 | 213 | @transient private var _sc: SparkContext = _ 214 | 215 | // 1.1. SparkContext definition 216 | override def sc: SparkContext = _sc 217 | 218 | val appID = new Date().toString + math.floor(math.random * 10E4).toLong.toString 219 | 220 | override val conf = new SparkConf(). 221 | setMaster("local[*]"). 222 | setAppName("test"). 223 | set("spark.ui.enabled", "false"). 224 | set("spark.app.id", appID) 225 | 226 | // 1.2. Instantiate new SparkContext and set logging level 227 | override def beforeAll() { 228 | _sc = new SparkContext(conf) 229 | _sc.setLogLevel(org.apache.log4j.Level.WARN.toString) 230 | super.beforeAll() 231 | } 232 | 233 | // 1.3. stop SparkContext 234 | override def afterAll() { 235 | try { 236 | LocalSparkContext.stop(_sc) 237 | _sc = null 238 | } finally { 239 | super.afterAll() 240 | } 241 | } 242 | } 243 | ``` 244 | 245 | - 1.1 sc is the SparkContext that you will use for your tests within the same suite. Internally, the trait uses a private 246 | variable "_sc" to manage the actual Spark Context so that you cannot (accidentally) modify it. 247 | - 1.2 "_sc" is instantiated within a "beforeAll()" method. The difference between "before()" that you have seen before and 248 | "beforeAll()" is that the latter is executed before executing the suite (difference here before the suite vs before a test in 249 | the suite). You can also have nested suites - slightly more advanced but a very handy approach when testing a large platform. 250 | You can read more about beforeAll and afterAll here: http://doc.scalatest.org/1.0/org/scalatest/BeforeAndAfterAll.html 251 | - 1.3 Similarly, Spark Context is stopped at the end of the test suite via a call to afterAll(). 252 | 253 | 2. Coming back to our test suite: DistanceFromCentroidTests, let's look at the actual test: 254 | ``` 255 | // 2. an actual test 256 | test("Testing calcDistance using a shared Spark Context") { 257 | val sum = DistanceFromCentroid.calcDistance(sc, vPointsRdd, centroid) 258 | val expected = sqrt(14.0) + sqrt(30.0) + sqrt(104.0) + sqrt(90.0) 259 | assert(sum === expected) 260 | } 261 | ``` 262 | This is very similar to regular ScalaTest syntax. The only difference is that we are invoking a test on a RDD using the local 263 | shared SparkContext supplied to us by spark-testing-base. We are testing DistanceFromCentroid.calcDistance(...) method that 264 | takes a RDD of points, a centroid and calculates the sum of inidividual distances from the centroid. Notice how we expressed 265 | the expected value. Since there were 4 points defined in vPointsRdd, we put in sqrt(each point's distance from centroid) for 266 | each point. This is just an individual choice - it helps with code readability by another person as to how the expected value is 267 | calculated. 268 | 269 | That's it. Happy testing! 270 | 271 | ## Spark Performance Testing with spark-perf 272 | Coming 273 | 274 | 275 | # References 276 | 277 | Spark testing: 278 | https://spark-summit.org/2014/wp-content/uploads/2014/06/Testing-Spark-Best-Practices-Anupama-Shetty-Neil-Marshall.pdf 279 | 280 | Scala Test 281 | http://www.scalatest.org/user_guide/using_assertions 282 | 283 | Spark Testing Base 284 | http://blog.cloudera.com/blog/2015/09/making-apache-spark-testing-easy-with-spark-testing-base/ 285 | -------------------------------------------------------------------------------- /bin/run_spark_example.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # 3 | # run an example Spark job 4 | # 5 | # History: 6 | # 09/25/2014 abose "created" 7 | 8 | . $HOME/.bash_profile 9 | 10 | echo 'Running example task of estimating the value of pi using ***SPARK***' 11 | 12 | VAR1=`hostname` 13 | ${SPARK_HOME}/bin/spark-submit --class org.apache.spark.examples.SparkPi --deploy-mode client --master spark://${VAR1}:7077 ${SPARK_HOME}/lib/spark-examples-1.2.0.2.2.0.0-82-hadoop2.6.0.2.2.0.0-2041.jar 10 14 | -------------------------------------------------------------------------------- /spark-unit-testing.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /src/main/scala/ml/dolphin/testing/DistanceFromCentroid.scala: -------------------------------------------------------------------------------- 1 | package ml.dolphin.testing 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.rdd.RDD 5 | import org.apache.spark.mllib.linalg.{Vector, Vectors} 6 | 7 | /** 8 | * Various measures of distances of a set of points from a centroid. 9 | * 10 | * @author Abhijit Bose 11 | * @version 1.0 06/24/2015 12 | * @since 1.0 06/24/2015 13 | */ 14 | 15 | object DistanceFromCentroid { 16 | 17 | /** 18 | * Calculate sum of distances of a set of points from a centroid. First individual distances 19 | * between a point and the centroid are calculated and then a global sum is taken. Sum being 20 | * associative and commutative can be done in parallel over the RDD partitions. 21 | * 22 | * @param sc SparkContext 23 | * @param vPoints a collection of points as Vector 24 | * @param centroid a centroid point as Vector 25 | * @return Accumulated distance in Double 26 | */ 27 | def calcDistance(sc: SparkContext, vPoints: RDD[Vector], centroid: Vector): Double = { 28 | 29 | // Broadcast centroid to all partitions 30 | val bcCentroid = sc.broadcast(centroid) 31 | 32 | // For each partition, calculate the sum of distances from centroid to each of the points in 33 | // that partition. Then, sum up the partial sums from all the partitions. 34 | 35 | val accmDistance = vPoints.mapPartitions{ points => { 36 | var sum = 0.0 37 | points.foreach { point => { 38 | sum += EuclideanVectorSpace.distance(point, bcCentroid.value) 39 | }} 40 | Iterator(sum) 41 | }}.reduce(_ + _) 42 | accmDistance 43 | } 44 | } -------------------------------------------------------------------------------- /src/main/scala/ml/dolphin/testing/EuclideanVectorSpace.scala: -------------------------------------------------------------------------------- 1 | package ml.dolphin.testing 2 | 3 | /** 4 | * Euclidean Vector Space extended from VectorSpace. 5 | * @note Methods have no side effects 6 | * 7 | * @author Abhijit Bose 8 | * @version 1.0 06/24/2015 9 | * @since 1.0 06/24/2015 10 | */ 11 | 12 | import breeze.numerics.sqrt 13 | import org.apache.spark.mllib.linalg.{Vector, Vectors} 14 | 15 | import scala.math.pow 16 | 17 | object EuclideanVectorSpace extends VectorSpace[Vector] { 18 | 19 | /** 20 | * Euclidean Distance between two vectors x and y 21 | * 22 | * @param x Input Vector x 23 | * @param y Input Vector y 24 | * @return Double 25 | */ 26 | override def distance(x: Vector, y: Vector): Double = { 27 | val dSquared = x.toArray.zip(y.toArray).foldLeft(0.0)( 28 | (r, c) => r + pow(c._1 - c._2, 2) 29 | ) 30 | sqrt(dSquared) 31 | } 32 | 33 | /** 34 | * Centroid of a finite set of points represented as a sequence of Vector's 35 | * 36 | * @param points Input set of points 37 | * @return Vector with the centroid 38 | */ 39 | override def centroid(points: Seq[Vector]) = { 40 | val numCols = points(0).size 41 | val center = points.foldLeft(new Array[Double](numCols))( 42 | (r, c) => r.toArray.zip(c.toArray).map(t => t._1 + t._2) 43 | ) 44 | Vectors.dense(center.map(_ / points.size)) 45 | } 46 | 47 | /** 48 | * Cosine similarity distance measure between two Vector's x and y 49 | * 50 | * @param x Input Vector x 51 | * @param y Input Vector y 52 | * @return Double 53 | */ 54 | override def cosine(x: Vector, y: Vector): Double = { 55 | val normX = sqrt(x.toArray.foldLeft(0.0)( 56 | (r, c) => r + c * c 57 | )) 58 | val normY = sqrt(y.toArray.foldLeft(0.0)( 59 | (r, c) => r + c * c 60 | )) 61 | val inner = x.toArray.zip(y.toArray).foldLeft(0.0)( 62 | (r, c) => r + c._1 * c._2 63 | ) 64 | 1.0 * inner / (normX * normY) 65 | } 66 | 67 | /** 68 | * Finds closest point and shortest distance between a given array of points x, and a given 69 | * point y. Uses brute-force L2-distance pairwise calculation. 70 | * 71 | * @todo Use better algorithm such as triangle inequality to find shortest distance 72 | * @param x Given Array of points, e.g. centroids in K-means clustering 73 | * @param y Given point from which distance needs to be calculated 74 | * @return (index in x, distance) of the closest point to y 75 | */ 76 | override def closest(x: Array[Vector], y: Vector): (Int, Double) = { 77 | var shortestDistance = Double.PositiveInfinity 78 | var closestIndex = 0 79 | var index = 0 80 | x.foreach(center => { 81 | val thisDistance = distance(center, y) 82 | if (thisDistance < shortestDistance) { 83 | shortestDistance = thisDistance 84 | closestIndex = index 85 | } 86 | index += 1 87 | }) 88 | (closestIndex, shortestDistance) 89 | } 90 | 91 | /** 92 | * Converts Array[(Vector, hashcode)] data structure of centers and points to an array of Vectors 93 | * only. Mostly used as a precursor to closest and other operations. 94 | * @param x Array of (Vector, Int) 95 | * @return Array[Vector] 96 | */ 97 | def toVector(x: Array[(Vector, Int)]): Array[Vector] = { 98 | x.map(_._1) 99 | } 100 | 101 | } 102 | -------------------------------------------------------------------------------- /src/main/scala/ml/dolphin/testing/SampleRdd.scala: -------------------------------------------------------------------------------- 1 | package ml.dolphin.testing 2 | 3 | import org.apache.spark.rdd.RDD 4 | 5 | /** 6 | * Some basic operations defined as an introduction to spark-testing-base library 7 | */ 8 | 9 | object SampleRdd { 10 | 11 | def tokenize(aL: RDD[String]) = { 12 | aL.map(x => x.split(' ')).collect() 13 | } 14 | } -------------------------------------------------------------------------------- /src/main/scala/ml/dolphin/testing/VectorSpace.scala: -------------------------------------------------------------------------------- 1 | package ml.dolphin.testing 2 | 3 | /** 4 | * Common algebraic operations in vector space. Define the functions for a class 5 | * mixed in with this trait that will be appropriate for a specific type of vector space. 6 | * @example EuclideanVectorSpace which extends this trait. 7 | * 8 | * @author Abhijit Bose 9 | * @version 1.0 06/24/2015 10 | * @since 1.0 06/24/2015 11 | */ 12 | trait VectorSpace[A] { 13 | 14 | // Distance between two points x and y 15 | def distance(x: A, y: A): Double 16 | 17 | // Cosine similarity measure between two points x and y 18 | def cosine(x: A, y: A): Double 19 | 20 | // Centroid of a set of points 21 | def centroid(points: Seq[A]): A 22 | 23 | // Index and Distance of the point in Array x that is closest to a given point y 24 | def closest(x: Array[A], y: A): (Int, Double) 25 | } 26 | -------------------------------------------------------------------------------- /src/test/scala/ml/dolphin/testing/DistanceFromCentroidTests.scala: -------------------------------------------------------------------------------- 1 | package ml.dolphin.testing 2 | 3 | import com.holdenkarau.spark.testing.SharedSparkContext 4 | import breeze.numerics.sqrt 5 | import org.apache.spark.rdd.RDD 6 | import org.scalatest.{BeforeAndAfter, FunSuite} 7 | import org.apache.spark.mllib.linalg.{Vector, Vectors} 8 | 9 | 10 | /** 11 | * Unit tests for ml.dolphin.testing.DistanceFromCentroid methods 12 | * 13 | * @author Abhijit Bose 14 | * @version 1.0 11/24/2015 15 | * @since 1.0 11/24/2015 16 | */ 17 | 18 | class DistanceFromCentroidTests extends FunSuite with BeforeAndAfter with SharedSparkContext { 19 | 20 | var vPoints: Array[Vector] = _ 21 | var centroid: Vector = _ 22 | var vPointsRdd: RDD[Vector] = _ 23 | 24 | before { 25 | vPoints = Array(Vectors.dense(1.0, 2.0, 3.0, 4.0), Vectors.dense(2.0, 3.0, 4.0, 5.0), 26 | Vectors.dense(3.0, 9.0, 1.0, 7.0), Vectors.dense(1.0, 5.0, 6.0, 8.0)) 27 | centroid = Vectors.dense(1.0, 1.0, 1.0, 1.0) 28 | vPointsRdd = sc.parallelize(vPoints, 3) 29 | } 30 | 31 | test("Testing calcDistance using a shared Spark Context") { 32 | val sum = DistanceFromCentroid.calcDistance(sc, vPointsRdd, centroid) 33 | val expected = sqrt(14.0) + sqrt(30.0) + sqrt(104.0) + sqrt(90.0) 34 | assert(sum === expected) 35 | } 36 | 37 | } 38 | -------------------------------------------------------------------------------- /src/test/scala/ml/dolphin/testing/EuclideanVectorSpaceTests.scala: -------------------------------------------------------------------------------- 1 | package ml.dolphin.testing 2 | 3 | import breeze.numerics.sqrt 4 | import org.apache.spark.mllib.linalg 5 | import org.apache.spark.mllib.linalg.Vectors 6 | import org.scalatest.{BeforeAndAfter, FunSuite} 7 | 8 | /** 9 | * Tests for EuclideanVectorSpace methods 10 | * 11 | * @author Abhijit Bose 12 | * @version 1.0 06/24/2015 13 | * @since 1.0 06/24/2015 14 | */ 15 | 16 | class EuclideanVectorSpaceTests extends FunSuite with BeforeAndAfter { 17 | 18 | var x: linalg.Vector = _ 19 | var y: linalg.Vector = _ 20 | 21 | before { 22 | x = Vectors.dense(1.0, 2.0, 3.0, 4.0) 23 | y = Vectors.dense(2.0, 3.0, 4.0, 5.0) 24 | } 25 | 26 | test("L2 distance between 2 Vector's") { 27 | assert(EuclideanVectorSpace.distance(x, y) === 2.0) 28 | } 29 | 30 | test("Cosine between 2 Vector's") { 31 | // expected value = 40.0 / (sqrt(54) * sqrt(30)) 32 | assert(EuclideanVectorSpace.cosine(x, y) === 40.0 / (sqrt(54) * sqrt(30))) 33 | } 34 | 35 | test("Vectors of 0's will have a zero distance") { 36 | assertResult(0.0) { 37 | EuclideanVectorSpace.distance(Vectors.dense(0.0, 0.0, 0.0), Vectors.dense(0.0, 0.0, 0.0)) 38 | } 39 | } 40 | 41 | test ("Centroid of a set of vectors") (pending) 42 | 43 | } 44 | -------------------------------------------------------------------------------- /src/test/scala/ml/dolphin/testing/SparkTestingBaseExample.scala: -------------------------------------------------------------------------------- 1 | package ml.dolphin.testing 2 | 3 | import com.holdenkarau.spark.testing.SharedSparkContext 4 | import org.scalatest.FunSuite 5 | 6 | /** 7 | * Example usage of spark-testing-base library written by Holden Karau 8 | * 9 | * @author Abhijit Bose 10 | * @version 1.0 11/24/2015 11 | * @since 1.0 11/24/2015 12 | */ 13 | 14 | class SampleRddTest extends FunSuite with SharedSparkContext { 15 | 16 | test("Testing RDD transformations using a shared Spark Context") { 17 | val input = List("Testing", "RDD transformations", "using a shared", "Spark Context") 18 | val expected = Array(Array("Testing"), Array("RDD", "transformations"), Array("using", "a", "shared"), 19 | Array("Spark", "Context")) 20 | val transformed = SampleRdd.tokenize(sc.parallelize(input)) 21 | assert(transformed === expected) 22 | } 23 | 24 | } 25 | --------------------------------------------------------------------------------