├── .cache
├── .classpath
├── .gitignore
├── .project
├── .settings
    ├── org.eclipse.jdt.core.prefs
    └── org.scala-ide.sdt.core.prefs
└── src
    ├── .DS_Store
    ├── api
        └── examples
        │   ├── Cartesian.scala
        │   ├── Checkpoint.scala
        │   ├── Coalesce.scala
        │   ├── Cogroup.scala
        │   ├── Collect.scala
        │   ├── CollectAsMap.scala
        │   ├── CombineByKey.scala
        │   ├── CountApproxDistinct.scala
        │   ├── GroupByKeyPair.scala
        │   ├── IntersectionTest.scala
        │   ├── Sample.scala
        │   └── Utils.scala
    ├── internals
        ├── IntersectionTest.scala
        ├── RepartitionTest2.scala
        ├── broadcastTest.scala
        ├── cartesianTest.scala
        ├── coalesceTest.scala
        ├── cogroupTest.scala
        ├── complexStages.scala
        ├── distinctTest.scala
        ├── groupByKeyTest.scala
        ├── hashjoinTest.scala
        ├── joinTest.scala
        ├── pipeTest.scala
        ├── reduceByKeyTest.scala
        ├── repartitionTest.scala
        └── sortByKeyTest.scala
    ├── local
        └── examples
        │   ├── Aggregate.scala
        │   ├── AggregateOrder.scala
        │   ├── Cartesian.scala
        │   ├── CollectAsMap.scala
        │   ├── FlatMap.scala
        │   ├── GenerateRandomText.scala
        │   ├── GroupByAction.scala
        │   ├── GroupByKey.scala
        │   ├── GroupByTest.scala
        │   ├── GroupWith.scala
        │   ├── JoinAction.scala
        │   ├── LocalWordCount.scala
        │   ├── LookUpTest.scala
        │   ├── MapPartitionsRDDTest.scala
        │   ├── MapValuesTest.scala
        │   ├── PipedRDDTest.scala
        │   ├── ReduceByKeyActionTest.scala
        │   ├── ReduceByKeyToDriverTest.scala
        │   ├── SparkLR.scala
        │   ├── TakeActionTest.scala
        │   ├── UnionTest.scala
        │   ├── partitionByTest.scala
        │   ├── reduceActionTest.scala
        │   └── sortByKeyTest.scala
    ├── org
        ├── .DS_Store
        └── apache
        │   ├── .DS_Store
        │   └── spark
        │       ├── .DS_Store
        │       └── examples
        │           ├── BroadcastTest.scala
        │           ├── CassandraCQLTest.scala
        │           ├── CassandraTest.scala
        │           ├── DriverSubmissionTest.scala
        │           ├── ExceptionHandlingTest.scala
        │           ├── GroupByTest.scala
        │           ├── HBaseTest.scala
        │           ├── HdfsTest.scala
        │           ├── LocalALS.scala
        │           ├── LocalFileLR.scala
        │           ├── LocalKMeans.scala
        │           ├── LocalLR.scala
        │           ├── LocalPi.scala
        │           ├── LogQuery.scala
        │           ├── MultiBroadcastTest.scala
        │           ├── SimpleSkewedGroupByTest.scala
        │           ├── SkewedGroupByTest.scala
        │           ├── SparkALS.scala
        │           ├── SparkHdfsLR.scala
        │           ├── SparkKMeans.scala
        │           ├── SparkLR.scala
        │           ├── SparkPageRank.scala
        │           ├── SparkPi.scala
        │           ├── SparkTC.scala
        │           ├── SparkTachyonHdfsLR.scala
        │           └── SparkTachyonPi.scala
    └── pretty
        └── examples
            ├── Aggregate.scala
            ├── Coalesce.scala
            ├── CogroupPair.scala
            └── GroupByKeyPair.scala


/.cache:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JerryLead/SparkLearning/b1150869e97dff8b623898c88b3635abd9f3bd54/.cache


--------------------------------------------------------------------------------
/.classpath:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <classpath>
3 | 	<classpathentry kind="src" path="src"/>
4 | 	<classpathentry kind="con" path="org.scala-ide.sdt.launching.SCALA_CONTAINER"/>
5 | 	<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.7"/>
6 | 	<classpathentry kind="con" path="org.eclipse.jdt.USER_LIBRARY/SparkLib-1.0.1"/>
7 | 	<classpathentry kind="output" path="bin"/>
8 | </classpath>
9 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /bin
2 | 


--------------------------------------------------------------------------------
/.project:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <projectDescription>
 3 | 	<name>SparkLearning</name>
 4 | 	<comment></comment>
 5 | 	<projects>
 6 | 	</projects>
 7 | 	<buildSpec>
 8 | 		<buildCommand>
 9 | 			<name>org.scala-ide.sdt.core.scalabuilder</name>
10 | 			<arguments>
11 | 			</arguments>
12 | 		</buildCommand>
13 | 	</buildSpec>
14 | 	<natures>
15 | 		<nature>org.scala-ide.sdt.core.scalanature</nature>
16 | 		<nature>org.eclipse.jdt.core.javanature</nature>
17 | 	</natures>
18 | </projectDescription>
19 | 


--------------------------------------------------------------------------------
/.settings/org.eclipse.jdt.core.prefs:
--------------------------------------------------------------------------------
 1 | eclipse.preferences.version=1
 2 | org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled
 3 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.7
 4 | org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve
 5 | org.eclipse.jdt.core.compiler.compliance=1.7
 6 | org.eclipse.jdt.core.compiler.debug.lineNumber=generate
 7 | org.eclipse.jdt.core.compiler.debug.localVariable=generate
 8 | org.eclipse.jdt.core.compiler.debug.sourceFile=generate
 9 | org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
10 | org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
11 | org.eclipse.jdt.core.compiler.source=1.7
12 | 


--------------------------------------------------------------------------------
/.settings/org.scala-ide.sdt.core.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | organizeimports.expandcollapse=expand
3 | organizeimports.groups=java$scala$org$com
4 | organizeimports.scalapackage=false
5 | organizeimports.wildcards=scalaz$scalaz.Scalaz
6 | 


--------------------------------------------------------------------------------
/src/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JerryLead/SparkLearning/b1150869e97dff8b623898c88b3635abd9f3bd54/src/.DS_Store


--------------------------------------------------------------------------------
/src/api/examples/Cartesian.scala:
--------------------------------------------------------------------------------
 1 | package api.examples
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | 
 5 | object Cartesian {
 6 |   def main(args: Array[String]) {
 7 | 	  val sc = new SparkContext("local", "Cartesian Test") 
 8 | 	  
 9 | 	  val x = sc.parallelize(List(1, 2, 3, 4, 5))
10 | 	  val y = sc.parallelize(List(6, 7, 8, 9, 10))
11 | 
12 | 	  println(x ++ y ++ x)
13 | 	  val result = x.cartesian(y)
14 | 	  //result.collect
15 | 	  result.foreach(println)
16 |   }
17 | }


--------------------------------------------------------------------------------
/src/api/examples/Checkpoint.scala:
--------------------------------------------------------------------------------
 1 | package api.examples
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | 
 5 | object Checkpoint {
 6 |   def main(args: Array[String]) {
 7 | 	  val sc = new SparkContext("local", "Checkpoint Test") 
 8 | 	  
 9 | 	  sc.setCheckpointDir("/Users/xulijie/Documents/data/checkpoint")
10 | 	  val a = sc.parallelize(1 to 4, 2)
11 | 	  a.checkpoint
12 | 	  a.count
13 |   }
14 | }


--------------------------------------------------------------------------------
/src/api/examples/Coalesce.scala:
--------------------------------------------------------------------------------
 1 | package api.examples
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | 
 5 | object Coalesce {
 6 |   def main(args: Array[String]) {
 7 | 	  val sc = new SparkContext("local", "Coalesce Test") 
 8 | 	  
 9 | 	  val y = sc.parallelize(1 to 10, 10)
10 | 	  
11 | 	  y.foreach(println)
12 | 	  
13 | 	  val z = y.coalesce(2, true)
14 | 	  
15 | 	  z.foreach(println)
16 |   }
17 | }


--------------------------------------------------------------------------------
/src/api/examples/Cogroup.scala:
--------------------------------------------------------------------------------
 1 | package api.examples
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | import org.apache.spark.SparkContext._
 5 | 
 6 | object Cogroup {
 7 |   def main(args: Array[String]) {
 8 |     val sc = new SparkContext("local", "Cogroup Test")
 9 | 
10 |     val a = sc.parallelize(List(1, 2, 1, 3), 2)
11 |     val b = sc.parallelize(List(1, 2, 3, 4, 5, 6), 3)
12 |     val d = a.map((_, "b"))
13 |     //b.foreach(println)
14 |     // output:
15 |     // (1,b)
16 |     // (2,b)
17 |     // (1,b)
18 |     // (3,b)
19 |     val e = b.map((_, "c"))
20 |     //c.foreach(println)
21 |     // output:
22 |     // (1,c)
23 |     // (2,c)
24 |     // (1,c)
25 |     // (3,c)
26 |     
27 |     //val result = b.cogroup(c)
28 |     val result = d.cogroup(e, 4)
29 |     result.foreach(println)
30 |     println(result.toDebugString)
31 |     // output:
32 |     // (1,(ArrayBuffer(b, b),ArrayBuffer(c, c)))
33 |     // (3,(ArrayBuffer(b),ArrayBuffer(c)))
34 |     // (2,(ArrayBuffer(b),ArrayBuffer(c)))
35 |     
36 |     /*
37 |      * MappedValuesRDD[5] at cogroup at Cogroup.scala:28 (3 partitions)
38 |      *  CoGroupedRDD[4] at cogroup at Cogroup.scala:28 (3 partitions)
39 |      *    MappedRDD[2] at map at Cogroup.scala:12 (2 partitions)
40 |      *      ParallelCollectionRDD[0] at parallelize at Cogroup.scala:10 (2 partitions)
41 |      *    MappedRDD[3] at map at Cogroup.scala:19 (3 partitions)
42 |      *      ParallelCollectionRDD[1] at parallelize at Cogroup.scala:11 (3 partitions)
43 |      * 
44 |      */
45 |   }
46 | }


--------------------------------------------------------------------------------
/src/api/examples/Collect.scala:
--------------------------------------------------------------------------------
 1 | package api.examples
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | 
 5 | object Collect {
 6 |   def main(args: Array[String]) {
 7 |     val sc = new SparkContext("local", "Collect Test") 
 8 |     
 9 |     val c = sc.parallelize(List("Gnu", "cat", "Rat", "Dog", "Gnu", "Rat"), 2)
10 |     
11 |     val result = c.collect
12 |     result.foreach(println)
13 |   }
14 | }


--------------------------------------------------------------------------------
/src/api/examples/CollectAsMap.scala:
--------------------------------------------------------------------------------
 1 | package api.examples
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | import org.apache.spark.SparkContext._
 5 | 
 6 | object CollectAsMap {
 7 |   def main(args: Array[String]) {
 8 |     val sc = new SparkContext("local", "CollectAsMap Test") 
 9 |     
10 |     val a = sc.parallelize(List(1, 2, 1, 3), 1)
11 |     val b = a.zip(a)
12 |     
13 |     val result = b.collectAsMap
14 |     
15 |     result.foreach(println)
16 |     
17 |     // output:
18 |     // (2,2)
19 |     // (1,1)
20 |     // (3,3)
21 |   }
22 | }


--------------------------------------------------------------------------------
/src/api/examples/CombineByKey.scala:
--------------------------------------------------------------------------------
 1 | package api.examples
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | import org.apache.spark.SparkContext._
 5 | 
 6 | object CombineByKey {
 7 |   def main(args: Array[String]) {
 8 |     val sc = new SparkContext("local", "CombineByKey Test")
 9 |     
10 |     val a = sc.parallelize(List("dog", "cat", "gnu", "salmon", "rabbit", "turkey", "wolf", "bear", "bee"), 3)
11 |     val b = sc.parallelize(List(1, 1, 2, 2, 2, 1, 2, 2, 2), 3)
12 |     val c = b.zip(a)
13 |     
14 |     val d = c.combineByKey(List(_), (x:List[String], y:String) 
15 |         => y :: x, (x:List[String], y:List[String]) => x ::: y)
16 |         
17 |     val result = d.collect
18 |     result.foreach(println)
19 |     println("RDD graph:\n" + d.toDebugString)
20 |   }
21 | }


--------------------------------------------------------------------------------
/src/api/examples/CountApproxDistinct.scala:
--------------------------------------------------------------------------------
 1 | package api.examples
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | import org.apache.spark.SparkContext._
 5 | 
 6 | object CountApproxDistinct {
 7 |   def main(args: Array[String]) {
 8 |     val sc = new SparkContext("local", "CountApproxDistinct Test")
 9 |     
10 |     val a = sc.parallelize(1 to 10000, 20)
11 |     val b = a++a++a++a++a
12 |     
13 |     val result = b.countApproxDistinct(0.1)
14 |     println(result)
15 |     //println(b.countApproxDistinct(0.05))
16 |     //println(b.countApproxDistinct(0.01))
17 |     //println(b.countApproxDistinct(0.001))
18 |     
19 |   }
20 | }


--------------------------------------------------------------------------------
/src/api/examples/GroupByKeyPair.scala:
--------------------------------------------------------------------------------
 1 | package api.examples
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | import org.apache.spark.SparkContext._
 5 | import org.apache.spark.RangePartitioner
 6 | 
 7 | object GroupByKeyPair {
 8 | 
 9 |    def main(args: Array[String]) {
10 |     
11 |     val sc = new SparkContext("local", "GroupByKeyPair Test") 
12 |     val d = sc.parallelize(1 to 100, 10)
13 |     
14 | 	val pairs = d.keyBy(x => x % 10)
15 | 		   			
16 | 	val result1 = pairs.groupByKey()
17 | 	//val result2 = pairs.groupByKey(3)
18 | 	//val result3 = pairs.groupByKey(new RangePartitioner(3, pairs))
19 | 	
20 | 	println("Result 1:")
21 | 	result1.foreach(println)
22 | 	
23 | 	//println("Result 2:")
24 | 	//result2.foreach(println)
25 | 	
26 | 	//println("Result 3:")
27 | 	//result3.foreach(println)
28 | 	
29 |   }
30 | }


--------------------------------------------------------------------------------
/src/api/examples/IntersectionTest.scala:
--------------------------------------------------------------------------------
 1 | package api.examples
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | import org.apache.spark.SparkContext._
 5 | import org.apache.spark.RangePartitioner
 6 | 
 7 | object IntersectionTest {
 8 | 
 9 |    def main(args: Array[String]) {
10 |     
11 |     val sc = new SparkContext("local", "Intersection Test") 
12 |     val a = sc.parallelize(List(1, 2, 3, 3, 4, 5), 3)
13 |     val b = sc.parallelize(List(1, 2, 5, 6), 2)
14 |     //val c = sc.parallelize(List(1, 2, 3), 1)
15 |     
16 |     val r = a.intersection(b)
17 | 	//r.foreachWith(i => i)((x, i) => println("[PartitionIndex " + i + "] " + x))
18 | 	
19 | 	println(r.toDebugString)
20 | 	// [PartitionIndex 1] 1
21 | 	// [PartitionIndex 2] 5
22 | 	// [PartitionIndex 2] 2
23 |   }
24 | }


--------------------------------------------------------------------------------
/src/api/examples/Sample.scala:
--------------------------------------------------------------------------------
 1 | package api.examples
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | 
 5 | 
 6 | object Sample {
 7 | 
 8 |    def main(args: Array[String]) {
 9 |     
10 |     val sc = new SparkContext("local", "Sample Test") 
11 |     val d = sc.parallelize(1 to 100, 10)
12 |     
13 | 	val result1 = d.sample(false, 0.1, 0)
14 | 	val result2 = d.sample(true, 0.1, 0)
15 | 	
16 | 	println(result1.toDebugString)
17 | 	
18 | 	println("result 1:")
19 | 	result1.collect.foreach(x => print(x + " "))
20 | 	println("\nresutl 2:")
21 | 	result2.collect.foreach(x => print(x + " "))
22 | 	//result1.foreachWith(i => i)((x, i) => println("[PartitionIndex " + i + "] " + x))
23 | 	//result2.foreachWith(i => i)((x, i) => println("[PartitionIndex " + i + "] " + x))
24 |   }
25 | }


--------------------------------------------------------------------------------
/src/api/examples/Utils.scala:
--------------------------------------------------------------------------------
1 | package api.examples
2 | 
3 | class Utils {
4 | 
5 |   //def print(rdd: RDD[T])
6 | }


--------------------------------------------------------------------------------
/src/internals/IntersectionTest.scala:
--------------------------------------------------------------------------------
 1 | package internals
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | import org.apache.spark.SparkContext._
 5 | import org.apache.spark.RangePartitioner
 6 | 
 7 | object IntersectionTest {
 8 | 
 9 |    def main(args: Array[String]) {
10 |     
11 |     val sc = new SparkContext("local", "Intersection Test") 
12 |     val a = sc.parallelize(List(1, 2, 3, 3, 4, 5), 3)
13 |     val b = sc.parallelize(List(1, 2, 5, 6), 2)
14 |     
15 |     
16 |     val r = a.intersection(b)
17 |     
18 |     a.foreachWith(i => i)((x, i) => println("[aIndex " + i + "] " + x))
19 |     b.foreachWith(i => i)((x, i) => println("[bIndex " + i + "] " + x))
20 | 	r.foreachWith(i => i)((x, i) => println("[PartitionIndex " + i + "] " + x))
21 | 	
22 | 	println(r.toDebugString)
23 | 
24 | 	/*
25 | [aIndex 0] 1
26 | [aIndex 0] 2
27 | 
28 | [aIndex 1] 3
29 | [aIndex 1] 3
30 | 
31 | [aIndex 2] 4
32 | [aIndex 2] 5
33 | 
34 | [bIndex 0] 1
35 | [bIndex 0] 2
36 | 
37 | [bIndex 1] 5
38 | [bIndex 1] 6
39 | 
40 | [PartitionIndex 1] 1
41 | 
42 | [PartitionIndex 2] 5
43 | [PartitionIndex 2] 2
44 | 
45 | MappedRDD[7] at intersection at IntersectionTest.scala:16 (3 partitions)
46 |   FilteredRDD[6] at intersection at IntersectionTest.scala:16 (3 partitions)
47 |     MappedValuesRDD[5] at intersection at IntersectionTest.scala:16 (3 partitions)
48 |       CoGroupedRDD[4] at intersection at IntersectionTest.scala:16 (3 partitions)
49 |         MappedRDD[2] at intersection at IntersectionTest.scala:16 (3 partitions)
50 |           ParallelCollectionRDD[0] at parallelize at IntersectionTest.scala:12 (3 partitions)
51 |         MappedRDD[3] at intersection at IntersectionTest.scala:16 (2 partitions)
52 |           ParallelCollectionRDD[1] at parallelize at IntersectionTest.scala:13 (2 partitions)
53 | */
54 |   }
55 | }


--------------------------------------------------------------------------------
/src/internals/RepartitionTest2.scala:
--------------------------------------------------------------------------------
 1 | package internals
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | import org.apache.spark.SparkContext._
 5 | import org.apache.spark.HashPartitioner
 6 | 
 7 | object RepartitionTest2 {
 8 |   def main(args: Array[String]) {
 9 | 
10 |     val sc = new SparkContext("local", "repartition Test")
11 |     val data = Array[(Int, Char)]((3, 'a'), (2, 'b'),
12 |       (1, 'c'), (4, 'd'))
13 |     val pairs1 = sc.parallelize(data, 3).partitionBy(new HashPartitioner(2))
14 |     
15 |     pairs1.foreachWith(i => i)((x, i) => println("[pairs1-Index " + i + "] " + x))
16 |   }
17 | }
18 | /*
19 | [pairs1-Index 0] (3,a)
20 | [pairs1-Index 0] (2,b)
21 | [pairs1-Index 0] (1,c)
22 | 
23 | [pairs1-Index 1] (4,d)
24 | */


--------------------------------------------------------------------------------
/src/internals/broadcastTest.scala:
--------------------------------------------------------------------------------
 1 | package internals
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.SparkContext
 5 | 
 6 | object broadcast {
 7 |   def main(args: Array[String]) {
 8 | 
 9 |     val bcName = "Http"
10 |     val blockSize = "4096"
11 | 
12 |     System.setProperty("spark.broadcast.factory", "org.apache.spark.broadcast." + bcName +
13 |       "BroadcastFactory")
14 |     System.setProperty("spark.broadcast.blockSize", blockSize)
15 |     val sparkConf = new SparkConf().setAppName("Broadcast Test").setMaster("local")
16 | 
17 |     val sc = new SparkContext(sparkConf)
18 | 
19 |     val slices = 2
20 |     val num = 100
21 | 
22 |     val arr1 = new Array[Int](num)
23 |   
24 |     for (i <- 0 until arr1.length) {
25 |       arr1(i) = i
26 |     }
27 |     
28 |     val data = sc.makeRDD(List(1, 2, 3, 4, 5, 6), 2)
29 | 
30 |     val barr1 = sc.broadcast(arr1)
31 |     val observedSizes = sc.parallelize(1 to 4, slices).map(_ => barr1.value.size)
32 |     // Collect the small RDD so we can print the observed sizes locally.
33 |     observedSizes.collect().foreach(i => println(i))
34 | 
35 |     //println(barr1.value.size)
36 |     //barr1.value.collect
37 |   }
38 | }


--------------------------------------------------------------------------------
/src/internals/cartesianTest.scala:
--------------------------------------------------------------------------------
 1 | package internals
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | import org.apache.spark.SparkContext._
 5 | 
 6 | object cartesianTest {
 7 |   def main(args: Array[String]) {
 8 | 
 9 |     val sc = new SparkContext("local", "cartesian Test")
10 |     val data1 = Array[(Int, Char)]((1, 'a'), (2, 'b'),
11 |       (3, 'c'), (4, 'd'))
12 |     val pairs1 = sc.parallelize(data1, 2)
13 | 
14 |     val data2 = Array[(Int, Char)]((1, 'A'), (2, 'B'))
15 |     val pairs2 = sc.parallelize(data2, 2)
16 |     
17 |     val result = pairs1.cartesian(pairs2)
18 | 
19 |     //pairs1.foreachWith(i => i)((x, i) => println("[pairs1-Index " + i + "] " + x))
20 |     //pairs2.foreachWith(i => i)((x, i) => println("[pairs2-Index " + i + "] " + x))
21 |     result.foreachWith(i => i)((x, i) => println("[PartitionIndex " + i + "] " + x))
22 |     
23 |      //println(result.toDebugString)
24 |   }
25 | }
26 | /*
27 | [pairs1-Index 0] (1,a)
28 | [pairs1-Index 0] (2,b)
29 | 
30 | [pairs1-Index 1] (3,c)
31 | [pairs1-Index 1] (4,d)
32 | 
33 | [pairs2-Index 0] (1,A)
34 | [pairs2-Index 1] (2,B)
35 | 
36 | [PartitionIndex 0] ((1,a),(1,A))
37 | [PartitionIndex 0] ((2,b),(1,A))
38 | 
39 | [PartitionIndex 1] ((1,a),(2,B))
40 | [PartitionIndex 1] ((2,b),(2,B))
41 | 
42 | [PartitionIndex 2] ((3,c),(1,A))
43 | [PartitionIndex 2] ((4,d),(1,A))
44 | 
45 | [PartitionIndex 3] ((3,c),(2,B))
46 | [PartitionIndex 3] ((4,d),(2,B))
47 | 
48 | 
49 | 
50 | CartesianRDD[2] at cartesian at cartesianTest.scala:17 (4 partitions)
51 |   ParallelCollectionRDD[0] at parallelize at cartesianTest.scala:12 (2 partitions)
52 |   ParallelCollectionRDD[1] at parallelize at cartesianTest.scala:15 (2 partitions)
53 | 
54 |  */
55 | 
56 | 


--------------------------------------------------------------------------------
/src/internals/coalesceTest.scala:
--------------------------------------------------------------------------------
 1 | package internals
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | 
 5 | object coalesceTest {
 6 |   def main(args: Array[String]) {
 7 | 	  val sc = new SparkContext("local", "Coalesce Test") 
 8 | 	  
 9 | 	  //val y = sc.parallelize(1 to 10, 5)
10 | 	  val y = sc.parallelize(List(1, 2, 3, 4, 5, 2, 5, 8, 3, 10), 5)
11 | 	  // y.foreachWith(i => i)((x, i) => println("[yPartitionIndex " + i + "] " + x))
12 | 	  
13 | 	  val z = y.coalesce(10, false)
14 | 	  
15 | 	  y.foreachWith(i => i)((x, i) => println("[yPartitionIndex " + i + "] " + x))
16 | 	  z.foreachWith(i => i)((x, i) => println("[zPartitionIndex " + i + "] " + x))
17 | 	  
18 | 	  println(z.toDebugString)
19 |   }
20 | }
21 | 
22 | /*
23 | [yPartitionIndex 0] 1
24 | [yPartitionIndex 0] 2
25 | 
26 | [yPartitionIndex 1] 3
27 | [yPartitionIndex 1] 4
28 | 
29 | [yPartitionIndex 2] 5
30 | [yPartitionIndex 2] 6
31 | 
32 | [yPartitionIndex 3] 7
33 | [yPartitionIndex 3] 8
34 | 
35 | [yPartitionIndex 4] 9
36 | [yPartitionIndex 4] 10
37 | 
38 | [zPartitionIndex 0] 1
39 | [zPartitionIndex 0] 2
40 | 
41 | [zPartitionIndex 1] 3
42 | [zPartitionIndex 1] 4
43 | [zPartitionIndex 1] 5
44 | [zPartitionIndex 1] 6
45 | 
46 | [zPartitionIndex 2] 7
47 | [zPartitionIndex 2] 8
48 | [zPartitionIndex 2] 9
49 | [zPartitionIndex 2] 10
50 | 
51 | 
52 | CoalescedRDD[1] at coalesce at coalesceTest.scala:13 (3 partitions)
53 |   ParallelCollectionRDD[0] at parallelize at coalesceTest.scala:9 (5 partitions)
54 |   
55 |   
56 | [zPartitionIndex 0] 6
57 | [zPartitionIndex 0] 7
58 | [zPartitionIndex 0] 9
59 | 
60 | [zPartitionIndex 1] 1
61 | [zPartitionIndex 1] 3
62 | [zPartitionIndex 1] 8
63 | [zPartitionIndex 1] 10
64 | 
65 | [zPartitionIndex 2] 2
66 | [zPartitionIndex 2] 4
67 | [zPartitionIndex 2] 5
68 | 
69 | 
70 | 
71 | 
72 | MappedRDD[4] at coalesce at coalesceTest.scala:13 (3 partitions)
73 |   CoalescedRDD[3] at coalesce at coalesceTest.scala:13 (3 partitions)
74 |     ShuffledRDD[2] at coalesce at coalesceTest.scala:13 (3 partitions)
75 |       MapPartitionsRDD[1] at coalesce at coalesceTest.scala:13 (5 partitions)
76 |         ParallelCollectionRDD[0] at parallelize at coalesceTest.scala:9 (5 partitions)
77 | 
78 | 
79 | */


--------------------------------------------------------------------------------
/src/internals/cogroupTest.scala:
--------------------------------------------------------------------------------
 1 | package internals
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | import org.apache.spark.SparkContext._
 5 | import org.apache.spark.RangePartitioner
 6 | 
 7 | object cogroupTest {
 8 | 
 9 |    def main(args: Array[String]) {
10 |     
11 |     val sc = new SparkContext("local", "cogroup Test") 
12 |     val a = sc.parallelize(List(1, 2, 3, 3, 4, 5), 3).map(x => (x, 'a'))
13 |     val b = sc.parallelize(List(1, 2, 5, 6), 2).map(y => (y, 'b'))
14 |     
15 |     
16 |     val r = a.cogroup(b)
17 |     
18 |     a.foreachWith(i => i)((x, i) => println("[aIndex " + i + "] " + x))
19 |     b.foreachWith(i => i)((x, i) => println("[bIndex " + i + "] " + x))
20 | 	r.foreachWith(i => i)((x, i) => println("[PartitionIndex " + i + "] " + x))
21 | 	
22 | 	println(r.toDebugString)
23 | 
24 | 	/*
25 | [aIndex 0] (1,a)
26 | [aIndex 0] (2,a)
27 | 
28 | [aIndex 1] (3,a)
29 | [aIndex 1] (3,a)
30 | 
31 | [aIndex 2] (4,a)
32 | [aIndex 2] (5,a)
33 | 
34 | [bIndex 0] (1,b)
35 | [bIndex 0] (2,b)
36 | 
37 | [bIndex 1] (5,b)
38 | [bIndex 1] (6,b)
39 | 
40 | [PartitionIndex 0] (6,(ArrayBuffer(),ArrayBuffer(b)))
41 | [PartitionIndex 0] (3,(ArrayBuffer(a, a),ArrayBuffer()))
42 | 
43 | [PartitionIndex 1] (4,(ArrayBuffer(a),ArrayBuffer()))
44 | [PartitionIndex 1] (1,(ArrayBuffer(a),ArrayBuffer(b)))
45 | 
46 | 
47 | [PartitionIndex 2] (5,(ArrayBuffer(a),ArrayBuffer(b)))
48 | [PartitionIndex 2] (2,(ArrayBuffer(a),ArrayBuffer(b)))
49 | 
50 | MappedValuesRDD[5] at cogroup at cogroupTest.scala:16 (3 partitions)
51 |   CoGroupedRDD[4] at cogroup at cogroupTest.scala:16 (3 partitions)
52 |     MappedRDD[1] at map at cogroupTest.scala:12 (3 partitions)
53 |       ParallelCollectionRDD[0] at parallelize at cogroupTest.scala:12 (3 partitions)
54 |     MappedRDD[3] at map at cogroupTest.scala:13 (2 partitions)
55 |       ParallelCollectionRDD[2] at parallelize at cogroupTest.scala:13 (2 partitions)
56 | */
57 |   }
58 | }


--------------------------------------------------------------------------------
/src/internals/complexStages.scala:
--------------------------------------------------------------------------------
 1 | package internals
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | import org.apache.spark.SparkContext._
 5 | import org.apache.spark.HashPartitioner
 6 | 
 7 | 
 8 | object complexStagesTest {
 9 |   def main(args: Array[String]) {
10 | 
11 |     val sc = new SparkContext("local", "complexStages Test")
12 |     
13 |     
14 |     val data1 = Array[(Int, Char)](
15 |       (1, 'a'), (2, 'b'),
16 |       (3, 'c'), (4, 'd'),
17 |       (5, 'e'), (3, 'f'),
18 |       (2, 'g'), (1, 'h'))
19 |     val rangePairs1 = sc.parallelize(data1, 3)
20 |     
21 |     val hashPairs1 = rangePairs1.partitionBy(new HashPartitioner(3))
22 | 
23 |     
24 |     val data2 = Array[(Int, String)]((1, "A"), (2, "B"),
25 |       (3, "C"), (4, "D"))
26 | 
27 |     val pairs2 = sc.parallelize(data2, 2)
28 |     val rangePairs2 = pairs2.map(x => (x._1, x._2.charAt(0)))
29 | 
30 |     
31 |     val data3 = Array[(Int, Char)]((1, 'X'), (2, 'Y'))
32 |     val rangePairs3 = sc.parallelize(data3, 2)
33 | 
34 |     
35 |     val rangePairs = rangePairs2.union(rangePairs3)
36 | 
37 |     
38 |     val result = hashPairs1.join(rangePairs)
39 | 
40 |     result.foreachWith(i => i)((x, i) => println("[result " + i + "] " + x))
41 |  
42 |     println(result.toDebugString)
43 |   }
44 | }


--------------------------------------------------------------------------------
/src/internals/distinctTest.scala:
--------------------------------------------------------------------------------
 1 | package internals
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | import org.apache.spark.SparkContext._
 5 | 
 6 | object distinctTest {
 7 |   def main(args: Array[String]) {
 8 |     val sc = new SparkContext("local", "distinct test")
 9 | 
10 |     val pairs = sc.parallelize(List(1, 2, 2, 3, 2, 1, 4, 5), 3)
11 | 
12 |     val result = pairs.distinct(2)
13 | 
14 |     // output
15 |     // [PartitionIndex 0] 1
16 |     // [PartitionIndex 0] 2
17 | 
18 |     // [PartitionIndex 1] 2
19 |     // [PartitionIndex 1] 3
20 |     // [PartitionIndex 1] 2
21 | 
22 |     // [PartitionIndex 2] 1
23 |     // [PartitionIndex 2] 4
24 |     // [PartitionIndex 2] 5
25 |     
26 |     pairs.foreachWith(i => i)((x, i) => println("[PartitionIndex " + i + "] " + x))
27 |     result.foreachWith(i => i)((x, i) => println("[PartitionIndex " + i + "] " + x))
28 | 
29 |     // output
30 |     // [PartitionIndex 0] 4
31 |     // [PartitionIndex 0] 2
32 | 
33 |     // [PartitionIndex 1] 1
34 |     // [PartitionIndex 1] 3
35 |     // [PartitionIndex 1] 5
36 | 
37 |     println(result.toDebugString)
38 |   }
39 | 
40 |   /*
41 | MappedRDD[5] at distinct at distinctTest.scala:12 (2 partitions)
42 |   MapPartitionsRDD[4] at distinct at distinctTest.scala:12 (2 partitions)
43 |     ShuffledRDD[3] at distinct at distinctTest.scala:12 (2 partitions)
44 |       MapPartitionsRDD[2] at distinct at distinctTest.scala:12 (3 partitions)
45 |         MappedRDD[1] at distinct at distinctTest.scala:12 (3 partitions)
46 |           ParallelCollectionRDD[0] at parallelize at distinctTest.scala:10 (3 partitions)
47 |    * 
48 |    */
49 | }


--------------------------------------------------------------------------------
/src/internals/groupByKeyTest.scala:
--------------------------------------------------------------------------------
 1 | package internals
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | import org.apache.spark.SparkContext._
 5 | import org.apache.spark.SparkConf
 6 | 
 7 | object groupByKeyTest {
 8 | 
 9 |    def main(args: Array[String]) {
10 |     val conf = new SparkConf().setAppName("GroupByKey").setMaster("local")
11 |     val sc = new SparkContext(conf) 
12 |     sc.setCheckpointDir("/Users/xulijie/Documents/data/checkpoint")
13 |      
14 | 	val data = Array[(Int, Char)]((1, 'a'), (2, 'b'),
15 | 		    						 (3, 'c'), (4, 'd'),
16 | 		    						 (5, 'e'), (3, 'f'),
17 | 		    						 (2, 'g'), (1, 'h')
18 | 		    						 
19 | 		    						)    							
20 | 	val pairs = sc.parallelize(data, 3)
21 | 	
22 | 	pairs.checkpoint
23 | 	pairs.count
24 | 	
25 | 	val result = pairs.groupByKey(2)
26 | 	
27 | 	// output:
28 | 	//pairs.foreachWith(i => i)((x, i) => println("[dataPartitionIndex " + i + "] " + x))
29 | 	result.foreachWith(i => i)((x, i) => println("[PartitionIndex " + i + "] " + x))
30 | 	
31 | 	println(result.toDebugString)
32 | 
33 | 	/*
34 | [dataPartitionIndex 0] (1,a)
35 | [dataPartitionIndex 0] (2,b)
36 | 
37 | [dataPartitionIndex 1] (3,c)
38 | [dataPartitionIndex 1] (4,d)
39 | [dataPartitionIndex 1] (5,e)
40 | 
41 | [dataPartitionIndex 2] (3,f)
42 | [dataPartitionIndex 2] (2,g)
43 | [dataPartitionIndex 2] (1,h)
44 | 
45 | [PartitionIndex 0] (4,ArrayBuffer(d))
46 | [PartitionIndex 0] (2,ArrayBuffer(b, g))
47 | 
48 | [PartitionIndex 1] (1,ArrayBuffer(a, h))
49 | [PartitionIndex 1] (3,ArrayBuffer(c, f))
50 | [PartitionIndex 1] (5,ArrayBuffer(e))
51 | 
52 | MappedValuesRDD[3] at groupByKey at groupByKeyTest.scala:19 (2 partitions)
53 |   MapPartitionsRDD[2] at groupByKey at groupByKeyTest.scala:19 (2 partitions)
54 |     ShuffledRDD[1] at groupByKey at groupByKeyTest.scala:19 (2 partitions)
55 |       ParallelCollectionRDD[0] at parallelize at groupByKeyTest.scala:17 (3 partitions)
56 | */
57 |   }
58 | }


--------------------------------------------------------------------------------
/src/internals/hashjoinTest.scala:
--------------------------------------------------------------------------------
 1 | package internals
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | import org.apache.spark.SparkContext._
 5 | import org.apache.spark.HashPartitioner
 6 | 
 7 | object hashjoinTest {
 8 |   def main(args: Array[String]) {
 9 | 
10 |     val sc = new SparkContext("local", "hashjoin Test")
11 |     val data1 = Array[(Int, Char)]((1, 'a'), (2, 'b'),
12 |       (3, 'c'), (4, 'd'),
13 |       (5, 'e'), (3, 'f'),
14 |       (2, 'g'), (1, 'h'))
15 |     val pairs1 = sc.parallelize(data1, 3).partitionBy(new HashPartitioner(3))
16 |     
17 | 
18 |     val data2 = Array[(Int, Char)]((1, 'A'), (2, 'B'),
19 |       (3, 'C'), (4, 'D'))
20 |     val pairs2 = sc.parallelize(data2, 2)
21 |    
22 |     val result = pairs1.join(pairs2)
23 | 
24 |     //pairs1.foreachWith(i => i)((x, i) => println("[pairs1-Index " + i + "] " + x))
25 |     //pairs2.foreachWith(i => i)((x, i) => println("[pairs2-Index " + i + "] " + x))
26 |     result.foreachWith(i => i)((x, i) => println("[PartitionIndex " + i + "] " + x))
27 |     
28 |     println(result.toDebugString)
29 |   /*
30 | [pairs1-Index 0] (1,a)
31 | [pairs1-Index 0] (2,b)
32 | 
33 | [pairs1-Index 1] (3,c)
34 | [pairs1-Index 1] (4,d)
35 | [pairs1-Index 1] (5,e)
36 | 
37 | [pairs1-Index 2] (3,f)
38 | [pairs1-Index 2] (2,g)
39 | [pairs1-Index 2] (1,h)
40 | 
41 | [pairs2-Index 0] (1,A)
42 | [pairs2-Index 0] (2,B)
43 | 
44 | [pairs2-Index 1] (3,C)
45 | [pairs2-Index 1] (4,D)
46 | 
47 | [PartitionIndex 0] (3,(c,C))
48 | [PartitionIndex 0] (3,(f,C))
49 | 
50 | [PartitionIndex 1] (4,(d,D))
51 | [PartitionIndex 1] (1,(a,A))
52 | [PartitionIndex 1] (1,(h,A))
53 | 
54 | [PartitionIndex 2] (2,(b,B))
55 | [PartitionIndex 2] (2,(g,B))
56 | 
57 | FlatMappedValuesRDD[4] at join at joinTest.scala:20 (3 partitions)
58 |   MappedValuesRDD[3] at join at joinTest.scala:20 (3 partitions)
59 |     CoGroupedRDD[2] at join at joinTest.scala:20 (3 partitions)
60 |       ParallelCollectionRDD[0] at parallelize at joinTest.scala:14 (3 partitions)
61 |       ParallelCollectionRDD[1] at parallelize at joinTest.scala:18 (2 partitions)
62 | 
63 |    */
64 |   }
65 | }


--------------------------------------------------------------------------------
/src/internals/joinTest.scala:
--------------------------------------------------------------------------------
 1 | package internals
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | import org.apache.spark.SparkContext._
 5 | import org.apache.spark.HashPartitioner
 6 | 
 7 | object joinTest {
 8 |   def main(args: Array[String]) {
 9 | 
10 |     val sc = new SparkContext("local", "join Test")
11 |     
12 |    
13 |     val data1 = Array[(Int, Char)]((1, 'a'), (2, 'b'),
14 |       (3, 'c'), (4, 'd'),
15 |       (5, 'e'), (3, 'f'),
16 |       (2, 'g'), (1, 'h'))
17 |     val pairs1 = sc.parallelize(data1, 3)
18 |     
19 | 
20 |     val data2 = Array[(Int, Char)]((1, 'A'), (2, 'B'),
21 |       (3, 'C'), (4, 'D'))
22 |     val pairs2 = sc.parallelize(data2, 2)
23 |     
24 |     
25 |     val result = pairs1.join(pairs2)
26 |     
27 |     //pairs1.foreachWith(i => i)((x, i) => println("[pairs1-Index " + i + "] " + x))
28 |     //pairs2.foreachWith(i => i)((x, i) => println("[pairs2-Index " + i + "] " + x))
29 |     result.foreachWith(i => i)((x, i) => println("[PartitionIndex " + i + "] " + x))
30 |     
31 |     println(result.toDebugString)
32 |     
33 |   /*
34 | [pairs1-Index 0] (1,a)
35 | [pairs1-Index 0] (2,b)
36 | 
37 | [pairs1-Index 1] (3,c)
38 | [pairs1-Index 1] (4,d)
39 | [pairs1-Index 1] (5,e)
40 | 
41 | [pairs1-Index 2] (3,f)
42 | [pairs1-Index 2] (2,g)
43 | [pairs1-Index 2] (1,h)
44 | 
45 | [pairs2-Index 0] (1,A)
46 | [pairs2-Index 0] (2,B)
47 | 
48 | [pairs2-Index 1] (3,C)
49 | [pairs2-Index 1] (4,D)
50 | 
51 | [PartitionIndex 0] (3,(c,C))
52 | [PartitionIndex 0] (3,(f,C))
53 | 
54 | [PartitionIndex 1] (4,(d,D))
55 | [PartitionIndex 1] (1,(a,A))
56 | [PartitionIndex 1] (1,(h,A))
57 | 
58 | [PartitionIndex 2] (2,(b,B))
59 | [PartitionIndex 2] (2,(g,B))
60 | 
61 | FlatMappedValuesRDD[4] at join at joinTest.scala:20 (3 partitions)
62 |   MappedValuesRDD[3] at join at joinTest.scala:20 (3 partitions)
63 |     CoGroupedRDD[2] at join at joinTest.scala:20 (3 partitions)
64 |       ParallelCollectionRDD[0] at parallelize at joinTest.scala:14 (3 partitions)
65 |       ParallelCollectionRDD[1] at parallelize at joinTest.scala:18 (2 partitions)
66 | 
67 |    */
68 |   }
69 | }


--------------------------------------------------------------------------------
/src/internals/pipeTest.scala:
--------------------------------------------------------------------------------
 1 | package internals
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | import org.apache.spark.SparkContext._
 5 | 
 6 | object pipeTest {
 7 |   def main(args: Array[String]) {
 8 | 
 9 |     val sc = new SparkContext("local", "cartesian Test")
10 |     
11 |     val a = sc.parallelize(1 to 9, 3) 
12 |     val result = a.pipe("head -n 2")
13 | 
14 |     a.foreachWith(i => i)((x, i) => println("[aIndex " + i + "] " + x))
15 |     result.foreachWith(i => i)((x, i) => println("[PartitionIndex " + i + "] " + x))
16 |     
17 |      println(result.toDebugString)
18 |   }
19 | }
20 | /*
21 | [PartitionIndex 0] 1
22 | [PartitionIndex 0] 2
23 | 
24 | [PartitionIndex 1] 4
25 | [PartitionIndex 1] 5
26 | 
27 | [PartitionIndex 2] 7
28 | [PartitionIndex 2] 8
29 | 
30 | 
31 | 
32 | PipedRDD[1] at pipe at pipeTest.scala:12 (3 partitions)
33 |   ParallelCollectionRDD[0] at parallelize at pipeTest.scala:11 (3 partitions)
34 | 
35 |  */
36 | 
37 | 


--------------------------------------------------------------------------------
/src/internals/reduceByKeyTest.scala:
--------------------------------------------------------------------------------
 1 | package internals
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | import org.apache.spark.SparkContext._
 5 | 
 6 | object reduceByKeyTest {
 7 | 
 8 |   def main(args: Array[String]) {
 9 |     val sc = new SparkContext("local", "ReduceByKey Test")
10 |     val data1 = Array[(String, Int)](("A", 1), ("B", 1),
11 |       ("C", 1), ("B", 1),
12 |       ("C", 1), ("D", 1),
13 |       ("C", 1), ("A", 1))
14 |     val pairs = sc.parallelize(data1, 3)
15 |     
16 |     // pairs.foreachWith(i => i)((x, i) => println("[pPartitionIndex " + i + "] " + x))
17 |      
18 |     // [pPartitionIndex 0] (A,1)
19 |     // [pPartitionIndex 0] (B,1)
20 |     
21 |     // [pPartitionIndex 1] (C,1)
22 |     // [pPartitionIndex 1] (B,1)
23 |     // [pPartitionIndex 1] (C,1)
24 |     
25 |     // [pPartitionIndex 2] (D,1)
26 |     // [pPartitionIndex 2] (C,1)
27 |     // [pPartitionIndex 2] (A,1)
28 |     
29 |     //val result = pairs.reduce((A, B) => (A._1 + "#" + B._1, A._2 + B._2))
30 |     //val result = pairs.fold(("K0",10))((A, B) => (A._1 + "#" + B._1, A._2 + B._2))
31 |     val result = pairs.reduceByKey(_ + _, 2)
32 |     result.foreachWith(i => i)((x, i) => println("[PartitionIndex " + i + "] " + x))
33 | 
34 |     println(result.toDebugString)
35 |     
36 |     // output
37 |     // [PartitionIndex 0] (B,2)
38 |     // [PartitionIndex 0] (D,1)
39 |     // [PartitionIndex 1] (A,2)
40 |     // [PartitionIndex 1] (C,3)
41 |     
42 |     /*
43 | MapPartitionsRDD[3] at reduceByKey at reduceByKeyTest.scala:17 (2 partitions)
44 |   ShuffledRDD[2] at reduceByKey at reduceByKeyTest.scala:17 (2 partitions)
45 |     MapPartitionsRDD[1] at reduceByKey at reduceByKeyTest.scala:17 (3 partitions)
46 |       ParallelCollectionRDD[0] at parallelize at reduceByKeyTest.scala:14 (3 partitions)
47 |      */
48 |   }
49 | }


--------------------------------------------------------------------------------
/src/internals/repartitionTest.scala:
--------------------------------------------------------------------------------
 1 | package internals
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | import org.apache.spark.SparkContext._
 5 | 
 6 | object repartitionTest {
 7 |   def main(args: Array[String]) {
 8 | 
 9 |     val sc = new SparkContext("local", "Coalesce Test")
10 |     val y = sc.parallelize(1 to 100, 5)
11 | 
12 |     //y.foreach(println)
13 | 
14 |     val z = y.repartition(2)
15 | 
16 |     val r = z.takeOrdered(7)
17 |     z.foreach(println)
18 |   }
19 | }


--------------------------------------------------------------------------------
/src/internals/sortByKeyTest.scala:
--------------------------------------------------------------------------------
 1 | package internals
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | import org.apache.spark.SparkContext._
 5 | 
 6 | object sortByKeyTest {
 7 |   def main(args: Array[String]) {
 8 | 
 9 |     val sc = new SparkContext("local", "sortByKey Test")
10 |     val data1 = Array[(Char, Int)](('A', 5), ('B', 4),
11 |       ('C', 3), ('B', 2),
12 |       ('C', 1), ('D', 2),
13 |       ('C', 3), ('A', 4))
14 |     val pairs = sc.parallelize(data1, 3)
15 | 
16 |     val result = pairs.sortByKey(true, 2)
17 |     pairs.foreachWith(i => i)((x, i) => println("[pairsPartitionIndex " + i + "] " + x))
18 |     result.foreachWith(i => i)((x, i) => println("[PartitionIndex " + i + "] " + x))
19 |     
20 |     println(result.toDebugString)
21 |   }
22 | 
23 | }
24 | 
25 | /*
26 | [pairsPartitionIndex 0] (A,5)
27 | [pairsPartitionIndex 0] (B,4)
28 |  
29 | [pairsPartitionIndex 1] (C,3)
30 | [pairsPartitionIndex 1] (B,2)
31 | [pairsPartitionIndex 1] (C,1)
32 | 
33 | [pairsPartitionIndex 2] (D,2)
34 | [pairsPartitionIndex 2] (C,3)
35 | [pairsPartitionIndex 2] (A,4)
36 | 
37 | [PartitionIndex 0] (A,5)
38 | [PartitionIndex 0] (A,4)
39 | [PartitionIndex 0] (B,4)
40 | [PartitionIndex 0] (B,2)
41 | 
42 | [PartitionIndex 1] (C,3)
43 | [PartitionIndex 1] (C,1)
44 | [PartitionIndex 1] (C,3)
45 | [PartitionIndex 1] (D,2)
46 | 
47 | MapPartitionsRDD[4] at sortByKey at sortByKeyTest.scala:16 (2 partitions)
48 |   ShuffledRDD[3] at sortByKey at sortByKeyTest.scala:16 (2 partitions)
49 |     ParallelCollectionRDD[0] at parallelize at sortByKeyTest.scala:14 (3 partitions)
50 |  */
51 | 


--------------------------------------------------------------------------------
/src/local/examples/Aggregate.scala:
--------------------------------------------------------------------------------
 1 | package local.examples
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | 
 5 | object Aggregate {
 6 |   
 7 |   def main(args: Array[String]) {
 8 | 
 9 |     val sc = new SparkContext("local", "AggregateAction Test")
10 |     val data = Array[(String, Int)](("A1", 1), ("A2", 2),
11 |       ("B1", 3), ("B2", 4),
12 |       ("C1", 5), ("C2", 6))
13 |       
14 |     val pairs = sc.parallelize(data, 3)
15 |     
16 |     // output:
17 |     // 	(A1,1)(A2,2)
18 |     //  (B1,3)(B2,4)
19 |     //	(C1,5)(C2,6)
20 |     pairs.foreach(print)
21 |     
22 |     val result = pairs.aggregate(("", 0))((U, T) => (U._1 + T._1, U._2 + T._2), (U, T) =>
23 |       ("[" + U._1 + T._1 + "] ", U._2 + T._2))
24 |       
25 |     // output ([[[A1A2] B1B2] C1C2] ,21)
26 |     println(result)
27 |   }
28 | }


--------------------------------------------------------------------------------
/src/local/examples/AggregateOrder.scala:
--------------------------------------------------------------------------------
 1 | package local.examples
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | 
 5 | object AggregateOrder {
 6 | 
 7 |   def main(args: Array[String]) {
 8 | 
 9 |     val sc = new SparkContext("local", "AggregateOrder Test")
10 |     val data = List("12", "23", "345", "4567")
11 | 
12 |     val pairs = sc.parallelize(data, 2)
13 |     pairs.foreach(x => println(x.length))
14 |     
15 |     //val result = pairs.aggregate("")((x,y) => math.min(x.length, y.length).toString, (x,y) => x + y)
16 |     
17 |     val result2 = pairs.aggregate("")((x,y) => "[" + x.length + "," + y.length + "] ", (x,y) => x + y)
18 |      
19 |     result2.foreach(println)
20 |     println(result2)
21 | 
22 |   }
23 | }


--------------------------------------------------------------------------------
/src/local/examples/Cartesian.scala:
--------------------------------------------------------------------------------
 1 | package local.examples
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | 
 5 | object Cartesian {
 6 | 	def main(args: Array[String]) {
 7 | 		val sc = new SparkContext("local", "Cartesian Test") 
 8 | 		val data1 = Array[(String, Int)](("A1", 1), ("A2", 2),
 9 | 		    							 ("B1", 3), ("B2", 4),
10 | 		    							 ("C1", 5), ("C1", 6))
11 | 		
12 | 		val data2 = Array[(String, Int)](("A1", 7), ("A2", 8),
13 | 		    							("B1", 9), ("C1", 0))    							
14 | 		val pairs1 = sc.parallelize(data1, 3)
15 | 		val pairs2 = sc.parallelize(data2, 2)
16 | 		
17 | 		val resultRDD = pairs1.cartesian(pairs2)
18 | 		
19 | 		resultRDD.foreach(println)
20 | 		
21 | 		/*
22 | 		 * Output of task1:
23 | 		 * ((A1,1),(A1,7))
24 | 		 * ((A1,1),(A2,8))
25 | 		 * ((A2,2),(A1,7))
26 | 		 * ((A2,2),(A2,8))
27 | 		 * Output of task2:
28 | 		 * ((A1,1),(B1,9))
29 | 		 * ((A1,1),(C1,0))
30 | 		 * ((A2,2),(B1,9))
31 | 		 * ((A2,2),(C1,0))
32 | 		 * Output of task3:
33 | 		 * ((B1,3),(A1,7))
34 | 		 * ((B1,3),(A2,8))
35 | 		 * ((B2,4),(A1,7))
36 | 		 * ((B2,4),(A2,8))
37 | 		 * Output of task4:
38 | 		 * ((B1,3),(B1,9))
39 | 		 * ((B1,3),(C1,0))
40 | 		 * ((B2,4),(B1,9))
41 | 		 * ((B2,4),(C1,0))
42 | 		 * Output of task5:
43 | 		 * ((C1,5),(A1,7))
44 | 		 * ((C1,5),(A2,8))
45 | 		 * ((C1,6),(A1,7))
46 | 		 * ((C1,6),(A2,8))
47 | 		 * Output of task6:
48 | 		 * ((C1,5),(B1,9))
49 | 		 * ((C1,5),(C1,0))
50 | 		 * ((C1,6),(B1,9))
51 | 		 * ((C1,6),(C1,0))
52 | 		 */
53 | 		
54 | 	}
55 | }


--------------------------------------------------------------------------------
/src/local/examples/CollectAsMap.scala:
--------------------------------------------------------------------------------
 1 | package local.examples
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | import org.apache.spark.SparkContext._
 5 | 
 6 | object CollectAsMap {
 7 |   def main(args: Array[String]) {
 8 | 
 9 |     val sc = new SparkContext("local", "CollectAsMap Test")
10 |     val data = Array[(String, Int)](("A", 1), ("B", 2),
11 |       ("B", 3), ("C", 4),
12 |       ("C", 5), ("C", 6))
13 |     
14 |     // as same as "val pairs = sc.parallelize(data, 3)"
15 |     val pairs = sc.makeRDD(data, 3)
16 |     
17 |     val result = pairs.collectAsMap
18 | 
19 |     // output Map(A -> 1, C -> 6, B -> 3)
20 |     print(result)
21 |   }
22 | 
23 | }


--------------------------------------------------------------------------------
/src/local/examples/FlatMap.scala:
--------------------------------------------------------------------------------
 1 | package local.examples
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | 
 5 | object FlatMap {
 6 |    def main(args: Array[String]) {
 7 |     
 8 |     val sc = new SparkContext("local", "FlatMap Test") 
 9 | 	val data = Array[(String, Int)](("A", 1), ("B", 2),
10 | 		    						 ("B", 3), ("C", 4),
11 | 		    						 ("C", 5), ("C", 6)
12 | 		    						)    							
13 | 	val pairs = sc.makeRDD(data, 3)
14 | 	
15 | 	val result = pairs.flatMap(T => (T._1 + T._2))
16 | 	
17 | 	result.foreach(println)
18 | 
19 |   }
20 | }


--------------------------------------------------------------------------------
/src/local/examples/GenerateRandomText.scala:
--------------------------------------------------------------------------------
  1 | package local.examples
  2 | 
  3 | import java.io.File
  4 | import java.io.FileWriter
  5 | import java.util.Random
  6 | 
  7 | object GenerateRandomText {
  8 | 
  9 |   def main(args: Array[String]) {
 10 |     val outputPath = new File("/Users/xulijie/Documents/data/RandomText/randomText-10MB.txt")
 11 |     if(!outputPath.getParentFile().exists())
 12 |       outputPath.getParentFile.mkdirs()
 13 |       
 14 |     val writer = new FileWriter(outputPath)
 15 |     val ranGen = new Random
 16 |     
 17 |   
 18 |     while (outputPath.length() < 10 * 1024 * 1024) {
 19 |       var index = Math.abs(ranGen.nextGaussian() * 1000) % 1000
 20 |       writer.write(words.apply(index.toInt))
 21 |       writer.write(" ")
 22 |       index = Math.abs(ranGen.nextGaussian()) % 1000
 23 |       writer.write(words.apply(index.toInt))
 24 |       writer.write("\n")
 25 |     }
 26 |     writer.close()
 27 |   }
 28 | 
 29 |   def printToFile(f: java.io.File)(op: java.io.PrintWriter => Unit) {
 30 |     val p = new java.io.PrintWriter(f)
 31 |     try { op(p) } finally { p.close() }
 32 |   }
 33 |   
 34 |   /**
 35 |    * A random list of 100 words from /usr/share/dict/words
 36 |    */
 37 |   private val words = Array[String](
 38 |     "diurnalness", "Homoiousian",
 39 |     "spiranthic", "tetragynian",
 40 |     "silverhead", "ungreat",
 41 |     "lithograph", "exploiter",
 42 |     "physiologian", "by",
 43 |     "hellbender", "Filipendula",
 44 |     "undeterring", "antiscolic",
 45 |     "pentagamist", "hypoid",
 46 |     "cacuminal", "sertularian",
 47 |     "schoolmasterism", "nonuple",
 48 |     "gallybeggar", "phytonic",
 49 |     "swearingly", "nebular",
 50 |     "Confervales", "thermochemically",
 51 |     "characinoid", "cocksuredom",
 52 |     "fallacious", "feasibleness",
 53 |     "debromination", "playfellowship",
 54 |     "tramplike", "testa",
 55 |     "participatingly", "unaccessible",
 56 |     "bromate", "experientialist",
 57 |     "roughcast", "docimastical",
 58 |     "choralcelo", "blightbird",
 59 |     "peptonate", "sombreroed",
 60 |     "unschematized", "antiabolitionist",
 61 |     "besagne", "mastication",
 62 |     "bromic", "sviatonosite",
 63 |     "cattimandoo", "metaphrastical",
 64 |     "endotheliomyoma", "hysterolysis",
 65 |     "unfulminated", "Hester",
 66 |     "oblongly", "blurredness",
 67 |     "authorling", "chasmy",
 68 |     "Scorpaenidae", "toxihaemia",
 69 |     "Dictograph", "Quakerishly",
 70 |     "deaf", "timbermonger",
 71 |     "strammel", "Thraupidae",
 72 |     "seditious", "plerome",
 73 |     "Arneb", "eristically",
 74 |     "serpentinic", "glaumrie",
 75 |     "socioromantic", "apocalypst",
 76 |     "tartrous", "Bassaris",
 77 |     "angiolymphoma", "horsefly",
 78 |     "kenno", "astronomize",
 79 |     "euphemious", "arsenide",
 80 |     "untongued", "parabolicness",
 81 |     "uvanite", "helpless",
 82 |     "gemmeous", "stormy",
 83 |     "templar", "erythrodextrin",
 84 |     "comism", "interfraternal",
 85 |     "preparative", "parastas",
 86 |     "frontoorbital", "Ophiosaurus",
 87 |     "diopside", "serosanguineous",
 88 |     "ununiformly", "karyological",
 89 |     "collegian", "allotropic",
 90 |     "depravity", "amylogenesis",
 91 |     "reformatory", "epidymides",
 92 |     "pleurotropous", "trillium",
 93 |     "dastardliness", "coadvice",
 94 |     "embryotic", "benthonic",
 95 |     "pomiferous", "figureheadship",
 96 |     "Megaluridae", "Harpa",
 97 |     "frenal", "commotion",
 98 |     "abthainry", "cobeliever",
 99 |     "manilla", "spiciferous",
100 |     "nativeness", "obispo",
101 |     "monilioid", "biopsic",
102 |     "valvula", "enterostomy",
103 |     "planosubulate", "pterostigma",
104 |     "lifter", "triradiated",
105 |     "venialness", "tum",
106 |     "archistome", "tautness",
107 |     "unswanlike", "antivenin",
108 |     "Lentibulariaceae", "Triphora",
109 |     "angiopathy", "anta",
110 |     "Dawsonia", "becomma",
111 |     "Yannigan", "winterproof",
112 |     "antalgol", "harr",
113 |     "underogating", "ineunt",
114 |     "cornberry", "flippantness",
115 |     "scyphostoma", "approbation",
116 |     "Ghent", "Macraucheniidae",
117 |     "scabbiness", "unanatomized",
118 |     "photoelasticity", "eurythermal",
119 |     "enation", "prepavement",
120 |     "flushgate", "subsequentially",
121 |     "Edo", "antihero",
122 |     "Isokontae", "unforkedness",
123 |     "porriginous", "daytime",
124 |     "nonexecutive", "trisilicic",
125 |     "morphiomania", "paranephros",
126 |     "botchedly", "impugnation",
127 |     "Dodecatheon", "obolus",
128 |     "unburnt", "provedore",
129 |     "Aktistetae", "superindifference",
130 |     "Alethea", "Joachimite",
131 |     "cyanophilous", "chorograph",
132 |     "brooky", "figured",
133 |     "periclitation", "quintette",
134 |     "hondo", "ornithodelphous",
135 |     "unefficient", "pondside",
136 |     "bogydom", "laurinoxylon",
137 |     "Shiah", "unharmed",
138 |     "cartful", "noncrystallized",
139 |     "abusiveness", "cromlech",
140 |     "japanned", "rizzomed",
141 |     "underskin", "adscendent",
142 |     "allectory", "gelatinousness",
143 |     "volcano", "uncompromisingly",
144 |     "cubit", "idiotize",
145 |     "unfurbelowed", "undinted",
146 |     "magnetooptics", "Savitar",
147 |     "diwata", "ramosopalmate",
148 |     "Pishquow", "tomorn",
149 |     "apopenptic", "Haversian",
150 |     "Hysterocarpus", "ten",
151 |     "outhue", "Bertat",
152 |     "mechanist", "asparaginic",
153 |     "velaric", "tonsure",
154 |     "bubble", "Pyrales",
155 |     "regardful", "glyphography",
156 |     "calabazilla", "shellworker",
157 |     "stradametrical", "havoc",
158 |     "theologicopolitical", "sawdust",
159 |     "diatomaceous", "jajman",
160 |     "temporomastoid", "Serrifera",
161 |     "Ochnaceae", "aspersor",
162 |     "trailmaking", "Bishareen",
163 |     "digitule", "octogynous",
164 |     "epididymitis", "smokefarthings",
165 |     "bacillite", "overcrown",
166 |     "mangonism", "sirrah",
167 |     "undecorated", "psychofugal",
168 |     "bismuthiferous", "rechar",
169 |     "Lemuridae", "frameable",
170 |     "thiodiazole", "Scanic",
171 |     "sportswomanship", "interruptedness",
172 |     "admissory", "osteopaedion",
173 |     "tingly", "tomorrowness",
174 |     "ethnocracy", "trabecular",
175 |     "vitally", "fossilism",
176 |     "adz", "metopon",
177 |     "prefatorial", "expiscate",
178 |     "diathermacy", "chronist",
179 |     "nigh", "generalizable",
180 |     "hysterogen", "aurothiosulphuric",
181 |     "whitlowwort", "downthrust",
182 |     "Protestantize", "monander",
183 |     "Itea", "chronographic",
184 |     "silicize", "Dunlop",
185 |     "eer", "componental",
186 |     "spot", "pamphlet",
187 |     "antineuritic", "paradisean",
188 |     "interruptor", "debellator",
189 |     "overcultured", "Florissant",
190 |     "hyocholic", "pneumatotherapy",
191 |     "tailoress", "rave",
192 |     "unpeople", "Sebastian",
193 |     "thermanesthesia", "Coniferae",
194 |     "swacking", "posterishness",
195 |     "ethmopalatal", "whittle",
196 |     "analgize", "scabbardless",
197 |     "naught", "symbiogenetically",
198 |     "trip", "parodist",
199 |     "columniform", "trunnel",
200 |     "yawler", "goodwill",
201 |     "pseudohalogen", "swangy",
202 |     "cervisial", "mediateness",
203 |     "genii", "imprescribable",
204 |     "pony", "consumptional",
205 |     "carposporangial", "poleax",
206 |     "bestill", "subfebrile",
207 |     "sapphiric", "arrowworm",
208 |     "qualminess", "ultraobscure",
209 |     "thorite", "Fouquieria",
210 |     "Bermudian", "prescriber",
211 |     "elemicin", "warlike",
212 |     "semiangle", "rotular",
213 |     "misthread", "returnability",
214 |     "seraphism", "precostal",
215 |     "quarried", "Babylonism",
216 |     "sangaree", "seelful",
217 |     "placatory", "pachydermous",
218 |     "bozal", "galbulus",
219 |     "spermaphyte", "cumbrousness",
220 |     "pope", "signifier",
221 |     "Endomycetaceae", "shallowish",
222 |     "sequacity", "periarthritis",
223 |     "bathysphere", "pentosuria",
224 |     "Dadaism", "spookdom",
225 |     "Consolamentum", "afterpressure",
226 |     "mutter", "louse",
227 |     "ovoviviparous", "corbel",
228 |     "metastoma", "biventer",
229 |     "Hydrangea", "hogmace",
230 |     "seizing", "nonsuppressed",
231 |     "oratorize", "uncarefully",
232 |     "benzothiofuran", "penult",
233 |     "balanocele", "macropterous",
234 |     "dishpan", "marten",
235 |     "absvolt", "jirble",
236 |     "parmelioid", "airfreighter",
237 |     "acocotl", "archesporial",
238 |     "hypoplastral", "preoral",
239 |     "quailberry", "cinque",
240 |     "terrestrially", "stroking",
241 |     "limpet", "moodishness",
242 |     "canicule", "archididascalian",
243 |     "pompiloid", "overstaid",
244 |     "introducer", "Italical",
245 |     "Christianopaganism", "prescriptible",
246 |     "subofficer", "danseuse",
247 |     "cloy", "saguran",
248 |     "frictionlessly", "deindividualization",
249 |     "Bulanda", "ventricous",
250 |     "subfoliar", "basto",
251 |     "scapuloradial", "suspend",
252 |     "stiffish", "Sphenodontidae",
253 |     "eternal", "verbid",
254 |     "mammonish", "upcushion",
255 |     "barkometer", "concretion",
256 |     "preagitate", "incomprehensible",
257 |     "tristich", "visceral",
258 |     "hemimelus", "patroller",
259 |     "stentorophonic", "pinulus",
260 |     "kerykeion", "brutism",
261 |     "monstership", "merciful",
262 |     "overinstruct", "defensibly",
263 |     "bettermost", "splenauxe",
264 |     "Mormyrus", "unreprimanded",
265 |     "taver", "ell",
266 |     "proacquittal", "infestation",
267 |     "overwoven", "Lincolnlike",
268 |     "chacona", "Tamil",
269 |     "classificational", "lebensraum",
270 |     "reeveland", "intuition",
271 |     "Whilkut", "focaloid",
272 |     "Eleusinian", "micromembrane",
273 |     "byroad", "nonrepetition",
274 |     "bacterioblast", "brag",
275 |     "ribaldrous", "phytoma",
276 |     "counteralliance", "pelvimetry",
277 |     "pelf", "relaster",
278 |     "thermoresistant", "aneurism",
279 |     "molossic", "euphonym",
280 |     "upswell", "ladhood",
281 |     "phallaceous", "inertly",
282 |     "gunshop", "stereotypography",
283 |     "laryngic", "refasten",
284 |     "twinling", "oflete",
285 |     "hepatorrhaphy", "electrotechnics",
286 |     "cockal", "guitarist",
287 |     "topsail", "Cimmerianism",
288 |     "larklike", "Llandovery",
289 |     "pyrocatechol", "immatchable",
290 |     "chooser", "metrocratic",
291 |     "craglike", "quadrennial",
292 |     "nonpoisonous", "undercolored",
293 |     "knob", "ultratense",
294 |     "balladmonger", "slait",
295 |     "sialadenitis", "bucketer",
296 |     "magnificently", "unstipulated",
297 |     "unscourged", "unsupercilious",
298 |     "packsack", "pansophism",
299 |     "soorkee", "percent",
300 |     "subirrigate", "champer",
301 |     "metapolitics", "spherulitic",
302 |     "involatile", "metaphonical",
303 |     "stachyuraceous", "speckedness",
304 |     "bespin", "proboscidiform",
305 |     "gul", "squit",
306 |     "yeelaman", "peristeropode",
307 |     "opacousness", "shibuichi",
308 |     "retinize", "yote",
309 |     "misexposition", "devilwise",
310 |     "pumpkinification", "vinny",
311 |     "bonze", "glossing",
312 |     "decardinalize", "transcortical",
313 |     "serphoid", "deepmost",
314 |     "guanajuatite", "wemless",
315 |     "arval", "lammy",
316 |     "Effie", "Saponaria",
317 |     "tetrahedral", "prolificy",
318 |     "excerpt", "dunkadoo",
319 |     "Spencerism", "insatiately",
320 |     "Gilaki", "oratorship",
321 |     "arduousness", "unbashfulness",
322 |     "Pithecolobium", "unisexuality",
323 |     "veterinarian", "detractive",
324 |     "liquidity", "acidophile",
325 |     "proauction", "sural",
326 |     "totaquina", "Vichyite",
327 |     "uninhabitedness", "allegedly",
328 |     "Gothish", "manny",
329 |     "Inger", "flutist",
330 |     "ticktick", "Ludgatian",
331 |     "homotransplant", "orthopedical",
332 |     "diminutively", "monogoneutic",
333 |     "Kenipsim", "sarcologist",
334 |     "drome", "stronghearted",
335 |     "Fameuse", "Swaziland",
336 |     "alen", "chilblain",
337 |     "beatable", "agglomeratic",
338 |     "constitutor", "tendomucoid",
339 |     "porencephalous", "arteriasis",
340 |     "boser", "tantivy",
341 |     "rede", "lineamental",
342 |     "uncontradictableness", "homeotypical",
343 |     "masa", "folious",
344 |     "dosseret", "neurodegenerative",
345 |     "subtransverse", "Chiasmodontidae",
346 |     "palaeotheriodont", "unstressedly",
347 |     "chalcites", "piquantness",
348 |     "lampyrine", "Aplacentalia",
349 |     "projecting", "elastivity",
350 |     "isopelletierin", "bladderwort",
351 |     "strander", "almud",
352 |     "iniquitously", "theologal",
353 |     "bugre", "chargeably",
354 |     "imperceptivity", "meriquinoidal",
355 |     "mesophyte", "divinator",
356 |     "perfunctory", "counterappellant",
357 |     "synovial", "charioteer",
358 |     "crystallographical", "comprovincial",
359 |     "infrastapedial", "pleasurehood",
360 |     "inventurous", "ultrasystematic",
361 |     "subangulated", "supraoesophageal",
362 |     "Vaishnavism", "transude",
363 |     "chrysochrous", "ungrave",
364 |     "reconciliable", "uninterpleaded",
365 |     "erlking", "wherefrom",
366 |     "aprosopia", "antiadiaphorist",
367 |     "metoxazine", "incalculable",
368 |     "umbellic", "predebit",
369 |     "foursquare", "unimmortal",
370 |     "nonmanufacture", "slangy",
371 |     "predisputant", "familist",
372 |     "preaffiliate", "friarhood",
373 |     "corelysis", "zoonitic",
374 |     "halloo", "paunchy",
375 |     "neuromimesis", "aconitine",
376 |     "hackneyed", "unfeeble",
377 |     "cubby", "autoschediastical",
378 |     "naprapath", "lyrebird",
379 |     "inexistency", "leucophoenicite",
380 |     "ferrogoslarite", "reperuse",
381 |     "uncombable", "tambo",
382 |     "propodiale", "diplomatize",
383 |     "Russifier", "clanned",
384 |     "corona", "michigan",
385 |     "nonutilitarian", "transcorporeal",
386 |     "bought", "Cercosporella",
387 |     "stapedius", "glandularly",
388 |     "pictorially", "weism",
389 |     "disilane", "rainproof",
390 |     "Caphtor", "scrubbed",
391 |     "oinomancy", "pseudoxanthine",
392 |     "nonlustrous", "redesertion",
393 |     "Oryzorictinae", "gala",
394 |     "Mycogone", "reappreciate",
395 |     "cyanoguanidine", "seeingness",
396 |     "breadwinner", "noreast",
397 |     "furacious", "epauliere",
398 |     "omniscribent", "Passiflorales",
399 |     "uninductive", "inductivity",
400 |     "Orbitolina", "Semecarpus",
401 |     "migrainoid", "steprelationship",
402 |     "phlogisticate", "mesymnion",
403 |     "sloped", "edificator",
404 |     "beneficent", "culm",
405 |     "paleornithology", "unurban",
406 |     "throbless", "amplexifoliate",
407 |     "sesquiquintile", "sapience",
408 |     "astucious", "dithery",
409 |     "boor", "ambitus",
410 |     "scotching", "uloid",
411 |     "uncompromisingness", "hoove",
412 |     "waird", "marshiness",
413 |     "Jerusalem", "mericarp",
414 |     "unevoked", "benzoperoxide",
415 |     "outguess", "pyxie",
416 |     "hymnic", "euphemize",
417 |     "mendacity", "erythremia",
418 |     "rosaniline", "unchatteled",
419 |     "lienteria", "Bushongo",
420 |     "dialoguer", "unrepealably",
421 |     "rivethead", "antideflation",
422 |     "vinegarish", "manganosiderite",
423 |     "doubtingness", "ovopyriform",
424 |     "Cephalodiscus", "Muscicapa",
425 |     "Animalivora", "angina",
426 |     "planispheric", "ipomoein",
427 |     "cuproiodargyrite", "sandbox",
428 |     "scrat", "Munnopsidae",
429 |     "shola", "pentafid",
430 |     "overstudiousness", "times",
431 |     "nonprofession", "appetible",
432 |     "valvulotomy", "goladar",
433 |     "uniarticular", "oxyterpene",
434 |     "unlapsing", "omega",
435 |     "trophonema", "seminonflammable",
436 |     "circumzenithal", "starer",
437 |     "depthwise", "liberatress",
438 |     "unleavened", "unrevolting",
439 |     "groundneedle", "topline",
440 |     "wandoo", "umangite",
441 |     "ordinant", "unachievable",
442 |     "oversand", "snare",
443 |     "avengeful", "unexplicit",
444 |     "mustafina", "sonable",
445 |     "rehabilitative", "eulogization",
446 |     "papery", "technopsychology",
447 |     "impressor", "cresylite",
448 |     "entame", "transudatory",
449 |     "scotale", "pachydermatoid",
450 |     "imaginary", "yeat",
451 |     "slipped", "stewardship",
452 |     "adatom", "cockstone",
453 |     "skyshine", "heavenful",
454 |     "comparability", "exprobratory",
455 |     "dermorhynchous", "parquet",
456 |     "cretaceous", "vesperal",
457 |     "raphis", "undangered",
458 |     "Glecoma", "engrain",
459 |     "counteractively", "Zuludom",
460 |     "orchiocatabasis", "Auriculariales",
461 |     "warriorwise", "extraorganismal",
462 |     "overbuilt", "alveolite",
463 |     "tetchy", "terrificness",
464 |     "widdle", "unpremonished",
465 |     "rebilling", "sequestrum",
466 |     "equiconvex", "heliocentricism",
467 |     "catabaptist", "okonite",
468 |     "propheticism", "helminthagogic",
469 |     "calycular", "giantly",
470 |     "wingable", "golem",
471 |     "unprovided", "commandingness",
472 |     "greave", "haply",
473 |     "doina", "depressingly",
474 |     "subdentate", "impairment",
475 |     "decidable", "neurotrophic",
476 |     "unpredict", "bicorporeal",
477 |     "pendulant", "flatman",
478 |     "intrabred", "toplike",
479 |     "Prosobranchiata", "farrantly",
480 |     "toxoplasmosis", "gorilloid",
481 |     "dipsomaniacal", "aquiline",
482 |     "atlantite", "ascitic",
483 |     "perculsive", "prospectiveness",
484 |     "saponaceous", "centrifugalization",
485 |     "dinical", "infravaginal",
486 |     "beadroll", "affaite",
487 |     "Helvidian", "tickleproof",
488 |     "abstractionism", "enhedge",
489 |     "outwealth", "overcontribute",
490 |     "coldfinch", "gymnastic",
491 |     "Pincian", "Munychian",
492 |     "codisjunct", "quad",
493 |     "coracomandibular", "phoenicochroite",
494 |     "amender", "selectivity",
495 |     "putative", "semantician",
496 |     "lophotrichic", "Spatangoidea",
497 |     "saccharogenic", "inferent",
498 |     "Triconodonta", "arrendation",
499 |     "sheepskin", "taurocolla",
500 |     "bunghole", "Machiavel",
501 |     "triakistetrahedral", "dehairer",
502 |     "prezygapophysial", "cylindric",
503 |     "pneumonalgia", "sleigher",
504 |     "emir", "Socraticism",
505 |     "licitness", "massedly",
506 |     "instructiveness", "sturdied",
507 |     "redecrease", "starosta",
508 |     "evictor", "orgiastic",
509 |     "squdge", "meloplasty",
510 |     "Tsonecan", "repealableness",
511 |     "swoony", "myesthesia",
512 |     "molecule", "autobiographist",
513 |     "reciprocation", "refective",
514 |     "unobservantness", "tricae",
515 |     "ungouged", "floatability",
516 |     "Mesua", "fetlocked",
517 |     "chordacentrum", "sedentariness",
518 |     "various", "laubanite",
519 |     "nectopod", "zenick",
520 |     "sequentially", "analgic",
521 |     "biodynamics", "posttraumatic",
522 |     "nummi", "pyroacetic",
523 |     "bot", "redescend",
524 |     "dispermy", "undiffusive",
525 |     "circular", "trillion",
526 |     "Uraniidae", "ploration",
527 |     "discipular", "potentness",
528 |     "sud", "Hu",
529 |     "Eryon", "plugger",
530 |     "subdrainage", "jharal",
531 |     "abscission", "supermarket",
532 |     "countergabion", "glacierist",
533 |     "lithotresis", "minniebush",
534 |     "zanyism", "eucalypteol",
535 |     "sterilely", "unrealize",
536 |     "unpatched", "hypochondriacism",
537 |     "critically", "cheesecutter")
538 | }


--------------------------------------------------------------------------------
/src/local/examples/GroupByAction.scala:
--------------------------------------------------------------------------------
 1 | package local.examples
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | import org.apache.spark.RangePartitioner
 5 | 
 6 | object GroupByAction {
 7 |   def main(args: Array[String]) {
 8 | 
 9 |     val sc = new SparkContext("local", "GroupByAction Test")
10 | 
11 |     val data = Array[(String, Int)](("A1", 1), ("A2", 2),
12 |       ("B1", 6), ("A2", 4),
13 |       ("B1", 3), ("B1", 5))
14 | 
15 |     val pairs = sc.parallelize(data, 3)
16 |     
17 |     // output: 
18 |     // (A1,1)
19 |     // (A2,2)
20 |     //
21 |     // (B1,6)
22 |     // (A2,4)
23 |     //
24 |     // (B1,3)
25 |     // (B1,5)
26 |     pairs.foreach(println)
27 | 
28 |     val result1 = pairs.groupBy(K => K._1)
29 |     val result2 = pairs.groupBy((K : (String, Int)) => K._1, 1)
30 |     val result3 = pairs.groupBy((K : (String, Int)) => K._1, new RangePartitioner(3, pairs))
31 |     
32 |     // output of result1:
33 |     // (A1,ArrayBuffer((A1,1)))
34 |     //
35 |     // (B1,ArrayBuffer((B1,6), (B1,3), (B1,5)))
36 |     // (A2,ArrayBuffer((A2,2), (A2,4)))
37 |     result1.foreach(println)
38 |     
39 |     // output of result2:
40 |     // (A1,ArrayBuffer((A1,1)))
41 |     // (B1,ArrayBuffer((B1,6), (B1,3), (B1,5)))
42 |     // (A2,ArrayBuffer((A2,2), (A2,4)))
43 |     result2.foreach(println)
44 |     
45 |     // output of result3:
46 |     // (A1,ArrayBuffer((A1,1)))
47 |     // (A2,ArrayBuffer((A2,2), (A2,4)))
48 |     //
49 |     // (B1,ArrayBuffer((B1,6), (B1,3), (B1,5)))
50 |     result3.foreach(println)
51 | 
52 |   }
53 | 
54 | }


--------------------------------------------------------------------------------
/src/local/examples/GroupByKey.scala:
--------------------------------------------------------------------------------
 1 | package local.examples
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | import org.apache.spark.SparkContext._
 5 | 
 6 | object GroupByKey {
 7 | 
 8 |    def main(args: Array[String]) {
 9 |     
10 |     val sc = new SparkContext("local", "GroupByKey Test") 
11 | 	val data = Array[(Int, Char)]((1, 'a'), (2, 'b'),
12 | 		    						 (3, 'c'), (4, 'd'),
13 | 		    						 (5, 'e'), (3, 'f'),
14 | 		    						 (2, 'g'), (1, 'h')
15 | 		    						 
16 | 		    						)    							
17 | 	val pairs = sc.parallelize(data, 3)
18 | 	
19 | 	val result = pairs.groupByKey(2)
20 | 	
21 | 	// output:
22 | 	// (B,ArrayBuffer(2, 3))
23 | 	// 
24 | 	// (A,ArrayBuffer(1))
25 | 	// (C,ArrayBuffer(4, 5, 6))
26 | 	//result.foreach(println)
27 | 	result.foreachWith(i => i)((x, i) => println("[PartitionIndex " + i + "] " + x))
28 | 	println(result.toDebugString)
29 |   }
30 | }


--------------------------------------------------------------------------------
/src/local/examples/GroupByTest.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package local.examples
19 | 
20 | import java.util.Random
21 | 
22 | import org.apache.spark.{SparkConf, SparkContext}
23 | import org.apache.spark.SparkContext._
24 | 
25 | /**
26 |   * Usage: GroupByTest [numMappers] [numKVPairs] [KeySize] [numReducers]
27 |   */
28 | object GroupByTest {
29 |   def main(args: Array[String]) {
30 |     val sparkConf = new SparkConf().setAppName("GroupBy Test").setMaster("local[2]")
31 |     var numMappers = 10
32 |     var numKVPairs = 100
33 |     var valSize = 100
34 |     var numReducers = 3
35 | 
36 |     val sc = new SparkContext(sparkConf)
37 | 
38 |     val pairs1 = sc.parallelize(0 until numMappers, numMappers).flatMap { p =>
39 |       val ranGen = new Random
40 |       var arr1 = new Array[(Int, Array[Byte])](numKVPairs)
41 |       for (i <- 0 until numKVPairs) {
42 |         val byteArr = new Array[Byte](valSize)
43 |         ranGen.nextBytes(byteArr)
44 |         arr1(i) = (ranGen.nextInt(10), byteArr)
45 |       }
46 |       arr1
47 |     }.cache
48 |     // Enforce that everything has been calculated and in cache
49 |     pairs1.count
50 | 
51 |     val result = pairs1.groupByKey(numReducers) 
52 |     println(result.count)
53 |     println(result.toDebugString)
54 | 
55 |     sc.stop()
56 |   }
57 | }
58 | 


--------------------------------------------------------------------------------
/src/local/examples/GroupWith.scala:
--------------------------------------------------------------------------------
 1 | package local.examples
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | import org.apache.spark.SparkContext._
 5 | 
 6 | object GroupWith {
 7 |   def main(args: Array[String]) {
 8 |     
 9 |     val sc = new SparkContext("local[2]", "GroupWith Test") 
10 |     
11 | 	val data1 = Array[(String, Int)](("A1", 1), ("A2", 2),
12 | 		    						 ("B1", 3), ("B2", 4),
13 | 		    						 ("C1", 5), ("C1", 6)
14 | 		    						)   
15 | 		    						
16 | 	val data2 = Array[(String, Int)](("A1", 7), ("A2", 8),
17 | 		    						 ("B1", 9), ("C1", 0)
18 | 		    						) 
19 | 	val pairs1 = sc.parallelize(data1, 3)
20 | 	val pairs2 = sc.parallelize(data2, 2)
21 | 
22 | 	val result = pairs1.groupWith(pairs2)
23 | 	result.foreach(println)
24 | 	
25 | 	// output:
26 | 	// (B1,(ArrayBuffer(3),ArrayBuffer(9)))
27 | 	// (A1,(ArrayBuffer(1),ArrayBuffer(7)))
28 | 	// (A2,(ArrayBuffer(2),ArrayBuffer(8)))
29 | 	//
30 | 	// (C1,(ArrayBuffer(5, 6),ArrayBuffer(0)))
31 | 	// (B2,(ArrayBuffer(4),ArrayBuffer()))
32 | 	
33 | 	
34 |   }
35 | }


--------------------------------------------------------------------------------
/src/local/examples/JoinAction.scala:
--------------------------------------------------------------------------------
 1 | package local.examples
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | import org.apache.spark.SparkContext._
 5 | 
 6 | object JoinAction {
 7 |      def main(args: Array[String]) {
 8 |     
 9 |     val sc = new SparkContext("local[2]", "JoinAction Test") 
10 |     
11 | 	val data1 = Array[(String, Int)](("A1", 1), ("A2", 2),
12 | 		    						 ("B1", 3), ("B2", 4),
13 | 		    						 ("C1", 5), ("C1", 6)
14 | 		    						)   
15 | 		    						
16 | 	val data2 = Array[(String, Int)](("A1", 7), ("A2", 8),
17 | 		    						 ("B1", 9), ("C1", 0)
18 | 		    						) 
19 | 	val pairs1 = sc.parallelize(data1, 3)
20 | 	val pairs2 = sc.parallelize(data2, 2)
21 | 	
22 | 	
23 | 	val result = pairs1.join(pairs2)
24 | 	
25 | 	// output:
26 | 	// (A1,(1,7))
27 | 	// (B1,(3,9))
28 | 	// (A2,(2,8))
29 | 	//
30 | 	// (C1,(5,0))
31 | 	// (C1,(6,0))
32 | 	result.foreach(println)
33 |   }
34 | 
35 | }


--------------------------------------------------------------------------------
/src/local/examples/LocalWordCount.scala:
--------------------------------------------------------------------------------
 1 | package local.examples
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | import org.apache.spark.SparkContext._
 5 | 
 6 | object LocalWordCount {
 7 |   def main(args: Array[String]) {
 8 |     
 9 |     val sc = new SparkContext("local[4]", "LocalWordCount")  
10 |     val myFile = sc.textFile("/Users/xulijie/Documents/data/RandomText/randomText-10MB.txt")
11 |     /*
12 |     val counts = myFile.map( l => l.split(" ")(2) )
13 | 			.map( url => (url, 1) )
14 | 			.reduceByKey( _+_ )
15 | 			.map{ case(url, count) => (count, url) }
16 | 			.sortByKey( ascending=false )
17 | 			.map{ case(count, url) => (url, count) }
18 | 	
19 |     */
20 |     val wordAndCount = myFile.flatMap(s => s.split(" "))
21 |     				   .map(w => (w, 1))
22 |     				   
23 |     val result = wordAndCount.reduceByKey(_ + _) 
24 |     result.foreach(println)
25 |    
26 |   }
27 | 
28 | }


--------------------------------------------------------------------------------
/src/local/examples/LookUpTest.scala:
--------------------------------------------------------------------------------
 1 | package local.examples
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | import org.apache.spark.SparkContext._
 5 | 
 6 | object LookUpTest {
 7 |   def main(args: Array[String]) {
 8 | 
 9 |     val sc = new SparkContext("local", "LookUp Test")
10 |     
11 |     val data = Array[(String, Int)](("A", 1), ("B", 2),
12 |       ("B", 3), ("C", 4),
13 |       ("C", 5), ("C", 6))
14 |       
15 |     val pairs = sc.parallelize(data, 3)
16 | 
17 |     val finalRDD = pairs.lookup("B")
18 |     
19 |     finalRDD.foreach(println)
20 |     // output:
21 |     // 2
22 |     // 3
23 |   }
24 | }


--------------------------------------------------------------------------------
/src/local/examples/MapPartitionsRDDTest.scala:
--------------------------------------------------------------------------------
 1 | package local.examples
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | 
 5 | object MapPartitionsRDDTest {
 6 | 
 7 |   def main(args: Array[String]) {
 8 | 		val sc = new SparkContext("local", "MapPartitionsRDD Test") 
 9 | 		val data = Array[(String, Int)](("A1", 1), ("A2", 2),
10 | 		    							("B1", 1), ("B2", 4),
11 | 		    							("C1", 3), ("C2", 4)
12 | 		    							)    							
13 | 		val pairs = sc.parallelize(data, 3)
14 | 		
15 | 		val finalRDD = pairs.mapPartitions(iter => iter.filter(_._2 >= 2))
16 | 		// val finalRDD2 = pairs.mapPartitionsWithIndex(f, preservesPartitioning)
17 | 		
18 | 		finalRDD.toArray().foreach(println)
19 | 		
20 |   }
21 | }


--------------------------------------------------------------------------------
/src/local/examples/MapValuesTest.scala:
--------------------------------------------------------------------------------
 1 | package local.examples
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | 
 5 | object MapValuesTest {
 6 |    def main(args: Array[String]) {
 7 |     
 8 |     val sc = new SparkContext("local", "ReduceByKeyToDriver Test") 
 9 | 	val data1 = Array[(String, Int)](("K", 1), ("T", 2),
10 | 		    						 ("T", 3), ("W", 4),
11 | 		    						 ("W", 5), ("W", 6)
12 | 		    						)    							
13 | 	val pairs = sc.parallelize(data1, 3)
14 | 	//val result = pairs.reduce((A, B) => (A._1 + "#" + B._1, A._2 + B._2))
15 | 	//val result = pairs.fold(("K0",10))((A, B) => (A._1 + "#" + B._1, A._2 + B._2))
16 | 	//val result = pairs.partitionBy(new RangePartitioner(2, pairs, true))
17 | 	//val result = pairs.mapValues(V => 10 * V)
18 | 	//result.foreach(println)
19 |   }
20 | }


--------------------------------------------------------------------------------
/src/local/examples/PipedRDDTest.scala:
--------------------------------------------------------------------------------
 1 | package local.examples
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | 
 5 | object PipedRDDTest {
 6 | 
 7 |   def main(args: Array[String]) {
 8 | 		val sc = new SparkContext("local", "Cartesian Test") 
 9 | 		val data1 = Array[(String, Int)](("K1", 1), ("K2", 2),
10 | 		    							("U1", 3), ("U2", 4),
11 | 		    							("W1", 3), ("W2", 4)
12 | 		    							)    							
13 | 		val pairs = sc.parallelize(data1, 3)
14 | 		
15 | 		val finalRDD = pairs.pipe("grep 2")
16 | 		
17 | 		finalRDD.foreach(println)
18 | 		
19 |   }
20 | }


--------------------------------------------------------------------------------
/src/local/examples/ReduceByKeyActionTest.scala:
--------------------------------------------------------------------------------
 1 | package local.examples
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | 
 5 | object ReduceByKeyActionTest {
 6 | 
 7 |   def main(args: Array[String]) {
 8 | 
 9 |     val sc = new SparkContext("local", "ReduceByKeyToDriver Test")
10 |     val data1 = Array[(String, Int)](("K", 1), ("U", 2),
11 |       ("U", 3), ("W", 4),
12 |       ("W", 5), ("W", 6))
13 |     val pairs = sc.parallelize(data1, 3)
14 |     //val result = pairs.reduce((A, B) => (A._1 + "#" + B._1, A._2 + B._2))
15 |     //val result = pairs.fold(("K0",10))((A, B) => (A._1 + "#" + B._1, A._2 + B._2))
16 |     //val result = pairs.reduceByKey(_ + _, 2)
17 |     //result.foreach(println)
18 |   }
19 | 
20 | }


--------------------------------------------------------------------------------
/src/local/examples/ReduceByKeyToDriverTest.scala:
--------------------------------------------------------------------------------
 1 | package local.examples
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | 
 5 | object ReduceByKeyToDriverTest {
 6 |   def main(args: Array[String]) {
 7 |     
 8 |     val sc = new SparkContext("local[3]", "ReduceByKeyToDriver Test") 
 9 | 	val data1 = Array[(String, Int)](("K", 1), ("U", 2),
10 | 		    						 ("U", 3), ("W", 4),
11 | 		    						 ("W", 5), ("W", 6)
12 | 		    						)    							
13 | 	val pairs = sc.parallelize(data1, 3)
14 | 	//val result = pairs.reduce((A, B) => (A._1 + "#" + B._1, A._2 + B._2))
15 | 	//val result = pairs.fold(("K0",10))((A, B) => (A._1 + "#" + B._1, A._2 + B._2))
16 | 	//val result = pairs.reduceByKeyToDriver(_ + _)
17 | 	//result.foreach(println)
18 |   }
19 | }


--------------------------------------------------------------------------------
/src/local/examples/SparkLR.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package local.examples
19 | 
20 | import java.util.Random
21 | 
22 | import scala.math.exp
23 | 
24 | import breeze.linalg.{Vector, DenseVector}
25 | 
26 | import org.apache.spark._
27 | 
28 | /**
29 |  * Logistic regression based classification.
30 |  * Usage: SparkLR [slices]
31 |  */
32 | object SparkLR {
33 |   val N = 10000  // Number of data points
34 |   val D = 10   // Numer of dimensions
35 |   val R = 0.7  // Scaling factor
36 |   val ITERATIONS = 5
37 |   val rand = new Random(42)
38 | 
39 |   case class DataPoint(x: Vector[Double], y: Double)
40 | 
41 |   def generateData = {
42 |     def generatePoint(i: Int) = {
43 |       val y = if(i % 2 == 0) -1 else 1
44 |       val x = DenseVector.fill(D){rand.nextGaussian + y * R}
45 |       println(x.toString() + " " + y)
46 |       DataPoint(x, y)
47 |       
48 |     }
49 |     Array.tabulate(N)(generatePoint)
50 |   }
51 | 
52 |   def main(args: Array[String]) {
53 |     val sparkConf = new SparkConf().setAppName("SparkLR")
54 |     val sc = new SparkContext(sparkConf)
55 |     val numSlices = if (args.length > 0) args(0).toInt else 2
56 |     val points = sc.parallelize(generateData, numSlices).cache()
57 | 
58 |     // Initialize w to a random value
59 |     var w = DenseVector.fill(D){2 * rand.nextDouble - 1}
60 |     println("Initial w: " + w)
61 | 
62 |     for (i <- 1 to ITERATIONS) {
63 |       println("On iteration " + i)
64 |       val gradient = points.map { p =>
65 |         p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y
66 |       }.reduce(_ + _)
67 |       w -= gradient
68 |     }
69 | 
70 |     println("Final w: " + w)
71 |     sc.stop()
72 |   }
73 | }
74 | 


--------------------------------------------------------------------------------
/src/local/examples/TakeActionTest.scala:
--------------------------------------------------------------------------------
 1 | package local.examples
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | 
 5 | object TakeActionTest {
 6 |   def main(args: Array[String]) {
 7 |     
 8 |     val sc = new SparkContext("local", "TakeAction Test") 
 9 | 	val data1 = Array[(String, Int)](("K1", 1), ("K2", 2),
10 | 		    						 ("U1", 3), ("U2", 4),
11 | 		    						 ("W1", 3), ("W2", 4)
12 | 		    						)    							
13 | 	val pairs = sc.parallelize(data1, 3)
14 | 	//val result = pairs.reduce((A, B) => (A._1 + "#" + B._1, A._2 + B._2))
15 | 	//val result = pairs.fold(("K0",10))((A, B) => (A._1 + "#" + B._1, A._2 + B._2))
16 | 	val result = pairs.take(5)
17 | 	result.foreach(println)
18 |   }
19 | }


--------------------------------------------------------------------------------
/src/local/examples/UnionTest.scala:
--------------------------------------------------------------------------------
 1 | package local.examples
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | 
 5 | object UnionTest {
 6 |   def main(args: Array[String]) {
 7 |     
 8 |     val sc = new SparkContext("local", "ReduceByKeyToDriver Test") 
 9 |     
10 | 	val data1 = Array[(String, Int)](("K1", 1), ("K2", 2),
11 | 		    						 ("U1", 3), ("U2", 4),
12 | 		    						 ("W1", 5), ("W1", 6)
13 | 		    						)   
14 | 		    						
15 | 	val data2 = Array[(String, Int)](("K1", 7), ("K2", 8),
16 | 		    						 ("U1", 9), ("W1", 0)
17 | 		    						) 
18 | 	val pairs1 = sc.parallelize(data1, 3)
19 | 	val pairs2 = sc.parallelize(data2, 2)  
20 | 	//val result = pairs.reduce((A, B) => (A._1 + "#" + B._1, A._2 + B._2))
21 | 	//val result = pairs.fold(("K0",10))((A, B) => (A._1 + "#" + B._1, A._2 + B._2))
22 | 	//val result = pairs.partitionBy(new RangePartitioner(2, pairs, true))
23 | 	val result = pairs1.union(pairs2)
24 | 	result.foreach(println)
25 | 	//result.saveAsTextFile("E:\\Spark\\output\\join")
26 |   }
27 | }


--------------------------------------------------------------------------------
/src/local/examples/partitionByTest.scala:
--------------------------------------------------------------------------------
 1 | package local.examples
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | 
 5 | object partitionByTest {
 6 |    def main(args: Array[String]) {
 7 |     
 8 |     val sc = new SparkContext("local", "ReduceByKeyToDriver Test") 
 9 | 	val data1 = Array[(String, Int)](("K", 1), ("T", 2),
10 | 		    						 ("T", 3), ("W", 4),
11 | 		    						 ("W", 5), ("W", 6)
12 | 		    						)    							
13 | 	val pairs = sc.parallelize(data1, 3)
14 | 	//val result = pairs.reduce((A, B) => (A._1 + "#" + B._1, A._2 + B._2))
15 | 	//val result = pairs.fold(("K0",10))((A, B) => (A._1 + "#" + B._1, A._2 + B._2))
16 | 	//val result = pairs.partitionBy(new RangePartitioner(2, pairs, true))
17 | 	//val result = pairs.partitionBy(new HashPartitioner(2))
18 | 	//result.foreach(println)
19 |   }
20 | }


--------------------------------------------------------------------------------
/src/local/examples/reduceActionTest.scala:
--------------------------------------------------------------------------------
 1 | package local.examples
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | 
 5 | object reduceActionTest {
 6 |   def main(args: Array[String]) {
 7 |     
 8 |     val sc = new SparkContext("local", "MapPartitionsRDD Test") 
 9 | 	val data1 = Array[(String, Int)](("K1", 1), ("K2", 2),
10 | 		    						 ("U1", 3), ("U2", 4),
11 | 		    						 ("W1", 3), ("W2", 4)
12 | 		    							)    							
13 | 	val pairs = sc.parallelize(data1, 3)
14 | 	//val result = pairs.reduce((A, B) => (A._1 + "#" + B._1, A._2 + B._2))
15 | 	val result = pairs.fold(("K0",10))((A, B) => (A._1 + "#" + B._1, A._2 + B._2))
16 | 	println(result)
17 |   }
18 | 
19 | }


--------------------------------------------------------------------------------
/src/local/examples/sortByKeyTest.scala:
--------------------------------------------------------------------------------
 1 | package local.examples
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | 
 5 | object sortByKeyTest {
 6 |    def main(args: Array[String]) {
 7 |     
 8 |     val sc = new SparkContext("local", "ReduceByKeyToDriver Test") 
 9 |     
10 | 	val data1 = Array[(String, Int)](("K1", 1), ("K2", 2),
11 | 		    						 ("U1", 3), ("U2", 4),
12 | 		    						 ("W1", 5), ("W1", 6)
13 | 		    						)   
14 | 	val pairs1 = sc.parallelize(data1, 3)
15 | 
16 | 	//val result = pairs.reduce((A, B) => (A._1 + "#" + B._1, A._2 + B._2))
17 | 	//val result = pairs.fold(("K0",10))((A, B) => (A._1 + "#" + B._1, A._2 + B._2))
18 | 
19 | 	//val result = pairs1.sortByKey()
20 | 	//result.foreach(println)
21 | 	//result.saveAsTextFile("E:\\Spark\\output\\sortByKey")
22 |   }
23 | 
24 | }


--------------------------------------------------------------------------------
/src/org/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JerryLead/SparkLearning/b1150869e97dff8b623898c88b3635abd9f3bd54/src/org/.DS_Store


--------------------------------------------------------------------------------
/src/org/apache/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JerryLead/SparkLearning/b1150869e97dff8b623898c88b3635abd9f3bd54/src/org/apache/.DS_Store


--------------------------------------------------------------------------------
/src/org/apache/spark/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JerryLead/SparkLearning/b1150869e97dff8b623898c88b3635abd9f3bd54/src/org/apache/spark/.DS_Store


--------------------------------------------------------------------------------
/src/org/apache/spark/examples/BroadcastTest.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.examples
19 | 
20 | import org.apache.spark.{SparkConf, SparkContext}
21 | 
22 | /**
23 |   * Usage: BroadcastTest [slices] [numElem] [broadcastAlgo] [blockSize]
24 |   */
25 | object BroadcastTest {
26 |   def main(args: Array[String]) {
27 | 
28 |     val bcName = if (args.length > 2) args(2) else "Http"
29 |     val blockSize = if (args.length > 3) args(3) else "4096"
30 | 
31 |     System.setProperty("spark.broadcast.factory", "org.apache.spark.broadcast." + bcName +
32 |       "BroadcastFactory")
33 |     System.setProperty("spark.broadcast.blockSize", blockSize)
34 |     val sparkConf = new SparkConf().setAppName("Broadcast Test")
35 | 
36 |     val sc = new SparkContext(sparkConf)
37 | 
38 |     val slices = if (args.length > 0) args(0).toInt else 2
39 |     val num = if (args.length > 1) args(1).toInt else 1000000
40 | 
41 |     val arr1 = new Array[Int](num)
42 |     for (i <- 0 until arr1.length) {
43 |       arr1(i) = i
44 |     }
45 | 
46 |     for (i <- 0 until 3) {
47 |       println("Iteration " + i)
48 |       println("===========")
49 |       val startTime = System.nanoTime
50 |       val barr1 = sc.broadcast(arr1)
51 |       val observedSizes = sc.parallelize(1 to 10, slices).map(_ => barr1.value.size)
52 |       // Collect the small RDD so we can print the observed sizes locally.
53 |       observedSizes.collect().foreach(i => println(i))
54 |       println("Iteration %d took %.0f milliseconds".format(i, (System.nanoTime - startTime) / 1E6))
55 |     }
56 | 
57 |     sc.stop()
58 |   }
59 | }
60 | 


--------------------------------------------------------------------------------
/src/org/apache/spark/examples/CassandraCQLTest.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.examples
 19 | 
 20 | import java.nio.ByteBuffer
 21 | 
 22 | import scala.collection.JavaConversions._
 23 | import scala.collection.mutable.ListBuffer
 24 | import scala.collection.immutable.Map
 25 | 
 26 | import org.apache.cassandra.hadoop.ConfigHelper
 27 | import org.apache.cassandra.hadoop.cql3.CqlPagingInputFormat
 28 | import org.apache.cassandra.hadoop.cql3.CqlConfigHelper
 29 | import org.apache.cassandra.hadoop.cql3.CqlOutputFormat
 30 | import org.apache.cassandra.utils.ByteBufferUtil
 31 | import org.apache.hadoop.mapreduce.Job
 32 | 
 33 | import org.apache.spark.{SparkConf, SparkContext}
 34 | import org.apache.spark.SparkContext._
 35 | 
 36 | /*
 37 |   Need to create following keyspace and column family in cassandra before running this example
 38 |   Start CQL shell using ./bin/cqlsh and execute following commands
 39 |   CREATE KEYSPACE retail WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 1};
 40 |   use retail;
 41 |   CREATE TABLE salecount (prod_id text, sale_count int, PRIMARY KEY (prod_id));
 42 |   CREATE TABLE ordercf (user_id text,
 43 |     time timestamp,
 44 |     prod_id text,
 45 |     quantity int,
 46 |     PRIMARY KEY (user_id, time));
 47 |   INSERT INTO ordercf (user_id,
 48 |     time,
 49 |     prod_id,
 50 |     quantity) VALUES  ('bob', 1385983646000, 'iphone', 1);
 51 |   INSERT INTO ordercf (user_id,
 52 |     time,
 53 |     prod_id,
 54 |     quantity) VALUES ('tom', 1385983647000, 'samsung', 4);
 55 |   INSERT INTO ordercf (user_id,
 56 |     time,
 57 |     prod_id,
 58 |     quantity) VALUES ('dora', 1385983648000, 'nokia', 2);
 59 |   INSERT INTO ordercf (user_id,
 60 |     time,
 61 |     prod_id,
 62 |     quantity) VALUES ('charlie', 1385983649000, 'iphone', 2);
 63 | */
 64 | 
 65 | /**
 66 |  * This example demonstrates how to read and write to cassandra column family created using CQL3
 67 |  * using Spark.
 68 |  * Parameters : <cassandra_node> <cassandra_port>
 69 |  * Usage: ./bin/spark-submit examples.jar \
 70 |  *  --class org.apache.spark.examples.CassandraCQLTest localhost 9160
 71 |  */
 72 | object CassandraCQLTest {
 73 | 
 74 |   def main(args: Array[String]) {
 75 |     val sparkConf = new SparkConf().setAppName("CQLTestApp")
 76 | 
 77 |     val sc = new SparkContext(sparkConf)
 78 |     val cHost: String = args(0)
 79 |     val cPort: String = args(1)
 80 |     val KeySpace = "retail"
 81 |     val InputColumnFamily = "ordercf"
 82 |     val OutputColumnFamily = "salecount"
 83 | 
 84 |     val job = new Job()
 85 |     job.setInputFormatClass(classOf[CqlPagingInputFormat])
 86 |     ConfigHelper.setInputInitialAddress(job.getConfiguration(), cHost)
 87 |     ConfigHelper.setInputRpcPort(job.getConfiguration(), cPort)
 88 |     ConfigHelper.setInputColumnFamily(job.getConfiguration(), KeySpace, InputColumnFamily)
 89 |     ConfigHelper.setInputPartitioner(job.getConfiguration(), "Murmur3Partitioner")
 90 |     CqlConfigHelper.setInputCQLPageRowSize(job.getConfiguration(), "3")
 91 | 
 92 |     /** CqlConfigHelper.setInputWhereClauses(job.getConfiguration(), "user_id='bob'") */
 93 | 
 94 |     /** An UPDATE writes one or more columns to a record in a Cassandra column family */
 95 |     val query = "UPDATE " + KeySpace + "." + OutputColumnFamily + " SET sale_count = ? "
 96 |     CqlConfigHelper.setOutputCql(job.getConfiguration(), query)
 97 | 
 98 |     job.setOutputFormatClass(classOf[CqlOutputFormat])
 99 |     ConfigHelper.setOutputColumnFamily(job.getConfiguration(), KeySpace, OutputColumnFamily)
100 |     ConfigHelper.setOutputInitialAddress(job.getConfiguration(), cHost)
101 |     ConfigHelper.setOutputRpcPort(job.getConfiguration(), cPort)
102 |     ConfigHelper.setOutputPartitioner(job.getConfiguration(), "Murmur3Partitioner")
103 | 
104 |     val casRdd = sc.newAPIHadoopRDD(job.getConfiguration(),
105 |       classOf[CqlPagingInputFormat],
106 |       classOf[java.util.Map[String,ByteBuffer]],
107 |       classOf[java.util.Map[String,ByteBuffer]])
108 | 
109 |     println("Count: " + casRdd.count)
110 |     val productSaleRDD = casRdd.map {
111 |       case (key, value) => {
112 |         (ByteBufferUtil.string(value.get("prod_id")), ByteBufferUtil.toInt(value.get("quantity")))
113 |       }
114 |     }
115 |     val aggregatedRDD = productSaleRDD.reduceByKey(_ + _)
116 |     aggregatedRDD.collect().foreach {
117 |       case (productId, saleCount) => println(productId + ":" + saleCount)
118 |     }
119 | 
120 |     val casoutputCF  = aggregatedRDD.map {
121 |       case (productId, saleCount) => {
122 |         val outColFamKey = Map("prod_id" -> ByteBufferUtil.bytes(productId))
123 |         val outKey: java.util.Map[String, ByteBuffer] = outColFamKey
124 |         var outColFamVal = new ListBuffer[ByteBuffer]
125 |         outColFamVal += ByteBufferUtil.bytes(saleCount)
126 |         val outVal: java.util.List[ByteBuffer] = outColFamVal
127 |        (outKey, outVal)
128 |       }
129 |     }
130 | 
131 |     casoutputCF.saveAsNewAPIHadoopFile(
132 |         KeySpace,
133 |         classOf[java.util.Map[String, ByteBuffer]],
134 |         classOf[java.util.List[ByteBuffer]],
135 |         classOf[CqlOutputFormat],
136 |         job.getConfiguration()
137 |       )
138 |   }
139 | }
140 | 


--------------------------------------------------------------------------------
/src/org/apache/spark/examples/CassandraTest.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.examples
 19 | 
 20 | import java.nio.ByteBuffer
 21 | import java.util.SortedMap
 22 | 
 23 | import scala.collection.JavaConversions._
 24 | 
 25 | import org.apache.cassandra.db.IColumn
 26 | import org.apache.cassandra.hadoop.ColumnFamilyOutputFormat
 27 | import org.apache.cassandra.hadoop.ConfigHelper
 28 | import org.apache.cassandra.hadoop.ColumnFamilyInputFormat
 29 | import org.apache.cassandra.thrift._
 30 | import org.apache.cassandra.utils.ByteBufferUtil
 31 | import org.apache.hadoop.mapreduce.Job
 32 | 
 33 | import org.apache.spark.{SparkConf, SparkContext}
 34 | import org.apache.spark.SparkContext._
 35 | 
 36 | /*
 37 |  * This example demonstrates using Spark with Cassandra with the New Hadoop API and Cassandra
 38 |  * support for Hadoop.
 39 |  *
 40 |  * To run this example, run this file with the following command params -
 41 |  * <cassandra_node> <cassandra_port>
 42 |  *
 43 |  * So if you want to run this on localhost this will be,
 44 |  * localhost 9160
 45 |  *
 46 |  * The example makes some assumptions:
 47 |  * 1. You have already created a keyspace called casDemo and it has a column family named Words
 48 |  * 2. There are column family has a column named "para" which has test content.
 49 |  *
 50 |  * You can create the content by running the following script at the bottom of this file with
 51 |  * cassandra-cli.
 52 |  *
 53 |  */
 54 | object CassandraTest {
 55 | 
 56 |   def main(args: Array[String]) {
 57 |     val sparkConf = new SparkConf().setAppName("casDemo")
 58 |     // Get a SparkContext
 59 |     val sc = new SparkContext(sparkConf)
 60 | 
 61 |     // Build the job configuration with ConfigHelper provided by Cassandra
 62 |     val job = new Job()
 63 |     job.setInputFormatClass(classOf[ColumnFamilyInputFormat])
 64 | 
 65 |     val host: String = args(1)
 66 |     val port: String = args(2)
 67 | 
 68 |     ConfigHelper.setInputInitialAddress(job.getConfiguration(), host)
 69 |     ConfigHelper.setInputRpcPort(job.getConfiguration(), port)
 70 |     ConfigHelper.setOutputInitialAddress(job.getConfiguration(), host)
 71 |     ConfigHelper.setOutputRpcPort(job.getConfiguration(), port)
 72 |     ConfigHelper.setInputColumnFamily(job.getConfiguration(), "casDemo", "Words")
 73 |     ConfigHelper.setOutputColumnFamily(job.getConfiguration(), "casDemo", "WordCount")
 74 | 
 75 |     val predicate = new SlicePredicate()
 76 |     val sliceRange = new SliceRange()
 77 |     sliceRange.setStart(Array.empty[Byte])
 78 |     sliceRange.setFinish(Array.empty[Byte])
 79 |     predicate.setSlice_range(sliceRange)
 80 |     ConfigHelper.setInputSlicePredicate(job.getConfiguration(), predicate)
 81 | 
 82 |     ConfigHelper.setInputPartitioner(job.getConfiguration(), "Murmur3Partitioner")
 83 |     ConfigHelper.setOutputPartitioner(job.getConfiguration(), "Murmur3Partitioner")
 84 | 
 85 |     // Make a new Hadoop RDD
 86 |     val casRdd = sc.newAPIHadoopRDD(
 87 |       job.getConfiguration(),
 88 |       classOf[ColumnFamilyInputFormat],
 89 |       classOf[ByteBuffer],
 90 |       classOf[SortedMap[ByteBuffer, IColumn]])
 91 | 
 92 |     // Let us first get all the paragraphs from the retrieved rows
 93 |     val paraRdd = casRdd.map {
 94 |       case (key, value) => {
 95 |         ByteBufferUtil.string(value.get(ByteBufferUtil.bytes("para")).value())
 96 |       }
 97 |     }
 98 | 
 99 |     // Lets get the word count in paras
100 |     val counts = paraRdd.flatMap(p => p.split(" ")).map(word => (word, 1)).reduceByKey(_ + _)
101 | 
102 |     counts.collect().foreach {
103 |       case (word, count) => println(word + ":" + count)
104 |     }
105 | 
106 |     counts.map {
107 |       case (word, count) => {
108 |         val colWord = new org.apache.cassandra.thrift.Column()
109 |         colWord.setName(ByteBufferUtil.bytes("word"))
110 |         colWord.setValue(ByteBufferUtil.bytes(word))
111 |         colWord.setTimestamp(System.currentTimeMillis)
112 | 
113 |         val colCount = new org.apache.cassandra.thrift.Column()
114 |         colCount.setName(ByteBufferUtil.bytes("wcount"))
115 |         colCount.setValue(ByteBufferUtil.bytes(count.toLong))
116 |         colCount.setTimestamp(System.currentTimeMillis)
117 | 
118 |         val outputkey = ByteBufferUtil.bytes(word + "-COUNT-" + System.currentTimeMillis)
119 | 
120 |         val mutations: java.util.List[Mutation] = new Mutation() :: new Mutation() :: Nil
121 |         mutations.get(0).setColumn_or_supercolumn(new ColumnOrSuperColumn())
122 |         mutations.get(0).column_or_supercolumn.setColumn(colWord)
123 |         mutations.get(1).setColumn_or_supercolumn(new ColumnOrSuperColumn())
124 |         mutations.get(1).column_or_supercolumn.setColumn(colCount)
125 |         (outputkey, mutations)
126 |       }
127 |     }.saveAsNewAPIHadoopFile("casDemo", classOf[ByteBuffer], classOf[List[Mutation]],
128 |       classOf[ColumnFamilyOutputFormat], job.getConfiguration)
129 |   }
130 | }
131 | 
132 | /*
133 | create keyspace casDemo;
134 | use casDemo;
135 | 
136 | create column family WordCount with comparator = UTF8Type;
137 | update column family WordCount with column_metadata =
138 |   [{column_name: word, validation_class: UTF8Type},
139 |     {column_name: wcount, validation_class: LongType}];
140 | 
141 | create column family Words with comparator = UTF8Type;
142 | update column family Words with column_metadata =
143 |   [{column_name: book, validation_class: UTF8Type},
144 |     {column_name: para, validation_class: UTF8Type}];
145 | 
146 | assume Words keys as utf8;
147 | 
148 | set Words['3musk001']['book'] = 'The Three Musketeers';
149 | set Words['3musk001']['para'] = 'On the first Monday of the month of April, 1625, the market
150 |   town of Meung, in which the author of ROMANCE OF THE ROSE was born, appeared to
151 |  be in as perfect a state of revolution as if the Huguenots had just made
152 |  a second La Rochelle of it. Many citizens, seeing the women flying
153 |  toward the High Street, leaving their children crying at the open doors,
154 |  hastened to don the cuirass, and supporting their somewhat uncertain
155 |  courage with a musket or a partisan, directed their steps toward the
156 |  hostelry of the Jolly Miller, before which was gathered, increasing
157 |  every minute, a compact group, vociferous and full of curiosity.';
158 | 
159 | set Words['3musk002']['book'] = 'The Three Musketeers';
160 | set Words['3musk002']['para'] = 'In those times panics were common, and few days passed without
161 |   some city or other registering in its archives an event of this kind. There were
162 |   nobles, who made war against each other; there was the king, who made
163 |   war against the cardinal; there was Spain, which made war against the
164 |   king. Then, in addition to these concealed or public, secret or open
165 |   wars, there were robbers, mendicants, Huguenots, wolves, and scoundrels,
166 |   who made war upon everybody. The citizens always took up arms readily
167 |   against thieves, wolves or scoundrels, often against nobles or
168 |   Huguenots, sometimes against the king, but never against cardinal or
169 |   Spain. It resulted, then, from this habit that on the said first Monday
170 |   of April, 1625, the citizens, on hearing the clamor, and seeing neither
171 |   the red-and-yellow standard nor the livery of the Duc de Richelieu,
172 |   rushed toward the hostel of the Jolly Miller. When arrived there, the
173 |   cause of the hubbub was apparent to all';
174 | 
175 | set Words['3musk003']['book'] = 'The Three Musketeers';
176 | set Words['3musk003']['para'] = 'You ought, I say, then, to husband the means you have, however
177 |   large the sum may be; but you ought also to endeavor to perfect yourself in
178 |   the exercises becoming a gentleman. I will write a letter today to the
179 |   Director of the Royal Academy, and tomorrow he will admit you without
180 |   any expense to yourself. Do not refuse this little service. Our
181 |   best-born and richest gentlemen sometimes solicit it without being able
182 |   to obtain it. You will learn horsemanship, swordsmanship in all its
183 |   branches, and dancing. You will make some desirable acquaintances; and
184 |   from time to time you can call upon me, just to tell me how you are
185 |   getting on, and to say whether I can be of further service to you.';
186 | 
187 | 
188 | set Words['thelostworld001']['book'] = 'The Lost World';
189 | set Words['thelostworld001']['para'] = 'She sat with that proud, delicate profile of hers outlined
190 |   against the red curtain.  How beautiful she was!  And yet how aloof!  We had been
191 |   friends, quite good friends; but never could I get beyond the same
192 |   comradeship which I might have established with one of my
193 |   fellow-reporters upon the Gazette,--perfectly frank, perfectly kindly,
194 |   and perfectly unsexual.  My instincts are all against a woman being too
195 |   frank and at her ease with me.  It is no compliment to a man.  Where
196 |   the real sex feeling begins, timidity and distrust are its companions,
197 |   heritage from old wicked days when love and violence went often hand in
198 |   hand.  The bent head, the averted eye, the faltering voice, the wincing
199 |   figure--these, and not the unshrinking gaze and frank reply, are the
200 |   true signals of passion.  Even in my short life I had learned as much
201 |   as that--or had inherited it in that race memory which we call instinct.';
202 | 
203 | set Words['thelostworld002']['book'] = 'The Lost World';
204 | set Words['thelostworld002']['para'] = 'I always liked McArdle, the crabbed, old, round-backed,
205 |   red-headed news editor, and I rather hoped that he liked me.  Of course, Beaumont was
206 |   the real boss; but he lived in the rarefied atmosphere of some Olympian
207 |   height from which he could distinguish nothing smaller than an
208 |   international crisis or a split in the Cabinet.  Sometimes we saw him
209 |   passing in lonely majesty to his inner sanctum, with his eyes staring
210 |   vaguely and his mind hovering over the Balkans or the Persian Gulf.  He
211 |   was above and beyond us.  But McArdle was his first lieutenant, and it
212 |   was he that we knew.  The old man nodded as I entered the room, and he
213 |   pushed his spectacles far up on his bald forehead.';
214 | 
215 | */
216 | 


--------------------------------------------------------------------------------
/src/org/apache/spark/examples/DriverSubmissionTest.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.examples
19 | 
20 | import scala.collection.JavaConversions._
21 | 
22 | /** Prints out environmental information, sleeps, and then exits. Made to
23 |   * test driver submission in the standalone scheduler. */
24 | object DriverSubmissionTest {
25 |   def main(args: Array[String]) {
26 |     if (args.size < 1) {
27 |       println("Usage: DriverSubmissionTest <seconds-to-sleep>")
28 |       System.exit(0)
29 |     }
30 |     val numSecondsToSleep = args(0).toInt
31 | 
32 |     val env = System.getenv()
33 |     val properties = System.getProperties()
34 | 
35 |     println("Environment variables containing SPARK_TEST:")
36 |     env.filter{case (k, v) => k.contains("SPARK_TEST")}.foreach(println)
37 | 
38 |     println("System properties containing spark.test:")
39 |     properties.filter{case (k, v) => k.toString.contains("spark.test")}.foreach(println)
40 | 
41 |     for (i <- 1 until numSecondsToSleep) {
42 |       println(s"Alive for $i out of $numSecondsToSleep seconds")
43 |       Thread.sleep(1000)
44 |     }
45 |   }
46 | }
47 | 


--------------------------------------------------------------------------------
/src/org/apache/spark/examples/ExceptionHandlingTest.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.examples
19 | 
20 | import org.apache.spark.{SparkConf, SparkContext}
21 | 
22 | object ExceptionHandlingTest {
23 |   def main(args: Array[String]) {
24 |     val sparkConf = new SparkConf().setAppName("ExceptionHandlingTest")
25 |     val sc = new SparkContext(sparkConf)
26 |     sc.parallelize(0 until sc.defaultParallelism).foreach { i =>
27 |       if (math.random > 0.75) {
28 |         throw new Exception("Testing exception handling")
29 |       }
30 |     }
31 | 
32 |     sc.stop()
33 |   }
34 | }
35 | 


--------------------------------------------------------------------------------
/src/org/apache/spark/examples/GroupByTest.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.examples
19 | 
20 | import java.util.Random
21 | 
22 | import org.apache.spark.{SparkConf, SparkContext}
23 | import org.apache.spark.SparkContext._
24 | 
25 | /**
26 |   * Usage: GroupByTest [numMappers] [numKVPairs] [KeySize] [numReducers]
27 |   */
28 | object GroupByTest {
29 |   def main(args: Array[String]) {
30 |     val sparkConf = new SparkConf().setAppName("GroupBy Test")
31 |     var numMappers = if (args.length > 0) args(0).toInt else 2
32 |     var numKVPairs = if (args.length > 1) args(1).toInt else 1000
33 |     var valSize = if (args.length > 2) args(2).toInt else 1000
34 |     var numReducers = if (args.length > 3) args(3).toInt else numMappers
35 | 
36 |     val sc = new SparkContext(sparkConf)
37 | 
38 |     val pairs1 = sc.parallelize(0 until numMappers, numMappers).flatMap { p =>
39 |       val ranGen = new Random
40 |       var arr1 = new Array[(Int, Array[Byte])](numKVPairs)
41 |       for (i <- 0 until numKVPairs) {
42 |         val byteArr = new Array[Byte](valSize)
43 |         ranGen.nextBytes(byteArr)
44 |         arr1(i) = (ranGen.nextInt(Int.MaxValue), byteArr)
45 |       }
46 |       arr1
47 |     }.cache
48 |     // Enforce that everything has been calculated and in cache
49 |     pairs1.count
50 | 
51 |     println(pairs1.groupByKey(numReducers).count)
52 | 
53 |     sc.stop()
54 |   }
55 | }
56 | 


--------------------------------------------------------------------------------
/src/org/apache/spark/examples/HBaseTest.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.examples
19 | 
20 | import org.apache.hadoop.hbase.client.HBaseAdmin
21 | import org.apache.hadoop.hbase.{HBaseConfiguration, HTableDescriptor}
22 | import org.apache.hadoop.hbase.mapreduce.TableInputFormat
23 | 
24 | import org.apache.spark._
25 | import org.apache.spark.rdd.NewHadoopRDD
26 | 
27 | object HBaseTest {
28 |   def main(args: Array[String]) {
29 |     val sparkConf = new SparkConf().setAppName("HBaseTest")
30 |     val sc = new SparkContext(sparkConf)
31 |     val conf = HBaseConfiguration.create()
32 |     // Other options for configuring scan behavior are available. More information available at
33 |     // http://hbase.apache.org/apidocs/org/apache/hadoop/hbase/mapreduce/TableInputFormat.html
34 |     conf.set(TableInputFormat.INPUT_TABLE, args(1))
35 | 
36 |     // Initialize hBase table if necessary
37 |     val admin = new HBaseAdmin(conf)
38 |     if(!admin.isTableAvailable(args(1))) {
39 |       val tableDesc = new HTableDescriptor(args(1))
40 |       admin.createTable(tableDesc)
41 |     }
42 | 
43 |     val hBaseRDD = sc.newAPIHadoopRDD(conf, classOf[TableInputFormat],
44 |       classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable],
45 |       classOf[org.apache.hadoop.hbase.client.Result])
46 | 
47 |     hBaseRDD.count()
48 | 
49 |     sc.stop()
50 |   }
51 | }
52 | 


--------------------------------------------------------------------------------
/src/org/apache/spark/examples/HdfsTest.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.examples
19 | 
20 | import org.apache.spark._
21 | 
22 | object HdfsTest {
23 |   def main(args: Array[String]) {
24 |     val sparkConf = new SparkConf().setAppName("HdfsTest")
25 |     val sc = new SparkContext(sparkConf)
26 |     val file = sc.textFile(args(1))
27 |     val mapped = file.map(s => s.length).cache()
28 |     for (iter <- 1 to 10) {
29 |       val start = System.currentTimeMillis()
30 |       for (x <- mapped) { x + 2 }
31 |       //  println("Processing: " + x)
32 |       val end = System.currentTimeMillis()
33 |       println("Iteration " + iter + " took " + (end-start) + " ms")
34 |     }
35 |     sc.stop()
36 |   }
37 | }
38 | 


--------------------------------------------------------------------------------
/src/org/apache/spark/examples/LocalALS.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.examples
 19 | 
 20 | import scala.math.sqrt
 21 | 
 22 | import cern.colt.matrix._
 23 | import cern.colt.matrix.linalg._
 24 | import cern.jet.math._
 25 | 
 26 | /**
 27 |  * Alternating least squares matrix factorization.
 28 |  */
 29 | object LocalALS {
 30 |   // Parameters set through command line arguments
 31 |   var M = 0 // Number of movies
 32 |   var U = 0 // Number of users
 33 |   var F = 0 // Number of features
 34 |   var ITERATIONS = 0
 35 | 
 36 |   val LAMBDA = 0.01 // Regularization coefficient
 37 | 
 38 |   // Some COLT objects
 39 |   val factory2D = DoubleFactory2D.dense
 40 |   val factory1D = DoubleFactory1D.dense
 41 |   val algebra = Algebra.DEFAULT
 42 |   val blas = SeqBlas.seqBlas
 43 | 
 44 |   def generateR(): DoubleMatrix2D = {
 45 |     val mh = factory2D.random(M, F)
 46 |     val uh = factory2D.random(U, F)
 47 |     algebra.mult(mh, algebra.transpose(uh))
 48 |   }
 49 | 
 50 |   def rmse(targetR: DoubleMatrix2D, ms: Array[DoubleMatrix1D],
 51 |     us: Array[DoubleMatrix1D]): Double =
 52 |   {
 53 |     val r = factory2D.make(M, U)
 54 |     for (i <- 0 until M; j <- 0 until U) {
 55 |       r.set(i, j, blas.ddot(ms(i), us(j)))
 56 |     }
 57 |     blas.daxpy(-1, targetR, r)
 58 |     val sumSqs = r.aggregate(Functions.plus, Functions.square)
 59 |     sqrt(sumSqs / (M * U))
 60 |   }
 61 | 
 62 |   def updateMovie(i: Int, m: DoubleMatrix1D, us: Array[DoubleMatrix1D],
 63 |     R: DoubleMatrix2D) : DoubleMatrix1D =
 64 |   {
 65 |     val XtX = factory2D.make(F, F)
 66 |     val Xty = factory1D.make(F)
 67 |     // For each user that rated the movie
 68 |     for (j <- 0 until U) {
 69 |       val u = us(j)
 70 |       // Add u * u^t to XtX
 71 |       blas.dger(1, u, u, XtX)
 72 |       // Add u * rating to Xty
 73 |       blas.daxpy(R.get(i, j), u, Xty)
 74 |     }
 75 |     // Add regularization coefs to diagonal terms
 76 |     for (d <- 0 until F) {
 77 |       XtX.set(d, d, XtX.get(d, d) + LAMBDA * U)
 78 |     }
 79 |     // Solve it with Cholesky
 80 |     val ch = new CholeskyDecomposition(XtX)
 81 |     val Xty2D = factory2D.make(Xty.toArray, F)
 82 |     val solved2D = ch.solve(Xty2D)
 83 |     solved2D.viewColumn(0)
 84 |   }
 85 | 
 86 |   def updateUser(j: Int, u: DoubleMatrix1D, ms: Array[DoubleMatrix1D],
 87 |     R: DoubleMatrix2D) : DoubleMatrix1D =
 88 |   {
 89 |     val XtX = factory2D.make(F, F)
 90 |     val Xty = factory1D.make(F)
 91 |     // For each movie that the user rated
 92 |     for (i <- 0 until M) {
 93 |       val m = ms(i)
 94 |       // Add m * m^t to XtX
 95 |       blas.dger(1, m, m, XtX)
 96 |       // Add m * rating to Xty
 97 |       blas.daxpy(R.get(i, j), m, Xty)
 98 |     }
 99 |     // Add regularization coefs to diagonal terms
100 |     for (d <- 0 until F) {
101 |       XtX.set(d, d, XtX.get(d, d) + LAMBDA * M)
102 |     }
103 |     // Solve it with Cholesky
104 |     val ch = new CholeskyDecomposition(XtX)
105 |     val Xty2D = factory2D.make(Xty.toArray, F)
106 |     val solved2D = ch.solve(Xty2D)
107 |     solved2D.viewColumn(0)
108 |   }
109 | 
110 |   def main(args: Array[String]) {
111 |     args match {
112 |       case Array(m, u, f, iters) => {
113 |         M = m.toInt
114 |         U = u.toInt
115 |         F = f.toInt
116 |         ITERATIONS = iters.toInt
117 |       }
118 |       case _ => {
119 |         System.err.println("Usage: LocalALS <M> <U> <F> <iters>")
120 |         System.exit(1)
121 |       }
122 |     }
123 |     printf("Running with M=%d, U=%d, F=%d, iters=%d\n", M, U, F, ITERATIONS)
124 | 
125 |     val R = generateR()
126 | 
127 |     // Initialize m and u randomly
128 |     var ms = Array.fill(M)(factory1D.random(F))
129 |     var us = Array.fill(U)(factory1D.random(F))
130 | 
131 |     // Iteratively update movies then users
132 |     for (iter <- 1 to ITERATIONS) {
133 |       println("Iteration " + iter + ":")
134 |       ms = (0 until M).map(i => updateMovie(i, ms(i), us, R)).toArray
135 |       us = (0 until U).map(j => updateUser(j, us(j), ms, R)).toArray
136 |       println("RMSE = " + rmse(R, ms, us))
137 |       println()
138 |     }
139 |   }
140 | }
141 | 


--------------------------------------------------------------------------------
/src/org/apache/spark/examples/LocalFileLR.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.examples
19 | 
20 | import java.util.Random
21 | 
22 | import breeze.linalg.{Vector, DenseVector}
23 | 
24 | object LocalFileLR {
25 |   val D = 10   // Numer of dimensions
26 |   val rand = new Random(42)
27 | 
28 |   case class DataPoint(x: Vector[Double], y: Double)
29 | 
30 |   def parsePoint(line: String): DataPoint = {
31 |     val nums = line.split(' ').map(_.toDouble)
32 |     DataPoint(new DenseVector(nums.slice(1, D + 1)), nums(0))
33 |   }
34 | 
35 |   def main(args: Array[String]) {
36 |     val lines = scala.io.Source.fromFile(args(0)).getLines().toArray
37 |     val points = lines.map(parsePoint _)
38 |     val ITERATIONS = args(1).toInt
39 | 
40 |     // Initialize w to a random value
41 |     var w = DenseVector.fill(D){2 * rand.nextDouble - 1}
42 |     println("Initial w: " + w)
43 | 
44 |     for (i <- 1 to ITERATIONS) {
45 |       println("On iteration " + i)
46 |       var gradient = DenseVector.zeros[Double](D)
47 |       for (p <- points) {
48 |         val scale = (1 / (1 + math.exp(-p.y * (w.dot(p.x)))) - 1) * p.y
49 |         gradient += p.x * scale
50 |       }
51 |       w -= gradient
52 |     }
53 | 
54 |     println("Final w: " + w)
55 |   }
56 | }
57 | 


--------------------------------------------------------------------------------
/src/org/apache/spark/examples/LocalKMeans.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.examples
 19 | 
 20 | import java.util.Random
 21 | 
 22 | import scala.collection.mutable.HashMap
 23 | import scala.collection.mutable.HashSet
 24 | 
 25 | import breeze.linalg.{Vector, DenseVector, squaredDistance}
 26 | 
 27 | import org.apache.spark.SparkContext._
 28 | 
 29 | /**
 30 |  * K-means clustering.
 31 |  */
 32 | object LocalKMeans {
 33 |   val N = 1000
 34 |   val R = 1000    // Scaling factor
 35 |   val D = 10
 36 |   val K = 10
 37 |   val convergeDist = 0.001
 38 |   val rand = new Random(42)
 39 | 
 40 |   def generateData = {
 41 |     def generatePoint(i: Int) = {
 42 |       DenseVector.fill(D){rand.nextDouble * R}
 43 |     }
 44 |     Array.tabulate(N)(generatePoint)
 45 |   }
 46 | 
 47 |   def closestPoint(p: Vector[Double], centers: HashMap[Int, Vector[Double]]): Int = {
 48 |     var index = 0
 49 |     var bestIndex = 0
 50 |     var closest = Double.PositiveInfinity
 51 | 
 52 |     for (i <- 1 to centers.size) {
 53 |       val vCurr = centers.get(i).get
 54 |       val tempDist = squaredDistance(p, vCurr)
 55 |       if (tempDist < closest) {
 56 |         closest = tempDist
 57 |         bestIndex = i
 58 |       }
 59 |     }
 60 | 
 61 |     bestIndex
 62 |   }
 63 | 
 64 |   def main(args: Array[String]) {
 65 |     val data = generateData
 66 |     var points = new HashSet[Vector[Double]]
 67 |     var kPoints = new HashMap[Int, Vector[Double]]
 68 |     var tempDist = 1.0
 69 | 
 70 |     while (points.size < K) {
 71 |       points.add(data(rand.nextInt(N)))
 72 |     }
 73 | 
 74 |     val iter = points.iterator
 75 |     for (i <- 1 to points.size) {
 76 |       kPoints.put(i, iter.next())
 77 |     }
 78 | 
 79 |     println("Initial centers: " + kPoints)
 80 | 
 81 |     while(tempDist > convergeDist) {
 82 |       var closest = data.map (p => (closestPoint(p, kPoints), (p, 1)))
 83 | 
 84 |       var mappings = closest.groupBy[Int] (x => x._1)
 85 | 
 86 |       var pointStats = mappings.map { pair =>
 87 |         pair._2.reduceLeft [(Int, (Vector[Double], Int))] {
 88 |           case ((id1, (x1, y1)), (id2, (x2, y2))) => (id1, (x1 + x2, y1 + y2))
 89 |         }
 90 |       }
 91 | 
 92 |       var newPoints = pointStats.map {mapping =>
 93 |         (mapping._1, mapping._2._1 * (1.0 / mapping._2._2))}
 94 | 
 95 |       tempDist = 0.0
 96 |       for (mapping <- newPoints) {
 97 |         tempDist += squaredDistance(kPoints.get(mapping._1).get, mapping._2)
 98 |       }
 99 | 
100 |       for (newP <- newPoints) {
101 |         kPoints.put(newP._1, newP._2)
102 |       }
103 |     }
104 | 
105 |     println("Final centers: " + kPoints)
106 |   }
107 | }
108 | 


--------------------------------------------------------------------------------
/src/org/apache/spark/examples/LocalLR.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.examples
19 | 
20 | import java.util.Random
21 | 
22 | import breeze.linalg.{Vector, DenseVector}
23 | 
24 | /**
25 |  * Logistic regression based classification.
26 |  */
27 | object LocalLR {
28 |   val N = 10000  // Number of data points
29 |   val D = 10   // Number of dimensions
30 |   val R = 0.7  // Scaling factor
31 |   val ITERATIONS = 5
32 |   val rand = new Random(42)
33 | 
34 |   case class DataPoint(x: Vector[Double], y: Double)
35 | 
36 |   def generateData = {
37 |     def generatePoint(i: Int) = {
38 |       val y = if(i % 2 == 0) -1 else 1
39 |       val x = DenseVector.fill(D){rand.nextGaussian + y * R}
40 |       DataPoint(x, y)
41 |     }
42 |     Array.tabulate(N)(generatePoint)
43 |   }
44 | 
45 |   def main(args: Array[String]) {
46 |     val data = generateData
47 | 
48 |     // Initialize w to a random value
49 |     var w = DenseVector.fill(D){2 * rand.nextDouble - 1}
50 |     println("Initial w: " + w)
51 | 
52 |     for (i <- 1 to ITERATIONS) {
53 |       println("On iteration " + i)
54 |       var gradient = DenseVector.zeros[Double](D)
55 |       for (p <- data) {
56 |         val scale = (1 / (1 + math.exp(-p.y * (w.dot(p.x)))) - 1) * p.y
57 |         gradient +=  p.x * scale
58 |       }
59 |       w -= gradient
60 |     }
61 | 
62 |     println("Final w: " + w)
63 |   }
64 | }
65 | 


--------------------------------------------------------------------------------
/src/org/apache/spark/examples/LocalPi.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.examples
19 | 
20 | import scala.math.random
21 | 
22 | import org.apache.spark._
23 | import org.apache.spark.SparkContext._
24 | 
25 | object LocalPi {
26 |   def main(args: Array[String]) {
27 |     var count = 0
28 |     for (i <- 1 to 100000) {
29 |       val x = random * 2 - 1
30 |       val y = random * 2 - 1
31 |       if (x*x + y*y < 1) count += 1
32 |     }
33 |     println("Pi is roughly " + 4 * count / 100000.0)
34 |   }
35 | }
36 | 


--------------------------------------------------------------------------------
/src/org/apache/spark/examples/LogQuery.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.examples
19 | 
20 | import org.apache.spark.{SparkConf, SparkContext}
21 | import org.apache.spark.SparkContext._
22 | 
23 | /**
24 |  * Executes a roll up-style query against Apache logs.
25 |  *  
26 |  * Usage: LogQuery [logFile]
27 |  */
28 | object LogQuery {
29 |   val exampleApacheLogs = List(
30 |     """10.10.10.10 - "FRED" [18/Jan/2013:17:56:07 +1100] "GET http://images.com/2013/Generic.jpg
31 |       | HTTP/1.1" 304 315 "http://referall.com/" "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1;
32 |       | GTB7.4; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30; .NET CLR 3.0.04506.648; .NET CLR
33 |       | 3.5.21022; .NET CLR 3.0.4506.2152; .NET CLR 1.0.3705; .NET CLR 1.1.4322; .NET CLR
34 |       | 3.5.30729; Release=ARP)" "UD-1" - "image/jpeg" "whatever" 0.350 "-" - "" 265 923 934 ""
35 |       | 62.24.11.25 images.com 1358492167 - Whatup""".stripMargin.lines.mkString,
36 |     """10.10.10.10 - "FRED" [18/Jan/2013:18:02:37 +1100] "GET http://images.com/2013/Generic.jpg
37 |       | HTTP/1.1" 304 306 "http:/referall.com" "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1;
38 |       | GTB7.4; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30; .NET CLR 3.0.04506.648; .NET CLR
39 |       | 3.5.21022; .NET CLR 3.0.4506.2152; .NET CLR 1.0.3705; .NET CLR 1.1.4322; .NET CLR
40 |       | 3.5.30729; Release=ARP)" "UD-1" - "image/jpeg" "whatever" 0.352 "-" - "" 256 977 988 ""
41 |       | 0 73.23.2.15 images.com 1358492557 - Whatup""".stripMargin.lines.mkString
42 |   )
43 | 
44 |   def main(args: Array[String]) {
45 | 
46 |     val sparkConf = new SparkConf().setAppName("Log Query")
47 |     val sc = new SparkContext(sparkConf)
48 | 
49 |     val dataSet =
50 |       if (args.length == 1) sc.textFile(args(0)) else sc.parallelize(exampleApacheLogs)
51 |     // scalastyle:off
52 |     val apacheLogRegex =
53 |       """^([\d.]+) (\S+) (\S+) \[([\w\d:/]+\s[+\-]\d{4})\] "(.+?)" (\d{3}) ([\d\-]+) "([^"]+)" "([^"]+)".*""".r
54 |     // scalastyle:on
55 |     /** Tracks the total query count and number of aggregate bytes for a particular group. */
56 |     class Stats(val count: Int, val numBytes: Int) extends Serializable {
57 |       def merge(other: Stats) = new Stats(count + other.count, numBytes + other.numBytes)
58 |       override def toString = "bytes=%s\tn=%s".format(numBytes, count)
59 |     }
60 | 
61 |     def extractKey(line: String): (String, String, String) = {
62 |       apacheLogRegex.findFirstIn(line) match {
63 |         case Some(apacheLogRegex(ip, _, user, dateTime, query, status, bytes, referer, ua)) =>
64 |           if (user != "\"-\"") (ip, user, query)
65 |           else (null, null, null)
66 |         case _ => (null, null, null)
67 |       }
68 |     }
69 | 
70 |     def extractStats(line: String): Stats = {
71 |       apacheLogRegex.findFirstIn(line) match {
72 |         case Some(apacheLogRegex(ip, _, user, dateTime, query, status, bytes, referer, ua)) =>
73 |           new Stats(1, bytes.toInt)
74 |         case _ => new Stats(1, 0)
75 |       }
76 |     }
77 | 
78 |     dataSet.map(line => (extractKey(line), extractStats(line)))
79 |       .reduceByKey((a, b) => a.merge(b))
80 |       .collect().foreach{
81 |         case (user, query) => println("%s\t%s".format(user, query))}
82 |   }
83 | }
84 | 


--------------------------------------------------------------------------------
/src/org/apache/spark/examples/MultiBroadcastTest.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.examples
19 | 
20 | import org.apache.spark.rdd.RDD
21 | import org.apache.spark.{SparkConf, SparkContext}
22 | 
23 | /**
24 |   * Usage: MultiBroadcastTest [slices] [numElem]
25 |   */
26 | object MultiBroadcastTest {
27 |   def main(args: Array[String]) {
28 | 
29 |     val sparkConf = new SparkConf().setAppName("Multi-Broadcast Test")
30 |     val sc = new SparkContext(sparkConf)
31 | 
32 |     val slices = if (args.length > 0) args(0).toInt else 2
33 |     val num = if (args.length > 1) args(1).toInt else 1000000
34 | 
35 |     val arr1 = new Array[Int](num)
36 |     for (i <- 0 until arr1.length) {
37 |       arr1(i) = i
38 |     }
39 | 
40 |     val arr2 = new Array[Int](num)
41 |     for (i <- 0 until arr2.length) {
42 |       arr2(i) = i
43 |     }
44 | 
45 |     val barr1 = sc.broadcast(arr1)
46 |     val barr2 = sc.broadcast(arr2)
47 |     val observedSizes: RDD[(Int, Int)] = sc.parallelize(1 to 10, slices).map { _ =>
48 |       (barr1.value.size, barr2.value.size)
49 |     }
50 |     // Collect the small RDD so we can print the observed sizes locally.
51 |     observedSizes.collect().foreach(i => println(i))
52 | 
53 |     sc.stop()
54 |   }
55 | }
56 | 


--------------------------------------------------------------------------------
/src/org/apache/spark/examples/SimpleSkewedGroupByTest.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.examples
19 | 
20 | import java.util.Random
21 | 
22 | import org.apache.spark.{SparkConf, SparkContext}
23 | import org.apache.spark.SparkContext._
24 | 
25 | /**
26 |   * Usage: SimpleSkewedGroupByTest [numMappers] [numKVPairs] [valSize] [numReducers] [ratio]
27 |   */
28 | object SimpleSkewedGroupByTest {
29 |   def main(args: Array[String]) {
30 | 
31 |     val sparkConf = new SparkConf().setAppName("SimpleSkewedGroupByTest")
32 |     var numMappers = if (args.length > 0) args(0).toInt else 2
33 |     var numKVPairs = if (args.length > 1) args(1).toInt else 1000
34 |     var valSize = if (args.length > 2) args(2).toInt else 1000
35 |     var numReducers = if (args.length > 3) args(3).toInt else numMappers
36 |     var ratio = if (args.length > 4) args(4).toInt else 5.0
37 | 
38 |     val sc = new SparkContext(sparkConf)
39 | 
40 |     val pairs1 = sc.parallelize(0 until numMappers, numMappers).flatMap { p =>
41 |       val ranGen = new Random
42 |       var result = new Array[(Int, Array[Byte])](numKVPairs)
43 |       for (i <- 0 until numKVPairs) {
44 |         val byteArr = new Array[Byte](valSize)
45 |         ranGen.nextBytes(byteArr)
46 |         val offset = ranGen.nextInt(1000) * numReducers
47 |         if (ranGen.nextDouble < ratio / (numReducers + ratio - 1)) {
48 |           // give ratio times higher chance of generating key 0 (for reducer 0)
49 |           result(i) = (offset, byteArr)
50 |         } else {
51 |           // generate a key for one of the other reducers
52 |           val key = 1 + ranGen.nextInt(numReducers-1) + offset
53 |           result(i) = (key, byteArr)
54 |         }
55 |       }
56 |       result
57 |     }.cache
58 |     // Enforce that everything has been calculated and in cache
59 |     pairs1.count
60 | 
61 |     println("RESULT: " + pairs1.groupByKey(numReducers).count)
62 |     // Print how many keys each reducer got (for debugging)
63 |     // println("RESULT: " + pairs1.groupByKey(numReducers)
64 |     //                           .map{case (k,v) => (k, v.size)}
65 |     //                           .collectAsMap)
66 | 
67 |     sc.stop()
68 |   }
69 | }
70 | 


--------------------------------------------------------------------------------
/src/org/apache/spark/examples/SkewedGroupByTest.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.examples
19 | 
20 | import java.util.Random
21 | 
22 | import org.apache.spark.{SparkConf, SparkContext}
23 | import org.apache.spark.SparkContext._
24 | 
25 | /**
26 |   * Usage: GroupByTest [numMappers] [numKVPairs] [KeySize] [numReducers]
27 |   */
28 | object SkewedGroupByTest {
29 |   def main(args: Array[String]) {
30 |     val sparkConf = new SparkConf().setAppName("GroupBy Test")
31 |     var numMappers = if (args.length > 0) args(0).toInt else 2
32 |     var numKVPairs = if (args.length > 1) args(1).toInt else 1000
33 |     var valSize = if (args.length > 2) args(2).toInt else 1000
34 |     var numReducers = if (args.length > 3) args(3).toInt else numMappers
35 | 
36 |     val sc = new SparkContext(sparkConf)
37 | 
38 |     val pairs1 = sc.parallelize(0 until numMappers, numMappers).flatMap { p =>
39 |       val ranGen = new Random
40 | 
41 |       // map output sizes lineraly increase from the 1st to the last
42 |       numKVPairs = (1.0 * (p + 1) / numMappers * numKVPairs).toInt
43 | 
44 |       var arr1 = new Array[(Int, Array[Byte])](numKVPairs)
45 |       for (i <- 0 until numKVPairs) {
46 |         val byteArr = new Array[Byte](valSize)
47 |         ranGen.nextBytes(byteArr)
48 |         arr1(i) = (ranGen.nextInt(Int.MaxValue), byteArr)
49 |       }
50 |       arr1
51 |     }.cache()
52 |     // Enforce that everything has been calculated and in cache
53 |     pairs1.count()
54 | 
55 |     println(pairs1.groupByKey(numReducers).count())
56 | 
57 |     sc.stop()
58 |   }
59 | }
60 | 


--------------------------------------------------------------------------------
/src/org/apache/spark/examples/SparkALS.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *    http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.apache.spark.examples
 19 | 
 20 | import scala.math.sqrt
 21 | 
 22 | import cern.colt.matrix._
 23 | import cern.colt.matrix.linalg._
 24 | import cern.jet.math._
 25 | 
 26 | import org.apache.spark._
 27 | 
 28 | /**
 29 |  * Alternating least squares matrix factorization.
 30 |  */
 31 | object SparkALS {
 32 |   // Parameters set through command line arguments
 33 |   var M = 0 // Number of movies
 34 |   var U = 0 // Number of users
 35 |   var F = 0 // Number of features
 36 |   var ITERATIONS = 0
 37 | 
 38 |   val LAMBDA = 0.01 // Regularization coefficient
 39 | 
 40 |   // Some COLT objects
 41 |   val factory2D = DoubleFactory2D.dense
 42 |   val factory1D = DoubleFactory1D.dense
 43 |   val algebra = Algebra.DEFAULT
 44 |   val blas = SeqBlas.seqBlas
 45 | 
 46 |   def generateR(): DoubleMatrix2D = {
 47 |     val mh = factory2D.random(M, F)
 48 |     val uh = factory2D.random(U, F)
 49 |     algebra.mult(mh, algebra.transpose(uh))
 50 |   }
 51 | 
 52 |   def rmse(targetR: DoubleMatrix2D, ms: Array[DoubleMatrix1D],
 53 |     us: Array[DoubleMatrix1D]): Double =
 54 |   {
 55 |     val r = factory2D.make(M, U)
 56 |     for (i <- 0 until M; j <- 0 until U) {
 57 |       r.set(i, j, blas.ddot(ms(i), us(j)))
 58 |     }
 59 |     blas.daxpy(-1, targetR, r)
 60 |     val sumSqs = r.aggregate(Functions.plus, Functions.square)
 61 |     sqrt(sumSqs / (M * U))
 62 |   }
 63 | 
 64 |   def update(i: Int, m: DoubleMatrix1D, us: Array[DoubleMatrix1D],
 65 |     R: DoubleMatrix2D) : DoubleMatrix1D =
 66 |   {
 67 |     val U = us.size
 68 |     val F = us(0).size
 69 |     val XtX = factory2D.make(F, F)
 70 |     val Xty = factory1D.make(F)
 71 |     // For each user that rated the movie
 72 |     for (j <- 0 until U) {
 73 |       val u = us(j)
 74 |       // Add u * u^t to XtX
 75 |       blas.dger(1, u, u, XtX)
 76 |       // Add u * rating to Xty
 77 |       blas.daxpy(R.get(i, j), u, Xty)
 78 |     }
 79 |     // Add regularization coefs to diagonal terms
 80 |     for (d <- 0 until F) {
 81 |       XtX.set(d, d, XtX.get(d, d) + LAMBDA * U)
 82 |     }
 83 |     // Solve it with Cholesky
 84 |     val ch = new CholeskyDecomposition(XtX)
 85 |     val Xty2D = factory2D.make(Xty.toArray, F)
 86 |     val solved2D = ch.solve(Xty2D)
 87 |     solved2D.viewColumn(0)
 88 |   }
 89 | 
 90 |   def main(args: Array[String]) {
 91 |     var slices = 0
 92 | 
 93 |     val options = (0 to 4).map(i => if (i < args.length) Some(args(i)) else None)
 94 | 
 95 |     options.toArray match {
 96 |       case Array(m, u, f, iters, slices_) =>
 97 |         M = m.getOrElse("100").toInt
 98 |         U = u.getOrElse("500").toInt
 99 |         F = f.getOrElse("10").toInt
100 |         ITERATIONS = iters.getOrElse("5").toInt
101 |         slices = slices_.getOrElse("2").toInt
102 |       case _ =>
103 |         System.err.println("Usage: SparkALS [M] [U] [F] [iters] [slices]")
104 |         System.exit(1)
105 |     }
106 |     printf("Running with M=%d, U=%d, F=%d, iters=%d\n", M, U, F, ITERATIONS)
107 |     val sparkConf = new SparkConf().setAppName("SparkALS")
108 |     val sc = new SparkContext(sparkConf)
109 | 
110 |     val R = generateR()
111 | 
112 |     // Initialize m and u randomly
113 |     var ms = Array.fill(M)(factory1D.random(F))
114 |     var us = Array.fill(U)(factory1D.random(F))
115 | 
116 |     // Iteratively update movies then users
117 |     val Rc  = sc.broadcast(R)
118 |     var msb = sc.broadcast(ms)
119 |     var usb = sc.broadcast(us)
120 |     for (iter <- 1 to ITERATIONS) {
121 |       println("Iteration " + iter + ":")
122 |       ms = sc.parallelize(0 until M, slices)
123 |                 .map(i => update(i, msb.value(i), usb.value, Rc.value))
124 |                 .collect()
125 |       msb = sc.broadcast(ms) // Re-broadcast ms because it was updated
126 |       us = sc.parallelize(0 until U, slices)
127 |                 .map(i => update(i, usb.value(i), msb.value, algebra.transpose(Rc.value)))
128 |                 .collect()
129 |       usb = sc.broadcast(us) // Re-broadcast us because it was updated
130 |       println("RMSE = " + rmse(R, ms, us))
131 |       println()
132 |     }
133 | 
134 |     sc.stop()
135 |   }
136 | }
137 | 


--------------------------------------------------------------------------------
/src/org/apache/spark/examples/SparkHdfsLR.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.examples
19 | 
20 | import java.util.Random
21 | 
22 | import scala.math.exp
23 | 
24 | import breeze.linalg.{Vector, DenseVector}
25 | 
26 | import org.apache.spark._
27 | import org.apache.spark.deploy.SparkHadoopUtil
28 | import org.apache.spark.scheduler.InputFormatInfo
29 | 
30 | 
31 | /**
32 |  * Logistic regression based classification.
33 |  */
34 | object SparkHdfsLR {
35 |   val D = 10   // Numer of dimensions
36 |   val rand = new Random(42)
37 | 
38 |   case class DataPoint(x: Vector[Double], y: Double)
39 | 
40 |   def parsePoint(line: String): DataPoint = {
41 |     val tok = new java.util.StringTokenizer(line, " ")
42 |     var y = tok.nextToken.toDouble
43 |     var x = new Array[Double](D)
44 |     var i = 0
45 |     while (i < D) {
46 |       x(i) = tok.nextToken.toDouble; i += 1
47 |     }
48 |     DataPoint(new DenseVector(x), y)
49 |   }
50 | 
51 |   def main(args: Array[String]) {
52 |     if (args.length < 2) {
53 |       System.err.println("Usage: SparkHdfsLR <file> <iters>")
54 |       System.exit(1)
55 |     }
56 | 
57 |     val sparkConf = new SparkConf().setAppName("SparkHdfsLR")
58 |     val inputPath = args(0)
59 |     val conf = SparkHadoopUtil.get.newConfiguration()
60 |     val sc = new SparkContext(sparkConf,
61 |       InputFormatInfo.computePreferredLocations(
62 |         Seq(new InputFormatInfo(conf, classOf[org.apache.hadoop.mapred.TextInputFormat], inputPath))
63 |       ))
64 |     val lines = sc.textFile(inputPath)
65 |     val points = lines.map(parsePoint _).cache()
66 |     val ITERATIONS = args(1).toInt
67 | 
68 |     // Initialize w to a random value
69 |     var w = DenseVector.fill(D){2 * rand.nextDouble - 1}
70 |     println("Initial w: " + w)
71 | 
72 |     for (i <- 1 to ITERATIONS) {
73 |       println("On iteration " + i)
74 |       val gradient = points.map { p =>
75 |         p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y
76 |       }.reduce(_ + _)
77 |       w -= gradient
78 |     }
79 | 
80 |     println("Final w: " + w)
81 |     sc.stop()
82 |   }
83 | }
84 | 


--------------------------------------------------------------------------------
/src/org/apache/spark/examples/SparkKMeans.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.examples
19 | 
20 | import java.util.Random
21 | 
22 | import breeze.linalg.{Vector, DenseVector, squaredDistance}
23 | 
24 | import org.apache.spark.{SparkConf, SparkContext}
25 | import org.apache.spark.SparkContext._
26 | 
27 | /**
28 |  * K-means clustering.
29 |  */
30 | object SparkKMeans {
31 |   val R = 1000     // Scaling factor
32 |   val rand = new Random(42)
33 | 
34 |   def parseVector(line: String): Vector[Double] = {
35 |     DenseVector(line.split(' ').map(_.toDouble))
36 |   }
37 | 
38 |   def closestPoint(p: Vector[Double], centers: Array[Vector[Double]]): Int = {
39 |     var index = 0
40 |     var bestIndex = 0
41 |     var closest = Double.PositiveInfinity
42 | 
43 |     for (i <- 0 until centers.length) {
44 |       val tempDist = squaredDistance(p, centers(i))
45 |       if (tempDist < closest) {
46 |         closest = tempDist
47 |         bestIndex = i
48 |       }
49 |     }
50 | 
51 |     bestIndex
52 |   }
53 | 
54 |   def main(args: Array[String]) {
55 |     if (args.length < 3) {
56 |       System.err.println("Usage: SparkKMeans <file> <k> <convergeDist>")
57 |       System.exit(1)
58 |     }
59 |     val sparkConf = new SparkConf().setAppName("SparkKMeans")
60 |     val sc = new SparkContext(sparkConf)
61 |     val lines = sc.textFile(args(0))
62 |     val data = lines.map(parseVector _).cache()
63 |     val K = args(1).toInt
64 |     val convergeDist = args(2).toDouble
65 | 
66 |     val kPoints = data.takeSample(withReplacement = false, K, 42).toArray
67 |     var tempDist = 1.0
68 | 
69 |     while(tempDist > convergeDist) {
70 |       val closest = data.map (p => (closestPoint(p, kPoints), (p, 1)))
71 | 
72 |       val pointStats = closest.reduceByKey{case ((x1, y1), (x2, y2)) => (x1 + x2, y1 + y2)}
73 | 
74 |       val newPoints = pointStats.map {pair =>
75 |         (pair._1, pair._2._1 * (1.0 / pair._2._2))}.collectAsMap()
76 | 
77 |       tempDist = 0.0
78 |       for (i <- 0 until K) {
79 |         tempDist += squaredDistance(kPoints(i), newPoints(i))
80 |       }
81 | 
82 |       for (newP <- newPoints) {
83 |         kPoints(newP._1) = newP._2
84 |       }
85 |       println("Finished iteration (delta = " + tempDist + ")")
86 |     }
87 | 
88 |     println("Final centers:")
89 |     kPoints.foreach(println)
90 |     sc.stop()
91 |   }
92 | }
93 | 


--------------------------------------------------------------------------------
/src/org/apache/spark/examples/SparkLR.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.examples
19 | 
20 | import java.util.Random
21 | 
22 | import scala.math.exp
23 | 
24 | import breeze.linalg.{Vector, DenseVector}
25 | 
26 | import org.apache.spark._
27 | 
28 | /**
29 |  * Logistic regression based classification.
30 |  * Usage: SparkLR [slices]
31 |  */
32 | object SparkLR {
33 |   val N = 10000  // Number of data points
34 |   val D = 10   // Numer of dimensions
35 |   val R = 0.7  // Scaling factor
36 |   val ITERATIONS = 5
37 |   val rand = new Random(42)
38 | 
39 |   case class DataPoint(x: Vector[Double], y: Double)
40 | 
41 |   def generateData = {
42 |     def generatePoint(i: Int) = {
43 |       val y = if(i % 2 == 0) -1 else 1
44 |       val x = DenseVector.fill(D){rand.nextGaussian + y * R}
45 |       DataPoint(x, y)
46 |     }
47 |     Array.tabulate(N)(generatePoint)
48 |   }
49 | 
50 |   def main(args: Array[String]) {
51 |     val sparkConf = new SparkConf().setAppName("SparkLR")
52 |     val sc = new SparkContext(sparkConf)
53 |     val numSlices = if (args.length > 0) args(0).toInt else 2
54 |     val points = sc.parallelize(generateData, numSlices).cache()
55 | 
56 |     // Initialize w to a random value
57 |     var w = DenseVector.fill(D){2 * rand.nextDouble - 1}
58 |     println("Initial w: " + w)
59 | 
60 |     for (i <- 1 to ITERATIONS) {
61 |       println("On iteration " + i)
62 |       val gradient = points.map { p =>
63 |         p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y
64 |       }.reduce(_ + _)
65 |       w -= gradient
66 |     }
67 | 
68 |     println("Final w: " + w)
69 |     sc.stop()
70 |   }
71 | }
72 | 


--------------------------------------------------------------------------------
/src/org/apache/spark/examples/SparkPageRank.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.examples
19 | 
20 | import org.apache.spark.SparkContext._
21 | import org.apache.spark.{SparkConf, SparkContext}
22 | 
23 | /**
24 |  * Computes the PageRank of URLs from an input file. Input file should
25 |  * be in format of:
26 |  * URL         neighbor URL
27 |  * URL         neighbor URL
28 |  * URL         neighbor URL
29 |  * ...
30 |  * where URL and their neighbors are separated by space(s).
31 |  */
32 | object SparkPageRank {
33 |   def main(args: Array[String]) {
34 |     val sparkConf = new SparkConf().setAppName("PageRank")
35 |     var iters = args(1).toInt
36 |     val ctx = new SparkContext(sparkConf)
37 |     val lines = ctx.textFile(args(0), 1)
38 |     val links = lines.map{ s =>
39 |       val parts = s.split("\\s+")
40 |       (parts(0), parts(1))
41 |     }.distinct().groupByKey().cache()
42 |     var ranks = links.mapValues(v => 1.0)
43 | 
44 |     for (i <- 1 to iters) {
45 |       val contribs = links.join(ranks).values.flatMap{ case (urls, rank) =>
46 |         val size = urls.size
47 |         urls.map(url => (url, rank / size))
48 |       }
49 |       ranks = contribs.reduceByKey(_ + _).mapValues(0.15 + 0.85 * _)
50 |     }
51 | 
52 |     val output = ranks.collect()
53 |     output.foreach(tup => println(tup._1 + " has rank: " + tup._2 + "."))
54 | 
55 |     ctx.stop()
56 |   }
57 | }
58 | 


--------------------------------------------------------------------------------
/src/org/apache/spark/examples/SparkPi.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.examples
19 | 
20 | import scala.math.random
21 | 
22 | import org.apache.spark._
23 | 
24 | /** Computes an approximation to pi */
25 | object SparkPi {
26 |   def main(args: Array[String]) {
27 |     val conf = new SparkConf().setAppName("Spark Pi")
28 |     val spark = new SparkContext(conf)
29 |     val slices = if (args.length > 0) args(0).toInt else 2
30 |     val n = 100000 * slices
31 |     val count = spark.parallelize(1 to n, slices).map { i =>
32 |       val x = random * 2 - 1
33 |       val y = random * 2 - 1
34 |       if (x*x + y*y < 1) 1 else 0
35 |     }.reduce(_ + _)
36 |     println("Pi is roughly " + 4.0 * count / n)
37 |     spark.stop()
38 |   }
39 | }
40 | 


--------------------------------------------------------------------------------
/src/org/apache/spark/examples/SparkTC.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.examples
19 | 
20 | import scala.util.Random
21 | import scala.collection.mutable
22 | 
23 | import org.apache.spark.{SparkConf, SparkContext}
24 | import org.apache.spark.SparkContext._
25 | 
26 | /**
27 |  * Transitive closure on a graph.
28 |  */
29 | object SparkTC {
30 |   val numEdges = 200
31 |   val numVertices = 100
32 |   val rand = new Random(42)
33 | 
34 |   def generateGraph = {
35 |     val edges: mutable.Set[(Int, Int)] = mutable.Set.empty
36 |     while (edges.size < numEdges) {
37 |       val from = rand.nextInt(numVertices)
38 |       val to = rand.nextInt(numVertices)
39 |       if (from != to) edges.+=((from, to))
40 |     }
41 |     edges.toSeq
42 |   }
43 | 
44 |   def main(args: Array[String]) {
45 |     val sparkConf = new SparkConf().setAppName("SparkTC")
46 |     val spark = new SparkContext(sparkConf)
47 |     val slices = if (args.length > 0) args(0).toInt else 2
48 |     var tc = spark.parallelize(generateGraph, slices).cache()
49 | 
50 |     // Linear transitive closure: each round grows paths by one edge,
51 |     // by joining the graph's edges with the already-discovered paths.
52 |     // e.g. join the path (y, z) from the TC with the edge (x, y) from
53 |     // the graph to obtain the path (x, z).
54 | 
55 |     // Because join() joins on keys, the edges are stored in reversed order.
56 |     val edges = tc.map(x => (x._2, x._1))
57 | 
58 |     // This join is iterated until a fixed point is reached.
59 |     var oldCount = 0L
60 |     var nextCount = tc.count()
61 |     do {
62 |       oldCount = nextCount
63 |       // Perform the join, obtaining an RDD of (y, (z, x)) pairs,
64 |       // then project the result to obtain the new (x, z) paths.
65 |       tc = tc.union(tc.join(edges).map(x => (x._2._2, x._2._1))).distinct().cache()
66 |       nextCount = tc.count()
67 |     } while (nextCount != oldCount)
68 | 
69 |     println("TC has " + tc.count() + " edges.")
70 |     spark.stop()
71 |   }
72 | }
73 | 


--------------------------------------------------------------------------------
/src/org/apache/spark/examples/SparkTachyonHdfsLR.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.examples
19 | 
20 | import java.util.Random
21 | 
22 | import scala.math.exp
23 | 
24 | import breeze.linalg.{Vector, DenseVector}
25 | 
26 | import org.apache.spark._
27 | import org.apache.spark.deploy.SparkHadoopUtil
28 | import org.apache.spark.scheduler.InputFormatInfo
29 | import org.apache.spark.storage.StorageLevel
30 | 
31 | 
32 | /**
33 |  * Logistic regression based classification.
34 |  * This example uses Tachyon to persist rdds during computation.
35 |  */
36 | object SparkTachyonHdfsLR {
37 |   val D = 10   // Numer of dimensions
38 |   val rand = new Random(42)
39 | 
40 |   case class DataPoint(x: Vector[Double], y: Double)
41 | 
42 |   def parsePoint(line: String): DataPoint = {
43 |     val tok = new java.util.StringTokenizer(line, " ")
44 |     var y = tok.nextToken.toDouble
45 |     var x = new Array[Double](D)
46 |     var i = 0
47 |     while (i < D) {
48 |       x(i) = tok.nextToken.toDouble; i += 1
49 |     }
50 |     DataPoint(new DenseVector(x), y)
51 |   }
52 | 
53 |   def main(args: Array[String]) {
54 |     val inputPath = args(0)
55 |     val conf = SparkHadoopUtil.get.newConfiguration()
56 |     val sparkConf = new SparkConf().setAppName("SparkTachyonHdfsLR")
57 |     val sc = new SparkContext(sparkConf,
58 |       InputFormatInfo.computePreferredLocations(
59 |         Seq(new InputFormatInfo(conf, classOf[org.apache.hadoop.mapred.TextInputFormat], inputPath))
60 |       ))
61 |     val lines = sc.textFile(inputPath)
62 |     val points = lines.map(parsePoint _).persist(StorageLevel.OFF_HEAP)
63 |     val ITERATIONS = args(1).toInt
64 | 
65 |     // Initialize w to a random value
66 |     var w = DenseVector.fill(D){2 * rand.nextDouble - 1}
67 |     println("Initial w: " + w)
68 | 
69 |     for (i <- 1 to ITERATIONS) {
70 |       println("On iteration " + i)
71 |       val gradient = points.map { p =>
72 |         p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y
73 |       }.reduce(_ + _)
74 |       w -= gradient
75 |     }
76 | 
77 |     println("Final w: " + w)
78 |     sc.stop()
79 |   }
80 | }
81 | 


--------------------------------------------------------------------------------
/src/org/apache/spark/examples/SparkTachyonPi.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package org.apache.spark.examples
19 | 
20 | import scala.math.random
21 | 
22 | import org.apache.spark._
23 | import org.apache.spark.storage.StorageLevel
24 | 
25 | /**
26 |  *  Computes an approximation to pi
27 |  *  This example uses Tachyon to persist rdds during computation.
28 |  */
29 | object SparkTachyonPi {
30 |   def main(args: Array[String]) {
31 |     val sparkConf = new SparkConf().setAppName("SparkTachyonPi")
32 |     val spark = new SparkContext(sparkConf)
33 | 
34 |     val slices = if (args.length > 0) args(0).toInt else 2
35 |     val n = 100000 * slices
36 | 
37 |     val rdd = spark.parallelize(1 to n, slices)
38 |     rdd.persist(StorageLevel.OFF_HEAP)
39 |     val count = rdd.map { i =>
40 |       val x = random * 2 - 1
41 |       val y = random * 2 - 1
42 |       if (x * x + y * y < 1) 1 else 0
43 |     }.reduce(_ + _)
44 |     println("Pi is roughly " + 4.0 * count / n)
45 | 
46 |     spark.stop()
47 |   }
48 | }
49 | 


--------------------------------------------------------------------------------
/src/pretty/examples/Aggregate.scala:
--------------------------------------------------------------------------------
 1 | package pretty.examples
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | 
 5 | object Aggregate {
 6 |   
 7 |   def main(args: Array[String]) {
 8 | 
 9 |     val sc = new SparkContext("local", "Aggregate Test")
10 |     val d = List("0", "1", "2", "3", "4", "5", "6", "7", "8", "9")
11 | 
12 |     val data = sc.parallelize(d, 2)
13 | 
14 |     val result = data.aggregate("a")((x,y) => "[" + x + "," + y + "]", 
15 |         (x,y) => x + y)
16 |      
17 |     println(result)
18 |     // output:
19 |     // a[[[[[a,0],1],2],3],4][[[[[a,5],6],7],8],9]
20 |   }
21 | }


--------------------------------------------------------------------------------
/src/pretty/examples/Coalesce.scala:
--------------------------------------------------------------------------------
 1 | package pretty.examples
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | 
 5 | object Coalesce {
 6 |   def main(args: Array[String]) {
 7 | 	  val sc = new SparkContext("local", "Coalesce Test") 
 8 | 	  
 9 | 	  val data = sc.parallelize(1 to 20, 10)
10 | 	  
11 | 	  val result = data.coalesce(2)
12 | 	  result.foreach(x => print(x + " "))
13 | 	  
14 | 	  // equals "repartition(2)" 
15 | 	  val resultWithHashPartition = data.coalesce(2, true)  
16 | 	  resultWithHashPartition.foreach(x => print(x + " "))
17 |   }
18 | }


--------------------------------------------------------------------------------
/src/pretty/examples/CogroupPair.scala:
--------------------------------------------------------------------------------
 1 | package pretty.examples
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | import org.apache.spark.SparkContext._
 5 | import org.apache.spark.RangePartitioner
 6 | 
 7 | object CogroupPair {
 8 |   def main(args: Array[String]) {
 9 |     val sc = new SparkContext("local", "Cogroup Test")
10 | 
11 |     val data1 = Array[(String, Int)](("A", 1), ("A", 2),
12 |     								 ("B", 3), ("B", 4),
13 |     								 ("C", 5), ("C", 6))
14 | 
15 |     val data2 = Array[(String, Int)](("A", 7), ("A", 8),
16 |     								 ("B", 9), ("C", 0))
17 | 
18 |     val data3 = Array[(String, Int)](("A", 10), ("B", 11))
19 |       
20 |     val pairs1 = sc.parallelize(data1, 3)
21 |     val pairs2 = sc.parallelize(data2, 2)
22 |     val pairs3 = sc.parallelize(data3, 3)
23 | 
24 |     val result1 = pairs1.cogroup(pairs2)
25 |     result1.foreach(println)
26 |     
27 | //    val result2 = pairs1.cogroup(pairs2, pairs3)
28 | //    result2.foreach(println)
29 | //   
30 | //    val result3 = pairs1.cogroup(pairs2, 1)
31 | //    result3.foreachWith(i => i)((x, i) => println("[PartitionIndex " + i + "] " + x))
32 | //    
33 | //    val result4 = pairs1.cogroup(pairs2, new RangePartitioner(2, pairs1))
34 | //    result4.foreachWith(i => i)((x, i) => println("[PartitionIndex " + i + "] " + x))
35 |   }
36 | }


--------------------------------------------------------------------------------
/src/pretty/examples/GroupByKeyPair.scala:
--------------------------------------------------------------------------------
 1 | package pretty.examples
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | import org.apache.spark.SparkContext._
 5 | import org.apache.spark.RangePartitioner
 6 | 
 7 | object GroupByKeyPair {
 8 | 
 9 |    def main(args: Array[String]) {
10 |     
11 |     val sc = new SparkContext("local", "GroupByKeyPair Test") 
12 |     val d = sc.parallelize(1 to 100, 10)
13 |     
14 | 	val pairs = d.keyBy(x => x % 10)
15 | 		   			
16 | 	val result1 = pairs.groupByKey()
17 | 	val result2 = pairs.groupByKey(3)
18 | 	val result3 = pairs.groupByKey(new RangePartitioner(3, pairs))
19 | 	
20 | 	println("Result 1:")
21 | 	result1.foreach(println)
22 | 	
23 | 	println("Result 2:")
24 | 	result2.foreach(println)
25 | 	
26 | 	println("Result 3:")
27 | 	result3.foreach(println)
28 | 	
29 |   }
30 | }


--------------------------------------------------------------------------------