├── .cache ├── .classpath ├── .gitignore ├── .project ├── .settings ├── org.eclipse.jdt.core.prefs └── org.scala-ide.sdt.core.prefs └── src ├── .DS_Store ├── api └── examples │ ├── Cartesian.scala │ ├── Checkpoint.scala │ ├── Coalesce.scala │ ├── Cogroup.scala │ ├── Collect.scala │ ├── CollectAsMap.scala │ ├── CombineByKey.scala │ ├── CountApproxDistinct.scala │ ├── GroupByKeyPair.scala │ ├── IntersectionTest.scala │ ├── Sample.scala │ └── Utils.scala ├── internals ├── IntersectionTest.scala ├── RepartitionTest2.scala ├── broadcastTest.scala ├── cartesianTest.scala ├── coalesceTest.scala ├── cogroupTest.scala ├── complexStages.scala ├── distinctTest.scala ├── groupByKeyTest.scala ├── hashjoinTest.scala ├── joinTest.scala ├── pipeTest.scala ├── reduceByKeyTest.scala ├── repartitionTest.scala └── sortByKeyTest.scala ├── local └── examples │ ├── Aggregate.scala │ ├── AggregateOrder.scala │ ├── Cartesian.scala │ ├── CollectAsMap.scala │ ├── FlatMap.scala │ ├── GenerateRandomText.scala │ ├── GroupByAction.scala │ ├── GroupByKey.scala │ ├── GroupByTest.scala │ ├── GroupWith.scala │ ├── JoinAction.scala │ ├── LocalWordCount.scala │ ├── LookUpTest.scala │ ├── MapPartitionsRDDTest.scala │ ├── MapValuesTest.scala │ ├── PipedRDDTest.scala │ ├── ReduceByKeyActionTest.scala │ ├── ReduceByKeyToDriverTest.scala │ ├── SparkLR.scala │ ├── TakeActionTest.scala │ ├── UnionTest.scala │ ├── partitionByTest.scala │ ├── reduceActionTest.scala │ └── sortByKeyTest.scala ├── org ├── .DS_Store └── apache │ ├── .DS_Store │ └── spark │ ├── .DS_Store │ └── examples │ ├── BroadcastTest.scala │ ├── CassandraCQLTest.scala │ ├── CassandraTest.scala │ ├── DriverSubmissionTest.scala │ ├── ExceptionHandlingTest.scala │ ├── GroupByTest.scala │ ├── HBaseTest.scala │ ├── HdfsTest.scala │ ├── LocalALS.scala │ ├── LocalFileLR.scala │ ├── LocalKMeans.scala │ ├── LocalLR.scala │ ├── LocalPi.scala │ ├── LogQuery.scala │ ├── MultiBroadcastTest.scala │ ├── SimpleSkewedGroupByTest.scala │ ├── SkewedGroupByTest.scala │ ├── SparkALS.scala │ ├── SparkHdfsLR.scala │ ├── SparkKMeans.scala │ ├── SparkLR.scala │ ├── SparkPageRank.scala │ ├── SparkPi.scala │ ├── SparkTC.scala │ ├── SparkTachyonHdfsLR.scala │ └── SparkTachyonPi.scala └── pretty └── examples ├── Aggregate.scala ├── Coalesce.scala ├── CogroupPair.scala └── GroupByKeyPair.scala /.cache: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JerryLead/SparkLearning/b1150869e97dff8b623898c88b3635abd9f3bd54/.cache -------------------------------------------------------------------------------- /.classpath: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /bin 2 | -------------------------------------------------------------------------------- /.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | SparkLearning 4 | 5 | 6 | 7 | 8 | 9 | org.scala-ide.sdt.core.scalabuilder 10 | 11 | 12 | 13 | 14 | 15 | org.scala-ide.sdt.core.scalanature 16 | org.eclipse.jdt.core.javanature 17 | 18 | 19 | -------------------------------------------------------------------------------- /.settings/org.eclipse.jdt.core.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled 3 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.7 4 | org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve 5 | org.eclipse.jdt.core.compiler.compliance=1.7 6 | org.eclipse.jdt.core.compiler.debug.lineNumber=generate 7 | org.eclipse.jdt.core.compiler.debug.localVariable=generate 8 | org.eclipse.jdt.core.compiler.debug.sourceFile=generate 9 | org.eclipse.jdt.core.compiler.problem.assertIdentifier=error 10 | org.eclipse.jdt.core.compiler.problem.enumIdentifier=error 11 | org.eclipse.jdt.core.compiler.source=1.7 12 | -------------------------------------------------------------------------------- /.settings/org.scala-ide.sdt.core.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | organizeimports.expandcollapse=expand 3 | organizeimports.groups=java$scala$org$com 4 | organizeimports.scalapackage=false 5 | organizeimports.wildcards=scalaz$scalaz.Scalaz 6 | -------------------------------------------------------------------------------- /src/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JerryLead/SparkLearning/b1150869e97dff8b623898c88b3635abd9f3bd54/src/.DS_Store -------------------------------------------------------------------------------- /src/api/examples/Cartesian.scala: -------------------------------------------------------------------------------- 1 | package api.examples 2 | 3 | import org.apache.spark.SparkContext 4 | 5 | object Cartesian { 6 | def main(args: Array[String]) { 7 | val sc = new SparkContext("local", "Cartesian Test") 8 | 9 | val x = sc.parallelize(List(1, 2, 3, 4, 5)) 10 | val y = sc.parallelize(List(6, 7, 8, 9, 10)) 11 | 12 | println(x ++ y ++ x) 13 | val result = x.cartesian(y) 14 | //result.collect 15 | result.foreach(println) 16 | } 17 | } -------------------------------------------------------------------------------- /src/api/examples/Checkpoint.scala: -------------------------------------------------------------------------------- 1 | package api.examples 2 | 3 | import org.apache.spark.SparkContext 4 | 5 | object Checkpoint { 6 | def main(args: Array[String]) { 7 | val sc = new SparkContext("local", "Checkpoint Test") 8 | 9 | sc.setCheckpointDir("/Users/xulijie/Documents/data/checkpoint") 10 | val a = sc.parallelize(1 to 4, 2) 11 | a.checkpoint 12 | a.count 13 | } 14 | } -------------------------------------------------------------------------------- /src/api/examples/Coalesce.scala: -------------------------------------------------------------------------------- 1 | package api.examples 2 | 3 | import org.apache.spark.SparkContext 4 | 5 | object Coalesce { 6 | def main(args: Array[String]) { 7 | val sc = new SparkContext("local", "Coalesce Test") 8 | 9 | val y = sc.parallelize(1 to 10, 10) 10 | 11 | y.foreach(println) 12 | 13 | val z = y.coalesce(2, true) 14 | 15 | z.foreach(println) 16 | } 17 | } -------------------------------------------------------------------------------- /src/api/examples/Cogroup.scala: -------------------------------------------------------------------------------- 1 | package api.examples 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.SparkContext._ 5 | 6 | object Cogroup { 7 | def main(args: Array[String]) { 8 | val sc = new SparkContext("local", "Cogroup Test") 9 | 10 | val a = sc.parallelize(List(1, 2, 1, 3), 2) 11 | val b = sc.parallelize(List(1, 2, 3, 4, 5, 6), 3) 12 | val d = a.map((_, "b")) 13 | //b.foreach(println) 14 | // output: 15 | // (1,b) 16 | // (2,b) 17 | // (1,b) 18 | // (3,b) 19 | val e = b.map((_, "c")) 20 | //c.foreach(println) 21 | // output: 22 | // (1,c) 23 | // (2,c) 24 | // (1,c) 25 | // (3,c) 26 | 27 | //val result = b.cogroup(c) 28 | val result = d.cogroup(e, 4) 29 | result.foreach(println) 30 | println(result.toDebugString) 31 | // output: 32 | // (1,(ArrayBuffer(b, b),ArrayBuffer(c, c))) 33 | // (3,(ArrayBuffer(b),ArrayBuffer(c))) 34 | // (2,(ArrayBuffer(b),ArrayBuffer(c))) 35 | 36 | /* 37 | * MappedValuesRDD[5] at cogroup at Cogroup.scala:28 (3 partitions) 38 | * CoGroupedRDD[4] at cogroup at Cogroup.scala:28 (3 partitions) 39 | * MappedRDD[2] at map at Cogroup.scala:12 (2 partitions) 40 | * ParallelCollectionRDD[0] at parallelize at Cogroup.scala:10 (2 partitions) 41 | * MappedRDD[3] at map at Cogroup.scala:19 (3 partitions) 42 | * ParallelCollectionRDD[1] at parallelize at Cogroup.scala:11 (3 partitions) 43 | * 44 | */ 45 | } 46 | } -------------------------------------------------------------------------------- /src/api/examples/Collect.scala: -------------------------------------------------------------------------------- 1 | package api.examples 2 | 3 | import org.apache.spark.SparkContext 4 | 5 | object Collect { 6 | def main(args: Array[String]) { 7 | val sc = new SparkContext("local", "Collect Test") 8 | 9 | val c = sc.parallelize(List("Gnu", "cat", "Rat", "Dog", "Gnu", "Rat"), 2) 10 | 11 | val result = c.collect 12 | result.foreach(println) 13 | } 14 | } -------------------------------------------------------------------------------- /src/api/examples/CollectAsMap.scala: -------------------------------------------------------------------------------- 1 | package api.examples 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.SparkContext._ 5 | 6 | object CollectAsMap { 7 | def main(args: Array[String]) { 8 | val sc = new SparkContext("local", "CollectAsMap Test") 9 | 10 | val a = sc.parallelize(List(1, 2, 1, 3), 1) 11 | val b = a.zip(a) 12 | 13 | val result = b.collectAsMap 14 | 15 | result.foreach(println) 16 | 17 | // output: 18 | // (2,2) 19 | // (1,1) 20 | // (3,3) 21 | } 22 | } -------------------------------------------------------------------------------- /src/api/examples/CombineByKey.scala: -------------------------------------------------------------------------------- 1 | package api.examples 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.SparkContext._ 5 | 6 | object CombineByKey { 7 | def main(args: Array[String]) { 8 | val sc = new SparkContext("local", "CombineByKey Test") 9 | 10 | val a = sc.parallelize(List("dog", "cat", "gnu", "salmon", "rabbit", "turkey", "wolf", "bear", "bee"), 3) 11 | val b = sc.parallelize(List(1, 1, 2, 2, 2, 1, 2, 2, 2), 3) 12 | val c = b.zip(a) 13 | 14 | val d = c.combineByKey(List(_), (x:List[String], y:String) 15 | => y :: x, (x:List[String], y:List[String]) => x ::: y) 16 | 17 | val result = d.collect 18 | result.foreach(println) 19 | println("RDD graph:\n" + d.toDebugString) 20 | } 21 | } -------------------------------------------------------------------------------- /src/api/examples/CountApproxDistinct.scala: -------------------------------------------------------------------------------- 1 | package api.examples 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.SparkContext._ 5 | 6 | object CountApproxDistinct { 7 | def main(args: Array[String]) { 8 | val sc = new SparkContext("local", "CountApproxDistinct Test") 9 | 10 | val a = sc.parallelize(1 to 10000, 20) 11 | val b = a++a++a++a++a 12 | 13 | val result = b.countApproxDistinct(0.1) 14 | println(result) 15 | //println(b.countApproxDistinct(0.05)) 16 | //println(b.countApproxDistinct(0.01)) 17 | //println(b.countApproxDistinct(0.001)) 18 | 19 | } 20 | } -------------------------------------------------------------------------------- /src/api/examples/GroupByKeyPair.scala: -------------------------------------------------------------------------------- 1 | package api.examples 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.SparkContext._ 5 | import org.apache.spark.RangePartitioner 6 | 7 | object GroupByKeyPair { 8 | 9 | def main(args: Array[String]) { 10 | 11 | val sc = new SparkContext("local", "GroupByKeyPair Test") 12 | val d = sc.parallelize(1 to 100, 10) 13 | 14 | val pairs = d.keyBy(x => x % 10) 15 | 16 | val result1 = pairs.groupByKey() 17 | //val result2 = pairs.groupByKey(3) 18 | //val result3 = pairs.groupByKey(new RangePartitioner(3, pairs)) 19 | 20 | println("Result 1:") 21 | result1.foreach(println) 22 | 23 | //println("Result 2:") 24 | //result2.foreach(println) 25 | 26 | //println("Result 3:") 27 | //result3.foreach(println) 28 | 29 | } 30 | } -------------------------------------------------------------------------------- /src/api/examples/IntersectionTest.scala: -------------------------------------------------------------------------------- 1 | package api.examples 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.SparkContext._ 5 | import org.apache.spark.RangePartitioner 6 | 7 | object IntersectionTest { 8 | 9 | def main(args: Array[String]) { 10 | 11 | val sc = new SparkContext("local", "Intersection Test") 12 | val a = sc.parallelize(List(1, 2, 3, 3, 4, 5), 3) 13 | val b = sc.parallelize(List(1, 2, 5, 6), 2) 14 | //val c = sc.parallelize(List(1, 2, 3), 1) 15 | 16 | val r = a.intersection(b) 17 | //r.foreachWith(i => i)((x, i) => println("[PartitionIndex " + i + "] " + x)) 18 | 19 | println(r.toDebugString) 20 | // [PartitionIndex 1] 1 21 | // [PartitionIndex 2] 5 22 | // [PartitionIndex 2] 2 23 | } 24 | } -------------------------------------------------------------------------------- /src/api/examples/Sample.scala: -------------------------------------------------------------------------------- 1 | package api.examples 2 | 3 | import org.apache.spark.SparkContext 4 | 5 | 6 | object Sample { 7 | 8 | def main(args: Array[String]) { 9 | 10 | val sc = new SparkContext("local", "Sample Test") 11 | val d = sc.parallelize(1 to 100, 10) 12 | 13 | val result1 = d.sample(false, 0.1, 0) 14 | val result2 = d.sample(true, 0.1, 0) 15 | 16 | println(result1.toDebugString) 17 | 18 | println("result 1:") 19 | result1.collect.foreach(x => print(x + " ")) 20 | println("\nresutl 2:") 21 | result2.collect.foreach(x => print(x + " ")) 22 | //result1.foreachWith(i => i)((x, i) => println("[PartitionIndex " + i + "] " + x)) 23 | //result2.foreachWith(i => i)((x, i) => println("[PartitionIndex " + i + "] " + x)) 24 | } 25 | } -------------------------------------------------------------------------------- /src/api/examples/Utils.scala: -------------------------------------------------------------------------------- 1 | package api.examples 2 | 3 | class Utils { 4 | 5 | //def print(rdd: RDD[T]) 6 | } -------------------------------------------------------------------------------- /src/internals/IntersectionTest.scala: -------------------------------------------------------------------------------- 1 | package internals 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.SparkContext._ 5 | import org.apache.spark.RangePartitioner 6 | 7 | object IntersectionTest { 8 | 9 | def main(args: Array[String]) { 10 | 11 | val sc = new SparkContext("local", "Intersection Test") 12 | val a = sc.parallelize(List(1, 2, 3, 3, 4, 5), 3) 13 | val b = sc.parallelize(List(1, 2, 5, 6), 2) 14 | 15 | 16 | val r = a.intersection(b) 17 | 18 | a.foreachWith(i => i)((x, i) => println("[aIndex " + i + "] " + x)) 19 | b.foreachWith(i => i)((x, i) => println("[bIndex " + i + "] " + x)) 20 | r.foreachWith(i => i)((x, i) => println("[PartitionIndex " + i + "] " + x)) 21 | 22 | println(r.toDebugString) 23 | 24 | /* 25 | [aIndex 0] 1 26 | [aIndex 0] 2 27 | 28 | [aIndex 1] 3 29 | [aIndex 1] 3 30 | 31 | [aIndex 2] 4 32 | [aIndex 2] 5 33 | 34 | [bIndex 0] 1 35 | [bIndex 0] 2 36 | 37 | [bIndex 1] 5 38 | [bIndex 1] 6 39 | 40 | [PartitionIndex 1] 1 41 | 42 | [PartitionIndex 2] 5 43 | [PartitionIndex 2] 2 44 | 45 | MappedRDD[7] at intersection at IntersectionTest.scala:16 (3 partitions) 46 | FilteredRDD[6] at intersection at IntersectionTest.scala:16 (3 partitions) 47 | MappedValuesRDD[5] at intersection at IntersectionTest.scala:16 (3 partitions) 48 | CoGroupedRDD[4] at intersection at IntersectionTest.scala:16 (3 partitions) 49 | MappedRDD[2] at intersection at IntersectionTest.scala:16 (3 partitions) 50 | ParallelCollectionRDD[0] at parallelize at IntersectionTest.scala:12 (3 partitions) 51 | MappedRDD[3] at intersection at IntersectionTest.scala:16 (2 partitions) 52 | ParallelCollectionRDD[1] at parallelize at IntersectionTest.scala:13 (2 partitions) 53 | */ 54 | } 55 | } -------------------------------------------------------------------------------- /src/internals/RepartitionTest2.scala: -------------------------------------------------------------------------------- 1 | package internals 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.SparkContext._ 5 | import org.apache.spark.HashPartitioner 6 | 7 | object RepartitionTest2 { 8 | def main(args: Array[String]) { 9 | 10 | val sc = new SparkContext("local", "repartition Test") 11 | val data = Array[(Int, Char)]((3, 'a'), (2, 'b'), 12 | (1, 'c'), (4, 'd')) 13 | val pairs1 = sc.parallelize(data, 3).partitionBy(new HashPartitioner(2)) 14 | 15 | pairs1.foreachWith(i => i)((x, i) => println("[pairs1-Index " + i + "] " + x)) 16 | } 17 | } 18 | /* 19 | [pairs1-Index 0] (3,a) 20 | [pairs1-Index 0] (2,b) 21 | [pairs1-Index 0] (1,c) 22 | 23 | [pairs1-Index 1] (4,d) 24 | */ -------------------------------------------------------------------------------- /src/internals/broadcastTest.scala: -------------------------------------------------------------------------------- 1 | package internals 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.SparkContext 5 | 6 | object broadcast { 7 | def main(args: Array[String]) { 8 | 9 | val bcName = "Http" 10 | val blockSize = "4096" 11 | 12 | System.setProperty("spark.broadcast.factory", "org.apache.spark.broadcast." + bcName + 13 | "BroadcastFactory") 14 | System.setProperty("spark.broadcast.blockSize", blockSize) 15 | val sparkConf = new SparkConf().setAppName("Broadcast Test").setMaster("local") 16 | 17 | val sc = new SparkContext(sparkConf) 18 | 19 | val slices = 2 20 | val num = 100 21 | 22 | val arr1 = new Array[Int](num) 23 | 24 | for (i <- 0 until arr1.length) { 25 | arr1(i) = i 26 | } 27 | 28 | val data = sc.makeRDD(List(1, 2, 3, 4, 5, 6), 2) 29 | 30 | val barr1 = sc.broadcast(arr1) 31 | val observedSizes = sc.parallelize(1 to 4, slices).map(_ => barr1.value.size) 32 | // Collect the small RDD so we can print the observed sizes locally. 33 | observedSizes.collect().foreach(i => println(i)) 34 | 35 | //println(barr1.value.size) 36 | //barr1.value.collect 37 | } 38 | } -------------------------------------------------------------------------------- /src/internals/cartesianTest.scala: -------------------------------------------------------------------------------- 1 | package internals 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.SparkContext._ 5 | 6 | object cartesianTest { 7 | def main(args: Array[String]) { 8 | 9 | val sc = new SparkContext("local", "cartesian Test") 10 | val data1 = Array[(Int, Char)]((1, 'a'), (2, 'b'), 11 | (3, 'c'), (4, 'd')) 12 | val pairs1 = sc.parallelize(data1, 2) 13 | 14 | val data2 = Array[(Int, Char)]((1, 'A'), (2, 'B')) 15 | val pairs2 = sc.parallelize(data2, 2) 16 | 17 | val result = pairs1.cartesian(pairs2) 18 | 19 | //pairs1.foreachWith(i => i)((x, i) => println("[pairs1-Index " + i + "] " + x)) 20 | //pairs2.foreachWith(i => i)((x, i) => println("[pairs2-Index " + i + "] " + x)) 21 | result.foreachWith(i => i)((x, i) => println("[PartitionIndex " + i + "] " + x)) 22 | 23 | //println(result.toDebugString) 24 | } 25 | } 26 | /* 27 | [pairs1-Index 0] (1,a) 28 | [pairs1-Index 0] (2,b) 29 | 30 | [pairs1-Index 1] (3,c) 31 | [pairs1-Index 1] (4,d) 32 | 33 | [pairs2-Index 0] (1,A) 34 | [pairs2-Index 1] (2,B) 35 | 36 | [PartitionIndex 0] ((1,a),(1,A)) 37 | [PartitionIndex 0] ((2,b),(1,A)) 38 | 39 | [PartitionIndex 1] ((1,a),(2,B)) 40 | [PartitionIndex 1] ((2,b),(2,B)) 41 | 42 | [PartitionIndex 2] ((3,c),(1,A)) 43 | [PartitionIndex 2] ((4,d),(1,A)) 44 | 45 | [PartitionIndex 3] ((3,c),(2,B)) 46 | [PartitionIndex 3] ((4,d),(2,B)) 47 | 48 | 49 | 50 | CartesianRDD[2] at cartesian at cartesianTest.scala:17 (4 partitions) 51 | ParallelCollectionRDD[0] at parallelize at cartesianTest.scala:12 (2 partitions) 52 | ParallelCollectionRDD[1] at parallelize at cartesianTest.scala:15 (2 partitions) 53 | 54 | */ 55 | 56 | -------------------------------------------------------------------------------- /src/internals/coalesceTest.scala: -------------------------------------------------------------------------------- 1 | package internals 2 | 3 | import org.apache.spark.SparkContext 4 | 5 | object coalesceTest { 6 | def main(args: Array[String]) { 7 | val sc = new SparkContext("local", "Coalesce Test") 8 | 9 | //val y = sc.parallelize(1 to 10, 5) 10 | val y = sc.parallelize(List(1, 2, 3, 4, 5, 2, 5, 8, 3, 10), 5) 11 | // y.foreachWith(i => i)((x, i) => println("[yPartitionIndex " + i + "] " + x)) 12 | 13 | val z = y.coalesce(10, false) 14 | 15 | y.foreachWith(i => i)((x, i) => println("[yPartitionIndex " + i + "] " + x)) 16 | z.foreachWith(i => i)((x, i) => println("[zPartitionIndex " + i + "] " + x)) 17 | 18 | println(z.toDebugString) 19 | } 20 | } 21 | 22 | /* 23 | [yPartitionIndex 0] 1 24 | [yPartitionIndex 0] 2 25 | 26 | [yPartitionIndex 1] 3 27 | [yPartitionIndex 1] 4 28 | 29 | [yPartitionIndex 2] 5 30 | [yPartitionIndex 2] 6 31 | 32 | [yPartitionIndex 3] 7 33 | [yPartitionIndex 3] 8 34 | 35 | [yPartitionIndex 4] 9 36 | [yPartitionIndex 4] 10 37 | 38 | [zPartitionIndex 0] 1 39 | [zPartitionIndex 0] 2 40 | 41 | [zPartitionIndex 1] 3 42 | [zPartitionIndex 1] 4 43 | [zPartitionIndex 1] 5 44 | [zPartitionIndex 1] 6 45 | 46 | [zPartitionIndex 2] 7 47 | [zPartitionIndex 2] 8 48 | [zPartitionIndex 2] 9 49 | [zPartitionIndex 2] 10 50 | 51 | 52 | CoalescedRDD[1] at coalesce at coalesceTest.scala:13 (3 partitions) 53 | ParallelCollectionRDD[0] at parallelize at coalesceTest.scala:9 (5 partitions) 54 | 55 | 56 | [zPartitionIndex 0] 6 57 | [zPartitionIndex 0] 7 58 | [zPartitionIndex 0] 9 59 | 60 | [zPartitionIndex 1] 1 61 | [zPartitionIndex 1] 3 62 | [zPartitionIndex 1] 8 63 | [zPartitionIndex 1] 10 64 | 65 | [zPartitionIndex 2] 2 66 | [zPartitionIndex 2] 4 67 | [zPartitionIndex 2] 5 68 | 69 | 70 | 71 | 72 | MappedRDD[4] at coalesce at coalesceTest.scala:13 (3 partitions) 73 | CoalescedRDD[3] at coalesce at coalesceTest.scala:13 (3 partitions) 74 | ShuffledRDD[2] at coalesce at coalesceTest.scala:13 (3 partitions) 75 | MapPartitionsRDD[1] at coalesce at coalesceTest.scala:13 (5 partitions) 76 | ParallelCollectionRDD[0] at parallelize at coalesceTest.scala:9 (5 partitions) 77 | 78 | 79 | */ -------------------------------------------------------------------------------- /src/internals/cogroupTest.scala: -------------------------------------------------------------------------------- 1 | package internals 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.SparkContext._ 5 | import org.apache.spark.RangePartitioner 6 | 7 | object cogroupTest { 8 | 9 | def main(args: Array[String]) { 10 | 11 | val sc = new SparkContext("local", "cogroup Test") 12 | val a = sc.parallelize(List(1, 2, 3, 3, 4, 5), 3).map(x => (x, 'a')) 13 | val b = sc.parallelize(List(1, 2, 5, 6), 2).map(y => (y, 'b')) 14 | 15 | 16 | val r = a.cogroup(b) 17 | 18 | a.foreachWith(i => i)((x, i) => println("[aIndex " + i + "] " + x)) 19 | b.foreachWith(i => i)((x, i) => println("[bIndex " + i + "] " + x)) 20 | r.foreachWith(i => i)((x, i) => println("[PartitionIndex " + i + "] " + x)) 21 | 22 | println(r.toDebugString) 23 | 24 | /* 25 | [aIndex 0] (1,a) 26 | [aIndex 0] (2,a) 27 | 28 | [aIndex 1] (3,a) 29 | [aIndex 1] (3,a) 30 | 31 | [aIndex 2] (4,a) 32 | [aIndex 2] (5,a) 33 | 34 | [bIndex 0] (1,b) 35 | [bIndex 0] (2,b) 36 | 37 | [bIndex 1] (5,b) 38 | [bIndex 1] (6,b) 39 | 40 | [PartitionIndex 0] (6,(ArrayBuffer(),ArrayBuffer(b))) 41 | [PartitionIndex 0] (3,(ArrayBuffer(a, a),ArrayBuffer())) 42 | 43 | [PartitionIndex 1] (4,(ArrayBuffer(a),ArrayBuffer())) 44 | [PartitionIndex 1] (1,(ArrayBuffer(a),ArrayBuffer(b))) 45 | 46 | 47 | [PartitionIndex 2] (5,(ArrayBuffer(a),ArrayBuffer(b))) 48 | [PartitionIndex 2] (2,(ArrayBuffer(a),ArrayBuffer(b))) 49 | 50 | MappedValuesRDD[5] at cogroup at cogroupTest.scala:16 (3 partitions) 51 | CoGroupedRDD[4] at cogroup at cogroupTest.scala:16 (3 partitions) 52 | MappedRDD[1] at map at cogroupTest.scala:12 (3 partitions) 53 | ParallelCollectionRDD[0] at parallelize at cogroupTest.scala:12 (3 partitions) 54 | MappedRDD[3] at map at cogroupTest.scala:13 (2 partitions) 55 | ParallelCollectionRDD[2] at parallelize at cogroupTest.scala:13 (2 partitions) 56 | */ 57 | } 58 | } -------------------------------------------------------------------------------- /src/internals/complexStages.scala: -------------------------------------------------------------------------------- 1 | package internals 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.SparkContext._ 5 | import org.apache.spark.HashPartitioner 6 | 7 | 8 | object complexStagesTest { 9 | def main(args: Array[String]) { 10 | 11 | val sc = new SparkContext("local", "complexStages Test") 12 | 13 | 14 | val data1 = Array[(Int, Char)]( 15 | (1, 'a'), (2, 'b'), 16 | (3, 'c'), (4, 'd'), 17 | (5, 'e'), (3, 'f'), 18 | (2, 'g'), (1, 'h')) 19 | val rangePairs1 = sc.parallelize(data1, 3) 20 | 21 | val hashPairs1 = rangePairs1.partitionBy(new HashPartitioner(3)) 22 | 23 | 24 | val data2 = Array[(Int, String)]((1, "A"), (2, "B"), 25 | (3, "C"), (4, "D")) 26 | 27 | val pairs2 = sc.parallelize(data2, 2) 28 | val rangePairs2 = pairs2.map(x => (x._1, x._2.charAt(0))) 29 | 30 | 31 | val data3 = Array[(Int, Char)]((1, 'X'), (2, 'Y')) 32 | val rangePairs3 = sc.parallelize(data3, 2) 33 | 34 | 35 | val rangePairs = rangePairs2.union(rangePairs3) 36 | 37 | 38 | val result = hashPairs1.join(rangePairs) 39 | 40 | result.foreachWith(i => i)((x, i) => println("[result " + i + "] " + x)) 41 | 42 | println(result.toDebugString) 43 | } 44 | } -------------------------------------------------------------------------------- /src/internals/distinctTest.scala: -------------------------------------------------------------------------------- 1 | package internals 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.SparkContext._ 5 | 6 | object distinctTest { 7 | def main(args: Array[String]) { 8 | val sc = new SparkContext("local", "distinct test") 9 | 10 | val pairs = sc.parallelize(List(1, 2, 2, 3, 2, 1, 4, 5), 3) 11 | 12 | val result = pairs.distinct(2) 13 | 14 | // output 15 | // [PartitionIndex 0] 1 16 | // [PartitionIndex 0] 2 17 | 18 | // [PartitionIndex 1] 2 19 | // [PartitionIndex 1] 3 20 | // [PartitionIndex 1] 2 21 | 22 | // [PartitionIndex 2] 1 23 | // [PartitionIndex 2] 4 24 | // [PartitionIndex 2] 5 25 | 26 | pairs.foreachWith(i => i)((x, i) => println("[PartitionIndex " + i + "] " + x)) 27 | result.foreachWith(i => i)((x, i) => println("[PartitionIndex " + i + "] " + x)) 28 | 29 | // output 30 | // [PartitionIndex 0] 4 31 | // [PartitionIndex 0] 2 32 | 33 | // [PartitionIndex 1] 1 34 | // [PartitionIndex 1] 3 35 | // [PartitionIndex 1] 5 36 | 37 | println(result.toDebugString) 38 | } 39 | 40 | /* 41 | MappedRDD[5] at distinct at distinctTest.scala:12 (2 partitions) 42 | MapPartitionsRDD[4] at distinct at distinctTest.scala:12 (2 partitions) 43 | ShuffledRDD[3] at distinct at distinctTest.scala:12 (2 partitions) 44 | MapPartitionsRDD[2] at distinct at distinctTest.scala:12 (3 partitions) 45 | MappedRDD[1] at distinct at distinctTest.scala:12 (3 partitions) 46 | ParallelCollectionRDD[0] at parallelize at distinctTest.scala:10 (3 partitions) 47 | * 48 | */ 49 | } -------------------------------------------------------------------------------- /src/internals/groupByKeyTest.scala: -------------------------------------------------------------------------------- 1 | package internals 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.SparkContext._ 5 | import org.apache.spark.SparkConf 6 | 7 | object groupByKeyTest { 8 | 9 | def main(args: Array[String]) { 10 | val conf = new SparkConf().setAppName("GroupByKey").setMaster("local") 11 | val sc = new SparkContext(conf) 12 | sc.setCheckpointDir("/Users/xulijie/Documents/data/checkpoint") 13 | 14 | val data = Array[(Int, Char)]((1, 'a'), (2, 'b'), 15 | (3, 'c'), (4, 'd'), 16 | (5, 'e'), (3, 'f'), 17 | (2, 'g'), (1, 'h') 18 | 19 | ) 20 | val pairs = sc.parallelize(data, 3) 21 | 22 | pairs.checkpoint 23 | pairs.count 24 | 25 | val result = pairs.groupByKey(2) 26 | 27 | // output: 28 | //pairs.foreachWith(i => i)((x, i) => println("[dataPartitionIndex " + i + "] " + x)) 29 | result.foreachWith(i => i)((x, i) => println("[PartitionIndex " + i + "] " + x)) 30 | 31 | println(result.toDebugString) 32 | 33 | /* 34 | [dataPartitionIndex 0] (1,a) 35 | [dataPartitionIndex 0] (2,b) 36 | 37 | [dataPartitionIndex 1] (3,c) 38 | [dataPartitionIndex 1] (4,d) 39 | [dataPartitionIndex 1] (5,e) 40 | 41 | [dataPartitionIndex 2] (3,f) 42 | [dataPartitionIndex 2] (2,g) 43 | [dataPartitionIndex 2] (1,h) 44 | 45 | [PartitionIndex 0] (4,ArrayBuffer(d)) 46 | [PartitionIndex 0] (2,ArrayBuffer(b, g)) 47 | 48 | [PartitionIndex 1] (1,ArrayBuffer(a, h)) 49 | [PartitionIndex 1] (3,ArrayBuffer(c, f)) 50 | [PartitionIndex 1] (5,ArrayBuffer(e)) 51 | 52 | MappedValuesRDD[3] at groupByKey at groupByKeyTest.scala:19 (2 partitions) 53 | MapPartitionsRDD[2] at groupByKey at groupByKeyTest.scala:19 (2 partitions) 54 | ShuffledRDD[1] at groupByKey at groupByKeyTest.scala:19 (2 partitions) 55 | ParallelCollectionRDD[0] at parallelize at groupByKeyTest.scala:17 (3 partitions) 56 | */ 57 | } 58 | } -------------------------------------------------------------------------------- /src/internals/hashjoinTest.scala: -------------------------------------------------------------------------------- 1 | package internals 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.SparkContext._ 5 | import org.apache.spark.HashPartitioner 6 | 7 | object hashjoinTest { 8 | def main(args: Array[String]) { 9 | 10 | val sc = new SparkContext("local", "hashjoin Test") 11 | val data1 = Array[(Int, Char)]((1, 'a'), (2, 'b'), 12 | (3, 'c'), (4, 'd'), 13 | (5, 'e'), (3, 'f'), 14 | (2, 'g'), (1, 'h')) 15 | val pairs1 = sc.parallelize(data1, 3).partitionBy(new HashPartitioner(3)) 16 | 17 | 18 | val data2 = Array[(Int, Char)]((1, 'A'), (2, 'B'), 19 | (3, 'C'), (4, 'D')) 20 | val pairs2 = sc.parallelize(data2, 2) 21 | 22 | val result = pairs1.join(pairs2) 23 | 24 | //pairs1.foreachWith(i => i)((x, i) => println("[pairs1-Index " + i + "] " + x)) 25 | //pairs2.foreachWith(i => i)((x, i) => println("[pairs2-Index " + i + "] " + x)) 26 | result.foreachWith(i => i)((x, i) => println("[PartitionIndex " + i + "] " + x)) 27 | 28 | println(result.toDebugString) 29 | /* 30 | [pairs1-Index 0] (1,a) 31 | [pairs1-Index 0] (2,b) 32 | 33 | [pairs1-Index 1] (3,c) 34 | [pairs1-Index 1] (4,d) 35 | [pairs1-Index 1] (5,e) 36 | 37 | [pairs1-Index 2] (3,f) 38 | [pairs1-Index 2] (2,g) 39 | [pairs1-Index 2] (1,h) 40 | 41 | [pairs2-Index 0] (1,A) 42 | [pairs2-Index 0] (2,B) 43 | 44 | [pairs2-Index 1] (3,C) 45 | [pairs2-Index 1] (4,D) 46 | 47 | [PartitionIndex 0] (3,(c,C)) 48 | [PartitionIndex 0] (3,(f,C)) 49 | 50 | [PartitionIndex 1] (4,(d,D)) 51 | [PartitionIndex 1] (1,(a,A)) 52 | [PartitionIndex 1] (1,(h,A)) 53 | 54 | [PartitionIndex 2] (2,(b,B)) 55 | [PartitionIndex 2] (2,(g,B)) 56 | 57 | FlatMappedValuesRDD[4] at join at joinTest.scala:20 (3 partitions) 58 | MappedValuesRDD[3] at join at joinTest.scala:20 (3 partitions) 59 | CoGroupedRDD[2] at join at joinTest.scala:20 (3 partitions) 60 | ParallelCollectionRDD[0] at parallelize at joinTest.scala:14 (3 partitions) 61 | ParallelCollectionRDD[1] at parallelize at joinTest.scala:18 (2 partitions) 62 | 63 | */ 64 | } 65 | } -------------------------------------------------------------------------------- /src/internals/joinTest.scala: -------------------------------------------------------------------------------- 1 | package internals 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.SparkContext._ 5 | import org.apache.spark.HashPartitioner 6 | 7 | object joinTest { 8 | def main(args: Array[String]) { 9 | 10 | val sc = new SparkContext("local", "join Test") 11 | 12 | 13 | val data1 = Array[(Int, Char)]((1, 'a'), (2, 'b'), 14 | (3, 'c'), (4, 'd'), 15 | (5, 'e'), (3, 'f'), 16 | (2, 'g'), (1, 'h')) 17 | val pairs1 = sc.parallelize(data1, 3) 18 | 19 | 20 | val data2 = Array[(Int, Char)]((1, 'A'), (2, 'B'), 21 | (3, 'C'), (4, 'D')) 22 | val pairs2 = sc.parallelize(data2, 2) 23 | 24 | 25 | val result = pairs1.join(pairs2) 26 | 27 | //pairs1.foreachWith(i => i)((x, i) => println("[pairs1-Index " + i + "] " + x)) 28 | //pairs2.foreachWith(i => i)((x, i) => println("[pairs2-Index " + i + "] " + x)) 29 | result.foreachWith(i => i)((x, i) => println("[PartitionIndex " + i + "] " + x)) 30 | 31 | println(result.toDebugString) 32 | 33 | /* 34 | [pairs1-Index 0] (1,a) 35 | [pairs1-Index 0] (2,b) 36 | 37 | [pairs1-Index 1] (3,c) 38 | [pairs1-Index 1] (4,d) 39 | [pairs1-Index 1] (5,e) 40 | 41 | [pairs1-Index 2] (3,f) 42 | [pairs1-Index 2] (2,g) 43 | [pairs1-Index 2] (1,h) 44 | 45 | [pairs2-Index 0] (1,A) 46 | [pairs2-Index 0] (2,B) 47 | 48 | [pairs2-Index 1] (3,C) 49 | [pairs2-Index 1] (4,D) 50 | 51 | [PartitionIndex 0] (3,(c,C)) 52 | [PartitionIndex 0] (3,(f,C)) 53 | 54 | [PartitionIndex 1] (4,(d,D)) 55 | [PartitionIndex 1] (1,(a,A)) 56 | [PartitionIndex 1] (1,(h,A)) 57 | 58 | [PartitionIndex 2] (2,(b,B)) 59 | [PartitionIndex 2] (2,(g,B)) 60 | 61 | FlatMappedValuesRDD[4] at join at joinTest.scala:20 (3 partitions) 62 | MappedValuesRDD[3] at join at joinTest.scala:20 (3 partitions) 63 | CoGroupedRDD[2] at join at joinTest.scala:20 (3 partitions) 64 | ParallelCollectionRDD[0] at parallelize at joinTest.scala:14 (3 partitions) 65 | ParallelCollectionRDD[1] at parallelize at joinTest.scala:18 (2 partitions) 66 | 67 | */ 68 | } 69 | } -------------------------------------------------------------------------------- /src/internals/pipeTest.scala: -------------------------------------------------------------------------------- 1 | package internals 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.SparkContext._ 5 | 6 | object pipeTest { 7 | def main(args: Array[String]) { 8 | 9 | val sc = new SparkContext("local", "cartesian Test") 10 | 11 | val a = sc.parallelize(1 to 9, 3) 12 | val result = a.pipe("head -n 2") 13 | 14 | a.foreachWith(i => i)((x, i) => println("[aIndex " + i + "] " + x)) 15 | result.foreachWith(i => i)((x, i) => println("[PartitionIndex " + i + "] " + x)) 16 | 17 | println(result.toDebugString) 18 | } 19 | } 20 | /* 21 | [PartitionIndex 0] 1 22 | [PartitionIndex 0] 2 23 | 24 | [PartitionIndex 1] 4 25 | [PartitionIndex 1] 5 26 | 27 | [PartitionIndex 2] 7 28 | [PartitionIndex 2] 8 29 | 30 | 31 | 32 | PipedRDD[1] at pipe at pipeTest.scala:12 (3 partitions) 33 | ParallelCollectionRDD[0] at parallelize at pipeTest.scala:11 (3 partitions) 34 | 35 | */ 36 | 37 | -------------------------------------------------------------------------------- /src/internals/reduceByKeyTest.scala: -------------------------------------------------------------------------------- 1 | package internals 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.SparkContext._ 5 | 6 | object reduceByKeyTest { 7 | 8 | def main(args: Array[String]) { 9 | val sc = new SparkContext("local", "ReduceByKey Test") 10 | val data1 = Array[(String, Int)](("A", 1), ("B", 1), 11 | ("C", 1), ("B", 1), 12 | ("C", 1), ("D", 1), 13 | ("C", 1), ("A", 1)) 14 | val pairs = sc.parallelize(data1, 3) 15 | 16 | // pairs.foreachWith(i => i)((x, i) => println("[pPartitionIndex " + i + "] " + x)) 17 | 18 | // [pPartitionIndex 0] (A,1) 19 | // [pPartitionIndex 0] (B,1) 20 | 21 | // [pPartitionIndex 1] (C,1) 22 | // [pPartitionIndex 1] (B,1) 23 | // [pPartitionIndex 1] (C,1) 24 | 25 | // [pPartitionIndex 2] (D,1) 26 | // [pPartitionIndex 2] (C,1) 27 | // [pPartitionIndex 2] (A,1) 28 | 29 | //val result = pairs.reduce((A, B) => (A._1 + "#" + B._1, A._2 + B._2)) 30 | //val result = pairs.fold(("K0",10))((A, B) => (A._1 + "#" + B._1, A._2 + B._2)) 31 | val result = pairs.reduceByKey(_ + _, 2) 32 | result.foreachWith(i => i)((x, i) => println("[PartitionIndex " + i + "] " + x)) 33 | 34 | println(result.toDebugString) 35 | 36 | // output 37 | // [PartitionIndex 0] (B,2) 38 | // [PartitionIndex 0] (D,1) 39 | // [PartitionIndex 1] (A,2) 40 | // [PartitionIndex 1] (C,3) 41 | 42 | /* 43 | MapPartitionsRDD[3] at reduceByKey at reduceByKeyTest.scala:17 (2 partitions) 44 | ShuffledRDD[2] at reduceByKey at reduceByKeyTest.scala:17 (2 partitions) 45 | MapPartitionsRDD[1] at reduceByKey at reduceByKeyTest.scala:17 (3 partitions) 46 | ParallelCollectionRDD[0] at parallelize at reduceByKeyTest.scala:14 (3 partitions) 47 | */ 48 | } 49 | } -------------------------------------------------------------------------------- /src/internals/repartitionTest.scala: -------------------------------------------------------------------------------- 1 | package internals 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.SparkContext._ 5 | 6 | object repartitionTest { 7 | def main(args: Array[String]) { 8 | 9 | val sc = new SparkContext("local", "Coalesce Test") 10 | val y = sc.parallelize(1 to 100, 5) 11 | 12 | //y.foreach(println) 13 | 14 | val z = y.repartition(2) 15 | 16 | val r = z.takeOrdered(7) 17 | z.foreach(println) 18 | } 19 | } -------------------------------------------------------------------------------- /src/internals/sortByKeyTest.scala: -------------------------------------------------------------------------------- 1 | package internals 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.SparkContext._ 5 | 6 | object sortByKeyTest { 7 | def main(args: Array[String]) { 8 | 9 | val sc = new SparkContext("local", "sortByKey Test") 10 | val data1 = Array[(Char, Int)](('A', 5), ('B', 4), 11 | ('C', 3), ('B', 2), 12 | ('C', 1), ('D', 2), 13 | ('C', 3), ('A', 4)) 14 | val pairs = sc.parallelize(data1, 3) 15 | 16 | val result = pairs.sortByKey(true, 2) 17 | pairs.foreachWith(i => i)((x, i) => println("[pairsPartitionIndex " + i + "] " + x)) 18 | result.foreachWith(i => i)((x, i) => println("[PartitionIndex " + i + "] " + x)) 19 | 20 | println(result.toDebugString) 21 | } 22 | 23 | } 24 | 25 | /* 26 | [pairsPartitionIndex 0] (A,5) 27 | [pairsPartitionIndex 0] (B,4) 28 | 29 | [pairsPartitionIndex 1] (C,3) 30 | [pairsPartitionIndex 1] (B,2) 31 | [pairsPartitionIndex 1] (C,1) 32 | 33 | [pairsPartitionIndex 2] (D,2) 34 | [pairsPartitionIndex 2] (C,3) 35 | [pairsPartitionIndex 2] (A,4) 36 | 37 | [PartitionIndex 0] (A,5) 38 | [PartitionIndex 0] (A,4) 39 | [PartitionIndex 0] (B,4) 40 | [PartitionIndex 0] (B,2) 41 | 42 | [PartitionIndex 1] (C,3) 43 | [PartitionIndex 1] (C,1) 44 | [PartitionIndex 1] (C,3) 45 | [PartitionIndex 1] (D,2) 46 | 47 | MapPartitionsRDD[4] at sortByKey at sortByKeyTest.scala:16 (2 partitions) 48 | ShuffledRDD[3] at sortByKey at sortByKeyTest.scala:16 (2 partitions) 49 | ParallelCollectionRDD[0] at parallelize at sortByKeyTest.scala:14 (3 partitions) 50 | */ 51 | -------------------------------------------------------------------------------- /src/local/examples/Aggregate.scala: -------------------------------------------------------------------------------- 1 | package local.examples 2 | 3 | import org.apache.spark.SparkContext 4 | 5 | object Aggregate { 6 | 7 | def main(args: Array[String]) { 8 | 9 | val sc = new SparkContext("local", "AggregateAction Test") 10 | val data = Array[(String, Int)](("A1", 1), ("A2", 2), 11 | ("B1", 3), ("B2", 4), 12 | ("C1", 5), ("C2", 6)) 13 | 14 | val pairs = sc.parallelize(data, 3) 15 | 16 | // output: 17 | // (A1,1)(A2,2) 18 | // (B1,3)(B2,4) 19 | // (C1,5)(C2,6) 20 | pairs.foreach(print) 21 | 22 | val result = pairs.aggregate(("", 0))((U, T) => (U._1 + T._1, U._2 + T._2), (U, T) => 23 | ("[" + U._1 + T._1 + "] ", U._2 + T._2)) 24 | 25 | // output ([[[A1A2] B1B2] C1C2] ,21) 26 | println(result) 27 | } 28 | } -------------------------------------------------------------------------------- /src/local/examples/AggregateOrder.scala: -------------------------------------------------------------------------------- 1 | package local.examples 2 | 3 | import org.apache.spark.SparkContext 4 | 5 | object AggregateOrder { 6 | 7 | def main(args: Array[String]) { 8 | 9 | val sc = new SparkContext("local", "AggregateOrder Test") 10 | val data = List("12", "23", "345", "4567") 11 | 12 | val pairs = sc.parallelize(data, 2) 13 | pairs.foreach(x => println(x.length)) 14 | 15 | //val result = pairs.aggregate("")((x,y) => math.min(x.length, y.length).toString, (x,y) => x + y) 16 | 17 | val result2 = pairs.aggregate("")((x,y) => "[" + x.length + "," + y.length + "] ", (x,y) => x + y) 18 | 19 | result2.foreach(println) 20 | println(result2) 21 | 22 | } 23 | } -------------------------------------------------------------------------------- /src/local/examples/Cartesian.scala: -------------------------------------------------------------------------------- 1 | package local.examples 2 | 3 | import org.apache.spark.SparkContext 4 | 5 | object Cartesian { 6 | def main(args: Array[String]) { 7 | val sc = new SparkContext("local", "Cartesian Test") 8 | val data1 = Array[(String, Int)](("A1", 1), ("A2", 2), 9 | ("B1", 3), ("B2", 4), 10 | ("C1", 5), ("C1", 6)) 11 | 12 | val data2 = Array[(String, Int)](("A1", 7), ("A2", 8), 13 | ("B1", 9), ("C1", 0)) 14 | val pairs1 = sc.parallelize(data1, 3) 15 | val pairs2 = sc.parallelize(data2, 2) 16 | 17 | val resultRDD = pairs1.cartesian(pairs2) 18 | 19 | resultRDD.foreach(println) 20 | 21 | /* 22 | * Output of task1: 23 | * ((A1,1),(A1,7)) 24 | * ((A1,1),(A2,8)) 25 | * ((A2,2),(A1,7)) 26 | * ((A2,2),(A2,8)) 27 | * Output of task2: 28 | * ((A1,1),(B1,9)) 29 | * ((A1,1),(C1,0)) 30 | * ((A2,2),(B1,9)) 31 | * ((A2,2),(C1,0)) 32 | * Output of task3: 33 | * ((B1,3),(A1,7)) 34 | * ((B1,3),(A2,8)) 35 | * ((B2,4),(A1,7)) 36 | * ((B2,4),(A2,8)) 37 | * Output of task4: 38 | * ((B1,3),(B1,9)) 39 | * ((B1,3),(C1,0)) 40 | * ((B2,4),(B1,9)) 41 | * ((B2,4),(C1,0)) 42 | * Output of task5: 43 | * ((C1,5),(A1,7)) 44 | * ((C1,5),(A2,8)) 45 | * ((C1,6),(A1,7)) 46 | * ((C1,6),(A2,8)) 47 | * Output of task6: 48 | * ((C1,5),(B1,9)) 49 | * ((C1,5),(C1,0)) 50 | * ((C1,6),(B1,9)) 51 | * ((C1,6),(C1,0)) 52 | */ 53 | 54 | } 55 | } -------------------------------------------------------------------------------- /src/local/examples/CollectAsMap.scala: -------------------------------------------------------------------------------- 1 | package local.examples 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.SparkContext._ 5 | 6 | object CollectAsMap { 7 | def main(args: Array[String]) { 8 | 9 | val sc = new SparkContext("local", "CollectAsMap Test") 10 | val data = Array[(String, Int)](("A", 1), ("B", 2), 11 | ("B", 3), ("C", 4), 12 | ("C", 5), ("C", 6)) 13 | 14 | // as same as "val pairs = sc.parallelize(data, 3)" 15 | val pairs = sc.makeRDD(data, 3) 16 | 17 | val result = pairs.collectAsMap 18 | 19 | // output Map(A -> 1, C -> 6, B -> 3) 20 | print(result) 21 | } 22 | 23 | } -------------------------------------------------------------------------------- /src/local/examples/FlatMap.scala: -------------------------------------------------------------------------------- 1 | package local.examples 2 | 3 | import org.apache.spark.SparkContext 4 | 5 | object FlatMap { 6 | def main(args: Array[String]) { 7 | 8 | val sc = new SparkContext("local", "FlatMap Test") 9 | val data = Array[(String, Int)](("A", 1), ("B", 2), 10 | ("B", 3), ("C", 4), 11 | ("C", 5), ("C", 6) 12 | ) 13 | val pairs = sc.makeRDD(data, 3) 14 | 15 | val result = pairs.flatMap(T => (T._1 + T._2)) 16 | 17 | result.foreach(println) 18 | 19 | } 20 | } -------------------------------------------------------------------------------- /src/local/examples/GenerateRandomText.scala: -------------------------------------------------------------------------------- 1 | package local.examples 2 | 3 | import java.io.File 4 | import java.io.FileWriter 5 | import java.util.Random 6 | 7 | object GenerateRandomText { 8 | 9 | def main(args: Array[String]) { 10 | val outputPath = new File("/Users/xulijie/Documents/data/RandomText/randomText-10MB.txt") 11 | if(!outputPath.getParentFile().exists()) 12 | outputPath.getParentFile.mkdirs() 13 | 14 | val writer = new FileWriter(outputPath) 15 | val ranGen = new Random 16 | 17 | 18 | while (outputPath.length() < 10 * 1024 * 1024) { 19 | var index = Math.abs(ranGen.nextGaussian() * 1000) % 1000 20 | writer.write(words.apply(index.toInt)) 21 | writer.write(" ") 22 | index = Math.abs(ranGen.nextGaussian()) % 1000 23 | writer.write(words.apply(index.toInt)) 24 | writer.write("\n") 25 | } 26 | writer.close() 27 | } 28 | 29 | def printToFile(f: java.io.File)(op: java.io.PrintWriter => Unit) { 30 | val p = new java.io.PrintWriter(f) 31 | try { op(p) } finally { p.close() } 32 | } 33 | 34 | /** 35 | * A random list of 100 words from /usr/share/dict/words 36 | */ 37 | private val words = Array[String]( 38 | "diurnalness", "Homoiousian", 39 | "spiranthic", "tetragynian", 40 | "silverhead", "ungreat", 41 | "lithograph", "exploiter", 42 | "physiologian", "by", 43 | "hellbender", "Filipendula", 44 | "undeterring", "antiscolic", 45 | "pentagamist", "hypoid", 46 | "cacuminal", "sertularian", 47 | "schoolmasterism", "nonuple", 48 | "gallybeggar", "phytonic", 49 | "swearingly", "nebular", 50 | "Confervales", "thermochemically", 51 | "characinoid", "cocksuredom", 52 | "fallacious", "feasibleness", 53 | "debromination", "playfellowship", 54 | "tramplike", "testa", 55 | "participatingly", "unaccessible", 56 | "bromate", "experientialist", 57 | "roughcast", "docimastical", 58 | "choralcelo", "blightbird", 59 | "peptonate", "sombreroed", 60 | "unschematized", "antiabolitionist", 61 | "besagne", "mastication", 62 | "bromic", "sviatonosite", 63 | "cattimandoo", "metaphrastical", 64 | "endotheliomyoma", "hysterolysis", 65 | "unfulminated", "Hester", 66 | "oblongly", "blurredness", 67 | "authorling", "chasmy", 68 | "Scorpaenidae", "toxihaemia", 69 | "Dictograph", "Quakerishly", 70 | "deaf", "timbermonger", 71 | "strammel", "Thraupidae", 72 | "seditious", "plerome", 73 | "Arneb", "eristically", 74 | "serpentinic", "glaumrie", 75 | "socioromantic", "apocalypst", 76 | "tartrous", "Bassaris", 77 | "angiolymphoma", "horsefly", 78 | "kenno", "astronomize", 79 | "euphemious", "arsenide", 80 | "untongued", "parabolicness", 81 | "uvanite", "helpless", 82 | "gemmeous", "stormy", 83 | "templar", "erythrodextrin", 84 | "comism", "interfraternal", 85 | "preparative", "parastas", 86 | "frontoorbital", "Ophiosaurus", 87 | "diopside", "serosanguineous", 88 | "ununiformly", "karyological", 89 | "collegian", "allotropic", 90 | "depravity", "amylogenesis", 91 | "reformatory", "epidymides", 92 | "pleurotropous", "trillium", 93 | "dastardliness", "coadvice", 94 | "embryotic", "benthonic", 95 | "pomiferous", "figureheadship", 96 | "Megaluridae", "Harpa", 97 | "frenal", "commotion", 98 | "abthainry", "cobeliever", 99 | "manilla", "spiciferous", 100 | "nativeness", "obispo", 101 | "monilioid", "biopsic", 102 | "valvula", "enterostomy", 103 | "planosubulate", "pterostigma", 104 | "lifter", "triradiated", 105 | "venialness", "tum", 106 | "archistome", "tautness", 107 | "unswanlike", "antivenin", 108 | "Lentibulariaceae", "Triphora", 109 | "angiopathy", "anta", 110 | "Dawsonia", "becomma", 111 | "Yannigan", "winterproof", 112 | "antalgol", "harr", 113 | "underogating", "ineunt", 114 | "cornberry", "flippantness", 115 | "scyphostoma", "approbation", 116 | "Ghent", "Macraucheniidae", 117 | "scabbiness", "unanatomized", 118 | "photoelasticity", "eurythermal", 119 | "enation", "prepavement", 120 | "flushgate", "subsequentially", 121 | "Edo", "antihero", 122 | "Isokontae", "unforkedness", 123 | "porriginous", "daytime", 124 | "nonexecutive", "trisilicic", 125 | "morphiomania", "paranephros", 126 | "botchedly", "impugnation", 127 | "Dodecatheon", "obolus", 128 | "unburnt", "provedore", 129 | "Aktistetae", "superindifference", 130 | "Alethea", "Joachimite", 131 | "cyanophilous", "chorograph", 132 | "brooky", "figured", 133 | "periclitation", "quintette", 134 | "hondo", "ornithodelphous", 135 | "unefficient", "pondside", 136 | "bogydom", "laurinoxylon", 137 | "Shiah", "unharmed", 138 | "cartful", "noncrystallized", 139 | "abusiveness", "cromlech", 140 | "japanned", "rizzomed", 141 | "underskin", "adscendent", 142 | "allectory", "gelatinousness", 143 | "volcano", "uncompromisingly", 144 | "cubit", "idiotize", 145 | "unfurbelowed", "undinted", 146 | "magnetooptics", "Savitar", 147 | "diwata", "ramosopalmate", 148 | "Pishquow", "tomorn", 149 | "apopenptic", "Haversian", 150 | "Hysterocarpus", "ten", 151 | "outhue", "Bertat", 152 | "mechanist", "asparaginic", 153 | "velaric", "tonsure", 154 | "bubble", "Pyrales", 155 | "regardful", "glyphography", 156 | "calabazilla", "shellworker", 157 | "stradametrical", "havoc", 158 | "theologicopolitical", "sawdust", 159 | "diatomaceous", "jajman", 160 | "temporomastoid", "Serrifera", 161 | "Ochnaceae", "aspersor", 162 | "trailmaking", "Bishareen", 163 | "digitule", "octogynous", 164 | "epididymitis", "smokefarthings", 165 | "bacillite", "overcrown", 166 | "mangonism", "sirrah", 167 | "undecorated", "psychofugal", 168 | "bismuthiferous", "rechar", 169 | "Lemuridae", "frameable", 170 | "thiodiazole", "Scanic", 171 | "sportswomanship", "interruptedness", 172 | "admissory", "osteopaedion", 173 | "tingly", "tomorrowness", 174 | "ethnocracy", "trabecular", 175 | "vitally", "fossilism", 176 | "adz", "metopon", 177 | "prefatorial", "expiscate", 178 | "diathermacy", "chronist", 179 | "nigh", "generalizable", 180 | "hysterogen", "aurothiosulphuric", 181 | "whitlowwort", "downthrust", 182 | "Protestantize", "monander", 183 | "Itea", "chronographic", 184 | "silicize", "Dunlop", 185 | "eer", "componental", 186 | "spot", "pamphlet", 187 | "antineuritic", "paradisean", 188 | "interruptor", "debellator", 189 | "overcultured", "Florissant", 190 | "hyocholic", "pneumatotherapy", 191 | "tailoress", "rave", 192 | "unpeople", "Sebastian", 193 | "thermanesthesia", "Coniferae", 194 | "swacking", "posterishness", 195 | "ethmopalatal", "whittle", 196 | "analgize", "scabbardless", 197 | "naught", "symbiogenetically", 198 | "trip", "parodist", 199 | "columniform", "trunnel", 200 | "yawler", "goodwill", 201 | "pseudohalogen", "swangy", 202 | "cervisial", "mediateness", 203 | "genii", "imprescribable", 204 | "pony", "consumptional", 205 | "carposporangial", "poleax", 206 | "bestill", "subfebrile", 207 | "sapphiric", "arrowworm", 208 | "qualminess", "ultraobscure", 209 | "thorite", "Fouquieria", 210 | "Bermudian", "prescriber", 211 | "elemicin", "warlike", 212 | "semiangle", "rotular", 213 | "misthread", "returnability", 214 | "seraphism", "precostal", 215 | "quarried", "Babylonism", 216 | "sangaree", "seelful", 217 | "placatory", "pachydermous", 218 | "bozal", "galbulus", 219 | "spermaphyte", "cumbrousness", 220 | "pope", "signifier", 221 | "Endomycetaceae", "shallowish", 222 | "sequacity", "periarthritis", 223 | "bathysphere", "pentosuria", 224 | "Dadaism", "spookdom", 225 | "Consolamentum", "afterpressure", 226 | "mutter", "louse", 227 | "ovoviviparous", "corbel", 228 | "metastoma", "biventer", 229 | "Hydrangea", "hogmace", 230 | "seizing", "nonsuppressed", 231 | "oratorize", "uncarefully", 232 | "benzothiofuran", "penult", 233 | "balanocele", "macropterous", 234 | "dishpan", "marten", 235 | "absvolt", "jirble", 236 | "parmelioid", "airfreighter", 237 | "acocotl", "archesporial", 238 | "hypoplastral", "preoral", 239 | "quailberry", "cinque", 240 | "terrestrially", "stroking", 241 | "limpet", "moodishness", 242 | "canicule", "archididascalian", 243 | "pompiloid", "overstaid", 244 | "introducer", "Italical", 245 | "Christianopaganism", "prescriptible", 246 | "subofficer", "danseuse", 247 | "cloy", "saguran", 248 | "frictionlessly", "deindividualization", 249 | "Bulanda", "ventricous", 250 | "subfoliar", "basto", 251 | "scapuloradial", "suspend", 252 | "stiffish", "Sphenodontidae", 253 | "eternal", "verbid", 254 | "mammonish", "upcushion", 255 | "barkometer", "concretion", 256 | "preagitate", "incomprehensible", 257 | "tristich", "visceral", 258 | "hemimelus", "patroller", 259 | "stentorophonic", "pinulus", 260 | "kerykeion", "brutism", 261 | "monstership", "merciful", 262 | "overinstruct", "defensibly", 263 | "bettermost", "splenauxe", 264 | "Mormyrus", "unreprimanded", 265 | "taver", "ell", 266 | "proacquittal", "infestation", 267 | "overwoven", "Lincolnlike", 268 | "chacona", "Tamil", 269 | "classificational", "lebensraum", 270 | "reeveland", "intuition", 271 | "Whilkut", "focaloid", 272 | "Eleusinian", "micromembrane", 273 | "byroad", "nonrepetition", 274 | "bacterioblast", "brag", 275 | "ribaldrous", "phytoma", 276 | "counteralliance", "pelvimetry", 277 | "pelf", "relaster", 278 | "thermoresistant", "aneurism", 279 | "molossic", "euphonym", 280 | "upswell", "ladhood", 281 | "phallaceous", "inertly", 282 | "gunshop", "stereotypography", 283 | "laryngic", "refasten", 284 | "twinling", "oflete", 285 | "hepatorrhaphy", "electrotechnics", 286 | "cockal", "guitarist", 287 | "topsail", "Cimmerianism", 288 | "larklike", "Llandovery", 289 | "pyrocatechol", "immatchable", 290 | "chooser", "metrocratic", 291 | "craglike", "quadrennial", 292 | "nonpoisonous", "undercolored", 293 | "knob", "ultratense", 294 | "balladmonger", "slait", 295 | "sialadenitis", "bucketer", 296 | "magnificently", "unstipulated", 297 | "unscourged", "unsupercilious", 298 | "packsack", "pansophism", 299 | "soorkee", "percent", 300 | "subirrigate", "champer", 301 | "metapolitics", "spherulitic", 302 | "involatile", "metaphonical", 303 | "stachyuraceous", "speckedness", 304 | "bespin", "proboscidiform", 305 | "gul", "squit", 306 | "yeelaman", "peristeropode", 307 | "opacousness", "shibuichi", 308 | "retinize", "yote", 309 | "misexposition", "devilwise", 310 | "pumpkinification", "vinny", 311 | "bonze", "glossing", 312 | "decardinalize", "transcortical", 313 | "serphoid", "deepmost", 314 | "guanajuatite", "wemless", 315 | "arval", "lammy", 316 | "Effie", "Saponaria", 317 | "tetrahedral", "prolificy", 318 | "excerpt", "dunkadoo", 319 | "Spencerism", "insatiately", 320 | "Gilaki", "oratorship", 321 | "arduousness", "unbashfulness", 322 | "Pithecolobium", "unisexuality", 323 | "veterinarian", "detractive", 324 | "liquidity", "acidophile", 325 | "proauction", "sural", 326 | "totaquina", "Vichyite", 327 | "uninhabitedness", "allegedly", 328 | "Gothish", "manny", 329 | "Inger", "flutist", 330 | "ticktick", "Ludgatian", 331 | "homotransplant", "orthopedical", 332 | "diminutively", "monogoneutic", 333 | "Kenipsim", "sarcologist", 334 | "drome", "stronghearted", 335 | "Fameuse", "Swaziland", 336 | "alen", "chilblain", 337 | "beatable", "agglomeratic", 338 | "constitutor", "tendomucoid", 339 | "porencephalous", "arteriasis", 340 | "boser", "tantivy", 341 | "rede", "lineamental", 342 | "uncontradictableness", "homeotypical", 343 | "masa", "folious", 344 | "dosseret", "neurodegenerative", 345 | "subtransverse", "Chiasmodontidae", 346 | "palaeotheriodont", "unstressedly", 347 | "chalcites", "piquantness", 348 | "lampyrine", "Aplacentalia", 349 | "projecting", "elastivity", 350 | "isopelletierin", "bladderwort", 351 | "strander", "almud", 352 | "iniquitously", "theologal", 353 | "bugre", "chargeably", 354 | "imperceptivity", "meriquinoidal", 355 | "mesophyte", "divinator", 356 | "perfunctory", "counterappellant", 357 | "synovial", "charioteer", 358 | "crystallographical", "comprovincial", 359 | "infrastapedial", "pleasurehood", 360 | "inventurous", "ultrasystematic", 361 | "subangulated", "supraoesophageal", 362 | "Vaishnavism", "transude", 363 | "chrysochrous", "ungrave", 364 | "reconciliable", "uninterpleaded", 365 | "erlking", "wherefrom", 366 | "aprosopia", "antiadiaphorist", 367 | "metoxazine", "incalculable", 368 | "umbellic", "predebit", 369 | "foursquare", "unimmortal", 370 | "nonmanufacture", "slangy", 371 | "predisputant", "familist", 372 | "preaffiliate", "friarhood", 373 | "corelysis", "zoonitic", 374 | "halloo", "paunchy", 375 | "neuromimesis", "aconitine", 376 | "hackneyed", "unfeeble", 377 | "cubby", "autoschediastical", 378 | "naprapath", "lyrebird", 379 | "inexistency", "leucophoenicite", 380 | "ferrogoslarite", "reperuse", 381 | "uncombable", "tambo", 382 | "propodiale", "diplomatize", 383 | "Russifier", "clanned", 384 | "corona", "michigan", 385 | "nonutilitarian", "transcorporeal", 386 | "bought", "Cercosporella", 387 | "stapedius", "glandularly", 388 | "pictorially", "weism", 389 | "disilane", "rainproof", 390 | "Caphtor", "scrubbed", 391 | "oinomancy", "pseudoxanthine", 392 | "nonlustrous", "redesertion", 393 | "Oryzorictinae", "gala", 394 | "Mycogone", "reappreciate", 395 | "cyanoguanidine", "seeingness", 396 | "breadwinner", "noreast", 397 | "furacious", "epauliere", 398 | "omniscribent", "Passiflorales", 399 | "uninductive", "inductivity", 400 | "Orbitolina", "Semecarpus", 401 | "migrainoid", "steprelationship", 402 | "phlogisticate", "mesymnion", 403 | "sloped", "edificator", 404 | "beneficent", "culm", 405 | "paleornithology", "unurban", 406 | "throbless", "amplexifoliate", 407 | "sesquiquintile", "sapience", 408 | "astucious", "dithery", 409 | "boor", "ambitus", 410 | "scotching", "uloid", 411 | "uncompromisingness", "hoove", 412 | "waird", "marshiness", 413 | "Jerusalem", "mericarp", 414 | "unevoked", "benzoperoxide", 415 | "outguess", "pyxie", 416 | "hymnic", "euphemize", 417 | "mendacity", "erythremia", 418 | "rosaniline", "unchatteled", 419 | "lienteria", "Bushongo", 420 | "dialoguer", "unrepealably", 421 | "rivethead", "antideflation", 422 | "vinegarish", "manganosiderite", 423 | "doubtingness", "ovopyriform", 424 | "Cephalodiscus", "Muscicapa", 425 | "Animalivora", "angina", 426 | "planispheric", "ipomoein", 427 | "cuproiodargyrite", "sandbox", 428 | "scrat", "Munnopsidae", 429 | "shola", "pentafid", 430 | "overstudiousness", "times", 431 | "nonprofession", "appetible", 432 | "valvulotomy", "goladar", 433 | "uniarticular", "oxyterpene", 434 | "unlapsing", "omega", 435 | "trophonema", "seminonflammable", 436 | "circumzenithal", "starer", 437 | "depthwise", "liberatress", 438 | "unleavened", "unrevolting", 439 | "groundneedle", "topline", 440 | "wandoo", "umangite", 441 | "ordinant", "unachievable", 442 | "oversand", "snare", 443 | "avengeful", "unexplicit", 444 | "mustafina", "sonable", 445 | "rehabilitative", "eulogization", 446 | "papery", "technopsychology", 447 | "impressor", "cresylite", 448 | "entame", "transudatory", 449 | "scotale", "pachydermatoid", 450 | "imaginary", "yeat", 451 | "slipped", "stewardship", 452 | "adatom", "cockstone", 453 | "skyshine", "heavenful", 454 | "comparability", "exprobratory", 455 | "dermorhynchous", "parquet", 456 | "cretaceous", "vesperal", 457 | "raphis", "undangered", 458 | "Glecoma", "engrain", 459 | "counteractively", "Zuludom", 460 | "orchiocatabasis", "Auriculariales", 461 | "warriorwise", "extraorganismal", 462 | "overbuilt", "alveolite", 463 | "tetchy", "terrificness", 464 | "widdle", "unpremonished", 465 | "rebilling", "sequestrum", 466 | "equiconvex", "heliocentricism", 467 | "catabaptist", "okonite", 468 | "propheticism", "helminthagogic", 469 | "calycular", "giantly", 470 | "wingable", "golem", 471 | "unprovided", "commandingness", 472 | "greave", "haply", 473 | "doina", "depressingly", 474 | "subdentate", "impairment", 475 | "decidable", "neurotrophic", 476 | "unpredict", "bicorporeal", 477 | "pendulant", "flatman", 478 | "intrabred", "toplike", 479 | "Prosobranchiata", "farrantly", 480 | "toxoplasmosis", "gorilloid", 481 | "dipsomaniacal", "aquiline", 482 | "atlantite", "ascitic", 483 | "perculsive", "prospectiveness", 484 | "saponaceous", "centrifugalization", 485 | "dinical", "infravaginal", 486 | "beadroll", "affaite", 487 | "Helvidian", "tickleproof", 488 | "abstractionism", "enhedge", 489 | "outwealth", "overcontribute", 490 | "coldfinch", "gymnastic", 491 | "Pincian", "Munychian", 492 | "codisjunct", "quad", 493 | "coracomandibular", "phoenicochroite", 494 | "amender", "selectivity", 495 | "putative", "semantician", 496 | "lophotrichic", "Spatangoidea", 497 | "saccharogenic", "inferent", 498 | "Triconodonta", "arrendation", 499 | "sheepskin", "taurocolla", 500 | "bunghole", "Machiavel", 501 | "triakistetrahedral", "dehairer", 502 | "prezygapophysial", "cylindric", 503 | "pneumonalgia", "sleigher", 504 | "emir", "Socraticism", 505 | "licitness", "massedly", 506 | "instructiveness", "sturdied", 507 | "redecrease", "starosta", 508 | "evictor", "orgiastic", 509 | "squdge", "meloplasty", 510 | "Tsonecan", "repealableness", 511 | "swoony", "myesthesia", 512 | "molecule", "autobiographist", 513 | "reciprocation", "refective", 514 | "unobservantness", "tricae", 515 | "ungouged", "floatability", 516 | "Mesua", "fetlocked", 517 | "chordacentrum", "sedentariness", 518 | "various", "laubanite", 519 | "nectopod", "zenick", 520 | "sequentially", "analgic", 521 | "biodynamics", "posttraumatic", 522 | "nummi", "pyroacetic", 523 | "bot", "redescend", 524 | "dispermy", "undiffusive", 525 | "circular", "trillion", 526 | "Uraniidae", "ploration", 527 | "discipular", "potentness", 528 | "sud", "Hu", 529 | "Eryon", "plugger", 530 | "subdrainage", "jharal", 531 | "abscission", "supermarket", 532 | "countergabion", "glacierist", 533 | "lithotresis", "minniebush", 534 | "zanyism", "eucalypteol", 535 | "sterilely", "unrealize", 536 | "unpatched", "hypochondriacism", 537 | "critically", "cheesecutter") 538 | } -------------------------------------------------------------------------------- /src/local/examples/GroupByAction.scala: -------------------------------------------------------------------------------- 1 | package local.examples 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.RangePartitioner 5 | 6 | object GroupByAction { 7 | def main(args: Array[String]) { 8 | 9 | val sc = new SparkContext("local", "GroupByAction Test") 10 | 11 | val data = Array[(String, Int)](("A1", 1), ("A2", 2), 12 | ("B1", 6), ("A2", 4), 13 | ("B1", 3), ("B1", 5)) 14 | 15 | val pairs = sc.parallelize(data, 3) 16 | 17 | // output: 18 | // (A1,1) 19 | // (A2,2) 20 | // 21 | // (B1,6) 22 | // (A2,4) 23 | // 24 | // (B1,3) 25 | // (B1,5) 26 | pairs.foreach(println) 27 | 28 | val result1 = pairs.groupBy(K => K._1) 29 | val result2 = pairs.groupBy((K : (String, Int)) => K._1, 1) 30 | val result3 = pairs.groupBy((K : (String, Int)) => K._1, new RangePartitioner(3, pairs)) 31 | 32 | // output of result1: 33 | // (A1,ArrayBuffer((A1,1))) 34 | // 35 | // (B1,ArrayBuffer((B1,6), (B1,3), (B1,5))) 36 | // (A2,ArrayBuffer((A2,2), (A2,4))) 37 | result1.foreach(println) 38 | 39 | // output of result2: 40 | // (A1,ArrayBuffer((A1,1))) 41 | // (B1,ArrayBuffer((B1,6), (B1,3), (B1,5))) 42 | // (A2,ArrayBuffer((A2,2), (A2,4))) 43 | result2.foreach(println) 44 | 45 | // output of result3: 46 | // (A1,ArrayBuffer((A1,1))) 47 | // (A2,ArrayBuffer((A2,2), (A2,4))) 48 | // 49 | // (B1,ArrayBuffer((B1,6), (B1,3), (B1,5))) 50 | result3.foreach(println) 51 | 52 | } 53 | 54 | } -------------------------------------------------------------------------------- /src/local/examples/GroupByKey.scala: -------------------------------------------------------------------------------- 1 | package local.examples 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.SparkContext._ 5 | 6 | object GroupByKey { 7 | 8 | def main(args: Array[String]) { 9 | 10 | val sc = new SparkContext("local", "GroupByKey Test") 11 | val data = Array[(Int, Char)]((1, 'a'), (2, 'b'), 12 | (3, 'c'), (4, 'd'), 13 | (5, 'e'), (3, 'f'), 14 | (2, 'g'), (1, 'h') 15 | 16 | ) 17 | val pairs = sc.parallelize(data, 3) 18 | 19 | val result = pairs.groupByKey(2) 20 | 21 | // output: 22 | // (B,ArrayBuffer(2, 3)) 23 | // 24 | // (A,ArrayBuffer(1)) 25 | // (C,ArrayBuffer(4, 5, 6)) 26 | //result.foreach(println) 27 | result.foreachWith(i => i)((x, i) => println("[PartitionIndex " + i + "] " + x)) 28 | println(result.toDebugString) 29 | } 30 | } -------------------------------------------------------------------------------- /src/local/examples/GroupByTest.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package local.examples 19 | 20 | import java.util.Random 21 | 22 | import org.apache.spark.{SparkConf, SparkContext} 23 | import org.apache.spark.SparkContext._ 24 | 25 | /** 26 | * Usage: GroupByTest [numMappers] [numKVPairs] [KeySize] [numReducers] 27 | */ 28 | object GroupByTest { 29 | def main(args: Array[String]) { 30 | val sparkConf = new SparkConf().setAppName("GroupBy Test").setMaster("local[2]") 31 | var numMappers = 10 32 | var numKVPairs = 100 33 | var valSize = 100 34 | var numReducers = 3 35 | 36 | val sc = new SparkContext(sparkConf) 37 | 38 | val pairs1 = sc.parallelize(0 until numMappers, numMappers).flatMap { p => 39 | val ranGen = new Random 40 | var arr1 = new Array[(Int, Array[Byte])](numKVPairs) 41 | for (i <- 0 until numKVPairs) { 42 | val byteArr = new Array[Byte](valSize) 43 | ranGen.nextBytes(byteArr) 44 | arr1(i) = (ranGen.nextInt(10), byteArr) 45 | } 46 | arr1 47 | }.cache 48 | // Enforce that everything has been calculated and in cache 49 | pairs1.count 50 | 51 | val result = pairs1.groupByKey(numReducers) 52 | println(result.count) 53 | println(result.toDebugString) 54 | 55 | sc.stop() 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /src/local/examples/GroupWith.scala: -------------------------------------------------------------------------------- 1 | package local.examples 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.SparkContext._ 5 | 6 | object GroupWith { 7 | def main(args: Array[String]) { 8 | 9 | val sc = new SparkContext("local[2]", "GroupWith Test") 10 | 11 | val data1 = Array[(String, Int)](("A1", 1), ("A2", 2), 12 | ("B1", 3), ("B2", 4), 13 | ("C1", 5), ("C1", 6) 14 | ) 15 | 16 | val data2 = Array[(String, Int)](("A1", 7), ("A2", 8), 17 | ("B1", 9), ("C1", 0) 18 | ) 19 | val pairs1 = sc.parallelize(data1, 3) 20 | val pairs2 = sc.parallelize(data2, 2) 21 | 22 | val result = pairs1.groupWith(pairs2) 23 | result.foreach(println) 24 | 25 | // output: 26 | // (B1,(ArrayBuffer(3),ArrayBuffer(9))) 27 | // (A1,(ArrayBuffer(1),ArrayBuffer(7))) 28 | // (A2,(ArrayBuffer(2),ArrayBuffer(8))) 29 | // 30 | // (C1,(ArrayBuffer(5, 6),ArrayBuffer(0))) 31 | // (B2,(ArrayBuffer(4),ArrayBuffer())) 32 | 33 | 34 | } 35 | } -------------------------------------------------------------------------------- /src/local/examples/JoinAction.scala: -------------------------------------------------------------------------------- 1 | package local.examples 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.SparkContext._ 5 | 6 | object JoinAction { 7 | def main(args: Array[String]) { 8 | 9 | val sc = new SparkContext("local[2]", "JoinAction Test") 10 | 11 | val data1 = Array[(String, Int)](("A1", 1), ("A2", 2), 12 | ("B1", 3), ("B2", 4), 13 | ("C1", 5), ("C1", 6) 14 | ) 15 | 16 | val data2 = Array[(String, Int)](("A1", 7), ("A2", 8), 17 | ("B1", 9), ("C1", 0) 18 | ) 19 | val pairs1 = sc.parallelize(data1, 3) 20 | val pairs2 = sc.parallelize(data2, 2) 21 | 22 | 23 | val result = pairs1.join(pairs2) 24 | 25 | // output: 26 | // (A1,(1,7)) 27 | // (B1,(3,9)) 28 | // (A2,(2,8)) 29 | // 30 | // (C1,(5,0)) 31 | // (C1,(6,0)) 32 | result.foreach(println) 33 | } 34 | 35 | } -------------------------------------------------------------------------------- /src/local/examples/LocalWordCount.scala: -------------------------------------------------------------------------------- 1 | package local.examples 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.SparkContext._ 5 | 6 | object LocalWordCount { 7 | def main(args: Array[String]) { 8 | 9 | val sc = new SparkContext("local[4]", "LocalWordCount") 10 | val myFile = sc.textFile("/Users/xulijie/Documents/data/RandomText/randomText-10MB.txt") 11 | /* 12 | val counts = myFile.map( l => l.split(" ")(2) ) 13 | .map( url => (url, 1) ) 14 | .reduceByKey( _+_ ) 15 | .map{ case(url, count) => (count, url) } 16 | .sortByKey( ascending=false ) 17 | .map{ case(count, url) => (url, count) } 18 | 19 | */ 20 | val wordAndCount = myFile.flatMap(s => s.split(" ")) 21 | .map(w => (w, 1)) 22 | 23 | val result = wordAndCount.reduceByKey(_ + _) 24 | result.foreach(println) 25 | 26 | } 27 | 28 | } -------------------------------------------------------------------------------- /src/local/examples/LookUpTest.scala: -------------------------------------------------------------------------------- 1 | package local.examples 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.SparkContext._ 5 | 6 | object LookUpTest { 7 | def main(args: Array[String]) { 8 | 9 | val sc = new SparkContext("local", "LookUp Test") 10 | 11 | val data = Array[(String, Int)](("A", 1), ("B", 2), 12 | ("B", 3), ("C", 4), 13 | ("C", 5), ("C", 6)) 14 | 15 | val pairs = sc.parallelize(data, 3) 16 | 17 | val finalRDD = pairs.lookup("B") 18 | 19 | finalRDD.foreach(println) 20 | // output: 21 | // 2 22 | // 3 23 | } 24 | } -------------------------------------------------------------------------------- /src/local/examples/MapPartitionsRDDTest.scala: -------------------------------------------------------------------------------- 1 | package local.examples 2 | 3 | import org.apache.spark.SparkContext 4 | 5 | object MapPartitionsRDDTest { 6 | 7 | def main(args: Array[String]) { 8 | val sc = new SparkContext("local", "MapPartitionsRDD Test") 9 | val data = Array[(String, Int)](("A1", 1), ("A2", 2), 10 | ("B1", 1), ("B2", 4), 11 | ("C1", 3), ("C2", 4) 12 | ) 13 | val pairs = sc.parallelize(data, 3) 14 | 15 | val finalRDD = pairs.mapPartitions(iter => iter.filter(_._2 >= 2)) 16 | // val finalRDD2 = pairs.mapPartitionsWithIndex(f, preservesPartitioning) 17 | 18 | finalRDD.toArray().foreach(println) 19 | 20 | } 21 | } -------------------------------------------------------------------------------- /src/local/examples/MapValuesTest.scala: -------------------------------------------------------------------------------- 1 | package local.examples 2 | 3 | import org.apache.spark.SparkContext 4 | 5 | object MapValuesTest { 6 | def main(args: Array[String]) { 7 | 8 | val sc = new SparkContext("local", "ReduceByKeyToDriver Test") 9 | val data1 = Array[(String, Int)](("K", 1), ("T", 2), 10 | ("T", 3), ("W", 4), 11 | ("W", 5), ("W", 6) 12 | ) 13 | val pairs = sc.parallelize(data1, 3) 14 | //val result = pairs.reduce((A, B) => (A._1 + "#" + B._1, A._2 + B._2)) 15 | //val result = pairs.fold(("K0",10))((A, B) => (A._1 + "#" + B._1, A._2 + B._2)) 16 | //val result = pairs.partitionBy(new RangePartitioner(2, pairs, true)) 17 | //val result = pairs.mapValues(V => 10 * V) 18 | //result.foreach(println) 19 | } 20 | } -------------------------------------------------------------------------------- /src/local/examples/PipedRDDTest.scala: -------------------------------------------------------------------------------- 1 | package local.examples 2 | 3 | import org.apache.spark.SparkContext 4 | 5 | object PipedRDDTest { 6 | 7 | def main(args: Array[String]) { 8 | val sc = new SparkContext("local", "Cartesian Test") 9 | val data1 = Array[(String, Int)](("K1", 1), ("K2", 2), 10 | ("U1", 3), ("U2", 4), 11 | ("W1", 3), ("W2", 4) 12 | ) 13 | val pairs = sc.parallelize(data1, 3) 14 | 15 | val finalRDD = pairs.pipe("grep 2") 16 | 17 | finalRDD.foreach(println) 18 | 19 | } 20 | } -------------------------------------------------------------------------------- /src/local/examples/ReduceByKeyActionTest.scala: -------------------------------------------------------------------------------- 1 | package local.examples 2 | 3 | import org.apache.spark.SparkContext 4 | 5 | object ReduceByKeyActionTest { 6 | 7 | def main(args: Array[String]) { 8 | 9 | val sc = new SparkContext("local", "ReduceByKeyToDriver Test") 10 | val data1 = Array[(String, Int)](("K", 1), ("U", 2), 11 | ("U", 3), ("W", 4), 12 | ("W", 5), ("W", 6)) 13 | val pairs = sc.parallelize(data1, 3) 14 | //val result = pairs.reduce((A, B) => (A._1 + "#" + B._1, A._2 + B._2)) 15 | //val result = pairs.fold(("K0",10))((A, B) => (A._1 + "#" + B._1, A._2 + B._2)) 16 | //val result = pairs.reduceByKey(_ + _, 2) 17 | //result.foreach(println) 18 | } 19 | 20 | } -------------------------------------------------------------------------------- /src/local/examples/ReduceByKeyToDriverTest.scala: -------------------------------------------------------------------------------- 1 | package local.examples 2 | 3 | import org.apache.spark.SparkContext 4 | 5 | object ReduceByKeyToDriverTest { 6 | def main(args: Array[String]) { 7 | 8 | val sc = new SparkContext("local[3]", "ReduceByKeyToDriver Test") 9 | val data1 = Array[(String, Int)](("K", 1), ("U", 2), 10 | ("U", 3), ("W", 4), 11 | ("W", 5), ("W", 6) 12 | ) 13 | val pairs = sc.parallelize(data1, 3) 14 | //val result = pairs.reduce((A, B) => (A._1 + "#" + B._1, A._2 + B._2)) 15 | //val result = pairs.fold(("K0",10))((A, B) => (A._1 + "#" + B._1, A._2 + B._2)) 16 | //val result = pairs.reduceByKeyToDriver(_ + _) 17 | //result.foreach(println) 18 | } 19 | } -------------------------------------------------------------------------------- /src/local/examples/SparkLR.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package local.examples 19 | 20 | import java.util.Random 21 | 22 | import scala.math.exp 23 | 24 | import breeze.linalg.{Vector, DenseVector} 25 | 26 | import org.apache.spark._ 27 | 28 | /** 29 | * Logistic regression based classification. 30 | * Usage: SparkLR [slices] 31 | */ 32 | object SparkLR { 33 | val N = 10000 // Number of data points 34 | val D = 10 // Numer of dimensions 35 | val R = 0.7 // Scaling factor 36 | val ITERATIONS = 5 37 | val rand = new Random(42) 38 | 39 | case class DataPoint(x: Vector[Double], y: Double) 40 | 41 | def generateData = { 42 | def generatePoint(i: Int) = { 43 | val y = if(i % 2 == 0) -1 else 1 44 | val x = DenseVector.fill(D){rand.nextGaussian + y * R} 45 | println(x.toString() + " " + y) 46 | DataPoint(x, y) 47 | 48 | } 49 | Array.tabulate(N)(generatePoint) 50 | } 51 | 52 | def main(args: Array[String]) { 53 | val sparkConf = new SparkConf().setAppName("SparkLR") 54 | val sc = new SparkContext(sparkConf) 55 | val numSlices = if (args.length > 0) args(0).toInt else 2 56 | val points = sc.parallelize(generateData, numSlices).cache() 57 | 58 | // Initialize w to a random value 59 | var w = DenseVector.fill(D){2 * rand.nextDouble - 1} 60 | println("Initial w: " + w) 61 | 62 | for (i <- 1 to ITERATIONS) { 63 | println("On iteration " + i) 64 | val gradient = points.map { p => 65 | p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y 66 | }.reduce(_ + _) 67 | w -= gradient 68 | } 69 | 70 | println("Final w: " + w) 71 | sc.stop() 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /src/local/examples/TakeActionTest.scala: -------------------------------------------------------------------------------- 1 | package local.examples 2 | 3 | import org.apache.spark.SparkContext 4 | 5 | object TakeActionTest { 6 | def main(args: Array[String]) { 7 | 8 | val sc = new SparkContext("local", "TakeAction Test") 9 | val data1 = Array[(String, Int)](("K1", 1), ("K2", 2), 10 | ("U1", 3), ("U2", 4), 11 | ("W1", 3), ("W2", 4) 12 | ) 13 | val pairs = sc.parallelize(data1, 3) 14 | //val result = pairs.reduce((A, B) => (A._1 + "#" + B._1, A._2 + B._2)) 15 | //val result = pairs.fold(("K0",10))((A, B) => (A._1 + "#" + B._1, A._2 + B._2)) 16 | val result = pairs.take(5) 17 | result.foreach(println) 18 | } 19 | } -------------------------------------------------------------------------------- /src/local/examples/UnionTest.scala: -------------------------------------------------------------------------------- 1 | package local.examples 2 | 3 | import org.apache.spark.SparkContext 4 | 5 | object UnionTest { 6 | def main(args: Array[String]) { 7 | 8 | val sc = new SparkContext("local", "ReduceByKeyToDriver Test") 9 | 10 | val data1 = Array[(String, Int)](("K1", 1), ("K2", 2), 11 | ("U1", 3), ("U2", 4), 12 | ("W1", 5), ("W1", 6) 13 | ) 14 | 15 | val data2 = Array[(String, Int)](("K1", 7), ("K2", 8), 16 | ("U1", 9), ("W1", 0) 17 | ) 18 | val pairs1 = sc.parallelize(data1, 3) 19 | val pairs2 = sc.parallelize(data2, 2) 20 | //val result = pairs.reduce((A, B) => (A._1 + "#" + B._1, A._2 + B._2)) 21 | //val result = pairs.fold(("K0",10))((A, B) => (A._1 + "#" + B._1, A._2 + B._2)) 22 | //val result = pairs.partitionBy(new RangePartitioner(2, pairs, true)) 23 | val result = pairs1.union(pairs2) 24 | result.foreach(println) 25 | //result.saveAsTextFile("E:\\Spark\\output\\join") 26 | } 27 | } -------------------------------------------------------------------------------- /src/local/examples/partitionByTest.scala: -------------------------------------------------------------------------------- 1 | package local.examples 2 | 3 | import org.apache.spark.SparkContext 4 | 5 | object partitionByTest { 6 | def main(args: Array[String]) { 7 | 8 | val sc = new SparkContext("local", "ReduceByKeyToDriver Test") 9 | val data1 = Array[(String, Int)](("K", 1), ("T", 2), 10 | ("T", 3), ("W", 4), 11 | ("W", 5), ("W", 6) 12 | ) 13 | val pairs = sc.parallelize(data1, 3) 14 | //val result = pairs.reduce((A, B) => (A._1 + "#" + B._1, A._2 + B._2)) 15 | //val result = pairs.fold(("K0",10))((A, B) => (A._1 + "#" + B._1, A._2 + B._2)) 16 | //val result = pairs.partitionBy(new RangePartitioner(2, pairs, true)) 17 | //val result = pairs.partitionBy(new HashPartitioner(2)) 18 | //result.foreach(println) 19 | } 20 | } -------------------------------------------------------------------------------- /src/local/examples/reduceActionTest.scala: -------------------------------------------------------------------------------- 1 | package local.examples 2 | 3 | import org.apache.spark.SparkContext 4 | 5 | object reduceActionTest { 6 | def main(args: Array[String]) { 7 | 8 | val sc = new SparkContext("local", "MapPartitionsRDD Test") 9 | val data1 = Array[(String, Int)](("K1", 1), ("K2", 2), 10 | ("U1", 3), ("U2", 4), 11 | ("W1", 3), ("W2", 4) 12 | ) 13 | val pairs = sc.parallelize(data1, 3) 14 | //val result = pairs.reduce((A, B) => (A._1 + "#" + B._1, A._2 + B._2)) 15 | val result = pairs.fold(("K0",10))((A, B) => (A._1 + "#" + B._1, A._2 + B._2)) 16 | println(result) 17 | } 18 | 19 | } -------------------------------------------------------------------------------- /src/local/examples/sortByKeyTest.scala: -------------------------------------------------------------------------------- 1 | package local.examples 2 | 3 | import org.apache.spark.SparkContext 4 | 5 | object sortByKeyTest { 6 | def main(args: Array[String]) { 7 | 8 | val sc = new SparkContext("local", "ReduceByKeyToDriver Test") 9 | 10 | val data1 = Array[(String, Int)](("K1", 1), ("K2", 2), 11 | ("U1", 3), ("U2", 4), 12 | ("W1", 5), ("W1", 6) 13 | ) 14 | val pairs1 = sc.parallelize(data1, 3) 15 | 16 | //val result = pairs.reduce((A, B) => (A._1 + "#" + B._1, A._2 + B._2)) 17 | //val result = pairs.fold(("K0",10))((A, B) => (A._1 + "#" + B._1, A._2 + B._2)) 18 | 19 | //val result = pairs1.sortByKey() 20 | //result.foreach(println) 21 | //result.saveAsTextFile("E:\\Spark\\output\\sortByKey") 22 | } 23 | 24 | } -------------------------------------------------------------------------------- /src/org/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JerryLead/SparkLearning/b1150869e97dff8b623898c88b3635abd9f3bd54/src/org/.DS_Store -------------------------------------------------------------------------------- /src/org/apache/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JerryLead/SparkLearning/b1150869e97dff8b623898c88b3635abd9f3bd54/src/org/apache/.DS_Store -------------------------------------------------------------------------------- /src/org/apache/spark/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JerryLead/SparkLearning/b1150869e97dff8b623898c88b3635abd9f3bd54/src/org/apache/spark/.DS_Store -------------------------------------------------------------------------------- /src/org/apache/spark/examples/BroadcastTest.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.examples 19 | 20 | import org.apache.spark.{SparkConf, SparkContext} 21 | 22 | /** 23 | * Usage: BroadcastTest [slices] [numElem] [broadcastAlgo] [blockSize] 24 | */ 25 | object BroadcastTest { 26 | def main(args: Array[String]) { 27 | 28 | val bcName = if (args.length > 2) args(2) else "Http" 29 | val blockSize = if (args.length > 3) args(3) else "4096" 30 | 31 | System.setProperty("spark.broadcast.factory", "org.apache.spark.broadcast." + bcName + 32 | "BroadcastFactory") 33 | System.setProperty("spark.broadcast.blockSize", blockSize) 34 | val sparkConf = new SparkConf().setAppName("Broadcast Test") 35 | 36 | val sc = new SparkContext(sparkConf) 37 | 38 | val slices = if (args.length > 0) args(0).toInt else 2 39 | val num = if (args.length > 1) args(1).toInt else 1000000 40 | 41 | val arr1 = new Array[Int](num) 42 | for (i <- 0 until arr1.length) { 43 | arr1(i) = i 44 | } 45 | 46 | for (i <- 0 until 3) { 47 | println("Iteration " + i) 48 | println("===========") 49 | val startTime = System.nanoTime 50 | val barr1 = sc.broadcast(arr1) 51 | val observedSizes = sc.parallelize(1 to 10, slices).map(_ => barr1.value.size) 52 | // Collect the small RDD so we can print the observed sizes locally. 53 | observedSizes.collect().foreach(i => println(i)) 54 | println("Iteration %d took %.0f milliseconds".format(i, (System.nanoTime - startTime) / 1E6)) 55 | } 56 | 57 | sc.stop() 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /src/org/apache/spark/examples/CassandraCQLTest.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.examples 19 | 20 | import java.nio.ByteBuffer 21 | 22 | import scala.collection.JavaConversions._ 23 | import scala.collection.mutable.ListBuffer 24 | import scala.collection.immutable.Map 25 | 26 | import org.apache.cassandra.hadoop.ConfigHelper 27 | import org.apache.cassandra.hadoop.cql3.CqlPagingInputFormat 28 | import org.apache.cassandra.hadoop.cql3.CqlConfigHelper 29 | import org.apache.cassandra.hadoop.cql3.CqlOutputFormat 30 | import org.apache.cassandra.utils.ByteBufferUtil 31 | import org.apache.hadoop.mapreduce.Job 32 | 33 | import org.apache.spark.{SparkConf, SparkContext} 34 | import org.apache.spark.SparkContext._ 35 | 36 | /* 37 | Need to create following keyspace and column family in cassandra before running this example 38 | Start CQL shell using ./bin/cqlsh and execute following commands 39 | CREATE KEYSPACE retail WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 1}; 40 | use retail; 41 | CREATE TABLE salecount (prod_id text, sale_count int, PRIMARY KEY (prod_id)); 42 | CREATE TABLE ordercf (user_id text, 43 | time timestamp, 44 | prod_id text, 45 | quantity int, 46 | PRIMARY KEY (user_id, time)); 47 | INSERT INTO ordercf (user_id, 48 | time, 49 | prod_id, 50 | quantity) VALUES ('bob', 1385983646000, 'iphone', 1); 51 | INSERT INTO ordercf (user_id, 52 | time, 53 | prod_id, 54 | quantity) VALUES ('tom', 1385983647000, 'samsung', 4); 55 | INSERT INTO ordercf (user_id, 56 | time, 57 | prod_id, 58 | quantity) VALUES ('dora', 1385983648000, 'nokia', 2); 59 | INSERT INTO ordercf (user_id, 60 | time, 61 | prod_id, 62 | quantity) VALUES ('charlie', 1385983649000, 'iphone', 2); 63 | */ 64 | 65 | /** 66 | * This example demonstrates how to read and write to cassandra column family created using CQL3 67 | * using Spark. 68 | * Parameters : 69 | * Usage: ./bin/spark-submit examples.jar \ 70 | * --class org.apache.spark.examples.CassandraCQLTest localhost 9160 71 | */ 72 | object CassandraCQLTest { 73 | 74 | def main(args: Array[String]) { 75 | val sparkConf = new SparkConf().setAppName("CQLTestApp") 76 | 77 | val sc = new SparkContext(sparkConf) 78 | val cHost: String = args(0) 79 | val cPort: String = args(1) 80 | val KeySpace = "retail" 81 | val InputColumnFamily = "ordercf" 82 | val OutputColumnFamily = "salecount" 83 | 84 | val job = new Job() 85 | job.setInputFormatClass(classOf[CqlPagingInputFormat]) 86 | ConfigHelper.setInputInitialAddress(job.getConfiguration(), cHost) 87 | ConfigHelper.setInputRpcPort(job.getConfiguration(), cPort) 88 | ConfigHelper.setInputColumnFamily(job.getConfiguration(), KeySpace, InputColumnFamily) 89 | ConfigHelper.setInputPartitioner(job.getConfiguration(), "Murmur3Partitioner") 90 | CqlConfigHelper.setInputCQLPageRowSize(job.getConfiguration(), "3") 91 | 92 | /** CqlConfigHelper.setInputWhereClauses(job.getConfiguration(), "user_id='bob'") */ 93 | 94 | /** An UPDATE writes one or more columns to a record in a Cassandra column family */ 95 | val query = "UPDATE " + KeySpace + "." + OutputColumnFamily + " SET sale_count = ? " 96 | CqlConfigHelper.setOutputCql(job.getConfiguration(), query) 97 | 98 | job.setOutputFormatClass(classOf[CqlOutputFormat]) 99 | ConfigHelper.setOutputColumnFamily(job.getConfiguration(), KeySpace, OutputColumnFamily) 100 | ConfigHelper.setOutputInitialAddress(job.getConfiguration(), cHost) 101 | ConfigHelper.setOutputRpcPort(job.getConfiguration(), cPort) 102 | ConfigHelper.setOutputPartitioner(job.getConfiguration(), "Murmur3Partitioner") 103 | 104 | val casRdd = sc.newAPIHadoopRDD(job.getConfiguration(), 105 | classOf[CqlPagingInputFormat], 106 | classOf[java.util.Map[String,ByteBuffer]], 107 | classOf[java.util.Map[String,ByteBuffer]]) 108 | 109 | println("Count: " + casRdd.count) 110 | val productSaleRDD = casRdd.map { 111 | case (key, value) => { 112 | (ByteBufferUtil.string(value.get("prod_id")), ByteBufferUtil.toInt(value.get("quantity"))) 113 | } 114 | } 115 | val aggregatedRDD = productSaleRDD.reduceByKey(_ + _) 116 | aggregatedRDD.collect().foreach { 117 | case (productId, saleCount) => println(productId + ":" + saleCount) 118 | } 119 | 120 | val casoutputCF = aggregatedRDD.map { 121 | case (productId, saleCount) => { 122 | val outColFamKey = Map("prod_id" -> ByteBufferUtil.bytes(productId)) 123 | val outKey: java.util.Map[String, ByteBuffer] = outColFamKey 124 | var outColFamVal = new ListBuffer[ByteBuffer] 125 | outColFamVal += ByteBufferUtil.bytes(saleCount) 126 | val outVal: java.util.List[ByteBuffer] = outColFamVal 127 | (outKey, outVal) 128 | } 129 | } 130 | 131 | casoutputCF.saveAsNewAPIHadoopFile( 132 | KeySpace, 133 | classOf[java.util.Map[String, ByteBuffer]], 134 | classOf[java.util.List[ByteBuffer]], 135 | classOf[CqlOutputFormat], 136 | job.getConfiguration() 137 | ) 138 | } 139 | } 140 | -------------------------------------------------------------------------------- /src/org/apache/spark/examples/CassandraTest.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.examples 19 | 20 | import java.nio.ByteBuffer 21 | import java.util.SortedMap 22 | 23 | import scala.collection.JavaConversions._ 24 | 25 | import org.apache.cassandra.db.IColumn 26 | import org.apache.cassandra.hadoop.ColumnFamilyOutputFormat 27 | import org.apache.cassandra.hadoop.ConfigHelper 28 | import org.apache.cassandra.hadoop.ColumnFamilyInputFormat 29 | import org.apache.cassandra.thrift._ 30 | import org.apache.cassandra.utils.ByteBufferUtil 31 | import org.apache.hadoop.mapreduce.Job 32 | 33 | import org.apache.spark.{SparkConf, SparkContext} 34 | import org.apache.spark.SparkContext._ 35 | 36 | /* 37 | * This example demonstrates using Spark with Cassandra with the New Hadoop API and Cassandra 38 | * support for Hadoop. 39 | * 40 | * To run this example, run this file with the following command params - 41 | * 42 | * 43 | * So if you want to run this on localhost this will be, 44 | * localhost 9160 45 | * 46 | * The example makes some assumptions: 47 | * 1. You have already created a keyspace called casDemo and it has a column family named Words 48 | * 2. There are column family has a column named "para" which has test content. 49 | * 50 | * You can create the content by running the following script at the bottom of this file with 51 | * cassandra-cli. 52 | * 53 | */ 54 | object CassandraTest { 55 | 56 | def main(args: Array[String]) { 57 | val sparkConf = new SparkConf().setAppName("casDemo") 58 | // Get a SparkContext 59 | val sc = new SparkContext(sparkConf) 60 | 61 | // Build the job configuration with ConfigHelper provided by Cassandra 62 | val job = new Job() 63 | job.setInputFormatClass(classOf[ColumnFamilyInputFormat]) 64 | 65 | val host: String = args(1) 66 | val port: String = args(2) 67 | 68 | ConfigHelper.setInputInitialAddress(job.getConfiguration(), host) 69 | ConfigHelper.setInputRpcPort(job.getConfiguration(), port) 70 | ConfigHelper.setOutputInitialAddress(job.getConfiguration(), host) 71 | ConfigHelper.setOutputRpcPort(job.getConfiguration(), port) 72 | ConfigHelper.setInputColumnFamily(job.getConfiguration(), "casDemo", "Words") 73 | ConfigHelper.setOutputColumnFamily(job.getConfiguration(), "casDemo", "WordCount") 74 | 75 | val predicate = new SlicePredicate() 76 | val sliceRange = new SliceRange() 77 | sliceRange.setStart(Array.empty[Byte]) 78 | sliceRange.setFinish(Array.empty[Byte]) 79 | predicate.setSlice_range(sliceRange) 80 | ConfigHelper.setInputSlicePredicate(job.getConfiguration(), predicate) 81 | 82 | ConfigHelper.setInputPartitioner(job.getConfiguration(), "Murmur3Partitioner") 83 | ConfigHelper.setOutputPartitioner(job.getConfiguration(), "Murmur3Partitioner") 84 | 85 | // Make a new Hadoop RDD 86 | val casRdd = sc.newAPIHadoopRDD( 87 | job.getConfiguration(), 88 | classOf[ColumnFamilyInputFormat], 89 | classOf[ByteBuffer], 90 | classOf[SortedMap[ByteBuffer, IColumn]]) 91 | 92 | // Let us first get all the paragraphs from the retrieved rows 93 | val paraRdd = casRdd.map { 94 | case (key, value) => { 95 | ByteBufferUtil.string(value.get(ByteBufferUtil.bytes("para")).value()) 96 | } 97 | } 98 | 99 | // Lets get the word count in paras 100 | val counts = paraRdd.flatMap(p => p.split(" ")).map(word => (word, 1)).reduceByKey(_ + _) 101 | 102 | counts.collect().foreach { 103 | case (word, count) => println(word + ":" + count) 104 | } 105 | 106 | counts.map { 107 | case (word, count) => { 108 | val colWord = new org.apache.cassandra.thrift.Column() 109 | colWord.setName(ByteBufferUtil.bytes("word")) 110 | colWord.setValue(ByteBufferUtil.bytes(word)) 111 | colWord.setTimestamp(System.currentTimeMillis) 112 | 113 | val colCount = new org.apache.cassandra.thrift.Column() 114 | colCount.setName(ByteBufferUtil.bytes("wcount")) 115 | colCount.setValue(ByteBufferUtil.bytes(count.toLong)) 116 | colCount.setTimestamp(System.currentTimeMillis) 117 | 118 | val outputkey = ByteBufferUtil.bytes(word + "-COUNT-" + System.currentTimeMillis) 119 | 120 | val mutations: java.util.List[Mutation] = new Mutation() :: new Mutation() :: Nil 121 | mutations.get(0).setColumn_or_supercolumn(new ColumnOrSuperColumn()) 122 | mutations.get(0).column_or_supercolumn.setColumn(colWord) 123 | mutations.get(1).setColumn_or_supercolumn(new ColumnOrSuperColumn()) 124 | mutations.get(1).column_or_supercolumn.setColumn(colCount) 125 | (outputkey, mutations) 126 | } 127 | }.saveAsNewAPIHadoopFile("casDemo", classOf[ByteBuffer], classOf[List[Mutation]], 128 | classOf[ColumnFamilyOutputFormat], job.getConfiguration) 129 | } 130 | } 131 | 132 | /* 133 | create keyspace casDemo; 134 | use casDemo; 135 | 136 | create column family WordCount with comparator = UTF8Type; 137 | update column family WordCount with column_metadata = 138 | [{column_name: word, validation_class: UTF8Type}, 139 | {column_name: wcount, validation_class: LongType}]; 140 | 141 | create column family Words with comparator = UTF8Type; 142 | update column family Words with column_metadata = 143 | [{column_name: book, validation_class: UTF8Type}, 144 | {column_name: para, validation_class: UTF8Type}]; 145 | 146 | assume Words keys as utf8; 147 | 148 | set Words['3musk001']['book'] = 'The Three Musketeers'; 149 | set Words['3musk001']['para'] = 'On the first Monday of the month of April, 1625, the market 150 | town of Meung, in which the author of ROMANCE OF THE ROSE was born, appeared to 151 | be in as perfect a state of revolution as if the Huguenots had just made 152 | a second La Rochelle of it. Many citizens, seeing the women flying 153 | toward the High Street, leaving their children crying at the open doors, 154 | hastened to don the cuirass, and supporting their somewhat uncertain 155 | courage with a musket or a partisan, directed their steps toward the 156 | hostelry of the Jolly Miller, before which was gathered, increasing 157 | every minute, a compact group, vociferous and full of curiosity.'; 158 | 159 | set Words['3musk002']['book'] = 'The Three Musketeers'; 160 | set Words['3musk002']['para'] = 'In those times panics were common, and few days passed without 161 | some city or other registering in its archives an event of this kind. There were 162 | nobles, who made war against each other; there was the king, who made 163 | war against the cardinal; there was Spain, which made war against the 164 | king. Then, in addition to these concealed or public, secret or open 165 | wars, there were robbers, mendicants, Huguenots, wolves, and scoundrels, 166 | who made war upon everybody. The citizens always took up arms readily 167 | against thieves, wolves or scoundrels, often against nobles or 168 | Huguenots, sometimes against the king, but never against cardinal or 169 | Spain. It resulted, then, from this habit that on the said first Monday 170 | of April, 1625, the citizens, on hearing the clamor, and seeing neither 171 | the red-and-yellow standard nor the livery of the Duc de Richelieu, 172 | rushed toward the hostel of the Jolly Miller. When arrived there, the 173 | cause of the hubbub was apparent to all'; 174 | 175 | set Words['3musk003']['book'] = 'The Three Musketeers'; 176 | set Words['3musk003']['para'] = 'You ought, I say, then, to husband the means you have, however 177 | large the sum may be; but you ought also to endeavor to perfect yourself in 178 | the exercises becoming a gentleman. I will write a letter today to the 179 | Director of the Royal Academy, and tomorrow he will admit you without 180 | any expense to yourself. Do not refuse this little service. Our 181 | best-born and richest gentlemen sometimes solicit it without being able 182 | to obtain it. You will learn horsemanship, swordsmanship in all its 183 | branches, and dancing. You will make some desirable acquaintances; and 184 | from time to time you can call upon me, just to tell me how you are 185 | getting on, and to say whether I can be of further service to you.'; 186 | 187 | 188 | set Words['thelostworld001']['book'] = 'The Lost World'; 189 | set Words['thelostworld001']['para'] = 'She sat with that proud, delicate profile of hers outlined 190 | against the red curtain. How beautiful she was! And yet how aloof! We had been 191 | friends, quite good friends; but never could I get beyond the same 192 | comradeship which I might have established with one of my 193 | fellow-reporters upon the Gazette,--perfectly frank, perfectly kindly, 194 | and perfectly unsexual. My instincts are all against a woman being too 195 | frank and at her ease with me. It is no compliment to a man. Where 196 | the real sex feeling begins, timidity and distrust are its companions, 197 | heritage from old wicked days when love and violence went often hand in 198 | hand. The bent head, the averted eye, the faltering voice, the wincing 199 | figure--these, and not the unshrinking gaze and frank reply, are the 200 | true signals of passion. Even in my short life I had learned as much 201 | as that--or had inherited it in that race memory which we call instinct.'; 202 | 203 | set Words['thelostworld002']['book'] = 'The Lost World'; 204 | set Words['thelostworld002']['para'] = 'I always liked McArdle, the crabbed, old, round-backed, 205 | red-headed news editor, and I rather hoped that he liked me. Of course, Beaumont was 206 | the real boss; but he lived in the rarefied atmosphere of some Olympian 207 | height from which he could distinguish nothing smaller than an 208 | international crisis or a split in the Cabinet. Sometimes we saw him 209 | passing in lonely majesty to his inner sanctum, with his eyes staring 210 | vaguely and his mind hovering over the Balkans or the Persian Gulf. He 211 | was above and beyond us. But McArdle was his first lieutenant, and it 212 | was he that we knew. The old man nodded as I entered the room, and he 213 | pushed his spectacles far up on his bald forehead.'; 214 | 215 | */ 216 | -------------------------------------------------------------------------------- /src/org/apache/spark/examples/DriverSubmissionTest.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.examples 19 | 20 | import scala.collection.JavaConversions._ 21 | 22 | /** Prints out environmental information, sleeps, and then exits. Made to 23 | * test driver submission in the standalone scheduler. */ 24 | object DriverSubmissionTest { 25 | def main(args: Array[String]) { 26 | if (args.size < 1) { 27 | println("Usage: DriverSubmissionTest ") 28 | System.exit(0) 29 | } 30 | val numSecondsToSleep = args(0).toInt 31 | 32 | val env = System.getenv() 33 | val properties = System.getProperties() 34 | 35 | println("Environment variables containing SPARK_TEST:") 36 | env.filter{case (k, v) => k.contains("SPARK_TEST")}.foreach(println) 37 | 38 | println("System properties containing spark.test:") 39 | properties.filter{case (k, v) => k.toString.contains("spark.test")}.foreach(println) 40 | 41 | for (i <- 1 until numSecondsToSleep) { 42 | println(s"Alive for $i out of $numSecondsToSleep seconds") 43 | Thread.sleep(1000) 44 | } 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/org/apache/spark/examples/ExceptionHandlingTest.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.examples 19 | 20 | import org.apache.spark.{SparkConf, SparkContext} 21 | 22 | object ExceptionHandlingTest { 23 | def main(args: Array[String]) { 24 | val sparkConf = new SparkConf().setAppName("ExceptionHandlingTest") 25 | val sc = new SparkContext(sparkConf) 26 | sc.parallelize(0 until sc.defaultParallelism).foreach { i => 27 | if (math.random > 0.75) { 28 | throw new Exception("Testing exception handling") 29 | } 30 | } 31 | 32 | sc.stop() 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /src/org/apache/spark/examples/GroupByTest.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.examples 19 | 20 | import java.util.Random 21 | 22 | import org.apache.spark.{SparkConf, SparkContext} 23 | import org.apache.spark.SparkContext._ 24 | 25 | /** 26 | * Usage: GroupByTest [numMappers] [numKVPairs] [KeySize] [numReducers] 27 | */ 28 | object GroupByTest { 29 | def main(args: Array[String]) { 30 | val sparkConf = new SparkConf().setAppName("GroupBy Test") 31 | var numMappers = if (args.length > 0) args(0).toInt else 2 32 | var numKVPairs = if (args.length > 1) args(1).toInt else 1000 33 | var valSize = if (args.length > 2) args(2).toInt else 1000 34 | var numReducers = if (args.length > 3) args(3).toInt else numMappers 35 | 36 | val sc = new SparkContext(sparkConf) 37 | 38 | val pairs1 = sc.parallelize(0 until numMappers, numMappers).flatMap { p => 39 | val ranGen = new Random 40 | var arr1 = new Array[(Int, Array[Byte])](numKVPairs) 41 | for (i <- 0 until numKVPairs) { 42 | val byteArr = new Array[Byte](valSize) 43 | ranGen.nextBytes(byteArr) 44 | arr1(i) = (ranGen.nextInt(Int.MaxValue), byteArr) 45 | } 46 | arr1 47 | }.cache 48 | // Enforce that everything has been calculated and in cache 49 | pairs1.count 50 | 51 | println(pairs1.groupByKey(numReducers).count) 52 | 53 | sc.stop() 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /src/org/apache/spark/examples/HBaseTest.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.examples 19 | 20 | import org.apache.hadoop.hbase.client.HBaseAdmin 21 | import org.apache.hadoop.hbase.{HBaseConfiguration, HTableDescriptor} 22 | import org.apache.hadoop.hbase.mapreduce.TableInputFormat 23 | 24 | import org.apache.spark._ 25 | import org.apache.spark.rdd.NewHadoopRDD 26 | 27 | object HBaseTest { 28 | def main(args: Array[String]) { 29 | val sparkConf = new SparkConf().setAppName("HBaseTest") 30 | val sc = new SparkContext(sparkConf) 31 | val conf = HBaseConfiguration.create() 32 | // Other options for configuring scan behavior are available. More information available at 33 | // http://hbase.apache.org/apidocs/org/apache/hadoop/hbase/mapreduce/TableInputFormat.html 34 | conf.set(TableInputFormat.INPUT_TABLE, args(1)) 35 | 36 | // Initialize hBase table if necessary 37 | val admin = new HBaseAdmin(conf) 38 | if(!admin.isTableAvailable(args(1))) { 39 | val tableDesc = new HTableDescriptor(args(1)) 40 | admin.createTable(tableDesc) 41 | } 42 | 43 | val hBaseRDD = sc.newAPIHadoopRDD(conf, classOf[TableInputFormat], 44 | classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable], 45 | classOf[org.apache.hadoop.hbase.client.Result]) 46 | 47 | hBaseRDD.count() 48 | 49 | sc.stop() 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /src/org/apache/spark/examples/HdfsTest.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.examples 19 | 20 | import org.apache.spark._ 21 | 22 | object HdfsTest { 23 | def main(args: Array[String]) { 24 | val sparkConf = new SparkConf().setAppName("HdfsTest") 25 | val sc = new SparkContext(sparkConf) 26 | val file = sc.textFile(args(1)) 27 | val mapped = file.map(s => s.length).cache() 28 | for (iter <- 1 to 10) { 29 | val start = System.currentTimeMillis() 30 | for (x <- mapped) { x + 2 } 31 | // println("Processing: " + x) 32 | val end = System.currentTimeMillis() 33 | println("Iteration " + iter + " took " + (end-start) + " ms") 34 | } 35 | sc.stop() 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/org/apache/spark/examples/LocalALS.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.examples 19 | 20 | import scala.math.sqrt 21 | 22 | import cern.colt.matrix._ 23 | import cern.colt.matrix.linalg._ 24 | import cern.jet.math._ 25 | 26 | /** 27 | * Alternating least squares matrix factorization. 28 | */ 29 | object LocalALS { 30 | // Parameters set through command line arguments 31 | var M = 0 // Number of movies 32 | var U = 0 // Number of users 33 | var F = 0 // Number of features 34 | var ITERATIONS = 0 35 | 36 | val LAMBDA = 0.01 // Regularization coefficient 37 | 38 | // Some COLT objects 39 | val factory2D = DoubleFactory2D.dense 40 | val factory1D = DoubleFactory1D.dense 41 | val algebra = Algebra.DEFAULT 42 | val blas = SeqBlas.seqBlas 43 | 44 | def generateR(): DoubleMatrix2D = { 45 | val mh = factory2D.random(M, F) 46 | val uh = factory2D.random(U, F) 47 | algebra.mult(mh, algebra.transpose(uh)) 48 | } 49 | 50 | def rmse(targetR: DoubleMatrix2D, ms: Array[DoubleMatrix1D], 51 | us: Array[DoubleMatrix1D]): Double = 52 | { 53 | val r = factory2D.make(M, U) 54 | for (i <- 0 until M; j <- 0 until U) { 55 | r.set(i, j, blas.ddot(ms(i), us(j))) 56 | } 57 | blas.daxpy(-1, targetR, r) 58 | val sumSqs = r.aggregate(Functions.plus, Functions.square) 59 | sqrt(sumSqs / (M * U)) 60 | } 61 | 62 | def updateMovie(i: Int, m: DoubleMatrix1D, us: Array[DoubleMatrix1D], 63 | R: DoubleMatrix2D) : DoubleMatrix1D = 64 | { 65 | val XtX = factory2D.make(F, F) 66 | val Xty = factory1D.make(F) 67 | // For each user that rated the movie 68 | for (j <- 0 until U) { 69 | val u = us(j) 70 | // Add u * u^t to XtX 71 | blas.dger(1, u, u, XtX) 72 | // Add u * rating to Xty 73 | blas.daxpy(R.get(i, j), u, Xty) 74 | } 75 | // Add regularization coefs to diagonal terms 76 | for (d <- 0 until F) { 77 | XtX.set(d, d, XtX.get(d, d) + LAMBDA * U) 78 | } 79 | // Solve it with Cholesky 80 | val ch = new CholeskyDecomposition(XtX) 81 | val Xty2D = factory2D.make(Xty.toArray, F) 82 | val solved2D = ch.solve(Xty2D) 83 | solved2D.viewColumn(0) 84 | } 85 | 86 | def updateUser(j: Int, u: DoubleMatrix1D, ms: Array[DoubleMatrix1D], 87 | R: DoubleMatrix2D) : DoubleMatrix1D = 88 | { 89 | val XtX = factory2D.make(F, F) 90 | val Xty = factory1D.make(F) 91 | // For each movie that the user rated 92 | for (i <- 0 until M) { 93 | val m = ms(i) 94 | // Add m * m^t to XtX 95 | blas.dger(1, m, m, XtX) 96 | // Add m * rating to Xty 97 | blas.daxpy(R.get(i, j), m, Xty) 98 | } 99 | // Add regularization coefs to diagonal terms 100 | for (d <- 0 until F) { 101 | XtX.set(d, d, XtX.get(d, d) + LAMBDA * M) 102 | } 103 | // Solve it with Cholesky 104 | val ch = new CholeskyDecomposition(XtX) 105 | val Xty2D = factory2D.make(Xty.toArray, F) 106 | val solved2D = ch.solve(Xty2D) 107 | solved2D.viewColumn(0) 108 | } 109 | 110 | def main(args: Array[String]) { 111 | args match { 112 | case Array(m, u, f, iters) => { 113 | M = m.toInt 114 | U = u.toInt 115 | F = f.toInt 116 | ITERATIONS = iters.toInt 117 | } 118 | case _ => { 119 | System.err.println("Usage: LocalALS ") 120 | System.exit(1) 121 | } 122 | } 123 | printf("Running with M=%d, U=%d, F=%d, iters=%d\n", M, U, F, ITERATIONS) 124 | 125 | val R = generateR() 126 | 127 | // Initialize m and u randomly 128 | var ms = Array.fill(M)(factory1D.random(F)) 129 | var us = Array.fill(U)(factory1D.random(F)) 130 | 131 | // Iteratively update movies then users 132 | for (iter <- 1 to ITERATIONS) { 133 | println("Iteration " + iter + ":") 134 | ms = (0 until M).map(i => updateMovie(i, ms(i), us, R)).toArray 135 | us = (0 until U).map(j => updateUser(j, us(j), ms, R)).toArray 136 | println("RMSE = " + rmse(R, ms, us)) 137 | println() 138 | } 139 | } 140 | } 141 | -------------------------------------------------------------------------------- /src/org/apache/spark/examples/LocalFileLR.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.examples 19 | 20 | import java.util.Random 21 | 22 | import breeze.linalg.{Vector, DenseVector} 23 | 24 | object LocalFileLR { 25 | val D = 10 // Numer of dimensions 26 | val rand = new Random(42) 27 | 28 | case class DataPoint(x: Vector[Double], y: Double) 29 | 30 | def parsePoint(line: String): DataPoint = { 31 | val nums = line.split(' ').map(_.toDouble) 32 | DataPoint(new DenseVector(nums.slice(1, D + 1)), nums(0)) 33 | } 34 | 35 | def main(args: Array[String]) { 36 | val lines = scala.io.Source.fromFile(args(0)).getLines().toArray 37 | val points = lines.map(parsePoint _) 38 | val ITERATIONS = args(1).toInt 39 | 40 | // Initialize w to a random value 41 | var w = DenseVector.fill(D){2 * rand.nextDouble - 1} 42 | println("Initial w: " + w) 43 | 44 | for (i <- 1 to ITERATIONS) { 45 | println("On iteration " + i) 46 | var gradient = DenseVector.zeros[Double](D) 47 | for (p <- points) { 48 | val scale = (1 / (1 + math.exp(-p.y * (w.dot(p.x)))) - 1) * p.y 49 | gradient += p.x * scale 50 | } 51 | w -= gradient 52 | } 53 | 54 | println("Final w: " + w) 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /src/org/apache/spark/examples/LocalKMeans.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.examples 19 | 20 | import java.util.Random 21 | 22 | import scala.collection.mutable.HashMap 23 | import scala.collection.mutable.HashSet 24 | 25 | import breeze.linalg.{Vector, DenseVector, squaredDistance} 26 | 27 | import org.apache.spark.SparkContext._ 28 | 29 | /** 30 | * K-means clustering. 31 | */ 32 | object LocalKMeans { 33 | val N = 1000 34 | val R = 1000 // Scaling factor 35 | val D = 10 36 | val K = 10 37 | val convergeDist = 0.001 38 | val rand = new Random(42) 39 | 40 | def generateData = { 41 | def generatePoint(i: Int) = { 42 | DenseVector.fill(D){rand.nextDouble * R} 43 | } 44 | Array.tabulate(N)(generatePoint) 45 | } 46 | 47 | def closestPoint(p: Vector[Double], centers: HashMap[Int, Vector[Double]]): Int = { 48 | var index = 0 49 | var bestIndex = 0 50 | var closest = Double.PositiveInfinity 51 | 52 | for (i <- 1 to centers.size) { 53 | val vCurr = centers.get(i).get 54 | val tempDist = squaredDistance(p, vCurr) 55 | if (tempDist < closest) { 56 | closest = tempDist 57 | bestIndex = i 58 | } 59 | } 60 | 61 | bestIndex 62 | } 63 | 64 | def main(args: Array[String]) { 65 | val data = generateData 66 | var points = new HashSet[Vector[Double]] 67 | var kPoints = new HashMap[Int, Vector[Double]] 68 | var tempDist = 1.0 69 | 70 | while (points.size < K) { 71 | points.add(data(rand.nextInt(N))) 72 | } 73 | 74 | val iter = points.iterator 75 | for (i <- 1 to points.size) { 76 | kPoints.put(i, iter.next()) 77 | } 78 | 79 | println("Initial centers: " + kPoints) 80 | 81 | while(tempDist > convergeDist) { 82 | var closest = data.map (p => (closestPoint(p, kPoints), (p, 1))) 83 | 84 | var mappings = closest.groupBy[Int] (x => x._1) 85 | 86 | var pointStats = mappings.map { pair => 87 | pair._2.reduceLeft [(Int, (Vector[Double], Int))] { 88 | case ((id1, (x1, y1)), (id2, (x2, y2))) => (id1, (x1 + x2, y1 + y2)) 89 | } 90 | } 91 | 92 | var newPoints = pointStats.map {mapping => 93 | (mapping._1, mapping._2._1 * (1.0 / mapping._2._2))} 94 | 95 | tempDist = 0.0 96 | for (mapping <- newPoints) { 97 | tempDist += squaredDistance(kPoints.get(mapping._1).get, mapping._2) 98 | } 99 | 100 | for (newP <- newPoints) { 101 | kPoints.put(newP._1, newP._2) 102 | } 103 | } 104 | 105 | println("Final centers: " + kPoints) 106 | } 107 | } 108 | -------------------------------------------------------------------------------- /src/org/apache/spark/examples/LocalLR.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.examples 19 | 20 | import java.util.Random 21 | 22 | import breeze.linalg.{Vector, DenseVector} 23 | 24 | /** 25 | * Logistic regression based classification. 26 | */ 27 | object LocalLR { 28 | val N = 10000 // Number of data points 29 | val D = 10 // Number of dimensions 30 | val R = 0.7 // Scaling factor 31 | val ITERATIONS = 5 32 | val rand = new Random(42) 33 | 34 | case class DataPoint(x: Vector[Double], y: Double) 35 | 36 | def generateData = { 37 | def generatePoint(i: Int) = { 38 | val y = if(i % 2 == 0) -1 else 1 39 | val x = DenseVector.fill(D){rand.nextGaussian + y * R} 40 | DataPoint(x, y) 41 | } 42 | Array.tabulate(N)(generatePoint) 43 | } 44 | 45 | def main(args: Array[String]) { 46 | val data = generateData 47 | 48 | // Initialize w to a random value 49 | var w = DenseVector.fill(D){2 * rand.nextDouble - 1} 50 | println("Initial w: " + w) 51 | 52 | for (i <- 1 to ITERATIONS) { 53 | println("On iteration " + i) 54 | var gradient = DenseVector.zeros[Double](D) 55 | for (p <- data) { 56 | val scale = (1 / (1 + math.exp(-p.y * (w.dot(p.x)))) - 1) * p.y 57 | gradient += p.x * scale 58 | } 59 | w -= gradient 60 | } 61 | 62 | println("Final w: " + w) 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /src/org/apache/spark/examples/LocalPi.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.examples 19 | 20 | import scala.math.random 21 | 22 | import org.apache.spark._ 23 | import org.apache.spark.SparkContext._ 24 | 25 | object LocalPi { 26 | def main(args: Array[String]) { 27 | var count = 0 28 | for (i <- 1 to 100000) { 29 | val x = random * 2 - 1 30 | val y = random * 2 - 1 31 | if (x*x + y*y < 1) count += 1 32 | } 33 | println("Pi is roughly " + 4 * count / 100000.0) 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /src/org/apache/spark/examples/LogQuery.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.examples 19 | 20 | import org.apache.spark.{SparkConf, SparkContext} 21 | import org.apache.spark.SparkContext._ 22 | 23 | /** 24 | * Executes a roll up-style query against Apache logs. 25 | * 26 | * Usage: LogQuery [logFile] 27 | */ 28 | object LogQuery { 29 | val exampleApacheLogs = List( 30 | """10.10.10.10 - "FRED" [18/Jan/2013:17:56:07 +1100] "GET http://images.com/2013/Generic.jpg 31 | | HTTP/1.1" 304 315 "http://referall.com/" "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 32 | | GTB7.4; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30; .NET CLR 3.0.04506.648; .NET CLR 33 | | 3.5.21022; .NET CLR 3.0.4506.2152; .NET CLR 1.0.3705; .NET CLR 1.1.4322; .NET CLR 34 | | 3.5.30729; Release=ARP)" "UD-1" - "image/jpeg" "whatever" 0.350 "-" - "" 265 923 934 "" 35 | | 62.24.11.25 images.com 1358492167 - Whatup""".stripMargin.lines.mkString, 36 | """10.10.10.10 - "FRED" [18/Jan/2013:18:02:37 +1100] "GET http://images.com/2013/Generic.jpg 37 | | HTTP/1.1" 304 306 "http:/referall.com" "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 38 | | GTB7.4; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30; .NET CLR 3.0.04506.648; .NET CLR 39 | | 3.5.21022; .NET CLR 3.0.4506.2152; .NET CLR 1.0.3705; .NET CLR 1.1.4322; .NET CLR 40 | | 3.5.30729; Release=ARP)" "UD-1" - "image/jpeg" "whatever" 0.352 "-" - "" 256 977 988 "" 41 | | 0 73.23.2.15 images.com 1358492557 - Whatup""".stripMargin.lines.mkString 42 | ) 43 | 44 | def main(args: Array[String]) { 45 | 46 | val sparkConf = new SparkConf().setAppName("Log Query") 47 | val sc = new SparkContext(sparkConf) 48 | 49 | val dataSet = 50 | if (args.length == 1) sc.textFile(args(0)) else sc.parallelize(exampleApacheLogs) 51 | // scalastyle:off 52 | val apacheLogRegex = 53 | """^([\d.]+) (\S+) (\S+) \[([\w\d:/]+\s[+\-]\d{4})\] "(.+?)" (\d{3}) ([\d\-]+) "([^"]+)" "([^"]+)".*""".r 54 | // scalastyle:on 55 | /** Tracks the total query count and number of aggregate bytes for a particular group. */ 56 | class Stats(val count: Int, val numBytes: Int) extends Serializable { 57 | def merge(other: Stats) = new Stats(count + other.count, numBytes + other.numBytes) 58 | override def toString = "bytes=%s\tn=%s".format(numBytes, count) 59 | } 60 | 61 | def extractKey(line: String): (String, String, String) = { 62 | apacheLogRegex.findFirstIn(line) match { 63 | case Some(apacheLogRegex(ip, _, user, dateTime, query, status, bytes, referer, ua)) => 64 | if (user != "\"-\"") (ip, user, query) 65 | else (null, null, null) 66 | case _ => (null, null, null) 67 | } 68 | } 69 | 70 | def extractStats(line: String): Stats = { 71 | apacheLogRegex.findFirstIn(line) match { 72 | case Some(apacheLogRegex(ip, _, user, dateTime, query, status, bytes, referer, ua)) => 73 | new Stats(1, bytes.toInt) 74 | case _ => new Stats(1, 0) 75 | } 76 | } 77 | 78 | dataSet.map(line => (extractKey(line), extractStats(line))) 79 | .reduceByKey((a, b) => a.merge(b)) 80 | .collect().foreach{ 81 | case (user, query) => println("%s\t%s".format(user, query))} 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /src/org/apache/spark/examples/MultiBroadcastTest.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.examples 19 | 20 | import org.apache.spark.rdd.RDD 21 | import org.apache.spark.{SparkConf, SparkContext} 22 | 23 | /** 24 | * Usage: MultiBroadcastTest [slices] [numElem] 25 | */ 26 | object MultiBroadcastTest { 27 | def main(args: Array[String]) { 28 | 29 | val sparkConf = new SparkConf().setAppName("Multi-Broadcast Test") 30 | val sc = new SparkContext(sparkConf) 31 | 32 | val slices = if (args.length > 0) args(0).toInt else 2 33 | val num = if (args.length > 1) args(1).toInt else 1000000 34 | 35 | val arr1 = new Array[Int](num) 36 | for (i <- 0 until arr1.length) { 37 | arr1(i) = i 38 | } 39 | 40 | val arr2 = new Array[Int](num) 41 | for (i <- 0 until arr2.length) { 42 | arr2(i) = i 43 | } 44 | 45 | val barr1 = sc.broadcast(arr1) 46 | val barr2 = sc.broadcast(arr2) 47 | val observedSizes: RDD[(Int, Int)] = sc.parallelize(1 to 10, slices).map { _ => 48 | (barr1.value.size, barr2.value.size) 49 | } 50 | // Collect the small RDD so we can print the observed sizes locally. 51 | observedSizes.collect().foreach(i => println(i)) 52 | 53 | sc.stop() 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /src/org/apache/spark/examples/SimpleSkewedGroupByTest.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.examples 19 | 20 | import java.util.Random 21 | 22 | import org.apache.spark.{SparkConf, SparkContext} 23 | import org.apache.spark.SparkContext._ 24 | 25 | /** 26 | * Usage: SimpleSkewedGroupByTest [numMappers] [numKVPairs] [valSize] [numReducers] [ratio] 27 | */ 28 | object SimpleSkewedGroupByTest { 29 | def main(args: Array[String]) { 30 | 31 | val sparkConf = new SparkConf().setAppName("SimpleSkewedGroupByTest") 32 | var numMappers = if (args.length > 0) args(0).toInt else 2 33 | var numKVPairs = if (args.length > 1) args(1).toInt else 1000 34 | var valSize = if (args.length > 2) args(2).toInt else 1000 35 | var numReducers = if (args.length > 3) args(3).toInt else numMappers 36 | var ratio = if (args.length > 4) args(4).toInt else 5.0 37 | 38 | val sc = new SparkContext(sparkConf) 39 | 40 | val pairs1 = sc.parallelize(0 until numMappers, numMappers).flatMap { p => 41 | val ranGen = new Random 42 | var result = new Array[(Int, Array[Byte])](numKVPairs) 43 | for (i <- 0 until numKVPairs) { 44 | val byteArr = new Array[Byte](valSize) 45 | ranGen.nextBytes(byteArr) 46 | val offset = ranGen.nextInt(1000) * numReducers 47 | if (ranGen.nextDouble < ratio / (numReducers + ratio - 1)) { 48 | // give ratio times higher chance of generating key 0 (for reducer 0) 49 | result(i) = (offset, byteArr) 50 | } else { 51 | // generate a key for one of the other reducers 52 | val key = 1 + ranGen.nextInt(numReducers-1) + offset 53 | result(i) = (key, byteArr) 54 | } 55 | } 56 | result 57 | }.cache 58 | // Enforce that everything has been calculated and in cache 59 | pairs1.count 60 | 61 | println("RESULT: " + pairs1.groupByKey(numReducers).count) 62 | // Print how many keys each reducer got (for debugging) 63 | // println("RESULT: " + pairs1.groupByKey(numReducers) 64 | // .map{case (k,v) => (k, v.size)} 65 | // .collectAsMap) 66 | 67 | sc.stop() 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /src/org/apache/spark/examples/SkewedGroupByTest.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.examples 19 | 20 | import java.util.Random 21 | 22 | import org.apache.spark.{SparkConf, SparkContext} 23 | import org.apache.spark.SparkContext._ 24 | 25 | /** 26 | * Usage: GroupByTest [numMappers] [numKVPairs] [KeySize] [numReducers] 27 | */ 28 | object SkewedGroupByTest { 29 | def main(args: Array[String]) { 30 | val sparkConf = new SparkConf().setAppName("GroupBy Test") 31 | var numMappers = if (args.length > 0) args(0).toInt else 2 32 | var numKVPairs = if (args.length > 1) args(1).toInt else 1000 33 | var valSize = if (args.length > 2) args(2).toInt else 1000 34 | var numReducers = if (args.length > 3) args(3).toInt else numMappers 35 | 36 | val sc = new SparkContext(sparkConf) 37 | 38 | val pairs1 = sc.parallelize(0 until numMappers, numMappers).flatMap { p => 39 | val ranGen = new Random 40 | 41 | // map output sizes lineraly increase from the 1st to the last 42 | numKVPairs = (1.0 * (p + 1) / numMappers * numKVPairs).toInt 43 | 44 | var arr1 = new Array[(Int, Array[Byte])](numKVPairs) 45 | for (i <- 0 until numKVPairs) { 46 | val byteArr = new Array[Byte](valSize) 47 | ranGen.nextBytes(byteArr) 48 | arr1(i) = (ranGen.nextInt(Int.MaxValue), byteArr) 49 | } 50 | arr1 51 | }.cache() 52 | // Enforce that everything has been calculated and in cache 53 | pairs1.count() 54 | 55 | println(pairs1.groupByKey(numReducers).count()) 56 | 57 | sc.stop() 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /src/org/apache/spark/examples/SparkALS.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.examples 19 | 20 | import scala.math.sqrt 21 | 22 | import cern.colt.matrix._ 23 | import cern.colt.matrix.linalg._ 24 | import cern.jet.math._ 25 | 26 | import org.apache.spark._ 27 | 28 | /** 29 | * Alternating least squares matrix factorization. 30 | */ 31 | object SparkALS { 32 | // Parameters set through command line arguments 33 | var M = 0 // Number of movies 34 | var U = 0 // Number of users 35 | var F = 0 // Number of features 36 | var ITERATIONS = 0 37 | 38 | val LAMBDA = 0.01 // Regularization coefficient 39 | 40 | // Some COLT objects 41 | val factory2D = DoubleFactory2D.dense 42 | val factory1D = DoubleFactory1D.dense 43 | val algebra = Algebra.DEFAULT 44 | val blas = SeqBlas.seqBlas 45 | 46 | def generateR(): DoubleMatrix2D = { 47 | val mh = factory2D.random(M, F) 48 | val uh = factory2D.random(U, F) 49 | algebra.mult(mh, algebra.transpose(uh)) 50 | } 51 | 52 | def rmse(targetR: DoubleMatrix2D, ms: Array[DoubleMatrix1D], 53 | us: Array[DoubleMatrix1D]): Double = 54 | { 55 | val r = factory2D.make(M, U) 56 | for (i <- 0 until M; j <- 0 until U) { 57 | r.set(i, j, blas.ddot(ms(i), us(j))) 58 | } 59 | blas.daxpy(-1, targetR, r) 60 | val sumSqs = r.aggregate(Functions.plus, Functions.square) 61 | sqrt(sumSqs / (M * U)) 62 | } 63 | 64 | def update(i: Int, m: DoubleMatrix1D, us: Array[DoubleMatrix1D], 65 | R: DoubleMatrix2D) : DoubleMatrix1D = 66 | { 67 | val U = us.size 68 | val F = us(0).size 69 | val XtX = factory2D.make(F, F) 70 | val Xty = factory1D.make(F) 71 | // For each user that rated the movie 72 | for (j <- 0 until U) { 73 | val u = us(j) 74 | // Add u * u^t to XtX 75 | blas.dger(1, u, u, XtX) 76 | // Add u * rating to Xty 77 | blas.daxpy(R.get(i, j), u, Xty) 78 | } 79 | // Add regularization coefs to diagonal terms 80 | for (d <- 0 until F) { 81 | XtX.set(d, d, XtX.get(d, d) + LAMBDA * U) 82 | } 83 | // Solve it with Cholesky 84 | val ch = new CholeskyDecomposition(XtX) 85 | val Xty2D = factory2D.make(Xty.toArray, F) 86 | val solved2D = ch.solve(Xty2D) 87 | solved2D.viewColumn(0) 88 | } 89 | 90 | def main(args: Array[String]) { 91 | var slices = 0 92 | 93 | val options = (0 to 4).map(i => if (i < args.length) Some(args(i)) else None) 94 | 95 | options.toArray match { 96 | case Array(m, u, f, iters, slices_) => 97 | M = m.getOrElse("100").toInt 98 | U = u.getOrElse("500").toInt 99 | F = f.getOrElse("10").toInt 100 | ITERATIONS = iters.getOrElse("5").toInt 101 | slices = slices_.getOrElse("2").toInt 102 | case _ => 103 | System.err.println("Usage: SparkALS [M] [U] [F] [iters] [slices]") 104 | System.exit(1) 105 | } 106 | printf("Running with M=%d, U=%d, F=%d, iters=%d\n", M, U, F, ITERATIONS) 107 | val sparkConf = new SparkConf().setAppName("SparkALS") 108 | val sc = new SparkContext(sparkConf) 109 | 110 | val R = generateR() 111 | 112 | // Initialize m and u randomly 113 | var ms = Array.fill(M)(factory1D.random(F)) 114 | var us = Array.fill(U)(factory1D.random(F)) 115 | 116 | // Iteratively update movies then users 117 | val Rc = sc.broadcast(R) 118 | var msb = sc.broadcast(ms) 119 | var usb = sc.broadcast(us) 120 | for (iter <- 1 to ITERATIONS) { 121 | println("Iteration " + iter + ":") 122 | ms = sc.parallelize(0 until M, slices) 123 | .map(i => update(i, msb.value(i), usb.value, Rc.value)) 124 | .collect() 125 | msb = sc.broadcast(ms) // Re-broadcast ms because it was updated 126 | us = sc.parallelize(0 until U, slices) 127 | .map(i => update(i, usb.value(i), msb.value, algebra.transpose(Rc.value))) 128 | .collect() 129 | usb = sc.broadcast(us) // Re-broadcast us because it was updated 130 | println("RMSE = " + rmse(R, ms, us)) 131 | println() 132 | } 133 | 134 | sc.stop() 135 | } 136 | } 137 | -------------------------------------------------------------------------------- /src/org/apache/spark/examples/SparkHdfsLR.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.examples 19 | 20 | import java.util.Random 21 | 22 | import scala.math.exp 23 | 24 | import breeze.linalg.{Vector, DenseVector} 25 | 26 | import org.apache.spark._ 27 | import org.apache.spark.deploy.SparkHadoopUtil 28 | import org.apache.spark.scheduler.InputFormatInfo 29 | 30 | 31 | /** 32 | * Logistic regression based classification. 33 | */ 34 | object SparkHdfsLR { 35 | val D = 10 // Numer of dimensions 36 | val rand = new Random(42) 37 | 38 | case class DataPoint(x: Vector[Double], y: Double) 39 | 40 | def parsePoint(line: String): DataPoint = { 41 | val tok = new java.util.StringTokenizer(line, " ") 42 | var y = tok.nextToken.toDouble 43 | var x = new Array[Double](D) 44 | var i = 0 45 | while (i < D) { 46 | x(i) = tok.nextToken.toDouble; i += 1 47 | } 48 | DataPoint(new DenseVector(x), y) 49 | } 50 | 51 | def main(args: Array[String]) { 52 | if (args.length < 2) { 53 | System.err.println("Usage: SparkHdfsLR ") 54 | System.exit(1) 55 | } 56 | 57 | val sparkConf = new SparkConf().setAppName("SparkHdfsLR") 58 | val inputPath = args(0) 59 | val conf = SparkHadoopUtil.get.newConfiguration() 60 | val sc = new SparkContext(sparkConf, 61 | InputFormatInfo.computePreferredLocations( 62 | Seq(new InputFormatInfo(conf, classOf[org.apache.hadoop.mapred.TextInputFormat], inputPath)) 63 | )) 64 | val lines = sc.textFile(inputPath) 65 | val points = lines.map(parsePoint _).cache() 66 | val ITERATIONS = args(1).toInt 67 | 68 | // Initialize w to a random value 69 | var w = DenseVector.fill(D){2 * rand.nextDouble - 1} 70 | println("Initial w: " + w) 71 | 72 | for (i <- 1 to ITERATIONS) { 73 | println("On iteration " + i) 74 | val gradient = points.map { p => 75 | p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y 76 | }.reduce(_ + _) 77 | w -= gradient 78 | } 79 | 80 | println("Final w: " + w) 81 | sc.stop() 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /src/org/apache/spark/examples/SparkKMeans.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.examples 19 | 20 | import java.util.Random 21 | 22 | import breeze.linalg.{Vector, DenseVector, squaredDistance} 23 | 24 | import org.apache.spark.{SparkConf, SparkContext} 25 | import org.apache.spark.SparkContext._ 26 | 27 | /** 28 | * K-means clustering. 29 | */ 30 | object SparkKMeans { 31 | val R = 1000 // Scaling factor 32 | val rand = new Random(42) 33 | 34 | def parseVector(line: String): Vector[Double] = { 35 | DenseVector(line.split(' ').map(_.toDouble)) 36 | } 37 | 38 | def closestPoint(p: Vector[Double], centers: Array[Vector[Double]]): Int = { 39 | var index = 0 40 | var bestIndex = 0 41 | var closest = Double.PositiveInfinity 42 | 43 | for (i <- 0 until centers.length) { 44 | val tempDist = squaredDistance(p, centers(i)) 45 | if (tempDist < closest) { 46 | closest = tempDist 47 | bestIndex = i 48 | } 49 | } 50 | 51 | bestIndex 52 | } 53 | 54 | def main(args: Array[String]) { 55 | if (args.length < 3) { 56 | System.err.println("Usage: SparkKMeans ") 57 | System.exit(1) 58 | } 59 | val sparkConf = new SparkConf().setAppName("SparkKMeans") 60 | val sc = new SparkContext(sparkConf) 61 | val lines = sc.textFile(args(0)) 62 | val data = lines.map(parseVector _).cache() 63 | val K = args(1).toInt 64 | val convergeDist = args(2).toDouble 65 | 66 | val kPoints = data.takeSample(withReplacement = false, K, 42).toArray 67 | var tempDist = 1.0 68 | 69 | while(tempDist > convergeDist) { 70 | val closest = data.map (p => (closestPoint(p, kPoints), (p, 1))) 71 | 72 | val pointStats = closest.reduceByKey{case ((x1, y1), (x2, y2)) => (x1 + x2, y1 + y2)} 73 | 74 | val newPoints = pointStats.map {pair => 75 | (pair._1, pair._2._1 * (1.0 / pair._2._2))}.collectAsMap() 76 | 77 | tempDist = 0.0 78 | for (i <- 0 until K) { 79 | tempDist += squaredDistance(kPoints(i), newPoints(i)) 80 | } 81 | 82 | for (newP <- newPoints) { 83 | kPoints(newP._1) = newP._2 84 | } 85 | println("Finished iteration (delta = " + tempDist + ")") 86 | } 87 | 88 | println("Final centers:") 89 | kPoints.foreach(println) 90 | sc.stop() 91 | } 92 | } 93 | -------------------------------------------------------------------------------- /src/org/apache/spark/examples/SparkLR.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.examples 19 | 20 | import java.util.Random 21 | 22 | import scala.math.exp 23 | 24 | import breeze.linalg.{Vector, DenseVector} 25 | 26 | import org.apache.spark._ 27 | 28 | /** 29 | * Logistic regression based classification. 30 | * Usage: SparkLR [slices] 31 | */ 32 | object SparkLR { 33 | val N = 10000 // Number of data points 34 | val D = 10 // Numer of dimensions 35 | val R = 0.7 // Scaling factor 36 | val ITERATIONS = 5 37 | val rand = new Random(42) 38 | 39 | case class DataPoint(x: Vector[Double], y: Double) 40 | 41 | def generateData = { 42 | def generatePoint(i: Int) = { 43 | val y = if(i % 2 == 0) -1 else 1 44 | val x = DenseVector.fill(D){rand.nextGaussian + y * R} 45 | DataPoint(x, y) 46 | } 47 | Array.tabulate(N)(generatePoint) 48 | } 49 | 50 | def main(args: Array[String]) { 51 | val sparkConf = new SparkConf().setAppName("SparkLR") 52 | val sc = new SparkContext(sparkConf) 53 | val numSlices = if (args.length > 0) args(0).toInt else 2 54 | val points = sc.parallelize(generateData, numSlices).cache() 55 | 56 | // Initialize w to a random value 57 | var w = DenseVector.fill(D){2 * rand.nextDouble - 1} 58 | println("Initial w: " + w) 59 | 60 | for (i <- 1 to ITERATIONS) { 61 | println("On iteration " + i) 62 | val gradient = points.map { p => 63 | p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y 64 | }.reduce(_ + _) 65 | w -= gradient 66 | } 67 | 68 | println("Final w: " + w) 69 | sc.stop() 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /src/org/apache/spark/examples/SparkPageRank.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.examples 19 | 20 | import org.apache.spark.SparkContext._ 21 | import org.apache.spark.{SparkConf, SparkContext} 22 | 23 | /** 24 | * Computes the PageRank of URLs from an input file. Input file should 25 | * be in format of: 26 | * URL neighbor URL 27 | * URL neighbor URL 28 | * URL neighbor URL 29 | * ... 30 | * where URL and their neighbors are separated by space(s). 31 | */ 32 | object SparkPageRank { 33 | def main(args: Array[String]) { 34 | val sparkConf = new SparkConf().setAppName("PageRank") 35 | var iters = args(1).toInt 36 | val ctx = new SparkContext(sparkConf) 37 | val lines = ctx.textFile(args(0), 1) 38 | val links = lines.map{ s => 39 | val parts = s.split("\\s+") 40 | (parts(0), parts(1)) 41 | }.distinct().groupByKey().cache() 42 | var ranks = links.mapValues(v => 1.0) 43 | 44 | for (i <- 1 to iters) { 45 | val contribs = links.join(ranks).values.flatMap{ case (urls, rank) => 46 | val size = urls.size 47 | urls.map(url => (url, rank / size)) 48 | } 49 | ranks = contribs.reduceByKey(_ + _).mapValues(0.15 + 0.85 * _) 50 | } 51 | 52 | val output = ranks.collect() 53 | output.foreach(tup => println(tup._1 + " has rank: " + tup._2 + ".")) 54 | 55 | ctx.stop() 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /src/org/apache/spark/examples/SparkPi.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.examples 19 | 20 | import scala.math.random 21 | 22 | import org.apache.spark._ 23 | 24 | /** Computes an approximation to pi */ 25 | object SparkPi { 26 | def main(args: Array[String]) { 27 | val conf = new SparkConf().setAppName("Spark Pi") 28 | val spark = new SparkContext(conf) 29 | val slices = if (args.length > 0) args(0).toInt else 2 30 | val n = 100000 * slices 31 | val count = spark.parallelize(1 to n, slices).map { i => 32 | val x = random * 2 - 1 33 | val y = random * 2 - 1 34 | if (x*x + y*y < 1) 1 else 0 35 | }.reduce(_ + _) 36 | println("Pi is roughly " + 4.0 * count / n) 37 | spark.stop() 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /src/org/apache/spark/examples/SparkTC.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.examples 19 | 20 | import scala.util.Random 21 | import scala.collection.mutable 22 | 23 | import org.apache.spark.{SparkConf, SparkContext} 24 | import org.apache.spark.SparkContext._ 25 | 26 | /** 27 | * Transitive closure on a graph. 28 | */ 29 | object SparkTC { 30 | val numEdges = 200 31 | val numVertices = 100 32 | val rand = new Random(42) 33 | 34 | def generateGraph = { 35 | val edges: mutable.Set[(Int, Int)] = mutable.Set.empty 36 | while (edges.size < numEdges) { 37 | val from = rand.nextInt(numVertices) 38 | val to = rand.nextInt(numVertices) 39 | if (from != to) edges.+=((from, to)) 40 | } 41 | edges.toSeq 42 | } 43 | 44 | def main(args: Array[String]) { 45 | val sparkConf = new SparkConf().setAppName("SparkTC") 46 | val spark = new SparkContext(sparkConf) 47 | val slices = if (args.length > 0) args(0).toInt else 2 48 | var tc = spark.parallelize(generateGraph, slices).cache() 49 | 50 | // Linear transitive closure: each round grows paths by one edge, 51 | // by joining the graph's edges with the already-discovered paths. 52 | // e.g. join the path (y, z) from the TC with the edge (x, y) from 53 | // the graph to obtain the path (x, z). 54 | 55 | // Because join() joins on keys, the edges are stored in reversed order. 56 | val edges = tc.map(x => (x._2, x._1)) 57 | 58 | // This join is iterated until a fixed point is reached. 59 | var oldCount = 0L 60 | var nextCount = tc.count() 61 | do { 62 | oldCount = nextCount 63 | // Perform the join, obtaining an RDD of (y, (z, x)) pairs, 64 | // then project the result to obtain the new (x, z) paths. 65 | tc = tc.union(tc.join(edges).map(x => (x._2._2, x._2._1))).distinct().cache() 66 | nextCount = tc.count() 67 | } while (nextCount != oldCount) 68 | 69 | println("TC has " + tc.count() + " edges.") 70 | spark.stop() 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /src/org/apache/spark/examples/SparkTachyonHdfsLR.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.examples 19 | 20 | import java.util.Random 21 | 22 | import scala.math.exp 23 | 24 | import breeze.linalg.{Vector, DenseVector} 25 | 26 | import org.apache.spark._ 27 | import org.apache.spark.deploy.SparkHadoopUtil 28 | import org.apache.spark.scheduler.InputFormatInfo 29 | import org.apache.spark.storage.StorageLevel 30 | 31 | 32 | /** 33 | * Logistic regression based classification. 34 | * This example uses Tachyon to persist rdds during computation. 35 | */ 36 | object SparkTachyonHdfsLR { 37 | val D = 10 // Numer of dimensions 38 | val rand = new Random(42) 39 | 40 | case class DataPoint(x: Vector[Double], y: Double) 41 | 42 | def parsePoint(line: String): DataPoint = { 43 | val tok = new java.util.StringTokenizer(line, " ") 44 | var y = tok.nextToken.toDouble 45 | var x = new Array[Double](D) 46 | var i = 0 47 | while (i < D) { 48 | x(i) = tok.nextToken.toDouble; i += 1 49 | } 50 | DataPoint(new DenseVector(x), y) 51 | } 52 | 53 | def main(args: Array[String]) { 54 | val inputPath = args(0) 55 | val conf = SparkHadoopUtil.get.newConfiguration() 56 | val sparkConf = new SparkConf().setAppName("SparkTachyonHdfsLR") 57 | val sc = new SparkContext(sparkConf, 58 | InputFormatInfo.computePreferredLocations( 59 | Seq(new InputFormatInfo(conf, classOf[org.apache.hadoop.mapred.TextInputFormat], inputPath)) 60 | )) 61 | val lines = sc.textFile(inputPath) 62 | val points = lines.map(parsePoint _).persist(StorageLevel.OFF_HEAP) 63 | val ITERATIONS = args(1).toInt 64 | 65 | // Initialize w to a random value 66 | var w = DenseVector.fill(D){2 * rand.nextDouble - 1} 67 | println("Initial w: " + w) 68 | 69 | for (i <- 1 to ITERATIONS) { 70 | println("On iteration " + i) 71 | val gradient = points.map { p => 72 | p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y 73 | }.reduce(_ + _) 74 | w -= gradient 75 | } 76 | 77 | println("Final w: " + w) 78 | sc.stop() 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /src/org/apache/spark/examples/SparkTachyonPi.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.apache.spark.examples 19 | 20 | import scala.math.random 21 | 22 | import org.apache.spark._ 23 | import org.apache.spark.storage.StorageLevel 24 | 25 | /** 26 | * Computes an approximation to pi 27 | * This example uses Tachyon to persist rdds during computation. 28 | */ 29 | object SparkTachyonPi { 30 | def main(args: Array[String]) { 31 | val sparkConf = new SparkConf().setAppName("SparkTachyonPi") 32 | val spark = new SparkContext(sparkConf) 33 | 34 | val slices = if (args.length > 0) args(0).toInt else 2 35 | val n = 100000 * slices 36 | 37 | val rdd = spark.parallelize(1 to n, slices) 38 | rdd.persist(StorageLevel.OFF_HEAP) 39 | val count = rdd.map { i => 40 | val x = random * 2 - 1 41 | val y = random * 2 - 1 42 | if (x * x + y * y < 1) 1 else 0 43 | }.reduce(_ + _) 44 | println("Pi is roughly " + 4.0 * count / n) 45 | 46 | spark.stop() 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /src/pretty/examples/Aggregate.scala: -------------------------------------------------------------------------------- 1 | package pretty.examples 2 | 3 | import org.apache.spark.SparkContext 4 | 5 | object Aggregate { 6 | 7 | def main(args: Array[String]) { 8 | 9 | val sc = new SparkContext("local", "Aggregate Test") 10 | val d = List("0", "1", "2", "3", "4", "5", "6", "7", "8", "9") 11 | 12 | val data = sc.parallelize(d, 2) 13 | 14 | val result = data.aggregate("a")((x,y) => "[" + x + "," + y + "]", 15 | (x,y) => x + y) 16 | 17 | println(result) 18 | // output: 19 | // a[[[[[a,0],1],2],3],4][[[[[a,5],6],7],8],9] 20 | } 21 | } -------------------------------------------------------------------------------- /src/pretty/examples/Coalesce.scala: -------------------------------------------------------------------------------- 1 | package pretty.examples 2 | 3 | import org.apache.spark.SparkContext 4 | 5 | object Coalesce { 6 | def main(args: Array[String]) { 7 | val sc = new SparkContext("local", "Coalesce Test") 8 | 9 | val data = sc.parallelize(1 to 20, 10) 10 | 11 | val result = data.coalesce(2) 12 | result.foreach(x => print(x + " ")) 13 | 14 | // equals "repartition(2)" 15 | val resultWithHashPartition = data.coalesce(2, true) 16 | resultWithHashPartition.foreach(x => print(x + " ")) 17 | } 18 | } -------------------------------------------------------------------------------- /src/pretty/examples/CogroupPair.scala: -------------------------------------------------------------------------------- 1 | package pretty.examples 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.SparkContext._ 5 | import org.apache.spark.RangePartitioner 6 | 7 | object CogroupPair { 8 | def main(args: Array[String]) { 9 | val sc = new SparkContext("local", "Cogroup Test") 10 | 11 | val data1 = Array[(String, Int)](("A", 1), ("A", 2), 12 | ("B", 3), ("B", 4), 13 | ("C", 5), ("C", 6)) 14 | 15 | val data2 = Array[(String, Int)](("A", 7), ("A", 8), 16 | ("B", 9), ("C", 0)) 17 | 18 | val data3 = Array[(String, Int)](("A", 10), ("B", 11)) 19 | 20 | val pairs1 = sc.parallelize(data1, 3) 21 | val pairs2 = sc.parallelize(data2, 2) 22 | val pairs3 = sc.parallelize(data3, 3) 23 | 24 | val result1 = pairs1.cogroup(pairs2) 25 | result1.foreach(println) 26 | 27 | // val result2 = pairs1.cogroup(pairs2, pairs3) 28 | // result2.foreach(println) 29 | // 30 | // val result3 = pairs1.cogroup(pairs2, 1) 31 | // result3.foreachWith(i => i)((x, i) => println("[PartitionIndex " + i + "] " + x)) 32 | // 33 | // val result4 = pairs1.cogroup(pairs2, new RangePartitioner(2, pairs1)) 34 | // result4.foreachWith(i => i)((x, i) => println("[PartitionIndex " + i + "] " + x)) 35 | } 36 | } -------------------------------------------------------------------------------- /src/pretty/examples/GroupByKeyPair.scala: -------------------------------------------------------------------------------- 1 | package pretty.examples 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.SparkContext._ 5 | import org.apache.spark.RangePartitioner 6 | 7 | object GroupByKeyPair { 8 | 9 | def main(args: Array[String]) { 10 | 11 | val sc = new SparkContext("local", "GroupByKeyPair Test") 12 | val d = sc.parallelize(1 to 100, 10) 13 | 14 | val pairs = d.keyBy(x => x % 10) 15 | 16 | val result1 = pairs.groupByKey() 17 | val result2 = pairs.groupByKey(3) 18 | val result3 = pairs.groupByKey(new RangePartitioner(3, pairs)) 19 | 20 | println("Result 1:") 21 | result1.foreach(println) 22 | 23 | println("Result 2:") 24 | result2.foreach(println) 25 | 26 | println("Result 3:") 27 | result3.foreach(println) 28 | 29 | } 30 | } --------------------------------------------------------------------------------