├── .cache
├── .classpath
├── .gitignore
├── .project
├── .settings
├── org.eclipse.jdt.core.prefs
└── org.scala-ide.sdt.core.prefs
└── src
├── .DS_Store
├── api
└── examples
│ ├── Cartesian.scala
│ ├── Checkpoint.scala
│ ├── Coalesce.scala
│ ├── Cogroup.scala
│ ├── Collect.scala
│ ├── CollectAsMap.scala
│ ├── CombineByKey.scala
│ ├── CountApproxDistinct.scala
│ ├── GroupByKeyPair.scala
│ ├── IntersectionTest.scala
│ ├── Sample.scala
│ └── Utils.scala
├── internals
├── IntersectionTest.scala
├── RepartitionTest2.scala
├── broadcastTest.scala
├── cartesianTest.scala
├── coalesceTest.scala
├── cogroupTest.scala
├── complexStages.scala
├── distinctTest.scala
├── groupByKeyTest.scala
├── hashjoinTest.scala
├── joinTest.scala
├── pipeTest.scala
├── reduceByKeyTest.scala
├── repartitionTest.scala
└── sortByKeyTest.scala
├── local
└── examples
│ ├── Aggregate.scala
│ ├── AggregateOrder.scala
│ ├── Cartesian.scala
│ ├── CollectAsMap.scala
│ ├── FlatMap.scala
│ ├── GenerateRandomText.scala
│ ├── GroupByAction.scala
│ ├── GroupByKey.scala
│ ├── GroupByTest.scala
│ ├── GroupWith.scala
│ ├── JoinAction.scala
│ ├── LocalWordCount.scala
│ ├── LookUpTest.scala
│ ├── MapPartitionsRDDTest.scala
│ ├── MapValuesTest.scala
│ ├── PipedRDDTest.scala
│ ├── ReduceByKeyActionTest.scala
│ ├── ReduceByKeyToDriverTest.scala
│ ├── SparkLR.scala
│ ├── TakeActionTest.scala
│ ├── UnionTest.scala
│ ├── partitionByTest.scala
│ ├── reduceActionTest.scala
│ └── sortByKeyTest.scala
├── org
├── .DS_Store
└── apache
│ ├── .DS_Store
│ └── spark
│ ├── .DS_Store
│ └── examples
│ ├── BroadcastTest.scala
│ ├── CassandraCQLTest.scala
│ ├── CassandraTest.scala
│ ├── DriverSubmissionTest.scala
│ ├── ExceptionHandlingTest.scala
│ ├── GroupByTest.scala
│ ├── HBaseTest.scala
│ ├── HdfsTest.scala
│ ├── LocalALS.scala
│ ├── LocalFileLR.scala
│ ├── LocalKMeans.scala
│ ├── LocalLR.scala
│ ├── LocalPi.scala
│ ├── LogQuery.scala
│ ├── MultiBroadcastTest.scala
│ ├── SimpleSkewedGroupByTest.scala
│ ├── SkewedGroupByTest.scala
│ ├── SparkALS.scala
│ ├── SparkHdfsLR.scala
│ ├── SparkKMeans.scala
│ ├── SparkLR.scala
│ ├── SparkPageRank.scala
│ ├── SparkPi.scala
│ ├── SparkTC.scala
│ ├── SparkTachyonHdfsLR.scala
│ └── SparkTachyonPi.scala
└── pretty
└── examples
├── Aggregate.scala
├── Coalesce.scala
├── CogroupPair.scala
└── GroupByKeyPair.scala
/.cache:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JerryLead/SparkLearning/b1150869e97dff8b623898c88b3635abd9f3bd54/.cache
--------------------------------------------------------------------------------
/.classpath:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /bin
2 |
--------------------------------------------------------------------------------
/.project:
--------------------------------------------------------------------------------
1 |
2 |
3 | SparkLearning
4 |
5 |
6 |
7 |
8 |
9 | org.scala-ide.sdt.core.scalabuilder
10 |
11 |
12 |
13 |
14 |
15 | org.scala-ide.sdt.core.scalanature
16 | org.eclipse.jdt.core.javanature
17 |
18 |
19 |
--------------------------------------------------------------------------------
/.settings/org.eclipse.jdt.core.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled
3 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.7
4 | org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve
5 | org.eclipse.jdt.core.compiler.compliance=1.7
6 | org.eclipse.jdt.core.compiler.debug.lineNumber=generate
7 | org.eclipse.jdt.core.compiler.debug.localVariable=generate
8 | org.eclipse.jdt.core.compiler.debug.sourceFile=generate
9 | org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
10 | org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
11 | org.eclipse.jdt.core.compiler.source=1.7
12 |
--------------------------------------------------------------------------------
/.settings/org.scala-ide.sdt.core.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | organizeimports.expandcollapse=expand
3 | organizeimports.groups=java$scala$org$com
4 | organizeimports.scalapackage=false
5 | organizeimports.wildcards=scalaz$scalaz.Scalaz
6 |
--------------------------------------------------------------------------------
/src/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JerryLead/SparkLearning/b1150869e97dff8b623898c88b3635abd9f3bd54/src/.DS_Store
--------------------------------------------------------------------------------
/src/api/examples/Cartesian.scala:
--------------------------------------------------------------------------------
1 | package api.examples
2 |
3 | import org.apache.spark.SparkContext
4 |
5 | object Cartesian {
6 | def main(args: Array[String]) {
7 | val sc = new SparkContext("local", "Cartesian Test")
8 |
9 | val x = sc.parallelize(List(1, 2, 3, 4, 5))
10 | val y = sc.parallelize(List(6, 7, 8, 9, 10))
11 |
12 | println(x ++ y ++ x)
13 | val result = x.cartesian(y)
14 | //result.collect
15 | result.foreach(println)
16 | }
17 | }
--------------------------------------------------------------------------------
/src/api/examples/Checkpoint.scala:
--------------------------------------------------------------------------------
1 | package api.examples
2 |
3 | import org.apache.spark.SparkContext
4 |
5 | object Checkpoint {
6 | def main(args: Array[String]) {
7 | val sc = new SparkContext("local", "Checkpoint Test")
8 |
9 | sc.setCheckpointDir("/Users/xulijie/Documents/data/checkpoint")
10 | val a = sc.parallelize(1 to 4, 2)
11 | a.checkpoint
12 | a.count
13 | }
14 | }
--------------------------------------------------------------------------------
/src/api/examples/Coalesce.scala:
--------------------------------------------------------------------------------
1 | package api.examples
2 |
3 | import org.apache.spark.SparkContext
4 |
5 | object Coalesce {
6 | def main(args: Array[String]) {
7 | val sc = new SparkContext("local", "Coalesce Test")
8 |
9 | val y = sc.parallelize(1 to 10, 10)
10 |
11 | y.foreach(println)
12 |
13 | val z = y.coalesce(2, true)
14 |
15 | z.foreach(println)
16 | }
17 | }
--------------------------------------------------------------------------------
/src/api/examples/Cogroup.scala:
--------------------------------------------------------------------------------
1 | package api.examples
2 |
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.SparkContext._
5 |
6 | object Cogroup {
7 | def main(args: Array[String]) {
8 | val sc = new SparkContext("local", "Cogroup Test")
9 |
10 | val a = sc.parallelize(List(1, 2, 1, 3), 2)
11 | val b = sc.parallelize(List(1, 2, 3, 4, 5, 6), 3)
12 | val d = a.map((_, "b"))
13 | //b.foreach(println)
14 | // output:
15 | // (1,b)
16 | // (2,b)
17 | // (1,b)
18 | // (3,b)
19 | val e = b.map((_, "c"))
20 | //c.foreach(println)
21 | // output:
22 | // (1,c)
23 | // (2,c)
24 | // (1,c)
25 | // (3,c)
26 |
27 | //val result = b.cogroup(c)
28 | val result = d.cogroup(e, 4)
29 | result.foreach(println)
30 | println(result.toDebugString)
31 | // output:
32 | // (1,(ArrayBuffer(b, b),ArrayBuffer(c, c)))
33 | // (3,(ArrayBuffer(b),ArrayBuffer(c)))
34 | // (2,(ArrayBuffer(b),ArrayBuffer(c)))
35 |
36 | /*
37 | * MappedValuesRDD[5] at cogroup at Cogroup.scala:28 (3 partitions)
38 | * CoGroupedRDD[4] at cogroup at Cogroup.scala:28 (3 partitions)
39 | * MappedRDD[2] at map at Cogroup.scala:12 (2 partitions)
40 | * ParallelCollectionRDD[0] at parallelize at Cogroup.scala:10 (2 partitions)
41 | * MappedRDD[3] at map at Cogroup.scala:19 (3 partitions)
42 | * ParallelCollectionRDD[1] at parallelize at Cogroup.scala:11 (3 partitions)
43 | *
44 | */
45 | }
46 | }
--------------------------------------------------------------------------------
/src/api/examples/Collect.scala:
--------------------------------------------------------------------------------
1 | package api.examples
2 |
3 | import org.apache.spark.SparkContext
4 |
5 | object Collect {
6 | def main(args: Array[String]) {
7 | val sc = new SparkContext("local", "Collect Test")
8 |
9 | val c = sc.parallelize(List("Gnu", "cat", "Rat", "Dog", "Gnu", "Rat"), 2)
10 |
11 | val result = c.collect
12 | result.foreach(println)
13 | }
14 | }
--------------------------------------------------------------------------------
/src/api/examples/CollectAsMap.scala:
--------------------------------------------------------------------------------
1 | package api.examples
2 |
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.SparkContext._
5 |
6 | object CollectAsMap {
7 | def main(args: Array[String]) {
8 | val sc = new SparkContext("local", "CollectAsMap Test")
9 |
10 | val a = sc.parallelize(List(1, 2, 1, 3), 1)
11 | val b = a.zip(a)
12 |
13 | val result = b.collectAsMap
14 |
15 | result.foreach(println)
16 |
17 | // output:
18 | // (2,2)
19 | // (1,1)
20 | // (3,3)
21 | }
22 | }
--------------------------------------------------------------------------------
/src/api/examples/CombineByKey.scala:
--------------------------------------------------------------------------------
1 | package api.examples
2 |
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.SparkContext._
5 |
6 | object CombineByKey {
7 | def main(args: Array[String]) {
8 | val sc = new SparkContext("local", "CombineByKey Test")
9 |
10 | val a = sc.parallelize(List("dog", "cat", "gnu", "salmon", "rabbit", "turkey", "wolf", "bear", "bee"), 3)
11 | val b = sc.parallelize(List(1, 1, 2, 2, 2, 1, 2, 2, 2), 3)
12 | val c = b.zip(a)
13 |
14 | val d = c.combineByKey(List(_), (x:List[String], y:String)
15 | => y :: x, (x:List[String], y:List[String]) => x ::: y)
16 |
17 | val result = d.collect
18 | result.foreach(println)
19 | println("RDD graph:\n" + d.toDebugString)
20 | }
21 | }
--------------------------------------------------------------------------------
/src/api/examples/CountApproxDistinct.scala:
--------------------------------------------------------------------------------
1 | package api.examples
2 |
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.SparkContext._
5 |
6 | object CountApproxDistinct {
7 | def main(args: Array[String]) {
8 | val sc = new SparkContext("local", "CountApproxDistinct Test")
9 |
10 | val a = sc.parallelize(1 to 10000, 20)
11 | val b = a++a++a++a++a
12 |
13 | val result = b.countApproxDistinct(0.1)
14 | println(result)
15 | //println(b.countApproxDistinct(0.05))
16 | //println(b.countApproxDistinct(0.01))
17 | //println(b.countApproxDistinct(0.001))
18 |
19 | }
20 | }
--------------------------------------------------------------------------------
/src/api/examples/GroupByKeyPair.scala:
--------------------------------------------------------------------------------
1 | package api.examples
2 |
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.SparkContext._
5 | import org.apache.spark.RangePartitioner
6 |
7 | object GroupByKeyPair {
8 |
9 | def main(args: Array[String]) {
10 |
11 | val sc = new SparkContext("local", "GroupByKeyPair Test")
12 | val d = sc.parallelize(1 to 100, 10)
13 |
14 | val pairs = d.keyBy(x => x % 10)
15 |
16 | val result1 = pairs.groupByKey()
17 | //val result2 = pairs.groupByKey(3)
18 | //val result3 = pairs.groupByKey(new RangePartitioner(3, pairs))
19 |
20 | println("Result 1:")
21 | result1.foreach(println)
22 |
23 | //println("Result 2:")
24 | //result2.foreach(println)
25 |
26 | //println("Result 3:")
27 | //result3.foreach(println)
28 |
29 | }
30 | }
--------------------------------------------------------------------------------
/src/api/examples/IntersectionTest.scala:
--------------------------------------------------------------------------------
1 | package api.examples
2 |
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.SparkContext._
5 | import org.apache.spark.RangePartitioner
6 |
7 | object IntersectionTest {
8 |
9 | def main(args: Array[String]) {
10 |
11 | val sc = new SparkContext("local", "Intersection Test")
12 | val a = sc.parallelize(List(1, 2, 3, 3, 4, 5), 3)
13 | val b = sc.parallelize(List(1, 2, 5, 6), 2)
14 | //val c = sc.parallelize(List(1, 2, 3), 1)
15 |
16 | val r = a.intersection(b)
17 | //r.foreachWith(i => i)((x, i) => println("[PartitionIndex " + i + "] " + x))
18 |
19 | println(r.toDebugString)
20 | // [PartitionIndex 1] 1
21 | // [PartitionIndex 2] 5
22 | // [PartitionIndex 2] 2
23 | }
24 | }
--------------------------------------------------------------------------------
/src/api/examples/Sample.scala:
--------------------------------------------------------------------------------
1 | package api.examples
2 |
3 | import org.apache.spark.SparkContext
4 |
5 |
6 | object Sample {
7 |
8 | def main(args: Array[String]) {
9 |
10 | val sc = new SparkContext("local", "Sample Test")
11 | val d = sc.parallelize(1 to 100, 10)
12 |
13 | val result1 = d.sample(false, 0.1, 0)
14 | val result2 = d.sample(true, 0.1, 0)
15 |
16 | println(result1.toDebugString)
17 |
18 | println("result 1:")
19 | result1.collect.foreach(x => print(x + " "))
20 | println("\nresutl 2:")
21 | result2.collect.foreach(x => print(x + " "))
22 | //result1.foreachWith(i => i)((x, i) => println("[PartitionIndex " + i + "] " + x))
23 | //result2.foreachWith(i => i)((x, i) => println("[PartitionIndex " + i + "] " + x))
24 | }
25 | }
--------------------------------------------------------------------------------
/src/api/examples/Utils.scala:
--------------------------------------------------------------------------------
1 | package api.examples
2 |
3 | class Utils {
4 |
5 | //def print(rdd: RDD[T])
6 | }
--------------------------------------------------------------------------------
/src/internals/IntersectionTest.scala:
--------------------------------------------------------------------------------
1 | package internals
2 |
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.SparkContext._
5 | import org.apache.spark.RangePartitioner
6 |
7 | object IntersectionTest {
8 |
9 | def main(args: Array[String]) {
10 |
11 | val sc = new SparkContext("local", "Intersection Test")
12 | val a = sc.parallelize(List(1, 2, 3, 3, 4, 5), 3)
13 | val b = sc.parallelize(List(1, 2, 5, 6), 2)
14 |
15 |
16 | val r = a.intersection(b)
17 |
18 | a.foreachWith(i => i)((x, i) => println("[aIndex " + i + "] " + x))
19 | b.foreachWith(i => i)((x, i) => println("[bIndex " + i + "] " + x))
20 | r.foreachWith(i => i)((x, i) => println("[PartitionIndex " + i + "] " + x))
21 |
22 | println(r.toDebugString)
23 |
24 | /*
25 | [aIndex 0] 1
26 | [aIndex 0] 2
27 |
28 | [aIndex 1] 3
29 | [aIndex 1] 3
30 |
31 | [aIndex 2] 4
32 | [aIndex 2] 5
33 |
34 | [bIndex 0] 1
35 | [bIndex 0] 2
36 |
37 | [bIndex 1] 5
38 | [bIndex 1] 6
39 |
40 | [PartitionIndex 1] 1
41 |
42 | [PartitionIndex 2] 5
43 | [PartitionIndex 2] 2
44 |
45 | MappedRDD[7] at intersection at IntersectionTest.scala:16 (3 partitions)
46 | FilteredRDD[6] at intersection at IntersectionTest.scala:16 (3 partitions)
47 | MappedValuesRDD[5] at intersection at IntersectionTest.scala:16 (3 partitions)
48 | CoGroupedRDD[4] at intersection at IntersectionTest.scala:16 (3 partitions)
49 | MappedRDD[2] at intersection at IntersectionTest.scala:16 (3 partitions)
50 | ParallelCollectionRDD[0] at parallelize at IntersectionTest.scala:12 (3 partitions)
51 | MappedRDD[3] at intersection at IntersectionTest.scala:16 (2 partitions)
52 | ParallelCollectionRDD[1] at parallelize at IntersectionTest.scala:13 (2 partitions)
53 | */
54 | }
55 | }
--------------------------------------------------------------------------------
/src/internals/RepartitionTest2.scala:
--------------------------------------------------------------------------------
1 | package internals
2 |
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.SparkContext._
5 | import org.apache.spark.HashPartitioner
6 |
7 | object RepartitionTest2 {
8 | def main(args: Array[String]) {
9 |
10 | val sc = new SparkContext("local", "repartition Test")
11 | val data = Array[(Int, Char)]((3, 'a'), (2, 'b'),
12 | (1, 'c'), (4, 'd'))
13 | val pairs1 = sc.parallelize(data, 3).partitionBy(new HashPartitioner(2))
14 |
15 | pairs1.foreachWith(i => i)((x, i) => println("[pairs1-Index " + i + "] " + x))
16 | }
17 | }
18 | /*
19 | [pairs1-Index 0] (3,a)
20 | [pairs1-Index 0] (2,b)
21 | [pairs1-Index 0] (1,c)
22 |
23 | [pairs1-Index 1] (4,d)
24 | */
--------------------------------------------------------------------------------
/src/internals/broadcastTest.scala:
--------------------------------------------------------------------------------
1 | package internals
2 |
3 | import org.apache.spark.SparkConf
4 | import org.apache.spark.SparkContext
5 |
6 | object broadcast {
7 | def main(args: Array[String]) {
8 |
9 | val bcName = "Http"
10 | val blockSize = "4096"
11 |
12 | System.setProperty("spark.broadcast.factory", "org.apache.spark.broadcast." + bcName +
13 | "BroadcastFactory")
14 | System.setProperty("spark.broadcast.blockSize", blockSize)
15 | val sparkConf = new SparkConf().setAppName("Broadcast Test").setMaster("local")
16 |
17 | val sc = new SparkContext(sparkConf)
18 |
19 | val slices = 2
20 | val num = 100
21 |
22 | val arr1 = new Array[Int](num)
23 |
24 | for (i <- 0 until arr1.length) {
25 | arr1(i) = i
26 | }
27 |
28 | val data = sc.makeRDD(List(1, 2, 3, 4, 5, 6), 2)
29 |
30 | val barr1 = sc.broadcast(arr1)
31 | val observedSizes = sc.parallelize(1 to 4, slices).map(_ => barr1.value.size)
32 | // Collect the small RDD so we can print the observed sizes locally.
33 | observedSizes.collect().foreach(i => println(i))
34 |
35 | //println(barr1.value.size)
36 | //barr1.value.collect
37 | }
38 | }
--------------------------------------------------------------------------------
/src/internals/cartesianTest.scala:
--------------------------------------------------------------------------------
1 | package internals
2 |
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.SparkContext._
5 |
6 | object cartesianTest {
7 | def main(args: Array[String]) {
8 |
9 | val sc = new SparkContext("local", "cartesian Test")
10 | val data1 = Array[(Int, Char)]((1, 'a'), (2, 'b'),
11 | (3, 'c'), (4, 'd'))
12 | val pairs1 = sc.parallelize(data1, 2)
13 |
14 | val data2 = Array[(Int, Char)]((1, 'A'), (2, 'B'))
15 | val pairs2 = sc.parallelize(data2, 2)
16 |
17 | val result = pairs1.cartesian(pairs2)
18 |
19 | //pairs1.foreachWith(i => i)((x, i) => println("[pairs1-Index " + i + "] " + x))
20 | //pairs2.foreachWith(i => i)((x, i) => println("[pairs2-Index " + i + "] " + x))
21 | result.foreachWith(i => i)((x, i) => println("[PartitionIndex " + i + "] " + x))
22 |
23 | //println(result.toDebugString)
24 | }
25 | }
26 | /*
27 | [pairs1-Index 0] (1,a)
28 | [pairs1-Index 0] (2,b)
29 |
30 | [pairs1-Index 1] (3,c)
31 | [pairs1-Index 1] (4,d)
32 |
33 | [pairs2-Index 0] (1,A)
34 | [pairs2-Index 1] (2,B)
35 |
36 | [PartitionIndex 0] ((1,a),(1,A))
37 | [PartitionIndex 0] ((2,b),(1,A))
38 |
39 | [PartitionIndex 1] ((1,a),(2,B))
40 | [PartitionIndex 1] ((2,b),(2,B))
41 |
42 | [PartitionIndex 2] ((3,c),(1,A))
43 | [PartitionIndex 2] ((4,d),(1,A))
44 |
45 | [PartitionIndex 3] ((3,c),(2,B))
46 | [PartitionIndex 3] ((4,d),(2,B))
47 |
48 |
49 |
50 | CartesianRDD[2] at cartesian at cartesianTest.scala:17 (4 partitions)
51 | ParallelCollectionRDD[0] at parallelize at cartesianTest.scala:12 (2 partitions)
52 | ParallelCollectionRDD[1] at parallelize at cartesianTest.scala:15 (2 partitions)
53 |
54 | */
55 |
56 |
--------------------------------------------------------------------------------
/src/internals/coalesceTest.scala:
--------------------------------------------------------------------------------
1 | package internals
2 |
3 | import org.apache.spark.SparkContext
4 |
5 | object coalesceTest {
6 | def main(args: Array[String]) {
7 | val sc = new SparkContext("local", "Coalesce Test")
8 |
9 | //val y = sc.parallelize(1 to 10, 5)
10 | val y = sc.parallelize(List(1, 2, 3, 4, 5, 2, 5, 8, 3, 10), 5)
11 | // y.foreachWith(i => i)((x, i) => println("[yPartitionIndex " + i + "] " + x))
12 |
13 | val z = y.coalesce(10, false)
14 |
15 | y.foreachWith(i => i)((x, i) => println("[yPartitionIndex " + i + "] " + x))
16 | z.foreachWith(i => i)((x, i) => println("[zPartitionIndex " + i + "] " + x))
17 |
18 | println(z.toDebugString)
19 | }
20 | }
21 |
22 | /*
23 | [yPartitionIndex 0] 1
24 | [yPartitionIndex 0] 2
25 |
26 | [yPartitionIndex 1] 3
27 | [yPartitionIndex 1] 4
28 |
29 | [yPartitionIndex 2] 5
30 | [yPartitionIndex 2] 6
31 |
32 | [yPartitionIndex 3] 7
33 | [yPartitionIndex 3] 8
34 |
35 | [yPartitionIndex 4] 9
36 | [yPartitionIndex 4] 10
37 |
38 | [zPartitionIndex 0] 1
39 | [zPartitionIndex 0] 2
40 |
41 | [zPartitionIndex 1] 3
42 | [zPartitionIndex 1] 4
43 | [zPartitionIndex 1] 5
44 | [zPartitionIndex 1] 6
45 |
46 | [zPartitionIndex 2] 7
47 | [zPartitionIndex 2] 8
48 | [zPartitionIndex 2] 9
49 | [zPartitionIndex 2] 10
50 |
51 |
52 | CoalescedRDD[1] at coalesce at coalesceTest.scala:13 (3 partitions)
53 | ParallelCollectionRDD[0] at parallelize at coalesceTest.scala:9 (5 partitions)
54 |
55 |
56 | [zPartitionIndex 0] 6
57 | [zPartitionIndex 0] 7
58 | [zPartitionIndex 0] 9
59 |
60 | [zPartitionIndex 1] 1
61 | [zPartitionIndex 1] 3
62 | [zPartitionIndex 1] 8
63 | [zPartitionIndex 1] 10
64 |
65 | [zPartitionIndex 2] 2
66 | [zPartitionIndex 2] 4
67 | [zPartitionIndex 2] 5
68 |
69 |
70 |
71 |
72 | MappedRDD[4] at coalesce at coalesceTest.scala:13 (3 partitions)
73 | CoalescedRDD[3] at coalesce at coalesceTest.scala:13 (3 partitions)
74 | ShuffledRDD[2] at coalesce at coalesceTest.scala:13 (3 partitions)
75 | MapPartitionsRDD[1] at coalesce at coalesceTest.scala:13 (5 partitions)
76 | ParallelCollectionRDD[0] at parallelize at coalesceTest.scala:9 (5 partitions)
77 |
78 |
79 | */
--------------------------------------------------------------------------------
/src/internals/cogroupTest.scala:
--------------------------------------------------------------------------------
1 | package internals
2 |
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.SparkContext._
5 | import org.apache.spark.RangePartitioner
6 |
7 | object cogroupTest {
8 |
9 | def main(args: Array[String]) {
10 |
11 | val sc = new SparkContext("local", "cogroup Test")
12 | val a = sc.parallelize(List(1, 2, 3, 3, 4, 5), 3).map(x => (x, 'a'))
13 | val b = sc.parallelize(List(1, 2, 5, 6), 2).map(y => (y, 'b'))
14 |
15 |
16 | val r = a.cogroup(b)
17 |
18 | a.foreachWith(i => i)((x, i) => println("[aIndex " + i + "] " + x))
19 | b.foreachWith(i => i)((x, i) => println("[bIndex " + i + "] " + x))
20 | r.foreachWith(i => i)((x, i) => println("[PartitionIndex " + i + "] " + x))
21 |
22 | println(r.toDebugString)
23 |
24 | /*
25 | [aIndex 0] (1,a)
26 | [aIndex 0] (2,a)
27 |
28 | [aIndex 1] (3,a)
29 | [aIndex 1] (3,a)
30 |
31 | [aIndex 2] (4,a)
32 | [aIndex 2] (5,a)
33 |
34 | [bIndex 0] (1,b)
35 | [bIndex 0] (2,b)
36 |
37 | [bIndex 1] (5,b)
38 | [bIndex 1] (6,b)
39 |
40 | [PartitionIndex 0] (6,(ArrayBuffer(),ArrayBuffer(b)))
41 | [PartitionIndex 0] (3,(ArrayBuffer(a, a),ArrayBuffer()))
42 |
43 | [PartitionIndex 1] (4,(ArrayBuffer(a),ArrayBuffer()))
44 | [PartitionIndex 1] (1,(ArrayBuffer(a),ArrayBuffer(b)))
45 |
46 |
47 | [PartitionIndex 2] (5,(ArrayBuffer(a),ArrayBuffer(b)))
48 | [PartitionIndex 2] (2,(ArrayBuffer(a),ArrayBuffer(b)))
49 |
50 | MappedValuesRDD[5] at cogroup at cogroupTest.scala:16 (3 partitions)
51 | CoGroupedRDD[4] at cogroup at cogroupTest.scala:16 (3 partitions)
52 | MappedRDD[1] at map at cogroupTest.scala:12 (3 partitions)
53 | ParallelCollectionRDD[0] at parallelize at cogroupTest.scala:12 (3 partitions)
54 | MappedRDD[3] at map at cogroupTest.scala:13 (2 partitions)
55 | ParallelCollectionRDD[2] at parallelize at cogroupTest.scala:13 (2 partitions)
56 | */
57 | }
58 | }
--------------------------------------------------------------------------------
/src/internals/complexStages.scala:
--------------------------------------------------------------------------------
1 | package internals
2 |
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.SparkContext._
5 | import org.apache.spark.HashPartitioner
6 |
7 |
8 | object complexStagesTest {
9 | def main(args: Array[String]) {
10 |
11 | val sc = new SparkContext("local", "complexStages Test")
12 |
13 |
14 | val data1 = Array[(Int, Char)](
15 | (1, 'a'), (2, 'b'),
16 | (3, 'c'), (4, 'd'),
17 | (5, 'e'), (3, 'f'),
18 | (2, 'g'), (1, 'h'))
19 | val rangePairs1 = sc.parallelize(data1, 3)
20 |
21 | val hashPairs1 = rangePairs1.partitionBy(new HashPartitioner(3))
22 |
23 |
24 | val data2 = Array[(Int, String)]((1, "A"), (2, "B"),
25 | (3, "C"), (4, "D"))
26 |
27 | val pairs2 = sc.parallelize(data2, 2)
28 | val rangePairs2 = pairs2.map(x => (x._1, x._2.charAt(0)))
29 |
30 |
31 | val data3 = Array[(Int, Char)]((1, 'X'), (2, 'Y'))
32 | val rangePairs3 = sc.parallelize(data3, 2)
33 |
34 |
35 | val rangePairs = rangePairs2.union(rangePairs3)
36 |
37 |
38 | val result = hashPairs1.join(rangePairs)
39 |
40 | result.foreachWith(i => i)((x, i) => println("[result " + i + "] " + x))
41 |
42 | println(result.toDebugString)
43 | }
44 | }
--------------------------------------------------------------------------------
/src/internals/distinctTest.scala:
--------------------------------------------------------------------------------
1 | package internals
2 |
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.SparkContext._
5 |
6 | object distinctTest {
7 | def main(args: Array[String]) {
8 | val sc = new SparkContext("local", "distinct test")
9 |
10 | val pairs = sc.parallelize(List(1, 2, 2, 3, 2, 1, 4, 5), 3)
11 |
12 | val result = pairs.distinct(2)
13 |
14 | // output
15 | // [PartitionIndex 0] 1
16 | // [PartitionIndex 0] 2
17 |
18 | // [PartitionIndex 1] 2
19 | // [PartitionIndex 1] 3
20 | // [PartitionIndex 1] 2
21 |
22 | // [PartitionIndex 2] 1
23 | // [PartitionIndex 2] 4
24 | // [PartitionIndex 2] 5
25 |
26 | pairs.foreachWith(i => i)((x, i) => println("[PartitionIndex " + i + "] " + x))
27 | result.foreachWith(i => i)((x, i) => println("[PartitionIndex " + i + "] " + x))
28 |
29 | // output
30 | // [PartitionIndex 0] 4
31 | // [PartitionIndex 0] 2
32 |
33 | // [PartitionIndex 1] 1
34 | // [PartitionIndex 1] 3
35 | // [PartitionIndex 1] 5
36 |
37 | println(result.toDebugString)
38 | }
39 |
40 | /*
41 | MappedRDD[5] at distinct at distinctTest.scala:12 (2 partitions)
42 | MapPartitionsRDD[4] at distinct at distinctTest.scala:12 (2 partitions)
43 | ShuffledRDD[3] at distinct at distinctTest.scala:12 (2 partitions)
44 | MapPartitionsRDD[2] at distinct at distinctTest.scala:12 (3 partitions)
45 | MappedRDD[1] at distinct at distinctTest.scala:12 (3 partitions)
46 | ParallelCollectionRDD[0] at parallelize at distinctTest.scala:10 (3 partitions)
47 | *
48 | */
49 | }
--------------------------------------------------------------------------------
/src/internals/groupByKeyTest.scala:
--------------------------------------------------------------------------------
1 | package internals
2 |
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.SparkContext._
5 | import org.apache.spark.SparkConf
6 |
7 | object groupByKeyTest {
8 |
9 | def main(args: Array[String]) {
10 | val conf = new SparkConf().setAppName("GroupByKey").setMaster("local")
11 | val sc = new SparkContext(conf)
12 | sc.setCheckpointDir("/Users/xulijie/Documents/data/checkpoint")
13 |
14 | val data = Array[(Int, Char)]((1, 'a'), (2, 'b'),
15 | (3, 'c'), (4, 'd'),
16 | (5, 'e'), (3, 'f'),
17 | (2, 'g'), (1, 'h')
18 |
19 | )
20 | val pairs = sc.parallelize(data, 3)
21 |
22 | pairs.checkpoint
23 | pairs.count
24 |
25 | val result = pairs.groupByKey(2)
26 |
27 | // output:
28 | //pairs.foreachWith(i => i)((x, i) => println("[dataPartitionIndex " + i + "] " + x))
29 | result.foreachWith(i => i)((x, i) => println("[PartitionIndex " + i + "] " + x))
30 |
31 | println(result.toDebugString)
32 |
33 | /*
34 | [dataPartitionIndex 0] (1,a)
35 | [dataPartitionIndex 0] (2,b)
36 |
37 | [dataPartitionIndex 1] (3,c)
38 | [dataPartitionIndex 1] (4,d)
39 | [dataPartitionIndex 1] (5,e)
40 |
41 | [dataPartitionIndex 2] (3,f)
42 | [dataPartitionIndex 2] (2,g)
43 | [dataPartitionIndex 2] (1,h)
44 |
45 | [PartitionIndex 0] (4,ArrayBuffer(d))
46 | [PartitionIndex 0] (2,ArrayBuffer(b, g))
47 |
48 | [PartitionIndex 1] (1,ArrayBuffer(a, h))
49 | [PartitionIndex 1] (3,ArrayBuffer(c, f))
50 | [PartitionIndex 1] (5,ArrayBuffer(e))
51 |
52 | MappedValuesRDD[3] at groupByKey at groupByKeyTest.scala:19 (2 partitions)
53 | MapPartitionsRDD[2] at groupByKey at groupByKeyTest.scala:19 (2 partitions)
54 | ShuffledRDD[1] at groupByKey at groupByKeyTest.scala:19 (2 partitions)
55 | ParallelCollectionRDD[0] at parallelize at groupByKeyTest.scala:17 (3 partitions)
56 | */
57 | }
58 | }
--------------------------------------------------------------------------------
/src/internals/hashjoinTest.scala:
--------------------------------------------------------------------------------
1 | package internals
2 |
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.SparkContext._
5 | import org.apache.spark.HashPartitioner
6 |
7 | object hashjoinTest {
8 | def main(args: Array[String]) {
9 |
10 | val sc = new SparkContext("local", "hashjoin Test")
11 | val data1 = Array[(Int, Char)]((1, 'a'), (2, 'b'),
12 | (3, 'c'), (4, 'd'),
13 | (5, 'e'), (3, 'f'),
14 | (2, 'g'), (1, 'h'))
15 | val pairs1 = sc.parallelize(data1, 3).partitionBy(new HashPartitioner(3))
16 |
17 |
18 | val data2 = Array[(Int, Char)]((1, 'A'), (2, 'B'),
19 | (3, 'C'), (4, 'D'))
20 | val pairs2 = sc.parallelize(data2, 2)
21 |
22 | val result = pairs1.join(pairs2)
23 |
24 | //pairs1.foreachWith(i => i)((x, i) => println("[pairs1-Index " + i + "] " + x))
25 | //pairs2.foreachWith(i => i)((x, i) => println("[pairs2-Index " + i + "] " + x))
26 | result.foreachWith(i => i)((x, i) => println("[PartitionIndex " + i + "] " + x))
27 |
28 | println(result.toDebugString)
29 | /*
30 | [pairs1-Index 0] (1,a)
31 | [pairs1-Index 0] (2,b)
32 |
33 | [pairs1-Index 1] (3,c)
34 | [pairs1-Index 1] (4,d)
35 | [pairs1-Index 1] (5,e)
36 |
37 | [pairs1-Index 2] (3,f)
38 | [pairs1-Index 2] (2,g)
39 | [pairs1-Index 2] (1,h)
40 |
41 | [pairs2-Index 0] (1,A)
42 | [pairs2-Index 0] (2,B)
43 |
44 | [pairs2-Index 1] (3,C)
45 | [pairs2-Index 1] (4,D)
46 |
47 | [PartitionIndex 0] (3,(c,C))
48 | [PartitionIndex 0] (3,(f,C))
49 |
50 | [PartitionIndex 1] (4,(d,D))
51 | [PartitionIndex 1] (1,(a,A))
52 | [PartitionIndex 1] (1,(h,A))
53 |
54 | [PartitionIndex 2] (2,(b,B))
55 | [PartitionIndex 2] (2,(g,B))
56 |
57 | FlatMappedValuesRDD[4] at join at joinTest.scala:20 (3 partitions)
58 | MappedValuesRDD[3] at join at joinTest.scala:20 (3 partitions)
59 | CoGroupedRDD[2] at join at joinTest.scala:20 (3 partitions)
60 | ParallelCollectionRDD[0] at parallelize at joinTest.scala:14 (3 partitions)
61 | ParallelCollectionRDD[1] at parallelize at joinTest.scala:18 (2 partitions)
62 |
63 | */
64 | }
65 | }
--------------------------------------------------------------------------------
/src/internals/joinTest.scala:
--------------------------------------------------------------------------------
1 | package internals
2 |
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.SparkContext._
5 | import org.apache.spark.HashPartitioner
6 |
7 | object joinTest {
8 | def main(args: Array[String]) {
9 |
10 | val sc = new SparkContext("local", "join Test")
11 |
12 |
13 | val data1 = Array[(Int, Char)]((1, 'a'), (2, 'b'),
14 | (3, 'c'), (4, 'd'),
15 | (5, 'e'), (3, 'f'),
16 | (2, 'g'), (1, 'h'))
17 | val pairs1 = sc.parallelize(data1, 3)
18 |
19 |
20 | val data2 = Array[(Int, Char)]((1, 'A'), (2, 'B'),
21 | (3, 'C'), (4, 'D'))
22 | val pairs2 = sc.parallelize(data2, 2)
23 |
24 |
25 | val result = pairs1.join(pairs2)
26 |
27 | //pairs1.foreachWith(i => i)((x, i) => println("[pairs1-Index " + i + "] " + x))
28 | //pairs2.foreachWith(i => i)((x, i) => println("[pairs2-Index " + i + "] " + x))
29 | result.foreachWith(i => i)((x, i) => println("[PartitionIndex " + i + "] " + x))
30 |
31 | println(result.toDebugString)
32 |
33 | /*
34 | [pairs1-Index 0] (1,a)
35 | [pairs1-Index 0] (2,b)
36 |
37 | [pairs1-Index 1] (3,c)
38 | [pairs1-Index 1] (4,d)
39 | [pairs1-Index 1] (5,e)
40 |
41 | [pairs1-Index 2] (3,f)
42 | [pairs1-Index 2] (2,g)
43 | [pairs1-Index 2] (1,h)
44 |
45 | [pairs2-Index 0] (1,A)
46 | [pairs2-Index 0] (2,B)
47 |
48 | [pairs2-Index 1] (3,C)
49 | [pairs2-Index 1] (4,D)
50 |
51 | [PartitionIndex 0] (3,(c,C))
52 | [PartitionIndex 0] (3,(f,C))
53 |
54 | [PartitionIndex 1] (4,(d,D))
55 | [PartitionIndex 1] (1,(a,A))
56 | [PartitionIndex 1] (1,(h,A))
57 |
58 | [PartitionIndex 2] (2,(b,B))
59 | [PartitionIndex 2] (2,(g,B))
60 |
61 | FlatMappedValuesRDD[4] at join at joinTest.scala:20 (3 partitions)
62 | MappedValuesRDD[3] at join at joinTest.scala:20 (3 partitions)
63 | CoGroupedRDD[2] at join at joinTest.scala:20 (3 partitions)
64 | ParallelCollectionRDD[0] at parallelize at joinTest.scala:14 (3 partitions)
65 | ParallelCollectionRDD[1] at parallelize at joinTest.scala:18 (2 partitions)
66 |
67 | */
68 | }
69 | }
--------------------------------------------------------------------------------
/src/internals/pipeTest.scala:
--------------------------------------------------------------------------------
1 | package internals
2 |
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.SparkContext._
5 |
6 | object pipeTest {
7 | def main(args: Array[String]) {
8 |
9 | val sc = new SparkContext("local", "cartesian Test")
10 |
11 | val a = sc.parallelize(1 to 9, 3)
12 | val result = a.pipe("head -n 2")
13 |
14 | a.foreachWith(i => i)((x, i) => println("[aIndex " + i + "] " + x))
15 | result.foreachWith(i => i)((x, i) => println("[PartitionIndex " + i + "] " + x))
16 |
17 | println(result.toDebugString)
18 | }
19 | }
20 | /*
21 | [PartitionIndex 0] 1
22 | [PartitionIndex 0] 2
23 |
24 | [PartitionIndex 1] 4
25 | [PartitionIndex 1] 5
26 |
27 | [PartitionIndex 2] 7
28 | [PartitionIndex 2] 8
29 |
30 |
31 |
32 | PipedRDD[1] at pipe at pipeTest.scala:12 (3 partitions)
33 | ParallelCollectionRDD[0] at parallelize at pipeTest.scala:11 (3 partitions)
34 |
35 | */
36 |
37 |
--------------------------------------------------------------------------------
/src/internals/reduceByKeyTest.scala:
--------------------------------------------------------------------------------
1 | package internals
2 |
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.SparkContext._
5 |
6 | object reduceByKeyTest {
7 |
8 | def main(args: Array[String]) {
9 | val sc = new SparkContext("local", "ReduceByKey Test")
10 | val data1 = Array[(String, Int)](("A", 1), ("B", 1),
11 | ("C", 1), ("B", 1),
12 | ("C", 1), ("D", 1),
13 | ("C", 1), ("A", 1))
14 | val pairs = sc.parallelize(data1, 3)
15 |
16 | // pairs.foreachWith(i => i)((x, i) => println("[pPartitionIndex " + i + "] " + x))
17 |
18 | // [pPartitionIndex 0] (A,1)
19 | // [pPartitionIndex 0] (B,1)
20 |
21 | // [pPartitionIndex 1] (C,1)
22 | // [pPartitionIndex 1] (B,1)
23 | // [pPartitionIndex 1] (C,1)
24 |
25 | // [pPartitionIndex 2] (D,1)
26 | // [pPartitionIndex 2] (C,1)
27 | // [pPartitionIndex 2] (A,1)
28 |
29 | //val result = pairs.reduce((A, B) => (A._1 + "#" + B._1, A._2 + B._2))
30 | //val result = pairs.fold(("K0",10))((A, B) => (A._1 + "#" + B._1, A._2 + B._2))
31 | val result = pairs.reduceByKey(_ + _, 2)
32 | result.foreachWith(i => i)((x, i) => println("[PartitionIndex " + i + "] " + x))
33 |
34 | println(result.toDebugString)
35 |
36 | // output
37 | // [PartitionIndex 0] (B,2)
38 | // [PartitionIndex 0] (D,1)
39 | // [PartitionIndex 1] (A,2)
40 | // [PartitionIndex 1] (C,3)
41 |
42 | /*
43 | MapPartitionsRDD[3] at reduceByKey at reduceByKeyTest.scala:17 (2 partitions)
44 | ShuffledRDD[2] at reduceByKey at reduceByKeyTest.scala:17 (2 partitions)
45 | MapPartitionsRDD[1] at reduceByKey at reduceByKeyTest.scala:17 (3 partitions)
46 | ParallelCollectionRDD[0] at parallelize at reduceByKeyTest.scala:14 (3 partitions)
47 | */
48 | }
49 | }
--------------------------------------------------------------------------------
/src/internals/repartitionTest.scala:
--------------------------------------------------------------------------------
1 | package internals
2 |
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.SparkContext._
5 |
6 | object repartitionTest {
7 | def main(args: Array[String]) {
8 |
9 | val sc = new SparkContext("local", "Coalesce Test")
10 | val y = sc.parallelize(1 to 100, 5)
11 |
12 | //y.foreach(println)
13 |
14 | val z = y.repartition(2)
15 |
16 | val r = z.takeOrdered(7)
17 | z.foreach(println)
18 | }
19 | }
--------------------------------------------------------------------------------
/src/internals/sortByKeyTest.scala:
--------------------------------------------------------------------------------
1 | package internals
2 |
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.SparkContext._
5 |
6 | object sortByKeyTest {
7 | def main(args: Array[String]) {
8 |
9 | val sc = new SparkContext("local", "sortByKey Test")
10 | val data1 = Array[(Char, Int)](('A', 5), ('B', 4),
11 | ('C', 3), ('B', 2),
12 | ('C', 1), ('D', 2),
13 | ('C', 3), ('A', 4))
14 | val pairs = sc.parallelize(data1, 3)
15 |
16 | val result = pairs.sortByKey(true, 2)
17 | pairs.foreachWith(i => i)((x, i) => println("[pairsPartitionIndex " + i + "] " + x))
18 | result.foreachWith(i => i)((x, i) => println("[PartitionIndex " + i + "] " + x))
19 |
20 | println(result.toDebugString)
21 | }
22 |
23 | }
24 |
25 | /*
26 | [pairsPartitionIndex 0] (A,5)
27 | [pairsPartitionIndex 0] (B,4)
28 |
29 | [pairsPartitionIndex 1] (C,3)
30 | [pairsPartitionIndex 1] (B,2)
31 | [pairsPartitionIndex 1] (C,1)
32 |
33 | [pairsPartitionIndex 2] (D,2)
34 | [pairsPartitionIndex 2] (C,3)
35 | [pairsPartitionIndex 2] (A,4)
36 |
37 | [PartitionIndex 0] (A,5)
38 | [PartitionIndex 0] (A,4)
39 | [PartitionIndex 0] (B,4)
40 | [PartitionIndex 0] (B,2)
41 |
42 | [PartitionIndex 1] (C,3)
43 | [PartitionIndex 1] (C,1)
44 | [PartitionIndex 1] (C,3)
45 | [PartitionIndex 1] (D,2)
46 |
47 | MapPartitionsRDD[4] at sortByKey at sortByKeyTest.scala:16 (2 partitions)
48 | ShuffledRDD[3] at sortByKey at sortByKeyTest.scala:16 (2 partitions)
49 | ParallelCollectionRDD[0] at parallelize at sortByKeyTest.scala:14 (3 partitions)
50 | */
51 |
--------------------------------------------------------------------------------
/src/local/examples/Aggregate.scala:
--------------------------------------------------------------------------------
1 | package local.examples
2 |
3 | import org.apache.spark.SparkContext
4 |
5 | object Aggregate {
6 |
7 | def main(args: Array[String]) {
8 |
9 | val sc = new SparkContext("local", "AggregateAction Test")
10 | val data = Array[(String, Int)](("A1", 1), ("A2", 2),
11 | ("B1", 3), ("B2", 4),
12 | ("C1", 5), ("C2", 6))
13 |
14 | val pairs = sc.parallelize(data, 3)
15 |
16 | // output:
17 | // (A1,1)(A2,2)
18 | // (B1,3)(B2,4)
19 | // (C1,5)(C2,6)
20 | pairs.foreach(print)
21 |
22 | val result = pairs.aggregate(("", 0))((U, T) => (U._1 + T._1, U._2 + T._2), (U, T) =>
23 | ("[" + U._1 + T._1 + "] ", U._2 + T._2))
24 |
25 | // output ([[[A1A2] B1B2] C1C2] ,21)
26 | println(result)
27 | }
28 | }
--------------------------------------------------------------------------------
/src/local/examples/AggregateOrder.scala:
--------------------------------------------------------------------------------
1 | package local.examples
2 |
3 | import org.apache.spark.SparkContext
4 |
5 | object AggregateOrder {
6 |
7 | def main(args: Array[String]) {
8 |
9 | val sc = new SparkContext("local", "AggregateOrder Test")
10 | val data = List("12", "23", "345", "4567")
11 |
12 | val pairs = sc.parallelize(data, 2)
13 | pairs.foreach(x => println(x.length))
14 |
15 | //val result = pairs.aggregate("")((x,y) => math.min(x.length, y.length).toString, (x,y) => x + y)
16 |
17 | val result2 = pairs.aggregate("")((x,y) => "[" + x.length + "," + y.length + "] ", (x,y) => x + y)
18 |
19 | result2.foreach(println)
20 | println(result2)
21 |
22 | }
23 | }
--------------------------------------------------------------------------------
/src/local/examples/Cartesian.scala:
--------------------------------------------------------------------------------
1 | package local.examples
2 |
3 | import org.apache.spark.SparkContext
4 |
5 | object Cartesian {
6 | def main(args: Array[String]) {
7 | val sc = new SparkContext("local", "Cartesian Test")
8 | val data1 = Array[(String, Int)](("A1", 1), ("A2", 2),
9 | ("B1", 3), ("B2", 4),
10 | ("C1", 5), ("C1", 6))
11 |
12 | val data2 = Array[(String, Int)](("A1", 7), ("A2", 8),
13 | ("B1", 9), ("C1", 0))
14 | val pairs1 = sc.parallelize(data1, 3)
15 | val pairs2 = sc.parallelize(data2, 2)
16 |
17 | val resultRDD = pairs1.cartesian(pairs2)
18 |
19 | resultRDD.foreach(println)
20 |
21 | /*
22 | * Output of task1:
23 | * ((A1,1),(A1,7))
24 | * ((A1,1),(A2,8))
25 | * ((A2,2),(A1,7))
26 | * ((A2,2),(A2,8))
27 | * Output of task2:
28 | * ((A1,1),(B1,9))
29 | * ((A1,1),(C1,0))
30 | * ((A2,2),(B1,9))
31 | * ((A2,2),(C1,0))
32 | * Output of task3:
33 | * ((B1,3),(A1,7))
34 | * ((B1,3),(A2,8))
35 | * ((B2,4),(A1,7))
36 | * ((B2,4),(A2,8))
37 | * Output of task4:
38 | * ((B1,3),(B1,9))
39 | * ((B1,3),(C1,0))
40 | * ((B2,4),(B1,9))
41 | * ((B2,4),(C1,0))
42 | * Output of task5:
43 | * ((C1,5),(A1,7))
44 | * ((C1,5),(A2,8))
45 | * ((C1,6),(A1,7))
46 | * ((C1,6),(A2,8))
47 | * Output of task6:
48 | * ((C1,5),(B1,9))
49 | * ((C1,5),(C1,0))
50 | * ((C1,6),(B1,9))
51 | * ((C1,6),(C1,0))
52 | */
53 |
54 | }
55 | }
--------------------------------------------------------------------------------
/src/local/examples/CollectAsMap.scala:
--------------------------------------------------------------------------------
1 | package local.examples
2 |
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.SparkContext._
5 |
6 | object CollectAsMap {
7 | def main(args: Array[String]) {
8 |
9 | val sc = new SparkContext("local", "CollectAsMap Test")
10 | val data = Array[(String, Int)](("A", 1), ("B", 2),
11 | ("B", 3), ("C", 4),
12 | ("C", 5), ("C", 6))
13 |
14 | // as same as "val pairs = sc.parallelize(data, 3)"
15 | val pairs = sc.makeRDD(data, 3)
16 |
17 | val result = pairs.collectAsMap
18 |
19 | // output Map(A -> 1, C -> 6, B -> 3)
20 | print(result)
21 | }
22 |
23 | }
--------------------------------------------------------------------------------
/src/local/examples/FlatMap.scala:
--------------------------------------------------------------------------------
1 | package local.examples
2 |
3 | import org.apache.spark.SparkContext
4 |
5 | object FlatMap {
6 | def main(args: Array[String]) {
7 |
8 | val sc = new SparkContext("local", "FlatMap Test")
9 | val data = Array[(String, Int)](("A", 1), ("B", 2),
10 | ("B", 3), ("C", 4),
11 | ("C", 5), ("C", 6)
12 | )
13 | val pairs = sc.makeRDD(data, 3)
14 |
15 | val result = pairs.flatMap(T => (T._1 + T._2))
16 |
17 | result.foreach(println)
18 |
19 | }
20 | }
--------------------------------------------------------------------------------
/src/local/examples/GenerateRandomText.scala:
--------------------------------------------------------------------------------
1 | package local.examples
2 |
3 | import java.io.File
4 | import java.io.FileWriter
5 | import java.util.Random
6 |
7 | object GenerateRandomText {
8 |
9 | def main(args: Array[String]) {
10 | val outputPath = new File("/Users/xulijie/Documents/data/RandomText/randomText-10MB.txt")
11 | if(!outputPath.getParentFile().exists())
12 | outputPath.getParentFile.mkdirs()
13 |
14 | val writer = new FileWriter(outputPath)
15 | val ranGen = new Random
16 |
17 |
18 | while (outputPath.length() < 10 * 1024 * 1024) {
19 | var index = Math.abs(ranGen.nextGaussian() * 1000) % 1000
20 | writer.write(words.apply(index.toInt))
21 | writer.write(" ")
22 | index = Math.abs(ranGen.nextGaussian()) % 1000
23 | writer.write(words.apply(index.toInt))
24 | writer.write("\n")
25 | }
26 | writer.close()
27 | }
28 |
29 | def printToFile(f: java.io.File)(op: java.io.PrintWriter => Unit) {
30 | val p = new java.io.PrintWriter(f)
31 | try { op(p) } finally { p.close() }
32 | }
33 |
34 | /**
35 | * A random list of 100 words from /usr/share/dict/words
36 | */
37 | private val words = Array[String](
38 | "diurnalness", "Homoiousian",
39 | "spiranthic", "tetragynian",
40 | "silverhead", "ungreat",
41 | "lithograph", "exploiter",
42 | "physiologian", "by",
43 | "hellbender", "Filipendula",
44 | "undeterring", "antiscolic",
45 | "pentagamist", "hypoid",
46 | "cacuminal", "sertularian",
47 | "schoolmasterism", "nonuple",
48 | "gallybeggar", "phytonic",
49 | "swearingly", "nebular",
50 | "Confervales", "thermochemically",
51 | "characinoid", "cocksuredom",
52 | "fallacious", "feasibleness",
53 | "debromination", "playfellowship",
54 | "tramplike", "testa",
55 | "participatingly", "unaccessible",
56 | "bromate", "experientialist",
57 | "roughcast", "docimastical",
58 | "choralcelo", "blightbird",
59 | "peptonate", "sombreroed",
60 | "unschematized", "antiabolitionist",
61 | "besagne", "mastication",
62 | "bromic", "sviatonosite",
63 | "cattimandoo", "metaphrastical",
64 | "endotheliomyoma", "hysterolysis",
65 | "unfulminated", "Hester",
66 | "oblongly", "blurredness",
67 | "authorling", "chasmy",
68 | "Scorpaenidae", "toxihaemia",
69 | "Dictograph", "Quakerishly",
70 | "deaf", "timbermonger",
71 | "strammel", "Thraupidae",
72 | "seditious", "plerome",
73 | "Arneb", "eristically",
74 | "serpentinic", "glaumrie",
75 | "socioromantic", "apocalypst",
76 | "tartrous", "Bassaris",
77 | "angiolymphoma", "horsefly",
78 | "kenno", "astronomize",
79 | "euphemious", "arsenide",
80 | "untongued", "parabolicness",
81 | "uvanite", "helpless",
82 | "gemmeous", "stormy",
83 | "templar", "erythrodextrin",
84 | "comism", "interfraternal",
85 | "preparative", "parastas",
86 | "frontoorbital", "Ophiosaurus",
87 | "diopside", "serosanguineous",
88 | "ununiformly", "karyological",
89 | "collegian", "allotropic",
90 | "depravity", "amylogenesis",
91 | "reformatory", "epidymides",
92 | "pleurotropous", "trillium",
93 | "dastardliness", "coadvice",
94 | "embryotic", "benthonic",
95 | "pomiferous", "figureheadship",
96 | "Megaluridae", "Harpa",
97 | "frenal", "commotion",
98 | "abthainry", "cobeliever",
99 | "manilla", "spiciferous",
100 | "nativeness", "obispo",
101 | "monilioid", "biopsic",
102 | "valvula", "enterostomy",
103 | "planosubulate", "pterostigma",
104 | "lifter", "triradiated",
105 | "venialness", "tum",
106 | "archistome", "tautness",
107 | "unswanlike", "antivenin",
108 | "Lentibulariaceae", "Triphora",
109 | "angiopathy", "anta",
110 | "Dawsonia", "becomma",
111 | "Yannigan", "winterproof",
112 | "antalgol", "harr",
113 | "underogating", "ineunt",
114 | "cornberry", "flippantness",
115 | "scyphostoma", "approbation",
116 | "Ghent", "Macraucheniidae",
117 | "scabbiness", "unanatomized",
118 | "photoelasticity", "eurythermal",
119 | "enation", "prepavement",
120 | "flushgate", "subsequentially",
121 | "Edo", "antihero",
122 | "Isokontae", "unforkedness",
123 | "porriginous", "daytime",
124 | "nonexecutive", "trisilicic",
125 | "morphiomania", "paranephros",
126 | "botchedly", "impugnation",
127 | "Dodecatheon", "obolus",
128 | "unburnt", "provedore",
129 | "Aktistetae", "superindifference",
130 | "Alethea", "Joachimite",
131 | "cyanophilous", "chorograph",
132 | "brooky", "figured",
133 | "periclitation", "quintette",
134 | "hondo", "ornithodelphous",
135 | "unefficient", "pondside",
136 | "bogydom", "laurinoxylon",
137 | "Shiah", "unharmed",
138 | "cartful", "noncrystallized",
139 | "abusiveness", "cromlech",
140 | "japanned", "rizzomed",
141 | "underskin", "adscendent",
142 | "allectory", "gelatinousness",
143 | "volcano", "uncompromisingly",
144 | "cubit", "idiotize",
145 | "unfurbelowed", "undinted",
146 | "magnetooptics", "Savitar",
147 | "diwata", "ramosopalmate",
148 | "Pishquow", "tomorn",
149 | "apopenptic", "Haversian",
150 | "Hysterocarpus", "ten",
151 | "outhue", "Bertat",
152 | "mechanist", "asparaginic",
153 | "velaric", "tonsure",
154 | "bubble", "Pyrales",
155 | "regardful", "glyphography",
156 | "calabazilla", "shellworker",
157 | "stradametrical", "havoc",
158 | "theologicopolitical", "sawdust",
159 | "diatomaceous", "jajman",
160 | "temporomastoid", "Serrifera",
161 | "Ochnaceae", "aspersor",
162 | "trailmaking", "Bishareen",
163 | "digitule", "octogynous",
164 | "epididymitis", "smokefarthings",
165 | "bacillite", "overcrown",
166 | "mangonism", "sirrah",
167 | "undecorated", "psychofugal",
168 | "bismuthiferous", "rechar",
169 | "Lemuridae", "frameable",
170 | "thiodiazole", "Scanic",
171 | "sportswomanship", "interruptedness",
172 | "admissory", "osteopaedion",
173 | "tingly", "tomorrowness",
174 | "ethnocracy", "trabecular",
175 | "vitally", "fossilism",
176 | "adz", "metopon",
177 | "prefatorial", "expiscate",
178 | "diathermacy", "chronist",
179 | "nigh", "generalizable",
180 | "hysterogen", "aurothiosulphuric",
181 | "whitlowwort", "downthrust",
182 | "Protestantize", "monander",
183 | "Itea", "chronographic",
184 | "silicize", "Dunlop",
185 | "eer", "componental",
186 | "spot", "pamphlet",
187 | "antineuritic", "paradisean",
188 | "interruptor", "debellator",
189 | "overcultured", "Florissant",
190 | "hyocholic", "pneumatotherapy",
191 | "tailoress", "rave",
192 | "unpeople", "Sebastian",
193 | "thermanesthesia", "Coniferae",
194 | "swacking", "posterishness",
195 | "ethmopalatal", "whittle",
196 | "analgize", "scabbardless",
197 | "naught", "symbiogenetically",
198 | "trip", "parodist",
199 | "columniform", "trunnel",
200 | "yawler", "goodwill",
201 | "pseudohalogen", "swangy",
202 | "cervisial", "mediateness",
203 | "genii", "imprescribable",
204 | "pony", "consumptional",
205 | "carposporangial", "poleax",
206 | "bestill", "subfebrile",
207 | "sapphiric", "arrowworm",
208 | "qualminess", "ultraobscure",
209 | "thorite", "Fouquieria",
210 | "Bermudian", "prescriber",
211 | "elemicin", "warlike",
212 | "semiangle", "rotular",
213 | "misthread", "returnability",
214 | "seraphism", "precostal",
215 | "quarried", "Babylonism",
216 | "sangaree", "seelful",
217 | "placatory", "pachydermous",
218 | "bozal", "galbulus",
219 | "spermaphyte", "cumbrousness",
220 | "pope", "signifier",
221 | "Endomycetaceae", "shallowish",
222 | "sequacity", "periarthritis",
223 | "bathysphere", "pentosuria",
224 | "Dadaism", "spookdom",
225 | "Consolamentum", "afterpressure",
226 | "mutter", "louse",
227 | "ovoviviparous", "corbel",
228 | "metastoma", "biventer",
229 | "Hydrangea", "hogmace",
230 | "seizing", "nonsuppressed",
231 | "oratorize", "uncarefully",
232 | "benzothiofuran", "penult",
233 | "balanocele", "macropterous",
234 | "dishpan", "marten",
235 | "absvolt", "jirble",
236 | "parmelioid", "airfreighter",
237 | "acocotl", "archesporial",
238 | "hypoplastral", "preoral",
239 | "quailberry", "cinque",
240 | "terrestrially", "stroking",
241 | "limpet", "moodishness",
242 | "canicule", "archididascalian",
243 | "pompiloid", "overstaid",
244 | "introducer", "Italical",
245 | "Christianopaganism", "prescriptible",
246 | "subofficer", "danseuse",
247 | "cloy", "saguran",
248 | "frictionlessly", "deindividualization",
249 | "Bulanda", "ventricous",
250 | "subfoliar", "basto",
251 | "scapuloradial", "suspend",
252 | "stiffish", "Sphenodontidae",
253 | "eternal", "verbid",
254 | "mammonish", "upcushion",
255 | "barkometer", "concretion",
256 | "preagitate", "incomprehensible",
257 | "tristich", "visceral",
258 | "hemimelus", "patroller",
259 | "stentorophonic", "pinulus",
260 | "kerykeion", "brutism",
261 | "monstership", "merciful",
262 | "overinstruct", "defensibly",
263 | "bettermost", "splenauxe",
264 | "Mormyrus", "unreprimanded",
265 | "taver", "ell",
266 | "proacquittal", "infestation",
267 | "overwoven", "Lincolnlike",
268 | "chacona", "Tamil",
269 | "classificational", "lebensraum",
270 | "reeveland", "intuition",
271 | "Whilkut", "focaloid",
272 | "Eleusinian", "micromembrane",
273 | "byroad", "nonrepetition",
274 | "bacterioblast", "brag",
275 | "ribaldrous", "phytoma",
276 | "counteralliance", "pelvimetry",
277 | "pelf", "relaster",
278 | "thermoresistant", "aneurism",
279 | "molossic", "euphonym",
280 | "upswell", "ladhood",
281 | "phallaceous", "inertly",
282 | "gunshop", "stereotypography",
283 | "laryngic", "refasten",
284 | "twinling", "oflete",
285 | "hepatorrhaphy", "electrotechnics",
286 | "cockal", "guitarist",
287 | "topsail", "Cimmerianism",
288 | "larklike", "Llandovery",
289 | "pyrocatechol", "immatchable",
290 | "chooser", "metrocratic",
291 | "craglike", "quadrennial",
292 | "nonpoisonous", "undercolored",
293 | "knob", "ultratense",
294 | "balladmonger", "slait",
295 | "sialadenitis", "bucketer",
296 | "magnificently", "unstipulated",
297 | "unscourged", "unsupercilious",
298 | "packsack", "pansophism",
299 | "soorkee", "percent",
300 | "subirrigate", "champer",
301 | "metapolitics", "spherulitic",
302 | "involatile", "metaphonical",
303 | "stachyuraceous", "speckedness",
304 | "bespin", "proboscidiform",
305 | "gul", "squit",
306 | "yeelaman", "peristeropode",
307 | "opacousness", "shibuichi",
308 | "retinize", "yote",
309 | "misexposition", "devilwise",
310 | "pumpkinification", "vinny",
311 | "bonze", "glossing",
312 | "decardinalize", "transcortical",
313 | "serphoid", "deepmost",
314 | "guanajuatite", "wemless",
315 | "arval", "lammy",
316 | "Effie", "Saponaria",
317 | "tetrahedral", "prolificy",
318 | "excerpt", "dunkadoo",
319 | "Spencerism", "insatiately",
320 | "Gilaki", "oratorship",
321 | "arduousness", "unbashfulness",
322 | "Pithecolobium", "unisexuality",
323 | "veterinarian", "detractive",
324 | "liquidity", "acidophile",
325 | "proauction", "sural",
326 | "totaquina", "Vichyite",
327 | "uninhabitedness", "allegedly",
328 | "Gothish", "manny",
329 | "Inger", "flutist",
330 | "ticktick", "Ludgatian",
331 | "homotransplant", "orthopedical",
332 | "diminutively", "monogoneutic",
333 | "Kenipsim", "sarcologist",
334 | "drome", "stronghearted",
335 | "Fameuse", "Swaziland",
336 | "alen", "chilblain",
337 | "beatable", "agglomeratic",
338 | "constitutor", "tendomucoid",
339 | "porencephalous", "arteriasis",
340 | "boser", "tantivy",
341 | "rede", "lineamental",
342 | "uncontradictableness", "homeotypical",
343 | "masa", "folious",
344 | "dosseret", "neurodegenerative",
345 | "subtransverse", "Chiasmodontidae",
346 | "palaeotheriodont", "unstressedly",
347 | "chalcites", "piquantness",
348 | "lampyrine", "Aplacentalia",
349 | "projecting", "elastivity",
350 | "isopelletierin", "bladderwort",
351 | "strander", "almud",
352 | "iniquitously", "theologal",
353 | "bugre", "chargeably",
354 | "imperceptivity", "meriquinoidal",
355 | "mesophyte", "divinator",
356 | "perfunctory", "counterappellant",
357 | "synovial", "charioteer",
358 | "crystallographical", "comprovincial",
359 | "infrastapedial", "pleasurehood",
360 | "inventurous", "ultrasystematic",
361 | "subangulated", "supraoesophageal",
362 | "Vaishnavism", "transude",
363 | "chrysochrous", "ungrave",
364 | "reconciliable", "uninterpleaded",
365 | "erlking", "wherefrom",
366 | "aprosopia", "antiadiaphorist",
367 | "metoxazine", "incalculable",
368 | "umbellic", "predebit",
369 | "foursquare", "unimmortal",
370 | "nonmanufacture", "slangy",
371 | "predisputant", "familist",
372 | "preaffiliate", "friarhood",
373 | "corelysis", "zoonitic",
374 | "halloo", "paunchy",
375 | "neuromimesis", "aconitine",
376 | "hackneyed", "unfeeble",
377 | "cubby", "autoschediastical",
378 | "naprapath", "lyrebird",
379 | "inexistency", "leucophoenicite",
380 | "ferrogoslarite", "reperuse",
381 | "uncombable", "tambo",
382 | "propodiale", "diplomatize",
383 | "Russifier", "clanned",
384 | "corona", "michigan",
385 | "nonutilitarian", "transcorporeal",
386 | "bought", "Cercosporella",
387 | "stapedius", "glandularly",
388 | "pictorially", "weism",
389 | "disilane", "rainproof",
390 | "Caphtor", "scrubbed",
391 | "oinomancy", "pseudoxanthine",
392 | "nonlustrous", "redesertion",
393 | "Oryzorictinae", "gala",
394 | "Mycogone", "reappreciate",
395 | "cyanoguanidine", "seeingness",
396 | "breadwinner", "noreast",
397 | "furacious", "epauliere",
398 | "omniscribent", "Passiflorales",
399 | "uninductive", "inductivity",
400 | "Orbitolina", "Semecarpus",
401 | "migrainoid", "steprelationship",
402 | "phlogisticate", "mesymnion",
403 | "sloped", "edificator",
404 | "beneficent", "culm",
405 | "paleornithology", "unurban",
406 | "throbless", "amplexifoliate",
407 | "sesquiquintile", "sapience",
408 | "astucious", "dithery",
409 | "boor", "ambitus",
410 | "scotching", "uloid",
411 | "uncompromisingness", "hoove",
412 | "waird", "marshiness",
413 | "Jerusalem", "mericarp",
414 | "unevoked", "benzoperoxide",
415 | "outguess", "pyxie",
416 | "hymnic", "euphemize",
417 | "mendacity", "erythremia",
418 | "rosaniline", "unchatteled",
419 | "lienteria", "Bushongo",
420 | "dialoguer", "unrepealably",
421 | "rivethead", "antideflation",
422 | "vinegarish", "manganosiderite",
423 | "doubtingness", "ovopyriform",
424 | "Cephalodiscus", "Muscicapa",
425 | "Animalivora", "angina",
426 | "planispheric", "ipomoein",
427 | "cuproiodargyrite", "sandbox",
428 | "scrat", "Munnopsidae",
429 | "shola", "pentafid",
430 | "overstudiousness", "times",
431 | "nonprofession", "appetible",
432 | "valvulotomy", "goladar",
433 | "uniarticular", "oxyterpene",
434 | "unlapsing", "omega",
435 | "trophonema", "seminonflammable",
436 | "circumzenithal", "starer",
437 | "depthwise", "liberatress",
438 | "unleavened", "unrevolting",
439 | "groundneedle", "topline",
440 | "wandoo", "umangite",
441 | "ordinant", "unachievable",
442 | "oversand", "snare",
443 | "avengeful", "unexplicit",
444 | "mustafina", "sonable",
445 | "rehabilitative", "eulogization",
446 | "papery", "technopsychology",
447 | "impressor", "cresylite",
448 | "entame", "transudatory",
449 | "scotale", "pachydermatoid",
450 | "imaginary", "yeat",
451 | "slipped", "stewardship",
452 | "adatom", "cockstone",
453 | "skyshine", "heavenful",
454 | "comparability", "exprobratory",
455 | "dermorhynchous", "parquet",
456 | "cretaceous", "vesperal",
457 | "raphis", "undangered",
458 | "Glecoma", "engrain",
459 | "counteractively", "Zuludom",
460 | "orchiocatabasis", "Auriculariales",
461 | "warriorwise", "extraorganismal",
462 | "overbuilt", "alveolite",
463 | "tetchy", "terrificness",
464 | "widdle", "unpremonished",
465 | "rebilling", "sequestrum",
466 | "equiconvex", "heliocentricism",
467 | "catabaptist", "okonite",
468 | "propheticism", "helminthagogic",
469 | "calycular", "giantly",
470 | "wingable", "golem",
471 | "unprovided", "commandingness",
472 | "greave", "haply",
473 | "doina", "depressingly",
474 | "subdentate", "impairment",
475 | "decidable", "neurotrophic",
476 | "unpredict", "bicorporeal",
477 | "pendulant", "flatman",
478 | "intrabred", "toplike",
479 | "Prosobranchiata", "farrantly",
480 | "toxoplasmosis", "gorilloid",
481 | "dipsomaniacal", "aquiline",
482 | "atlantite", "ascitic",
483 | "perculsive", "prospectiveness",
484 | "saponaceous", "centrifugalization",
485 | "dinical", "infravaginal",
486 | "beadroll", "affaite",
487 | "Helvidian", "tickleproof",
488 | "abstractionism", "enhedge",
489 | "outwealth", "overcontribute",
490 | "coldfinch", "gymnastic",
491 | "Pincian", "Munychian",
492 | "codisjunct", "quad",
493 | "coracomandibular", "phoenicochroite",
494 | "amender", "selectivity",
495 | "putative", "semantician",
496 | "lophotrichic", "Spatangoidea",
497 | "saccharogenic", "inferent",
498 | "Triconodonta", "arrendation",
499 | "sheepskin", "taurocolla",
500 | "bunghole", "Machiavel",
501 | "triakistetrahedral", "dehairer",
502 | "prezygapophysial", "cylindric",
503 | "pneumonalgia", "sleigher",
504 | "emir", "Socraticism",
505 | "licitness", "massedly",
506 | "instructiveness", "sturdied",
507 | "redecrease", "starosta",
508 | "evictor", "orgiastic",
509 | "squdge", "meloplasty",
510 | "Tsonecan", "repealableness",
511 | "swoony", "myesthesia",
512 | "molecule", "autobiographist",
513 | "reciprocation", "refective",
514 | "unobservantness", "tricae",
515 | "ungouged", "floatability",
516 | "Mesua", "fetlocked",
517 | "chordacentrum", "sedentariness",
518 | "various", "laubanite",
519 | "nectopod", "zenick",
520 | "sequentially", "analgic",
521 | "biodynamics", "posttraumatic",
522 | "nummi", "pyroacetic",
523 | "bot", "redescend",
524 | "dispermy", "undiffusive",
525 | "circular", "trillion",
526 | "Uraniidae", "ploration",
527 | "discipular", "potentness",
528 | "sud", "Hu",
529 | "Eryon", "plugger",
530 | "subdrainage", "jharal",
531 | "abscission", "supermarket",
532 | "countergabion", "glacierist",
533 | "lithotresis", "minniebush",
534 | "zanyism", "eucalypteol",
535 | "sterilely", "unrealize",
536 | "unpatched", "hypochondriacism",
537 | "critically", "cheesecutter")
538 | }
--------------------------------------------------------------------------------
/src/local/examples/GroupByAction.scala:
--------------------------------------------------------------------------------
1 | package local.examples
2 |
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.RangePartitioner
5 |
6 | object GroupByAction {
7 | def main(args: Array[String]) {
8 |
9 | val sc = new SparkContext("local", "GroupByAction Test")
10 |
11 | val data = Array[(String, Int)](("A1", 1), ("A2", 2),
12 | ("B1", 6), ("A2", 4),
13 | ("B1", 3), ("B1", 5))
14 |
15 | val pairs = sc.parallelize(data, 3)
16 |
17 | // output:
18 | // (A1,1)
19 | // (A2,2)
20 | //
21 | // (B1,6)
22 | // (A2,4)
23 | //
24 | // (B1,3)
25 | // (B1,5)
26 | pairs.foreach(println)
27 |
28 | val result1 = pairs.groupBy(K => K._1)
29 | val result2 = pairs.groupBy((K : (String, Int)) => K._1, 1)
30 | val result3 = pairs.groupBy((K : (String, Int)) => K._1, new RangePartitioner(3, pairs))
31 |
32 | // output of result1:
33 | // (A1,ArrayBuffer((A1,1)))
34 | //
35 | // (B1,ArrayBuffer((B1,6), (B1,3), (B1,5)))
36 | // (A2,ArrayBuffer((A2,2), (A2,4)))
37 | result1.foreach(println)
38 |
39 | // output of result2:
40 | // (A1,ArrayBuffer((A1,1)))
41 | // (B1,ArrayBuffer((B1,6), (B1,3), (B1,5)))
42 | // (A2,ArrayBuffer((A2,2), (A2,4)))
43 | result2.foreach(println)
44 |
45 | // output of result3:
46 | // (A1,ArrayBuffer((A1,1)))
47 | // (A2,ArrayBuffer((A2,2), (A2,4)))
48 | //
49 | // (B1,ArrayBuffer((B1,6), (B1,3), (B1,5)))
50 | result3.foreach(println)
51 |
52 | }
53 |
54 | }
--------------------------------------------------------------------------------
/src/local/examples/GroupByKey.scala:
--------------------------------------------------------------------------------
1 | package local.examples
2 |
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.SparkContext._
5 |
6 | object GroupByKey {
7 |
8 | def main(args: Array[String]) {
9 |
10 | val sc = new SparkContext("local", "GroupByKey Test")
11 | val data = Array[(Int, Char)]((1, 'a'), (2, 'b'),
12 | (3, 'c'), (4, 'd'),
13 | (5, 'e'), (3, 'f'),
14 | (2, 'g'), (1, 'h')
15 |
16 | )
17 | val pairs = sc.parallelize(data, 3)
18 |
19 | val result = pairs.groupByKey(2)
20 |
21 | // output:
22 | // (B,ArrayBuffer(2, 3))
23 | //
24 | // (A,ArrayBuffer(1))
25 | // (C,ArrayBuffer(4, 5, 6))
26 | //result.foreach(println)
27 | result.foreachWith(i => i)((x, i) => println("[PartitionIndex " + i + "] " + x))
28 | println(result.toDebugString)
29 | }
30 | }
--------------------------------------------------------------------------------
/src/local/examples/GroupByTest.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package local.examples
19 |
20 | import java.util.Random
21 |
22 | import org.apache.spark.{SparkConf, SparkContext}
23 | import org.apache.spark.SparkContext._
24 |
25 | /**
26 | * Usage: GroupByTest [numMappers] [numKVPairs] [KeySize] [numReducers]
27 | */
28 | object GroupByTest {
29 | def main(args: Array[String]) {
30 | val sparkConf = new SparkConf().setAppName("GroupBy Test").setMaster("local[2]")
31 | var numMappers = 10
32 | var numKVPairs = 100
33 | var valSize = 100
34 | var numReducers = 3
35 |
36 | val sc = new SparkContext(sparkConf)
37 |
38 | val pairs1 = sc.parallelize(0 until numMappers, numMappers).flatMap { p =>
39 | val ranGen = new Random
40 | var arr1 = new Array[(Int, Array[Byte])](numKVPairs)
41 | for (i <- 0 until numKVPairs) {
42 | val byteArr = new Array[Byte](valSize)
43 | ranGen.nextBytes(byteArr)
44 | arr1(i) = (ranGen.nextInt(10), byteArr)
45 | }
46 | arr1
47 | }.cache
48 | // Enforce that everything has been calculated and in cache
49 | pairs1.count
50 |
51 | val result = pairs1.groupByKey(numReducers)
52 | println(result.count)
53 | println(result.toDebugString)
54 |
55 | sc.stop()
56 | }
57 | }
58 |
--------------------------------------------------------------------------------
/src/local/examples/GroupWith.scala:
--------------------------------------------------------------------------------
1 | package local.examples
2 |
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.SparkContext._
5 |
6 | object GroupWith {
7 | def main(args: Array[String]) {
8 |
9 | val sc = new SparkContext("local[2]", "GroupWith Test")
10 |
11 | val data1 = Array[(String, Int)](("A1", 1), ("A2", 2),
12 | ("B1", 3), ("B2", 4),
13 | ("C1", 5), ("C1", 6)
14 | )
15 |
16 | val data2 = Array[(String, Int)](("A1", 7), ("A2", 8),
17 | ("B1", 9), ("C1", 0)
18 | )
19 | val pairs1 = sc.parallelize(data1, 3)
20 | val pairs2 = sc.parallelize(data2, 2)
21 |
22 | val result = pairs1.groupWith(pairs2)
23 | result.foreach(println)
24 |
25 | // output:
26 | // (B1,(ArrayBuffer(3),ArrayBuffer(9)))
27 | // (A1,(ArrayBuffer(1),ArrayBuffer(7)))
28 | // (A2,(ArrayBuffer(2),ArrayBuffer(8)))
29 | //
30 | // (C1,(ArrayBuffer(5, 6),ArrayBuffer(0)))
31 | // (B2,(ArrayBuffer(4),ArrayBuffer()))
32 |
33 |
34 | }
35 | }
--------------------------------------------------------------------------------
/src/local/examples/JoinAction.scala:
--------------------------------------------------------------------------------
1 | package local.examples
2 |
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.SparkContext._
5 |
6 | object JoinAction {
7 | def main(args: Array[String]) {
8 |
9 | val sc = new SparkContext("local[2]", "JoinAction Test")
10 |
11 | val data1 = Array[(String, Int)](("A1", 1), ("A2", 2),
12 | ("B1", 3), ("B2", 4),
13 | ("C1", 5), ("C1", 6)
14 | )
15 |
16 | val data2 = Array[(String, Int)](("A1", 7), ("A2", 8),
17 | ("B1", 9), ("C1", 0)
18 | )
19 | val pairs1 = sc.parallelize(data1, 3)
20 | val pairs2 = sc.parallelize(data2, 2)
21 |
22 |
23 | val result = pairs1.join(pairs2)
24 |
25 | // output:
26 | // (A1,(1,7))
27 | // (B1,(3,9))
28 | // (A2,(2,8))
29 | //
30 | // (C1,(5,0))
31 | // (C1,(6,0))
32 | result.foreach(println)
33 | }
34 |
35 | }
--------------------------------------------------------------------------------
/src/local/examples/LocalWordCount.scala:
--------------------------------------------------------------------------------
1 | package local.examples
2 |
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.SparkContext._
5 |
6 | object LocalWordCount {
7 | def main(args: Array[String]) {
8 |
9 | val sc = new SparkContext("local[4]", "LocalWordCount")
10 | val myFile = sc.textFile("/Users/xulijie/Documents/data/RandomText/randomText-10MB.txt")
11 | /*
12 | val counts = myFile.map( l => l.split(" ")(2) )
13 | .map( url => (url, 1) )
14 | .reduceByKey( _+_ )
15 | .map{ case(url, count) => (count, url) }
16 | .sortByKey( ascending=false )
17 | .map{ case(count, url) => (url, count) }
18 |
19 | */
20 | val wordAndCount = myFile.flatMap(s => s.split(" "))
21 | .map(w => (w, 1))
22 |
23 | val result = wordAndCount.reduceByKey(_ + _)
24 | result.foreach(println)
25 |
26 | }
27 |
28 | }
--------------------------------------------------------------------------------
/src/local/examples/LookUpTest.scala:
--------------------------------------------------------------------------------
1 | package local.examples
2 |
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.SparkContext._
5 |
6 | object LookUpTest {
7 | def main(args: Array[String]) {
8 |
9 | val sc = new SparkContext("local", "LookUp Test")
10 |
11 | val data = Array[(String, Int)](("A", 1), ("B", 2),
12 | ("B", 3), ("C", 4),
13 | ("C", 5), ("C", 6))
14 |
15 | val pairs = sc.parallelize(data, 3)
16 |
17 | val finalRDD = pairs.lookup("B")
18 |
19 | finalRDD.foreach(println)
20 | // output:
21 | // 2
22 | // 3
23 | }
24 | }
--------------------------------------------------------------------------------
/src/local/examples/MapPartitionsRDDTest.scala:
--------------------------------------------------------------------------------
1 | package local.examples
2 |
3 | import org.apache.spark.SparkContext
4 |
5 | object MapPartitionsRDDTest {
6 |
7 | def main(args: Array[String]) {
8 | val sc = new SparkContext("local", "MapPartitionsRDD Test")
9 | val data = Array[(String, Int)](("A1", 1), ("A2", 2),
10 | ("B1", 1), ("B2", 4),
11 | ("C1", 3), ("C2", 4)
12 | )
13 | val pairs = sc.parallelize(data, 3)
14 |
15 | val finalRDD = pairs.mapPartitions(iter => iter.filter(_._2 >= 2))
16 | // val finalRDD2 = pairs.mapPartitionsWithIndex(f, preservesPartitioning)
17 |
18 | finalRDD.toArray().foreach(println)
19 |
20 | }
21 | }
--------------------------------------------------------------------------------
/src/local/examples/MapValuesTest.scala:
--------------------------------------------------------------------------------
1 | package local.examples
2 |
3 | import org.apache.spark.SparkContext
4 |
5 | object MapValuesTest {
6 | def main(args: Array[String]) {
7 |
8 | val sc = new SparkContext("local", "ReduceByKeyToDriver Test")
9 | val data1 = Array[(String, Int)](("K", 1), ("T", 2),
10 | ("T", 3), ("W", 4),
11 | ("W", 5), ("W", 6)
12 | )
13 | val pairs = sc.parallelize(data1, 3)
14 | //val result = pairs.reduce((A, B) => (A._1 + "#" + B._1, A._2 + B._2))
15 | //val result = pairs.fold(("K0",10))((A, B) => (A._1 + "#" + B._1, A._2 + B._2))
16 | //val result = pairs.partitionBy(new RangePartitioner(2, pairs, true))
17 | //val result = pairs.mapValues(V => 10 * V)
18 | //result.foreach(println)
19 | }
20 | }
--------------------------------------------------------------------------------
/src/local/examples/PipedRDDTest.scala:
--------------------------------------------------------------------------------
1 | package local.examples
2 |
3 | import org.apache.spark.SparkContext
4 |
5 | object PipedRDDTest {
6 |
7 | def main(args: Array[String]) {
8 | val sc = new SparkContext("local", "Cartesian Test")
9 | val data1 = Array[(String, Int)](("K1", 1), ("K2", 2),
10 | ("U1", 3), ("U2", 4),
11 | ("W1", 3), ("W2", 4)
12 | )
13 | val pairs = sc.parallelize(data1, 3)
14 |
15 | val finalRDD = pairs.pipe("grep 2")
16 |
17 | finalRDD.foreach(println)
18 |
19 | }
20 | }
--------------------------------------------------------------------------------
/src/local/examples/ReduceByKeyActionTest.scala:
--------------------------------------------------------------------------------
1 | package local.examples
2 |
3 | import org.apache.spark.SparkContext
4 |
5 | object ReduceByKeyActionTest {
6 |
7 | def main(args: Array[String]) {
8 |
9 | val sc = new SparkContext("local", "ReduceByKeyToDriver Test")
10 | val data1 = Array[(String, Int)](("K", 1), ("U", 2),
11 | ("U", 3), ("W", 4),
12 | ("W", 5), ("W", 6))
13 | val pairs = sc.parallelize(data1, 3)
14 | //val result = pairs.reduce((A, B) => (A._1 + "#" + B._1, A._2 + B._2))
15 | //val result = pairs.fold(("K0",10))((A, B) => (A._1 + "#" + B._1, A._2 + B._2))
16 | //val result = pairs.reduceByKey(_ + _, 2)
17 | //result.foreach(println)
18 | }
19 |
20 | }
--------------------------------------------------------------------------------
/src/local/examples/ReduceByKeyToDriverTest.scala:
--------------------------------------------------------------------------------
1 | package local.examples
2 |
3 | import org.apache.spark.SparkContext
4 |
5 | object ReduceByKeyToDriverTest {
6 | def main(args: Array[String]) {
7 |
8 | val sc = new SparkContext("local[3]", "ReduceByKeyToDriver Test")
9 | val data1 = Array[(String, Int)](("K", 1), ("U", 2),
10 | ("U", 3), ("W", 4),
11 | ("W", 5), ("W", 6)
12 | )
13 | val pairs = sc.parallelize(data1, 3)
14 | //val result = pairs.reduce((A, B) => (A._1 + "#" + B._1, A._2 + B._2))
15 | //val result = pairs.fold(("K0",10))((A, B) => (A._1 + "#" + B._1, A._2 + B._2))
16 | //val result = pairs.reduceByKeyToDriver(_ + _)
17 | //result.foreach(println)
18 | }
19 | }
--------------------------------------------------------------------------------
/src/local/examples/SparkLR.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package local.examples
19 |
20 | import java.util.Random
21 |
22 | import scala.math.exp
23 |
24 | import breeze.linalg.{Vector, DenseVector}
25 |
26 | import org.apache.spark._
27 |
28 | /**
29 | * Logistic regression based classification.
30 | * Usage: SparkLR [slices]
31 | */
32 | object SparkLR {
33 | val N = 10000 // Number of data points
34 | val D = 10 // Numer of dimensions
35 | val R = 0.7 // Scaling factor
36 | val ITERATIONS = 5
37 | val rand = new Random(42)
38 |
39 | case class DataPoint(x: Vector[Double], y: Double)
40 |
41 | def generateData = {
42 | def generatePoint(i: Int) = {
43 | val y = if(i % 2 == 0) -1 else 1
44 | val x = DenseVector.fill(D){rand.nextGaussian + y * R}
45 | println(x.toString() + " " + y)
46 | DataPoint(x, y)
47 |
48 | }
49 | Array.tabulate(N)(generatePoint)
50 | }
51 |
52 | def main(args: Array[String]) {
53 | val sparkConf = new SparkConf().setAppName("SparkLR")
54 | val sc = new SparkContext(sparkConf)
55 | val numSlices = if (args.length > 0) args(0).toInt else 2
56 | val points = sc.parallelize(generateData, numSlices).cache()
57 |
58 | // Initialize w to a random value
59 | var w = DenseVector.fill(D){2 * rand.nextDouble - 1}
60 | println("Initial w: " + w)
61 |
62 | for (i <- 1 to ITERATIONS) {
63 | println("On iteration " + i)
64 | val gradient = points.map { p =>
65 | p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y
66 | }.reduce(_ + _)
67 | w -= gradient
68 | }
69 |
70 | println("Final w: " + w)
71 | sc.stop()
72 | }
73 | }
74 |
--------------------------------------------------------------------------------
/src/local/examples/TakeActionTest.scala:
--------------------------------------------------------------------------------
1 | package local.examples
2 |
3 | import org.apache.spark.SparkContext
4 |
5 | object TakeActionTest {
6 | def main(args: Array[String]) {
7 |
8 | val sc = new SparkContext("local", "TakeAction Test")
9 | val data1 = Array[(String, Int)](("K1", 1), ("K2", 2),
10 | ("U1", 3), ("U2", 4),
11 | ("W1", 3), ("W2", 4)
12 | )
13 | val pairs = sc.parallelize(data1, 3)
14 | //val result = pairs.reduce((A, B) => (A._1 + "#" + B._1, A._2 + B._2))
15 | //val result = pairs.fold(("K0",10))((A, B) => (A._1 + "#" + B._1, A._2 + B._2))
16 | val result = pairs.take(5)
17 | result.foreach(println)
18 | }
19 | }
--------------------------------------------------------------------------------
/src/local/examples/UnionTest.scala:
--------------------------------------------------------------------------------
1 | package local.examples
2 |
3 | import org.apache.spark.SparkContext
4 |
5 | object UnionTest {
6 | def main(args: Array[String]) {
7 |
8 | val sc = new SparkContext("local", "ReduceByKeyToDriver Test")
9 |
10 | val data1 = Array[(String, Int)](("K1", 1), ("K2", 2),
11 | ("U1", 3), ("U2", 4),
12 | ("W1", 5), ("W1", 6)
13 | )
14 |
15 | val data2 = Array[(String, Int)](("K1", 7), ("K2", 8),
16 | ("U1", 9), ("W1", 0)
17 | )
18 | val pairs1 = sc.parallelize(data1, 3)
19 | val pairs2 = sc.parallelize(data2, 2)
20 | //val result = pairs.reduce((A, B) => (A._1 + "#" + B._1, A._2 + B._2))
21 | //val result = pairs.fold(("K0",10))((A, B) => (A._1 + "#" + B._1, A._2 + B._2))
22 | //val result = pairs.partitionBy(new RangePartitioner(2, pairs, true))
23 | val result = pairs1.union(pairs2)
24 | result.foreach(println)
25 | //result.saveAsTextFile("E:\\Spark\\output\\join")
26 | }
27 | }
--------------------------------------------------------------------------------
/src/local/examples/partitionByTest.scala:
--------------------------------------------------------------------------------
1 | package local.examples
2 |
3 | import org.apache.spark.SparkContext
4 |
5 | object partitionByTest {
6 | def main(args: Array[String]) {
7 |
8 | val sc = new SparkContext("local", "ReduceByKeyToDriver Test")
9 | val data1 = Array[(String, Int)](("K", 1), ("T", 2),
10 | ("T", 3), ("W", 4),
11 | ("W", 5), ("W", 6)
12 | )
13 | val pairs = sc.parallelize(data1, 3)
14 | //val result = pairs.reduce((A, B) => (A._1 + "#" + B._1, A._2 + B._2))
15 | //val result = pairs.fold(("K0",10))((A, B) => (A._1 + "#" + B._1, A._2 + B._2))
16 | //val result = pairs.partitionBy(new RangePartitioner(2, pairs, true))
17 | //val result = pairs.partitionBy(new HashPartitioner(2))
18 | //result.foreach(println)
19 | }
20 | }
--------------------------------------------------------------------------------
/src/local/examples/reduceActionTest.scala:
--------------------------------------------------------------------------------
1 | package local.examples
2 |
3 | import org.apache.spark.SparkContext
4 |
5 | object reduceActionTest {
6 | def main(args: Array[String]) {
7 |
8 | val sc = new SparkContext("local", "MapPartitionsRDD Test")
9 | val data1 = Array[(String, Int)](("K1", 1), ("K2", 2),
10 | ("U1", 3), ("U2", 4),
11 | ("W1", 3), ("W2", 4)
12 | )
13 | val pairs = sc.parallelize(data1, 3)
14 | //val result = pairs.reduce((A, B) => (A._1 + "#" + B._1, A._2 + B._2))
15 | val result = pairs.fold(("K0",10))((A, B) => (A._1 + "#" + B._1, A._2 + B._2))
16 | println(result)
17 | }
18 |
19 | }
--------------------------------------------------------------------------------
/src/local/examples/sortByKeyTest.scala:
--------------------------------------------------------------------------------
1 | package local.examples
2 |
3 | import org.apache.spark.SparkContext
4 |
5 | object sortByKeyTest {
6 | def main(args: Array[String]) {
7 |
8 | val sc = new SparkContext("local", "ReduceByKeyToDriver Test")
9 |
10 | val data1 = Array[(String, Int)](("K1", 1), ("K2", 2),
11 | ("U1", 3), ("U2", 4),
12 | ("W1", 5), ("W1", 6)
13 | )
14 | val pairs1 = sc.parallelize(data1, 3)
15 |
16 | //val result = pairs.reduce((A, B) => (A._1 + "#" + B._1, A._2 + B._2))
17 | //val result = pairs.fold(("K0",10))((A, B) => (A._1 + "#" + B._1, A._2 + B._2))
18 |
19 | //val result = pairs1.sortByKey()
20 | //result.foreach(println)
21 | //result.saveAsTextFile("E:\\Spark\\output\\sortByKey")
22 | }
23 |
24 | }
--------------------------------------------------------------------------------
/src/org/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JerryLead/SparkLearning/b1150869e97dff8b623898c88b3635abd9f3bd54/src/org/.DS_Store
--------------------------------------------------------------------------------
/src/org/apache/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JerryLead/SparkLearning/b1150869e97dff8b623898c88b3635abd9f3bd54/src/org/apache/.DS_Store
--------------------------------------------------------------------------------
/src/org/apache/spark/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JerryLead/SparkLearning/b1150869e97dff8b623898c88b3635abd9f3bd54/src/org/apache/spark/.DS_Store
--------------------------------------------------------------------------------
/src/org/apache/spark/examples/BroadcastTest.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.spark.examples
19 |
20 | import org.apache.spark.{SparkConf, SparkContext}
21 |
22 | /**
23 | * Usage: BroadcastTest [slices] [numElem] [broadcastAlgo] [blockSize]
24 | */
25 | object BroadcastTest {
26 | def main(args: Array[String]) {
27 |
28 | val bcName = if (args.length > 2) args(2) else "Http"
29 | val blockSize = if (args.length > 3) args(3) else "4096"
30 |
31 | System.setProperty("spark.broadcast.factory", "org.apache.spark.broadcast." + bcName +
32 | "BroadcastFactory")
33 | System.setProperty("spark.broadcast.blockSize", blockSize)
34 | val sparkConf = new SparkConf().setAppName("Broadcast Test")
35 |
36 | val sc = new SparkContext(sparkConf)
37 |
38 | val slices = if (args.length > 0) args(0).toInt else 2
39 | val num = if (args.length > 1) args(1).toInt else 1000000
40 |
41 | val arr1 = new Array[Int](num)
42 | for (i <- 0 until arr1.length) {
43 | arr1(i) = i
44 | }
45 |
46 | for (i <- 0 until 3) {
47 | println("Iteration " + i)
48 | println("===========")
49 | val startTime = System.nanoTime
50 | val barr1 = sc.broadcast(arr1)
51 | val observedSizes = sc.parallelize(1 to 10, slices).map(_ => barr1.value.size)
52 | // Collect the small RDD so we can print the observed sizes locally.
53 | observedSizes.collect().foreach(i => println(i))
54 | println("Iteration %d took %.0f milliseconds".format(i, (System.nanoTime - startTime) / 1E6))
55 | }
56 |
57 | sc.stop()
58 | }
59 | }
60 |
--------------------------------------------------------------------------------
/src/org/apache/spark/examples/CassandraCQLTest.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.spark.examples
19 |
20 | import java.nio.ByteBuffer
21 |
22 | import scala.collection.JavaConversions._
23 | import scala.collection.mutable.ListBuffer
24 | import scala.collection.immutable.Map
25 |
26 | import org.apache.cassandra.hadoop.ConfigHelper
27 | import org.apache.cassandra.hadoop.cql3.CqlPagingInputFormat
28 | import org.apache.cassandra.hadoop.cql3.CqlConfigHelper
29 | import org.apache.cassandra.hadoop.cql3.CqlOutputFormat
30 | import org.apache.cassandra.utils.ByteBufferUtil
31 | import org.apache.hadoop.mapreduce.Job
32 |
33 | import org.apache.spark.{SparkConf, SparkContext}
34 | import org.apache.spark.SparkContext._
35 |
36 | /*
37 | Need to create following keyspace and column family in cassandra before running this example
38 | Start CQL shell using ./bin/cqlsh and execute following commands
39 | CREATE KEYSPACE retail WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 1};
40 | use retail;
41 | CREATE TABLE salecount (prod_id text, sale_count int, PRIMARY KEY (prod_id));
42 | CREATE TABLE ordercf (user_id text,
43 | time timestamp,
44 | prod_id text,
45 | quantity int,
46 | PRIMARY KEY (user_id, time));
47 | INSERT INTO ordercf (user_id,
48 | time,
49 | prod_id,
50 | quantity) VALUES ('bob', 1385983646000, 'iphone', 1);
51 | INSERT INTO ordercf (user_id,
52 | time,
53 | prod_id,
54 | quantity) VALUES ('tom', 1385983647000, 'samsung', 4);
55 | INSERT INTO ordercf (user_id,
56 | time,
57 | prod_id,
58 | quantity) VALUES ('dora', 1385983648000, 'nokia', 2);
59 | INSERT INTO ordercf (user_id,
60 | time,
61 | prod_id,
62 | quantity) VALUES ('charlie', 1385983649000, 'iphone', 2);
63 | */
64 |
65 | /**
66 | * This example demonstrates how to read and write to cassandra column family created using CQL3
67 | * using Spark.
68 | * Parameters :
69 | * Usage: ./bin/spark-submit examples.jar \
70 | * --class org.apache.spark.examples.CassandraCQLTest localhost 9160
71 | */
72 | object CassandraCQLTest {
73 |
74 | def main(args: Array[String]) {
75 | val sparkConf = new SparkConf().setAppName("CQLTestApp")
76 |
77 | val sc = new SparkContext(sparkConf)
78 | val cHost: String = args(0)
79 | val cPort: String = args(1)
80 | val KeySpace = "retail"
81 | val InputColumnFamily = "ordercf"
82 | val OutputColumnFamily = "salecount"
83 |
84 | val job = new Job()
85 | job.setInputFormatClass(classOf[CqlPagingInputFormat])
86 | ConfigHelper.setInputInitialAddress(job.getConfiguration(), cHost)
87 | ConfigHelper.setInputRpcPort(job.getConfiguration(), cPort)
88 | ConfigHelper.setInputColumnFamily(job.getConfiguration(), KeySpace, InputColumnFamily)
89 | ConfigHelper.setInputPartitioner(job.getConfiguration(), "Murmur3Partitioner")
90 | CqlConfigHelper.setInputCQLPageRowSize(job.getConfiguration(), "3")
91 |
92 | /** CqlConfigHelper.setInputWhereClauses(job.getConfiguration(), "user_id='bob'") */
93 |
94 | /** An UPDATE writes one or more columns to a record in a Cassandra column family */
95 | val query = "UPDATE " + KeySpace + "." + OutputColumnFamily + " SET sale_count = ? "
96 | CqlConfigHelper.setOutputCql(job.getConfiguration(), query)
97 |
98 | job.setOutputFormatClass(classOf[CqlOutputFormat])
99 | ConfigHelper.setOutputColumnFamily(job.getConfiguration(), KeySpace, OutputColumnFamily)
100 | ConfigHelper.setOutputInitialAddress(job.getConfiguration(), cHost)
101 | ConfigHelper.setOutputRpcPort(job.getConfiguration(), cPort)
102 | ConfigHelper.setOutputPartitioner(job.getConfiguration(), "Murmur3Partitioner")
103 |
104 | val casRdd = sc.newAPIHadoopRDD(job.getConfiguration(),
105 | classOf[CqlPagingInputFormat],
106 | classOf[java.util.Map[String,ByteBuffer]],
107 | classOf[java.util.Map[String,ByteBuffer]])
108 |
109 | println("Count: " + casRdd.count)
110 | val productSaleRDD = casRdd.map {
111 | case (key, value) => {
112 | (ByteBufferUtil.string(value.get("prod_id")), ByteBufferUtil.toInt(value.get("quantity")))
113 | }
114 | }
115 | val aggregatedRDD = productSaleRDD.reduceByKey(_ + _)
116 | aggregatedRDD.collect().foreach {
117 | case (productId, saleCount) => println(productId + ":" + saleCount)
118 | }
119 |
120 | val casoutputCF = aggregatedRDD.map {
121 | case (productId, saleCount) => {
122 | val outColFamKey = Map("prod_id" -> ByteBufferUtil.bytes(productId))
123 | val outKey: java.util.Map[String, ByteBuffer] = outColFamKey
124 | var outColFamVal = new ListBuffer[ByteBuffer]
125 | outColFamVal += ByteBufferUtil.bytes(saleCount)
126 | val outVal: java.util.List[ByteBuffer] = outColFamVal
127 | (outKey, outVal)
128 | }
129 | }
130 |
131 | casoutputCF.saveAsNewAPIHadoopFile(
132 | KeySpace,
133 | classOf[java.util.Map[String, ByteBuffer]],
134 | classOf[java.util.List[ByteBuffer]],
135 | classOf[CqlOutputFormat],
136 | job.getConfiguration()
137 | )
138 | }
139 | }
140 |
--------------------------------------------------------------------------------
/src/org/apache/spark/examples/CassandraTest.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.spark.examples
19 |
20 | import java.nio.ByteBuffer
21 | import java.util.SortedMap
22 |
23 | import scala.collection.JavaConversions._
24 |
25 | import org.apache.cassandra.db.IColumn
26 | import org.apache.cassandra.hadoop.ColumnFamilyOutputFormat
27 | import org.apache.cassandra.hadoop.ConfigHelper
28 | import org.apache.cassandra.hadoop.ColumnFamilyInputFormat
29 | import org.apache.cassandra.thrift._
30 | import org.apache.cassandra.utils.ByteBufferUtil
31 | import org.apache.hadoop.mapreduce.Job
32 |
33 | import org.apache.spark.{SparkConf, SparkContext}
34 | import org.apache.spark.SparkContext._
35 |
36 | /*
37 | * This example demonstrates using Spark with Cassandra with the New Hadoop API and Cassandra
38 | * support for Hadoop.
39 | *
40 | * To run this example, run this file with the following command params -
41 | *
42 | *
43 | * So if you want to run this on localhost this will be,
44 | * localhost 9160
45 | *
46 | * The example makes some assumptions:
47 | * 1. You have already created a keyspace called casDemo and it has a column family named Words
48 | * 2. There are column family has a column named "para" which has test content.
49 | *
50 | * You can create the content by running the following script at the bottom of this file with
51 | * cassandra-cli.
52 | *
53 | */
54 | object CassandraTest {
55 |
56 | def main(args: Array[String]) {
57 | val sparkConf = new SparkConf().setAppName("casDemo")
58 | // Get a SparkContext
59 | val sc = new SparkContext(sparkConf)
60 |
61 | // Build the job configuration with ConfigHelper provided by Cassandra
62 | val job = new Job()
63 | job.setInputFormatClass(classOf[ColumnFamilyInputFormat])
64 |
65 | val host: String = args(1)
66 | val port: String = args(2)
67 |
68 | ConfigHelper.setInputInitialAddress(job.getConfiguration(), host)
69 | ConfigHelper.setInputRpcPort(job.getConfiguration(), port)
70 | ConfigHelper.setOutputInitialAddress(job.getConfiguration(), host)
71 | ConfigHelper.setOutputRpcPort(job.getConfiguration(), port)
72 | ConfigHelper.setInputColumnFamily(job.getConfiguration(), "casDemo", "Words")
73 | ConfigHelper.setOutputColumnFamily(job.getConfiguration(), "casDemo", "WordCount")
74 |
75 | val predicate = new SlicePredicate()
76 | val sliceRange = new SliceRange()
77 | sliceRange.setStart(Array.empty[Byte])
78 | sliceRange.setFinish(Array.empty[Byte])
79 | predicate.setSlice_range(sliceRange)
80 | ConfigHelper.setInputSlicePredicate(job.getConfiguration(), predicate)
81 |
82 | ConfigHelper.setInputPartitioner(job.getConfiguration(), "Murmur3Partitioner")
83 | ConfigHelper.setOutputPartitioner(job.getConfiguration(), "Murmur3Partitioner")
84 |
85 | // Make a new Hadoop RDD
86 | val casRdd = sc.newAPIHadoopRDD(
87 | job.getConfiguration(),
88 | classOf[ColumnFamilyInputFormat],
89 | classOf[ByteBuffer],
90 | classOf[SortedMap[ByteBuffer, IColumn]])
91 |
92 | // Let us first get all the paragraphs from the retrieved rows
93 | val paraRdd = casRdd.map {
94 | case (key, value) => {
95 | ByteBufferUtil.string(value.get(ByteBufferUtil.bytes("para")).value())
96 | }
97 | }
98 |
99 | // Lets get the word count in paras
100 | val counts = paraRdd.flatMap(p => p.split(" ")).map(word => (word, 1)).reduceByKey(_ + _)
101 |
102 | counts.collect().foreach {
103 | case (word, count) => println(word + ":" + count)
104 | }
105 |
106 | counts.map {
107 | case (word, count) => {
108 | val colWord = new org.apache.cassandra.thrift.Column()
109 | colWord.setName(ByteBufferUtil.bytes("word"))
110 | colWord.setValue(ByteBufferUtil.bytes(word))
111 | colWord.setTimestamp(System.currentTimeMillis)
112 |
113 | val colCount = new org.apache.cassandra.thrift.Column()
114 | colCount.setName(ByteBufferUtil.bytes("wcount"))
115 | colCount.setValue(ByteBufferUtil.bytes(count.toLong))
116 | colCount.setTimestamp(System.currentTimeMillis)
117 |
118 | val outputkey = ByteBufferUtil.bytes(word + "-COUNT-" + System.currentTimeMillis)
119 |
120 | val mutations: java.util.List[Mutation] = new Mutation() :: new Mutation() :: Nil
121 | mutations.get(0).setColumn_or_supercolumn(new ColumnOrSuperColumn())
122 | mutations.get(0).column_or_supercolumn.setColumn(colWord)
123 | mutations.get(1).setColumn_or_supercolumn(new ColumnOrSuperColumn())
124 | mutations.get(1).column_or_supercolumn.setColumn(colCount)
125 | (outputkey, mutations)
126 | }
127 | }.saveAsNewAPIHadoopFile("casDemo", classOf[ByteBuffer], classOf[List[Mutation]],
128 | classOf[ColumnFamilyOutputFormat], job.getConfiguration)
129 | }
130 | }
131 |
132 | /*
133 | create keyspace casDemo;
134 | use casDemo;
135 |
136 | create column family WordCount with comparator = UTF8Type;
137 | update column family WordCount with column_metadata =
138 | [{column_name: word, validation_class: UTF8Type},
139 | {column_name: wcount, validation_class: LongType}];
140 |
141 | create column family Words with comparator = UTF8Type;
142 | update column family Words with column_metadata =
143 | [{column_name: book, validation_class: UTF8Type},
144 | {column_name: para, validation_class: UTF8Type}];
145 |
146 | assume Words keys as utf8;
147 |
148 | set Words['3musk001']['book'] = 'The Three Musketeers';
149 | set Words['3musk001']['para'] = 'On the first Monday of the month of April, 1625, the market
150 | town of Meung, in which the author of ROMANCE OF THE ROSE was born, appeared to
151 | be in as perfect a state of revolution as if the Huguenots had just made
152 | a second La Rochelle of it. Many citizens, seeing the women flying
153 | toward the High Street, leaving their children crying at the open doors,
154 | hastened to don the cuirass, and supporting their somewhat uncertain
155 | courage with a musket or a partisan, directed their steps toward the
156 | hostelry of the Jolly Miller, before which was gathered, increasing
157 | every minute, a compact group, vociferous and full of curiosity.';
158 |
159 | set Words['3musk002']['book'] = 'The Three Musketeers';
160 | set Words['3musk002']['para'] = 'In those times panics were common, and few days passed without
161 | some city or other registering in its archives an event of this kind. There were
162 | nobles, who made war against each other; there was the king, who made
163 | war against the cardinal; there was Spain, which made war against the
164 | king. Then, in addition to these concealed or public, secret or open
165 | wars, there were robbers, mendicants, Huguenots, wolves, and scoundrels,
166 | who made war upon everybody. The citizens always took up arms readily
167 | against thieves, wolves or scoundrels, often against nobles or
168 | Huguenots, sometimes against the king, but never against cardinal or
169 | Spain. It resulted, then, from this habit that on the said first Monday
170 | of April, 1625, the citizens, on hearing the clamor, and seeing neither
171 | the red-and-yellow standard nor the livery of the Duc de Richelieu,
172 | rushed toward the hostel of the Jolly Miller. When arrived there, the
173 | cause of the hubbub was apparent to all';
174 |
175 | set Words['3musk003']['book'] = 'The Three Musketeers';
176 | set Words['3musk003']['para'] = 'You ought, I say, then, to husband the means you have, however
177 | large the sum may be; but you ought also to endeavor to perfect yourself in
178 | the exercises becoming a gentleman. I will write a letter today to the
179 | Director of the Royal Academy, and tomorrow he will admit you without
180 | any expense to yourself. Do not refuse this little service. Our
181 | best-born and richest gentlemen sometimes solicit it without being able
182 | to obtain it. You will learn horsemanship, swordsmanship in all its
183 | branches, and dancing. You will make some desirable acquaintances; and
184 | from time to time you can call upon me, just to tell me how you are
185 | getting on, and to say whether I can be of further service to you.';
186 |
187 |
188 | set Words['thelostworld001']['book'] = 'The Lost World';
189 | set Words['thelostworld001']['para'] = 'She sat with that proud, delicate profile of hers outlined
190 | against the red curtain. How beautiful she was! And yet how aloof! We had been
191 | friends, quite good friends; but never could I get beyond the same
192 | comradeship which I might have established with one of my
193 | fellow-reporters upon the Gazette,--perfectly frank, perfectly kindly,
194 | and perfectly unsexual. My instincts are all against a woman being too
195 | frank and at her ease with me. It is no compliment to a man. Where
196 | the real sex feeling begins, timidity and distrust are its companions,
197 | heritage from old wicked days when love and violence went often hand in
198 | hand. The bent head, the averted eye, the faltering voice, the wincing
199 | figure--these, and not the unshrinking gaze and frank reply, are the
200 | true signals of passion. Even in my short life I had learned as much
201 | as that--or had inherited it in that race memory which we call instinct.';
202 |
203 | set Words['thelostworld002']['book'] = 'The Lost World';
204 | set Words['thelostworld002']['para'] = 'I always liked McArdle, the crabbed, old, round-backed,
205 | red-headed news editor, and I rather hoped that he liked me. Of course, Beaumont was
206 | the real boss; but he lived in the rarefied atmosphere of some Olympian
207 | height from which he could distinguish nothing smaller than an
208 | international crisis or a split in the Cabinet. Sometimes we saw him
209 | passing in lonely majesty to his inner sanctum, with his eyes staring
210 | vaguely and his mind hovering over the Balkans or the Persian Gulf. He
211 | was above and beyond us. But McArdle was his first lieutenant, and it
212 | was he that we knew. The old man nodded as I entered the room, and he
213 | pushed his spectacles far up on his bald forehead.';
214 |
215 | */
216 |
--------------------------------------------------------------------------------
/src/org/apache/spark/examples/DriverSubmissionTest.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.spark.examples
19 |
20 | import scala.collection.JavaConversions._
21 |
22 | /** Prints out environmental information, sleeps, and then exits. Made to
23 | * test driver submission in the standalone scheduler. */
24 | object DriverSubmissionTest {
25 | def main(args: Array[String]) {
26 | if (args.size < 1) {
27 | println("Usage: DriverSubmissionTest ")
28 | System.exit(0)
29 | }
30 | val numSecondsToSleep = args(0).toInt
31 |
32 | val env = System.getenv()
33 | val properties = System.getProperties()
34 |
35 | println("Environment variables containing SPARK_TEST:")
36 | env.filter{case (k, v) => k.contains("SPARK_TEST")}.foreach(println)
37 |
38 | println("System properties containing spark.test:")
39 | properties.filter{case (k, v) => k.toString.contains("spark.test")}.foreach(println)
40 |
41 | for (i <- 1 until numSecondsToSleep) {
42 | println(s"Alive for $i out of $numSecondsToSleep seconds")
43 | Thread.sleep(1000)
44 | }
45 | }
46 | }
47 |
--------------------------------------------------------------------------------
/src/org/apache/spark/examples/ExceptionHandlingTest.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.spark.examples
19 |
20 | import org.apache.spark.{SparkConf, SparkContext}
21 |
22 | object ExceptionHandlingTest {
23 | def main(args: Array[String]) {
24 | val sparkConf = new SparkConf().setAppName("ExceptionHandlingTest")
25 | val sc = new SparkContext(sparkConf)
26 | sc.parallelize(0 until sc.defaultParallelism).foreach { i =>
27 | if (math.random > 0.75) {
28 | throw new Exception("Testing exception handling")
29 | }
30 | }
31 |
32 | sc.stop()
33 | }
34 | }
35 |
--------------------------------------------------------------------------------
/src/org/apache/spark/examples/GroupByTest.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.spark.examples
19 |
20 | import java.util.Random
21 |
22 | import org.apache.spark.{SparkConf, SparkContext}
23 | import org.apache.spark.SparkContext._
24 |
25 | /**
26 | * Usage: GroupByTest [numMappers] [numKVPairs] [KeySize] [numReducers]
27 | */
28 | object GroupByTest {
29 | def main(args: Array[String]) {
30 | val sparkConf = new SparkConf().setAppName("GroupBy Test")
31 | var numMappers = if (args.length > 0) args(0).toInt else 2
32 | var numKVPairs = if (args.length > 1) args(1).toInt else 1000
33 | var valSize = if (args.length > 2) args(2).toInt else 1000
34 | var numReducers = if (args.length > 3) args(3).toInt else numMappers
35 |
36 | val sc = new SparkContext(sparkConf)
37 |
38 | val pairs1 = sc.parallelize(0 until numMappers, numMappers).flatMap { p =>
39 | val ranGen = new Random
40 | var arr1 = new Array[(Int, Array[Byte])](numKVPairs)
41 | for (i <- 0 until numKVPairs) {
42 | val byteArr = new Array[Byte](valSize)
43 | ranGen.nextBytes(byteArr)
44 | arr1(i) = (ranGen.nextInt(Int.MaxValue), byteArr)
45 | }
46 | arr1
47 | }.cache
48 | // Enforce that everything has been calculated and in cache
49 | pairs1.count
50 |
51 | println(pairs1.groupByKey(numReducers).count)
52 |
53 | sc.stop()
54 | }
55 | }
56 |
--------------------------------------------------------------------------------
/src/org/apache/spark/examples/HBaseTest.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.spark.examples
19 |
20 | import org.apache.hadoop.hbase.client.HBaseAdmin
21 | import org.apache.hadoop.hbase.{HBaseConfiguration, HTableDescriptor}
22 | import org.apache.hadoop.hbase.mapreduce.TableInputFormat
23 |
24 | import org.apache.spark._
25 | import org.apache.spark.rdd.NewHadoopRDD
26 |
27 | object HBaseTest {
28 | def main(args: Array[String]) {
29 | val sparkConf = new SparkConf().setAppName("HBaseTest")
30 | val sc = new SparkContext(sparkConf)
31 | val conf = HBaseConfiguration.create()
32 | // Other options for configuring scan behavior are available. More information available at
33 | // http://hbase.apache.org/apidocs/org/apache/hadoop/hbase/mapreduce/TableInputFormat.html
34 | conf.set(TableInputFormat.INPUT_TABLE, args(1))
35 |
36 | // Initialize hBase table if necessary
37 | val admin = new HBaseAdmin(conf)
38 | if(!admin.isTableAvailable(args(1))) {
39 | val tableDesc = new HTableDescriptor(args(1))
40 | admin.createTable(tableDesc)
41 | }
42 |
43 | val hBaseRDD = sc.newAPIHadoopRDD(conf, classOf[TableInputFormat],
44 | classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable],
45 | classOf[org.apache.hadoop.hbase.client.Result])
46 |
47 | hBaseRDD.count()
48 |
49 | sc.stop()
50 | }
51 | }
52 |
--------------------------------------------------------------------------------
/src/org/apache/spark/examples/HdfsTest.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.spark.examples
19 |
20 | import org.apache.spark._
21 |
22 | object HdfsTest {
23 | def main(args: Array[String]) {
24 | val sparkConf = new SparkConf().setAppName("HdfsTest")
25 | val sc = new SparkContext(sparkConf)
26 | val file = sc.textFile(args(1))
27 | val mapped = file.map(s => s.length).cache()
28 | for (iter <- 1 to 10) {
29 | val start = System.currentTimeMillis()
30 | for (x <- mapped) { x + 2 }
31 | // println("Processing: " + x)
32 | val end = System.currentTimeMillis()
33 | println("Iteration " + iter + " took " + (end-start) + " ms")
34 | }
35 | sc.stop()
36 | }
37 | }
38 |
--------------------------------------------------------------------------------
/src/org/apache/spark/examples/LocalALS.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.spark.examples
19 |
20 | import scala.math.sqrt
21 |
22 | import cern.colt.matrix._
23 | import cern.colt.matrix.linalg._
24 | import cern.jet.math._
25 |
26 | /**
27 | * Alternating least squares matrix factorization.
28 | */
29 | object LocalALS {
30 | // Parameters set through command line arguments
31 | var M = 0 // Number of movies
32 | var U = 0 // Number of users
33 | var F = 0 // Number of features
34 | var ITERATIONS = 0
35 |
36 | val LAMBDA = 0.01 // Regularization coefficient
37 |
38 | // Some COLT objects
39 | val factory2D = DoubleFactory2D.dense
40 | val factory1D = DoubleFactory1D.dense
41 | val algebra = Algebra.DEFAULT
42 | val blas = SeqBlas.seqBlas
43 |
44 | def generateR(): DoubleMatrix2D = {
45 | val mh = factory2D.random(M, F)
46 | val uh = factory2D.random(U, F)
47 | algebra.mult(mh, algebra.transpose(uh))
48 | }
49 |
50 | def rmse(targetR: DoubleMatrix2D, ms: Array[DoubleMatrix1D],
51 | us: Array[DoubleMatrix1D]): Double =
52 | {
53 | val r = factory2D.make(M, U)
54 | for (i <- 0 until M; j <- 0 until U) {
55 | r.set(i, j, blas.ddot(ms(i), us(j)))
56 | }
57 | blas.daxpy(-1, targetR, r)
58 | val sumSqs = r.aggregate(Functions.plus, Functions.square)
59 | sqrt(sumSqs / (M * U))
60 | }
61 |
62 | def updateMovie(i: Int, m: DoubleMatrix1D, us: Array[DoubleMatrix1D],
63 | R: DoubleMatrix2D) : DoubleMatrix1D =
64 | {
65 | val XtX = factory2D.make(F, F)
66 | val Xty = factory1D.make(F)
67 | // For each user that rated the movie
68 | for (j <- 0 until U) {
69 | val u = us(j)
70 | // Add u * u^t to XtX
71 | blas.dger(1, u, u, XtX)
72 | // Add u * rating to Xty
73 | blas.daxpy(R.get(i, j), u, Xty)
74 | }
75 | // Add regularization coefs to diagonal terms
76 | for (d <- 0 until F) {
77 | XtX.set(d, d, XtX.get(d, d) + LAMBDA * U)
78 | }
79 | // Solve it with Cholesky
80 | val ch = new CholeskyDecomposition(XtX)
81 | val Xty2D = factory2D.make(Xty.toArray, F)
82 | val solved2D = ch.solve(Xty2D)
83 | solved2D.viewColumn(0)
84 | }
85 |
86 | def updateUser(j: Int, u: DoubleMatrix1D, ms: Array[DoubleMatrix1D],
87 | R: DoubleMatrix2D) : DoubleMatrix1D =
88 | {
89 | val XtX = factory2D.make(F, F)
90 | val Xty = factory1D.make(F)
91 | // For each movie that the user rated
92 | for (i <- 0 until M) {
93 | val m = ms(i)
94 | // Add m * m^t to XtX
95 | blas.dger(1, m, m, XtX)
96 | // Add m * rating to Xty
97 | blas.daxpy(R.get(i, j), m, Xty)
98 | }
99 | // Add regularization coefs to diagonal terms
100 | for (d <- 0 until F) {
101 | XtX.set(d, d, XtX.get(d, d) + LAMBDA * M)
102 | }
103 | // Solve it with Cholesky
104 | val ch = new CholeskyDecomposition(XtX)
105 | val Xty2D = factory2D.make(Xty.toArray, F)
106 | val solved2D = ch.solve(Xty2D)
107 | solved2D.viewColumn(0)
108 | }
109 |
110 | def main(args: Array[String]) {
111 | args match {
112 | case Array(m, u, f, iters) => {
113 | M = m.toInt
114 | U = u.toInt
115 | F = f.toInt
116 | ITERATIONS = iters.toInt
117 | }
118 | case _ => {
119 | System.err.println("Usage: LocalALS ")
120 | System.exit(1)
121 | }
122 | }
123 | printf("Running with M=%d, U=%d, F=%d, iters=%d\n", M, U, F, ITERATIONS)
124 |
125 | val R = generateR()
126 |
127 | // Initialize m and u randomly
128 | var ms = Array.fill(M)(factory1D.random(F))
129 | var us = Array.fill(U)(factory1D.random(F))
130 |
131 | // Iteratively update movies then users
132 | for (iter <- 1 to ITERATIONS) {
133 | println("Iteration " + iter + ":")
134 | ms = (0 until M).map(i => updateMovie(i, ms(i), us, R)).toArray
135 | us = (0 until U).map(j => updateUser(j, us(j), ms, R)).toArray
136 | println("RMSE = " + rmse(R, ms, us))
137 | println()
138 | }
139 | }
140 | }
141 |
--------------------------------------------------------------------------------
/src/org/apache/spark/examples/LocalFileLR.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.spark.examples
19 |
20 | import java.util.Random
21 |
22 | import breeze.linalg.{Vector, DenseVector}
23 |
24 | object LocalFileLR {
25 | val D = 10 // Numer of dimensions
26 | val rand = new Random(42)
27 |
28 | case class DataPoint(x: Vector[Double], y: Double)
29 |
30 | def parsePoint(line: String): DataPoint = {
31 | val nums = line.split(' ').map(_.toDouble)
32 | DataPoint(new DenseVector(nums.slice(1, D + 1)), nums(0))
33 | }
34 |
35 | def main(args: Array[String]) {
36 | val lines = scala.io.Source.fromFile(args(0)).getLines().toArray
37 | val points = lines.map(parsePoint _)
38 | val ITERATIONS = args(1).toInt
39 |
40 | // Initialize w to a random value
41 | var w = DenseVector.fill(D){2 * rand.nextDouble - 1}
42 | println("Initial w: " + w)
43 |
44 | for (i <- 1 to ITERATIONS) {
45 | println("On iteration " + i)
46 | var gradient = DenseVector.zeros[Double](D)
47 | for (p <- points) {
48 | val scale = (1 / (1 + math.exp(-p.y * (w.dot(p.x)))) - 1) * p.y
49 | gradient += p.x * scale
50 | }
51 | w -= gradient
52 | }
53 |
54 | println("Final w: " + w)
55 | }
56 | }
57 |
--------------------------------------------------------------------------------
/src/org/apache/spark/examples/LocalKMeans.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.spark.examples
19 |
20 | import java.util.Random
21 |
22 | import scala.collection.mutable.HashMap
23 | import scala.collection.mutable.HashSet
24 |
25 | import breeze.linalg.{Vector, DenseVector, squaredDistance}
26 |
27 | import org.apache.spark.SparkContext._
28 |
29 | /**
30 | * K-means clustering.
31 | */
32 | object LocalKMeans {
33 | val N = 1000
34 | val R = 1000 // Scaling factor
35 | val D = 10
36 | val K = 10
37 | val convergeDist = 0.001
38 | val rand = new Random(42)
39 |
40 | def generateData = {
41 | def generatePoint(i: Int) = {
42 | DenseVector.fill(D){rand.nextDouble * R}
43 | }
44 | Array.tabulate(N)(generatePoint)
45 | }
46 |
47 | def closestPoint(p: Vector[Double], centers: HashMap[Int, Vector[Double]]): Int = {
48 | var index = 0
49 | var bestIndex = 0
50 | var closest = Double.PositiveInfinity
51 |
52 | for (i <- 1 to centers.size) {
53 | val vCurr = centers.get(i).get
54 | val tempDist = squaredDistance(p, vCurr)
55 | if (tempDist < closest) {
56 | closest = tempDist
57 | bestIndex = i
58 | }
59 | }
60 |
61 | bestIndex
62 | }
63 |
64 | def main(args: Array[String]) {
65 | val data = generateData
66 | var points = new HashSet[Vector[Double]]
67 | var kPoints = new HashMap[Int, Vector[Double]]
68 | var tempDist = 1.0
69 |
70 | while (points.size < K) {
71 | points.add(data(rand.nextInt(N)))
72 | }
73 |
74 | val iter = points.iterator
75 | for (i <- 1 to points.size) {
76 | kPoints.put(i, iter.next())
77 | }
78 |
79 | println("Initial centers: " + kPoints)
80 |
81 | while(tempDist > convergeDist) {
82 | var closest = data.map (p => (closestPoint(p, kPoints), (p, 1)))
83 |
84 | var mappings = closest.groupBy[Int] (x => x._1)
85 |
86 | var pointStats = mappings.map { pair =>
87 | pair._2.reduceLeft [(Int, (Vector[Double], Int))] {
88 | case ((id1, (x1, y1)), (id2, (x2, y2))) => (id1, (x1 + x2, y1 + y2))
89 | }
90 | }
91 |
92 | var newPoints = pointStats.map {mapping =>
93 | (mapping._1, mapping._2._1 * (1.0 / mapping._2._2))}
94 |
95 | tempDist = 0.0
96 | for (mapping <- newPoints) {
97 | tempDist += squaredDistance(kPoints.get(mapping._1).get, mapping._2)
98 | }
99 |
100 | for (newP <- newPoints) {
101 | kPoints.put(newP._1, newP._2)
102 | }
103 | }
104 |
105 | println("Final centers: " + kPoints)
106 | }
107 | }
108 |
--------------------------------------------------------------------------------
/src/org/apache/spark/examples/LocalLR.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.spark.examples
19 |
20 | import java.util.Random
21 |
22 | import breeze.linalg.{Vector, DenseVector}
23 |
24 | /**
25 | * Logistic regression based classification.
26 | */
27 | object LocalLR {
28 | val N = 10000 // Number of data points
29 | val D = 10 // Number of dimensions
30 | val R = 0.7 // Scaling factor
31 | val ITERATIONS = 5
32 | val rand = new Random(42)
33 |
34 | case class DataPoint(x: Vector[Double], y: Double)
35 |
36 | def generateData = {
37 | def generatePoint(i: Int) = {
38 | val y = if(i % 2 == 0) -1 else 1
39 | val x = DenseVector.fill(D){rand.nextGaussian + y * R}
40 | DataPoint(x, y)
41 | }
42 | Array.tabulate(N)(generatePoint)
43 | }
44 |
45 | def main(args: Array[String]) {
46 | val data = generateData
47 |
48 | // Initialize w to a random value
49 | var w = DenseVector.fill(D){2 * rand.nextDouble - 1}
50 | println("Initial w: " + w)
51 |
52 | for (i <- 1 to ITERATIONS) {
53 | println("On iteration " + i)
54 | var gradient = DenseVector.zeros[Double](D)
55 | for (p <- data) {
56 | val scale = (1 / (1 + math.exp(-p.y * (w.dot(p.x)))) - 1) * p.y
57 | gradient += p.x * scale
58 | }
59 | w -= gradient
60 | }
61 |
62 | println("Final w: " + w)
63 | }
64 | }
65 |
--------------------------------------------------------------------------------
/src/org/apache/spark/examples/LocalPi.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.spark.examples
19 |
20 | import scala.math.random
21 |
22 | import org.apache.spark._
23 | import org.apache.spark.SparkContext._
24 |
25 | object LocalPi {
26 | def main(args: Array[String]) {
27 | var count = 0
28 | for (i <- 1 to 100000) {
29 | val x = random * 2 - 1
30 | val y = random * 2 - 1
31 | if (x*x + y*y < 1) count += 1
32 | }
33 | println("Pi is roughly " + 4 * count / 100000.0)
34 | }
35 | }
36 |
--------------------------------------------------------------------------------
/src/org/apache/spark/examples/LogQuery.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.spark.examples
19 |
20 | import org.apache.spark.{SparkConf, SparkContext}
21 | import org.apache.spark.SparkContext._
22 |
23 | /**
24 | * Executes a roll up-style query against Apache logs.
25 | *
26 | * Usage: LogQuery [logFile]
27 | */
28 | object LogQuery {
29 | val exampleApacheLogs = List(
30 | """10.10.10.10 - "FRED" [18/Jan/2013:17:56:07 +1100] "GET http://images.com/2013/Generic.jpg
31 | | HTTP/1.1" 304 315 "http://referall.com/" "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1;
32 | | GTB7.4; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30; .NET CLR 3.0.04506.648; .NET CLR
33 | | 3.5.21022; .NET CLR 3.0.4506.2152; .NET CLR 1.0.3705; .NET CLR 1.1.4322; .NET CLR
34 | | 3.5.30729; Release=ARP)" "UD-1" - "image/jpeg" "whatever" 0.350 "-" - "" 265 923 934 ""
35 | | 62.24.11.25 images.com 1358492167 - Whatup""".stripMargin.lines.mkString,
36 | """10.10.10.10 - "FRED" [18/Jan/2013:18:02:37 +1100] "GET http://images.com/2013/Generic.jpg
37 | | HTTP/1.1" 304 306 "http:/referall.com" "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1;
38 | | GTB7.4; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30; .NET CLR 3.0.04506.648; .NET CLR
39 | | 3.5.21022; .NET CLR 3.0.4506.2152; .NET CLR 1.0.3705; .NET CLR 1.1.4322; .NET CLR
40 | | 3.5.30729; Release=ARP)" "UD-1" - "image/jpeg" "whatever" 0.352 "-" - "" 256 977 988 ""
41 | | 0 73.23.2.15 images.com 1358492557 - Whatup""".stripMargin.lines.mkString
42 | )
43 |
44 | def main(args: Array[String]) {
45 |
46 | val sparkConf = new SparkConf().setAppName("Log Query")
47 | val sc = new SparkContext(sparkConf)
48 |
49 | val dataSet =
50 | if (args.length == 1) sc.textFile(args(0)) else sc.parallelize(exampleApacheLogs)
51 | // scalastyle:off
52 | val apacheLogRegex =
53 | """^([\d.]+) (\S+) (\S+) \[([\w\d:/]+\s[+\-]\d{4})\] "(.+?)" (\d{3}) ([\d\-]+) "([^"]+)" "([^"]+)".*""".r
54 | // scalastyle:on
55 | /** Tracks the total query count and number of aggregate bytes for a particular group. */
56 | class Stats(val count: Int, val numBytes: Int) extends Serializable {
57 | def merge(other: Stats) = new Stats(count + other.count, numBytes + other.numBytes)
58 | override def toString = "bytes=%s\tn=%s".format(numBytes, count)
59 | }
60 |
61 | def extractKey(line: String): (String, String, String) = {
62 | apacheLogRegex.findFirstIn(line) match {
63 | case Some(apacheLogRegex(ip, _, user, dateTime, query, status, bytes, referer, ua)) =>
64 | if (user != "\"-\"") (ip, user, query)
65 | else (null, null, null)
66 | case _ => (null, null, null)
67 | }
68 | }
69 |
70 | def extractStats(line: String): Stats = {
71 | apacheLogRegex.findFirstIn(line) match {
72 | case Some(apacheLogRegex(ip, _, user, dateTime, query, status, bytes, referer, ua)) =>
73 | new Stats(1, bytes.toInt)
74 | case _ => new Stats(1, 0)
75 | }
76 | }
77 |
78 | dataSet.map(line => (extractKey(line), extractStats(line)))
79 | .reduceByKey((a, b) => a.merge(b))
80 | .collect().foreach{
81 | case (user, query) => println("%s\t%s".format(user, query))}
82 | }
83 | }
84 |
--------------------------------------------------------------------------------
/src/org/apache/spark/examples/MultiBroadcastTest.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.spark.examples
19 |
20 | import org.apache.spark.rdd.RDD
21 | import org.apache.spark.{SparkConf, SparkContext}
22 |
23 | /**
24 | * Usage: MultiBroadcastTest [slices] [numElem]
25 | */
26 | object MultiBroadcastTest {
27 | def main(args: Array[String]) {
28 |
29 | val sparkConf = new SparkConf().setAppName("Multi-Broadcast Test")
30 | val sc = new SparkContext(sparkConf)
31 |
32 | val slices = if (args.length > 0) args(0).toInt else 2
33 | val num = if (args.length > 1) args(1).toInt else 1000000
34 |
35 | val arr1 = new Array[Int](num)
36 | for (i <- 0 until arr1.length) {
37 | arr1(i) = i
38 | }
39 |
40 | val arr2 = new Array[Int](num)
41 | for (i <- 0 until arr2.length) {
42 | arr2(i) = i
43 | }
44 |
45 | val barr1 = sc.broadcast(arr1)
46 | val barr2 = sc.broadcast(arr2)
47 | val observedSizes: RDD[(Int, Int)] = sc.parallelize(1 to 10, slices).map { _ =>
48 | (barr1.value.size, barr2.value.size)
49 | }
50 | // Collect the small RDD so we can print the observed sizes locally.
51 | observedSizes.collect().foreach(i => println(i))
52 |
53 | sc.stop()
54 | }
55 | }
56 |
--------------------------------------------------------------------------------
/src/org/apache/spark/examples/SimpleSkewedGroupByTest.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.spark.examples
19 |
20 | import java.util.Random
21 |
22 | import org.apache.spark.{SparkConf, SparkContext}
23 | import org.apache.spark.SparkContext._
24 |
25 | /**
26 | * Usage: SimpleSkewedGroupByTest [numMappers] [numKVPairs] [valSize] [numReducers] [ratio]
27 | */
28 | object SimpleSkewedGroupByTest {
29 | def main(args: Array[String]) {
30 |
31 | val sparkConf = new SparkConf().setAppName("SimpleSkewedGroupByTest")
32 | var numMappers = if (args.length > 0) args(0).toInt else 2
33 | var numKVPairs = if (args.length > 1) args(1).toInt else 1000
34 | var valSize = if (args.length > 2) args(2).toInt else 1000
35 | var numReducers = if (args.length > 3) args(3).toInt else numMappers
36 | var ratio = if (args.length > 4) args(4).toInt else 5.0
37 |
38 | val sc = new SparkContext(sparkConf)
39 |
40 | val pairs1 = sc.parallelize(0 until numMappers, numMappers).flatMap { p =>
41 | val ranGen = new Random
42 | var result = new Array[(Int, Array[Byte])](numKVPairs)
43 | for (i <- 0 until numKVPairs) {
44 | val byteArr = new Array[Byte](valSize)
45 | ranGen.nextBytes(byteArr)
46 | val offset = ranGen.nextInt(1000) * numReducers
47 | if (ranGen.nextDouble < ratio / (numReducers + ratio - 1)) {
48 | // give ratio times higher chance of generating key 0 (for reducer 0)
49 | result(i) = (offset, byteArr)
50 | } else {
51 | // generate a key for one of the other reducers
52 | val key = 1 + ranGen.nextInt(numReducers-1) + offset
53 | result(i) = (key, byteArr)
54 | }
55 | }
56 | result
57 | }.cache
58 | // Enforce that everything has been calculated and in cache
59 | pairs1.count
60 |
61 | println("RESULT: " + pairs1.groupByKey(numReducers).count)
62 | // Print how many keys each reducer got (for debugging)
63 | // println("RESULT: " + pairs1.groupByKey(numReducers)
64 | // .map{case (k,v) => (k, v.size)}
65 | // .collectAsMap)
66 |
67 | sc.stop()
68 | }
69 | }
70 |
--------------------------------------------------------------------------------
/src/org/apache/spark/examples/SkewedGroupByTest.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.spark.examples
19 |
20 | import java.util.Random
21 |
22 | import org.apache.spark.{SparkConf, SparkContext}
23 | import org.apache.spark.SparkContext._
24 |
25 | /**
26 | * Usage: GroupByTest [numMappers] [numKVPairs] [KeySize] [numReducers]
27 | */
28 | object SkewedGroupByTest {
29 | def main(args: Array[String]) {
30 | val sparkConf = new SparkConf().setAppName("GroupBy Test")
31 | var numMappers = if (args.length > 0) args(0).toInt else 2
32 | var numKVPairs = if (args.length > 1) args(1).toInt else 1000
33 | var valSize = if (args.length > 2) args(2).toInt else 1000
34 | var numReducers = if (args.length > 3) args(3).toInt else numMappers
35 |
36 | val sc = new SparkContext(sparkConf)
37 |
38 | val pairs1 = sc.parallelize(0 until numMappers, numMappers).flatMap { p =>
39 | val ranGen = new Random
40 |
41 | // map output sizes lineraly increase from the 1st to the last
42 | numKVPairs = (1.0 * (p + 1) / numMappers * numKVPairs).toInt
43 |
44 | var arr1 = new Array[(Int, Array[Byte])](numKVPairs)
45 | for (i <- 0 until numKVPairs) {
46 | val byteArr = new Array[Byte](valSize)
47 | ranGen.nextBytes(byteArr)
48 | arr1(i) = (ranGen.nextInt(Int.MaxValue), byteArr)
49 | }
50 | arr1
51 | }.cache()
52 | // Enforce that everything has been calculated and in cache
53 | pairs1.count()
54 |
55 | println(pairs1.groupByKey(numReducers).count())
56 |
57 | sc.stop()
58 | }
59 | }
60 |
--------------------------------------------------------------------------------
/src/org/apache/spark/examples/SparkALS.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.spark.examples
19 |
20 | import scala.math.sqrt
21 |
22 | import cern.colt.matrix._
23 | import cern.colt.matrix.linalg._
24 | import cern.jet.math._
25 |
26 | import org.apache.spark._
27 |
28 | /**
29 | * Alternating least squares matrix factorization.
30 | */
31 | object SparkALS {
32 | // Parameters set through command line arguments
33 | var M = 0 // Number of movies
34 | var U = 0 // Number of users
35 | var F = 0 // Number of features
36 | var ITERATIONS = 0
37 |
38 | val LAMBDA = 0.01 // Regularization coefficient
39 |
40 | // Some COLT objects
41 | val factory2D = DoubleFactory2D.dense
42 | val factory1D = DoubleFactory1D.dense
43 | val algebra = Algebra.DEFAULT
44 | val blas = SeqBlas.seqBlas
45 |
46 | def generateR(): DoubleMatrix2D = {
47 | val mh = factory2D.random(M, F)
48 | val uh = factory2D.random(U, F)
49 | algebra.mult(mh, algebra.transpose(uh))
50 | }
51 |
52 | def rmse(targetR: DoubleMatrix2D, ms: Array[DoubleMatrix1D],
53 | us: Array[DoubleMatrix1D]): Double =
54 | {
55 | val r = factory2D.make(M, U)
56 | for (i <- 0 until M; j <- 0 until U) {
57 | r.set(i, j, blas.ddot(ms(i), us(j)))
58 | }
59 | blas.daxpy(-1, targetR, r)
60 | val sumSqs = r.aggregate(Functions.plus, Functions.square)
61 | sqrt(sumSqs / (M * U))
62 | }
63 |
64 | def update(i: Int, m: DoubleMatrix1D, us: Array[DoubleMatrix1D],
65 | R: DoubleMatrix2D) : DoubleMatrix1D =
66 | {
67 | val U = us.size
68 | val F = us(0).size
69 | val XtX = factory2D.make(F, F)
70 | val Xty = factory1D.make(F)
71 | // For each user that rated the movie
72 | for (j <- 0 until U) {
73 | val u = us(j)
74 | // Add u * u^t to XtX
75 | blas.dger(1, u, u, XtX)
76 | // Add u * rating to Xty
77 | blas.daxpy(R.get(i, j), u, Xty)
78 | }
79 | // Add regularization coefs to diagonal terms
80 | for (d <- 0 until F) {
81 | XtX.set(d, d, XtX.get(d, d) + LAMBDA * U)
82 | }
83 | // Solve it with Cholesky
84 | val ch = new CholeskyDecomposition(XtX)
85 | val Xty2D = factory2D.make(Xty.toArray, F)
86 | val solved2D = ch.solve(Xty2D)
87 | solved2D.viewColumn(0)
88 | }
89 |
90 | def main(args: Array[String]) {
91 | var slices = 0
92 |
93 | val options = (0 to 4).map(i => if (i < args.length) Some(args(i)) else None)
94 |
95 | options.toArray match {
96 | case Array(m, u, f, iters, slices_) =>
97 | M = m.getOrElse("100").toInt
98 | U = u.getOrElse("500").toInt
99 | F = f.getOrElse("10").toInt
100 | ITERATIONS = iters.getOrElse("5").toInt
101 | slices = slices_.getOrElse("2").toInt
102 | case _ =>
103 | System.err.println("Usage: SparkALS [M] [U] [F] [iters] [slices]")
104 | System.exit(1)
105 | }
106 | printf("Running with M=%d, U=%d, F=%d, iters=%d\n", M, U, F, ITERATIONS)
107 | val sparkConf = new SparkConf().setAppName("SparkALS")
108 | val sc = new SparkContext(sparkConf)
109 |
110 | val R = generateR()
111 |
112 | // Initialize m and u randomly
113 | var ms = Array.fill(M)(factory1D.random(F))
114 | var us = Array.fill(U)(factory1D.random(F))
115 |
116 | // Iteratively update movies then users
117 | val Rc = sc.broadcast(R)
118 | var msb = sc.broadcast(ms)
119 | var usb = sc.broadcast(us)
120 | for (iter <- 1 to ITERATIONS) {
121 | println("Iteration " + iter + ":")
122 | ms = sc.parallelize(0 until M, slices)
123 | .map(i => update(i, msb.value(i), usb.value, Rc.value))
124 | .collect()
125 | msb = sc.broadcast(ms) // Re-broadcast ms because it was updated
126 | us = sc.parallelize(0 until U, slices)
127 | .map(i => update(i, usb.value(i), msb.value, algebra.transpose(Rc.value)))
128 | .collect()
129 | usb = sc.broadcast(us) // Re-broadcast us because it was updated
130 | println("RMSE = " + rmse(R, ms, us))
131 | println()
132 | }
133 |
134 | sc.stop()
135 | }
136 | }
137 |
--------------------------------------------------------------------------------
/src/org/apache/spark/examples/SparkHdfsLR.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.spark.examples
19 |
20 | import java.util.Random
21 |
22 | import scala.math.exp
23 |
24 | import breeze.linalg.{Vector, DenseVector}
25 |
26 | import org.apache.spark._
27 | import org.apache.spark.deploy.SparkHadoopUtil
28 | import org.apache.spark.scheduler.InputFormatInfo
29 |
30 |
31 | /**
32 | * Logistic regression based classification.
33 | */
34 | object SparkHdfsLR {
35 | val D = 10 // Numer of dimensions
36 | val rand = new Random(42)
37 |
38 | case class DataPoint(x: Vector[Double], y: Double)
39 |
40 | def parsePoint(line: String): DataPoint = {
41 | val tok = new java.util.StringTokenizer(line, " ")
42 | var y = tok.nextToken.toDouble
43 | var x = new Array[Double](D)
44 | var i = 0
45 | while (i < D) {
46 | x(i) = tok.nextToken.toDouble; i += 1
47 | }
48 | DataPoint(new DenseVector(x), y)
49 | }
50 |
51 | def main(args: Array[String]) {
52 | if (args.length < 2) {
53 | System.err.println("Usage: SparkHdfsLR ")
54 | System.exit(1)
55 | }
56 |
57 | val sparkConf = new SparkConf().setAppName("SparkHdfsLR")
58 | val inputPath = args(0)
59 | val conf = SparkHadoopUtil.get.newConfiguration()
60 | val sc = new SparkContext(sparkConf,
61 | InputFormatInfo.computePreferredLocations(
62 | Seq(new InputFormatInfo(conf, classOf[org.apache.hadoop.mapred.TextInputFormat], inputPath))
63 | ))
64 | val lines = sc.textFile(inputPath)
65 | val points = lines.map(parsePoint _).cache()
66 | val ITERATIONS = args(1).toInt
67 |
68 | // Initialize w to a random value
69 | var w = DenseVector.fill(D){2 * rand.nextDouble - 1}
70 | println("Initial w: " + w)
71 |
72 | for (i <- 1 to ITERATIONS) {
73 | println("On iteration " + i)
74 | val gradient = points.map { p =>
75 | p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y
76 | }.reduce(_ + _)
77 | w -= gradient
78 | }
79 |
80 | println("Final w: " + w)
81 | sc.stop()
82 | }
83 | }
84 |
--------------------------------------------------------------------------------
/src/org/apache/spark/examples/SparkKMeans.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.spark.examples
19 |
20 | import java.util.Random
21 |
22 | import breeze.linalg.{Vector, DenseVector, squaredDistance}
23 |
24 | import org.apache.spark.{SparkConf, SparkContext}
25 | import org.apache.spark.SparkContext._
26 |
27 | /**
28 | * K-means clustering.
29 | */
30 | object SparkKMeans {
31 | val R = 1000 // Scaling factor
32 | val rand = new Random(42)
33 |
34 | def parseVector(line: String): Vector[Double] = {
35 | DenseVector(line.split(' ').map(_.toDouble))
36 | }
37 |
38 | def closestPoint(p: Vector[Double], centers: Array[Vector[Double]]): Int = {
39 | var index = 0
40 | var bestIndex = 0
41 | var closest = Double.PositiveInfinity
42 |
43 | for (i <- 0 until centers.length) {
44 | val tempDist = squaredDistance(p, centers(i))
45 | if (tempDist < closest) {
46 | closest = tempDist
47 | bestIndex = i
48 | }
49 | }
50 |
51 | bestIndex
52 | }
53 |
54 | def main(args: Array[String]) {
55 | if (args.length < 3) {
56 | System.err.println("Usage: SparkKMeans ")
57 | System.exit(1)
58 | }
59 | val sparkConf = new SparkConf().setAppName("SparkKMeans")
60 | val sc = new SparkContext(sparkConf)
61 | val lines = sc.textFile(args(0))
62 | val data = lines.map(parseVector _).cache()
63 | val K = args(1).toInt
64 | val convergeDist = args(2).toDouble
65 |
66 | val kPoints = data.takeSample(withReplacement = false, K, 42).toArray
67 | var tempDist = 1.0
68 |
69 | while(tempDist > convergeDist) {
70 | val closest = data.map (p => (closestPoint(p, kPoints), (p, 1)))
71 |
72 | val pointStats = closest.reduceByKey{case ((x1, y1), (x2, y2)) => (x1 + x2, y1 + y2)}
73 |
74 | val newPoints = pointStats.map {pair =>
75 | (pair._1, pair._2._1 * (1.0 / pair._2._2))}.collectAsMap()
76 |
77 | tempDist = 0.0
78 | for (i <- 0 until K) {
79 | tempDist += squaredDistance(kPoints(i), newPoints(i))
80 | }
81 |
82 | for (newP <- newPoints) {
83 | kPoints(newP._1) = newP._2
84 | }
85 | println("Finished iteration (delta = " + tempDist + ")")
86 | }
87 |
88 | println("Final centers:")
89 | kPoints.foreach(println)
90 | sc.stop()
91 | }
92 | }
93 |
--------------------------------------------------------------------------------
/src/org/apache/spark/examples/SparkLR.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.spark.examples
19 |
20 | import java.util.Random
21 |
22 | import scala.math.exp
23 |
24 | import breeze.linalg.{Vector, DenseVector}
25 |
26 | import org.apache.spark._
27 |
28 | /**
29 | * Logistic regression based classification.
30 | * Usage: SparkLR [slices]
31 | */
32 | object SparkLR {
33 | val N = 10000 // Number of data points
34 | val D = 10 // Numer of dimensions
35 | val R = 0.7 // Scaling factor
36 | val ITERATIONS = 5
37 | val rand = new Random(42)
38 |
39 | case class DataPoint(x: Vector[Double], y: Double)
40 |
41 | def generateData = {
42 | def generatePoint(i: Int) = {
43 | val y = if(i % 2 == 0) -1 else 1
44 | val x = DenseVector.fill(D){rand.nextGaussian + y * R}
45 | DataPoint(x, y)
46 | }
47 | Array.tabulate(N)(generatePoint)
48 | }
49 |
50 | def main(args: Array[String]) {
51 | val sparkConf = new SparkConf().setAppName("SparkLR")
52 | val sc = new SparkContext(sparkConf)
53 | val numSlices = if (args.length > 0) args(0).toInt else 2
54 | val points = sc.parallelize(generateData, numSlices).cache()
55 |
56 | // Initialize w to a random value
57 | var w = DenseVector.fill(D){2 * rand.nextDouble - 1}
58 | println("Initial w: " + w)
59 |
60 | for (i <- 1 to ITERATIONS) {
61 | println("On iteration " + i)
62 | val gradient = points.map { p =>
63 | p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y
64 | }.reduce(_ + _)
65 | w -= gradient
66 | }
67 |
68 | println("Final w: " + w)
69 | sc.stop()
70 | }
71 | }
72 |
--------------------------------------------------------------------------------
/src/org/apache/spark/examples/SparkPageRank.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.spark.examples
19 |
20 | import org.apache.spark.SparkContext._
21 | import org.apache.spark.{SparkConf, SparkContext}
22 |
23 | /**
24 | * Computes the PageRank of URLs from an input file. Input file should
25 | * be in format of:
26 | * URL neighbor URL
27 | * URL neighbor URL
28 | * URL neighbor URL
29 | * ...
30 | * where URL and their neighbors are separated by space(s).
31 | */
32 | object SparkPageRank {
33 | def main(args: Array[String]) {
34 | val sparkConf = new SparkConf().setAppName("PageRank")
35 | var iters = args(1).toInt
36 | val ctx = new SparkContext(sparkConf)
37 | val lines = ctx.textFile(args(0), 1)
38 | val links = lines.map{ s =>
39 | val parts = s.split("\\s+")
40 | (parts(0), parts(1))
41 | }.distinct().groupByKey().cache()
42 | var ranks = links.mapValues(v => 1.0)
43 |
44 | for (i <- 1 to iters) {
45 | val contribs = links.join(ranks).values.flatMap{ case (urls, rank) =>
46 | val size = urls.size
47 | urls.map(url => (url, rank / size))
48 | }
49 | ranks = contribs.reduceByKey(_ + _).mapValues(0.15 + 0.85 * _)
50 | }
51 |
52 | val output = ranks.collect()
53 | output.foreach(tup => println(tup._1 + " has rank: " + tup._2 + "."))
54 |
55 | ctx.stop()
56 | }
57 | }
58 |
--------------------------------------------------------------------------------
/src/org/apache/spark/examples/SparkPi.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.spark.examples
19 |
20 | import scala.math.random
21 |
22 | import org.apache.spark._
23 |
24 | /** Computes an approximation to pi */
25 | object SparkPi {
26 | def main(args: Array[String]) {
27 | val conf = new SparkConf().setAppName("Spark Pi")
28 | val spark = new SparkContext(conf)
29 | val slices = if (args.length > 0) args(0).toInt else 2
30 | val n = 100000 * slices
31 | val count = spark.parallelize(1 to n, slices).map { i =>
32 | val x = random * 2 - 1
33 | val y = random * 2 - 1
34 | if (x*x + y*y < 1) 1 else 0
35 | }.reduce(_ + _)
36 | println("Pi is roughly " + 4.0 * count / n)
37 | spark.stop()
38 | }
39 | }
40 |
--------------------------------------------------------------------------------
/src/org/apache/spark/examples/SparkTC.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.spark.examples
19 |
20 | import scala.util.Random
21 | import scala.collection.mutable
22 |
23 | import org.apache.spark.{SparkConf, SparkContext}
24 | import org.apache.spark.SparkContext._
25 |
26 | /**
27 | * Transitive closure on a graph.
28 | */
29 | object SparkTC {
30 | val numEdges = 200
31 | val numVertices = 100
32 | val rand = new Random(42)
33 |
34 | def generateGraph = {
35 | val edges: mutable.Set[(Int, Int)] = mutable.Set.empty
36 | while (edges.size < numEdges) {
37 | val from = rand.nextInt(numVertices)
38 | val to = rand.nextInt(numVertices)
39 | if (from != to) edges.+=((from, to))
40 | }
41 | edges.toSeq
42 | }
43 |
44 | def main(args: Array[String]) {
45 | val sparkConf = new SparkConf().setAppName("SparkTC")
46 | val spark = new SparkContext(sparkConf)
47 | val slices = if (args.length > 0) args(0).toInt else 2
48 | var tc = spark.parallelize(generateGraph, slices).cache()
49 |
50 | // Linear transitive closure: each round grows paths by one edge,
51 | // by joining the graph's edges with the already-discovered paths.
52 | // e.g. join the path (y, z) from the TC with the edge (x, y) from
53 | // the graph to obtain the path (x, z).
54 |
55 | // Because join() joins on keys, the edges are stored in reversed order.
56 | val edges = tc.map(x => (x._2, x._1))
57 |
58 | // This join is iterated until a fixed point is reached.
59 | var oldCount = 0L
60 | var nextCount = tc.count()
61 | do {
62 | oldCount = nextCount
63 | // Perform the join, obtaining an RDD of (y, (z, x)) pairs,
64 | // then project the result to obtain the new (x, z) paths.
65 | tc = tc.union(tc.join(edges).map(x => (x._2._2, x._2._1))).distinct().cache()
66 | nextCount = tc.count()
67 | } while (nextCount != oldCount)
68 |
69 | println("TC has " + tc.count() + " edges.")
70 | spark.stop()
71 | }
72 | }
73 |
--------------------------------------------------------------------------------
/src/org/apache/spark/examples/SparkTachyonHdfsLR.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.spark.examples
19 |
20 | import java.util.Random
21 |
22 | import scala.math.exp
23 |
24 | import breeze.linalg.{Vector, DenseVector}
25 |
26 | import org.apache.spark._
27 | import org.apache.spark.deploy.SparkHadoopUtil
28 | import org.apache.spark.scheduler.InputFormatInfo
29 | import org.apache.spark.storage.StorageLevel
30 |
31 |
32 | /**
33 | * Logistic regression based classification.
34 | * This example uses Tachyon to persist rdds during computation.
35 | */
36 | object SparkTachyonHdfsLR {
37 | val D = 10 // Numer of dimensions
38 | val rand = new Random(42)
39 |
40 | case class DataPoint(x: Vector[Double], y: Double)
41 |
42 | def parsePoint(line: String): DataPoint = {
43 | val tok = new java.util.StringTokenizer(line, " ")
44 | var y = tok.nextToken.toDouble
45 | var x = new Array[Double](D)
46 | var i = 0
47 | while (i < D) {
48 | x(i) = tok.nextToken.toDouble; i += 1
49 | }
50 | DataPoint(new DenseVector(x), y)
51 | }
52 |
53 | def main(args: Array[String]) {
54 | val inputPath = args(0)
55 | val conf = SparkHadoopUtil.get.newConfiguration()
56 | val sparkConf = new SparkConf().setAppName("SparkTachyonHdfsLR")
57 | val sc = new SparkContext(sparkConf,
58 | InputFormatInfo.computePreferredLocations(
59 | Seq(new InputFormatInfo(conf, classOf[org.apache.hadoop.mapred.TextInputFormat], inputPath))
60 | ))
61 | val lines = sc.textFile(inputPath)
62 | val points = lines.map(parsePoint _).persist(StorageLevel.OFF_HEAP)
63 | val ITERATIONS = args(1).toInt
64 |
65 | // Initialize w to a random value
66 | var w = DenseVector.fill(D){2 * rand.nextDouble - 1}
67 | println("Initial w: " + w)
68 |
69 | for (i <- 1 to ITERATIONS) {
70 | println("On iteration " + i)
71 | val gradient = points.map { p =>
72 | p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y
73 | }.reduce(_ + _)
74 | w -= gradient
75 | }
76 |
77 | println("Final w: " + w)
78 | sc.stop()
79 | }
80 | }
81 |
--------------------------------------------------------------------------------
/src/org/apache/spark/examples/SparkTachyonPi.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.spark.examples
19 |
20 | import scala.math.random
21 |
22 | import org.apache.spark._
23 | import org.apache.spark.storage.StorageLevel
24 |
25 | /**
26 | * Computes an approximation to pi
27 | * This example uses Tachyon to persist rdds during computation.
28 | */
29 | object SparkTachyonPi {
30 | def main(args: Array[String]) {
31 | val sparkConf = new SparkConf().setAppName("SparkTachyonPi")
32 | val spark = new SparkContext(sparkConf)
33 |
34 | val slices = if (args.length > 0) args(0).toInt else 2
35 | val n = 100000 * slices
36 |
37 | val rdd = spark.parallelize(1 to n, slices)
38 | rdd.persist(StorageLevel.OFF_HEAP)
39 | val count = rdd.map { i =>
40 | val x = random * 2 - 1
41 | val y = random * 2 - 1
42 | if (x * x + y * y < 1) 1 else 0
43 | }.reduce(_ + _)
44 | println("Pi is roughly " + 4.0 * count / n)
45 |
46 | spark.stop()
47 | }
48 | }
49 |
--------------------------------------------------------------------------------
/src/pretty/examples/Aggregate.scala:
--------------------------------------------------------------------------------
1 | package pretty.examples
2 |
3 | import org.apache.spark.SparkContext
4 |
5 | object Aggregate {
6 |
7 | def main(args: Array[String]) {
8 |
9 | val sc = new SparkContext("local", "Aggregate Test")
10 | val d = List("0", "1", "2", "3", "4", "5", "6", "7", "8", "9")
11 |
12 | val data = sc.parallelize(d, 2)
13 |
14 | val result = data.aggregate("a")((x,y) => "[" + x + "," + y + "]",
15 | (x,y) => x + y)
16 |
17 | println(result)
18 | // output:
19 | // a[[[[[a,0],1],2],3],4][[[[[a,5],6],7],8],9]
20 | }
21 | }
--------------------------------------------------------------------------------
/src/pretty/examples/Coalesce.scala:
--------------------------------------------------------------------------------
1 | package pretty.examples
2 |
3 | import org.apache.spark.SparkContext
4 |
5 | object Coalesce {
6 | def main(args: Array[String]) {
7 | val sc = new SparkContext("local", "Coalesce Test")
8 |
9 | val data = sc.parallelize(1 to 20, 10)
10 |
11 | val result = data.coalesce(2)
12 | result.foreach(x => print(x + " "))
13 |
14 | // equals "repartition(2)"
15 | val resultWithHashPartition = data.coalesce(2, true)
16 | resultWithHashPartition.foreach(x => print(x + " "))
17 | }
18 | }
--------------------------------------------------------------------------------
/src/pretty/examples/CogroupPair.scala:
--------------------------------------------------------------------------------
1 | package pretty.examples
2 |
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.SparkContext._
5 | import org.apache.spark.RangePartitioner
6 |
7 | object CogroupPair {
8 | def main(args: Array[String]) {
9 | val sc = new SparkContext("local", "Cogroup Test")
10 |
11 | val data1 = Array[(String, Int)](("A", 1), ("A", 2),
12 | ("B", 3), ("B", 4),
13 | ("C", 5), ("C", 6))
14 |
15 | val data2 = Array[(String, Int)](("A", 7), ("A", 8),
16 | ("B", 9), ("C", 0))
17 |
18 | val data3 = Array[(String, Int)](("A", 10), ("B", 11))
19 |
20 | val pairs1 = sc.parallelize(data1, 3)
21 | val pairs2 = sc.parallelize(data2, 2)
22 | val pairs3 = sc.parallelize(data3, 3)
23 |
24 | val result1 = pairs1.cogroup(pairs2)
25 | result1.foreach(println)
26 |
27 | // val result2 = pairs1.cogroup(pairs2, pairs3)
28 | // result2.foreach(println)
29 | //
30 | // val result3 = pairs1.cogroup(pairs2, 1)
31 | // result3.foreachWith(i => i)((x, i) => println("[PartitionIndex " + i + "] " + x))
32 | //
33 | // val result4 = pairs1.cogroup(pairs2, new RangePartitioner(2, pairs1))
34 | // result4.foreachWith(i => i)((x, i) => println("[PartitionIndex " + i + "] " + x))
35 | }
36 | }
--------------------------------------------------------------------------------
/src/pretty/examples/GroupByKeyPair.scala:
--------------------------------------------------------------------------------
1 | package pretty.examples
2 |
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.SparkContext._
5 | import org.apache.spark.RangePartitioner
6 |
7 | object GroupByKeyPair {
8 |
9 | def main(args: Array[String]) {
10 |
11 | val sc = new SparkContext("local", "GroupByKeyPair Test")
12 | val d = sc.parallelize(1 to 100, 10)
13 |
14 | val pairs = d.keyBy(x => x % 10)
15 |
16 | val result1 = pairs.groupByKey()
17 | val result2 = pairs.groupByKey(3)
18 | val result3 = pairs.groupByKey(new RangePartitioner(3, pairs))
19 |
20 | println("Result 1:")
21 | result1.foreach(println)
22 |
23 | println("Result 2:")
24 | result2.foreach(println)
25 |
26 | println("Result 3:")
27 | result3.foreach(println)
28 |
29 | }
30 | }
--------------------------------------------------------------------------------