├── data
├── people.txt
└── inputfile.txt
├── renovate.json
├── README.md
├── src
├── main
│ ├── scala
│ │ ├── com
│ │ │ └── javachen
│ │ │ │ └── spark
│ │ │ │ └── examples
│ │ │ │ ├── rdd
│ │ │ │ ├── Pipe.scala
│ │ │ │ ├── ActionTest.scala
│ │ │ │ ├── TransformTest.scala
│ │ │ │ ├── FlatMap.scala
│ │ │ │ ├── Lookup.scala
│ │ │ │ ├── CollectAsMap.scala
│ │ │ │ ├── MapPartitions.scala
│ │ │ │ ├── ReduceByKey.scala
│ │ │ │ ├── AggregateOrder.scala
│ │ │ │ ├── MapValues.scala
│ │ │ │ ├── Aggregate.scala
│ │ │ │ ├── PartitionBy.scala
│ │ │ │ ├── Join.scala
│ │ │ │ ├── ScalaWordCount.scala
│ │ │ │ ├── GroupWith.scala
│ │ │ │ ├── GroupByKey.scala
│ │ │ │ ├── Cartesian.scala
│ │ │ │ └── GroupByAction.scala
│ │ │ │ ├── sparksql
│ │ │ │ ├── ScalaSparkSQLByReflection.scala
│ │ │ │ └── ScalaSparkSQLBySchema.scala
│ │ │ │ └── mllib
│ │ │ │ ├── EvaluateResult.scala
│ │ │ │ ├── ScalaMovieLensALS.scala
│ │ │ │ ├── MovieSimilarities.scala
│ │ │ │ ├── MovieLensALS.scala
│ │ │ │ └── ScalaLocalALS.scala
│ │ └── org
│ │ │ └── apache
│ │ │ └── spark
│ │ │ └── examples
│ │ │ ├── LocalPi.scala
│ │ │ ├── ExceptionHandlingTest.scala
│ │ │ ├── SparkPi.scala
│ │ │ ├── HdfsTest.scala
│ │ │ ├── SparkTachyonPi.scala
│ │ │ ├── DriverSubmissionTest.scala
│ │ │ ├── MultiBroadcastTest.scala
│ │ │ ├── GroupByTest.scala
│ │ │ ├── BroadcastTest.scala
│ │ │ ├── SkewedGroupByTest.scala
│ │ │ ├── SparkTC.scala
│ │ │ ├── LocalFileLR.scala
│ │ │ ├── SparkPageRank.scala
│ │ │ ├── LocalLR.scala
│ │ │ ├── SimpleSkewedGroupByTest.scala
│ │ │ ├── SparkLR.scala
│ │ │ ├── SparkTachyonHdfsLR.scala
│ │ │ ├── SparkKMeans.scala
│ │ │ ├── SparkHdfsLR.scala
│ │ │ ├── LocalKMeans.scala
│ │ │ ├── LogQuery.scala
│ │ │ ├── LocalALS.scala
│ │ │ └── SparkALS.scala
│ ├── python
│ │ ├── PythonWordCount.py
│ │ ├── PythonALS.py
│ │ ├── PythonSparkSQLByReflection.py
│ │ └── PythonSparkSQLBySchema.py
│ └── java
│ │ └── com
│ │ └── javachen
│ │ └── spark
│ │ └── examples
│ │ ├── sparksql
│ │ ├── JavaSparkSQLBySchema.java
│ │ └── JavaSparkSQLByReflection.java
│ │ ├── mllib
│ │ └── JavaALS.java
│ │ └── rdd
│ │ └── JavaWordCount.java
└── test
│ └── java
│ └── com
│ └── javachen
│ └── spark
│ └── AppTest.java
├── .gitignore
├── LICENSE
└── pom.xml
/data/people.txt:
--------------------------------------------------------------------------------
1 | Michael, 29
2 | Andy, 30
3 | Justin, 19
--------------------------------------------------------------------------------
/renovate.json:
--------------------------------------------------------------------------------
1 | {
2 | "$schema": "https://docs.renovatebot.com/renovate-schema.json",
3 | "extends": [
4 | "config:recommended"
5 | ]
6 | }
7 |
--------------------------------------------------------------------------------
/data/inputfile.txt:
--------------------------------------------------------------------------------
1 | apple
2 | banana counter
3 | counter one two three
4 | three one
5 | five seven eight
6 | twenty one three five counter six
7 | one siz helga
8 | apple banana fiver
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # learning-spark
2 |
3 | Learning to write Spark examples
4 |
5 | # Links
6 |
7 | - https://github.com/JerryLead/SparkLearning
8 | - https://github.com/ceteri/spark-exercises
9 | - https://github.com/databricks/reference-apps
--------------------------------------------------------------------------------
/src/main/scala/com/javachen/spark/examples/rdd/Pipe.scala:
--------------------------------------------------------------------------------
1 | package com.javachen.spark.examples.rdd
2 |
3 | /**
4 | *
5 | * @author june.
6 | * @date 2015-05-12 17:21.
7 | */
8 | object Pipe {
9 |
10 | }
11 |
--------------------------------------------------------------------------------
/src/main/scala/com/javachen/spark/examples/rdd/ActionTest.scala:
--------------------------------------------------------------------------------
1 | package com.javachen.spark.examples.rdd
2 |
3 | /**
4 | *
5 | * @author june.
6 | * @date 2015-05-12 17:25.
7 | */
8 | object ActionTest {
9 |
10 | }
11 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.iml
2 | .idea/
3 | target/
4 | *.class
5 |
6 | # Mobile Tools for Java (J2ME)
7 | .mtj.tmp/
8 |
9 | # Package Files #
10 | *.jar
11 | *.war
12 | *.ear
13 |
14 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
15 |
--------------------------------------------------------------------------------
/src/main/scala/com/javachen/spark/examples/rdd/TransformTest.scala:
--------------------------------------------------------------------------------
1 | package com.javachen.spark.examples.rdd
2 |
3 | /**
4 | *
5 | * @author june.
6 | * @date 2015-05-12 17:25.
7 | */
8 | object TransformTest {
9 |
10 | }
11 |
--------------------------------------------------------------------------------
/src/main/scala/com/javachen/spark/examples/rdd/FlatMap.scala:
--------------------------------------------------------------------------------
1 | package com.javachen.spark.examples.rdd
2 |
3 | import org.apache.spark.SparkContext
4 |
5 | object FlatMap {
6 | def main(args: Array[String]) {
7 |
8 | val sc = new SparkContext("local", "FlatMap Test")
9 | val data = Array[(String, Int)](("A", 1), ("B", 2),
10 | ("B", 3), ("C", 4),
11 | ("C", 5), ("C", 6)
12 | )
13 | val pairs = sc.makeRDD(data, 3)
14 |
15 | val result = pairs.flatMap(T => (T._1 + T._2))
16 |
17 | result.foreach(println)
18 |
19 | }
20 | }
--------------------------------------------------------------------------------
/src/main/scala/com/javachen/spark/examples/rdd/Lookup.scala:
--------------------------------------------------------------------------------
1 | package com.javachen.spark.examples.rdd
2 |
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.SparkContext._
5 |
6 | object Lookup {
7 | def main(args: Array[String]) {
8 |
9 | val sc = new SparkContext("local", "LookUp Test")
10 |
11 | val data = Array[(String, Int)](("A", 1), ("B", 2),
12 | ("B", 3), ("C", 4),
13 | ("C", 5), ("C", 6))
14 |
15 | val pairs = sc.parallelize(data, 3)
16 |
17 | val finalRDD = pairs.lookup("B")
18 |
19 | finalRDD.foreach(println)
20 | // output:
21 | // 2
22 | // 3
23 | }
24 | }
--------------------------------------------------------------------------------
/src/main/scala/com/javachen/spark/examples/rdd/CollectAsMap.scala:
--------------------------------------------------------------------------------
1 | package com.javachen.spark.examples.rdd
2 |
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.SparkContext._
5 |
6 | object CollectAsMap {
7 | def main(args: Array[String]) {
8 |
9 | val sc = new SparkContext("local", "CollectAsMap Test")
10 | val data = Array[(String, Int)](("A", 1), ("B", 2),
11 | ("B", 3), ("C", 4),
12 | ("C", 5), ("C", 6))
13 |
14 | // as same as "val pairs = sc.parallelize(data, 3)"
15 | val pairs = sc.makeRDD(data, 3)
16 |
17 | val result = pairs.collectAsMap
18 |
19 | // output Map(A -> 1, C -> 6, B -> 3)
20 | print(result)
21 | }
22 |
23 | }
--------------------------------------------------------------------------------
/src/main/scala/com/javachen/spark/examples/rdd/MapPartitions.scala:
--------------------------------------------------------------------------------
1 | package com.javachen.spark.examples.rdd
2 |
3 | import org.apache.spark.SparkContext
4 |
5 | object MapPartitions {
6 |
7 | def main(args: Array[String]) {
8 | val sc = new SparkContext("local", "MapPartitionsRDD Test")
9 | val data = Array[(String, Int)](("A1", 1), ("A2", 2),
10 | ("B1", 1), ("B2", 4),
11 | ("C1", 3), ("C2", 4)
12 | )
13 | val pairs = sc.parallelize(data, 3)
14 |
15 | val finalRDD = pairs.mapPartitions(iter => iter.filter(_._2 >= 2))
16 | // val finalRDD2 = pairs.mapPartitionsWithIndex(f, preservesPartitioning)
17 |
18 | finalRDD.toArray().foreach(println)
19 |
20 | }
21 | }
--------------------------------------------------------------------------------
/src/main/python/PythonWordCount.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | from pyspark import SparkContext
4 |
5 | if __name__ == "__main__":
6 | file=sys.argv[1]
7 | threshold=int(sys.argv[2])
8 | sc = SparkContext(appName="PythonWordCount")
9 | lines = sc.textFile(file, 1)
10 | counts = lines.flatMap(lambda x: x.split(' ')) \
11 | .map(lambda x: (x, 1)) \
12 | .reduceByKey(lambda a, b: a + b) \
13 | .filter(lambda (a, b) : b >= threshold) \
14 | .flatMap(lambda (a, b): list(a)) \
15 | .map(lambda x: (x, 1)) \
16 | .reduceByKey(lambda a, b: a + b)
17 |
18 | print ",".join(str(t) for t in counts.collect())
19 | sc.stop()
--------------------------------------------------------------------------------
/src/main/scala/com/javachen/spark/examples/rdd/ReduceByKey.scala:
--------------------------------------------------------------------------------
1 | package com.javachen.spark.examples.rdd
2 |
3 | import org.apache.spark.SparkContext
4 |
5 | object ReduceByKey {
6 |
7 | def main(args: Array[String]) {
8 |
9 | val sc = new SparkContext("local", "ReduceByKeyToDriver Test")
10 | val data1 = Array[(String, Int)](("K", 1), ("U", 2),
11 | ("U", 3), ("W", 4),
12 | ("W", 5), ("W", 6))
13 | val pairs = sc.parallelize(data1, 3)
14 | //val result = pairs.reduce((A, B) => (A._1 + "#" + B._1, A._2 + B._2))
15 | //val result = pairs.fold(("K0",10))((A, B) => (A._1 + "#" + B._1, A._2 + B._2))
16 | val result = pairs.reduceByKey(_ + _, 2)
17 | result.foreach(println)
18 | }
19 |
20 | }
--------------------------------------------------------------------------------
/src/main/scala/com/javachen/spark/examples/rdd/AggregateOrder.scala:
--------------------------------------------------------------------------------
1 | package com.javachen.spark.examples.rdd
2 |
3 | import org.apache.spark.SparkContext
4 |
5 | object AggregateOrder {
6 |
7 | def main(args: Array[String]) {
8 |
9 | val sc = new SparkContext("local", "AggregateOrder Test")
10 | val data = List("12", "23", "345", "4567")
11 |
12 | val pairs = sc.parallelize(data, 2)
13 | pairs.foreach(x => println(x.length))
14 |
15 | //val result = pairs.aggregate("")((x,y) => math.min(x.length, y.length).toString, (x,y) => x + y)
16 |
17 | val result2 = pairs.aggregate("")((x,y) => "[" + x.length + "," + y.length + "] ", (x,y) => x + y)
18 |
19 | result2.foreach(println)
20 | println(result2)
21 |
22 | }
23 | }
--------------------------------------------------------------------------------
/src/main/scala/com/javachen/spark/examples/rdd/MapValues.scala:
--------------------------------------------------------------------------------
1 | package com.javachen.spark.examples.rdd
2 |
3 | import org.apache.spark.SparkContext
4 |
5 | object MapValues {
6 | def main(args: Array[String]) {
7 |
8 | val sc = new SparkContext("local", "ReduceByKeyToDriver Test")
9 | val data1 = Array[(String, Int)](("K", 1), ("T", 2),
10 | ("T", 3), ("W", 4),
11 | ("W", 5), ("W", 6)
12 | )
13 | val pairs = sc.parallelize(data1, 3)
14 | //val result = pairs.reduce((A, B) => (A._1 + "#" + B._1, A._2 + B._2))
15 | //val result = pairs.fold(("K0",10))((A, B) => (A._1 + "#" + B._1, A._2 + B._2))
16 | //val result = pairs.partitionBy(new RangePartitioner(2, pairs, true))
17 | val result = pairs.mapValues(V => 10 * V)
18 | result.foreach(println)
19 | }
20 | }
--------------------------------------------------------------------------------
/src/test/java/com/javachen/spark/AppTest.java:
--------------------------------------------------------------------------------
1 | package com.javachen.spark;
2 |
3 | import junit.framework.Test;
4 | import junit.framework.TestCase;
5 | import junit.framework.TestSuite;
6 |
7 | /**
8 | * Unit test for simple App.
9 | */
10 | public class AppTest
11 | extends TestCase {
12 | /**
13 | * Create the test case
14 | *
15 | * @param testName name of the test case
16 | */
17 | public AppTest(String testName) {
18 | super(testName);
19 | }
20 |
21 | /**
22 | * @return the suite of tests being tested
23 | */
24 | public static Test suite() {
25 | return new TestSuite(AppTest.class);
26 | }
27 |
28 | /**
29 | * Rigourous Test :-)
30 | */
31 | public void testApp() {
32 | assertTrue(true);
33 | }
34 | }
35 |
--------------------------------------------------------------------------------
/src/main/scala/com/javachen/spark/examples/rdd/Aggregate.scala:
--------------------------------------------------------------------------------
1 | package com.javachen.spark.examples.rdd
2 |
3 | import org.apache.spark.SparkContext
4 |
5 | object Aggregate {
6 |
7 | def main(args: Array[String]) {
8 |
9 | val sc = new SparkContext("local", "AggregateAction Test")
10 | val data = Array[(String, Int)](("A1", 1), ("A2", 2),
11 | ("B1", 3), ("B2", 4),
12 | ("C1", 5), ("C2", 6))
13 |
14 | val pairs = sc.parallelize(data, 3)
15 |
16 | // output:
17 | // (A1,1)(A2,2)
18 | // (B1,3)(B2,4)
19 | // (C1,5)(C2,6)
20 | pairs.foreach(print)
21 |
22 | val result = pairs.aggregate(("", 0))((U, T) => (U._1 + T._1, U._2 + T._2), (U, T) =>
23 | ("[" + U._1 + T._1 + "] ", U._2 + T._2))
24 |
25 | // output ([[[A1A2] B1B2] C1C2] ,21)
26 | println(result)
27 | }
28 | }
--------------------------------------------------------------------------------
/src/main/python/PythonALS.py:
--------------------------------------------------------------------------------
1 | from pyspark.mllib.recommendation import ALS
2 | from numpy import array
3 |
4 | # Load and parse the data
5 | data = sc.textFile("data/mllib/als/test.data")
6 | ratings = data.map(lambda line: array([float(x) for x in line.split(',')]))
7 |
8 | # Build the recommendation model using Alternating Least Squares
9 | rank = 10
10 | numIterations = 20
11 | model = ALS.train(ratings, rank, numIterations)
12 |
13 | # Evaluate the model on training data
14 | testdata = ratings.map(lambda p: (int(p[0]), int(p[1])))
15 | predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2]))
16 | ratesAndPreds = ratings.map(lambda r: ((r[0], r[1]), r[2])).join(predictions)
17 | MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).reduce(lambda x, y: x + y)/ratesAndPreds.count()
18 | print("Mean Squared Error = " + str(MSE))
--------------------------------------------------------------------------------
/src/main/scala/com/javachen/spark/examples/rdd/PartitionBy.scala:
--------------------------------------------------------------------------------
1 | package com.javachen.spark.examples.rdd
2 |
3 | import org.apache.spark.{RangePartitioner,HashPartitioner, SparkContext}
4 |
5 | object PartitionBy {
6 | def main(args: Array[String]) {
7 |
8 | val sc = new SparkContext("local", "ReduceByKeyToDriver Test")
9 | val data1 = Array[(String, Int)](("K", 1), ("T", 2),
10 | ("T", 3), ("W", 4),
11 | ("W", 5), ("W", 6)
12 | )
13 | val pairs = sc.parallelize(data1, 3)
14 | //val result = pairs.reduce((A, B) => (A._1 + "#" + B._1, A._2 + B._2))
15 | //val result = pairs.fold(("K0",10))((A, B) => (A._1 + "#" + B._1, A._2 + B._2))
16 | var result = pairs.partitionBy(new RangePartitioner(2, pairs, true))
17 | result = pairs.partitionBy(new HashPartitioner(2))
18 | result.foreach(println)
19 | }
20 | }
--------------------------------------------------------------------------------
/src/main/python/PythonSparkSQLByReflection.py:
--------------------------------------------------------------------------------
1 | # sc is an existing SparkContext.
2 | from pyspark.sql import SQLContext, Row
3 |
4 | sqlContext = SQLContext(sc)
5 |
6 | # Load a text file and convert each line to a Row.
7 | lines = sc.textFile("people.txt")
8 | parts = lines.map(lambda l: l.split(","))
9 | people = parts.map(lambda p: Row(name=p[0], age=int(p[1])))
10 |
11 | # Infer the schema, and register the DataFrame as a table.
12 | schemaPeople = sqlContext.inferSchema(people)
13 | schemaPeople.registerTempTable("people")
14 |
15 | # SQL can be run over DataFrames that have been registered as a table.
16 | teenagers = sqlContext.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19")
17 |
18 | # The results of SQL queries are RDDs and support all the normal RDD operations.
19 | teenNames = teenagers.map(lambda p: "Name: " + p.name)
20 | for teenName in teenNames.collect():
21 | print teenName
--------------------------------------------------------------------------------
/src/main/scala/com/javachen/spark/examples/rdd/Join.scala:
--------------------------------------------------------------------------------
1 | package com.javachen.spark.examples.rdd
2 |
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.SparkContext._
5 |
6 | object Join {
7 | def main(args: Array[String]) {
8 |
9 | val sc = new SparkContext("local[2]", "JoinAction Test")
10 |
11 | val data1 = Array[(String, Int)](("A1", 1), ("A2", 2),
12 | ("B1", 3), ("B2", 4),
13 | ("C1", 5), ("C1", 6)
14 | )
15 |
16 | val data2 = Array[(String, Int)](("A1", 7), ("A2", 8),
17 | ("B1", 9), ("C1", 0)
18 | )
19 | val pairs1 = sc.parallelize(data1, 3)
20 | val pairs2 = sc.parallelize(data2, 2)
21 |
22 |
23 | val result = pairs1.join(pairs2)
24 |
25 | // output:
26 | // (A1,(1,7))
27 | // (B1,(3,9))
28 | // (A2,(2,8))
29 | //
30 | // (C1,(5,0))
31 | // (C1,(6,0))
32 | result.foreach(println)
33 | }
34 |
35 | }
--------------------------------------------------------------------------------
/src/main/scala/com/javachen/spark/examples/rdd/ScalaWordCount.scala:
--------------------------------------------------------------------------------
1 | package com.javachen.spark.examples.rdd
2 |
3 | import org.apache.spark.{SparkConf, SparkContext}
4 |
5 | object ScalaWordCount {
6 | def main(args: Array[String]) {
7 | val sc = new SparkContext(new SparkConf().setAppName("ScalaWordCount"))
8 | val threshold = args(1).toInt
9 |
10 | // split each document into words
11 | val tokenized = sc.textFile(args(0)).flatMap(_.split(" "))
12 |
13 | // count the occurrence of each word
14 | val wordCounts = tokenized.map((_, 1)).reduceByKey(_ + _)
15 |
16 | // filter out words with less than threshold occurrences
17 | val filtered = wordCounts.filter(_._2 >= threshold)
18 |
19 | // count characters
20 | val charCounts = filtered.flatMap(_._1.toCharArray).map((_, 1)).reduceByKey(_ + _)
21 |
22 | System.out.println(charCounts.collect().mkString(", "))
23 | }
24 | }
--------------------------------------------------------------------------------
/src/main/scala/com/javachen/spark/examples/rdd/GroupWith.scala:
--------------------------------------------------------------------------------
1 | package com.javachen.spark.examples.rdd
2 |
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.SparkContext._
5 |
6 | object GroupWith {
7 | def main(args: Array[String]) {
8 |
9 | val sc = new SparkContext("local[2]", "GroupWith Test")
10 |
11 | val data1 = Array[(String, Int)](("A1", 1), ("A2", 2),
12 | ("B1", 3), ("B2", 4),
13 | ("C1", 5), ("C1", 6)
14 | )
15 |
16 | val data2 = Array[(String, Int)](("A1", 7), ("A2", 8),
17 | ("B1", 9), ("C1", 0)
18 | )
19 | val pairs1 = sc.parallelize(data1, 3)
20 | val pairs2 = sc.parallelize(data2, 2)
21 |
22 | val result = pairs1.groupWith(pairs2)
23 | result.foreach(println)
24 |
25 | // output:
26 | // (B1,(ArrayBuffer(3),ArrayBuffer(9)))
27 | // (A1,(ArrayBuffer(1),ArrayBuffer(7)))
28 | // (A2,(ArrayBuffer(2),ArrayBuffer(8)))
29 | //
30 | // (C1,(ArrayBuffer(5, 6),ArrayBuffer(0)))
31 | // (B2,(ArrayBuffer(4),ArrayBuffer()))
32 |
33 |
34 | }
35 | }
--------------------------------------------------------------------------------
/src/main/python/PythonSparkSQLBySchema.py:
--------------------------------------------------------------------------------
1 | # Import SQLContext and data types
2 | from pyspark.sql import *
3 |
4 | # sc is an existing SparkContext.
5 | sqlContext = SQLContext(sc)
6 |
7 | # Load a text file and convert each line to a tuple.
8 | lines = sc.textFile("people.txt")
9 | parts = lines.map(lambda l: l.split(","))
10 | people = parts.map(lambda p: (p[0], p[1].strip()))
11 |
12 | # The schema is encoded in a string.
13 | schemaString = "name age"
14 |
15 | fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split()]
16 | schema = StructType(fields)
17 |
18 | # Apply the schema to the RDD.
19 | schemaPeople = sqlContext.createDataFrame(people, schema)
20 |
21 | # Register the DataFrame as a table.
22 | schemaPeople.registerTempTable("people")
23 |
24 | # SQL can be run over DataFrames that have been registered as a table.
25 | results = sqlContext.sql("SELECT name FROM people")
26 |
27 | # The results of SQL queries are RDDs and support all the normal RDD operations.
28 | names = results.map(lambda p: "Name: " + p.name)
29 | for name in names.collect():
30 | print name
--------------------------------------------------------------------------------
/src/main/scala/com/javachen/spark/examples/rdd/GroupByKey.scala:
--------------------------------------------------------------------------------
1 | package com.javachen.spark.examples.rdd
2 |
3 | import java.util.Random
4 |
5 | import org.apache.spark.{SparkConf, SparkContext}
6 | import org.apache.spark.SparkContext._
7 |
8 | /**
9 | * Usage: GroupByTest [numMappers] [numKVPairs] [KeySize] [numReducers]
10 | */
11 | object GroupByKey {
12 | def main(args: Array[String]) {
13 | val sparkConf = new SparkConf().setAppName("GroupBy Test").setMaster("local[2]")
14 | var numMappers = 10
15 | var numKVPairs = 100
16 | var valSize = 100
17 | var numReducers = 3
18 |
19 | val sc = new SparkContext(sparkConf)
20 |
21 | val pairs1 = sc.parallelize(0 until numMappers, numMappers).flatMap { p =>
22 | val ranGen = new Random
23 | var arr1 = new Array[(Int, Array[Byte])](numKVPairs)
24 | for (i <- 0 until numKVPairs) {
25 | val byteArr = new Array[Byte](valSize)
26 | ranGen.nextBytes(byteArr)
27 | arr1(i) = (ranGen.nextInt(10), byteArr)
28 | }
29 | arr1
30 | }.cache
31 | // Enforce that everything has been calculated and in cache
32 | pairs1.count
33 |
34 | val result = pairs1.groupByKey(numReducers)
35 | println(result.count)
36 | println(result.toDebugString)
37 |
38 | sc.stop()
39 | }
40 | }
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/examples/LocalPi.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.spark.examples
19 |
20 | import scala.math.random
21 |
22 | import org.apache.spark._
23 | import org.apache.spark.SparkContext._
24 |
25 | object LocalPi {
26 | def main(args: Array[String]) {
27 | var count = 0
28 | for (i <- 1 to 100000) {
29 | val x = random * 2 - 1
30 | val y = random * 2 - 1
31 | if (x*x + y*y < 1) count += 1
32 | }
33 | println("Pi is roughly " + 4 * count / 100000.0)
34 | }
35 | }
36 |
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/examples/ExceptionHandlingTest.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.spark.examples
19 |
20 | import org.apache.spark.{SparkConf, SparkContext}
21 |
22 | object ExceptionHandlingTest {
23 | def main(args: Array[String]) {
24 | val sparkConf = new SparkConf().setAppName("ExceptionHandlingTest")
25 | val sc = new SparkContext(sparkConf)
26 | sc.parallelize(0 until sc.defaultParallelism).foreach { i =>
27 | if (math.random > 0.75) {
28 | throw new Exception("Testing exception handling")
29 | }
30 | }
31 |
32 | sc.stop()
33 | }
34 | }
35 |
--------------------------------------------------------------------------------
/src/main/scala/com/javachen/spark/examples/rdd/Cartesian.scala:
--------------------------------------------------------------------------------
1 | package com.javachen.spark.examples.rdd
2 |
3 | import org.apache.spark.SparkContext
4 |
5 | object Cartesian {
6 | def main(args: Array[String]) {
7 | val sc = new SparkContext("local", "Cartesian Test")
8 | val data1 = Array[(String, Int)](("A1", 1), ("A2", 2),
9 | ("B1", 3), ("B2", 4),
10 | ("C1", 5), ("C1", 6))
11 |
12 | val data2 = Array[(String, Int)](("A1", 7), ("A2", 8),
13 | ("B1", 9), ("C1", 0))
14 | val pairs1 = sc.parallelize(data1, 3)
15 | val pairs2 = sc.parallelize(data2, 2)
16 |
17 | val resultRDD = pairs1.cartesian(pairs2)
18 |
19 | resultRDD.foreach(println)
20 |
21 | /*
22 | * Output of task1:
23 | * ((A1,1),(A1,7))
24 | * ((A1,1),(A2,8))
25 | * ((A2,2),(A1,7))
26 | * ((A2,2),(A2,8))
27 | * Output of task2:
28 | * ((A1,1),(B1,9))
29 | * ((A1,1),(C1,0))
30 | * ((A2,2),(B1,9))
31 | * ((A2,2),(C1,0))
32 | * Output of task3:
33 | * ((B1,3),(A1,7))
34 | * ((B1,3),(A2,8))
35 | * ((B2,4),(A1,7))
36 | * ((B2,4),(A2,8))
37 | * Output of task4:
38 | * ((B1,3),(B1,9))
39 | * ((B1,3),(C1,0))
40 | * ((B2,4),(B1,9))
41 | * ((B2,4),(C1,0))
42 | * Output of task5:
43 | * ((C1,5),(A1,7))
44 | * ((C1,5),(A2,8))
45 | * ((C1,6),(A1,7))
46 | * ((C1,6),(A2,8))
47 | * Output of task6:
48 | * ((C1,5),(B1,9))
49 | * ((C1,5),(C1,0))
50 | * ((C1,6),(B1,9))
51 | * ((C1,6),(C1,0))
52 | */
53 |
54 | }
55 | }
--------------------------------------------------------------------------------
/src/main/scala/com/javachen/spark/examples/rdd/GroupByAction.scala:
--------------------------------------------------------------------------------
1 | package com.javachen.spark.examples.rdd
2 |
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.RangePartitioner
5 |
6 | object GroupByAction {
7 | def main(args: Array[String]) {
8 |
9 | val sc = new SparkContext("local", "GroupByAction Test")
10 |
11 | val data = Array[(String, Int)](("A1", 1), ("A2", 2),
12 | ("B1", 6), ("A2", 4),
13 | ("B1", 3), ("B1", 5))
14 |
15 | val pairs = sc.parallelize(data, 3)
16 |
17 | // output:
18 | // (A1,1)
19 | // (A2,2)
20 | //
21 | // (B1,6)
22 | // (A2,4)
23 | //
24 | // (B1,3)
25 | // (B1,5)
26 | pairs.foreach(println)
27 |
28 | val result1 = pairs.groupBy(K => K._1)
29 | val result2 = pairs.groupBy((K : (String, Int)) => K._1, 1)
30 | val result3 = pairs.groupBy((K : (String, Int)) => K._1, new RangePartitioner(3, pairs))
31 |
32 | // output of result1:
33 | // (A1,ArrayBuffer((A1,1)))
34 | //
35 | // (B1,ArrayBuffer((B1,6), (B1,3), (B1,5)))
36 | // (A2,ArrayBuffer((A2,2), (A2,4)))
37 | result1.foreach(println)
38 |
39 | // output of result2:
40 | // (A1,ArrayBuffer((A1,1)))
41 | // (B1,ArrayBuffer((B1,6), (B1,3), (B1,5)))
42 | // (A2,ArrayBuffer((A2,2), (A2,4)))
43 | result2.foreach(println)
44 |
45 | // output of result3:
46 | // (A1,ArrayBuffer((A1,1)))
47 | // (A2,ArrayBuffer((A2,2), (A2,4)))
48 | //
49 | // (B1,ArrayBuffer((B1,6), (B1,3), (B1,5)))
50 | result3.foreach(println)
51 |
52 | }
53 |
54 | }
--------------------------------------------------------------------------------
/src/main/scala/com/javachen/spark/examples/sparksql/ScalaSparkSQLByReflection.scala:
--------------------------------------------------------------------------------
1 | package com.javachen.spark.examples.sparksql
2 |
3 | import org.apache.spark.{SparkConf, SparkContext}
4 |
5 | object ScalaSparkSQLByReflection {
6 |
7 | // Define the schema using a case class.
8 | // Note: Case classes in Scala 2.10 can support only up to 22 fields. To work around this limit,
9 | // you can use custom classes that implement the Product interface.
10 | case class People(name: String, age: Int)
11 |
12 | def main(args: Array[String]) {
13 | val sc = new SparkContext(new SparkConf().setAppName("ScalaSparkSQL"))
14 | val sqlContext = new org.apache.spark.sql.SQLContext(sc)
15 |
16 | // this is used to implicitly convert an RDD to a DataFrame.
17 | import sqlContext.implicits._
18 |
19 | // Create an RDD of People objects and register it as a table.
20 | val people = sc.textFile("people.txt").map(_.split(",")).map(p => People(p(0), p(1).trim.toInt)).toDF()
21 | people.registerTempTable("people")
22 |
23 | // SQL statements can be run by using the sql methods provided by sqlContext.
24 | val teenagers = sqlContext.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19")
25 |
26 | // The results of SQL queries are DataFrames and support all the normal RDD operations.
27 | // The columns of a row in the result can be accessed by ordinal.
28 | teenagers.map(t => "Name: " + t(0)).collect().foreach(println)
29 |
30 | people.saveAsParquetFile("people.parquet")
31 |
32 | val parquetFile = sqlContext.parquetFile("people.parquet")
33 | }
34 | }
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/examples/SparkPi.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.spark.examples
19 |
20 | import scala.math.random
21 |
22 | import org.apache.spark._
23 |
24 | /** Computes an approximation to pi */
25 | object SparkPi {
26 | def main(args: Array[String]) {
27 | val conf = new SparkConf().setAppName("Spark Pi")
28 | val spark = new SparkContext(conf)
29 | val slices = if (args.length > 0) args(0).toInt else 2
30 | val n = math.min(100000L * slices, Int.MaxValue).toInt // avoid overflow
31 | val count = spark.parallelize(1 until n, slices).map { i =>
32 | val x = random * 2 - 1
33 | val y = random * 2 - 1
34 | if (x*x + y*y < 1) 1 else 0
35 | }.reduce(_ + _)
36 | println("Pi is roughly " + 4.0 * count / n)
37 | spark.stop()
38 | }
39 | }
40 |
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/examples/HdfsTest.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.spark.examples
19 |
20 | import org.apache.spark._
21 |
22 |
23 | object HdfsTest {
24 |
25 | /** Usage: HdfsTest [file] */
26 | def main(args: Array[String]) {
27 | if (args.length < 1) {
28 | System.err.println("Usage: HdfsTest ")
29 | System.exit(1)
30 | }
31 | val sparkConf = new SparkConf().setAppName("HdfsTest")
32 | val sc = new SparkContext(sparkConf)
33 | val file = sc.textFile(args(0))
34 | val mapped = file.map(s => s.length).cache()
35 | for (iter <- 1 to 10) {
36 | val start = System.currentTimeMillis()
37 | for (x <- mapped) { x + 2 }
38 | val end = System.currentTimeMillis()
39 | println("Iteration " + iter + " took " + (end-start) + " ms")
40 | }
41 | sc.stop()
42 | }
43 | }
44 |
--------------------------------------------------------------------------------
/src/main/scala/com/javachen/spark/examples/sparksql/ScalaSparkSQLBySchema.scala:
--------------------------------------------------------------------------------
1 | package com.javachen.spark.examples.sparksql
2 |
3 | import org.apache.spark.sql.types.{StringType, StructField, StructType}
4 | import org.apache.spark.{SparkConf, SparkContext}
5 |
6 | object ScalaSparkSQLBySchema {
7 |
8 | def main(args: Array[String]) {
9 | val sc = new SparkContext(new SparkConf().setAppName("ScalaSparkSQL"))
10 | val sqlContext = new org.apache.spark.sql.SQLContext(sc)
11 |
12 | // Create an RDD
13 | val people = sc.textFile("people.txt")
14 |
15 | // The schema is encoded in a string
16 | val schemaString = "name age"
17 |
18 | // Import Spark SQL data types and Row.
19 | import org.apache.spark.sql._
20 |
21 | // Generate the schema based on the string of schema
22 | val schema =
23 | StructType(
24 | schemaString.split(" ").map(fieldName => StructField(fieldName, StringType, true)))
25 |
26 | // Convert records of the RDD (people) to Rows.
27 | val rowRDD = people.map(_.split(",")).map(p => Row(p(0), p(1).trim))
28 |
29 | // Apply the schema to the RDD.
30 | val peopleDataFrame = sqlContext.createDataFrame(rowRDD, schema)
31 |
32 | // Register the DataFrames as a table.
33 | peopleDataFrame.registerTempTable("people")
34 |
35 | // SQL statements can be run by using the sql methods provided by sqlContext.
36 | val results = sqlContext.sql("SELECT name FROM people")
37 |
38 | // The results of SQL queries are DataFrames and support all the normal RDD operations.
39 | // The columns of a row in the result can be accessed by ordinal.
40 | results.map(t => "Name: " + t(0)).collect().foreach(println)
41 | }
42 | }
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/examples/SparkTachyonPi.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.spark.examples
19 |
20 | import scala.math.random
21 |
22 | import org.apache.spark._
23 | import org.apache.spark.storage.StorageLevel
24 |
25 | /**
26 | * Computes an approximation to pi
27 | * This example uses Tachyon to persist rdds during computation.
28 | */
29 | object SparkTachyonPi {
30 | def main(args: Array[String]) {
31 | val sparkConf = new SparkConf().setAppName("SparkTachyonPi")
32 | val spark = new SparkContext(sparkConf)
33 |
34 | val slices = if (args.length > 0) args(0).toInt else 2
35 | val n = 100000 * slices
36 |
37 | val rdd = spark.parallelize(1 to n, slices)
38 | rdd.persist(StorageLevel.OFF_HEAP)
39 | val count = rdd.map { i =>
40 | val x = random * 2 - 1
41 | val y = random * 2 - 1
42 | if (x * x + y * y < 1) 1 else 0
43 | }.reduce(_ + _)
44 | println("Pi is roughly " + 4.0 * count / n)
45 |
46 | spark.stop()
47 | }
48 | }
49 |
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/examples/DriverSubmissionTest.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.spark.examples
19 |
20 | import scala.collection.JavaConversions._
21 |
22 | import org.apache.spark.util.Utils
23 |
24 | /** Prints out environmental information, sleeps, and then exits. Made to
25 | * test driver submission in the standalone scheduler. */
26 | object DriverSubmissionTest {
27 | def main(args: Array[String]) {
28 | if (args.size < 1) {
29 | println("Usage: DriverSubmissionTest ")
30 | System.exit(0)
31 | }
32 | val numSecondsToSleep = args(0).toInt
33 |
34 | val env = System.getenv()
35 | val properties = Utils.getSystemProperties
36 |
37 | println("Environment variables containing SPARK_TEST:")
38 | env.filter{case (k, v) => k.contains("SPARK_TEST")}.foreach(println)
39 |
40 | println("System properties containing spark.test:")
41 | properties.filter{case (k, v) => k.toString.contains("spark.test")}.foreach(println)
42 |
43 | for (i <- 1 until numSecondsToSleep) {
44 | println(s"Alive for $i out of $numSecondsToSleep seconds")
45 | Thread.sleep(1000)
46 | }
47 | }
48 | }
49 |
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/examples/MultiBroadcastTest.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.spark.examples
19 |
20 | import org.apache.spark.rdd.RDD
21 | import org.apache.spark.{SparkConf, SparkContext}
22 |
23 | /**
24 | * Usage: MultiBroadcastTest [slices] [numElem]
25 | */
26 | object MultiBroadcastTest {
27 | def main(args: Array[String]) {
28 |
29 | val sparkConf = new SparkConf().setAppName("Multi-Broadcast Test")
30 | val sc = new SparkContext(sparkConf)
31 |
32 | val slices = if (args.length > 0) args(0).toInt else 2
33 | val num = if (args.length > 1) args(1).toInt else 1000000
34 |
35 | val arr1 = new Array[Int](num)
36 | for (i <- 0 until arr1.length) {
37 | arr1(i) = i
38 | }
39 |
40 | val arr2 = new Array[Int](num)
41 | for (i <- 0 until arr2.length) {
42 | arr2(i) = i
43 | }
44 |
45 | val barr1 = sc.broadcast(arr1)
46 | val barr2 = sc.broadcast(arr2)
47 | val observedSizes: RDD[(Int, Int)] = sc.parallelize(1 to 10, slices).map { _ =>
48 | (barr1.value.size, barr2.value.size)
49 | }
50 | // Collect the small RDD so we can print the observed sizes locally.
51 | observedSizes.collect().foreach(i => println(i))
52 |
53 | sc.stop()
54 | }
55 | }
56 |
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/examples/GroupByTest.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.spark.examples
19 |
20 | import java.util.Random
21 |
22 | import org.apache.spark.{SparkConf, SparkContext}
23 | import org.apache.spark.SparkContext._
24 |
25 | /**
26 | * Usage: GroupByTest [numMappers] [numKVPairs] [KeySize] [numReducers]
27 | */
28 | object GroupByTest {
29 | def main(args: Array[String]) {
30 | val sparkConf = new SparkConf().setAppName("GroupBy Test")
31 | var numMappers = if (args.length > 0) args(0).toInt else 2
32 | var numKVPairs = if (args.length > 1) args(1).toInt else 1000
33 | var valSize = if (args.length > 2) args(2).toInt else 1000
34 | var numReducers = if (args.length > 3) args(3).toInt else numMappers
35 |
36 | val sc = new SparkContext(sparkConf)
37 |
38 | val pairs1 = sc.parallelize(0 until numMappers, numMappers).flatMap { p =>
39 | val ranGen = new Random
40 | var arr1 = new Array[(Int, Array[Byte])](numKVPairs)
41 | for (i <- 0 until numKVPairs) {
42 | val byteArr = new Array[Byte](valSize)
43 | ranGen.nextBytes(byteArr)
44 | arr1(i) = (ranGen.nextInt(Int.MaxValue), byteArr)
45 | }
46 | arr1
47 | }.cache()
48 | // Enforce that everything has been calculated and in cache
49 | pairs1.count()
50 |
51 | println(pairs1.groupByKey(numReducers).count())
52 |
53 | sc.stop()
54 | }
55 | }
56 |
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/examples/BroadcastTest.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.spark.examples
19 |
20 | import org.apache.spark.{SparkConf, SparkContext}
21 |
22 | /**
23 | * Usage: BroadcastTest [slices] [numElem] [broadcastAlgo] [blockSize]
24 | */
25 | object BroadcastTest {
26 | def main(args: Array[String]) {
27 |
28 | val bcName = if (args.length > 2) args(2) else "Http"
29 | val blockSize = if (args.length > 3) args(3) else "4096"
30 |
31 | val sparkConf = new SparkConf().setAppName("Broadcast Test")
32 | .set("spark.broadcast.factory", s"org.apache.spark.broadcast.${bcName}BroadcastFactory")
33 | .set("spark.broadcast.blockSize", blockSize)
34 | val sc = new SparkContext(sparkConf)
35 |
36 | val slices = if (args.length > 0) args(0).toInt else 2
37 | val num = if (args.length > 1) args(1).toInt else 1000000
38 |
39 | val arr1 = (0 until num).toArray
40 |
41 | for (i <- 0 until 3) {
42 | println("Iteration " + i)
43 | println("===========")
44 | val startTime = System.nanoTime
45 | val barr1 = sc.broadcast(arr1)
46 | val observedSizes = sc.parallelize(1 to 10, slices).map(_ => barr1.value.size)
47 | // Collect the small RDD so we can print the observed sizes locally.
48 | observedSizes.collect().foreach(i => println(i))
49 | println("Iteration %d took %.0f milliseconds".format(i, (System.nanoTime - startTime) / 1E6))
50 | }
51 |
52 | sc.stop()
53 | }
54 | }
55 |
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/examples/SkewedGroupByTest.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.spark.examples
19 |
20 | import java.util.Random
21 |
22 | import org.apache.spark.{SparkConf, SparkContext}
23 | import org.apache.spark.SparkContext._
24 |
25 | /**
26 | * Usage: GroupByTest [numMappers] [numKVPairs] [KeySize] [numReducers]
27 | */
28 | object SkewedGroupByTest {
29 | def main(args: Array[String]) {
30 | val sparkConf = new SparkConf().setAppName("GroupBy Test")
31 | var numMappers = if (args.length > 0) args(0).toInt else 2
32 | var numKVPairs = if (args.length > 1) args(1).toInt else 1000
33 | var valSize = if (args.length > 2) args(2).toInt else 1000
34 | var numReducers = if (args.length > 3) args(3).toInt else numMappers
35 |
36 | val sc = new SparkContext(sparkConf)
37 |
38 | val pairs1 = sc.parallelize(0 until numMappers, numMappers).flatMap { p =>
39 | val ranGen = new Random
40 |
41 | // map output sizes lineraly increase from the 1st to the last
42 | numKVPairs = (1.0 * (p + 1) / numMappers * numKVPairs).toInt
43 |
44 | var arr1 = new Array[(Int, Array[Byte])](numKVPairs)
45 | for (i <- 0 until numKVPairs) {
46 | val byteArr = new Array[Byte](valSize)
47 | ranGen.nextBytes(byteArr)
48 | arr1(i) = (ranGen.nextInt(Int.MaxValue), byteArr)
49 | }
50 | arr1
51 | }.cache()
52 | // Enforce that everything has been calculated and in cache
53 | pairs1.count()
54 |
55 | println(pairs1.groupByKey(numReducers).count())
56 |
57 | sc.stop()
58 | }
59 | }
60 |
--------------------------------------------------------------------------------
/src/main/scala/com/javachen/spark/examples/mllib/EvaluateResult.scala:
--------------------------------------------------------------------------------
1 | package com.javachen.grab
2 |
3 | import org.apache.spark.mllib.recommendation.{ALS, MatrixFactorizationModel, Rating}
4 | import org.apache.spark.rdd.RDD
5 |
6 | /**
7 | *
8 | * Created by june on 2015-05-27 09:13.
9 | */
10 | object EvaluateResult {
11 | def coverage(training: RDD[Rating],userRecommends:RDD[(Int, List[Int])])={
12 | userRecommends.flatMap(_._2).distinct().count.toDouble / training.map(_.product).distinct().count
13 | }
14 |
15 | def popularity(training: RDD[Rating],userRecommends:RDD[(Int, List[Int])])={
16 | var ret = 0.0
17 | var n=0
18 | val item_popularity=training.map{ case Rating(user, product, rate) =>
19 | (product,(user, rate))
20 | }.groupByKey(4).map{case (product,list)=>
21 | (product,list.size)
22 | }.collectAsMap()
23 |
24 | userRecommends.flatMap(_._2).collect().foreach { p =>
25 | ret = ret + math.log(1 + item_popularity.get(p).get)
26 | n = n + 1
27 | }
28 |
29 | ret/n
30 | }
31 |
32 | def recallAndPrecisionAndF1(training: RDD[Rating],userRecommends:RDD[(Int, List[Int])]):(Double, Double,Double) = {
33 | val usersProducts: RDD[(Int, Int)] = training.map { case Rating(user, product, rate) =>
34 | (user, product)
35 | }
36 |
37 | val groupData=userRecommends.join(usersProducts.groupByKey().map {case (k,v) => (k,v.toList)})
38 |
39 | val (hit, testNum, recNum) = groupData.map{ case (user, (mItems, tItems)) =>
40 | var count = 0
41 | // 计算准确率:推荐命中商品数/实际推荐商品数, topN为推荐上限值
42 | val precNum = mItems.length
43 | for (i <- 0 until precNum)
44 | if (tItems.contains(mItems(i)))
45 | count += 1
46 | (count, tItems.length, precNum) }.reduce( (t1, t2) => (t1._1 + t2._1, t1._2 + t2._2, t1._3 + t2._3) )
47 |
48 | val recall: Double = hit * 1.0 / testNum
49 | val precision: Double = hit * 1.0 / recNum
50 | val f1: Double = 2 * recall * precision / (recall + precision)
51 |
52 | println(s"$hit,$testNum,$recNum")
53 | (recall,precision,f1)
54 | }
55 |
56 | def recallAndPrecision(test:RDD[Rating],result:RDD[Rating]):Double = {
57 | val numHit: Long = result.intersection(test).count
58 | val recall: Double = numHit * 1.0 / test.count
59 | val precision: Double = numHit * 1.0 / result.count
60 | val f1: Double = 2 * recall * precision / (recall + precision)
61 | System.out.println("recall : " + recall + "\nprecision : " + precision + "\nf1 : " + f1)
62 | f1
63 | }
64 | }
65 |
--------------------------------------------------------------------------------
/src/main/java/com/javachen/spark/examples/sparksql/JavaSparkSQLBySchema.java:
--------------------------------------------------------------------------------
1 | //package com.javachen.spark.examples.sparksql;
2 | //
3 | //import org.apache.spark.SparkConf;
4 | //import org.apache.spark.api.java.JavaRDD;
5 | //import org.apache.spark.api.java.JavaSparkContext;
6 | //import org.apache.spark.api.java.function.Function;
7 | //import org.apache.spark.sql.DataFrame;
8 | //import org.apache.spark.sql.Row;
9 | //import org.apache.spark.sql.SQLContext;
10 | //
11 | //import java.util.List;
12 | //
13 | //public class JavaSparkSQLBySchema {
14 | // public static void main(String[] args) throws Exception {
15 | // SparkConf sparkConf = new SparkConf().setAppName("JavaSparkSQLBySchema");
16 | // JavaSparkContext ctx = new JavaSparkContext(sparkConf);
17 | // SQLContext sqlContext = new SQLContext(sc);
18 | //
19 | // // Load a text file and convert each line to a JavaBean.
20 | // JavaRDD people = sc.textFile("people.txt");
21 | //
22 | // // The schema is encoded in a string
23 | // String schemaString = "name age";
24 | //
25 | // // Generate the schema based on the string of schema
26 | // List fields = new ArrayList();
27 | // for (String fieldName : schemaString.split(" ")) {
28 | // fields.add(DataType.createStructField(fieldName, DataType.StringType, true));
29 | // }
30 | // StructType schema = DataType.createStructType(fields);
31 | //
32 | // // Convert records of the RDD (people) to Rows.
33 | // JavaRDD rowRDD = people.map(
34 | // new Function() {
35 | // public Row call(String record) throws Exception {
36 | // String[] fields = record.split(",");
37 | // return Row.create(fields[0], fields[1].trim());
38 | // }
39 | // });
40 | //
41 | // // Apply the schema to the RDD.
42 | // DataFrame peopleDataFrame = sqlContext.createDataFrame(rowRDD, schema);
43 | //
44 | // // Register the DataFrame as a table.
45 | // peopleDataFrame.registerTempTable("people");
46 | //
47 | // // SQL can be run over RDDs that have been registered as tables.
48 | // DataFrame results = sqlContext.sql("SELECT name FROM people");
49 | //
50 | // // The results of SQL queries are DataFrames and support all the normal RDD operations.
51 | // // The columns of a row in the result can be accessed by ordinal.
52 | // List names = results.map(new Function() {
53 | // public String call(Row row) {
54 | // return "Name: " + row.getString(0);
55 | // }
56 | // }).collect();
57 | // }
58 | //
59 | //}
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/examples/SparkTC.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.spark.examples
19 |
20 | import scala.util.Random
21 | import scala.collection.mutable
22 |
23 | import org.apache.spark.{SparkConf, SparkContext}
24 | import org.apache.spark.SparkContext._
25 |
26 | /**
27 | * Transitive closure on a graph.
28 | */
29 | object SparkTC {
30 | val numEdges = 200
31 | val numVertices = 100
32 | val rand = new Random(42)
33 |
34 | def generateGraph = {
35 | val edges: mutable.Set[(Int, Int)] = mutable.Set.empty
36 | while (edges.size < numEdges) {
37 | val from = rand.nextInt(numVertices)
38 | val to = rand.nextInt(numVertices)
39 | if (from != to) edges.+=((from, to))
40 | }
41 | edges.toSeq
42 | }
43 |
44 | def main(args: Array[String]) {
45 | val sparkConf = new SparkConf().setAppName("SparkTC")
46 | val spark = new SparkContext(sparkConf)
47 | val slices = if (args.length > 0) args(0).toInt else 2
48 | var tc = spark.parallelize(generateGraph, slices).cache()
49 |
50 | // Linear transitive closure: each round grows paths by one edge,
51 | // by joining the graph's edges with the already-discovered paths.
52 | // e.g. join the path (y, z) from the TC with the edge (x, y) from
53 | // the graph to obtain the path (x, z).
54 |
55 | // Because join() joins on keys, the edges are stored in reversed order.
56 | val edges = tc.map(x => (x._2, x._1))
57 |
58 | // This join is iterated until a fixed point is reached.
59 | var oldCount = 0L
60 | var nextCount = tc.count()
61 | do {
62 | oldCount = nextCount
63 | // Perform the join, obtaining an RDD of (y, (z, x)) pairs,
64 | // then project the result to obtain the new (x, z) paths.
65 | tc = tc.union(tc.join(edges).map(x => (x._2._2, x._2._1))).distinct().cache()
66 | nextCount = tc.count()
67 | } while (nextCount != oldCount)
68 |
69 | println("TC has " + tc.count() + " edges.")
70 | spark.stop()
71 | }
72 | }
73 |
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/examples/LocalFileLR.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.spark.examples
19 |
20 | import java.util.Random
21 |
22 | import breeze.linalg.{Vector, DenseVector}
23 |
24 | /**
25 | * Logistic regression based classification.
26 | *
27 | * This is an example implementation for learning how to use Spark. For more conventional use,
28 | * please refer to either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or
29 | * org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS based on your needs.
30 | */
31 | object LocalFileLR {
32 | val D = 10 // Numer of dimensions
33 | val rand = new Random(42)
34 |
35 | case class DataPoint(x: Vector[Double], y: Double)
36 |
37 | def parsePoint(line: String): DataPoint = {
38 | val nums = line.split(' ').map(_.toDouble)
39 | DataPoint(new DenseVector(nums.slice(1, D + 1)), nums(0))
40 | }
41 |
42 | def showWarning() {
43 | System.err.println(
44 | """WARN: This is a naive implementation of Logistic Regression and is given as an example!
45 | |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or
46 | |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
47 | |for more conventional use.
48 | """.stripMargin)
49 | }
50 |
51 | def main(args: Array[String]) {
52 |
53 | showWarning()
54 |
55 | val lines = scala.io.Source.fromFile(args(0)).getLines().toArray
56 | val points = lines.map(parsePoint _)
57 | val ITERATIONS = args(1).toInt
58 |
59 | // Initialize w to a random value
60 | var w = DenseVector.fill(D){2 * rand.nextDouble - 1}
61 | println("Initial w: " + w)
62 |
63 | for (i <- 1 to ITERATIONS) {
64 | println("On iteration " + i)
65 | var gradient = DenseVector.zeros[Double](D)
66 | for (p <- points) {
67 | val scale = (1 / (1 + math.exp(-p.y * (w.dot(p.x)))) - 1) * p.y
68 | gradient += p.x * scale
69 | }
70 | w -= gradient
71 | }
72 |
73 | println("Final w: " + w)
74 | }
75 | }
76 |
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/examples/SparkPageRank.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.spark.examples
19 |
20 | import org.apache.spark.SparkContext._
21 | import org.apache.spark.{SparkConf, SparkContext}
22 |
23 | /**
24 | * Computes the PageRank of URLs from an input file. Input file should
25 | * be in format of:
26 | * URL neighbor URL
27 | * URL neighbor URL
28 | * URL neighbor URL
29 | * ...
30 | * where URL and their neighbors are separated by space(s).
31 | *
32 | * This is an example implementation for learning how to use Spark. For more conventional use,
33 | * please refer to org.apache.spark.graphx.lib.PageRank
34 | */
35 | object SparkPageRank {
36 |
37 | def showWarning() {
38 | System.err.println(
39 | """WARN: This is a naive implementation of PageRank and is given as an example!
40 | |Please use the PageRank implementation found in org.apache.spark.graphx.lib.PageRank
41 | |for more conventional use.
42 | """.stripMargin)
43 | }
44 |
45 | def main(args: Array[String]) {
46 | if (args.length < 1) {
47 | System.err.println("Usage: SparkPageRank ")
48 | System.exit(1)
49 | }
50 |
51 | showWarning()
52 |
53 | val sparkConf = new SparkConf().setAppName("PageRank")
54 | val iters = if (args.length > 0) args(1).toInt else 10
55 | val ctx = new SparkContext(sparkConf)
56 | val lines = ctx.textFile(args(0), 1)
57 | val links = lines.map{ s =>
58 | val parts = s.split("\\s+")
59 | (parts(0), parts(1))
60 | }.distinct().groupByKey().cache()
61 | var ranks = links.mapValues(v => 1.0)
62 |
63 | for (i <- 1 to iters) {
64 | val contribs = links.join(ranks).values.flatMap{ case (urls, rank) =>
65 | val size = urls.size
66 | urls.map(url => (url, rank / size))
67 | }
68 | ranks = contribs.reduceByKey(_ + _).mapValues(0.15 + 0.85 * _)
69 | }
70 |
71 | val output = ranks.collect()
72 | output.foreach(tup => println(tup._1 + " has rank: " + tup._2 + "."))
73 |
74 | ctx.stop()
75 | }
76 | }
77 |
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/examples/LocalLR.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.spark.examples
19 |
20 | import java.util.Random
21 |
22 | import breeze.linalg.{Vector, DenseVector}
23 |
24 | /**
25 | * Logistic regression based classification.
26 | *
27 | * This is an example implementation for learning how to use Spark. For more conventional use,
28 | * please refer to either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or
29 | * org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS based on your needs.
30 | */
31 | object LocalLR {
32 | val N = 10000 // Number of data points
33 | val D = 10 // Number of dimensions
34 | val R = 0.7 // Scaling factor
35 | val ITERATIONS = 5
36 | val rand = new Random(42)
37 |
38 | case class DataPoint(x: Vector[Double], y: Double)
39 |
40 | def generateData = {
41 | def generatePoint(i: Int) = {
42 | val y = if(i % 2 == 0) -1 else 1
43 | val x = DenseVector.fill(D){rand.nextGaussian + y * R}
44 | DataPoint(x, y)
45 | }
46 | Array.tabulate(N)(generatePoint)
47 | }
48 |
49 | def showWarning() {
50 | System.err.println(
51 | """WARN: This is a naive implementation of Logistic Regression and is given as an example!
52 | |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or
53 | |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
54 | |for more conventional use.
55 | """.stripMargin)
56 | }
57 |
58 | def main(args: Array[String]) {
59 |
60 | showWarning()
61 |
62 | val data = generateData
63 | // Initialize w to a random value
64 | var w = DenseVector.fill(D){2 * rand.nextDouble - 1}
65 | println("Initial w: " + w)
66 |
67 | for (i <- 1 to ITERATIONS) {
68 | println("On iteration " + i)
69 | var gradient = DenseVector.zeros[Double](D)
70 | for (p <- data) {
71 | val scale = (1 / (1 + math.exp(-p.y * (w.dot(p.x)))) - 1) * p.y
72 | gradient += p.x * scale
73 | }
74 | w -= gradient
75 | }
76 |
77 | println("Final w: " + w)
78 | }
79 | }
80 |
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/examples/SimpleSkewedGroupByTest.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.spark.examples
19 |
20 | import java.util.Random
21 |
22 | import org.apache.spark.{SparkConf, SparkContext}
23 | import org.apache.spark.SparkContext._
24 |
25 | /**
26 | * Usage: SimpleSkewedGroupByTest [numMappers] [numKVPairs] [valSize] [numReducers] [ratio]
27 | */
28 | object SimpleSkewedGroupByTest {
29 | def main(args: Array[String]) {
30 |
31 | val sparkConf = new SparkConf().setAppName("SimpleSkewedGroupByTest")
32 | var numMappers = if (args.length > 0) args(0).toInt else 2
33 | var numKVPairs = if (args.length > 1) args(1).toInt else 1000
34 | var valSize = if (args.length > 2) args(2).toInt else 1000
35 | var numReducers = if (args.length > 3) args(3).toInt else numMappers
36 | var ratio = if (args.length > 4) args(4).toInt else 5.0
37 |
38 | val sc = new SparkContext(sparkConf)
39 |
40 | val pairs1 = sc.parallelize(0 until numMappers, numMappers).flatMap { p =>
41 | val ranGen = new Random
42 | var result = new Array[(Int, Array[Byte])](numKVPairs)
43 | for (i <- 0 until numKVPairs) {
44 | val byteArr = new Array[Byte](valSize)
45 | ranGen.nextBytes(byteArr)
46 | val offset = ranGen.nextInt(1000) * numReducers
47 | if (ranGen.nextDouble < ratio / (numReducers + ratio - 1)) {
48 | // give ratio times higher chance of generating key 0 (for reducer 0)
49 | result(i) = (offset, byteArr)
50 | } else {
51 | // generate a key for one of the other reducers
52 | val key = 1 + ranGen.nextInt(numReducers-1) + offset
53 | result(i) = (key, byteArr)
54 | }
55 | }
56 | result
57 | }.cache
58 | // Enforce that everything has been calculated and in cache
59 | pairs1.count
60 |
61 | println("RESULT: " + pairs1.groupByKey(numReducers).count)
62 | // Print how many keys each reducer got (for debugging)
63 | // println("RESULT: " + pairs1.groupByKey(numReducers)
64 | // .map{case (k,v) => (k, v.size)}
65 | // .collectAsMap)
66 |
67 | sc.stop()
68 | }
69 | }
70 |
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/examples/SparkLR.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.spark.examples
19 |
20 | import java.util.Random
21 |
22 | import scala.math.exp
23 |
24 | import breeze.linalg.{Vector, DenseVector}
25 |
26 | import org.apache.spark._
27 |
28 | /**
29 | * Logistic regression based classification.
30 | * Usage: SparkLR [slices]
31 | *
32 | * This is an example implementation for learning how to use Spark. For more conventional use,
33 | * please refer to either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or
34 | * org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS based on your needs.
35 | */
36 | object SparkLR {
37 | val N = 10000 // Number of data points
38 | val D = 10 // Numer of dimensions
39 | val R = 0.7 // Scaling factor
40 | val ITERATIONS = 5
41 | val rand = new Random(42)
42 |
43 | case class DataPoint(x: Vector[Double], y: Double)
44 |
45 | def generateData = {
46 | def generatePoint(i: Int) = {
47 | val y = if(i % 2 == 0) -1 else 1
48 | val x = DenseVector.fill(D){rand.nextGaussian + y * R}
49 | DataPoint(x, y)
50 | }
51 | Array.tabulate(N)(generatePoint)
52 | }
53 |
54 | def showWarning() {
55 | System.err.println(
56 | """WARN: This is a naive implementation of Logistic Regression and is given as an example!
57 | |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or
58 | |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
59 | |for more conventional use.
60 | """.stripMargin)
61 | }
62 |
63 | def main(args: Array[String]) {
64 |
65 | showWarning()
66 |
67 | val sparkConf = new SparkConf().setAppName("SparkLR")
68 | val sc = new SparkContext(sparkConf)
69 | val numSlices = if (args.length > 0) args(0).toInt else 2
70 | val points = sc.parallelize(generateData, numSlices).cache()
71 |
72 | // Initialize w to a random value
73 | var w = DenseVector.fill(D){2 * rand.nextDouble - 1}
74 | println("Initial w: " + w)
75 |
76 | for (i <- 1 to ITERATIONS) {
77 | println("On iteration " + i)
78 | val gradient = points.map { p =>
79 | p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y
80 | }.reduce(_ + _)
81 | w -= gradient
82 | }
83 |
84 | println("Final w: " + w)
85 |
86 | sc.stop()
87 | }
88 | }
89 |
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/examples/SparkTachyonHdfsLR.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.spark.examples
19 |
20 | import java.util.Random
21 |
22 | import scala.math.exp
23 |
24 | import breeze.linalg.{Vector, DenseVector}
25 | import org.apache.hadoop.conf.Configuration
26 |
27 | import org.apache.spark._
28 | import org.apache.spark.scheduler.InputFormatInfo
29 | import org.apache.spark.storage.StorageLevel
30 |
31 |
32 | /**
33 | * Logistic regression based classification.
34 | * This example uses Tachyon to persist rdds during computation.
35 | *
36 | * This is an example implementation for learning how to use Spark. For more conventional use,
37 | * please refer to either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or
38 | * org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS based on your needs.
39 | */
40 | object SparkTachyonHdfsLR {
41 | val D = 10 // Numer of dimensions
42 | val rand = new Random(42)
43 |
44 | def showWarning() {
45 | System.err.println(
46 | """WARN: This is a naive implementation of Logistic Regression and is given as an example!
47 | |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or
48 | |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
49 | |for more conventional use.
50 | """.stripMargin)
51 | }
52 |
53 | case class DataPoint(x: Vector[Double], y: Double)
54 |
55 | def parsePoint(line: String): DataPoint = {
56 | val tok = new java.util.StringTokenizer(line, " ")
57 | var y = tok.nextToken.toDouble
58 | var x = new Array[Double](D)
59 | var i = 0
60 | while (i < D) {
61 | x(i) = tok.nextToken.toDouble; i += 1
62 | }
63 | DataPoint(new DenseVector(x), y)
64 | }
65 |
66 | def main(args: Array[String]) {
67 |
68 | showWarning()
69 |
70 | val inputPath = args(0)
71 | val sparkConf = new SparkConf().setAppName("SparkTachyonHdfsLR")
72 | val conf = new Configuration()
73 | val sc = new SparkContext(sparkConf,
74 | InputFormatInfo.computePreferredLocations(
75 | Seq(new InputFormatInfo(conf, classOf[org.apache.hadoop.mapred.TextInputFormat], inputPath))
76 | ))
77 | val lines = sc.textFile(inputPath)
78 | val points = lines.map(parsePoint _).persist(StorageLevel.OFF_HEAP)
79 | val ITERATIONS = args(1).toInt
80 |
81 | // Initialize w to a random value
82 | var w = DenseVector.fill(D){2 * rand.nextDouble - 1}
83 | println("Initial w: " + w)
84 |
85 | for (i <- 1 to ITERATIONS) {
86 | println("On iteration " + i)
87 | val gradient = points.map { p =>
88 | p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y
89 | }.reduce(_ + _)
90 | w -= gradient
91 | }
92 |
93 | println("Final w: " + w)
94 | sc.stop()
95 | }
96 | }
97 |
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/examples/SparkKMeans.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.spark.examples
19 |
20 | import breeze.linalg.{Vector, DenseVector, squaredDistance}
21 |
22 | import org.apache.spark.{SparkConf, SparkContext}
23 | import org.apache.spark.SparkContext._
24 |
25 | /**
26 | * K-means clustering.
27 | *
28 | * This is an example implementation for learning how to use Spark. For more conventional use,
29 | * please refer to org.apache.spark.mllib.clustering.KMeans
30 | */
31 | object SparkKMeans {
32 |
33 | def parseVector(line: String): Vector[Double] = {
34 | DenseVector(line.split(' ').map(_.toDouble))
35 | }
36 |
37 | def closestPoint(p: Vector[Double], centers: Array[Vector[Double]]): Int = {
38 | var bestIndex = 0
39 | var closest = Double.PositiveInfinity
40 |
41 | for (i <- 0 until centers.length) {
42 | val tempDist = squaredDistance(p, centers(i))
43 | if (tempDist < closest) {
44 | closest = tempDist
45 | bestIndex = i
46 | }
47 | }
48 |
49 | bestIndex
50 | }
51 |
52 | def showWarning() {
53 | System.err.println(
54 | """WARN: This is a naive implementation of KMeans Clustering and is given as an example!
55 | |Please use the KMeans method found in org.apache.spark.mllib.clustering
56 | |for more conventional use.
57 | """.stripMargin)
58 | }
59 |
60 | def main(args: Array[String]) {
61 |
62 | if (args.length < 3) {
63 | System.err.println("Usage: SparkKMeans ")
64 | System.exit(1)
65 | }
66 |
67 | showWarning()
68 |
69 | val sparkConf = new SparkConf().setAppName("SparkKMeans")
70 | val sc = new SparkContext(sparkConf)
71 | val lines = sc.textFile(args(0))
72 | val data = lines.map(parseVector _).cache()
73 | val K = args(1).toInt
74 | val convergeDist = args(2).toDouble
75 |
76 | val kPoints = data.takeSample(withReplacement = false, K, 42).toArray
77 | var tempDist = 1.0
78 |
79 | while(tempDist > convergeDist) {
80 | val closest = data.map (p => (closestPoint(p, kPoints), (p, 1)))
81 |
82 | val pointStats = closest.reduceByKey{case ((x1, y1), (x2, y2)) => (x1 + x2, y1 + y2)}
83 |
84 | val newPoints = pointStats.map {pair =>
85 | (pair._1, pair._2._1 * (1.0 / pair._2._2))}.collectAsMap()
86 |
87 | tempDist = 0.0
88 | for (i <- 0 until K) {
89 | tempDist += squaredDistance(kPoints(i), newPoints(i))
90 | }
91 |
92 | for (newP <- newPoints) {
93 | kPoints(newP._1) = newP._2
94 | }
95 | println("Finished iteration (delta = " + tempDist + ")")
96 | }
97 |
98 | println("Final centers:")
99 | kPoints.foreach(println)
100 | sc.stop()
101 | }
102 | }
103 |
--------------------------------------------------------------------------------
/src/main/scala/org/apache/spark/examples/SparkHdfsLR.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed to the Apache Software Foundation (ASF) under one or more
3 | * contributor license agreements. See the NOTICE file distributed with
4 | * this work for additional information regarding copyright ownership.
5 | * The ASF licenses this file to You under the Apache License, Version 2.0
6 | * (the "License"); you may not use this file except in compliance with
7 | * the License. You may obtain a copy of the License at
8 | *
9 | * http://www.apache.org/licenses/LICENSE-2.0
10 | *
11 | * Unless required by applicable law or agreed to in writing, software
12 | * distributed under the License is distributed on an "AS IS" BASIS,
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | * See the License for the specific language governing permissions and
15 | * limitations under the License.
16 | */
17 |
18 | package org.apache.spark.examples
19 |
20 | import java.util.Random
21 |
22 | import scala.math.exp
23 |
24 | import breeze.linalg.{Vector, DenseVector}
25 | import org.apache.hadoop.conf.Configuration
26 |
27 | import org.apache.spark._
28 | import org.apache.spark.scheduler.InputFormatInfo
29 |
30 |
31 | /**
32 | * Logistic regression based classification.
33 | *
34 | * This is an example implementation for learning how to use Spark. For more conventional use,
35 | * please refer to either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or
36 | * org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS based on your needs.
37 | */
38 | object SparkHdfsLR {
39 | val D = 10 // Numer of dimensions
40 | val rand = new Random(42)
41 |
42 | case class DataPoint(x: Vector[Double], y: Double)
43 |
44 | def parsePoint(line: String): DataPoint = {
45 | val tok = new java.util.StringTokenizer(line, " ")
46 | var y = tok.nextToken.toDouble
47 | var x = new Array[Double](D)
48 | var i = 0
49 | while (i < D) {
50 | x(i) = tok.nextToken.toDouble; i += 1
51 | }
52 | DataPoint(new DenseVector(x), y)
53 | }
54 |
55 | def showWarning() {
56 | System.err.println(
57 | """WARN: This is a naive implementation of Logistic Regression and is given as an example!
58 | |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or
59 | |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
60 | |for more conventional use.
61 | """.stripMargin)
62 | }
63 |
64 | def main(args: Array[String]) {
65 |
66 | if (args.length < 2) {
67 | System.err.println("Usage: SparkHdfsLR ")
68 | System.exit(1)
69 | }
70 |
71 | showWarning()
72 |
73 | val sparkConf = new SparkConf().setAppName("SparkHdfsLR")
74 | val inputPath = args(0)
75 | val conf = new Configuration()
76 | val sc = new SparkContext(sparkConf,
77 | InputFormatInfo.computePreferredLocations(
78 | Seq(new InputFormatInfo(conf, classOf[org.apache.hadoop.mapred.TextInputFormat], inputPath))
79 | ))
80 | val lines = sc.textFile(inputPath)
81 | val points = lines.map(parsePoint _).cache()
82 | val ITERATIONS = args(1).toInt
83 |
84 | // Initialize w to a random value
85 | var w = DenseVector.fill(D){2 * rand.nextDouble - 1}
86 | println("Initial w: " + w)
87 |
88 | for (i <- 1 to ITERATIONS) {
89 | println("On iteration " + i)
90 | val gradient = points.map { p =>
91 | p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y
92 | }.reduce(_ + _)
93 | w -= gradient
94 | }
95 |
96 | println("Final w: " + w)
97 | sc.stop()
98 | }
99 | }
100 |
--------------------------------------------------------------------------------
/src/main/java/com/javachen/spark/examples/mllib/JavaALS.java:
--------------------------------------------------------------------------------
1 | package com.javachen.spark.examples.mllib;
2 |
3 | import org.apache.spark.SparkConf;
4 | import org.apache.spark.api.java.JavaDoubleRDD;
5 | import org.apache.spark.api.java.JavaPairRDD;
6 | import org.apache.spark.api.java.JavaRDD;
7 | import org.apache.spark.api.java.JavaSparkContext;
8 | import org.apache.spark.api.java.function.Function;
9 | import org.apache.spark.mllib.recommendation.ALS;
10 | import org.apache.spark.mllib.recommendation.MatrixFactorizationModel;
11 | import org.apache.spark.mllib.recommendation.Rating;
12 | import scala.Tuple2;
13 |
14 | public class JavaALS {
15 | public static void main(String[] args) {
16 | SparkConf conf = new SparkConf().setAppName("Java Collaborative Filtering Example");
17 | JavaSparkContext sc = new JavaSparkContext(conf);
18 |
19 | // Load and parse the data
20 | String path = "data/mllib/als/test.data";
21 | JavaRDD data = sc.textFile(path);
22 | JavaRDD ratings = data.map(
23 | new Function() {
24 | public Rating call(String s) {
25 | String[] sarray = s.split(",");
26 | return new Rating(Integer.parseInt(sarray[0]), Integer.parseInt(sarray[1]),
27 | Double.parseDouble(sarray[2]));
28 | }
29 | }
30 | );
31 |
32 | // Build the recommendation model using ALS
33 | int rank = 10;
34 | int numIterations = 20;
35 | MatrixFactorizationModel model = ALS.train(JavaRDD.toRDD(ratings), rank, numIterations, 0.01);
36 |
37 | // Evaluate the model on rating data
38 | JavaRDD> userProducts = ratings.map(
39 | new Function>() {
40 | public Tuple2