├── .gitignore
├── Readme.md
├── pom.xml
└── src
└── main
└── scala
└── simpleexample
├── SparkFileExample.scala
├── SparkKafkaExample.scala
└── SparkStreamingExample.scala
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | *.iml
3 | target
4 |
--------------------------------------------------------------------------------
/Readme.md:
--------------------------------------------------------------------------------
1 | # Spark Streaming Examples
2 |
3 |
4 | ### Links
5 |
6 | * http://spark.apache.org/docs/latest/streaming-kafka-integration.html
7 | * http://stackoverflow.com/questions/22338025/kafka-consumers-in-spark-streaming-parallel-consumption-in-worker-nodes
8 | * http://stackoverflow.com/questions/22132968/run-spark-kafka-wordcount-java-example-without-run-example-script
9 | * http://spark.apache.org/docs/latest/streaming-programming-guide.html
10 | * https://issues.apache.org/jira/browse/SPARK-944
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
4 | 4.0.0
5 |
6 | spark-streaming-simple-example
7 | spark-streaming-simple-example
8 | 0.1-SNAPSHOT
9 | jar
10 |
11 | Spark Streaming Examples
12 |
13 |
14 | spark-streaming-simple-example.SparkKMeansApp
15 | UTF-8
16 |
17 | 1.3.1
18 |
19 | 1.0.1
20 | 1.7.5
21 | 13.0.1
22 | 2.2.4
23 |
24 |
25 |
26 |
27 | apache release
28 | https://repository.apache.org/content/repositories/releases/
29 |
30 |
31 | scala-tools.org
32 | Scala-tools Maven2 Repository
33 | http://scala-tools.org/repo-releases
34 |
35 |
36 |
37 |
38 |
39 | scala-tools.org
40 | Scala-tools Maven2 Repository
41 | http://scala-tools.org/repo-releases
42 |
43 |
44 |
45 |
46 |
47 | org.apache.spark
48 | spark-core_2.10
49 | ${spark.core.version}
50 | provided
51 |
52 |
53 | org.slf4j
54 | slf4j-log4j12
55 |
56 |
57 |
58 |
59 | org.apache.spark
60 | spark-streaming_2.10
61 | ${spark.core.version}
62 |
63 |
64 | org.slf4j
65 | slf4j-log4j12
66 |
67 |
68 |
69 |
70 | org.apache.spark
71 | spark-hive_2.10
72 | ${spark.core.version}
73 |
74 |
75 | org.slf4j
76 | slf4j-log4j12
77 |
78 |
79 | provided
80 |
81 |
82 | org.apache.spark
83 | spark-streaming-kafka_2.10
84 | ${spark.core.version}
85 |
86 |
87 | org.slf4j
88 | slf4j-log4j12
89 |
90 |
91 |
92 |
97 |
98 | org.apache.hbase
99 | hbase-client
100 | ${hbase.version}
101 |
102 |
103 | org.apache.hbase
104 | hbase-common
105 | ${hbase.version}
106 |
107 |
108 | org.apache.hbase
109 | hbase-server
110 | ${hbase.version}
111 |
112 |
113 | org.apache.hbase
114 | hbase-hadoop-compat
115 | ${hbase.version}
116 |
117 |
118 | org.apache.hbase
119 | hbase-protocol
120 | ${hbase.version}
121 |
122 |
123 | org.apache.hbase
124 | hbase-hadoop2-compat
125 | ${hbase.version}
126 |
127 |
213 |
214 | org.slf4j
215 | slf4j-api
216 | ${slf4j.version}
217 | provided
218 |
219 |
220 | org.scalanlp
221 | breeze_2.10
222 | 0.9
223 |
224 |
225 | org.scala-lang
226 | scala-library
227 | 2.10.3
228 | provided
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 | org.scala-tools
237 | maven-scala-plugin
238 | 2.15.2
239 |
240 |
241 | org.apache.maven.plugins
242 | maven-compiler-plugin
243 | 3.1
244 |
245 | 1.6
246 | 1.6
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 | org.scala-tools
255 | maven-scala-plugin
256 |
257 |
258 | scala-compile-first
259 | process-resources
260 |
261 | add-source
262 | compile
263 |
264 |
265 |
266 | scala-test-compile
267 | process-test-resources
268 |
269 | testCompile
270 |
271 |
272 |
273 |
274 |
275 | org.apache.maven.plugins
276 | maven-shade-plugin
277 | 1.4
278 |
279 | true
280 |
281 |
282 |
283 | package
284 |
285 | shade
286 |
287 |
288 |
289 |
291 |
293 |
294 |
295 |
296 |
297 |
298 | *:*
299 |
300 | META-INF/*.SF
301 | META-INF/*.DSA
302 | META-INF/*.RSA
303 |
304 |
305 |
306 |
307 |
308 |
309 |
310 |
311 |
312 |
313 |
--------------------------------------------------------------------------------
/src/main/scala/simpleexample/SparkFileExample.scala:
--------------------------------------------------------------------------------
1 | package simpleexample
2 |
3 | import org.apache.hadoop.io.{LongWritable, Text}
4 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat
5 | import org.apache.spark.{SparkContext, SparkConf}
6 | import org.apache.spark.sql.hive.HiveContext
7 | import org.apache.spark.streaming.{Seconds, StreamingContext}
8 |
9 | /*
10 | Submitting:
11 | spark-submit --master yarn-client \
12 | --num-executors 2 \
13 | --driver-memory 512m \
14 | --executor-memory 512m \
15 | --executor-cores 1 \
16 | --class simpleexample.SparkFileExample \
17 | spark-streaming-simple-example-0.1-SNAPSHOT.jar /spark_log
18 | */
19 | object SparkFileExample {
20 |
21 | def main(args: Array[String]): Unit = {
22 | if(args.length < 1) {
23 | System.err.println("Usage: ")
24 | System.exit(1)
25 | }
26 |
27 | val sparkConf = new SparkConf().setAppName("SpoolDirSpark")
28 | val ssc = new StreamingContext(sparkConf, Seconds(2))
29 |
30 | val hiveContext = new HiveContext(ssc.sparkContext)
31 | import hiveContext.implicits._
32 | import hiveContext.sql
33 |
34 | val inputDirectory = args(0)
35 |
36 | val lines = ssc.fileStream[LongWritable, Text, TextInputFormat](inputDirectory).map{ case (x, y) => (x.toString, y.toString) }
37 |
38 | lines.print()
39 |
40 | // ToDo
41 | // lines.foreachRDD { rdd =>
42 | // rdd.foreachPartition { line =>
43 | // line.foreach { item =>
44 | // val values = item.toString().split(",")
45 | // val date = values(0)
46 | // val open = values(1)
47 | // val high = values(2)
48 | // val low = values(3)
49 | // val close = values(4)
50 | // val volume = values(5)
51 | // val adj_close = values(6)
52 | // val year = date.split("-")(0)
53 | // sql(f"INSERT INTO TABLE stocks PARTITION (year= '$year') VALUES ('$date', $open, $high, $low, $close, $volume, $adj_close);")
54 | // }
55 | // }
56 | // }
57 |
58 | ssc.start()
59 | ssc.awaitTermination()
60 |
61 | }
62 | }
63 |
64 |
--------------------------------------------------------------------------------
/src/main/scala/simpleexample/SparkKafkaExample.scala:
--------------------------------------------------------------------------------
1 | package simpleexample
2 |
3 | import java.util
4 |
5 | import kafka.serializer.{DefaultDecoder, StringDecoder}
6 | import org.apache.hadoop.conf.Configuration
7 | import org.apache.hadoop.hbase.HBaseConfiguration
8 | import org.apache.hadoop.hbase.client.{Put}
9 | import org.apache.hadoop.hbase.io.ImmutableBytesWritable
10 | import org.apache.hadoop.hbase.mapreduce.TableOutputFormat
11 | import org.apache.hadoop.hbase.util.Bytes
12 | import org.apache.hadoop.io.{LongWritable, Writable, IntWritable, Text}
13 | import org.apache.hadoop.mapred.{TextOutputFormat, JobConf}
14 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
15 | import org.apache.spark.SparkConf
16 | import org.apache.spark.rdd.PairRDDFunctions
17 | import org.apache.spark.storage.StorageLevel
18 | import org.apache.spark.streaming.kafka.{HasOffsetRanges, KafkaUtils}
19 | import org.apache.spark.streaming.{Minutes, Seconds, StreamingContext}
20 |
21 | import scala.collection.mutable.ListBuffer
22 |
23 | /**
24 | * Created by hkropp on 19/04/15.
25 | */
26 | object SparkKafkaExample
27 | {
28 |
29 | def main(args: Array[String]): Unit =
30 | {
31 | if (args.length < 2)
32 | {
33 | System.err.println("Usage: ")
34 | System.exit(1)
35 | }
36 |
37 | val Array(broker, zk, topic) = args
38 |
39 | val sparkConf = new SparkConf().setAppName("KafkaHBaseWordCount")
40 | val ssc = new StreamingContext(sparkConf, Seconds(10))
41 | //ssc.checkpoint("./checkpoints") // checkpointing dir
42 | // ssc.checkpoint("hdfs://checkpoints") // dir in hdfs for prod
43 |
44 | val kafkaConf = Map("metadata.broker.list" -> broker,
45 | "zookeeper.connect" -> zk,
46 | "group.id" -> "kafka-spark-streaming-example",
47 | "zookeeper.connection.timeout.ms" -> "1000")
48 |
49 | /* Kafka integration with reciever */
50 | val lines = KafkaUtils.createStream[Array[Byte], String, DefaultDecoder, StringDecoder](
51 | ssc, kafkaConf, Map(topic -> 1),
52 | StorageLevel.MEMORY_ONLY_SER).map(_._2)
53 |
54 | /* Experiemental DirectStream w/o Reciever */
55 | // val lines = KafkaUtils.createDirectStream[Array[Byte], String, DefaultDecoder, StringDecoder](
56 | // ssc,
57 | // kafkaConf,
58 | // Set(topic)).map(_._2)
59 |
60 | /* Getting Kafka offsets from RDDs
61 | lines.foreachRDD { rdd =>
62 | val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
63 | offsetRanges.foreach( println(_) )
64 | }*/
65 |
66 | val words = lines.flatMap(_.split(" "))
67 |
68 | val wordCounts = words.map(x => (x, 1L)).reduceByKey(_ + _)
69 |
70 | // .reduceByKeyAndWindow(_ + _, _ - _, Minutes(5), Seconds(2), 2)
71 | //wordCounts.print()
72 |
73 | /*words.map(x => (x, 1L)).saveAsNewAPIHadoopFiles(
74 | "prefix", "suffix",
75 | classOf[Text],
76 | classOf[IntWritable],
77 | classOf[org.apache.hadoop.hbase.mapreduce.TableOutputFormat[Text]],
78 | conf)*/
79 |
80 | wordCounts.foreachRDD ( rdd => {
81 |
82 | val conf = HBaseConfiguration.create()
83 | conf.set(TableOutputFormat.OUTPUT_TABLE, "stream_count")
84 | conf.set("hbase.zookeeper.quorum", "localhost:2181")
85 | conf.set("hbase.master", "localhost:60000");
86 | conf.set("hbase.rootdir", "file:///tmp/hbase")
87 |
88 | val jobConf = new Configuration(conf)
89 | jobConf.set("mapreduce.job.output.key.class", classOf[Text].getName)
90 | jobConf.set("mapreduce.job.output.value.class", classOf[LongWritable].getName)
91 | jobConf.set("mapreduce.outputformat.class", classOf[TableOutputFormat[Text]].getName)
92 |
93 | rdd.saveAsNewAPIHadoopDataset(jobConf)
94 |
95 | //rdd.saveAsTextFile("/user/vagrant/tmp/sparktest_out")
96 | //new PairRDDFunctions(rdd.map(convert)).saveAsNewAPIHadoopDataset(jobConf)
97 | /*rdd.foreach({
98 | case (value, count) => {
99 | println("##########################################")
100 | println("value --> " + value + " with count --> " + count)
101 | println("##########################################")
102 | }
103 | })*/
104 | //val connection = connect("stream_count")
105 | //rdd.foreach( record => connection.put(putRequest(record)) )
106 | })
107 |
108 | wordCounts.print()
109 |
110 | ssc.start()
111 | ssc.awaitTermination()
112 | }
113 |
114 | def putRequest(t: (String, Long)) = {
115 | val p = new Put(Bytes.toBytes(t._1))
116 | p.add(Bytes.toBytes("word"), Bytes.toBytes("count"), Bytes.toBytes(t._2))
117 | }
118 |
119 | def convert(t: (String, Long)) = {
120 | val p = new Put(Bytes.toBytes(t._1))
121 | p.add(Bytes.toBytes("word"), Bytes.toBytes("count"), Bytes.toBytes(t._2))
122 | (t._1, p)
123 | }
124 | }
125 |
--------------------------------------------------------------------------------
/src/main/scala/simpleexample/SparkStreamingExample.scala:
--------------------------------------------------------------------------------
1 | package simpleexample
2 |
3 | import org.apache.spark.SparkConf
4 | import org.apache.spark.streaming.{Seconds, StreamingContext}
5 | import org.apache.spark.storage.StorageLevel
6 |
7 | /**
8 | * Created by hkropp on 22/03/15.
9 | */
10 | object SparkStreamingExample {
11 | def main(args: Array[String]) {
12 |
13 | if (args.length < 2) {
14 | System.err.println("Usage: NetworkWordCount ")
15 | System.exit(1)
16 | }
17 |
18 | // Create a local StreamingContext with two working thread and batch interval of 1 second.
19 | // The master requires 2 cores to prevent from a starvation scenario.
20 | val sparkConf = new SparkConf().setMaster("local[2]").setAppName("NetworkWordCount")
21 | val ssc = new StreamingContext(sparkConf, Seconds(1))
22 |
23 | // Create a socket stream on target ip:port and count the
24 | // words in input stream of \n delimited text (eg. generated by 'nc')
25 | // Note that no duplication in storage level only for running locally.
26 | // Replication necessary in distributed scenario for fault tolerance.
27 | val lines = ssc.socketTextStream(args(0), args(1).toInt, StorageLevel.MEMORY_AND_DISK_SER)
28 | val words = lines.flatMap(_.split(" "))
29 | val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _)
30 | wordCounts.print()
31 | ssc.start()
32 | ssc.awaitTermination()
33 | }
34 | }
35 |
--------------------------------------------------------------------------------