├── .gitignore ├── Readme.md ├── pom.xml └── src └── main └── scala └── simpleexample ├── SparkFileExample.scala ├── SparkKafkaExample.scala └── SparkStreamingExample.scala /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | *.iml 3 | target 4 | -------------------------------------------------------------------------------- /Readme.md: -------------------------------------------------------------------------------- 1 | # Spark Streaming Examples 2 | 3 | 4 | ### Links 5 | 6 | * http://spark.apache.org/docs/latest/streaming-kafka-integration.html 7 | * http://stackoverflow.com/questions/22338025/kafka-consumers-in-spark-streaming-parallel-consumption-in-worker-nodes 8 | * http://stackoverflow.com/questions/22132968/run-spark-kafka-wordcount-java-example-without-run-example-script 9 | * http://spark.apache.org/docs/latest/streaming-programming-guide.html 10 | * https://issues.apache.org/jira/browse/SPARK-944 -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | 4.0.0 5 | 6 | spark-streaming-simple-example 7 | spark-streaming-simple-example 8 | 0.1-SNAPSHOT 9 | jar 10 | 11 | Spark Streaming Examples 12 | 13 | 14 | spark-streaming-simple-example.SparkKMeansApp 15 | UTF-8 16 | 17 | 1.3.1 18 | 19 | 1.0.1 20 | 1.7.5 21 | 13.0.1 22 | 2.2.4 23 | 24 | 25 | 26 | 27 | apache release 28 | https://repository.apache.org/content/repositories/releases/ 29 | 30 | 31 | scala-tools.org 32 | Scala-tools Maven2 Repository 33 | http://scala-tools.org/repo-releases 34 | 35 | 36 | 37 | 38 | 39 | scala-tools.org 40 | Scala-tools Maven2 Repository 41 | http://scala-tools.org/repo-releases 42 | 43 | 44 | 45 | 46 | 47 | org.apache.spark 48 | spark-core_2.10 49 | ${spark.core.version} 50 | provided 51 | 52 | 53 | org.slf4j 54 | slf4j-log4j12 55 | 56 | 57 | 58 | 59 | org.apache.spark 60 | spark-streaming_2.10 61 | ${spark.core.version} 62 | 63 | 64 | org.slf4j 65 | slf4j-log4j12 66 | 67 | 68 | 69 | 70 | org.apache.spark 71 | spark-hive_2.10 72 | ${spark.core.version} 73 | 74 | 75 | org.slf4j 76 | slf4j-log4j12 77 | 78 | 79 | provided 80 | 81 | 82 | org.apache.spark 83 | spark-streaming-kafka_2.10 84 | ${spark.core.version} 85 | 86 | 87 | org.slf4j 88 | slf4j-log4j12 89 | 90 | 91 | 92 | 97 | 98 | org.apache.hbase 99 | hbase-client 100 | ${hbase.version} 101 | 102 | 103 | org.apache.hbase 104 | hbase-common 105 | ${hbase.version} 106 | 107 | 108 | org.apache.hbase 109 | hbase-server 110 | ${hbase.version} 111 | 112 | 113 | org.apache.hbase 114 | hbase-hadoop-compat 115 | ${hbase.version} 116 | 117 | 118 | org.apache.hbase 119 | hbase-protocol 120 | ${hbase.version} 121 | 122 | 123 | org.apache.hbase 124 | hbase-hadoop2-compat 125 | ${hbase.version} 126 | 127 | 213 | 214 | org.slf4j 215 | slf4j-api 216 | ${slf4j.version} 217 | provided 218 | 219 | 220 | org.scalanlp 221 | breeze_2.10 222 | 0.9 223 | 224 | 225 | org.scala-lang 226 | scala-library 227 | 2.10.3 228 | provided 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | org.scala-tools 237 | maven-scala-plugin 238 | 2.15.2 239 | 240 | 241 | org.apache.maven.plugins 242 | maven-compiler-plugin 243 | 3.1 244 | 245 | 1.6 246 | 1.6 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | org.scala-tools 255 | maven-scala-plugin 256 | 257 | 258 | scala-compile-first 259 | process-resources 260 | 261 | add-source 262 | compile 263 | 264 | 265 | 266 | scala-test-compile 267 | process-test-resources 268 | 269 | testCompile 270 | 271 | 272 | 273 | 274 | 275 | org.apache.maven.plugins 276 | maven-shade-plugin 277 | 1.4 278 | 279 | true 280 | 281 | 282 | 283 | package 284 | 285 | shade 286 | 287 | 288 | 289 | 291 | 293 | 294 | 295 | 296 | 297 | 298 | *:* 299 | 300 | META-INF/*.SF 301 | META-INF/*.DSA 302 | META-INF/*.RSA 303 | 304 | 305 | 306 | 307 | 308 | 309 | 310 | 311 | 312 | 313 | -------------------------------------------------------------------------------- /src/main/scala/simpleexample/SparkFileExample.scala: -------------------------------------------------------------------------------- 1 | package simpleexample 2 | 3 | import org.apache.hadoop.io.{LongWritable, Text} 4 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat 5 | import org.apache.spark.{SparkContext, SparkConf} 6 | import org.apache.spark.sql.hive.HiveContext 7 | import org.apache.spark.streaming.{Seconds, StreamingContext} 8 | 9 | /* 10 | Submitting: 11 | spark-submit --master yarn-client \ 12 | --num-executors 2 \ 13 | --driver-memory 512m \ 14 | --executor-memory 512m \ 15 | --executor-cores 1 \ 16 | --class simpleexample.SparkFileExample \ 17 | spark-streaming-simple-example-0.1-SNAPSHOT.jar /spark_log 18 | */ 19 | object SparkFileExample { 20 | 21 | def main(args: Array[String]): Unit = { 22 | if(args.length < 1) { 23 | System.err.println("Usage: ") 24 | System.exit(1) 25 | } 26 | 27 | val sparkConf = new SparkConf().setAppName("SpoolDirSpark") 28 | val ssc = new StreamingContext(sparkConf, Seconds(2)) 29 | 30 | val hiveContext = new HiveContext(ssc.sparkContext) 31 | import hiveContext.implicits._ 32 | import hiveContext.sql 33 | 34 | val inputDirectory = args(0) 35 | 36 | val lines = ssc.fileStream[LongWritable, Text, TextInputFormat](inputDirectory).map{ case (x, y) => (x.toString, y.toString) } 37 | 38 | lines.print() 39 | 40 | // ToDo 41 | // lines.foreachRDD { rdd => 42 | // rdd.foreachPartition { line => 43 | // line.foreach { item => 44 | // val values = item.toString().split(",") 45 | // val date = values(0) 46 | // val open = values(1) 47 | // val high = values(2) 48 | // val low = values(3) 49 | // val close = values(4) 50 | // val volume = values(5) 51 | // val adj_close = values(6) 52 | // val year = date.split("-")(0) 53 | // sql(f"INSERT INTO TABLE stocks PARTITION (year= '$year') VALUES ('$date', $open, $high, $low, $close, $volume, $adj_close);") 54 | // } 55 | // } 56 | // } 57 | 58 | ssc.start() 59 | ssc.awaitTermination() 60 | 61 | } 62 | } 63 | 64 | -------------------------------------------------------------------------------- /src/main/scala/simpleexample/SparkKafkaExample.scala: -------------------------------------------------------------------------------- 1 | package simpleexample 2 | 3 | import java.util 4 | 5 | import kafka.serializer.{DefaultDecoder, StringDecoder} 6 | import org.apache.hadoop.conf.Configuration 7 | import org.apache.hadoop.hbase.HBaseConfiguration 8 | import org.apache.hadoop.hbase.client.{Put} 9 | import org.apache.hadoop.hbase.io.ImmutableBytesWritable 10 | import org.apache.hadoop.hbase.mapreduce.TableOutputFormat 11 | import org.apache.hadoop.hbase.util.Bytes 12 | import org.apache.hadoop.io.{LongWritable, Writable, IntWritable, Text} 13 | import org.apache.hadoop.mapred.{TextOutputFormat, JobConf} 14 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat 15 | import org.apache.spark.SparkConf 16 | import org.apache.spark.rdd.PairRDDFunctions 17 | import org.apache.spark.storage.StorageLevel 18 | import org.apache.spark.streaming.kafka.{HasOffsetRanges, KafkaUtils} 19 | import org.apache.spark.streaming.{Minutes, Seconds, StreamingContext} 20 | 21 | import scala.collection.mutable.ListBuffer 22 | 23 | /** 24 | * Created by hkropp on 19/04/15. 25 | */ 26 | object SparkKafkaExample 27 | { 28 | 29 | def main(args: Array[String]): Unit = 30 | { 31 | if (args.length < 2) 32 | { 33 | System.err.println("Usage:

") 34 | System.exit(1) 35 | } 36 | 37 | val Array(broker, zk, topic) = args 38 | 39 | val sparkConf = new SparkConf().setAppName("KafkaHBaseWordCount") 40 | val ssc = new StreamingContext(sparkConf, Seconds(10)) 41 | //ssc.checkpoint("./checkpoints") // checkpointing dir 42 | // ssc.checkpoint("hdfs://checkpoints") // dir in hdfs for prod 43 | 44 | val kafkaConf = Map("metadata.broker.list" -> broker, 45 | "zookeeper.connect" -> zk, 46 | "group.id" -> "kafka-spark-streaming-example", 47 | "zookeeper.connection.timeout.ms" -> "1000") 48 | 49 | /* Kafka integration with reciever */ 50 | val lines = KafkaUtils.createStream[Array[Byte], String, DefaultDecoder, StringDecoder]( 51 | ssc, kafkaConf, Map(topic -> 1), 52 | StorageLevel.MEMORY_ONLY_SER).map(_._2) 53 | 54 | /* Experiemental DirectStream w/o Reciever */ 55 | // val lines = KafkaUtils.createDirectStream[Array[Byte], String, DefaultDecoder, StringDecoder]( 56 | // ssc, 57 | // kafkaConf, 58 | // Set(topic)).map(_._2) 59 | 60 | /* Getting Kafka offsets from RDDs 61 | lines.foreachRDD { rdd => 62 | val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges 63 | offsetRanges.foreach( println(_) ) 64 | }*/ 65 | 66 | val words = lines.flatMap(_.split(" ")) 67 | 68 | val wordCounts = words.map(x => (x, 1L)).reduceByKey(_ + _) 69 | 70 | // .reduceByKeyAndWindow(_ + _, _ - _, Minutes(5), Seconds(2), 2) 71 | //wordCounts.print() 72 | 73 | /*words.map(x => (x, 1L)).saveAsNewAPIHadoopFiles( 74 | "prefix", "suffix", 75 | classOf[Text], 76 | classOf[IntWritable], 77 | classOf[org.apache.hadoop.hbase.mapreduce.TableOutputFormat[Text]], 78 | conf)*/ 79 | 80 | wordCounts.foreachRDD ( rdd => { 81 | 82 | val conf = HBaseConfiguration.create() 83 | conf.set(TableOutputFormat.OUTPUT_TABLE, "stream_count") 84 | conf.set("hbase.zookeeper.quorum", "localhost:2181") 85 | conf.set("hbase.master", "localhost:60000"); 86 | conf.set("hbase.rootdir", "file:///tmp/hbase") 87 | 88 | val jobConf = new Configuration(conf) 89 | jobConf.set("mapreduce.job.output.key.class", classOf[Text].getName) 90 | jobConf.set("mapreduce.job.output.value.class", classOf[LongWritable].getName) 91 | jobConf.set("mapreduce.outputformat.class", classOf[TableOutputFormat[Text]].getName) 92 | 93 | rdd.saveAsNewAPIHadoopDataset(jobConf) 94 | 95 | //rdd.saveAsTextFile("/user/vagrant/tmp/sparktest_out") 96 | //new PairRDDFunctions(rdd.map(convert)).saveAsNewAPIHadoopDataset(jobConf) 97 | /*rdd.foreach({ 98 | case (value, count) => { 99 | println("##########################################") 100 | println("value --> " + value + " with count --> " + count) 101 | println("##########################################") 102 | } 103 | })*/ 104 | //val connection = connect("stream_count") 105 | //rdd.foreach( record => connection.put(putRequest(record)) ) 106 | }) 107 | 108 | wordCounts.print() 109 | 110 | ssc.start() 111 | ssc.awaitTermination() 112 | } 113 | 114 | def putRequest(t: (String, Long)) = { 115 | val p = new Put(Bytes.toBytes(t._1)) 116 | p.add(Bytes.toBytes("word"), Bytes.toBytes("count"), Bytes.toBytes(t._2)) 117 | } 118 | 119 | def convert(t: (String, Long)) = { 120 | val p = new Put(Bytes.toBytes(t._1)) 121 | p.add(Bytes.toBytes("word"), Bytes.toBytes("count"), Bytes.toBytes(t._2)) 122 | (t._1, p) 123 | } 124 | } 125 | -------------------------------------------------------------------------------- /src/main/scala/simpleexample/SparkStreamingExample.scala: -------------------------------------------------------------------------------- 1 | package simpleexample 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.streaming.{Seconds, StreamingContext} 5 | import org.apache.spark.storage.StorageLevel 6 | 7 | /** 8 | * Created by hkropp on 22/03/15. 9 | */ 10 | object SparkStreamingExample { 11 | def main(args: Array[String]) { 12 | 13 | if (args.length < 2) { 14 | System.err.println("Usage: NetworkWordCount ") 15 | System.exit(1) 16 | } 17 | 18 | // Create a local StreamingContext with two working thread and batch interval of 1 second. 19 | // The master requires 2 cores to prevent from a starvation scenario. 20 | val sparkConf = new SparkConf().setMaster("local[2]").setAppName("NetworkWordCount") 21 | val ssc = new StreamingContext(sparkConf, Seconds(1)) 22 | 23 | // Create a socket stream on target ip:port and count the 24 | // words in input stream of \n delimited text (eg. generated by 'nc') 25 | // Note that no duplication in storage level only for running locally. 26 | // Replication necessary in distributed scenario for fault tolerance. 27 | val lines = ssc.socketTextStream(args(0), args(1).toInt, StorageLevel.MEMORY_AND_DISK_SER) 28 | val words = lines.flatMap(_.split(" ")) 29 | val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _) 30 | wordCounts.print() 31 | ssc.start() 32 | ssc.awaitTermination() 33 | } 34 | } 35 | --------------------------------------------------------------------------------