├── .gitignore ├── README.md ├── TwitterAvroSource.conf ├── pom.xml └── src ├── main └── scala │ └── com │ └── stdatalabs │ ├── Kafka │ └── KafkaTwitterProducer.java │ └── Streaming │ ├── FlumeSparkPopularHashTags.scala │ ├── KafkaDirectPopularHashTags.scala │ ├── KafkaSparkPopularHashTags.scala │ ├── RecoverableKafkaPopularHashTags.scala │ └── SparkPopularHashTags.scala └── test └── java └── org └── stdatalabs └── TwitterPopularHashTags └── AppTest.java /.gitignore: -------------------------------------------------------------------------------- 1 | *.iml 2 | .*/ 3 | target/ 4 | .classpath 5 | .cache-main 6 | .cache-tests 7 | .settings 8 | .project 9 | tweets.txt 10 | checkpoint/ 11 | 12 | # Compiled source # 13 | ################### 14 | *.com 15 | *.class 16 | *.dll 17 | *.exe 18 | *.o 19 | *.so 20 | checkpoint 21 | 22 | # Packages # 23 | ############ 24 | # it's better to unpack these files and commit the raw source 25 | # git has its own built in compression methods 26 | *.7z 27 | *.dmg 28 | *.gz 29 | *.iso 30 | *.jar 31 | *.rar 32 | *.tar 33 | *.zip 34 | 35 | # Logs and databases # 36 | ###################### 37 | *.log 38 | *.sql 39 | *.sqlite 40 | 41 | # OS generated files # 42 | ###################### 43 | .DS_Store 44 | .DS_Store? 45 | ._* 46 | .Spotlight-V100 47 | .Trashes 48 | ehthumbs.db 49 | Thumbs.db 50 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SparkTwitterPopularHashTags 2 | 3 | A project on Spark Streaming to analyze Popular hashtags from live twitter data streams. Data is ingested from different input sources like Twitter source, Flume and Kafka and processed downstream using Spark Streaming. 4 | 5 | ## Requirements 6 | - IDE 7 | - Apache Maven 3.x 8 | - JVM 6 or 7 9 | 10 | ## General Info 11 | The source folder is organized into 2 packages i.e. Kafka and Streaming. Each class in the Streaming package explores different approach to consume data from Twitter source. Below is the list of classes: 12 | * com/stdatalabs/Kafka 13 | * KafkaTwitterProducer.java -- A Kafka Producer that publishes twitter data to a kafka broker 14 | * com/stdatalabs/Streaming 15 | * SparkPopularHashTags.scala -- Receives data from Twitter datasource 16 | * FlumeSparkPopularHashTags.scala -- Receives data from Flume Twitter producer 17 | * KafkaSparkPopularHashTags.scala -- Receives data from Kafka Producer 18 | * RecoverableKafkaPopularHashTags.scala -- Spark-Kafka receiver based approach. Ensures at-least once semantics 19 | * KafkaDirectPopularHashTags.scala -- Spark-Kafka Direct approach. Ensures exactly once semantics 20 | * TwitterAvroSource.conf 21 | -- Flume conf for running Twitter avro source 22 | 23 | ## Description 24 | * ##### A Spark Streaming application that receives tweets on certain keywords from twitter datasource and finds the popular hashtags. 25 | Discussed in blog -- 26 | [Spark Streaming part 1: Real time twitter sentiment analysis](http://stdatalabs.blogspot.in/2016/09/spark-streaming-part-1-real-time.html) 27 | 28 | * ##### A Spark Streaming - Flume integration to find Popular hashtags from twitter. It receives events from a Flume source that connects to twitter and pushes tweets as avro events to sink. 29 | Discussed in blog -- 30 | [Spark streaming part 2: Real time twitter sentiment analysis using Flume](http://stdatalabs.blogspot.in/2016/09/spark-streaming-part-2-real-time_10.html) 31 | 32 | * ##### A Spark Streaming - Kafka integration to receive twitter data from kafka producer and find the popular hashtags 33 | Discussed in blog -- 34 | [Spark streaming part 3: Real time twitter sentiment analysis using kafka](http://stdatalabs.blogspot.in/2016/09/spark-streaming-part-3-real-time.html) 35 | 36 | * ##### A Spark Streaming - Kafka integration to ensure at-least once semantics 37 | Discussed in blog -- 38 | [Data guarantees in Spark Streaming with kafka integration](http://stdatalabs.blogspot.in/2016/10/data-guarantees-in-spark-streaming-with.html) 39 | 40 | * ##### A Spark Streaming - Kafka integration to ensure exactly once semantics 41 | Discussed in blog -- 42 | [Data guarantees in Spark Streaming with kafka integration](http://stdatalabs.blogspot.in/2016/10/data-guarantees-in-spark-streaming-with.html) 43 | 44 | 45 | 46 | ### More articles on hadoop technology stack at [stdatalabs](http://stdatalabs.blogspot.com) 47 | 48 | -------------------------------------------------------------------------------- /TwitterAvroSource.conf: -------------------------------------------------------------------------------- 1 | TwitterAgent.sources = Twitter 2 | TwitterAgent.channels = MemChannel 3 | TwitterAgent.sinks = avroSink 4 | 5 | 6 | 7 | # Describing/Configuring the source 8 | TwitterAgent.sources.Twitter.type = org.apache.flume.source.twitter.TwitterSource 9 | TwitterAgent.sources.Twitter.consumerKey= 10 | TwitterAgent.sources.Twitter.consumerSecret= 11 | TwitterAgent.sources.Twitter.accessToken= 12 | TwitterAgent.sources.Twitter.accessTokenSecret= 13 | 14 | 15 | 16 | # Describing/Configuring the sink 17 | TwitterAgent.sinks.avroSink.type = avro 18 | TwitterAgent.sinks.avroSink.batch-size = 1 19 | TwitterAgent.sinks.avroSink.hostname = ubuntu 20 | TwitterAgent.sinks.avroSink.port = 9988 21 | 22 | 23 | 24 | # Describing/Configuring the memory channel 25 | TwitterAgent.channels.MemChannel.type = memory 26 | TwitterAgent.channels.MemChannel.capacity = 10000 27 | TwitterAgent.channels.MemChannel.transactionCapacity = 100 28 | 29 | 30 | 31 | # Linking the source and sink to the memory channel 32 | TwitterAgent.sources.Twitter.channels = MemChannel 33 | TwitterAgent.sinks.avroSink.channel = MemChannel 34 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | com.stdatalabs.Streaming 6 | SparkTwitterAnalysis 7 | 0.0.1-SNAPSHOT 8 | jar 9 | 10 | TwitterPopularHashTags 11 | http://maven.apache.org 12 | 13 | 14 | UTF-8 15 | 16 | 17 | 18 | 19 | junit 20 | junit 21 | 3.8.1 22 | test 23 | 24 | 25 | junit 26 | junit 27 | 3.8.1 28 | test 29 | 30 | 31 | org.apache.spark 32 | spark-core_2.10 33 | 1.4.1 34 | compile 35 | 36 | 37 | 38 | org.apache.spark 39 | spark-streaming_2.10 40 | 1.4.1 41 | compile 42 | 43 | 44 | 45 | org.apache.spark 46 | spark-mllib_2.10 47 | 1.4.1 48 | 49 | 50 | org.apache.spark 51 | spark-sql_2.10 52 | 1.4.1 53 | 54 | 55 | org.apache.spark 56 | spark-hive_2.10 57 | 1.4.1 58 | 59 | 60 | 61 | org.apache.spark 62 | spark-streaming-twitter_2.10 63 | 1.4.1 64 | 65 | 66 | org.apache.spark 67 | spark-streaming-kafka_2.10 68 | 1.4.1 69 | 70 | 71 | org.apache.hadoop 72 | hadoop-client 73 | 2.4.1 74 | compile 75 | 76 | 77 | org.apache.hadoop 78 | hadoop-common 79 | 2.4.1 80 | compile 81 | 82 | 83 | 84 | org.apache.spark 85 | spark-streaming-flume_2.10 86 | 1.4.1 87 | 88 | 89 | 90 | org.apache.spark 91 | spark-streaming-flume-sink_2.10 92 | 1.4.1 93 | 94 | 95 | 96 | org.twitter4j 97 | twitter4j-core 98 | 3.0.3 99 | 100 | 101 | 102 | org.twitter4j 103 | twitter4j-stream 104 | 3.0.3 105 | 106 | 107 | 108 | org.twitter4j 109 | twitter4j-async 110 | 3.0.3 111 | 112 | 113 | org.apache.storm 114 | storm-core 115 | 0.10.0 116 | provided 117 | 118 | 119 | org.slf4j 120 | slf4j-log4j12 121 | 122 | 123 | 124 | 125 | org.apache.kafka 126 | kafka_2.10 127 | 0.8.2.1 128 | 129 | 130 | org.slf4j 131 | slf4j-log4j12 132 | 133 | 134 | 135 | 136 | org.apache.storm 137 | storm-kafka 138 | 0.10.0-beta1 139 | 140 | 141 | org.apache.storm 142 | storm-hdfs 143 | 0.10.0-beta1 144 | 145 | 146 | 147 | -------------------------------------------------------------------------------- /src/main/scala/com/stdatalabs/Kafka/KafkaTwitterProducer.java: -------------------------------------------------------------------------------- 1 | package com.stdatalabs.Kafka; 2 | 3 | import java.util.Arrays; 4 | import java.util.Properties; 5 | import java.util.concurrent.LinkedBlockingQueue; 6 | 7 | import twitter4j.*; 8 | import twitter4j.conf.*; 9 | 10 | import twitter4j.StallWarning; 11 | import twitter4j.Status; 12 | import twitter4j.StatusDeletionNotice; 13 | import twitter4j.StatusListener; 14 | import twitter4j.TwitterStream; 15 | import twitter4j.TwitterStreamFactory; 16 | import twitter4j.conf.ConfigurationBuilder; 17 | import twitter4j.json.DataObjectFactory; 18 | 19 | import org.apache.kafka.clients.producer.Producer; 20 | import org.apache.kafka.clients.producer.KafkaProducer; 21 | import org.apache.kafka.clients.producer.ProducerRecord; 22 | import kafka.producer.KeyedMessage; 23 | 24 | /** 25 | * A Kafka Producer that gets tweets on certain keywords 26 | * from twitter datasource and publishes to a kafka topic 27 | * 28 | * Arguments: ... 29 | * - Twitter consumer key 30 | * - Twitter consumer secret 31 | * - Twitter access token 32 | * - Twitter access token secret 33 | * - The kafka topic to subscribe to 34 | * - The keyword to filter tweets 35 | * - Any number of keywords to filter tweets 36 | * 37 | * More discussion at stdatalabs.blogspot.com 38 | * 39 | * @author Sachin Thirumala 40 | */ 41 | 42 | public class KafkaTwitterProducer { 43 | public static void main(String[] args) throws Exception { 44 | final LinkedBlockingQueue queue = new LinkedBlockingQueue(1000); 45 | 46 | if (args.length < 4) { 47 | System.out.println( 48 | "Usage: KafkaTwitterProducer "); 49 | return; 50 | } 51 | 52 | String consumerKey = args[0].toString(); 53 | String consumerSecret = args[1].toString(); 54 | String accessToken = args[2].toString(); 55 | String accessTokenSecret = args[3].toString(); 56 | String topicName = args[4].toString(); 57 | String[] arguments = args.clone(); 58 | String[] keyWords = Arrays.copyOfRange(arguments, 5, arguments.length); 59 | 60 | // Set twitter oAuth tokens in the configuration 61 | ConfigurationBuilder cb = new ConfigurationBuilder(); 62 | cb.setDebugEnabled(true).setOAuthConsumerKey(consumerKey).setOAuthConsumerSecret(consumerSecret) 63 | .setOAuthAccessToken(accessToken).setOAuthAccessTokenSecret(accessTokenSecret); 64 | 65 | // Create twitterstream using the configuration 66 | TwitterStream twitterStream = new TwitterStreamFactory(cb.build()).getInstance(); 67 | StatusListener listener = new StatusListener() { 68 | 69 | @Override 70 | public void onStatus(Status status) { 71 | queue.offer(status); 72 | } 73 | 74 | @Override 75 | public void onDeletionNotice(StatusDeletionNotice statusDeletionNotice) { 76 | System.out.println("Got a status deletion notice id:" + statusDeletionNotice.getStatusId()); 77 | } 78 | 79 | @Override 80 | public void onTrackLimitationNotice(int numberOfLimitedStatuses) { 81 | System.out.println("Got track limitation notice:" + numberOfLimitedStatuses); 82 | } 83 | 84 | @Override 85 | public void onScrubGeo(long userId, long upToStatusId) { 86 | System.out.println("Got scrub_geo event userId:" + userId + "upToStatusId:" + upToStatusId); 87 | } 88 | 89 | @Override 90 | public void onStallWarning(StallWarning warning) { 91 | System.out.println("Got stall warning:" + warning); 92 | } 93 | 94 | @Override 95 | public void onException(Exception ex) { 96 | ex.printStackTrace(); 97 | } 98 | }; 99 | twitterStream.addListener(listener); 100 | 101 | // Filter keywords 102 | FilterQuery query = new FilterQuery().track(keyWords); 103 | twitterStream.filter(query); 104 | 105 | // Thread.sleep(5000); 106 | 107 | // Add Kafka producer config settings 108 | Properties props = new Properties(); 109 | props.put("metadata.broker.list", "localhost:9092"); 110 | props.put("bootstrap.servers", "localhost:9092"); 111 | props.put("acks", "all"); 112 | props.put("retries", 0); 113 | props.put("batch.size", 16384); 114 | props.put("linger.ms", 1); 115 | props.put("buffer.memory", 33554432); 116 | 117 | props.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer"); 118 | props.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer"); 119 | 120 | Producer producer = new KafkaProducer(props); 121 | int i = 0; 122 | int j = 0; 123 | 124 | // poll for new tweets in the queue. If new tweets are added, send them 125 | // to the topic 126 | while (true) { 127 | Status ret = queue.poll(); 128 | 129 | if (ret == null) { 130 | Thread.sleep(100); 131 | // i++; 132 | } else { 133 | for (HashtagEntity hashtage : ret.getHashtagEntities()) { 134 | System.out.println("Tweet:" + ret); 135 | System.out.println("Hashtag: " + hashtage.getText()); 136 | // producer.send(new ProducerRecord( 137 | // topicName, Integer.toString(j++), hashtage.getText())); 138 | producer.send(new ProducerRecord(topicName, Integer.toString(j++), ret.getText())); 139 | } 140 | } 141 | } 142 | // producer.close(); 143 | // Thread.sleep(500); 144 | // twitterStream.shutdown(); 145 | } 146 | 147 | } 148 | -------------------------------------------------------------------------------- /src/main/scala/com/stdatalabs/Streaming/FlumeSparkPopularHashTags.scala: -------------------------------------------------------------------------------- 1 | package com.stdatalabs.Streaming 2 | 3 | import org.apache.spark.streaming.{ Seconds, StreamingContext } 4 | import org.apache.spark.SparkContext._ 5 | import org.apache.spark.streaming.twitter._ 6 | import org.apache.spark.SparkConf 7 | import org.apache.spark.streaming._ 8 | import org.apache.spark.{ SparkContext, SparkConf } 9 | import org.apache.spark.storage.StorageLevel 10 | import org.apache.spark.streaming.flume._ 11 | 12 | /** 13 | * A Spark Streaming - Flume integration to find Popular hashtags from twitter 14 | * It receives events from a Flume source that connects to twitter and pushes 15 | * tweets as avro events to sink. 16 | * 17 | * More discussion at stdatalabs.blogspot.com 18 | * 19 | * @author Sachin Thirumala 20 | */ 21 | 22 | object FlumeSparkPopularHashTags { 23 | val conf = new SparkConf().setMaster("local[6]").setAppName("Spark Streaming - Flume Source - PopularHashTags") 24 | val sc = new SparkContext(conf) 25 | def main(args: Array[String]) { 26 | sc.setLogLevel("WARN") 27 | // Set the Spark StreamingContext to create a DStream for every 5 seconds 28 | val ssc = new StreamingContext(sc, Seconds(5)) 29 | val filter = args.takeRight(args.length) 30 | 31 | // Create stream using FlumeUtils to receive data from flume at hostname: and port: 32 | val stream = FlumeUtils.createStream(ssc, "ubuntu", 9988) 33 | val tweets = stream.map(e => new String(e.event.getBody.array)) 34 | tweets.print() 35 | 36 | // Split the stream on space and extract hashtags 37 | val hashTags = tweets.flatMap(status => status.split(" ").filter(_.startsWith("#"))) 38 | // Get the top hashtags over the previous 60 sec window 39 | val topCounts60 = hashTags.map((_, 1)).reduceByKeyAndWindow(_ + _, Seconds(60)) 40 | .map { case (topic, count) => (count, topic) } 41 | .transform(_.sortByKey(false)) 42 | 43 | // Get the top hashtags over the previous 10 sec window 44 | val topCounts10 = hashTags.map((_, 1)).reduceByKeyAndWindow(_ + _, Seconds(10)) 45 | .map { case (topic, count) => (count, topic) } 46 | .transform(_.sortByKey(false)) 47 | 48 | // Print popular hashtags 49 | topCounts60.foreachRDD(rdd => { 50 | val topList = rdd.take(10) 51 | println("\nPopular topics in last 60 seconds (%s total):".format(rdd.count())) 52 | topList.foreach { case (count, tag) => println("%s (%s tweets)".format(tag, count)) } 53 | }) 54 | 55 | topCounts10.foreachRDD(rdd => { 56 | val topList = rdd.take(10) 57 | println("\nPopular topics in last 10 seconds (%s total):".format(rdd.count())) 58 | topList.foreach { case (count, tag) => println("%s (%s tweets)".format(tag, count)) } 59 | }) 60 | 61 | stream.count().map(cnt => "Received " + cnt + " flume events.").print() 62 | ssc.start() 63 | ssc.awaitTermination() 64 | } 65 | } -------------------------------------------------------------------------------- /src/main/scala/com/stdatalabs/Streaming/KafkaDirectPopularHashTags.scala: -------------------------------------------------------------------------------- 1 | package com.stdatalabs.Streaming 2 | 3 | import java.util.HashMap 4 | 5 | import kafka.serializer.StringDecoder 6 | 7 | import org.apache.kafka.clients.producer.{ KafkaProducer, ProducerConfig, ProducerRecord } 8 | import org.apache.spark.SparkConf 9 | import org.apache.spark.streaming._ 10 | import org.apache.spark.streaming.kafka._ 11 | import org.apache.spark.streaming.{ Seconds, StreamingContext } 12 | import org.apache.spark.SparkContext._ 13 | import org.apache.spark.streaming.twitter._ 14 | import org.apache.spark.SparkConf 15 | import org.apache.spark.streaming._ 16 | import org.apache.spark.{ SparkContext, SparkConf } 17 | import org.apache.spark.storage.StorageLevel 18 | import _root_.kafka.serializer.StringDecoder 19 | 20 | /** 21 | * A Spark Streaming - Kafka integration to receive twitter data from 22 | * kafka topic and find the popular hashtags and also ensure exactly once semantics 23 | * i.e, process data only once 24 | * 25 | * Arguments: 26 | * - List of one or more Kafka brokers 27 | * - List of one or more kafka topics to consume from 28 | * - The directory to store and retrieve checkpoint data 29 | * 30 | * More discussion at stdatalabs.blogspot.com 31 | * 32 | * @author Sachin Thirumala 33 | */ 34 | 35 | object KafkaDirectReciverPopularHashTags { 36 | 37 | def createContext(brokers: String, topics: String, checkpointDirectory: String): StreamingContext = { 38 | 39 | // If you do not see this printed, that means the StreamingContext has been loaded 40 | // from the new checkpoint 41 | println("Creating new context") 42 | 43 | val conf = new SparkConf().setMaster("local[6]").setAppName("Spark Streaming - Kafka DirectReceiver - PopularHashTags").set("spark.executor.memory", "1g") 44 | 45 | val sc = new SparkContext(conf) 46 | 47 | sc.setLogLevel("WARN") 48 | 49 | // Set the Spark StreamingContext to create a DStream for every 2 seconds 50 | val ssc = new StreamingContext(sc, Seconds(2)) 51 | ssc.checkpoint("checkpoint") 52 | 53 | // Define the Kafka parameters, broker list must be specified 54 | val kafkaParams = Map[String, String]( 55 | "metadata.broker.list" -> brokers, 56 | // start from the smallest available offset, ie the beginning of the kafka log 57 | "auto.offset.reset" -> "largest") 58 | 59 | // Define which topics to read from 60 | val topicsSet = topics.split(",").toSet 61 | 62 | // Map value from the kafka message (k, v) pair 63 | val lines = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topicsSet) 64 | // Filter hashtags 65 | val hashTags = lines.map(_._2).flatMap(_.split(" ")).filter(_.startsWith("#")) 66 | 67 | // Get the top hashtags over the previous 60/10 sec window 68 | val topCounts60 = hashTags.map((_, 1)).reduceByKeyAndWindow(_ + _, Seconds(60)) 69 | .map { case (topic, count) => (count, topic) } 70 | .transform(_.sortByKey(false)) 71 | 72 | val topCounts10 = hashTags.map((_, 1)).reduceByKeyAndWindow(_ + _, Seconds(10)) 73 | .map { case (topic, count) => (count, topic) } 74 | .transform(_.sortByKey(false)) 75 | 76 | lines.print() 77 | 78 | // Print popular hashtags 79 | topCounts60.foreachRDD(rdd => { 80 | val topList = rdd.take(10) 81 | println("\nPopular topics in last 60 seconds (%s total):".format(rdd.count())) 82 | topList.foreach { case (count, tag) => println("%s (%s tweets)".format(tag, count)) } 83 | }) 84 | 85 | topCounts10.foreachRDD(rdd => { 86 | val topList = rdd.take(10) 87 | println("\nPopular topics in last 10 seconds (%s total):".format(rdd.count())) 88 | topList.foreach { case (count, tag) => println("%s (%s tweets)".format(tag, count)) } 89 | }) 90 | 91 | lines.count().map(cnt => "Received " + cnt + " kafka messages.").print() 92 | 93 | ssc 94 | 95 | } 96 | 97 | def main(args: Array[String]) { 98 | 99 | if (args.length < 2) { 100 | System.err.println(s""" 101 | |Usage: KafkaDirectPopularHashTags 102 | | is a list of one or more Kafka brokers 103 | | is a list of one or more kafka topics to consume from 104 | | the directory where the metadata is stored 105 | | 106 | """.stripMargin) 107 | System.exit(1) 108 | } 109 | 110 | // Create an array of arguments: brokers, topicname, checkpoint directory 111 | val Array(brokers, topics, checkpointDirectory) = args 112 | 113 | val ssc = StreamingContext.getOrCreate(checkpointDirectory, 114 | () => createContext(brokers, topics, checkpointDirectory)) 115 | 116 | ssc.start() 117 | ssc.awaitTermination() 118 | } 119 | 120 | } -------------------------------------------------------------------------------- /src/main/scala/com/stdatalabs/Streaming/KafkaSparkPopularHashTags.scala: -------------------------------------------------------------------------------- 1 | package com.stdatalabs.Streaming 2 | 3 | import java.util.HashMap 4 | 5 | import org.apache.kafka.clients.producer.{ KafkaProducer, ProducerConfig, ProducerRecord } 6 | import org.apache.spark.SparkConf 7 | import org.apache.spark.streaming._ 8 | import org.apache.spark.streaming.kafka._ 9 | import org.apache.spark.streaming.{ Seconds, StreamingContext } 10 | import org.apache.spark.SparkContext._ 11 | import org.apache.spark.streaming.twitter._ 12 | import org.apache.spark.SparkConf 13 | import org.apache.spark.streaming._ 14 | import org.apache.spark.{ SparkContext, SparkConf } 15 | import org.apache.spark.storage.StorageLevel 16 | 17 | /** 18 | * A Spark Streaming - Kafka integration to receive twitter 19 | * data from kafka topic and find the popular hashtags 20 | * 21 | * Arguments: 22 | * - The zookeeper hostname 23 | * - The Kafka consumer group 24 | * - The kafka topic to subscribe to 25 | * - Number of kafka receivers to run in parallel 26 | * 27 | * More discussion at stdatalabs.blogspot.com 28 | * 29 | * @author Sachin Thirumala 30 | */ 31 | 32 | object KafkaSparkPopularHashTags { 33 | 34 | val conf = new SparkConf().setMaster("local[6]").setAppName("Spark Streaming - Kafka Producer - PopularHashTags").set("spark.executor.memory", "1g") 35 | 36 | conf.set("spark.streaming.receiver.writeAheadLog.enable", "true") 37 | 38 | val sc = new SparkContext(conf) 39 | 40 | def main(args: Array[String]) { 41 | 42 | sc.setLogLevel("WARN") 43 | 44 | // Create an array of arguments: zookeeper hostname/ip,consumer group, topicname, num of threads 45 | val Array(zkQuorum, group, topics, numThreads) = args 46 | 47 | // Set the Spark StreamingContext to create a DStream for every 2 seconds 48 | val ssc = new StreamingContext(sc, Seconds(2)) 49 | ssc.checkpoint("checkpoint") 50 | 51 | // Map each topic to a thread 52 | val topicMap = topics.split(",").map((_, numThreads.toInt)).toMap 53 | // Map value from the kafka message (k, v) pair 54 | val lines = KafkaUtils.createStream(ssc, zkQuorum, group, topicMap).map(_._2) 55 | // Filter hashtags 56 | val hashTags = lines.flatMap(_.split(" ")).filter(_.startsWith("#")) 57 | 58 | // Get the top hashtags over the previous 60/10 sec window 59 | val topCounts60 = hashTags.map((_, 1)).reduceByKeyAndWindow(_ + _, Seconds(60)) 60 | .map { case (topic, count) => (count, topic) } 61 | .transform(_.sortByKey(false)) 62 | 63 | val topCounts10 = hashTags.map((_, 1)).reduceByKeyAndWindow(_ + _, Seconds(10)) 64 | .map { case (topic, count) => (count, topic) } 65 | .transform(_.sortByKey(false)) 66 | 67 | lines.print() 68 | 69 | // Print popular hashtags 70 | topCounts60.foreachRDD(rdd => { 71 | val topList = rdd.take(10) 72 | println("\nPopular topics in last 60 seconds (%s total):".format(rdd.count())) 73 | topList.foreach { case (count, tag) => println("%s (%s tweets)".format(tag, count)) } 74 | }) 75 | 76 | topCounts10.foreachRDD(rdd => { 77 | val topList = rdd.take(10) 78 | println("\nPopular topics in last 10 seconds (%s total):".format(rdd.count())) 79 | topList.foreach { case (count, tag) => println("%s (%s tweets)".format(tag, count)) } 80 | }) 81 | 82 | lines.count().map(cnt => "Received " + cnt + " kafka messages.").print() 83 | 84 | ssc.start() 85 | ssc.awaitTermination() 86 | } 87 | } -------------------------------------------------------------------------------- /src/main/scala/com/stdatalabs/Streaming/RecoverableKafkaPopularHashTags.scala: -------------------------------------------------------------------------------- 1 | package com.stdatalabs.Streaming 2 | 3 | import java.util.HashMap 4 | 5 | import org.apache.kafka.clients.producer.{ KafkaProducer, ProducerConfig, ProducerRecord } 6 | import org.apache.spark.SparkConf 7 | import org.apache.spark.streaming._ 8 | import org.apache.spark.streaming.kafka._ 9 | import org.apache.spark.streaming.{ Seconds, StreamingContext } 10 | import org.apache.spark.SparkContext._ 11 | import org.apache.spark.streaming.twitter._ 12 | import org.apache.spark.SparkConf 13 | import org.apache.spark.streaming._ 14 | import org.apache.spark.{ SparkContext, SparkConf } 15 | import org.apache.spark.storage.StorageLevel 16 | 17 | /** 18 | * A Spark Streaming - Kafka integration to receive twitter data from kafka 19 | * topic and find the popular hashtags and also ensure at-least once semantics 20 | * i.e, zero data loss 21 | * 22 | * Arguments: 23 | * - The zookeeper hostname 24 | * - The Kafka consumer group 25 | * - The kafka topic to subscribe to 26 | * - Number of kafka receivers to run in parallel 27 | * - The directory to store and retrieve checkpoint data 28 | * 29 | * More discussion at stdatalabs.blogspot.com 30 | * 31 | * @author Sachin Thirumala 32 | */ 33 | 34 | object RecoverableKafkaPopularHashTags { 35 | 36 | def createContext(zkQuorum: String, group: String, topics: String, numThreads: String, checkpointDirectory: String): StreamingContext = { 37 | 38 | // If you do not see this printed, that means the StreamingContext has been loaded 39 | // from the new checkpoint 40 | println("Creating new context") 41 | val conf = new SparkConf().setMaster("local[6]").setAppName("Spark Streaming - Kafka Producer - PopularHashTags").set("spark.executor.memory", "1g") 42 | 43 | conf.set("spark.streaming.receiver.writeAheadLog.enable", "true") 44 | 45 | val sc = new SparkContext(conf) 46 | 47 | sc.setLogLevel("WARN") 48 | 49 | // Set the Spark StreamingContext to create a DStream for every 2 seconds 50 | val ssc = new StreamingContext(sc, Seconds(2)) 51 | 52 | ssc.checkpoint(checkpointDirectory) 53 | 54 | // Map each topic to a thread 55 | val topicMap = topics.split(",").map((_, numThreads.toInt)).toMap 56 | // Map value from the kafka message (k, v) pair 57 | val lines = KafkaUtils.createStream(ssc, zkQuorum, group, topicMap).map(_._2) 58 | // Filter hashtags 59 | val hashTags = lines.flatMap(_.split(" ")).filter(_.startsWith("#")) 60 | 61 | // Get the top hashtags over the previous 60/10 sec window 62 | val topCounts60 = hashTags.map((_, 1)).reduceByKeyAndWindow(_ + _, Seconds(60)) 63 | .map { case (topic, count) => (count, topic) } 64 | .transform(_.sortByKey(false)) 65 | 66 | val topCounts10 = hashTags.map((_, 1)).reduceByKeyAndWindow(_ + _, Seconds(10)) 67 | .map { case (topic, count) => (count, topic) } 68 | .transform(_.sortByKey(false)) 69 | 70 | lines.print() 71 | 72 | // Print popular hashtags 73 | topCounts60.foreachRDD(rdd => { 74 | val topList = rdd.take(10) 75 | println("\nPopular topics in last 60 seconds (%s total):".format(rdd.count())) 76 | topList.foreach { case (count, tag) => println("%s (%s tweets)".format(tag, count)) } 77 | }) 78 | 79 | topCounts10.foreachRDD(rdd => { 80 | val topList = rdd.take(10) 81 | println("\nPopular topics in last 10 seconds (%s total):".format(rdd.count())) 82 | topList.foreach { case (count, tag) => println("%s (%s tweets)".format(tag, count)) } 83 | }) 84 | 85 | lines.count().map(cnt => "Received " + cnt + " kafka messages.").print() 86 | 87 | ssc 88 | 89 | } 90 | 91 | def main(args: Array[String]) { 92 | 93 | if (args.length != 5) { 94 | System.err.println("Your arguments were " + args.mkString("[", ", ", "]")) 95 | System.err.println( 96 | """ 97 | |Usage: RecoverableKafkaPopularHashTags 98 | | 99 | """.stripMargin 100 | ) 101 | System.exit(1) 102 | } 103 | 104 | // Create an array of arguments: zookeeper hostname/ip,consumer group, topicname, num of threads 105 | val Array(zkQuorum, group, topics, numThreads, checkpointDirectory) = args 106 | 107 | val ssc = StreamingContext.getOrCreate(checkpointDirectory, 108 | () => createContext(zkQuorum, group, topics, numThreads, checkpointDirectory)) 109 | 110 | ssc.start() 111 | ssc.awaitTermination() 112 | } 113 | 114 | } -------------------------------------------------------------------------------- /src/main/scala/com/stdatalabs/Streaming/SparkPopularHashTags.scala: -------------------------------------------------------------------------------- 1 | package com.stdatalabs.Streaming 2 | 3 | import org.apache.spark.streaming.{ Seconds, StreamingContext } 4 | import org.apache.spark.SparkContext._ 5 | import org.apache.spark.streaming.twitter._ 6 | import org.apache.spark.SparkConf 7 | import org.apache.spark.streaming._ 8 | import org.apache.spark.{ SparkContext, SparkConf } 9 | import org.apache.spark.storage.StorageLevel 10 | import org.apache.spark.streaming.flume._ 11 | 12 | /** 13 | * A Spark Streaming application that receives tweets on certain 14 | * keywords from twitter datasource and find the popular hashtags 15 | * 16 | * Arguments: ... 17 | * - Twitter consumer key 18 | * - Twitter consumer secret 19 | * - Twitter access token 20 | * - Twitter access token secret 21 | * - The keyword to filter tweets 22 | * - Any number of keywords to filter tweets 23 | * 24 | * More discussion at stdatalabs.blogspot.com 25 | * 26 | * @author Sachin Thirumala 27 | */ 28 | 29 | object SparkPopularHashTags { 30 | val conf = new SparkConf().setMaster("local[4]").setAppName("Spark Streaming - PopularHashTags") 31 | val sc = new SparkContext(conf) 32 | 33 | def main(args: Array[String]) { 34 | 35 | sc.setLogLevel("WARN") 36 | 37 | val Array(consumerKey, consumerSecret, accessToken, accessTokenSecret) = args.take(4) 38 | val filters = args.takeRight(args.length - 4) 39 | 40 | // Set the system properties so that Twitter4j library used by twitter stream 41 | // can use them to generat OAuth credentials 42 | System.setProperty("twitter4j.oauth.consumerKey", consumerKey) 43 | System.setProperty("twitter4j.oauth.consumerSecret", consumerSecret) 44 | System.setProperty("twitter4j.oauth.accessToken", accessToken) 45 | System.setProperty("twitter4j.oauth.accessTokenSecret", accessTokenSecret) 46 | 47 | // Set the Spark StreamingContext to create a DStream for every 5 seconds 48 | val ssc = new StreamingContext(sc, Seconds(5)) 49 | // Pass the filter keywords as arguements 50 | 51 | // val stream = FlumeUtils.createStream(ssc, args(0), args(1).toInt) 52 | val stream = TwitterUtils.createStream(ssc, None, filters) 53 | 54 | // Split the stream on space and extract hashtags 55 | val hashTags = stream.flatMap(status => status.getText.split(" ").filter(_.startsWith("#"))) 56 | 57 | // Get the top hashtags over the previous 60 sec window 58 | val topCounts60 = hashTags.map((_, 1)).reduceByKeyAndWindow(_ + _, Seconds(60)) 59 | .map { case (topic, count) => (count, topic) } 60 | .transform(_.sortByKey(false)) 61 | 62 | // Get the top hashtags over the previous 10 sec window 63 | val topCounts10 = hashTags.map((_, 1)).reduceByKeyAndWindow(_ + _, Seconds(10)) 64 | .map { case (topic, count) => (count, topic) } 65 | .transform(_.sortByKey(false)) 66 | 67 | // print tweets in the currect DStream 68 | stream.print() 69 | 70 | // Print popular hashtags 71 | topCounts60.foreachRDD(rdd => { 72 | val topList = rdd.take(10) 73 | println("\nPopular topics in last 60 seconds (%s total):".format(rdd.count())) 74 | topList.foreach { case (count, tag) => println("%s (%s tweets)".format(tag, count)) } 75 | }) 76 | topCounts10.foreachRDD(rdd => { 77 | val topList = rdd.take(10) 78 | println("\nPopular topics in last 10 seconds (%s total):".format(rdd.count())) 79 | topList.foreach { case (count, tag) => println("%s (%s tweets)".format(tag, count)) } 80 | }) 81 | 82 | ssc.start() 83 | ssc.awaitTermination() 84 | } 85 | } -------------------------------------------------------------------------------- /src/test/java/org/stdatalabs/TwitterPopularHashTags/AppTest.java: -------------------------------------------------------------------------------- 1 | package org.stdatalabs.TwitterPopularHashTags; 2 | 3 | import junit.framework.Test; 4 | import junit.framework.TestCase; 5 | import junit.framework.TestSuite; 6 | 7 | /** 8 | * Unit test for simple App. 9 | */ 10 | public class AppTest 11 | extends TestCase 12 | { 13 | /** 14 | * Create the test case 15 | * 16 | * @param testName name of the test case 17 | */ 18 | public AppTest( String testName ) 19 | { 20 | super( testName ); 21 | } 22 | 23 | /** 24 | * @return the suite of tests being tested 25 | */ 26 | public static Test suite() 27 | { 28 | return new TestSuite( AppTest.class ); 29 | } 30 | 31 | /** 32 | * Rigourous Test :-) 33 | */ 34 | public void testApp() 35 | { 36 | assertTrue( true ); 37 | } 38 | } 39 | --------------------------------------------------------------------------------