├── .gitignore
├── README.md
├── TwitterAvroSource.conf
├── pom.xml
└── src
├── main
└── scala
│ └── com
│ └── stdatalabs
│ ├── Kafka
│ └── KafkaTwitterProducer.java
│ └── Streaming
│ ├── FlumeSparkPopularHashTags.scala
│ ├── KafkaDirectPopularHashTags.scala
│ ├── KafkaSparkPopularHashTags.scala
│ ├── RecoverableKafkaPopularHashTags.scala
│ └── SparkPopularHashTags.scala
└── test
└── java
└── org
└── stdatalabs
└── TwitterPopularHashTags
└── AppTest.java
/.gitignore:
--------------------------------------------------------------------------------
1 | *.iml
2 | .*/
3 | target/
4 | .classpath
5 | .cache-main
6 | .cache-tests
7 | .settings
8 | .project
9 | tweets.txt
10 | checkpoint/
11 |
12 | # Compiled source #
13 | ###################
14 | *.com
15 | *.class
16 | *.dll
17 | *.exe
18 | *.o
19 | *.so
20 | checkpoint
21 |
22 | # Packages #
23 | ############
24 | # it's better to unpack these files and commit the raw source
25 | # git has its own built in compression methods
26 | *.7z
27 | *.dmg
28 | *.gz
29 | *.iso
30 | *.jar
31 | *.rar
32 | *.tar
33 | *.zip
34 |
35 | # Logs and databases #
36 | ######################
37 | *.log
38 | *.sql
39 | *.sqlite
40 |
41 | # OS generated files #
42 | ######################
43 | .DS_Store
44 | .DS_Store?
45 | ._*
46 | .Spotlight-V100
47 | .Trashes
48 | ehthumbs.db
49 | Thumbs.db
50 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # SparkTwitterPopularHashTags
2 |
3 | A project on Spark Streaming to analyze Popular hashtags from live twitter data streams. Data is ingested from different input sources like Twitter source, Flume and Kafka and processed downstream using Spark Streaming.
4 |
5 | ## Requirements
6 | - IDE
7 | - Apache Maven 3.x
8 | - JVM 6 or 7
9 |
10 | ## General Info
11 | The source folder is organized into 2 packages i.e. Kafka and Streaming. Each class in the Streaming package explores different approach to consume data from Twitter source. Below is the list of classes:
12 | * com/stdatalabs/Kafka
13 | * KafkaTwitterProducer.java -- A Kafka Producer that publishes twitter data to a kafka broker
14 | * com/stdatalabs/Streaming
15 | * SparkPopularHashTags.scala -- Receives data from Twitter datasource
16 | * FlumeSparkPopularHashTags.scala -- Receives data from Flume Twitter producer
17 | * KafkaSparkPopularHashTags.scala -- Receives data from Kafka Producer
18 | * RecoverableKafkaPopularHashTags.scala -- Spark-Kafka receiver based approach. Ensures at-least once semantics
19 | * KafkaDirectPopularHashTags.scala -- Spark-Kafka Direct approach. Ensures exactly once semantics
20 | * TwitterAvroSource.conf
21 | -- Flume conf for running Twitter avro source
22 |
23 | ## Description
24 | * ##### A Spark Streaming application that receives tweets on certain keywords from twitter datasource and finds the popular hashtags.
25 | Discussed in blog --
26 | [Spark Streaming part 1: Real time twitter sentiment analysis](http://stdatalabs.blogspot.in/2016/09/spark-streaming-part-1-real-time.html)
27 |
28 | * ##### A Spark Streaming - Flume integration to find Popular hashtags from twitter. It receives events from a Flume source that connects to twitter and pushes tweets as avro events to sink.
29 | Discussed in blog --
30 | [Spark streaming part 2: Real time twitter sentiment analysis using Flume](http://stdatalabs.blogspot.in/2016/09/spark-streaming-part-2-real-time_10.html)
31 |
32 | * ##### A Spark Streaming - Kafka integration to receive twitter data from kafka producer and find the popular hashtags
33 | Discussed in blog --
34 | [Spark streaming part 3: Real time twitter sentiment analysis using kafka](http://stdatalabs.blogspot.in/2016/09/spark-streaming-part-3-real-time.html)
35 |
36 | * ##### A Spark Streaming - Kafka integration to ensure at-least once semantics
37 | Discussed in blog --
38 | [Data guarantees in Spark Streaming with kafka integration](http://stdatalabs.blogspot.in/2016/10/data-guarantees-in-spark-streaming-with.html)
39 |
40 | * ##### A Spark Streaming - Kafka integration to ensure exactly once semantics
41 | Discussed in blog --
42 | [Data guarantees in Spark Streaming with kafka integration](http://stdatalabs.blogspot.in/2016/10/data-guarantees-in-spark-streaming-with.html)
43 |
44 |
45 |
46 | ### More articles on hadoop technology stack at [stdatalabs](http://stdatalabs.blogspot.com)
47 |
48 |
--------------------------------------------------------------------------------
/TwitterAvroSource.conf:
--------------------------------------------------------------------------------
1 | TwitterAgent.sources = Twitter
2 | TwitterAgent.channels = MemChannel
3 | TwitterAgent.sinks = avroSink
4 |
5 |
6 |
7 | # Describing/Configuring the source
8 | TwitterAgent.sources.Twitter.type = org.apache.flume.source.twitter.TwitterSource
9 | TwitterAgent.sources.Twitter.consumerKey=
10 | TwitterAgent.sources.Twitter.consumerSecret=
11 | TwitterAgent.sources.Twitter.accessToken=
12 | TwitterAgent.sources.Twitter.accessTokenSecret=
13 |
14 |
15 |
16 | # Describing/Configuring the sink
17 | TwitterAgent.sinks.avroSink.type = avro
18 | TwitterAgent.sinks.avroSink.batch-size = 1
19 | TwitterAgent.sinks.avroSink.hostname = ubuntu
20 | TwitterAgent.sinks.avroSink.port = 9988
21 |
22 |
23 |
24 | # Describing/Configuring the memory channel
25 | TwitterAgent.channels.MemChannel.type = memory
26 | TwitterAgent.channels.MemChannel.capacity = 10000
27 | TwitterAgent.channels.MemChannel.transactionCapacity = 100
28 |
29 |
30 |
31 | # Linking the source and sink to the memory channel
32 | TwitterAgent.sources.Twitter.channels = MemChannel
33 | TwitterAgent.sinks.avroSink.channel = MemChannel
34 |
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
3 | 4.0.0
4 |
5 | com.stdatalabs.Streaming
6 | SparkTwitterAnalysis
7 | 0.0.1-SNAPSHOT
8 | jar
9 |
10 | TwitterPopularHashTags
11 | http://maven.apache.org
12 |
13 |
14 | UTF-8
15 |
16 |
17 |
18 |
19 | junit
20 | junit
21 | 3.8.1
22 | test
23 |
24 |
25 | junit
26 | junit
27 | 3.8.1
28 | test
29 |
30 |
31 | org.apache.spark
32 | spark-core_2.10
33 | 1.4.1
34 | compile
35 |
36 |
37 |
38 | org.apache.spark
39 | spark-streaming_2.10
40 | 1.4.1
41 | compile
42 |
43 |
44 |
45 | org.apache.spark
46 | spark-mllib_2.10
47 | 1.4.1
48 |
49 |
50 | org.apache.spark
51 | spark-sql_2.10
52 | 1.4.1
53 |
54 |
55 | org.apache.spark
56 | spark-hive_2.10
57 | 1.4.1
58 |
59 |
60 |
61 | org.apache.spark
62 | spark-streaming-twitter_2.10
63 | 1.4.1
64 |
65 |
66 | org.apache.spark
67 | spark-streaming-kafka_2.10
68 | 1.4.1
69 |
70 |
71 | org.apache.hadoop
72 | hadoop-client
73 | 2.4.1
74 | compile
75 |
76 |
77 | org.apache.hadoop
78 | hadoop-common
79 | 2.4.1
80 | compile
81 |
82 |
83 |
84 | org.apache.spark
85 | spark-streaming-flume_2.10
86 | 1.4.1
87 |
88 |
89 |
90 | org.apache.spark
91 | spark-streaming-flume-sink_2.10
92 | 1.4.1
93 |
94 |
95 |
96 | org.twitter4j
97 | twitter4j-core
98 | 3.0.3
99 |
100 |
101 |
102 | org.twitter4j
103 | twitter4j-stream
104 | 3.0.3
105 |
106 |
107 |
108 | org.twitter4j
109 | twitter4j-async
110 | 3.0.3
111 |
112 |
113 | org.apache.storm
114 | storm-core
115 | 0.10.0
116 | provided
117 |
118 |
119 | org.slf4j
120 | slf4j-log4j12
121 |
122 |
123 |
124 |
125 | org.apache.kafka
126 | kafka_2.10
127 | 0.8.2.1
128 |
129 |
130 | org.slf4j
131 | slf4j-log4j12
132 |
133 |
134 |
135 |
136 | org.apache.storm
137 | storm-kafka
138 | 0.10.0-beta1
139 |
140 |
141 | org.apache.storm
142 | storm-hdfs
143 | 0.10.0-beta1
144 |
145 |
146 |
147 |
--------------------------------------------------------------------------------
/src/main/scala/com/stdatalabs/Kafka/KafkaTwitterProducer.java:
--------------------------------------------------------------------------------
1 | package com.stdatalabs.Kafka;
2 |
3 | import java.util.Arrays;
4 | import java.util.Properties;
5 | import java.util.concurrent.LinkedBlockingQueue;
6 |
7 | import twitter4j.*;
8 | import twitter4j.conf.*;
9 |
10 | import twitter4j.StallWarning;
11 | import twitter4j.Status;
12 | import twitter4j.StatusDeletionNotice;
13 | import twitter4j.StatusListener;
14 | import twitter4j.TwitterStream;
15 | import twitter4j.TwitterStreamFactory;
16 | import twitter4j.conf.ConfigurationBuilder;
17 | import twitter4j.json.DataObjectFactory;
18 |
19 | import org.apache.kafka.clients.producer.Producer;
20 | import org.apache.kafka.clients.producer.KafkaProducer;
21 | import org.apache.kafka.clients.producer.ProducerRecord;
22 | import kafka.producer.KeyedMessage;
23 |
24 | /**
25 | * A Kafka Producer that gets tweets on certain keywords
26 | * from twitter datasource and publishes to a kafka topic
27 | *
28 | * Arguments: ...
29 | * - Twitter consumer key
30 | * - Twitter consumer secret
31 | * - Twitter access token
32 | * - Twitter access token secret
33 | * - The kafka topic to subscribe to
34 | * - The keyword to filter tweets
35 | * - Any number of keywords to filter tweets
36 | *
37 | * More discussion at stdatalabs.blogspot.com
38 | *
39 | * @author Sachin Thirumala
40 | */
41 |
42 | public class KafkaTwitterProducer {
43 | public static void main(String[] args) throws Exception {
44 | final LinkedBlockingQueue queue = new LinkedBlockingQueue(1000);
45 |
46 | if (args.length < 4) {
47 | System.out.println(
48 | "Usage: KafkaTwitterProducer ");
49 | return;
50 | }
51 |
52 | String consumerKey = args[0].toString();
53 | String consumerSecret = args[1].toString();
54 | String accessToken = args[2].toString();
55 | String accessTokenSecret = args[3].toString();
56 | String topicName = args[4].toString();
57 | String[] arguments = args.clone();
58 | String[] keyWords = Arrays.copyOfRange(arguments, 5, arguments.length);
59 |
60 | // Set twitter oAuth tokens in the configuration
61 | ConfigurationBuilder cb = new ConfigurationBuilder();
62 | cb.setDebugEnabled(true).setOAuthConsumerKey(consumerKey).setOAuthConsumerSecret(consumerSecret)
63 | .setOAuthAccessToken(accessToken).setOAuthAccessTokenSecret(accessTokenSecret);
64 |
65 | // Create twitterstream using the configuration
66 | TwitterStream twitterStream = new TwitterStreamFactory(cb.build()).getInstance();
67 | StatusListener listener = new StatusListener() {
68 |
69 | @Override
70 | public void onStatus(Status status) {
71 | queue.offer(status);
72 | }
73 |
74 | @Override
75 | public void onDeletionNotice(StatusDeletionNotice statusDeletionNotice) {
76 | System.out.println("Got a status deletion notice id:" + statusDeletionNotice.getStatusId());
77 | }
78 |
79 | @Override
80 | public void onTrackLimitationNotice(int numberOfLimitedStatuses) {
81 | System.out.println("Got track limitation notice:" + numberOfLimitedStatuses);
82 | }
83 |
84 | @Override
85 | public void onScrubGeo(long userId, long upToStatusId) {
86 | System.out.println("Got scrub_geo event userId:" + userId + "upToStatusId:" + upToStatusId);
87 | }
88 |
89 | @Override
90 | public void onStallWarning(StallWarning warning) {
91 | System.out.println("Got stall warning:" + warning);
92 | }
93 |
94 | @Override
95 | public void onException(Exception ex) {
96 | ex.printStackTrace();
97 | }
98 | };
99 | twitterStream.addListener(listener);
100 |
101 | // Filter keywords
102 | FilterQuery query = new FilterQuery().track(keyWords);
103 | twitterStream.filter(query);
104 |
105 | // Thread.sleep(5000);
106 |
107 | // Add Kafka producer config settings
108 | Properties props = new Properties();
109 | props.put("metadata.broker.list", "localhost:9092");
110 | props.put("bootstrap.servers", "localhost:9092");
111 | props.put("acks", "all");
112 | props.put("retries", 0);
113 | props.put("batch.size", 16384);
114 | props.put("linger.ms", 1);
115 | props.put("buffer.memory", 33554432);
116 |
117 | props.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer");
118 | props.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer");
119 |
120 | Producer producer = new KafkaProducer(props);
121 | int i = 0;
122 | int j = 0;
123 |
124 | // poll for new tweets in the queue. If new tweets are added, send them
125 | // to the topic
126 | while (true) {
127 | Status ret = queue.poll();
128 |
129 | if (ret == null) {
130 | Thread.sleep(100);
131 | // i++;
132 | } else {
133 | for (HashtagEntity hashtage : ret.getHashtagEntities()) {
134 | System.out.println("Tweet:" + ret);
135 | System.out.println("Hashtag: " + hashtage.getText());
136 | // producer.send(new ProducerRecord(
137 | // topicName, Integer.toString(j++), hashtage.getText()));
138 | producer.send(new ProducerRecord(topicName, Integer.toString(j++), ret.getText()));
139 | }
140 | }
141 | }
142 | // producer.close();
143 | // Thread.sleep(500);
144 | // twitterStream.shutdown();
145 | }
146 |
147 | }
148 |
--------------------------------------------------------------------------------
/src/main/scala/com/stdatalabs/Streaming/FlumeSparkPopularHashTags.scala:
--------------------------------------------------------------------------------
1 | package com.stdatalabs.Streaming
2 |
3 | import org.apache.spark.streaming.{ Seconds, StreamingContext }
4 | import org.apache.spark.SparkContext._
5 | import org.apache.spark.streaming.twitter._
6 | import org.apache.spark.SparkConf
7 | import org.apache.spark.streaming._
8 | import org.apache.spark.{ SparkContext, SparkConf }
9 | import org.apache.spark.storage.StorageLevel
10 | import org.apache.spark.streaming.flume._
11 |
12 | /**
13 | * A Spark Streaming - Flume integration to find Popular hashtags from twitter
14 | * It receives events from a Flume source that connects to twitter and pushes
15 | * tweets as avro events to sink.
16 | *
17 | * More discussion at stdatalabs.blogspot.com
18 | *
19 | * @author Sachin Thirumala
20 | */
21 |
22 | object FlumeSparkPopularHashTags {
23 | val conf = new SparkConf().setMaster("local[6]").setAppName("Spark Streaming - Flume Source - PopularHashTags")
24 | val sc = new SparkContext(conf)
25 | def main(args: Array[String]) {
26 | sc.setLogLevel("WARN")
27 | // Set the Spark StreamingContext to create a DStream for every 5 seconds
28 | val ssc = new StreamingContext(sc, Seconds(5))
29 | val filter = args.takeRight(args.length)
30 |
31 | // Create stream using FlumeUtils to receive data from flume at hostname: and port:
32 | val stream = FlumeUtils.createStream(ssc, "ubuntu", 9988)
33 | val tweets = stream.map(e => new String(e.event.getBody.array))
34 | tweets.print()
35 |
36 | // Split the stream on space and extract hashtags
37 | val hashTags = tweets.flatMap(status => status.split(" ").filter(_.startsWith("#")))
38 | // Get the top hashtags over the previous 60 sec window
39 | val topCounts60 = hashTags.map((_, 1)).reduceByKeyAndWindow(_ + _, Seconds(60))
40 | .map { case (topic, count) => (count, topic) }
41 | .transform(_.sortByKey(false))
42 |
43 | // Get the top hashtags over the previous 10 sec window
44 | val topCounts10 = hashTags.map((_, 1)).reduceByKeyAndWindow(_ + _, Seconds(10))
45 | .map { case (topic, count) => (count, topic) }
46 | .transform(_.sortByKey(false))
47 |
48 | // Print popular hashtags
49 | topCounts60.foreachRDD(rdd => {
50 | val topList = rdd.take(10)
51 | println("\nPopular topics in last 60 seconds (%s total):".format(rdd.count()))
52 | topList.foreach { case (count, tag) => println("%s (%s tweets)".format(tag, count)) }
53 | })
54 |
55 | topCounts10.foreachRDD(rdd => {
56 | val topList = rdd.take(10)
57 | println("\nPopular topics in last 10 seconds (%s total):".format(rdd.count()))
58 | topList.foreach { case (count, tag) => println("%s (%s tweets)".format(tag, count)) }
59 | })
60 |
61 | stream.count().map(cnt => "Received " + cnt + " flume events.").print()
62 | ssc.start()
63 | ssc.awaitTermination()
64 | }
65 | }
--------------------------------------------------------------------------------
/src/main/scala/com/stdatalabs/Streaming/KafkaDirectPopularHashTags.scala:
--------------------------------------------------------------------------------
1 | package com.stdatalabs.Streaming
2 |
3 | import java.util.HashMap
4 |
5 | import kafka.serializer.StringDecoder
6 |
7 | import org.apache.kafka.clients.producer.{ KafkaProducer, ProducerConfig, ProducerRecord }
8 | import org.apache.spark.SparkConf
9 | import org.apache.spark.streaming._
10 | import org.apache.spark.streaming.kafka._
11 | import org.apache.spark.streaming.{ Seconds, StreamingContext }
12 | import org.apache.spark.SparkContext._
13 | import org.apache.spark.streaming.twitter._
14 | import org.apache.spark.SparkConf
15 | import org.apache.spark.streaming._
16 | import org.apache.spark.{ SparkContext, SparkConf }
17 | import org.apache.spark.storage.StorageLevel
18 | import _root_.kafka.serializer.StringDecoder
19 |
20 | /**
21 | * A Spark Streaming - Kafka integration to receive twitter data from
22 | * kafka topic and find the popular hashtags and also ensure exactly once semantics
23 | * i.e, process data only once
24 | *
25 | * Arguments:
26 | * - List of one or more Kafka brokers
27 | * - List of one or more kafka topics to consume from
28 | * - The directory to store and retrieve checkpoint data
29 | *
30 | * More discussion at stdatalabs.blogspot.com
31 | *
32 | * @author Sachin Thirumala
33 | */
34 |
35 | object KafkaDirectReciverPopularHashTags {
36 |
37 | def createContext(brokers: String, topics: String, checkpointDirectory: String): StreamingContext = {
38 |
39 | // If you do not see this printed, that means the StreamingContext has been loaded
40 | // from the new checkpoint
41 | println("Creating new context")
42 |
43 | val conf = new SparkConf().setMaster("local[6]").setAppName("Spark Streaming - Kafka DirectReceiver - PopularHashTags").set("spark.executor.memory", "1g")
44 |
45 | val sc = new SparkContext(conf)
46 |
47 | sc.setLogLevel("WARN")
48 |
49 | // Set the Spark StreamingContext to create a DStream for every 2 seconds
50 | val ssc = new StreamingContext(sc, Seconds(2))
51 | ssc.checkpoint("checkpoint")
52 |
53 | // Define the Kafka parameters, broker list must be specified
54 | val kafkaParams = Map[String, String](
55 | "metadata.broker.list" -> brokers,
56 | // start from the smallest available offset, ie the beginning of the kafka log
57 | "auto.offset.reset" -> "largest")
58 |
59 | // Define which topics to read from
60 | val topicsSet = topics.split(",").toSet
61 |
62 | // Map value from the kafka message (k, v) pair
63 | val lines = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topicsSet)
64 | // Filter hashtags
65 | val hashTags = lines.map(_._2).flatMap(_.split(" ")).filter(_.startsWith("#"))
66 |
67 | // Get the top hashtags over the previous 60/10 sec window
68 | val topCounts60 = hashTags.map((_, 1)).reduceByKeyAndWindow(_ + _, Seconds(60))
69 | .map { case (topic, count) => (count, topic) }
70 | .transform(_.sortByKey(false))
71 |
72 | val topCounts10 = hashTags.map((_, 1)).reduceByKeyAndWindow(_ + _, Seconds(10))
73 | .map { case (topic, count) => (count, topic) }
74 | .transform(_.sortByKey(false))
75 |
76 | lines.print()
77 |
78 | // Print popular hashtags
79 | topCounts60.foreachRDD(rdd => {
80 | val topList = rdd.take(10)
81 | println("\nPopular topics in last 60 seconds (%s total):".format(rdd.count()))
82 | topList.foreach { case (count, tag) => println("%s (%s tweets)".format(tag, count)) }
83 | })
84 |
85 | topCounts10.foreachRDD(rdd => {
86 | val topList = rdd.take(10)
87 | println("\nPopular topics in last 10 seconds (%s total):".format(rdd.count()))
88 | topList.foreach { case (count, tag) => println("%s (%s tweets)".format(tag, count)) }
89 | })
90 |
91 | lines.count().map(cnt => "Received " + cnt + " kafka messages.").print()
92 |
93 | ssc
94 |
95 | }
96 |
97 | def main(args: Array[String]) {
98 |
99 | if (args.length < 2) {
100 | System.err.println(s"""
101 | |Usage: KafkaDirectPopularHashTags
102 | | is a list of one or more Kafka brokers
103 | | is a list of one or more kafka topics to consume from
104 | | the directory where the metadata is stored
105 | |
106 | """.stripMargin)
107 | System.exit(1)
108 | }
109 |
110 | // Create an array of arguments: brokers, topicname, checkpoint directory
111 | val Array(brokers, topics, checkpointDirectory) = args
112 |
113 | val ssc = StreamingContext.getOrCreate(checkpointDirectory,
114 | () => createContext(brokers, topics, checkpointDirectory))
115 |
116 | ssc.start()
117 | ssc.awaitTermination()
118 | }
119 |
120 | }
--------------------------------------------------------------------------------
/src/main/scala/com/stdatalabs/Streaming/KafkaSparkPopularHashTags.scala:
--------------------------------------------------------------------------------
1 | package com.stdatalabs.Streaming
2 |
3 | import java.util.HashMap
4 |
5 | import org.apache.kafka.clients.producer.{ KafkaProducer, ProducerConfig, ProducerRecord }
6 | import org.apache.spark.SparkConf
7 | import org.apache.spark.streaming._
8 | import org.apache.spark.streaming.kafka._
9 | import org.apache.spark.streaming.{ Seconds, StreamingContext }
10 | import org.apache.spark.SparkContext._
11 | import org.apache.spark.streaming.twitter._
12 | import org.apache.spark.SparkConf
13 | import org.apache.spark.streaming._
14 | import org.apache.spark.{ SparkContext, SparkConf }
15 | import org.apache.spark.storage.StorageLevel
16 |
17 | /**
18 | * A Spark Streaming - Kafka integration to receive twitter
19 | * data from kafka topic and find the popular hashtags
20 | *
21 | * Arguments:
22 | * - The zookeeper hostname
23 | * - The Kafka consumer group
24 | * - The kafka topic to subscribe to
25 | * - Number of kafka receivers to run in parallel
26 | *
27 | * More discussion at stdatalabs.blogspot.com
28 | *
29 | * @author Sachin Thirumala
30 | */
31 |
32 | object KafkaSparkPopularHashTags {
33 |
34 | val conf = new SparkConf().setMaster("local[6]").setAppName("Spark Streaming - Kafka Producer - PopularHashTags").set("spark.executor.memory", "1g")
35 |
36 | conf.set("spark.streaming.receiver.writeAheadLog.enable", "true")
37 |
38 | val sc = new SparkContext(conf)
39 |
40 | def main(args: Array[String]) {
41 |
42 | sc.setLogLevel("WARN")
43 |
44 | // Create an array of arguments: zookeeper hostname/ip,consumer group, topicname, num of threads
45 | val Array(zkQuorum, group, topics, numThreads) = args
46 |
47 | // Set the Spark StreamingContext to create a DStream for every 2 seconds
48 | val ssc = new StreamingContext(sc, Seconds(2))
49 | ssc.checkpoint("checkpoint")
50 |
51 | // Map each topic to a thread
52 | val topicMap = topics.split(",").map((_, numThreads.toInt)).toMap
53 | // Map value from the kafka message (k, v) pair
54 | val lines = KafkaUtils.createStream(ssc, zkQuorum, group, topicMap).map(_._2)
55 | // Filter hashtags
56 | val hashTags = lines.flatMap(_.split(" ")).filter(_.startsWith("#"))
57 |
58 | // Get the top hashtags over the previous 60/10 sec window
59 | val topCounts60 = hashTags.map((_, 1)).reduceByKeyAndWindow(_ + _, Seconds(60))
60 | .map { case (topic, count) => (count, topic) }
61 | .transform(_.sortByKey(false))
62 |
63 | val topCounts10 = hashTags.map((_, 1)).reduceByKeyAndWindow(_ + _, Seconds(10))
64 | .map { case (topic, count) => (count, topic) }
65 | .transform(_.sortByKey(false))
66 |
67 | lines.print()
68 |
69 | // Print popular hashtags
70 | topCounts60.foreachRDD(rdd => {
71 | val topList = rdd.take(10)
72 | println("\nPopular topics in last 60 seconds (%s total):".format(rdd.count()))
73 | topList.foreach { case (count, tag) => println("%s (%s tweets)".format(tag, count)) }
74 | })
75 |
76 | topCounts10.foreachRDD(rdd => {
77 | val topList = rdd.take(10)
78 | println("\nPopular topics in last 10 seconds (%s total):".format(rdd.count()))
79 | topList.foreach { case (count, tag) => println("%s (%s tweets)".format(tag, count)) }
80 | })
81 |
82 | lines.count().map(cnt => "Received " + cnt + " kafka messages.").print()
83 |
84 | ssc.start()
85 | ssc.awaitTermination()
86 | }
87 | }
--------------------------------------------------------------------------------
/src/main/scala/com/stdatalabs/Streaming/RecoverableKafkaPopularHashTags.scala:
--------------------------------------------------------------------------------
1 | package com.stdatalabs.Streaming
2 |
3 | import java.util.HashMap
4 |
5 | import org.apache.kafka.clients.producer.{ KafkaProducer, ProducerConfig, ProducerRecord }
6 | import org.apache.spark.SparkConf
7 | import org.apache.spark.streaming._
8 | import org.apache.spark.streaming.kafka._
9 | import org.apache.spark.streaming.{ Seconds, StreamingContext }
10 | import org.apache.spark.SparkContext._
11 | import org.apache.spark.streaming.twitter._
12 | import org.apache.spark.SparkConf
13 | import org.apache.spark.streaming._
14 | import org.apache.spark.{ SparkContext, SparkConf }
15 | import org.apache.spark.storage.StorageLevel
16 |
17 | /**
18 | * A Spark Streaming - Kafka integration to receive twitter data from kafka
19 | * topic and find the popular hashtags and also ensure at-least once semantics
20 | * i.e, zero data loss
21 | *
22 | * Arguments:
23 | * - The zookeeper hostname
24 | * - The Kafka consumer group
25 | * - The kafka topic to subscribe to
26 | * - Number of kafka receivers to run in parallel
27 | * - The directory to store and retrieve checkpoint data
28 | *
29 | * More discussion at stdatalabs.blogspot.com
30 | *
31 | * @author Sachin Thirumala
32 | */
33 |
34 | object RecoverableKafkaPopularHashTags {
35 |
36 | def createContext(zkQuorum: String, group: String, topics: String, numThreads: String, checkpointDirectory: String): StreamingContext = {
37 |
38 | // If you do not see this printed, that means the StreamingContext has been loaded
39 | // from the new checkpoint
40 | println("Creating new context")
41 | val conf = new SparkConf().setMaster("local[6]").setAppName("Spark Streaming - Kafka Producer - PopularHashTags").set("spark.executor.memory", "1g")
42 |
43 | conf.set("spark.streaming.receiver.writeAheadLog.enable", "true")
44 |
45 | val sc = new SparkContext(conf)
46 |
47 | sc.setLogLevel("WARN")
48 |
49 | // Set the Spark StreamingContext to create a DStream for every 2 seconds
50 | val ssc = new StreamingContext(sc, Seconds(2))
51 |
52 | ssc.checkpoint(checkpointDirectory)
53 |
54 | // Map each topic to a thread
55 | val topicMap = topics.split(",").map((_, numThreads.toInt)).toMap
56 | // Map value from the kafka message (k, v) pair
57 | val lines = KafkaUtils.createStream(ssc, zkQuorum, group, topicMap).map(_._2)
58 | // Filter hashtags
59 | val hashTags = lines.flatMap(_.split(" ")).filter(_.startsWith("#"))
60 |
61 | // Get the top hashtags over the previous 60/10 sec window
62 | val topCounts60 = hashTags.map((_, 1)).reduceByKeyAndWindow(_ + _, Seconds(60))
63 | .map { case (topic, count) => (count, topic) }
64 | .transform(_.sortByKey(false))
65 |
66 | val topCounts10 = hashTags.map((_, 1)).reduceByKeyAndWindow(_ + _, Seconds(10))
67 | .map { case (topic, count) => (count, topic) }
68 | .transform(_.sortByKey(false))
69 |
70 | lines.print()
71 |
72 | // Print popular hashtags
73 | topCounts60.foreachRDD(rdd => {
74 | val topList = rdd.take(10)
75 | println("\nPopular topics in last 60 seconds (%s total):".format(rdd.count()))
76 | topList.foreach { case (count, tag) => println("%s (%s tweets)".format(tag, count)) }
77 | })
78 |
79 | topCounts10.foreachRDD(rdd => {
80 | val topList = rdd.take(10)
81 | println("\nPopular topics in last 10 seconds (%s total):".format(rdd.count()))
82 | topList.foreach { case (count, tag) => println("%s (%s tweets)".format(tag, count)) }
83 | })
84 |
85 | lines.count().map(cnt => "Received " + cnt + " kafka messages.").print()
86 |
87 | ssc
88 |
89 | }
90 |
91 | def main(args: Array[String]) {
92 |
93 | if (args.length != 5) {
94 | System.err.println("Your arguments were " + args.mkString("[", ", ", "]"))
95 | System.err.println(
96 | """
97 | |Usage: RecoverableKafkaPopularHashTags
98 | |
99 | """.stripMargin
100 | )
101 | System.exit(1)
102 | }
103 |
104 | // Create an array of arguments: zookeeper hostname/ip,consumer group, topicname, num of threads
105 | val Array(zkQuorum, group, topics, numThreads, checkpointDirectory) = args
106 |
107 | val ssc = StreamingContext.getOrCreate(checkpointDirectory,
108 | () => createContext(zkQuorum, group, topics, numThreads, checkpointDirectory))
109 |
110 | ssc.start()
111 | ssc.awaitTermination()
112 | }
113 |
114 | }
--------------------------------------------------------------------------------
/src/main/scala/com/stdatalabs/Streaming/SparkPopularHashTags.scala:
--------------------------------------------------------------------------------
1 | package com.stdatalabs.Streaming
2 |
3 | import org.apache.spark.streaming.{ Seconds, StreamingContext }
4 | import org.apache.spark.SparkContext._
5 | import org.apache.spark.streaming.twitter._
6 | import org.apache.spark.SparkConf
7 | import org.apache.spark.streaming._
8 | import org.apache.spark.{ SparkContext, SparkConf }
9 | import org.apache.spark.storage.StorageLevel
10 | import org.apache.spark.streaming.flume._
11 |
12 | /**
13 | * A Spark Streaming application that receives tweets on certain
14 | * keywords from twitter datasource and find the popular hashtags
15 | *
16 | * Arguments: ...
17 | * - Twitter consumer key
18 | * - Twitter consumer secret
19 | * - Twitter access token
20 | * - Twitter access token secret
21 | * - The keyword to filter tweets
22 | * - Any number of keywords to filter tweets
23 | *
24 | * More discussion at stdatalabs.blogspot.com
25 | *
26 | * @author Sachin Thirumala
27 | */
28 |
29 | object SparkPopularHashTags {
30 | val conf = new SparkConf().setMaster("local[4]").setAppName("Spark Streaming - PopularHashTags")
31 | val sc = new SparkContext(conf)
32 |
33 | def main(args: Array[String]) {
34 |
35 | sc.setLogLevel("WARN")
36 |
37 | val Array(consumerKey, consumerSecret, accessToken, accessTokenSecret) = args.take(4)
38 | val filters = args.takeRight(args.length - 4)
39 |
40 | // Set the system properties so that Twitter4j library used by twitter stream
41 | // can use them to generat OAuth credentials
42 | System.setProperty("twitter4j.oauth.consumerKey", consumerKey)
43 | System.setProperty("twitter4j.oauth.consumerSecret", consumerSecret)
44 | System.setProperty("twitter4j.oauth.accessToken", accessToken)
45 | System.setProperty("twitter4j.oauth.accessTokenSecret", accessTokenSecret)
46 |
47 | // Set the Spark StreamingContext to create a DStream for every 5 seconds
48 | val ssc = new StreamingContext(sc, Seconds(5))
49 | // Pass the filter keywords as arguements
50 |
51 | // val stream = FlumeUtils.createStream(ssc, args(0), args(1).toInt)
52 | val stream = TwitterUtils.createStream(ssc, None, filters)
53 |
54 | // Split the stream on space and extract hashtags
55 | val hashTags = stream.flatMap(status => status.getText.split(" ").filter(_.startsWith("#")))
56 |
57 | // Get the top hashtags over the previous 60 sec window
58 | val topCounts60 = hashTags.map((_, 1)).reduceByKeyAndWindow(_ + _, Seconds(60))
59 | .map { case (topic, count) => (count, topic) }
60 | .transform(_.sortByKey(false))
61 |
62 | // Get the top hashtags over the previous 10 sec window
63 | val topCounts10 = hashTags.map((_, 1)).reduceByKeyAndWindow(_ + _, Seconds(10))
64 | .map { case (topic, count) => (count, topic) }
65 | .transform(_.sortByKey(false))
66 |
67 | // print tweets in the currect DStream
68 | stream.print()
69 |
70 | // Print popular hashtags
71 | topCounts60.foreachRDD(rdd => {
72 | val topList = rdd.take(10)
73 | println("\nPopular topics in last 60 seconds (%s total):".format(rdd.count()))
74 | topList.foreach { case (count, tag) => println("%s (%s tweets)".format(tag, count)) }
75 | })
76 | topCounts10.foreachRDD(rdd => {
77 | val topList = rdd.take(10)
78 | println("\nPopular topics in last 10 seconds (%s total):".format(rdd.count()))
79 | topList.foreach { case (count, tag) => println("%s (%s tweets)".format(tag, count)) }
80 | })
81 |
82 | ssc.start()
83 | ssc.awaitTermination()
84 | }
85 | }
--------------------------------------------------------------------------------
/src/test/java/org/stdatalabs/TwitterPopularHashTags/AppTest.java:
--------------------------------------------------------------------------------
1 | package org.stdatalabs.TwitterPopularHashTags;
2 |
3 | import junit.framework.Test;
4 | import junit.framework.TestCase;
5 | import junit.framework.TestSuite;
6 |
7 | /**
8 | * Unit test for simple App.
9 | */
10 | public class AppTest
11 | extends TestCase
12 | {
13 | /**
14 | * Create the test case
15 | *
16 | * @param testName name of the test case
17 | */
18 | public AppTest( String testName )
19 | {
20 | super( testName );
21 | }
22 |
23 | /**
24 | * @return the suite of tests being tested
25 | */
26 | public static Test suite()
27 | {
28 | return new TestSuite( AppTest.class );
29 | }
30 |
31 | /**
32 | * Rigourous Test :-)
33 | */
34 | public void testApp()
35 | {
36 | assertTrue( true );
37 | }
38 | }
39 |
--------------------------------------------------------------------------------