├── .gitignore
├── README.md
├── TwitterAvroSource.conf
├── pom.xml
└── src
    ├── main
        └── scala
        │   └── com
        │       └── stdatalabs
        │           ├── Kafka
        │               └── KafkaTwitterProducer.java
        │           └── Streaming
        │               ├── FlumeSparkPopularHashTags.scala
        │               ├── KafkaDirectPopularHashTags.scala
        │               ├── KafkaSparkPopularHashTags.scala
        │               ├── RecoverableKafkaPopularHashTags.scala
        │               └── SparkPopularHashTags.scala
    └── test
        └── java
            └── org
                └── stdatalabs
                    └── TwitterPopularHashTags
                        └── AppTest.java


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.iml
 2 | .*/
 3 | target/
 4 | .classpath
 5 | .cache-main
 6 | .cache-tests
 7 | .settings
 8 | .project
 9 | tweets.txt
10 | checkpoint/
11 | 
12 | # Compiled source #
13 | ###################
14 | *.com
15 | *.class
16 | *.dll
17 | *.exe
18 | *.o
19 | *.so
20 | checkpoint
21 | 
22 | # Packages #
23 | ############
24 | # it's better to unpack these files and commit the raw source
25 | # git has its own built in compression methods
26 | *.7z
27 | *.dmg
28 | *.gz
29 | *.iso
30 | *.jar
31 | *.rar
32 | *.tar
33 | *.zip
34 | 
35 | # Logs and databases #
36 | ######################
37 | *.log
38 | *.sql
39 | *.sqlite
40 | 
41 | # OS generated files #
42 | ######################
43 | .DS_Store
44 | .DS_Store?
45 | ._*
46 | .Spotlight-V100
47 | .Trashes
48 | ehthumbs.db
49 | Thumbs.db
50 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # SparkTwitterPopularHashTags
 2 | 
 3 | A project on Spark Streaming to analyze Popular hashtags from live twitter data streams. Data is ingested from different input sources like Twitter source, Flume and Kafka and processed downstream using Spark Streaming.
 4 | 
 5 | ## Requirements
 6 | - IDE 
 7 | - Apache Maven 3.x
 8 | - JVM 6 or 7
 9 | 
10 | ## General Info
11 | The source folder is organized into 2 packages i.e. Kafka and Streaming. Each class in the Streaming package explores different approach to consume data from Twitter source. Below is the list of classes:
12 | * com/stdatalabs/Kafka
13 |      * KafkaTwitterProducer.java --   A Kafka Producer that publishes twitter data to a kafka broker
14 | * com/stdatalabs/Streaming
15 |     * SparkPopularHashTags.scala -- Receives data from Twitter datasource
16 |     * FlumeSparkPopularHashTags.scala -- Receives data from Flume Twitter producer
17 |     * KafkaSparkPopularHashTags.scala -- Receives data from Kafka Producer
18 |     * RecoverableKafkaPopularHashTags.scala -- Spark-Kafka receiver based approach. Ensures at-least once semantics
19 |     * KafkaDirectPopularHashTags.scala -- Spark-Kafka Direct approach. Ensures exactly once semantics
20 | * TwitterAvroSource.conf 
21 |     -- Flume conf for running Twitter avro source
22 | 
23 | ## Description
24 | * ##### A Spark Streaming application that receives tweets on certain keywords from twitter datasource and finds the popular hashtags. 
25 |   Discussed in blog -- 
26 |      [Spark Streaming part 1: Real time twitter sentiment analysis](http://stdatalabs.blogspot.in/2016/09/spark-streaming-part-1-real-time.html)
27 | 
28 | * ##### A Spark Streaming - Flume integration to find Popular hashtags from twitter. It receives events from a Flume source that connects to twitter and pushes tweets as avro events to sink.
29 |     Discussed in blog -- 
30 |      [Spark streaming part 2: Real time twitter sentiment analysis using Flume](http://stdatalabs.blogspot.in/2016/09/spark-streaming-part-2-real-time_10.html)
31 |      
32 | * ##### A Spark Streaming - Kafka integration to receive twitter data from kafka producer and find the popular hashtags
33 |     Discussed in blog -- 
34 |      [Spark streaming part 3: Real time twitter sentiment analysis using kafka](http://stdatalabs.blogspot.in/2016/09/spark-streaming-part-3-real-time.html)
35 |      
36 | * ##### A Spark Streaming - Kafka integration to ensure at-least once semantics
37 |     Discussed in blog -- 
38 |      [Data guarantees in Spark Streaming with kafka integration](http://stdatalabs.blogspot.in/2016/10/data-guarantees-in-spark-streaming-with.html)
39 |      
40 | * ##### A Spark Streaming - Kafka integration to ensure exactly once semantics
41 |     Discussed in blog -- 
42 |      [Data guarantees in Spark Streaming with kafka integration](http://stdatalabs.blogspot.in/2016/10/data-guarantees-in-spark-streaming-with.html)
43 | 
44 | 
45 | 
46 | ### More articles on hadoop technology stack at [stdatalabs](http://stdatalabs.blogspot.com)
47 | 
48 | 


--------------------------------------------------------------------------------
/TwitterAvroSource.conf:
--------------------------------------------------------------------------------
 1 | TwitterAgent.sources = Twitter  
 2 |  TwitterAgent.channels = MemChannel  
 3 |  TwitterAgent.sinks = avroSink  
 4 | 
 5 | 
 6 | 
 7 | # Describing/Configuring the source   
 8 |  TwitterAgent.sources.Twitter.type = org.apache.flume.source.twitter.TwitterSource  
 9 |  TwitterAgent.sources.Twitter.consumerKey= 
10 |  TwitterAgent.sources.Twitter.consumerSecret=
11 |  TwitterAgent.sources.Twitter.accessToken=
12 |  TwitterAgent.sources.Twitter.accessTokenSecret=
13 | 
14 | 
15 | 
16 |  # Describing/Configuring the sink   
17 |  TwitterAgent.sinks.avroSink.type = avro  
18 |  TwitterAgent.sinks.avroSink.batch-size = 1  
19 |  TwitterAgent.sinks.avroSink.hostname = ubuntu 
20 |  TwitterAgent.sinks.avroSink.port = 9988  
21 | 
22 | 
23 | 
24 |  # Describing/Configuring the memory channel  
25 |  TwitterAgent.channels.MemChannel.type = memory  
26 |  TwitterAgent.channels.MemChannel.capacity = 10000  
27 |  TwitterAgent.channels.MemChannel.transactionCapacity = 100 
28 | 
29 | 
30 | 
31 |  # Linking the source and sink to the memory channel  
32 |  TwitterAgent.sources.Twitter.channels = MemChannel  
33 |  TwitterAgent.sinks.avroSink.channel = MemChannel  
34 | 


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
  1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  2 | 	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  3 | 	<modelVersion>4.0.0</modelVersion>
  4 | 
  5 | 	<groupId>com.stdatalabs.Streaming</groupId>
  6 | 	<artifactId>SparkTwitterAnalysis</artifactId>
  7 | 	<version>0.0.1-SNAPSHOT</version>
  8 | 	<packaging>jar</packaging>
  9 | 
 10 | 	<name>TwitterPopularHashTags</name>
 11 | 	<url>http://maven.apache.org</url>
 12 | 
 13 | 	<properties>
 14 | 		<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
 15 | 	</properties>
 16 | 
 17 | 	<dependencies>
 18 | 		<dependency>
 19 | 			<groupId>junit</groupId>
 20 | 			<artifactId>junit</artifactId>
 21 | 			<version>3.8.1</version>
 22 | 			<scope>test</scope>
 23 | 		</dependency>
 24 | 		<dependency>
 25 | 			<groupId>junit</groupId>
 26 | 			<artifactId>junit</artifactId>
 27 | 			<version>3.8.1</version>
 28 | 			<scope>test</scope>
 29 | 		</dependency>
 30 | 		<dependency>
 31 | 			<groupId>org.apache.spark</groupId>
 32 | 			<artifactId>spark-core_2.10</artifactId>
 33 | 			<version>1.4.1</version>
 34 | 			<scope>compile</scope>
 35 | 		</dependency>
 36 | 
 37 | 		<dependency>
 38 | 			<groupId>org.apache.spark</groupId>
 39 | 			<artifactId>spark-streaming_2.10</artifactId>
 40 | 			<version>1.4.1</version>
 41 | 			<scope>compile</scope>
 42 | 		</dependency>
 43 | 
 44 | 		<dependency>
 45 | 			<groupId>org.apache.spark</groupId>
 46 | 			<artifactId>spark-mllib_2.10</artifactId>
 47 | 			<version>1.4.1</version>
 48 | 		</dependency>
 49 | 		<dependency>
 50 | 			<groupId>org.apache.spark</groupId>
 51 | 			<artifactId>spark-sql_2.10</artifactId>
 52 | 			<version>1.4.1</version>
 53 | 		</dependency>
 54 | 		<dependency>
 55 | 			<groupId>org.apache.spark</groupId>
 56 | 			<artifactId>spark-hive_2.10</artifactId>
 57 | 			<version>1.4.1</version>
 58 | 		</dependency>
 59 | 
 60 | 		<dependency>
 61 | 			<groupId>org.apache.spark</groupId>
 62 | 			<artifactId>spark-streaming-twitter_2.10</artifactId>
 63 | 			<version>1.4.1</version>
 64 | 		</dependency>
 65 | 		<dependency>
 66 | 			<groupId>org.apache.spark</groupId>
 67 | 			<artifactId>spark-streaming-kafka_2.10</artifactId>
 68 | 			<version>1.4.1</version>
 69 | 		</dependency>
 70 | 		<dependency>
 71 | 			<groupId>org.apache.hadoop</groupId>
 72 | 			<artifactId>hadoop-client</artifactId>
 73 | 			<version>2.4.1</version>
 74 | 			<scope>compile</scope>
 75 | 		</dependency>
 76 | 		<dependency>
 77 | 			<groupId>org.apache.hadoop</groupId>
 78 | 			<artifactId>hadoop-common</artifactId>
 79 | 			<version>2.4.1</version>
 80 | 			<scope>compile</scope>
 81 | 		</dependency>
 82 | 		<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-streaming-flume_2.10 -->
 83 | 		<dependency>
 84 | 			<groupId>org.apache.spark</groupId>
 85 | 			<artifactId>spark-streaming-flume_2.10</artifactId>
 86 | 			<version>1.4.1</version>
 87 | 		</dependency>
 88 | 		<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-streaming-flume-sink_2.10 -->
 89 | 		<dependency>
 90 | 			<groupId>org.apache.spark</groupId>
 91 | 			<artifactId>spark-streaming-flume-sink_2.10</artifactId>
 92 | 			<version>1.4.1</version>
 93 | 		</dependency>
 94 | 		<!-- https://mvnrepository.com/artifact/org.twitter4j/twitter4j-core -->
 95 | 		<dependency>
 96 | 			<groupId>org.twitter4j</groupId>
 97 | 			<artifactId>twitter4j-core</artifactId>
 98 | 			<version>3.0.3</version>
 99 | 		</dependency>
100 | 		<!-- https://mvnrepository.com/artifact/org.twitter4j/twitter4j-stream -->
101 | 		<dependency>
102 | 			<groupId>org.twitter4j</groupId>
103 | 			<artifactId>twitter4j-stream</artifactId>
104 | 			<version>3.0.3</version>
105 | 		</dependency>
106 | 		<!-- https://mvnrepository.com/artifact/org.twitter4j/twitter4j-async -->
107 | 		<dependency>
108 | 			<groupId>org.twitter4j</groupId>
109 | 			<artifactId>twitter4j-async</artifactId>
110 | 			<version>3.0.3</version>
111 | 		</dependency>
112 | 				<dependency>
113 | 			<groupId>org.apache.storm</groupId>
114 | 			<artifactId>storm-core</artifactId>
115 | 			<version>0.10.0</version>
116 | 			<scope>provided</scope>
117 | 			<exclusions>
118 | 				<exclusion>
119 | 					<groupId>org.slf4j</groupId>
120 | 					<artifactId>slf4j-log4j12</artifactId>
121 | 				</exclusion>
122 | 			</exclusions>
123 | 		</dependency>
124 | 		<dependency>
125 | 			<groupId>org.apache.kafka</groupId>
126 | 			<artifactId>kafka_2.10</artifactId>
127 | 			<version>0.8.2.1</version>
128 | 			<exclusions>
129 | 				<exclusion>
130 | 					<groupId>org.slf4j</groupId>
131 | 					<artifactId>slf4j-log4j12</artifactId>
132 | 				</exclusion>
133 | 			</exclusions>
134 | 		</dependency>
135 | 		<dependency>
136 | 			<groupId>org.apache.storm</groupId>
137 | 			<artifactId>storm-kafka</artifactId>
138 | 			<version>0.10.0-beta1</version>
139 | 		</dependency>
140 | 		<dependency>
141 | 			<groupId>org.apache.storm</groupId>
142 | 			<artifactId>storm-hdfs</artifactId>
143 | 			<version>0.10.0-beta1</version>
144 | 		</dependency>
145 | 	</dependencies>
146 | </project>
147 | 


--------------------------------------------------------------------------------
/src/main/scala/com/stdatalabs/Kafka/KafkaTwitterProducer.java:
--------------------------------------------------------------------------------
  1 | package com.stdatalabs.Kafka;
  2 | 
  3 | import java.util.Arrays;
  4 | import java.util.Properties;
  5 | import java.util.concurrent.LinkedBlockingQueue;
  6 | 
  7 | import twitter4j.*;
  8 | import twitter4j.conf.*;
  9 | 
 10 | import twitter4j.StallWarning;
 11 | import twitter4j.Status;
 12 | import twitter4j.StatusDeletionNotice;
 13 | import twitter4j.StatusListener;
 14 | import twitter4j.TwitterStream;
 15 | import twitter4j.TwitterStreamFactory;
 16 | import twitter4j.conf.ConfigurationBuilder;
 17 | import twitter4j.json.DataObjectFactory;
 18 | 
 19 | import org.apache.kafka.clients.producer.Producer;
 20 | import org.apache.kafka.clients.producer.KafkaProducer;
 21 | import org.apache.kafka.clients.producer.ProducerRecord;
 22 | import kafka.producer.KeyedMessage;
 23 | 
 24 | /**
 25 |  * A Kafka Producer that gets tweets on certain keywords
 26 |  * from twitter datasource and publishes to a kafka topic
 27 |  * 
 28 |  * Arguments: <comsumerKey> <consumerSecret> <accessToken> <accessTokenSecret> <topic-name> <keyword_1> ... <keyword_n>
 29 |  * <comsumerKey>		- Twitter consumer key 
 30 |  * <consumerSecret>  	- Twitter consumer secret
 31 |  * <accessToken>		- Twitter access token
 32 |  * <accessTokenSecret>	- Twitter access token secret
 33 |  * <topic-name>			- The kafka topic to subscribe to
 34 |  * <keyword_1>			- The keyword to filter tweets
 35 |  * <keyword_n>			- Any number of keywords to filter tweets
 36 |  * 
 37 |  * More discussion at stdatalabs.blogspot.com
 38 |  * 
 39 |  * @author Sachin Thirumala
 40 |  */
 41 | 
 42 | public class KafkaTwitterProducer {
 43 | 	public static void main(String[] args) throws Exception {
 44 | 		final LinkedBlockingQueue<Status> queue = new LinkedBlockingQueue<Status>(1000);
 45 | 
 46 | 		if (args.length < 4) {
 47 | 			System.out.println(
 48 | 					"Usage: KafkaTwitterProducer <twitter-consumer-key> <twitter-consumer-secret> <twitter-access-token> <twitter-access-token-secret> <topic-name> <twitter-search-keywords>");
 49 | 			return;
 50 | 		}
 51 | 
 52 | 		String consumerKey = args[0].toString();
 53 | 		String consumerSecret = args[1].toString();
 54 | 		String accessToken = args[2].toString();
 55 | 		String accessTokenSecret = args[3].toString();
 56 | 		String topicName = args[4].toString();
 57 | 		String[] arguments = args.clone();
 58 | 		String[] keyWords = Arrays.copyOfRange(arguments, 5, arguments.length);
 59 | 
 60 | 		// Set twitter oAuth tokens in the configuration
 61 | 		ConfigurationBuilder cb = new ConfigurationBuilder();
 62 | 		cb.setDebugEnabled(true).setOAuthConsumerKey(consumerKey).setOAuthConsumerSecret(consumerSecret)
 63 | 				.setOAuthAccessToken(accessToken).setOAuthAccessTokenSecret(accessTokenSecret);
 64 | 
 65 | 		// Create twitterstream using the configuration
 66 | 		TwitterStream twitterStream = new TwitterStreamFactory(cb.build()).getInstance();
 67 | 		StatusListener listener = new StatusListener() {
 68 | 
 69 | 			@Override
 70 | 			public void onStatus(Status status) {
 71 | 				queue.offer(status);
 72 | 			}
 73 | 
 74 | 			@Override
 75 | 			public void onDeletionNotice(StatusDeletionNotice statusDeletionNotice) {
 76 | 				System.out.println("Got a status deletion notice id:" + statusDeletionNotice.getStatusId());
 77 | 			}
 78 | 
 79 | 			@Override
 80 | 			public void onTrackLimitationNotice(int numberOfLimitedStatuses) {
 81 | 				System.out.println("Got track limitation notice:" + numberOfLimitedStatuses);
 82 | 			}
 83 | 
 84 | 			@Override
 85 | 			public void onScrubGeo(long userId, long upToStatusId) {
 86 | 				System.out.println("Got scrub_geo event userId:" + userId + "upToStatusId:" + upToStatusId);
 87 | 			}
 88 | 
 89 | 			@Override
 90 | 			public void onStallWarning(StallWarning warning) {
 91 | 				System.out.println("Got stall warning:" + warning);
 92 | 			}
 93 | 
 94 | 			@Override
 95 | 			public void onException(Exception ex) {
 96 | 				ex.printStackTrace();
 97 | 			}
 98 | 		};
 99 | 		twitterStream.addListener(listener);
100 | 
101 | 		// Filter keywords
102 | 		FilterQuery query = new FilterQuery().track(keyWords);
103 | 		twitterStream.filter(query);
104 | 
105 | 		// Thread.sleep(5000);
106 | 
107 | 		// Add Kafka producer config settings
108 | 		Properties props = new Properties();
109 | 		props.put("metadata.broker.list", "localhost:9092");
110 | 		props.put("bootstrap.servers", "localhost:9092");
111 | 		props.put("acks", "all");
112 | 		props.put("retries", 0);
113 | 		props.put("batch.size", 16384);
114 | 		props.put("linger.ms", 1);
115 | 		props.put("buffer.memory", 33554432);
116 | 
117 | 		props.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer");
118 | 		props.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer");
119 | 
120 | 		Producer<String, String> producer = new KafkaProducer<String, String>(props);
121 | 		int i = 0;
122 | 		int j = 0;
123 | 
124 | 		// poll for new tweets in the queue. If new tweets are added, send them
125 | 		// to the topic
126 | 		while (true) {
127 | 			Status ret = queue.poll();
128 | 
129 | 			if (ret == null) {
130 | 				Thread.sleep(100);
131 | 				// i++;
132 | 			} else {
133 | 				for (HashtagEntity hashtage : ret.getHashtagEntities()) {
134 | 					System.out.println("Tweet:" + ret);
135 | 					System.out.println("Hashtag: " + hashtage.getText());
136 | 					// producer.send(new ProducerRecord<String, String>(
137 | 					// topicName, Integer.toString(j++), hashtage.getText()));
138 | 					producer.send(new ProducerRecord<String, String>(topicName, Integer.toString(j++), ret.getText()));
139 | 				}
140 | 			}
141 | 		}
142 | 		// producer.close();
143 | 		// Thread.sleep(500);
144 | 		// twitterStream.shutdown();
145 | 	}
146 | 
147 | }
148 | 


--------------------------------------------------------------------------------
/src/main/scala/com/stdatalabs/Streaming/FlumeSparkPopularHashTags.scala:
--------------------------------------------------------------------------------
 1 | package com.stdatalabs.Streaming
 2 | 
 3 | import org.apache.spark.streaming.{ Seconds, StreamingContext }
 4 | import org.apache.spark.SparkContext._
 5 | import org.apache.spark.streaming.twitter._
 6 | import org.apache.spark.SparkConf
 7 | import org.apache.spark.streaming._
 8 | import org.apache.spark.{ SparkContext, SparkConf }
 9 | import org.apache.spark.storage.StorageLevel
10 | import org.apache.spark.streaming.flume._
11 | 
12 | /**
13 |  * A Spark Streaming - Flume integration to find Popular hashtags from twitter
14 |  * It receives events from a Flume source that connects to twitter and pushes 
15 |  * tweets as avro events to sink.
16 |  * 
17 |  * More discussion at stdatalabs.blogspot.com
18 |  * 
19 |  * @author Sachin Thirumala
20 |  */
21 | 
22 | object FlumeSparkPopularHashTags {
23 |   val conf = new SparkConf().setMaster("local[6]").setAppName("Spark Streaming - Flume Source - PopularHashTags")
24 |   val sc = new SparkContext(conf)
25 |   def main(args: Array[String]) {
26 |     sc.setLogLevel("WARN")
27 |     // Set the Spark StreamingContext to create a DStream for every 5 seconds  
28 |     val ssc = new StreamingContext(sc, Seconds(5))
29 |     val filter = args.takeRight(args.length)
30 | 
31 |     // Create stream using FlumeUtils to receive data from flume at hostname: <hostname> and port: <port>  
32 |     val stream = FlumeUtils.createStream(ssc, "ubuntu", 9988)
33 |     val tweets = stream.map(e => new String(e.event.getBody.array))
34 |     tweets.print()
35 | 
36 |     // Split the stream on space and extract hashtags   
37 |     val hashTags = tweets.flatMap(status => status.split(" ").filter(_.startsWith("#")))
38 |     // Get the top hashtags over the previous 60 sec window  
39 |     val topCounts60 = hashTags.map((_, 1)).reduceByKeyAndWindow(_ + _, Seconds(60))
40 |       .map { case (topic, count) => (count, topic) }
41 |       .transform(_.sortByKey(false))
42 | 
43 |     // Get the top hashtags over the previous 10 sec window  
44 |     val topCounts10 = hashTags.map((_, 1)).reduceByKeyAndWindow(_ + _, Seconds(10))
45 |       .map { case (topic, count) => (count, topic) }
46 |       .transform(_.sortByKey(false))
47 | 
48 |     // Print popular hashtags  
49 |     topCounts60.foreachRDD(rdd => {
50 |       val topList = rdd.take(10)
51 |       println("\nPopular topics in last 60 seconds (%s total):".format(rdd.count()))
52 |       topList.foreach { case (count, tag) => println("%s (%s tweets)".format(tag, count)) }
53 |     })
54 | 
55 |     topCounts10.foreachRDD(rdd => {
56 |       val topList = rdd.take(10)
57 |       println("\nPopular topics in last 10 seconds (%s total):".format(rdd.count()))
58 |       topList.foreach { case (count, tag) => println("%s (%s tweets)".format(tag, count)) }
59 |     })
60 | 
61 |     stream.count().map(cnt => "Received " + cnt + " flume events.").print()
62 |     ssc.start()
63 |     ssc.awaitTermination()
64 |   }
65 | }  


--------------------------------------------------------------------------------
/src/main/scala/com/stdatalabs/Streaming/KafkaDirectPopularHashTags.scala:
--------------------------------------------------------------------------------
  1 | package com.stdatalabs.Streaming
  2 | 
  3 | import java.util.HashMap
  4 | 
  5 | import kafka.serializer.StringDecoder
  6 | 
  7 | import org.apache.kafka.clients.producer.{ KafkaProducer, ProducerConfig, ProducerRecord }
  8 | import org.apache.spark.SparkConf
  9 | import org.apache.spark.streaming._
 10 | import org.apache.spark.streaming.kafka._
 11 | import org.apache.spark.streaming.{ Seconds, StreamingContext }
 12 | import org.apache.spark.SparkContext._
 13 | import org.apache.spark.streaming.twitter._
 14 | import org.apache.spark.SparkConf
 15 | import org.apache.spark.streaming._
 16 | import org.apache.spark.{ SparkContext, SparkConf }
 17 | import org.apache.spark.storage.StorageLevel
 18 | import _root_.kafka.serializer.StringDecoder
 19 | 
 20 | /**
 21 |  * A Spark Streaming - Kafka integration to receive twitter data from 
 22 |  * kafka topic and find the popular hashtags and also ensure exactly once semantics
 23 |  * i.e, process data only once
 24 |  * 
 25 |  * Arguments: <brokers> <topics> <checkpointDir>
 26 |  * <brokers>             -	List of one or more Kafka brokers
 27 |  * <topics>              -	List of one or more kafka topics to consume from
 28 |  * <checkpointDirectory> -  The directory to store and retrieve checkpoint data 
 29 |  * 
 30 |  * More discussion at stdatalabs.blogspot.com
 31 |  * 
 32 |  * @author Sachin Thirumala
 33 |  */
 34 | 
 35 | object KafkaDirectReciverPopularHashTags {
 36 | 
 37 |   def createContext(brokers: String, topics: String, checkpointDirectory: String): StreamingContext = {
 38 | 
 39 |     // If you do not see this printed, that means the StreamingContext has been loaded
 40 |     // from the new checkpoint
 41 |     println("Creating new context")
 42 | 
 43 |     val conf = new SparkConf().setMaster("local[6]").setAppName("Spark Streaming - Kafka DirectReceiver - PopularHashTags").set("spark.executor.memory", "1g")
 44 | 
 45 |     val sc = new SparkContext(conf)
 46 | 
 47 |     sc.setLogLevel("WARN")
 48 | 
 49 |     // Set the Spark StreamingContext to create a DStream for every 2 seconds  
 50 |     val ssc = new StreamingContext(sc, Seconds(2))
 51 |     ssc.checkpoint("checkpoint")
 52 | 
 53 |     // Define the Kafka parameters, broker list must be specified
 54 |     val kafkaParams = Map[String, String](
 55 |       "metadata.broker.list" -> brokers,
 56 |       // start from the smallest available offset, ie the beginning of the kafka log
 57 |       "auto.offset.reset" -> "largest")
 58 | 
 59 |     // Define which topics to read from
 60 |     val topicsSet = topics.split(",").toSet
 61 | 
 62 |     // Map value from the kafka message (k, v) pair      
 63 |     val lines = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topicsSet)
 64 |     // Filter hashtags
 65 |     val hashTags = lines.map(_._2).flatMap(_.split(" ")).filter(_.startsWith("#"))
 66 | 
 67 |     // Get the top hashtags over the previous 60/10 sec window   
 68 |     val topCounts60 = hashTags.map((_, 1)).reduceByKeyAndWindow(_ + _, Seconds(60))
 69 |       .map { case (topic, count) => (count, topic) }
 70 |       .transform(_.sortByKey(false))
 71 | 
 72 |     val topCounts10 = hashTags.map((_, 1)).reduceByKeyAndWindow(_ + _, Seconds(10))
 73 |       .map { case (topic, count) => (count, topic) }
 74 |       .transform(_.sortByKey(false))
 75 | 
 76 |     lines.print()
 77 | 
 78 |     // Print popular hashtags
 79 |     topCounts60.foreachRDD(rdd => {
 80 |       val topList = rdd.take(10)
 81 |       println("\nPopular topics in last 60 seconds (%s total):".format(rdd.count()))
 82 |       topList.foreach { case (count, tag) => println("%s (%s tweets)".format(tag, count)) }
 83 |     })
 84 | 
 85 |     topCounts10.foreachRDD(rdd => {
 86 |       val topList = rdd.take(10)
 87 |       println("\nPopular topics in last 10 seconds (%s total):".format(rdd.count()))
 88 |       topList.foreach { case (count, tag) => println("%s (%s tweets)".format(tag, count)) }
 89 |     })
 90 | 
 91 |     lines.count().map(cnt => "Received " + cnt + " kafka messages.").print()
 92 | 
 93 |     ssc
 94 | 
 95 |   }
 96 | 
 97 |   def main(args: Array[String]) {
 98 | 
 99 |     if (args.length < 2) {
100 |       System.err.println(s"""
101 |         |Usage: KafkaDirectPopularHashTags <brokers> <topics>
102 |         |  <brokers> is a list of one or more Kafka brokers
103 |         |  <topics> is a list of one or more kafka topics to consume from
104 |         |  <checkpointDirectory> the directory where the metadata is stored
105 |         |
106 |         """.stripMargin)
107 |       System.exit(1)
108 |     }
109 | 
110 |     // Create an array of arguments: brokers, topicname, checkpoint directory  
111 |     val Array(brokers, topics, checkpointDirectory) = args
112 | 
113 |     val ssc = StreamingContext.getOrCreate(checkpointDirectory,
114 |       () => createContext(brokers, topics, checkpointDirectory))
115 | 
116 |     ssc.start()
117 |     ssc.awaitTermination()
118 |   }
119 | 
120 | }


--------------------------------------------------------------------------------
/src/main/scala/com/stdatalabs/Streaming/KafkaSparkPopularHashTags.scala:
--------------------------------------------------------------------------------
 1 | package com.stdatalabs.Streaming
 2 | 
 3 | import java.util.HashMap
 4 | 
 5 | import org.apache.kafka.clients.producer.{ KafkaProducer, ProducerConfig, ProducerRecord }
 6 | import org.apache.spark.SparkConf
 7 | import org.apache.spark.streaming._
 8 | import org.apache.spark.streaming.kafka._
 9 | import org.apache.spark.streaming.{ Seconds, StreamingContext }
10 | import org.apache.spark.SparkContext._
11 | import org.apache.spark.streaming.twitter._
12 | import org.apache.spark.SparkConf
13 | import org.apache.spark.streaming._
14 | import org.apache.spark.{ SparkContext, SparkConf }
15 | import org.apache.spark.storage.StorageLevel
16 | 
17 | /**
18 |  * A Spark Streaming - Kafka integration to receive twitter 
19 |  * data from kafka topic and find the popular hashtags
20 |  * 
21 |  * Arguments: <zkQuorum> <consumer-group> <topics> <numThreads>
22 |  * <zkQuorum>       - The zookeeper hostname
23 |  * <consumer-group> - The Kafka consumer group
24 |  * <topics>         - The kafka topic to subscribe to
25 |  * <numThreads>     - Number of kafka receivers to run in parallel
26 |  * 
27 |  * More discussion at stdatalabs.blogspot.com
28 |  * 
29 |  * @author Sachin Thirumala
30 |  */
31 | 
32 | object KafkaSparkPopularHashTags {
33 | 
34 |   val conf = new SparkConf().setMaster("local[6]").setAppName("Spark Streaming - Kafka Producer - PopularHashTags").set("spark.executor.memory", "1g")
35 | 
36 |   conf.set("spark.streaming.receiver.writeAheadLog.enable", "true")
37 |   
38 |   val sc = new SparkContext(conf)
39 | 
40 |   def main(args: Array[String]) {
41 | 
42 |     sc.setLogLevel("WARN")
43 | 
44 |     // Create an array of arguments: zookeeper hostname/ip,consumer group, topicname, num of threads   
45 |     val Array(zkQuorum, group, topics, numThreads) = args
46 | 
47 |     // Set the Spark StreamingContext to create a DStream for every 2 seconds  
48 |     val ssc = new StreamingContext(sc, Seconds(2))
49 |     ssc.checkpoint("checkpoint")
50 | 
51 |     // Map each topic to a thread  
52 |     val topicMap = topics.split(",").map((_, numThreads.toInt)).toMap
53 |     // Map value from the kafka message (k, v) pair      
54 |     val lines = KafkaUtils.createStream(ssc, zkQuorum, group, topicMap).map(_._2)
55 |     // Filter hashtags
56 |     val hashTags = lines.flatMap(_.split(" ")).filter(_.startsWith("#"))
57 | 
58 |     // Get the top hashtags over the previous 60/10 sec window   
59 |     val topCounts60 = hashTags.map((_, 1)).reduceByKeyAndWindow(_ + _, Seconds(60))
60 |       .map { case (topic, count) => (count, topic) }
61 |       .transform(_.sortByKey(false))
62 | 
63 |     val topCounts10 = hashTags.map((_, 1)).reduceByKeyAndWindow(_ + _, Seconds(10))
64 |       .map { case (topic, count) => (count, topic) }
65 |       .transform(_.sortByKey(false))
66 | 
67 |     lines.print()
68 | 
69 |     // Print popular hashtags
70 |     topCounts60.foreachRDD(rdd => {
71 |       val topList = rdd.take(10)
72 |       println("\nPopular topics in last 60 seconds (%s total):".format(rdd.count()))
73 |       topList.foreach { case (count, tag) => println("%s (%s tweets)".format(tag, count)) }
74 |     })
75 | 
76 |     topCounts10.foreachRDD(rdd => {
77 |       val topList = rdd.take(10)
78 |       println("\nPopular topics in last 10 seconds (%s total):".format(rdd.count()))
79 |       topList.foreach { case (count, tag) => println("%s (%s tweets)".format(tag, count)) }
80 |     })
81 | 
82 |     lines.count().map(cnt => "Received " + cnt + " kafka messages.").print()
83 | 
84 |     ssc.start()
85 |     ssc.awaitTermination()
86 |   }
87 | }


--------------------------------------------------------------------------------
/src/main/scala/com/stdatalabs/Streaming/RecoverableKafkaPopularHashTags.scala:
--------------------------------------------------------------------------------
  1 | package com.stdatalabs.Streaming
  2 | 
  3 | import java.util.HashMap
  4 | 
  5 | import org.apache.kafka.clients.producer.{ KafkaProducer, ProducerConfig, ProducerRecord }
  6 | import org.apache.spark.SparkConf
  7 | import org.apache.spark.streaming._
  8 | import org.apache.spark.streaming.kafka._
  9 | import org.apache.spark.streaming.{ Seconds, StreamingContext }
 10 | import org.apache.spark.SparkContext._
 11 | import org.apache.spark.streaming.twitter._
 12 | import org.apache.spark.SparkConf
 13 | import org.apache.spark.streaming._
 14 | import org.apache.spark.{ SparkContext, SparkConf }
 15 | import org.apache.spark.storage.StorageLevel
 16 | 
 17 | /**
 18 |  * A Spark Streaming - Kafka integration to receive twitter data from kafka 
 19 |  * topic and find the popular hashtags and also ensure at-least once semantics
 20 |  * i.e, zero data loss
 21 |  * 
 22 |  * Arguments: <zkQuorum> <consumer-group> <topics> <numThreads>
 23 |  * <zkQuorum> 					 - The zookeeper hostname
 24 |  * <consumer-group>      - The Kafka consumer group
 25 |  * <topics>              - The kafka topic to subscribe to
 26 |  * <numThreads>          - Number of kafka receivers to run in parallel
 27 |  * <checkpointDirectory> - The directory to store and retrieve checkpoint data
 28 |  * 
 29 |  * More discussion at stdatalabs.blogspot.com
 30 |  * 
 31 |  * @author Sachin Thirumala
 32 |  */
 33 | 
 34 | object RecoverableKafkaPopularHashTags {
 35 | 
 36 |   def createContext(zkQuorum: String, group: String, topics: String, numThreads: String, checkpointDirectory: String): StreamingContext = {
 37 | 
 38 |     // If you do not see this printed, that means the StreamingContext has been loaded
 39 |     // from the new checkpoint
 40 |     println("Creating new context")
 41 |     val conf = new SparkConf().setMaster("local[6]").setAppName("Spark Streaming - Kafka Producer - PopularHashTags").set("spark.executor.memory", "1g")
 42 | 
 43 |     conf.set("spark.streaming.receiver.writeAheadLog.enable", "true")
 44 | 
 45 |     val sc = new SparkContext(conf)
 46 | 
 47 |     sc.setLogLevel("WARN")
 48 | 
 49 |     // Set the Spark StreamingContext to create a DStream for every 2 seconds  
 50 |     val ssc = new StreamingContext(sc, Seconds(2))
 51 | 
 52 |     ssc.checkpoint(checkpointDirectory)
 53 | 
 54 |     // Map each topic to a thread  
 55 |     val topicMap = topics.split(",").map((_, numThreads.toInt)).toMap
 56 |     // Map value from the kafka message (k, v) pair      
 57 |     val lines = KafkaUtils.createStream(ssc, zkQuorum, group, topicMap).map(_._2)
 58 |     // Filter hashtags
 59 |     val hashTags = lines.flatMap(_.split(" ")).filter(_.startsWith("#"))
 60 | 
 61 |     // Get the top hashtags over the previous 60/10 sec window   
 62 |     val topCounts60 = hashTags.map((_, 1)).reduceByKeyAndWindow(_ + _, Seconds(60))
 63 |       .map { case (topic, count) => (count, topic) }
 64 |       .transform(_.sortByKey(false))
 65 | 
 66 |     val topCounts10 = hashTags.map((_, 1)).reduceByKeyAndWindow(_ + _, Seconds(10))
 67 |       .map { case (topic, count) => (count, topic) }
 68 |       .transform(_.sortByKey(false))
 69 | 
 70 |     lines.print()
 71 | 
 72 |     // Print popular hashtags
 73 |     topCounts60.foreachRDD(rdd => {
 74 |       val topList = rdd.take(10)
 75 |       println("\nPopular topics in last 60 seconds (%s total):".format(rdd.count()))
 76 |       topList.foreach { case (count, tag) => println("%s (%s tweets)".format(tag, count)) }
 77 |     })
 78 | 
 79 |     topCounts10.foreachRDD(rdd => {
 80 |       val topList = rdd.take(10)
 81 |       println("\nPopular topics in last 10 seconds (%s total):".format(rdd.count()))
 82 |       topList.foreach { case (count, tag) => println("%s (%s tweets)".format(tag, count)) }
 83 |     })
 84 | 
 85 |     lines.count().map(cnt => "Received " + cnt + " kafka messages.").print()
 86 | 
 87 |     ssc
 88 | 
 89 |   }
 90 | 
 91 |   def main(args: Array[String]) {
 92 |     
 93 |     if (args.length != 5) {
 94 |       System.err.println("Your arguments were " + args.mkString("[", ", ", "]"))
 95 |       System.err.println(
 96 |         """
 97 |           |Usage: RecoverableKafkaPopularHashTags <zkQuorum> <group> <topics>
 98 |           |     <numThreads> <checkpointDirectory> 
 99 |         """.stripMargin
100 |       )
101 |       System.exit(1)
102 |     }
103 | 
104 |     // Create an array of arguments: zookeeper hostname/ip,consumer group, topicname, num of threads   
105 |     val Array(zkQuorum, group, topics, numThreads, checkpointDirectory) = args
106 | 
107 |     val ssc = StreamingContext.getOrCreate(checkpointDirectory,
108 |       () => createContext(zkQuorum, group, topics, numThreads, checkpointDirectory))
109 | 
110 |     ssc.start()
111 |     ssc.awaitTermination()
112 |   }
113 | 
114 | }


--------------------------------------------------------------------------------
/src/main/scala/com/stdatalabs/Streaming/SparkPopularHashTags.scala:
--------------------------------------------------------------------------------
 1 | package com.stdatalabs.Streaming
 2 | 
 3 | import org.apache.spark.streaming.{ Seconds, StreamingContext }
 4 | import org.apache.spark.SparkContext._
 5 | import org.apache.spark.streaming.twitter._
 6 | import org.apache.spark.SparkConf
 7 | import org.apache.spark.streaming._
 8 | import org.apache.spark.{ SparkContext, SparkConf }
 9 | import org.apache.spark.storage.StorageLevel
10 | import org.apache.spark.streaming.flume._
11 | 
12 | /**
13 |  * A Spark Streaming application that receives tweets on certain 
14 |  * keywords from twitter datasource and find the popular hashtags
15 |  * 
16 |  * Arguments: <comsumerKey> <consumerSecret> <accessToken> <accessTokenSecret> <keyword_1> ... <keyword_n>
17 |  * <comsumerKey>				- Twitter consumer key 
18 |  * <consumerSecret>  		- Twitter consumer secret
19 |  * <accessToken>				- Twitter access token
20 |  * <accessTokenSecret>	- Twitter access token secret
21 |  * <keyword_1>					- The keyword to filter tweets
22 |  * <keyword_n>					- Any number of keywords to filter tweets
23 |  * 
24 |  * More discussion at stdatalabs.blogspot.com
25 |  * 
26 |  * @author Sachin Thirumala
27 |  */
28 | 
29 | object SparkPopularHashTags {
30 |   val conf = new SparkConf().setMaster("local[4]").setAppName("Spark Streaming - PopularHashTags")
31 |   val sc = new SparkContext(conf)
32 | 
33 |   def main(args: Array[String]) {
34 | 
35 |     sc.setLogLevel("WARN")
36 | 
37 |     val Array(consumerKey, consumerSecret, accessToken, accessTokenSecret) = args.take(4)
38 |     val filters = args.takeRight(args.length - 4)
39 | 
40 |     // Set the system properties so that Twitter4j library used by twitter stream
41 |     // can use them to generat OAuth credentials
42 |     System.setProperty("twitter4j.oauth.consumerKey", consumerKey)
43 |     System.setProperty("twitter4j.oauth.consumerSecret", consumerSecret)
44 |     System.setProperty("twitter4j.oauth.accessToken", accessToken)
45 |     System.setProperty("twitter4j.oauth.accessTokenSecret", accessTokenSecret)
46 | 
47 |     // Set the Spark StreamingContext to create a DStream for every 5 seconds
48 |     val ssc = new StreamingContext(sc, Seconds(5))
49 |     // Pass the filter keywords as arguements
50 | 
51 |     //  val stream = FlumeUtils.createStream(ssc, args(0), args(1).toInt)  
52 |     val stream = TwitterUtils.createStream(ssc, None, filters)
53 |     
54 |     // Split the stream on space and extract hashtags 
55 |     val hashTags = stream.flatMap(status => status.getText.split(" ").filter(_.startsWith("#")))
56 | 
57 |     // Get the top hashtags over the previous 60 sec window
58 |     val topCounts60 = hashTags.map((_, 1)).reduceByKeyAndWindow(_ + _, Seconds(60))
59 |       .map { case (topic, count) => (count, topic) }
60 |       .transform(_.sortByKey(false))
61 | 
62 |     // Get the top hashtags over the previous 10 sec window
63 |     val topCounts10 = hashTags.map((_, 1)).reduceByKeyAndWindow(_ + _, Seconds(10))
64 |       .map { case (topic, count) => (count, topic) }
65 |       .transform(_.sortByKey(false))
66 | 
67 |     // print tweets in the currect DStream 
68 |     stream.print()
69 | 
70 |     // Print popular hashtags  
71 |     topCounts60.foreachRDD(rdd => {
72 |       val topList = rdd.take(10)
73 |       println("\nPopular topics in last 60 seconds (%s total):".format(rdd.count()))
74 |       topList.foreach { case (count, tag) => println("%s (%s tweets)".format(tag, count)) }
75 |     })
76 |     topCounts10.foreachRDD(rdd => {
77 |       val topList = rdd.take(10)
78 |       println("\nPopular topics in last 10 seconds (%s total):".format(rdd.count()))
79 |       topList.foreach { case (count, tag) => println("%s (%s tweets)".format(tag, count)) }
80 |     })
81 | 
82 |     ssc.start()
83 |     ssc.awaitTermination()
84 |   }
85 | }   


--------------------------------------------------------------------------------
/src/test/java/org/stdatalabs/TwitterPopularHashTags/AppTest.java:
--------------------------------------------------------------------------------
 1 | package org.stdatalabs.TwitterPopularHashTags;
 2 | 
 3 | import junit.framework.Test;
 4 | import junit.framework.TestCase;
 5 | import junit.framework.TestSuite;
 6 | 
 7 | /**
 8 |  * Unit test for simple App.
 9 |  */
10 | public class AppTest 
11 |     extends TestCase
12 | {
13 |     /**
14 |      * Create the test case
15 |      *
16 |      * @param testName name of the test case
17 |      */
18 |     public AppTest( String testName )
19 |     {
20 |         super( testName );
21 |     }
22 | 
23 |     /**
24 |      * @return the suite of tests being tested
25 |      */
26 |     public static Test suite()
27 |     {
28 |         return new TestSuite( AppTest.class );
29 |     }
30 | 
31 |     /**
32 |      * Rigourous Test :-)
33 |      */
34 |     public void testApp()
35 |     {
36 |         assertTrue( true );
37 |     }
38 | }
39 | 


--------------------------------------------------------------------------------