├── spark-streaming ├── .gitignore ├── twitter4j.properties ├── src │ └── main │ │ └── java │ │ └── TopHashtags.scala ├── pom.xml └── README.md └── README.md /spark-streaming/.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | target/ 3 | 4 | -------------------------------------------------------------------------------- /spark-streaming/twitter4j.properties: -------------------------------------------------------------------------------- 1 | debug=true 2 | jsonStoreEnabled=true 3 | oauth.consumerKey=AAAAAAAAAAAAAAAAAA 4 | oauth.consumerSecret=AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA 5 | oauth.accessToken=AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA 6 | oauth.accessTokenSecret=AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA 7 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Tweet Analysis with Spark 2 | 3 | This tutorial shows how to use Spark to analyze crawls from Twitter's streaming API, which is formatted as a sequence of JSON objects, one per line. We assume CDH 5.3.0. 4 | 5 | Fire up the Spark shell: 6 | 7 | ``` 8 | $ export HADOOP_CONF_DIR=/etc/hadoop/conf 9 | $ spark-shell --master yarn-client --jars gson-2.3.1.jar --num-executors 10 10 | ``` 11 | 12 | The need for setting up the environment variable is due to [this issue](https://issues.cloudera.org/browse/DISTRO-664). You can download the jar [here](http://search.maven.org/#artifactdetails%7Ccom.google.code.gson%7Cgson%7C2.3.1%7Cjar). 13 | 14 | If you want to play with individual tweets: 15 | 16 | ``` 17 | import com.google.gson._ 18 | 19 | val jsonParser = new JsonParser() 20 | val gson = new GsonBuilder().setPrettyPrinting().create() 21 | 22 | val raw = sc.textFile("/path/to/file"); 23 | 24 | // Take five JSON records and print each out. 25 | for (r <- raw.take(5)) { 26 | println(gson.toJson(jsonParser.parse(r))) 27 | } 28 | ``` 29 | 30 | Here's the [Gson API javadoc](http://google-gson.googlecode.com/svn/trunk/gson/docs/javadocs/index.html). 31 | 32 | Now let's analyze the tweets at scale. Here's how we start: 33 | 34 | ``` 35 | import com.google.gson._ 36 | 37 | val raw = sc.textFile("/path/to/file"); 38 | 39 | var tweets = 40 | raw.map(line => { 41 | val jsonParser = new JsonParser() 42 | val obj = jsonParser.parse(line).getAsJsonObject() 43 | if (obj.has("delete")) { 44 | null 45 | } else { 46 | (obj.get("id").getAsLong(), obj.get("text").getAsString()) 47 | } 48 | }).filter(x => x != null) 49 | ``` 50 | 51 | The variable `tweets` now holds an RDD of (tweetid, tweet text) pairs, with deletes filtered out. 52 | 53 | Let's continue with word count: 54 | 55 | ``` 56 | val wc = 57 | tweets.flatMap(t => t._2.split(" ")).map(word => (word, 1)).reduceByKey(_ + _) 58 | ``` 59 | 60 | Let's say, find common terms, sort them, and save output to disk: 61 | 62 | ``` 63 | wc.filter(t => t._2 > 100).sortByKey().saveAsTextFile("wc_out") 64 | ``` 65 | -------------------------------------------------------------------------------- /spark-streaming/src/main/java/TopHashtags.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed under the Apache License, Version 2.0 (the "License"); you 3 | * may not use this file except in compliance with the License. You may 4 | * obtain a copy of the License at 5 | * 6 | * http://www.apache.org/licenses/LICENSE-2.0 7 | * 8 | * Unless required by applicable law or agreed to in writing, software 9 | * distributed under the License is distributed on an "AS IS" BASIS, 10 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 11 | * implied. See the License for the specific language governing 12 | * permissions and limitations under the License. 13 | */ 14 | 15 | // Needed for all Spark jobs. 16 | import org.apache.spark.SparkConf 17 | import org.apache.spark.SparkContext._ 18 | 19 | // Only needed for Spark Streaming. 20 | import org.apache.spark.streaming.Seconds 21 | import org.apache.spark.streaming.StreamingContext 22 | import org.apache.spark.streaming.StreamingContext._ 23 | 24 | // Only needed for utilities for streaming from Twitter. 25 | import org.apache.spark.streaming.twitter._ 26 | 27 | object TopHashtags { 28 | def main(args: Array[String]): Unit = { 29 | 30 | // Set up the Spark configuration with our app name and any other config 31 | // parameters you want (e.g., Kryo serialization or executor memory). 32 | val sparkConf = new SparkConf().setAppName("TopHashtags") 33 | 34 | // Use the config to create a streaming context that creates a new RDD 35 | // with a batch interval of every 5 seconds. 36 | val ssc = new StreamingContext(sparkConf, Seconds(5)) 37 | 38 | // Use the streaming context and the TwitterUtils to create the 39 | // Twitter stream. 40 | val stream = TwitterUtils.createStream(ssc, None) 41 | 42 | // Each tweet comes as a twitter4j.Status object, which we can use to 43 | // extract hash tags. We use flatMap() since each status could have 44 | // ZERO OR MORE hashtags. 45 | val hashTags = stream.flatMap(status => status.getHashtagEntities) 46 | 47 | // Convert hashtag to (hashtag, 1) pair for future reduction. 48 | val hashTagPairs = hashTags.map(hashtag => ("#" + hashtag.getText, 1)) 49 | 50 | // Use reduceByKeyAndWindow to reduce our hashtag pairs by summing their 51 | // counts over the last 10 seconds of batch intervals (in this case, 2 RDDs). 52 | val topCounts10 = hashTagPairs.reduceByKeyAndWindow((l, r) => {l + r}, Seconds(10)) 53 | 54 | // topCounts10 will provide a new RDD for every window. Calling transform() 55 | // on each of these RDDs gives us a per-window transformation. We use 56 | // this transformation to sort each RDD by the hashtag counts. The FALSE 57 | // flag tells the sortBy() function to sort in descending order. 58 | val sortedTopCounts10 = topCounts10.transform(rdd => 59 | rdd.sortBy(hashtagPair => hashtagPair._2, false)) 60 | 61 | // Print popular hashtags. 62 | sortedTopCounts10.foreachRDD(rdd => { 63 | val topList = rdd.take(10) 64 | println("\nPopular topics in last 10 seconds (%s total):".format(rdd.count())) 65 | topList.foreach{case (tag, count) => println("%s (%d tweets)".format(tag, count))} 66 | }) 67 | 68 | // Finally, start the streaming operation and continue until killed. 69 | ssc.start() 70 | ssc.awaitTermination() 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /spark-streaming/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4.0.0 4 | edu.umd 5 | SparkStreamingTwitterDemo 6 | 1.0-SNAPSHOT 7 | jar 8 | 9 | UTF-8 10 | 1.7 11 | 1.7 12 | UTF-8 13 | 2.10.4 14 | 15 | 16 | 17 | The Apache Software License, Version 2.0 18 | http://www.apache.org/licenses/LICENSE-2.0.txt 19 | repo 20 | 21 | 22 | 23 | 24 | 25 | scala-tools.org 26 | Scala-Tools Maven2 Repository 27 | http://scala-tools.org/repo-releases 28 | 29 | 30 | cloudera 31 | https://repository.cloudera.com/artifactory/cloudera-repos/ 32 | 33 | 34 | 35 | 36 | 37 | scala-tools.org 38 | Scala-Tools Maven2 Repository 39 | http://scala-tools.org/repo-releases 40 | 41 | 42 | 43 | 44 | 45 | org.scala-lang 46 | scala-library 47 | ${scala.version} 48 | 49 | 50 | org.apache.spark 51 | spark-core_2.10 52 | 1.2.0 53 | 54 | 55 | com.google.code.gson 56 | gson 57 | 2.3.1 58 | 59 | 60 | org.apache.spark 61 | spark-streaming_2.10 62 | 1.2.0-cdh5.3.0 63 | 64 | 65 | org.apache.spark 66 | spark-streaming-twitter_2.10 67 | 1.2.0-cdh5.3.0 68 | 69 | 70 | 71 | 72 | junit 73 | junit 74 | 4.8.1 75 | test 76 | 77 | 78 | org.scalatest 79 | scalatest 80 | 1.2 81 | test 82 | 83 | 84 | 85 | 86 | src/main/java 87 | src/test/java 88 | 89 | 90 | org.scala-tools 91 | maven-scala-plugin 92 | 2.15.0 93 | 94 | 95 | 96 | compile 97 | testCompile 98 | 99 | 100 | 101 | -deprecation 102 | -make:transitive 103 | -dependencyfile 104 | ${project.build.directory}/.scala_dependencies 105 | 106 | 107 | 108 | 109 | 110 | 111 | maven-assembly-plugin 112 | 2.5.3 113 | 114 | 115 | jar-with-dependencies 116 | 117 | 118 | 119 | 120 | make-assembly 121 | package 122 | 123 | single 124 | 125 | 126 | 127 | 128 | 129 | org.apache.maven.plugins 130 | maven-surefire-plugin 131 | 2.6 132 | 133 | false 134 | true 135 | 136 | 137 | 138 | **/*Test.* 139 | **/*Suite.* 140 | 141 | 142 | 143 | 144 | 145 | 146 | -------------------------------------------------------------------------------- /spark-streaming/README.md: -------------------------------------------------------------------------------- 1 | # Real-Time Twitter Analytics with Spark Streaming 2 | 3 | This project is a quick tutorial to show how to use Spark's streaming capabilities to analyze Twitter data directly from Twitter's [sample stream](https://dev.twitter.com/streaming/public). 4 | As an exercise, we will demonstrate how to extract the most popular hashtags in Twitter's sample stream over the past few minutes. 5 | 6 | The code we'll be using is written in Scala, should be compiled with Maven, and submitted to your cluster (or Spark standalone installation) using spark-submit. 7 | The code we will reference here can be executed in `spark-shell` as well. 8 | 9 | 10 | ## Setting Up the Environment 11 | 12 | 1. The first thing to note is that you'll need a set of credentials from Twitter's Developer website to authenticate against Twitter's servers and access the sample stream. You can get these credentials from Twitter's [Application Management](https://apps.twitter.com/) site. 13 | 14 | 1. Spark Streaming includes a special library for Twitter access, called `spark-streaming-twitter_2.10`, which will be included in our `pom.xml` file as a dependency (along with `spark-core_2.10` and `spark-streaming_2.10`). 15 | 16 | 1. The Spark Streaming Twitter library makes use of [Twitter4j](http://twitter4j.org/) 3.0.3 for handling authentication and handshaking with Twitter's streaming API. As a result, we will need to populate a `twitter4j.properties` file with the oauth.consumerKey, oauth.consumerSecret, oauth.accessToken, and oauth.accessTokenSecret values we got from Twitter's app management site. For simplicity, this properties file should be in the current directory from which you launch your Spark application. 17 | 18 | 19 | ## Coding For Streams 20 | 21 | All code referenced below appears in `src/main/java/TopHashtags.scala`. 22 | 23 | Now that we're ready to code, we first need to make sure all we have all the necessary imports. Fortunately, for this simple example, we only need a few. 24 | 25 | ``` 26 | // Needed for all Spark jobs. 27 | import org.apache.spark.SparkConf 28 | import org.apache.spark.SparkContext._ 29 | 30 | // Only needed for Spark Streaming. 31 | import org.apache.spark.streaming.Seconds 32 | import org.apache.spark.streaming.StreamingContext 33 | import org.apache.spark.streaming.StreamingContext._ 34 | 35 | // Only needed for utilities for streaming from Twitter. 36 | import org.apache.spark.streaming.twitter._ 37 | ``` 38 | 39 | In our main function, we can now configure Spark's streaming resources. The Spark config is straightforward, but creating the StreamingContext object takes an additional parameter called the _batch interval_. This interval should be set such that your cluster can process the data in less time than the interval, and it can have big implications for the rest of your system if set too low. Additionally, the batch interval provides a lower bound on how granular your real-time computation can be (e.g., no faster than every 5 seconds). We use 5 here as a conservative estimate. See Spark's [Performance Tuning](https://spark.apache.org/docs/latest/streaming-programming-guide.html#setting-the-right-batch-interval) for more information. 40 | 41 | ``` 42 | // Set up the Spark configuration with our app name and any other config 43 | // parameters you want (e.g., Kryo serialization or executor memory). 44 | val sparkConf = new SparkConf().setAppName("TopHashtags") 45 | 46 | // Use the config to create a streaming context that creates a new RDD 47 | // with a batch interval of every 5 seconds. 48 | val ssc = new StreamingContext(sparkConf, Seconds(5)) 49 | ``` 50 | 51 | We then use the Spark streaming context to create our Twitter stream (though we won't connect to it just yet). Spark is nice in giving us easy utility functions to create this stream. The `None` object refers to authentication information, which we leave blank to force Twitter4j's default authentication (i.e., use the `twitter4j.properties` file). The `createStream()` function can also take a set of filter keywords as well, but we omit that here. For more information, see the [TwitterUtils API](https://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.streaming.twitter.TwitterUtils$). 52 | 53 | ``` 54 | // Use the streaming context and the TwitterUtils to create the 55 | // Twitter stream. 56 | val stream = TwitterUtils.createStream(ssc, None) 57 | ``` 58 | 59 | Our next goal is to define the operations on the batches of tweets Spark Streaming gives us. Since we want to count hashtags, our first step is to extract hashtags from each tweet. 60 | 61 | ``` 62 | // Each tweet comes as a twitter4j.Status object, which we can use to 63 | // extract hash tags. We use flatMap() since each status could have 64 | // ZERO OR MORE hashtags. 65 | val hashTags = stream.flatMap(status => status.getHashtagEntities) 66 | ``` 67 | 68 | We then map each twitter4j.HashtagEntity object to a pair containing the "#" symbol, the text of the hashtag, and 1, which we will use in our reduction for counting. 69 | 70 | ``` 71 | // Convert hashtag to (hashtag, 1) pair for future reduction. 72 | val hashTagPairs = hashTags.map(hashtag => ("#" + hashtag.getText, 1)) 73 | ``` 74 | 75 | To count the occurrences for a given hashtag pair, we would use reduceByKey(), but since we are using Spark streaming, we can also make use of its *sliding window* capabilities and instead use reduceByKeyAndWindow(). This function also takes a time parameter to control how many previous RDDs it uses when performing its reduceByKey operation. In this case, we're only looking at the past 10 seconds of RDDs, and since our batch interval is 5 seconds, we're only looking at the past two RDDs. 76 | 77 | ``` 78 | // Use reduceByKeyAndWindow to reduce our hashtag pairs by summing their 79 | // counts over the last 10 seconds of batch intervals (in this case, 2 RDDs). 80 | val topCounts10 = hashTagPairs.reduceByKeyAndWindow((l, r) => {l + r}, Seconds(10)) 81 | ``` 82 | 83 | The topCounts10 variable will now point to a stream of RDDs, where each RDD is contains pairs of (hashtag, count). We want to sort each of these RDDs by the hashtag counts, so we call transform(), which applies an arbitrary function to each RDD. 84 | 85 | ``` 86 | // topCounts10 will provide a new RDD for every window. Calling transform() 87 | // on each of these RDDs gives us a per-window transformation. We use 88 | // this transformation to sort each RDD by the hashtag counts. The FALSE 89 | // flag tells the sortBy() function to sort in descending order. 90 | val sortedTopCounts10 = topCounts10.transform(rdd => 91 | rdd.sortBy(hashtagPair => hashtagPair._2, false)) 92 | ``` 93 | 94 | For each of these sorted RDDs, we want to print the top 10 most popular hashtags. The foreachRDD() function makes this step easy, and we simply take the first 10 pairs from the RDD using take(10) and print them out. 95 | 96 | ``` 97 | // Print popular hashtags. 98 | sortedTopCounts10.foreachRDD(rdd => { 99 | val topList = rdd.take(10) 100 | println("\nPopular topics in last 10 seconds (%s total):".format(rdd.count())) 101 | topList.foreach{case (tag, count) => println("%s (%d tweets)".format(tag, count))} 102 | }) 103 | ``` 104 | 105 | Once we've set up all the operations for the data we stream in, it's finally time to start up the stream. 106 | 107 | ``` 108 | // Finally, start the streaming operation and continue until killed. 109 | ssc.start() 110 | ssc.awaitTermination() 111 | ``` 112 | 113 | ## Building and Running Your Code 114 | 115 | The last steps are to build and run the code. 116 | 117 | 1. To build, simply run `mvn clean package` in this directory. 118 | 119 | 1. To run your code: 120 | 121 | ``` 122 | $ spark-submit --class TopHashtags --master yarn-client \ 123 | ./target/SparkStreamingTwitterDemo-1.0-SNAPSHOT-jar-with-dependencies.jar 124 | ``` 125 | 126 | Note that you might need to set up the environment variable due to [this issue](https://issues.cloudera.org/browse/DISTRO-664): 127 | 128 | ``` 129 | $ export HADOOP_CONF_DIR=/etc/hadoop/conf 130 | ``` 131 | 132 | __NOTE__: Be sure to update your `twitter4j.properties` file and have it in the directory where you run `spark-submit`. 133 | --------------------------------------------------------------------------------