├── visualizations
    ├── TotalTweets.json
    ├── Sentiments.json
    ├── PIELanguage.json
    ├── TrendingHashTags.json
    ├── TwitterSentimentAnalysis_dashboard.json
    ├── TextBasedSentiment.json
    └── Quarterhourly-Analysis.json
├── .gitignore
├── src
    ├── test
    │   └── java
    │   │   └── com
    │   │       └── stdatalabs
    │   │           └── SparkES
    │   │               └── AppTest.java
    └── main
    │   └── scala
    │       └── com
    │           └── stdatalabs
    │               └── SparkES
    │                   ├── SentimentUtils.scala
    │                   └── TwitterSentimentAnalysis.scala
├── README.md
├── pom.xml
└── dependency-reduced-pom.xml


/visualizations/TotalTweets.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "_id": "Total-Tweets",
 4 |     "_type": "visualization",
 5 |     "_source": {
 6 |       "title": "Total Tweets",
 7 |       "visState": "{\"type\":\"metric\",\"params\":{\"fontSize\":60},\"aggs\":[{\"id\":\"1\",\"type\":\"count\",\"schema\":\"metric\",\"params\":{}}],\"listeners\":{}}",
 8 |       "description": "",
 9 |       "version": 1,
10 |       "kibanaSavedObjectMeta": {
11 |         "searchSourceJSON": "{\"index\":\"twitter_020717\",\"query\":{\"query_string\":{\"query\":\"*\",\"analyze_wildcard\":true}},\"filter\":[]}"
12 |       }
13 |     }
14 |   }
15 | ]


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.iml
 2 | .*/
 3 | target/
 4 | .classpath
 5 | .cache-main
 6 | .cache-tests
 7 | .settings
 8 | .project
 9 | tweets.txt
10 | checkpoint/
11 | 
12 | # Compiled source #
13 | ###################
14 | *.com
15 | *.class
16 | *.dll
17 | *.exe
18 | *.o
19 | *.so
20 | checkpoint
21 | 
22 | # Packages #
23 | ############
24 | # it's better to unpack these files and commit the raw source
25 | # git has its own built in compression methods
26 | *.7z
27 | *.dmg
28 | *.gz
29 | *.iso
30 | *.jar
31 | *.rar
32 | *.tar
33 | *.zip
34 | 
35 | # Logs and databases #
36 | ######################
37 | *.log
38 | *.sql
39 | *.sqlite
40 | 
41 | # OS generated files #
42 | ######################
43 | .DS_Store
44 | .DS_Store?
45 | ._*
46 | .Spotlight-V100
47 | .Trashes
48 | ehthumbs.db
49 | Thumbs.db
50 | 


--------------------------------------------------------------------------------
/visualizations/Sentiments.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "_id": "Sentiments",
 4 |     "_type": "visualization",
 5 |     "_source": {
 6 |       "title": "Sentiments",
 7 |       "visState": "{\"type\":\"table\",\"params\":{\"perPage\":5,\"showMeticsAtAllLevels\":false,\"showPartialRows\":false},\"aggs\":[{\"id\":\"1\",\"type\":\"count\",\"schema\":\"metric\",\"params\":{}},{\"id\":\"2\",\"type\":\"terms\",\"schema\":\"bucket\",\"params\":{\"field\":\"sentiment\",\"size\":5,\"order\":\"desc\",\"orderBy\":\"1\"}}],\"listeners\":{}}",
 8 |       "description": "",
 9 |       "version": 1,
10 |       "kibanaSavedObjectMeta": {
11 |         "searchSourceJSON": "{\"index\":\"twitter_020717\",\"query\":{\"query_string\":{\"analyze_wildcard\":true,\"query\":\"*\"}},\"filter\":[]}"
12 |       }
13 |     }
14 |   }
15 | ]


--------------------------------------------------------------------------------
/src/test/java/com/stdatalabs/SparkES/AppTest.java:
--------------------------------------------------------------------------------
 1 | package com.stdatalabs.SparkES;
 2 | 
 3 | import junit.framework.Test;
 4 | import junit.framework.TestCase;
 5 | import junit.framework.TestSuite;
 6 | 
 7 | /**
 8 |  * Unit test for simple App.
 9 |  */
10 | public class AppTest 
11 |     extends TestCase
12 | {
13 |     /**
14 |      * Create the test case
15 |      *
16 |      * @param testName name of the test case
17 |      */
18 |     public AppTest( String testName )
19 |     {
20 |         super( testName );
21 |     }
22 | 
23 |     /**
24 |      * @return the suite of tests being tested
25 |      */
26 |     public static Test suite()
27 |     {
28 |         return new TestSuite( AppTest.class );
29 |     }
30 | 
31 |     /**
32 |      * Rigourous Test :-)
33 |      */
34 |     public void testApp()
35 |     {
36 |         assertTrue( true );
37 |     }
38 | }
39 | 


--------------------------------------------------------------------------------
/visualizations/PIELanguage.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "_id": "PIE-Sentiment-Language",
 4 |     "_type": "visualization",
 5 |     "_source": {
 6 |       "title": "PIE Sentiment Language",
 7 |       "visState": "{\"aggs\":[{\"id\":\"1\",\"params\":{},\"schema\":\"metric\",\"type\":\"count\"},{\"id\":\"3\",\"params\":{\"field\":\"language\",\"order\":\"desc\",\"orderBy\":\"1\",\"size\":5},\"schema\":\"segment\",\"type\":\"terms\"},{\"id\":\"4\",\"params\":{\"field\":\"sentiment\",\"order\":\"desc\",\"orderBy\":\"1\",\"size\":5},\"schema\":\"segment\",\"type\":\"terms\"}],\"listeners\":{},\"params\":{\"addLegend\":true,\"addTooltip\":true,\"isDonut\":false,\"shareYAxis\":true},\"type\":\"pie\"}",
 8 |       "description": "",
 9 |       "version": 1,
10 |       "kibanaSavedObjectMeta": {
11 |         "searchSourceJSON": "{\"index\":\"twitter_020717\",\"query\":{\"query_string\":{\"analyze_wildcard\":true,\"query\":\"*\"}},\"filter\":[]}"
12 |       }
13 |     }
14 |   }
15 | ]


--------------------------------------------------------------------------------
/visualizations/TrendingHashTags.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "_id": "Trending-HashTags",
 4 |     "_type": "visualization",
 5 |     "_source": {
 6 |       "title": "Trending HashTags",
 7 |       "visState": "{\"type\":\"pie\",\"params\":{\"shareYAxis\":true,\"addTooltip\":true,\"addLegend\":true,\"isDonut\":false},\"aggs\":[{\"id\":\"1\",\"type\":\"count\",\"schema\":\"metric\",\"params\":{}},{\"id\":\"2\",\"type\":\"terms\",\"schema\":\"segment\",\"params\":{\"field\":\"hashtags\",\"size\":5,\"order\":\"desc\",\"orderBy\":\"custom\",\"orderAgg\":{\"id\":\"2-orderAgg\",\"type\":\"count\",\"schema\":\"orderAgg\",\"params\":{}}}},{\"id\":\"3\",\"type\":\"terms\",\"schema\":\"segment\",\"params\":{\"field\":\"sentiment\",\"size\":5,\"order\":\"desc\",\"orderBy\":\"1\"}}],\"listeners\":{}}",
 8 |       "description": "",
 9 |       "version": 1,
10 |       "kibanaSavedObjectMeta": {
11 |         "searchSourceJSON": "{\"index\":\"twitter_020717\",\"query\":{\"query_string\":{\"query\":\"*\",\"analyze_wildcard\":true}},\"filter\":[]}"
12 |       }
13 |     }
14 |   }
15 | ]


--------------------------------------------------------------------------------
/visualizations/TwitterSentimentAnalysis_dashboard.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "_id": "Twitter-Sentiment-DashBoard-020717",
 4 |     "_type": "dashboard",
 5 |     "_source": {
 6 |       "title": "Twitter Sentiment DashBoard - 020717",
 7 |       "hits": 0,
 8 |       "description": "",
 9 |       "panelsJSON": "[{\"col\":1,\"id\":\"PIE-Sentiment-Language\",\"row\":9,\"size_x\":5,\"size_y\":4,\"type\":\"visualization\"},{\"col\":6,\"id\":\"Trending-HashTags\",\"row\":9,\"size_x\":7,\"size_y\":4,\"type\":\"visualization\"},{\"col\":5,\"id\":\"QuarterHourly-Analysis\",\"row\":1,\"size_x\":8,\"size_y\":4,\"type\":\"visualization\"},{\"col\":1,\"id\":\"Total-Tweets\",\"row\":5,\"size_x\":4,\"size_y\":4,\"type\":\"visualization\"},{\"col\":1,\"id\":\"Sentiments\",\"row\":1,\"size_x\":4,\"size_y\":4,\"type\":\"visualization\"},{\"id\":\"Text-Based-Sentiment\",\"type\":\"visualization\",\"size_x\":8,\"size_y\":4,\"col\":5,\"row\":5}]",
10 |       "version": 1,
11 |       "timeRestore": false,
12 |       "kibanaSavedObjectMeta": {
13 |         "searchSourceJSON": "{\"filter\":[{\"query\":{\"query_string\":{\"analyze_wildcard\":true,\"query\":\"*\"}}}]}"
14 |       }
15 |     }
16 |   }
17 | ]


--------------------------------------------------------------------------------
/visualizations/TextBasedSentiment.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "_id": "Text-Based-Sentiment",
 4 |     "_type": "visualization",
 5 |     "_source": {
 6 |       "title": "Text Based Sentiment",
 7 |       "visState": "{\"type\":\"histogram\",\"params\":{\"shareYAxis\":true,\"addTooltip\":true,\"addLegend\":true,\"scale\":\"linear\",\"mode\":\"grouped\",\"times\":[],\"addTimeMarker\":false,\"defaultYExtents\":false,\"setYExtents\":false,\"yAxis\":{}},\"aggs\":[{\"id\":\"1\",\"type\":\"count\",\"schema\":\"metric\",\"params\":{}},{\"id\":\"2\",\"type\":\"terms\",\"schema\":\"segment\",\"params\":{\"field\":\"sentiment\",\"size\":5,\"order\":\"desc\",\"orderBy\":\"1\"}},{\"id\":\"3\",\"type\":\"terms\",\"schema\":\"group\",\"params\":{\"field\":\"text\",\"include\":{\"flags\":[\"CASE_INSENSITIVE\"],\"pattern\":\"Transport||Health||Energy||Education\"},\"size\":5,\"order\":\"desc\",\"orderBy\":\"1\"}}],\"listeners\":{}}",
 8 |       "description": "",
 9 |       "version": 1,
10 |       "kibanaSavedObjectMeta": {
11 |         "searchSourceJSON": "{\"index\":\"twitter_020717\",\"query\":{\"query_string\":{\"query\":\"*\",\"analyze_wildcard\":true}},\"filter\":[]}"
12 |       }
13 |     }
14 |   }
15 | ]


--------------------------------------------------------------------------------
/visualizations/Quarterhourly-Analysis.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "_id": "QuarterHourly-Analysis",
 4 |     "_type": "visualization",
 5 |     "_source": {
 6 |       "title": "QuarterHourly Analysis",
 7 |       "visState": "{\"type\":\"line\",\"params\":{\"shareYAxis\":true,\"addTooltip\":true,\"addLegend\":true,\"showCircles\":true,\"smoothLines\":false,\"interpolate\":\"linear\",\"scale\":\"linear\",\"drawLinesBetweenPoints\":true,\"radiusRatio\":\"55\",\"times\":[],\"addTimeMarker\":false,\"defaultYExtents\":false,\"setYExtents\":false,\"yAxis\":{}},\"aggs\":[{\"id\":\"1\",\"type\":\"count\",\"schema\":\"metric\",\"params\":{}},{\"id\":\"2\",\"type\":\"date_histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"created_at\",\"interval\":\"custom\",\"customInterval\":\"15m\",\"min_doc_count\":1,\"extended_bounds\":{}}},{\"id\":\"3\",\"type\":\"terms\",\"schema\":\"group\",\"params\":{\"field\":\"sentiment\",\"size\":5,\"order\":\"desc\",\"orderBy\":\"1\"}}],\"listeners\":{}}",
 8 |       "description": "",
 9 |       "version": 1,
10 |       "kibanaSavedObjectMeta": {
11 |         "searchSourceJSON": "{\"index\":\"twitter_020717\",\"query\":{\"query_string\":{\"query\":\"*\",\"analyze_wildcard\":true}},\"filter\":[]}"
12 |       }
13 |     }
14 |   }
15 | ]


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # SparkTwitterPopularHashTags
 2 | 
 3 | A project on Spark Streaming to analyze Popular hashtags from live twitter data streams. Data is ingested from different input sources like Twitter source, Flume and Kafka and processed downstream using Spark Streaming.
 4 | 
 5 | ## Requirements
 6 | - IDE 
 7 | - Apache Maven 3.x
 8 | - JVM 6 or 7
 9 | 
10 | ## General Info
11 | The source folder is organized into 2 packages i.e. Kafka and Streaming. Each class in the Streaming package explores different approach to consume data from Twitter source. Below is the list of classes:
12 | * com/stdatalabs/Kafka
13 |      * KafkaTwitterProducer.java --   A Kafka Producer that publishes twitter data to a kafka broker
14 | * com/stdatalabs/Streaming
15 |     * SparkPopularHashTags.scala -- Receives data from Twitter datasource
16 |     * FlumeSparkPopularHashTags.scala -- Receives data from Flume Twitter producer
17 |     * KafkaSparkPopularHashTags.scala -- Receives data from Kafka Producer
18 |     * RecoverableKafkaPopularHashTags.scala -- Spark-Kafka receiver based approach. Ensures at-least once semantics
19 |     * KafkaDirectPopularHashTags.scala -- Spark-Kafka Direct approach. Ensures exactly once semantics
20 | * TwitterAvroSource.conf 
21 |     -- Flume conf for running Twitter avro source
22 | 
23 | ## Description
24 | * ##### A Spark Streaming application that receives tweets on certain keywords from twitter datasource and finds the popular hashtags. 
25 |   Discussed in blog -- 
26 |      [Spark Streaming part 1: Real time twitter sentiment analysis](http://stdatalabs.blogspot.in/2016/09/spark-streaming-part-1-real-time.html)
27 | 
28 | * ##### A Spark Streaming - Flume integration to find Popular hashtags from twitter. It receives events from a Flume source that connects to twitter and pushes tweets as avro events to sink.
29 |     Discussed in blog -- 
30 |      [Spark streaming part 2: Real time twitter sentiment analysis using Flume](http://stdatalabs.blogspot.in/2016/09/spark-streaming-part-2-real-time_10.html)
31 |      
32 | * ##### A Spark Streaming - Kafka integration to receive twitter data from kafka producer and find the popular hashtags
33 |     Discussed in blog -- 
34 |      [Spark streaming part 3: Real time twitter sentiment analysis using kafka](http://stdatalabs.blogspot.in/2016/09/spark-streaming-part-3-real-time.html)
35 |      
36 | * ##### A Spark Streaming - Kafka integration to ensure at-least once semantics
37 |     Discussed in blog -- 
38 |      [Data guarantees in Spark Streaming with kafka integration](http://stdatalabs.blogspot.in/2016/10/data-guarantees-in-spark-streaming-with.html)
39 |      
40 | * ##### A Spark Streaming - Kafka integration to ensure exactly once semantics
41 |     Discussed in blog -- 
42 |      [Data guarantees in Spark Streaming with kafka integration](http://stdatalabs.blogspot.in/2016/10/data-guarantees-in-spark-streaming-with.html)
43 | 
44 | 
45 | 
46 | ### More articles on hadoop technology stack at [stdatalabs](stdatalabs.blogspot.com)
47 | 
48 | 


--------------------------------------------------------------------------------
/src/main/scala/com/stdatalabs/SparkES/SentimentUtils.scala:
--------------------------------------------------------------------------------
 1 | package com.stdatalabs.SparkES
 2 | 
 3 | import java.util.Properties
 4 | 
 5 | import edu.stanford.nlp.ling.CoreAnnotations
 6 | import edu.stanford.nlp.neural.rnn.RNNCoreAnnotations
 7 | import edu.stanford.nlp.pipeline.StanfordCoreNLP
 8 | import edu.stanford.nlp.sentiment.SentimentCoreAnnotations
 9 | 
10 | import scala.collection.JavaConversions._
11 | import scala.collection.mutable.ListBuffer
12 | 
13 | object SentimentUtils {
14 | 
15 |   val nlpPropts = {
16 |     val propts = new Properties()
17 |     /* Annotators - Meaning http://corenlp.run/
18 |        tokenize   - Tokenize the sentence.
19 |        ssplit     - Split the text into sentence. Identify fullstop, exclamation etc and split sentences
20 |        pos        - Reads text in some language and assigns parts of speech to each word (and other token), such as noun, verb, adjective, etc.
21 |        lemma      - Group together the different inflected forms of a word so they can be analysed as a single item.
22 |        parse      - Provide syntactic analysis http://nlp.stanford.edu:8080/parser/index.jsp
23 |        sentiment  - Provide model for sentiment analysis
24 |        * */
25 |     propts.setProperty("annotators", "tokenize, ssplit, pos, lemma, parse, sentiment")
26 |     propts
27 |   }
28 | 
29 |   def detectSentiment(message: String): String = {
30 | 
31 |     // Create a pipeline with NLP properties
32 |     val pipeline = new StanfordCoreNLP(nlpPropts)
33 | 
34 |     // Run message through the Pipeline
35 |     val annotation = pipeline.process(message)
36 |     var sentiments: ListBuffer[Double] = ListBuffer()
37 |     var sizes: ListBuffer[Int] = ListBuffer()
38 | 
39 |     var longest = 0
40 |     var mainSentiment = 0
41 | 
42 |     // An Annotation is a Map and you can get and use the various analyses individually.
43 |     // For instance, this gets the parse tree of the first sentence in the text.
44 |     // Iterate through tweet
45 |     for (tweetMsg <- annotation.get(classOf[CoreAnnotations.SentencesAnnotation])) {
46 |       // Create a RNN parse tree
47 |       val parseTree = tweetMsg.get(classOf[SentimentCoreAnnotations.AnnotatedTree])
48 |       // Detect Sentiment
49 |       val tweetSentiment = RNNCoreAnnotations.getPredictedClass(parseTree)
50 |       val partText = tweetMsg.toString
51 | 
52 |       if (partText.length() > longest) {
53 |         mainSentiment = tweetSentiment
54 |         longest = partText.length()
55 |       }
56 | 
57 |       sentiments += tweetSentiment.toDouble
58 |       sizes += partText.length
59 |     }
60 | 
61 |     val weightedSentiments = (sentiments, sizes).zipped.map((sentiment, size) => sentiment * size)
62 |     var weightedSentiment = weightedSentiments.sum / (sizes.fold(0)(_ + _))
63 | 
64 |     if (weightedSentiment <= 0.0)
65 |       "NOT_UNDERSTOOD"
66 |     else if (weightedSentiment < 1.6)
67 |       "NEGATIVE"
68 |     else if (weightedSentiment <= 2.0)
69 |       "NEUTRAL"
70 |     else if (weightedSentiment < 5.0)
71 |       "POSITIVE"
72 |     else "NOT_UNDERSTOOD"    
73 |   }
74 | }


--------------------------------------------------------------------------------
/src/main/scala/com/stdatalabs/SparkES/TwitterSentimentAnalysis.scala:
--------------------------------------------------------------------------------
 1 | package com.stdatalabs.SparkES
 2 | 
 3 | import com.stdatalabs.SparkES.SentimentUtils._
 4 | import org.apache.spark.SparkConf
 5 | import org.apache.spark.streaming.twitter._
 6 | import org.apache.spark.streaming.{Seconds, StreamingContext}
 7 | import org.elasticsearch.spark._
 8 | import org.apache.spark.SparkContext
 9 | import org.apache.spark.SparkContext._
10 | 
11 | import java.util.Date
12 | import java.text.SimpleDateFormat
13 | import java.util.Locale
14 | 
15 | import scala.util.Try
16 | 
17 | /**
18 |  * Twitter Sentiment analysis using Stanford core-nlp library
19 |  * and storing results in elastic search
20 |  * 
21 |  * Arguments: <comsumerKey> <consumerSecret> <accessToken> <accessTokenSecret> <topic-name> <keyword_1> ... <keyword_n>
22 |  * <comsumerKey>	- Twitter consumer key 
23 |  * <consumerSecret>  	- Twitter consumer secret
24 |  * <accessToken>	- Twitter access token
25 |  * <accessTokenSecret>	- Twitter access token secret
26 |  * <topic-name>		- The kafka topic to subscribe to
27 |  * <keyword_1>		- The keyword to filter tweets
28 |  * <keyword_n>		- Any number of keywords to filter tweets
29 |  * 
30 |  * 
31 |  * @author Sachin Thirumala
32 |  */
33 | 
34 | object TwitterSentimentAnalysis {
35 | 
36 |    def main(args: Array[String]) {
37 | 
38 |      if (args.length < 4) {
39 |        System.err.println("Usage: TwitterSentimentAnalysis <consumer key> <consumer secret> " +
40 |          "<access token> <access token secret> [<filters>]")
41 |        System.exit(1)
42 |      }
43 | 
44 |      val Array(consumerKey, consumerSecret, accessToken, accessTokenSecret) = args.take(4)
45 |      val filters = args.takeRight(args.length - 4)
46 | 
47 |      // set twitter oAuth keys
48 |      System.setProperty("twitter4j.oauth.consumerKey", consumerKey)
49 |      System.setProperty("twitter4j.oauth.consumerSecret", consumerSecret)
50 |      System.setProperty("twitter4j.oauth.accessToken", accessToken)
51 |      System.setProperty("twitter4j.oauth.accessTokenSecret", accessTokenSecret)
52 | 
53 |      val conf = new SparkConf().setAppName("TwitterSentimentAnalysis")
54 |      
55 |      // Create a DStream for every 5 seconds
56 |      val ssc = new StreamingContext(conf, Seconds(5))
57 | 
58 |      // Get json object from twitter stream
59 |      val tweets = TwitterUtils.createStream(ssc, None, filters)
60 | 
61 |      tweets.print()
62 | 
63 |      /* Extract and transform required columns from json object and also generate sentiment score for each tweet.
64 |       * RDD can be saved into elasticsearch as long as the content can be translated to a document.
65 |       * So each RDD should be transformed to a Map before storing in elasticsearch index twitter_082717/tweet.
66 |       */
67 |      tweets.foreachRDD{(rdd, time) =>
68 |        rdd.map(t => {
69 |          Map(
70 |            "user"-> t.getUser.getScreenName,
71 |            "created_at" -> t.getCreatedAt.getTime.toString,
72 |            "location" -> Option(t.getGeoLocation).map(geo => { s"${geo.getLatitude},${geo.getLongitude}" }),
73 |            "text" -> t.getText,
74 |            "hashtags" -> t.getHashtagEntities.map(_.getText),
75 |            "retweet" -> t.getRetweetCount,
76 |            "language" -> t.getLang.toString(),
77 |            "sentiment" -> detectSentiment(t.getText).toString
78 |          )
79 |        }).saveToEs("twitter_020717/tweet")
80 |      }
81 |      
82 | 
83 |      ssc.start()
84 |      ssc.awaitTermination()
85 | 
86 |    }
87 |  }


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
  1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  2 | 	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  3 | 	<modelVersion>4.0.0</modelVersion>
  4 | 
  5 | 	<groupId>com.stdatalabs</groupId>
  6 | 	<artifactId>SparkES</artifactId>
  7 | 	<version>0.0.1-SNAPSHOT</version>
  8 | 	<packaging>jar</packaging>
  9 | 
 10 | 	<name>SparkES</name>
 11 | 	<url>http://maven.apache.org</url>
 12 | 
 13 | 	<properties>
 14 | 		<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
 15 | 	</properties>
 16 | 	
 17 | 	<repositories>
 18 |     <repository>
 19 |       <id>cloudera</id>
 20 |       <url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
 21 |     </repository>
 22 |   </repositories>
 23 | 
 24 | 	<dependencies>
 25 | 		<dependency>
 26 | 			<groupId>junit</groupId>
 27 | 			<artifactId>junit</artifactId>
 28 | 			<version>3.8.1</version>
 29 | 			<scope>test</scope>
 30 | 		</dependency>
 31 | 
 32 | 		<dependency>
 33 | 			<groupId>org.apache.spark</groupId>
 34 | 			<artifactId>spark-core_2.10</artifactId>
 35 | 			<version>1.5.0</version>
 36 | 			<scope>provided</scope>
 37 | 		</dependency>
 38 | 
 39 | 		<dependency>
 40 | 			<groupId>org.apache.spark</groupId>
 41 | 			<artifactId>spark-streaming_2.10</artifactId>
 42 | 			<version>1.5.0</version>
 43 | 			<scope>provided</scope>
 44 | 		</dependency>
 45 | 		<dependency>
 46 | 			<groupId>org.apache.spark</groupId>
 47 | 			<artifactId>spark-mllib_2.10</artifactId>
 48 | 			<version>1.5.0</version>
 49 | 			<scope>provided</scope>
 50 | 		</dependency>
 51 | 		<dependency>
 52 | 			<groupId>org.apache.spark</groupId>
 53 | 			<artifactId>spark-sql_2.10</artifactId>
 54 | 			<version>1.5.0</version>
 55 | 			<scope>provided</scope>
 56 | 		</dependency>
 57 | 		<dependency>
 58 | 			<groupId>org.apache.spark</groupId>
 59 | 			<artifactId>spark-hive_2.10</artifactId>
 60 | 			<version>1.5.0</version>
 61 | 			<scope>provided</scope>
 62 | 		</dependency>
 63 | 		<dependency>
 64 | 			<groupId>org.twitter4j</groupId>
 65 | 			<artifactId>twitter4j-core</artifactId>
 66 | 			<version>3.0.6</version>
 67 | 		</dependency>
 68 | 		<!-- https://mvnrepository.com/artifact/org.twitter4j/twitter4j-stream -->
 69 | 		<dependency>
 70 | 			<groupId>org.twitter4j</groupId>
 71 | 			<artifactId>twitter4j-stream</artifactId>
 72 | 			<version>3.0.6</version>
 73 | 		</dependency>
 74 | 		<!-- https://mvnrepository.com/artifact/org.twitter4j/twitter4j-async -->
 75 | 		<dependency>
 76 | 			<groupId>org.twitter4j</groupId>
 77 | 			<artifactId>twitter4j-async</artifactId>
 78 | 			<version>3.0.6</version>
 79 | 		</dependency>
 80 | 		<dependency>
 81 | 			<groupId>org.apache.spark</groupId>
 82 | 			<artifactId>spark-streaming-twitter_2.10</artifactId>
 83 | 			<version>1.5.0-cdh5.5.0</version>
 84 | 			<scope>compile</scope>
 85 | 		</dependency>
 86 | 		<dependency>
 87 | 			<groupId>edu.stanford.nlp</groupId>
 88 | 			<artifactId>stanford-corenlp</artifactId>
 89 | 			<version>3.4.1</version>
 90 | 		</dependency>
 91 | 		<dependency>
 92 | 			<groupId>edu.stanford.nlp</groupId>
 93 | 			<artifactId>stanford-corenlp</artifactId>
 94 | 			<version>3.4.1</version>
 95 | 			<classifier>models</classifier>
 96 | 		</dependency>
 97 | 		
 98 | 		<dependency>
 99 | 			<groupId>org.elasticsearch</groupId>
100 | 			<artifactId>elasticsearch-spark-13_2.10</artifactId>
101 | 			<version>5.4.0</version>
102 | 		</dependency>
103 | 	</dependencies>
104 | 
105 | 	<build>
106 | 		<plugins>
107 | 			<plugin>
108 | 				<groupId>org.apache.maven.plugins</groupId>
109 | 				<artifactId>maven-shade-plugin</artifactId>
110 | 				<version>2.4.3</version>
111 | 				<executions>
112 | 					<execution>
113 | 						<phase>package</phase>
114 | 						<goals>
115 | 							<goal>shade</goal>
116 | 						</goals>
117 | 						<configuration>
118 | 							<transformers>
119 | 								<transformer
120 | 									implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
121 | 									<mainClass>com.stdatalabs.SparkES.TwitterSentimentAnalysis</mainClass>
122 | 								</transformer>
123 | 							</transformers>
124 | 						</configuration>
125 | 					</execution>
126 | 				</executions>
127 | 			</plugin>
128 | 		</plugins>
129 | 	</build>
130 | </project>
131 | 


--------------------------------------------------------------------------------
/dependency-reduced-pom.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
  3 |   <modelVersion>4.0.0</modelVersion>
  4 |   <groupId>com.stdatalabs</groupId>
  5 |   <artifactId>SparkES</artifactId>
  6 |   <name>SparkES</name>
  7 |   <version>0.0.1-SNAPSHOT</version>
  8 |   <url>http://maven.apache.org</url>
  9 |   <build>
 10 |     <plugins>
 11 |       <plugin>
 12 |         <artifactId>maven-shade-plugin</artifactId>
 13 |         <version>2.4.3</version>
 14 |         <executions>
 15 |           <execution>
 16 |             <phase>package</phase>
 17 |             <goals>
 18 |               <goal>shade</goal>
 19 |             </goals>
 20 |             <configuration>
 21 |               <transformers>
 22 |                 <transformer>
 23 |                   <mainClass>com.stdatalabs.SparkES.TwitterSentimentAnalysis</mainClass>
 24 |                 </transformer>
 25 |               </transformers>
 26 |             </configuration>
 27 |           </execution>
 28 |         </executions>
 29 |       </plugin>
 30 |     </plugins>
 31 |   </build>
 32 |   <repositories>
 33 |     <repository>
 34 |       <id>cloudera</id>
 35 |       <url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
 36 |     </repository>
 37 |   </repositories>
 38 |   <dependencies>
 39 |     <dependency>
 40 |       <groupId>junit</groupId>
 41 |       <artifactId>junit</artifactId>
 42 |       <version>3.8.1</version>
 43 |       <scope>test</scope>
 44 |     </dependency>
 45 |     <dependency>
 46 |       <groupId>org.apache.spark</groupId>
 47 |       <artifactId>spark-core_2.10</artifactId>
 48 |       <version>1.5.0</version>
 49 |       <scope>provided</scope>
 50 |       <exclusions>
 51 |         <exclusion>
 52 |           <artifactId>avro-mapred</artifactId>
 53 |           <groupId>org.apache.avro</groupId>
 54 |         </exclusion>
 55 |         <exclusion>
 56 |           <artifactId>chill_2.10</artifactId>
 57 |           <groupId>com.twitter</groupId>
 58 |         </exclusion>
 59 |         <exclusion>
 60 |           <artifactId>chill-java</artifactId>
 61 |           <groupId>com.twitter</groupId>
 62 |         </exclusion>
 63 |         <exclusion>
 64 |           <artifactId>hadoop-client</artifactId>
 65 |           <groupId>org.apache.hadoop</groupId>
 66 |         </exclusion>
 67 |         <exclusion>
 68 |           <artifactId>spark-launcher_2.10</artifactId>
 69 |           <groupId>org.apache.spark</groupId>
 70 |         </exclusion>
 71 |         <exclusion>
 72 |           <artifactId>spark-network-common_2.10</artifactId>
 73 |           <groupId>org.apache.spark</groupId>
 74 |         </exclusion>
 75 |         <exclusion>
 76 |           <artifactId>spark-network-shuffle_2.10</artifactId>
 77 |           <groupId>org.apache.spark</groupId>
 78 |         </exclusion>
 79 |         <exclusion>
 80 |           <artifactId>spark-unsafe_2.10</artifactId>
 81 |           <groupId>org.apache.spark</groupId>
 82 |         </exclusion>
 83 |         <exclusion>
 84 |           <artifactId>jets3t</artifactId>
 85 |           <groupId>net.java.dev.jets3t</groupId>
 86 |         </exclusion>
 87 |         <exclusion>
 88 |           <artifactId>curator-recipes</artifactId>
 89 |           <groupId>org.apache.curator</groupId>
 90 |         </exclusion>
 91 |         <exclusion>
 92 |           <artifactId>javax.servlet</artifactId>
 93 |           <groupId>org.eclipse.jetty.orbit</groupId>
 94 |         </exclusion>
 95 |         <exclusion>
 96 |           <artifactId>commons-lang3</artifactId>
 97 |           <groupId>org.apache.commons</groupId>
 98 |         </exclusion>
 99 |         <exclusion>
100 |           <artifactId>commons-math3</artifactId>
101 |           <groupId>org.apache.commons</groupId>
102 |         </exclusion>
103 |         <exclusion>
104 |           <artifactId>jsr305</artifactId>
105 |           <groupId>com.google.code.findbugs</groupId>
106 |         </exclusion>
107 |         <exclusion>
108 |           <artifactId>slf4j-api</artifactId>
109 |           <groupId>org.slf4j</groupId>
110 |         </exclusion>
111 |         <exclusion>
112 |           <artifactId>jul-to-slf4j</artifactId>
113 |           <groupId>org.slf4j</groupId>
114 |         </exclusion>
115 |         <exclusion>
116 |           <artifactId>jcl-over-slf4j</artifactId>
117 |           <groupId>org.slf4j</groupId>
118 |         </exclusion>
119 |         <exclusion>
120 |           <artifactId>log4j</artifactId>
121 |           <groupId>log4j</groupId>
122 |         </exclusion>
123 |         <exclusion>
124 |           <artifactId>slf4j-log4j12</artifactId>
125 |           <groupId>org.slf4j</groupId>
126 |         </exclusion>
127 |         <exclusion>
128 |           <artifactId>compress-lzf</artifactId>
129 |           <groupId>com.ning</groupId>
130 |         </exclusion>
131 |         <exclusion>
132 |           <artifactId>snappy-java</artifactId>
133 |           <groupId>org.xerial.snappy</groupId>
134 |         </exclusion>
135 |         <exclusion>
136 |           <artifactId>lz4</artifactId>
137 |           <groupId>net.jpountz.lz4</groupId>
138 |         </exclusion>
139 |         <exclusion>
140 |           <artifactId>RoaringBitmap</artifactId>
141 |           <groupId>org.roaringbitmap</groupId>
142 |         </exclusion>
143 |         <exclusion>
144 |           <artifactId>commons-net</artifactId>
145 |           <groupId>commons-net</groupId>
146 |         </exclusion>
147 |         <exclusion>
148 |           <artifactId>akka-remote_2.10</artifactId>
149 |           <groupId>com.typesafe.akka</groupId>
150 |         </exclusion>
151 |         <exclusion>
152 |           <artifactId>akka-slf4j_2.10</artifactId>
153 |           <groupId>com.typesafe.akka</groupId>
154 |         </exclusion>
155 |         <exclusion>
156 |           <artifactId>json4s-jackson_2.10</artifactId>
157 |           <groupId>org.json4s</groupId>
158 |         </exclusion>
159 |         <exclusion>
160 |           <artifactId>jersey-server</artifactId>
161 |           <groupId>com.sun.jersey</groupId>
162 |         </exclusion>
163 |         <exclusion>
164 |           <artifactId>jersey-core</artifactId>
165 |           <groupId>com.sun.jersey</groupId>
166 |         </exclusion>
167 |         <exclusion>
168 |           <artifactId>mesos</artifactId>
169 |           <groupId>org.apache.mesos</groupId>
170 |         </exclusion>
171 |         <exclusion>
172 |           <artifactId>netty-all</artifactId>
173 |           <groupId>io.netty</groupId>
174 |         </exclusion>
175 |         <exclusion>
176 |           <artifactId>stream</artifactId>
177 |           <groupId>com.clearspring.analytics</groupId>
178 |         </exclusion>
179 |         <exclusion>
180 |           <artifactId>metrics-core</artifactId>
181 |           <groupId>io.dropwizard.metrics</groupId>
182 |         </exclusion>
183 |         <exclusion>
184 |           <artifactId>metrics-jvm</artifactId>
185 |           <groupId>io.dropwizard.metrics</groupId>
186 |         </exclusion>
187 |         <exclusion>
188 |           <artifactId>metrics-json</artifactId>
189 |           <groupId>io.dropwizard.metrics</groupId>
190 |         </exclusion>
191 |         <exclusion>
192 |           <artifactId>metrics-graphite</artifactId>
193 |           <groupId>io.dropwizard.metrics</groupId>
194 |         </exclusion>
195 |         <exclusion>
196 |           <artifactId>jackson-databind</artifactId>
197 |           <groupId>com.fasterxml.jackson.core</groupId>
198 |         </exclusion>
199 |         <exclusion>
200 |           <artifactId>jackson-module-scala_2.10</artifactId>
201 |           <groupId>com.fasterxml.jackson.module</groupId>
202 |         </exclusion>
203 |         <exclusion>
204 |           <artifactId>ivy</artifactId>
205 |           <groupId>org.apache.ivy</groupId>
206 |         </exclusion>
207 |         <exclusion>
208 |           <artifactId>oro</artifactId>
209 |           <groupId>oro</groupId>
210 |         </exclusion>
211 |         <exclusion>
212 |           <artifactId>tachyon-client</artifactId>
213 |           <groupId>org.tachyonproject</groupId>
214 |         </exclusion>
215 |         <exclusion>
216 |           <artifactId>pyrolite</artifactId>
217 |           <groupId>net.razorvine</groupId>
218 |         </exclusion>
219 |         <exclusion>
220 |           <artifactId>py4j</artifactId>
221 |           <groupId>net.sf.py4j</groupId>
222 |         </exclusion>
223 |       </exclusions>
224 |     </dependency>
225 |     <dependency>
226 |       <groupId>org.apache.spark</groupId>
227 |       <artifactId>spark-streaming_2.10</artifactId>
228 |       <version>1.5.0</version>
229 |       <scope>provided</scope>
230 |     </dependency>
231 |     <dependency>
232 |       <groupId>org.apache.spark</groupId>
233 |       <artifactId>spark-mllib_2.10</artifactId>
234 |       <version>1.5.0</version>
235 |       <scope>provided</scope>
236 |       <exclusions>
237 |         <exclusion>
238 |           <artifactId>spark-graphx_2.10</artifactId>
239 |           <groupId>org.apache.spark</groupId>
240 |         </exclusion>
241 |         <exclusion>
242 |           <artifactId>breeze_2.10</artifactId>
243 |           <groupId>org.scalanlp</groupId>
244 |         </exclusion>
245 |         <exclusion>
246 |           <artifactId>pmml-model</artifactId>
247 |           <groupId>org.jpmml</groupId>
248 |         </exclusion>
249 |         <exclusion>
250 |           <artifactId>commons-math3</artifactId>
251 |           <groupId>org.apache.commons</groupId>
252 |         </exclusion>
253 |       </exclusions>
254 |     </dependency>
255 |     <dependency>
256 |       <groupId>org.apache.spark</groupId>
257 |       <artifactId>spark-sql_2.10</artifactId>
258 |       <version>1.5.0</version>
259 |       <scope>provided</scope>
260 |       <exclusions>
261 |         <exclusion>
262 |           <artifactId>spark-catalyst_2.10</artifactId>
263 |           <groupId>org.apache.spark</groupId>
264 |         </exclusion>
265 |         <exclusion>
266 |           <artifactId>parquet-column</artifactId>
267 |           <groupId>org.apache.parquet</groupId>
268 |         </exclusion>
269 |         <exclusion>
270 |           <artifactId>parquet-hadoop</artifactId>
271 |           <groupId>org.apache.parquet</groupId>
272 |         </exclusion>
273 |         <exclusion>
274 |           <artifactId>jackson-databind</artifactId>
275 |           <groupId>com.fasterxml.jackson.core</groupId>
276 |         </exclusion>
277 |       </exclusions>
278 |     </dependency>
279 |     <dependency>
280 |       <groupId>org.apache.spark</groupId>
281 |       <artifactId>spark-hive_2.10</artifactId>
282 |       <version>1.5.0</version>
283 |       <scope>provided</scope>
284 |       <exclusions>
285 |         <exclusion>
286 |           <artifactId>parquet-hadoop-bundle</artifactId>
287 |           <groupId>com.twitter</groupId>
288 |         </exclusion>
289 |         <exclusion>
290 |           <artifactId>hive-exec</artifactId>
291 |           <groupId>org.spark-project.hive</groupId>
292 |         </exclusion>
293 |         <exclusion>
294 |           <artifactId>hive-metastore</artifactId>
295 |           <groupId>org.spark-project.hive</groupId>
296 |         </exclusion>
297 |         <exclusion>
298 |           <artifactId>avro</artifactId>
299 |           <groupId>org.apache.avro</groupId>
300 |         </exclusion>
301 |         <exclusion>
302 |           <artifactId>commons-httpclient</artifactId>
303 |           <groupId>commons-httpclient</groupId>
304 |         </exclusion>
305 |         <exclusion>
306 |           <artifactId>calcite-avatica</artifactId>
307 |           <groupId>org.apache.calcite</groupId>
308 |         </exclusion>
309 |         <exclusion>
310 |           <artifactId>calcite-core</artifactId>
311 |           <groupId>org.apache.calcite</groupId>
312 |         </exclusion>
313 |         <exclusion>
314 |           <artifactId>httpclient</artifactId>
315 |           <groupId>org.apache.httpcomponents</groupId>
316 |         </exclusion>
317 |         <exclusion>
318 |           <artifactId>jackson-mapper-asl</artifactId>
319 |           <groupId>org.codehaus.jackson</groupId>
320 |         </exclusion>
321 |         <exclusion>
322 |           <artifactId>commons-codec</artifactId>
323 |           <groupId>commons-codec</groupId>
324 |         </exclusion>
325 |         <exclusion>
326 |           <artifactId>jodd-core</artifactId>
327 |           <groupId>org.jodd</groupId>
328 |         </exclusion>
329 |         <exclusion>
330 |           <artifactId>datanucleus-core</artifactId>
331 |           <groupId>org.datanucleus</groupId>
332 |         </exclusion>
333 |         <exclusion>
334 |           <artifactId>libthrift</artifactId>
335 |           <groupId>org.apache.thrift</groupId>
336 |         </exclusion>
337 |         <exclusion>
338 |           <artifactId>libfb303</artifactId>
339 |           <groupId>org.apache.thrift</groupId>
340 |         </exclusion>
341 |         <exclusion>
342 |           <artifactId>avro-mapred</artifactId>
343 |           <groupId>org.apache.avro</groupId>
344 |         </exclusion>
345 |         <exclusion>
346 |           <artifactId>jsr305</artifactId>
347 |           <groupId>com.google.code.findbugs</groupId>
348 |         </exclusion>
349 |       </exclusions>
350 |     </dependency>
351 |   </dependencies>
352 |   <properties>
353 |     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
354 |   </properties>
355 | </project>
356 | 
357 | 


--------------------------------------------------------------------------------