├── CountryAnalysis.xlsx
├── CountrySentiment.png
├── HiveCode.txt
├── README.md
├── analysis1.py
├── dictionary.tsv
├── sentiment2.xlsx
├── time_zone_map.tsv
└── tweetdata.txt


/CountryAnalysis.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AshwanthRamji/Depression-Sentiment-Analysis-with-Twitter-Data/44c043d83bf141e4e7fd74db5e4cb4e5d0ddb449/CountryAnalysis.xlsx


--------------------------------------------------------------------------------
/CountrySentiment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AshwanthRamji/Depression-Sentiment-Analysis-with-Twitter-Data/44c043d83bf141e4e7fd74db5e4cb4e5d0ddb449/CountrySentiment.png


--------------------------------------------------------------------------------
/HiveCode.txt:
--------------------------------------------------------------------------------
  1 | 
  2 | --create the tweets_raw table containing the records as received from Twitter
  3 | 
  4 | CREATE EXTERNAL TABLE tweets_raw (
  5 |    id BIGINT,
  6 |    created_at STRING,
  7 |    source STRING,
  8 |    favorited BOOLEAN,
  9 |    retweet_count INT,
 10 |    retweeted_status STRUCT<
 11 |       text:STRING,
 12 |       user:STRUCT<screen_name:STRING,name:STRING>>,
 13 |    entities STRUCT<
 14 |       urls:ARRAY<STRUCT<expanded_url:STRING>>,
 15 |       user_mentions:ARRAY<STRUCT<screen_name:STRING,name:STRING>>,
 16 |       hashtags:ARRAY<STRUCT<text:STRING>>>,
 17 |    text STRING,
 18 |    user STRUCT<
 19 |       screen_name:STRING,
 20 |       name:STRING,
 21 |       friends_count:INT,
 22 |       followers_count:INT,
 23 |       statuses_count:INT,
 24 |       verified:BOOLEAN,
 25 |       utc_offset:STRING, -- was INT but nulls are strings
 26 |       time_zone:STRING>,
 27 |    in_reply_to_screen_name STRING,
 28 |    year int,
 29 |    month int,
 30 |    day int,
 31 |    hour int
 32 | )
 33 | ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe'
 34 | LOCATION '/user/ashwa/twitterdata'
 35 | ;
 36 | 
 37 | 
 38 | SELECT * FROM tweets_raw LIMIT 100;
 39 | 
 40 | -- create sentiment dictionary (ONE TIME PROCESS)
 41 | CREATE EXTERNAL TABLE dictionary (
 42 |     type string,
 43 |     length int,
 44 |     word string,
 45 |     pos string,
 46 |     stemmed string,
 47 |     polarity string
 48 | )
 49 | ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' 
 50 | STORED AS TEXTFILE
 51 | LOCATION '/user/ashwa/dictionarydata';
 52 | 
 53 | 
 54 | 
 55 | -- create the time zone to country mapper (ONE TIME PROCESS)
 56 | CREATE EXTERNAL TABLE time_zone_map (
 57 |     time_zone string,
 58 |     country string,
 59 |     notes string
 60 | )
 61 | ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' 
 62 | STORED AS TEXTFILE
 63 | LOCATION '/user/ashwa/timedata';
 64 | 
 65 | -- Clean up tweets
 66 | CREATE VIEW tweets_simple AS
 67 | SELECT
 68 |   id,
 69 |   cast ( from_unixtime( unix_timestamp(concat( '2017 ', substring(created_at,5,15)), 'yyyy MMM dd hh:mm:ss')) as timestamp) ts,
 70 |   text,
 71 |   user.time_zone 
 72 | FROM tweets_raw
 73 | ;
 74 | 
 75 | -- Get the tweets based on matching time zones
 76 | CREATE VIEW tweets_clean AS
 77 | SELECT
 78 |   id,
 79 |   ts,
 80 |   text,
 81 |   m.country 
 82 |  FROM tweets_simple t LEFT OUTER JOIN time_zone_map m ON t.time_zone = m.time_zone;
 83 |  
 84 |  
 85 |  
 86 |  -- Compute sentiment
 87 |  create view l1 as select id, words from tweets_raw lateral view explode(sentences(lower(text))) dummy as words;
 88 |  create view l2 as select id, word from l1 lateral view explode( words ) dummy as word ;
 89 |  create view l3 as select 
 90 |      id, 
 91 |      l2.word, 
 92 |      case d.polarity 
 93 |        when  'negative' then -1
 94 |        when 'positive' then 1 
 95 |        else 0 end as polarity 
 96 |   from l2 left outer join dictionary d on l2.word = d.word;
 97 | 
 98 | -- Create the sentiments (Note: Need to create - hadoop fs -mkdir \\\apps\hive\warehouse)
 99 | create table tweets_sentiment as select 
100 |   id, 
101 |   case 
102 |     when sum( polarity ) > 0 then 'positive' 
103 |     when sum( polarity ) < 0 then 'negative'  
104 |     else 'neutral' end as sentiment 
105 |  from l3 group by id;
106 | 
107 | -- put everything back together and re-number sentiment --STORED AS ORC
108 | CREATE TABLE tweetsbi 
109 | AS
110 | SELECT 
111 |   t.id,t.country,
112 |   case s.sentiment 
113 |     when 'positive' then 1 
114 |     when 'neutral' then 0 
115 |     when 'negative' then -1 
116 |   end as sentiment  
117 | FROM tweets_clean t LEFT OUTER JOIN tweets_sentiment s on t.id = s.id;
118 | 
119 | CREATE TABLE tweetsbi2 
120 | AS
121 | SELECT 
122 |   t.id,
123 |   case s.sentiment 
124 |     when 'positive' then 1 
125 |     when 'neutral' then 0 
126 |     when 'negative' then -1 
127 |   end as sentiment  
128 | FROM tweets_clean t LEFT OUTER JOIN tweets_sentiment s on t.id = s.id;
129 | 
130 | CREATE TABLE tweetsbi5 
131 | AS
132 | SELECT 
133 |   t.text
134 | FROM tweets_clean t LEFT OUTER JOIN tweets_sentiment s on t.id = s.id;


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Depression-Sentiment-Analysis-with-Twitter-Data
 2 | Sentiment Analysis using Python, Twitter API and Hive
 3 | 
 4 | # Steps 
 5 | Step1: Get twitter sentiments for keywords – depression, anxiety, mental health.
 6 | 
 7 | Step2: Store twitter sentiments in a text file, Collected for an hour. Got 3500 tweets.
 8 | 
 9 | Step3: Install and setup Hadoop and Hive. Once setup create a table to store the necessary tweet details. Use Jsonserde to convert the JSON format according to our tables.
10 | 
11 | Step4: Create a table called tweets_raw table containing the records as received from Twitter.
12 | 
13 | Step5: Load the time zone file and the dictionary file to the Hadoop file system (hdfs).
14 | 
15 | Step6: The time zone file contains the time zone and the associated country.
16 | 
17 | Step7: The dictionary contains words with their polarity. Each word taken from the tweet is compared with the dictionary and given a score.
18 | 
19 | Step8: the sum of polarity is added for each tweet and of it is above 0, then it is a positive tweet. If it is equal to 0 it is neutral and if lesser than 0 it is a negative tweet.
20 | 
21 | Step9: By this way tweets are classified as positive or negative.
22 | 
23 | Step10. Stores as an excel file and fed to python. Here using Naive Bayes classifier to classify tweets as positive or negative and also see the efficiency of the algorithm. The algorithm ran on the test set and were able to get a 95% accuracy in predicting positive and negative tweets.
24 | 


--------------------------------------------------------------------------------
/analysis1.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Thu Jun  1 19:43:52 2017
  4 | 
  5 | @author: ashwa
  6 | """
  7 | 
  8 | from tweepy.streaming import StreamListener
  9 | from tweepy import OAuthHandler
 10 | from tweepy import Stream
 11 | 
 12 | #Variables that contains the user credentials to access Twitter API 
 13 | consumer_key = ''
 14 | consumer_secret = ''
 15 | access_token = ''
 16 | access_secret = ''
 17 | 
 18 | 
 19 | #This is a basic listener that just prints received tweets to stdout.
 20 | class StdOutListener(StreamListener):
 21 | 
 22 |     def on_data(self, data):
 23 |         print(data)
 24 |         return True
 25 | 
 26 |     def on_error(self, status):
 27 |         print(status)
 28 | 
 29 | 
 30 | if __name__ == '__main__':
 31 | 
 32 |     #This handles Twitter authetification and the connection to Twitter Streaming API
 33 |     l = StdOutListener()
 34 |     auth = OAuthHandler(consumer_key, consumer_secret)
 35 |     auth.set_access_token(access_token, access_secret)
 36 |     stream = Stream(auth, l)
 37 | 
 38 |     #This line filter Twitter Streams to capture data by the keywords: 'depression', 'anxiety', 'mental health'
 39 |     stream.filter(track=['Depression', 'Anxiety', 'mental health'])
 40 |     
 41 | import json
 42 | import pandas as pd
 43 | import matplotlib.pyplot as plt
 44 | 
 45 | tweets_data_path = 'tweetdata.txt'
 46 | 
 47 | tweets_data = []
 48 | tweets_file = open(tweets_data_path, "r")
 49 | for line in tweets_file:
 50 |     try:
 51 |         tweet = json.loads(line)
 52 |         tweets_data.append(tweet)
 53 |     except:
 54 |         continue
 55 |     
 56 | print (len(tweets_data))
 57 | '''
 58 | tweets = pd.DataFrame()
 59 | 
 60 | tweets['id'] = map(lambda tweet: tweet.get('id', None),tweets_data)
 61 | tweets['text'] = map(lambda tweet: tweet.get('text', None),tweets_data)
 62 | 
 63 | print(tweets.head())
 64 | print(tweets)
 65 | '''
 66 | sent = pd.read_excel('sentiment2.xlsx')
 67 | print(sent.head())
 68 | print(sent['id'])
 69 | print(len(sent))
 70 | 
 71 | x = []
 72 | y = []
 73 | for i in range(len(tweets_data)):
 74 |     if tweets_data[i]['id']==sent['id'][i]:
 75 |         x.append(tweets_data[i]['text'])
 76 |         y.append(sent['sentiment'][i])
 77 | print(x[0].split(" "))
 78 | print(y[0])
 79 | '''
 80 | for i in range(len(x)):
 81 |     x[i] = x[i].split(" ")
 82 | print(x[0])
 83 | print(x)
 84 | '''
 85 |             
 86 | from sklearn.naive_bayes import MultinomialNB
 87 | from sklearn.feature_extraction.text import CountVectorizer
 88 | from sklearn import metrics
 89 | 
 90 | vectorizer = CountVectorizer(stop_words='english')
 91 | train_features = vectorizer.fit_transform(x)
 92 | 
 93 | actual = y[:-500]
 94 | 
 95 | 
 96 | 
 97 | nb = MultinomialNB()
 98 | nb.fit(train_features, [int(r) for r in y])
 99 | 
100 | test_features = vectorizer.transform(x[:-500])
101 | 
102 | test_try= vectorizer.transform(["Can we all stop treating anxiety like it's a choice and something cool to have thank you"])
103 | test_try2= vectorizer.transform(["I want to die depression sucks"])
104 | predict2 = nb.predict(test_try)
105 | predict3 = nb.predict(test_try2)
106 | 
107 | #print(predict2)
108 | predictions = nb.predict(test_features)
109 | 
110 | print()
111 | 
112 | fpr, tpr, thresholds = metrics.roc_curve(actual, predictions, pos_label=1)
113 | print("Multinomial naive bayes AUC: {0}".format(metrics.auc(fpr, tpr)))
114 | 
115 | print(predict2)
116 | print(predict3)
117 | 
118 | 
119 | 


--------------------------------------------------------------------------------
/sentiment2.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AshwanthRamji/Depression-Sentiment-Analysis-with-Twitter-Data/44c043d83bf141e4e7fd74db5e4cb4e5d0ddb449/sentiment2.xlsx


--------------------------------------------------------------------------------
/time_zone_map.tsv:
--------------------------------------------------------------------------------
1 | time_zone	country	Column1Abu Dhabi	UNITED ARAB EMIRATES	Adelaide	AUSTRALIA	Alaska	UNITED STATES	Almaty	KAZAKHSTAN	Amsterdam	NETHERLANDS	Arizona	UNITED STATES	Astana	KAZAKHSTAN	Athens	GREECE	Atlantic Time (Canada)	CANADA	Auckland	NEW ZEALAND	Azores	SPAIN	Baghdad	IRAQ	Baku	AZERBAIJAN	Bangkok	THAILAND	Beijing	CHINA	Belgrade	BELGIUM	Berlin	GERMANY	Bern	GERMANY	Bogota	COLOMBIA	Brasilia	BRAZIL	Bratislava	SLOVAKIA	Brisbane	AUSTRALIA	Brussels	BELGIUM	Bucharest	ROMANIA	Budapest	HUNGARY	Buenos Aires	ARGENTINA	Cairo	EGYPT	Canberra	AUSTRALIA	Cape Verde Is.	CAPE VERDE	Caracas	VENEZUELA	Casablanca	MOROCCO	Central America	MEXICO	InferredCentral Time (US & Canada)	UNITED STATES	InferredChennai	INDIA	Chihuahua	MEXICO	Chongqing	CHINA	Copenhagen	DENMARK	Darwin	AUSTRALIA	Dhaka	BANGLADESH	Dublin	IRELAND	Eastern Time (US & Canada)	UNITED STATES	InferredEdinburgh	UNITED KINGDOM	Ekaterinburg	RUSSIAN FEDERATION	Fiji	SAUDI ARABIA	Georgetown	GUYANA	Greenland	GREENLAND	Guadalajara	MEXICO	Guam	GUAM	Hanoi	VIET NAM	Harare	ZIMBABWE	Hawaii	UNITED STATES	Helsinki	FINLAND	Hobart	AUSTRALIA	Hong Kong	HONG KONG	Indiana (East)	UNITED STATES	InferredInternational Date Line West		Irkutsk	RUSSIAN FEDERATION	Islamabad	PAKISTAN	Istanbul	TURKEY	Jakarta	INDONESIA	Jerusalem	ISRAEL	Kabul	AFGHANISTAN	Kamchatka	RUSSIAN FEDERATION	Karachi	PAKISTAN	Kathmandu	NEPAL	Kolkata	INDIA	Krasnoyarsk	RUSSIAN FEDERATION	Kuala Lumpur	MALAYSIA	Kuwait	KUWAIT	Kyiv	UKRAINE	La Paz	BOLIVIA	Lima	PERU	Lisbon	PORTUGAL	Ljubljana	SLOVENIA	London	UNITED KINGDOM	Madrid	SPAIN	Magadan	ROMANIA	Marshall Is.	MARSHALL ISLANDS	Mazatlan	MEXICO	Melbourne	AUSTRALIA	Mexico City	MEXICO	Mid-Atlantic	UNITED STATES	InferredMidway Island	UNITED STATES	Minsk	BELARUS	Monrovia	LIBERIA	Monterrey	MEXICO	Moscow	RUSSIAN FEDERATION	Mountain Time (US & Canada)	UNITED STATES	InferredMumbai	INDIA	Muscat	OMAN	Nairobi	KENYA	New Caledonia	FRANCE	New Delhi	INDIA	Newfoundland	UNITED STATES	Novosibirsk	UKRAINE	Nuku'alofa	TONGA	Osaka	JAPAN	Pacific Time (US & Canada)	UNITED STATES	InferredParis	FRANCE	Perth	AUSTRALIA	Port Moresby	PAPUA NEW GUINEA	Prague	CZECH REPUBLIC	Pretoria	SOUTH AFRICA	Quito	ECUADOR	Rangoon	MYANMAR	Riga	LATVIA	Riyadh	SAUDI ARABIA	Rome	ITALY	Samoa	PAPUA NEW GUINEA	Santiago	CHILE	Sapporo	JAPAN	Sarajevo	BOSNIA AND HERZEGOWINA	Saskatchewan	CANADA	Seoul	KOREA (S)	Singapore	SINGAPORE	Skopje	MACEDONIA	Sofia	BULGARIA	Solomon Is.	SOLOMON ISLANDS	Sri Jayawardenepura	SRI LANKA	St. Petersburg	RUSSIAN FEDERATION	Stockholm	SWEDEN	Sydney	AUSTRALIA	Taipei	TAIWAN	Tallinn	ESTONIA	Tashkent	UZBEKISTAN	Tbilisi	GEORGIA	Tehran	IRAN (ISLAMIC REPUBLIC OF)	Tijuana	MEXICO	Tokyo	JAPAN	Ulaan Bataar	MONGOLIA	Urumqi	CHINA	Vienna	AUSTRIA	Vilnius	LITHUANIA	Vladivostok	RUSSIAN FEDERATION	Volgograd	RUSSIAN FEDERATION	Warsaw	POLAND	Wellington	NEW ZEALAND	West Central Africa	NIGERIA	InferredYakutsk	RUSSIAN FEDERATION	Yerevan	ARMENIA	Zagreb	CROATIA		


--------------------------------------------------------------------------------