├── CountryAnalysis.xlsx ├── CountrySentiment.png ├── HiveCode.txt ├── README.md ├── analysis1.py ├── dictionary.tsv ├── sentiment2.xlsx ├── time_zone_map.tsv └── tweetdata.txt /CountryAnalysis.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AshwanthRamji/Depression-Sentiment-Analysis-with-Twitter-Data/44c043d83bf141e4e7fd74db5e4cb4e5d0ddb449/CountryAnalysis.xlsx -------------------------------------------------------------------------------- /CountrySentiment.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AshwanthRamji/Depression-Sentiment-Analysis-with-Twitter-Data/44c043d83bf141e4e7fd74db5e4cb4e5d0ddb449/CountrySentiment.png -------------------------------------------------------------------------------- /HiveCode.txt: -------------------------------------------------------------------------------- 1 | 2 | --create the tweets_raw table containing the records as received from Twitter 3 | 4 | CREATE EXTERNAL TABLE tweets_raw ( 5 | id BIGINT, 6 | created_at STRING, 7 | source STRING, 8 | favorited BOOLEAN, 9 | retweet_count INT, 10 | retweeted_status STRUCT< 11 | text:STRING, 12 | user:STRUCT>, 13 | entities STRUCT< 14 | urls:ARRAY>, 15 | user_mentions:ARRAY>, 16 | hashtags:ARRAY>>, 17 | text STRING, 18 | user STRUCT< 19 | screen_name:STRING, 20 | name:STRING, 21 | friends_count:INT, 22 | followers_count:INT, 23 | statuses_count:INT, 24 | verified:BOOLEAN, 25 | utc_offset:STRING, -- was INT but nulls are strings 26 | time_zone:STRING>, 27 | in_reply_to_screen_name STRING, 28 | year int, 29 | month int, 30 | day int, 31 | hour int 32 | ) 33 | ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe' 34 | LOCATION '/user/ashwa/twitterdata' 35 | ; 36 | 37 | 38 | SELECT * FROM tweets_raw LIMIT 100; 39 | 40 | -- create sentiment dictionary (ONE TIME PROCESS) 41 | CREATE EXTERNAL TABLE dictionary ( 42 | type string, 43 | length int, 44 | word string, 45 | pos string, 46 | stemmed string, 47 | polarity string 48 | ) 49 | ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' 50 | STORED AS TEXTFILE 51 | LOCATION '/user/ashwa/dictionarydata'; 52 | 53 | 54 | 55 | -- create the time zone to country mapper (ONE TIME PROCESS) 56 | CREATE EXTERNAL TABLE time_zone_map ( 57 | time_zone string, 58 | country string, 59 | notes string 60 | ) 61 | ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' 62 | STORED AS TEXTFILE 63 | LOCATION '/user/ashwa/timedata'; 64 | 65 | -- Clean up tweets 66 | CREATE VIEW tweets_simple AS 67 | SELECT 68 | id, 69 | cast ( from_unixtime( unix_timestamp(concat( '2017 ', substring(created_at,5,15)), 'yyyy MMM dd hh:mm:ss')) as timestamp) ts, 70 | text, 71 | user.time_zone 72 | FROM tweets_raw 73 | ; 74 | 75 | -- Get the tweets based on matching time zones 76 | CREATE VIEW tweets_clean AS 77 | SELECT 78 | id, 79 | ts, 80 | text, 81 | m.country 82 | FROM tweets_simple t LEFT OUTER JOIN time_zone_map m ON t.time_zone = m.time_zone; 83 | 84 | 85 | 86 | -- Compute sentiment 87 | create view l1 as select id, words from tweets_raw lateral view explode(sentences(lower(text))) dummy as words; 88 | create view l2 as select id, word from l1 lateral view explode( words ) dummy as word ; 89 | create view l3 as select 90 | id, 91 | l2.word, 92 | case d.polarity 93 | when 'negative' then -1 94 | when 'positive' then 1 95 | else 0 end as polarity 96 | from l2 left outer join dictionary d on l2.word = d.word; 97 | 98 | -- Create the sentiments (Note: Need to create - hadoop fs -mkdir \\\apps\hive\warehouse) 99 | create table tweets_sentiment as select 100 | id, 101 | case 102 | when sum( polarity ) > 0 then 'positive' 103 | when sum( polarity ) < 0 then 'negative' 104 | else 'neutral' end as sentiment 105 | from l3 group by id; 106 | 107 | -- put everything back together and re-number sentiment --STORED AS ORC 108 | CREATE TABLE tweetsbi 109 | AS 110 | SELECT 111 | t.id,t.country, 112 | case s.sentiment 113 | when 'positive' then 1 114 | when 'neutral' then 0 115 | when 'negative' then -1 116 | end as sentiment 117 | FROM tweets_clean t LEFT OUTER JOIN tweets_sentiment s on t.id = s.id; 118 | 119 | CREATE TABLE tweetsbi2 120 | AS 121 | SELECT 122 | t.id, 123 | case s.sentiment 124 | when 'positive' then 1 125 | when 'neutral' then 0 126 | when 'negative' then -1 127 | end as sentiment 128 | FROM tweets_clean t LEFT OUTER JOIN tweets_sentiment s on t.id = s.id; 129 | 130 | CREATE TABLE tweetsbi5 131 | AS 132 | SELECT 133 | t.text 134 | FROM tweets_clean t LEFT OUTER JOIN tweets_sentiment s on t.id = s.id; -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Depression-Sentiment-Analysis-with-Twitter-Data 2 | Sentiment Analysis using Python, Twitter API and Hive 3 | 4 | # Steps 5 | Step1: Get twitter sentiments for keywords – depression, anxiety, mental health. 6 | 7 | Step2: Store twitter sentiments in a text file, Collected for an hour. Got 3500 tweets. 8 | 9 | Step3: Install and setup Hadoop and Hive. Once setup create a table to store the necessary tweet details. Use Jsonserde to convert the JSON format according to our tables. 10 | 11 | Step4: Create a table called tweets_raw table containing the records as received from Twitter. 12 | 13 | Step5: Load the time zone file and the dictionary file to the Hadoop file system (hdfs). 14 | 15 | Step6: The time zone file contains the time zone and the associated country. 16 | 17 | Step7: The dictionary contains words with their polarity. Each word taken from the tweet is compared with the dictionary and given a score. 18 | 19 | Step8: the sum of polarity is added for each tweet and of it is above 0, then it is a positive tweet. If it is equal to 0 it is neutral and if lesser than 0 it is a negative tweet. 20 | 21 | Step9: By this way tweets are classified as positive or negative. 22 | 23 | Step10. Stores as an excel file and fed to python. Here using Naive Bayes classifier to classify tweets as positive or negative and also see the efficiency of the algorithm. The algorithm ran on the test set and were able to get a 95% accuracy in predicting positive and negative tweets. 24 | -------------------------------------------------------------------------------- /analysis1.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Jun 1 19:43:52 2017 4 | 5 | @author: ashwa 6 | """ 7 | 8 | from tweepy.streaming import StreamListener 9 | from tweepy import OAuthHandler 10 | from tweepy import Stream 11 | 12 | #Variables that contains the user credentials to access Twitter API 13 | consumer_key = '' 14 | consumer_secret = '' 15 | access_token = '' 16 | access_secret = '' 17 | 18 | 19 | #This is a basic listener that just prints received tweets to stdout. 20 | class StdOutListener(StreamListener): 21 | 22 | def on_data(self, data): 23 | print(data) 24 | return True 25 | 26 | def on_error(self, status): 27 | print(status) 28 | 29 | 30 | if __name__ == '__main__': 31 | 32 | #This handles Twitter authetification and the connection to Twitter Streaming API 33 | l = StdOutListener() 34 | auth = OAuthHandler(consumer_key, consumer_secret) 35 | auth.set_access_token(access_token, access_secret) 36 | stream = Stream(auth, l) 37 | 38 | #This line filter Twitter Streams to capture data by the keywords: 'depression', 'anxiety', 'mental health' 39 | stream.filter(track=['Depression', 'Anxiety', 'mental health']) 40 | 41 | import json 42 | import pandas as pd 43 | import matplotlib.pyplot as plt 44 | 45 | tweets_data_path = 'tweetdata.txt' 46 | 47 | tweets_data = [] 48 | tweets_file = open(tweets_data_path, "r") 49 | for line in tweets_file: 50 | try: 51 | tweet = json.loads(line) 52 | tweets_data.append(tweet) 53 | except: 54 | continue 55 | 56 | print (len(tweets_data)) 57 | ''' 58 | tweets = pd.DataFrame() 59 | 60 | tweets['id'] = map(lambda tweet: tweet.get('id', None),tweets_data) 61 | tweets['text'] = map(lambda tweet: tweet.get('text', None),tweets_data) 62 | 63 | print(tweets.head()) 64 | print(tweets) 65 | ''' 66 | sent = pd.read_excel('sentiment2.xlsx') 67 | print(sent.head()) 68 | print(sent['id']) 69 | print(len(sent)) 70 | 71 | x = [] 72 | y = [] 73 | for i in range(len(tweets_data)): 74 | if tweets_data[i]['id']==sent['id'][i]: 75 | x.append(tweets_data[i]['text']) 76 | y.append(sent['sentiment'][i]) 77 | print(x[0].split(" ")) 78 | print(y[0]) 79 | ''' 80 | for i in range(len(x)): 81 | x[i] = x[i].split(" ") 82 | print(x[0]) 83 | print(x) 84 | ''' 85 | 86 | from sklearn.naive_bayes import MultinomialNB 87 | from sklearn.feature_extraction.text import CountVectorizer 88 | from sklearn import metrics 89 | 90 | vectorizer = CountVectorizer(stop_words='english') 91 | train_features = vectorizer.fit_transform(x) 92 | 93 | actual = y[:-500] 94 | 95 | 96 | 97 | nb = MultinomialNB() 98 | nb.fit(train_features, [int(r) for r in y]) 99 | 100 | test_features = vectorizer.transform(x[:-500]) 101 | 102 | test_try= vectorizer.transform(["Can we all stop treating anxiety like it's a choice and something cool to have thank you"]) 103 | test_try2= vectorizer.transform(["I want to die depression sucks"]) 104 | predict2 = nb.predict(test_try) 105 | predict3 = nb.predict(test_try2) 106 | 107 | #print(predict2) 108 | predictions = nb.predict(test_features) 109 | 110 | print() 111 | 112 | fpr, tpr, thresholds = metrics.roc_curve(actual, predictions, pos_label=1) 113 | print("Multinomial naive bayes AUC: {0}".format(metrics.auc(fpr, tpr))) 114 | 115 | print(predict2) 116 | print(predict3) 117 | 118 | 119 | -------------------------------------------------------------------------------- /sentiment2.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AshwanthRamji/Depression-Sentiment-Analysis-with-Twitter-Data/44c043d83bf141e4e7fd74db5e4cb4e5d0ddb449/sentiment2.xlsx -------------------------------------------------------------------------------- /time_zone_map.tsv: -------------------------------------------------------------------------------- 1 | time_zone country Column1 Abu Dhabi UNITED ARAB EMIRATES Adelaide AUSTRALIA Alaska UNITED STATES Almaty KAZAKHSTAN Amsterdam NETHERLANDS Arizona UNITED STATES Astana KAZAKHSTAN Athens GREECE Atlantic Time (Canada) CANADA Auckland NEW ZEALAND Azores SPAIN Baghdad IRAQ Baku AZERBAIJAN Bangkok THAILAND Beijing CHINA Belgrade BELGIUM Berlin GERMANY Bern GERMANY Bogota COLOMBIA Brasilia BRAZIL Bratislava SLOVAKIA Brisbane AUSTRALIA Brussels BELGIUM Bucharest ROMANIA Budapest HUNGARY Buenos Aires ARGENTINA Cairo EGYPT Canberra AUSTRALIA Cape Verde Is. CAPE VERDE Caracas VENEZUELA Casablanca MOROCCO Central America MEXICO Inferred Central Time (US & Canada) UNITED STATES Inferred Chennai INDIA Chihuahua MEXICO Chongqing CHINA Copenhagen DENMARK Darwin AUSTRALIA Dhaka BANGLADESH Dublin IRELAND Eastern Time (US & Canada) UNITED STATES Inferred Edinburgh UNITED KINGDOM Ekaterinburg RUSSIAN FEDERATION Fiji SAUDI ARABIA Georgetown GUYANA Greenland GREENLAND Guadalajara MEXICO Guam GUAM Hanoi VIET NAM Harare ZIMBABWE Hawaii UNITED STATES Helsinki FINLAND Hobart AUSTRALIA Hong Kong HONG KONG Indiana (East) UNITED STATES Inferred International Date Line West Irkutsk RUSSIAN FEDERATION Islamabad PAKISTAN Istanbul TURKEY Jakarta INDONESIA Jerusalem ISRAEL Kabul AFGHANISTAN Kamchatka RUSSIAN FEDERATION Karachi PAKISTAN Kathmandu NEPAL Kolkata INDIA Krasnoyarsk RUSSIAN FEDERATION Kuala Lumpur MALAYSIA Kuwait KUWAIT Kyiv UKRAINE La Paz BOLIVIA Lima PERU Lisbon PORTUGAL Ljubljana SLOVENIA London UNITED KINGDOM Madrid SPAIN Magadan ROMANIA Marshall Is. MARSHALL ISLANDS Mazatlan MEXICO Melbourne AUSTRALIA Mexico City MEXICO Mid-Atlantic UNITED STATES Inferred Midway Island UNITED STATES Minsk BELARUS Monrovia LIBERIA Monterrey MEXICO Moscow RUSSIAN FEDERATION Mountain Time (US & Canada) UNITED STATES Inferred Mumbai INDIA Muscat OMAN Nairobi KENYA New Caledonia FRANCE New Delhi INDIA Newfoundland UNITED STATES Novosibirsk UKRAINE Nuku'alofa TONGA Osaka JAPAN Pacific Time (US & Canada) UNITED STATES Inferred Paris FRANCE Perth AUSTRALIA Port Moresby PAPUA NEW GUINEA Prague CZECH REPUBLIC Pretoria SOUTH AFRICA Quito ECUADOR Rangoon MYANMAR Riga LATVIA Riyadh SAUDI ARABIA Rome ITALY Samoa PAPUA NEW GUINEA Santiago CHILE Sapporo JAPAN Sarajevo BOSNIA AND HERZEGOWINA Saskatchewan CANADA Seoul KOREA (S) Singapore SINGAPORE Skopje MACEDONIA Sofia BULGARIA Solomon Is. SOLOMON ISLANDS Sri Jayawardenepura SRI LANKA St. Petersburg RUSSIAN FEDERATION Stockholm SWEDEN Sydney AUSTRALIA Taipei TAIWAN Tallinn ESTONIA Tashkent UZBEKISTAN Tbilisi GEORGIA Tehran IRAN (ISLAMIC REPUBLIC OF) Tijuana MEXICO Tokyo JAPAN Ulaan Bataar MONGOLIA Urumqi CHINA Vienna AUSTRIA Vilnius LITHUANIA Vladivostok RUSSIAN FEDERATION Volgograd RUSSIAN FEDERATION Warsaw POLAND Wellington NEW ZEALAND West Central Africa NIGERIA Inferred Yakutsk RUSSIAN FEDERATION Yerevan ARMENIA Zagreb CROATIA --------------------------------------------------------------------------------