├── .gitignore ├── README.md ├── cross_validate.py ├── data ├── sampleSubmission.csv ├── test.csv ├── train.csv └── variableNames.txt ├── doc.md ├── score.py ├── tweet_text.py └── upload.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.swp 3 | *.sh 4 | submissions/* 5 | account.py 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | cloudy-tweets 2 | ============= 3 | 4 | Machine Learning solution for Kaggle.com's 5 | " 6 | Partly Sunny with a Chance of Hashtags" 7 | 8 | -------------------------------------------------------------------------------- /cross_validate.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | __author__ = "Eric Chiang" 4 | __copyright__ = "Copyright 2013, Eric Chiang" 5 | __email__ = "eric.chiang.m@gmail.com" 6 | 7 | __license__ = "GPL" 8 | __version__ = "3.0" 9 | 10 | from Queue import Queue 11 | from sklearn.cross_validation import KFold 12 | from sklearn.feature_extraction.text import CountVectorizer 13 | from sklearn.svm import SVR 14 | from threading import Thread 15 | from time import sleep 16 | from tweet_text import TweetTokenizer 17 | import numpy as np 18 | import pandas as pd 19 | import sys 20 | 21 | """ 22 | Cross validation file 23 | """ 24 | 25 | 26 | def run_fold(train_indices,test_indices,vectorizer,train_data,pred,queue): 27 | """ Run fold computation in parallel 28 | """ 29 | print "Processing Fold" 30 | train_raw = raw_tweets[train_indices].tolist() 31 | test_raw = raw_tweets[test_indices].tolist() 32 | X_train = vectorizer.transform(train_raw) 33 | X_test = vectorizer.transform(test_raw) 34 | for variable in variables: 35 | y_train = np.array(train_data[variable])[train_indices] 36 | clf = SVR() 37 | clf.fit(X_train,y_train) 38 | pred[variable][test_indices] = clf.predict(X_test) 39 | # When finished notify queue 40 | queue.put("done") 41 | 42 | # Load train data from file 43 | train_data = pd.read_csv(open('data/train.csv'),quotechar='"') 44 | 45 | # Numpy array used to allow for test/train splitting 46 | raw_tweets = np.array(train_data['tweet']) 47 | variables = train_data.columns[4:].tolist() 48 | 49 | if len(variables) != 24: 50 | sys.stderr.write(variables) 51 | raise Exception("Bad number of sentiments") 52 | 53 | # Allocate data frame for predictions 54 | pred = train_data.copy() 55 | 56 | # Build vectorizer 57 | tokenizer = TweetTokenizer() 58 | print "Building vectorizer" 59 | vectorizer = CountVectorizer(tokenizer=tokenizer.tokenize_tweet, 60 | max_features=50, 61 | binary=True, 62 | ngram_range=(1,1)) 63 | 64 | # Vectorization is done after folds are split. It's takes more time to slice a 65 | # sparce matrix than a list of strings. 66 | vectorizer.fit(raw_tweets) 67 | 68 | 69 | q = Queue() 70 | num_threads = 4 71 | num_folds = 8 72 | threads = [] 73 | 74 | # Spawn a thread for each fold 75 | k_fold = KFold(n=len(raw_tweets),n_folds=num_folds,indices=True) 76 | for train_indices,test_indices in k_fold: 77 | if len(threads) >= num_threads: 78 | while q.empty(): 79 | sleep(2) 80 | status = q.get() 81 | t = Thread(target=run_fold, args=(train_indices,test_indices, 82 | vectorizer,train_data,pred,q)) 83 | t.deamon = True 84 | t.start() 85 | threads.append(t) 86 | 87 | # Wait for all threads to complete 88 | for t in threads: 89 | t.join() 90 | 91 | # Calculate (and print) mean squred errors for each variable 92 | print "Mean Squared Errors:" 93 | for variable in variables: 94 | y_pred = np.array(pred[variable]) 95 | y_actu = np.array(train_data[variable]) 96 | 97 | mse = np.average((y_pred - y_actu)**2) 98 | print "'%s': %f" % (variable,mse) 99 | -------------------------------------------------------------------------------- /data/variableNames.txt: -------------------------------------------------------------------------------- 1 | s = sentiment 2 | w = when 3 | k = kind 4 | ============================================================ 5 | s1,"I can't tell" 6 | s2,"Negative" 7 | s3,"Neutral / author is just sharing information" 8 | s4,"Positive" 9 | s5,"Tweet not related to weather condition" 10 | w1,"current (same day) weather" 11 | w2,"future (forecast)" 12 | w3,"I can't tell" 13 | w4,"past weather" 14 | k1,"clouds" 15 | k2,"cold" 16 | k3,"dry" 17 | k4,"hot" 18 | k5,"humid" 19 | k6,"hurricane" 20 | k7,"I can't tell" 21 | k8,"ice" 22 | k9,"other" 23 | k10,"rain" 24 | k11,"snow" 25 | k12,"storms" 26 | k13,"sun" 27 | k14,"tornado" 28 | k15,"wind" 29 | -------------------------------------------------------------------------------- /doc.md: -------------------------------------------------------------------------------- 1 | Documentation for inital attempt at competition (and before I joined yhat) 2 | 3 |

Overview

4 | 5 | Sentiment classification of tweets! Each tweet is linked to a confidence 6 | interval of 15 unique sentiments. 7 | 8 | Sentiments include (see data/variableNames.txt for details): 9 | 16 | 17 |

Methodology

18 | 19 | Simple pipeline of feature generation to classification. 20 | 21 | Feature space is comprised of word frequencies for m most common words and only 22 | considers unigram information. Use of linear ridge regression for 23 | classification. Classifier predominantly prefered because of speed. 24 | 25 |

Resuts

26 | 27 | Cross validation estimates for total RMSE across all 15 features is about 0.169 28 | when n = 270 (aka feature space incoperates the 270 most common words). 29 | Current leader (2013-11-04) has attained around a 0.146 RMSE. Have only been 30 | able to bump m up to 270 before my computer maxed out its active memory. I need 31 | to upgrade. 32 | Cross validaiton RMSE: 0.169 (m = 270; alpha = 1e-7; folds = 10) 33 | 34 | UPDATE (2013-11-05):
35 | scipy.sparse matrices implemented. Have had successful runs using 600 features, 36 | though generating test and training folds now takes signifigantly longer. Took 37 | an hour to split 600 features into 10 folds on my machine with no 38 | parallelization. Each fold is writen to file then retrieved individually for cv 39 | to maintain low active memory use; at this point this package definitely does 40 | not optimize for speed. Will try for 1000 features next.
41 | Cross validation RMSE: 0.164 (m = 600; ridge regression; alpha = 1e-7) 42 | 43 | UPDATE (2013-11-06):
44 | Successful run with 1000 features. Memory failure when attempting 2000. 45 | Considering work arounds.
46 | Cross validation RMSE: 0.160 (m = 1000; alpha = 1e-7; folds = 10) 47 | 48 | UPDATE (2013-11-07):
49 | Observations now split into train/test folds before being mapped to feature 50 | space. This means fold splitting is preformed on a numpy array rather than a 51 | large sparce matrix which creates a faster and lower memory use pipeline. 52 | Successful runs with 2000 and 3000 features. Very clearly hitting diminishing 53 | returns when considering expansions of the feature space.
54 | Cross validation RMSE: 0.1578 (m = 3000; alpha = 1e-7; folds = 10) 55 | 56 | UPDATE (2013-11-08):
57 | First two submission to the Kaggle.com leaderboards. First sumbission used a 58 | feature space of the m most frequent words in the training data, the second 59 | considered the m most frequent words in the combination of the training and 60 | test. The first strategy proved marginally better though both resulted in a 61 | surprisingly high error score by Kaggle as compared to cross validation. Will 62 | discuss those discrepancies in detail under the 'Details/Musing' header since 63 | the differences are far more signifigant than what might be allowable.
64 | Kaggle RMSE: 0.16350 (m = 3000; alpha = 1e-7) 65 | 66 | UPDATE (2013-11-09):
67 | Signifigant expansion of rare word mapping. Numeric and alphanumeric data 68 | mapped to a larger set of possible values. Emoticon information now preserved 69 | as inspired by Go, Bhayani, and Huang demonstration of its usefulness in 70 | Twitter sentiment classification [1]. Exclamation and question marks presence 71 | also collected which were both noted by Pang, Lee and Vaithyanathan to be an 72 | unobvious source of information to humans (grad students) picking indicator 73 | words [2]. Details under the 'Details/Musing' header.
74 | Cross validation RMSE: 0.1578 (m = 3000; alpha = 1e-7; folds = 3)
75 | Cross validation RMSE: 0.1576 (m = 4000; alpha = 1e-7; folds = 3)
76 | Cross validation RMSE: 0.1586 (m = 5000; alpha = 1e-7; folds = 3)
77 | Kaggle RMSE: 0.16298 (m = 3000; alpha = 1e-7)
78 | Kaggle RMSE: 0.16314 (m = 4000; alpha = 1e-7)
79 | Kaggle RMSE: 0.16370 (m = 5000; alpha = 1e-7)
80 | 81 | UPDATE (2013-11-10):
82 | Considering four new modificaitons after yesterday's results:
83 | Increasing the alpha value to favor variance over bias. Considering word 84 | presense rather than word count for feature generation [1]. Reworking 85 | altercation of scores predicited out of range (above 1.0 or bellow 0.0). Trying 86 | different classifiers. 87 | 88 | UPDATE (2013-11-11):
89 | Elastic net tested and imediately showed improvements to ridge. Takes 90 | significantly longer to run than ridge but is still completes in a reasonable 91 | amount of time. Attempted to test support vector machines regression since it 92 | has shown to be successful as a sentiment classifier. Unfortunately, time 93 | complexity is an issue since it took roughly twice the time of elastic net in 94 | inital testing. Am now using a free EC2 micro instance on AWS to run 95 | regressions since I need to use my laptop for other work. If anyone wants to 96 | donate cycles message me! Currently placed 26th with elastic net results.
97 | Kaggle RMSE: 0.16014 (m = 3000; elastic net; alpha = 1e-5)
98 | Kaggle RMSE: 0.15970 (m = 4000; elastic net; alpha = 1e-5) 99 | 100 | 101 |

Details/Musings

102 | 103 | 2013-11-07:
104 | To be honest I've been surprised by how successful my approach has been so far. 105 | There has been very little thought put into parsing tweets or improving 106 | information retrieval pre-regression. The only efforts so far have been mapping 107 | rare words; grouping similar low frequency words which would intuitively hold 108 | similar meanings. The binning effors which have been implemented so far pertain 109 | to numeric and alphanumeric words. 110 |
Currently bins: 111 |
122 | Inital efforts to further partition bins failed to improve the accuracy of the 123 | hypothesis. Though admittingly, those efforts took place when the feature 124 | space was an order of magnitude smaller. 125 | 126 | 2013-11-08:
127 | My initial hypothesis to explain why the cross validation errors were so 128 | different from the Kaggle scores is that the high number of folds lead to a 129 | high trian/test ratio, allowing the training data to better encompass the test. 130 | Will run 3 fold CV (rather than 10 fold) against future submission to test this 131 | theory. Very adamant in bringing CV in line with Kaggle since CV results will 132 | be critical in detemining if different variables require different classifiers. 133 | 134 | Also, for documentation, all reported scores will now specify if they were 135 | produced by cross validation or by blind test (what Kaggle does). Number of 136 | folds will also be noted. This should have been the policy from the beginning, 137 | but it's hard to voluntarily admit that your model might not be as good as it 138 | appears. Luckily, Kaggle provides the benchmark and self reporting of scores 139 | remains a rare phenomenon in other fields. 140 | 141 | 2013-11-09
142 | Current rare word mapping strategies. 143 | 144 | Before stripping tweets of punctuation and splitting into words, the string is 145 | check for exclamation marks, question marks and emoticons. Positive emoticons 146 | are mapped to ':)', negative emoticons are mapped to ':(', neutral emoticons 147 | are mapped to ':/', and winky faces ';)' are mainted. Links, mentions, and RT's 148 | are now also preserved (they were previously not). Exclamation and question 149 | mark methodology taken from [1]. Emoticon methodology taken from [2]. 150 | 151 | After tweets are cleared of punctuation and split each individual word is 152 | determined to either be numeric, alphanumeric, or alphabetical. If the word is 153 | numeric, it is rounded down to the nearest ten, grouping numbers of similar 154 | size together. If the word is alphanumierc, the numbers are removed and a 155 | special '#num' tag is appended to the word. This has had the very pleasing 156 | result of producing common strings such as 'mph #num', 'pm #num', and 'f #num' 157 | which arise organically from the program rather than having to be selected for. 158 | 159 | 160 |

References

161 | [1] B. Pang, L. Lee, and S. Vaithyanathan. Thumbs up? Sentiment classification 162 | using machine learning techniques. 2002 163 | [2] A. Go, R. Bhayani, and L. Huang. Twitter Sentiment Classification using 164 | Distant Supervision. 2009 165 | -------------------------------------------------------------------------------- /score.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | __author__ = "Eric Chiang" 4 | __copyright__ = "Copyright 2013, Eric Chiang" 5 | __email__ = "eric.chiang.m@gmail.com" 6 | 7 | __license__ = "GPL" 8 | __version__ = "3.0" 9 | 10 | from account_info import username,apikey 11 | from yhat import Yhat 12 | import pandas as pd 13 | import numpy as np 14 | import sys 15 | 16 | """ 17 | Score train data using yhat and create a submission 18 | """ 19 | 20 | if len(sys.argv) != 2: 21 | sys.stderr.write("Please specify a file to write predictions out to!\n") 22 | sys.exit(2) 23 | 24 | sub_file = sys.argv[1] 25 | 26 | # Record the best model 27 | best_model =\ 28 | { 29 | 's1':1, 's2':1, 's3':1, 's4':1, 's5':1, 30 | 'w1':1, 'w2':1, 'w3':1, 'w4':1, 31 | 'k1':1, 'k2':1, 'k3':1, 'k4':1, 'k5':1, 32 | 'k6':1, 'k7':1, 'k8':1, 'k9':1, 'k10':1, 33 | 'k11':1, 'k12':1, 'k13':1, 'k14':1, 'k15':1 34 | } 35 | 36 | test_data = pd.read_csv(open('data/test.csv','r'),quotechar='"') 37 | 38 | sub_data = pd.read_csv(open('data/sampleSubmission.csv','r'),quotechar='"') 39 | 40 | if not np.alltrue(test_data['id'] == sub_data['id']): 41 | raise Exception("IDs do not match") 42 | 43 | yh = Yhat(username, apikey) 44 | 45 | variabless = sub_data.columns[1:] 46 | raw_tweets = test_data['tweet'].tolist() 47 | 48 | for variable in variables: 49 | model_version = best_model[variable] 50 | model_name = "TweetClassifier_%s" % (variable,) 51 | results_from_server = yh.raw_predict(model_name, 52 | model_version, 53 | raw_tweets) 54 | pred = results_from_server['prediction']['scores'] 55 | sub_data[variable] = pred 56 | 57 | try: 58 | sub_data.to_csv(open(sub_file,'w'),index=False) 59 | except IOError: 60 | sys.stderr.write("IO error: could not write data to file") 61 | -------------------------------------------------------------------------------- /tweet_text.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | __author__ = "Eric Chiang" 4 | __copyright__ = "Copyright 2013, Eric Chiang" 5 | __email__ = "eric.chiang.m@gmail.com" 6 | 7 | __license__ = "GPL" 8 | __version__ = "3.0" 9 | 10 | import re 11 | import string 12 | 13 | """ 14 | Tokenizer to capture information from tweets 15 | 16 | Includes rare word binning 17 | """ 18 | 19 | class TweetTokenizer(object): 20 | def __init__(self): 21 | self.re_number = re.compile(r'^\d*\.?\d+$') 22 | self.re_alphanum = re.compile('\d') 23 | self.re_weather = re.compile("#WEATHER.* (\d+\.\d+)F.* (\d+\.\d+)% Humidity. (\d+\.\d+)MPH") 24 | self.re_time = re.compile(r'\d{1,2}:\d\d[ ]?(am|pm)?') # Time 25 | self.re_temp = re.compile(r'(\d+\.?\d*) ?(fahrenheit|celcius|f|c|degrees|degree)(\W|$)') 26 | self.re_velo = re.compile(r'\d+\.?\d* ?mph') # Velocity 27 | self.re_perc = re.compile(r'\d+\.?\d* ?(%|percent)') 28 | self.re_punc = re.compile(r'[%s]' % re.escape(string.punctuation)) 29 | self.re_numb = re.compile(r'^-?\d+\.?\d*$') 30 | 31 | self.pos_emoticons = [':)',':-)',' : )',':D','=)',' : D ','(:','(='] 32 | self.neg_emoticons = [':(',':-(',' : (','=(','):',')='] 33 | self.ntr_emoticons = [':/','=/',':\\','=\\',':S','=S',':|','=|'] 34 | 35 | self.meta_mention = '@mention' 36 | self.meta_link = '{link}' 37 | 38 | def tokenize_tweet(self,tweet): 39 | 40 | # This particular pattern represents 2% of tweet sample 41 | m = self.re_weather.match(tweet) 42 | if m: 43 | temp = float(m.group(1)) 44 | humd = float(m.group(2)) 45 | mph = float(m.group(3)) 46 | sent = "" 47 | if temp > 85: 48 | sent = 'HOTNUMBER' 49 | elif temp > 45: 50 | sent = 'NICENUMBER' 51 | else: 52 | sent = 'COLDNUMBER' 53 | 54 | temp = str(int(temp / 10.0) * 10) 55 | humd = str(int(humd / 10.0) * 10) 56 | mph = str(int(mph / 10.0) * 10) 57 | tokens = ['WEATHER','MPH'+mph,'TEMP'+temp,'HUMD'+humd,sent, 'TEMP'] 58 | return tokens 59 | 60 | tweet = tweet.lower() 61 | 62 | if '!' in tweet: 63 | tweet = tweet.replace('!',' EXL ') 64 | if '?' in tweet: 65 | tweet = tweet.replace('?',' QST ') 66 | 67 | for emoticon in self.pos_emoticons: 68 | if emoticon in tweet: 69 | tweet = tweet.replace(emoticon,' SMILEY ') 70 | for emoticon in self.neg_emoticons: 71 | if emoticon in tweet: 72 | tweet = tweet.replace(emoticon,' FROWNY ') 73 | if ';)' in tweet: 74 | tweet = tweet.replace(';)',' WINKY ') 75 | 76 | if self.meta_mention in tweet: 77 | tweet = tweet.replace(self.meta_mention,' MENTION ') 78 | if self.meta_link in tweet: 79 | tweet = tweet.replace(self.meta_link,' LINK ') 80 | 81 | tweet = self.re_time.sub(' TIME ',tweet) 82 | tweet = self.re_temp.sub(r' TEMP \1 ',tweet) 83 | tweet = self.re_velo.sub(r' WIND ',tweet) 84 | tweet = self.re_perc.sub(r' PERC ',tweet) 85 | 86 | tokens = tweet.split() 87 | 88 | return_tokens = [] 89 | 90 | for token in tokens: 91 | token = self.re_punc.sub('',token) 92 | if self.re_numb.match(token): 93 | token = float(token) 94 | if token > 120: 95 | token = 'LARGENUMBER' 96 | elif token > 85: 97 | token = 'HOTNUMBER' 98 | elif token > 45: 99 | token = 'NICENUMBER' 100 | elif token > 10: 101 | token = 'COLDNUMBER' 102 | else: 103 | token = 'SMALLNUMBER' 104 | if token: 105 | return_tokens.append(token) 106 | 107 | return return_tokens 108 | 109 | # Test 110 | if __name__ == '__main__': 111 | import pandas as pd 112 | tokenizer = TweetTokenizer() 113 | train_data = pd.read_csv(open('data/train.csv','r'),quotechar='"') 114 | tweets = train_data['tweet'].tolist() 115 | for tweet in tweets[:100]: 116 | print tweet 117 | print " ".join(tokenizer.tokenize_tweet(tweet)) 118 | -------------------------------------------------------------------------------- /upload.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | __author__ = "Eric Chiang" 4 | __copyright__ = "Copyright 2013, Eric Chiang" 5 | __email__ = "eric.chiang.m@gmail.com" 6 | 7 | __license__ = "GPL" 8 | __version__ = "3.0" 9 | 10 | 11 | from account import username,apikey 12 | from sklearn.feature_extraction.text import CountVectorizer 13 | from sklearn.svm import SVR 14 | from yhat import Yhat,BaseModel 15 | 16 | import nltk 17 | import numpy as np 18 | import pandas as pd 19 | import sys 20 | 21 | class TweetClassifier(BaseModel): 22 | """Yhat classifier to be uploaded to server. 23 | """ 24 | def transform(self,raw_tweets): 25 | import nltk 26 | return self.vectorizer.transform(raw_tweets) 27 | 28 | def predict(self,x): 29 | pred = np.array(self.clf.predict(x)) 30 | pred[np.where(pred > 1.0)] = 1.0 31 | pred[np.where(pred < 0.0)] = 0.0 32 | return {"scores" : pred} 33 | 34 | train_data = pd.read_csv(open('data/train.csv','r'),quotechar='"') 35 | 36 | raw_tweets = train_data['tweet'].tolist() 37 | sanity_raw = raw_tweets[:100] 38 | 39 | sentiments = train_data.columns[4:].tolist() 40 | vectorizer = CountVectorizer(tokenizer=nltk.word_tokenize, 41 | stop_words='english', 42 | max_features=3000, 43 | binary=True, 44 | ngram_range=(1,1)) 45 | 46 | yh = Yhat(username,apikey) 47 | 48 | X_train = vectorizer.fit_transform(raw_tweets) 49 | 50 | for sentiment in sentiments: 51 | print "Processing '%s'" % sentiment 52 | clf = SVR() 53 | y_train = train_data[sentiment].tolist() 54 | 55 | print "Training classifier" 56 | clf.fit(X_train,y_train) 57 | 58 | tweet_clf = TweetClassifier(clf=clf,vectorizer=vectorizer) 59 | model_name = "TweetClassifier_%s" % (sentiment,) 60 | 61 | print "Uploading to yhat" 62 | upload_status = yh.upload(model_name,tweet_clf) 63 | model_version = upload_status['version'] 64 | 65 | print "'%s':'%s' uploaded to yhat" % (model_name,model_version) 66 | 67 | # Sanity check uploaded classifier by comparing remote against local scores 68 | 69 | print "Preforming sanity check" 70 | print "Predicting local scores" 71 | local_sanity = tweet_clf.predict(tweet_clf.transform(sanity_raw))['scores'] 72 | local_sanity = np.array(local_sanity) 73 | 74 | print "Getting scores from server" 75 | results_from_server = yh.raw_predict(model_name,model_version,sanity_raw) 76 | try: 77 | server_sanity = results_from_server['prediction']['scores'] 78 | except: 79 | print results_from_server 80 | sys.exit(3) 81 | server_sanity = np.array(server_sanity) 82 | 83 | # Because of float point scores compare difference of scores to some level 84 | # of tolerance rather than checking equality 85 | score_diff = np.abs(local_sanity - server_sanity) 86 | 87 | sanity_tolerance = 1e-3 88 | sanity_status = np.alltrue(score_diff < sanity_tolerance) 89 | 90 | if not sanity_status: 91 | sys.stderr.write("Sanity check failed\n") 92 | sys.stderr.write("Local sanity scores\n%s\n" % (local_sanity,)) 93 | sys.stderr.write("Server sanity scores\n%s\n" % (server_sanity,)) 94 | raise Exception("Sanity check failed") 95 | 96 | print "Sanity check passed" 97 | --------------------------------------------------------------------------------