├── .gitignore
├── README.md
├── cross_validate.py
├── data
├── sampleSubmission.csv
├── test.csv
├── train.csv
└── variableNames.txt
├── doc.md
├── score.py
├── tweet_text.py
└── upload.py
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | *.swp
3 | *.sh
4 | submissions/*
5 | account.py
6 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | cloudy-tweets
2 | =============
3 |
4 | Machine Learning solution for Kaggle.com's
5 | "
6 | Partly Sunny with a Chance of Hashtags"
7 |
8 |
--------------------------------------------------------------------------------
/cross_validate.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 |
3 | __author__ = "Eric Chiang"
4 | __copyright__ = "Copyright 2013, Eric Chiang"
5 | __email__ = "eric.chiang.m@gmail.com"
6 |
7 | __license__ = "GPL"
8 | __version__ = "3.0"
9 |
10 | from Queue import Queue
11 | from sklearn.cross_validation import KFold
12 | from sklearn.feature_extraction.text import CountVectorizer
13 | from sklearn.svm import SVR
14 | from threading import Thread
15 | from time import sleep
16 | from tweet_text import TweetTokenizer
17 | import numpy as np
18 | import pandas as pd
19 | import sys
20 |
21 | """
22 | Cross validation file
23 | """
24 |
25 |
26 | def run_fold(train_indices,test_indices,vectorizer,train_data,pred,queue):
27 | """ Run fold computation in parallel
28 | """
29 | print "Processing Fold"
30 | train_raw = raw_tweets[train_indices].tolist()
31 | test_raw = raw_tweets[test_indices].tolist()
32 | X_train = vectorizer.transform(train_raw)
33 | X_test = vectorizer.transform(test_raw)
34 | for variable in variables:
35 | y_train = np.array(train_data[variable])[train_indices]
36 | clf = SVR()
37 | clf.fit(X_train,y_train)
38 | pred[variable][test_indices] = clf.predict(X_test)
39 | # When finished notify queue
40 | queue.put("done")
41 |
42 | # Load train data from file
43 | train_data = pd.read_csv(open('data/train.csv'),quotechar='"')
44 |
45 | # Numpy array used to allow for test/train splitting
46 | raw_tweets = np.array(train_data['tweet'])
47 | variables = train_data.columns[4:].tolist()
48 |
49 | if len(variables) != 24:
50 | sys.stderr.write(variables)
51 | raise Exception("Bad number of sentiments")
52 |
53 | # Allocate data frame for predictions
54 | pred = train_data.copy()
55 |
56 | # Build vectorizer
57 | tokenizer = TweetTokenizer()
58 | print "Building vectorizer"
59 | vectorizer = CountVectorizer(tokenizer=tokenizer.tokenize_tweet,
60 | max_features=50,
61 | binary=True,
62 | ngram_range=(1,1))
63 |
64 | # Vectorization is done after folds are split. It's takes more time to slice a
65 | # sparce matrix than a list of strings.
66 | vectorizer.fit(raw_tweets)
67 |
68 |
69 | q = Queue()
70 | num_threads = 4
71 | num_folds = 8
72 | threads = []
73 |
74 | # Spawn a thread for each fold
75 | k_fold = KFold(n=len(raw_tweets),n_folds=num_folds,indices=True)
76 | for train_indices,test_indices in k_fold:
77 | if len(threads) >= num_threads:
78 | while q.empty():
79 | sleep(2)
80 | status = q.get()
81 | t = Thread(target=run_fold, args=(train_indices,test_indices,
82 | vectorizer,train_data,pred,q))
83 | t.deamon = True
84 | t.start()
85 | threads.append(t)
86 |
87 | # Wait for all threads to complete
88 | for t in threads:
89 | t.join()
90 |
91 | # Calculate (and print) mean squred errors for each variable
92 | print "Mean Squared Errors:"
93 | for variable in variables:
94 | y_pred = np.array(pred[variable])
95 | y_actu = np.array(train_data[variable])
96 |
97 | mse = np.average((y_pred - y_actu)**2)
98 | print "'%s': %f" % (variable,mse)
99 |
--------------------------------------------------------------------------------
/data/variableNames.txt:
--------------------------------------------------------------------------------
1 | s = sentiment
2 | w = when
3 | k = kind
4 | ============================================================
5 | s1,"I can't tell"
6 | s2,"Negative"
7 | s3,"Neutral / author is just sharing information"
8 | s4,"Positive"
9 | s5,"Tweet not related to weather condition"
10 | w1,"current (same day) weather"
11 | w2,"future (forecast)"
12 | w3,"I can't tell"
13 | w4,"past weather"
14 | k1,"clouds"
15 | k2,"cold"
16 | k3,"dry"
17 | k4,"hot"
18 | k5,"humid"
19 | k6,"hurricane"
20 | k7,"I can't tell"
21 | k8,"ice"
22 | k9,"other"
23 | k10,"rain"
24 | k11,"snow"
25 | k12,"storms"
26 | k13,"sun"
27 | k14,"tornado"
28 | k15,"wind"
29 |
--------------------------------------------------------------------------------
/doc.md:
--------------------------------------------------------------------------------
1 | Documentation for inital attempt at competition (and before I joined yhat)
2 |
3 |
Overview
4 |
5 | Sentiment classification of tweets! Each tweet is linked to a confidence
6 | interval of 15 unique sentiments.
7 |
8 | Sentiments include (see data/variableNames.txt for details):
9 |
10 | - current weather
11 | - future forecast
12 | - hot
13 | - rain
14 | - snow
15 |
16 |
17 | Methodology
18 |
19 | Simple pipeline of feature generation to classification.
20 |
21 | Feature space is comprised of word frequencies for m most common words and only
22 | considers unigram information. Use of linear ridge regression for
23 | classification. Classifier predominantly prefered because of speed.
24 |
25 | Resuts
26 |
27 | Cross validation estimates for total RMSE across all 15 features is about 0.169
28 | when n = 270 (aka feature space incoperates the 270 most common words).
29 | Current leader (2013-11-04) has attained around a 0.146 RMSE. Have only been
30 | able to bump m up to 270 before my computer maxed out its active memory. I need
31 | to upgrade.
32 | Cross validaiton RMSE: 0.169 (m = 270; alpha = 1e-7; folds = 10)
33 |
34 | UPDATE (2013-11-05):
35 | scipy.sparse matrices implemented. Have had successful runs using 600 features,
36 | though generating test and training folds now takes signifigantly longer. Took
37 | an hour to split 600 features into 10 folds on my machine with no
38 | parallelization. Each fold is writen to file then retrieved individually for cv
39 | to maintain low active memory use; at this point this package definitely does
40 | not optimize for speed. Will try for 1000 features next.
41 | Cross validation RMSE: 0.164 (m = 600; ridge regression; alpha = 1e-7)
42 |
43 | UPDATE (2013-11-06):
44 | Successful run with 1000 features. Memory failure when attempting 2000.
45 | Considering work arounds.
46 | Cross validation RMSE: 0.160 (m = 1000; alpha = 1e-7; folds = 10)
47 |
48 | UPDATE (2013-11-07):
49 | Observations now split into train/test folds before being mapped to feature
50 | space. This means fold splitting is preformed on a numpy array rather than a
51 | large sparce matrix which creates a faster and lower memory use pipeline.
52 | Successful runs with 2000 and 3000 features. Very clearly hitting diminishing
53 | returns when considering expansions of the feature space.
54 | Cross validation RMSE: 0.1578 (m = 3000; alpha = 1e-7; folds = 10)
55 |
56 | UPDATE (2013-11-08):
57 | First two submission to the Kaggle.com leaderboards. First sumbission used a
58 | feature space of the m most frequent words in the training data, the second
59 | considered the m most frequent words in the combination of the training and
60 | test. The first strategy proved marginally better though both resulted in a
61 | surprisingly high error score by Kaggle as compared to cross validation. Will
62 | discuss those discrepancies in detail under the 'Details/Musing' header since
63 | the differences are far more signifigant than what might be allowable.
64 | Kaggle RMSE: 0.16350 (m = 3000; alpha = 1e-7)
65 |
66 | UPDATE (2013-11-09):
67 | Signifigant expansion of rare word mapping. Numeric and alphanumeric data
68 | mapped to a larger set of possible values. Emoticon information now preserved
69 | as inspired by Go, Bhayani, and Huang demonstration of its usefulness in
70 | Twitter sentiment classification [1]. Exclamation and question marks presence
71 | also collected which were both noted by Pang, Lee and Vaithyanathan to be an
72 | unobvious source of information to humans (grad students) picking indicator
73 | words [2]. Details under the 'Details/Musing' header.
74 | Cross validation RMSE: 0.1578 (m = 3000; alpha = 1e-7; folds = 3)
75 | Cross validation RMSE: 0.1576 (m = 4000; alpha = 1e-7; folds = 3)
76 | Cross validation RMSE: 0.1586 (m = 5000; alpha = 1e-7; folds = 3)
77 | Kaggle RMSE: 0.16298 (m = 3000; alpha = 1e-7)
78 | Kaggle RMSE: 0.16314 (m = 4000; alpha = 1e-7)
79 | Kaggle RMSE: 0.16370 (m = 5000; alpha = 1e-7)
80 |
81 | UPDATE (2013-11-10):
82 | Considering four new modificaitons after yesterday's results:
83 | Increasing the alpha value to favor variance over bias. Considering word
84 | presense rather than word count for feature generation [1]. Reworking
85 | altercation of scores predicited out of range (above 1.0 or bellow 0.0). Trying
86 | different classifiers.
87 |
88 | UPDATE (2013-11-11):
89 | Elastic net tested and imediately showed improvements to ridge. Takes
90 | significantly longer to run than ridge but is still completes in a reasonable
91 | amount of time. Attempted to test support vector machines regression since it
92 | has shown to be successful as a sentiment classifier. Unfortunately, time
93 | complexity is an issue since it took roughly twice the time of elastic net in
94 | inital testing. Am now using a free EC2 micro instance on AWS to run
95 | regressions since I need to use my laptop for other work. If anyone wants to
96 | donate cycles message me! Currently placed 26th with elastic net results.
97 | Kaggle RMSE: 0.16014 (m = 3000; elastic net; alpha = 1e-5)
98 | Kaggle RMSE: 0.15970 (m = 4000; elastic net; alpha = 1e-5)
99 |
100 |
101 | Details/Musings
102 |
103 | 2013-11-07:
104 | To be honest I've been surprised by how successful my approach has been so far.
105 | There has been very little thought put into parsing tweets or improving
106 | information retrieval pre-regression. The only efforts so far have been mapping
107 | rare words; grouping similar low frequency words which would intuitively hold
108 | similar meanings. The binning effors which have been implemented so far pertain
109 | to numeric and alphanumeric words.
110 |
Currently bins:
111 |
112 | - Alphanumeric
113 | - Numeric
114 |
115 | - Over 90
116 | - Between 90 and 50
117 | - Between 50 and 10
118 | - Less than 10
119 |
120 |
121 |
122 | Inital efforts to further partition bins failed to improve the accuracy of the
123 | hypothesis. Though admittingly, those efforts took place when the feature
124 | space was an order of magnitude smaller.
125 |
126 | 2013-11-08:
127 | My initial hypothesis to explain why the cross validation errors were so
128 | different from the Kaggle scores is that the high number of folds lead to a
129 | high trian/test ratio, allowing the training data to better encompass the test.
130 | Will run 3 fold CV (rather than 10 fold) against future submission to test this
131 | theory. Very adamant in bringing CV in line with Kaggle since CV results will
132 | be critical in detemining if different variables require different classifiers.
133 |
134 | Also, for documentation, all reported scores will now specify if they were
135 | produced by cross validation or by blind test (what Kaggle does). Number of
136 | folds will also be noted. This should have been the policy from the beginning,
137 | but it's hard to voluntarily admit that your model might not be as good as it
138 | appears. Luckily, Kaggle provides the benchmark and self reporting of scores
139 | remains a rare phenomenon in other fields.
140 |
141 | 2013-11-09
142 | Current rare word mapping strategies.
143 |
144 | Before stripping tweets of punctuation and splitting into words, the string is
145 | check for exclamation marks, question marks and emoticons. Positive emoticons
146 | are mapped to ':)', negative emoticons are mapped to ':(', neutral emoticons
147 | are mapped to ':/', and winky faces ';)' are mainted. Links, mentions, and RT's
148 | are now also preserved (they were previously not). Exclamation and question
149 | mark methodology taken from [1]. Emoticon methodology taken from [2].
150 |
151 | After tweets are cleared of punctuation and split each individual word is
152 | determined to either be numeric, alphanumeric, or alphabetical. If the word is
153 | numeric, it is rounded down to the nearest ten, grouping numbers of similar
154 | size together. If the word is alphanumierc, the numbers are removed and a
155 | special '#num' tag is appended to the word. This has had the very pleasing
156 | result of producing common strings such as 'mph #num', 'pm #num', and 'f #num'
157 | which arise organically from the program rather than having to be selected for.
158 |
159 |
160 | References
161 | [1] B. Pang, L. Lee, and S. Vaithyanathan. Thumbs up? Sentiment classification
162 | using machine learning techniques. 2002
163 | [2] A. Go, R. Bhayani, and L. Huang. Twitter Sentiment Classification using
164 | Distant Supervision. 2009
165 |
--------------------------------------------------------------------------------
/score.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 |
3 | __author__ = "Eric Chiang"
4 | __copyright__ = "Copyright 2013, Eric Chiang"
5 | __email__ = "eric.chiang.m@gmail.com"
6 |
7 | __license__ = "GPL"
8 | __version__ = "3.0"
9 |
10 | from account_info import username,apikey
11 | from yhat import Yhat
12 | import pandas as pd
13 | import numpy as np
14 | import sys
15 |
16 | """
17 | Score train data using yhat and create a submission
18 | """
19 |
20 | if len(sys.argv) != 2:
21 | sys.stderr.write("Please specify a file to write predictions out to!\n")
22 | sys.exit(2)
23 |
24 | sub_file = sys.argv[1]
25 |
26 | # Record the best model
27 | best_model =\
28 | {
29 | 's1':1, 's2':1, 's3':1, 's4':1, 's5':1,
30 | 'w1':1, 'w2':1, 'w3':1, 'w4':1,
31 | 'k1':1, 'k2':1, 'k3':1, 'k4':1, 'k5':1,
32 | 'k6':1, 'k7':1, 'k8':1, 'k9':1, 'k10':1,
33 | 'k11':1, 'k12':1, 'k13':1, 'k14':1, 'k15':1
34 | }
35 |
36 | test_data = pd.read_csv(open('data/test.csv','r'),quotechar='"')
37 |
38 | sub_data = pd.read_csv(open('data/sampleSubmission.csv','r'),quotechar='"')
39 |
40 | if not np.alltrue(test_data['id'] == sub_data['id']):
41 | raise Exception("IDs do not match")
42 |
43 | yh = Yhat(username, apikey)
44 |
45 | variabless = sub_data.columns[1:]
46 | raw_tweets = test_data['tweet'].tolist()
47 |
48 | for variable in variables:
49 | model_version = best_model[variable]
50 | model_name = "TweetClassifier_%s" % (variable,)
51 | results_from_server = yh.raw_predict(model_name,
52 | model_version,
53 | raw_tweets)
54 | pred = results_from_server['prediction']['scores']
55 | sub_data[variable] = pred
56 |
57 | try:
58 | sub_data.to_csv(open(sub_file,'w'),index=False)
59 | except IOError:
60 | sys.stderr.write("IO error: could not write data to file")
61 |
--------------------------------------------------------------------------------
/tweet_text.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 |
3 | __author__ = "Eric Chiang"
4 | __copyright__ = "Copyright 2013, Eric Chiang"
5 | __email__ = "eric.chiang.m@gmail.com"
6 |
7 | __license__ = "GPL"
8 | __version__ = "3.0"
9 |
10 | import re
11 | import string
12 |
13 | """
14 | Tokenizer to capture information from tweets
15 |
16 | Includes rare word binning
17 | """
18 |
19 | class TweetTokenizer(object):
20 | def __init__(self):
21 | self.re_number = re.compile(r'^\d*\.?\d+$')
22 | self.re_alphanum = re.compile('\d')
23 | self.re_weather = re.compile("#WEATHER.* (\d+\.\d+)F.* (\d+\.\d+)% Humidity. (\d+\.\d+)MPH")
24 | self.re_time = re.compile(r'\d{1,2}:\d\d[ ]?(am|pm)?') # Time
25 | self.re_temp = re.compile(r'(\d+\.?\d*) ?(fahrenheit|celcius|f|c|degrees|degree)(\W|$)')
26 | self.re_velo = re.compile(r'\d+\.?\d* ?mph') # Velocity
27 | self.re_perc = re.compile(r'\d+\.?\d* ?(%|percent)')
28 | self.re_punc = re.compile(r'[%s]' % re.escape(string.punctuation))
29 | self.re_numb = re.compile(r'^-?\d+\.?\d*$')
30 |
31 | self.pos_emoticons = [':)',':-)',' : )',':D','=)',' : D ','(:','(=']
32 | self.neg_emoticons = [':(',':-(',' : (','=(','):',')=']
33 | self.ntr_emoticons = [':/','=/',':\\','=\\',':S','=S',':|','=|']
34 |
35 | self.meta_mention = '@mention'
36 | self.meta_link = '{link}'
37 |
38 | def tokenize_tweet(self,tweet):
39 |
40 | # This particular pattern represents 2% of tweet sample
41 | m = self.re_weather.match(tweet)
42 | if m:
43 | temp = float(m.group(1))
44 | humd = float(m.group(2))
45 | mph = float(m.group(3))
46 | sent = ""
47 | if temp > 85:
48 | sent = 'HOTNUMBER'
49 | elif temp > 45:
50 | sent = 'NICENUMBER'
51 | else:
52 | sent = 'COLDNUMBER'
53 |
54 | temp = str(int(temp / 10.0) * 10)
55 | humd = str(int(humd / 10.0) * 10)
56 | mph = str(int(mph / 10.0) * 10)
57 | tokens = ['WEATHER','MPH'+mph,'TEMP'+temp,'HUMD'+humd,sent, 'TEMP']
58 | return tokens
59 |
60 | tweet = tweet.lower()
61 |
62 | if '!' in tweet:
63 | tweet = tweet.replace('!',' EXL ')
64 | if '?' in tweet:
65 | tweet = tweet.replace('?',' QST ')
66 |
67 | for emoticon in self.pos_emoticons:
68 | if emoticon in tweet:
69 | tweet = tweet.replace(emoticon,' SMILEY ')
70 | for emoticon in self.neg_emoticons:
71 | if emoticon in tweet:
72 | tweet = tweet.replace(emoticon,' FROWNY ')
73 | if ';)' in tweet:
74 | tweet = tweet.replace(';)',' WINKY ')
75 |
76 | if self.meta_mention in tweet:
77 | tweet = tweet.replace(self.meta_mention,' MENTION ')
78 | if self.meta_link in tweet:
79 | tweet = tweet.replace(self.meta_link,' LINK ')
80 |
81 | tweet = self.re_time.sub(' TIME ',tweet)
82 | tweet = self.re_temp.sub(r' TEMP \1 ',tweet)
83 | tweet = self.re_velo.sub(r' WIND ',tweet)
84 | tweet = self.re_perc.sub(r' PERC ',tweet)
85 |
86 | tokens = tweet.split()
87 |
88 | return_tokens = []
89 |
90 | for token in tokens:
91 | token = self.re_punc.sub('',token)
92 | if self.re_numb.match(token):
93 | token = float(token)
94 | if token > 120:
95 | token = 'LARGENUMBER'
96 | elif token > 85:
97 | token = 'HOTNUMBER'
98 | elif token > 45:
99 | token = 'NICENUMBER'
100 | elif token > 10:
101 | token = 'COLDNUMBER'
102 | else:
103 | token = 'SMALLNUMBER'
104 | if token:
105 | return_tokens.append(token)
106 |
107 | return return_tokens
108 |
109 | # Test
110 | if __name__ == '__main__':
111 | import pandas as pd
112 | tokenizer = TweetTokenizer()
113 | train_data = pd.read_csv(open('data/train.csv','r'),quotechar='"')
114 | tweets = train_data['tweet'].tolist()
115 | for tweet in tweets[:100]:
116 | print tweet
117 | print " ".join(tokenizer.tokenize_tweet(tweet))
118 |
--------------------------------------------------------------------------------
/upload.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 |
3 | __author__ = "Eric Chiang"
4 | __copyright__ = "Copyright 2013, Eric Chiang"
5 | __email__ = "eric.chiang.m@gmail.com"
6 |
7 | __license__ = "GPL"
8 | __version__ = "3.0"
9 |
10 |
11 | from account import username,apikey
12 | from sklearn.feature_extraction.text import CountVectorizer
13 | from sklearn.svm import SVR
14 | from yhat import Yhat,BaseModel
15 |
16 | import nltk
17 | import numpy as np
18 | import pandas as pd
19 | import sys
20 |
21 | class TweetClassifier(BaseModel):
22 | """Yhat classifier to be uploaded to server.
23 | """
24 | def transform(self,raw_tweets):
25 | import nltk
26 | return self.vectorizer.transform(raw_tweets)
27 |
28 | def predict(self,x):
29 | pred = np.array(self.clf.predict(x))
30 | pred[np.where(pred > 1.0)] = 1.0
31 | pred[np.where(pred < 0.0)] = 0.0
32 | return {"scores" : pred}
33 |
34 | train_data = pd.read_csv(open('data/train.csv','r'),quotechar='"')
35 |
36 | raw_tweets = train_data['tweet'].tolist()
37 | sanity_raw = raw_tweets[:100]
38 |
39 | sentiments = train_data.columns[4:].tolist()
40 | vectorizer = CountVectorizer(tokenizer=nltk.word_tokenize,
41 | stop_words='english',
42 | max_features=3000,
43 | binary=True,
44 | ngram_range=(1,1))
45 |
46 | yh = Yhat(username,apikey)
47 |
48 | X_train = vectorizer.fit_transform(raw_tweets)
49 |
50 | for sentiment in sentiments:
51 | print "Processing '%s'" % sentiment
52 | clf = SVR()
53 | y_train = train_data[sentiment].tolist()
54 |
55 | print "Training classifier"
56 | clf.fit(X_train,y_train)
57 |
58 | tweet_clf = TweetClassifier(clf=clf,vectorizer=vectorizer)
59 | model_name = "TweetClassifier_%s" % (sentiment,)
60 |
61 | print "Uploading to yhat"
62 | upload_status = yh.upload(model_name,tweet_clf)
63 | model_version = upload_status['version']
64 |
65 | print "'%s':'%s' uploaded to yhat" % (model_name,model_version)
66 |
67 | # Sanity check uploaded classifier by comparing remote against local scores
68 |
69 | print "Preforming sanity check"
70 | print "Predicting local scores"
71 | local_sanity = tweet_clf.predict(tweet_clf.transform(sanity_raw))['scores']
72 | local_sanity = np.array(local_sanity)
73 |
74 | print "Getting scores from server"
75 | results_from_server = yh.raw_predict(model_name,model_version,sanity_raw)
76 | try:
77 | server_sanity = results_from_server['prediction']['scores']
78 | except:
79 | print results_from_server
80 | sys.exit(3)
81 | server_sanity = np.array(server_sanity)
82 |
83 | # Because of float point scores compare difference of scores to some level
84 | # of tolerance rather than checking equality
85 | score_diff = np.abs(local_sanity - server_sanity)
86 |
87 | sanity_tolerance = 1e-3
88 | sanity_status = np.alltrue(score_diff < sanity_tolerance)
89 |
90 | if not sanity_status:
91 | sys.stderr.write("Sanity check failed\n")
92 | sys.stderr.write("Local sanity scores\n%s\n" % (local_sanity,))
93 | sys.stderr.write("Server sanity scores\n%s\n" % (server_sanity,))
94 | raise Exception("Sanity check failed")
95 |
96 | print "Sanity check passed"
97 |
--------------------------------------------------------------------------------