├── .gitignore
├── .ipynb_checkpoints
    └── twitter_sentiment_analysis-checkpoint.ipynb
├── README.md
├── cleanup.py
├── data
    ├── emoticons.txt
    ├── test.csv
    └── train.csv
├── emoticons.py
├── main.py
├── preprocessing.py
├── twitter_sentiment_analysis.ipynb
└── word2vec.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm
 2 | 
 3 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
 4 | 
 5 | data/processed.csv
 6 | 
 7 | 
 8 | # User-specific stuff:
 9 | 
10 | .idea/workspace.xml
11 | 
12 | .idea/tasks.xml
13 | 
14 | __pycache__
15 | data/*.csv
16 | 
17 | 
18 | 
19 | 
20 | # Sensitive or high-churn files:
21 | 
22 | .idea/dataSources/
23 | 
24 | .idea/dataSources.ids
25 | 
26 | .idea/dataSources.xml
27 | 
28 | .idea/dataSources.local.xml
29 | 
30 | .idea/sqlDataSources.xml
31 | 
32 | .idea/dynamic.xml
33 | 
34 | .idea/uiDesigner.xml
35 | 
36 | 
37 | 
38 | # Gradle:
39 | 
40 | .idea/gradle.xml
41 | 
42 | .idea/libraries
43 | 
44 | 
45 | 
46 | # Mongo Explorer plugin:
47 | 
48 | .idea/mongoSettings.xml
49 | 
50 | 
51 | 
52 | ## File-based project format:
53 | 
54 | *.iws
55 | 
56 | 
57 | 
58 | ## Plugin-specific files:
59 | 
60 | 
61 | 
62 | # IntelliJ
63 | 
64 | /out/
65 | 
66 | 
67 | 
68 | # mpeltonen/sbt-idea plugin
69 | 
70 | .idea_modules/
71 | 
72 | 
73 | 
74 | # JIRA plugin
75 | 
76 | atlassian-ide-plugin.xml
77 | 
78 | 
79 | 
80 | # Crashlytics plugin (for Android Studio and IntelliJ)
81 | 
82 | com_crashlytics_export_strings.xml
83 | 
84 | crashlytics.properties
85 | 
86 | crashlytics-build.properties
87 | 
88 | .idea
89 | 
90 | 
91 | fabric.properties
92 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Jupyter Notebook + Python code of twitter sentiment analysis
2 | Details and full description:
3 | http://zablo.net/blog/post/twitter-sentiment-analysis-python-scikit-word2vec-nltk-xgboost
4 | 


--------------------------------------------------------------------------------
/cleanup.py:
--------------------------------------------------------------------------------
 1 | import re as regex
 2 | 
 3 | 
 4 | class TwitterCleanuper:
 5 |     def iterate(self):
 6 |         for cleanup_method in [self.remove_urls,
 7 |                                self.remove_usernames,
 8 |                                self.remove_na,
 9 |                                self.remove_special_chars,
10 |                                self.remove_numbers]:
11 |             yield cleanup_method
12 | 
13 |     @staticmethod
14 |     def remove_by_regex(tweets, regexp):
15 |         tweets.loc[:, "text"].replace(regexp, "", inplace=True)
16 |         return tweets
17 | 
18 |     def remove_urls(self, tweets):
19 |         return TwitterCleanuper.remove_by_regex(tweets, regex.compile(r"http.?://[^\s]+[\s]?"))
20 | 
21 |     def remove_na(self, tweets):
22 |         return tweets[tweets["text"] != "Not Available"]
23 | 
24 |     def remove_special_chars(self, tweets):  # it unrolls the hashtags to normal words
25 |         for remove in map(lambda r: regex.compile(regex.escape(r)), [",", ":", "\"", "=", "&", ";", "%", "$",
26 |                                                                      "@", "%", "^", "*", "(", ")", "{", "}",
27 |                                                                      "[", "]", "|", "/", "\\", ">", "<", "-",
28 |                                                                      "!", "?", ".", "'",
29 |                                                                      "--", "---", "#"]):
30 |             tweets.loc[:, "text"].replace(remove, "", inplace=True)
31 |         return tweets
32 | 
33 |     def remove_usernames(self, tweets):
34 |         return TwitterCleanuper.remove_by_regex(tweets, regex.compile(r"@[^\s]+[\s]?"))
35 | 
36 |     def remove_numbers(self, tweets):
37 |         return TwitterCleanuper.remove_by_regex(tweets, regex.compile(r"\s?[0-9]+\.?[0-9]*"))
38 | 


--------------------------------------------------------------------------------
/data/emoticons.txt:
--------------------------------------------------------------------------------
 1 | positive:
 2 | :-)
 3 | :)
 4 | :-]
 5 | :]
 6 | :-3
 7 | :3
 8 | :->
 9 | :>
10 | 8-)
11 | 8)
12 | :-}
13 | :}
14 | :o)
15 | :c)
16 | :^)
17 | =]
18 | =)
19 | :-D
20 | :D
21 | 8-D
22 | 8D
23 | x-D
24 | xD
25 | X-D
26 | XD
27 | =D
28 | =3
29 | B^D
30 | :-))
31 | ;-)
32 | ;)
33 | *-)
34 | *)
35 | ;-]
36 | ;]
37 | ;^)
38 | ;D
39 | :-P
40 | :P
41 | X-P
42 | x-p
43 | :-p
44 | :p
45 | :-?
46 | :?
47 | :-?
48 | :?
49 | :-b
50 | :b
51 | =p
52 | >:P
53 | :*
54 | :-*
55 | ^.^
56 | ^_^
57 | ^-^
58 | xd
59 | negative:
60 | :-(
61 | :(
62 | :-c
63 | :c
64 | :-<
65 | :<
66 | :-[
67 | :[
68 | :-||
69 | >:[
70 | :{
71 | :@
72 | >:(
73 | :-/
74 | :/
75 | >:\
76 | >:/
77 | :\
78 | =/
79 | =\
80 | :L
81 | =L
82 | :S
83 | :-|
84 | :|
85 | :-X
86 | :X
87 | -.-
88 | -,-


--------------------------------------------------------------------------------
/emoticons.py:
--------------------------------------------------------------------------------
 1 | class EmoticonDetector:
 2 |     emoticons = {}
 3 | 
 4 |     def __init__(self, emoticon_file="data\\emoticons.txt"):
 5 |         from pathlib import Path
 6 |         content = Path(emoticon_file).read_text()
 7 |         positive = True
 8 |         for line in content.split("\n"):
 9 |             if "positive" in line.lower():
10 |                 positive = True
11 |                 continue
12 |             elif "negative" in line.lower():
13 |                 positive = False
14 |                 continue
15 | 
16 |             self.emoticons[line] = positive
17 | 
18 |     def is_positive(self, emoticon):
19 |         if emoticon in self.emoticons:
20 |             return self.emoticons[emoticon]
21 |         return False
22 | 
23 |     def is_emoticon(self, to_check):
24 |         return to_check in self.emoticons
25 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | import multiprocessing
  2 | from multiprocessing import Process
  3 | from time import time
  4 | 
  5 | import pandas as pd
  6 | from sklearn.ensemble import RandomForestClassifier, VotingClassifier
  7 | from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
  8 | from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
  9 | from sklearn.naive_bayes import BernoulliNB
 10 | from xgboost import XGBClassifier as XGBoostClassifier
 11 | from sklearn.svm import LinearSVC
 12 | 
 13 | from cleanup import TwitterCleanuper
 14 | from preprocessing import TwitterData
 15 | from word2vec import Word2VecProvider
 16 | 
 17 | 
 18 | def preprocess(results, data_path, is_testing, data_name, min_occurrences=5, cache_output=None):
 19 |     twitter_data = TwitterData()
 20 |     twitter_data.initialize(data_path, is_testing)
 21 |     twitter_data.build_features()
 22 |     twitter_data.cleanup(TwitterCleanuper())
 23 |     twitter_data.tokenize()
 24 |     twitter_data.stem()
 25 |     twitter_data.build_wordlist(min_occurrences=min_occurrences)
 26 |     #twitter_data.build_data_model()
 27 |     # twitter_data.build_ngrams()
 28 |     # twitter_data.build_ngram_model()
 29 |     # twitter_data.build_data_model(with_ngram=2)
 30 |     # word2vec = Word2VecProvider()
 31 |     # word2vec.load("H:\\Programowanie\\glove.twitter.27B.200d.txt")
 32 |     # twitter_data.build_word2vec_model(word2vec)
 33 |     if cache_output is not None:
 34 |         twitter_data.data_model.to_csv(cache_output, index_label="idx", float_format="%.6f")
 35 |     results[data_name] = twitter_data.data_model
 36 | 
 37 | 
 38 | def preprare_data(min_occurrences):
 39 |     import os
 40 |     training_data = None
 41 |     testing_data = None
 42 |     print("Loading data...")
 43 |     test_data_file_name = "data\\processed_test_word2vec_bow_" + str(min_occurrences) + ".csv"
 44 |     train_data_file_name = "data\\processed_train_word2vec_bow_" + str(min_occurrences) + ".csv"
 45 |     use_cache = os.path.isfile(train_data_file_name) and os.path.isfile(
 46 |         test_data_file_name)
 47 |     if use_cache:
 48 |         training_data = TwitterData()
 49 |         training_data.initialize(None, from_cached=train_data_file_name)
 50 |         training_data = training_data.data_model
 51 | 
 52 |         testing_data = TwitterData()
 53 |         testing_data.initialize(None, from_cached=test_data_file_name)
 54 |         testing_data = testing_data.data_model
 55 |         print("Loaded from cached files...")
 56 |     else:
 57 |         print("Preprocessing data...")
 58 |         with multiprocessing.Manager() as manager:
 59 | 
 60 |             results = manager.dict()
 61 | 
 62 |             preprocess_training = Process(target=preprocess, args=(
 63 |                 results, "data\\train.csv", False, "train", min_occurrences, train_data_file_name,))
 64 | 
 65 |             preprocess_testing = Process(target=preprocess, args=(
 66 |                 results, "data\\test.csv", True, "test", min_occurrences, test_data_file_name,))
 67 | 
 68 |             preprocess_training.start()
 69 |             preprocess_testing.start()
 70 |             print("Multiple processes started...")
 71 | 
 72 |             preprocess_testing.join()
 73 |             print("Preprocessed testing data...")
 74 | 
 75 |             preprocess_training.join()
 76 |             print("Preprocessed training data...")
 77 | 
 78 |             training_data = results["train"]
 79 |             testing_data = results["test"]
 80 | 
 81 |             print("Data preprocessed & cached...")
 82 | 
 83 |     return training_data, testing_data
 84 | 
 85 | 
 86 | def log(text):
 87 |     print(text)
 88 |     with open("log.txt", "a") as log_file:
 89 |         log_file.write(str(text) + "\n")
 90 | 
 91 | 
 92 | def test_classifier(X_train, y_train, X_test, y_test, classifier):
 93 |     log("")
 94 |     log("===============================================")
 95 |     classifier_name = str(type(classifier).__name__)
 96 |     log("Testing " + classifier_name)
 97 |     now = time()
 98 |     list_of_labels = sorted(list(set(y_train)))
 99 |     model = classifier.fit(X_train, y_train)
100 |     log("Learing time {0}s".format(time() - now))
101 |     now = time()
102 |     predictions = model.predict(X_test)
103 |     log("Predicting time {0}s".format(time() - now))
104 | 
105 |     from sklearn.metrics import classification_report
106 |     precision = precision_score(y_test, predictions, average=None, pos_label=None, labels=list_of_labels)
107 |     recall = recall_score(y_test, predictions, average=None, pos_label=None, labels=list_of_labels)
108 |     accuracy = accuracy_score(y_test, predictions)
109 |     log("=================== Results ===================")
110 |     log(classification_report(y_test, predictions, labels=list_of_labels))
111 |     log("            Negative     Neutral     Positive")
112 |     log("F1       " + str(f1_score(y_test, predictions, average=None, pos_label=None, labels=list_of_labels)))
113 |     log("Precision" + str(precision))
114 |     log("Recall   " + str(recall))
115 |     log("Accuracy " + str(accuracy))
116 |     log("===============================================")
117 | 
118 |     return precision, recall, accuracy
119 | 
120 | 
121 | def cv(classifier, X_train, y_train):
122 |     log("")
123 |     log("===============================================")
124 |     classifier_name = str(type(classifier).__name__)
125 |     log("Testing " + classifier_name)
126 |     now = time()
127 | 
128 |     log("Crossvalidating...")
129 |     # recall = [cross_val_score(classifier, X_train, y_train, scoring="recall_micro", cv=10, n_jobs=-1)]
130 |     accuracy = [cross_val_score(classifier, X_train, y_train, cv=8, n_jobs=-1)]
131 |     # precision = [cross_val_score(classifier, X_train, y_train, scoring="precision_micro", cv=10, n_jobs=-1)]
132 |     recall = -1
133 |     precision = -1
134 |     log("Crosvalidation completed in {0}s".format(time() - now))
135 |     log("=================== Results ===================")
136 |     log("Accuracy: " + str(accuracy))
137 |     log("Precision: " + str(precision))
138 |     log("Recall: " + str(recall))
139 |     log("===============================================")
140 |     log("CV time: {0}".format(time() - now))
141 |     return accuracy, precision, recall
142 | 
143 | 
144 | import numpy as np
145 | 
146 | 
147 | # Utility function to report best scores
148 | def report(results, n_top=3):
149 |     for i in range(1, n_top + 1):
150 |         candidates = np.flatnonzero(results['rank_test_score'] == i)
151 |         for candidate in candidates:
152 |             log("Model with rank: {0}".format(i))
153 |             log("Mean validation score: {0:.3f} (std: {1:.3f})".format(
154 |                 results['mean_test_score'][candidate],
155 |                 results['std_test_score'][candidate]))
156 |             log("Parameters: {0}".format(results['params'][candidate]))
157 |             log("")
158 | 
159 | 
160 | def best_fit(X_train, y_train):
161 |     log("")
162 | 
163 |     seed = 666
164 |     import time as ttt
165 |     attributes = len(X_train.columns)
166 |     examples = len(X_train)
167 |     now = time()
168 |     log(ttt.ctime())
169 |     # Parameters for SVM
170 |     # parameters = {
171 |     #     "dual": [True, False],
172 |     #     "tol": [1e-3, 1e-4, 1e-5],
173 |     #     "C": [1.0, 1.5, 2.0, 5.0, 10, 100, 1000]
174 |     # }
175 |     # rand_search = RandomizedSearchCV(LinearSVC(max_iter=5000), param_distributions=parameters, cv=8,n_jobs=-1,n_iter=20)
176 |     #
177 |     #
178 |     # rand_search.fit(X_train,y_train)
179 |     # report(rand_search.cv_results_, 10)
180 |     # log(ttt.ctime())
181 |     # log(time() - now)
182 |     # return
183 | 
184 |     # Parameters for Bagging
185 |     # parameters = {
186 |     #     "n_estimators": [2, 3, 5, 13, 51, 201, 303, 403, 505],
187 |     #     "max_features": list(map(lambda x: int(x),
188 |     #                              [sqrt(attributes), 2 * sqrt(attributes), 3 * sqrt(attributes), attributes / 2,
189 |     #                               attributes / 3, attributes / 4]))
190 |     # }
191 |     #
192 |     # rand_search = RandomizedSearchCV(BaggingClassifier(
193 |     #     base_estimator=LinearSVC(random_state=seed, class_weight="balanced", max_iter=5000, C=1.0, tol=0.0001, dual=True),
194 |     #     random_state=seed, n_jobs=1), param_distributions=parameters, n_jobs=-1, n_iter=3, cv=8,
195 |     #     scoring=make_scorer(f1_score, average="micro", labels=["positive", "negative", "neutral"]))
196 |     #
197 |     # now = time()
198 |     # log(ttt.ctime())
199 |     # rand_search.fit(X_train, y_train)
200 |     #
201 |     # report(rand_search.cv_results_, 10)
202 |     log(ttt.ctime())
203 |     log(time() - now)
204 | 
205 |     # Parameters for RF
206 |     # log("RF:")
207 |     # parameters = {
208 |     #     "n_estimators":[103, 201, 305, 403, 666, 1001, 5007, 10001],
209 |     #     "max_depth":[None, 5, 20, 40, 73, 100, 1000, 2000],
210 |     #     "criterion":["gini", "entropy"]
211 |     # }
212 |     #
213 |     # rand_search = RandomizedSearchCV(RandomForestClassifier(random_state=seed,n_jobs=-1),param_distributions=parameters,
214 |     #                                  n_iter=15,scoring="accuracy",
215 |     #                                  n_jobs=1,cv=10)
216 |     # now = time()
217 |     # log(ttt.ctime())
218 |     # rand_search.fit(X_train, y_train)
219 |     #
220 |     # report(rand_search.cv_results_, 10)
221 |     # log(ttt.ctime())
222 |     # log(time() - now)
223 | 
224 |     # Parameters for XGBoost
225 |     log("XGB:")
226 |     parameters = {
227 |         "n_estimators":[103,201, 403],
228 |         "max_depth":[3,10,15],
229 |         "objective":["multi:softmax","binary:logistic"],
230 |         "learning_rate":[0.05, 0.1, 0.15, 0.3]
231 |     }
232 | 
233 |     rand_search = RandomizedSearchCV(XGBoostClassifier(seed=seed),param_distributions=parameters,
234 |                                      n_iter=5,scoring="accuracy",
235 |                                      n_jobs=-1,cv=8)
236 | 
237 | 
238 |     now = time()
239 |     log(ttt.ctime())
240 |     rand_search.fit(X_train, y_train)
241 | 
242 |     report(rand_search.cv_results_, 10)
243 |     log(ttt.ctime())
244 |     log(time() - now)
245 | 
246 |     parameters = {
247 |         "n_estimators": [403, 666, 1000],
248 |         "max_depth": [40,50,90,100,200],
249 |         "subsample":[1.0, 0.6, 0.9],
250 |         "objective": ["multi:softmax", "binary:logistic"],
251 |         "learning_rate": [0.1, 0.15, 0.5]
252 |     }
253 | 
254 |     rand_search = RandomizedSearchCV(XGBoostClassifier(seed=seed,), param_distributions=parameters,
255 |                                      n_iter=5, scoring="accuracy",
256 |                                      n_jobs=-1, cv=8)
257 | 
258 |     now = time()
259 |     log(ttt.ctime())
260 |     rand_search.fit(X_train, y_train)
261 | 
262 |     report(rand_search.cv_results_, 10)
263 |     log(ttt.ctime())
264 |     log(time() - now)
265 | 
266 |     return
267 | 
268 | 
269 |     # Parameters for VotingClassifier
270 |     # parameters = {
271 |     #     "weights": [
272 |     #         [1, 1, 1],
273 |     #         [2, 1, 1],
274 |     #         [2, 2, 1],
275 |     #         [4, 1, 5],
276 |     #         [1, 1, 2],
277 |     #         [5, 1, 2],
278 |     #         [5, 2, 1],
279 |     #         [5, 3, 2],
280 |     #         [6, 2, 1],
281 |     #         [6, 1, 5],
282 |     #         [6, 1, 2],
283 |     #         [7, 1, 6],
284 |     #         [7, 2, 3],
285 |     #     ]
286 |     # }
287 |     log("Voting RF XGB NB:")
288 |     parameters = {
289 |         "weights": [
290 |             [1, 1, 1],
291 |             [2, 1, 1],
292 |             [1, 1, 2],
293 |             [4, 1, 5],
294 |             [3, 1, 3],
295 |             [3, 1, 4]
296 |         ]
297 |     }
298 | 
299 |     rand_search = GridSearchCV(VotingClassifier([
300 |         ("randomforest", RandomForestClassifier(n_estimators=403, random_state=seed, max_depth=73, n_jobs=-1)),
301 |         ("naivebayes", BernoulliNB()),
302 |         ("xgboost", XGBoostClassifier(n_estimators=103, seed=seed, max_depth=3, objective="multi:softmax"))
303 |     ], voting="soft", n_jobs=1), scoring="accuracy", n_jobs=-1, cv=8, param_grid=parameters)
304 |     rand_search.fit(X_train, y_train)
305 |     #
306 |     report(rand_search.cv_results_, 10)
307 |     log(ttt.ctime())
308 |     log(time() - now)
309 | 
310 | 
311 | def numbers_to_boolean(df):
312 |     for column in filter(lambda col: col.startswith("number_of_"), df.columns):
313 |         df[column] = (df[column] >= 1).astype(int)
314 | 
315 | 
316 | if __name__ == "__main__":
317 | 
318 |     def main():
319 |         result_col_names = ["min_occ", "precision", "recall", "accuracy"]
320 |         result_col_names = ["min_occ", "precision_negative", "precision_neutral", "precision_positive",
321 |                             "recall_negative",
322 |                             "recall_neutral", "recall_positive", "accuracy"]
323 | 
324 |         results_df = pd.DataFrame(columns=result_col_names)
325 |         for m in range(3, 4):
326 |             print("Preparing data with min_occurrences=" + str(m))
327 |             training_data, testing_data = preprare_data(m)
328 |             log("********************************************************")
329 |             log("Validating for {0} min_occurrences:".format(m))
330 |             # drop idx & id columns
331 |             if training_data.columns[0] == "idx":
332 |                 training_data = training_data.iloc[:, 1:]
333 | 
334 |             if testing_data.columns[0] == "idx":
335 |                 testing_data = testing_data.iloc[:, 1:]
336 | 
337 |             if "original_id" in training_data.columns:
338 |                 training_data.drop("original_id", axis=1, inplace=True)
339 | 
340 |             if "original_id" in testing_data.columns:
341 |                 testing_data.drop("original_id", axis=1, inplace=True)
342 | 
343 |             # continue
344 |             import random
345 |             seed = 666
346 |             random.seed(seed)
347 |             X_train, X_test, y_train, y_test = train_test_split(training_data.iloc[:, 1:], training_data.iloc[:, 0],
348 |                                                                 train_size=0.7, stratify=training_data.iloc[:, 0],
349 |                                                                 random_state=seed)
350 | 
351 |             use_full_set = True
352 |             if use_full_set:
353 |                 X_train = training_data.iloc[:, 1:]
354 |                 y_train = training_data.iloc[:, 0]
355 | 
356 |                 X_test = testing_data.iloc[:, 1:]
357 |                 y_test = testing_data.iloc[:, 0]
358 | 
359 |             # from sklearn.preprocessing import StandardScaler
360 |             # scaler = StandardScaler()
361 |             # scaler.fit(X_train)
362 |             #
363 |             # scaler.transform(X_train)
364 |             # scaler.transform(X_test)
365 | 
366 |             # numbers_to_boolean(X_train)
367 |             # numbers_to_boolean(X_test)
368 | 
369 |             from math import sqrt
370 | 
371 |             classifiers = [
372 |                 # MLPClassifier(hidden_layer_sizes=(900, 666, 500, 100, 50, 13), random_state=seed, max_iter=5000)
373 |                 # LinearSVC(random_state=seed,class_weight="balanced",max_iter=5000,C=1.0,tol=1e-5,dual=True),
374 |                 # SVC(random_state=seed, class_weight="balanced", max_iter=10000, kernel="linear",probability=True)
375 |                 # BaggingClassifier(base_estimator=LinearSVC(random_state=seed, class_weight="balanced", max_iter=5000,
376 |                 #                                            C=1.0, tol=1e-5, dual=True),
377 |                 #                   n_estimators=403, n_jobs=-1, random_state=seed,
378 |                 #                   max_features=410),
379 |                 # VotingClassifier([
380 |                 #     ("svm", BaggingClassifier(
381 |                 #         base_estimator=LinearSVC(random_state=seed, class_weight="balanced", max_iter=5000,
382 |                 #                                  C=2.0, tol=0.001, dual=True),
383 |                 #         n_estimators=8, n_jobs=-1, random_state=seed,
384 |                 #     )),
385 |                 #     ("naivebayes", BernoulliNB()),
386 |                 #     ("randomforest", RandomForestClassifier(max_depth=73, n_estimators=403, n_jobs=-1))
387 |                 # ], voting="soft", weights=[1,1,1]),
388 |                 # XGBoostClassifier(n_estimators=103,seed=seed,max_depth=4, objective="multi:softmax"),
389 |                 # LinearSVC(random_state=seed, class_weight="balanced", max_iter=5000, C=2.0, tol=0.001, dual=True)
390 |                 # LogisticRegression(max_iter=5000,n_jobs=-1,solver="sag",random_state=seed),
391 |                 # RandomForestClassifier(n_jobs=-1,random_state=seed,n_estimators=403)
392 |                 # VotingClassifier([
393 |                 #     ("randomforest",
394 |                 #      RandomForestClassifier(n_estimators=403, random_state=seed, max_depth=73, n_jobs=-1)),
395 |                 #     ("naivebayes", BernoulliNB()),
396 |                 #     ("xgboost", XGBoostClassifier(n_estimators=103, seed=seed, max_depth=3, objective="multi:softmax"))
397 |                 # ], voting="soft", weights=[4, 1, 5], n_jobs=-1),
398 |                 BernoulliNB()
399 |                 # SVC(C=2.0,kernel="linear",tol=0.001,random_state=seed),
400 | 
401 |                 # BaggingClassifier(
402 |                 #     base_estimator=LinearSVC(random_state=seed, class_weight="balanced", max_iter=5000,
403 |                 #                              C=2.0, tol=0.001, dual=True),
404 |                 #     n_estimators=8, n_jobs=-1, random_state=seed,
405 |                 # )
406 |                 # BernoulliNB(),
407 |                 # LinearSVC(random_state=seed, class_weight="balanced", max_iter=5000, C=2.0, tol=0.001, dual=True)
408 |             ]
409 |             #
410 |             best_fit(X_train, y_train)
411 |             continue
412 |             # classifier = joblib.load("VotingClassifier_train.bin")
413 |             # predictions = classifier.predict(X_test)
414 |             #
415 |             # not_predicted_idx = []
416 |             # for idx, predicted in enumerate(predictions):
417 |             #     if predicted == y_test.iloc[idx]:
418 |             #         continue
419 |             #     not_predicted_idx.append(idx)
420 |             #
421 |             # bad_predictions = X_test.iloc[not_predicted_idx, :]
422 |             # bad_predictions = bad_predictions.assign(label=y_test.iloc[not_predicted_idx])
423 |             # bad_predictions.to_csv("data\\not_predicted.csv", index_label="idx")
424 |             #
425 |             # continue
426 |             for classifier in classifiers:
427 |                 # precision, recall, accuracy = test_classifier(X_train, y_train, X_test, y_test, classifier)
428 |                 precision, recall, accuracy = cv(classifier,X_train,y_train)
429 |                 continue
430 |                 _tmp = [m]
431 |                 for idx in range(0, len(precision)):
432 |                     _tmp.append(precision[idx])
433 | 
434 |                 for idx in range(0, len(recall)):
435 |                     _tmp.append(recall[idx])
436 | 
437 |                 _tmp.append(accuracy)
438 |                 results_df = results_df.append(pd.DataFrame([_tmp], columns=result_col_names))
439 |                 results_df.to_csv("results_" + str(type(classifier).__name__) + "_train.csv", index_label="idx")
440 |                 # joblib.dump(classifier, str(type(classifier).__name__)+"_train.bin")
441 | 
442 |         print(results_df)
443 | 
444 |         print("Done!")
445 | 
446 | 
447 |     main()
448 | 


--------------------------------------------------------------------------------
/preprocessing.py:
--------------------------------------------------------------------------------
  1 | from collections import Counter
  2 | import nltk
  3 | import pandas as pd
  4 | from emoticons import EmoticonDetector
  5 | import re as regex
  6 | import numpy as np
  7 | 
  8 | 
  9 | class TwitterData:
 10 |     data = []
 11 |     processed_data = []
 12 |     wordlist = []
 13 |     ngrams = []
 14 | 
 15 |     whitelist = ["n't", "not"]
 16 |     data_model = None
 17 |     data_labels = None
 18 |     is_testing = False
 19 | 
 20 |     def initialize(self, csv_file, is_testing_set=False, from_cached=None):
 21 |         if from_cached is not None:
 22 |             self.data_model = pd.read_csv(from_cached)
 23 |             return
 24 | 
 25 |         self.is_testing = is_testing_set
 26 | 
 27 |         if not is_testing_set:
 28 |             self.data = pd.read_csv(csv_file, header=0, names=["id", "emotion", "text"])
 29 |             self.data = self.data[self.data["emotion"].isin(["positive", "negative", "neutral"])]
 30 |         else:
 31 |             self.data = pd.read_csv(csv_file, header=0, names=["id", "text"])
 32 |             not_null_text = 1 ^ pd.isnull(self.data["text"])
 33 |             not_null_id = 1 ^ pd.isnull(self.data["id"])
 34 |             self.data = self.data.loc[not_null_id & not_null_text, :]
 35 | 
 36 |         self.processed_data = self.data
 37 |         self.wordlist = []
 38 |         self.data_model = None
 39 |         self.data_labels = None
 40 | 
 41 |     def cleanup(self, cleanuper):
 42 |         t = self.processed_data
 43 |         for cleanup_method in cleanuper.iterate():
 44 |             if not self.is_testing:
 45 |                 t = cleanup_method(t)
 46 |             else:
 47 |                 if cleanup_method.__name__ != "remove_na":
 48 |                     t = cleanup_method(t)
 49 | 
 50 |         self.processed_data = t
 51 | 
 52 |     def build_features(self):
 53 |         def count_by_lambda(expression, word_array):
 54 |             return len(list(filter(expression, word_array)))
 55 | 
 56 |         def count_occurences(character, word_array):
 57 |             counter = 0
 58 |             for j, word in enumerate(word_array):
 59 |                 for char in word:
 60 |                     if char == character:
 61 |                         counter += 1
 62 | 
 63 |             return counter
 64 | 
 65 |         def count_by_regex(regex, plain_text):
 66 |             return len(regex.findall(plain_text))
 67 | 
 68 |         self.add_column("splitted_text", map(lambda txt: txt.split(" "), self.processed_data["text"]))
 69 | 
 70 |         # number of uppercase words
 71 |         uppercase = list(map(lambda txt: count_by_lambda(lambda word: word == word.upper(), txt),
 72 |                              self.processed_data["splitted_text"]))
 73 |         self.add_column("number_of_uppercase", uppercase)
 74 | 
 75 |         # number of !
 76 |         exclamations = list(map(lambda txt: count_occurences("!", txt),
 77 |                                 self.processed_data["splitted_text"]))
 78 | 
 79 |         self.add_column("number_of_exclamation", exclamations)
 80 | 
 81 |         # number of ?
 82 |         questions = list(map(lambda txt: count_occurences("?", txt),
 83 |                              self.processed_data["splitted_text"]))
 84 | 
 85 |         self.add_column("number_of_question", questions)
 86 | 
 87 |         # number of ...
 88 |         ellipsis = list(map(lambda txt: count_by_regex(regex.compile(r"\.\s?\.\s?\."), txt),
 89 |                             self.processed_data["text"]))
 90 | 
 91 |         self.add_column("number_of_ellipsis", ellipsis)
 92 | 
 93 |         # number of hashtags
 94 |         hashtags = list(map(lambda txt: count_occurences("#", txt),
 95 |                             self.processed_data["splitted_text"]))
 96 | 
 97 |         self.add_column("number_of_hashtags", hashtags)
 98 | 
 99 |         # number of mentions
100 |         mentions = list(map(lambda txt: count_occurences("@", txt),
101 |                             self.processed_data["splitted_text"]))
102 | 
103 |         self.add_column("number_of_mentions", mentions)
104 | 
105 |         # number of quotes
106 |         quotes = list(map(lambda plain_text: int(count_occurences("'", [plain_text.strip("'").strip('"')]) / 2 +
107 |                                                  count_occurences('"', [plain_text.strip("'").strip('"')]) / 2),
108 |                           self.processed_data["text"]))
109 | 
110 |         self.add_column("number_of_quotes", quotes)
111 | 
112 |         # number of urls
113 |         urls = list(map(lambda txt: count_by_regex(regex.compile(r"http.?://[^\s]+[\s]?"), txt),
114 |                         self.processed_data["text"]))
115 | 
116 |         self.add_column("number_of_urls", urls)
117 | 
118 |         # number of positive emoticons
119 |         ed = EmoticonDetector()
120 |         positive_emo = list(
121 |             map(lambda txt: count_by_lambda(lambda word: ed.is_emoticon(word) and ed.is_positive(word), txt),
122 |                 self.processed_data["splitted_text"]))
123 | 
124 |         self.add_column("number_of_positive_emo", positive_emo)
125 | 
126 |         # number of negative emoticons
127 |         negative_emo = list(map(
128 |             lambda txt: count_by_lambda(lambda word: ed.is_emoticon(word) and not ed.is_positive(word), txt),
129 |             self.processed_data["splitted_text"]))
130 | 
131 |         self.add_column("number_of_negative_emo", negative_emo)
132 | 
133 |         pass
134 | 
135 |     def add_column(self, column_name, column_content):
136 |         self.processed_data.loc[:, column_name] = pd.Series(column_content, index=self.processed_data.index)
137 | 
138 |     def stem(self, stemmer=nltk.PorterStemmer()):
139 |         def stem_and_join(row):
140 |             row["text"] = list(map(lambda str: stemmer.stem(str.lower()), row["text"]))
141 |             return row
142 | 
143 |         self.processed_data = self.processed_data.apply(stem_and_join, axis=1)
144 | 
145 |     def tokenize(self, tokenizer=nltk.word_tokenize):
146 |         def tokenize_row(row):
147 |             row["text"] = tokenizer(row["text"])
148 |             row["tokenized_text"] = [] + row["text"]
149 |             return row
150 | 
151 |         self.processed_data = self.processed_data.apply(tokenize_row, axis=1)
152 | 
153 |     def build_wordlist(self, min_occurrences=5, max_occurences=500, stopwords=nltk.corpus.stopwords.words("english"),
154 |                        whitelist=None):
155 |         self.wordlist = []
156 |         whitelist = self.whitelist if whitelist is None else whitelist
157 |         import os
158 |         if os.path.isfile("data\\wordlist.csv"):
159 |             word_df = pd.read_csv("data\\wordlist.csv")
160 |             word_df = word_df[word_df["occurrences"] > min_occurrences]
161 |             self.wordlist = list(word_df.loc[:, "word"])
162 |             return
163 | 
164 |         words = Counter()
165 |         for idx in self.processed_data.index:
166 |             words.update(self.processed_data.loc[idx, "text"])
167 | 
168 |         for idx, stop_word in enumerate(stopwords):
169 |             if stop_word not in whitelist:
170 |                 del words[stop_word]
171 | 
172 |         word_df = pd.DataFrame(data={"word": [k for k, v in words.most_common() if min_occurrences < v < max_occurences],
173 |                                      "occurrences": [v for k, v in words.most_common() if min_occurrences < v < max_occurences]},
174 |                                columns=["word", "occurrences"])
175 | 
176 |         word_df.to_csv("data\\wordlist.csv", index_label="idx")
177 |         self.wordlist = [k for k, v in words.most_common() if min_occurrences < v < max_occurences]
178 | 
179 |     def build_ngrams(self, ngram=2, stopwords=nltk.corpus.stopwords.words("english"),
180 |                      whitelist=None):
181 |         whitelist = self.whitelist if whitelist is None else whitelist
182 |         stopwords = list(filter(lambda sw: sw not in whitelist, stopwords))
183 |         ngrams = Counter()
184 |         for idx in self.processed_data.index:
185 |             tokens = self.processed_data.loc[idx, "text"]
186 |             ngrams.update(self.generate_ngrams(tokens, ngram, stopwords))
187 | 
188 |         self.ngrams = [ng for ng, cnt in ngrams.most_common() if cnt >= 2]
189 | 
190 |     def generate_ngrams(self, tokens, ngram, stopwords):
191 |         return list(map(lambda ng: str.join("_", ng),
192 |                         nltk.ngrams(
193 |                             filter(lambda word: word not in stopwords, tokens),
194 |                             ngram)))
195 | 
196 |     def build_ngram_model(self, stopwords=nltk.corpus.stopwords.words("english"),
197 |                           whitelist=None, ngram=2):
198 |         whitelist = self.whitelist if whitelist is None else whitelist
199 |         stopwords = list(filter(lambda sw: sw not in whitelist, stopwords))
200 |         extra_columns = [col for col in self.processed_data.columns if col.startswith("number_of")]
201 |         label_column = []
202 |         if not self.is_testing:
203 |             label_column = ["label"]
204 | 
205 |         columns = label_column + extra_columns + list(self.ngrams)
206 |         labels = []
207 |         rows = []
208 |         for idx in self.processed_data.index:
209 |             current_row = []
210 | 
211 |             if not self.is_testing:
212 |                 # add label
213 |                 current_label = self.processed_data.loc[idx, "emotion"]
214 |                 labels.append(current_label)
215 |                 current_row.append(current_label)
216 | 
217 |             for _, col in enumerate(extra_columns):
218 |                 current_row.append(self.processed_data.loc[idx, col])
219 | 
220 |             # add ngrams
221 |             tokens = self.processed_data.loc[idx, "text"]
222 |             current_ngrams = self.generate_ngrams(tokens, ngram, stopwords)
223 |             for _, ng in enumerate(self.ngrams):
224 |                 current_row.append(1 if ng in current_ngrams else 0)
225 | 
226 |             rows.append(current_row)
227 | 
228 |         self.data_model = pd.DataFrame(rows, columns=columns)
229 |         self.data_labels = pd.Series(labels)
230 |         return self.data_model, self.data_labels
231 | 
232 |     def build_word2vec_model(self, word2vec_provider, stopwords=nltk.corpus.stopwords.words("english"), whitelist=None):
233 |         whitelist = self.whitelist if whitelist is None else whitelist
234 |         stopwords = list(filter(lambda sw: sw not in whitelist, stopwords))
235 |         extra_columns = [col for col in self.processed_data.columns if col.startswith("number_of")]
236 |         similarity_columns = ["bad_similarity", "good_similarity", "information_similarity"]
237 |         label_column = []
238 |         if not self.is_testing:
239 |             label_column = ["label"]
240 | 
241 |         columns = label_column + ["original_id"] + extra_columns + similarity_columns + list(
242 |             map(lambda i: "word2vec_{0}".format(i), range(0, word2vec_provider.dimensions))) + list(
243 |             map(lambda w: w + "_bow",self.wordlist))
244 |         labels = []
245 |         rows = []
246 |         for idx in self.processed_data.index:
247 |             current_row = []
248 | 
249 |             if not self.is_testing:
250 |                 # add label
251 |                 current_label = self.processed_data.loc[idx, "emotion"]
252 |                 labels.append(current_label)
253 |                 current_row.append(current_label)
254 | 
255 |             current_row.append(self.processed_data.loc[idx, "id"])
256 | 
257 |             for _, col in enumerate(extra_columns):
258 |                 current_row.append(self.processed_data.loc[idx, col])
259 | 
260 |             # average similarities with words
261 |             tokens = self.processed_data.loc[idx, "tokenized_text"]
262 |             for main_word in map(lambda w: w.split("_")[0], similarity_columns):
263 |                 current_similarities = [abs(sim) for sim in
264 |                                         map(lambda word: word2vec_provider.get_similarity(main_word, word.lower()), tokens) if
265 |                                         sim is not None]
266 |                 if len(current_similarities) <= 1:
267 |                     current_row.append(0 if len(current_similarities) == 0 else current_similarities[0])
268 |                     continue
269 |                 max_sim = max(current_similarities)
270 |                 min_sim = min(current_similarities)
271 |                 current_similarities = [((sim - min_sim) / (max_sim - min_sim)) for sim in
272 |                                         current_similarities]  # normalize to <0;1>
273 |                 current_row.append(np.array(current_similarities).mean())
274 | 
275 |             # add word2vec vector
276 |             tokens = self.processed_data.loc[idx, "tokenized_text"]
277 |             current_word2vec = []
278 |             for _, word in enumerate(tokens):
279 |                 vec = word2vec_provider.get_vector(word.lower())
280 |                 if vec is not None:
281 |                     current_word2vec.append(vec)
282 | 
283 |             averaged_word2vec = list(np.array(current_word2vec).mean(axis=0))
284 |             # averaged_word2vec = map(lambda avg: (avg if abs(avg) > 0.0001 else 0), averaged_word2vec)
285 |             current_row += averaged_word2vec
286 | 
287 |             # add bag-of-words
288 |             tokens = set(self.processed_data.loc[idx, "text"])
289 |             for _, word in enumerate(self.wordlist):
290 |                 current_row.append(1 if word in tokens else 0)
291 | 
292 |             rows.append(current_row)
293 | 
294 |         self.data_model = pd.DataFrame(rows, columns=columns)
295 |         self.data_labels = pd.Series(labels)
296 |         return self.data_model, self.data_labels
297 | 
298 |     def build_data_model(self, with_ngram=None):
299 |         extra_columns = [col for col in self.processed_data.columns if col.startswith("number_of")]
300 |         label_column = []
301 |         if not self.is_testing:
302 |             label_column = ["label"]
303 | 
304 |         columns = label_column + extra_columns + list(
305 |             map(lambda w: w + "_bow",self.wordlist))
306 |         if with_ngram is not None:
307 |             columns += list(self.ngrams)
308 |         labels = []
309 |         rows = []
310 |         for idx in self.processed_data.index:
311 |             current_row = []
312 | 
313 |             if not self.is_testing:
314 |                 # add label
315 |                 current_label = self.processed_data.loc[idx, "emotion"]
316 |                 labels.append(current_label)
317 |                 current_row.append(current_label)
318 | 
319 |             for _, col in enumerate(extra_columns):
320 |                 current_row.append(self.processed_data.loc[idx, col])
321 | 
322 |             # add tokens
323 |             tokens = set(self.processed_data.loc[idx, "text"])
324 |             for _, word in enumerate(self.wordlist):
325 |                 current_row.append(1 if word in tokens else 0)
326 | 
327 |             if with_ngram is not None:
328 |                 current_ngrams = self.generate_ngrams(self.processed_data.loc[idx, "text"], with_ngram, [])
329 |                 for _, ng in enumerate(self.ngrams):
330 |                     current_row.append(1 if ng in current_ngrams else 0)
331 | 
332 |             rows.append(current_row)
333 | 
334 |         self.data_model = pd.DataFrame(rows, columns=columns)
335 |         self.data_labels = pd.Series(labels)
336 |         return self.data_model, self.data_labels
337 | 


--------------------------------------------------------------------------------
/word2vec.py:
--------------------------------------------------------------------------------
 1 | import gensim
 2 | 
 3 | 
 4 | class Word2VecProvider(object):
 5 |     word2vec = None
 6 | 
 7 |     dimensions = 0
 8 | 
 9 |     def load(self, path_to_word2vec):
10 |         self.word2vec = gensim.models.Word2Vec.load_word2vec_format(path_to_word2vec, binary=False)
11 |         self.word2vec.init_sims(replace=True)
12 |         self.dimensions = self.word2vec.vector_size
13 | 
14 |     def get_vector(self, word):
15 |         if word not in self.word2vec.vocab:
16 |             return None
17 | 
18 |         return self.word2vec.syn0norm[self.word2vec.vocab[word].index]
19 | 
20 |     def get_similarity(self, word1, word2):
21 |         if word1 not in self.word2vec.vocab or word2 not in self.word2vec.vocab:
22 |             return None
23 | 
24 |         return self.word2vec.similarity(word1, word2)
25 | 


--------------------------------------------------------------------------------