├── .gitignore ├── .ipynb_checkpoints └── twitter_sentiment_analysis-checkpoint.ipynb ├── README.md ├── cleanup.py ├── data ├── emoticons.txt ├── test.csv └── train.csv ├── emoticons.py ├── main.py ├── preprocessing.py ├── twitter_sentiment_analysis.ipynb └── word2vec.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm 2 | 3 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 4 | 5 | data/processed.csv 6 | 7 | 8 | # User-specific stuff: 9 | 10 | .idea/workspace.xml 11 | 12 | .idea/tasks.xml 13 | 14 | __pycache__ 15 | data/*.csv 16 | 17 | 18 | 19 | 20 | # Sensitive or high-churn files: 21 | 22 | .idea/dataSources/ 23 | 24 | .idea/dataSources.ids 25 | 26 | .idea/dataSources.xml 27 | 28 | .idea/dataSources.local.xml 29 | 30 | .idea/sqlDataSources.xml 31 | 32 | .idea/dynamic.xml 33 | 34 | .idea/uiDesigner.xml 35 | 36 | 37 | 38 | # Gradle: 39 | 40 | .idea/gradle.xml 41 | 42 | .idea/libraries 43 | 44 | 45 | 46 | # Mongo Explorer plugin: 47 | 48 | .idea/mongoSettings.xml 49 | 50 | 51 | 52 | ## File-based project format: 53 | 54 | *.iws 55 | 56 | 57 | 58 | ## Plugin-specific files: 59 | 60 | 61 | 62 | # IntelliJ 63 | 64 | /out/ 65 | 66 | 67 | 68 | # mpeltonen/sbt-idea plugin 69 | 70 | .idea_modules/ 71 | 72 | 73 | 74 | # JIRA plugin 75 | 76 | atlassian-ide-plugin.xml 77 | 78 | 79 | 80 | # Crashlytics plugin (for Android Studio and IntelliJ) 81 | 82 | com_crashlytics_export_strings.xml 83 | 84 | crashlytics.properties 85 | 86 | crashlytics-build.properties 87 | 88 | .idea 89 | 90 | 91 | fabric.properties 92 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Jupyter Notebook + Python code of twitter sentiment analysis 2 | Details and full description: 3 | http://zablo.net/blog/post/twitter-sentiment-analysis-python-scikit-word2vec-nltk-xgboost 4 | -------------------------------------------------------------------------------- /cleanup.py: -------------------------------------------------------------------------------- 1 | import re as regex 2 | 3 | 4 | class TwitterCleanuper: 5 | def iterate(self): 6 | for cleanup_method in [self.remove_urls, 7 | self.remove_usernames, 8 | self.remove_na, 9 | self.remove_special_chars, 10 | self.remove_numbers]: 11 | yield cleanup_method 12 | 13 | @staticmethod 14 | def remove_by_regex(tweets, regexp): 15 | tweets.loc[:, "text"].replace(regexp, "", inplace=True) 16 | return tweets 17 | 18 | def remove_urls(self, tweets): 19 | return TwitterCleanuper.remove_by_regex(tweets, regex.compile(r"http.?://[^\s]+[\s]?")) 20 | 21 | def remove_na(self, tweets): 22 | return tweets[tweets["text"] != "Not Available"] 23 | 24 | def remove_special_chars(self, tweets): # it unrolls the hashtags to normal words 25 | for remove in map(lambda r: regex.compile(regex.escape(r)), [",", ":", "\"", "=", "&", ";", "%", "$", 26 | "@", "%", "^", "*", "(", ")", "{", "}", 27 | "[", "]", "|", "/", "\\", ">", "<", "-", 28 | "!", "?", ".", "'", 29 | "--", "---", "#"]): 30 | tweets.loc[:, "text"].replace(remove, "", inplace=True) 31 | return tweets 32 | 33 | def remove_usernames(self, tweets): 34 | return TwitterCleanuper.remove_by_regex(tweets, regex.compile(r"@[^\s]+[\s]?")) 35 | 36 | def remove_numbers(self, tweets): 37 | return TwitterCleanuper.remove_by_regex(tweets, regex.compile(r"\s?[0-9]+\.?[0-9]*")) 38 | -------------------------------------------------------------------------------- /data/emoticons.txt: -------------------------------------------------------------------------------- 1 | positive: 2 | :-) 3 | :) 4 | :-] 5 | :] 6 | :-3 7 | :3 8 | :-> 9 | :> 10 | 8-) 11 | 8) 12 | :-} 13 | :} 14 | :o) 15 | :c) 16 | :^) 17 | =] 18 | =) 19 | :-D 20 | :D 21 | 8-D 22 | 8D 23 | x-D 24 | xD 25 | X-D 26 | XD 27 | =D 28 | =3 29 | B^D 30 | :-)) 31 | ;-) 32 | ;) 33 | *-) 34 | *) 35 | ;-] 36 | ;] 37 | ;^) 38 | ;D 39 | :-P 40 | :P 41 | X-P 42 | x-p 43 | :-p 44 | :p 45 | :-? 46 | :? 47 | :-? 48 | :? 49 | :-b 50 | :b 51 | =p 52 | >:P 53 | :* 54 | :-* 55 | ^.^ 56 | ^_^ 57 | ^-^ 58 | xd 59 | negative: 60 | :-( 61 | :( 62 | :-c 63 | :c 64 | :-< 65 | :< 66 | :-[ 67 | :[ 68 | :-|| 69 | >:[ 70 | :{ 71 | :@ 72 | >:( 73 | :-/ 74 | :/ 75 | >:\ 76 | >:/ 77 | :\ 78 | =/ 79 | =\ 80 | :L 81 | =L 82 | :S 83 | :-| 84 | :| 85 | :-X 86 | :X 87 | -.- 88 | -,- -------------------------------------------------------------------------------- /emoticons.py: -------------------------------------------------------------------------------- 1 | class EmoticonDetector: 2 | emoticons = {} 3 | 4 | def __init__(self, emoticon_file="data\\emoticons.txt"): 5 | from pathlib import Path 6 | content = Path(emoticon_file).read_text() 7 | positive = True 8 | for line in content.split("\n"): 9 | if "positive" in line.lower(): 10 | positive = True 11 | continue 12 | elif "negative" in line.lower(): 13 | positive = False 14 | continue 15 | 16 | self.emoticons[line] = positive 17 | 18 | def is_positive(self, emoticon): 19 | if emoticon in self.emoticons: 20 | return self.emoticons[emoticon] 21 | return False 22 | 23 | def is_emoticon(self, to_check): 24 | return to_check in self.emoticons 25 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import multiprocessing 2 | from multiprocessing import Process 3 | from time import time 4 | 5 | import pandas as pd 6 | from sklearn.ensemble import RandomForestClassifier, VotingClassifier 7 | from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score 8 | from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV 9 | from sklearn.naive_bayes import BernoulliNB 10 | from xgboost import XGBClassifier as XGBoostClassifier 11 | from sklearn.svm import LinearSVC 12 | 13 | from cleanup import TwitterCleanuper 14 | from preprocessing import TwitterData 15 | from word2vec import Word2VecProvider 16 | 17 | 18 | def preprocess(results, data_path, is_testing, data_name, min_occurrences=5, cache_output=None): 19 | twitter_data = TwitterData() 20 | twitter_data.initialize(data_path, is_testing) 21 | twitter_data.build_features() 22 | twitter_data.cleanup(TwitterCleanuper()) 23 | twitter_data.tokenize() 24 | twitter_data.stem() 25 | twitter_data.build_wordlist(min_occurrences=min_occurrences) 26 | #twitter_data.build_data_model() 27 | # twitter_data.build_ngrams() 28 | # twitter_data.build_ngram_model() 29 | # twitter_data.build_data_model(with_ngram=2) 30 | # word2vec = Word2VecProvider() 31 | # word2vec.load("H:\\Programowanie\\glove.twitter.27B.200d.txt") 32 | # twitter_data.build_word2vec_model(word2vec) 33 | if cache_output is not None: 34 | twitter_data.data_model.to_csv(cache_output, index_label="idx", float_format="%.6f") 35 | results[data_name] = twitter_data.data_model 36 | 37 | 38 | def preprare_data(min_occurrences): 39 | import os 40 | training_data = None 41 | testing_data = None 42 | print("Loading data...") 43 | test_data_file_name = "data\\processed_test_word2vec_bow_" + str(min_occurrences) + ".csv" 44 | train_data_file_name = "data\\processed_train_word2vec_bow_" + str(min_occurrences) + ".csv" 45 | use_cache = os.path.isfile(train_data_file_name) and os.path.isfile( 46 | test_data_file_name) 47 | if use_cache: 48 | training_data = TwitterData() 49 | training_data.initialize(None, from_cached=train_data_file_name) 50 | training_data = training_data.data_model 51 | 52 | testing_data = TwitterData() 53 | testing_data.initialize(None, from_cached=test_data_file_name) 54 | testing_data = testing_data.data_model 55 | print("Loaded from cached files...") 56 | else: 57 | print("Preprocessing data...") 58 | with multiprocessing.Manager() as manager: 59 | 60 | results = manager.dict() 61 | 62 | preprocess_training = Process(target=preprocess, args=( 63 | results, "data\\train.csv", False, "train", min_occurrences, train_data_file_name,)) 64 | 65 | preprocess_testing = Process(target=preprocess, args=( 66 | results, "data\\test.csv", True, "test", min_occurrences, test_data_file_name,)) 67 | 68 | preprocess_training.start() 69 | preprocess_testing.start() 70 | print("Multiple processes started...") 71 | 72 | preprocess_testing.join() 73 | print("Preprocessed testing data...") 74 | 75 | preprocess_training.join() 76 | print("Preprocessed training data...") 77 | 78 | training_data = results["train"] 79 | testing_data = results["test"] 80 | 81 | print("Data preprocessed & cached...") 82 | 83 | return training_data, testing_data 84 | 85 | 86 | def log(text): 87 | print(text) 88 | with open("log.txt", "a") as log_file: 89 | log_file.write(str(text) + "\n") 90 | 91 | 92 | def test_classifier(X_train, y_train, X_test, y_test, classifier): 93 | log("") 94 | log("===============================================") 95 | classifier_name = str(type(classifier).__name__) 96 | log("Testing " + classifier_name) 97 | now = time() 98 | list_of_labels = sorted(list(set(y_train))) 99 | model = classifier.fit(X_train, y_train) 100 | log("Learing time {0}s".format(time() - now)) 101 | now = time() 102 | predictions = model.predict(X_test) 103 | log("Predicting time {0}s".format(time() - now)) 104 | 105 | from sklearn.metrics import classification_report 106 | precision = precision_score(y_test, predictions, average=None, pos_label=None, labels=list_of_labels) 107 | recall = recall_score(y_test, predictions, average=None, pos_label=None, labels=list_of_labels) 108 | accuracy = accuracy_score(y_test, predictions) 109 | log("=================== Results ===================") 110 | log(classification_report(y_test, predictions, labels=list_of_labels)) 111 | log(" Negative Neutral Positive") 112 | log("F1 " + str(f1_score(y_test, predictions, average=None, pos_label=None, labels=list_of_labels))) 113 | log("Precision" + str(precision)) 114 | log("Recall " + str(recall)) 115 | log("Accuracy " + str(accuracy)) 116 | log("===============================================") 117 | 118 | return precision, recall, accuracy 119 | 120 | 121 | def cv(classifier, X_train, y_train): 122 | log("") 123 | log("===============================================") 124 | classifier_name = str(type(classifier).__name__) 125 | log("Testing " + classifier_name) 126 | now = time() 127 | 128 | log("Crossvalidating...") 129 | # recall = [cross_val_score(classifier, X_train, y_train, scoring="recall_micro", cv=10, n_jobs=-1)] 130 | accuracy = [cross_val_score(classifier, X_train, y_train, cv=8, n_jobs=-1)] 131 | # precision = [cross_val_score(classifier, X_train, y_train, scoring="precision_micro", cv=10, n_jobs=-1)] 132 | recall = -1 133 | precision = -1 134 | log("Crosvalidation completed in {0}s".format(time() - now)) 135 | log("=================== Results ===================") 136 | log("Accuracy: " + str(accuracy)) 137 | log("Precision: " + str(precision)) 138 | log("Recall: " + str(recall)) 139 | log("===============================================") 140 | log("CV time: {0}".format(time() - now)) 141 | return accuracy, precision, recall 142 | 143 | 144 | import numpy as np 145 | 146 | 147 | # Utility function to report best scores 148 | def report(results, n_top=3): 149 | for i in range(1, n_top + 1): 150 | candidates = np.flatnonzero(results['rank_test_score'] == i) 151 | for candidate in candidates: 152 | log("Model with rank: {0}".format(i)) 153 | log("Mean validation score: {0:.3f} (std: {1:.3f})".format( 154 | results['mean_test_score'][candidate], 155 | results['std_test_score'][candidate])) 156 | log("Parameters: {0}".format(results['params'][candidate])) 157 | log("") 158 | 159 | 160 | def best_fit(X_train, y_train): 161 | log("") 162 | 163 | seed = 666 164 | import time as ttt 165 | attributes = len(X_train.columns) 166 | examples = len(X_train) 167 | now = time() 168 | log(ttt.ctime()) 169 | # Parameters for SVM 170 | # parameters = { 171 | # "dual": [True, False], 172 | # "tol": [1e-3, 1e-4, 1e-5], 173 | # "C": [1.0, 1.5, 2.0, 5.0, 10, 100, 1000] 174 | # } 175 | # rand_search = RandomizedSearchCV(LinearSVC(max_iter=5000), param_distributions=parameters, cv=8,n_jobs=-1,n_iter=20) 176 | # 177 | # 178 | # rand_search.fit(X_train,y_train) 179 | # report(rand_search.cv_results_, 10) 180 | # log(ttt.ctime()) 181 | # log(time() - now) 182 | # return 183 | 184 | # Parameters for Bagging 185 | # parameters = { 186 | # "n_estimators": [2, 3, 5, 13, 51, 201, 303, 403, 505], 187 | # "max_features": list(map(lambda x: int(x), 188 | # [sqrt(attributes), 2 * sqrt(attributes), 3 * sqrt(attributes), attributes / 2, 189 | # attributes / 3, attributes / 4])) 190 | # } 191 | # 192 | # rand_search = RandomizedSearchCV(BaggingClassifier( 193 | # base_estimator=LinearSVC(random_state=seed, class_weight="balanced", max_iter=5000, C=1.0, tol=0.0001, dual=True), 194 | # random_state=seed, n_jobs=1), param_distributions=parameters, n_jobs=-1, n_iter=3, cv=8, 195 | # scoring=make_scorer(f1_score, average="micro", labels=["positive", "negative", "neutral"])) 196 | # 197 | # now = time() 198 | # log(ttt.ctime()) 199 | # rand_search.fit(X_train, y_train) 200 | # 201 | # report(rand_search.cv_results_, 10) 202 | log(ttt.ctime()) 203 | log(time() - now) 204 | 205 | # Parameters for RF 206 | # log("RF:") 207 | # parameters = { 208 | # "n_estimators":[103, 201, 305, 403, 666, 1001, 5007, 10001], 209 | # "max_depth":[None, 5, 20, 40, 73, 100, 1000, 2000], 210 | # "criterion":["gini", "entropy"] 211 | # } 212 | # 213 | # rand_search = RandomizedSearchCV(RandomForestClassifier(random_state=seed,n_jobs=-1),param_distributions=parameters, 214 | # n_iter=15,scoring="accuracy", 215 | # n_jobs=1,cv=10) 216 | # now = time() 217 | # log(ttt.ctime()) 218 | # rand_search.fit(X_train, y_train) 219 | # 220 | # report(rand_search.cv_results_, 10) 221 | # log(ttt.ctime()) 222 | # log(time() - now) 223 | 224 | # Parameters for XGBoost 225 | log("XGB:") 226 | parameters = { 227 | "n_estimators":[103,201, 403], 228 | "max_depth":[3,10,15], 229 | "objective":["multi:softmax","binary:logistic"], 230 | "learning_rate":[0.05, 0.1, 0.15, 0.3] 231 | } 232 | 233 | rand_search = RandomizedSearchCV(XGBoostClassifier(seed=seed),param_distributions=parameters, 234 | n_iter=5,scoring="accuracy", 235 | n_jobs=-1,cv=8) 236 | 237 | 238 | now = time() 239 | log(ttt.ctime()) 240 | rand_search.fit(X_train, y_train) 241 | 242 | report(rand_search.cv_results_, 10) 243 | log(ttt.ctime()) 244 | log(time() - now) 245 | 246 | parameters = { 247 | "n_estimators": [403, 666, 1000], 248 | "max_depth": [40,50,90,100,200], 249 | "subsample":[1.0, 0.6, 0.9], 250 | "objective": ["multi:softmax", "binary:logistic"], 251 | "learning_rate": [0.1, 0.15, 0.5] 252 | } 253 | 254 | rand_search = RandomizedSearchCV(XGBoostClassifier(seed=seed,), param_distributions=parameters, 255 | n_iter=5, scoring="accuracy", 256 | n_jobs=-1, cv=8) 257 | 258 | now = time() 259 | log(ttt.ctime()) 260 | rand_search.fit(X_train, y_train) 261 | 262 | report(rand_search.cv_results_, 10) 263 | log(ttt.ctime()) 264 | log(time() - now) 265 | 266 | return 267 | 268 | 269 | # Parameters for VotingClassifier 270 | # parameters = { 271 | # "weights": [ 272 | # [1, 1, 1], 273 | # [2, 1, 1], 274 | # [2, 2, 1], 275 | # [4, 1, 5], 276 | # [1, 1, 2], 277 | # [5, 1, 2], 278 | # [5, 2, 1], 279 | # [5, 3, 2], 280 | # [6, 2, 1], 281 | # [6, 1, 5], 282 | # [6, 1, 2], 283 | # [7, 1, 6], 284 | # [7, 2, 3], 285 | # ] 286 | # } 287 | log("Voting RF XGB NB:") 288 | parameters = { 289 | "weights": [ 290 | [1, 1, 1], 291 | [2, 1, 1], 292 | [1, 1, 2], 293 | [4, 1, 5], 294 | [3, 1, 3], 295 | [3, 1, 4] 296 | ] 297 | } 298 | 299 | rand_search = GridSearchCV(VotingClassifier([ 300 | ("randomforest", RandomForestClassifier(n_estimators=403, random_state=seed, max_depth=73, n_jobs=-1)), 301 | ("naivebayes", BernoulliNB()), 302 | ("xgboost", XGBoostClassifier(n_estimators=103, seed=seed, max_depth=3, objective="multi:softmax")) 303 | ], voting="soft", n_jobs=1), scoring="accuracy", n_jobs=-1, cv=8, param_grid=parameters) 304 | rand_search.fit(X_train, y_train) 305 | # 306 | report(rand_search.cv_results_, 10) 307 | log(ttt.ctime()) 308 | log(time() - now) 309 | 310 | 311 | def numbers_to_boolean(df): 312 | for column in filter(lambda col: col.startswith("number_of_"), df.columns): 313 | df[column] = (df[column] >= 1).astype(int) 314 | 315 | 316 | if __name__ == "__main__": 317 | 318 | def main(): 319 | result_col_names = ["min_occ", "precision", "recall", "accuracy"] 320 | result_col_names = ["min_occ", "precision_negative", "precision_neutral", "precision_positive", 321 | "recall_negative", 322 | "recall_neutral", "recall_positive", "accuracy"] 323 | 324 | results_df = pd.DataFrame(columns=result_col_names) 325 | for m in range(3, 4): 326 | print("Preparing data with min_occurrences=" + str(m)) 327 | training_data, testing_data = preprare_data(m) 328 | log("********************************************************") 329 | log("Validating for {0} min_occurrences:".format(m)) 330 | # drop idx & id columns 331 | if training_data.columns[0] == "idx": 332 | training_data = training_data.iloc[:, 1:] 333 | 334 | if testing_data.columns[0] == "idx": 335 | testing_data = testing_data.iloc[:, 1:] 336 | 337 | if "original_id" in training_data.columns: 338 | training_data.drop("original_id", axis=1, inplace=True) 339 | 340 | if "original_id" in testing_data.columns: 341 | testing_data.drop("original_id", axis=1, inplace=True) 342 | 343 | # continue 344 | import random 345 | seed = 666 346 | random.seed(seed) 347 | X_train, X_test, y_train, y_test = train_test_split(training_data.iloc[:, 1:], training_data.iloc[:, 0], 348 | train_size=0.7, stratify=training_data.iloc[:, 0], 349 | random_state=seed) 350 | 351 | use_full_set = True 352 | if use_full_set: 353 | X_train = training_data.iloc[:, 1:] 354 | y_train = training_data.iloc[:, 0] 355 | 356 | X_test = testing_data.iloc[:, 1:] 357 | y_test = testing_data.iloc[:, 0] 358 | 359 | # from sklearn.preprocessing import StandardScaler 360 | # scaler = StandardScaler() 361 | # scaler.fit(X_train) 362 | # 363 | # scaler.transform(X_train) 364 | # scaler.transform(X_test) 365 | 366 | # numbers_to_boolean(X_train) 367 | # numbers_to_boolean(X_test) 368 | 369 | from math import sqrt 370 | 371 | classifiers = [ 372 | # MLPClassifier(hidden_layer_sizes=(900, 666, 500, 100, 50, 13), random_state=seed, max_iter=5000) 373 | # LinearSVC(random_state=seed,class_weight="balanced",max_iter=5000,C=1.0,tol=1e-5,dual=True), 374 | # SVC(random_state=seed, class_weight="balanced", max_iter=10000, kernel="linear",probability=True) 375 | # BaggingClassifier(base_estimator=LinearSVC(random_state=seed, class_weight="balanced", max_iter=5000, 376 | # C=1.0, tol=1e-5, dual=True), 377 | # n_estimators=403, n_jobs=-1, random_state=seed, 378 | # max_features=410), 379 | # VotingClassifier([ 380 | # ("svm", BaggingClassifier( 381 | # base_estimator=LinearSVC(random_state=seed, class_weight="balanced", max_iter=5000, 382 | # C=2.0, tol=0.001, dual=True), 383 | # n_estimators=8, n_jobs=-1, random_state=seed, 384 | # )), 385 | # ("naivebayes", BernoulliNB()), 386 | # ("randomforest", RandomForestClassifier(max_depth=73, n_estimators=403, n_jobs=-1)) 387 | # ], voting="soft", weights=[1,1,1]), 388 | # XGBoostClassifier(n_estimators=103,seed=seed,max_depth=4, objective="multi:softmax"), 389 | # LinearSVC(random_state=seed, class_weight="balanced", max_iter=5000, C=2.0, tol=0.001, dual=True) 390 | # LogisticRegression(max_iter=5000,n_jobs=-1,solver="sag",random_state=seed), 391 | # RandomForestClassifier(n_jobs=-1,random_state=seed,n_estimators=403) 392 | # VotingClassifier([ 393 | # ("randomforest", 394 | # RandomForestClassifier(n_estimators=403, random_state=seed, max_depth=73, n_jobs=-1)), 395 | # ("naivebayes", BernoulliNB()), 396 | # ("xgboost", XGBoostClassifier(n_estimators=103, seed=seed, max_depth=3, objective="multi:softmax")) 397 | # ], voting="soft", weights=[4, 1, 5], n_jobs=-1), 398 | BernoulliNB() 399 | # SVC(C=2.0,kernel="linear",tol=0.001,random_state=seed), 400 | 401 | # BaggingClassifier( 402 | # base_estimator=LinearSVC(random_state=seed, class_weight="balanced", max_iter=5000, 403 | # C=2.0, tol=0.001, dual=True), 404 | # n_estimators=8, n_jobs=-1, random_state=seed, 405 | # ) 406 | # BernoulliNB(), 407 | # LinearSVC(random_state=seed, class_weight="balanced", max_iter=5000, C=2.0, tol=0.001, dual=True) 408 | ] 409 | # 410 | best_fit(X_train, y_train) 411 | continue 412 | # classifier = joblib.load("VotingClassifier_train.bin") 413 | # predictions = classifier.predict(X_test) 414 | # 415 | # not_predicted_idx = [] 416 | # for idx, predicted in enumerate(predictions): 417 | # if predicted == y_test.iloc[idx]: 418 | # continue 419 | # not_predicted_idx.append(idx) 420 | # 421 | # bad_predictions = X_test.iloc[not_predicted_idx, :] 422 | # bad_predictions = bad_predictions.assign(label=y_test.iloc[not_predicted_idx]) 423 | # bad_predictions.to_csv("data\\not_predicted.csv", index_label="idx") 424 | # 425 | # continue 426 | for classifier in classifiers: 427 | # precision, recall, accuracy = test_classifier(X_train, y_train, X_test, y_test, classifier) 428 | precision, recall, accuracy = cv(classifier,X_train,y_train) 429 | continue 430 | _tmp = [m] 431 | for idx in range(0, len(precision)): 432 | _tmp.append(precision[idx]) 433 | 434 | for idx in range(0, len(recall)): 435 | _tmp.append(recall[idx]) 436 | 437 | _tmp.append(accuracy) 438 | results_df = results_df.append(pd.DataFrame([_tmp], columns=result_col_names)) 439 | results_df.to_csv("results_" + str(type(classifier).__name__) + "_train.csv", index_label="idx") 440 | # joblib.dump(classifier, str(type(classifier).__name__)+"_train.bin") 441 | 442 | print(results_df) 443 | 444 | print("Done!") 445 | 446 | 447 | main() 448 | -------------------------------------------------------------------------------- /preprocessing.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | import nltk 3 | import pandas as pd 4 | from emoticons import EmoticonDetector 5 | import re as regex 6 | import numpy as np 7 | 8 | 9 | class TwitterData: 10 | data = [] 11 | processed_data = [] 12 | wordlist = [] 13 | ngrams = [] 14 | 15 | whitelist = ["n't", "not"] 16 | data_model = None 17 | data_labels = None 18 | is_testing = False 19 | 20 | def initialize(self, csv_file, is_testing_set=False, from_cached=None): 21 | if from_cached is not None: 22 | self.data_model = pd.read_csv(from_cached) 23 | return 24 | 25 | self.is_testing = is_testing_set 26 | 27 | if not is_testing_set: 28 | self.data = pd.read_csv(csv_file, header=0, names=["id", "emotion", "text"]) 29 | self.data = self.data[self.data["emotion"].isin(["positive", "negative", "neutral"])] 30 | else: 31 | self.data = pd.read_csv(csv_file, header=0, names=["id", "text"]) 32 | not_null_text = 1 ^ pd.isnull(self.data["text"]) 33 | not_null_id = 1 ^ pd.isnull(self.data["id"]) 34 | self.data = self.data.loc[not_null_id & not_null_text, :] 35 | 36 | self.processed_data = self.data 37 | self.wordlist = [] 38 | self.data_model = None 39 | self.data_labels = None 40 | 41 | def cleanup(self, cleanuper): 42 | t = self.processed_data 43 | for cleanup_method in cleanuper.iterate(): 44 | if not self.is_testing: 45 | t = cleanup_method(t) 46 | else: 47 | if cleanup_method.__name__ != "remove_na": 48 | t = cleanup_method(t) 49 | 50 | self.processed_data = t 51 | 52 | def build_features(self): 53 | def count_by_lambda(expression, word_array): 54 | return len(list(filter(expression, word_array))) 55 | 56 | def count_occurences(character, word_array): 57 | counter = 0 58 | for j, word in enumerate(word_array): 59 | for char in word: 60 | if char == character: 61 | counter += 1 62 | 63 | return counter 64 | 65 | def count_by_regex(regex, plain_text): 66 | return len(regex.findall(plain_text)) 67 | 68 | self.add_column("splitted_text", map(lambda txt: txt.split(" "), self.processed_data["text"])) 69 | 70 | # number of uppercase words 71 | uppercase = list(map(lambda txt: count_by_lambda(lambda word: word == word.upper(), txt), 72 | self.processed_data["splitted_text"])) 73 | self.add_column("number_of_uppercase", uppercase) 74 | 75 | # number of ! 76 | exclamations = list(map(lambda txt: count_occurences("!", txt), 77 | self.processed_data["splitted_text"])) 78 | 79 | self.add_column("number_of_exclamation", exclamations) 80 | 81 | # number of ? 82 | questions = list(map(lambda txt: count_occurences("?", txt), 83 | self.processed_data["splitted_text"])) 84 | 85 | self.add_column("number_of_question", questions) 86 | 87 | # number of ... 88 | ellipsis = list(map(lambda txt: count_by_regex(regex.compile(r"\.\s?\.\s?\."), txt), 89 | self.processed_data["text"])) 90 | 91 | self.add_column("number_of_ellipsis", ellipsis) 92 | 93 | # number of hashtags 94 | hashtags = list(map(lambda txt: count_occurences("#", txt), 95 | self.processed_data["splitted_text"])) 96 | 97 | self.add_column("number_of_hashtags", hashtags) 98 | 99 | # number of mentions 100 | mentions = list(map(lambda txt: count_occurences("@", txt), 101 | self.processed_data["splitted_text"])) 102 | 103 | self.add_column("number_of_mentions", mentions) 104 | 105 | # number of quotes 106 | quotes = list(map(lambda plain_text: int(count_occurences("'", [plain_text.strip("'").strip('"')]) / 2 + 107 | count_occurences('"', [plain_text.strip("'").strip('"')]) / 2), 108 | self.processed_data["text"])) 109 | 110 | self.add_column("number_of_quotes", quotes) 111 | 112 | # number of urls 113 | urls = list(map(lambda txt: count_by_regex(regex.compile(r"http.?://[^\s]+[\s]?"), txt), 114 | self.processed_data["text"])) 115 | 116 | self.add_column("number_of_urls", urls) 117 | 118 | # number of positive emoticons 119 | ed = EmoticonDetector() 120 | positive_emo = list( 121 | map(lambda txt: count_by_lambda(lambda word: ed.is_emoticon(word) and ed.is_positive(word), txt), 122 | self.processed_data["splitted_text"])) 123 | 124 | self.add_column("number_of_positive_emo", positive_emo) 125 | 126 | # number of negative emoticons 127 | negative_emo = list(map( 128 | lambda txt: count_by_lambda(lambda word: ed.is_emoticon(word) and not ed.is_positive(word), txt), 129 | self.processed_data["splitted_text"])) 130 | 131 | self.add_column("number_of_negative_emo", negative_emo) 132 | 133 | pass 134 | 135 | def add_column(self, column_name, column_content): 136 | self.processed_data.loc[:, column_name] = pd.Series(column_content, index=self.processed_data.index) 137 | 138 | def stem(self, stemmer=nltk.PorterStemmer()): 139 | def stem_and_join(row): 140 | row["text"] = list(map(lambda str: stemmer.stem(str.lower()), row["text"])) 141 | return row 142 | 143 | self.processed_data = self.processed_data.apply(stem_and_join, axis=1) 144 | 145 | def tokenize(self, tokenizer=nltk.word_tokenize): 146 | def tokenize_row(row): 147 | row["text"] = tokenizer(row["text"]) 148 | row["tokenized_text"] = [] + row["text"] 149 | return row 150 | 151 | self.processed_data = self.processed_data.apply(tokenize_row, axis=1) 152 | 153 | def build_wordlist(self, min_occurrences=5, max_occurences=500, stopwords=nltk.corpus.stopwords.words("english"), 154 | whitelist=None): 155 | self.wordlist = [] 156 | whitelist = self.whitelist if whitelist is None else whitelist 157 | import os 158 | if os.path.isfile("data\\wordlist.csv"): 159 | word_df = pd.read_csv("data\\wordlist.csv") 160 | word_df = word_df[word_df["occurrences"] > min_occurrences] 161 | self.wordlist = list(word_df.loc[:, "word"]) 162 | return 163 | 164 | words = Counter() 165 | for idx in self.processed_data.index: 166 | words.update(self.processed_data.loc[idx, "text"]) 167 | 168 | for idx, stop_word in enumerate(stopwords): 169 | if stop_word not in whitelist: 170 | del words[stop_word] 171 | 172 | word_df = pd.DataFrame(data={"word": [k for k, v in words.most_common() if min_occurrences < v < max_occurences], 173 | "occurrences": [v for k, v in words.most_common() if min_occurrences < v < max_occurences]}, 174 | columns=["word", "occurrences"]) 175 | 176 | word_df.to_csv("data\\wordlist.csv", index_label="idx") 177 | self.wordlist = [k for k, v in words.most_common() if min_occurrences < v < max_occurences] 178 | 179 | def build_ngrams(self, ngram=2, stopwords=nltk.corpus.stopwords.words("english"), 180 | whitelist=None): 181 | whitelist = self.whitelist if whitelist is None else whitelist 182 | stopwords = list(filter(lambda sw: sw not in whitelist, stopwords)) 183 | ngrams = Counter() 184 | for idx in self.processed_data.index: 185 | tokens = self.processed_data.loc[idx, "text"] 186 | ngrams.update(self.generate_ngrams(tokens, ngram, stopwords)) 187 | 188 | self.ngrams = [ng for ng, cnt in ngrams.most_common() if cnt >= 2] 189 | 190 | def generate_ngrams(self, tokens, ngram, stopwords): 191 | return list(map(lambda ng: str.join("_", ng), 192 | nltk.ngrams( 193 | filter(lambda word: word not in stopwords, tokens), 194 | ngram))) 195 | 196 | def build_ngram_model(self, stopwords=nltk.corpus.stopwords.words("english"), 197 | whitelist=None, ngram=2): 198 | whitelist = self.whitelist if whitelist is None else whitelist 199 | stopwords = list(filter(lambda sw: sw not in whitelist, stopwords)) 200 | extra_columns = [col for col in self.processed_data.columns if col.startswith("number_of")] 201 | label_column = [] 202 | if not self.is_testing: 203 | label_column = ["label"] 204 | 205 | columns = label_column + extra_columns + list(self.ngrams) 206 | labels = [] 207 | rows = [] 208 | for idx in self.processed_data.index: 209 | current_row = [] 210 | 211 | if not self.is_testing: 212 | # add label 213 | current_label = self.processed_data.loc[idx, "emotion"] 214 | labels.append(current_label) 215 | current_row.append(current_label) 216 | 217 | for _, col in enumerate(extra_columns): 218 | current_row.append(self.processed_data.loc[idx, col]) 219 | 220 | # add ngrams 221 | tokens = self.processed_data.loc[idx, "text"] 222 | current_ngrams = self.generate_ngrams(tokens, ngram, stopwords) 223 | for _, ng in enumerate(self.ngrams): 224 | current_row.append(1 if ng in current_ngrams else 0) 225 | 226 | rows.append(current_row) 227 | 228 | self.data_model = pd.DataFrame(rows, columns=columns) 229 | self.data_labels = pd.Series(labels) 230 | return self.data_model, self.data_labels 231 | 232 | def build_word2vec_model(self, word2vec_provider, stopwords=nltk.corpus.stopwords.words("english"), whitelist=None): 233 | whitelist = self.whitelist if whitelist is None else whitelist 234 | stopwords = list(filter(lambda sw: sw not in whitelist, stopwords)) 235 | extra_columns = [col for col in self.processed_data.columns if col.startswith("number_of")] 236 | similarity_columns = ["bad_similarity", "good_similarity", "information_similarity"] 237 | label_column = [] 238 | if not self.is_testing: 239 | label_column = ["label"] 240 | 241 | columns = label_column + ["original_id"] + extra_columns + similarity_columns + list( 242 | map(lambda i: "word2vec_{0}".format(i), range(0, word2vec_provider.dimensions))) + list( 243 | map(lambda w: w + "_bow",self.wordlist)) 244 | labels = [] 245 | rows = [] 246 | for idx in self.processed_data.index: 247 | current_row = [] 248 | 249 | if not self.is_testing: 250 | # add label 251 | current_label = self.processed_data.loc[idx, "emotion"] 252 | labels.append(current_label) 253 | current_row.append(current_label) 254 | 255 | current_row.append(self.processed_data.loc[idx, "id"]) 256 | 257 | for _, col in enumerate(extra_columns): 258 | current_row.append(self.processed_data.loc[idx, col]) 259 | 260 | # average similarities with words 261 | tokens = self.processed_data.loc[idx, "tokenized_text"] 262 | for main_word in map(lambda w: w.split("_")[0], similarity_columns): 263 | current_similarities = [abs(sim) for sim in 264 | map(lambda word: word2vec_provider.get_similarity(main_word, word.lower()), tokens) if 265 | sim is not None] 266 | if len(current_similarities) <= 1: 267 | current_row.append(0 if len(current_similarities) == 0 else current_similarities[0]) 268 | continue 269 | max_sim = max(current_similarities) 270 | min_sim = min(current_similarities) 271 | current_similarities = [((sim - min_sim) / (max_sim - min_sim)) for sim in 272 | current_similarities] # normalize to <0;1> 273 | current_row.append(np.array(current_similarities).mean()) 274 | 275 | # add word2vec vector 276 | tokens = self.processed_data.loc[idx, "tokenized_text"] 277 | current_word2vec = [] 278 | for _, word in enumerate(tokens): 279 | vec = word2vec_provider.get_vector(word.lower()) 280 | if vec is not None: 281 | current_word2vec.append(vec) 282 | 283 | averaged_word2vec = list(np.array(current_word2vec).mean(axis=0)) 284 | # averaged_word2vec = map(lambda avg: (avg if abs(avg) > 0.0001 else 0), averaged_word2vec) 285 | current_row += averaged_word2vec 286 | 287 | # add bag-of-words 288 | tokens = set(self.processed_data.loc[idx, "text"]) 289 | for _, word in enumerate(self.wordlist): 290 | current_row.append(1 if word in tokens else 0) 291 | 292 | rows.append(current_row) 293 | 294 | self.data_model = pd.DataFrame(rows, columns=columns) 295 | self.data_labels = pd.Series(labels) 296 | return self.data_model, self.data_labels 297 | 298 | def build_data_model(self, with_ngram=None): 299 | extra_columns = [col for col in self.processed_data.columns if col.startswith("number_of")] 300 | label_column = [] 301 | if not self.is_testing: 302 | label_column = ["label"] 303 | 304 | columns = label_column + extra_columns + list( 305 | map(lambda w: w + "_bow",self.wordlist)) 306 | if with_ngram is not None: 307 | columns += list(self.ngrams) 308 | labels = [] 309 | rows = [] 310 | for idx in self.processed_data.index: 311 | current_row = [] 312 | 313 | if not self.is_testing: 314 | # add label 315 | current_label = self.processed_data.loc[idx, "emotion"] 316 | labels.append(current_label) 317 | current_row.append(current_label) 318 | 319 | for _, col in enumerate(extra_columns): 320 | current_row.append(self.processed_data.loc[idx, col]) 321 | 322 | # add tokens 323 | tokens = set(self.processed_data.loc[idx, "text"]) 324 | for _, word in enumerate(self.wordlist): 325 | current_row.append(1 if word in tokens else 0) 326 | 327 | if with_ngram is not None: 328 | current_ngrams = self.generate_ngrams(self.processed_data.loc[idx, "text"], with_ngram, []) 329 | for _, ng in enumerate(self.ngrams): 330 | current_row.append(1 if ng in current_ngrams else 0) 331 | 332 | rows.append(current_row) 333 | 334 | self.data_model = pd.DataFrame(rows, columns=columns) 335 | self.data_labels = pd.Series(labels) 336 | return self.data_model, self.data_labels 337 | -------------------------------------------------------------------------------- /word2vec.py: -------------------------------------------------------------------------------- 1 | import gensim 2 | 3 | 4 | class Word2VecProvider(object): 5 | word2vec = None 6 | 7 | dimensions = 0 8 | 9 | def load(self, path_to_word2vec): 10 | self.word2vec = gensim.models.Word2Vec.load_word2vec_format(path_to_word2vec, binary=False) 11 | self.word2vec.init_sims(replace=True) 12 | self.dimensions = self.word2vec.vector_size 13 | 14 | def get_vector(self, word): 15 | if word not in self.word2vec.vocab: 16 | return None 17 | 18 | return self.word2vec.syn0norm[self.word2vec.vocab[word].index] 19 | 20 | def get_similarity(self, word1, word2): 21 | if word1 not in self.word2vec.vocab or word2 not in self.word2vec.vocab: 22 | return None 23 | 24 | return self.word2vec.similarity(word1, word2) 25 | --------------------------------------------------------------------------------