├── .gitignore
├── README.md
├── classification
    ├── Classifier.py
    ├── Evaluator.py
    ├── NeuralNets.py
    └── __init__.py
├── embedding
    ├── __init__.py
    └── sswe_extractor.py
├── feature_extracting
    ├── SennaFeatureExtractor.py
    ├── WordEmbeddingFeatureExtractor.py
    └── __init__.py
├── main.py
├── models
    ├── Dataset.py
    └── __init__.py
├── preprocessing
    ├── __init__.py
    ├── csv_header_change.py
    ├── csv_tsv.py
    ├── preprocesstweets.py
    ├── process_json_file.py
    ├── stopwords.txt
    ├── store_tweet_in_db.py
    └── twitter_streaming.py
└── text_mining_project_report.pdf


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # Jupyter Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # SageMath parsed files
 80 | *.sage.py
 81 | 
 82 | # dotenv
 83 | .env
 84 | 
 85 | # virtualenv
 86 | .venv
 87 | venv/
 88 | ENV/
 89 | 
 90 | # Spyder project settings
 91 | .spyderproject
 92 | .spyproject
 93 | 
 94 | # Rope project settings
 95 | .ropeproject
 96 | 
 97 | # mkdocs documentation
 98 | /site
 99 | 
100 | # mypy
101 | .mypy_cache/
102 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Learning-Sentiment-Specific-Word-embedding-for-Twitter-Sentiment-Classification
2 | 


--------------------------------------------------------------------------------
/classification/Classifier.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | 
  3 | from sklearn import metrics
  4 | 
  5 | from feature_extractor import FeatureExtractor
  6 | from sklearn.naive_bayes import MultinomialNB
  7 | from sklearn.neural_network import MLPClassifier
  8 | from sklearn.model_selection import train_test_split
  9 | from sklearn.svm import SVC
 10 | from sklearn.ensemble import RandomForestClassifier
 11 | 
 12 | 
 13 | class Classifier(object):
 14 |     """docstring for Classifier"""
 15 | 
 16 |     def __init__(self, models="multinomial"):
 17 |         super(Classifier, self).__init__()
 18 |         self.models = models
 19 |         if models == "multinomial":
 20 |             self.classifier = MultinomialNB()
 21 |         elif models == "svm":
 22 |             self.classifier = SVC(kernel='linear')
 23 |         elif models == "rfc":
 24 |             self.classifier = RandomForestClassifier()
 25 |         elif models == "nn":
 26 |             self.classifier = MLPClassifier()
 27 | 
 28 |     def classify(self, dataset):
 29 |         contents = dataset.get_contents()
 30 |         labels = dataset.get_labels()
 31 |         return self.classify_raw(contents, labels)
 32 | 
 33 |     def classify_raw(self, dataset, labels):
 34 |         self.classifier = self.classifier.fit(dataset, labels)
 35 |         return self.classifier
 36 | 
 37 |     """Return predictions for dataset using Dataset class"""
 38 | 
 39 |     def test(self, dataset):
 40 |         contents = dataset.get_contents()
 41 |         return self.test_raw(contents)
 42 | 
 43 |     """Return predictions for dataset using raw array dataset"""
 44 | 
 45 |     def test_raw(self, dataset):
 46 |         predictions = self.classifier.predict(dataset)
 47 |         return predictions
 48 | 
 49 |     def get_classifier_type(self):
 50 |         if self.models == "multinomial":
 51 |             return "Multinomial Naive-Bayes"
 52 |         elif self.models == "svm":
 53 |             return "Support Vector Machine"
 54 |         elif self.models == "rfc":
 55 |             return "Random Forest Classifier"
 56 |         elif self.models == "nn":
 57 |             return "Multilayer Perceptron (Neural Network)"
 58 |         else:
 59 |             return "Unknown classifier"
 60 | 
 61 | 
 62 | def main(filename):
 63 |     fe = FeatureExtractor("tfidf", filename)
 64 |     fe.load_dataset()
 65 |     fe.load_labels()
 66 | 
 67 |     bow = fe.build_bag()
 68 |     bag = fe.build_tfidf()
 69 | 
 70 |     print "** Using Multinomial NB Models **"
 71 | 
 72 |     # TFIDF
 73 |     clf = Classifier(models="multinomial")
 74 |     clf.classify(bag, fe.raw_labels)
 75 | 
 76 |     preds = clf.test(bag)
 77 |     # for doc, cat in zip(fe.dataset, preds):
 78 |     # 	print "%r => %s" % (doc, cat)
 79 | 
 80 |     print "TFIDF accuracy score: %f" % (metrics.accuracy_score(fe.raw_labels, preds, normalize=True))
 81 |     f1_pos = metrics.f1_score(fe.raw_labels, preds, pos_label='positive')
 82 |     f1_neg = metrics.f1_score(fe.raw_labels, preds, pos_label='negative')
 83 |     f1_neu = metrics.f1_score(fe.raw_labels, preds, pos_label='neutral')
 84 |     print "TFIDF F1 score: %f" % f1_pos
 85 |     print "TFIDF F1 negative score: %f" % f1_neg
 86 |     print "TFIDF F1 neutral score: %f" % f1_neu
 87 | 
 88 |     print "\nAverage F-measure: %f" % ((f1_pos + f1_neg + f1_neu ) / 2)
 89 | 
 90 |     # bag of words
 91 |     clf = Classifier(models="multinomial")
 92 |     clf.classify(bow, fe.raw_labels)
 93 |     preds = clf.test(bow)
 94 | 
 95 |     print "BOW accuracy score: %f" % (metrics.accuracy_score(fe.raw_labels, preds, normalize=True))
 96 |     print "BOW F1 score: %f" % (metrics.f1_score(fe.raw_labels, preds, pos_label='positive'))
 97 | 
 98 |     print "\n** Using SVM **"
 99 | 
100 |     # TFIDF
101 |     clf = Classifier(models="svm")
102 |     clf.classify(bag, fe.raw_labels)
103 | 
104 |     preds = clf.test(bag)
105 |     # for doc, cat in zip(fe.dataset, preds):
106 |     # 	print "%r => %s" % (doc, cat)
107 | 
108 |     print "TFIDF accuracy score: %f" % (metrics.accuracy_score(fe.raw_labels, preds, normalize=True))
109 | 
110 |     # bag of words
111 |     clf = Classifier(models="svm")
112 |     clf.classify(bow, fe.raw_labels)
113 |     preds = clf.test(bow)
114 | 
115 |     print "BOW accuracy score: %f" % (metrics.accuracy_score(fe.raw_labels, preds, normalize=True))
116 | 
117 |     X_train, X_test, y_train, y_test = train_test_split(bow, fe.raw_labels, test_size=0.4, random_state=0)
118 |     clf = Classifier(models="svm")
119 |     clf.classify(X_train, y_train)
120 |     preds = clf.test(X_test)
121 | 
122 |     print "Using 60/40, BOW accuracy: %f" % (metrics.accuracy_score(y_test, preds, normalize=True))
123 |     print "Using 60/40, BOW F1: %f" % (metrics.f1_score(y_test, preds, pos_label='positive'))
124 | 
125 | 
126 | if __name__ == '__main__':
127 |     main(sys.argv[1])
128 | 


--------------------------------------------------------------------------------
/classification/Evaluator.py:
--------------------------------------------------------------------------------
  1 | from sklearn.metrics import accuracy_score
  2 | from sklearn.metrics import f1_score
  3 | from sklearn.metrics import confusion_matrix
  4 | from sklearn.model_selection import cross_val_score, cross_val_predict
  5 | from sklearn.model_selection import KFold
  6 | 
  7 | import copy
  8 | import csv
  9 | 
 10 | import numpy as np
 11 | 
 12 | verbose_level = 0  # verbose level
 13 | n_job = 3        # number of CPU used in evaluation
 14 | seed = 7           # seed for our random state cross validation
 15 | 
 16 | from nltk.tokenize import TweetTokenizer # a tweet tokenizer from nltk.
 17 | tokenizer = TweetTokenizer()
 18 | from sklearn.preprocessing import scale
 19 | 
 20 | class Evaluator(object):
 21 |     """docstring for Evaluator"""
 22 |     def __init__(self):
 23 |         super(Evaluator, self).__init__()
 24 | 
 25 |     def eval_with_test_set(self, model, feature_extractors, training_set, test_set, outfile="results_test.csv"):
 26 |         if not isinstance(feature_extractors, list):
 27 |             return
 28 | 
 29 |         training_contents = training_set.get_contents()
 30 |         training_labels = training_set.get_labels()
 31 | 
 32 |         # build training features
 33 | 
 34 |         # Print properties
 35 |         print "Evaluation method: Test Set"
 36 |         print "Classifier: %s" % (model.get_classifier_type())
 37 | 
 38 |         test_contents = test_set.get_contents()
 39 |         test_labels = test_set.get_labels()
 40 | 
 41 |         field_names = ["id", "content", "polarity"]
 42 |         fe_predictions = dict()
 43 | 
 44 |         for feature_extractor in feature_extractors:
 45 |             fe = copy.copy(feature_extractor)
 46 |             print "\nFeature Extractor: %s" % (fe.get_name())
 47 |             field_names.append(fe.get_name())
 48 | 
 49 |             # build our feature extractor from the training dataset contents
 50 | 
 51 |             fe.set_dataset(training_contents)
 52 |             fe.build()
 53 |             training_contents = [tweet.split() for tweet in training_contents]
 54 | 
 55 |             training_features = fe.extract_features(training_contents)
 56 |             #print("training features :")
 57 |             #print(training_features)
 58 |             # build features for our test dataset
 59 |             test_contents = [tweet.split() for tweet in test_contents]
 60 |             test_features = fe.extract_existing_features(test_contents)
 61 |             #print("test features :")
 62 |             #print(test_features)
 63 |             # build training models
 64 |             model.classify_raw(training_features, training_labels)
 65 | 
 66 |             # start evaluating with test set
 67 |             test_predictions = model.test_raw(test_features)
 68 |             fe_predictions[fe.get_name()] = test_predictions
 69 | 
 70 |             # evaluate confusion matrix
 71 |             cnf_matrix = confusion_matrix(test_labels, test_predictions,labels=['positive', 'negative','neutral'])
 72 | 
 73 |             print "Average F-measure: %f" % (f1_score(test_labels, test_predictions, average='macro'))
 74 |             print "Average accuracy : %f" % (f1_score(test_labels, test_predictions, average='micro'))
 75 |             print "\nConfusion Matrix:"
 76 |             print "\t\tPositive\tNegative\tNeutral (predicted labels)"
 77 |             print "Positive\t%d\t\t%d\t\t%d" % (cnf_matrix[0][0], cnf_matrix[0][1],cnf_matrix[0][2])
 78 |             print "Negative\t%d\t\t%d\t\t%d" % (cnf_matrix[1][0], cnf_matrix[1][1],cnf_matrix[1][2])
 79 |             print "Neutral \t%d\t\t%d\t\t%d" % (cnf_matrix[2][0], cnf_matrix[2][1],cnf_matrix[2][2])
 80 |             print "(actual labels)\n"
 81 | 
 82 |         with open(outfile, "wb") as csvfile:
 83 |             writer = csv.DictWriter(csvfile, fieldnames=field_names)
 84 |             writer.writeheader()
 85 |             for i in xrange(len(test_contents)):
 86 |                 row = {
 87 |                     'id': i + 1,
 88 |                     'content': test_contents[i],
 89 |                     'polarity': test_labels[i],
 90 |                 }
 91 |                 # append results
 92 |                 for j in xrange(len(feature_extractors)):
 93 |                     row[feature_extractors[j].get_name()] = fe_predictions[feature_extractors[j].get_name()][i]
 94 | 
 95 |                 writer.writerow(row)
 96 | 
 97 |     def eval_with_cross_validation(self, model, feature_extractors, training_set, num_fold=10, cv=None):
 98 |         if not isinstance(feature_extractors, list):
 99 |             return
100 | 
101 |         # if model
102 |         training_contents = training_set.get_contents()
103 |         training_labels = training_set.get_labels()
104 | 
105 |         # Print properties
106 |         print "Evaluation method: Cross Validation"
107 |         print "Number of Folds: %d" % (num_fold)
108 |         print "Classifier: %s" % (model.get_classifier_type())
109 | 
110 |         if not cv:
111 |             kfold = KFold(n_splits=num_fold, random_state=seed)
112 |         else:
113 |             kfold = cv
114 | 
115 |         for feature_extractor in feature_extractors:
116 |             fe = copy.copy(feature_extractor)
117 |             print "\nFeature Extractor: %s" % (fe.get_name())
118 | 
119 |             # build our feature extractor from the dataset contents
120 |             fe.set_dataset(training_contents)
121 |             fe.build()
122 |             training_contents = [tweet.split() for tweet in training_contents]
123 |             training_features = fe.extract_features(training_contents)
124 |             # obtain our classification results
125 |             # measure is done by using macro F1 score
126 |             scores = cross_val_score(model.classifier, X=training_features,
127 |                                     y=training_labels, cv=kfold, n_jobs=n_job,
128 |                                     scoring='f1_macro', verbose=verbose_level)
129 | 
130 |             # print each of the iteration scroe
131 |             for i in xrange(0, len(scores)):
132 |                 print "Iteration %d = %f" % (i + 1, scores[i])
133 | 
134 |             print "Average score: %f" % (scores.mean())
135 |             print "Standard Deviation: %f" % (scores.std())
136 |             print "Maximum F1-score: %f" % (np.amax(scores))
137 | 
138 | 
139 |     def create_evaluation_result(self, model, feature_extractors, training_set, num_fold=10, outfile="results_cv.csv", cv=None):
140 |         if not isinstance(feature_extractors, list):
141 |             return
142 | 
143 |         # if model
144 |         training_contents = training_set.get_contents()
145 |         training_labels = training_set.get_labels()
146 | 
147 |         # Print properties
148 |         print "Evaluation method: Cross Validation"
149 |         print "Number of Folds: %d" % (num_fold)
150 |         print "Classifier: %s" % (model.get_classifier_type())
151 | 
152 |         field_names = ["id", "content", "polarity"]
153 |         fe_predictions = dict()
154 | 
155 |         if not cv:
156 |             kfold = KFold(n_splits=num_fold, random_state=seed)
157 |         else:
158 |             kfold = cv
159 | 
160 |         for feature_extractor in feature_extractors:
161 |             fe = copy.copy(feature_extractor)
162 |             field_names.append(fe.get_name())
163 | 
164 |             # build our feature extractor from the dataset contents
165 |             fe.set_dataset(training_contents)
166 |             fe.build()
167 |             training_contents = [tweet.split() for tweet in training_contents]
168 |             training_features = fe.extract_features(training_contents)
169 |             # obtain our classification results
170 |             # measure is done by using macro F1 score
171 |             predictions = cross_val_predict(model.classifier, X=training_features,
172 |                                             y=training_labels, cv=kfold, n_jobs=n_job,
173 |                                             verbose=verbose_level,fit_params={})
174 |             fe_predictions[fe.get_name()] = predictions
175 | 
176 |         with open(outfile, "wb") as csvfile:
177 |             writer = csv.DictWriter(csvfile, fieldnames=field_names)
178 |             writer.writeheader()
179 |             for i in xrange(len(training_contents)):
180 |                 row = {
181 |                     'id': i + 1,
182 |                     'content': training_contents[i],
183 |                     'polarity': training_labels[i],
184 |                 }
185 |                 # append results
186 |                 for j in xrange(len(feature_extractors)):
187 |                     row[feature_extractors[j].get_name()] = fe_predictions[feature_extractors[j].get_name()][i]
188 | 
189 |                 writer.writerow(row)
190 | 
191 |         return outfile
192 | 


--------------------------------------------------------------------------------
/classification/NeuralNets.py:
--------------------------------------------------------------------------------
 1 | from keras.layers import Dense
 2 | from keras.optimizers import Adam
 3 | from keras.models import Sequential
 4 | from sklearn.feature_extraction.text import TfidfVectorizer
 5 | from sklearn.metrics import f1_score
 6 | from sklearn.preprocessing import scale
 7 | from tqdm import tqdm
 8 | import numpy as np
 9 | from sklearn.metrics import confusion_matrix
10 | 
11 | 
12 | def split_data(dataset):
13 | 
14 |     x_samples = dataset.get_contents()
15 |     y_labels = dataset.get_labels()
16 |     y = []
17 |     for x in y_labels:
18 |         if x=="positive":
19 |             y.append(1)
20 |         elif x=="negative":
21 |             y.append(-1)
22 |         else:
23 |             y.append(0)
24 | 
25 |     return x_samples,y
26 | 
27 | 
28 | def build_tfidf(x_train):
29 |     print 'building tf-idf matrix ...'
30 |     vectorizer = TfidfVectorizer(analyzer=lambda x: x, min_df=10)
31 |     vectorizer.fit_transform([x.split() for x in x_train])
32 |     tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
33 |     print 'vocab size :', len(tfidf)
34 |     return tfidf
35 | 
36 | 
37 | class NeuralNets(object):
38 |     def __init__(self, input_size=100,x_train=None, y_train=None,
39 |                  epochs=20, batch_size=32,x_test=None, y_test=None):
40 |             self.inputdim = input_size
41 |             self.xtrain = x_train
42 |             self.ytrain = y_train
43 |             self.epochs = epochs
44 |             self.xtest = x_test
45 |             self.ytest = y_test
46 |             self.batchsize = batch_size
47 | 
48 | 
49 |     def train_neural_nets(self):
50 | 
51 |         "*** Train Neural Networks model ***"
52 | 
53 |         model = Sequential()
54 |         model.add(Dense(200, activation='relu', input_dim=self.inputdim))
55 |         #model.add(Dense(32, activation='softsign'))
56 |         model.add(Dense(1, activation='sigmoid'))
57 |         model.compile(optimizer=Adam(lr=0.01),loss='binary_crossentropy',metrics=['accuracy','mse','mae'])
58 |         print("\n Training Neural Network Classifier with Training dataset")
59 |         model.fit(self.xtrain, self.ytrain, epochs=self.epochs, batch_size=self.batchsize, verbose=2)
60 |         print("\n Evaluating Neural Network Classifier on Test dataset")
61 |         score = model.evaluate(self.xtest, self.ytest, batch_size=128, verbose=2)
62 |         print("{} is {}".format("accuracy",score[1]))
63 |         print("{} is {}".format("mse: ",score[2]))
64 |         print("{} is {}".format("mae: ", score[3]))
65 |         y_predictions = model.predict(self.xtest, batch_size=128, verbose=2)
66 |         y_pred = np.around(y_predictions)
67 |         y_pred = [int(x) for x in y_pred.flatten().tolist()]
68 |         cnf_matrix = confusion_matrix(self.ytest, y_pred,labels=[1,-1,0])
69 |         print "Average F-measure: %f" % (f1_score(self.ytest, y_pred, average='macro'))
70 |         print "\n Confusion Matrix:"
71 |         print "\t\tPositive\tNegative\tNeutral (predicted labels)"
72 |         print "Positive\t%d\t\t%d\t\t%d" % (cnf_matrix[0][0], cnf_matrix[0][1], cnf_matrix[0][2])
73 |         print "Negative\t%d\t\t%d\t\t%d" % (cnf_matrix[1][0], cnf_matrix[1][1], cnf_matrix[1][2])
74 |         print "Neutral \t%d\t\t%d\t\t%d" % (cnf_matrix[2][0], cnf_matrix[2][1], cnf_matrix[2][2])
75 |         print "(actual labels)\n"
76 | 


--------------------------------------------------------------------------------
/classification/__init__.py:
--------------------------------------------------------------------------------
1 | from Classifier import Classifier
2 | from Evaluator import Evaluator
3 | from NeuralNets import NeuralNets
4 | 


--------------------------------------------------------------------------------
/embedding/__init__.py:
--------------------------------------------------------------------------------
1 | from embedding.sswe_extractor import *
2 | 


--------------------------------------------------------------------------------
/embedding/sswe_extractor.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2
  2 | # -*- coding: utf-8 -*-
  3 | import logging
  4 | import numpy as np
  5 | from ConfigParser import ConfigParser
  6 | from itertools import chain
  7 | # local
  8 | from deepnl import *
  9 | from deepnl.extractors import *
 10 | from deepnl.reader import TweetReader
 11 | from deepnl.network import Network
 12 | from deepnl.sentiwords import SentimentTrainer
 13 | 
 14 | 
 15 | # ----------------------------------------------------------------------
 16 | # ----------------------------------------------------------------------
 17 | class sswe_model(object):
 18 |     def __init__(self, window=3, embeddings_size=50, epochs=100, learning_rate=0.001,
 19 |                  eps=1e-8, ro=0.95, hidden=200, ngrams=2, textField=0,
 20 |                  tagField=1, alpha=0.5, train=None, model=None,
 21 |                  vocab=None, minOccurr=3, vocab_size=0, vectors=None, load=None,
 22 |                  threads=5, variant=None, verbose=None, config_file=None):
 23 |         self.window = window
 24 |         self.embeddings_size = embeddings_size
 25 |         self.iterations = epochs
 26 |         self.learning_rate = learning_rate
 27 |         self.eps = eps
 28 |         self.ro = ro
 29 |         self.hidden = hidden
 30 |         self.ngrams = ngrams
 31 |         self.textField = textField
 32 |         self.tagField = tagField
 33 |         self.alpha = alpha
 34 |         self.train = train
 35 |         self.vocab = vocab
 36 |         self.minOccurr = minOccurr
 37 |         self.vocab_size = vocab_size
 38 |         self.vectors = vectors
 39 |         self.load = load
 40 |         self.variant = variant
 41 |         self.verbose = verbose
 42 |         self.model = model
 43 |         self.threads = threads
 44 |         self.config_file = config_file
 45 | 
 46 | 
 47 | 
 48 | def create_sswe_model(train_filename, vocab_file, vector_file, train_model, save_model, size):
 49 |     """model parameters: you can customize other parameters in the class sswe_mode()"""
 50 |     emb_size = size  # Number of features per word
 51 |     epochs = 100  # Number of training epochs
 52 |     l_r = 0.1  # Learning rate for network weights
 53 |     hidden = 200   # Number of hidden neurons
 54 |     ngrams = 2  # Length of ngrams
 55 |     text = 0  # field containing text
 56 |     tag = 1  # field containing polarity
 57 |     train = train_filename  # File with text corpus for training
 58 |     model = save_model  # File where to save the model
 59 |     vocab = vocab_file  # Vocabulary file, either read and updated or created
 60 |     vectors = vector_file  # Embeddings file, either read and updated or created
 61 |     load = train_model  # Load previously saved model
 62 |     threads = 15  # Number of threads
 63 |     variant = None
 64 | 
 65 |     sswe = sswe_model(embeddings_size=emb_size, epochs=epochs, learning_rate=l_r, threads=threads,
 66 |                       hidden=hidden, ngrams=ngrams, textField=text, tagField=tag, train=train,
 67 |                       model=model, vocab=vocab, minOccurr=3, vectors=vectors, load=load, variant=variant)
 68 |     return sswe
 69 | 
 70 | 
 71 | 
 72 | def sswe_trainer(model_parameters):
 73 |     # set the seed for replicability
 74 |     np.random.seed(42)
 75 |     # args = parser.parse_args()
 76 |     args = model_parameters
 77 |     log_format = '%(message)s'
 78 |     log_level = logging.DEBUG if args.verbose else logging.INFO
 79 |     log_level = logging.INFO
 80 |     logging.basicConfig(format=log_format, level=log_level)
 81 |     logger = logging.getLogger("Logger")
 82 | 
 83 |     config = ConfigParser()
 84 |     if args.config_file:
 85 |         config.read(args.config_file)
 86 |     # merge args with config
 87 |     reader = TweetReader(text_field=args.textField, label_field=args.tagField, ngrams=args.ngrams)
 88 |     reader.read(args.train)
 89 |     vocab, bigrams, trigrams = reader.create_vocabulary(reader.sentences, args.vocab_size,
 90 |                                                         min_occurrences=args.minOccurr)
 91 |     #print("length vocab")
 92 |     #print(len(vocab))
 93 |     if args.variant == 'word2vec' and os.path.exists(args.vectors):
 94 |         embeddings = Embeddings(vectors=args.vectors, variant=args.variant)
 95 |         embeddings.merge(vocab)
 96 |         logger.info("Saving vocabulary in %s" % args.vocab)
 97 |         embeddings.save_vocabulary(args.vocab)
 98 |     elif os.path.exists(args.vocab):
 99 |         # start with the given vocabulary
100 |         b_vocab = reader.load_vocabulary(args.vocab)
101 |         bound = len(b_vocab)-len(bigrams)-len(trigrams)
102 |         base_vocab=b_vocab[:bound]
103 |         #print("length base vocab :")
104 |         #print(len(base_vocab))
105 |         if os.path.exists(args.vectors):
106 |             # load embeddings
107 |             embeddings = Embeddings(vectors=args.vectors, vocab=base_vocab, variant=args.variant)
108 |         else:
109 |             # create embeddings
110 |             embeddings = Embeddings(args.embeddings_size, vocab=base_vocab, variant=args.variant)
111 |             # add the ngrams from the corpus
112 |             embeddings.merge(vocab)
113 |             logger.info("Overriding vocabulary in %s" % args.vocab)
114 |             embeddings.save_vocabulary(args.vocab)
115 |     else:
116 |         embeddings = Embeddings(args.embeddings_size, vocab=vocab, variant=args.variant)
117 |         logger.info("Saving vocabulary in %s" % args.vocab)
118 |         embeddings.save_vocabulary(args.vocab)
119 | 
120 |     # Assume bigrams are prefix of trigrams, or else we should put a terminator
121 |     # on trie
122 |     trie = {}
123 |     for b in chain(bigrams, trigrams):
124 |         tmp = trie
125 |         for w in b:
126 |             tmp = tmp.setdefault(embeddings.dict[w], {})
127 | 
128 |     converter = Converter()
129 |     converter.add(embeddings)
130 | 
131 |     trainer = create_trainer(args, converter)
132 | 
133 |     report_intervals = max(args.iterations / 200, 1)
134 |     report_intervals = 10000  # DEBUG
135 | 
136 |     logger.info("Starting training")
137 | 
138 |     # a generator expression (can be iterated several times)
139 |     # It caches converted sentences, avoiding repeated conversions
140 |     converted_sentences = converter.generator(reader.sentences, cache=True)
141 |     trainer.train(converted_sentences, reader.polarities, trie,
142 |                   args.iterations, report_intervals)
143 | 
144 |     logger.info("Overriding vectors to %s" % args.vectors)
145 |     embeddings.save_vectors(args.vectors, args.variant)
146 |     if args.model:
147 |         logger.info("Saving trained model to %s" % args.model)
148 |         trainer.save(args.model)
149 | 
150 | 
151 | def create_trainer(args, converter):
152 |     """
153 |     Creates or loads a neural network according to the specified args.
154 |     """
155 | 
156 |     logger = logging.getLogger("Logger")
157 | 
158 |     if args.load:
159 |         logger.info("Loading provided network...")
160 |         trainer = SentimentTrainer.load(args.load)
161 |         # change learning rate
162 |         trainer.learning_rate = args.learning_rate
163 |     else:
164 |         logger.info('Creating new network...')
165 |         # sum the number of features in all extractors' tables
166 |         input_size = converter.size() * (args.window * 2 + 1)
167 |         nn = Network(input_size, args.hidden, 2)
168 |         options = {
169 |             'learning_rate': args.learning_rate,
170 |             'eps': args.eps,
171 |             'ro': args.ro,
172 |             'verbose': args.verbose,
173 |             'left_context': args.window,
174 |             'right_context': args.window,
175 |             'ngram_size': args.ngrams,
176 |             'alpha': args.alpha
177 |         }
178 |         trainer = SentimentTrainer(nn, converter, options)
179 | 
180 |     trainer.saver = saver(args.model, args.vectors)
181 | 
182 |     logger.info("... with the following parameters:")
183 |     logger.info(trainer.nn.description())
184 | 
185 |     return trainer
186 | 
187 | 
188 | def saver(model_file, vectors_file):
189 |     """Function for saving model periodically"""
190 | 
191 |     def save(trainer):
192 |         # save embeddings also separately
193 |         if vectors_file:
194 |             trainer.save_vectors(vectors_file)
195 |         if model_file:
196 |             trainer.save(model_file)
197 | 
198 |     return save
199 | 
200 | 
201 | def buildWordVector(tokens, size, tweet_w2v, tfidf):
202 |     vec = np.zeros(size).reshape((1, size))
203 |     count = 0.
204 |     for word in tokens:
205 |         try:
206 |             vec += tweet_w2v[word].reshape((1, size)) * tfidf[word]
207 |             count += 1.
208 |         except KeyError:  # handling the case where the token is not
209 |             # in the corpus. useful for testing.
210 |             continue
211 |     if count != 0:
212 |         vec /= count
213 |     return vec
214 | 
215 | 
216 | def get_sswe_features(vocab_file, model_file):
217 |     vocabs = []
218 |     models = []
219 |     with open(vocab_file, "rb") as vocablist:
220 |         for vocab in vocablist:
221 |             vocabs.append(vocab.rstrip())
222 | 
223 |     with open(model_file, "rb") as modellist:
224 |         for model in modellist:
225 |             arr_model = model.split()
226 |             models.append(np.array(map(float, arr_model)))
227 |     # build our word embedding model vectorizer
228 |     sswe_dict = dict(zip(vocabs, models))
229 |     return sswe_dict
230 | 


--------------------------------------------------------------------------------
/feature_extracting/SennaFeatureExtractor.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from FeatureExtractor import FeatureExtractor
 4 | from vectorizer.WordEmbeddingVectorizer import WordEmbeddingVectorizer
 5 | 
 6 | from models.SentenceIterator import SentenceIterator
 7 | 
 8 | 
 9 | class SennaFeatureExtractor(FeatureExtractor):
10 |     """docstring for FeatureExtractor"""
11 |     def __init__(self, dataset=None, infile=None, vocabfile=None, binary=False, dimen=100):
12 |         self.model_file = infile
13 |         self.vocab_file = vocabfile
14 |         self.binary = binary
15 |         self.dataset = dataset
16 |         self.dimen = dimen
17 | 
18 |     def build(self):
19 |         if self.model_file and self.vocab_file:
20 |             vocabs = []
21 |             models = []
22 |             with open(self.vocab_file, "rb") as vocablist:
23 |                 for vocab in vocablist:
24 |                     vocabs.append(vocab.rstrip())
25 | 
26 |             with open(self.model_file, "rb") as modellist:
27 |                 for model in modellist:
28 |                     arr_model = model.split()
29 |                     models.append(np.array(map(float, arr_model)))
30 |                 #modelss = models[:100]
31 |                 #vocabss = vocabs[:100]
32 |             # build our word embedding model vectorizer
33 |             senna_dict = dict(zip(vocabs, models))
34 |             sentences = SentenceIterator(self.dataset)
35 | 
36 |             self.vectorizer = WordEmbeddingVectorizer(senna_dict, self.dimen)
37 |             self.vectorizer.fit(sentences)
38 |         else:
39 |             pass
40 | 
41 |         return self
42 | 
43 |     def extract_existing_features(self, dataset):
44 |         return super(SennaFeatureExtractor, self).extract_features(dataset)
45 | 
46 |     def get_name(self):
47 |         return "SENNA C&W SSWE"
48 | 


--------------------------------------------------------------------------------
/feature_extracting/WordEmbeddingFeatureExtractor.py:
--------------------------------------------------------------------------------
 1 | import gensim
 2 | from FeatureExtractor import FeatureExtractor
 3 | from vectorizer.WordEmbeddingVectorizer import WordEmbeddingVectorizer
 4 | 
 5 | from models.SentenceIterator import SentenceIterator
 6 | 
 7 | 
 8 | class WordEmbeddingFeatureExtractor(FeatureExtractor):
 9 |     """docstring for FeatureExtractor"""
10 |     def __init__(self, dataset=None, infile=None, binary=False, dimen=100, sswe=0):
11 |         super(WordEmbeddingFeatureExtractor, self).__init__(dataset)
12 |         self.model_file = infile
13 |         self.binary = binary
14 |         self.dimen = dimen
15 |         self.sswe = sswe
16 | 
17 |     def build(self):
18 |         if not self.model_file:
19 |             sentences = SentenceIterator(self.dataset)
20 |             w2v = gensim.models.Word2Vec(sentences, size=self.dimen, min_count=1)
21 |             word_vectors = w2v.wv
22 |             del w2v   # free memory
23 |         else:
24 |             word_vectors = gensim.models.KeyedVectors.load_word2vec_format(self.model_file, binary=self.binary)
25 | 
26 |             # build our word embedding model vectorizer
27 |             # w2v_dict = dict(zip(w2v.index2word, w2v.syn0))
28 |             sentences = SentenceIterator(self.dataset)
29 | 
30 |         self.vectorizer = WordEmbeddingVectorizer(word_vectors, self.dimen)
31 |         self.vectorizer.fit(sentences)
32 | 
33 |         return self
34 | 
35 |     def extract_existing_features(self, dataset):
36 |         return super(WordEmbeddingFeatureExtractor, self).extract_features(dataset)
37 | 
38 |     def save_model_to_file(self, outfile, vocabfile=None, binary=True):
39 |         sentences = SentenceIterator(self.dataset)
40 |         w2v = gensim.models.Word2Vec(sentences, size=self.dimen, min_count=1, sg=1, workers=4, iter=10)
41 | 
42 |         w2v.wv.save_word2vec_format(outfile, fvocab=vocabfile, binary=binary)
43 | 
44 |     def get_name(self):
45 |         if self.sswe == 1:
46 |             return "SSWE + Word2Vec"
47 |         else:
48 |             return "Gensim Word2Vec"
49 | 


--------------------------------------------------------------------------------
/feature_extracting/__init__.py:
--------------------------------------------------------------------------------
1 | from feature_extracting.SennaFeatureExtractor import *
2 | from feature_extracting.WordEmbeddingFeatureExtractor import *
3 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | from preprocessing.preprocesstweets import *
  2 | from embedding.sswe_extractor import *
  3 | from models import Dataset
  4 | from classification.Classifier import Classifier
  5 | from classification.NeuralNets import *
  6 | from classification.Evaluator import Evaluator
  7 | from feature_extracting import SennaFeatureExtractor
  8 | from sklearn.model_selection import KFold
  9 | from sklearn.preprocessing import scale
 10 | from tqdm import tqdm
 11 | import numpy as np
 12 | 
 13 | 
 14 | def main():
 15 |     """ Sentiment Specific Embedding for twitter classification """
 16 | 
 17 |     embeddings_size = 50  # Embedding size for SSWE model
 18 |     vocab_file = "Embedding/features/semeval_vocabs_200.txt"  # path to the vocabulary file
 19 |     vector_file = "Embedding/features/semeval_vectors_200.txt"  # path to the vector file
 20 |     stopwordsfile = "preprocess/stopwords.txt"
 21 | 
 22 |     """     Sentiment-Specific Word Embedding (SSWE)    """
 23 | 
 24 |     if True:
 25 |         # Load dataset
 26 |         data_train = 'dataset/training1600000.csv'  # training data set file path
 27 |         pre_data_train = 'dataset/preprocessed_dataset1600000.csv'  # file to save dataset after cleaning
 28 | 
 29 |         if True:
 30 |             print("\n **** Dataset cleaning ****")
 31 |             tweets_prepocess(data_train, pre_data_train, stopwordsfile)
 32 | 
 33 |         if True:
 34 |             print("\n **** SSWE model Trainig ****")
 35 |             train_model = None  # path to the file contains the trained model if it is already exist
 36 |             save_model = "Embedding/models/SSWE_model_1600000_200"  # path to the file where model will be saved
 37 |             sswe = create_sswe_model(pre_data_train, vocab_file, vector_file, train_model,
 38 |                                      save_model, embeddings_size)
 39 |             sswe_trainer(sswe)
 40 | 
 41 |     """     Embedding visualisation and Similarity computing    """
 42 | 
 43 |     if True:
 44 |         visualiser = Visualiser(sizeOfEmbedding=embeddings_size,
 45 |                                 VocabsFname=vocab_file,
 46 |                                 VectorsFname=vector_file,
 47 |                                 WVFilename="Visualisation/data/w2vformat.txt",
 48 |                                 visualizerHTMLfilename="Visualisation/data/embedding.html")
 49 |         visualiser.visualize()
 50 | 
 51 |     """ Twitter Sentiment Classification """
 52 | 
 53 |     if True:
 54 |         # Data pre-processing
 55 | 
 56 |         print("\n **** Training data cleaning ****")
 57 |         pre_processing_train = "dataset/preprocessed_semeval_traindataset.csv"
 58 |         # tweets_prepocess(train_set, pre_processing_train, stopwordsfile)
 59 | 
 60 |         print("\n **** Test data cleaning ****")
 61 |         pre_processing_test = "dataset/preprocessed_semeval_testdataset.csv"
 62 |         # tweets_prepocess(test_set, pre_processing_test, stopwordsfile)
 63 | 
 64 |         # LOAD TRAIN SET
 65 |         dataset_train = Dataset.DatasetReview()
 66 |         dataset_train.load_review_from_csv(pre_processing_train)
 67 | 
 68 |         # LOAD TEST SET
 69 |         dataset_test = Dataset.DatasetReview()
 70 |         dataset_test.load_review_from_csv(pre_processing_test)
 71 | 
 72 |         ################################### Neural Nets classifier ###########################
 73 | 
 74 |         # Extract Features
 75 |         tweet2v = get_sswe_features(vocab_file, vector_file)
 76 | 
 77 |         # Extract samples and labels
 78 |         x_train, y_train = split_data(dataset_train)
 79 |         x_test, y_test = split_data(dataset_train)
 80 | 
 81 |         tfidf = build_tfidf(x_train)
 82 | 
 83 |         train_vecs_sswe = np.concatenate(
 84 |             [buildWordVector(z.split(), embeddings_size,
 85 |                              tweet2v, tfidf) for z in tqdm(map(lambda x: x, x_train))])
 86 | 
 87 |         train_vecs_sswe = scale(train_vecs_sswe)
 88 | 
 89 |         test_vecs_sswe = np.concatenate(
 90 |             [buildWordVector(z.split(), embeddings_size,
 91 |                              tweet2v, tfidf) for z in tqdm(map(lambda x: x, x_test))])
 92 |         test_vecs_sswe = scale(test_vecs_sswe)
 93 | 
 94 |         # neural network model
 95 |         neuralnets = NeuralNets(input_size=embeddings_size, x_train=train_vecs_sswe, y_train=y_train,
 96 |                                 epochs=450, batch_size=32, x_test=test_vecs_sswe, y_test=y_test)
 97 |         neuralnets.train_neural_nets()
 98 | 
 99 |         ##########################################################################################
100 |         ########
101 |         ########        Classical classifiers with sklearn
102 |         ########
103 |         ##########################################################################################
104 |         print("\n**** CROSS VALIDATION EVALUATION (CORPUS: SemEval) ****\n")
105 | 
106 |         fe_sswe = SennaFeatureExtractor(infile=vector_file, vocabfile=vocab_file, dimen=embeddings_size)
107 |         feature_extractors = [fe_sswe]
108 |         ev = Evaluator()
109 | 
110 |         ################################# SVM ###################################################
111 | 
112 |         print ("\n**** CROSS VALIDATION EVALUATION (CORPUS: SemEval) ****\n")
113 |         model = Classifier(models="svm")
114 |         kfold = KFold(n_splits=10)
115 |         ev.eval_with_cross_validation(model, feature_extractors=feature_extractors,
116 |                                       training_set=dataset_train, num_fold=10, cv=kfold)
117 |         ev.create_evaluation_result(model, feature_extractors=feature_extractors,
118 |                                     training_set=dataset_train, num_fold=10, cv=kfold)
119 | 
120 |         print ("\n**** TEST SET EVALUATION (CORPUS: SemEval) ****\n")
121 |         ev.eval_with_test_set(model, feature_extractors=feature_extractors,
122 |                               training_set=dataset_train,
123 |                               test_set=dataset_test)
124 | 
125 |         ################################### Naive bayes ##########################################
126 | 
127 |         print ("\n**** CROSS VALIDATION EVALUATION (CORPUS: SemEval) ****\n")
128 |         model = Classifier(models="multinomial")
129 |         kfold = KFold(n_splits=10)
130 |         ev.eval_with_cross_validation(model, feature_extractors=feature_extractors,
131 |                                       training_set=dataset_train, num_fold=10, cv=kfold)
132 |         ev.create_evaluation_result(model, feature_extractors=feature_extractors,
133 |                                     training_set=dataset_train, num_fold=10, cv=kfold)
134 | 
135 |         print ("\n**** TEST SET EVALUATION (CORPUS: DATASET) ****\n")
136 |         ev.eval_with_test_set(model, feature_extractors=feature_extractors,
137 |                               training_set=dataset_train,
138 |                               test_set=dataset_test)
139 | 
140 |         #########################################  RandomForestClassifier #######################
141 | 
142 |         print ("\n**** CROSS VALIDATION EVALUATION (CORPUS: SemEval) ****\n")
143 |         model = Classifier(models="rfc")
144 |         kfold = KFold(n_splits=10)
145 |         ev.eval_with_cross_validation(model, feature_extractors=feature_extractors,
146 |                                       training_set=dataset_train, num_fold=10, cv=kfold)
147 |         ev.create_evaluation_result(model, feature_extractors=feature_extractors,
148 |                                     training_set=dataset_train, num_fold=10, cv=kfold)
149 | 
150 |         print ("\n**** TEST SET EVALUATION (CORPUS: SemEval) ****\n")
151 |         ev.eval_with_test_set(model, feature_extractors=feature_extractors,
152 |                               training_set=dataset_train,
153 |                               test_set=dataset_test)
154 | 
155 |         #########################################  MLPClassifier #######################
156 | 
157 |         print ("\n**** CROSS VALIDATION EVALUATION (CORPUS: SemEval) ****\n")
158 |         model = Classifier(models="nn")
159 |         kfold = KFold(n_splits=10)
160 |         ev.eval_with_cross_validation(model, feature_extractors=feature_extractors,
161 |                                       training_set=dataset_train, num_fold=10, cv=kfold)
162 |         ev.create_evaluation_result(model, feature_extractors=feature_extractors,
163 |                                     training_set=dataset_train, num_fold=10, cv=kfold)
164 | 
165 |         print ("\n**** TEST SET EVALUATION (CORPUS: SemEval) ****\n")
166 |         ev.eval_with_test_set(model, feature_extractors=feature_extractors,
167 |                               training_set=dataset_train,
168 |                               test_set=dataset_test)
169 | 
170 | 
171 | if __name__ == '__main__':
172 |     main()
173 | 


--------------------------------------------------------------------------------
/models/Dataset.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import sys
  3 | import random
  4 | from Review import Review
  5 | from sklearn.model_selection import train_test_split
  6 | 
  7 | class DatasetReview():
  8 |     """docstring for Dataset"""
  9 | 
 10 |     def __init__(self):
 11 |         self.dataset = []
 12 |         self.field_names = []
 13 |         self.label_values = []
 14 |         self.column_label = ""
 15 | 
 16 |     def load_review_from_csv(self, infile):
 17 |         with open(infile, "rb") as csvfile:
 18 |             reader = csv.DictReader(csvfile)
 19 | 
 20 |             # init field names & label column
 21 |             self.field_names = reader.fieldnames
 22 |             self.column_label = self.field_names[-1]
 23 | 
 24 |             for rows in reader:
 25 |                 review = Review(rows[self.field_names[0]], rows[self.field_names[1]])
 26 |                 self.dataset.append(review)
 27 |                 if self.label_values.count(rows[self.column_label]) == 0:
 28 |                     self.label_values.append(rows[self.column_label])
 29 | 
 30 |         return infile
 31 | 
 32 |     def dataset_from_array(self, dataset):
 33 |         n_dataset = DatasetReview()
 34 |         n_dataset.dataset = dataset
 35 |         n_dataset.field_names = self.field_names
 36 |         n_dataset.label_values = self.label_values
 37 |         n_dataset.column_label = self.column_label
 38 | 
 39 |         return n_dataset
 40 | 
 41 |     def dataset_from_contents_labels(self, contents, labels):
 42 |         arr_dataset = []
 43 |         for i in xrange(len(contents)):
 44 |             dr = Review(contents[i], labels[i])
 45 |             arr_dataset.append(dr)
 46 | 
 47 |         return self.dataset_from_array(arr_dataset)
 48 | 
 49 |     def get_dataset_size(self):
 50 |         return len(self.dataset)
 51 | 
 52 |     """get text content for datasets"""
 53 | 
 54 |     def get_contents(self):
 55 |         res = []
 56 |         for data in self.dataset:
 57 |             res.append(data.content)
 58 | 
 59 |         return res
 60 | 
 61 |     """ get labels for all datasets """
 62 | 
 63 |     def get_labels(self):
 64 |         res = []
 65 |         for data in self.dataset:
 66 |             res.append(data.polarity)
 67 | 
 68 |         return res
 69 | 
 70 |     def get_label_enum(self):
 71 |         return self.label_values
 72 | 
 73 |     def get_dataset(self, idx):
 74 |         return self.dataset[idx]
 75 | 
 76 |     def get_formatted_dataset(self):
 77 |         res = []
 78 |         for data in self.dataset:
 79 |             res.append(data.to_string())
 80 | 
 81 |         return res
 82 | 
 83 |     def export_formatted_dataset(self, outfile):
 84 |         res = self.get_formatted_dataset()
 85 |         with open(outfile, "wb") as f:
 86 |             for row in res:
 87 |                 f.write(row + "\n")
 88 | 
 89 |         return outfile
 90 | 
 91 |     def export_to_csv(self, outfile):
 92 |         with open(outfile, "wb") as csvfile:
 93 |             writer = csv.DictWriter(csvfile, fieldnames=self.field_names)
 94 |             writer.writeheader()
 95 |             for data in self.dataset:
 96 |                 writer.writerow({
 97 |                     'content': data.content,
 98 |                     'polarity': data.polarity
 99 |                 })
100 | 
101 |         return outfile
102 | 
103 |     def get_data_label_size(self, label):
104 |         return sum(1 for x in self.dataset if x.polarity == label)
105 | 
106 |     def get_data_label(self, label):
107 |         return [data for data in self.dataset if data.polarity == label]
108 | 
109 |     def get_sample_to_minority(self):
110 |         if not self.dataset:
111 |             return []
112 |         else:
113 |             pos_sample = self.get_data_label_size("positive")
114 |             neg_sample = self.get_data_label_size("negative")
115 |             neu_sample = self.get_data_label_size("neutral")
116 | 
117 |             print "%d | %d | %d" % (pos_sample, neg_sample, neu_sample)
118 |             t_dataset = []
119 |             if pos_sample > neg_sample:
120 |                 temp = self.get_data_label("positive")
121 |                 for x in xrange(0, neg_sample):
122 |                     idx = random.randint(0, len(temp) - 1)
123 |                     t_dataset.append(temp[idx])
124 | 
125 |                 # append the minority instance
126 |                 t_dataset.extend(self.get_data_label("negative"))
127 |                 m_dts = self.dataset_from_array(t_dataset)
128 |                 return m_dts
129 | 
130 |             elif neg_sample > pos_sample:
131 |                 temp = self.get_data_label("negative")
132 |                 for x in xrange(1, pos_sample):
133 |                     idx = random.randint(0, len(temp) - 1)
134 |                     t_dataset.append(temp[idx])
135 | 
136 |                 # append the minority instance
137 |                 t_dataset.extend(self.get_data_label("positive"))
138 |                 m_dts = self.dataset_from_array(t_dataset)
139 |                 return m_dts
140 | 
141 |             else:
142 |                 return self
143 | 
144 |     def split_to_ratio(self, ratio):
145 |         X_train, X_test, y_train, y_test = train_test_split(self.get_contents(), self.get_labels(), test_size=ratio)
146 | 
147 |         dataset_train = self.dataset_from_contents_labels(X_train, y_train)
148 |         dataset_test = self.dataset_from_contents_labels(X_test, y_test)
149 | 
150 |         return dataset_train, dataset_test
151 | 
152 |     def export_only_contents(self, outfile):
153 |         with open(outfile, "wb") as ofile:
154 |             for data in self.dataset:
155 |                 ofile.write(data.content + "\n")
156 | 
157 |         return outfile
158 | 
159 | 
160 | def main(infile):
161 |     dataset = DatasetReview()
162 |     dataset.load_review_from_csv(infile)
163 |     print dataset.get_label_enum()
164 |     dataset.export_formatted_dataset("formatted_dataset.txt")
165 | 
166 |     print "Positive instances: %d" % (dataset.get_data_label_size("positive"))
167 |     print "Negative instances: %d" % (dataset.get_data_label_size("negative"))
168 | 
169 |     t_dataset = dataset.get_sample_to_minority()
170 |     print "Positive instances: %d" % (t_dataset.get_data_label_size("positive"))
171 |     print "Negative instances: %d" % (t_dataset.get_data_label_size("negative"))
172 |     t_dataset.export_to_csv("sample.csv")
173 | 
174 | 
175 | if __name__ == '__main__':
176 |     main(sys.argv[1])
177 | 


--------------------------------------------------------------------------------
/models/__init__.py:
--------------------------------------------------------------------------------
1 | from Dataset import DatasetReview
2 | 
3 | 


--------------------------------------------------------------------------------
/preprocessing/__init__.py:
--------------------------------------------------------------------------------
1 | from preprocesstweets import tweets_prepocess
2 | 


--------------------------------------------------------------------------------
/preprocessing/csv_header_change.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | import os
 3 | 
 4 | inputFileName = "/root/PycharmProjects/project_text_mining/data/trained.csv"
 5 | outputFileName = os.path.splitext(inputFileName)[0] + "_modified.csv"
 6 | 
 7 | with open(inputFileName, 'rb') as inFile, open(outputFileName, 'wb') as outfile:
 8 |     r = csv.reader(inFile)
 9 |     w = csv.writer(outfile)
10 | 
11 |     next(r, None)  # skip the first row from the reader, the old header
12 |     # write new header
13 |     w.writerow(['Polarity', 'ID', 'Date', 'Query','User','TWITTER_MESSAGE'])
14 | 
15 |     # copy the rest
16 |     for row in r:
17 | 
18 | 
19 |         w.writerow(row)


--------------------------------------------------------------------------------
/preprocessing/csv_tsv.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | from twitter import Twitter, OAuth, TwitterHTTPError, TwitterStream
 3 | 
 4 | from random import randint
 5 | 
 6 | 
 7 | inputFileName = "/root/PycharmProjects/project_text_mining/data/trained.csv"
 8 | 
 9 | with open(inputFileName) as csvfile :
10 |     readCSV = csv.reader(csvfile, delimiter=',')
11 | 
12 | 
13 | 
14 | 
15 |     for row in readCSV:
16 | 
17 | 
18 |         if (row[0]=="0"):
19 |             row[0]= "negative"
20 |         elif (row[0]=="4"):
21 |             row[0]= "positive"
22 |         else : row [0]=="neutral"
23 | 
24 |         t1 = row[4]
25 |         t2 = row[0]
26 | 
27 |         row[4] = row [0]
28 |         row[0]=t1
29 |         row[0]=row[1]
30 |         row[1]=t1
31 |         #print(row[1])
32 |         del(row[2])
33 |         del(row[2])
34 |         row[1]= str(randint(11111111, 99999999))
35 | 
36 |         print '\t'.join(str(p) for p in row)
37 |         #print "\t".join(row)
38 | 
39 | 
40 | 


--------------------------------------------------------------------------------
/preprocessing/preprocesstweets.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | import re
 3 | 
 4 | 
 5 | def replaceTwoOrMore(s):
 6 |     # look for 2 or more repetitions of character
 7 |     patt = re.compile(r"(.)\1{1,}", re.DOTALL)
 8 |     return patt.sub(r"\1\1", s)
 9 | 
10 | 
11 | def processTweet(tweet):
12 |     # process the tweets
13 |     # Convert to lower case
14 |     tweet = tweet.lower()
15 |     # Convert www.* or https?://* to URL
16 |     tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', 'url', tweet)
17 |     # Convert @username to AT_USER
18 |     tweet = re.sub('@[^\s]+', 'at_user', tweet)
19 |     # Remove additional white spaces
20 |     tweet = re.sub('[\s]+', ' ', tweet)
21 |     # Replace #word with word
22 |     tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
23 |     # trim
24 |     tweet = tweet.strip('\'"')
25 |     return tweet
26 | 
27 | 
28 | def getStopWordList(fname):
29 |     # read the stopwords
30 |     stopWords = []
31 |     stopWords.append('rt')
32 |     stopWords.append('url')
33 |     stopWords.append('at_user')
34 | 
35 |     fp = open(fname, 'r')
36 |     line = fp.readline()
37 |     while line:
38 |         word = line.strip()
39 |         stopWords.append(word)
40 |         line = fp.readline()
41 |     fp.close()
42 |     return stopWords
43 | 
44 | 
45 | def getFeatureVector(tweet, stopWords):
46 |     featureVector = []
47 |     words = tweet.split()
48 |     for w in words:
49 |         # replace two or more with two occurrences
50 |         w = replaceTwoOrMore(w)
51 |         # strip punctuation
52 |         w = w.strip('\'"?,.')
53 |         # check if it consists of only words
54 |         val = re.search(r"^[a-zA-Z][a-zA-Z0-9]*[a-zA-Z]+[a-zA-Z0-9]*$", w)
55 |         # ignore if it is a stopWord
56 |         if (w in stopWords or val is None):
57 |             continue
58 |         else:
59 |             featureVector.append(w.lower())
60 |     return featureVector
61 | 
62 | 
63 | def tweets_prepocess(inputfile, outputfile, stopwordfile):
64 |     with open(inputfile, 'r') as f:
65 |         readerCSV = csv.reader(f, delimiter=',', dialect='excel')
66 |         #print(readerCSV)
67 |         data = []
68 |         stopwords = getStopWordList(stopwordfile)
69 |         for row in readerCSV:
70 |             if len(row)>1:
71 |                 tweet = row[0]
72 |                 label = row[1]
73 |                 #print(tweet)
74 |                 tweet_processed = processTweet(replaceTwoOrMore(tweet))
75 |                 tweet_features = getFeatureVector(tweet_processed, stopwords)
76 |                 #print(tweet)
77 |                 #print("tweet")
78 |                 #print(tweet_features)
79 |                 if len(tweet_features) > 3:
80 |                     tweet_tokens = ' '.join(tweet_features)
81 |                     #data.append(['" {} "'.format(tweet_tokens),'" {} "'.format(label)])
82 |                     data.append(['" {} "'.format(tweet_tokens),label])
83 |                     #data.append([tweet_tokens, label])
84 |     f.close()
85 | 
86 |     # preprocess and store in a new clean csv file
87 |     with open(outputfile, "w") as fw:
88 |         writer = csv.writer(fw, delimiter='\t', lineterminator='\n', dialect='excel', quoting=csv.QUOTE_NONE,
89 |                             quotechar=None)
90 |         #writer = csv.writer(fw, delimiter='\t', lineterminator='\n', dialect='excel', quoting=csv.QUOTE_NONE, quotechar=None)
91 |         #writer = csv.writer(fw, delimiter='\t', dialect='excel-tab',quoting=csv.QUOTE_NONE, quotechar=None)
92 |         for x in data:
93 |             if len(x[1]) != 0:
94 |                 writer.writerow(x)
95 |     fw.close()
96 | 


--------------------------------------------------------------------------------
/preprocessing/process_json_file.py:
--------------------------------------------------------------------------------
 1 | # Import the necessary package to process data in JSON format
 2 | try:
 3 |     import json
 4 | except ImportError:
 5 |     import simplejson as json
 6 | import logging
 7 | from pymongo import MongoClient
 8 | 
 9 | logger = logging.Logger('catch_all')
10 | 
11 | tweet_filename  = "/root/PycharmProjects/project_text_mining/tweets.txt"
12 | tweet_file = open(tweet_filename,'r')
13 | keys_to_delete = ["favorited","contributors","truncated","possibly_sensitive","is_quote_status"\
14 |                   ,"in_reply_to_status_id","filter_level","geo","favorite_count","extended_tweet"\
15 |                   ,"entities","in_reply_to_user_id_str","retweeted","coordinates","timestamp_ms"\
16 |                   ,"source","in_reply_to_status_id_str","in_reply_to_screen_name","display_text_range"\
17 |                   ,"place","retweet_count","in_reply_to_user_id","user","_id"]
18 | 
19 | client = MongoClient('localhost', 27017)
20 | db = client.twitter_db
21 | 
22 | 
23 | for line in tweet_file:
24 |     try:
25 |         tweet = json.loads(line.strip())
26 |         tweet["text"] = tweet["extended_tweet"]["full_text"].encode("utf-8")
27 |         for key in keys_to_delete:
28 |             if key in tweet:
29 |                 del tweet[key]
30 | 
31 |         posts = db.twitter_db
32 |         posts.insert(tweet)
33 |         print(tweet)
34 | 
35 |     except Exception as e:
36 |         # read in a line is not in JSON format (sometimes error occured)
37 |         logger.error(e, exc_info=True)
38 |         print(e)
39 | tweet_file.close()
40 | 
41 | 
42 | 
43 | """
44 | 
45 | try :
46 |     tweet = json.loads(line.strip())
47 |     keys.append(tweet.keys())
48 |     new_dict = {"text": tweet["text"] for key in keys}
49 |     print(new_dict)
50 |     #dict_i_want = {key: tweet[key] for key in keys}
51 |     #print dict_i_want
52 |     #if 'extended_tweet' in tweet.keys()  :
53 |     #print tweet['extended_tweet']['full_text'].encode('utf-8')  # content of the tweet
54 | except Exception as e:
55 |     # read in a line is not in JSON format (sometimes error occured)
56 |     logger.error(e, exc_info=True)
57 |     print(e)
58 | tweet_file.close()
59 | """


--------------------------------------------------------------------------------
/preprocessing/stopwords.txt:
--------------------------------------------------------------------------------
  1 | a
  2 | about
  3 | above
  4 | across
  5 | after
  6 | afterwards
  7 | again
  8 | against
  9 | all
 10 | almost
 11 | alone
 12 | along
 13 | already
 14 | also
 15 | although
 16 | always
 17 | am
 18 | among
 19 | amongst
 20 | amoungst
 21 | amount
 22 | an
 23 | and
 24 | another
 25 | any
 26 | anyhow
 27 | anyone
 28 | anything
 29 | anyway
 30 | anywhere
 31 | are
 32 | around
 33 | as
 34 | at
 35 | back
 36 | be
 37 | became
 38 | because
 39 | become
 40 | becomes
 41 | becoming
 42 | been
 43 | before
 44 | beforehand
 45 | behind
 46 | being
 47 | below
 48 | beside
 49 | besides
 50 | between
 51 | beyond
 52 | bill
 53 | both
 54 | bottom
 55 | but
 56 | by
 57 | call
 58 | can
 59 | cannot
 60 | cant
 61 | co
 62 | computer
 63 | con
 64 | could
 65 | couldnt
 66 | cry
 67 | de
 68 | describe
 69 | detail
 70 | do
 71 | done
 72 | down
 73 | due
 74 | during
 75 | each
 76 | eg
 77 | eight
 78 | either
 79 | eleven
 80 | else
 81 | elsewhere
 82 | empty
 83 | enough
 84 | etc
 85 | even
 86 | ever
 87 | every
 88 | everyone
 89 | everything
 90 | everywhere
 91 | except
 92 | few
 93 | fifteen
 94 | fify
 95 | fill
 96 | find
 97 | fire
 98 | first
 99 | five
100 | for
101 | former
102 | formerly
103 | forty
104 | found
105 | four
106 | from
107 | front
108 | full
109 | further
110 | get
111 | give
112 | go
113 | had
114 | has
115 | hasnt
116 | have
117 | he
118 | hence
119 | her
120 | here
121 | hereafter
122 | hereby
123 | herein
124 | hereupon
125 | hers
126 | herse"
127 | him
128 | himse"
129 | his
130 | how
131 | however
132 | hundred
133 | i
134 | ie
135 | if
136 | in
137 | inc
138 | indeed
139 | interest
140 | into
141 | is
142 | it
143 | its
144 | itse"
145 | keep
146 | last
147 | latter
148 | latterly
149 | least
150 | less
151 | ltd
152 | made
153 | many
154 | may
155 | me
156 | meanwhile
157 | might
158 | mill
159 | mine
160 | more
161 | moreover
162 | most
163 | mostly
164 | move
165 | much
166 | must
167 | my
168 | myse"
169 | name
170 | namely
171 | neither
172 | never
173 | nevertheless
174 | next
175 | nine
176 | no
177 | nobody
178 | none
179 | noone
180 | nor
181 | not
182 | nothing
183 | now
184 | nowhere
185 | of
186 | off
187 | often
188 | on
189 | once
190 | one
191 | only
192 | onto
193 | or
194 | other
195 | others
196 | otherwise
197 | our
198 | ours
199 | ourselves
200 | out
201 | over
202 | own
203 | part
204 | per
205 | perhaps
206 | please
207 | put
208 | rather
209 | re
210 | same
211 | see
212 | seem
213 | seemed
214 | seeming
215 | seems
216 | serious
217 | several
218 | she
219 | should
220 | show
221 | side
222 | since
223 | sincere
224 | six
225 | sixty
226 | so
227 | some
228 | somehow
229 | someone
230 | something
231 | sometime
232 | sometimes
233 | somewhere
234 | still
235 | such
236 | system
237 | take
238 | ten
239 | than
240 | that
241 | the
242 | their
243 | them
244 | themselves
245 | then
246 | thence
247 | there
248 | thereafter
249 | thereby
250 | therefore
251 | therein
252 | thereupon
253 | these
254 | they
255 | thick
256 | thin
257 | third
258 | this
259 | those
260 | though
261 | three
262 | through
263 | throughout
264 | thru
265 | thus
266 | to
267 | together
268 | too
269 | top
270 | toward
271 | towards
272 | twelve
273 | twenty
274 | two
275 | un
276 | under
277 | until
278 | up
279 | upon
280 | us
281 | very
282 | via
283 | was
284 | we
285 | well
286 | were
287 | what
288 | whatever
289 | when
290 | whence
291 | whenever
292 | where
293 | whereafter
294 | whereas
295 | whereby
296 | wherein
297 | whereupon
298 | wherever
299 | whether
300 | which
301 | while
302 | whither
303 | who
304 | whoever
305 | whole
306 | whom
307 | whose
308 | why
309 | will
310 | with
311 | within
312 | without
313 | would
314 | yet
315 | you
316 | your
317 | yours
318 | yourself
319 | yourselves
320 | ,
321 | .
322 | http
323 | https
324 | @
325 | ?
326 | !
327 | -
328 | http...
329 | :
330 | 1
331 | 2
332 | 3
333 | 4
334 | 5
335 | 6
336 | 7
337 | 8
338 | 9
339 | rt
340 | 's
341 | 't
342 | amp


--------------------------------------------------------------------------------
/preprocessing/store_tweet_in_db.py:
--------------------------------------------------------------------------------
 1 | from pymongo import MongoClient
 2 | import json
 3 | try:
 4 |     import json
 5 | except ImportError:
 6 |     import simplejson as json
 7 | import logging
 8 | logger = logging.Logger('catch_all')
 9 | 
10 | client = MongoClient('localhost', 27017)
11 | db = client.twitter_db
12 | 
13 | tweet_filename  = "/root/PycharmProjects/untitled1/full_tweet.txt"
14 | tweet_file = open(tweet_filename,'r')
15 | 
16 | for line in tweet_file:
17 |     try:
18 |         tweet = json.loads(line.strip())
19 |         print(type(json))
20 |         #posts = db.posts
21 |         #posts.insert(post)
22 |     except Exception as e:
23 |         # read in a line is not in JSON format (sometimes error occured)
24 |        logger.error(e, exc_info=True)
25 |        print(e)


--------------------------------------------------------------------------------
/preprocessing/twitter_streaming.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # Import the necessary package to process data in JSON format
 3 | try:
 4 |     import json
 5 | except ImportError:
 6 |     import simplejson as json
 7 | # Import the necessary methods from "twitter" library
 8 | from twitter import Twitter, OAuth, TwitterHTTPError, TwitterStream
 9 | 
10 | ### Secret information for the authentication with the twitter API
11 | ACCESS_TOKEN = '172760940-WFyHDrF0gwA0aj7snvx8KFuaUgyG8L6hB3Wndi9b'
12 | ACCESS_SECRET = 'qW74TbTr6O98zEVn91tc2J8jkiSAAMRO7k91WzD4yx5MH'
13 | CONSUMER_KEY = 'U5gIP4mJsHVIB1GKTLglHeOoE'
14 | CONSUMER_SECRET = 'jm1lPsrRa7QYoQKW7thbN3stqWV8e6wl7ybM3AgqWJMoIMFAkq'
15 | 
16 | oauth = OAuth(ACCESS_TOKEN, ACCESS_SECRET, CONSUMER_KEY, CONSUMER_SECRET)
17 | twitter_stream = TwitterStream(auth=oauth)
18 | iterator = twitter_stream.statuses.sample(language="en")
19 | 
20 | 
21 | #specify the tweets number to collect
22 | tweet_count = 10
23 | keys = []
24 | for tweet in iterator:
25 |     if 'extended_tweet' in tweet.keys():
26 |         tweet_count -= 1
27 |     # Twitter Python Tool wraps the data returned by Twitter
28 |     # as a TwitterDictResponse object.
29 |     # We convert it back to the JSON format to print/score
30 |         print json.dumps(tweet)
31 | 
32 | 
33 | 
34 | 
35 |     # The command below will do pretty printing for JSON data, try it out
36 |     # print json.dumps(tweet, indent=4)
37 | 
38 |     if tweet_count <= 0:
39 | 
40 |         break
41 | 
42 | 


--------------------------------------------------------------------------------
/text_mining_project_report.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AzizCode92/Learning-Sentiment-Specific-Word-embedding-for-Twitter-Sentiment-Classification/7deabbb5f532065ff4f598781dee2b2c58fb1464/text_mining_project_report.pdf


--------------------------------------------------------------------------------