├── .gitignore ├── README.md ├── classification ├── Classifier.py ├── Evaluator.py ├── NeuralNets.py └── __init__.py ├── embedding ├── __init__.py └── sswe_extractor.py ├── feature_extracting ├── SennaFeatureExtractor.py ├── WordEmbeddingFeatureExtractor.py └── __init__.py ├── main.py ├── models ├── Dataset.py └── __init__.py ├── preprocessing ├── __init__.py ├── csv_header_change.py ├── csv_tsv.py ├── preprocesstweets.py ├── process_json_file.py ├── stopwords.txt ├── store_tweet_in_db.py └── twitter_streaming.py └── text_mining_project_report.pdf /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Learning-Sentiment-Specific-Word-embedding-for-Twitter-Sentiment-Classification 2 | -------------------------------------------------------------------------------- /classification/Classifier.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | from sklearn import metrics 4 | 5 | from feature_extractor import FeatureExtractor 6 | from sklearn.naive_bayes import MultinomialNB 7 | from sklearn.neural_network import MLPClassifier 8 | from sklearn.model_selection import train_test_split 9 | from sklearn.svm import SVC 10 | from sklearn.ensemble import RandomForestClassifier 11 | 12 | 13 | class Classifier(object): 14 | """docstring for Classifier""" 15 | 16 | def __init__(self, models="multinomial"): 17 | super(Classifier, self).__init__() 18 | self.models = models 19 | if models == "multinomial": 20 | self.classifier = MultinomialNB() 21 | elif models == "svm": 22 | self.classifier = SVC(kernel='linear') 23 | elif models == "rfc": 24 | self.classifier = RandomForestClassifier() 25 | elif models == "nn": 26 | self.classifier = MLPClassifier() 27 | 28 | def classify(self, dataset): 29 | contents = dataset.get_contents() 30 | labels = dataset.get_labels() 31 | return self.classify_raw(contents, labels) 32 | 33 | def classify_raw(self, dataset, labels): 34 | self.classifier = self.classifier.fit(dataset, labels) 35 | return self.classifier 36 | 37 | """Return predictions for dataset using Dataset class""" 38 | 39 | def test(self, dataset): 40 | contents = dataset.get_contents() 41 | return self.test_raw(contents) 42 | 43 | """Return predictions for dataset using raw array dataset""" 44 | 45 | def test_raw(self, dataset): 46 | predictions = self.classifier.predict(dataset) 47 | return predictions 48 | 49 | def get_classifier_type(self): 50 | if self.models == "multinomial": 51 | return "Multinomial Naive-Bayes" 52 | elif self.models == "svm": 53 | return "Support Vector Machine" 54 | elif self.models == "rfc": 55 | return "Random Forest Classifier" 56 | elif self.models == "nn": 57 | return "Multilayer Perceptron (Neural Network)" 58 | else: 59 | return "Unknown classifier" 60 | 61 | 62 | def main(filename): 63 | fe = FeatureExtractor("tfidf", filename) 64 | fe.load_dataset() 65 | fe.load_labels() 66 | 67 | bow = fe.build_bag() 68 | bag = fe.build_tfidf() 69 | 70 | print "** Using Multinomial NB Models **" 71 | 72 | # TFIDF 73 | clf = Classifier(models="multinomial") 74 | clf.classify(bag, fe.raw_labels) 75 | 76 | preds = clf.test(bag) 77 | # for doc, cat in zip(fe.dataset, preds): 78 | # print "%r => %s" % (doc, cat) 79 | 80 | print "TFIDF accuracy score: %f" % (metrics.accuracy_score(fe.raw_labels, preds, normalize=True)) 81 | f1_pos = metrics.f1_score(fe.raw_labels, preds, pos_label='positive') 82 | f1_neg = metrics.f1_score(fe.raw_labels, preds, pos_label='negative') 83 | f1_neu = metrics.f1_score(fe.raw_labels, preds, pos_label='neutral') 84 | print "TFIDF F1 score: %f" % f1_pos 85 | print "TFIDF F1 negative score: %f" % f1_neg 86 | print "TFIDF F1 neutral score: %f" % f1_neu 87 | 88 | print "\nAverage F-measure: %f" % ((f1_pos + f1_neg + f1_neu ) / 2) 89 | 90 | # bag of words 91 | clf = Classifier(models="multinomial") 92 | clf.classify(bow, fe.raw_labels) 93 | preds = clf.test(bow) 94 | 95 | print "BOW accuracy score: %f" % (metrics.accuracy_score(fe.raw_labels, preds, normalize=True)) 96 | print "BOW F1 score: %f" % (metrics.f1_score(fe.raw_labels, preds, pos_label='positive')) 97 | 98 | print "\n** Using SVM **" 99 | 100 | # TFIDF 101 | clf = Classifier(models="svm") 102 | clf.classify(bag, fe.raw_labels) 103 | 104 | preds = clf.test(bag) 105 | # for doc, cat in zip(fe.dataset, preds): 106 | # print "%r => %s" % (doc, cat) 107 | 108 | print "TFIDF accuracy score: %f" % (metrics.accuracy_score(fe.raw_labels, preds, normalize=True)) 109 | 110 | # bag of words 111 | clf = Classifier(models="svm") 112 | clf.classify(bow, fe.raw_labels) 113 | preds = clf.test(bow) 114 | 115 | print "BOW accuracy score: %f" % (metrics.accuracy_score(fe.raw_labels, preds, normalize=True)) 116 | 117 | X_train, X_test, y_train, y_test = train_test_split(bow, fe.raw_labels, test_size=0.4, random_state=0) 118 | clf = Classifier(models="svm") 119 | clf.classify(X_train, y_train) 120 | preds = clf.test(X_test) 121 | 122 | print "Using 60/40, BOW accuracy: %f" % (metrics.accuracy_score(y_test, preds, normalize=True)) 123 | print "Using 60/40, BOW F1: %f" % (metrics.f1_score(y_test, preds, pos_label='positive')) 124 | 125 | 126 | if __name__ == '__main__': 127 | main(sys.argv[1]) 128 | -------------------------------------------------------------------------------- /classification/Evaluator.py: -------------------------------------------------------------------------------- 1 | from sklearn.metrics import accuracy_score 2 | from sklearn.metrics import f1_score 3 | from sklearn.metrics import confusion_matrix 4 | from sklearn.model_selection import cross_val_score, cross_val_predict 5 | from sklearn.model_selection import KFold 6 | 7 | import copy 8 | import csv 9 | 10 | import numpy as np 11 | 12 | verbose_level = 0 # verbose level 13 | n_job = 3 # number of CPU used in evaluation 14 | seed = 7 # seed for our random state cross validation 15 | 16 | from nltk.tokenize import TweetTokenizer # a tweet tokenizer from nltk. 17 | tokenizer = TweetTokenizer() 18 | from sklearn.preprocessing import scale 19 | 20 | class Evaluator(object): 21 | """docstring for Evaluator""" 22 | def __init__(self): 23 | super(Evaluator, self).__init__() 24 | 25 | def eval_with_test_set(self, model, feature_extractors, training_set, test_set, outfile="results_test.csv"): 26 | if not isinstance(feature_extractors, list): 27 | return 28 | 29 | training_contents = training_set.get_contents() 30 | training_labels = training_set.get_labels() 31 | 32 | # build training features 33 | 34 | # Print properties 35 | print "Evaluation method: Test Set" 36 | print "Classifier: %s" % (model.get_classifier_type()) 37 | 38 | test_contents = test_set.get_contents() 39 | test_labels = test_set.get_labels() 40 | 41 | field_names = ["id", "content", "polarity"] 42 | fe_predictions = dict() 43 | 44 | for feature_extractor in feature_extractors: 45 | fe = copy.copy(feature_extractor) 46 | print "\nFeature Extractor: %s" % (fe.get_name()) 47 | field_names.append(fe.get_name()) 48 | 49 | # build our feature extractor from the training dataset contents 50 | 51 | fe.set_dataset(training_contents) 52 | fe.build() 53 | training_contents = [tweet.split() for tweet in training_contents] 54 | 55 | training_features = fe.extract_features(training_contents) 56 | #print("training features :") 57 | #print(training_features) 58 | # build features for our test dataset 59 | test_contents = [tweet.split() for tweet in test_contents] 60 | test_features = fe.extract_existing_features(test_contents) 61 | #print("test features :") 62 | #print(test_features) 63 | # build training models 64 | model.classify_raw(training_features, training_labels) 65 | 66 | # start evaluating with test set 67 | test_predictions = model.test_raw(test_features) 68 | fe_predictions[fe.get_name()] = test_predictions 69 | 70 | # evaluate confusion matrix 71 | cnf_matrix = confusion_matrix(test_labels, test_predictions,labels=['positive', 'negative','neutral']) 72 | 73 | print "Average F-measure: %f" % (f1_score(test_labels, test_predictions, average='macro')) 74 | print "Average accuracy : %f" % (f1_score(test_labels, test_predictions, average='micro')) 75 | print "\nConfusion Matrix:" 76 | print "\t\tPositive\tNegative\tNeutral (predicted labels)" 77 | print "Positive\t%d\t\t%d\t\t%d" % (cnf_matrix[0][0], cnf_matrix[0][1],cnf_matrix[0][2]) 78 | print "Negative\t%d\t\t%d\t\t%d" % (cnf_matrix[1][0], cnf_matrix[1][1],cnf_matrix[1][2]) 79 | print "Neutral \t%d\t\t%d\t\t%d" % (cnf_matrix[2][0], cnf_matrix[2][1],cnf_matrix[2][2]) 80 | print "(actual labels)\n" 81 | 82 | with open(outfile, "wb") as csvfile: 83 | writer = csv.DictWriter(csvfile, fieldnames=field_names) 84 | writer.writeheader() 85 | for i in xrange(len(test_contents)): 86 | row = { 87 | 'id': i + 1, 88 | 'content': test_contents[i], 89 | 'polarity': test_labels[i], 90 | } 91 | # append results 92 | for j in xrange(len(feature_extractors)): 93 | row[feature_extractors[j].get_name()] = fe_predictions[feature_extractors[j].get_name()][i] 94 | 95 | writer.writerow(row) 96 | 97 | def eval_with_cross_validation(self, model, feature_extractors, training_set, num_fold=10, cv=None): 98 | if not isinstance(feature_extractors, list): 99 | return 100 | 101 | # if model 102 | training_contents = training_set.get_contents() 103 | training_labels = training_set.get_labels() 104 | 105 | # Print properties 106 | print "Evaluation method: Cross Validation" 107 | print "Number of Folds: %d" % (num_fold) 108 | print "Classifier: %s" % (model.get_classifier_type()) 109 | 110 | if not cv: 111 | kfold = KFold(n_splits=num_fold, random_state=seed) 112 | else: 113 | kfold = cv 114 | 115 | for feature_extractor in feature_extractors: 116 | fe = copy.copy(feature_extractor) 117 | print "\nFeature Extractor: %s" % (fe.get_name()) 118 | 119 | # build our feature extractor from the dataset contents 120 | fe.set_dataset(training_contents) 121 | fe.build() 122 | training_contents = [tweet.split() for tweet in training_contents] 123 | training_features = fe.extract_features(training_contents) 124 | # obtain our classification results 125 | # measure is done by using macro F1 score 126 | scores = cross_val_score(model.classifier, X=training_features, 127 | y=training_labels, cv=kfold, n_jobs=n_job, 128 | scoring='f1_macro', verbose=verbose_level) 129 | 130 | # print each of the iteration scroe 131 | for i in xrange(0, len(scores)): 132 | print "Iteration %d = %f" % (i + 1, scores[i]) 133 | 134 | print "Average score: %f" % (scores.mean()) 135 | print "Standard Deviation: %f" % (scores.std()) 136 | print "Maximum F1-score: %f" % (np.amax(scores)) 137 | 138 | 139 | def create_evaluation_result(self, model, feature_extractors, training_set, num_fold=10, outfile="results_cv.csv", cv=None): 140 | if not isinstance(feature_extractors, list): 141 | return 142 | 143 | # if model 144 | training_contents = training_set.get_contents() 145 | training_labels = training_set.get_labels() 146 | 147 | # Print properties 148 | print "Evaluation method: Cross Validation" 149 | print "Number of Folds: %d" % (num_fold) 150 | print "Classifier: %s" % (model.get_classifier_type()) 151 | 152 | field_names = ["id", "content", "polarity"] 153 | fe_predictions = dict() 154 | 155 | if not cv: 156 | kfold = KFold(n_splits=num_fold, random_state=seed) 157 | else: 158 | kfold = cv 159 | 160 | for feature_extractor in feature_extractors: 161 | fe = copy.copy(feature_extractor) 162 | field_names.append(fe.get_name()) 163 | 164 | # build our feature extractor from the dataset contents 165 | fe.set_dataset(training_contents) 166 | fe.build() 167 | training_contents = [tweet.split() for tweet in training_contents] 168 | training_features = fe.extract_features(training_contents) 169 | # obtain our classification results 170 | # measure is done by using macro F1 score 171 | predictions = cross_val_predict(model.classifier, X=training_features, 172 | y=training_labels, cv=kfold, n_jobs=n_job, 173 | verbose=verbose_level,fit_params={}) 174 | fe_predictions[fe.get_name()] = predictions 175 | 176 | with open(outfile, "wb") as csvfile: 177 | writer = csv.DictWriter(csvfile, fieldnames=field_names) 178 | writer.writeheader() 179 | for i in xrange(len(training_contents)): 180 | row = { 181 | 'id': i + 1, 182 | 'content': training_contents[i], 183 | 'polarity': training_labels[i], 184 | } 185 | # append results 186 | for j in xrange(len(feature_extractors)): 187 | row[feature_extractors[j].get_name()] = fe_predictions[feature_extractors[j].get_name()][i] 188 | 189 | writer.writerow(row) 190 | 191 | return outfile 192 | -------------------------------------------------------------------------------- /classification/NeuralNets.py: -------------------------------------------------------------------------------- 1 | from keras.layers import Dense 2 | from keras.optimizers import Adam 3 | from keras.models import Sequential 4 | from sklearn.feature_extraction.text import TfidfVectorizer 5 | from sklearn.metrics import f1_score 6 | from sklearn.preprocessing import scale 7 | from tqdm import tqdm 8 | import numpy as np 9 | from sklearn.metrics import confusion_matrix 10 | 11 | 12 | def split_data(dataset): 13 | 14 | x_samples = dataset.get_contents() 15 | y_labels = dataset.get_labels() 16 | y = [] 17 | for x in y_labels: 18 | if x=="positive": 19 | y.append(1) 20 | elif x=="negative": 21 | y.append(-1) 22 | else: 23 | y.append(0) 24 | 25 | return x_samples,y 26 | 27 | 28 | def build_tfidf(x_train): 29 | print 'building tf-idf matrix ...' 30 | vectorizer = TfidfVectorizer(analyzer=lambda x: x, min_df=10) 31 | vectorizer.fit_transform([x.split() for x in x_train]) 32 | tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_)) 33 | print 'vocab size :', len(tfidf) 34 | return tfidf 35 | 36 | 37 | class NeuralNets(object): 38 | def __init__(self, input_size=100,x_train=None, y_train=None, 39 | epochs=20, batch_size=32,x_test=None, y_test=None): 40 | self.inputdim = input_size 41 | self.xtrain = x_train 42 | self.ytrain = y_train 43 | self.epochs = epochs 44 | self.xtest = x_test 45 | self.ytest = y_test 46 | self.batchsize = batch_size 47 | 48 | 49 | def train_neural_nets(self): 50 | 51 | "*** Train Neural Networks model ***" 52 | 53 | model = Sequential() 54 | model.add(Dense(200, activation='relu', input_dim=self.inputdim)) 55 | #model.add(Dense(32, activation='softsign')) 56 | model.add(Dense(1, activation='sigmoid')) 57 | model.compile(optimizer=Adam(lr=0.01),loss='binary_crossentropy',metrics=['accuracy','mse','mae']) 58 | print("\n Training Neural Network Classifier with Training dataset") 59 | model.fit(self.xtrain, self.ytrain, epochs=self.epochs, batch_size=self.batchsize, verbose=2) 60 | print("\n Evaluating Neural Network Classifier on Test dataset") 61 | score = model.evaluate(self.xtest, self.ytest, batch_size=128, verbose=2) 62 | print("{} is {}".format("accuracy",score[1])) 63 | print("{} is {}".format("mse: ",score[2])) 64 | print("{} is {}".format("mae: ", score[3])) 65 | y_predictions = model.predict(self.xtest, batch_size=128, verbose=2) 66 | y_pred = np.around(y_predictions) 67 | y_pred = [int(x) for x in y_pred.flatten().tolist()] 68 | cnf_matrix = confusion_matrix(self.ytest, y_pred,labels=[1,-1,0]) 69 | print "Average F-measure: %f" % (f1_score(self.ytest, y_pred, average='macro')) 70 | print "\n Confusion Matrix:" 71 | print "\t\tPositive\tNegative\tNeutral (predicted labels)" 72 | print "Positive\t%d\t\t%d\t\t%d" % (cnf_matrix[0][0], cnf_matrix[0][1], cnf_matrix[0][2]) 73 | print "Negative\t%d\t\t%d\t\t%d" % (cnf_matrix[1][0], cnf_matrix[1][1], cnf_matrix[1][2]) 74 | print "Neutral \t%d\t\t%d\t\t%d" % (cnf_matrix[2][0], cnf_matrix[2][1], cnf_matrix[2][2]) 75 | print "(actual labels)\n" 76 | -------------------------------------------------------------------------------- /classification/__init__.py: -------------------------------------------------------------------------------- 1 | from Classifier import Classifier 2 | from Evaluator import Evaluator 3 | from NeuralNets import NeuralNets 4 | -------------------------------------------------------------------------------- /embedding/__init__.py: -------------------------------------------------------------------------------- 1 | from embedding.sswe_extractor import * 2 | -------------------------------------------------------------------------------- /embedding/sswe_extractor.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # -*- coding: utf-8 -*- 3 | import logging 4 | import numpy as np 5 | from ConfigParser import ConfigParser 6 | from itertools import chain 7 | # local 8 | from deepnl import * 9 | from deepnl.extractors import * 10 | from deepnl.reader import TweetReader 11 | from deepnl.network import Network 12 | from deepnl.sentiwords import SentimentTrainer 13 | 14 | 15 | # ---------------------------------------------------------------------- 16 | # ---------------------------------------------------------------------- 17 | class sswe_model(object): 18 | def __init__(self, window=3, embeddings_size=50, epochs=100, learning_rate=0.001, 19 | eps=1e-8, ro=0.95, hidden=200, ngrams=2, textField=0, 20 | tagField=1, alpha=0.5, train=None, model=None, 21 | vocab=None, minOccurr=3, vocab_size=0, vectors=None, load=None, 22 | threads=5, variant=None, verbose=None, config_file=None): 23 | self.window = window 24 | self.embeddings_size = embeddings_size 25 | self.iterations = epochs 26 | self.learning_rate = learning_rate 27 | self.eps = eps 28 | self.ro = ro 29 | self.hidden = hidden 30 | self.ngrams = ngrams 31 | self.textField = textField 32 | self.tagField = tagField 33 | self.alpha = alpha 34 | self.train = train 35 | self.vocab = vocab 36 | self.minOccurr = minOccurr 37 | self.vocab_size = vocab_size 38 | self.vectors = vectors 39 | self.load = load 40 | self.variant = variant 41 | self.verbose = verbose 42 | self.model = model 43 | self.threads = threads 44 | self.config_file = config_file 45 | 46 | 47 | 48 | def create_sswe_model(train_filename, vocab_file, vector_file, train_model, save_model, size): 49 | """model parameters: you can customize other parameters in the class sswe_mode()""" 50 | emb_size = size # Number of features per word 51 | epochs = 100 # Number of training epochs 52 | l_r = 0.1 # Learning rate for network weights 53 | hidden = 200 # Number of hidden neurons 54 | ngrams = 2 # Length of ngrams 55 | text = 0 # field containing text 56 | tag = 1 # field containing polarity 57 | train = train_filename # File with text corpus for training 58 | model = save_model # File where to save the model 59 | vocab = vocab_file # Vocabulary file, either read and updated or created 60 | vectors = vector_file # Embeddings file, either read and updated or created 61 | load = train_model # Load previously saved model 62 | threads = 15 # Number of threads 63 | variant = None 64 | 65 | sswe = sswe_model(embeddings_size=emb_size, epochs=epochs, learning_rate=l_r, threads=threads, 66 | hidden=hidden, ngrams=ngrams, textField=text, tagField=tag, train=train, 67 | model=model, vocab=vocab, minOccurr=3, vectors=vectors, load=load, variant=variant) 68 | return sswe 69 | 70 | 71 | 72 | def sswe_trainer(model_parameters): 73 | # set the seed for replicability 74 | np.random.seed(42) 75 | # args = parser.parse_args() 76 | args = model_parameters 77 | log_format = '%(message)s' 78 | log_level = logging.DEBUG if args.verbose else logging.INFO 79 | log_level = logging.INFO 80 | logging.basicConfig(format=log_format, level=log_level) 81 | logger = logging.getLogger("Logger") 82 | 83 | config = ConfigParser() 84 | if args.config_file: 85 | config.read(args.config_file) 86 | # merge args with config 87 | reader = TweetReader(text_field=args.textField, label_field=args.tagField, ngrams=args.ngrams) 88 | reader.read(args.train) 89 | vocab, bigrams, trigrams = reader.create_vocabulary(reader.sentences, args.vocab_size, 90 | min_occurrences=args.minOccurr) 91 | #print("length vocab") 92 | #print(len(vocab)) 93 | if args.variant == 'word2vec' and os.path.exists(args.vectors): 94 | embeddings = Embeddings(vectors=args.vectors, variant=args.variant) 95 | embeddings.merge(vocab) 96 | logger.info("Saving vocabulary in %s" % args.vocab) 97 | embeddings.save_vocabulary(args.vocab) 98 | elif os.path.exists(args.vocab): 99 | # start with the given vocabulary 100 | b_vocab = reader.load_vocabulary(args.vocab) 101 | bound = len(b_vocab)-len(bigrams)-len(trigrams) 102 | base_vocab=b_vocab[:bound] 103 | #print("length base vocab :") 104 | #print(len(base_vocab)) 105 | if os.path.exists(args.vectors): 106 | # load embeddings 107 | embeddings = Embeddings(vectors=args.vectors, vocab=base_vocab, variant=args.variant) 108 | else: 109 | # create embeddings 110 | embeddings = Embeddings(args.embeddings_size, vocab=base_vocab, variant=args.variant) 111 | # add the ngrams from the corpus 112 | embeddings.merge(vocab) 113 | logger.info("Overriding vocabulary in %s" % args.vocab) 114 | embeddings.save_vocabulary(args.vocab) 115 | else: 116 | embeddings = Embeddings(args.embeddings_size, vocab=vocab, variant=args.variant) 117 | logger.info("Saving vocabulary in %s" % args.vocab) 118 | embeddings.save_vocabulary(args.vocab) 119 | 120 | # Assume bigrams are prefix of trigrams, or else we should put a terminator 121 | # on trie 122 | trie = {} 123 | for b in chain(bigrams, trigrams): 124 | tmp = trie 125 | for w in b: 126 | tmp = tmp.setdefault(embeddings.dict[w], {}) 127 | 128 | converter = Converter() 129 | converter.add(embeddings) 130 | 131 | trainer = create_trainer(args, converter) 132 | 133 | report_intervals = max(args.iterations / 200, 1) 134 | report_intervals = 10000 # DEBUG 135 | 136 | logger.info("Starting training") 137 | 138 | # a generator expression (can be iterated several times) 139 | # It caches converted sentences, avoiding repeated conversions 140 | converted_sentences = converter.generator(reader.sentences, cache=True) 141 | trainer.train(converted_sentences, reader.polarities, trie, 142 | args.iterations, report_intervals) 143 | 144 | logger.info("Overriding vectors to %s" % args.vectors) 145 | embeddings.save_vectors(args.vectors, args.variant) 146 | if args.model: 147 | logger.info("Saving trained model to %s" % args.model) 148 | trainer.save(args.model) 149 | 150 | 151 | def create_trainer(args, converter): 152 | """ 153 | Creates or loads a neural network according to the specified args. 154 | """ 155 | 156 | logger = logging.getLogger("Logger") 157 | 158 | if args.load: 159 | logger.info("Loading provided network...") 160 | trainer = SentimentTrainer.load(args.load) 161 | # change learning rate 162 | trainer.learning_rate = args.learning_rate 163 | else: 164 | logger.info('Creating new network...') 165 | # sum the number of features in all extractors' tables 166 | input_size = converter.size() * (args.window * 2 + 1) 167 | nn = Network(input_size, args.hidden, 2) 168 | options = { 169 | 'learning_rate': args.learning_rate, 170 | 'eps': args.eps, 171 | 'ro': args.ro, 172 | 'verbose': args.verbose, 173 | 'left_context': args.window, 174 | 'right_context': args.window, 175 | 'ngram_size': args.ngrams, 176 | 'alpha': args.alpha 177 | } 178 | trainer = SentimentTrainer(nn, converter, options) 179 | 180 | trainer.saver = saver(args.model, args.vectors) 181 | 182 | logger.info("... with the following parameters:") 183 | logger.info(trainer.nn.description()) 184 | 185 | return trainer 186 | 187 | 188 | def saver(model_file, vectors_file): 189 | """Function for saving model periodically""" 190 | 191 | def save(trainer): 192 | # save embeddings also separately 193 | if vectors_file: 194 | trainer.save_vectors(vectors_file) 195 | if model_file: 196 | trainer.save(model_file) 197 | 198 | return save 199 | 200 | 201 | def buildWordVector(tokens, size, tweet_w2v, tfidf): 202 | vec = np.zeros(size).reshape((1, size)) 203 | count = 0. 204 | for word in tokens: 205 | try: 206 | vec += tweet_w2v[word].reshape((1, size)) * tfidf[word] 207 | count += 1. 208 | except KeyError: # handling the case where the token is not 209 | # in the corpus. useful for testing. 210 | continue 211 | if count != 0: 212 | vec /= count 213 | return vec 214 | 215 | 216 | def get_sswe_features(vocab_file, model_file): 217 | vocabs = [] 218 | models = [] 219 | with open(vocab_file, "rb") as vocablist: 220 | for vocab in vocablist: 221 | vocabs.append(vocab.rstrip()) 222 | 223 | with open(model_file, "rb") as modellist: 224 | for model in modellist: 225 | arr_model = model.split() 226 | models.append(np.array(map(float, arr_model))) 227 | # build our word embedding model vectorizer 228 | sswe_dict = dict(zip(vocabs, models)) 229 | return sswe_dict 230 | -------------------------------------------------------------------------------- /feature_extracting/SennaFeatureExtractor.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from FeatureExtractor import FeatureExtractor 4 | from vectorizer.WordEmbeddingVectorizer import WordEmbeddingVectorizer 5 | 6 | from models.SentenceIterator import SentenceIterator 7 | 8 | 9 | class SennaFeatureExtractor(FeatureExtractor): 10 | """docstring for FeatureExtractor""" 11 | def __init__(self, dataset=None, infile=None, vocabfile=None, binary=False, dimen=100): 12 | self.model_file = infile 13 | self.vocab_file = vocabfile 14 | self.binary = binary 15 | self.dataset = dataset 16 | self.dimen = dimen 17 | 18 | def build(self): 19 | if self.model_file and self.vocab_file: 20 | vocabs = [] 21 | models = [] 22 | with open(self.vocab_file, "rb") as vocablist: 23 | for vocab in vocablist: 24 | vocabs.append(vocab.rstrip()) 25 | 26 | with open(self.model_file, "rb") as modellist: 27 | for model in modellist: 28 | arr_model = model.split() 29 | models.append(np.array(map(float, arr_model))) 30 | #modelss = models[:100] 31 | #vocabss = vocabs[:100] 32 | # build our word embedding model vectorizer 33 | senna_dict = dict(zip(vocabs, models)) 34 | sentences = SentenceIterator(self.dataset) 35 | 36 | self.vectorizer = WordEmbeddingVectorizer(senna_dict, self.dimen) 37 | self.vectorizer.fit(sentences) 38 | else: 39 | pass 40 | 41 | return self 42 | 43 | def extract_existing_features(self, dataset): 44 | return super(SennaFeatureExtractor, self).extract_features(dataset) 45 | 46 | def get_name(self): 47 | return "SENNA C&W SSWE" 48 | -------------------------------------------------------------------------------- /feature_extracting/WordEmbeddingFeatureExtractor.py: -------------------------------------------------------------------------------- 1 | import gensim 2 | from FeatureExtractor import FeatureExtractor 3 | from vectorizer.WordEmbeddingVectorizer import WordEmbeddingVectorizer 4 | 5 | from models.SentenceIterator import SentenceIterator 6 | 7 | 8 | class WordEmbeddingFeatureExtractor(FeatureExtractor): 9 | """docstring for FeatureExtractor""" 10 | def __init__(self, dataset=None, infile=None, binary=False, dimen=100, sswe=0): 11 | super(WordEmbeddingFeatureExtractor, self).__init__(dataset) 12 | self.model_file = infile 13 | self.binary = binary 14 | self.dimen = dimen 15 | self.sswe = sswe 16 | 17 | def build(self): 18 | if not self.model_file: 19 | sentences = SentenceIterator(self.dataset) 20 | w2v = gensim.models.Word2Vec(sentences, size=self.dimen, min_count=1) 21 | word_vectors = w2v.wv 22 | del w2v # free memory 23 | else: 24 | word_vectors = gensim.models.KeyedVectors.load_word2vec_format(self.model_file, binary=self.binary) 25 | 26 | # build our word embedding model vectorizer 27 | # w2v_dict = dict(zip(w2v.index2word, w2v.syn0)) 28 | sentences = SentenceIterator(self.dataset) 29 | 30 | self.vectorizer = WordEmbeddingVectorizer(word_vectors, self.dimen) 31 | self.vectorizer.fit(sentences) 32 | 33 | return self 34 | 35 | def extract_existing_features(self, dataset): 36 | return super(WordEmbeddingFeatureExtractor, self).extract_features(dataset) 37 | 38 | def save_model_to_file(self, outfile, vocabfile=None, binary=True): 39 | sentences = SentenceIterator(self.dataset) 40 | w2v = gensim.models.Word2Vec(sentences, size=self.dimen, min_count=1, sg=1, workers=4, iter=10) 41 | 42 | w2v.wv.save_word2vec_format(outfile, fvocab=vocabfile, binary=binary) 43 | 44 | def get_name(self): 45 | if self.sswe == 1: 46 | return "SSWE + Word2Vec" 47 | else: 48 | return "Gensim Word2Vec" 49 | -------------------------------------------------------------------------------- /feature_extracting/__init__.py: -------------------------------------------------------------------------------- 1 | from feature_extracting.SennaFeatureExtractor import * 2 | from feature_extracting.WordEmbeddingFeatureExtractor import * 3 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | from preprocessing.preprocesstweets import * 2 | from embedding.sswe_extractor import * 3 | from models import Dataset 4 | from classification.Classifier import Classifier 5 | from classification.NeuralNets import * 6 | from classification.Evaluator import Evaluator 7 | from feature_extracting import SennaFeatureExtractor 8 | from sklearn.model_selection import KFold 9 | from sklearn.preprocessing import scale 10 | from tqdm import tqdm 11 | import numpy as np 12 | 13 | 14 | def main(): 15 | """ Sentiment Specific Embedding for twitter classification """ 16 | 17 | embeddings_size = 50 # Embedding size for SSWE model 18 | vocab_file = "Embedding/features/semeval_vocabs_200.txt" # path to the vocabulary file 19 | vector_file = "Embedding/features/semeval_vectors_200.txt" # path to the vector file 20 | stopwordsfile = "preprocess/stopwords.txt" 21 | 22 | """ Sentiment-Specific Word Embedding (SSWE) """ 23 | 24 | if True: 25 | # Load dataset 26 | data_train = 'dataset/training1600000.csv' # training data set file path 27 | pre_data_train = 'dataset/preprocessed_dataset1600000.csv' # file to save dataset after cleaning 28 | 29 | if True: 30 | print("\n **** Dataset cleaning ****") 31 | tweets_prepocess(data_train, pre_data_train, stopwordsfile) 32 | 33 | if True: 34 | print("\n **** SSWE model Trainig ****") 35 | train_model = None # path to the file contains the trained model if it is already exist 36 | save_model = "Embedding/models/SSWE_model_1600000_200" # path to the file where model will be saved 37 | sswe = create_sswe_model(pre_data_train, vocab_file, vector_file, train_model, 38 | save_model, embeddings_size) 39 | sswe_trainer(sswe) 40 | 41 | """ Embedding visualisation and Similarity computing """ 42 | 43 | if True: 44 | visualiser = Visualiser(sizeOfEmbedding=embeddings_size, 45 | VocabsFname=vocab_file, 46 | VectorsFname=vector_file, 47 | WVFilename="Visualisation/data/w2vformat.txt", 48 | visualizerHTMLfilename="Visualisation/data/embedding.html") 49 | visualiser.visualize() 50 | 51 | """ Twitter Sentiment Classification """ 52 | 53 | if True: 54 | # Data pre-processing 55 | 56 | print("\n **** Training data cleaning ****") 57 | pre_processing_train = "dataset/preprocessed_semeval_traindataset.csv" 58 | # tweets_prepocess(train_set, pre_processing_train, stopwordsfile) 59 | 60 | print("\n **** Test data cleaning ****") 61 | pre_processing_test = "dataset/preprocessed_semeval_testdataset.csv" 62 | # tweets_prepocess(test_set, pre_processing_test, stopwordsfile) 63 | 64 | # LOAD TRAIN SET 65 | dataset_train = Dataset.DatasetReview() 66 | dataset_train.load_review_from_csv(pre_processing_train) 67 | 68 | # LOAD TEST SET 69 | dataset_test = Dataset.DatasetReview() 70 | dataset_test.load_review_from_csv(pre_processing_test) 71 | 72 | ################################### Neural Nets classifier ########################### 73 | 74 | # Extract Features 75 | tweet2v = get_sswe_features(vocab_file, vector_file) 76 | 77 | # Extract samples and labels 78 | x_train, y_train = split_data(dataset_train) 79 | x_test, y_test = split_data(dataset_train) 80 | 81 | tfidf = build_tfidf(x_train) 82 | 83 | train_vecs_sswe = np.concatenate( 84 | [buildWordVector(z.split(), embeddings_size, 85 | tweet2v, tfidf) for z in tqdm(map(lambda x: x, x_train))]) 86 | 87 | train_vecs_sswe = scale(train_vecs_sswe) 88 | 89 | test_vecs_sswe = np.concatenate( 90 | [buildWordVector(z.split(), embeddings_size, 91 | tweet2v, tfidf) for z in tqdm(map(lambda x: x, x_test))]) 92 | test_vecs_sswe = scale(test_vecs_sswe) 93 | 94 | # neural network model 95 | neuralnets = NeuralNets(input_size=embeddings_size, x_train=train_vecs_sswe, y_train=y_train, 96 | epochs=450, batch_size=32, x_test=test_vecs_sswe, y_test=y_test) 97 | neuralnets.train_neural_nets() 98 | 99 | ########################################################################################## 100 | ######## 101 | ######## Classical classifiers with sklearn 102 | ######## 103 | ########################################################################################## 104 | print("\n**** CROSS VALIDATION EVALUATION (CORPUS: SemEval) ****\n") 105 | 106 | fe_sswe = SennaFeatureExtractor(infile=vector_file, vocabfile=vocab_file, dimen=embeddings_size) 107 | feature_extractors = [fe_sswe] 108 | ev = Evaluator() 109 | 110 | ################################# SVM ################################################### 111 | 112 | print ("\n**** CROSS VALIDATION EVALUATION (CORPUS: SemEval) ****\n") 113 | model = Classifier(models="svm") 114 | kfold = KFold(n_splits=10) 115 | ev.eval_with_cross_validation(model, feature_extractors=feature_extractors, 116 | training_set=dataset_train, num_fold=10, cv=kfold) 117 | ev.create_evaluation_result(model, feature_extractors=feature_extractors, 118 | training_set=dataset_train, num_fold=10, cv=kfold) 119 | 120 | print ("\n**** TEST SET EVALUATION (CORPUS: SemEval) ****\n") 121 | ev.eval_with_test_set(model, feature_extractors=feature_extractors, 122 | training_set=dataset_train, 123 | test_set=dataset_test) 124 | 125 | ################################### Naive bayes ########################################## 126 | 127 | print ("\n**** CROSS VALIDATION EVALUATION (CORPUS: SemEval) ****\n") 128 | model = Classifier(models="multinomial") 129 | kfold = KFold(n_splits=10) 130 | ev.eval_with_cross_validation(model, feature_extractors=feature_extractors, 131 | training_set=dataset_train, num_fold=10, cv=kfold) 132 | ev.create_evaluation_result(model, feature_extractors=feature_extractors, 133 | training_set=dataset_train, num_fold=10, cv=kfold) 134 | 135 | print ("\n**** TEST SET EVALUATION (CORPUS: DATASET) ****\n") 136 | ev.eval_with_test_set(model, feature_extractors=feature_extractors, 137 | training_set=dataset_train, 138 | test_set=dataset_test) 139 | 140 | ######################################### RandomForestClassifier ####################### 141 | 142 | print ("\n**** CROSS VALIDATION EVALUATION (CORPUS: SemEval) ****\n") 143 | model = Classifier(models="rfc") 144 | kfold = KFold(n_splits=10) 145 | ev.eval_with_cross_validation(model, feature_extractors=feature_extractors, 146 | training_set=dataset_train, num_fold=10, cv=kfold) 147 | ev.create_evaluation_result(model, feature_extractors=feature_extractors, 148 | training_set=dataset_train, num_fold=10, cv=kfold) 149 | 150 | print ("\n**** TEST SET EVALUATION (CORPUS: SemEval) ****\n") 151 | ev.eval_with_test_set(model, feature_extractors=feature_extractors, 152 | training_set=dataset_train, 153 | test_set=dataset_test) 154 | 155 | ######################################### MLPClassifier ####################### 156 | 157 | print ("\n**** CROSS VALIDATION EVALUATION (CORPUS: SemEval) ****\n") 158 | model = Classifier(models="nn") 159 | kfold = KFold(n_splits=10) 160 | ev.eval_with_cross_validation(model, feature_extractors=feature_extractors, 161 | training_set=dataset_train, num_fold=10, cv=kfold) 162 | ev.create_evaluation_result(model, feature_extractors=feature_extractors, 163 | training_set=dataset_train, num_fold=10, cv=kfold) 164 | 165 | print ("\n**** TEST SET EVALUATION (CORPUS: SemEval) ****\n") 166 | ev.eval_with_test_set(model, feature_extractors=feature_extractors, 167 | training_set=dataset_train, 168 | test_set=dataset_test) 169 | 170 | 171 | if __name__ == '__main__': 172 | main() 173 | -------------------------------------------------------------------------------- /models/Dataset.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import sys 3 | import random 4 | from Review import Review 5 | from sklearn.model_selection import train_test_split 6 | 7 | class DatasetReview(): 8 | """docstring for Dataset""" 9 | 10 | def __init__(self): 11 | self.dataset = [] 12 | self.field_names = [] 13 | self.label_values = [] 14 | self.column_label = "" 15 | 16 | def load_review_from_csv(self, infile): 17 | with open(infile, "rb") as csvfile: 18 | reader = csv.DictReader(csvfile) 19 | 20 | # init field names & label column 21 | self.field_names = reader.fieldnames 22 | self.column_label = self.field_names[-1] 23 | 24 | for rows in reader: 25 | review = Review(rows[self.field_names[0]], rows[self.field_names[1]]) 26 | self.dataset.append(review) 27 | if self.label_values.count(rows[self.column_label]) == 0: 28 | self.label_values.append(rows[self.column_label]) 29 | 30 | return infile 31 | 32 | def dataset_from_array(self, dataset): 33 | n_dataset = DatasetReview() 34 | n_dataset.dataset = dataset 35 | n_dataset.field_names = self.field_names 36 | n_dataset.label_values = self.label_values 37 | n_dataset.column_label = self.column_label 38 | 39 | return n_dataset 40 | 41 | def dataset_from_contents_labels(self, contents, labels): 42 | arr_dataset = [] 43 | for i in xrange(len(contents)): 44 | dr = Review(contents[i], labels[i]) 45 | arr_dataset.append(dr) 46 | 47 | return self.dataset_from_array(arr_dataset) 48 | 49 | def get_dataset_size(self): 50 | return len(self.dataset) 51 | 52 | """get text content for datasets""" 53 | 54 | def get_contents(self): 55 | res = [] 56 | for data in self.dataset: 57 | res.append(data.content) 58 | 59 | return res 60 | 61 | """ get labels for all datasets """ 62 | 63 | def get_labels(self): 64 | res = [] 65 | for data in self.dataset: 66 | res.append(data.polarity) 67 | 68 | return res 69 | 70 | def get_label_enum(self): 71 | return self.label_values 72 | 73 | def get_dataset(self, idx): 74 | return self.dataset[idx] 75 | 76 | def get_formatted_dataset(self): 77 | res = [] 78 | for data in self.dataset: 79 | res.append(data.to_string()) 80 | 81 | return res 82 | 83 | def export_formatted_dataset(self, outfile): 84 | res = self.get_formatted_dataset() 85 | with open(outfile, "wb") as f: 86 | for row in res: 87 | f.write(row + "\n") 88 | 89 | return outfile 90 | 91 | def export_to_csv(self, outfile): 92 | with open(outfile, "wb") as csvfile: 93 | writer = csv.DictWriter(csvfile, fieldnames=self.field_names) 94 | writer.writeheader() 95 | for data in self.dataset: 96 | writer.writerow({ 97 | 'content': data.content, 98 | 'polarity': data.polarity 99 | }) 100 | 101 | return outfile 102 | 103 | def get_data_label_size(self, label): 104 | return sum(1 for x in self.dataset if x.polarity == label) 105 | 106 | def get_data_label(self, label): 107 | return [data for data in self.dataset if data.polarity == label] 108 | 109 | def get_sample_to_minority(self): 110 | if not self.dataset: 111 | return [] 112 | else: 113 | pos_sample = self.get_data_label_size("positive") 114 | neg_sample = self.get_data_label_size("negative") 115 | neu_sample = self.get_data_label_size("neutral") 116 | 117 | print "%d | %d | %d" % (pos_sample, neg_sample, neu_sample) 118 | t_dataset = [] 119 | if pos_sample > neg_sample: 120 | temp = self.get_data_label("positive") 121 | for x in xrange(0, neg_sample): 122 | idx = random.randint(0, len(temp) - 1) 123 | t_dataset.append(temp[idx]) 124 | 125 | # append the minority instance 126 | t_dataset.extend(self.get_data_label("negative")) 127 | m_dts = self.dataset_from_array(t_dataset) 128 | return m_dts 129 | 130 | elif neg_sample > pos_sample: 131 | temp = self.get_data_label("negative") 132 | for x in xrange(1, pos_sample): 133 | idx = random.randint(0, len(temp) - 1) 134 | t_dataset.append(temp[idx]) 135 | 136 | # append the minority instance 137 | t_dataset.extend(self.get_data_label("positive")) 138 | m_dts = self.dataset_from_array(t_dataset) 139 | return m_dts 140 | 141 | else: 142 | return self 143 | 144 | def split_to_ratio(self, ratio): 145 | X_train, X_test, y_train, y_test = train_test_split(self.get_contents(), self.get_labels(), test_size=ratio) 146 | 147 | dataset_train = self.dataset_from_contents_labels(X_train, y_train) 148 | dataset_test = self.dataset_from_contents_labels(X_test, y_test) 149 | 150 | return dataset_train, dataset_test 151 | 152 | def export_only_contents(self, outfile): 153 | with open(outfile, "wb") as ofile: 154 | for data in self.dataset: 155 | ofile.write(data.content + "\n") 156 | 157 | return outfile 158 | 159 | 160 | def main(infile): 161 | dataset = DatasetReview() 162 | dataset.load_review_from_csv(infile) 163 | print dataset.get_label_enum() 164 | dataset.export_formatted_dataset("formatted_dataset.txt") 165 | 166 | print "Positive instances: %d" % (dataset.get_data_label_size("positive")) 167 | print "Negative instances: %d" % (dataset.get_data_label_size("negative")) 168 | 169 | t_dataset = dataset.get_sample_to_minority() 170 | print "Positive instances: %d" % (t_dataset.get_data_label_size("positive")) 171 | print "Negative instances: %d" % (t_dataset.get_data_label_size("negative")) 172 | t_dataset.export_to_csv("sample.csv") 173 | 174 | 175 | if __name__ == '__main__': 176 | main(sys.argv[1]) 177 | -------------------------------------------------------------------------------- /models/__init__.py: -------------------------------------------------------------------------------- 1 | from Dataset import DatasetReview 2 | 3 | -------------------------------------------------------------------------------- /preprocessing/__init__.py: -------------------------------------------------------------------------------- 1 | from preprocesstweets import tweets_prepocess 2 | -------------------------------------------------------------------------------- /preprocessing/csv_header_change.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import os 3 | 4 | inputFileName = "/root/PycharmProjects/project_text_mining/data/trained.csv" 5 | outputFileName = os.path.splitext(inputFileName)[0] + "_modified.csv" 6 | 7 | with open(inputFileName, 'rb') as inFile, open(outputFileName, 'wb') as outfile: 8 | r = csv.reader(inFile) 9 | w = csv.writer(outfile) 10 | 11 | next(r, None) # skip the first row from the reader, the old header 12 | # write new header 13 | w.writerow(['Polarity', 'ID', 'Date', 'Query','User','TWITTER_MESSAGE']) 14 | 15 | # copy the rest 16 | for row in r: 17 | 18 | 19 | w.writerow(row) -------------------------------------------------------------------------------- /preprocessing/csv_tsv.py: -------------------------------------------------------------------------------- 1 | import csv 2 | from twitter import Twitter, OAuth, TwitterHTTPError, TwitterStream 3 | 4 | from random import randint 5 | 6 | 7 | inputFileName = "/root/PycharmProjects/project_text_mining/data/trained.csv" 8 | 9 | with open(inputFileName) as csvfile : 10 | readCSV = csv.reader(csvfile, delimiter=',') 11 | 12 | 13 | 14 | 15 | for row in readCSV: 16 | 17 | 18 | if (row[0]=="0"): 19 | row[0]= "negative" 20 | elif (row[0]=="4"): 21 | row[0]= "positive" 22 | else : row [0]=="neutral" 23 | 24 | t1 = row[4] 25 | t2 = row[0] 26 | 27 | row[4] = row [0] 28 | row[0]=t1 29 | row[0]=row[1] 30 | row[1]=t1 31 | #print(row[1]) 32 | del(row[2]) 33 | del(row[2]) 34 | row[1]= str(randint(11111111, 99999999)) 35 | 36 | print '\t'.join(str(p) for p in row) 37 | #print "\t".join(row) 38 | 39 | 40 | -------------------------------------------------------------------------------- /preprocessing/preprocesstweets.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import re 3 | 4 | 5 | def replaceTwoOrMore(s): 6 | # look for 2 or more repetitions of character 7 | patt = re.compile(r"(.)\1{1,}", re.DOTALL) 8 | return patt.sub(r"\1\1", s) 9 | 10 | 11 | def processTweet(tweet): 12 | # process the tweets 13 | # Convert to lower case 14 | tweet = tweet.lower() 15 | # Convert www.* or https?://* to URL 16 | tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', 'url', tweet) 17 | # Convert @username to AT_USER 18 | tweet = re.sub('@[^\s]+', 'at_user', tweet) 19 | # Remove additional white spaces 20 | tweet = re.sub('[\s]+', ' ', tweet) 21 | # Replace #word with word 22 | tweet = re.sub(r'#([^\s]+)', r'\1', tweet) 23 | # trim 24 | tweet = tweet.strip('\'"') 25 | return tweet 26 | 27 | 28 | def getStopWordList(fname): 29 | # read the stopwords 30 | stopWords = [] 31 | stopWords.append('rt') 32 | stopWords.append('url') 33 | stopWords.append('at_user') 34 | 35 | fp = open(fname, 'r') 36 | line = fp.readline() 37 | while line: 38 | word = line.strip() 39 | stopWords.append(word) 40 | line = fp.readline() 41 | fp.close() 42 | return stopWords 43 | 44 | 45 | def getFeatureVector(tweet, stopWords): 46 | featureVector = [] 47 | words = tweet.split() 48 | for w in words: 49 | # replace two or more with two occurrences 50 | w = replaceTwoOrMore(w) 51 | # strip punctuation 52 | w = w.strip('\'"?,.') 53 | # check if it consists of only words 54 | val = re.search(r"^[a-zA-Z][a-zA-Z0-9]*[a-zA-Z]+[a-zA-Z0-9]*$", w) 55 | # ignore if it is a stopWord 56 | if (w in stopWords or val is None): 57 | continue 58 | else: 59 | featureVector.append(w.lower()) 60 | return featureVector 61 | 62 | 63 | def tweets_prepocess(inputfile, outputfile, stopwordfile): 64 | with open(inputfile, 'r') as f: 65 | readerCSV = csv.reader(f, delimiter=',', dialect='excel') 66 | #print(readerCSV) 67 | data = [] 68 | stopwords = getStopWordList(stopwordfile) 69 | for row in readerCSV: 70 | if len(row)>1: 71 | tweet = row[0] 72 | label = row[1] 73 | #print(tweet) 74 | tweet_processed = processTweet(replaceTwoOrMore(tweet)) 75 | tweet_features = getFeatureVector(tweet_processed, stopwords) 76 | #print(tweet) 77 | #print("tweet") 78 | #print(tweet_features) 79 | if len(tweet_features) > 3: 80 | tweet_tokens = ' '.join(tweet_features) 81 | #data.append(['" {} "'.format(tweet_tokens),'" {} "'.format(label)]) 82 | data.append(['" {} "'.format(tweet_tokens),label]) 83 | #data.append([tweet_tokens, label]) 84 | f.close() 85 | 86 | # preprocess and store in a new clean csv file 87 | with open(outputfile, "w") as fw: 88 | writer = csv.writer(fw, delimiter='\t', lineterminator='\n', dialect='excel', quoting=csv.QUOTE_NONE, 89 | quotechar=None) 90 | #writer = csv.writer(fw, delimiter='\t', lineterminator='\n', dialect='excel', quoting=csv.QUOTE_NONE, quotechar=None) 91 | #writer = csv.writer(fw, delimiter='\t', dialect='excel-tab',quoting=csv.QUOTE_NONE, quotechar=None) 92 | for x in data: 93 | if len(x[1]) != 0: 94 | writer.writerow(x) 95 | fw.close() 96 | -------------------------------------------------------------------------------- /preprocessing/process_json_file.py: -------------------------------------------------------------------------------- 1 | # Import the necessary package to process data in JSON format 2 | try: 3 | import json 4 | except ImportError: 5 | import simplejson as json 6 | import logging 7 | from pymongo import MongoClient 8 | 9 | logger = logging.Logger('catch_all') 10 | 11 | tweet_filename = "/root/PycharmProjects/project_text_mining/tweets.txt" 12 | tweet_file = open(tweet_filename,'r') 13 | keys_to_delete = ["favorited","contributors","truncated","possibly_sensitive","is_quote_status"\ 14 | ,"in_reply_to_status_id","filter_level","geo","favorite_count","extended_tweet"\ 15 | ,"entities","in_reply_to_user_id_str","retweeted","coordinates","timestamp_ms"\ 16 | ,"source","in_reply_to_status_id_str","in_reply_to_screen_name","display_text_range"\ 17 | ,"place","retweet_count","in_reply_to_user_id","user","_id"] 18 | 19 | client = MongoClient('localhost', 27017) 20 | db = client.twitter_db 21 | 22 | 23 | for line in tweet_file: 24 | try: 25 | tweet = json.loads(line.strip()) 26 | tweet["text"] = tweet["extended_tweet"]["full_text"].encode("utf-8") 27 | for key in keys_to_delete: 28 | if key in tweet: 29 | del tweet[key] 30 | 31 | posts = db.twitter_db 32 | posts.insert(tweet) 33 | print(tweet) 34 | 35 | except Exception as e: 36 | # read in a line is not in JSON format (sometimes error occured) 37 | logger.error(e, exc_info=True) 38 | print(e) 39 | tweet_file.close() 40 | 41 | 42 | 43 | """ 44 | 45 | try : 46 | tweet = json.loads(line.strip()) 47 | keys.append(tweet.keys()) 48 | new_dict = {"text": tweet["text"] for key in keys} 49 | print(new_dict) 50 | #dict_i_want = {key: tweet[key] for key in keys} 51 | #print dict_i_want 52 | #if 'extended_tweet' in tweet.keys() : 53 | #print tweet['extended_tweet']['full_text'].encode('utf-8') # content of the tweet 54 | except Exception as e: 55 | # read in a line is not in JSON format (sometimes error occured) 56 | logger.error(e, exc_info=True) 57 | print(e) 58 | tweet_file.close() 59 | """ -------------------------------------------------------------------------------- /preprocessing/stopwords.txt: -------------------------------------------------------------------------------- 1 | a 2 | about 3 | above 4 | across 5 | after 6 | afterwards 7 | again 8 | against 9 | all 10 | almost 11 | alone 12 | along 13 | already 14 | also 15 | although 16 | always 17 | am 18 | among 19 | amongst 20 | amoungst 21 | amount 22 | an 23 | and 24 | another 25 | any 26 | anyhow 27 | anyone 28 | anything 29 | anyway 30 | anywhere 31 | are 32 | around 33 | as 34 | at 35 | back 36 | be 37 | became 38 | because 39 | become 40 | becomes 41 | becoming 42 | been 43 | before 44 | beforehand 45 | behind 46 | being 47 | below 48 | beside 49 | besides 50 | between 51 | beyond 52 | bill 53 | both 54 | bottom 55 | but 56 | by 57 | call 58 | can 59 | cannot 60 | cant 61 | co 62 | computer 63 | con 64 | could 65 | couldnt 66 | cry 67 | de 68 | describe 69 | detail 70 | do 71 | done 72 | down 73 | due 74 | during 75 | each 76 | eg 77 | eight 78 | either 79 | eleven 80 | else 81 | elsewhere 82 | empty 83 | enough 84 | etc 85 | even 86 | ever 87 | every 88 | everyone 89 | everything 90 | everywhere 91 | except 92 | few 93 | fifteen 94 | fify 95 | fill 96 | find 97 | fire 98 | first 99 | five 100 | for 101 | former 102 | formerly 103 | forty 104 | found 105 | four 106 | from 107 | front 108 | full 109 | further 110 | get 111 | give 112 | go 113 | had 114 | has 115 | hasnt 116 | have 117 | he 118 | hence 119 | her 120 | here 121 | hereafter 122 | hereby 123 | herein 124 | hereupon 125 | hers 126 | herse" 127 | him 128 | himse" 129 | his 130 | how 131 | however 132 | hundred 133 | i 134 | ie 135 | if 136 | in 137 | inc 138 | indeed 139 | interest 140 | into 141 | is 142 | it 143 | its 144 | itse" 145 | keep 146 | last 147 | latter 148 | latterly 149 | least 150 | less 151 | ltd 152 | made 153 | many 154 | may 155 | me 156 | meanwhile 157 | might 158 | mill 159 | mine 160 | more 161 | moreover 162 | most 163 | mostly 164 | move 165 | much 166 | must 167 | my 168 | myse" 169 | name 170 | namely 171 | neither 172 | never 173 | nevertheless 174 | next 175 | nine 176 | no 177 | nobody 178 | none 179 | noone 180 | nor 181 | not 182 | nothing 183 | now 184 | nowhere 185 | of 186 | off 187 | often 188 | on 189 | once 190 | one 191 | only 192 | onto 193 | or 194 | other 195 | others 196 | otherwise 197 | our 198 | ours 199 | ourselves 200 | out 201 | over 202 | own 203 | part 204 | per 205 | perhaps 206 | please 207 | put 208 | rather 209 | re 210 | same 211 | see 212 | seem 213 | seemed 214 | seeming 215 | seems 216 | serious 217 | several 218 | she 219 | should 220 | show 221 | side 222 | since 223 | sincere 224 | six 225 | sixty 226 | so 227 | some 228 | somehow 229 | someone 230 | something 231 | sometime 232 | sometimes 233 | somewhere 234 | still 235 | such 236 | system 237 | take 238 | ten 239 | than 240 | that 241 | the 242 | their 243 | them 244 | themselves 245 | then 246 | thence 247 | there 248 | thereafter 249 | thereby 250 | therefore 251 | therein 252 | thereupon 253 | these 254 | they 255 | thick 256 | thin 257 | third 258 | this 259 | those 260 | though 261 | three 262 | through 263 | throughout 264 | thru 265 | thus 266 | to 267 | together 268 | too 269 | top 270 | toward 271 | towards 272 | twelve 273 | twenty 274 | two 275 | un 276 | under 277 | until 278 | up 279 | upon 280 | us 281 | very 282 | via 283 | was 284 | we 285 | well 286 | were 287 | what 288 | whatever 289 | when 290 | whence 291 | whenever 292 | where 293 | whereafter 294 | whereas 295 | whereby 296 | wherein 297 | whereupon 298 | wherever 299 | whether 300 | which 301 | while 302 | whither 303 | who 304 | whoever 305 | whole 306 | whom 307 | whose 308 | why 309 | will 310 | with 311 | within 312 | without 313 | would 314 | yet 315 | you 316 | your 317 | yours 318 | yourself 319 | yourselves 320 | , 321 | . 322 | http 323 | https 324 | @ 325 | ? 326 | ! 327 | - 328 | http... 329 | : 330 | 1 331 | 2 332 | 3 333 | 4 334 | 5 335 | 6 336 | 7 337 | 8 338 | 9 339 | rt 340 | 's 341 | 't 342 | amp -------------------------------------------------------------------------------- /preprocessing/store_tweet_in_db.py: -------------------------------------------------------------------------------- 1 | from pymongo import MongoClient 2 | import json 3 | try: 4 | import json 5 | except ImportError: 6 | import simplejson as json 7 | import logging 8 | logger = logging.Logger('catch_all') 9 | 10 | client = MongoClient('localhost', 27017) 11 | db = client.twitter_db 12 | 13 | tweet_filename = "/root/PycharmProjects/untitled1/full_tweet.txt" 14 | tweet_file = open(tweet_filename,'r') 15 | 16 | for line in tweet_file: 17 | try: 18 | tweet = json.loads(line.strip()) 19 | print(type(json)) 20 | #posts = db.posts 21 | #posts.insert(post) 22 | except Exception as e: 23 | # read in a line is not in JSON format (sometimes error occured) 24 | logger.error(e, exc_info=True) 25 | print(e) -------------------------------------------------------------------------------- /preprocessing/twitter_streaming.py: -------------------------------------------------------------------------------- 1 | 2 | # Import the necessary package to process data in JSON format 3 | try: 4 | import json 5 | except ImportError: 6 | import simplejson as json 7 | # Import the necessary methods from "twitter" library 8 | from twitter import Twitter, OAuth, TwitterHTTPError, TwitterStream 9 | 10 | ### Secret information for the authentication with the twitter API 11 | ACCESS_TOKEN = '172760940-WFyHDrF0gwA0aj7snvx8KFuaUgyG8L6hB3Wndi9b' 12 | ACCESS_SECRET = 'qW74TbTr6O98zEVn91tc2J8jkiSAAMRO7k91WzD4yx5MH' 13 | CONSUMER_KEY = 'U5gIP4mJsHVIB1GKTLglHeOoE' 14 | CONSUMER_SECRET = 'jm1lPsrRa7QYoQKW7thbN3stqWV8e6wl7ybM3AgqWJMoIMFAkq' 15 | 16 | oauth = OAuth(ACCESS_TOKEN, ACCESS_SECRET, CONSUMER_KEY, CONSUMER_SECRET) 17 | twitter_stream = TwitterStream(auth=oauth) 18 | iterator = twitter_stream.statuses.sample(language="en") 19 | 20 | 21 | #specify the tweets number to collect 22 | tweet_count = 10 23 | keys = [] 24 | for tweet in iterator: 25 | if 'extended_tweet' in tweet.keys(): 26 | tweet_count -= 1 27 | # Twitter Python Tool wraps the data returned by Twitter 28 | # as a TwitterDictResponse object. 29 | # We convert it back to the JSON format to print/score 30 | print json.dumps(tweet) 31 | 32 | 33 | 34 | 35 | # The command below will do pretty printing for JSON data, try it out 36 | # print json.dumps(tweet, indent=4) 37 | 38 | if tweet_count <= 0: 39 | 40 | break 41 | 42 | -------------------------------------------------------------------------------- /text_mining_project_report.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AzizCode92/Learning-Sentiment-Specific-Word-embedding-for-Twitter-Sentiment-Classification/7deabbb5f532065ff4f598781dee2b2c58fb1464/text_mining_project_report.pdf --------------------------------------------------------------------------------