├── .gitignore ├── LICENSE ├── README.md └── sentiment ├── __init__.py └── sentiment.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled source # 2 | ################### 3 | *.com 4 | *.class 5 | *.dll 6 | *.exe 7 | *.o 8 | *.so 9 | 10 | # Logs and databases # 11 | ###################### 12 | *.log 13 | *.sql 14 | *.sqlite 15 | 16 | # OS generated files # 17 | ###################### 18 | .DS_Store 19 | .DS_Store? 20 | ._* 21 | .Spotlight-V100 22 | .Trashes 23 | Icon? 24 | ehthumbs.db 25 | Thumbs.db 26 | 27 | # Misc # 28 | ######## 29 | # text edit temp files 30 | *~ 31 | # Sublime text project files 32 | *.sublime-workspace 33 | 34 | # Byte-compiled / optimized / DLL files 35 | __pycache__/ 36 | *.py[cod] 37 | 38 | # C extensions 39 | *.so 40 | 41 | # Distribution / packaging 42 | bin/ 43 | build/ 44 | develop-eggs/ 45 | dist/ 46 | eggs/ 47 | lib/ 48 | lib64/ 49 | parts/ 50 | sdist/ 51 | var/ 52 | *.egg-info/ 53 | .installed.cfg 54 | *.egg 55 | 56 | # Installer logs 57 | pip-log.txt 58 | pip-delete-this-directory.txt 59 | 60 | # Unit test / coverage reports 61 | .tox/ 62 | .coverage 63 | .cache 64 | nosetests.xml 65 | coverage.xml 66 | 67 | # Translations 68 | *.mo 69 | 70 | # Mr Developer 71 | .mr.developer.cfg 72 | .project 73 | .pydevproject 74 | 75 | # Rope 76 | .ropeproject 77 | 78 | # Django stuff: 79 | *.log 80 | *.pot 81 | 82 | # Sphinx documentation 83 | docs/_build/ 84 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014 Romain Strock 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Sentiment Analysis 2 | ================== 3 | 4 | Sentiment Analysis using logistic regression (via gradient descent). 5 | 6 | --> [Usage Example](https://github.com/srom/sentiment/blob/master/sentiment/sentiment.py#L390) 7 | 8 | ### Dependencies 9 | 10 | - [Python 2.7](https://www.python.org/download/releases/2.7) 11 | - [NLTK](http://www.nltk.org/) 12 | - [Numpy](http://www.numpy.org/) 13 | 14 | ### License 15 | 16 | [MIT License](https://github.com/srom/sentiment/blob/master/LICENSE) 17 | -------------------------------------------------------------------------------- /sentiment/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- -------------------------------------------------------------------------------- /sentiment/sentiment.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import re 4 | from nltk.corpus import movie_reviews, stopwords 5 | from nltk.util import ngrams 6 | from nltk.tokenize import word_tokenize 7 | from nltk.stem.porter import PorterStemmer 8 | from nltk.probability import FreqDist 9 | import numpy as np 10 | 11 | 12 | #==== General parameters 13 | FEATURES_NUMBER = 2000 14 | NGRAMS_NUMBER = 2 15 | REGULARISATION = 10.0 16 | 17 | #==== Gradient descent constants 18 | SPEED = 0.001 19 | MAX_ITERATIONS = 20 20 | THRESHOLD_CONVERGENCE = 1 # in percentage 21 | 22 | #==== Text processing constants 23 | BLACKLIST_STOPWORDS = ['over','only','very','not','no'] 24 | ENGLISH_STOPWORDS = set(stopwords.words('english')) - set(BLACKLIST_STOPWORDS) 25 | NEG_CONTRACTIONS = [ 26 | (r'aren\'t', 'are not'), 27 | (r'can\'t', 'can not'), 28 | (r'couldn\'t', 'could not'), 29 | (r'daren\'t', 'dare not'), 30 | (r'didn\'t', 'did not'), 31 | (r'doesn\'t', 'does not'), 32 | (r'don\'t', 'do not'), 33 | (r'isn\'t', 'is not'), 34 | (r'hasn\'t', 'has not'), 35 | (r'haven\'t', 'have not'), 36 | (r'hadn\'t', 'had not'), 37 | (r'mayn\'t', 'may not'), 38 | (r'mightn\'t', 'might not'), 39 | (r'mustn\'t', 'must not'), 40 | (r'needn\'t', 'need not'), 41 | (r'oughtn\'t', 'ought not'), 42 | (r'shan\'t', 'shall not'), 43 | (r'shouldn\'t', 'should not'), 44 | (r'wasn\'t', 'was not'), 45 | (r'weren\'t', 'were not'), 46 | (r'won\'t', 'will not'), 47 | (r'wouldn\'t', 'would not'), 48 | (r'ain\'t', 'am not') # not only but stopword anyway 49 | ] 50 | OTHER_CONTRACTIONS = { 51 | "'m": 'am', 52 | "'ll": 'will', 53 | "'s": 'has', # or 'is' but both are stopwords 54 | "'d": 'had' # or 'would' but both are stopwords 55 | } 56 | 57 | class SentimentMachine(object): 58 | """ 59 | This class train a logistic regression model to analyse the sentiment 60 | of a document. Sentiment is either negative (0) or positive (1). 61 | """ 62 | 63 | def __init__(self, training_set, score_set): 64 | """ 65 | Init the SentimentMachine with the training set. 66 | 67 | Args: 68 | training_set: A list of documents (list of strings) 69 | score_set: A list of sentiment scores (list of numbers) 70 | 71 | len(training_set) and len(score_set) must be equal. 72 | """ 73 | self.training_set = training_set 74 | self.score_set = score_set 75 | self.stemmer = PorterStemmer() 76 | # dictionnary of sets of ngrams 77 | self._most_common_ngrams = {} 78 | # weight vector 79 | self.w = None 80 | 81 | def compute_ngrams(self, document, n): 82 | """ 83 | Compute ngrams of the document. 84 | 85 | Args: 86 | document: The document as a string. 87 | n: The number of grams. Must be a positive interger. 88 | 89 | Returns: 90 | A list of ngrams. 91 | """ 92 | # lowercase 93 | doc = document.lower() 94 | # TODO split by sentences for more accuracy 95 | # transform negative contractions (e.g don't --> do not) 96 | for t in NEG_CONTRACTIONS: 97 | doc = re.sub(t[0], t[1], doc) 98 | # tokenize 99 | tokens = word_tokenize(doc) 100 | # transform other contractions (e.g 'll --> will) 101 | tokens = [OTHER_CONTRACTIONS[token] if OTHER_CONTRACTIONS.get(token) 102 | else token for token in tokens] 103 | # remove punctuation 104 | r = r'[a-z]+' 105 | tokens = [word for word in tokens if re.search(r, word)] 106 | # remove irrelevant stop words 107 | tokens = [token for token in tokens if token not in ENGLISH_STOPWORDS] 108 | # stemming 109 | tokens = [self.stemmer.stem(token) for token in tokens] 110 | if n == 1: 111 | # return the list of words 112 | return tokens 113 | else: 114 | # return the list of ngrams 115 | return ngrams(tokens, n) 116 | 117 | def get_most_common_ngrams(self, n, nb_ngrams=None): 118 | """ 119 | Compute and return the set of the most common ngrams in the documents. 120 | This set is cached inside the object. 121 | 122 | Args: 123 | n: The number of grams. Must be a positive interger. 124 | nb_ngrams: The number of ngrams to return, i.e quantifying the 'most'. 125 | 126 | Returns: 127 | A list of the most common ngrams. 128 | """ 129 | try: 130 | # return cached value 131 | return self._most_common_ngrams[n] 132 | except KeyError: 133 | pass 134 | 135 | # compute all ngrams 136 | all_ngrams = [] 137 | for document in self.training_set: 138 | all_ngrams.extend(self.compute_ngrams(document, n)) 139 | 140 | # get the frequency or return all ngrams 141 | freq = FreqDist(ngram for ngram in all_ngrams) 142 | # store and return the nb_ngrams most common ngrams 143 | if nb_ngrams: 144 | self._most_common_ngrams[n] = freq.keys()[:nb_ngrams] 145 | else: 146 | self._most_common_ngrams[n] = freq.keys() 147 | return self._most_common_ngrams[n] 148 | 149 | def document_features(self, document): 150 | """ 151 | Compute the nb features of a given document. 152 | - most common words: 1 if the document contains this word, else 0 153 | - most common bigrams: 1 if the document contains this bigram, else 0 154 | 155 | Args: 156 | document: The document as a string. 157 | 158 | Returns: 159 | A list of binary features. 160 | """ 161 | features = [] 162 | 163 | # most common ngrams for n = 1 to NGRAMS_NUMBER 164 | nb_ngrams = NGRAMS_NUMBER 165 | nb_features = FEATURES_NUMBER / nb_ngrams 166 | for n in range(nb_ngrams): 167 | common_ngrams = [] 168 | # get ngrams in the document 169 | ngrams = set(self.compute_ngrams(document, n+1)) 170 | for ngram in self.get_most_common_ngrams(n+1, nb_features): 171 | # if ngram is a common one then feature = 1 else 0 172 | common_ngrams.append(1 if ngram in ngrams else 0) 173 | # add new feature 174 | features.extend(common_ngrams) 175 | 176 | return features 177 | 178 | 179 | def compute_features_matrix(self, train_set=None): 180 | """ 181 | Load the NxM matrix X where N is equals to the number of documents 182 | in the set and M is equal to the number of features. 183 | 184 | Args: 185 | train_set: A list of documents (list of strings). 186 | If None, self.training_set is used. 187 | 188 | Returns: 189 | A NxM matrix (numpy.array) 190 | """ 191 | m = [] 192 | for document in train_set or self.training_set: 193 | m.append(self.document_features(document)) 194 | return np.array(m) 195 | 196 | 197 | def train(self, speed=0.001, stochastic=False): 198 | """ 199 | Train the model via logistic regression (stochastic gradient descent). 200 | 201 | Args: 202 | speed: Speed of the gradient descent. 203 | """ 204 | # load training matrix 205 | print '==== Compute training set features...' 206 | x = self.compute_features_matrix() 207 | y = np.transpose(np.array(self.score_set)) 208 | print '==== Done' 209 | 210 | # shuffle 211 | [n,m] = x.shape 212 | print 'Number of features: %d' % m 213 | indices = np.random.permutation(n) 214 | x, y = x[indices,:], y[indices, :] 215 | 216 | # inital value for w 217 | w_zero = np.zeros(m) 218 | 219 | # train like a boss 220 | print '==== Start training...' 221 | method = 'Stochastic' if stochastic else 'Batch' 222 | print '==== (%s Gradient Descent)' % method 223 | self.w = gradient_descent(x, y, w_zero, speed=speed, stochastic=stochastic) 224 | print '==== Done' 225 | return self.w 226 | 227 | 228 | def classify(self, test_string): 229 | """ 230 | Test the logistic model on the given string. 231 | 232 | Args: 233 | test_string: the test string. 234 | 235 | Returns: 236 | The predicted output value. 237 | """ 238 | if self.w is None: 239 | raise ValueError('Looks like you forgot to .train() ' 240 | + 'the model before .classify()-ing it!') 241 | 242 | # get features vector 243 | x = np.array(self.document_features(test_string)) 244 | 245 | # compute h(transpose(w) * x) and return the result according 246 | # to the boundary h(transpose(w) * x) = 0.5 247 | return 1 if sigmoid(np.dot(np.transpose(self.w), x)) >= 0.5 else 0 248 | 249 | 250 | def sigmoid(z): 251 | """ 252 | The sigmoid / logistic function. 253 | 254 | Args: 255 | z: any real number. 256 | 257 | Returns: 258 | A value between O and 1. 259 | """ 260 | return 1.0 / (1.0 + np.exp(-1.0 * z)) 261 | 262 | def cost(w, x, y, h): 263 | """ 264 | Cost function of the logistic regression. 265 | 266 | Args: 267 | w: weight vector (numpy.array) 268 | x: documents matrix (numpy.array) 269 | y: output vector (numpy.array) 270 | h: function of x and w 271 | 272 | Returns: 273 | The cost value (float). 274 | """ 275 | [n, m] = x.shape 276 | val = 0 277 | # cost 278 | for i in xrange(n): 279 | val += (y[i] * np.log(h(x[i], w)) 280 | + (1.0 - y[i]) * np.log(1.0 - h(x[i], w))) 281 | # regularisation 282 | reg = REGULARISATION * np.dot(np.transpose(w), w) / (2.0 * n) 283 | return -1.0 * (val / n) + reg 284 | 285 | 286 | def batch_descent(w, x, y, h, speed): 287 | """ 288 | Compute w (Batch gradient descent). 289 | 290 | Args: 291 | w: weight vector (numpy.array) 292 | x: documents matrix (numpy.array) 293 | y: output vector (numpy.array) 294 | h: function of x and w 295 | 296 | Returns: 297 | The gradient vector (list of float values). 298 | """ 299 | [n, m] = x.shape 300 | for i in xrange(m): 301 | reg = REGULARISATION * w[i] / n 302 | for j in xrange(n): 303 | w[i] = w[i] - speed * ((h(x[j], w) - y[j]) * x[j,i] - reg) 304 | 305 | 306 | def stochastic_descent(w, x, y, h, speed): 307 | """ 308 | Compute w (Stochastic gradient descent). 309 | 310 | Args: 311 | w: weight vector (numpy.array) 312 | x: documents matrix (numpy.array) 313 | y: output vector (numpy.array) 314 | h: function of x and w 315 | 316 | Returns: 317 | The gradient vector (list of float values). 318 | """ 319 | [n, m] = x.shape 320 | for i in xrange(n): 321 | for j in xrange(m): 322 | reg = REGULARISATION * w[j] / n 323 | w[j] = w[j] - speed * ((h(x[i], w) - y[i]) * x[i,j] - reg) 324 | 325 | def gradient_descent(x, y, w_zero, speed=SPEED, stochastic=False, 326 | threshold=THRESHOLD_CONVERGENCE, max_iter=MAX_ITERATIONS): 327 | """ 328 | Gradient descent (either batch or stochastic) find a local minimum of a 329 | function f by iteratively substract a proportion of the gradient of f. 330 | 331 | Args: 332 | x: The train set (numpy.array). 333 | y: The training output vector (numpy.array). 334 | w_zero: initial value of the parameter (numpy.array). 335 | speed: The speed of the descent (float). 336 | stochastic: Batch or Stochastic gradient descent (Boolean). 337 | threshold: Convergence threshold for the difference between two 338 | consecutive cost function values (float, in percent). 339 | max_iter: Maximum number of iterations (integer). 340 | 341 | Returns: 342 | The weight vector which minimize the logistic cost function (numpy.array) 343 | """ 344 | # get the dimensions of the train set 345 | [n,m] = x.shape 346 | # init the weight vector 347 | w = w_zero 348 | # init variables 349 | iteration = 0 350 | diff = threshold + 1 351 | last_cost_val = 0 352 | # define h as the sigmoid of transpose(w[i]) * x[i] 353 | h = lambda a,b: sigmoid(np.dot(a,b)) 354 | # gradient descent 355 | while ( 356 | iteration < max_iter 357 | and diff > threshold 358 | ): 359 | iteration += 1 360 | print 'iteration %d...' % iteration 361 | 362 | # compute w 363 | if stochastic: 364 | # stochastic gradient descent 365 | stochastic_descent(w, x, y, h, speed) 366 | else: 367 | # batch gradient descent 368 | batch_descent(w, x, y, h, speed) 369 | 370 | # check convergence 371 | cost_val = cost(w, x, y, h) 372 | if iteration > 1: 373 | diff = abs(100 - (last_cost_val / cost_val) * 100) 374 | last_cost_val = cost_val 375 | valid = 0 376 | for i in xrange(n): 377 | v = 1 if sigmoid(np.dot(w, x[i])) >= 0.5 else 0 378 | valid += 1 if v == y[i] else 0 379 | percent = 100.0 * valid / n 380 | 381 | print ('Well-classified documents: {0} / {1} ({2}%)' 382 | .format(valid, n, percent)) 383 | print 'Cost value: %.4f' % cost_val 384 | print 'DIFF: %.4f %%' % diff 385 | print 386 | 387 | return w 388 | 389 | 390 | def main(): 391 | """ 392 | Sample training using the movie reviews corpus (Pang, Lee). 393 | """ 394 | 395 | #== load inputs 396 | documents = np.array([movie_reviews.raw(review_id) 397 | for category in movie_reviews.categories() 398 | for review_id in movie_reviews.fileids(category)]) 399 | 400 | sentiment_scores = np.array([0 if category == 'neg' else 1 401 | for category in movie_reviews.categories() 402 | for review_id in movie_reviews.fileids(category)]) 403 | 404 | #== select random indices 405 | n = documents.shape[0] 406 | indices = np.random.permutation(n) 407 | threshold = np.floor(n*0.8) # 80% training set / 20% test set 408 | train_idx, test_idx = indices[:threshold], indices[threshold:] 409 | 410 | #== select training and validation sets according to these indicies 411 | x_train, x_test = documents[:, train_idx], documents[:, test_idx] 412 | y_train, y_test = sentiment_scores[:, train_idx], sentiment_scores[:, test_idx] 413 | 414 | #== train the model 415 | print '===== Training the model...' 416 | sentiment = SentimentMachine(x_train.tolist(), y_train.tolist()) 417 | w = sentiment.train(speed=0.001, stochastic=False) 418 | print '===== Model trained.' 419 | 420 | #== test efficiency of the model 421 | print '===== Testing the model...' 422 | # compute the MSE 423 | h = lambda a,b: sigmoid(np.dot(a,b)) 424 | x = sentiment.compute_features_matrix(x_test.tolist()) 425 | mse = cost(w, x, y_test, h) 426 | # compute the number of valid classifications 427 | n_test = y_test.shape[0] 428 | valid = 0 429 | for i in xrange(n_test): 430 | valid += 1 if sentiment.classify(x_test[i]) == y_test[i] else 0 431 | percent = 100.0 * valid / n_test 432 | # print results 433 | print ('== Number of well-classified documents: {0} / {1} ({2}%)' 434 | .format(valid, n_test, percent)) 435 | print '== Cost value on the test set: %.4f' % mse 436 | 437 | 438 | if __name__ == '__main__': 439 | main() 440 | --------------------------------------------------------------------------------