├── .gitignore
├── LICENSE
├── README.md
└── sentiment
    ├── __init__.py
    └── sentiment.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Compiled source #
 2 | ###################
 3 | *.com
 4 | *.class
 5 | *.dll
 6 | *.exe
 7 | *.o
 8 | *.so
 9 | 
10 | # Logs and databases #
11 | ######################
12 | *.log
13 | *.sql
14 | *.sqlite
15 | 
16 | # OS generated files #
17 | ######################
18 | .DS_Store
19 | .DS_Store?
20 | ._*
21 | .Spotlight-V100
22 | .Trashes
23 | Icon?
24 | ehthumbs.db
25 | Thumbs.db
26 | 
27 | # Misc #
28 | ########
29 | # text edit temp files
30 | *~
31 | # Sublime text project files
32 | *.sublime-workspace
33 | 
34 | # Byte-compiled / optimized / DLL files
35 | __pycache__/
36 | *.py[cod]
37 | 
38 | # C extensions
39 | *.so
40 | 
41 | # Distribution / packaging
42 | bin/
43 | build/
44 | develop-eggs/
45 | dist/
46 | eggs/
47 | lib/
48 | lib64/
49 | parts/
50 | sdist/
51 | var/
52 | *.egg-info/
53 | .installed.cfg
54 | *.egg
55 | 
56 | # Installer logs
57 | pip-log.txt
58 | pip-delete-this-directory.txt
59 | 
60 | # Unit test / coverage reports
61 | .tox/
62 | .coverage
63 | .cache
64 | nosetests.xml
65 | coverage.xml
66 | 
67 | # Translations
68 | *.mo
69 | 
70 | # Mr Developer
71 | .mr.developer.cfg
72 | .project
73 | .pydevproject
74 | 
75 | # Rope
76 | .ropeproject
77 | 
78 | # Django stuff:
79 | *.log
80 | *.pot
81 | 
82 | # Sphinx documentation
83 | docs/_build/
84 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2014 Romain Strock
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Sentiment Analysis
 2 | ==================
 3 | 
 4 | Sentiment Analysis using logistic regression (via gradient descent).
 5 | 
 6 | --> [Usage Example](https://github.com/srom/sentiment/blob/master/sentiment/sentiment.py#L390)
 7 | 
 8 | ### Dependencies
 9 | 
10 |  - [Python 2.7](https://www.python.org/download/releases/2.7)
11 |  - [NLTK](http://www.nltk.org/)
12 |  - [Numpy](http://www.numpy.org/)
13 | 
14 | ### License
15 | 
16 | [MIT License](https://github.com/srom/sentiment/blob/master/LICENSE)
17 | 


--------------------------------------------------------------------------------
/sentiment/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-


--------------------------------------------------------------------------------
/sentiment/sentiment.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import re
  4 | from nltk.corpus import movie_reviews, stopwords
  5 | from nltk.util import ngrams
  6 | from nltk.tokenize import word_tokenize
  7 | from nltk.stem.porter import PorterStemmer
  8 | from nltk.probability import FreqDist
  9 | import numpy as np
 10 | 
 11 | 
 12 | #==== General parameters
 13 | FEATURES_NUMBER = 2000
 14 | NGRAMS_NUMBER = 2
 15 | REGULARISATION = 10.0
 16 | 
 17 | #==== Gradient descent constants
 18 | SPEED = 0.001
 19 | MAX_ITERATIONS = 20
 20 | THRESHOLD_CONVERGENCE = 1 # in percentage
 21 | 
 22 | #==== Text processing constants
 23 | BLACKLIST_STOPWORDS = ['over','only','very','not','no']
 24 | ENGLISH_STOPWORDS = set(stopwords.words('english')) - set(BLACKLIST_STOPWORDS)
 25 | NEG_CONTRACTIONS = [
 26 |     (r'aren\'t', 'are not'),
 27 |     (r'can\'t', 'can not'),
 28 |     (r'couldn\'t', 'could not'),
 29 |     (r'daren\'t', 'dare not'),
 30 |     (r'didn\'t', 'did not'),
 31 |     (r'doesn\'t', 'does not'),
 32 |     (r'don\'t', 'do not'),
 33 |     (r'isn\'t', 'is not'),
 34 |     (r'hasn\'t', 'has not'),
 35 |     (r'haven\'t', 'have not'),
 36 |     (r'hadn\'t', 'had not'),
 37 |     (r'mayn\'t', 'may not'),
 38 |     (r'mightn\'t', 'might not'),
 39 |     (r'mustn\'t', 'must not'),
 40 |     (r'needn\'t', 'need not'),
 41 |     (r'oughtn\'t', 'ought not'),
 42 |     (r'shan\'t', 'shall not'),
 43 |     (r'shouldn\'t', 'should not'),
 44 |     (r'wasn\'t', 'was not'),
 45 |     (r'weren\'t', 'were not'),
 46 |     (r'won\'t', 'will not'),
 47 |     (r'wouldn\'t', 'would not'),
 48 |     (r'ain\'t', 'am not') # not only but stopword anyway
 49 | ]
 50 | OTHER_CONTRACTIONS = {
 51 |     "'m": 'am',
 52 |     "'ll": 'will',
 53 |     "'s": 'has', # or 'is' but both are stopwords
 54 |     "'d": 'had'  # or 'would' but both are stopwords
 55 | }
 56 | 
 57 | class SentimentMachine(object):
 58 |     """
 59 |     This class train a logistic regression model to analyse the sentiment
 60 |     of a document. Sentiment is either negative (0) or positive (1).
 61 |     """
 62 | 
 63 |     def __init__(self, training_set, score_set):
 64 |         """
 65 |         Init the SentimentMachine with the training set.
 66 | 
 67 |         Args:
 68 |             training_set: A list of documents (list of strings)
 69 |             score_set: A list of sentiment scores (list of numbers)
 70 |             
 71 |             len(training_set) and len(score_set) must be equal.
 72 |         """
 73 |         self.training_set = training_set
 74 |         self.score_set = score_set
 75 |         self.stemmer = PorterStemmer()
 76 |         # dictionnary of sets of ngrams
 77 |         self._most_common_ngrams = {}
 78 |         # weight vector
 79 |         self.w = None
 80 | 
 81 |     def compute_ngrams(self, document, n):
 82 |         """
 83 |         Compute ngrams of the document.
 84 | 
 85 |         Args:
 86 |             document: The document as a string.
 87 |             n: The number of grams. Must be a positive interger.
 88 | 
 89 |         Returns:
 90 |             A list of ngrams.
 91 |         """
 92 |         # lowercase
 93 |         doc = document.lower()
 94 |         # TODO split by sentences for more accuracy
 95 |         # transform negative contractions (e.g don't --> do not)
 96 |         for t in NEG_CONTRACTIONS:
 97 |             doc = re.sub(t[0], t[1], doc)
 98 |         # tokenize
 99 |         tokens = word_tokenize(doc)
100 |         # transform other contractions (e.g 'll --> will)
101 |         tokens = [OTHER_CONTRACTIONS[token] if OTHER_CONTRACTIONS.get(token) 
102 |                     else token for token in tokens]
103 |         # remove punctuation
104 |         r = r'[a-z]+'
105 |         tokens = [word for word in tokens if re.search(r, word)]
106 |         # remove irrelevant stop words
107 |         tokens = [token for token in tokens if token not in ENGLISH_STOPWORDS]
108 |         # stemming
109 |         tokens = [self.stemmer.stem(token) for token in tokens]
110 |         if n == 1:
111 |             # return the list of words
112 |             return tokens
113 |         else:
114 |             # return the list of ngrams
115 |             return ngrams(tokens, n)
116 | 
117 |     def get_most_common_ngrams(self, n, nb_ngrams=None):
118 |         """
119 |         Compute and return the set of the most common ngrams in the documents.
120 |         This set is cached inside the object.
121 | 
122 |         Args:
123 |             n: The number of grams. Must be a positive interger.
124 |             nb_ngrams: The number of ngrams to return, i.e quantifying the 'most'.
125 | 
126 |         Returns:
127 |             A list of the most common ngrams.
128 |         """
129 |         try:
130 |             # return cached value
131 |             return self._most_common_ngrams[n]
132 |         except KeyError:
133 |             pass
134 | 
135 |         # compute all ngrams
136 |         all_ngrams = []
137 |         for document in self.training_set:
138 |             all_ngrams.extend(self.compute_ngrams(document, n))
139 | 
140 |         # get the frequency or return all ngrams
141 |         freq = FreqDist(ngram for ngram in all_ngrams)
142 |         # store and return the nb_ngrams most common ngrams
143 |         if nb_ngrams:
144 |             self._most_common_ngrams[n] = freq.keys()[:nb_ngrams]
145 |         else:
146 |             self._most_common_ngrams[n] = freq.keys()
147 |         return self._most_common_ngrams[n]
148 | 
149 |     def document_features(self, document):
150 |         """
151 |         Compute the nb features of a given document.
152 |          - most common words: 1 if the document contains this word, else 0
153 |          - most common bigrams: 1 if the document contains this bigram, else 0
154 | 
155 |          Args:
156 |             document: The document as a string.
157 | 
158 |         Returns:
159 |             A list of binary features.
160 |         """
161 |         features = []
162 | 
163 |         # most common ngrams for n = 1 to NGRAMS_NUMBER
164 |         nb_ngrams = NGRAMS_NUMBER
165 |         nb_features = FEATURES_NUMBER / nb_ngrams
166 |         for n in range(nb_ngrams):
167 |             common_ngrams = []
168 |             # get ngrams in the document
169 |             ngrams = set(self.compute_ngrams(document, n+1))
170 |             for ngram in self.get_most_common_ngrams(n+1, nb_features):
171 |                 # if ngram is a common one then feature = 1 else 0
172 |                 common_ngrams.append(1 if ngram in ngrams else 0)
173 |             # add new feature
174 |             features.extend(common_ngrams)
175 | 
176 |         return features
177 | 
178 | 
179 |     def compute_features_matrix(self, train_set=None):
180 |         """
181 |         Load the NxM matrix X where N is equals to the number of documents 
182 |         in the set and M is equal to the number of features.
183 | 
184 |         Args:
185 |             train_set: A list of documents (list of strings).
186 |                 If None, self.training_set is used.
187 | 
188 |         Returns:
189 |             A NxM matrix (numpy.array) 
190 |         """
191 |         m = []
192 |         for document in train_set or self.training_set:
193 |             m.append(self.document_features(document))
194 |         return np.array(m)
195 | 
196 | 
197 |     def train(self, speed=0.001, stochastic=False):
198 |         """
199 |         Train the model via logistic regression (stochastic gradient descent).
200 | 
201 |         Args:
202 |             speed: Speed of the gradient descent.
203 |         """
204 |         # load training matrix
205 |         print '==== Compute training set features...'
206 |         x = self.compute_features_matrix()
207 |         y = np.transpose(np.array(self.score_set))
208 |         print '==== Done'
209 | 
210 |         # shuffle
211 |         [n,m] = x.shape
212 |         print 'Number of features: %d' % m
213 |         indices = np.random.permutation(n)
214 |         x, y = x[indices,:], y[indices, :]
215 | 
216 |         # inital value for w
217 |         w_zero = np.zeros(m)
218 | 
219 |         # train like a boss
220 |         print '==== Start training...'
221 |         method = 'Stochastic' if stochastic else 'Batch'
222 |         print '==== (%s Gradient Descent)' % method
223 |         self.w = gradient_descent(x, y, w_zero, speed=speed, stochastic=stochastic)
224 |         print '==== Done'
225 |         return self.w
226 | 
227 | 
228 |     def classify(self, test_string):
229 |         """
230 |         Test the logistic model on the given string.
231 | 
232 |         Args:
233 |             test_string: the test string.
234 | 
235 |         Returns:
236 |             The predicted output value. 
237 |         """
238 |         if self.w is None:
239 |             raise ValueError('Looks like you forgot to .train() ' 
240 |                 + 'the model before .classify()-ing it!')
241 | 
242 |         # get features vector
243 |         x = np.array(self.document_features(test_string))
244 | 
245 |         # compute h(transpose(w) * x) and return the result according
246 |         # to the boundary h(transpose(w) * x) = 0.5
247 |         return 1 if sigmoid(np.dot(np.transpose(self.w), x)) >= 0.5 else 0
248 | 
249 | 
250 | def sigmoid(z):
251 |     """
252 |     The sigmoid / logistic function.
253 | 
254 |     Args:
255 |         z: any real number.
256 | 
257 |     Returns:
258 |         A value between O and 1.
259 |     """
260 |     return 1.0 / (1.0 + np.exp(-1.0 * z))
261 | 
262 | def cost(w, x, y, h):
263 |     """
264 |     Cost function of the logistic regression.
265 | 
266 |     Args:
267 |         w: weight vector (numpy.array)
268 |         x: documents matrix (numpy.array)
269 |         y: output vector (numpy.array)
270 |         h: function of x and w
271 | 
272 |     Returns:
273 |         The cost value (float).
274 |     """
275 |     [n, m] = x.shape
276 |     val = 0
277 |     # cost
278 |     for i in xrange(n):
279 |         val += (y[i] * np.log(h(x[i], w))
280 |             + (1.0 - y[i]) * np.log(1.0 - h(x[i], w)))
281 |     # regularisation
282 |     reg = REGULARISATION * np.dot(np.transpose(w), w) / (2.0 * n)
283 |     return -1.0 * (val / n) + reg
284 | 
285 | 
286 | def batch_descent(w, x, y, h, speed):
287 |     """
288 |     Compute w (Batch gradient descent).
289 | 
290 |     Args:
291 |         w: weight vector (numpy.array)
292 |         x: documents matrix (numpy.array)
293 |         y: output vector (numpy.array)
294 |         h: function of x and w
295 | 
296 |     Returns:
297 |         The gradient vector (list of float values).
298 |     """
299 |     [n, m] = x.shape
300 |     for i in xrange(m):
301 |         reg = REGULARISATION * w[i] / n
302 |         for j in xrange(n):
303 |             w[i] = w[i] - speed * ((h(x[j], w) - y[j]) * x[j,i] - reg)
304 | 
305 | 
306 | def stochastic_descent(w, x, y, h, speed):
307 |     """
308 |     Compute w (Stochastic gradient descent).
309 | 
310 |     Args:
311 |         w: weight vector (numpy.array)
312 |         x: documents matrix (numpy.array)
313 |         y: output vector (numpy.array)
314 |         h: function of x and w
315 | 
316 |     Returns:
317 |         The gradient vector (list of float values).
318 |     """
319 |     [n, m] = x.shape
320 |     for i in xrange(n):
321 |         for j in xrange(m):
322 |             reg = REGULARISATION * w[j] / n
323 |             w[j] = w[j] - speed * ((h(x[i], w) - y[i]) * x[i,j] - reg)
324 | 
325 | def gradient_descent(x, y, w_zero, speed=SPEED, stochastic=False, 
326 |                     threshold=THRESHOLD_CONVERGENCE, max_iter=MAX_ITERATIONS):
327 |     """
328 |     Gradient descent (either batch or stochastic) find a local minimum of a 
329 |     function f by iteratively substract a proportion of the gradient of f.
330 | 
331 |     Args:
332 |         x: The train set (numpy.array).
333 |         y: The training output vector (numpy.array).
334 |         w_zero: initial value of the parameter (numpy.array).
335 |         speed: The speed of the descent (float).
336 |         stochastic: Batch or Stochastic gradient descent (Boolean).
337 |         threshold: Convergence threshold for the difference between two 
338 |             consecutive cost function values (float, in percent).
339 |         max_iter: Maximum number of iterations (integer).
340 | 
341 |     Returns:
342 |         The weight vector which minimize the logistic cost function (numpy.array)
343 |     """
344 |     # get the dimensions of the train set
345 |     [n,m] = x.shape
346 |     # init the weight vector
347 |     w = w_zero
348 |     # init variables
349 |     iteration = 0
350 |     diff = threshold + 1
351 |     last_cost_val = 0
352 |     # define h as the sigmoid of transpose(w[i]) * x[i]
353 |     h = lambda a,b: sigmoid(np.dot(a,b))
354 |     # gradient descent
355 |     while (
356 |         iteration < max_iter
357 |         and diff > threshold
358 |     ):
359 |         iteration += 1
360 |         print 'iteration %d...' % iteration
361 | 
362 |         # compute w
363 |         if stochastic:
364 |             # stochastic gradient descent
365 |             stochastic_descent(w, x, y, h, speed)
366 |         else:
367 |             # batch gradient descent
368 |             batch_descent(w, x, y, h, speed)
369 | 
370 |         # check convergence
371 |         cost_val = cost(w, x, y, h)
372 |         if iteration > 1:
373 |             diff = abs(100 - (last_cost_val / cost_val) * 100)
374 |         last_cost_val = cost_val
375 |         valid = 0
376 |         for i in xrange(n):
377 |             v = 1 if sigmoid(np.dot(w, x[i])) >= 0.5 else 0
378 |             valid += 1 if v == y[i] else 0
379 |         percent = 100.0 * valid / n
380 | 
381 |         print ('Well-classified documents: {0} / {1} ({2}%)'
382 |             .format(valid, n, percent))
383 |         print 'Cost value: %.4f' % cost_val
384 |         print 'DIFF: %.4f %%' % diff
385 |         print
386 | 
387 |     return w
388 | 
389 | 
390 | def main():
391 |     """
392 |     Sample training using the movie reviews corpus (Pang, Lee).
393 |     """
394 | 
395 |     #== load inputs
396 |     documents = np.array([movie_reviews.raw(review_id) 
397 |         for category in movie_reviews.categories() 
398 |         for review_id in movie_reviews.fileids(category)])
399 | 
400 |     sentiment_scores = np.array([0 if category == 'neg' else 1 
401 |         for category in movie_reviews.categories() 
402 |         for review_id in movie_reviews.fileids(category)])
403 | 
404 |     #== select random indices
405 |     n = documents.shape[0]
406 |     indices = np.random.permutation(n)
407 |     threshold = np.floor(n*0.8) # 80% training set / 20% test set
408 |     train_idx, test_idx = indices[:threshold], indices[threshold:]
409 | 
410 |     #== select training and validation sets according to these indicies
411 |     x_train, x_test = documents[:, train_idx], documents[:, test_idx]
412 |     y_train, y_test = sentiment_scores[:, train_idx], sentiment_scores[:, test_idx]
413 | 
414 |     #== train the model
415 |     print '===== Training the model...'
416 |     sentiment = SentimentMachine(x_train.tolist(), y_train.tolist())
417 |     w = sentiment.train(speed=0.001, stochastic=False)
418 |     print '===== Model trained.'
419 | 
420 |     #== test efficiency of the model
421 |     print '===== Testing the model...'
422 |     # compute the MSE
423 |     h = lambda a,b: sigmoid(np.dot(a,b))
424 |     x = sentiment.compute_features_matrix(x_test.tolist())
425 |     mse = cost(w, x, y_test, h)
426 |     # compute the number of valid classifications
427 |     n_test = y_test.shape[0]
428 |     valid = 0
429 |     for i in xrange(n_test):
430 |         valid += 1 if sentiment.classify(x_test[i]) == y_test[i] else 0
431 |     percent = 100.0 * valid / n_test
432 |     # print results
433 |     print ('== Number of well-classified documents: {0} / {1} ({2}%)'
434 |         .format(valid, n_test, percent))
435 |     print '== Cost value on the test set: %.4f' % mse
436 | 
437 | 
438 | if __name__ == '__main__':
439 |     main()
440 | 


--------------------------------------------------------------------------------