├── .gitignore
├── ReadMe.md
└── sentiment.py


/.gitignore:
--------------------------------------------------------------------------------
1 | SentiWordNet.txt
2 | .DS_Store
3 | *.pyc
4 | 


--------------------------------------------------------------------------------
/ReadMe.md:
--------------------------------------------------------------------------------
 1 | # About
 2 | 
 3 | A simple dictionary-based tool for sentiment scoring a sentence based on SentiWordNet 3.0
 4 | 
 5 | Sentiment scores are between -1 and 1, greater than 0 for positive and less than 0 for negative.
 6 | 
 7 | Dictionary-based sentiment analysis does not perform as well as a trained classifier,
 8 | but it is domain-independent, based on *a priori* knowledge of words' sentiment values.
 9 | 
10 | The class handles negations and multiword expressions.
11 | 
12 | ## Dependencies
13 | 
14 | nltk including tokenizers
15 | 
16 | # Usage
17 | 
18 | First download SentiWordNet 3.0 [here](http://sentiwordnet.isti.cnr.it/), and delete any header and footer lines so that the file contains only data, e.g.
19 | 
20 | ```
21 | a	00001740	0.125	0	able#1	(usually followed by 'to') having the necessary...
22 | ```
23 | 
24 | Initialize SentimentAnalysis with your SentiWordNet filesname and choice of weighting across word senses.
25 | 
26 | ```python
27 | s = SentimentAnalysis(filename='SentiWordNet.txt',weighting='geometric')
28 | 
29 | >>> s.score('I love you!')
30 | 0.59375
31 | 
32 | >>> s.score('Pants are the worst.')
33 | -0.125
34 | 
35 | >>> s.score('I do not particularly enjoy this product.')
36 | -0.15885416666666666
37 | ```
38 | 
39 | The weighting can be 'average', 'geometric' or 'harmonic'.
40 | See Guerini et al. "Sentiment Analysis: How to Derive Prior Polarities from SentiWordNet".


--------------------------------------------------------------------------------
/sentiment.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Class to score sentiment of text.
  3 | 
  4 | Use domain-independent method of dictionary lookup of sentiment words,
  5 | handling negations and multiword expressions. Based on SentiWordNet 3.0.
  6 | 
  7 | """
  8 | 
  9 | import nltk
 10 | import re
 11 | 
 12 | 
 13 | class SentimentAnalysis(object):
 14 |     """Class to get sentiment score based on analyzer."""
 15 | 
 16 |     def __init__(self, filename='SentiWordNet.txt', weighting='geometric'):
 17 |         """Initialize with filename and choice of weighting."""
 18 |         if weighting not in ('geometric', 'harmonic', 'average'):
 19 |             raise ValueError(
 20 |                 'Allowed weighting options are geometric, harmonic, average')
 21 |         # parse file and build sentiwordnet dicts
 22 |         self.swn_pos = {'a': {}, 'v': {}, 'r': {}, 'n': {}}
 23 |         self.swn_all = {}
 24 |         self.build_swn(filename, weighting)
 25 | 
 26 |     def average(self, score_list):
 27 |         """Get arithmetic average of scores."""
 28 |         if(score_list):
 29 |             return sum(score_list) / float(len(score_list))
 30 |         else:
 31 |             return 0
 32 | 
 33 |     def geometric_weighted(self, score_list):
 34 |         """"Get geometric weighted sum of scores."""
 35 |         weighted_sum = 0
 36 |         num = 1
 37 |         for el in score_list:
 38 |             weighted_sum += (el * (1 / float(2**num)))
 39 |             num += 1
 40 |         return weighted_sum
 41 | 
 42 |     # another possible weighting instead of average
 43 |     def harmonic_weighted(self, score_list):
 44 |         """Get harmonic weighted sum of scores."""
 45 |         weighted_sum = 0
 46 |         num = 2
 47 |         for el in score_list:
 48 |             weighted_sum += (el * (1 / float(num)))
 49 |             num += 1
 50 |         return weighted_sum
 51 | 
 52 |     def build_swn(self, filename, weighting):
 53 |         """Build class's lookup based on SentiWordNet 3.0."""
 54 |         records = [line.split('\t') for line in open(filename)]
 55 |         for rec in records:
 56 |             # has many words in 1 entry
 57 |             words = rec[4].split()
 58 |             pos = rec[0]
 59 |             for word_num in words:
 60 |                 word = word_num.split('#')[0]
 61 |                 sense_num = int(word_num.split('#')[1])
 62 | 
 63 |                 # build a dictionary key'ed by sense number
 64 |                 if word not in self.swn_pos[pos]:
 65 |                     self.swn_pos[pos][word] = {}
 66 |                 self.swn_pos[pos][word][sense_num] = float(
 67 |                     rec[2]) - float(rec[3])
 68 |                 if word not in self.swn_all:
 69 |                     self.swn_all[word] = {}
 70 |                 self.swn_all[word][sense_num] = float(rec[2]) - float(rec[3])
 71 | 
 72 |         # convert innermost dicts to ordered lists of scores
 73 |         for pos in self.swn_pos.keys():
 74 |             for word in self.swn_pos[pos].keys():
 75 |                 newlist = [self.swn_pos[pos][word][k] for k in sorted(
 76 |                     self.swn_pos[pos][word].keys())]
 77 |                 if weighting == 'average':
 78 |                     self.swn_pos[pos][word] = self.average(newlist)
 79 |                 if weighting == 'geometric':
 80 |                     self.swn_pos[pos][word] = self.geometric_weighted(newlist)
 81 |                 if weighting == 'harmonic':
 82 |                     self.swn_pos[pos][word] = self.harmonic_weighted(newlist)
 83 | 
 84 |         for word in self.swn_all.keys():
 85 |             newlist = [self.swn_all[word][k] for k in sorted(
 86 |                 self.swn_all[word].keys())]
 87 |             if weighting == 'average':
 88 |                 self.swn_all[word] = self.average(newlist)
 89 |             if weighting == 'geometric':
 90 |                 self.swn_all[word] = self.geometric_weighted(newlist)
 91 |             if weighting == 'harmonic':
 92 |                 self.swn_all[word] = self.harmonic_weighted(newlist)
 93 | 
 94 |     def pos_short(self, pos):
 95 |         """Convert NLTK POS tags to SWN's POS tags."""
 96 |         if pos in set(['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']):
 97 |             return 'v'
 98 |         elif pos in set(['JJ', 'JJR', 'JJS']):
 99 |             return 'a'
100 |         elif pos in set(['RB', 'RBR', 'RBS']):
101 |             return 'r'
102 |         elif pos in set(['NNS', 'NN', 'NNP', 'NNPS']):
103 |             return 'n'
104 |         else:
105 |             return 'a'
106 | 
107 |     def score_word(self, word, pos):
108 |         """Get sentiment score of word based on SWN and part of speech."""
109 |         try:
110 |             return self.swn_pos[pos][word]
111 |         except KeyError:
112 |             try:
113 |                 return self.swn_all[word]
114 |             except KeyError:
115 |                 return 0
116 | 
117 |     def score(self, sentence):
118 |         """Sentiment score a sentence."""
119 |         # init sentiwordnet lookup/scoring tools
120 |         impt = set(['NNS', 'NN', 'NNP', 'NNPS', 'JJ', 'JJR', 'JJS',
121 |                     'RB', 'RBR', 'RBS', 'VB', 'VBD', 'VBG', 'VBN',
122 |                     'VBP', 'VBZ', 'unknown'])
123 |         non_base = set(['VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'NNS', 'NNPS'])
124 |         negations = set(['not', 'n\'t', 'less', 'no', 'never',
125 |                          'nothing', 'nowhere', 'hardly', 'barely',
126 |                          'scarcely', 'nobody', 'none'])
127 |         stopwords = nltk.corpus.stopwords.words('english')
128 |         wnl = nltk.WordNetLemmatizer()
129 | 
130 |         scores = []
131 |         tokens = nltk.tokenize.word_tokenize(sentence)
132 |         tagged = nltk.pos_tag(tokens)
133 | 
134 |         index = 0
135 |         for el in tagged:
136 | 
137 |             pos = el[1]
138 |             try:
139 |                 word = re.match('(\w+)', el[0]).group(0).lower()
140 |                 start = index - 5
141 |                 if start < 0:
142 |                     start = 0
143 |                 neighborhood = tokens[start:index]
144 | 
145 |                 # look for trailing multiword expressions
146 |                 word_minus_one = tokens[index-1:index+1]
147 |                 word_minus_two = tokens[index-2:index+1]
148 | 
149 |                 # if multiword expression, fold to one expression
150 |                 if(self.is_multiword(word_minus_two)):
151 |                     if len(scores) > 1:
152 |                         scores.pop()
153 |                         scores.pop()
154 |                     if len(neighborhood) > 1:
155 |                         neighborhood.pop()
156 |                         neighborhood.pop()
157 |                     word = '_'.join(word_minus_two)
158 |                     pos = 'unknown'
159 | 
160 |                 elif(self.is_multiword(word_minus_one)):
161 |                     if len(scores) > 0:
162 |                         scores.pop()
163 |                     if len(neighborhood) > 0:
164 |                         neighborhood.pop()
165 |                     word = '_'.join(word_minus_one)
166 |                     pos = 'unknown'
167 | 
168 |                 # perform lookup
169 |                 if (pos in impt) and (word not in stopwords):
170 |                     if pos in non_base:
171 |                         word = wnl.lemmatize(word, self.pos_short(pos))
172 |                     score = self.score_word(word, self.pos_short(pos))
173 |                     if len(negations.intersection(set(neighborhood))) > 0:
174 |                         score = -score
175 |                     scores.append(score)
176 | 
177 |             except AttributeError:
178 |                 pass
179 | 
180 |             index += 1
181 | 
182 |         if len(scores) > 0:
183 |             return sum(scores) / float(len(scores))
184 |         else:
185 |             return 0
186 | 
187 |     def is_multiword(self, words):
188 |         """Test if a group of words is a multiword expression."""
189 |         joined = '_'.join(words)
190 |         return joined in self.swn_all
191 | 
192 | 


--------------------------------------------------------------------------------