├── .gitignore ├── ReadMe.md └── sentiment.py /.gitignore: -------------------------------------------------------------------------------- 1 | SentiWordNet.txt 2 | .DS_Store 3 | *.pyc 4 | -------------------------------------------------------------------------------- /ReadMe.md: -------------------------------------------------------------------------------- 1 | # About 2 | 3 | A simple dictionary-based tool for sentiment scoring a sentence based on SentiWordNet 3.0 4 | 5 | Sentiment scores are between -1 and 1, greater than 0 for positive and less than 0 for negative. 6 | 7 | Dictionary-based sentiment analysis does not perform as well as a trained classifier, 8 | but it is domain-independent, based on *a priori* knowledge of words' sentiment values. 9 | 10 | The class handles negations and multiword expressions. 11 | 12 | ## Dependencies 13 | 14 | nltk including tokenizers 15 | 16 | # Usage 17 | 18 | First download SentiWordNet 3.0 [here](http://sentiwordnet.isti.cnr.it/), and delete any header and footer lines so that the file contains only data, e.g. 19 | 20 | ``` 21 | a 00001740 0.125 0 able#1 (usually followed by 'to') having the necessary... 22 | ``` 23 | 24 | Initialize SentimentAnalysis with your SentiWordNet filesname and choice of weighting across word senses. 25 | 26 | ```python 27 | s = SentimentAnalysis(filename='SentiWordNet.txt',weighting='geometric') 28 | 29 | >>> s.score('I love you!') 30 | 0.59375 31 | 32 | >>> s.score('Pants are the worst.') 33 | -0.125 34 | 35 | >>> s.score('I do not particularly enjoy this product.') 36 | -0.15885416666666666 37 | ``` 38 | 39 | The weighting can be 'average', 'geometric' or 'harmonic'. 40 | See Guerini et al. "Sentiment Analysis: How to Derive Prior Polarities from SentiWordNet". -------------------------------------------------------------------------------- /sentiment.py: -------------------------------------------------------------------------------- 1 | """ 2 | Class to score sentiment of text. 3 | 4 | Use domain-independent method of dictionary lookup of sentiment words, 5 | handling negations and multiword expressions. Based on SentiWordNet 3.0. 6 | 7 | """ 8 | 9 | import nltk 10 | import re 11 | 12 | 13 | class SentimentAnalysis(object): 14 | """Class to get sentiment score based on analyzer.""" 15 | 16 | def __init__(self, filename='SentiWordNet.txt', weighting='geometric'): 17 | """Initialize with filename and choice of weighting.""" 18 | if weighting not in ('geometric', 'harmonic', 'average'): 19 | raise ValueError( 20 | 'Allowed weighting options are geometric, harmonic, average') 21 | # parse file and build sentiwordnet dicts 22 | self.swn_pos = {'a': {}, 'v': {}, 'r': {}, 'n': {}} 23 | self.swn_all = {} 24 | self.build_swn(filename, weighting) 25 | 26 | def average(self, score_list): 27 | """Get arithmetic average of scores.""" 28 | if(score_list): 29 | return sum(score_list) / float(len(score_list)) 30 | else: 31 | return 0 32 | 33 | def geometric_weighted(self, score_list): 34 | """"Get geometric weighted sum of scores.""" 35 | weighted_sum = 0 36 | num = 1 37 | for el in score_list: 38 | weighted_sum += (el * (1 / float(2**num))) 39 | num += 1 40 | return weighted_sum 41 | 42 | # another possible weighting instead of average 43 | def harmonic_weighted(self, score_list): 44 | """Get harmonic weighted sum of scores.""" 45 | weighted_sum = 0 46 | num = 2 47 | for el in score_list: 48 | weighted_sum += (el * (1 / float(num))) 49 | num += 1 50 | return weighted_sum 51 | 52 | def build_swn(self, filename, weighting): 53 | """Build class's lookup based on SentiWordNet 3.0.""" 54 | records = [line.split('\t') for line in open(filename)] 55 | for rec in records: 56 | # has many words in 1 entry 57 | words = rec[4].split() 58 | pos = rec[0] 59 | for word_num in words: 60 | word = word_num.split('#')[0] 61 | sense_num = int(word_num.split('#')[1]) 62 | 63 | # build a dictionary key'ed by sense number 64 | if word not in self.swn_pos[pos]: 65 | self.swn_pos[pos][word] = {} 66 | self.swn_pos[pos][word][sense_num] = float( 67 | rec[2]) - float(rec[3]) 68 | if word not in self.swn_all: 69 | self.swn_all[word] = {} 70 | self.swn_all[word][sense_num] = float(rec[2]) - float(rec[3]) 71 | 72 | # convert innermost dicts to ordered lists of scores 73 | for pos in self.swn_pos.keys(): 74 | for word in self.swn_pos[pos].keys(): 75 | newlist = [self.swn_pos[pos][word][k] for k in sorted( 76 | self.swn_pos[pos][word].keys())] 77 | if weighting == 'average': 78 | self.swn_pos[pos][word] = self.average(newlist) 79 | if weighting == 'geometric': 80 | self.swn_pos[pos][word] = self.geometric_weighted(newlist) 81 | if weighting == 'harmonic': 82 | self.swn_pos[pos][word] = self.harmonic_weighted(newlist) 83 | 84 | for word in self.swn_all.keys(): 85 | newlist = [self.swn_all[word][k] for k in sorted( 86 | self.swn_all[word].keys())] 87 | if weighting == 'average': 88 | self.swn_all[word] = self.average(newlist) 89 | if weighting == 'geometric': 90 | self.swn_all[word] = self.geometric_weighted(newlist) 91 | if weighting == 'harmonic': 92 | self.swn_all[word] = self.harmonic_weighted(newlist) 93 | 94 | def pos_short(self, pos): 95 | """Convert NLTK POS tags to SWN's POS tags.""" 96 | if pos in set(['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']): 97 | return 'v' 98 | elif pos in set(['JJ', 'JJR', 'JJS']): 99 | return 'a' 100 | elif pos in set(['RB', 'RBR', 'RBS']): 101 | return 'r' 102 | elif pos in set(['NNS', 'NN', 'NNP', 'NNPS']): 103 | return 'n' 104 | else: 105 | return 'a' 106 | 107 | def score_word(self, word, pos): 108 | """Get sentiment score of word based on SWN and part of speech.""" 109 | try: 110 | return self.swn_pos[pos][word] 111 | except KeyError: 112 | try: 113 | return self.swn_all[word] 114 | except KeyError: 115 | return 0 116 | 117 | def score(self, sentence): 118 | """Sentiment score a sentence.""" 119 | # init sentiwordnet lookup/scoring tools 120 | impt = set(['NNS', 'NN', 'NNP', 'NNPS', 'JJ', 'JJR', 'JJS', 121 | 'RB', 'RBR', 'RBS', 'VB', 'VBD', 'VBG', 'VBN', 122 | 'VBP', 'VBZ', 'unknown']) 123 | non_base = set(['VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'NNS', 'NNPS']) 124 | negations = set(['not', 'n\'t', 'less', 'no', 'never', 125 | 'nothing', 'nowhere', 'hardly', 'barely', 126 | 'scarcely', 'nobody', 'none']) 127 | stopwords = nltk.corpus.stopwords.words('english') 128 | wnl = nltk.WordNetLemmatizer() 129 | 130 | scores = [] 131 | tokens = nltk.tokenize.word_tokenize(sentence) 132 | tagged = nltk.pos_tag(tokens) 133 | 134 | index = 0 135 | for el in tagged: 136 | 137 | pos = el[1] 138 | try: 139 | word = re.match('(\w+)', el[0]).group(0).lower() 140 | start = index - 5 141 | if start < 0: 142 | start = 0 143 | neighborhood = tokens[start:index] 144 | 145 | # look for trailing multiword expressions 146 | word_minus_one = tokens[index-1:index+1] 147 | word_minus_two = tokens[index-2:index+1] 148 | 149 | # if multiword expression, fold to one expression 150 | if(self.is_multiword(word_minus_two)): 151 | if len(scores) > 1: 152 | scores.pop() 153 | scores.pop() 154 | if len(neighborhood) > 1: 155 | neighborhood.pop() 156 | neighborhood.pop() 157 | word = '_'.join(word_minus_two) 158 | pos = 'unknown' 159 | 160 | elif(self.is_multiword(word_minus_one)): 161 | if len(scores) > 0: 162 | scores.pop() 163 | if len(neighborhood) > 0: 164 | neighborhood.pop() 165 | word = '_'.join(word_minus_one) 166 | pos = 'unknown' 167 | 168 | # perform lookup 169 | if (pos in impt) and (word not in stopwords): 170 | if pos in non_base: 171 | word = wnl.lemmatize(word, self.pos_short(pos)) 172 | score = self.score_word(word, self.pos_short(pos)) 173 | if len(negations.intersection(set(neighborhood))) > 0: 174 | score = -score 175 | scores.append(score) 176 | 177 | except AttributeError: 178 | pass 179 | 180 | index += 1 181 | 182 | if len(scores) > 0: 183 | return sum(scores) / float(len(scores)) 184 | else: 185 | return 0 186 | 187 | def is_multiword(self, words): 188 | """Test if a group of words is a multiword expression.""" 189 | joined = '_'.join(words) 190 | return joined in self.swn_all 191 | 192 | --------------------------------------------------------------------------------