├── dicts
    ├── inv.yml
    ├── dec.yml
    ├── inc.yml
    ├── positive.yml
    └── negative.yml
├── README.md
├── .gitignore
└── basic_sentiment_analysis.py


/dicts/inv.yml:
--------------------------------------------------------------------------------
1 | lack of: [inv]
2 | not: [inv]
3 | 


--------------------------------------------------------------------------------
/dicts/dec.yml:
--------------------------------------------------------------------------------
1 | barely: [dec]
2 | little: [dec]
3 | 


--------------------------------------------------------------------------------
/dicts/inc.yml:
--------------------------------------------------------------------------------
1 | too: [inc]
2 | very: [inc]
3 | sorely: [inc]
4 | 


--------------------------------------------------------------------------------
/dicts/positive.yml:
--------------------------------------------------------------------------------
1 | nice: [positive]
2 | awesome: [positive]
3 | cool: [positive]
4 | superb: [positive]


--------------------------------------------------------------------------------
/dicts/negative.yml:
--------------------------------------------------------------------------------
1 | bad: [negative]
2 | uninspired: [negative]
3 | expensive: [negative]
4 | dissapointed: [negative]
5 | recommend others to avoid: [negative]


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | basic_sentiment_analysis
2 | ========================
3 | 
4 | Code of the blog post: http://fjavieralba.com/basic-sentiment-analysis-with-python.html


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.py[co]
 2 | 
 3 | # Packages
 4 | *.egg
 5 | *.egg-info
 6 | dist
 7 | build
 8 | eggs
 9 | parts
10 | bin
11 | var
12 | sdist
13 | develop-eggs
14 | .installed.cfg
15 | 
16 | # Installer logs
17 | pip-log.txt
18 | 
19 | # Unit test / coverage reports
20 | .coverage
21 | .tox
22 | 
23 | #Translations
24 | *.mo
25 | 
26 | #Mr Developer
27 | .mr.developer.cfg
28 | 


--------------------------------------------------------------------------------
/basic_sentiment_analysis.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | basic_sentiment_analysis
  4 | ~~~~~~~~~~~~~~~~~~~~~~~~
  5 | 
  6 | This module contains the code and examples described in 
  7 | http://fjavieralba.com/basic-sentiment-analysis-with-python.html
  8 | 
  9 | """
 10 | 
 11 | from pprint import pprint
 12 | import nltk
 13 | import yaml
 14 | import sys
 15 | import os
 16 | import re
 17 | 
 18 | class Splitter(object):
 19 | 
 20 |     def __init__(self):
 21 |         self.nltk_splitter = nltk.data.load('tokenizers/punkt/english.pickle')
 22 |         self.nltk_tokenizer = nltk.tokenize.TreebankWordTokenizer()
 23 | 
 24 |     def split(self, text):
 25 |         """
 26 |         input format: a paragraph of text
 27 |         output format: a list of lists of words.
 28 |             e.g.: [['this', 'is', 'a', 'sentence'], ['this', 'is', 'another', 'one']]
 29 |         """
 30 |         sentences = self.nltk_splitter.tokenize(text)
 31 |         tokenized_sentences = [self.nltk_tokenizer.tokenize(sent) for sent in sentences]
 32 |         return tokenized_sentences
 33 | 
 34 | 
 35 | class POSTagger(object):
 36 | 
 37 |     def __init__(self):
 38 |         pass
 39 |         
 40 |     def pos_tag(self, sentences):
 41 |         """
 42 |         input format: list of lists of words
 43 |             e.g.: [['this', 'is', 'a', 'sentence'], ['this', 'is', 'another', 'one']]
 44 |         output format: list of lists of tagged tokens. Each tagged tokens has a
 45 |         form, a lemma, and a list of tags
 46 |             e.g: [[('this', 'this', ['DT']), ('is', 'be', ['VB']), ('a', 'a', ['DT']), ('sentence', 'sentence', ['NN'])],
 47 |                     [('this', 'this', ['DT']), ('is', 'be', ['VB']), ('another', 'another', ['DT']), ('one', 'one', ['CARD'])]]
 48 |         """
 49 | 
 50 |         pos = [nltk.pos_tag(sentence) for sentence in sentences]
 51 |         #adapt format
 52 |         pos = [[(word, word, [postag]) for (word, postag) in sentence] for sentence in pos]
 53 |         return pos
 54 | 
 55 | class DictionaryTagger(object):
 56 | 
 57 |     def __init__(self, dictionary_paths):
 58 |         files = [open(path, 'r') for path in dictionary_paths]
 59 |         dictionaries = [yaml.load(dict_file) for dict_file in files]
 60 |         map(lambda x: x.close(), files)
 61 |         self.dictionary = {}
 62 |         self.max_key_size = 0
 63 |         for curr_dict in dictionaries:
 64 |             for key in curr_dict:
 65 |                 if key in self.dictionary:
 66 |                     self.dictionary[key].extend(curr_dict[key])
 67 |                 else:
 68 |                     self.dictionary[key] = curr_dict[key]
 69 |                     self.max_key_size = max(self.max_key_size, len(key))
 70 | 
 71 |     def tag(self, postagged_sentences):
 72 |         return [self.tag_sentence(sentence) for sentence in postagged_sentences]
 73 | 
 74 |     def tag_sentence(self, sentence, tag_with_lemmas=False):
 75 |         """
 76 |         the result is only one tagging of all the possible ones.
 77 |         The resulting tagging is determined by these two priority rules:
 78 |             - longest matches have higher priority
 79 |             - search is made from left to right
 80 |         """
 81 |         tag_sentence = []
 82 |         N = len(sentence)
 83 |         if self.max_key_size == 0:
 84 |             self.max_key_size = N
 85 |         i = 0
 86 |         while (i < N):
 87 |             j = min(i + self.max_key_size, N) #avoid overflow
 88 |             tagged = False
 89 |             while (j > i):
 90 |                 expression_form = ' '.join([word[0] for word in sentence[i:j]]).lower()
 91 |                 expression_lemma = ' '.join([word[1] for word in sentence[i:j]]).lower()
 92 |                 if tag_with_lemmas:
 93 |                     literal = expression_lemma
 94 |                 else:
 95 |                     literal = expression_form
 96 |                 if literal in self.dictionary:
 97 |                     #self.logger.debug("found: %s" % literal)
 98 |                     is_single_token = j - i == 1
 99 |                     original_position = i
100 |                     i = j
101 |                     taggings = [tag for tag in self.dictionary[literal]]
102 |                     tagged_expression = (expression_form, expression_lemma, taggings)
103 |                     if is_single_token: #if the tagged literal is a single token, conserve its previous taggings:
104 |                         original_token_tagging = sentence[original_position][2]
105 |                         tagged_expression[2].extend(original_token_tagging)
106 |                     tag_sentence.append(tagged_expression)
107 |                     tagged = True
108 |                 else:
109 |                     j = j - 1
110 |             if not tagged:
111 |                 tag_sentence.append(sentence[i])
112 |                 i += 1
113 |         return tag_sentence
114 | 
115 | def value_of(sentiment):
116 |     if sentiment == 'positive': return 1
117 |     if sentiment == 'negative': return -1
118 |     return 0
119 | 
120 | def sentence_score(sentence_tokens, previous_token, acum_score):    
121 |     if not sentence_tokens:
122 |         return acum_score
123 |     else:
124 |         current_token = sentence_tokens[0]
125 |         tags = current_token[2]
126 |         token_score = sum([value_of(tag) for tag in tags])
127 |         if previous_token is not None:
128 |             previous_tags = previous_token[2]
129 |             if 'inc' in previous_tags:
130 |                 token_score *= 2.0
131 |             elif 'dec' in previous_tags:
132 |                 token_score /= 2.0
133 |             elif 'inv' in previous_tags:
134 |                 token_score *= -1.0
135 |         return sentence_score(sentence_tokens[1:], current_token, acum_score + token_score)
136 | 
137 | def sentiment_score(review):
138 |     return sum([sentence_score(sentence, None, 0.0) for sentence in review])
139 | 
140 | if __name__ == "__main__":
141 |     text = """What can I say about this place. The staff of the restaurant is 
142 |     nice and the eggplant is not bad. Apart from that, very uninspired food, 
143 |     lack of atmosphere and too expensive. I am a staunch vegetarian and was 
144 |     sorely dissapointed with the veggie options on the menu. Will be the last 
145 |     time I visit, I recommend others to avoid."""
146 | 
147 |     splitter = Splitter()
148 |     postagger = POSTagger()
149 |     dicttagger = DictionaryTagger([ 'dicts/positive.yml', 'dicts/negative.yml', 
150 |                                     'dicts/inc.yml', 'dicts/dec.yml', 'dicts/inv.yml'])
151 | 
152 |     splitted_sentences = splitter.split(text)
153 |     pprint(splitted_sentences)
154 | 
155 |     pos_tagged_sentences = postagger.pos_tag(splitted_sentences)
156 |     pprint(pos_tagged_sentences)
157 | 
158 |     dict_tagged_sentences = dicttagger.tag(pos_tagged_sentences)
159 |     pprint(dict_tagged_sentences)
160 | 
161 |     print("analyzing sentiment...")
162 |     score = sentiment_score(dict_tagged_sentences)
163 |     print(score)
164 | 
165 | 
166 | 


--------------------------------------------------------------------------------