├── dicts ├── inv.yml ├── dec.yml ├── inc.yml ├── positive.yml └── negative.yml ├── README.md ├── .gitignore └── basic_sentiment_analysis.py /dicts/inv.yml: -------------------------------------------------------------------------------- 1 | lack of: [inv] 2 | not: [inv] 3 | -------------------------------------------------------------------------------- /dicts/dec.yml: -------------------------------------------------------------------------------- 1 | barely: [dec] 2 | little: [dec] 3 | -------------------------------------------------------------------------------- /dicts/inc.yml: -------------------------------------------------------------------------------- 1 | too: [inc] 2 | very: [inc] 3 | sorely: [inc] 4 | -------------------------------------------------------------------------------- /dicts/positive.yml: -------------------------------------------------------------------------------- 1 | nice: [positive] 2 | awesome: [positive] 3 | cool: [positive] 4 | superb: [positive] -------------------------------------------------------------------------------- /dicts/negative.yml: -------------------------------------------------------------------------------- 1 | bad: [negative] 2 | uninspired: [negative] 3 | expensive: [negative] 4 | dissapointed: [negative] 5 | recommend others to avoid: [negative] -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | basic_sentiment_analysis 2 | ======================== 3 | 4 | Code of the blog post: http://fjavieralba.com/basic-sentiment-analysis-with-python.html -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[co] 2 | 3 | # Packages 4 | *.egg 5 | *.egg-info 6 | dist 7 | build 8 | eggs 9 | parts 10 | bin 11 | var 12 | sdist 13 | develop-eggs 14 | .installed.cfg 15 | 16 | # Installer logs 17 | pip-log.txt 18 | 19 | # Unit test / coverage reports 20 | .coverage 21 | .tox 22 | 23 | #Translations 24 | *.mo 25 | 26 | #Mr Developer 27 | .mr.developer.cfg 28 | -------------------------------------------------------------------------------- /basic_sentiment_analysis.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | basic_sentiment_analysis 4 | ~~~~~~~~~~~~~~~~~~~~~~~~ 5 | 6 | This module contains the code and examples described in 7 | http://fjavieralba.com/basic-sentiment-analysis-with-python.html 8 | 9 | """ 10 | 11 | from pprint import pprint 12 | import nltk 13 | import yaml 14 | import sys 15 | import os 16 | import re 17 | 18 | class Splitter(object): 19 | 20 | def __init__(self): 21 | self.nltk_splitter = nltk.data.load('tokenizers/punkt/english.pickle') 22 | self.nltk_tokenizer = nltk.tokenize.TreebankWordTokenizer() 23 | 24 | def split(self, text): 25 | """ 26 | input format: a paragraph of text 27 | output format: a list of lists of words. 28 | e.g.: [['this', 'is', 'a', 'sentence'], ['this', 'is', 'another', 'one']] 29 | """ 30 | sentences = self.nltk_splitter.tokenize(text) 31 | tokenized_sentences = [self.nltk_tokenizer.tokenize(sent) for sent in sentences] 32 | return tokenized_sentences 33 | 34 | 35 | class POSTagger(object): 36 | 37 | def __init__(self): 38 | pass 39 | 40 | def pos_tag(self, sentences): 41 | """ 42 | input format: list of lists of words 43 | e.g.: [['this', 'is', 'a', 'sentence'], ['this', 'is', 'another', 'one']] 44 | output format: list of lists of tagged tokens. Each tagged tokens has a 45 | form, a lemma, and a list of tags 46 | e.g: [[('this', 'this', ['DT']), ('is', 'be', ['VB']), ('a', 'a', ['DT']), ('sentence', 'sentence', ['NN'])], 47 | [('this', 'this', ['DT']), ('is', 'be', ['VB']), ('another', 'another', ['DT']), ('one', 'one', ['CARD'])]] 48 | """ 49 | 50 | pos = [nltk.pos_tag(sentence) for sentence in sentences] 51 | #adapt format 52 | pos = [[(word, word, [postag]) for (word, postag) in sentence] for sentence in pos] 53 | return pos 54 | 55 | class DictionaryTagger(object): 56 | 57 | def __init__(self, dictionary_paths): 58 | files = [open(path, 'r') for path in dictionary_paths] 59 | dictionaries = [yaml.load(dict_file) for dict_file in files] 60 | map(lambda x: x.close(), files) 61 | self.dictionary = {} 62 | self.max_key_size = 0 63 | for curr_dict in dictionaries: 64 | for key in curr_dict: 65 | if key in self.dictionary: 66 | self.dictionary[key].extend(curr_dict[key]) 67 | else: 68 | self.dictionary[key] = curr_dict[key] 69 | self.max_key_size = max(self.max_key_size, len(key)) 70 | 71 | def tag(self, postagged_sentences): 72 | return [self.tag_sentence(sentence) for sentence in postagged_sentences] 73 | 74 | def tag_sentence(self, sentence, tag_with_lemmas=False): 75 | """ 76 | the result is only one tagging of all the possible ones. 77 | The resulting tagging is determined by these two priority rules: 78 | - longest matches have higher priority 79 | - search is made from left to right 80 | """ 81 | tag_sentence = [] 82 | N = len(sentence) 83 | if self.max_key_size == 0: 84 | self.max_key_size = N 85 | i = 0 86 | while (i < N): 87 | j = min(i + self.max_key_size, N) #avoid overflow 88 | tagged = False 89 | while (j > i): 90 | expression_form = ' '.join([word[0] for word in sentence[i:j]]).lower() 91 | expression_lemma = ' '.join([word[1] for word in sentence[i:j]]).lower() 92 | if tag_with_lemmas: 93 | literal = expression_lemma 94 | else: 95 | literal = expression_form 96 | if literal in self.dictionary: 97 | #self.logger.debug("found: %s" % literal) 98 | is_single_token = j - i == 1 99 | original_position = i 100 | i = j 101 | taggings = [tag for tag in self.dictionary[literal]] 102 | tagged_expression = (expression_form, expression_lemma, taggings) 103 | if is_single_token: #if the tagged literal is a single token, conserve its previous taggings: 104 | original_token_tagging = sentence[original_position][2] 105 | tagged_expression[2].extend(original_token_tagging) 106 | tag_sentence.append(tagged_expression) 107 | tagged = True 108 | else: 109 | j = j - 1 110 | if not tagged: 111 | tag_sentence.append(sentence[i]) 112 | i += 1 113 | return tag_sentence 114 | 115 | def value_of(sentiment): 116 | if sentiment == 'positive': return 1 117 | if sentiment == 'negative': return -1 118 | return 0 119 | 120 | def sentence_score(sentence_tokens, previous_token, acum_score): 121 | if not sentence_tokens: 122 | return acum_score 123 | else: 124 | current_token = sentence_tokens[0] 125 | tags = current_token[2] 126 | token_score = sum([value_of(tag) for tag in tags]) 127 | if previous_token is not None: 128 | previous_tags = previous_token[2] 129 | if 'inc' in previous_tags: 130 | token_score *= 2.0 131 | elif 'dec' in previous_tags: 132 | token_score /= 2.0 133 | elif 'inv' in previous_tags: 134 | token_score *= -1.0 135 | return sentence_score(sentence_tokens[1:], current_token, acum_score + token_score) 136 | 137 | def sentiment_score(review): 138 | return sum([sentence_score(sentence, None, 0.0) for sentence in review]) 139 | 140 | if __name__ == "__main__": 141 | text = """What can I say about this place. The staff of the restaurant is 142 | nice and the eggplant is not bad. Apart from that, very uninspired food, 143 | lack of atmosphere and too expensive. I am a staunch vegetarian and was 144 | sorely dissapointed with the veggie options on the menu. Will be the last 145 | time I visit, I recommend others to avoid.""" 146 | 147 | splitter = Splitter() 148 | postagger = POSTagger() 149 | dicttagger = DictionaryTagger([ 'dicts/positive.yml', 'dicts/negative.yml', 150 | 'dicts/inc.yml', 'dicts/dec.yml', 'dicts/inv.yml']) 151 | 152 | splitted_sentences = splitter.split(text) 153 | pprint(splitted_sentences) 154 | 155 | pos_tagged_sentences = postagger.pos_tag(splitted_sentences) 156 | pprint(pos_tagged_sentences) 157 | 158 | dict_tagged_sentences = dicttagger.tag(pos_tagged_sentences) 159 | pprint(dict_tagged_sentences) 160 | 161 | print("analyzing sentiment...") 162 | score = sentiment_score(dict_tagged_sentences) 163 | print(score) 164 | 165 | 166 | --------------------------------------------------------------------------------