├── polarityData ├── rt-polaritydata.README.1.0.txt └── rt-polaritydata │ ├── rt-polarity-neg.txt │ └── rt-polarity-pos.txt ├── readme.md └── sentiment_analysis.py /polarityData/rt-polaritydata.README.1.0.txt: -------------------------------------------------------------------------------- 1 | 2 | ======= 3 | 4 | Introduction 5 | 6 | This README v1.0 (June, 2005) for the v1.0 sentence polarity dataset comes 7 | from the URL 8 | http://www.cs.cornell.edu/people/pabo/movie-review-data . 9 | 10 | ======= 11 | 12 | Citation Info 13 | 14 | This data was first used in Bo Pang and Lillian Lee, 15 | ``Seeing stars: Exploiting class relationships for sentiment categorization 16 | with respect to rating scales.'', Proceedings of the ACL, 2005. 17 | 18 | @InProceedings{Pang+Lee:05a, 19 | author = {Bo Pang and Lillian Lee}, 20 | title = {Seeing stars: Exploiting class relationships for sentiment 21 | categorization with respect to rating scales}, 22 | booktitle = {Proceedings of the ACL}, 23 | year = 2005 24 | } 25 | 26 | ======= 27 | 28 | Data Format Summary 29 | 30 | - rt-polaritydata.tar.gz: contains this readme and two data files that 31 | were used in the experiments described in Pang/Lee ACL 2005. 32 | 33 | Specifically: 34 | * rt-polarity.pos contains 5331 positive snippets 35 | * rt-polarity.neg contains 5331 negative snippets 36 | 37 | Each line in these two files corresponds to a single snippet (usually 38 | containing roughly one single sentence); all snippets are down-cased. 39 | The snippets were labeled automatically, as described below (see 40 | section "Label Decision"). 41 | 42 | Note: The original source files from which the data in 43 | rt-polaritydata.tar.gz was derived can be found in the subjective 44 | part (Rotten Tomatoes pages) of subjectivity_html.tar.gz (released 45 | with subjectivity dataset v1.0). 46 | 47 | 48 | ======= 49 | 50 | Label Decision 51 | 52 | We assumed snippets (from Rotten Tomatoes webpages) for reviews marked with 53 | ``fresh'' are positive, and those for reviews marked with ``rotten'' are 54 | negative. 55 | -------------------------------------------------------------------------------- /polarityData/rt-polaritydata/rt-polarity-neg.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abromberg/sentiment_analysis_python/bdae4288eb3c678d4af04d370934a07a7b2cd882/polarityData/rt-polaritydata/rt-polarity-neg.txt -------------------------------------------------------------------------------- /polarityData/rt-polaritydata/rt-polarity-pos.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abromberg/sentiment_analysis_python/bdae4288eb3c678d4af04d370934a07a7b2cd882/polarityData/rt-polaritydata/rt-polarity-pos.txt -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | ## README 2 | 3 | ### Documentation 4 | 5 | For documentation, check out the blog post about this code [here](http://andybromberg.com/sentiment-analysis-python). 6 | 7 | ### Note 8 | 9 | Due to the fact that I developed this on Windows, there might be issues reading the polarity data files by line using the code I provided (because of inconsistent line break characters). If this comes up, please [email me](mailto:hi@andybromberg.com)! -------------------------------------------------------------------------------- /sentiment_analysis.py: -------------------------------------------------------------------------------- 1 | import re, math, collections, itertools, os 2 | import nltk, nltk.classify.util, nltk.metrics 3 | from nltk.classify import NaiveBayesClassifier 4 | from nltk.metrics import BigramAssocMeasures 5 | from nltk.probability import FreqDist, ConditionalFreqDist 6 | 7 | 8 | POLARITY_DATA_DIR = os.path.join('polarityData', 'rt-polaritydata') 9 | RT_POLARITY_POS_FILE = os.path.join(POLARITY_DATA_DIR, 'rt-polarity-pos.txt') 10 | RT_POLARITY_NEG_FILE = os.path.join(POLARITY_DATA_DIR, 'rt-polarity-neg.txt') 11 | 12 | 13 | #this function takes a feature selection mechanism and returns its performance in a variety of metrics 14 | def evaluate_features(feature_select): 15 | posFeatures = [] 16 | negFeatures = [] 17 | #http://stackoverflow.com/questions/367155/splitting-a-string-into-words-and-punctuation 18 | #breaks up the sentences into lists of individual words (as selected by the input mechanism) and appends 'pos' or 'neg' after each list 19 | with open(RT_POLARITY_POS_FILE, 'r') as posSentences: 20 | for i in posSentences: 21 | posWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) 22 | posWords = [feature_select(posWords), 'pos'] 23 | posFeatures.append(posWords) 24 | with open(RT_POLARITY_NEG_FILE, 'r') as negSentences: 25 | for i in negSentences: 26 | negWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) 27 | negWords = [feature_select(negWords), 'neg'] 28 | negFeatures.append(negWords) 29 | 30 | 31 | #selects 3/4 of the features to be used for training and 1/4 to be used for testing 32 | posCutoff = int(math.floor(len(posFeatures)*3/4)) 33 | negCutoff = int(math.floor(len(negFeatures)*3/4)) 34 | trainFeatures = posFeatures[:posCutoff] + negFeatures[:negCutoff] 35 | testFeatures = posFeatures[posCutoff:] + negFeatures[negCutoff:] 36 | 37 | #trains a Naive Bayes Classifier 38 | classifier = NaiveBayesClassifier.train(trainFeatures) 39 | 40 | #initiates referenceSets and testSets 41 | referenceSets = collections.defaultdict(set) 42 | testSets = collections.defaultdict(set) 43 | 44 | #puts correctly labeled sentences in referenceSets and the predictively labeled version in testsets 45 | for i, (features, label) in enumerate(testFeatures): 46 | referenceSets[label].add(i) 47 | predicted = classifier.classify(features) 48 | testSets[predicted].add(i) 49 | 50 | #prints metrics to show how well the feature selection did 51 | print 'train on %d instances, test on %d instances' % (len(trainFeatures), len(testFeatures)) 52 | print 'accuracy:', nltk.classify.util.accuracy(classifier, testFeatures) 53 | print 'pos precision:', nltk.metrics.precision(referenceSets['pos'], testSets['pos']) 54 | print 'pos recall:', nltk.metrics.recall(referenceSets['pos'], testSets['pos']) 55 | print 'neg precision:', nltk.metrics.precision(referenceSets['neg'], testSets['neg']) 56 | print 'neg recall:', nltk.metrics.recall(referenceSets['neg'], testSets['neg']) 57 | classifier.show_most_informative_features(10) 58 | 59 | #creates a feature selection mechanism that uses all words 60 | def make_full_dict(words): 61 | return dict([(word, True) for word in words]) 62 | 63 | #tries using all words as the feature selection mechanism 64 | print 'using all words as features' 65 | evaluate_features(make_full_dict) 66 | 67 | #scores words based on chi-squared test to show information gain (http://streamhacker.com/2010/06/16/text-classification-sentiment-analysis-eliminate-low-information-features/) 68 | def create_word_scores(): 69 | #creates lists of all positive and negative words 70 | posWords = [] 71 | negWords = [] 72 | with open(RT_POLARITY_POS_FILE, 'r') as posSentences: 73 | for i in posSentences: 74 | posWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) 75 | posWords.append(posWord) 76 | with open(RT_POLARITY_NEG_FILE, 'r') as negSentences: 77 | for i in negSentences: 78 | negWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip()) 79 | negWords.append(negWord) 80 | posWords = list(itertools.chain(*posWords)) 81 | negWords = list(itertools.chain(*negWords)) 82 | 83 | #build frequency distibution of all words and then frequency distributions of words within positive and negative labels 84 | word_fd = FreqDist() 85 | cond_word_fd = ConditionalFreqDist() 86 | for word in posWords: 87 | word_fd[word.lower()] += 1 88 | cond_word_fd['pos'][word.lower()] += 1 89 | for word in negWords: 90 | word_fd[word.lower()] += 1 91 | cond_word_fd['neg'][word.lower()] += 1 92 | 93 | #finds the number of positive and negative words, as well as the total number of words 94 | pos_word_count = cond_word_fd['pos'].N() 95 | neg_word_count = cond_word_fd['neg'].N() 96 | total_word_count = pos_word_count + neg_word_count 97 | 98 | #builds dictionary of word scores based on chi-squared test 99 | word_scores = {} 100 | for word, freq in word_fd.iteritems(): 101 | pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) 102 | neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) 103 | word_scores[word] = pos_score + neg_score 104 | 105 | return word_scores 106 | 107 | #finds word scores 108 | word_scores = create_word_scores() 109 | 110 | #finds the best 'number' words based on word scores 111 | def find_best_words(word_scores, number): 112 | best_vals = sorted(word_scores.iteritems(), key=lambda (w, s): s, reverse=True)[:number] 113 | best_words = set([w for w, s in best_vals]) 114 | return best_words 115 | 116 | #creates feature selection mechanism that only uses best words 117 | def best_word_features(words): 118 | return dict([(word, True) for word in words if word in best_words]) 119 | 120 | #numbers of features to select 121 | numbers_to_test = [10, 100, 1000, 10000, 15000] 122 | #tries the best_word_features mechanism with each of the numbers_to_test of features 123 | for num in numbers_to_test: 124 | print 'evaluating best %d word features' % (num) 125 | best_words = find_best_words(word_scores, num) 126 | evaluate_features(best_word_features) 127 | --------------------------------------------------------------------------------