├── polarityData
    ├── rt-polaritydata.README.1.0.txt
    └── rt-polaritydata
    │   ├── rt-polarity-neg.txt
    │   └── rt-polarity-pos.txt
├── readme.md
└── sentiment_analysis.py


/polarityData/rt-polaritydata.README.1.0.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | =======
 3 | 
 4 | Introduction
 5 | 
 6 | This README v1.0 (June, 2005) for the v1.0 sentence polarity dataset comes
 7 | from the URL
 8 | http://www.cs.cornell.edu/people/pabo/movie-review-data .
 9 | 
10 | =======
11 | 
12 | Citation Info 
13 | 
14 | This data was first used in Bo Pang and Lillian Lee,
15 | ``Seeing stars: Exploiting class relationships for sentiment categorization
16 | with respect to rating scales.'', Proceedings of the ACL, 2005.
17 |   
18 | @InProceedings{Pang+Lee:05a,
19 |   author =       {Bo Pang and Lillian Lee},
20 |   title =        {Seeing stars: Exploiting class relationships for sentiment
21 |                   categorization with respect to rating scales},
22 |   booktitle =    {Proceedings of the ACL},
23 |   year =         2005
24 | }
25 | 
26 | =======
27 | 
28 | Data Format Summary 
29 | 
30 | - rt-polaritydata.tar.gz: contains this readme and two data files that
31 |   were used in the experiments described in Pang/Lee ACL 2005.
32 | 
33 |   Specifically: 
34 |   * rt-polarity.pos contains 5331 positive snippets
35 |   * rt-polarity.neg contains 5331 negative snippets
36 | 
37 |   Each line in these two files corresponds to a single snippet (usually
38 |   containing roughly one single sentence); all snippets are down-cased.  
39 |   The snippets were labeled automatically, as described below (see 
40 |   section "Label Decision").
41 | 
42 |   Note: The original source files from which the data in
43 |   rt-polaritydata.tar.gz was derived can be found in the subjective
44 |   part (Rotten Tomatoes pages) of subjectivity_html.tar.gz (released 
45 |   with subjectivity dataset v1.0).
46 | 
47 |    
48 | =======
49 | 
50 | Label Decision 
51 | 
52 | We assumed snippets (from Rotten Tomatoes webpages) for reviews marked with 
53 | ``fresh'' are positive, and those for reviews marked with ``rotten'' are
54 | negative.
55 | 


--------------------------------------------------------------------------------
/polarityData/rt-polaritydata/rt-polarity-neg.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abromberg/sentiment_analysis_python/bdae4288eb3c678d4af04d370934a07a7b2cd882/polarityData/rt-polaritydata/rt-polarity-neg.txt


--------------------------------------------------------------------------------
/polarityData/rt-polaritydata/rt-polarity-pos.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abromberg/sentiment_analysis_python/bdae4288eb3c678d4af04d370934a07a7b2cd882/polarityData/rt-polaritydata/rt-polarity-pos.txt


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
1 | ## README
2 | 
3 | ### Documentation
4 | 
5 | For documentation, check out the blog post about this code [here](http://andybromberg.com/sentiment-analysis-python).
6 | 
7 | ### Note
8 | 
9 | Due to the fact that I developed this on Windows, there might be issues reading the polarity data files by line using the code I provided (because of inconsistent line break characters). If this comes up, please [email me](mailto:hi@andybromberg.com)!


--------------------------------------------------------------------------------
/sentiment_analysis.py:
--------------------------------------------------------------------------------
  1 | import re, math, collections, itertools, os
  2 | import nltk, nltk.classify.util, nltk.metrics
  3 | from nltk.classify import NaiveBayesClassifier
  4 | from nltk.metrics import BigramAssocMeasures
  5 | from nltk.probability import FreqDist, ConditionalFreqDist
  6 | 
  7 | 
  8 | POLARITY_DATA_DIR = os.path.join('polarityData', 'rt-polaritydata')
  9 | RT_POLARITY_POS_FILE = os.path.join(POLARITY_DATA_DIR, 'rt-polarity-pos.txt')
 10 | RT_POLARITY_NEG_FILE = os.path.join(POLARITY_DATA_DIR, 'rt-polarity-neg.txt')
 11 | 
 12 | 
 13 | #this function takes a feature selection mechanism and returns its performance in a variety of metrics
 14 | def evaluate_features(feature_select):
 15 | 	posFeatures = []
 16 | 	negFeatures = []
 17 | 	#http://stackoverflow.com/questions/367155/splitting-a-string-into-words-and-punctuation
 18 | 	#breaks up the sentences into lists of individual words (as selected by the input mechanism) and appends 'pos' or 'neg' after each list
 19 | 	with open(RT_POLARITY_POS_FILE, 'r') as posSentences:
 20 | 		for i in posSentences:
 21 | 			posWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
 22 | 			posWords = [feature_select(posWords), 'pos']
 23 | 			posFeatures.append(posWords)
 24 | 	with open(RT_POLARITY_NEG_FILE, 'r') as negSentences:
 25 | 		for i in negSentences:
 26 | 			negWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
 27 | 			negWords = [feature_select(negWords), 'neg']
 28 | 			negFeatures.append(negWords)
 29 | 
 30 | 	
 31 | 	#selects 3/4 of the features to be used for training and 1/4 to be used for testing
 32 | 	posCutoff = int(math.floor(len(posFeatures)*3/4))
 33 | 	negCutoff = int(math.floor(len(negFeatures)*3/4))
 34 | 	trainFeatures = posFeatures[:posCutoff] + negFeatures[:negCutoff]
 35 | 	testFeatures = posFeatures[posCutoff:] + negFeatures[negCutoff:]
 36 | 
 37 | 	#trains a Naive Bayes Classifier
 38 | 	classifier = NaiveBayesClassifier.train(trainFeatures)	
 39 | 
 40 | 	#initiates referenceSets and testSets
 41 | 	referenceSets = collections.defaultdict(set)
 42 | 	testSets = collections.defaultdict(set)	
 43 | 
 44 | 	#puts correctly labeled sentences in referenceSets and the predictively labeled version in testsets
 45 | 	for i, (features, label) in enumerate(testFeatures):
 46 | 		referenceSets[label].add(i)
 47 | 		predicted = classifier.classify(features)
 48 | 		testSets[predicted].add(i)	
 49 | 
 50 | 	#prints metrics to show how well the feature selection did
 51 | 	print 'train on %d instances, test on %d instances' % (len(trainFeatures), len(testFeatures))
 52 | 	print 'accuracy:', nltk.classify.util.accuracy(classifier, testFeatures)
 53 | 	print 'pos precision:', nltk.metrics.precision(referenceSets['pos'], testSets['pos'])
 54 | 	print 'pos recall:', nltk.metrics.recall(referenceSets['pos'], testSets['pos'])
 55 | 	print 'neg precision:', nltk.metrics.precision(referenceSets['neg'], testSets['neg'])
 56 | 	print 'neg recall:', nltk.metrics.recall(referenceSets['neg'], testSets['neg'])
 57 | 	classifier.show_most_informative_features(10)
 58 | 
 59 | #creates a feature selection mechanism that uses all words
 60 | def make_full_dict(words):
 61 | 	return dict([(word, True) for word in words])
 62 | 
 63 | #tries using all words as the feature selection mechanism
 64 | print 'using all words as features'
 65 | evaluate_features(make_full_dict)
 66 | 
 67 | #scores words based on chi-squared test to show information gain (http://streamhacker.com/2010/06/16/text-classification-sentiment-analysis-eliminate-low-information-features/)
 68 | def create_word_scores():
 69 | 	#creates lists of all positive and negative words
 70 | 	posWords = []
 71 | 	negWords = []
 72 | 	with open(RT_POLARITY_POS_FILE, 'r') as posSentences:
 73 | 		for i in posSentences:
 74 | 			posWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
 75 | 			posWords.append(posWord)
 76 | 	with open(RT_POLARITY_NEG_FILE, 'r') as negSentences:
 77 | 		for i in negSentences:
 78 | 			negWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
 79 | 			negWords.append(negWord)
 80 | 	posWords = list(itertools.chain(*posWords))
 81 | 	negWords = list(itertools.chain(*negWords))
 82 | 
 83 | 	#build frequency distibution of all words and then frequency distributions of words within positive and negative labels
 84 | 	word_fd = FreqDist()
 85 | 	cond_word_fd = ConditionalFreqDist()
 86 | 	for word in posWords:
 87 | 		word_fd[word.lower()] += 1
 88 | 		cond_word_fd['pos'][word.lower()] += 1
 89 | 	for word in negWords:
 90 | 		word_fd[word.lower()] += 1
 91 | 		cond_word_fd['neg'][word.lower()] += 1
 92 | 
 93 | 	#finds the number of positive and negative words, as well as the total number of words
 94 | 	pos_word_count = cond_word_fd['pos'].N()
 95 | 	neg_word_count = cond_word_fd['neg'].N()
 96 | 	total_word_count = pos_word_count + neg_word_count
 97 | 
 98 | 	#builds dictionary of word scores based on chi-squared test
 99 | 	word_scores = {}
100 | 	for word, freq in word_fd.iteritems():
101 | 		pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
102 | 		neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
103 | 		word_scores[word] = pos_score + neg_score
104 | 
105 | 	return word_scores
106 | 
107 | #finds word scores
108 | word_scores = create_word_scores()
109 | 
110 | #finds the best 'number' words based on word scores
111 | def find_best_words(word_scores, number):
112 | 	best_vals = sorted(word_scores.iteritems(), key=lambda (w, s): s, reverse=True)[:number]
113 | 	best_words = set([w for w, s in best_vals])
114 | 	return best_words
115 | 
116 | #creates feature selection mechanism that only uses best words
117 | def best_word_features(words):
118 | 	return dict([(word, True) for word in words if word in best_words])
119 | 
120 | #numbers of features to select
121 | numbers_to_test = [10, 100, 1000, 10000, 15000]
122 | #tries the best_word_features mechanism with each of the numbers_to_test of features
123 | for num in numbers_to_test:
124 | 	print 'evaluating best %d word features' % (num)
125 | 	best_words = find_best_words(word_scores, num)
126 | 	evaluate_features(best_word_features)
127 | 


--------------------------------------------------------------------------------