├── Model_Creation.py
└── README.md
/Model_Creation.py:
--------------------------------------------------------------------------------
1 | """
2 | Created on Sun Nov 22 17:26:01 2015
3 | Script will do sentiment analysis on restaurant reviews.
4 | @author: Ricky
5 | """
6 | from sklearn.feature_extraction.text import CountVectorizer
7 | from sklearn.feature_extraction.text import TfidfTransformer
8 | from sklearn.ensemble import VotingClassifier
9 | from sklearn.linear_model import LogisticRegression
10 | from sklearn.neighbors import KNeighborsClassifier
11 | from sklearn.naive_bayes import MultinomialNB
12 | from sklearn.pipeline import Pipeline
13 | from nltk.util import ngrams
14 | import re
15 | from sklearn import linear_model
16 |
17 | fileWriter = open('out.txt','w')
18 |
19 | mystopwords = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours',
20 | 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers',
21 | 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves',
22 | 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are',
23 | 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does',
24 | 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until',
25 | 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into',
26 | 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down',
27 | 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here',
28 | 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',
29 | 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so',
30 | 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now']
31 |
32 | def loadData(fname):
33 | reviews=[]
34 | labels=[]
35 | # count2 = 0
36 | f=open(fname)
37 | for line in f:
38 | # count2 = count2 + 1
39 | review,rating=line.strip().split('\t')
40 | review = re.sub('not ', 'not', review)
41 | review = re.sub('Not ', 'Not', review)
42 | review = re.sub('
', ' ',review)
43 | review = re.sub(' +', ' ',review)
44 | # review = re.sub('[^a-z\d]', ' ',review)
45 | terms = review.split()
46 | reviews.append(review.lower())
47 | labels.append(int(rating))
48 | threegrams = ngrams(terms,3)
49 | for tg in threegrams:
50 | if tg[0] in mystopwords or tg[1] in mystopwords or tg[2] in mystopwords:
51 | continue
52 | # print count2
53 | f.close()
54 | return reviews,labels
55 |
56 | def loadTrainData(fname):
57 | reviews=[]
58 | f=open(fname)
59 | for line in f:
60 | review=line.strip()
61 | review = re.sub('not ', 'not', review)
62 | review = re.sub('Not ', 'Not', review)
63 | review = re.sub('
', ' ',review)
64 | review = re.sub(' +', ' ',review)
65 | # review = re.sub('[^a-z\d]', ' ',review)
66 | terms = review.split()
67 | reviews.append(review.lower())
68 | threegrams = ngrams(terms,3)
69 | for tg in threegrams:
70 | if tg[0] in mystopwords or tg[1] in mystopwords or tg[2] in mystopwords:
71 | continue
72 | f.close()
73 | return reviews
74 |
75 | rev_train,labels_train=loadData('training.txt')
76 | rev_test=loadTrainData('testing.txt')
77 |
78 |
79 | MNB_pipeline = Pipeline([('vect', CountVectorizer(ngram_range = (1, 2))),
80 | ('clf', MultinomialNB(alpha = 1.0, fit_prior = True)),
81 | ])
82 |
83 | KNN_pipeline = Pipeline([('vect', CountVectorizer()),
84 | ('clf', KNeighborsClassifier(n_neighbors = 20)),
85 | ])
86 |
87 | SGD_pipeline = Pipeline([('vect', CountVectorizer()),
88 | ('clf', linear_model.SGDClassifier(loss='log')),
89 | ])
90 |
91 | LR_pipeline = Pipeline([('vect', CountVectorizer()),
92 | ('tfidf', TfidfTransformer(norm = 'l2', use_idf = True, smooth_idf = True, sublinear_tf = True)),
93 | ('clf', LogisticRegression(warm_start = True, random_state = 1)),
94 | ])
95 |
96 |
97 | eclf = VotingClassifier(estimators=[('MNB', MNB_pipeline), ('SGD',SGD_pipeline), ('LR', LR_pipeline)], voting = 'soft', weights = [3,2,3])
98 | #('KNN', KNN_pipeline),
99 |
100 | eclf.fit(rev_train,labels_train)
101 |
102 | #use soft voting to predict (majority voting)
103 | pred=eclf.predict(rev_test)
104 |
105 | for x in pred:
106 | fileWriter.write(str(x)+'\n')
107 | fileWriter.close()
108 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Sentimental-Analysis-Python
2 | Build a classifier that predicts whether a restaurant review is positive or negative, based on the review text only. Used classification algorithms like KNN, Logistic Regression and Naive Bayes to classify the reviews either positive or negative. 85% accuracy was achieved using various combination of above algorithm.
3 |
--------------------------------------------------------------------------------