├── Model_Creation.py └── README.md /Model_Creation.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on Sun Nov 22 17:26:01 2015 3 | Script will do sentiment analysis on restaurant reviews. 4 | @author: Ricky 5 | """ 6 | from sklearn.feature_extraction.text import CountVectorizer 7 | from sklearn.feature_extraction.text import TfidfTransformer 8 | from sklearn.ensemble import VotingClassifier 9 | from sklearn.linear_model import LogisticRegression 10 | from sklearn.neighbors import KNeighborsClassifier 11 | from sklearn.naive_bayes import MultinomialNB 12 | from sklearn.pipeline import Pipeline 13 | from nltk.util import ngrams 14 | import re 15 | from sklearn import linear_model 16 | 17 | fileWriter = open('out.txt','w') 18 | 19 | mystopwords = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 20 | 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 21 | 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 22 | 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 23 | 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 24 | 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 25 | 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 26 | 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 27 | 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 28 | 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 29 | 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 30 | 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now'] 31 | 32 | def loadData(fname): 33 | reviews=[] 34 | labels=[] 35 | # count2 = 0 36 | f=open(fname) 37 | for line in f: 38 | # count2 = count2 + 1 39 | review,rating=line.strip().split('\t') 40 | review = re.sub('not ', 'not', review) 41 | review = re.sub('Not ', 'Not', review) 42 | review = re.sub('
', ' ',review) 43 | review = re.sub(' +', ' ',review) 44 | # review = re.sub('[^a-z\d]', ' ',review) 45 | terms = review.split() 46 | reviews.append(review.lower()) 47 | labels.append(int(rating)) 48 | threegrams = ngrams(terms,3) 49 | for tg in threegrams: 50 | if tg[0] in mystopwords or tg[1] in mystopwords or tg[2] in mystopwords: 51 | continue 52 | # print count2 53 | f.close() 54 | return reviews,labels 55 | 56 | def loadTrainData(fname): 57 | reviews=[] 58 | f=open(fname) 59 | for line in f: 60 | review=line.strip() 61 | review = re.sub('not ', 'not', review) 62 | review = re.sub('Not ', 'Not', review) 63 | review = re.sub('
', ' ',review) 64 | review = re.sub(' +', ' ',review) 65 | # review = re.sub('[^a-z\d]', ' ',review) 66 | terms = review.split() 67 | reviews.append(review.lower()) 68 | threegrams = ngrams(terms,3) 69 | for tg in threegrams: 70 | if tg[0] in mystopwords or tg[1] in mystopwords or tg[2] in mystopwords: 71 | continue 72 | f.close() 73 | return reviews 74 | 75 | rev_train,labels_train=loadData('training.txt') 76 | rev_test=loadTrainData('testing.txt') 77 | 78 | 79 | MNB_pipeline = Pipeline([('vect', CountVectorizer(ngram_range = (1, 2))), 80 | ('clf', MultinomialNB(alpha = 1.0, fit_prior = True)), 81 | ]) 82 | 83 | KNN_pipeline = Pipeline([('vect', CountVectorizer()), 84 | ('clf', KNeighborsClassifier(n_neighbors = 20)), 85 | ]) 86 | 87 | SGD_pipeline = Pipeline([('vect', CountVectorizer()), 88 | ('clf', linear_model.SGDClassifier(loss='log')), 89 | ]) 90 | 91 | LR_pipeline = Pipeline([('vect', CountVectorizer()), 92 | ('tfidf', TfidfTransformer(norm = 'l2', use_idf = True, smooth_idf = True, sublinear_tf = True)), 93 | ('clf', LogisticRegression(warm_start = True, random_state = 1)), 94 | ]) 95 | 96 | 97 | eclf = VotingClassifier(estimators=[('MNB', MNB_pipeline), ('SGD',SGD_pipeline), ('LR', LR_pipeline)], voting = 'soft', weights = [3,2,3]) 98 | #('KNN', KNN_pipeline), 99 | 100 | eclf.fit(rev_train,labels_train) 101 | 102 | #use soft voting to predict (majority voting) 103 | pred=eclf.predict(rev_test) 104 | 105 | for x in pred: 106 | fileWriter.write(str(x)+'\n') 107 | fileWriter.close() 108 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Sentimental-Analysis-Python 2 | Build a classifier that predicts whether a restaurant review is positive or negative, based on the review text only. Used classification algorithms like KNN, Logistic Regression and Naive Bayes to classify the reviews either positive or negative. 85% accuracy was achieved using various combination of above algorithm. 3 | --------------------------------------------------------------------------------