├── Model_Creation.py
└── README.md


/Model_Creation.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Created on Sun Nov 22 17:26:01 2015
  3 | Script will do sentiment analysis on restaurant reviews.
  4 | @author: Ricky
  5 | """
  6 | from sklearn.feature_extraction.text import CountVectorizer
  7 | from sklearn.feature_extraction.text import TfidfTransformer
  8 | from sklearn.ensemble import VotingClassifier
  9 | from sklearn.linear_model import LogisticRegression
 10 | from sklearn.neighbors import KNeighborsClassifier
 11 | from sklearn.naive_bayes import MultinomialNB
 12 | from sklearn.pipeline import Pipeline
 13 | from nltk.util import ngrams
 14 | import re
 15 | from sklearn import linear_model
 16 | 
 17 | fileWriter = open('out.txt','w')
 18 | 
 19 | mystopwords = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours',
 20 | 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers',
 21 | 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves',
 22 | 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are',
 23 | 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does',
 24 | 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until',
 25 | 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into',
 26 | 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down',
 27 | 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here',
 28 | 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',
 29 | 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so',
 30 | 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now']
 31 | 
 32 | def loadData(fname):
 33 |     reviews=[]
 34 |     labels=[]
 35 |    # count2 = 0
 36 |     f=open(fname)
 37 |     for line in f:
 38 |        # count2 = count2 + 1
 39 |         review,rating=line.strip().split('\t')
 40 |         review = re.sub('not ', 'not', review)
 41 |         review = re.sub('Not ', 'Not', review)
 42 |         review = re.sub('<br>', ' ',review)
 43 |         review = re.sub(' +', ' ',review)
 44 |        # review = re.sub('[^a-z\d]', ' ',review)
 45 |         terms = review.split()
 46 |         reviews.append(review.lower())    
 47 |         labels.append(int(rating))
 48 |     threegrams = ngrams(terms,3)
 49 |     for tg in threegrams:
 50 |         if tg[0] in mystopwords or tg[1] in mystopwords or tg[2] in mystopwords:
 51 |             continue
 52 |   #  print count2  
 53 |     f.close()
 54 |     return reviews,labels
 55 | 
 56 | def loadTrainData(fname):
 57 |     reviews=[]
 58 |     f=open(fname)
 59 |     for line in f:
 60 |         review=line.strip()
 61 |         review = re.sub('not ', 'not', review)
 62 |         review = re.sub('Not ', 'Not', review)
 63 |         review = re.sub('<br>', ' ',review)
 64 |         review = re.sub(' +', ' ',review)
 65 |        # review = re.sub('[^a-z\d]', ' ',review)
 66 |         terms = review.split()
 67 |         reviews.append(review.lower())
 68 |     threegrams = ngrams(terms,3)
 69 |     for tg in threegrams:
 70 |         if tg[0] in mystopwords or tg[1] in mystopwords or tg[2] in mystopwords:
 71 |             continue
 72 |     f.close()
 73 |     return reviews
 74 | 
 75 | rev_train,labels_train=loadData('training.txt')
 76 | rev_test=loadTrainData('testing.txt')
 77 | 
 78 | 
 79 | MNB_pipeline = Pipeline([('vect', CountVectorizer(ngram_range = (1, 2))), 
 80 |                          ('clf', MultinomialNB(alpha = 1.0, fit_prior = True)),
 81 |                         ])
 82 | 
 83 | KNN_pipeline = Pipeline([('vect', CountVectorizer()), 
 84 |                          ('clf', KNeighborsClassifier(n_neighbors = 20)),
 85 |                         ])
 86 |                         
 87 | SGD_pipeline = Pipeline([('vect', CountVectorizer()),
 88 |                         ('clf', linear_model.SGDClassifier(loss='log')),
 89 |                         ])
 90 |                         
 91 | LR_pipeline = Pipeline([('vect', CountVectorizer()), 
 92 |                         ('tfidf', TfidfTransformer(norm = 'l2', use_idf = True, smooth_idf = True, sublinear_tf = True)),
 93 |                         ('clf', LogisticRegression(warm_start = True, random_state = 1)),
 94 |                        ]) 
 95 |                      
 96 | 
 97 | eclf = VotingClassifier(estimators=[('MNB', MNB_pipeline), ('SGD',SGD_pipeline), ('LR', LR_pipeline)], voting = 'soft', weights = [3,2,3])
 98 | #('KNN', KNN_pipeline), 
 99 | 
100 | eclf.fit(rev_train,labels_train)
101 | 
102 | #use soft voting to predict (majority voting)
103 | pred=eclf.predict(rev_test)
104 | 
105 | for x in pred:
106 |     fileWriter.write(str(x)+'\n')
107 | fileWriter.close()
108 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Sentimental-Analysis-Python
2 | Build a classifier that predicts whether a restaurant review is positive or negative, based on the review text only. Used classification algorithms like KNN, Logistic Regression and Naive Bayes to classify the reviews either positive or negative. 85% accuracy was achieved using various combination of above algorithm. 
3 | 


--------------------------------------------------------------------------------