├── README.md
└── sentiment analysis (1).ipynb


/README.md:
--------------------------------------------------------------------------------
1 | # Sentiment-Analysis-on-a-Book-Review
2 | This repository contains a dataset which comprises of reviews on a book that is sold on Amazon.
3 | 


--------------------------------------------------------------------------------
/sentiment analysis (1).ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Sentiment Analysis of a book on sale at Amazon.com"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "## About the Dataset\n",
 15 |     "\n",
 16 |     "This dataset contains a set of 10,000 reviews on a particular book which is scrapped from the comment section of Amazon's online website.\n",
 17 |     "We aim to find out the type of sentiment a particular reader has towards this book by making use of his/her comments."
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "markdown",
 22 |    "metadata": {},
 23 |    "source": [
 24 |     "Importing the Json file and building the framework of our class which predicts the sentiment."
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": 1,
 30 |    "metadata": {},
 31 |    "outputs": [
 32 |     {
 33 |      "data": {
 34 |       "text/plain": [
 35 |        "'POSITIVE'"
 36 |       ]
 37 |      },
 38 |      "execution_count": 1,
 39 |      "metadata": {},
 40 |      "output_type": "execute_result"
 41 |     }
 42 |    ],
 43 |    "source": [
 44 |     "import json\n",
 45 |     "class Sentiment:\n",
 46 |     "    NEGATIVE=\"NEGATIVE\"\n",
 47 |     "    NEUTRAL=\"NEUTRAL\"\n",
 48 |     "    POSITIVE=\"POSITIVE\"\n",
 49 |     "    \n",
 50 |     "class Review:\n",
 51 |     "    def __init__(self,text,score):  \n",
 52 |     "        self.text =text\n",
 53 |     "        self.score=score\n",
 54 |     "        self.sentiment=self.get_sentiment()\n",
 55 |     "        \n",
 56 |     "    def get_sentiment(self):\n",
 57 |     "        if self.score<=2:\n",
 58 |     "            return Sentiment.NEGATIVE\n",
 59 |     "        elif self.score==3:\n",
 60 |     "            return Sentiment.NEUTRAL\n",
 61 |     "        else:\n",
 62 |     "            return Sentiment.POSITIVE\n",
 63 |     "        \n",
 64 |     "file_name='C://Users//Nishil07//Documents//Books_small_10000.json'\n",
 65 |     "reviews=[]\n",
 66 |     "with open(file_name) as f:\n",
 67 |     "    for line in f:\n",
 68 |     "        review=json.loads(line)\n",
 69 |     "        reviews.append(Review(review['reviewText'],review['overall']))\n",
 70 |     "reviews[5].sentiment"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "markdown",
 75 |    "metadata": {},
 76 |    "source": [
 77 |     "Splitting the train and test data."
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": 2,
 83 |    "metadata": {},
 84 |    "outputs": [],
 85 |    "source": [
 86 |     "import numpy as np\n",
 87 |     "from sklearn.model_selection import train_test_split\n",
 88 |     "training,test=train_test_split(reviews,test_size=0.33,random_state=42)"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": 3,
 94 |    "metadata": {},
 95 |    "outputs": [
 96 |     {
 97 |      "name": "stdout",
 98 |      "output_type": "stream",
 99 |      "text": [
100 |       "POSITIVE\n"
101 |      ]
102 |     }
103 |    ],
104 |    "source": [
105 |     "print(training[0].sentiment)"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "code",
110 |    "execution_count": 4,
111 |    "metadata": {},
112 |    "outputs": [
113 |     {
114 |      "data": {
115 |       "text/plain": [
116 |        "'POSITIVE'"
117 |       ]
118 |      },
119 |      "execution_count": 4,
120 |      "metadata": {},
121 |      "output_type": "execute_result"
122 |     }
123 |    ],
124 |    "source": [
125 |     "train_x=[x.text for x in training]\n",
126 |     "train_y=[x.sentiment for x in training]\n",
127 |     "test_x=[x.text for x in test]\n",
128 |     "test_y=[x.sentiment for x in test]\n",
129 |     "test_y[0]"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "markdown",
134 |    "metadata": {},
135 |    "source": [
136 |     "Converting the words to vectors and making use of the 'bag of words' concept."
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "code",
141 |    "execution_count": 33,
142 |    "metadata": {},
143 |    "outputs": [
144 |     {
145 |      "name": "stdout",
146 |      "output_type": "stream",
147 |      "text": [
148 |       "  (0, 932)\t1\n",
149 |       "  (0, 11573)\t1\n",
150 |       "  (0, 16680)\t2\n",
151 |       "  (0, 2155)\t2\n",
152 |       "  (0, 6681)\t4\n",
153 |       "  (0, 951)\t2\n",
154 |       "  (0, 8908)\t5\n",
155 |       "  (0, 8874)\t6\n",
156 |       "  (0, 16593)\t8\n",
157 |       "  (0, 16586)\t4\n",
158 |       "  (0, 18578)\t2\n",
159 |       "  (0, 16824)\t9\n",
160 |       "  (0, 8374)\t1\n",
161 |       "  (0, 7710)\t1\n",
162 |       "  (0, 11267)\t1\n",
163 |       "  (0, 6505)\t1\n",
164 |       "  (0, 7844)\t1\n",
165 |       "  (0, 11409)\t2\n",
166 |       "  (0, 15845)\t2\n",
167 |       "  (0, 7681)\t1\n",
168 |       "  (0, 3393)\t1\n",
169 |       "  (0, 9991)\t3\n",
170 |       "  (0, 15306)\t3\n",
171 |       "  (0, 2541)\t2\n",
172 |       "  (0, 6868)\t4\n",
173 |       "  :\t:\n",
174 |       "  (0, 5312)\t1\n",
175 |       "  (0, 6715)\t1\n",
176 |       "  (0, 12077)\t1\n",
177 |       "  (0, 7629)\t1\n",
178 |       "  (0, 666)\t1\n",
179 |       "  (0, 7768)\t2\n",
180 |       "  (0, 1785)\t1\n",
181 |       "  (0, 372)\t1\n",
182 |       "  (0, 18468)\t1\n",
183 |       "  (0, 5577)\t1\n",
184 |       "  (0, 13335)\t1\n",
185 |       "  (0, 16555)\t1\n",
186 |       "  (0, 877)\t1\n",
187 |       "  (0, 11306)\t1\n",
188 |       "  (0, 8985)\t1\n",
189 |       "  (0, 1443)\t1\n",
190 |       "  (0, 9696)\t1\n",
191 |       "  (0, 9628)\t1\n",
192 |       "  (0, 10389)\t1\n",
193 |       "  (0, 9754)\t1\n",
194 |       "  (0, 16737)\t1\n",
195 |       "  (0, 11834)\t1\n",
196 |       "  (0, 18401)\t1\n",
197 |       "  (0, 10269)\t1\n",
198 |       "  (0, 15765)\t1\n"
199 |      ]
200 |     }
201 |    ],
202 |    "source": [
203 |     "from sklearn.feature_extraction.text import CountVectorizer\n",
204 |     "vectorizer=CountVectorizer()\n",
205 |     "train_x_vectors=vectorizer.fit_transform(train_x)\n",
206 |     "test_x_vectors=vectorizer.fit_transform(test_x)\n",
207 |     "#test_x_vectors=test_x_vectors.toarray()\n",
208 |     "#test_x_vectors=np.reshape(test_x_vectors,(-1,1))\n",
209 |     "print(test_x_vectors[10])"
210 |    ]
211 |   },
212 |   {
213 |    "cell_type": "markdown",
214 |    "metadata": {},
215 |    "source": [
216 |     "Building the prediction models and fitting the train data into them"
217 |    ]
218 |   },
219 |   {
220 |    "cell_type": "code",
221 |    "execution_count": 6,
222 |    "metadata": {},
223 |    "outputs": [],
224 |    "source": [
225 |     "from sklearn import svm"
226 |    ]
227 |   },
228 |   {
229 |    "cell_type": "code",
230 |    "execution_count": 7,
231 |    "metadata": {},
232 |    "outputs": [
233 |     {
234 |      "data": {
235 |       "text/plain": [
236 |        "array(['POSITIVE'], dtype='<U8')"
237 |       ]
238 |      },
239 |      "execution_count": 7,
240 |      "metadata": {},
241 |      "output_type": "execute_result"
242 |     }
243 |    ],
244 |    "source": [
245 |     "clf_svm=svm.SVC(kernel='linear')\n",
246 |     "#clf_svm.fit(train_x_vectors,train_y)\n",
247 |     "#print(test_x[0])\n",
248 |     "clf_svm.fit(test_x_vectors,test_y)\n",
249 |     "clf_svm.predict(test_x_vectors[0])"
250 |    ]
251 |   },
252 |   {
253 |    "cell_type": "code",
254 |    "execution_count": 8,
255 |    "metadata": {},
256 |    "outputs": [
257 |     {
258 |      "data": {
259 |       "text/plain": [
260 |        "array(['POSITIVE'], dtype='<U8')"
261 |       ]
262 |      },
263 |      "execution_count": 8,
264 |      "metadata": {},
265 |      "output_type": "execute_result"
266 |     }
267 |    ],
268 |    "source": [
269 |     "from sklearn.tree import DecisionTreeClassifier\n",
270 |     "clf_dec=DecisionTreeClassifier()\n",
271 |     "clf_dec.fit(test_x_vectors,test_y)\n",
272 |     "clf_dec.predict(test_x_vectors[0])"
273 |    ]
274 |   },
275 |   {
276 |    "cell_type": "code",
277 |    "execution_count": 34,
278 |    "metadata": {},
279 |    "outputs": [
280 |     {
281 |      "name": "stderr",
282 |      "output_type": "stream",
283 |      "text": [
284 |       "C:\\Users\\Nishil07\\anaconda3\\lib\\site-packages\\sklearn\\linear_model\\_logistic.py:940: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
285 |       "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
286 |       "\n",
287 |       "Increase the number of iterations (max_iter) or scale the data as shown in:\n",
288 |       "    https://scikit-learn.org/stable/modules/preprocessing.html\n",
289 |       "Please also refer to the documentation for alternative solver options:\n",
290 |       "    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
291 |       "  extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)\n"
292 |      ]
293 |     },
294 |     {
295 |      "data": {
296 |       "text/plain": [
297 |        "array(['POSITIVE'], dtype='<U8')"
298 |       ]
299 |      },
300 |      "execution_count": 34,
301 |      "metadata": {},
302 |      "output_type": "execute_result"
303 |     }
304 |    ],
305 |    "source": [
306 |     "from sklearn.linear_model import LogisticRegression\n",
307 |     "clf_lr=LogisticRegression()\n",
308 |     "clf_lr.fit(test_x_vectors,test_y)\n",
309 |     "clf_lr.predict(test_x_vectors[0])"
310 |    ]
311 |   },
312 |   {
313 |    "cell_type": "markdown",
314 |    "metadata": {},
315 |    "source": [
316 |     "Applying various testing metrics to check how accurate our model is......"
317 |    ]
318 |   },
319 |   {
320 |    "cell_type": "code",
321 |    "execution_count": 37,
322 |    "metadata": {},
323 |    "outputs": [
324 |     {
325 |      "name": "stdout",
326 |      "output_type": "stream",
327 |      "text": [
328 |       "1.0\n",
329 |       "1.0\n",
330 |       "0.9972727272727273\n"
331 |      ]
332 |     }
333 |    ],
334 |    "source": [
335 |     "#mean accuracy\n",
336 |     "print(clf_svm.score(test_x_vectors,test_y))\n",
337 |     "print(clf_dec.score(test_x_vectors,test_y))\n",
338 |     "print(clf_lr.score(test_x_vectors,test_y))"
339 |    ]
340 |   },
341 |   {
342 |    "cell_type": "code",
343 |    "execution_count": 42,
344 |    "metadata": {},
345 |    "outputs": [
346 |     {
347 |      "data": {
348 |       "text/plain": [
349 |        "array([0.99837633, 0.98911353, 0.99516908])"
350 |       ]
351 |      },
352 |      "execution_count": 42,
353 |      "metadata": {},
354 |      "output_type": "execute_result"
355 |     }
356 |    ],
357 |    "source": [
358 |     "#F1-scores\n",
359 |     "from sklearn.metrics import f1_score\n",
360 |     "f1_score(test_y,clf_lr.predict(test_x_vectors),average=None,labels=[Sentiment.POSITIVE,Sentiment.NEUTRAL,Sentiment.NEGATIVE])"
361 |    ]
362 |   },
363 |   {
364 |    "cell_type": "code",
365 |    "execution_count": null,
366 |    "metadata": {},
367 |    "outputs": [],
368 |    "source": []
369 |   },
370 |   {
371 |    "cell_type": "code",
372 |    "execution_count": null,
373 |    "metadata": {},
374 |    "outputs": [],
375 |    "source": []
376 |   }
377 |  ],
378 |  "metadata": {
379 |   "kernelspec": {
380 |    "display_name": "Python 3",
381 |    "language": "python",
382 |    "name": "python3"
383 |   },
384 |   "language_info": {
385 |    "codemirror_mode": {
386 |     "name": "ipython",
387 |     "version": 3
388 |    },
389 |    "file_extension": ".py",
390 |    "mimetype": "text/x-python",
391 |    "name": "python",
392 |    "nbconvert_exporter": "python",
393 |    "pygments_lexer": "ipython3",
394 |    "version": "3.7.6"
395 |   }
396 |  },
397 |  "nbformat": 4,
398 |  "nbformat_minor": 4
399 | }
400 | 


--------------------------------------------------------------------------------