├── README.md └── sentiment analysis (1).ipynb /README.md: -------------------------------------------------------------------------------- 1 | # Sentiment-Analysis-on-a-Book-Review 2 | This repository contains a dataset which comprises of reviews on a book that is sold on Amazon. 3 | -------------------------------------------------------------------------------- /sentiment analysis (1).ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Sentiment Analysis of a book on sale at Amazon.com" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## About the Dataset\n", 15 | "\n", 16 | "This dataset contains a set of 10,000 reviews on a particular book which is scrapped from the comment section of Amazon's online website.\n", 17 | "We aim to find out the type of sentiment a particular reader has towards this book by making use of his/her comments." 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": {}, 23 | "source": [ 24 | "Importing the Json file and building the framework of our class which predicts the sentiment." 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 1, 30 | "metadata": {}, 31 | "outputs": [ 32 | { 33 | "data": { 34 | "text/plain": [ 35 | "'POSITIVE'" 36 | ] 37 | }, 38 | "execution_count": 1, 39 | "metadata": {}, 40 | "output_type": "execute_result" 41 | } 42 | ], 43 | "source": [ 44 | "import json\n", 45 | "class Sentiment:\n", 46 | " NEGATIVE=\"NEGATIVE\"\n", 47 | " NEUTRAL=\"NEUTRAL\"\n", 48 | " POSITIVE=\"POSITIVE\"\n", 49 | " \n", 50 | "class Review:\n", 51 | " def __init__(self,text,score): \n", 52 | " self.text =text\n", 53 | " self.score=score\n", 54 | " self.sentiment=self.get_sentiment()\n", 55 | " \n", 56 | " def get_sentiment(self):\n", 57 | " if self.score<=2:\n", 58 | " return Sentiment.NEGATIVE\n", 59 | " elif self.score==3:\n", 60 | " return Sentiment.NEUTRAL\n", 61 | " else:\n", 62 | " return Sentiment.POSITIVE\n", 63 | " \n", 64 | "file_name='C://Users//Nishil07//Documents//Books_small_10000.json'\n", 65 | "reviews=[]\n", 66 | "with open(file_name) as f:\n", 67 | " for line in f:\n", 68 | " review=json.loads(line)\n", 69 | " reviews.append(Review(review['reviewText'],review['overall']))\n", 70 | "reviews[5].sentiment" 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "metadata": {}, 76 | "source": [ 77 | "Splitting the train and test data." 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 2, 83 | "metadata": {}, 84 | "outputs": [], 85 | "source": [ 86 | "import numpy as np\n", 87 | "from sklearn.model_selection import train_test_split\n", 88 | "training,test=train_test_split(reviews,test_size=0.33,random_state=42)" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 3, 94 | "metadata": {}, 95 | "outputs": [ 96 | { 97 | "name": "stdout", 98 | "output_type": "stream", 99 | "text": [ 100 | "POSITIVE\n" 101 | ] 102 | } 103 | ], 104 | "source": [ 105 | "print(training[0].sentiment)" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": 4, 111 | "metadata": {}, 112 | "outputs": [ 113 | { 114 | "data": { 115 | "text/plain": [ 116 | "'POSITIVE'" 117 | ] 118 | }, 119 | "execution_count": 4, 120 | "metadata": {}, 121 | "output_type": "execute_result" 122 | } 123 | ], 124 | "source": [ 125 | "train_x=[x.text for x in training]\n", 126 | "train_y=[x.sentiment for x in training]\n", 127 | "test_x=[x.text for x in test]\n", 128 | "test_y=[x.sentiment for x in test]\n", 129 | "test_y[0]" 130 | ] 131 | }, 132 | { 133 | "cell_type": "markdown", 134 | "metadata": {}, 135 | "source": [ 136 | "Converting the words to vectors and making use of the 'bag of words' concept." 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": 33, 142 | "metadata": {}, 143 | "outputs": [ 144 | { 145 | "name": "stdout", 146 | "output_type": "stream", 147 | "text": [ 148 | " (0, 932)\t1\n", 149 | " (0, 11573)\t1\n", 150 | " (0, 16680)\t2\n", 151 | " (0, 2155)\t2\n", 152 | " (0, 6681)\t4\n", 153 | " (0, 951)\t2\n", 154 | " (0, 8908)\t5\n", 155 | " (0, 8874)\t6\n", 156 | " (0, 16593)\t8\n", 157 | " (0, 16586)\t4\n", 158 | " (0, 18578)\t2\n", 159 | " (0, 16824)\t9\n", 160 | " (0, 8374)\t1\n", 161 | " (0, 7710)\t1\n", 162 | " (0, 11267)\t1\n", 163 | " (0, 6505)\t1\n", 164 | " (0, 7844)\t1\n", 165 | " (0, 11409)\t2\n", 166 | " (0, 15845)\t2\n", 167 | " (0, 7681)\t1\n", 168 | " (0, 3393)\t1\n", 169 | " (0, 9991)\t3\n", 170 | " (0, 15306)\t3\n", 171 | " (0, 2541)\t2\n", 172 | " (0, 6868)\t4\n", 173 | " :\t:\n", 174 | " (0, 5312)\t1\n", 175 | " (0, 6715)\t1\n", 176 | " (0, 12077)\t1\n", 177 | " (0, 7629)\t1\n", 178 | " (0, 666)\t1\n", 179 | " (0, 7768)\t2\n", 180 | " (0, 1785)\t1\n", 181 | " (0, 372)\t1\n", 182 | " (0, 18468)\t1\n", 183 | " (0, 5577)\t1\n", 184 | " (0, 13335)\t1\n", 185 | " (0, 16555)\t1\n", 186 | " (0, 877)\t1\n", 187 | " (0, 11306)\t1\n", 188 | " (0, 8985)\t1\n", 189 | " (0, 1443)\t1\n", 190 | " (0, 9696)\t1\n", 191 | " (0, 9628)\t1\n", 192 | " (0, 10389)\t1\n", 193 | " (0, 9754)\t1\n", 194 | " (0, 16737)\t1\n", 195 | " (0, 11834)\t1\n", 196 | " (0, 18401)\t1\n", 197 | " (0, 10269)\t1\n", 198 | " (0, 15765)\t1\n" 199 | ] 200 | } 201 | ], 202 | "source": [ 203 | "from sklearn.feature_extraction.text import CountVectorizer\n", 204 | "vectorizer=CountVectorizer()\n", 205 | "train_x_vectors=vectorizer.fit_transform(train_x)\n", 206 | "test_x_vectors=vectorizer.fit_transform(test_x)\n", 207 | "#test_x_vectors=test_x_vectors.toarray()\n", 208 | "#test_x_vectors=np.reshape(test_x_vectors,(-1,1))\n", 209 | "print(test_x_vectors[10])" 210 | ] 211 | }, 212 | { 213 | "cell_type": "markdown", 214 | "metadata": {}, 215 | "source": [ 216 | "Building the prediction models and fitting the train data into them" 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": 6, 222 | "metadata": {}, 223 | "outputs": [], 224 | "source": [ 225 | "from sklearn import svm" 226 | ] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "execution_count": 7, 231 | "metadata": {}, 232 | "outputs": [ 233 | { 234 | "data": { 235 | "text/plain": [ 236 | "array(['POSITIVE'], dtype='