├── .ipynb_checkpoints └── Aspect Based Sentiment Analysis-checkpoint.ipynb ├── Aspect Based Sentiment Analysis.ipynb ├── README.md ├── tagged_text_list_test.pkl └── tagged_text_list_train.pkl /.ipynb_checkpoints/Aspect Based Sentiment Analysis-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import pandas as pd\n", 12 | "from sklearn.externals import joblib\n", 13 | "from sklearn.feature_extraction.text import CountVectorizer\n", 14 | "from sklearn.feature_extraction import DictVectorizer\n", 15 | "from sklearn.linear_model import SGDClassifier\n", 16 | "from sklearn.multiclass import OneVsRestClassifier\n", 17 | "from sklearn.naive_bayes import MultinomialNB\n", 18 | "from sklearn import svm\n", 19 | "import xml.etree.ElementTree as ET\n", 20 | "from lxml import etree\n", 21 | "from scipy.sparse import hstack\n", 22 | "import numpy as np\n", 23 | "import warnings\n", 24 | "\n", 25 | "\n", 26 | "path_train = r'E:\\Engineering\\8th sem\\nlp COMP 473\\NLP projects\\ABSA16_Laptops_Train_English_SB2.xml'\n", 27 | "path_test = r'E:\\Engineering\\8th sem\\nlp COMP 473\\NLP projects\\EN_LAPT_SB2_TEST.xml'\n", 28 | "\n", 29 | "#For stanford POS Tagger\n", 30 | "home = r'C:\\Users\\THe_strOX\\Anaconda3\\stanford-postagger-full-2017-06-09'\n", 31 | "from nltk.tag.stanford import StanfordPOSTagger as POS_Tag\n", 32 | "from nltk import word_tokenize\n", 33 | "_path_to_model = home + '/models/english-bidirectional-distsim.tagger' \n", 34 | "_path_to_jar = home + '/stanford-postagger.jar'\n", 35 | "stanford_tag = POS_Tag(model_filename=_path_to_model, path_to_jar=_path_to_jar)" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 2, 41 | "metadata": { 42 | "collapsed": true 43 | }, 44 | "outputs": [], 45 | "source": [ 46 | "#xml parser\n", 47 | "def get_list(path):\n", 48 | " tree=ET.parse(path)\n", 49 | " root = tree.getroot()\n", 50 | " text_list = []\n", 51 | " opinion_list = []\n", 52 | " for review in root.findall('Review'):\n", 53 | " text_string=\"\"\n", 54 | " opinion_inner_list=[]\n", 55 | " for sent in review.findall('./sentences/sentence'):\n", 56 | " text_string= text_string+ \" \"+ sent.find('text').text\n", 57 | " text_list.append(text_string)\n", 58 | " for opinion in review.findall('./Opinions/Opinion'):\n", 59 | " opinion_dict = {\n", 60 | " opinion.get('category').replace('#','_'): opinion.get('polarity')\n", 61 | " }\n", 62 | " opinion_inner_list.append(opinion_dict)\n", 63 | " opinion_list.append(opinion_inner_list)\n", 64 | " return text_list,opinion_list" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 3, 70 | "metadata": { 71 | "collapsed": true 72 | }, 73 | "outputs": [], 74 | "source": [ 75 | "#Selecting only 20 most common aspect.\n", 76 | "def get_most_common_aspect(opinion_list):\n", 77 | " import nltk\n", 78 | " opinion= []\n", 79 | " for inner_list in opinion_list:\n", 80 | " for _dict in inner_list:\n", 81 | " for key in _dict:\n", 82 | " opinion.append(key)\n", 83 | " most_common_aspect = [k for k,v in nltk.FreqDist(opinion).most_common(20)]\n", 84 | " return most_common_aspect" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 4, 90 | "metadata": { 91 | "collapsed": true 92 | }, 93 | "outputs": [], 94 | "source": [ 95 | "#generate data frame\n", 96 | "def get_data_frame(text_list,opinion_list,most_common_aspect):\n", 97 | " data={'Review':text_list}\n", 98 | " df = pd.DataFrame(data)\n", 99 | " if opinion_list:\n", 100 | " for inner_list in opinion_list:\n", 101 | " for _dict in inner_list:\n", 102 | " for key in _dict:\n", 103 | " if key in most_common_aspect:\n", 104 | " df.loc[opinion_list.index(inner_list),key]=_dict[key]\n", 105 | " return df" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": 5, 111 | "metadata": { 112 | "collapsed": true 113 | }, 114 | "outputs": [], 115 | "source": [ 116 | "#generate data frame for aspect extraction task\n", 117 | "def get_aspect_data_frame(df,most_common_aspect):\n", 118 | " for common_aspect in most_common_aspect:\n", 119 | " df[common_aspect]=df[common_aspect].replace(['positive','negative','neutral','conflict'],[1,1,1,1])\n", 120 | " df = df.fillna(0)\n", 121 | " return df" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": 6, 127 | "metadata": { 128 | "collapsed": true 129 | }, 130 | "outputs": [], 131 | "source": [ 132 | "def get_positive_data_frame(df,most_common_aspect):\n", 133 | " for common_aspect in most_common_aspect:\n", 134 | " df[common_aspect]=df[common_aspect].replace(['positive'],[1])\n", 135 | " df[common_aspect]=df[common_aspect].replace(['negative','neutral','conflict'],[0,0,0])\n", 136 | " df = df.fillna(0)\n", 137 | " return df" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": 7, 143 | "metadata": { 144 | "collapsed": true 145 | }, 146 | "outputs": [], 147 | "source": [ 148 | "def get_negative_data_frame(df,most_common_aspect):\n", 149 | " for common_aspect in most_common_aspect:\n", 150 | " df[common_aspect]=df[common_aspect].replace(['negative'],[1])\n", 151 | " df[common_aspect]=df[common_aspect].replace(['positive','neutral','conflict'],[0,0,0])\n", 152 | " df = df.fillna(0)\n", 153 | " return df" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": 8, 159 | "metadata": { 160 | "collapsed": true 161 | }, 162 | "outputs": [], 163 | "source": [ 164 | "def get_neutral_data_frame(df,most_common_aspect):\n", 165 | " for common_aspect in most_common_aspect:\n", 166 | " df[common_aspect]=df[common_aspect].replace(['neutral','conflict'],[1,1])\n", 167 | " df[common_aspect]=df[common_aspect].replace(['negative','positive'],[0,0])\n", 168 | " df = df.fillna(0)\n", 169 | " return df" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": 9, 175 | "metadata": { 176 | "collapsed": false 177 | }, 178 | "outputs": [], 179 | "source": [ 180 | "#To tag using stanford pos tagger\n", 181 | "def posTag(review):\n", 182 | " tagged_text_list=[]\n", 183 | " for text in review:\n", 184 | " tagged_text_list.append(stanford_tag.tag(word_tokenize(text)))\n", 185 | " return tagged_text_list\n", 186 | "#posTag(\"this is random text\")" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": 10, 192 | "metadata": { 193 | "collapsed": true 194 | }, 195 | "outputs": [], 196 | "source": [ 197 | "#Filter the word with tag- noun,adjective,verb,adverb\n", 198 | "def filterTag(tagged_review):\n", 199 | " final_text_list=[]\n", 200 | " for text_list in tagged_review:\n", 201 | " final_text=[]\n", 202 | " for word,tag in text_list:\n", 203 | " if tag in ['NN','NNS','NNP','NNPS','RB','RBR','RBS','JJ','JJR','JJS','VB','VBD','VBG','VBN','VBP','VBZ']:\n", 204 | " final_text.append(word)\n", 205 | " final_text_list.append(' '.join(final_text))\n", 206 | " return final_text_list" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": 11, 212 | "metadata": { 213 | "collapsed": false 214 | }, 215 | "outputs": [], 216 | "source": [ 217 | "def get_dict_aspect(y,most_common_aspect):\n", 218 | " position=[]\n", 219 | " for innerlist in y:\n", 220 | " position.append([i for i, j in enumerate(innerlist) if j == 1])\n", 221 | " sorted_common=sorted(most_common_aspect)\n", 222 | " dict_aspect=[]\n", 223 | " for innerlist in position:\n", 224 | " inner_dict={}\n", 225 | " for word in sorted_common:\n", 226 | " if sorted_common.index(word) in innerlist:\n", 227 | " inner_dict[word]= 5\n", 228 | " else:\n", 229 | " inner_dict[word]=0\n", 230 | " dict_aspect.append(inner_dict)\n", 231 | " return dict_aspect" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": 12, 237 | "metadata": { 238 | "collapsed": false 239 | }, 240 | "outputs": [], 241 | "source": [ 242 | "#Stage 1:\n", 243 | "#Making list to train\n", 244 | "train_text_list,train_opinion_list = get_list(path_train)\n", 245 | "most_common_aspect = get_most_common_aspect(train_opinion_list)" 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": 13, 251 | "metadata": { 252 | "collapsed": false 253 | }, 254 | "outputs": [], 255 | "source": [ 256 | "#This takes time to tag. Already tagged and saved. So, loading file ...\n", 257 | "#tagged_text_list_train=posTag(train_text_list)\n", 258 | "#joblib.dump(tagged_text_list_train, 'tagged_text_list_train.pkl')\n", 259 | "tagged_text_list_train=joblib.load('tagged_text_list_train.pkl')" 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": 14, 265 | "metadata": { 266 | "collapsed": true 267 | }, 268 | "outputs": [], 269 | "source": [ 270 | "#train list after filter\n", 271 | "final_train_text_list=filterTag(tagged_text_list_train)" 272 | ] 273 | }, 274 | { 275 | "cell_type": "code", 276 | "execution_count": 15, 277 | "metadata": { 278 | "collapsed": false 279 | }, 280 | "outputs": [], 281 | "source": [ 282 | "#get data frame\n", 283 | "df_train = get_data_frame(final_train_text_list,train_opinion_list,most_common_aspect)\n", 284 | "df_train_aspect = get_aspect_data_frame(df_train,most_common_aspect)\n", 285 | "df_train_aspect = df_train_aspect.reindex_axis(sorted(df_train_aspect.columns), axis=1)" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": 16, 291 | "metadata": { 292 | "collapsed": false 293 | }, 294 | "outputs": [], 295 | "source": [ 296 | "#Similar for test list\n", 297 | "test_text_list,test_opinion_list = get_list(path_test)" 298 | ] 299 | }, 300 | { 301 | "cell_type": "code", 302 | "execution_count": 17, 303 | "metadata": { 304 | "collapsed": false 305 | }, 306 | "outputs": [], 307 | "source": [ 308 | "#tagged_text_list_test=posTag(test_text_list)\n", 309 | "#joblib.dump(tagged_text_list_test, 'tagged_text_list_test.pkl')\n", 310 | "tagged_text_list_test=joblib.load('tagged_text_list_test.pkl')" 311 | ] 312 | }, 313 | { 314 | "cell_type": "code", 315 | "execution_count": 18, 316 | "metadata": { 317 | "collapsed": true 318 | }, 319 | "outputs": [], 320 | "source": [ 321 | "final_test_text_list=filterTag(tagged_text_list_test)" 322 | ] 323 | }, 324 | { 325 | "cell_type": "code", 326 | "execution_count": 19, 327 | "metadata": { 328 | "collapsed": false 329 | }, 330 | "outputs": [], 331 | "source": [ 332 | "df_test = get_data_frame(final_test_text_list,test_opinion_list,most_common_aspect)\n", 333 | "df_test_aspect = get_aspect_data_frame(df_test,most_common_aspect)\n", 334 | "df_test_aspect = df_test_aspect.reindex_axis(sorted(df_test_aspect.columns), axis=1)" 335 | ] 336 | }, 337 | { 338 | "cell_type": "code", 339 | "execution_count": 20, 340 | "metadata": { 341 | "collapsed": false 342 | }, 343 | "outputs": [], 344 | "source": [ 345 | "#Sort the data frame according to aspect's name and separate data(X) and target(y)\n", 346 | "#df_train_aspect = df_train_aspect.sample(frac=1).reset_index(drop=True) #For randoming\n", 347 | "X_train= df_train_aspect.Review\n", 348 | "y_train = df_train_aspect.drop('Review',1)\n", 349 | "\n", 350 | "#df_test_aspect = df_test_aspect.sample(frac=1).reset_index(drop=True) #For randoming\n", 351 | "X_test = df_test_aspect.Review\n", 352 | "y_test = df_test_aspect.drop('Review',1)" 353 | ] 354 | }, 355 | { 356 | "cell_type": "code", 357 | "execution_count": 21, 358 | "metadata": { 359 | "collapsed": false 360 | }, 361 | "outputs": [], 362 | "source": [ 363 | "#Change y_train to numpy array\n", 364 | "import numpy as np\n", 365 | "y_train = np.asarray(y_train, dtype=np.int64)\n", 366 | "y_test = np.asarray(y_test, dtype=np.int64)" 367 | ] 368 | }, 369 | { 370 | "cell_type": "code", 371 | "execution_count": 22, 372 | "metadata": { 373 | "collapsed": false 374 | }, 375 | "outputs": [], 376 | "source": [ 377 | "#Generate word vecotors using CountVectorizer\n", 378 | "from sklearn.feature_extraction.text import CountVectorizer\n", 379 | "from nltk import word_tokenize \n", 380 | "from nltk.stem import WordNetLemmatizer \n", 381 | "vect = CountVectorizer(max_df=1.0,stop_words='english') \n", 382 | "X_train_dtm = vect.fit_transform(X_train)\n", 383 | "X_test_dtm = vect.transform(X_test)" 384 | ] 385 | }, 386 | { 387 | "cell_type": "code", 388 | "execution_count": 23, 389 | "metadata": { 390 | "collapsed": false, 391 | "scrolled": true 392 | }, 393 | "outputs": [], 394 | "source": [ 395 | "#Create various models. These are multi-label models.\n", 396 | "nb_classif = OneVsRestClassifier(MultinomialNB()).fit(X_train_dtm, y_train)\n", 397 | "C = 1.0 #SVregularization parameter\n", 398 | "svc = OneVsRestClassifier(svm.SVC(kernel='linear', C=C)).fit(X_train_dtm, y_train)\n", 399 | "lin_svc = OneVsRestClassifier(svm.LinearSVC(C=C)).fit(X_train_dtm, y_train)\n", 400 | "sgd = OneVsRestClassifier(SGDClassifier()).fit(X_train_dtm,y_train)" 401 | ] 402 | }, 403 | { 404 | "cell_type": "code", 405 | "execution_count": 24, 406 | "metadata": { 407 | "collapsed": false 408 | }, 409 | "outputs": [], 410 | "source": [ 411 | "#Predict the test data using classifiers\n", 412 | "y_pred_class = nb_classif.predict(X_test_dtm)\n", 413 | "y_pred_class_svc = svc.predict(X_test_dtm)\n", 414 | "y_pred_class_lin_svc = lin_svc.predict(X_test_dtm)\n", 415 | "y_pred_class_sgd = sgd.predict(X_test_dtm)" 416 | ] 417 | }, 418 | { 419 | "cell_type": "code", 420 | "execution_count": 25, 421 | "metadata": { 422 | "collapsed": true 423 | }, 424 | "outputs": [], 425 | "source": [ 426 | "#Following code to test metrics of all aspect extraction classifiers\n", 427 | "from sklearn import metrics" 428 | ] 429 | }, 430 | { 431 | "cell_type": "code", 432 | "execution_count": 26, 433 | "metadata": { 434 | "collapsed": false 435 | }, 436 | "outputs": [ 437 | { 438 | "name": "stdout", 439 | "output_type": "stream", 440 | "text": [ 441 | "0.025\n", 442 | "0.05\n", 443 | "0.05\n", 444 | "0.0375\n" 445 | ] 446 | } 447 | ], 448 | "source": [ 449 | "print(metrics.accuracy_score(y_test,y_pred_class))\n", 450 | "print(metrics.accuracy_score(y_test,y_pred_class_svc))\n", 451 | "print(metrics.accuracy_score(y_test,y_pred_class_lin_svc))\n", 452 | "print(metrics.accuracy_score(y_test,y_pred_class_sgd))" 453 | ] 454 | }, 455 | { 456 | "cell_type": "code", 457 | "execution_count": 27, 458 | "metadata": { 459 | "collapsed": false 460 | }, 461 | "outputs": [ 462 | { 463 | "name": "stdout", 464 | "output_type": "stream", 465 | "text": [ 466 | "0.75\n", 467 | "0.711229946524\n", 468 | "0.732193732194\n", 469 | "0.700657894737\n" 470 | ] 471 | } 472 | ], 473 | "source": [ 474 | "print(metrics.precision_score(y_test,y_pred_class,average='micro'))\n", 475 | "print(metrics.precision_score(y_test,y_pred_class_svc,average='micro'))\n", 476 | "print(metrics.precision_score(y_test,y_pred_class_lin_svc,average='micro'))\n", 477 | "print(metrics.precision_score(y_test,y_pred_class_sgd,average='micro'))" 478 | ] 479 | }, 480 | { 481 | "cell_type": "code", 482 | "execution_count": 28, 483 | "metadata": { 484 | "collapsed": false 485 | }, 486 | "outputs": [ 487 | { 488 | "name": "stdout", 489 | "output_type": "stream", 490 | "text": [ 491 | "0.457627118644\n", 492 | "0.64406779661\n", 493 | "0.622276029056\n", 494 | "0.515738498789\n" 495 | ] 496 | } 497 | ], 498 | "source": [ 499 | "print(metrics.recall_score(y_test,y_pred_class,average='micro'))\n", 500 | "print(metrics.recall_score(y_test,y_pred_class_svc,average='micro'))\n", 501 | "print(metrics.recall_score(y_test,y_pred_class_lin_svc,average='micro'))\n", 502 | "print(metrics.recall_score(y_test,y_pred_class_sgd,average='micro'))" 503 | ] 504 | }, 505 | { 506 | "cell_type": "code", 507 | "execution_count": 29, 508 | "metadata": { 509 | "collapsed": false 510 | }, 511 | "outputs": [ 512 | { 513 | "name": "stdout", 514 | "output_type": "stream", 515 | "text": [ 516 | "0.568421052632\n", 517 | "0.675984752224\n", 518 | "0.67277486911\n", 519 | "0.594142259414\n" 520 | ] 521 | } 522 | ], 523 | "source": [ 524 | "print(metrics.f1_score(y_test,y_pred_class,average='micro'))\n", 525 | "print(metrics.f1_score(y_test,y_pred_class_svc,average='micro'))\n", 526 | "print(metrics.f1_score(y_test,y_pred_class_lin_svc,average='micro'))\n", 527 | "print(metrics.f1_score(y_test,y_pred_class_sgd,average='micro'))" 528 | ] 529 | }, 530 | { 531 | "cell_type": "code", 532 | "execution_count": 30, 533 | "metadata": { 534 | "collapsed": false, 535 | "scrolled": true 536 | }, 537 | "outputs": [ 538 | { 539 | "name": "stdout", 540 | "output_type": "stream", 541 | "text": [ 542 | " precision recall f1-score support\n", 543 | "\n", 544 | " 0 0.67 0.14 0.24 14\n", 545 | " 1 0.71 0.50 0.59 24\n", 546 | " 2 0.00 0.00 0.00 12\n", 547 | " 3 0.00 0.00 0.00 4\n", 548 | " 4 0.00 0.00 0.00 21\n", 549 | " 5 0.00 0.00 0.00 8\n", 550 | " 6 0.00 0.00 0.00 7\n", 551 | " 7 0.76 0.64 0.69 39\n", 552 | " 8 1.00 1.00 1.00 80\n", 553 | " 9 0.44 0.17 0.24 24\n", 554 | " 10 0.62 0.70 0.65 46\n", 555 | " 11 0.00 0.00 0.00 5\n", 556 | " 12 0.57 0.30 0.39 27\n", 557 | " 13 0.57 0.45 0.50 29\n", 558 | " 14 0.77 0.33 0.47 30\n", 559 | " 15 0.00 0.00 0.00 4\n", 560 | " 16 0.00 0.00 0.00 9\n", 561 | " 17 0.00 0.00 0.00 15\n", 562 | " 18 0.00 0.00 0.00 4\n", 563 | " 19 0.60 0.27 0.37 11\n", 564 | "\n", 565 | "avg / total 0.57 0.46 0.49 413\n", 566 | "\n", 567 | " precision recall f1-score support\n", 568 | "\n", 569 | " 0 0.78 1.00 0.88 14\n", 570 | " 1 0.68 0.71 0.69 24\n", 571 | " 2 0.86 0.50 0.63 12\n", 572 | " 3 0.12 0.25 0.17 4\n", 573 | " 4 0.56 0.43 0.49 21\n", 574 | " 5 0.75 0.38 0.50 8\n", 575 | " 6 0.20 0.14 0.17 7\n", 576 | " 7 0.74 0.67 0.70 39\n", 577 | " 8 1.00 0.97 0.99 80\n", 578 | " 9 0.63 0.50 0.56 24\n", 579 | " 10 0.68 0.74 0.71 46\n", 580 | " 11 0.33 0.40 0.36 5\n", 581 | " 12 0.83 0.74 0.78 27\n", 582 | " 13 0.56 0.66 0.60 29\n", 583 | " 14 0.60 0.40 0.48 30\n", 584 | " 15 0.67 0.50 0.57 4\n", 585 | " 16 0.18 0.22 0.20 9\n", 586 | " 17 0.80 0.27 0.40 15\n", 587 | " 18 1.00 0.25 0.40 4\n", 588 | " 19 0.60 0.27 0.37 11\n", 589 | "\n", 590 | "avg / total 0.72 0.64 0.67 413\n", 591 | "\n", 592 | " precision recall f1-score support\n", 593 | "\n", 594 | " 0 0.78 1.00 0.88 14\n", 595 | " 1 0.65 0.71 0.68 24\n", 596 | " 2 1.00 0.42 0.59 12\n", 597 | " 3 0.17 0.25 0.20 4\n", 598 | " 4 0.64 0.43 0.51 21\n", 599 | " 5 1.00 0.25 0.40 8\n", 600 | " 6 0.33 0.14 0.20 7\n", 601 | " 7 0.74 0.67 0.70 39\n", 602 | " 8 1.00 0.96 0.98 80\n", 603 | " 9 0.61 0.46 0.52 24\n", 604 | " 10 0.70 0.72 0.71 46\n", 605 | " 11 0.25 0.20 0.22 5\n", 606 | " 12 0.83 0.74 0.78 27\n", 607 | " 13 0.54 0.66 0.59 29\n", 608 | " 14 0.61 0.37 0.46 30\n", 609 | " 15 1.00 0.25 0.40 4\n", 610 | " 16 0.20 0.22 0.21 9\n", 611 | " 17 1.00 0.20 0.33 15\n", 612 | " 18 1.00 0.25 0.40 4\n", 613 | " 19 0.75 0.27 0.40 11\n", 614 | "\n", 615 | "avg / total 0.75 0.62 0.66 413\n", 616 | "\n", 617 | " precision recall f1-score support\n", 618 | "\n", 619 | " 0 0.75 0.64 0.69 14\n", 620 | " 1 0.60 0.75 0.67 24\n", 621 | " 2 0.00 0.00 0.00 12\n", 622 | " 3 0.00 0.00 0.00 4\n", 623 | " 4 0.50 0.19 0.28 21\n", 624 | " 5 0.00 0.00 0.00 8\n", 625 | " 6 0.00 0.00 0.00 7\n", 626 | " 7 0.63 0.62 0.62 39\n", 627 | " 8 1.00 0.99 0.99 80\n", 628 | " 9 0.67 0.33 0.44 24\n", 629 | " 10 0.65 0.61 0.63 46\n", 630 | " 11 0.29 0.40 0.33 5\n", 631 | " 12 0.71 0.37 0.49 27\n", 632 | " 13 0.58 0.48 0.53 29\n", 633 | " 14 0.71 0.33 0.45 30\n", 634 | " 15 0.00 0.00 0.00 4\n", 635 | " 16 0.20 0.11 0.14 9\n", 636 | " 17 0.67 0.13 0.22 15\n", 637 | " 18 0.33 0.25 0.29 4\n", 638 | " 19 0.43 0.27 0.33 11\n", 639 | "\n", 640 | "avg / total 0.64 0.52 0.55 413\n", 641 | "\n" 642 | ] 643 | } 644 | ], 645 | "source": [ 646 | "with warnings.catch_warnings():\n", 647 | " warnings.simplefilter(\"ignore\")\n", 648 | " print(metrics.classification_report(y_test, y_pred_class))\n", 649 | " print(metrics.classification_report(y_test, y_pred_class_svc))\n", 650 | " print(metrics.classification_report(y_test, y_pred_class_lin_svc))\n", 651 | " print(metrics.classification_report(y_test, y_pred_class_sgd))" 652 | ] 653 | }, 654 | { 655 | "cell_type": "code", 656 | "execution_count": 31, 657 | "metadata": { 658 | "collapsed": false 659 | }, 660 | "outputs": [], 661 | "source": [ 662 | "#Stage 2:\n", 663 | "#Generating extra feature that indicates which aspect category is present in the review\n", 664 | "train_dict_aspect=get_dict_aspect(y_train, most_common_aspect)\n", 665 | "d_train=DictVectorizer() \n", 666 | "X_train_aspect_dtm = d_train.fit_transform(train_dict_aspect)\n", 667 | "\n", 668 | "#y_test is used to generated extra feature in order to test the performance of 2nd classifer.\n", 669 | "#Use y_pred_class_svc(Highest performer for aspect classification) as input for extra feature to test the overall performace.\n", 670 | "test_dict_aspect=get_dict_aspect(y_test,most_common_aspect)\n", 671 | "d_test=DictVectorizer() \n", 672 | "X_test_aspect_dtm = d_test.fit_transform(test_dict_aspect)" 673 | ] 674 | }, 675 | { 676 | "cell_type": "code", 677 | "execution_count": 32, 678 | "metadata": { 679 | "collapsed": false 680 | }, 681 | "outputs": [], 682 | "source": [ 683 | "#Function for classiflying positive,negative or neutral sentiment of all the aspects\n", 684 | "def classify_sentiment(df_train,df_test,X_train_aspect_dtm,X_test_aspect_dtm):\n", 685 | " \n", 686 | " df_train = df_train.reindex_axis(sorted(df_train_positive.columns), axis=1)\n", 687 | " df_test = df_test.reindex_axis(sorted(df_test_positive.columns), axis=1)\n", 688 | "\n", 689 | " import numpy as np\n", 690 | " X_train = df_train.Review\n", 691 | " y_train = df_train.drop('Review',1)\n", 692 | " y_train = np.asarray(y_train, dtype=np.int64)\n", 693 | "\n", 694 | " X_test = df_test.Review\n", 695 | " y_test = df_test.drop('Review',1)\n", 696 | " y_test = np.asarray(y_test, dtype=np.int64)\n", 697 | "\n", 698 | " vect_sen = CountVectorizer(stop_words='english',ngram_range=(1,2)) \n", 699 | " X_train_dtm = vect_sen.fit_transform(X_train)\n", 700 | " X_test_dtm = vect_sen.transform(X_test)\n", 701 | "\n", 702 | " #ombining word vector with extra feature.\n", 703 | " from scipy.sparse import hstack\n", 704 | " X_train_dtm=hstack((X_train_dtm, X_train_aspect_dtm))\n", 705 | " X_test_dtm=hstack((X_test_dtm, X_test_aspect_dtm))\n", 706 | "\n", 707 | " C = 1.0 #SVregularization parameter\n", 708 | " nb_classif = OneVsRestClassifier(MultinomialNB()).fit(X_train_dtm, y_train)\n", 709 | " svc = OneVsRestClassifier(svm.SVC(kernel='linear', C=C)).fit(X_train_dtm, y_train)\n", 710 | " lin_svc = OneVsRestClassifier(svm.LinearSVC(C=C)).fit(X_train_dtm, y_train)\n", 711 | " sgd = OneVsRestClassifier(SGDClassifier()).fit(X_train_dtm,y_train)\n", 712 | "\n", 713 | " y_pred_class= nb_classif.predict(X_test_dtm)\n", 714 | " y_pred_class_svc = svc.predict(X_test_dtm)\n", 715 | " y_pred_class_lin_svc = lin_svc.predict(X_test_dtm)\n", 716 | " y_pred_class_sgd = sgd.predict(X_test_dtm)\n", 717 | " return (y_test,y_pred_class,y_pred_class_svc,y_pred_class_lin_svc,y_pred_class_sgd)" 718 | ] 719 | }, 720 | { 721 | "cell_type": "code", 722 | "execution_count": 33, 723 | "metadata": { 724 | "collapsed": true 725 | }, 726 | "outputs": [], 727 | "source": [ 728 | "def print_metrices(y_test,y_pred_class,y_pred_class_svc,y_pred_class_lin_svc,y_pred_class_sgd):\n", 729 | " print(\"Accuracy:\")\n", 730 | " print(metrics.accuracy_score(y_test,y_pred_class))\n", 731 | " print(metrics.accuracy_score(y_test,y_pred_class_svc))\n", 732 | " print(metrics.accuracy_score(y_test,y_pred_class_lin_svc))\n", 733 | " print(metrics.accuracy_score(y_test,y_pred_class_sgd))\n", 734 | "\n", 735 | " print(\"\\nAverage precision:\")\n", 736 | " print(metrics.precision_score(y_test,y_pred_class,average='micro'))\n", 737 | " print(metrics.precision_score(y_test,y_pred_class_svc,average='micro'))\n", 738 | " print(metrics.precision_score(y_test,y_pred_class_lin_svc,average='micro'))\n", 739 | " print(metrics.precision_score(y_test,y_pred_class_sgd,average='micro'))\n", 740 | "\n", 741 | " print(\"\\nAverage recall:\")\n", 742 | " print(metrics.recall_score(y_test,y_pred_class,average='micro'))\n", 743 | " print(metrics.recall_score(y_test,y_pred_class_svc,average='micro'))\n", 744 | " print(metrics.recall_score(y_test,y_pred_class_lin_svc,average='micro'))\n", 745 | " print(metrics.recall_score(y_test,y_pred_class_sgd,average='micro'))\n", 746 | " \n", 747 | " print(\"\\nAverage f1:\")\n", 748 | " print(metrics.f1_score(y_test,y_pred_class,average='micro'))\n", 749 | " print(metrics.f1_score(y_test,y_pred_class_svc,average='micro'))\n", 750 | " print(metrics.f1_score(y_test,y_pred_class_lin_svc,average='micro'))\n", 751 | " print(metrics.f1_score(y_test,y_pred_class_sgd,average='micro'))\n", 752 | "\n", 753 | " print(\"\\nClassification report:\")\n", 754 | " print(metrics.classification_report(y_test, y_pred_class))\n", 755 | " print(metrics.classification_report(y_test, y_pred_class_svc))\n", 756 | " print(metrics.classification_report(y_test, y_pred_class_lin_svc))\n", 757 | " print(metrics.classification_report(y_test, y_pred_class_sgd))" 758 | ] 759 | }, 760 | { 761 | "cell_type": "code", 762 | "execution_count": 34, 763 | "metadata": { 764 | "collapsed": false 765 | }, 766 | "outputs": [ 767 | { 768 | "name": "stdout", 769 | "output_type": "stream", 770 | "text": [ 771 | "Accuracy:\n", 772 | "0.15\n", 773 | "0.35\n", 774 | "0.3125\n", 775 | "0.125\n", 776 | "\n", 777 | "Average precision:\n", 778 | "0.857142857143\n", 779 | "0.745762711864\n", 780 | "0.756097560976\n", 781 | "0.704545454545\n", 782 | "\n", 783 | "Average recall:\n", 784 | "0.260869565217\n", 785 | "0.797101449275\n", 786 | "0.673913043478\n", 787 | "0.673913043478\n", 788 | "\n", 789 | "Average f1:\n", 790 | "0.4\n", 791 | "0.77057793345\n", 792 | "0.712643678161\n", 793 | "0.688888888889\n", 794 | "\n", 795 | "Classification report:\n", 796 | " precision recall f1-score support\n", 797 | "\n", 798 | " 0 0.00 0.00 0.00 12\n", 799 | " 1 0.00 0.00 0.00 14\n", 800 | " 2 0.00 0.00 0.00 6\n", 801 | " 3 0.00 0.00 0.00 3\n", 802 | " 4 0.00 0.00 0.00 15\n", 803 | " 5 0.00 0.00 0.00 7\n", 804 | " 6 0.00 0.00 0.00 4\n", 805 | " 7 0.00 0.00 0.00 26\n", 806 | " 8 0.83 0.95 0.89 61\n", 807 | " 9 0.00 0.00 0.00 11\n", 808 | " 10 1.00 0.39 0.56 36\n", 809 | " 11 0.00 0.00 0.00 4\n", 810 | " 12 0.00 0.00 0.00 12\n", 811 | " 13 0.00 0.00 0.00 18\n", 812 | " 14 0.00 0.00 0.00 23\n", 813 | " 15 0.00 0.00 0.00 3\n", 814 | " 16 0.00 0.00 0.00 7\n", 815 | " 17 0.00 0.00 0.00 8\n", 816 | " 18 0.00 0.00 0.00 2\n", 817 | " 19 0.00 0.00 0.00 4\n", 818 | "\n", 819 | "avg / total 0.31 0.26 0.27 276\n", 820 | "\n", 821 | " precision recall f1-score support\n", 822 | "\n", 823 | " 0 0.91 0.83 0.87 12\n", 824 | " 1 0.67 0.71 0.69 14\n", 825 | " 2 0.50 1.00 0.67 6\n", 826 | " 3 0.75 1.00 0.86 3\n", 827 | " 4 0.65 0.73 0.69 15\n", 828 | " 5 0.75 0.43 0.55 7\n", 829 | " 6 0.67 0.50 0.57 4\n", 830 | " 7 0.71 0.77 0.74 26\n", 831 | " 8 0.89 0.93 0.91 61\n", 832 | " 9 0.47 0.82 0.60 11\n", 833 | " 10 0.89 0.89 0.89 36\n", 834 | " 11 0.75 0.75 0.75 4\n", 835 | " 12 0.60 0.75 0.67 12\n", 836 | " 13 0.72 0.72 0.72 18\n", 837 | " 14 0.75 0.91 0.82 23\n", 838 | " 15 0.67 0.67 0.67 3\n", 839 | " 16 0.25 0.14 0.18 7\n", 840 | " 17 0.80 0.50 0.62 8\n", 841 | " 18 0.67 1.00 0.80 2\n", 842 | " 19 1.00 0.50 0.67 4\n", 843 | "\n", 844 | "avg / total 0.76 0.80 0.77 276\n", 845 | "\n", 846 | " precision recall f1-score support\n", 847 | "\n", 848 | " 0 1.00 0.75 0.86 12\n", 849 | " 1 0.64 0.64 0.64 14\n", 850 | " 2 0.60 0.50 0.55 6\n", 851 | " 3 0.75 1.00 0.86 3\n", 852 | " 4 0.69 0.60 0.64 15\n", 853 | " 5 1.00 0.14 0.25 7\n", 854 | " 6 0.00 0.00 0.00 4\n", 855 | " 7 0.68 0.65 0.67 26\n", 856 | " 8 0.88 0.92 0.90 61\n", 857 | " 9 0.44 0.73 0.55 11\n", 858 | " 10 0.89 0.86 0.87 36\n", 859 | " 11 1.00 0.50 0.67 4\n", 860 | " 12 0.45 0.42 0.43 12\n", 861 | " 13 0.73 0.61 0.67 18\n", 862 | " 14 0.71 0.65 0.68 23\n", 863 | " 15 1.00 0.33 0.50 3\n", 864 | " 16 0.00 0.00 0.00 7\n", 865 | " 17 0.75 0.38 0.50 8\n", 866 | " 18 0.67 1.00 0.80 2\n", 867 | " 19 1.00 0.25 0.40 4\n", 868 | "\n", 869 | "avg / total 0.74 0.67 0.69 276\n", 870 | "\n", 871 | " precision recall f1-score support\n", 872 | "\n", 873 | " 0 1.00 0.58 0.74 12\n", 874 | " 1 0.64 1.00 0.78 14\n", 875 | " 2 0.50 0.83 0.62 6\n", 876 | " 3 0.75 1.00 0.86 3\n", 877 | " 4 0.72 0.87 0.79 15\n", 878 | " 5 0.50 0.14 0.22 7\n", 879 | " 6 0.75 0.75 0.75 4\n", 880 | " 7 0.53 0.31 0.39 26\n", 881 | " 8 0.88 0.85 0.87 61\n", 882 | " 9 0.38 0.55 0.44 11\n", 883 | " 10 0.78 0.97 0.86 36\n", 884 | " 11 0.80 1.00 0.89 4\n", 885 | " 12 0.67 0.17 0.27 12\n", 886 | " 13 0.73 0.44 0.55 18\n", 887 | " 14 0.73 0.83 0.78 23\n", 888 | " 15 0.00 0.00 0.00 3\n", 889 | " 16 0.00 0.00 0.00 7\n", 890 | " 17 0.33 0.12 0.18 8\n", 891 | " 18 0.67 1.00 0.80 2\n", 892 | " 19 0.30 0.75 0.43 4\n", 893 | "\n", 894 | "avg / total 0.69 0.67 0.65 276\n", 895 | "\n" 896 | ] 897 | } 898 | ], 899 | "source": [ 900 | "#For positive sentiment classifier\n", 901 | "df_train = get_data_frame(final_train_text_list,train_opinion_list,most_common_aspect)\n", 902 | "df_test = get_data_frame(final_test_text_list,test_opinion_list,most_common_aspect)\n", 903 | "\n", 904 | "df_train_positive = get_positive_data_frame(df_train,most_common_aspect)\n", 905 | "df_test_positive = get_positive_data_frame(df_test,most_common_aspect)\n", 906 | "y_test_pos,y_pred_class_pos,y_pred_class_svc_pos,y_pred_class_lin_svc_pos,y_pred_class_sgd_pos=classify_sentiment(df_train_positive,df_test_positive,X_train_aspect_dtm,X_test_aspect_dtm)\n", 907 | "with warnings.catch_warnings():\n", 908 | " warnings.simplefilter(\"ignore\")\n", 909 | " print_metrices(y_test_pos,y_pred_class_pos,y_pred_class_svc_pos,y_pred_class_lin_svc_pos,y_pred_class_sgd_pos)" 910 | ] 911 | }, 912 | { 913 | "cell_type": "code", 914 | "execution_count": 35, 915 | "metadata": { 916 | "collapsed": false 917 | }, 918 | "outputs": [ 919 | { 920 | "name": "stdout", 921 | "output_type": "stream", 922 | "text": [ 923 | "Accuracy:\n", 924 | "0.4875\n", 925 | "0.4875\n", 926 | "0.4625\n", 927 | "0.3375\n", 928 | "\n", 929 | "Average precision:\n", 930 | "0.7\n", 931 | "0.625\n", 932 | "0.666666666667\n", 933 | "0.449438202247\n", 934 | "\n", 935 | "Average recall:\n", 936 | "0.0642201834862\n", 937 | "0.412844036697\n", 938 | "0.330275229358\n", 939 | "0.366972477064\n", 940 | "\n", 941 | "Average f1:\n", 942 | "0.117647058824\n", 943 | "0.497237569061\n", 944 | "0.441717791411\n", 945 | "0.40404040404\n", 946 | "\n", 947 | "Classification report:\n", 948 | " precision recall f1-score support\n", 949 | "\n", 950 | " 0 0.00 0.00 0.00 2\n", 951 | " 1 0.00 0.00 0.00 10\n", 952 | " 2 0.00 0.00 0.00 2\n", 953 | " 3 0.00 0.00 0.00 1\n", 954 | " 4 0.00 0.00 0.00 4\n", 955 | " 5 0.00 0.00 0.00 1\n", 956 | " 6 0.00 0.00 0.00 3\n", 957 | " 7 0.00 0.00 0.00 3\n", 958 | " 8 0.67 0.33 0.44 18\n", 959 | " 9 0.00 0.00 0.00 9\n", 960 | " 10 0.00 0.00 0.00 10\n", 961 | " 11 0.00 0.00 0.00 1\n", 962 | " 12 0.00 0.00 0.00 12\n", 963 | " 13 1.00 0.09 0.17 11\n", 964 | " 14 0.00 0.00 0.00 6\n", 965 | " 15 0.00 0.00 0.00 0\n", 966 | " 16 0.00 0.00 0.00 2\n", 967 | " 17 0.00 0.00 0.00 5\n", 968 | " 18 0.00 0.00 0.00 2\n", 969 | " 19 0.00 0.00 0.00 7\n", 970 | "\n", 971 | "avg / total 0.21 0.06 0.09 109\n", 972 | "\n", 973 | " precision recall f1-score support\n", 974 | "\n", 975 | " 0 0.50 0.50 0.50 2\n", 976 | " 1 0.67 0.60 0.63 10\n", 977 | " 2 0.00 0.00 0.00 2\n", 978 | " 3 0.00 0.00 0.00 1\n", 979 | " 4 0.50 0.25 0.33 4\n", 980 | " 5 0.00 0.00 0.00 1\n", 981 | " 6 1.00 0.67 0.80 3\n", 982 | " 7 0.00 0.00 0.00 3\n", 983 | " 8 0.77 0.56 0.65 18\n", 984 | " 9 1.00 0.22 0.36 9\n", 985 | " 10 0.78 0.70 0.74 10\n", 986 | " 11 0.00 0.00 0.00 1\n", 987 | " 12 0.75 0.25 0.38 12\n", 988 | " 13 0.70 0.64 0.67 11\n", 989 | " 14 0.00 0.00 0.00 6\n", 990 | " 15 0.00 0.00 0.00 0\n", 991 | " 16 0.00 0.00 0.00 2\n", 992 | " 17 0.00 0.00 0.00 5\n", 993 | " 18 0.00 0.00 0.00 2\n", 994 | " 19 0.75 0.86 0.80 7\n", 995 | "\n", 996 | "avg / total 0.60 0.41 0.47 109\n", 997 | "\n", 998 | " precision recall f1-score support\n", 999 | "\n", 1000 | " 0 1.00 0.50 0.67 2\n", 1001 | " 1 0.67 0.40 0.50 10\n", 1002 | " 2 0.00 0.00 0.00 2\n", 1003 | " 3 0.00 0.00 0.00 1\n", 1004 | " 4 0.33 0.25 0.29 4\n", 1005 | " 5 0.00 0.00 0.00 1\n", 1006 | " 6 1.00 0.33 0.50 3\n", 1007 | " 7 0.00 0.00 0.00 3\n", 1008 | " 8 0.77 0.56 0.65 18\n", 1009 | " 9 0.00 0.00 0.00 9\n", 1010 | " 10 0.86 0.60 0.71 10\n", 1011 | " 11 0.00 0.00 0.00 1\n", 1012 | " 12 1.00 0.08 0.15 12\n", 1013 | " 13 0.70 0.64 0.67 11\n", 1014 | " 14 0.00 0.00 0.00 6\n", 1015 | " 15 0.00 0.00 0.00 0\n", 1016 | " 16 0.00 0.00 0.00 2\n", 1017 | " 17 0.00 0.00 0.00 5\n", 1018 | " 18 0.00 0.00 0.00 2\n", 1019 | " 19 0.83 0.71 0.77 7\n", 1020 | "\n", 1021 | "avg / total 0.56 0.33 0.39 109\n", 1022 | "\n", 1023 | " precision recall f1-score support\n", 1024 | "\n", 1025 | " 0 1.00 0.50 0.67 2\n", 1026 | " 1 0.71 0.50 0.59 10\n", 1027 | " 2 0.00 0.00 0.00 2\n", 1028 | " 3 0.00 0.00 0.00 1\n", 1029 | " 4 1.00 0.25 0.40 4\n", 1030 | " 5 0.00 0.00 0.00 1\n", 1031 | " 6 0.67 0.67 0.67 3\n", 1032 | " 7 0.13 0.67 0.22 3\n", 1033 | " 8 0.38 0.67 0.48 18\n", 1034 | " 9 1.00 0.11 0.20 9\n", 1035 | " 10 0.50 0.70 0.58 10\n", 1036 | " 11 0.00 0.00 0.00 1\n", 1037 | " 12 0.00 0.00 0.00 12\n", 1038 | " 13 0.75 0.27 0.40 11\n", 1039 | " 14 0.00 0.00 0.00 6\n", 1040 | " 15 0.00 0.00 0.00 0\n", 1041 | " 16 0.00 0.00 0.00 2\n", 1042 | " 17 0.00 0.00 0.00 5\n", 1043 | " 18 0.00 0.00 0.00 2\n", 1044 | " 19 0.86 0.86 0.86 7\n", 1045 | "\n", 1046 | "avg / total 0.46 0.37 0.35 109\n", 1047 | "\n" 1048 | ] 1049 | } 1050 | ], 1051 | "source": [ 1052 | "#For negative sentiment classifier\n", 1053 | "df_train = get_data_frame(final_train_text_list,train_opinion_list,most_common_aspect)\n", 1054 | "df_test = get_data_frame(final_test_text_list,test_opinion_list,most_common_aspect)\n", 1055 | "\n", 1056 | "df_train_neg = get_negative_data_frame(df_train,most_common_aspect)\n", 1057 | "df_test_neg = get_negative_data_frame(df_test,most_common_aspect)\n", 1058 | "\n", 1059 | "y_test_neg,y_pred_class_neg,y_pred_class_svc_neg,y_pred_class_lin_svc_neg,y_pred_class_sgd_neg=classify_sentiment(df_train_neg,df_test_neg,X_train_aspect_dtm,X_test_aspect_dtm)\n", 1060 | "with warnings.catch_warnings():\n", 1061 | " warnings.simplefilter(\"ignore\")\n", 1062 | " print_metrices(y_test_neg,y_pred_class_neg,y_pred_class_svc_neg,y_pred_class_lin_svc_neg,y_pred_class_sgd_neg)" 1063 | ] 1064 | }, 1065 | { 1066 | "cell_type": "code", 1067 | "execution_count": 36, 1068 | "metadata": { 1069 | "collapsed": false 1070 | }, 1071 | "outputs": [ 1072 | { 1073 | "name": "stdout", 1074 | "output_type": "stream", 1075 | "text": [ 1076 | "Accuracy:\n", 1077 | "0.7375\n", 1078 | "0.725\n", 1079 | "0.7375\n", 1080 | "0.5875\n", 1081 | "\n", 1082 | "Average precision:\n", 1083 | "0.0\n", 1084 | "0.153846153846\n", 1085 | "0.333333333333\n", 1086 | "0.105263157895\n", 1087 | "\n", 1088 | "Average recall:\n", 1089 | "0.0\n", 1090 | "0.0714285714286\n", 1091 | "0.0357142857143\n", 1092 | "0.0714285714286\n", 1093 | "\n", 1094 | "Average f1:\n", 1095 | "0.0\n", 1096 | "0.0975609756098\n", 1097 | "0.0645161290323\n", 1098 | "0.0851063829787\n", 1099 | "\n", 1100 | "Classification report:\n", 1101 | " precision recall f1-score support\n", 1102 | "\n", 1103 | " 0 0.00 0.00 0.00 0\n", 1104 | " 1 0.00 0.00 0.00 0\n", 1105 | " 2 0.00 0.00 0.00 4\n", 1106 | " 3 0.00 0.00 0.00 0\n", 1107 | " 4 0.00 0.00 0.00 2\n", 1108 | " 5 0.00 0.00 0.00 0\n", 1109 | " 6 0.00 0.00 0.00 0\n", 1110 | " 7 0.00 0.00 0.00 10\n", 1111 | " 8 0.00 0.00 0.00 1\n", 1112 | " 9 0.00 0.00 0.00 4\n", 1113 | " 10 0.00 0.00 0.00 0\n", 1114 | " 11 0.00 0.00 0.00 0\n", 1115 | " 12 0.00 0.00 0.00 3\n", 1116 | " 13 0.00 0.00 0.00 0\n", 1117 | " 14 0.00 0.00 0.00 1\n", 1118 | " 15 0.00 0.00 0.00 1\n", 1119 | " 16 0.00 0.00 0.00 0\n", 1120 | " 17 0.00 0.00 0.00 2\n", 1121 | " 18 0.00 0.00 0.00 0\n", 1122 | " 19 0.00 0.00 0.00 0\n", 1123 | "\n", 1124 | "avg / total 0.00 0.00 0.00 28\n", 1125 | "\n", 1126 | " precision recall f1-score support\n", 1127 | "\n", 1128 | " 0 0.00 0.00 0.00 0\n", 1129 | " 1 0.00 0.00 0.00 0\n", 1130 | " 2 0.00 0.00 0.00 4\n", 1131 | " 3 0.00 0.00 0.00 0\n", 1132 | " 4 0.00 0.00 0.00 2\n", 1133 | " 5 0.00 0.00 0.00 0\n", 1134 | " 6 0.00 0.00 0.00 0\n", 1135 | " 7 0.29 0.20 0.24 10\n", 1136 | " 8 0.00 0.00 0.00 1\n", 1137 | " 9 0.00 0.00 0.00 4\n", 1138 | " 10 0.00 0.00 0.00 0\n", 1139 | " 11 0.00 0.00 0.00 0\n", 1140 | " 12 0.00 0.00 0.00 3\n", 1141 | " 13 0.00 0.00 0.00 0\n", 1142 | " 14 0.00 0.00 0.00 1\n", 1143 | " 15 0.00 0.00 0.00 1\n", 1144 | " 16 0.00 0.00 0.00 0\n", 1145 | " 17 0.00 0.00 0.00 2\n", 1146 | " 18 0.00 0.00 0.00 0\n", 1147 | " 19 0.00 0.00 0.00 0\n", 1148 | "\n", 1149 | "avg / total 0.10 0.07 0.08 28\n", 1150 | "\n", 1151 | " precision recall f1-score support\n", 1152 | "\n", 1153 | " 0 0.00 0.00 0.00 0\n", 1154 | " 1 0.00 0.00 0.00 0\n", 1155 | " 2 0.00 0.00 0.00 4\n", 1156 | " 3 0.00 0.00 0.00 0\n", 1157 | " 4 0.00 0.00 0.00 2\n", 1158 | " 5 0.00 0.00 0.00 0\n", 1159 | " 6 0.00 0.00 0.00 0\n", 1160 | " 7 0.33 0.10 0.15 10\n", 1161 | " 8 0.00 0.00 0.00 1\n", 1162 | " 9 0.00 0.00 0.00 4\n", 1163 | " 10 0.00 0.00 0.00 0\n", 1164 | " 11 0.00 0.00 0.00 0\n", 1165 | " 12 0.00 0.00 0.00 3\n", 1166 | " 13 0.00 0.00 0.00 0\n", 1167 | " 14 0.00 0.00 0.00 1\n", 1168 | " 15 0.00 0.00 0.00 1\n", 1169 | " 16 0.00 0.00 0.00 0\n", 1170 | " 17 0.00 0.00 0.00 2\n", 1171 | " 18 0.00 0.00 0.00 0\n", 1172 | " 19 0.00 0.00 0.00 0\n", 1173 | "\n", 1174 | "avg / total 0.12 0.04 0.05 28\n", 1175 | "\n", 1176 | " precision recall f1-score support\n", 1177 | "\n", 1178 | " 0 0.00 0.00 0.00 0\n", 1179 | " 1 0.00 0.00 0.00 0\n", 1180 | " 2 0.00 0.00 0.00 4\n", 1181 | " 3 0.00 0.00 0.00 0\n", 1182 | " 4 0.00 0.00 0.00 2\n", 1183 | " 5 0.00 0.00 0.00 0\n", 1184 | " 6 0.00 0.00 0.00 0\n", 1185 | " 7 0.00 0.00 0.00 10\n", 1186 | " 8 0.00 0.00 0.00 1\n", 1187 | " 9 0.00 0.00 0.00 4\n", 1188 | " 10 0.00 0.00 0.00 0\n", 1189 | " 11 0.00 0.00 0.00 0\n", 1190 | " 12 0.18 0.67 0.29 3\n", 1191 | " 13 0.00 0.00 0.00 0\n", 1192 | " 14 0.00 0.00 0.00 1\n", 1193 | " 15 0.00 0.00 0.00 1\n", 1194 | " 16 0.00 0.00 0.00 0\n", 1195 | " 17 0.00 0.00 0.00 2\n", 1196 | " 18 0.00 0.00 0.00 0\n", 1197 | " 19 0.00 0.00 0.00 0\n", 1198 | "\n", 1199 | "avg / total 0.02 0.07 0.03 28\n", 1200 | "\n" 1201 | ] 1202 | } 1203 | ], 1204 | "source": [ 1205 | "#For neutral or conflict sentiment classifier\n", 1206 | "df_train = get_data_frame(final_train_text_list,train_opinion_list,most_common_aspect)\n", 1207 | "df_test = get_data_frame(final_test_text_list,test_opinion_list,most_common_aspect)\n", 1208 | "\n", 1209 | "df_train_neu = get_neutral_data_frame(df_train,most_common_aspect)\n", 1210 | "df_test_neu = get_neutral_data_frame(df_test,most_common_aspect)\n", 1211 | "\n", 1212 | "y_test_neu,y_pred_class_neu,y_pred_class_svc_neu,y_pred_class_lin_svc_neu,y_pred_class_sgd_neu=classify_sentiment(df_train_neu,df_test_neu,X_train_aspect_dtm,X_test_aspect_dtm)\n", 1213 | "with warnings.catch_warnings():\n", 1214 | " warnings.simplefilter(\"ignore\")\n", 1215 | " print_metrices(y_test_neu,y_pred_class_neu,y_pred_class_svc_neu,y_pred_class_lin_svc_neu,y_pred_class_sgd_neu)" 1216 | ] 1217 | }, 1218 | { 1219 | "cell_type": "code", 1220 | "execution_count": 37, 1221 | "metadata": { 1222 | "collapsed": false, 1223 | "scrolled": false 1224 | }, 1225 | "outputs": [ 1226 | { 1227 | "name": "stdout", 1228 | "output_type": "stream", 1229 | "text": [ 1230 | "Enter a laptop review:\n", 1231 | "\n", 1232 | "This is my first asus laptop. So far i am really enjoying this laptop. 512GB SSD is super fast. Battery life is also good and can last very long. I have no complain on screen quality too as display supports 4k videos. Maybe that is why it costs a lot. This is an expensive laptop and it's price is very high compared to other laptops of similar specs. So, if you have no trouble paying for this laptop, it is pretty good.\n" 1233 | ] 1234 | }, 1235 | { 1236 | "data": { 1237 | "text/plain": [ 1238 | "array([[1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0]])" 1239 | ] 1240 | }, 1241 | "execution_count": 37, 1242 | "metadata": {}, 1243 | "output_type": "execute_result" 1244 | } 1245 | ], 1246 | "source": [ 1247 | "#Aspect Based Sentiment analyis of user's input.\n", 1248 | "user_input=input(\"Enter a laptop review:\\n\\n\")\n", 1249 | "#Preprocessing and vectorizing\n", 1250 | "tagged_user_input = posTag([user_input])\n", 1251 | "filter_tagged_user_input = filterTag(tagged_user_input)\n", 1252 | "\n", 1253 | "user_input_series=pd.Series(filter_tagged_user_input)\n", 1254 | "user_input_series_dtm=vect.transform(user_input_series)\n", 1255 | "\n", 1256 | "predict_aspect= svc.predict(user_input_series_dtm)\n", 1257 | "extra_feature=get_dict_aspect(predict_aspect, most_common_aspect)\n", 1258 | "extra_feature_dtm=DictVectorizer().fit_transform(extra_feature)\n", 1259 | "predict_aspect" 1260 | ] 1261 | }, 1262 | { 1263 | "cell_type": "code", 1264 | "execution_count": 38, 1265 | "metadata": { 1266 | "collapsed": false 1267 | }, 1268 | "outputs": [ 1269 | { 1270 | "data": { 1271 | "text/plain": [ 1272 | "array([[1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0]])" 1273 | ] 1274 | }, 1275 | "execution_count": 38, 1276 | "metadata": {}, 1277 | "output_type": "execute_result" 1278 | } 1279 | ], 1280 | "source": [ 1281 | "#predicting weather the dectected aspect is positive or not\n", 1282 | "test_opinion_list=[]\n", 1283 | "df_test = get_data_frame(filter_tagged_user_input,test_opinion_list,most_common_aspect)\n", 1284 | "df_train = get_data_frame(final_train_text_list,train_opinion_list,most_common_aspect)\n", 1285 | "\n", 1286 | "df_train_positive = get_positive_data_frame(df_train,most_common_aspect)\n", 1287 | "y_test_pos,y_pred_class_pos,y_pred_class_svc_pos,y_pred_class_lin_svc_pos,y_pred_class_sgd_pos=classify_sentiment(df_train_positive,df_test,X_train_aspect_dtm,extra_feature_dtm)\n", 1288 | "\n", 1289 | "y_pred_class_svc_pos" 1290 | ] 1291 | }, 1292 | { 1293 | "cell_type": "code", 1294 | "execution_count": 39, 1295 | "metadata": { 1296 | "collapsed": false 1297 | }, 1298 | "outputs": [ 1299 | { 1300 | "data": { 1301 | "text/plain": [ 1302 | "array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])" 1303 | ] 1304 | }, 1305 | "execution_count": 39, 1306 | "metadata": {}, 1307 | "output_type": "execute_result" 1308 | } 1309 | ], 1310 | "source": [ 1311 | "#predicting weather the dectected aspect is negative or not\n", 1312 | "test_opinion_list=[]\n", 1313 | "df_test = get_data_frame(filter_tagged_user_input,test_opinion_list,most_common_aspect)\n", 1314 | "df_train = get_data_frame(final_train_text_list,train_opinion_list,most_common_aspect)\n", 1315 | "\n", 1316 | "df_train_negative = get_negative_data_frame(df_train,most_common_aspect)\n", 1317 | "y_test_neg,y_pred_class_neg,y_pred_class_svc_neg,y_pred_class_lin_svc_neg,y_pred_class_sgd_neg=classify_sentiment(df_train_negative,df_test,X_train_aspect_dtm,extra_feature_dtm)\n", 1318 | "\n", 1319 | "y_pred_class_svc_neg" 1320 | ] 1321 | }, 1322 | { 1323 | "cell_type": "code", 1324 | "execution_count": 40, 1325 | "metadata": { 1326 | "collapsed": false 1327 | }, 1328 | "outputs": [ 1329 | { 1330 | "data": { 1331 | "text/plain": [ 1332 | "array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]])" 1333 | ] 1334 | }, 1335 | "execution_count": 40, 1336 | "metadata": {}, 1337 | "output_type": "execute_result" 1338 | } 1339 | ], 1340 | "source": [ 1341 | "#predicting weather the dectected aspect is neutral or coflict or not\n", 1342 | "test_opinion_list=[]\n", 1343 | "df_test = get_data_frame(filter_tagged_user_input,test_opinion_list,most_common_aspect)\n", 1344 | "df_train = get_data_frame(final_train_text_list,train_opinion_list,most_common_aspect)\n", 1345 | "\n", 1346 | "df_train_neutral = get_neutral_data_frame(df_train,most_common_aspect)\n", 1347 | "y_test_neu,y_pred_class_neu,y_pred_class_svc_neu,y_pred_class_lin_svc_neu,y_pred_class_sgd_neu=classify_sentiment(df_train_neutral,df_test,X_train_aspect_dtm,extra_feature_dtm)\n", 1348 | "\n", 1349 | "y_pred_class_svc_neu" 1350 | ] 1351 | }, 1352 | { 1353 | "cell_type": "code", 1354 | "execution_count": 41, 1355 | "metadata": { 1356 | "collapsed": false 1357 | }, 1358 | "outputs": [ 1359 | { 1360 | "data": { 1361 | "text/plain": [ 1362 | "[0, 3, 8, 10, 13]" 1363 | ] 1364 | }, 1365 | "execution_count": 41, 1366 | "metadata": {}, 1367 | "output_type": "execute_result" 1368 | } 1369 | ], 1370 | "source": [ 1371 | "#Finding the aspect that is positive\n", 1372 | "index_positive=[]\n", 1373 | "for i, (a, b) in enumerate(zip(predict_aspect.tolist()[0], y_pred_class_svc_pos.tolist()[0])):\n", 1374 | " if a ==1 and b==1:\n", 1375 | " index_positive.append(i)\n", 1376 | "index_positive " 1377 | ] 1378 | }, 1379 | { 1380 | "cell_type": "code", 1381 | "execution_count": 42, 1382 | "metadata": { 1383 | "collapsed": false 1384 | }, 1385 | "outputs": [ 1386 | { 1387 | "data": { 1388 | "text/plain": [ 1389 | "[]" 1390 | ] 1391 | }, 1392 | "execution_count": 42, 1393 | "metadata": {}, 1394 | "output_type": "execute_result" 1395 | } 1396 | ], 1397 | "source": [ 1398 | "#Finding the aspect that is negative\n", 1399 | "index_negative=[]\n", 1400 | "for i, (a, b) in enumerate(zip(predict_aspect.tolist()[0], y_pred_class_svc_neg.tolist()[0])):\n", 1401 | " if a ==1 and b==1:\n", 1402 | " index_negative.append(i)\n", 1403 | "index_negative " 1404 | ] 1405 | }, 1406 | { 1407 | "cell_type": "code", 1408 | "execution_count": 43, 1409 | "metadata": { 1410 | "collapsed": false 1411 | }, 1412 | "outputs": [ 1413 | { 1414 | "data": { 1415 | "text/plain": [ 1416 | "[12]" 1417 | ] 1418 | }, 1419 | "execution_count": 43, 1420 | "metadata": {}, 1421 | "output_type": "execute_result" 1422 | } 1423 | ], 1424 | "source": [ 1425 | "#Finding the aspect that is neutral\n", 1426 | "index_neutral=[]\n", 1427 | "for i, (a, b) in enumerate(zip(predict_aspect.tolist()[0], y_pred_class_svc_neu.tolist()[0])):\n", 1428 | " if a ==1 and b==1:\n", 1429 | " index_neutral.append(i)\n", 1430 | "index_neutral " 1431 | ] 1432 | }, 1433 | { 1434 | "cell_type": "code", 1435 | "execution_count": 44, 1436 | "metadata": { 1437 | "collapsed": true 1438 | }, 1439 | "outputs": [], 1440 | "source": [ 1441 | "output=[]" 1442 | ] 1443 | }, 1444 | { 1445 | "cell_type": "code", 1446 | "execution_count": 45, 1447 | "metadata": { 1448 | "collapsed": false 1449 | }, 1450 | "outputs": [], 1451 | "source": [ 1452 | "if index_positive:\n", 1453 | " for index in index_positive:\n", 1454 | " output.append(sorted(most_common_aspect)[index]+\": positive\")" 1455 | ] 1456 | }, 1457 | { 1458 | "cell_type": "code", 1459 | "execution_count": 46, 1460 | "metadata": { 1461 | "collapsed": false 1462 | }, 1463 | "outputs": [], 1464 | "source": [ 1465 | "if index_negative:\n", 1466 | " for index in index_negative:\n", 1467 | " output.append(sorted(most_common_aspect)[index]+\": negative\")" 1468 | ] 1469 | }, 1470 | { 1471 | "cell_type": "code", 1472 | "execution_count": 47, 1473 | "metadata": { 1474 | "collapsed": false 1475 | }, 1476 | "outputs": [], 1477 | "source": [ 1478 | "if index_neutral:\n", 1479 | " for index in index_neutral:\n", 1480 | " output.append(sorted(most_common_aspect)[index]+\": neutral or conflict\")" 1481 | ] 1482 | }, 1483 | { 1484 | "cell_type": "code", 1485 | "execution_count": 48, 1486 | "metadata": { 1487 | "collapsed": false 1488 | }, 1489 | "outputs": [ 1490 | { 1491 | "data": { 1492 | "text/plain": [ 1493 | "['BATTERY_OPERATION_PERFORMANCE: positive',\n", 1494 | " 'DISPLAY_GENERAL: positive',\n", 1495 | " 'LAPTOP_GENERAL: positive',\n", 1496 | " 'LAPTOP_OPERATION_PERFORMANCE: positive',\n", 1497 | " 'LAPTOP_QUALITY: positive',\n", 1498 | " 'LAPTOP_PRICE: neutral or conflict']" 1499 | ] 1500 | }, 1501 | "execution_count": 48, 1502 | "metadata": {}, 1503 | "output_type": "execute_result" 1504 | } 1505 | ], 1506 | "source": [ 1507 | "#Prediction of Aspect Based Sentiment Analaysis for user's input\n", 1508 | "output" 1509 | ] 1510 | } 1511 | ], 1512 | "metadata": { 1513 | "kernelspec": { 1514 | "display_name": "Python 3", 1515 | "language": "python", 1516 | "name": "python3" 1517 | }, 1518 | "language_info": { 1519 | "codemirror_mode": { 1520 | "name": "ipython", 1521 | "version": 3 1522 | }, 1523 | "file_extension": ".py", 1524 | "mimetype": "text/x-python", 1525 | "name": "python", 1526 | "nbconvert_exporter": "python", 1527 | "pygments_lexer": "ipython3", 1528 | "version": "3.6.0" 1529 | } 1530 | }, 1531 | "nbformat": 4, 1532 | "nbformat_minor": 2 1533 | } 1534 | -------------------------------------------------------------------------------- /Aspect Based Sentiment Analysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import pandas as pd\n", 12 | "from sklearn.externals import joblib\n", 13 | "from sklearn.feature_extraction.text import CountVectorizer\n", 14 | "from sklearn.feature_extraction import DictVectorizer\n", 15 | "from sklearn.linear_model import SGDClassifier\n", 16 | "from sklearn.multiclass import OneVsRestClassifier\n", 17 | "from sklearn.naive_bayes import MultinomialNB\n", 18 | "from sklearn import svm\n", 19 | "import xml.etree.ElementTree as ET\n", 20 | "from lxml import etree\n", 21 | "from scipy.sparse import hstack\n", 22 | "import numpy as np\n", 23 | "import warnings\n", 24 | "\n", 25 | "\n", 26 | "path_train = r'E:\\Engineering\\8th sem\\nlp COMP 473\\NLP projects\\ABSA16_Laptops_Train_English_SB2.xml'\n", 27 | "path_test = r'E:\\Engineering\\8th sem\\nlp COMP 473\\NLP projects\\EN_LAPT_SB2_TEST.xml'\n", 28 | "\n", 29 | "#For stanford POS Tagger\n", 30 | "home = r'C:\\Users\\THe_strOX\\Anaconda3\\stanford-postagger-full-2017-06-09'\n", 31 | "from nltk.tag.stanford import StanfordPOSTagger as POS_Tag\n", 32 | "from nltk import word_tokenize\n", 33 | "_path_to_model = home + '/models/english-bidirectional-distsim.tagger' \n", 34 | "_path_to_jar = home + '/stanford-postagger.jar'\n", 35 | "stanford_tag = POS_Tag(model_filename=_path_to_model, path_to_jar=_path_to_jar)" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 2, 41 | "metadata": { 42 | "collapsed": true 43 | }, 44 | "outputs": [], 45 | "source": [ 46 | "#xml parser\n", 47 | "def get_list(path):\n", 48 | " tree=ET.parse(path)\n", 49 | " root = tree.getroot()\n", 50 | " text_list = []\n", 51 | " opinion_list = []\n", 52 | " for review in root.findall('Review'):\n", 53 | " text_string=\"\"\n", 54 | " opinion_inner_list=[]\n", 55 | " for sent in review.findall('./sentences/sentence'):\n", 56 | " text_string= text_string+ \" \"+ sent.find('text').text\n", 57 | " text_list.append(text_string)\n", 58 | " for opinion in review.findall('./Opinions/Opinion'):\n", 59 | " opinion_dict = {\n", 60 | " opinion.get('category').replace('#','_'): opinion.get('polarity')\n", 61 | " }\n", 62 | " opinion_inner_list.append(opinion_dict)\n", 63 | " opinion_list.append(opinion_inner_list)\n", 64 | " return text_list,opinion_list" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 3, 70 | "metadata": { 71 | "collapsed": true 72 | }, 73 | "outputs": [], 74 | "source": [ 75 | "#Selecting only 20 most common aspect.\n", 76 | "def get_most_common_aspect(opinion_list):\n", 77 | " import nltk\n", 78 | " opinion= []\n", 79 | " for inner_list in opinion_list:\n", 80 | " for _dict in inner_list:\n", 81 | " for key in _dict:\n", 82 | " opinion.append(key)\n", 83 | " most_common_aspect = [k for k,v in nltk.FreqDist(opinion).most_common(20)]\n", 84 | " return most_common_aspect" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 4, 90 | "metadata": { 91 | "collapsed": true 92 | }, 93 | "outputs": [], 94 | "source": [ 95 | "#generate data frame\n", 96 | "def get_data_frame(text_list,opinion_list,most_common_aspect):\n", 97 | " data={'Review':text_list}\n", 98 | " df = pd.DataFrame(data)\n", 99 | " if opinion_list:\n", 100 | " for inner_list in opinion_list:\n", 101 | " for _dict in inner_list:\n", 102 | " for key in _dict:\n", 103 | " if key in most_common_aspect:\n", 104 | " df.loc[opinion_list.index(inner_list),key]=_dict[key]\n", 105 | " return df" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": 5, 111 | "metadata": { 112 | "collapsed": true 113 | }, 114 | "outputs": [], 115 | "source": [ 116 | "#generate data frame for aspect extraction task\n", 117 | "def get_aspect_data_frame(df,most_common_aspect):\n", 118 | " for common_aspect in most_common_aspect:\n", 119 | " df[common_aspect]=df[common_aspect].replace(['positive','negative','neutral','conflict'],[1,1,1,1])\n", 120 | " df = df.fillna(0)\n", 121 | " return df" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": 6, 127 | "metadata": { 128 | "collapsed": true 129 | }, 130 | "outputs": [], 131 | "source": [ 132 | "def get_positive_data_frame(df,most_common_aspect):\n", 133 | " for common_aspect in most_common_aspect:\n", 134 | " df[common_aspect]=df[common_aspect].replace(['positive'],[1])\n", 135 | " df[common_aspect]=df[common_aspect].replace(['negative','neutral','conflict'],[0,0,0])\n", 136 | " df = df.fillna(0)\n", 137 | " return df" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": 7, 143 | "metadata": { 144 | "collapsed": true 145 | }, 146 | "outputs": [], 147 | "source": [ 148 | "def get_negative_data_frame(df,most_common_aspect):\n", 149 | " for common_aspect in most_common_aspect:\n", 150 | " df[common_aspect]=df[common_aspect].replace(['negative'],[1])\n", 151 | " df[common_aspect]=df[common_aspect].replace(['positive','neutral','conflict'],[0,0,0])\n", 152 | " df = df.fillna(0)\n", 153 | " return df" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": 8, 159 | "metadata": { 160 | "collapsed": true 161 | }, 162 | "outputs": [], 163 | "source": [ 164 | "def get_neutral_data_frame(df,most_common_aspect):\n", 165 | " for common_aspect in most_common_aspect:\n", 166 | " df[common_aspect]=df[common_aspect].replace(['neutral','conflict'],[1,1])\n", 167 | " df[common_aspect]=df[common_aspect].replace(['negative','positive'],[0,0])\n", 168 | " df = df.fillna(0)\n", 169 | " return df" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": 9, 175 | "metadata": { 176 | "collapsed": false 177 | }, 178 | "outputs": [], 179 | "source": [ 180 | "#To tag using stanford pos tagger\n", 181 | "def posTag(review):\n", 182 | " tagged_text_list=[]\n", 183 | " for text in review:\n", 184 | " tagged_text_list.append(stanford_tag.tag(word_tokenize(text)))\n", 185 | " return tagged_text_list\n", 186 | "#posTag(\"this is random text\")" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": 10, 192 | "metadata": { 193 | "collapsed": true 194 | }, 195 | "outputs": [], 196 | "source": [ 197 | "#Filter the word with tag- noun,adjective,verb,adverb\n", 198 | "def filterTag(tagged_review):\n", 199 | " final_text_list=[]\n", 200 | " for text_list in tagged_review:\n", 201 | " final_text=[]\n", 202 | " for word,tag in text_list:\n", 203 | " if tag in ['NN','NNS','NNP','NNPS','RB','RBR','RBS','JJ','JJR','JJS','VB','VBD','VBG','VBN','VBP','VBZ']:\n", 204 | " final_text.append(word)\n", 205 | " final_text_list.append(' '.join(final_text))\n", 206 | " return final_text_list" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": 11, 212 | "metadata": { 213 | "collapsed": false 214 | }, 215 | "outputs": [], 216 | "source": [ 217 | "def get_dict_aspect(y,most_common_aspect):\n", 218 | " position=[]\n", 219 | " for innerlist in y:\n", 220 | " position.append([i for i, j in enumerate(innerlist) if j == 1])\n", 221 | " sorted_common=sorted(most_common_aspect)\n", 222 | " dict_aspect=[]\n", 223 | " for innerlist in position:\n", 224 | " inner_dict={}\n", 225 | " for word in sorted_common:\n", 226 | " if sorted_common.index(word) in innerlist:\n", 227 | " inner_dict[word]= 5\n", 228 | " else:\n", 229 | " inner_dict[word]=0\n", 230 | " dict_aspect.append(inner_dict)\n", 231 | " return dict_aspect" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": 12, 237 | "metadata": { 238 | "collapsed": false 239 | }, 240 | "outputs": [], 241 | "source": [ 242 | "#Stage 1:\n", 243 | "#Making list to train\n", 244 | "train_text_list,train_opinion_list = get_list(path_train)\n", 245 | "most_common_aspect = get_most_common_aspect(train_opinion_list)" 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": 13, 251 | "metadata": { 252 | "collapsed": false 253 | }, 254 | "outputs": [], 255 | "source": [ 256 | "#This takes time to tag. Already tagged and saved. So, loading file ...\n", 257 | "#tagged_text_list_train=posTag(train_text_list)\n", 258 | "#joblib.dump(tagged_text_list_train, 'tagged_text_list_train.pkl')\n", 259 | "tagged_text_list_train=joblib.load('tagged_text_list_train.pkl')" 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": 14, 265 | "metadata": { 266 | "collapsed": true 267 | }, 268 | "outputs": [], 269 | "source": [ 270 | "#train list after filter\n", 271 | "final_train_text_list=filterTag(tagged_text_list_train)" 272 | ] 273 | }, 274 | { 275 | "cell_type": "code", 276 | "execution_count": 15, 277 | "metadata": { 278 | "collapsed": false 279 | }, 280 | "outputs": [], 281 | "source": [ 282 | "#get data frame\n", 283 | "df_train = get_data_frame(final_train_text_list,train_opinion_list,most_common_aspect)\n", 284 | "df_train_aspect = get_aspect_data_frame(df_train,most_common_aspect)\n", 285 | "df_train_aspect = df_train_aspect.reindex_axis(sorted(df_train_aspect.columns), axis=1)" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": 16, 291 | "metadata": { 292 | "collapsed": false 293 | }, 294 | "outputs": [], 295 | "source": [ 296 | "#Similar for test list\n", 297 | "test_text_list,test_opinion_list = get_list(path_test)" 298 | ] 299 | }, 300 | { 301 | "cell_type": "code", 302 | "execution_count": 17, 303 | "metadata": { 304 | "collapsed": false 305 | }, 306 | "outputs": [], 307 | "source": [ 308 | "#tagged_text_list_test=posTag(test_text_list)\n", 309 | "#joblib.dump(tagged_text_list_test, 'tagged_text_list_test.pkl')\n", 310 | "tagged_text_list_test=joblib.load('tagged_text_list_test.pkl')" 311 | ] 312 | }, 313 | { 314 | "cell_type": "code", 315 | "execution_count": 18, 316 | "metadata": { 317 | "collapsed": true 318 | }, 319 | "outputs": [], 320 | "source": [ 321 | "final_test_text_list=filterTag(tagged_text_list_test)" 322 | ] 323 | }, 324 | { 325 | "cell_type": "code", 326 | "execution_count": 19, 327 | "metadata": { 328 | "collapsed": false 329 | }, 330 | "outputs": [], 331 | "source": [ 332 | "df_test = get_data_frame(final_test_text_list,test_opinion_list,most_common_aspect)\n", 333 | "df_test_aspect = get_aspect_data_frame(df_test,most_common_aspect)\n", 334 | "df_test_aspect = df_test_aspect.reindex_axis(sorted(df_test_aspect.columns), axis=1)" 335 | ] 336 | }, 337 | { 338 | "cell_type": "code", 339 | "execution_count": 20, 340 | "metadata": { 341 | "collapsed": false 342 | }, 343 | "outputs": [], 344 | "source": [ 345 | "#Sort the data frame according to aspect's name and separate data(X) and target(y)\n", 346 | "#df_train_aspect = df_train_aspect.sample(frac=1).reset_index(drop=True) #For randoming\n", 347 | "X_train= df_train_aspect.Review\n", 348 | "y_train = df_train_aspect.drop('Review',1)\n", 349 | "\n", 350 | "#df_test_aspect = df_test_aspect.sample(frac=1).reset_index(drop=True) #For randoming\n", 351 | "X_test = df_test_aspect.Review\n", 352 | "y_test = df_test_aspect.drop('Review',1)" 353 | ] 354 | }, 355 | { 356 | "cell_type": "code", 357 | "execution_count": 21, 358 | "metadata": { 359 | "collapsed": false 360 | }, 361 | "outputs": [], 362 | "source": [ 363 | "#Change y_train to numpy array\n", 364 | "import numpy as np\n", 365 | "y_train = np.asarray(y_train, dtype=np.int64)\n", 366 | "y_test = np.asarray(y_test, dtype=np.int64)" 367 | ] 368 | }, 369 | { 370 | "cell_type": "code", 371 | "execution_count": 22, 372 | "metadata": { 373 | "collapsed": false 374 | }, 375 | "outputs": [], 376 | "source": [ 377 | "#Generate word vecotors using CountVectorizer\n", 378 | "from sklearn.feature_extraction.text import CountVectorizer\n", 379 | "from nltk import word_tokenize \n", 380 | "from nltk.stem import WordNetLemmatizer \n", 381 | "vect = CountVectorizer(max_df=1.0,stop_words='english') \n", 382 | "X_train_dtm = vect.fit_transform(X_train)\n", 383 | "X_test_dtm = vect.transform(X_test)" 384 | ] 385 | }, 386 | { 387 | "cell_type": "code", 388 | "execution_count": 23, 389 | "metadata": { 390 | "collapsed": false, 391 | "scrolled": true 392 | }, 393 | "outputs": [], 394 | "source": [ 395 | "#Create various models. These are multi-label models.\n", 396 | "nb_classif = OneVsRestClassifier(MultinomialNB()).fit(X_train_dtm, y_train)\n", 397 | "C = 1.0 #SVregularization parameter\n", 398 | "svc = OneVsRestClassifier(svm.SVC(kernel='linear', C=C)).fit(X_train_dtm, y_train)\n", 399 | "lin_svc = OneVsRestClassifier(svm.LinearSVC(C=C)).fit(X_train_dtm, y_train)\n", 400 | "sgd = OneVsRestClassifier(SGDClassifier()).fit(X_train_dtm,y_train)" 401 | ] 402 | }, 403 | { 404 | "cell_type": "code", 405 | "execution_count": 24, 406 | "metadata": { 407 | "collapsed": false 408 | }, 409 | "outputs": [], 410 | "source": [ 411 | "#Predict the test data using classifiers\n", 412 | "y_pred_class = nb_classif.predict(X_test_dtm)\n", 413 | "y_pred_class_svc = svc.predict(X_test_dtm)\n", 414 | "y_pred_class_lin_svc = lin_svc.predict(X_test_dtm)\n", 415 | "y_pred_class_sgd = sgd.predict(X_test_dtm)" 416 | ] 417 | }, 418 | { 419 | "cell_type": "code", 420 | "execution_count": 25, 421 | "metadata": { 422 | "collapsed": true 423 | }, 424 | "outputs": [], 425 | "source": [ 426 | "#Following code to test metrics of all aspect extraction classifiers\n", 427 | "from sklearn import metrics" 428 | ] 429 | }, 430 | { 431 | "cell_type": "code", 432 | "execution_count": 26, 433 | "metadata": { 434 | "collapsed": false 435 | }, 436 | "outputs": [ 437 | { 438 | "name": "stdout", 439 | "output_type": "stream", 440 | "text": [ 441 | "0.025\n", 442 | "0.05\n", 443 | "0.05\n", 444 | "0.0375\n" 445 | ] 446 | } 447 | ], 448 | "source": [ 449 | "print(metrics.accuracy_score(y_test,y_pred_class))\n", 450 | "print(metrics.accuracy_score(y_test,y_pred_class_svc))\n", 451 | "print(metrics.accuracy_score(y_test,y_pred_class_lin_svc))\n", 452 | "print(metrics.accuracy_score(y_test,y_pred_class_sgd))" 453 | ] 454 | }, 455 | { 456 | "cell_type": "code", 457 | "execution_count": 27, 458 | "metadata": { 459 | "collapsed": false 460 | }, 461 | "outputs": [ 462 | { 463 | "name": "stdout", 464 | "output_type": "stream", 465 | "text": [ 466 | "0.75\n", 467 | "0.711229946524\n", 468 | "0.732193732194\n", 469 | "0.700657894737\n" 470 | ] 471 | } 472 | ], 473 | "source": [ 474 | "print(metrics.precision_score(y_test,y_pred_class,average='micro'))\n", 475 | "print(metrics.precision_score(y_test,y_pred_class_svc,average='micro'))\n", 476 | "print(metrics.precision_score(y_test,y_pred_class_lin_svc,average='micro'))\n", 477 | "print(metrics.precision_score(y_test,y_pred_class_sgd,average='micro'))" 478 | ] 479 | }, 480 | { 481 | "cell_type": "code", 482 | "execution_count": 28, 483 | "metadata": { 484 | "collapsed": false 485 | }, 486 | "outputs": [ 487 | { 488 | "name": "stdout", 489 | "output_type": "stream", 490 | "text": [ 491 | "0.457627118644\n", 492 | "0.64406779661\n", 493 | "0.622276029056\n", 494 | "0.515738498789\n" 495 | ] 496 | } 497 | ], 498 | "source": [ 499 | "print(metrics.recall_score(y_test,y_pred_class,average='micro'))\n", 500 | "print(metrics.recall_score(y_test,y_pred_class_svc,average='micro'))\n", 501 | "print(metrics.recall_score(y_test,y_pred_class_lin_svc,average='micro'))\n", 502 | "print(metrics.recall_score(y_test,y_pred_class_sgd,average='micro'))" 503 | ] 504 | }, 505 | { 506 | "cell_type": "code", 507 | "execution_count": 29, 508 | "metadata": { 509 | "collapsed": false 510 | }, 511 | "outputs": [ 512 | { 513 | "name": "stdout", 514 | "output_type": "stream", 515 | "text": [ 516 | "0.568421052632\n", 517 | "0.675984752224\n", 518 | "0.67277486911\n", 519 | "0.594142259414\n" 520 | ] 521 | } 522 | ], 523 | "source": [ 524 | "print(metrics.f1_score(y_test,y_pred_class,average='micro'))\n", 525 | "print(metrics.f1_score(y_test,y_pred_class_svc,average='micro'))\n", 526 | "print(metrics.f1_score(y_test,y_pred_class_lin_svc,average='micro'))\n", 527 | "print(metrics.f1_score(y_test,y_pred_class_sgd,average='micro'))" 528 | ] 529 | }, 530 | { 531 | "cell_type": "code", 532 | "execution_count": 30, 533 | "metadata": { 534 | "collapsed": false, 535 | "scrolled": true 536 | }, 537 | "outputs": [ 538 | { 539 | "name": "stdout", 540 | "output_type": "stream", 541 | "text": [ 542 | " precision recall f1-score support\n", 543 | "\n", 544 | " 0 0.67 0.14 0.24 14\n", 545 | " 1 0.71 0.50 0.59 24\n", 546 | " 2 0.00 0.00 0.00 12\n", 547 | " 3 0.00 0.00 0.00 4\n", 548 | " 4 0.00 0.00 0.00 21\n", 549 | " 5 0.00 0.00 0.00 8\n", 550 | " 6 0.00 0.00 0.00 7\n", 551 | " 7 0.76 0.64 0.69 39\n", 552 | " 8 1.00 1.00 1.00 80\n", 553 | " 9 0.44 0.17 0.24 24\n", 554 | " 10 0.62 0.70 0.65 46\n", 555 | " 11 0.00 0.00 0.00 5\n", 556 | " 12 0.57 0.30 0.39 27\n", 557 | " 13 0.57 0.45 0.50 29\n", 558 | " 14 0.77 0.33 0.47 30\n", 559 | " 15 0.00 0.00 0.00 4\n", 560 | " 16 0.00 0.00 0.00 9\n", 561 | " 17 0.00 0.00 0.00 15\n", 562 | " 18 0.00 0.00 0.00 4\n", 563 | " 19 0.60 0.27 0.37 11\n", 564 | "\n", 565 | "avg / total 0.57 0.46 0.49 413\n", 566 | "\n", 567 | " precision recall f1-score support\n", 568 | "\n", 569 | " 0 0.78 1.00 0.88 14\n", 570 | " 1 0.68 0.71 0.69 24\n", 571 | " 2 0.86 0.50 0.63 12\n", 572 | " 3 0.12 0.25 0.17 4\n", 573 | " 4 0.56 0.43 0.49 21\n", 574 | " 5 0.75 0.38 0.50 8\n", 575 | " 6 0.20 0.14 0.17 7\n", 576 | " 7 0.74 0.67 0.70 39\n", 577 | " 8 1.00 0.97 0.99 80\n", 578 | " 9 0.63 0.50 0.56 24\n", 579 | " 10 0.68 0.74 0.71 46\n", 580 | " 11 0.33 0.40 0.36 5\n", 581 | " 12 0.83 0.74 0.78 27\n", 582 | " 13 0.56 0.66 0.60 29\n", 583 | " 14 0.60 0.40 0.48 30\n", 584 | " 15 0.67 0.50 0.57 4\n", 585 | " 16 0.18 0.22 0.20 9\n", 586 | " 17 0.80 0.27 0.40 15\n", 587 | " 18 1.00 0.25 0.40 4\n", 588 | " 19 0.60 0.27 0.37 11\n", 589 | "\n", 590 | "avg / total 0.72 0.64 0.67 413\n", 591 | "\n", 592 | " precision recall f1-score support\n", 593 | "\n", 594 | " 0 0.78 1.00 0.88 14\n", 595 | " 1 0.65 0.71 0.68 24\n", 596 | " 2 1.00 0.42 0.59 12\n", 597 | " 3 0.17 0.25 0.20 4\n", 598 | " 4 0.64 0.43 0.51 21\n", 599 | " 5 1.00 0.25 0.40 8\n", 600 | " 6 0.33 0.14 0.20 7\n", 601 | " 7 0.74 0.67 0.70 39\n", 602 | " 8 1.00 0.96 0.98 80\n", 603 | " 9 0.61 0.46 0.52 24\n", 604 | " 10 0.70 0.72 0.71 46\n", 605 | " 11 0.25 0.20 0.22 5\n", 606 | " 12 0.83 0.74 0.78 27\n", 607 | " 13 0.54 0.66 0.59 29\n", 608 | " 14 0.61 0.37 0.46 30\n", 609 | " 15 1.00 0.25 0.40 4\n", 610 | " 16 0.20 0.22 0.21 9\n", 611 | " 17 1.00 0.20 0.33 15\n", 612 | " 18 1.00 0.25 0.40 4\n", 613 | " 19 0.75 0.27 0.40 11\n", 614 | "\n", 615 | "avg / total 0.75 0.62 0.66 413\n", 616 | "\n", 617 | " precision recall f1-score support\n", 618 | "\n", 619 | " 0 0.75 0.64 0.69 14\n", 620 | " 1 0.60 0.75 0.67 24\n", 621 | " 2 0.00 0.00 0.00 12\n", 622 | " 3 0.00 0.00 0.00 4\n", 623 | " 4 0.50 0.19 0.28 21\n", 624 | " 5 0.00 0.00 0.00 8\n", 625 | " 6 0.00 0.00 0.00 7\n", 626 | " 7 0.63 0.62 0.62 39\n", 627 | " 8 1.00 0.99 0.99 80\n", 628 | " 9 0.67 0.33 0.44 24\n", 629 | " 10 0.65 0.61 0.63 46\n", 630 | " 11 0.29 0.40 0.33 5\n", 631 | " 12 0.71 0.37 0.49 27\n", 632 | " 13 0.58 0.48 0.53 29\n", 633 | " 14 0.71 0.33 0.45 30\n", 634 | " 15 0.00 0.00 0.00 4\n", 635 | " 16 0.20 0.11 0.14 9\n", 636 | " 17 0.67 0.13 0.22 15\n", 637 | " 18 0.33 0.25 0.29 4\n", 638 | " 19 0.43 0.27 0.33 11\n", 639 | "\n", 640 | "avg / total 0.64 0.52 0.55 413\n", 641 | "\n" 642 | ] 643 | } 644 | ], 645 | "source": [ 646 | "with warnings.catch_warnings():\n", 647 | " warnings.simplefilter(\"ignore\")\n", 648 | " print(metrics.classification_report(y_test, y_pred_class))\n", 649 | " print(metrics.classification_report(y_test, y_pred_class_svc))\n", 650 | " print(metrics.classification_report(y_test, y_pred_class_lin_svc))\n", 651 | " print(metrics.classification_report(y_test, y_pred_class_sgd))" 652 | ] 653 | }, 654 | { 655 | "cell_type": "code", 656 | "execution_count": 31, 657 | "metadata": { 658 | "collapsed": false 659 | }, 660 | "outputs": [], 661 | "source": [ 662 | "#Stage 2:\n", 663 | "#Generating extra feature that indicates which aspect category is present in the review\n", 664 | "train_dict_aspect=get_dict_aspect(y_train, most_common_aspect)\n", 665 | "d_train=DictVectorizer() \n", 666 | "X_train_aspect_dtm = d_train.fit_transform(train_dict_aspect)\n", 667 | "\n", 668 | "#y_test is used to generated extra feature in order to test the performance of 2nd classifer.\n", 669 | "#Use y_pred_class_svc(Highest performer for aspect classification) as input for extra feature to test the overall performace.\n", 670 | "test_dict_aspect=get_dict_aspect(y_test,most_common_aspect)\n", 671 | "d_test=DictVectorizer() \n", 672 | "X_test_aspect_dtm = d_test.fit_transform(test_dict_aspect)" 673 | ] 674 | }, 675 | { 676 | "cell_type": "code", 677 | "execution_count": 32, 678 | "metadata": { 679 | "collapsed": false 680 | }, 681 | "outputs": [], 682 | "source": [ 683 | "#Function for classiflying positive,negative or neutral sentiment of all the aspects\n", 684 | "def classify_sentiment(df_train,df_test,X_train_aspect_dtm,X_test_aspect_dtm):\n", 685 | " \n", 686 | " df_train = df_train.reindex_axis(sorted(df_train_positive.columns), axis=1)\n", 687 | " df_test = df_test.reindex_axis(sorted(df_test_positive.columns), axis=1)\n", 688 | "\n", 689 | " import numpy as np\n", 690 | " X_train = df_train.Review\n", 691 | " y_train = df_train.drop('Review',1)\n", 692 | " y_train = np.asarray(y_train, dtype=np.int64)\n", 693 | "\n", 694 | " X_test = df_test.Review\n", 695 | " y_test = df_test.drop('Review',1)\n", 696 | " y_test = np.asarray(y_test, dtype=np.int64)\n", 697 | "\n", 698 | " vect_sen = CountVectorizer(stop_words='english',ngram_range=(1,2)) \n", 699 | " X_train_dtm = vect_sen.fit_transform(X_train)\n", 700 | " X_test_dtm = vect_sen.transform(X_test)\n", 701 | "\n", 702 | " #ombining word vector with extra feature.\n", 703 | " from scipy.sparse import hstack\n", 704 | " X_train_dtm=hstack((X_train_dtm, X_train_aspect_dtm))\n", 705 | " X_test_dtm=hstack((X_test_dtm, X_test_aspect_dtm))\n", 706 | "\n", 707 | " C = 1.0 #SVregularization parameter\n", 708 | " nb_classif = OneVsRestClassifier(MultinomialNB()).fit(X_train_dtm, y_train)\n", 709 | " svc = OneVsRestClassifier(svm.SVC(kernel='linear', C=C)).fit(X_train_dtm, y_train)\n", 710 | " lin_svc = OneVsRestClassifier(svm.LinearSVC(C=C)).fit(X_train_dtm, y_train)\n", 711 | " sgd = OneVsRestClassifier(SGDClassifier()).fit(X_train_dtm,y_train)\n", 712 | "\n", 713 | " y_pred_class= nb_classif.predict(X_test_dtm)\n", 714 | " y_pred_class_svc = svc.predict(X_test_dtm)\n", 715 | " y_pred_class_lin_svc = lin_svc.predict(X_test_dtm)\n", 716 | " y_pred_class_sgd = sgd.predict(X_test_dtm)\n", 717 | " return (y_test,y_pred_class,y_pred_class_svc,y_pred_class_lin_svc,y_pred_class_sgd)" 718 | ] 719 | }, 720 | { 721 | "cell_type": "code", 722 | "execution_count": 33, 723 | "metadata": { 724 | "collapsed": true 725 | }, 726 | "outputs": [], 727 | "source": [ 728 | "def print_metrices(y_test,y_pred_class,y_pred_class_svc,y_pred_class_lin_svc,y_pred_class_sgd):\n", 729 | " print(\"Accuracy:\")\n", 730 | " print(metrics.accuracy_score(y_test,y_pred_class))\n", 731 | " print(metrics.accuracy_score(y_test,y_pred_class_svc))\n", 732 | " print(metrics.accuracy_score(y_test,y_pred_class_lin_svc))\n", 733 | " print(metrics.accuracy_score(y_test,y_pred_class_sgd))\n", 734 | "\n", 735 | " print(\"\\nAverage precision:\")\n", 736 | " print(metrics.precision_score(y_test,y_pred_class,average='micro'))\n", 737 | " print(metrics.precision_score(y_test,y_pred_class_svc,average='micro'))\n", 738 | " print(metrics.precision_score(y_test,y_pred_class_lin_svc,average='micro'))\n", 739 | " print(metrics.precision_score(y_test,y_pred_class_sgd,average='micro'))\n", 740 | "\n", 741 | " print(\"\\nAverage recall:\")\n", 742 | " print(metrics.recall_score(y_test,y_pred_class,average='micro'))\n", 743 | " print(metrics.recall_score(y_test,y_pred_class_svc,average='micro'))\n", 744 | " print(metrics.recall_score(y_test,y_pred_class_lin_svc,average='micro'))\n", 745 | " print(metrics.recall_score(y_test,y_pred_class_sgd,average='micro'))\n", 746 | " \n", 747 | " print(\"\\nAverage f1:\")\n", 748 | " print(metrics.f1_score(y_test,y_pred_class,average='micro'))\n", 749 | " print(metrics.f1_score(y_test,y_pred_class_svc,average='micro'))\n", 750 | " print(metrics.f1_score(y_test,y_pred_class_lin_svc,average='micro'))\n", 751 | " print(metrics.f1_score(y_test,y_pred_class_sgd,average='micro'))\n", 752 | "\n", 753 | " print(\"\\nClassification report:\")\n", 754 | " print(metrics.classification_report(y_test, y_pred_class))\n", 755 | " print(metrics.classification_report(y_test, y_pred_class_svc))\n", 756 | " print(metrics.classification_report(y_test, y_pred_class_lin_svc))\n", 757 | " print(metrics.classification_report(y_test, y_pred_class_sgd))" 758 | ] 759 | }, 760 | { 761 | "cell_type": "code", 762 | "execution_count": 34, 763 | "metadata": { 764 | "collapsed": false 765 | }, 766 | "outputs": [ 767 | { 768 | "name": "stdout", 769 | "output_type": "stream", 770 | "text": [ 771 | "Accuracy:\n", 772 | "0.15\n", 773 | "0.35\n", 774 | "0.3125\n", 775 | "0.125\n", 776 | "\n", 777 | "Average precision:\n", 778 | "0.857142857143\n", 779 | "0.745762711864\n", 780 | "0.756097560976\n", 781 | "0.704545454545\n", 782 | "\n", 783 | "Average recall:\n", 784 | "0.260869565217\n", 785 | "0.797101449275\n", 786 | "0.673913043478\n", 787 | "0.673913043478\n", 788 | "\n", 789 | "Average f1:\n", 790 | "0.4\n", 791 | "0.77057793345\n", 792 | "0.712643678161\n", 793 | "0.688888888889\n", 794 | "\n", 795 | "Classification report:\n", 796 | " precision recall f1-score support\n", 797 | "\n", 798 | " 0 0.00 0.00 0.00 12\n", 799 | " 1 0.00 0.00 0.00 14\n", 800 | " 2 0.00 0.00 0.00 6\n", 801 | " 3 0.00 0.00 0.00 3\n", 802 | " 4 0.00 0.00 0.00 15\n", 803 | " 5 0.00 0.00 0.00 7\n", 804 | " 6 0.00 0.00 0.00 4\n", 805 | " 7 0.00 0.00 0.00 26\n", 806 | " 8 0.83 0.95 0.89 61\n", 807 | " 9 0.00 0.00 0.00 11\n", 808 | " 10 1.00 0.39 0.56 36\n", 809 | " 11 0.00 0.00 0.00 4\n", 810 | " 12 0.00 0.00 0.00 12\n", 811 | " 13 0.00 0.00 0.00 18\n", 812 | " 14 0.00 0.00 0.00 23\n", 813 | " 15 0.00 0.00 0.00 3\n", 814 | " 16 0.00 0.00 0.00 7\n", 815 | " 17 0.00 0.00 0.00 8\n", 816 | " 18 0.00 0.00 0.00 2\n", 817 | " 19 0.00 0.00 0.00 4\n", 818 | "\n", 819 | "avg / total 0.31 0.26 0.27 276\n", 820 | "\n", 821 | " precision recall f1-score support\n", 822 | "\n", 823 | " 0 0.91 0.83 0.87 12\n", 824 | " 1 0.67 0.71 0.69 14\n", 825 | " 2 0.50 1.00 0.67 6\n", 826 | " 3 0.75 1.00 0.86 3\n", 827 | " 4 0.65 0.73 0.69 15\n", 828 | " 5 0.75 0.43 0.55 7\n", 829 | " 6 0.67 0.50 0.57 4\n", 830 | " 7 0.71 0.77 0.74 26\n", 831 | " 8 0.89 0.93 0.91 61\n", 832 | " 9 0.47 0.82 0.60 11\n", 833 | " 10 0.89 0.89 0.89 36\n", 834 | " 11 0.75 0.75 0.75 4\n", 835 | " 12 0.60 0.75 0.67 12\n", 836 | " 13 0.72 0.72 0.72 18\n", 837 | " 14 0.75 0.91 0.82 23\n", 838 | " 15 0.67 0.67 0.67 3\n", 839 | " 16 0.25 0.14 0.18 7\n", 840 | " 17 0.80 0.50 0.62 8\n", 841 | " 18 0.67 1.00 0.80 2\n", 842 | " 19 1.00 0.50 0.67 4\n", 843 | "\n", 844 | "avg / total 0.76 0.80 0.77 276\n", 845 | "\n", 846 | " precision recall f1-score support\n", 847 | "\n", 848 | " 0 1.00 0.75 0.86 12\n", 849 | " 1 0.64 0.64 0.64 14\n", 850 | " 2 0.60 0.50 0.55 6\n", 851 | " 3 0.75 1.00 0.86 3\n", 852 | " 4 0.69 0.60 0.64 15\n", 853 | " 5 1.00 0.14 0.25 7\n", 854 | " 6 0.00 0.00 0.00 4\n", 855 | " 7 0.68 0.65 0.67 26\n", 856 | " 8 0.88 0.92 0.90 61\n", 857 | " 9 0.44 0.73 0.55 11\n", 858 | " 10 0.89 0.86 0.87 36\n", 859 | " 11 1.00 0.50 0.67 4\n", 860 | " 12 0.45 0.42 0.43 12\n", 861 | " 13 0.73 0.61 0.67 18\n", 862 | " 14 0.71 0.65 0.68 23\n", 863 | " 15 1.00 0.33 0.50 3\n", 864 | " 16 0.00 0.00 0.00 7\n", 865 | " 17 0.75 0.38 0.50 8\n", 866 | " 18 0.67 1.00 0.80 2\n", 867 | " 19 1.00 0.25 0.40 4\n", 868 | "\n", 869 | "avg / total 0.74 0.67 0.69 276\n", 870 | "\n", 871 | " precision recall f1-score support\n", 872 | "\n", 873 | " 0 1.00 0.58 0.74 12\n", 874 | " 1 0.64 1.00 0.78 14\n", 875 | " 2 0.50 0.83 0.62 6\n", 876 | " 3 0.75 1.00 0.86 3\n", 877 | " 4 0.72 0.87 0.79 15\n", 878 | " 5 0.50 0.14 0.22 7\n", 879 | " 6 0.75 0.75 0.75 4\n", 880 | " 7 0.53 0.31 0.39 26\n", 881 | " 8 0.88 0.85 0.87 61\n", 882 | " 9 0.38 0.55 0.44 11\n", 883 | " 10 0.78 0.97 0.86 36\n", 884 | " 11 0.80 1.00 0.89 4\n", 885 | " 12 0.67 0.17 0.27 12\n", 886 | " 13 0.73 0.44 0.55 18\n", 887 | " 14 0.73 0.83 0.78 23\n", 888 | " 15 0.00 0.00 0.00 3\n", 889 | " 16 0.00 0.00 0.00 7\n", 890 | " 17 0.33 0.12 0.18 8\n", 891 | " 18 0.67 1.00 0.80 2\n", 892 | " 19 0.30 0.75 0.43 4\n", 893 | "\n", 894 | "avg / total 0.69 0.67 0.65 276\n", 895 | "\n" 896 | ] 897 | } 898 | ], 899 | "source": [ 900 | "#For positive sentiment classifier\n", 901 | "df_train = get_data_frame(final_train_text_list,train_opinion_list,most_common_aspect)\n", 902 | "df_test = get_data_frame(final_test_text_list,test_opinion_list,most_common_aspect)\n", 903 | "\n", 904 | "df_train_positive = get_positive_data_frame(df_train,most_common_aspect)\n", 905 | "df_test_positive = get_positive_data_frame(df_test,most_common_aspect)\n", 906 | "y_test_pos,y_pred_class_pos,y_pred_class_svc_pos,y_pred_class_lin_svc_pos,y_pred_class_sgd_pos=classify_sentiment(df_train_positive,df_test_positive,X_train_aspect_dtm,X_test_aspect_dtm)\n", 907 | "with warnings.catch_warnings():\n", 908 | " warnings.simplefilter(\"ignore\")\n", 909 | " print_metrices(y_test_pos,y_pred_class_pos,y_pred_class_svc_pos,y_pred_class_lin_svc_pos,y_pred_class_sgd_pos)" 910 | ] 911 | }, 912 | { 913 | "cell_type": "code", 914 | "execution_count": 35, 915 | "metadata": { 916 | "collapsed": false 917 | }, 918 | "outputs": [ 919 | { 920 | "name": "stdout", 921 | "output_type": "stream", 922 | "text": [ 923 | "Accuracy:\n", 924 | "0.4875\n", 925 | "0.4875\n", 926 | "0.4625\n", 927 | "0.3375\n", 928 | "\n", 929 | "Average precision:\n", 930 | "0.7\n", 931 | "0.625\n", 932 | "0.666666666667\n", 933 | "0.449438202247\n", 934 | "\n", 935 | "Average recall:\n", 936 | "0.0642201834862\n", 937 | "0.412844036697\n", 938 | "0.330275229358\n", 939 | "0.366972477064\n", 940 | "\n", 941 | "Average f1:\n", 942 | "0.117647058824\n", 943 | "0.497237569061\n", 944 | "0.441717791411\n", 945 | "0.40404040404\n", 946 | "\n", 947 | "Classification report:\n", 948 | " precision recall f1-score support\n", 949 | "\n", 950 | " 0 0.00 0.00 0.00 2\n", 951 | " 1 0.00 0.00 0.00 10\n", 952 | " 2 0.00 0.00 0.00 2\n", 953 | " 3 0.00 0.00 0.00 1\n", 954 | " 4 0.00 0.00 0.00 4\n", 955 | " 5 0.00 0.00 0.00 1\n", 956 | " 6 0.00 0.00 0.00 3\n", 957 | " 7 0.00 0.00 0.00 3\n", 958 | " 8 0.67 0.33 0.44 18\n", 959 | " 9 0.00 0.00 0.00 9\n", 960 | " 10 0.00 0.00 0.00 10\n", 961 | " 11 0.00 0.00 0.00 1\n", 962 | " 12 0.00 0.00 0.00 12\n", 963 | " 13 1.00 0.09 0.17 11\n", 964 | " 14 0.00 0.00 0.00 6\n", 965 | " 15 0.00 0.00 0.00 0\n", 966 | " 16 0.00 0.00 0.00 2\n", 967 | " 17 0.00 0.00 0.00 5\n", 968 | " 18 0.00 0.00 0.00 2\n", 969 | " 19 0.00 0.00 0.00 7\n", 970 | "\n", 971 | "avg / total 0.21 0.06 0.09 109\n", 972 | "\n", 973 | " precision recall f1-score support\n", 974 | "\n", 975 | " 0 0.50 0.50 0.50 2\n", 976 | " 1 0.67 0.60 0.63 10\n", 977 | " 2 0.00 0.00 0.00 2\n", 978 | " 3 0.00 0.00 0.00 1\n", 979 | " 4 0.50 0.25 0.33 4\n", 980 | " 5 0.00 0.00 0.00 1\n", 981 | " 6 1.00 0.67 0.80 3\n", 982 | " 7 0.00 0.00 0.00 3\n", 983 | " 8 0.77 0.56 0.65 18\n", 984 | " 9 1.00 0.22 0.36 9\n", 985 | " 10 0.78 0.70 0.74 10\n", 986 | " 11 0.00 0.00 0.00 1\n", 987 | " 12 0.75 0.25 0.38 12\n", 988 | " 13 0.70 0.64 0.67 11\n", 989 | " 14 0.00 0.00 0.00 6\n", 990 | " 15 0.00 0.00 0.00 0\n", 991 | " 16 0.00 0.00 0.00 2\n", 992 | " 17 0.00 0.00 0.00 5\n", 993 | " 18 0.00 0.00 0.00 2\n", 994 | " 19 0.75 0.86 0.80 7\n", 995 | "\n", 996 | "avg / total 0.60 0.41 0.47 109\n", 997 | "\n", 998 | " precision recall f1-score support\n", 999 | "\n", 1000 | " 0 1.00 0.50 0.67 2\n", 1001 | " 1 0.67 0.40 0.50 10\n", 1002 | " 2 0.00 0.00 0.00 2\n", 1003 | " 3 0.00 0.00 0.00 1\n", 1004 | " 4 0.33 0.25 0.29 4\n", 1005 | " 5 0.00 0.00 0.00 1\n", 1006 | " 6 1.00 0.33 0.50 3\n", 1007 | " 7 0.00 0.00 0.00 3\n", 1008 | " 8 0.77 0.56 0.65 18\n", 1009 | " 9 0.00 0.00 0.00 9\n", 1010 | " 10 0.86 0.60 0.71 10\n", 1011 | " 11 0.00 0.00 0.00 1\n", 1012 | " 12 1.00 0.08 0.15 12\n", 1013 | " 13 0.70 0.64 0.67 11\n", 1014 | " 14 0.00 0.00 0.00 6\n", 1015 | " 15 0.00 0.00 0.00 0\n", 1016 | " 16 0.00 0.00 0.00 2\n", 1017 | " 17 0.00 0.00 0.00 5\n", 1018 | " 18 0.00 0.00 0.00 2\n", 1019 | " 19 0.83 0.71 0.77 7\n", 1020 | "\n", 1021 | "avg / total 0.56 0.33 0.39 109\n", 1022 | "\n", 1023 | " precision recall f1-score support\n", 1024 | "\n", 1025 | " 0 1.00 0.50 0.67 2\n", 1026 | " 1 0.71 0.50 0.59 10\n", 1027 | " 2 0.00 0.00 0.00 2\n", 1028 | " 3 0.00 0.00 0.00 1\n", 1029 | " 4 1.00 0.25 0.40 4\n", 1030 | " 5 0.00 0.00 0.00 1\n", 1031 | " 6 0.67 0.67 0.67 3\n", 1032 | " 7 0.13 0.67 0.22 3\n", 1033 | " 8 0.38 0.67 0.48 18\n", 1034 | " 9 1.00 0.11 0.20 9\n", 1035 | " 10 0.50 0.70 0.58 10\n", 1036 | " 11 0.00 0.00 0.00 1\n", 1037 | " 12 0.00 0.00 0.00 12\n", 1038 | " 13 0.75 0.27 0.40 11\n", 1039 | " 14 0.00 0.00 0.00 6\n", 1040 | " 15 0.00 0.00 0.00 0\n", 1041 | " 16 0.00 0.00 0.00 2\n", 1042 | " 17 0.00 0.00 0.00 5\n", 1043 | " 18 0.00 0.00 0.00 2\n", 1044 | " 19 0.86 0.86 0.86 7\n", 1045 | "\n", 1046 | "avg / total 0.46 0.37 0.35 109\n", 1047 | "\n" 1048 | ] 1049 | } 1050 | ], 1051 | "source": [ 1052 | "#For negative sentiment classifier\n", 1053 | "df_train = get_data_frame(final_train_text_list,train_opinion_list,most_common_aspect)\n", 1054 | "df_test = get_data_frame(final_test_text_list,test_opinion_list,most_common_aspect)\n", 1055 | "\n", 1056 | "df_train_neg = get_negative_data_frame(df_train,most_common_aspect)\n", 1057 | "df_test_neg = get_negative_data_frame(df_test,most_common_aspect)\n", 1058 | "\n", 1059 | "y_test_neg,y_pred_class_neg,y_pred_class_svc_neg,y_pred_class_lin_svc_neg,y_pred_class_sgd_neg=classify_sentiment(df_train_neg,df_test_neg,X_train_aspect_dtm,X_test_aspect_dtm)\n", 1060 | "with warnings.catch_warnings():\n", 1061 | " warnings.simplefilter(\"ignore\")\n", 1062 | " print_metrices(y_test_neg,y_pred_class_neg,y_pred_class_svc_neg,y_pred_class_lin_svc_neg,y_pred_class_sgd_neg)" 1063 | ] 1064 | }, 1065 | { 1066 | "cell_type": "code", 1067 | "execution_count": 36, 1068 | "metadata": { 1069 | "collapsed": false 1070 | }, 1071 | "outputs": [ 1072 | { 1073 | "name": "stdout", 1074 | "output_type": "stream", 1075 | "text": [ 1076 | "Accuracy:\n", 1077 | "0.7375\n", 1078 | "0.725\n", 1079 | "0.7375\n", 1080 | "0.5875\n", 1081 | "\n", 1082 | "Average precision:\n", 1083 | "0.0\n", 1084 | "0.153846153846\n", 1085 | "0.333333333333\n", 1086 | "0.105263157895\n", 1087 | "\n", 1088 | "Average recall:\n", 1089 | "0.0\n", 1090 | "0.0714285714286\n", 1091 | "0.0357142857143\n", 1092 | "0.0714285714286\n", 1093 | "\n", 1094 | "Average f1:\n", 1095 | "0.0\n", 1096 | "0.0975609756098\n", 1097 | "0.0645161290323\n", 1098 | "0.0851063829787\n", 1099 | "\n", 1100 | "Classification report:\n", 1101 | " precision recall f1-score support\n", 1102 | "\n", 1103 | " 0 0.00 0.00 0.00 0\n", 1104 | " 1 0.00 0.00 0.00 0\n", 1105 | " 2 0.00 0.00 0.00 4\n", 1106 | " 3 0.00 0.00 0.00 0\n", 1107 | " 4 0.00 0.00 0.00 2\n", 1108 | " 5 0.00 0.00 0.00 0\n", 1109 | " 6 0.00 0.00 0.00 0\n", 1110 | " 7 0.00 0.00 0.00 10\n", 1111 | " 8 0.00 0.00 0.00 1\n", 1112 | " 9 0.00 0.00 0.00 4\n", 1113 | " 10 0.00 0.00 0.00 0\n", 1114 | " 11 0.00 0.00 0.00 0\n", 1115 | " 12 0.00 0.00 0.00 3\n", 1116 | " 13 0.00 0.00 0.00 0\n", 1117 | " 14 0.00 0.00 0.00 1\n", 1118 | " 15 0.00 0.00 0.00 1\n", 1119 | " 16 0.00 0.00 0.00 0\n", 1120 | " 17 0.00 0.00 0.00 2\n", 1121 | " 18 0.00 0.00 0.00 0\n", 1122 | " 19 0.00 0.00 0.00 0\n", 1123 | "\n", 1124 | "avg / total 0.00 0.00 0.00 28\n", 1125 | "\n", 1126 | " precision recall f1-score support\n", 1127 | "\n", 1128 | " 0 0.00 0.00 0.00 0\n", 1129 | " 1 0.00 0.00 0.00 0\n", 1130 | " 2 0.00 0.00 0.00 4\n", 1131 | " 3 0.00 0.00 0.00 0\n", 1132 | " 4 0.00 0.00 0.00 2\n", 1133 | " 5 0.00 0.00 0.00 0\n", 1134 | " 6 0.00 0.00 0.00 0\n", 1135 | " 7 0.29 0.20 0.24 10\n", 1136 | " 8 0.00 0.00 0.00 1\n", 1137 | " 9 0.00 0.00 0.00 4\n", 1138 | " 10 0.00 0.00 0.00 0\n", 1139 | " 11 0.00 0.00 0.00 0\n", 1140 | " 12 0.00 0.00 0.00 3\n", 1141 | " 13 0.00 0.00 0.00 0\n", 1142 | " 14 0.00 0.00 0.00 1\n", 1143 | " 15 0.00 0.00 0.00 1\n", 1144 | " 16 0.00 0.00 0.00 0\n", 1145 | " 17 0.00 0.00 0.00 2\n", 1146 | " 18 0.00 0.00 0.00 0\n", 1147 | " 19 0.00 0.00 0.00 0\n", 1148 | "\n", 1149 | "avg / total 0.10 0.07 0.08 28\n", 1150 | "\n", 1151 | " precision recall f1-score support\n", 1152 | "\n", 1153 | " 0 0.00 0.00 0.00 0\n", 1154 | " 1 0.00 0.00 0.00 0\n", 1155 | " 2 0.00 0.00 0.00 4\n", 1156 | " 3 0.00 0.00 0.00 0\n", 1157 | " 4 0.00 0.00 0.00 2\n", 1158 | " 5 0.00 0.00 0.00 0\n", 1159 | " 6 0.00 0.00 0.00 0\n", 1160 | " 7 0.33 0.10 0.15 10\n", 1161 | " 8 0.00 0.00 0.00 1\n", 1162 | " 9 0.00 0.00 0.00 4\n", 1163 | " 10 0.00 0.00 0.00 0\n", 1164 | " 11 0.00 0.00 0.00 0\n", 1165 | " 12 0.00 0.00 0.00 3\n", 1166 | " 13 0.00 0.00 0.00 0\n", 1167 | " 14 0.00 0.00 0.00 1\n", 1168 | " 15 0.00 0.00 0.00 1\n", 1169 | " 16 0.00 0.00 0.00 0\n", 1170 | " 17 0.00 0.00 0.00 2\n", 1171 | " 18 0.00 0.00 0.00 0\n", 1172 | " 19 0.00 0.00 0.00 0\n", 1173 | "\n", 1174 | "avg / total 0.12 0.04 0.05 28\n", 1175 | "\n", 1176 | " precision recall f1-score support\n", 1177 | "\n", 1178 | " 0 0.00 0.00 0.00 0\n", 1179 | " 1 0.00 0.00 0.00 0\n", 1180 | " 2 0.00 0.00 0.00 4\n", 1181 | " 3 0.00 0.00 0.00 0\n", 1182 | " 4 0.00 0.00 0.00 2\n", 1183 | " 5 0.00 0.00 0.00 0\n", 1184 | " 6 0.00 0.00 0.00 0\n", 1185 | " 7 0.00 0.00 0.00 10\n", 1186 | " 8 0.00 0.00 0.00 1\n", 1187 | " 9 0.00 0.00 0.00 4\n", 1188 | " 10 0.00 0.00 0.00 0\n", 1189 | " 11 0.00 0.00 0.00 0\n", 1190 | " 12 0.18 0.67 0.29 3\n", 1191 | " 13 0.00 0.00 0.00 0\n", 1192 | " 14 0.00 0.00 0.00 1\n", 1193 | " 15 0.00 0.00 0.00 1\n", 1194 | " 16 0.00 0.00 0.00 0\n", 1195 | " 17 0.00 0.00 0.00 2\n", 1196 | " 18 0.00 0.00 0.00 0\n", 1197 | " 19 0.00 0.00 0.00 0\n", 1198 | "\n", 1199 | "avg / total 0.02 0.07 0.03 28\n", 1200 | "\n" 1201 | ] 1202 | } 1203 | ], 1204 | "source": [ 1205 | "#For neutral or conflict sentiment classifier\n", 1206 | "df_train = get_data_frame(final_train_text_list,train_opinion_list,most_common_aspect)\n", 1207 | "df_test = get_data_frame(final_test_text_list,test_opinion_list,most_common_aspect)\n", 1208 | "\n", 1209 | "df_train_neu = get_neutral_data_frame(df_train,most_common_aspect)\n", 1210 | "df_test_neu = get_neutral_data_frame(df_test,most_common_aspect)\n", 1211 | "\n", 1212 | "y_test_neu,y_pred_class_neu,y_pred_class_svc_neu,y_pred_class_lin_svc_neu,y_pred_class_sgd_neu=classify_sentiment(df_train_neu,df_test_neu,X_train_aspect_dtm,X_test_aspect_dtm)\n", 1213 | "with warnings.catch_warnings():\n", 1214 | " warnings.simplefilter(\"ignore\")\n", 1215 | " print_metrices(y_test_neu,y_pred_class_neu,y_pred_class_svc_neu,y_pred_class_lin_svc_neu,y_pred_class_sgd_neu)" 1216 | ] 1217 | }, 1218 | { 1219 | "cell_type": "code", 1220 | "execution_count": 37, 1221 | "metadata": { 1222 | "collapsed": false, 1223 | "scrolled": false 1224 | }, 1225 | "outputs": [ 1226 | { 1227 | "name": "stdout", 1228 | "output_type": "stream", 1229 | "text": [ 1230 | "Enter a laptop review:\n", 1231 | "\n", 1232 | "This is my first asus laptop. So far i am really enjoying this laptop. 512GB SSD is super fast. Battery life is also good and can last very long. I have no complain on screen quality too as display supports 4k videos. Maybe that is why it costs a lot. This is an expensive laptop and it's price is very high compared to other laptops of similar specs. So, if you have no trouble paying for this laptop, it is pretty good.\n" 1233 | ] 1234 | }, 1235 | { 1236 | "data": { 1237 | "text/plain": [ 1238 | "array([[1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0]])" 1239 | ] 1240 | }, 1241 | "execution_count": 37, 1242 | "metadata": {}, 1243 | "output_type": "execute_result" 1244 | } 1245 | ], 1246 | "source": [ 1247 | "#Aspect Based Sentiment analyis of user's input.\n", 1248 | "user_input=input(\"Enter a laptop review:\\n\\n\")\n", 1249 | "#Preprocessing and vectorizing\n", 1250 | "tagged_user_input = posTag([user_input])\n", 1251 | "filter_tagged_user_input = filterTag(tagged_user_input)\n", 1252 | "\n", 1253 | "user_input_series=pd.Series(filter_tagged_user_input)\n", 1254 | "user_input_series_dtm=vect.transform(user_input_series)\n", 1255 | "\n", 1256 | "predict_aspect= svc.predict(user_input_series_dtm)\n", 1257 | "extra_feature=get_dict_aspect(predict_aspect, most_common_aspect)\n", 1258 | "extra_feature_dtm=DictVectorizer().fit_transform(extra_feature)\n", 1259 | "predict_aspect" 1260 | ] 1261 | }, 1262 | { 1263 | "cell_type": "code", 1264 | "execution_count": 38, 1265 | "metadata": { 1266 | "collapsed": false 1267 | }, 1268 | "outputs": [ 1269 | { 1270 | "data": { 1271 | "text/plain": [ 1272 | "array([[1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0]])" 1273 | ] 1274 | }, 1275 | "execution_count": 38, 1276 | "metadata": {}, 1277 | "output_type": "execute_result" 1278 | } 1279 | ], 1280 | "source": [ 1281 | "#predicting weather the dectected aspect is positive or not\n", 1282 | "test_opinion_list=[]\n", 1283 | "df_test = get_data_frame(filter_tagged_user_input,test_opinion_list,most_common_aspect)\n", 1284 | "df_train = get_data_frame(final_train_text_list,train_opinion_list,most_common_aspect)\n", 1285 | "\n", 1286 | "df_train_positive = get_positive_data_frame(df_train,most_common_aspect)\n", 1287 | "y_test_pos,y_pred_class_pos,y_pred_class_svc_pos,y_pred_class_lin_svc_pos,y_pred_class_sgd_pos=classify_sentiment(df_train_positive,df_test,X_train_aspect_dtm,extra_feature_dtm)\n", 1288 | "\n", 1289 | "y_pred_class_svc_pos" 1290 | ] 1291 | }, 1292 | { 1293 | "cell_type": "code", 1294 | "execution_count": 39, 1295 | "metadata": { 1296 | "collapsed": false 1297 | }, 1298 | "outputs": [ 1299 | { 1300 | "data": { 1301 | "text/plain": [ 1302 | "array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])" 1303 | ] 1304 | }, 1305 | "execution_count": 39, 1306 | "metadata": {}, 1307 | "output_type": "execute_result" 1308 | } 1309 | ], 1310 | "source": [ 1311 | "#predicting weather the dectected aspect is negative or not\n", 1312 | "test_opinion_list=[]\n", 1313 | "df_test = get_data_frame(filter_tagged_user_input,test_opinion_list,most_common_aspect)\n", 1314 | "df_train = get_data_frame(final_train_text_list,train_opinion_list,most_common_aspect)\n", 1315 | "\n", 1316 | "df_train_negative = get_negative_data_frame(df_train,most_common_aspect)\n", 1317 | "y_test_neg,y_pred_class_neg,y_pred_class_svc_neg,y_pred_class_lin_svc_neg,y_pred_class_sgd_neg=classify_sentiment(df_train_negative,df_test,X_train_aspect_dtm,extra_feature_dtm)\n", 1318 | "\n", 1319 | "y_pred_class_svc_neg" 1320 | ] 1321 | }, 1322 | { 1323 | "cell_type": "code", 1324 | "execution_count": 40, 1325 | "metadata": { 1326 | "collapsed": false 1327 | }, 1328 | "outputs": [ 1329 | { 1330 | "data": { 1331 | "text/plain": [ 1332 | "array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]])" 1333 | ] 1334 | }, 1335 | "execution_count": 40, 1336 | "metadata": {}, 1337 | "output_type": "execute_result" 1338 | } 1339 | ], 1340 | "source": [ 1341 | "#predicting weather the dectected aspect is neutral or coflict or not\n", 1342 | "test_opinion_list=[]\n", 1343 | "df_test = get_data_frame(filter_tagged_user_input,test_opinion_list,most_common_aspect)\n", 1344 | "df_train = get_data_frame(final_train_text_list,train_opinion_list,most_common_aspect)\n", 1345 | "\n", 1346 | "df_train_neutral = get_neutral_data_frame(df_train,most_common_aspect)\n", 1347 | "y_test_neu,y_pred_class_neu,y_pred_class_svc_neu,y_pred_class_lin_svc_neu,y_pred_class_sgd_neu=classify_sentiment(df_train_neutral,df_test,X_train_aspect_dtm,extra_feature_dtm)\n", 1348 | "\n", 1349 | "y_pred_class_svc_neu" 1350 | ] 1351 | }, 1352 | { 1353 | "cell_type": "code", 1354 | "execution_count": 41, 1355 | "metadata": { 1356 | "collapsed": false 1357 | }, 1358 | "outputs": [ 1359 | { 1360 | "data": { 1361 | "text/plain": [ 1362 | "[0, 3, 8, 10, 13]" 1363 | ] 1364 | }, 1365 | "execution_count": 41, 1366 | "metadata": {}, 1367 | "output_type": "execute_result" 1368 | } 1369 | ], 1370 | "source": [ 1371 | "#Finding the aspect that is positive\n", 1372 | "index_positive=[]\n", 1373 | "for i, (a, b) in enumerate(zip(predict_aspect.tolist()[0], y_pred_class_svc_pos.tolist()[0])):\n", 1374 | " if a ==1 and b==1:\n", 1375 | " index_positive.append(i)\n", 1376 | "index_positive " 1377 | ] 1378 | }, 1379 | { 1380 | "cell_type": "code", 1381 | "execution_count": 42, 1382 | "metadata": { 1383 | "collapsed": false 1384 | }, 1385 | "outputs": [ 1386 | { 1387 | "data": { 1388 | "text/plain": [ 1389 | "[]" 1390 | ] 1391 | }, 1392 | "execution_count": 42, 1393 | "metadata": {}, 1394 | "output_type": "execute_result" 1395 | } 1396 | ], 1397 | "source": [ 1398 | "#Finding the aspect that is negative\n", 1399 | "index_negative=[]\n", 1400 | "for i, (a, b) in enumerate(zip(predict_aspect.tolist()[0], y_pred_class_svc_neg.tolist()[0])):\n", 1401 | " if a ==1 and b==1:\n", 1402 | " index_negative.append(i)\n", 1403 | "index_negative " 1404 | ] 1405 | }, 1406 | { 1407 | "cell_type": "code", 1408 | "execution_count": 43, 1409 | "metadata": { 1410 | "collapsed": false 1411 | }, 1412 | "outputs": [ 1413 | { 1414 | "data": { 1415 | "text/plain": [ 1416 | "[12]" 1417 | ] 1418 | }, 1419 | "execution_count": 43, 1420 | "metadata": {}, 1421 | "output_type": "execute_result" 1422 | } 1423 | ], 1424 | "source": [ 1425 | "#Finding the aspect that is neutral\n", 1426 | "index_neutral=[]\n", 1427 | "for i, (a, b) in enumerate(zip(predict_aspect.tolist()[0], y_pred_class_svc_neu.tolist()[0])):\n", 1428 | " if a ==1 and b==1:\n", 1429 | " index_neutral.append(i)\n", 1430 | "index_neutral " 1431 | ] 1432 | }, 1433 | { 1434 | "cell_type": "code", 1435 | "execution_count": 44, 1436 | "metadata": { 1437 | "collapsed": true 1438 | }, 1439 | "outputs": [], 1440 | "source": [ 1441 | "output=[]" 1442 | ] 1443 | }, 1444 | { 1445 | "cell_type": "code", 1446 | "execution_count": 45, 1447 | "metadata": { 1448 | "collapsed": false 1449 | }, 1450 | "outputs": [], 1451 | "source": [ 1452 | "if index_positive:\n", 1453 | " for index in index_positive:\n", 1454 | " output.append(sorted(most_common_aspect)[index]+\": positive\")" 1455 | ] 1456 | }, 1457 | { 1458 | "cell_type": "code", 1459 | "execution_count": 46, 1460 | "metadata": { 1461 | "collapsed": false 1462 | }, 1463 | "outputs": [], 1464 | "source": [ 1465 | "if index_negative:\n", 1466 | " for index in index_negative:\n", 1467 | " output.append(sorted(most_common_aspect)[index]+\": negative\")" 1468 | ] 1469 | }, 1470 | { 1471 | "cell_type": "code", 1472 | "execution_count": 47, 1473 | "metadata": { 1474 | "collapsed": false 1475 | }, 1476 | "outputs": [], 1477 | "source": [ 1478 | "if index_neutral:\n", 1479 | " for index in index_neutral:\n", 1480 | " output.append(sorted(most_common_aspect)[index]+\": neutral or conflict\")" 1481 | ] 1482 | }, 1483 | { 1484 | "cell_type": "code", 1485 | "execution_count": 48, 1486 | "metadata": { 1487 | "collapsed": false 1488 | }, 1489 | "outputs": [ 1490 | { 1491 | "data": { 1492 | "text/plain": [ 1493 | "['BATTERY_OPERATION_PERFORMANCE: positive',\n", 1494 | " 'DISPLAY_GENERAL: positive',\n", 1495 | " 'LAPTOP_GENERAL: positive',\n", 1496 | " 'LAPTOP_OPERATION_PERFORMANCE: positive',\n", 1497 | " 'LAPTOP_QUALITY: positive',\n", 1498 | " 'LAPTOP_PRICE: neutral or conflict']" 1499 | ] 1500 | }, 1501 | "execution_count": 48, 1502 | "metadata": {}, 1503 | "output_type": "execute_result" 1504 | } 1505 | ], 1506 | "source": [ 1507 | "#Prediction of Aspect Based Sentiment Analaysis for user's input\n", 1508 | "output" 1509 | ] 1510 | } 1511 | ], 1512 | "metadata": { 1513 | "kernelspec": { 1514 | "display_name": "Python 3", 1515 | "language": "python", 1516 | "name": "python3" 1517 | }, 1518 | "language_info": { 1519 | "codemirror_mode": { 1520 | "name": "ipython", 1521 | "version": 3 1522 | }, 1523 | "file_extension": ".py", 1524 | "mimetype": "text/x-python", 1525 | "name": "python", 1526 | "nbconvert_exporter": "python", 1527 | "pygments_lexer": "ipython3", 1528 | "version": "3.6.0" 1529 | } 1530 | }, 1531 | "nbformat": 4, 1532 | "nbformat_minor": 2 1533 | } 1534 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Aspect-Based-Sentiment-Analysis 2 | 3 | Aspect Based Sentiment Analysis task focuses on the recognition of aspect category and classification of emotions (positive, negative, neutral) in text. 4 | The aim of Aspect Based Sentiment Analysis is to determine the sentiment polarity expressed towards certain aspect. 5 | This system is based on supervised learning using Support Vector Machine (SVM). 6 | Multi label svc classifier is used to classify the laptop reviews into 20 aspect categories and also to classify the review as positive, negative or neutral. For each sentiment, a multi label svc classifier is used. Performance of the system is also compared with other classifiers. 7 | 8 | # Dataset 9 | [Dataset is avaibale at SemEval website.](http://alt.qcri.org/semeval2016/task5/index.php?id=data-and-tools) 10 | 11 | # Required libraries 12 | * Scikit Learn 13 | * Nltk 14 | * pandas 15 | * numpy 16 | 17 | P.S. This is a Mini-Project for a 2 month crash course (NLP) in the Final Year at the Department of Computer Science and Engineering, Kathmandu University. 18 | -------------------------------------------------------------------------------- /tagged_text_list_test.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thestrox/Aspect-Based-Sentiment-Analysis/965a97fbfd933d0b7e94885da6d6cfaa5a0e714d/tagged_text_list_test.pkl -------------------------------------------------------------------------------- /tagged_text_list_train.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thestrox/Aspect-Based-Sentiment-Analysis/965a97fbfd933d0b7e94885da6d6cfaa5a0e714d/tagged_text_list_train.pkl --------------------------------------------------------------------------------