├── .ipynb_checkpoints
    └── Aspect Based Sentiment Analysis-checkpoint.ipynb
├── Aspect Based Sentiment Analysis.ipynb
├── README.md
├── tagged_text_list_test.pkl
└── tagged_text_list_train.pkl


/.ipynb_checkpoints/Aspect Based Sentiment Analysis-checkpoint.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "code",
   5 |    "execution_count": 1,
   6 |    "metadata": {
   7 |     "collapsed": true
   8 |    },
   9 |    "outputs": [],
  10 |    "source": [
  11 |     "import pandas as pd\n",
  12 |     "from sklearn.externals import joblib\n",
  13 |     "from sklearn.feature_extraction.text import CountVectorizer\n",
  14 |     "from sklearn.feature_extraction import DictVectorizer\n",
  15 |     "from sklearn.linear_model import SGDClassifier\n",
  16 |     "from sklearn.multiclass import OneVsRestClassifier\n",
  17 |     "from sklearn.naive_bayes import MultinomialNB\n",
  18 |     "from sklearn import svm\n",
  19 |     "import xml.etree.ElementTree as ET\n",
  20 |     "from lxml import etree\n",
  21 |     "from scipy.sparse import hstack\n",
  22 |     "import numpy as np\n",
  23 |     "import warnings\n",
  24 |     "\n",
  25 |     "\n",
  26 |     "path_train = r'E:\\Engineering\\8th sem\\nlp COMP 473\\NLP projects\\ABSA16_Laptops_Train_English_SB2.xml'\n",
  27 |     "path_test = r'E:\\Engineering\\8th sem\\nlp COMP 473\\NLP projects\\EN_LAPT_SB2_TEST.xml'\n",
  28 |     "\n",
  29 |     "#For stanford POS Tagger\n",
  30 |     "home = r'C:\\Users\\THe_strOX\\Anaconda3\\stanford-postagger-full-2017-06-09'\n",
  31 |     "from nltk.tag.stanford import StanfordPOSTagger as POS_Tag\n",
  32 |     "from nltk import word_tokenize\n",
  33 |     "_path_to_model = home + '/models/english-bidirectional-distsim.tagger' \n",
  34 |     "_path_to_jar = home + '/stanford-postagger.jar'\n",
  35 |     "stanford_tag = POS_Tag(model_filename=_path_to_model, path_to_jar=_path_to_jar)"
  36 |    ]
  37 |   },
  38 |   {
  39 |    "cell_type": "code",
  40 |    "execution_count": 2,
  41 |    "metadata": {
  42 |     "collapsed": true
  43 |    },
  44 |    "outputs": [],
  45 |    "source": [
  46 |     "#xml parser\n",
  47 |     "def get_list(path):\n",
  48 |     "    tree=ET.parse(path)\n",
  49 |     "    root = tree.getroot()\n",
  50 |     "    text_list = []\n",
  51 |     "    opinion_list = []\n",
  52 |     "    for review in root.findall('Review'):\n",
  53 |     "        text_string=\"\"\n",
  54 |     "        opinion_inner_list=[]\n",
  55 |     "        for sent in review.findall('./sentences/sentence'):\n",
  56 |     "            text_string= text_string+ \" \"+ sent.find('text').text\n",
  57 |     "        text_list.append(text_string)\n",
  58 |     "        for opinion in review.findall('./Opinions/Opinion'):\n",
  59 |     "            opinion_dict = {\n",
  60 |     "                opinion.get('category').replace('#','_'): opinion.get('polarity')\n",
  61 |     "            }\n",
  62 |     "            opinion_inner_list.append(opinion_dict)\n",
  63 |     "        opinion_list.append(opinion_inner_list)\n",
  64 |     "    return text_list,opinion_list"
  65 |    ]
  66 |   },
  67 |   {
  68 |    "cell_type": "code",
  69 |    "execution_count": 3,
  70 |    "metadata": {
  71 |     "collapsed": true
  72 |    },
  73 |    "outputs": [],
  74 |    "source": [
  75 |     "#Selecting only 20 most common aspect.\n",
  76 |     "def get_most_common_aspect(opinion_list):\n",
  77 |     "    import nltk\n",
  78 |     "    opinion= []\n",
  79 |     "    for inner_list in opinion_list:\n",
  80 |     "        for _dict in inner_list:\n",
  81 |     "            for key in _dict:\n",
  82 |     "                opinion.append(key)\n",
  83 |     "    most_common_aspect = [k for k,v in nltk.FreqDist(opinion).most_common(20)]\n",
  84 |     "    return most_common_aspect"
  85 |    ]
  86 |   },
  87 |   {
  88 |    "cell_type": "code",
  89 |    "execution_count": 4,
  90 |    "metadata": {
  91 |     "collapsed": true
  92 |    },
  93 |    "outputs": [],
  94 |    "source": [
  95 |     "#generate data frame\n",
  96 |     "def get_data_frame(text_list,opinion_list,most_common_aspect):\n",
  97 |     "    data={'Review':text_list}\n",
  98 |     "    df = pd.DataFrame(data)\n",
  99 |     "    if opinion_list:\n",
 100 |     "        for inner_list in opinion_list:\n",
 101 |     "            for _dict in inner_list:\n",
 102 |     "                for key in _dict:\n",
 103 |     "                    if key in most_common_aspect:\n",
 104 |     "                        df.loc[opinion_list.index(inner_list),key]=_dict[key]\n",
 105 |     "    return df"
 106 |    ]
 107 |   },
 108 |   {
 109 |    "cell_type": "code",
 110 |    "execution_count": 5,
 111 |    "metadata": {
 112 |     "collapsed": true
 113 |    },
 114 |    "outputs": [],
 115 |    "source": [
 116 |     "#generate data frame for aspect extraction task\n",
 117 |     "def get_aspect_data_frame(df,most_common_aspect):\n",
 118 |     "    for common_aspect in most_common_aspect:\n",
 119 |     "        df[common_aspect]=df[common_aspect].replace(['positive','negative','neutral','conflict'],[1,1,1,1])\n",
 120 |     "    df = df.fillna(0)\n",
 121 |     "    return df"
 122 |    ]
 123 |   },
 124 |   {
 125 |    "cell_type": "code",
 126 |    "execution_count": 6,
 127 |    "metadata": {
 128 |     "collapsed": true
 129 |    },
 130 |    "outputs": [],
 131 |    "source": [
 132 |     "def get_positive_data_frame(df,most_common_aspect):\n",
 133 |     "    for common_aspect in most_common_aspect:\n",
 134 |     "        df[common_aspect]=df[common_aspect].replace(['positive'],[1])\n",
 135 |     "        df[common_aspect]=df[common_aspect].replace(['negative','neutral','conflict'],[0,0,0])\n",
 136 |     "    df = df.fillna(0)\n",
 137 |     "    return df"
 138 |    ]
 139 |   },
 140 |   {
 141 |    "cell_type": "code",
 142 |    "execution_count": 7,
 143 |    "metadata": {
 144 |     "collapsed": true
 145 |    },
 146 |    "outputs": [],
 147 |    "source": [
 148 |     "def get_negative_data_frame(df,most_common_aspect):\n",
 149 |     "    for common_aspect in most_common_aspect:\n",
 150 |     "        df[common_aspect]=df[common_aspect].replace(['negative'],[1])\n",
 151 |     "        df[common_aspect]=df[common_aspect].replace(['positive','neutral','conflict'],[0,0,0])\n",
 152 |     "    df = df.fillna(0)\n",
 153 |     "    return df"
 154 |    ]
 155 |   },
 156 |   {
 157 |    "cell_type": "code",
 158 |    "execution_count": 8,
 159 |    "metadata": {
 160 |     "collapsed": true
 161 |    },
 162 |    "outputs": [],
 163 |    "source": [
 164 |     "def get_neutral_data_frame(df,most_common_aspect):\n",
 165 |     "    for common_aspect in most_common_aspect:\n",
 166 |     "        df[common_aspect]=df[common_aspect].replace(['neutral','conflict'],[1,1])\n",
 167 |     "        df[common_aspect]=df[common_aspect].replace(['negative','positive'],[0,0])\n",
 168 |     "    df = df.fillna(0)\n",
 169 |     "    return df"
 170 |    ]
 171 |   },
 172 |   {
 173 |    "cell_type": "code",
 174 |    "execution_count": 9,
 175 |    "metadata": {
 176 |     "collapsed": false
 177 |    },
 178 |    "outputs": [],
 179 |    "source": [
 180 |     "#To tag using stanford pos tagger\n",
 181 |     "def posTag(review):\n",
 182 |     "    tagged_text_list=[]\n",
 183 |     "    for text in review:\n",
 184 |     "        tagged_text_list.append(stanford_tag.tag(word_tokenize(text)))\n",
 185 |     "    return tagged_text_list\n",
 186 |     "#posTag(\"this is random text\")"
 187 |    ]
 188 |   },
 189 |   {
 190 |    "cell_type": "code",
 191 |    "execution_count": 10,
 192 |    "metadata": {
 193 |     "collapsed": true
 194 |    },
 195 |    "outputs": [],
 196 |    "source": [
 197 |     "#Filter the word with tag- noun,adjective,verb,adverb\n",
 198 |     "def filterTag(tagged_review):\n",
 199 |     "    final_text_list=[]\n",
 200 |     "    for text_list in tagged_review:\n",
 201 |     "        final_text=[]\n",
 202 |     "        for word,tag in text_list:\n",
 203 |     "            if tag in ['NN','NNS','NNP','NNPS','RB','RBR','RBS','JJ','JJR','JJS','VB','VBD','VBG','VBN','VBP','VBZ']:\n",
 204 |     "                final_text.append(word)\n",
 205 |     "        final_text_list.append(' '.join(final_text))\n",
 206 |     "    return final_text_list"
 207 |    ]
 208 |   },
 209 |   {
 210 |    "cell_type": "code",
 211 |    "execution_count": 11,
 212 |    "metadata": {
 213 |     "collapsed": false
 214 |    },
 215 |    "outputs": [],
 216 |    "source": [
 217 |     "def get_dict_aspect(y,most_common_aspect):\n",
 218 |     "    position=[]\n",
 219 |     "    for innerlist in y:\n",
 220 |     "        position.append([i for i, j in enumerate(innerlist) if j == 1])\n",
 221 |     "    sorted_common=sorted(most_common_aspect)\n",
 222 |     "    dict_aspect=[]\n",
 223 |     "    for innerlist in position:\n",
 224 |     "        inner_dict={}\n",
 225 |     "        for word in sorted_common:\n",
 226 |     "            if sorted_common.index(word) in innerlist:\n",
 227 |     "                inner_dict[word]= 5\n",
 228 |     "            else:\n",
 229 |     "                inner_dict[word]=0\n",
 230 |     "        dict_aspect.append(inner_dict)\n",
 231 |     "    return dict_aspect"
 232 |    ]
 233 |   },
 234 |   {
 235 |    "cell_type": "code",
 236 |    "execution_count": 12,
 237 |    "metadata": {
 238 |     "collapsed": false
 239 |    },
 240 |    "outputs": [],
 241 |    "source": [
 242 |     "#Stage 1:\n",
 243 |     "#Making list to train\n",
 244 |     "train_text_list,train_opinion_list = get_list(path_train)\n",
 245 |     "most_common_aspect = get_most_common_aspect(train_opinion_list)"
 246 |    ]
 247 |   },
 248 |   {
 249 |    "cell_type": "code",
 250 |    "execution_count": 13,
 251 |    "metadata": {
 252 |     "collapsed": false
 253 |    },
 254 |    "outputs": [],
 255 |    "source": [
 256 |     "#This takes time to tag. Already tagged and saved. So, loading file ...\n",
 257 |     "#tagged_text_list_train=posTag(train_text_list)\n",
 258 |     "#joblib.dump(tagged_text_list_train, 'tagged_text_list_train.pkl')\n",
 259 |     "tagged_text_list_train=joblib.load('tagged_text_list_train.pkl')"
 260 |    ]
 261 |   },
 262 |   {
 263 |    "cell_type": "code",
 264 |    "execution_count": 14,
 265 |    "metadata": {
 266 |     "collapsed": true
 267 |    },
 268 |    "outputs": [],
 269 |    "source": [
 270 |     "#train list after filter\n",
 271 |     "final_train_text_list=filterTag(tagged_text_list_train)"
 272 |    ]
 273 |   },
 274 |   {
 275 |    "cell_type": "code",
 276 |    "execution_count": 15,
 277 |    "metadata": {
 278 |     "collapsed": false
 279 |    },
 280 |    "outputs": [],
 281 |    "source": [
 282 |     "#get data frame\n",
 283 |     "df_train = get_data_frame(final_train_text_list,train_opinion_list,most_common_aspect)\n",
 284 |     "df_train_aspect = get_aspect_data_frame(df_train,most_common_aspect)\n",
 285 |     "df_train_aspect = df_train_aspect.reindex_axis(sorted(df_train_aspect.columns), axis=1)"
 286 |    ]
 287 |   },
 288 |   {
 289 |    "cell_type": "code",
 290 |    "execution_count": 16,
 291 |    "metadata": {
 292 |     "collapsed": false
 293 |    },
 294 |    "outputs": [],
 295 |    "source": [
 296 |     "#Similar for test list\n",
 297 |     "test_text_list,test_opinion_list = get_list(path_test)"
 298 |    ]
 299 |   },
 300 |   {
 301 |    "cell_type": "code",
 302 |    "execution_count": 17,
 303 |    "metadata": {
 304 |     "collapsed": false
 305 |    },
 306 |    "outputs": [],
 307 |    "source": [
 308 |     "#tagged_text_list_test=posTag(test_text_list)\n",
 309 |     "#joblib.dump(tagged_text_list_test, 'tagged_text_list_test.pkl')\n",
 310 |     "tagged_text_list_test=joblib.load('tagged_text_list_test.pkl')"
 311 |    ]
 312 |   },
 313 |   {
 314 |    "cell_type": "code",
 315 |    "execution_count": 18,
 316 |    "metadata": {
 317 |     "collapsed": true
 318 |    },
 319 |    "outputs": [],
 320 |    "source": [
 321 |     "final_test_text_list=filterTag(tagged_text_list_test)"
 322 |    ]
 323 |   },
 324 |   {
 325 |    "cell_type": "code",
 326 |    "execution_count": 19,
 327 |    "metadata": {
 328 |     "collapsed": false
 329 |    },
 330 |    "outputs": [],
 331 |    "source": [
 332 |     "df_test = get_data_frame(final_test_text_list,test_opinion_list,most_common_aspect)\n",
 333 |     "df_test_aspect = get_aspect_data_frame(df_test,most_common_aspect)\n",
 334 |     "df_test_aspect = df_test_aspect.reindex_axis(sorted(df_test_aspect.columns), axis=1)"
 335 |    ]
 336 |   },
 337 |   {
 338 |    "cell_type": "code",
 339 |    "execution_count": 20,
 340 |    "metadata": {
 341 |     "collapsed": false
 342 |    },
 343 |    "outputs": [],
 344 |    "source": [
 345 |     "#Sort the data frame according to aspect's name and separate data(X) and target(y)\n",
 346 |     "#df_train_aspect = df_train_aspect.sample(frac=1).reset_index(drop=True) #For randoming\n",
 347 |     "X_train= df_train_aspect.Review\n",
 348 |     "y_train = df_train_aspect.drop('Review',1)\n",
 349 |     "\n",
 350 |     "#df_test_aspect = df_test_aspect.sample(frac=1).reset_index(drop=True) #For randoming\n",
 351 |     "X_test = df_test_aspect.Review\n",
 352 |     "y_test = df_test_aspect.drop('Review',1)"
 353 |    ]
 354 |   },
 355 |   {
 356 |    "cell_type": "code",
 357 |    "execution_count": 21,
 358 |    "metadata": {
 359 |     "collapsed": false
 360 |    },
 361 |    "outputs": [],
 362 |    "source": [
 363 |     "#Change y_train to numpy array\n",
 364 |     "import numpy as np\n",
 365 |     "y_train = np.asarray(y_train, dtype=np.int64)\n",
 366 |     "y_test = np.asarray(y_test, dtype=np.int64)"
 367 |    ]
 368 |   },
 369 |   {
 370 |    "cell_type": "code",
 371 |    "execution_count": 22,
 372 |    "metadata": {
 373 |     "collapsed": false
 374 |    },
 375 |    "outputs": [],
 376 |    "source": [
 377 |     "#Generate word vecotors using CountVectorizer\n",
 378 |     "from sklearn.feature_extraction.text import CountVectorizer\n",
 379 |     "from nltk import word_tokenize          \n",
 380 |     "from nltk.stem import WordNetLemmatizer \n",
 381 |     "vect = CountVectorizer(max_df=1.0,stop_words='english')  \n",
 382 |     "X_train_dtm = vect.fit_transform(X_train)\n",
 383 |     "X_test_dtm = vect.transform(X_test)"
 384 |    ]
 385 |   },
 386 |   {
 387 |    "cell_type": "code",
 388 |    "execution_count": 23,
 389 |    "metadata": {
 390 |     "collapsed": false,
 391 |     "scrolled": true
 392 |    },
 393 |    "outputs": [],
 394 |    "source": [
 395 |     "#Create various models. These are multi-label models.\n",
 396 |     "nb_classif = OneVsRestClassifier(MultinomialNB()).fit(X_train_dtm, y_train)\n",
 397 |     "C = 1.0 #SVregularization parameter\n",
 398 |     "svc = OneVsRestClassifier(svm.SVC(kernel='linear', C=C)).fit(X_train_dtm, y_train)\n",
 399 |     "lin_svc = OneVsRestClassifier(svm.LinearSVC(C=C)).fit(X_train_dtm, y_train)\n",
 400 |     "sgd = OneVsRestClassifier(SGDClassifier()).fit(X_train_dtm,y_train)"
 401 |    ]
 402 |   },
 403 |   {
 404 |    "cell_type": "code",
 405 |    "execution_count": 24,
 406 |    "metadata": {
 407 |     "collapsed": false
 408 |    },
 409 |    "outputs": [],
 410 |    "source": [
 411 |     "#Predict the test data using classifiers\n",
 412 |     "y_pred_class = nb_classif.predict(X_test_dtm)\n",
 413 |     "y_pred_class_svc = svc.predict(X_test_dtm)\n",
 414 |     "y_pred_class_lin_svc = lin_svc.predict(X_test_dtm)\n",
 415 |     "y_pred_class_sgd = sgd.predict(X_test_dtm)"
 416 |    ]
 417 |   },
 418 |   {
 419 |    "cell_type": "code",
 420 |    "execution_count": 25,
 421 |    "metadata": {
 422 |     "collapsed": true
 423 |    },
 424 |    "outputs": [],
 425 |    "source": [
 426 |     "#Following code to test metrics of all aspect extraction classifiers\n",
 427 |     "from sklearn import metrics"
 428 |    ]
 429 |   },
 430 |   {
 431 |    "cell_type": "code",
 432 |    "execution_count": 26,
 433 |    "metadata": {
 434 |     "collapsed": false
 435 |    },
 436 |    "outputs": [
 437 |     {
 438 |      "name": "stdout",
 439 |      "output_type": "stream",
 440 |      "text": [
 441 |       "0.025\n",
 442 |       "0.05\n",
 443 |       "0.05\n",
 444 |       "0.0375\n"
 445 |      ]
 446 |     }
 447 |    ],
 448 |    "source": [
 449 |     "print(metrics.accuracy_score(y_test,y_pred_class))\n",
 450 |     "print(metrics.accuracy_score(y_test,y_pred_class_svc))\n",
 451 |     "print(metrics.accuracy_score(y_test,y_pred_class_lin_svc))\n",
 452 |     "print(metrics.accuracy_score(y_test,y_pred_class_sgd))"
 453 |    ]
 454 |   },
 455 |   {
 456 |    "cell_type": "code",
 457 |    "execution_count": 27,
 458 |    "metadata": {
 459 |     "collapsed": false
 460 |    },
 461 |    "outputs": [
 462 |     {
 463 |      "name": "stdout",
 464 |      "output_type": "stream",
 465 |      "text": [
 466 |       "0.75\n",
 467 |       "0.711229946524\n",
 468 |       "0.732193732194\n",
 469 |       "0.700657894737\n"
 470 |      ]
 471 |     }
 472 |    ],
 473 |    "source": [
 474 |     "print(metrics.precision_score(y_test,y_pred_class,average='micro'))\n",
 475 |     "print(metrics.precision_score(y_test,y_pred_class_svc,average='micro'))\n",
 476 |     "print(metrics.precision_score(y_test,y_pred_class_lin_svc,average='micro'))\n",
 477 |     "print(metrics.precision_score(y_test,y_pred_class_sgd,average='micro'))"
 478 |    ]
 479 |   },
 480 |   {
 481 |    "cell_type": "code",
 482 |    "execution_count": 28,
 483 |    "metadata": {
 484 |     "collapsed": false
 485 |    },
 486 |    "outputs": [
 487 |     {
 488 |      "name": "stdout",
 489 |      "output_type": "stream",
 490 |      "text": [
 491 |       "0.457627118644\n",
 492 |       "0.64406779661\n",
 493 |       "0.622276029056\n",
 494 |       "0.515738498789\n"
 495 |      ]
 496 |     }
 497 |    ],
 498 |    "source": [
 499 |     "print(metrics.recall_score(y_test,y_pred_class,average='micro'))\n",
 500 |     "print(metrics.recall_score(y_test,y_pred_class_svc,average='micro'))\n",
 501 |     "print(metrics.recall_score(y_test,y_pred_class_lin_svc,average='micro'))\n",
 502 |     "print(metrics.recall_score(y_test,y_pred_class_sgd,average='micro'))"
 503 |    ]
 504 |   },
 505 |   {
 506 |    "cell_type": "code",
 507 |    "execution_count": 29,
 508 |    "metadata": {
 509 |     "collapsed": false
 510 |    },
 511 |    "outputs": [
 512 |     {
 513 |      "name": "stdout",
 514 |      "output_type": "stream",
 515 |      "text": [
 516 |       "0.568421052632\n",
 517 |       "0.675984752224\n",
 518 |       "0.67277486911\n",
 519 |       "0.594142259414\n"
 520 |      ]
 521 |     }
 522 |    ],
 523 |    "source": [
 524 |     "print(metrics.f1_score(y_test,y_pred_class,average='micro'))\n",
 525 |     "print(metrics.f1_score(y_test,y_pred_class_svc,average='micro'))\n",
 526 |     "print(metrics.f1_score(y_test,y_pred_class_lin_svc,average='micro'))\n",
 527 |     "print(metrics.f1_score(y_test,y_pred_class_sgd,average='micro'))"
 528 |    ]
 529 |   },
 530 |   {
 531 |    "cell_type": "code",
 532 |    "execution_count": 30,
 533 |    "metadata": {
 534 |     "collapsed": false,
 535 |     "scrolled": true
 536 |    },
 537 |    "outputs": [
 538 |     {
 539 |      "name": "stdout",
 540 |      "output_type": "stream",
 541 |      "text": [
 542 |       "             precision    recall  f1-score   support\n",
 543 |       "\n",
 544 |       "          0       0.67      0.14      0.24        14\n",
 545 |       "          1       0.71      0.50      0.59        24\n",
 546 |       "          2       0.00      0.00      0.00        12\n",
 547 |       "          3       0.00      0.00      0.00         4\n",
 548 |       "          4       0.00      0.00      0.00        21\n",
 549 |       "          5       0.00      0.00      0.00         8\n",
 550 |       "          6       0.00      0.00      0.00         7\n",
 551 |       "          7       0.76      0.64      0.69        39\n",
 552 |       "          8       1.00      1.00      1.00        80\n",
 553 |       "          9       0.44      0.17      0.24        24\n",
 554 |       "         10       0.62      0.70      0.65        46\n",
 555 |       "         11       0.00      0.00      0.00         5\n",
 556 |       "         12       0.57      0.30      0.39        27\n",
 557 |       "         13       0.57      0.45      0.50        29\n",
 558 |       "         14       0.77      0.33      0.47        30\n",
 559 |       "         15       0.00      0.00      0.00         4\n",
 560 |       "         16       0.00      0.00      0.00         9\n",
 561 |       "         17       0.00      0.00      0.00        15\n",
 562 |       "         18       0.00      0.00      0.00         4\n",
 563 |       "         19       0.60      0.27      0.37        11\n",
 564 |       "\n",
 565 |       "avg / total       0.57      0.46      0.49       413\n",
 566 |       "\n",
 567 |       "             precision    recall  f1-score   support\n",
 568 |       "\n",
 569 |       "          0       0.78      1.00      0.88        14\n",
 570 |       "          1       0.68      0.71      0.69        24\n",
 571 |       "          2       0.86      0.50      0.63        12\n",
 572 |       "          3       0.12      0.25      0.17         4\n",
 573 |       "          4       0.56      0.43      0.49        21\n",
 574 |       "          5       0.75      0.38      0.50         8\n",
 575 |       "          6       0.20      0.14      0.17         7\n",
 576 |       "          7       0.74      0.67      0.70        39\n",
 577 |       "          8       1.00      0.97      0.99        80\n",
 578 |       "          9       0.63      0.50      0.56        24\n",
 579 |       "         10       0.68      0.74      0.71        46\n",
 580 |       "         11       0.33      0.40      0.36         5\n",
 581 |       "         12       0.83      0.74      0.78        27\n",
 582 |       "         13       0.56      0.66      0.60        29\n",
 583 |       "         14       0.60      0.40      0.48        30\n",
 584 |       "         15       0.67      0.50      0.57         4\n",
 585 |       "         16       0.18      0.22      0.20         9\n",
 586 |       "         17       0.80      0.27      0.40        15\n",
 587 |       "         18       1.00      0.25      0.40         4\n",
 588 |       "         19       0.60      0.27      0.37        11\n",
 589 |       "\n",
 590 |       "avg / total       0.72      0.64      0.67       413\n",
 591 |       "\n",
 592 |       "             precision    recall  f1-score   support\n",
 593 |       "\n",
 594 |       "          0       0.78      1.00      0.88        14\n",
 595 |       "          1       0.65      0.71      0.68        24\n",
 596 |       "          2       1.00      0.42      0.59        12\n",
 597 |       "          3       0.17      0.25      0.20         4\n",
 598 |       "          4       0.64      0.43      0.51        21\n",
 599 |       "          5       1.00      0.25      0.40         8\n",
 600 |       "          6       0.33      0.14      0.20         7\n",
 601 |       "          7       0.74      0.67      0.70        39\n",
 602 |       "          8       1.00      0.96      0.98        80\n",
 603 |       "          9       0.61      0.46      0.52        24\n",
 604 |       "         10       0.70      0.72      0.71        46\n",
 605 |       "         11       0.25      0.20      0.22         5\n",
 606 |       "         12       0.83      0.74      0.78        27\n",
 607 |       "         13       0.54      0.66      0.59        29\n",
 608 |       "         14       0.61      0.37      0.46        30\n",
 609 |       "         15       1.00      0.25      0.40         4\n",
 610 |       "         16       0.20      0.22      0.21         9\n",
 611 |       "         17       1.00      0.20      0.33        15\n",
 612 |       "         18       1.00      0.25      0.40         4\n",
 613 |       "         19       0.75      0.27      0.40        11\n",
 614 |       "\n",
 615 |       "avg / total       0.75      0.62      0.66       413\n",
 616 |       "\n",
 617 |       "             precision    recall  f1-score   support\n",
 618 |       "\n",
 619 |       "          0       0.75      0.64      0.69        14\n",
 620 |       "          1       0.60      0.75      0.67        24\n",
 621 |       "          2       0.00      0.00      0.00        12\n",
 622 |       "          3       0.00      0.00      0.00         4\n",
 623 |       "          4       0.50      0.19      0.28        21\n",
 624 |       "          5       0.00      0.00      0.00         8\n",
 625 |       "          6       0.00      0.00      0.00         7\n",
 626 |       "          7       0.63      0.62      0.62        39\n",
 627 |       "          8       1.00      0.99      0.99        80\n",
 628 |       "          9       0.67      0.33      0.44        24\n",
 629 |       "         10       0.65      0.61      0.63        46\n",
 630 |       "         11       0.29      0.40      0.33         5\n",
 631 |       "         12       0.71      0.37      0.49        27\n",
 632 |       "         13       0.58      0.48      0.53        29\n",
 633 |       "         14       0.71      0.33      0.45        30\n",
 634 |       "         15       0.00      0.00      0.00         4\n",
 635 |       "         16       0.20      0.11      0.14         9\n",
 636 |       "         17       0.67      0.13      0.22        15\n",
 637 |       "         18       0.33      0.25      0.29         4\n",
 638 |       "         19       0.43      0.27      0.33        11\n",
 639 |       "\n",
 640 |       "avg / total       0.64      0.52      0.55       413\n",
 641 |       "\n"
 642 |      ]
 643 |     }
 644 |    ],
 645 |    "source": [
 646 |     "with warnings.catch_warnings():\n",
 647 |     "    warnings.simplefilter(\"ignore\")\n",
 648 |     "    print(metrics.classification_report(y_test, y_pred_class))\n",
 649 |     "    print(metrics.classification_report(y_test, y_pred_class_svc))\n",
 650 |     "    print(metrics.classification_report(y_test, y_pred_class_lin_svc))\n",
 651 |     "    print(metrics.classification_report(y_test, y_pred_class_sgd))"
 652 |    ]
 653 |   },
 654 |   {
 655 |    "cell_type": "code",
 656 |    "execution_count": 31,
 657 |    "metadata": {
 658 |     "collapsed": false
 659 |    },
 660 |    "outputs": [],
 661 |    "source": [
 662 |     "#Stage 2:\n",
 663 |     "#Generating extra feature that indicates which aspect category is present in the review\n",
 664 |     "train_dict_aspect=get_dict_aspect(y_train, most_common_aspect)\n",
 665 |     "d_train=DictVectorizer() \n",
 666 |     "X_train_aspect_dtm = d_train.fit_transform(train_dict_aspect)\n",
 667 |     "\n",
 668 |     "#y_test is used to generated extra feature in order to test the performance of 2nd classifer.\n",
 669 |     "#Use y_pred_class_svc(Highest performer for aspect classification) as input for extra feature to test the overall performace.\n",
 670 |     "test_dict_aspect=get_dict_aspect(y_test,most_common_aspect)\n",
 671 |     "d_test=DictVectorizer() \n",
 672 |     "X_test_aspect_dtm = d_test.fit_transform(test_dict_aspect)"
 673 |    ]
 674 |   },
 675 |   {
 676 |    "cell_type": "code",
 677 |    "execution_count": 32,
 678 |    "metadata": {
 679 |     "collapsed": false
 680 |    },
 681 |    "outputs": [],
 682 |    "source": [
 683 |     "#Function for classiflying positive,negative or neutral sentiment of all the aspects\n",
 684 |     "def classify_sentiment(df_train,df_test,X_train_aspect_dtm,X_test_aspect_dtm):\n",
 685 |     "    \n",
 686 |     "    df_train = df_train.reindex_axis(sorted(df_train_positive.columns), axis=1)\n",
 687 |     "    df_test = df_test.reindex_axis(sorted(df_test_positive.columns), axis=1)\n",
 688 |     "\n",
 689 |     "    import numpy as np\n",
 690 |     "    X_train = df_train.Review\n",
 691 |     "    y_train = df_train.drop('Review',1)\n",
 692 |     "    y_train = np.asarray(y_train, dtype=np.int64)\n",
 693 |     "\n",
 694 |     "    X_test = df_test.Review\n",
 695 |     "    y_test = df_test.drop('Review',1)\n",
 696 |     "    y_test = np.asarray(y_test, dtype=np.int64)\n",
 697 |     "\n",
 698 |     "    vect_sen = CountVectorizer(stop_words='english',ngram_range=(1,2))  \n",
 699 |     "    X_train_dtm = vect_sen.fit_transform(X_train)\n",
 700 |     "    X_test_dtm = vect_sen.transform(X_test)\n",
 701 |     "\n",
 702 |     "    #ombining word vector with extra feature.\n",
 703 |     "    from scipy.sparse import hstack\n",
 704 |     "    X_train_dtm=hstack((X_train_dtm, X_train_aspect_dtm))\n",
 705 |     "    X_test_dtm=hstack((X_test_dtm, X_test_aspect_dtm))\n",
 706 |     "\n",
 707 |     "    C = 1.0 #SVregularization parameter\n",
 708 |     "    nb_classif = OneVsRestClassifier(MultinomialNB()).fit(X_train_dtm, y_train)\n",
 709 |     "    svc = OneVsRestClassifier(svm.SVC(kernel='linear', C=C)).fit(X_train_dtm, y_train)\n",
 710 |     "    lin_svc = OneVsRestClassifier(svm.LinearSVC(C=C)).fit(X_train_dtm, y_train)\n",
 711 |     "    sgd = OneVsRestClassifier(SGDClassifier()).fit(X_train_dtm,y_train)\n",
 712 |     "\n",
 713 |     "    y_pred_class= nb_classif.predict(X_test_dtm)\n",
 714 |     "    y_pred_class_svc = svc.predict(X_test_dtm)\n",
 715 |     "    y_pred_class_lin_svc = lin_svc.predict(X_test_dtm)\n",
 716 |     "    y_pred_class_sgd = sgd.predict(X_test_dtm)\n",
 717 |     "    return (y_test,y_pred_class,y_pred_class_svc,y_pred_class_lin_svc,y_pred_class_sgd)"
 718 |    ]
 719 |   },
 720 |   {
 721 |    "cell_type": "code",
 722 |    "execution_count": 33,
 723 |    "metadata": {
 724 |     "collapsed": true
 725 |    },
 726 |    "outputs": [],
 727 |    "source": [
 728 |     "def print_metrices(y_test,y_pred_class,y_pred_class_svc,y_pred_class_lin_svc,y_pred_class_sgd):\n",
 729 |     "    print(\"Accuracy:\")\n",
 730 |     "    print(metrics.accuracy_score(y_test,y_pred_class))\n",
 731 |     "    print(metrics.accuracy_score(y_test,y_pred_class_svc))\n",
 732 |     "    print(metrics.accuracy_score(y_test,y_pred_class_lin_svc))\n",
 733 |     "    print(metrics.accuracy_score(y_test,y_pred_class_sgd))\n",
 734 |     "\n",
 735 |     "    print(\"\\nAverage precision:\")\n",
 736 |     "    print(metrics.precision_score(y_test,y_pred_class,average='micro'))\n",
 737 |     "    print(metrics.precision_score(y_test,y_pred_class_svc,average='micro'))\n",
 738 |     "    print(metrics.precision_score(y_test,y_pred_class_lin_svc,average='micro'))\n",
 739 |     "    print(metrics.precision_score(y_test,y_pred_class_sgd,average='micro'))\n",
 740 |     "\n",
 741 |     "    print(\"\\nAverage recall:\")\n",
 742 |     "    print(metrics.recall_score(y_test,y_pred_class,average='micro'))\n",
 743 |     "    print(metrics.recall_score(y_test,y_pred_class_svc,average='micro'))\n",
 744 |     "    print(metrics.recall_score(y_test,y_pred_class_lin_svc,average='micro'))\n",
 745 |     "    print(metrics.recall_score(y_test,y_pred_class_sgd,average='micro'))\n",
 746 |     "    \n",
 747 |     "    print(\"\\nAverage f1:\")\n",
 748 |     "    print(metrics.f1_score(y_test,y_pred_class,average='micro'))\n",
 749 |     "    print(metrics.f1_score(y_test,y_pred_class_svc,average='micro'))\n",
 750 |     "    print(metrics.f1_score(y_test,y_pred_class_lin_svc,average='micro'))\n",
 751 |     "    print(metrics.f1_score(y_test,y_pred_class_sgd,average='micro'))\n",
 752 |     "\n",
 753 |     "    print(\"\\nClassification report:\")\n",
 754 |     "    print(metrics.classification_report(y_test, y_pred_class))\n",
 755 |     "    print(metrics.classification_report(y_test, y_pred_class_svc))\n",
 756 |     "    print(metrics.classification_report(y_test, y_pred_class_lin_svc))\n",
 757 |     "    print(metrics.classification_report(y_test, y_pred_class_sgd))"
 758 |    ]
 759 |   },
 760 |   {
 761 |    "cell_type": "code",
 762 |    "execution_count": 34,
 763 |    "metadata": {
 764 |     "collapsed": false
 765 |    },
 766 |    "outputs": [
 767 |     {
 768 |      "name": "stdout",
 769 |      "output_type": "stream",
 770 |      "text": [
 771 |       "Accuracy:\n",
 772 |       "0.15\n",
 773 |       "0.35\n",
 774 |       "0.3125\n",
 775 |       "0.125\n",
 776 |       "\n",
 777 |       "Average precision:\n",
 778 |       "0.857142857143\n",
 779 |       "0.745762711864\n",
 780 |       "0.756097560976\n",
 781 |       "0.704545454545\n",
 782 |       "\n",
 783 |       "Average recall:\n",
 784 |       "0.260869565217\n",
 785 |       "0.797101449275\n",
 786 |       "0.673913043478\n",
 787 |       "0.673913043478\n",
 788 |       "\n",
 789 |       "Average f1:\n",
 790 |       "0.4\n",
 791 |       "0.77057793345\n",
 792 |       "0.712643678161\n",
 793 |       "0.688888888889\n",
 794 |       "\n",
 795 |       "Classification report:\n",
 796 |       "             precision    recall  f1-score   support\n",
 797 |       "\n",
 798 |       "          0       0.00      0.00      0.00        12\n",
 799 |       "          1       0.00      0.00      0.00        14\n",
 800 |       "          2       0.00      0.00      0.00         6\n",
 801 |       "          3       0.00      0.00      0.00         3\n",
 802 |       "          4       0.00      0.00      0.00        15\n",
 803 |       "          5       0.00      0.00      0.00         7\n",
 804 |       "          6       0.00      0.00      0.00         4\n",
 805 |       "          7       0.00      0.00      0.00        26\n",
 806 |       "          8       0.83      0.95      0.89        61\n",
 807 |       "          9       0.00      0.00      0.00        11\n",
 808 |       "         10       1.00      0.39      0.56        36\n",
 809 |       "         11       0.00      0.00      0.00         4\n",
 810 |       "         12       0.00      0.00      0.00        12\n",
 811 |       "         13       0.00      0.00      0.00        18\n",
 812 |       "         14       0.00      0.00      0.00        23\n",
 813 |       "         15       0.00      0.00      0.00         3\n",
 814 |       "         16       0.00      0.00      0.00         7\n",
 815 |       "         17       0.00      0.00      0.00         8\n",
 816 |       "         18       0.00      0.00      0.00         2\n",
 817 |       "         19       0.00      0.00      0.00         4\n",
 818 |       "\n",
 819 |       "avg / total       0.31      0.26      0.27       276\n",
 820 |       "\n",
 821 |       "             precision    recall  f1-score   support\n",
 822 |       "\n",
 823 |       "          0       0.91      0.83      0.87        12\n",
 824 |       "          1       0.67      0.71      0.69        14\n",
 825 |       "          2       0.50      1.00      0.67         6\n",
 826 |       "          3       0.75      1.00      0.86         3\n",
 827 |       "          4       0.65      0.73      0.69        15\n",
 828 |       "          5       0.75      0.43      0.55         7\n",
 829 |       "          6       0.67      0.50      0.57         4\n",
 830 |       "          7       0.71      0.77      0.74        26\n",
 831 |       "          8       0.89      0.93      0.91        61\n",
 832 |       "          9       0.47      0.82      0.60        11\n",
 833 |       "         10       0.89      0.89      0.89        36\n",
 834 |       "         11       0.75      0.75      0.75         4\n",
 835 |       "         12       0.60      0.75      0.67        12\n",
 836 |       "         13       0.72      0.72      0.72        18\n",
 837 |       "         14       0.75      0.91      0.82        23\n",
 838 |       "         15       0.67      0.67      0.67         3\n",
 839 |       "         16       0.25      0.14      0.18         7\n",
 840 |       "         17       0.80      0.50      0.62         8\n",
 841 |       "         18       0.67      1.00      0.80         2\n",
 842 |       "         19       1.00      0.50      0.67         4\n",
 843 |       "\n",
 844 |       "avg / total       0.76      0.80      0.77       276\n",
 845 |       "\n",
 846 |       "             precision    recall  f1-score   support\n",
 847 |       "\n",
 848 |       "          0       1.00      0.75      0.86        12\n",
 849 |       "          1       0.64      0.64      0.64        14\n",
 850 |       "          2       0.60      0.50      0.55         6\n",
 851 |       "          3       0.75      1.00      0.86         3\n",
 852 |       "          4       0.69      0.60      0.64        15\n",
 853 |       "          5       1.00      0.14      0.25         7\n",
 854 |       "          6       0.00      0.00      0.00         4\n",
 855 |       "          7       0.68      0.65      0.67        26\n",
 856 |       "          8       0.88      0.92      0.90        61\n",
 857 |       "          9       0.44      0.73      0.55        11\n",
 858 |       "         10       0.89      0.86      0.87        36\n",
 859 |       "         11       1.00      0.50      0.67         4\n",
 860 |       "         12       0.45      0.42      0.43        12\n",
 861 |       "         13       0.73      0.61      0.67        18\n",
 862 |       "         14       0.71      0.65      0.68        23\n",
 863 |       "         15       1.00      0.33      0.50         3\n",
 864 |       "         16       0.00      0.00      0.00         7\n",
 865 |       "         17       0.75      0.38      0.50         8\n",
 866 |       "         18       0.67      1.00      0.80         2\n",
 867 |       "         19       1.00      0.25      0.40         4\n",
 868 |       "\n",
 869 |       "avg / total       0.74      0.67      0.69       276\n",
 870 |       "\n",
 871 |       "             precision    recall  f1-score   support\n",
 872 |       "\n",
 873 |       "          0       1.00      0.58      0.74        12\n",
 874 |       "          1       0.64      1.00      0.78        14\n",
 875 |       "          2       0.50      0.83      0.62         6\n",
 876 |       "          3       0.75      1.00      0.86         3\n",
 877 |       "          4       0.72      0.87      0.79        15\n",
 878 |       "          5       0.50      0.14      0.22         7\n",
 879 |       "          6       0.75      0.75      0.75         4\n",
 880 |       "          7       0.53      0.31      0.39        26\n",
 881 |       "          8       0.88      0.85      0.87        61\n",
 882 |       "          9       0.38      0.55      0.44        11\n",
 883 |       "         10       0.78      0.97      0.86        36\n",
 884 |       "         11       0.80      1.00      0.89         4\n",
 885 |       "         12       0.67      0.17      0.27        12\n",
 886 |       "         13       0.73      0.44      0.55        18\n",
 887 |       "         14       0.73      0.83      0.78        23\n",
 888 |       "         15       0.00      0.00      0.00         3\n",
 889 |       "         16       0.00      0.00      0.00         7\n",
 890 |       "         17       0.33      0.12      0.18         8\n",
 891 |       "         18       0.67      1.00      0.80         2\n",
 892 |       "         19       0.30      0.75      0.43         4\n",
 893 |       "\n",
 894 |       "avg / total       0.69      0.67      0.65       276\n",
 895 |       "\n"
 896 |      ]
 897 |     }
 898 |    ],
 899 |    "source": [
 900 |     "#For positive sentiment classifier\n",
 901 |     "df_train = get_data_frame(final_train_text_list,train_opinion_list,most_common_aspect)\n",
 902 |     "df_test = get_data_frame(final_test_text_list,test_opinion_list,most_common_aspect)\n",
 903 |     "\n",
 904 |     "df_train_positive = get_positive_data_frame(df_train,most_common_aspect)\n",
 905 |     "df_test_positive = get_positive_data_frame(df_test,most_common_aspect)\n",
 906 |     "y_test_pos,y_pred_class_pos,y_pred_class_svc_pos,y_pred_class_lin_svc_pos,y_pred_class_sgd_pos=classify_sentiment(df_train_positive,df_test_positive,X_train_aspect_dtm,X_test_aspect_dtm)\n",
 907 |     "with warnings.catch_warnings():\n",
 908 |     "    warnings.simplefilter(\"ignore\")\n",
 909 |     "    print_metrices(y_test_pos,y_pred_class_pos,y_pred_class_svc_pos,y_pred_class_lin_svc_pos,y_pred_class_sgd_pos)"
 910 |    ]
 911 |   },
 912 |   {
 913 |    "cell_type": "code",
 914 |    "execution_count": 35,
 915 |    "metadata": {
 916 |     "collapsed": false
 917 |    },
 918 |    "outputs": [
 919 |     {
 920 |      "name": "stdout",
 921 |      "output_type": "stream",
 922 |      "text": [
 923 |       "Accuracy:\n",
 924 |       "0.4875\n",
 925 |       "0.4875\n",
 926 |       "0.4625\n",
 927 |       "0.3375\n",
 928 |       "\n",
 929 |       "Average precision:\n",
 930 |       "0.7\n",
 931 |       "0.625\n",
 932 |       "0.666666666667\n",
 933 |       "0.449438202247\n",
 934 |       "\n",
 935 |       "Average recall:\n",
 936 |       "0.0642201834862\n",
 937 |       "0.412844036697\n",
 938 |       "0.330275229358\n",
 939 |       "0.366972477064\n",
 940 |       "\n",
 941 |       "Average f1:\n",
 942 |       "0.117647058824\n",
 943 |       "0.497237569061\n",
 944 |       "0.441717791411\n",
 945 |       "0.40404040404\n",
 946 |       "\n",
 947 |       "Classification report:\n",
 948 |       "             precision    recall  f1-score   support\n",
 949 |       "\n",
 950 |       "          0       0.00      0.00      0.00         2\n",
 951 |       "          1       0.00      0.00      0.00        10\n",
 952 |       "          2       0.00      0.00      0.00         2\n",
 953 |       "          3       0.00      0.00      0.00         1\n",
 954 |       "          4       0.00      0.00      0.00         4\n",
 955 |       "          5       0.00      0.00      0.00         1\n",
 956 |       "          6       0.00      0.00      0.00         3\n",
 957 |       "          7       0.00      0.00      0.00         3\n",
 958 |       "          8       0.67      0.33      0.44        18\n",
 959 |       "          9       0.00      0.00      0.00         9\n",
 960 |       "         10       0.00      0.00      0.00        10\n",
 961 |       "         11       0.00      0.00      0.00         1\n",
 962 |       "         12       0.00      0.00      0.00        12\n",
 963 |       "         13       1.00      0.09      0.17        11\n",
 964 |       "         14       0.00      0.00      0.00         6\n",
 965 |       "         15       0.00      0.00      0.00         0\n",
 966 |       "         16       0.00      0.00      0.00         2\n",
 967 |       "         17       0.00      0.00      0.00         5\n",
 968 |       "         18       0.00      0.00      0.00         2\n",
 969 |       "         19       0.00      0.00      0.00         7\n",
 970 |       "\n",
 971 |       "avg / total       0.21      0.06      0.09       109\n",
 972 |       "\n",
 973 |       "             precision    recall  f1-score   support\n",
 974 |       "\n",
 975 |       "          0       0.50      0.50      0.50         2\n",
 976 |       "          1       0.67      0.60      0.63        10\n",
 977 |       "          2       0.00      0.00      0.00         2\n",
 978 |       "          3       0.00      0.00      0.00         1\n",
 979 |       "          4       0.50      0.25      0.33         4\n",
 980 |       "          5       0.00      0.00      0.00         1\n",
 981 |       "          6       1.00      0.67      0.80         3\n",
 982 |       "          7       0.00      0.00      0.00         3\n",
 983 |       "          8       0.77      0.56      0.65        18\n",
 984 |       "          9       1.00      0.22      0.36         9\n",
 985 |       "         10       0.78      0.70      0.74        10\n",
 986 |       "         11       0.00      0.00      0.00         1\n",
 987 |       "         12       0.75      0.25      0.38        12\n",
 988 |       "         13       0.70      0.64      0.67        11\n",
 989 |       "         14       0.00      0.00      0.00         6\n",
 990 |       "         15       0.00      0.00      0.00         0\n",
 991 |       "         16       0.00      0.00      0.00         2\n",
 992 |       "         17       0.00      0.00      0.00         5\n",
 993 |       "         18       0.00      0.00      0.00         2\n",
 994 |       "         19       0.75      0.86      0.80         7\n",
 995 |       "\n",
 996 |       "avg / total       0.60      0.41      0.47       109\n",
 997 |       "\n",
 998 |       "             precision    recall  f1-score   support\n",
 999 |       "\n",
1000 |       "          0       1.00      0.50      0.67         2\n",
1001 |       "          1       0.67      0.40      0.50        10\n",
1002 |       "          2       0.00      0.00      0.00         2\n",
1003 |       "          3       0.00      0.00      0.00         1\n",
1004 |       "          4       0.33      0.25      0.29         4\n",
1005 |       "          5       0.00      0.00      0.00         1\n",
1006 |       "          6       1.00      0.33      0.50         3\n",
1007 |       "          7       0.00      0.00      0.00         3\n",
1008 |       "          8       0.77      0.56      0.65        18\n",
1009 |       "          9       0.00      0.00      0.00         9\n",
1010 |       "         10       0.86      0.60      0.71        10\n",
1011 |       "         11       0.00      0.00      0.00         1\n",
1012 |       "         12       1.00      0.08      0.15        12\n",
1013 |       "         13       0.70      0.64      0.67        11\n",
1014 |       "         14       0.00      0.00      0.00         6\n",
1015 |       "         15       0.00      0.00      0.00         0\n",
1016 |       "         16       0.00      0.00      0.00         2\n",
1017 |       "         17       0.00      0.00      0.00         5\n",
1018 |       "         18       0.00      0.00      0.00         2\n",
1019 |       "         19       0.83      0.71      0.77         7\n",
1020 |       "\n",
1021 |       "avg / total       0.56      0.33      0.39       109\n",
1022 |       "\n",
1023 |       "             precision    recall  f1-score   support\n",
1024 |       "\n",
1025 |       "          0       1.00      0.50      0.67         2\n",
1026 |       "          1       0.71      0.50      0.59        10\n",
1027 |       "          2       0.00      0.00      0.00         2\n",
1028 |       "          3       0.00      0.00      0.00         1\n",
1029 |       "          4       1.00      0.25      0.40         4\n",
1030 |       "          5       0.00      0.00      0.00         1\n",
1031 |       "          6       0.67      0.67      0.67         3\n",
1032 |       "          7       0.13      0.67      0.22         3\n",
1033 |       "          8       0.38      0.67      0.48        18\n",
1034 |       "          9       1.00      0.11      0.20         9\n",
1035 |       "         10       0.50      0.70      0.58        10\n",
1036 |       "         11       0.00      0.00      0.00         1\n",
1037 |       "         12       0.00      0.00      0.00        12\n",
1038 |       "         13       0.75      0.27      0.40        11\n",
1039 |       "         14       0.00      0.00      0.00         6\n",
1040 |       "         15       0.00      0.00      0.00         0\n",
1041 |       "         16       0.00      0.00      0.00         2\n",
1042 |       "         17       0.00      0.00      0.00         5\n",
1043 |       "         18       0.00      0.00      0.00         2\n",
1044 |       "         19       0.86      0.86      0.86         7\n",
1045 |       "\n",
1046 |       "avg / total       0.46      0.37      0.35       109\n",
1047 |       "\n"
1048 |      ]
1049 |     }
1050 |    ],
1051 |    "source": [
1052 |     "#For negative sentiment classifier\n",
1053 |     "df_train = get_data_frame(final_train_text_list,train_opinion_list,most_common_aspect)\n",
1054 |     "df_test = get_data_frame(final_test_text_list,test_opinion_list,most_common_aspect)\n",
1055 |     "\n",
1056 |     "df_train_neg = get_negative_data_frame(df_train,most_common_aspect)\n",
1057 |     "df_test_neg = get_negative_data_frame(df_test,most_common_aspect)\n",
1058 |     "\n",
1059 |     "y_test_neg,y_pred_class_neg,y_pred_class_svc_neg,y_pred_class_lin_svc_neg,y_pred_class_sgd_neg=classify_sentiment(df_train_neg,df_test_neg,X_train_aspect_dtm,X_test_aspect_dtm)\n",
1060 |     "with warnings.catch_warnings():\n",
1061 |     "    warnings.simplefilter(\"ignore\")\n",
1062 |     "    print_metrices(y_test_neg,y_pred_class_neg,y_pred_class_svc_neg,y_pred_class_lin_svc_neg,y_pred_class_sgd_neg)"
1063 |    ]
1064 |   },
1065 |   {
1066 |    "cell_type": "code",
1067 |    "execution_count": 36,
1068 |    "metadata": {
1069 |     "collapsed": false
1070 |    },
1071 |    "outputs": [
1072 |     {
1073 |      "name": "stdout",
1074 |      "output_type": "stream",
1075 |      "text": [
1076 |       "Accuracy:\n",
1077 |       "0.7375\n",
1078 |       "0.725\n",
1079 |       "0.7375\n",
1080 |       "0.5875\n",
1081 |       "\n",
1082 |       "Average precision:\n",
1083 |       "0.0\n",
1084 |       "0.153846153846\n",
1085 |       "0.333333333333\n",
1086 |       "0.105263157895\n",
1087 |       "\n",
1088 |       "Average recall:\n",
1089 |       "0.0\n",
1090 |       "0.0714285714286\n",
1091 |       "0.0357142857143\n",
1092 |       "0.0714285714286\n",
1093 |       "\n",
1094 |       "Average f1:\n",
1095 |       "0.0\n",
1096 |       "0.0975609756098\n",
1097 |       "0.0645161290323\n",
1098 |       "0.0851063829787\n",
1099 |       "\n",
1100 |       "Classification report:\n",
1101 |       "             precision    recall  f1-score   support\n",
1102 |       "\n",
1103 |       "          0       0.00      0.00      0.00         0\n",
1104 |       "          1       0.00      0.00      0.00         0\n",
1105 |       "          2       0.00      0.00      0.00         4\n",
1106 |       "          3       0.00      0.00      0.00         0\n",
1107 |       "          4       0.00      0.00      0.00         2\n",
1108 |       "          5       0.00      0.00      0.00         0\n",
1109 |       "          6       0.00      0.00      0.00         0\n",
1110 |       "          7       0.00      0.00      0.00        10\n",
1111 |       "          8       0.00      0.00      0.00         1\n",
1112 |       "          9       0.00      0.00      0.00         4\n",
1113 |       "         10       0.00      0.00      0.00         0\n",
1114 |       "         11       0.00      0.00      0.00         0\n",
1115 |       "         12       0.00      0.00      0.00         3\n",
1116 |       "         13       0.00      0.00      0.00         0\n",
1117 |       "         14       0.00      0.00      0.00         1\n",
1118 |       "         15       0.00      0.00      0.00         1\n",
1119 |       "         16       0.00      0.00      0.00         0\n",
1120 |       "         17       0.00      0.00      0.00         2\n",
1121 |       "         18       0.00      0.00      0.00         0\n",
1122 |       "         19       0.00      0.00      0.00         0\n",
1123 |       "\n",
1124 |       "avg / total       0.00      0.00      0.00        28\n",
1125 |       "\n",
1126 |       "             precision    recall  f1-score   support\n",
1127 |       "\n",
1128 |       "          0       0.00      0.00      0.00         0\n",
1129 |       "          1       0.00      0.00      0.00         0\n",
1130 |       "          2       0.00      0.00      0.00         4\n",
1131 |       "          3       0.00      0.00      0.00         0\n",
1132 |       "          4       0.00      0.00      0.00         2\n",
1133 |       "          5       0.00      0.00      0.00         0\n",
1134 |       "          6       0.00      0.00      0.00         0\n",
1135 |       "          7       0.29      0.20      0.24        10\n",
1136 |       "          8       0.00      0.00      0.00         1\n",
1137 |       "          9       0.00      0.00      0.00         4\n",
1138 |       "         10       0.00      0.00      0.00         0\n",
1139 |       "         11       0.00      0.00      0.00         0\n",
1140 |       "         12       0.00      0.00      0.00         3\n",
1141 |       "         13       0.00      0.00      0.00         0\n",
1142 |       "         14       0.00      0.00      0.00         1\n",
1143 |       "         15       0.00      0.00      0.00         1\n",
1144 |       "         16       0.00      0.00      0.00         0\n",
1145 |       "         17       0.00      0.00      0.00         2\n",
1146 |       "         18       0.00      0.00      0.00         0\n",
1147 |       "         19       0.00      0.00      0.00         0\n",
1148 |       "\n",
1149 |       "avg / total       0.10      0.07      0.08        28\n",
1150 |       "\n",
1151 |       "             precision    recall  f1-score   support\n",
1152 |       "\n",
1153 |       "          0       0.00      0.00      0.00         0\n",
1154 |       "          1       0.00      0.00      0.00         0\n",
1155 |       "          2       0.00      0.00      0.00         4\n",
1156 |       "          3       0.00      0.00      0.00         0\n",
1157 |       "          4       0.00      0.00      0.00         2\n",
1158 |       "          5       0.00      0.00      0.00         0\n",
1159 |       "          6       0.00      0.00      0.00         0\n",
1160 |       "          7       0.33      0.10      0.15        10\n",
1161 |       "          8       0.00      0.00      0.00         1\n",
1162 |       "          9       0.00      0.00      0.00         4\n",
1163 |       "         10       0.00      0.00      0.00         0\n",
1164 |       "         11       0.00      0.00      0.00         0\n",
1165 |       "         12       0.00      0.00      0.00         3\n",
1166 |       "         13       0.00      0.00      0.00         0\n",
1167 |       "         14       0.00      0.00      0.00         1\n",
1168 |       "         15       0.00      0.00      0.00         1\n",
1169 |       "         16       0.00      0.00      0.00         0\n",
1170 |       "         17       0.00      0.00      0.00         2\n",
1171 |       "         18       0.00      0.00      0.00         0\n",
1172 |       "         19       0.00      0.00      0.00         0\n",
1173 |       "\n",
1174 |       "avg / total       0.12      0.04      0.05        28\n",
1175 |       "\n",
1176 |       "             precision    recall  f1-score   support\n",
1177 |       "\n",
1178 |       "          0       0.00      0.00      0.00         0\n",
1179 |       "          1       0.00      0.00      0.00         0\n",
1180 |       "          2       0.00      0.00      0.00         4\n",
1181 |       "          3       0.00      0.00      0.00         0\n",
1182 |       "          4       0.00      0.00      0.00         2\n",
1183 |       "          5       0.00      0.00      0.00         0\n",
1184 |       "          6       0.00      0.00      0.00         0\n",
1185 |       "          7       0.00      0.00      0.00        10\n",
1186 |       "          8       0.00      0.00      0.00         1\n",
1187 |       "          9       0.00      0.00      0.00         4\n",
1188 |       "         10       0.00      0.00      0.00         0\n",
1189 |       "         11       0.00      0.00      0.00         0\n",
1190 |       "         12       0.18      0.67      0.29         3\n",
1191 |       "         13       0.00      0.00      0.00         0\n",
1192 |       "         14       0.00      0.00      0.00         1\n",
1193 |       "         15       0.00      0.00      0.00         1\n",
1194 |       "         16       0.00      0.00      0.00         0\n",
1195 |       "         17       0.00      0.00      0.00         2\n",
1196 |       "         18       0.00      0.00      0.00         0\n",
1197 |       "         19       0.00      0.00      0.00         0\n",
1198 |       "\n",
1199 |       "avg / total       0.02      0.07      0.03        28\n",
1200 |       "\n"
1201 |      ]
1202 |     }
1203 |    ],
1204 |    "source": [
1205 |     "#For neutral or conflict sentiment classifier\n",
1206 |     "df_train = get_data_frame(final_train_text_list,train_opinion_list,most_common_aspect)\n",
1207 |     "df_test = get_data_frame(final_test_text_list,test_opinion_list,most_common_aspect)\n",
1208 |     "\n",
1209 |     "df_train_neu = get_neutral_data_frame(df_train,most_common_aspect)\n",
1210 |     "df_test_neu = get_neutral_data_frame(df_test,most_common_aspect)\n",
1211 |     "\n",
1212 |     "y_test_neu,y_pred_class_neu,y_pred_class_svc_neu,y_pred_class_lin_svc_neu,y_pred_class_sgd_neu=classify_sentiment(df_train_neu,df_test_neu,X_train_aspect_dtm,X_test_aspect_dtm)\n",
1213 |     "with warnings.catch_warnings():\n",
1214 |     "    warnings.simplefilter(\"ignore\")\n",
1215 |     "    print_metrices(y_test_neu,y_pred_class_neu,y_pred_class_svc_neu,y_pred_class_lin_svc_neu,y_pred_class_sgd_neu)"
1216 |    ]
1217 |   },
1218 |   {
1219 |    "cell_type": "code",
1220 |    "execution_count": 37,
1221 |    "metadata": {
1222 |     "collapsed": false,
1223 |     "scrolled": false
1224 |    },
1225 |    "outputs": [
1226 |     {
1227 |      "name": "stdout",
1228 |      "output_type": "stream",
1229 |      "text": [
1230 |       "Enter a laptop review:\n",
1231 |       "\n",
1232 |       "This is my first asus laptop. So far i am really enjoying this laptop. 512GB SSD is super fast. Battery life is also good and can last very long. I have no complain on screen quality too as display supports 4k videos. Maybe that is why it costs a lot. This is an expensive laptop and it's price is very high compared to other laptops of similar specs. So, if you have no trouble paying for this laptop, it is pretty good.\n"
1233 |      ]
1234 |     },
1235 |     {
1236 |      "data": {
1237 |       "text/plain": [
1238 |        "array([[1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0]])"
1239 |       ]
1240 |      },
1241 |      "execution_count": 37,
1242 |      "metadata": {},
1243 |      "output_type": "execute_result"
1244 |     }
1245 |    ],
1246 |    "source": [
1247 |     "#Aspect Based Sentiment analyis of user's input.\n",
1248 |     "user_input=input(\"Enter a laptop review:\\n\\n\")\n",
1249 |     "#Preprocessing and vectorizing\n",
1250 |     "tagged_user_input = posTag([user_input])\n",
1251 |     "filter_tagged_user_input = filterTag(tagged_user_input)\n",
1252 |     "\n",
1253 |     "user_input_series=pd.Series(filter_tagged_user_input)\n",
1254 |     "user_input_series_dtm=vect.transform(user_input_series)\n",
1255 |     "\n",
1256 |     "predict_aspect= svc.predict(user_input_series_dtm)\n",
1257 |     "extra_feature=get_dict_aspect(predict_aspect, most_common_aspect)\n",
1258 |     "extra_feature_dtm=DictVectorizer().fit_transform(extra_feature)\n",
1259 |     "predict_aspect"
1260 |    ]
1261 |   },
1262 |   {
1263 |    "cell_type": "code",
1264 |    "execution_count": 38,
1265 |    "metadata": {
1266 |     "collapsed": false
1267 |    },
1268 |    "outputs": [
1269 |     {
1270 |      "data": {
1271 |       "text/plain": [
1272 |        "array([[1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0]])"
1273 |       ]
1274 |      },
1275 |      "execution_count": 38,
1276 |      "metadata": {},
1277 |      "output_type": "execute_result"
1278 |     }
1279 |    ],
1280 |    "source": [
1281 |     "#predicting weather the dectected aspect is positive or not\n",
1282 |     "test_opinion_list=[]\n",
1283 |     "df_test = get_data_frame(filter_tagged_user_input,test_opinion_list,most_common_aspect)\n",
1284 |     "df_train = get_data_frame(final_train_text_list,train_opinion_list,most_common_aspect)\n",
1285 |     "\n",
1286 |     "df_train_positive = get_positive_data_frame(df_train,most_common_aspect)\n",
1287 |     "y_test_pos,y_pred_class_pos,y_pred_class_svc_pos,y_pred_class_lin_svc_pos,y_pred_class_sgd_pos=classify_sentiment(df_train_positive,df_test,X_train_aspect_dtm,extra_feature_dtm)\n",
1288 |     "\n",
1289 |     "y_pred_class_svc_pos"
1290 |    ]
1291 |   },
1292 |   {
1293 |    "cell_type": "code",
1294 |    "execution_count": 39,
1295 |    "metadata": {
1296 |     "collapsed": false
1297 |    },
1298 |    "outputs": [
1299 |     {
1300 |      "data": {
1301 |       "text/plain": [
1302 |        "array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])"
1303 |       ]
1304 |      },
1305 |      "execution_count": 39,
1306 |      "metadata": {},
1307 |      "output_type": "execute_result"
1308 |     }
1309 |    ],
1310 |    "source": [
1311 |     "#predicting weather the dectected aspect is negative or not\n",
1312 |     "test_opinion_list=[]\n",
1313 |     "df_test = get_data_frame(filter_tagged_user_input,test_opinion_list,most_common_aspect)\n",
1314 |     "df_train = get_data_frame(final_train_text_list,train_opinion_list,most_common_aspect)\n",
1315 |     "\n",
1316 |     "df_train_negative = get_negative_data_frame(df_train,most_common_aspect)\n",
1317 |     "y_test_neg,y_pred_class_neg,y_pred_class_svc_neg,y_pred_class_lin_svc_neg,y_pred_class_sgd_neg=classify_sentiment(df_train_negative,df_test,X_train_aspect_dtm,extra_feature_dtm)\n",
1318 |     "\n",
1319 |     "y_pred_class_svc_neg"
1320 |    ]
1321 |   },
1322 |   {
1323 |    "cell_type": "code",
1324 |    "execution_count": 40,
1325 |    "metadata": {
1326 |     "collapsed": false
1327 |    },
1328 |    "outputs": [
1329 |     {
1330 |      "data": {
1331 |       "text/plain": [
1332 |        "array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]])"
1333 |       ]
1334 |      },
1335 |      "execution_count": 40,
1336 |      "metadata": {},
1337 |      "output_type": "execute_result"
1338 |     }
1339 |    ],
1340 |    "source": [
1341 |     "#predicting weather the dectected aspect is neutral or coflict or not\n",
1342 |     "test_opinion_list=[]\n",
1343 |     "df_test = get_data_frame(filter_tagged_user_input,test_opinion_list,most_common_aspect)\n",
1344 |     "df_train = get_data_frame(final_train_text_list,train_opinion_list,most_common_aspect)\n",
1345 |     "\n",
1346 |     "df_train_neutral = get_neutral_data_frame(df_train,most_common_aspect)\n",
1347 |     "y_test_neu,y_pred_class_neu,y_pred_class_svc_neu,y_pred_class_lin_svc_neu,y_pred_class_sgd_neu=classify_sentiment(df_train_neutral,df_test,X_train_aspect_dtm,extra_feature_dtm)\n",
1348 |     "\n",
1349 |     "y_pred_class_svc_neu"
1350 |    ]
1351 |   },
1352 |   {
1353 |    "cell_type": "code",
1354 |    "execution_count": 41,
1355 |    "metadata": {
1356 |     "collapsed": false
1357 |    },
1358 |    "outputs": [
1359 |     {
1360 |      "data": {
1361 |       "text/plain": [
1362 |        "[0, 3, 8, 10, 13]"
1363 |       ]
1364 |      },
1365 |      "execution_count": 41,
1366 |      "metadata": {},
1367 |      "output_type": "execute_result"
1368 |     }
1369 |    ],
1370 |    "source": [
1371 |     "#Finding the aspect that is positive\n",
1372 |     "index_positive=[]\n",
1373 |     "for i, (a, b) in enumerate(zip(predict_aspect.tolist()[0], y_pred_class_svc_pos.tolist()[0])):\n",
1374 |     "    if a ==1 and b==1:\n",
1375 |     "        index_positive.append(i)\n",
1376 |     "index_positive         "
1377 |    ]
1378 |   },
1379 |   {
1380 |    "cell_type": "code",
1381 |    "execution_count": 42,
1382 |    "metadata": {
1383 |     "collapsed": false
1384 |    },
1385 |    "outputs": [
1386 |     {
1387 |      "data": {
1388 |       "text/plain": [
1389 |        "[]"
1390 |       ]
1391 |      },
1392 |      "execution_count": 42,
1393 |      "metadata": {},
1394 |      "output_type": "execute_result"
1395 |     }
1396 |    ],
1397 |    "source": [
1398 |     "#Finding the aspect that is negative\n",
1399 |     "index_negative=[]\n",
1400 |     "for i, (a, b) in enumerate(zip(predict_aspect.tolist()[0], y_pred_class_svc_neg.tolist()[0])):\n",
1401 |     "    if a ==1 and b==1:\n",
1402 |     "        index_negative.append(i)\n",
1403 |     "index_negative         "
1404 |    ]
1405 |   },
1406 |   {
1407 |    "cell_type": "code",
1408 |    "execution_count": 43,
1409 |    "metadata": {
1410 |     "collapsed": false
1411 |    },
1412 |    "outputs": [
1413 |     {
1414 |      "data": {
1415 |       "text/plain": [
1416 |        "[12]"
1417 |       ]
1418 |      },
1419 |      "execution_count": 43,
1420 |      "metadata": {},
1421 |      "output_type": "execute_result"
1422 |     }
1423 |    ],
1424 |    "source": [
1425 |     "#Finding the aspect that is neutral\n",
1426 |     "index_neutral=[]\n",
1427 |     "for i, (a, b) in enumerate(zip(predict_aspect.tolist()[0], y_pred_class_svc_neu.tolist()[0])):\n",
1428 |     "    if a ==1 and b==1:\n",
1429 |     "        index_neutral.append(i)\n",
1430 |     "index_neutral         "
1431 |    ]
1432 |   },
1433 |   {
1434 |    "cell_type": "code",
1435 |    "execution_count": 44,
1436 |    "metadata": {
1437 |     "collapsed": true
1438 |    },
1439 |    "outputs": [],
1440 |    "source": [
1441 |     "output=[]"
1442 |    ]
1443 |   },
1444 |   {
1445 |    "cell_type": "code",
1446 |    "execution_count": 45,
1447 |    "metadata": {
1448 |     "collapsed": false
1449 |    },
1450 |    "outputs": [],
1451 |    "source": [
1452 |     "if index_positive:\n",
1453 |     "    for index in index_positive:\n",
1454 |     "        output.append(sorted(most_common_aspect)[index]+\": positive\")"
1455 |    ]
1456 |   },
1457 |   {
1458 |    "cell_type": "code",
1459 |    "execution_count": 46,
1460 |    "metadata": {
1461 |     "collapsed": false
1462 |    },
1463 |    "outputs": [],
1464 |    "source": [
1465 |     "if index_negative:\n",
1466 |     "    for index in index_negative:\n",
1467 |     "        output.append(sorted(most_common_aspect)[index]+\": negative\")"
1468 |    ]
1469 |   },
1470 |   {
1471 |    "cell_type": "code",
1472 |    "execution_count": 47,
1473 |    "metadata": {
1474 |     "collapsed": false
1475 |    },
1476 |    "outputs": [],
1477 |    "source": [
1478 |     "if index_neutral:\n",
1479 |     "    for index in index_neutral:\n",
1480 |     "        output.append(sorted(most_common_aspect)[index]+\": neutral or conflict\")"
1481 |    ]
1482 |   },
1483 |   {
1484 |    "cell_type": "code",
1485 |    "execution_count": 48,
1486 |    "metadata": {
1487 |     "collapsed": false
1488 |    },
1489 |    "outputs": [
1490 |     {
1491 |      "data": {
1492 |       "text/plain": [
1493 |        "['BATTERY_OPERATION_PERFORMANCE: positive',\n",
1494 |        " 'DISPLAY_GENERAL: positive',\n",
1495 |        " 'LAPTOP_GENERAL: positive',\n",
1496 |        " 'LAPTOP_OPERATION_PERFORMANCE: positive',\n",
1497 |        " 'LAPTOP_QUALITY: positive',\n",
1498 |        " 'LAPTOP_PRICE: neutral or conflict']"
1499 |       ]
1500 |      },
1501 |      "execution_count": 48,
1502 |      "metadata": {},
1503 |      "output_type": "execute_result"
1504 |     }
1505 |    ],
1506 |    "source": [
1507 |     "#Prediction of Aspect Based Sentiment Analaysis for user's input\n",
1508 |     "output"
1509 |    ]
1510 |   }
1511 |  ],
1512 |  "metadata": {
1513 |   "kernelspec": {
1514 |    "display_name": "Python 3",
1515 |    "language": "python",
1516 |    "name": "python3"
1517 |   },
1518 |   "language_info": {
1519 |    "codemirror_mode": {
1520 |     "name": "ipython",
1521 |     "version": 3
1522 |    },
1523 |    "file_extension": ".py",
1524 |    "mimetype": "text/x-python",
1525 |    "name": "python",
1526 |    "nbconvert_exporter": "python",
1527 |    "pygments_lexer": "ipython3",
1528 |    "version": "3.6.0"
1529 |   }
1530 |  },
1531 |  "nbformat": 4,
1532 |  "nbformat_minor": 2
1533 | }
1534 | 


--------------------------------------------------------------------------------
/Aspect Based Sentiment Analysis.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "code",
   5 |    "execution_count": 1,
   6 |    "metadata": {
   7 |     "collapsed": true
   8 |    },
   9 |    "outputs": [],
  10 |    "source": [
  11 |     "import pandas as pd\n",
  12 |     "from sklearn.externals import joblib\n",
  13 |     "from sklearn.feature_extraction.text import CountVectorizer\n",
  14 |     "from sklearn.feature_extraction import DictVectorizer\n",
  15 |     "from sklearn.linear_model import SGDClassifier\n",
  16 |     "from sklearn.multiclass import OneVsRestClassifier\n",
  17 |     "from sklearn.naive_bayes import MultinomialNB\n",
  18 |     "from sklearn import svm\n",
  19 |     "import xml.etree.ElementTree as ET\n",
  20 |     "from lxml import etree\n",
  21 |     "from scipy.sparse import hstack\n",
  22 |     "import numpy as np\n",
  23 |     "import warnings\n",
  24 |     "\n",
  25 |     "\n",
  26 |     "path_train = r'E:\\Engineering\\8th sem\\nlp COMP 473\\NLP projects\\ABSA16_Laptops_Train_English_SB2.xml'\n",
  27 |     "path_test = r'E:\\Engineering\\8th sem\\nlp COMP 473\\NLP projects\\EN_LAPT_SB2_TEST.xml'\n",
  28 |     "\n",
  29 |     "#For stanford POS Tagger\n",
  30 |     "home = r'C:\\Users\\THe_strOX\\Anaconda3\\stanford-postagger-full-2017-06-09'\n",
  31 |     "from nltk.tag.stanford import StanfordPOSTagger as POS_Tag\n",
  32 |     "from nltk import word_tokenize\n",
  33 |     "_path_to_model = home + '/models/english-bidirectional-distsim.tagger' \n",
  34 |     "_path_to_jar = home + '/stanford-postagger.jar'\n",
  35 |     "stanford_tag = POS_Tag(model_filename=_path_to_model, path_to_jar=_path_to_jar)"
  36 |    ]
  37 |   },
  38 |   {
  39 |    "cell_type": "code",
  40 |    "execution_count": 2,
  41 |    "metadata": {
  42 |     "collapsed": true
  43 |    },
  44 |    "outputs": [],
  45 |    "source": [
  46 |     "#xml parser\n",
  47 |     "def get_list(path):\n",
  48 |     "    tree=ET.parse(path)\n",
  49 |     "    root = tree.getroot()\n",
  50 |     "    text_list = []\n",
  51 |     "    opinion_list = []\n",
  52 |     "    for review in root.findall('Review'):\n",
  53 |     "        text_string=\"\"\n",
  54 |     "        opinion_inner_list=[]\n",
  55 |     "        for sent in review.findall('./sentences/sentence'):\n",
  56 |     "            text_string= text_string+ \" \"+ sent.find('text').text\n",
  57 |     "        text_list.append(text_string)\n",
  58 |     "        for opinion in review.findall('./Opinions/Opinion'):\n",
  59 |     "            opinion_dict = {\n",
  60 |     "                opinion.get('category').replace('#','_'): opinion.get('polarity')\n",
  61 |     "            }\n",
  62 |     "            opinion_inner_list.append(opinion_dict)\n",
  63 |     "        opinion_list.append(opinion_inner_list)\n",
  64 |     "    return text_list,opinion_list"
  65 |    ]
  66 |   },
  67 |   {
  68 |    "cell_type": "code",
  69 |    "execution_count": 3,
  70 |    "metadata": {
  71 |     "collapsed": true
  72 |    },
  73 |    "outputs": [],
  74 |    "source": [
  75 |     "#Selecting only 20 most common aspect.\n",
  76 |     "def get_most_common_aspect(opinion_list):\n",
  77 |     "    import nltk\n",
  78 |     "    opinion= []\n",
  79 |     "    for inner_list in opinion_list:\n",
  80 |     "        for _dict in inner_list:\n",
  81 |     "            for key in _dict:\n",
  82 |     "                opinion.append(key)\n",
  83 |     "    most_common_aspect = [k for k,v in nltk.FreqDist(opinion).most_common(20)]\n",
  84 |     "    return most_common_aspect"
  85 |    ]
  86 |   },
  87 |   {
  88 |    "cell_type": "code",
  89 |    "execution_count": 4,
  90 |    "metadata": {
  91 |     "collapsed": true
  92 |    },
  93 |    "outputs": [],
  94 |    "source": [
  95 |     "#generate data frame\n",
  96 |     "def get_data_frame(text_list,opinion_list,most_common_aspect):\n",
  97 |     "    data={'Review':text_list}\n",
  98 |     "    df = pd.DataFrame(data)\n",
  99 |     "    if opinion_list:\n",
 100 |     "        for inner_list in opinion_list:\n",
 101 |     "            for _dict in inner_list:\n",
 102 |     "                for key in _dict:\n",
 103 |     "                    if key in most_common_aspect:\n",
 104 |     "                        df.loc[opinion_list.index(inner_list),key]=_dict[key]\n",
 105 |     "    return df"
 106 |    ]
 107 |   },
 108 |   {
 109 |    "cell_type": "code",
 110 |    "execution_count": 5,
 111 |    "metadata": {
 112 |     "collapsed": true
 113 |    },
 114 |    "outputs": [],
 115 |    "source": [
 116 |     "#generate data frame for aspect extraction task\n",
 117 |     "def get_aspect_data_frame(df,most_common_aspect):\n",
 118 |     "    for common_aspect in most_common_aspect:\n",
 119 |     "        df[common_aspect]=df[common_aspect].replace(['positive','negative','neutral','conflict'],[1,1,1,1])\n",
 120 |     "    df = df.fillna(0)\n",
 121 |     "    return df"
 122 |    ]
 123 |   },
 124 |   {
 125 |    "cell_type": "code",
 126 |    "execution_count": 6,
 127 |    "metadata": {
 128 |     "collapsed": true
 129 |    },
 130 |    "outputs": [],
 131 |    "source": [
 132 |     "def get_positive_data_frame(df,most_common_aspect):\n",
 133 |     "    for common_aspect in most_common_aspect:\n",
 134 |     "        df[common_aspect]=df[common_aspect].replace(['positive'],[1])\n",
 135 |     "        df[common_aspect]=df[common_aspect].replace(['negative','neutral','conflict'],[0,0,0])\n",
 136 |     "    df = df.fillna(0)\n",
 137 |     "    return df"
 138 |    ]
 139 |   },
 140 |   {
 141 |    "cell_type": "code",
 142 |    "execution_count": 7,
 143 |    "metadata": {
 144 |     "collapsed": true
 145 |    },
 146 |    "outputs": [],
 147 |    "source": [
 148 |     "def get_negative_data_frame(df,most_common_aspect):\n",
 149 |     "    for common_aspect in most_common_aspect:\n",
 150 |     "        df[common_aspect]=df[common_aspect].replace(['negative'],[1])\n",
 151 |     "        df[common_aspect]=df[common_aspect].replace(['positive','neutral','conflict'],[0,0,0])\n",
 152 |     "    df = df.fillna(0)\n",
 153 |     "    return df"
 154 |    ]
 155 |   },
 156 |   {
 157 |    "cell_type": "code",
 158 |    "execution_count": 8,
 159 |    "metadata": {
 160 |     "collapsed": true
 161 |    },
 162 |    "outputs": [],
 163 |    "source": [
 164 |     "def get_neutral_data_frame(df,most_common_aspect):\n",
 165 |     "    for common_aspect in most_common_aspect:\n",
 166 |     "        df[common_aspect]=df[common_aspect].replace(['neutral','conflict'],[1,1])\n",
 167 |     "        df[common_aspect]=df[common_aspect].replace(['negative','positive'],[0,0])\n",
 168 |     "    df = df.fillna(0)\n",
 169 |     "    return df"
 170 |    ]
 171 |   },
 172 |   {
 173 |    "cell_type": "code",
 174 |    "execution_count": 9,
 175 |    "metadata": {
 176 |     "collapsed": false
 177 |    },
 178 |    "outputs": [],
 179 |    "source": [
 180 |     "#To tag using stanford pos tagger\n",
 181 |     "def posTag(review):\n",
 182 |     "    tagged_text_list=[]\n",
 183 |     "    for text in review:\n",
 184 |     "        tagged_text_list.append(stanford_tag.tag(word_tokenize(text)))\n",
 185 |     "    return tagged_text_list\n",
 186 |     "#posTag(\"this is random text\")"
 187 |    ]
 188 |   },
 189 |   {
 190 |    "cell_type": "code",
 191 |    "execution_count": 10,
 192 |    "metadata": {
 193 |     "collapsed": true
 194 |    },
 195 |    "outputs": [],
 196 |    "source": [
 197 |     "#Filter the word with tag- noun,adjective,verb,adverb\n",
 198 |     "def filterTag(tagged_review):\n",
 199 |     "    final_text_list=[]\n",
 200 |     "    for text_list in tagged_review:\n",
 201 |     "        final_text=[]\n",
 202 |     "        for word,tag in text_list:\n",
 203 |     "            if tag in ['NN','NNS','NNP','NNPS','RB','RBR','RBS','JJ','JJR','JJS','VB','VBD','VBG','VBN','VBP','VBZ']:\n",
 204 |     "                final_text.append(word)\n",
 205 |     "        final_text_list.append(' '.join(final_text))\n",
 206 |     "    return final_text_list"
 207 |    ]
 208 |   },
 209 |   {
 210 |    "cell_type": "code",
 211 |    "execution_count": 11,
 212 |    "metadata": {
 213 |     "collapsed": false
 214 |    },
 215 |    "outputs": [],
 216 |    "source": [
 217 |     "def get_dict_aspect(y,most_common_aspect):\n",
 218 |     "    position=[]\n",
 219 |     "    for innerlist in y:\n",
 220 |     "        position.append([i for i, j in enumerate(innerlist) if j == 1])\n",
 221 |     "    sorted_common=sorted(most_common_aspect)\n",
 222 |     "    dict_aspect=[]\n",
 223 |     "    for innerlist in position:\n",
 224 |     "        inner_dict={}\n",
 225 |     "        for word in sorted_common:\n",
 226 |     "            if sorted_common.index(word) in innerlist:\n",
 227 |     "                inner_dict[word]= 5\n",
 228 |     "            else:\n",
 229 |     "                inner_dict[word]=0\n",
 230 |     "        dict_aspect.append(inner_dict)\n",
 231 |     "    return dict_aspect"
 232 |    ]
 233 |   },
 234 |   {
 235 |    "cell_type": "code",
 236 |    "execution_count": 12,
 237 |    "metadata": {
 238 |     "collapsed": false
 239 |    },
 240 |    "outputs": [],
 241 |    "source": [
 242 |     "#Stage 1:\n",
 243 |     "#Making list to train\n",
 244 |     "train_text_list,train_opinion_list = get_list(path_train)\n",
 245 |     "most_common_aspect = get_most_common_aspect(train_opinion_list)"
 246 |    ]
 247 |   },
 248 |   {
 249 |    "cell_type": "code",
 250 |    "execution_count": 13,
 251 |    "metadata": {
 252 |     "collapsed": false
 253 |    },
 254 |    "outputs": [],
 255 |    "source": [
 256 |     "#This takes time to tag. Already tagged and saved. So, loading file ...\n",
 257 |     "#tagged_text_list_train=posTag(train_text_list)\n",
 258 |     "#joblib.dump(tagged_text_list_train, 'tagged_text_list_train.pkl')\n",
 259 |     "tagged_text_list_train=joblib.load('tagged_text_list_train.pkl')"
 260 |    ]
 261 |   },
 262 |   {
 263 |    "cell_type": "code",
 264 |    "execution_count": 14,
 265 |    "metadata": {
 266 |     "collapsed": true
 267 |    },
 268 |    "outputs": [],
 269 |    "source": [
 270 |     "#train list after filter\n",
 271 |     "final_train_text_list=filterTag(tagged_text_list_train)"
 272 |    ]
 273 |   },
 274 |   {
 275 |    "cell_type": "code",
 276 |    "execution_count": 15,
 277 |    "metadata": {
 278 |     "collapsed": false
 279 |    },
 280 |    "outputs": [],
 281 |    "source": [
 282 |     "#get data frame\n",
 283 |     "df_train = get_data_frame(final_train_text_list,train_opinion_list,most_common_aspect)\n",
 284 |     "df_train_aspect = get_aspect_data_frame(df_train,most_common_aspect)\n",
 285 |     "df_train_aspect = df_train_aspect.reindex_axis(sorted(df_train_aspect.columns), axis=1)"
 286 |    ]
 287 |   },
 288 |   {
 289 |    "cell_type": "code",
 290 |    "execution_count": 16,
 291 |    "metadata": {
 292 |     "collapsed": false
 293 |    },
 294 |    "outputs": [],
 295 |    "source": [
 296 |     "#Similar for test list\n",
 297 |     "test_text_list,test_opinion_list = get_list(path_test)"
 298 |    ]
 299 |   },
 300 |   {
 301 |    "cell_type": "code",
 302 |    "execution_count": 17,
 303 |    "metadata": {
 304 |     "collapsed": false
 305 |    },
 306 |    "outputs": [],
 307 |    "source": [
 308 |     "#tagged_text_list_test=posTag(test_text_list)\n",
 309 |     "#joblib.dump(tagged_text_list_test, 'tagged_text_list_test.pkl')\n",
 310 |     "tagged_text_list_test=joblib.load('tagged_text_list_test.pkl')"
 311 |    ]
 312 |   },
 313 |   {
 314 |    "cell_type": "code",
 315 |    "execution_count": 18,
 316 |    "metadata": {
 317 |     "collapsed": true
 318 |    },
 319 |    "outputs": [],
 320 |    "source": [
 321 |     "final_test_text_list=filterTag(tagged_text_list_test)"
 322 |    ]
 323 |   },
 324 |   {
 325 |    "cell_type": "code",
 326 |    "execution_count": 19,
 327 |    "metadata": {
 328 |     "collapsed": false
 329 |    },
 330 |    "outputs": [],
 331 |    "source": [
 332 |     "df_test = get_data_frame(final_test_text_list,test_opinion_list,most_common_aspect)\n",
 333 |     "df_test_aspect = get_aspect_data_frame(df_test,most_common_aspect)\n",
 334 |     "df_test_aspect = df_test_aspect.reindex_axis(sorted(df_test_aspect.columns), axis=1)"
 335 |    ]
 336 |   },
 337 |   {
 338 |    "cell_type": "code",
 339 |    "execution_count": 20,
 340 |    "metadata": {
 341 |     "collapsed": false
 342 |    },
 343 |    "outputs": [],
 344 |    "source": [
 345 |     "#Sort the data frame according to aspect's name and separate data(X) and target(y)\n",
 346 |     "#df_train_aspect = df_train_aspect.sample(frac=1).reset_index(drop=True) #For randoming\n",
 347 |     "X_train= df_train_aspect.Review\n",
 348 |     "y_train = df_train_aspect.drop('Review',1)\n",
 349 |     "\n",
 350 |     "#df_test_aspect = df_test_aspect.sample(frac=1).reset_index(drop=True) #For randoming\n",
 351 |     "X_test = df_test_aspect.Review\n",
 352 |     "y_test = df_test_aspect.drop('Review',1)"
 353 |    ]
 354 |   },
 355 |   {
 356 |    "cell_type": "code",
 357 |    "execution_count": 21,
 358 |    "metadata": {
 359 |     "collapsed": false
 360 |    },
 361 |    "outputs": [],
 362 |    "source": [
 363 |     "#Change y_train to numpy array\n",
 364 |     "import numpy as np\n",
 365 |     "y_train = np.asarray(y_train, dtype=np.int64)\n",
 366 |     "y_test = np.asarray(y_test, dtype=np.int64)"
 367 |    ]
 368 |   },
 369 |   {
 370 |    "cell_type": "code",
 371 |    "execution_count": 22,
 372 |    "metadata": {
 373 |     "collapsed": false
 374 |    },
 375 |    "outputs": [],
 376 |    "source": [
 377 |     "#Generate word vecotors using CountVectorizer\n",
 378 |     "from sklearn.feature_extraction.text import CountVectorizer\n",
 379 |     "from nltk import word_tokenize          \n",
 380 |     "from nltk.stem import WordNetLemmatizer \n",
 381 |     "vect = CountVectorizer(max_df=1.0,stop_words='english')  \n",
 382 |     "X_train_dtm = vect.fit_transform(X_train)\n",
 383 |     "X_test_dtm = vect.transform(X_test)"
 384 |    ]
 385 |   },
 386 |   {
 387 |    "cell_type": "code",
 388 |    "execution_count": 23,
 389 |    "metadata": {
 390 |     "collapsed": false,
 391 |     "scrolled": true
 392 |    },
 393 |    "outputs": [],
 394 |    "source": [
 395 |     "#Create various models. These are multi-label models.\n",
 396 |     "nb_classif = OneVsRestClassifier(MultinomialNB()).fit(X_train_dtm, y_train)\n",
 397 |     "C = 1.0 #SVregularization parameter\n",
 398 |     "svc = OneVsRestClassifier(svm.SVC(kernel='linear', C=C)).fit(X_train_dtm, y_train)\n",
 399 |     "lin_svc = OneVsRestClassifier(svm.LinearSVC(C=C)).fit(X_train_dtm, y_train)\n",
 400 |     "sgd = OneVsRestClassifier(SGDClassifier()).fit(X_train_dtm,y_train)"
 401 |    ]
 402 |   },
 403 |   {
 404 |    "cell_type": "code",
 405 |    "execution_count": 24,
 406 |    "metadata": {
 407 |     "collapsed": false
 408 |    },
 409 |    "outputs": [],
 410 |    "source": [
 411 |     "#Predict the test data using classifiers\n",
 412 |     "y_pred_class = nb_classif.predict(X_test_dtm)\n",
 413 |     "y_pred_class_svc = svc.predict(X_test_dtm)\n",
 414 |     "y_pred_class_lin_svc = lin_svc.predict(X_test_dtm)\n",
 415 |     "y_pred_class_sgd = sgd.predict(X_test_dtm)"
 416 |    ]
 417 |   },
 418 |   {
 419 |    "cell_type": "code",
 420 |    "execution_count": 25,
 421 |    "metadata": {
 422 |     "collapsed": true
 423 |    },
 424 |    "outputs": [],
 425 |    "source": [
 426 |     "#Following code to test metrics of all aspect extraction classifiers\n",
 427 |     "from sklearn import metrics"
 428 |    ]
 429 |   },
 430 |   {
 431 |    "cell_type": "code",
 432 |    "execution_count": 26,
 433 |    "metadata": {
 434 |     "collapsed": false
 435 |    },
 436 |    "outputs": [
 437 |     {
 438 |      "name": "stdout",
 439 |      "output_type": "stream",
 440 |      "text": [
 441 |       "0.025\n",
 442 |       "0.05\n",
 443 |       "0.05\n",
 444 |       "0.0375\n"
 445 |      ]
 446 |     }
 447 |    ],
 448 |    "source": [
 449 |     "print(metrics.accuracy_score(y_test,y_pred_class))\n",
 450 |     "print(metrics.accuracy_score(y_test,y_pred_class_svc))\n",
 451 |     "print(metrics.accuracy_score(y_test,y_pred_class_lin_svc))\n",
 452 |     "print(metrics.accuracy_score(y_test,y_pred_class_sgd))"
 453 |    ]
 454 |   },
 455 |   {
 456 |    "cell_type": "code",
 457 |    "execution_count": 27,
 458 |    "metadata": {
 459 |     "collapsed": false
 460 |    },
 461 |    "outputs": [
 462 |     {
 463 |      "name": "stdout",
 464 |      "output_type": "stream",
 465 |      "text": [
 466 |       "0.75\n",
 467 |       "0.711229946524\n",
 468 |       "0.732193732194\n",
 469 |       "0.700657894737\n"
 470 |      ]
 471 |     }
 472 |    ],
 473 |    "source": [
 474 |     "print(metrics.precision_score(y_test,y_pred_class,average='micro'))\n",
 475 |     "print(metrics.precision_score(y_test,y_pred_class_svc,average='micro'))\n",
 476 |     "print(metrics.precision_score(y_test,y_pred_class_lin_svc,average='micro'))\n",
 477 |     "print(metrics.precision_score(y_test,y_pred_class_sgd,average='micro'))"
 478 |    ]
 479 |   },
 480 |   {
 481 |    "cell_type": "code",
 482 |    "execution_count": 28,
 483 |    "metadata": {
 484 |     "collapsed": false
 485 |    },
 486 |    "outputs": [
 487 |     {
 488 |      "name": "stdout",
 489 |      "output_type": "stream",
 490 |      "text": [
 491 |       "0.457627118644\n",
 492 |       "0.64406779661\n",
 493 |       "0.622276029056\n",
 494 |       "0.515738498789\n"
 495 |      ]
 496 |     }
 497 |    ],
 498 |    "source": [
 499 |     "print(metrics.recall_score(y_test,y_pred_class,average='micro'))\n",
 500 |     "print(metrics.recall_score(y_test,y_pred_class_svc,average='micro'))\n",
 501 |     "print(metrics.recall_score(y_test,y_pred_class_lin_svc,average='micro'))\n",
 502 |     "print(metrics.recall_score(y_test,y_pred_class_sgd,average='micro'))"
 503 |    ]
 504 |   },
 505 |   {
 506 |    "cell_type": "code",
 507 |    "execution_count": 29,
 508 |    "metadata": {
 509 |     "collapsed": false
 510 |    },
 511 |    "outputs": [
 512 |     {
 513 |      "name": "stdout",
 514 |      "output_type": "stream",
 515 |      "text": [
 516 |       "0.568421052632\n",
 517 |       "0.675984752224\n",
 518 |       "0.67277486911\n",
 519 |       "0.594142259414\n"
 520 |      ]
 521 |     }
 522 |    ],
 523 |    "source": [
 524 |     "print(metrics.f1_score(y_test,y_pred_class,average='micro'))\n",
 525 |     "print(metrics.f1_score(y_test,y_pred_class_svc,average='micro'))\n",
 526 |     "print(metrics.f1_score(y_test,y_pred_class_lin_svc,average='micro'))\n",
 527 |     "print(metrics.f1_score(y_test,y_pred_class_sgd,average='micro'))"
 528 |    ]
 529 |   },
 530 |   {
 531 |    "cell_type": "code",
 532 |    "execution_count": 30,
 533 |    "metadata": {
 534 |     "collapsed": false,
 535 |     "scrolled": true
 536 |    },
 537 |    "outputs": [
 538 |     {
 539 |      "name": "stdout",
 540 |      "output_type": "stream",
 541 |      "text": [
 542 |       "             precision    recall  f1-score   support\n",
 543 |       "\n",
 544 |       "          0       0.67      0.14      0.24        14\n",
 545 |       "          1       0.71      0.50      0.59        24\n",
 546 |       "          2       0.00      0.00      0.00        12\n",
 547 |       "          3       0.00      0.00      0.00         4\n",
 548 |       "          4       0.00      0.00      0.00        21\n",
 549 |       "          5       0.00      0.00      0.00         8\n",
 550 |       "          6       0.00      0.00      0.00         7\n",
 551 |       "          7       0.76      0.64      0.69        39\n",
 552 |       "          8       1.00      1.00      1.00        80\n",
 553 |       "          9       0.44      0.17      0.24        24\n",
 554 |       "         10       0.62      0.70      0.65        46\n",
 555 |       "         11       0.00      0.00      0.00         5\n",
 556 |       "         12       0.57      0.30      0.39        27\n",
 557 |       "         13       0.57      0.45      0.50        29\n",
 558 |       "         14       0.77      0.33      0.47        30\n",
 559 |       "         15       0.00      0.00      0.00         4\n",
 560 |       "         16       0.00      0.00      0.00         9\n",
 561 |       "         17       0.00      0.00      0.00        15\n",
 562 |       "         18       0.00      0.00      0.00         4\n",
 563 |       "         19       0.60      0.27      0.37        11\n",
 564 |       "\n",
 565 |       "avg / total       0.57      0.46      0.49       413\n",
 566 |       "\n",
 567 |       "             precision    recall  f1-score   support\n",
 568 |       "\n",
 569 |       "          0       0.78      1.00      0.88        14\n",
 570 |       "          1       0.68      0.71      0.69        24\n",
 571 |       "          2       0.86      0.50      0.63        12\n",
 572 |       "          3       0.12      0.25      0.17         4\n",
 573 |       "          4       0.56      0.43      0.49        21\n",
 574 |       "          5       0.75      0.38      0.50         8\n",
 575 |       "          6       0.20      0.14      0.17         7\n",
 576 |       "          7       0.74      0.67      0.70        39\n",
 577 |       "          8       1.00      0.97      0.99        80\n",
 578 |       "          9       0.63      0.50      0.56        24\n",
 579 |       "         10       0.68      0.74      0.71        46\n",
 580 |       "         11       0.33      0.40      0.36         5\n",
 581 |       "         12       0.83      0.74      0.78        27\n",
 582 |       "         13       0.56      0.66      0.60        29\n",
 583 |       "         14       0.60      0.40      0.48        30\n",
 584 |       "         15       0.67      0.50      0.57         4\n",
 585 |       "         16       0.18      0.22      0.20         9\n",
 586 |       "         17       0.80      0.27      0.40        15\n",
 587 |       "         18       1.00      0.25      0.40         4\n",
 588 |       "         19       0.60      0.27      0.37        11\n",
 589 |       "\n",
 590 |       "avg / total       0.72      0.64      0.67       413\n",
 591 |       "\n",
 592 |       "             precision    recall  f1-score   support\n",
 593 |       "\n",
 594 |       "          0       0.78      1.00      0.88        14\n",
 595 |       "          1       0.65      0.71      0.68        24\n",
 596 |       "          2       1.00      0.42      0.59        12\n",
 597 |       "          3       0.17      0.25      0.20         4\n",
 598 |       "          4       0.64      0.43      0.51        21\n",
 599 |       "          5       1.00      0.25      0.40         8\n",
 600 |       "          6       0.33      0.14      0.20         7\n",
 601 |       "          7       0.74      0.67      0.70        39\n",
 602 |       "          8       1.00      0.96      0.98        80\n",
 603 |       "          9       0.61      0.46      0.52        24\n",
 604 |       "         10       0.70      0.72      0.71        46\n",
 605 |       "         11       0.25      0.20      0.22         5\n",
 606 |       "         12       0.83      0.74      0.78        27\n",
 607 |       "         13       0.54      0.66      0.59        29\n",
 608 |       "         14       0.61      0.37      0.46        30\n",
 609 |       "         15       1.00      0.25      0.40         4\n",
 610 |       "         16       0.20      0.22      0.21         9\n",
 611 |       "         17       1.00      0.20      0.33        15\n",
 612 |       "         18       1.00      0.25      0.40         4\n",
 613 |       "         19       0.75      0.27      0.40        11\n",
 614 |       "\n",
 615 |       "avg / total       0.75      0.62      0.66       413\n",
 616 |       "\n",
 617 |       "             precision    recall  f1-score   support\n",
 618 |       "\n",
 619 |       "          0       0.75      0.64      0.69        14\n",
 620 |       "          1       0.60      0.75      0.67        24\n",
 621 |       "          2       0.00      0.00      0.00        12\n",
 622 |       "          3       0.00      0.00      0.00         4\n",
 623 |       "          4       0.50      0.19      0.28        21\n",
 624 |       "          5       0.00      0.00      0.00         8\n",
 625 |       "          6       0.00      0.00      0.00         7\n",
 626 |       "          7       0.63      0.62      0.62        39\n",
 627 |       "          8       1.00      0.99      0.99        80\n",
 628 |       "          9       0.67      0.33      0.44        24\n",
 629 |       "         10       0.65      0.61      0.63        46\n",
 630 |       "         11       0.29      0.40      0.33         5\n",
 631 |       "         12       0.71      0.37      0.49        27\n",
 632 |       "         13       0.58      0.48      0.53        29\n",
 633 |       "         14       0.71      0.33      0.45        30\n",
 634 |       "         15       0.00      0.00      0.00         4\n",
 635 |       "         16       0.20      0.11      0.14         9\n",
 636 |       "         17       0.67      0.13      0.22        15\n",
 637 |       "         18       0.33      0.25      0.29         4\n",
 638 |       "         19       0.43      0.27      0.33        11\n",
 639 |       "\n",
 640 |       "avg / total       0.64      0.52      0.55       413\n",
 641 |       "\n"
 642 |      ]
 643 |     }
 644 |    ],
 645 |    "source": [
 646 |     "with warnings.catch_warnings():\n",
 647 |     "    warnings.simplefilter(\"ignore\")\n",
 648 |     "    print(metrics.classification_report(y_test, y_pred_class))\n",
 649 |     "    print(metrics.classification_report(y_test, y_pred_class_svc))\n",
 650 |     "    print(metrics.classification_report(y_test, y_pred_class_lin_svc))\n",
 651 |     "    print(metrics.classification_report(y_test, y_pred_class_sgd))"
 652 |    ]
 653 |   },
 654 |   {
 655 |    "cell_type": "code",
 656 |    "execution_count": 31,
 657 |    "metadata": {
 658 |     "collapsed": false
 659 |    },
 660 |    "outputs": [],
 661 |    "source": [
 662 |     "#Stage 2:\n",
 663 |     "#Generating extra feature that indicates which aspect category is present in the review\n",
 664 |     "train_dict_aspect=get_dict_aspect(y_train, most_common_aspect)\n",
 665 |     "d_train=DictVectorizer() \n",
 666 |     "X_train_aspect_dtm = d_train.fit_transform(train_dict_aspect)\n",
 667 |     "\n",
 668 |     "#y_test is used to generated extra feature in order to test the performance of 2nd classifer.\n",
 669 |     "#Use y_pred_class_svc(Highest performer for aspect classification) as input for extra feature to test the overall performace.\n",
 670 |     "test_dict_aspect=get_dict_aspect(y_test,most_common_aspect)\n",
 671 |     "d_test=DictVectorizer() \n",
 672 |     "X_test_aspect_dtm = d_test.fit_transform(test_dict_aspect)"
 673 |    ]
 674 |   },
 675 |   {
 676 |    "cell_type": "code",
 677 |    "execution_count": 32,
 678 |    "metadata": {
 679 |     "collapsed": false
 680 |    },
 681 |    "outputs": [],
 682 |    "source": [
 683 |     "#Function for classiflying positive,negative or neutral sentiment of all the aspects\n",
 684 |     "def classify_sentiment(df_train,df_test,X_train_aspect_dtm,X_test_aspect_dtm):\n",
 685 |     "    \n",
 686 |     "    df_train = df_train.reindex_axis(sorted(df_train_positive.columns), axis=1)\n",
 687 |     "    df_test = df_test.reindex_axis(sorted(df_test_positive.columns), axis=1)\n",
 688 |     "\n",
 689 |     "    import numpy as np\n",
 690 |     "    X_train = df_train.Review\n",
 691 |     "    y_train = df_train.drop('Review',1)\n",
 692 |     "    y_train = np.asarray(y_train, dtype=np.int64)\n",
 693 |     "\n",
 694 |     "    X_test = df_test.Review\n",
 695 |     "    y_test = df_test.drop('Review',1)\n",
 696 |     "    y_test = np.asarray(y_test, dtype=np.int64)\n",
 697 |     "\n",
 698 |     "    vect_sen = CountVectorizer(stop_words='english',ngram_range=(1,2))  \n",
 699 |     "    X_train_dtm = vect_sen.fit_transform(X_train)\n",
 700 |     "    X_test_dtm = vect_sen.transform(X_test)\n",
 701 |     "\n",
 702 |     "    #ombining word vector with extra feature.\n",
 703 |     "    from scipy.sparse import hstack\n",
 704 |     "    X_train_dtm=hstack((X_train_dtm, X_train_aspect_dtm))\n",
 705 |     "    X_test_dtm=hstack((X_test_dtm, X_test_aspect_dtm))\n",
 706 |     "\n",
 707 |     "    C = 1.0 #SVregularization parameter\n",
 708 |     "    nb_classif = OneVsRestClassifier(MultinomialNB()).fit(X_train_dtm, y_train)\n",
 709 |     "    svc = OneVsRestClassifier(svm.SVC(kernel='linear', C=C)).fit(X_train_dtm, y_train)\n",
 710 |     "    lin_svc = OneVsRestClassifier(svm.LinearSVC(C=C)).fit(X_train_dtm, y_train)\n",
 711 |     "    sgd = OneVsRestClassifier(SGDClassifier()).fit(X_train_dtm,y_train)\n",
 712 |     "\n",
 713 |     "    y_pred_class= nb_classif.predict(X_test_dtm)\n",
 714 |     "    y_pred_class_svc = svc.predict(X_test_dtm)\n",
 715 |     "    y_pred_class_lin_svc = lin_svc.predict(X_test_dtm)\n",
 716 |     "    y_pred_class_sgd = sgd.predict(X_test_dtm)\n",
 717 |     "    return (y_test,y_pred_class,y_pred_class_svc,y_pred_class_lin_svc,y_pred_class_sgd)"
 718 |    ]
 719 |   },
 720 |   {
 721 |    "cell_type": "code",
 722 |    "execution_count": 33,
 723 |    "metadata": {
 724 |     "collapsed": true
 725 |    },
 726 |    "outputs": [],
 727 |    "source": [
 728 |     "def print_metrices(y_test,y_pred_class,y_pred_class_svc,y_pred_class_lin_svc,y_pred_class_sgd):\n",
 729 |     "    print(\"Accuracy:\")\n",
 730 |     "    print(metrics.accuracy_score(y_test,y_pred_class))\n",
 731 |     "    print(metrics.accuracy_score(y_test,y_pred_class_svc))\n",
 732 |     "    print(metrics.accuracy_score(y_test,y_pred_class_lin_svc))\n",
 733 |     "    print(metrics.accuracy_score(y_test,y_pred_class_sgd))\n",
 734 |     "\n",
 735 |     "    print(\"\\nAverage precision:\")\n",
 736 |     "    print(metrics.precision_score(y_test,y_pred_class,average='micro'))\n",
 737 |     "    print(metrics.precision_score(y_test,y_pred_class_svc,average='micro'))\n",
 738 |     "    print(metrics.precision_score(y_test,y_pred_class_lin_svc,average='micro'))\n",
 739 |     "    print(metrics.precision_score(y_test,y_pred_class_sgd,average='micro'))\n",
 740 |     "\n",
 741 |     "    print(\"\\nAverage recall:\")\n",
 742 |     "    print(metrics.recall_score(y_test,y_pred_class,average='micro'))\n",
 743 |     "    print(metrics.recall_score(y_test,y_pred_class_svc,average='micro'))\n",
 744 |     "    print(metrics.recall_score(y_test,y_pred_class_lin_svc,average='micro'))\n",
 745 |     "    print(metrics.recall_score(y_test,y_pred_class_sgd,average='micro'))\n",
 746 |     "    \n",
 747 |     "    print(\"\\nAverage f1:\")\n",
 748 |     "    print(metrics.f1_score(y_test,y_pred_class,average='micro'))\n",
 749 |     "    print(metrics.f1_score(y_test,y_pred_class_svc,average='micro'))\n",
 750 |     "    print(metrics.f1_score(y_test,y_pred_class_lin_svc,average='micro'))\n",
 751 |     "    print(metrics.f1_score(y_test,y_pred_class_sgd,average='micro'))\n",
 752 |     "\n",
 753 |     "    print(\"\\nClassification report:\")\n",
 754 |     "    print(metrics.classification_report(y_test, y_pred_class))\n",
 755 |     "    print(metrics.classification_report(y_test, y_pred_class_svc))\n",
 756 |     "    print(metrics.classification_report(y_test, y_pred_class_lin_svc))\n",
 757 |     "    print(metrics.classification_report(y_test, y_pred_class_sgd))"
 758 |    ]
 759 |   },
 760 |   {
 761 |    "cell_type": "code",
 762 |    "execution_count": 34,
 763 |    "metadata": {
 764 |     "collapsed": false
 765 |    },
 766 |    "outputs": [
 767 |     {
 768 |      "name": "stdout",
 769 |      "output_type": "stream",
 770 |      "text": [
 771 |       "Accuracy:\n",
 772 |       "0.15\n",
 773 |       "0.35\n",
 774 |       "0.3125\n",
 775 |       "0.125\n",
 776 |       "\n",
 777 |       "Average precision:\n",
 778 |       "0.857142857143\n",
 779 |       "0.745762711864\n",
 780 |       "0.756097560976\n",
 781 |       "0.704545454545\n",
 782 |       "\n",
 783 |       "Average recall:\n",
 784 |       "0.260869565217\n",
 785 |       "0.797101449275\n",
 786 |       "0.673913043478\n",
 787 |       "0.673913043478\n",
 788 |       "\n",
 789 |       "Average f1:\n",
 790 |       "0.4\n",
 791 |       "0.77057793345\n",
 792 |       "0.712643678161\n",
 793 |       "0.688888888889\n",
 794 |       "\n",
 795 |       "Classification report:\n",
 796 |       "             precision    recall  f1-score   support\n",
 797 |       "\n",
 798 |       "          0       0.00      0.00      0.00        12\n",
 799 |       "          1       0.00      0.00      0.00        14\n",
 800 |       "          2       0.00      0.00      0.00         6\n",
 801 |       "          3       0.00      0.00      0.00         3\n",
 802 |       "          4       0.00      0.00      0.00        15\n",
 803 |       "          5       0.00      0.00      0.00         7\n",
 804 |       "          6       0.00      0.00      0.00         4\n",
 805 |       "          7       0.00      0.00      0.00        26\n",
 806 |       "          8       0.83      0.95      0.89        61\n",
 807 |       "          9       0.00      0.00      0.00        11\n",
 808 |       "         10       1.00      0.39      0.56        36\n",
 809 |       "         11       0.00      0.00      0.00         4\n",
 810 |       "         12       0.00      0.00      0.00        12\n",
 811 |       "         13       0.00      0.00      0.00        18\n",
 812 |       "         14       0.00      0.00      0.00        23\n",
 813 |       "         15       0.00      0.00      0.00         3\n",
 814 |       "         16       0.00      0.00      0.00         7\n",
 815 |       "         17       0.00      0.00      0.00         8\n",
 816 |       "         18       0.00      0.00      0.00         2\n",
 817 |       "         19       0.00      0.00      0.00         4\n",
 818 |       "\n",
 819 |       "avg / total       0.31      0.26      0.27       276\n",
 820 |       "\n",
 821 |       "             precision    recall  f1-score   support\n",
 822 |       "\n",
 823 |       "          0       0.91      0.83      0.87        12\n",
 824 |       "          1       0.67      0.71      0.69        14\n",
 825 |       "          2       0.50      1.00      0.67         6\n",
 826 |       "          3       0.75      1.00      0.86         3\n",
 827 |       "          4       0.65      0.73      0.69        15\n",
 828 |       "          5       0.75      0.43      0.55         7\n",
 829 |       "          6       0.67      0.50      0.57         4\n",
 830 |       "          7       0.71      0.77      0.74        26\n",
 831 |       "          8       0.89      0.93      0.91        61\n",
 832 |       "          9       0.47      0.82      0.60        11\n",
 833 |       "         10       0.89      0.89      0.89        36\n",
 834 |       "         11       0.75      0.75      0.75         4\n",
 835 |       "         12       0.60      0.75      0.67        12\n",
 836 |       "         13       0.72      0.72      0.72        18\n",
 837 |       "         14       0.75      0.91      0.82        23\n",
 838 |       "         15       0.67      0.67      0.67         3\n",
 839 |       "         16       0.25      0.14      0.18         7\n",
 840 |       "         17       0.80      0.50      0.62         8\n",
 841 |       "         18       0.67      1.00      0.80         2\n",
 842 |       "         19       1.00      0.50      0.67         4\n",
 843 |       "\n",
 844 |       "avg / total       0.76      0.80      0.77       276\n",
 845 |       "\n",
 846 |       "             precision    recall  f1-score   support\n",
 847 |       "\n",
 848 |       "          0       1.00      0.75      0.86        12\n",
 849 |       "          1       0.64      0.64      0.64        14\n",
 850 |       "          2       0.60      0.50      0.55         6\n",
 851 |       "          3       0.75      1.00      0.86         3\n",
 852 |       "          4       0.69      0.60      0.64        15\n",
 853 |       "          5       1.00      0.14      0.25         7\n",
 854 |       "          6       0.00      0.00      0.00         4\n",
 855 |       "          7       0.68      0.65      0.67        26\n",
 856 |       "          8       0.88      0.92      0.90        61\n",
 857 |       "          9       0.44      0.73      0.55        11\n",
 858 |       "         10       0.89      0.86      0.87        36\n",
 859 |       "         11       1.00      0.50      0.67         4\n",
 860 |       "         12       0.45      0.42      0.43        12\n",
 861 |       "         13       0.73      0.61      0.67        18\n",
 862 |       "         14       0.71      0.65      0.68        23\n",
 863 |       "         15       1.00      0.33      0.50         3\n",
 864 |       "         16       0.00      0.00      0.00         7\n",
 865 |       "         17       0.75      0.38      0.50         8\n",
 866 |       "         18       0.67      1.00      0.80         2\n",
 867 |       "         19       1.00      0.25      0.40         4\n",
 868 |       "\n",
 869 |       "avg / total       0.74      0.67      0.69       276\n",
 870 |       "\n",
 871 |       "             precision    recall  f1-score   support\n",
 872 |       "\n",
 873 |       "          0       1.00      0.58      0.74        12\n",
 874 |       "          1       0.64      1.00      0.78        14\n",
 875 |       "          2       0.50      0.83      0.62         6\n",
 876 |       "          3       0.75      1.00      0.86         3\n",
 877 |       "          4       0.72      0.87      0.79        15\n",
 878 |       "          5       0.50      0.14      0.22         7\n",
 879 |       "          6       0.75      0.75      0.75         4\n",
 880 |       "          7       0.53      0.31      0.39        26\n",
 881 |       "          8       0.88      0.85      0.87        61\n",
 882 |       "          9       0.38      0.55      0.44        11\n",
 883 |       "         10       0.78      0.97      0.86        36\n",
 884 |       "         11       0.80      1.00      0.89         4\n",
 885 |       "         12       0.67      0.17      0.27        12\n",
 886 |       "         13       0.73      0.44      0.55        18\n",
 887 |       "         14       0.73      0.83      0.78        23\n",
 888 |       "         15       0.00      0.00      0.00         3\n",
 889 |       "         16       0.00      0.00      0.00         7\n",
 890 |       "         17       0.33      0.12      0.18         8\n",
 891 |       "         18       0.67      1.00      0.80         2\n",
 892 |       "         19       0.30      0.75      0.43         4\n",
 893 |       "\n",
 894 |       "avg / total       0.69      0.67      0.65       276\n",
 895 |       "\n"
 896 |      ]
 897 |     }
 898 |    ],
 899 |    "source": [
 900 |     "#For positive sentiment classifier\n",
 901 |     "df_train = get_data_frame(final_train_text_list,train_opinion_list,most_common_aspect)\n",
 902 |     "df_test = get_data_frame(final_test_text_list,test_opinion_list,most_common_aspect)\n",
 903 |     "\n",
 904 |     "df_train_positive = get_positive_data_frame(df_train,most_common_aspect)\n",
 905 |     "df_test_positive = get_positive_data_frame(df_test,most_common_aspect)\n",
 906 |     "y_test_pos,y_pred_class_pos,y_pred_class_svc_pos,y_pred_class_lin_svc_pos,y_pred_class_sgd_pos=classify_sentiment(df_train_positive,df_test_positive,X_train_aspect_dtm,X_test_aspect_dtm)\n",
 907 |     "with warnings.catch_warnings():\n",
 908 |     "    warnings.simplefilter(\"ignore\")\n",
 909 |     "    print_metrices(y_test_pos,y_pred_class_pos,y_pred_class_svc_pos,y_pred_class_lin_svc_pos,y_pred_class_sgd_pos)"
 910 |    ]
 911 |   },
 912 |   {
 913 |    "cell_type": "code",
 914 |    "execution_count": 35,
 915 |    "metadata": {
 916 |     "collapsed": false
 917 |    },
 918 |    "outputs": [
 919 |     {
 920 |      "name": "stdout",
 921 |      "output_type": "stream",
 922 |      "text": [
 923 |       "Accuracy:\n",
 924 |       "0.4875\n",
 925 |       "0.4875\n",
 926 |       "0.4625\n",
 927 |       "0.3375\n",
 928 |       "\n",
 929 |       "Average precision:\n",
 930 |       "0.7\n",
 931 |       "0.625\n",
 932 |       "0.666666666667\n",
 933 |       "0.449438202247\n",
 934 |       "\n",
 935 |       "Average recall:\n",
 936 |       "0.0642201834862\n",
 937 |       "0.412844036697\n",
 938 |       "0.330275229358\n",
 939 |       "0.366972477064\n",
 940 |       "\n",
 941 |       "Average f1:\n",
 942 |       "0.117647058824\n",
 943 |       "0.497237569061\n",
 944 |       "0.441717791411\n",
 945 |       "0.40404040404\n",
 946 |       "\n",
 947 |       "Classification report:\n",
 948 |       "             precision    recall  f1-score   support\n",
 949 |       "\n",
 950 |       "          0       0.00      0.00      0.00         2\n",
 951 |       "          1       0.00      0.00      0.00        10\n",
 952 |       "          2       0.00      0.00      0.00         2\n",
 953 |       "          3       0.00      0.00      0.00         1\n",
 954 |       "          4       0.00      0.00      0.00         4\n",
 955 |       "          5       0.00      0.00      0.00         1\n",
 956 |       "          6       0.00      0.00      0.00         3\n",
 957 |       "          7       0.00      0.00      0.00         3\n",
 958 |       "          8       0.67      0.33      0.44        18\n",
 959 |       "          9       0.00      0.00      0.00         9\n",
 960 |       "         10       0.00      0.00      0.00        10\n",
 961 |       "         11       0.00      0.00      0.00         1\n",
 962 |       "         12       0.00      0.00      0.00        12\n",
 963 |       "         13       1.00      0.09      0.17        11\n",
 964 |       "         14       0.00      0.00      0.00         6\n",
 965 |       "         15       0.00      0.00      0.00         0\n",
 966 |       "         16       0.00      0.00      0.00         2\n",
 967 |       "         17       0.00      0.00      0.00         5\n",
 968 |       "         18       0.00      0.00      0.00         2\n",
 969 |       "         19       0.00      0.00      0.00         7\n",
 970 |       "\n",
 971 |       "avg / total       0.21      0.06      0.09       109\n",
 972 |       "\n",
 973 |       "             precision    recall  f1-score   support\n",
 974 |       "\n",
 975 |       "          0       0.50      0.50      0.50         2\n",
 976 |       "          1       0.67      0.60      0.63        10\n",
 977 |       "          2       0.00      0.00      0.00         2\n",
 978 |       "          3       0.00      0.00      0.00         1\n",
 979 |       "          4       0.50      0.25      0.33         4\n",
 980 |       "          5       0.00      0.00      0.00         1\n",
 981 |       "          6       1.00      0.67      0.80         3\n",
 982 |       "          7       0.00      0.00      0.00         3\n",
 983 |       "          8       0.77      0.56      0.65        18\n",
 984 |       "          9       1.00      0.22      0.36         9\n",
 985 |       "         10       0.78      0.70      0.74        10\n",
 986 |       "         11       0.00      0.00      0.00         1\n",
 987 |       "         12       0.75      0.25      0.38        12\n",
 988 |       "         13       0.70      0.64      0.67        11\n",
 989 |       "         14       0.00      0.00      0.00         6\n",
 990 |       "         15       0.00      0.00      0.00         0\n",
 991 |       "         16       0.00      0.00      0.00         2\n",
 992 |       "         17       0.00      0.00      0.00         5\n",
 993 |       "         18       0.00      0.00      0.00         2\n",
 994 |       "         19       0.75      0.86      0.80         7\n",
 995 |       "\n",
 996 |       "avg / total       0.60      0.41      0.47       109\n",
 997 |       "\n",
 998 |       "             precision    recall  f1-score   support\n",
 999 |       "\n",
1000 |       "          0       1.00      0.50      0.67         2\n",
1001 |       "          1       0.67      0.40      0.50        10\n",
1002 |       "          2       0.00      0.00      0.00         2\n",
1003 |       "          3       0.00      0.00      0.00         1\n",
1004 |       "          4       0.33      0.25      0.29         4\n",
1005 |       "          5       0.00      0.00      0.00         1\n",
1006 |       "          6       1.00      0.33      0.50         3\n",
1007 |       "          7       0.00      0.00      0.00         3\n",
1008 |       "          8       0.77      0.56      0.65        18\n",
1009 |       "          9       0.00      0.00      0.00         9\n",
1010 |       "         10       0.86      0.60      0.71        10\n",
1011 |       "         11       0.00      0.00      0.00         1\n",
1012 |       "         12       1.00      0.08      0.15        12\n",
1013 |       "         13       0.70      0.64      0.67        11\n",
1014 |       "         14       0.00      0.00      0.00         6\n",
1015 |       "         15       0.00      0.00      0.00         0\n",
1016 |       "         16       0.00      0.00      0.00         2\n",
1017 |       "         17       0.00      0.00      0.00         5\n",
1018 |       "         18       0.00      0.00      0.00         2\n",
1019 |       "         19       0.83      0.71      0.77         7\n",
1020 |       "\n",
1021 |       "avg / total       0.56      0.33      0.39       109\n",
1022 |       "\n",
1023 |       "             precision    recall  f1-score   support\n",
1024 |       "\n",
1025 |       "          0       1.00      0.50      0.67         2\n",
1026 |       "          1       0.71      0.50      0.59        10\n",
1027 |       "          2       0.00      0.00      0.00         2\n",
1028 |       "          3       0.00      0.00      0.00         1\n",
1029 |       "          4       1.00      0.25      0.40         4\n",
1030 |       "          5       0.00      0.00      0.00         1\n",
1031 |       "          6       0.67      0.67      0.67         3\n",
1032 |       "          7       0.13      0.67      0.22         3\n",
1033 |       "          8       0.38      0.67      0.48        18\n",
1034 |       "          9       1.00      0.11      0.20         9\n",
1035 |       "         10       0.50      0.70      0.58        10\n",
1036 |       "         11       0.00      0.00      0.00         1\n",
1037 |       "         12       0.00      0.00      0.00        12\n",
1038 |       "         13       0.75      0.27      0.40        11\n",
1039 |       "         14       0.00      0.00      0.00         6\n",
1040 |       "         15       0.00      0.00      0.00         0\n",
1041 |       "         16       0.00      0.00      0.00         2\n",
1042 |       "         17       0.00      0.00      0.00         5\n",
1043 |       "         18       0.00      0.00      0.00         2\n",
1044 |       "         19       0.86      0.86      0.86         7\n",
1045 |       "\n",
1046 |       "avg / total       0.46      0.37      0.35       109\n",
1047 |       "\n"
1048 |      ]
1049 |     }
1050 |    ],
1051 |    "source": [
1052 |     "#For negative sentiment classifier\n",
1053 |     "df_train = get_data_frame(final_train_text_list,train_opinion_list,most_common_aspect)\n",
1054 |     "df_test = get_data_frame(final_test_text_list,test_opinion_list,most_common_aspect)\n",
1055 |     "\n",
1056 |     "df_train_neg = get_negative_data_frame(df_train,most_common_aspect)\n",
1057 |     "df_test_neg = get_negative_data_frame(df_test,most_common_aspect)\n",
1058 |     "\n",
1059 |     "y_test_neg,y_pred_class_neg,y_pred_class_svc_neg,y_pred_class_lin_svc_neg,y_pred_class_sgd_neg=classify_sentiment(df_train_neg,df_test_neg,X_train_aspect_dtm,X_test_aspect_dtm)\n",
1060 |     "with warnings.catch_warnings():\n",
1061 |     "    warnings.simplefilter(\"ignore\")\n",
1062 |     "    print_metrices(y_test_neg,y_pred_class_neg,y_pred_class_svc_neg,y_pred_class_lin_svc_neg,y_pred_class_sgd_neg)"
1063 |    ]
1064 |   },
1065 |   {
1066 |    "cell_type": "code",
1067 |    "execution_count": 36,
1068 |    "metadata": {
1069 |     "collapsed": false
1070 |    },
1071 |    "outputs": [
1072 |     {
1073 |      "name": "stdout",
1074 |      "output_type": "stream",
1075 |      "text": [
1076 |       "Accuracy:\n",
1077 |       "0.7375\n",
1078 |       "0.725\n",
1079 |       "0.7375\n",
1080 |       "0.5875\n",
1081 |       "\n",
1082 |       "Average precision:\n",
1083 |       "0.0\n",
1084 |       "0.153846153846\n",
1085 |       "0.333333333333\n",
1086 |       "0.105263157895\n",
1087 |       "\n",
1088 |       "Average recall:\n",
1089 |       "0.0\n",
1090 |       "0.0714285714286\n",
1091 |       "0.0357142857143\n",
1092 |       "0.0714285714286\n",
1093 |       "\n",
1094 |       "Average f1:\n",
1095 |       "0.0\n",
1096 |       "0.0975609756098\n",
1097 |       "0.0645161290323\n",
1098 |       "0.0851063829787\n",
1099 |       "\n",
1100 |       "Classification report:\n",
1101 |       "             precision    recall  f1-score   support\n",
1102 |       "\n",
1103 |       "          0       0.00      0.00      0.00         0\n",
1104 |       "          1       0.00      0.00      0.00         0\n",
1105 |       "          2       0.00      0.00      0.00         4\n",
1106 |       "          3       0.00      0.00      0.00         0\n",
1107 |       "          4       0.00      0.00      0.00         2\n",
1108 |       "          5       0.00      0.00      0.00         0\n",
1109 |       "          6       0.00      0.00      0.00         0\n",
1110 |       "          7       0.00      0.00      0.00        10\n",
1111 |       "          8       0.00      0.00      0.00         1\n",
1112 |       "          9       0.00      0.00      0.00         4\n",
1113 |       "         10       0.00      0.00      0.00         0\n",
1114 |       "         11       0.00      0.00      0.00         0\n",
1115 |       "         12       0.00      0.00      0.00         3\n",
1116 |       "         13       0.00      0.00      0.00         0\n",
1117 |       "         14       0.00      0.00      0.00         1\n",
1118 |       "         15       0.00      0.00      0.00         1\n",
1119 |       "         16       0.00      0.00      0.00         0\n",
1120 |       "         17       0.00      0.00      0.00         2\n",
1121 |       "         18       0.00      0.00      0.00         0\n",
1122 |       "         19       0.00      0.00      0.00         0\n",
1123 |       "\n",
1124 |       "avg / total       0.00      0.00      0.00        28\n",
1125 |       "\n",
1126 |       "             precision    recall  f1-score   support\n",
1127 |       "\n",
1128 |       "          0       0.00      0.00      0.00         0\n",
1129 |       "          1       0.00      0.00      0.00         0\n",
1130 |       "          2       0.00      0.00      0.00         4\n",
1131 |       "          3       0.00      0.00      0.00         0\n",
1132 |       "          4       0.00      0.00      0.00         2\n",
1133 |       "          5       0.00      0.00      0.00         0\n",
1134 |       "          6       0.00      0.00      0.00         0\n",
1135 |       "          7       0.29      0.20      0.24        10\n",
1136 |       "          8       0.00      0.00      0.00         1\n",
1137 |       "          9       0.00      0.00      0.00         4\n",
1138 |       "         10       0.00      0.00      0.00         0\n",
1139 |       "         11       0.00      0.00      0.00         0\n",
1140 |       "         12       0.00      0.00      0.00         3\n",
1141 |       "         13       0.00      0.00      0.00         0\n",
1142 |       "         14       0.00      0.00      0.00         1\n",
1143 |       "         15       0.00      0.00      0.00         1\n",
1144 |       "         16       0.00      0.00      0.00         0\n",
1145 |       "         17       0.00      0.00      0.00         2\n",
1146 |       "         18       0.00      0.00      0.00         0\n",
1147 |       "         19       0.00      0.00      0.00         0\n",
1148 |       "\n",
1149 |       "avg / total       0.10      0.07      0.08        28\n",
1150 |       "\n",
1151 |       "             precision    recall  f1-score   support\n",
1152 |       "\n",
1153 |       "          0       0.00      0.00      0.00         0\n",
1154 |       "          1       0.00      0.00      0.00         0\n",
1155 |       "          2       0.00      0.00      0.00         4\n",
1156 |       "          3       0.00      0.00      0.00         0\n",
1157 |       "          4       0.00      0.00      0.00         2\n",
1158 |       "          5       0.00      0.00      0.00         0\n",
1159 |       "          6       0.00      0.00      0.00         0\n",
1160 |       "          7       0.33      0.10      0.15        10\n",
1161 |       "          8       0.00      0.00      0.00         1\n",
1162 |       "          9       0.00      0.00      0.00         4\n",
1163 |       "         10       0.00      0.00      0.00         0\n",
1164 |       "         11       0.00      0.00      0.00         0\n",
1165 |       "         12       0.00      0.00      0.00         3\n",
1166 |       "         13       0.00      0.00      0.00         0\n",
1167 |       "         14       0.00      0.00      0.00         1\n",
1168 |       "         15       0.00      0.00      0.00         1\n",
1169 |       "         16       0.00      0.00      0.00         0\n",
1170 |       "         17       0.00      0.00      0.00         2\n",
1171 |       "         18       0.00      0.00      0.00         0\n",
1172 |       "         19       0.00      0.00      0.00         0\n",
1173 |       "\n",
1174 |       "avg / total       0.12      0.04      0.05        28\n",
1175 |       "\n",
1176 |       "             precision    recall  f1-score   support\n",
1177 |       "\n",
1178 |       "          0       0.00      0.00      0.00         0\n",
1179 |       "          1       0.00      0.00      0.00         0\n",
1180 |       "          2       0.00      0.00      0.00         4\n",
1181 |       "          3       0.00      0.00      0.00         0\n",
1182 |       "          4       0.00      0.00      0.00         2\n",
1183 |       "          5       0.00      0.00      0.00         0\n",
1184 |       "          6       0.00      0.00      0.00         0\n",
1185 |       "          7       0.00      0.00      0.00        10\n",
1186 |       "          8       0.00      0.00      0.00         1\n",
1187 |       "          9       0.00      0.00      0.00         4\n",
1188 |       "         10       0.00      0.00      0.00         0\n",
1189 |       "         11       0.00      0.00      0.00         0\n",
1190 |       "         12       0.18      0.67      0.29         3\n",
1191 |       "         13       0.00      0.00      0.00         0\n",
1192 |       "         14       0.00      0.00      0.00         1\n",
1193 |       "         15       0.00      0.00      0.00         1\n",
1194 |       "         16       0.00      0.00      0.00         0\n",
1195 |       "         17       0.00      0.00      0.00         2\n",
1196 |       "         18       0.00      0.00      0.00         0\n",
1197 |       "         19       0.00      0.00      0.00         0\n",
1198 |       "\n",
1199 |       "avg / total       0.02      0.07      0.03        28\n",
1200 |       "\n"
1201 |      ]
1202 |     }
1203 |    ],
1204 |    "source": [
1205 |     "#For neutral or conflict sentiment classifier\n",
1206 |     "df_train = get_data_frame(final_train_text_list,train_opinion_list,most_common_aspect)\n",
1207 |     "df_test = get_data_frame(final_test_text_list,test_opinion_list,most_common_aspect)\n",
1208 |     "\n",
1209 |     "df_train_neu = get_neutral_data_frame(df_train,most_common_aspect)\n",
1210 |     "df_test_neu = get_neutral_data_frame(df_test,most_common_aspect)\n",
1211 |     "\n",
1212 |     "y_test_neu,y_pred_class_neu,y_pred_class_svc_neu,y_pred_class_lin_svc_neu,y_pred_class_sgd_neu=classify_sentiment(df_train_neu,df_test_neu,X_train_aspect_dtm,X_test_aspect_dtm)\n",
1213 |     "with warnings.catch_warnings():\n",
1214 |     "    warnings.simplefilter(\"ignore\")\n",
1215 |     "    print_metrices(y_test_neu,y_pred_class_neu,y_pred_class_svc_neu,y_pred_class_lin_svc_neu,y_pred_class_sgd_neu)"
1216 |    ]
1217 |   },
1218 |   {
1219 |    "cell_type": "code",
1220 |    "execution_count": 37,
1221 |    "metadata": {
1222 |     "collapsed": false,
1223 |     "scrolled": false
1224 |    },
1225 |    "outputs": [
1226 |     {
1227 |      "name": "stdout",
1228 |      "output_type": "stream",
1229 |      "text": [
1230 |       "Enter a laptop review:\n",
1231 |       "\n",
1232 |       "This is my first asus laptop. So far i am really enjoying this laptop. 512GB SSD is super fast. Battery life is also good and can last very long. I have no complain on screen quality too as display supports 4k videos. Maybe that is why it costs a lot. This is an expensive laptop and it's price is very high compared to other laptops of similar specs. So, if you have no trouble paying for this laptop, it is pretty good.\n"
1233 |      ]
1234 |     },
1235 |     {
1236 |      "data": {
1237 |       "text/plain": [
1238 |        "array([[1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0]])"
1239 |       ]
1240 |      },
1241 |      "execution_count": 37,
1242 |      "metadata": {},
1243 |      "output_type": "execute_result"
1244 |     }
1245 |    ],
1246 |    "source": [
1247 |     "#Aspect Based Sentiment analyis of user's input.\n",
1248 |     "user_input=input(\"Enter a laptop review:\\n\\n\")\n",
1249 |     "#Preprocessing and vectorizing\n",
1250 |     "tagged_user_input = posTag([user_input])\n",
1251 |     "filter_tagged_user_input = filterTag(tagged_user_input)\n",
1252 |     "\n",
1253 |     "user_input_series=pd.Series(filter_tagged_user_input)\n",
1254 |     "user_input_series_dtm=vect.transform(user_input_series)\n",
1255 |     "\n",
1256 |     "predict_aspect= svc.predict(user_input_series_dtm)\n",
1257 |     "extra_feature=get_dict_aspect(predict_aspect, most_common_aspect)\n",
1258 |     "extra_feature_dtm=DictVectorizer().fit_transform(extra_feature)\n",
1259 |     "predict_aspect"
1260 |    ]
1261 |   },
1262 |   {
1263 |    "cell_type": "code",
1264 |    "execution_count": 38,
1265 |    "metadata": {
1266 |     "collapsed": false
1267 |    },
1268 |    "outputs": [
1269 |     {
1270 |      "data": {
1271 |       "text/plain": [
1272 |        "array([[1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0]])"
1273 |       ]
1274 |      },
1275 |      "execution_count": 38,
1276 |      "metadata": {},
1277 |      "output_type": "execute_result"
1278 |     }
1279 |    ],
1280 |    "source": [
1281 |     "#predicting weather the dectected aspect is positive or not\n",
1282 |     "test_opinion_list=[]\n",
1283 |     "df_test = get_data_frame(filter_tagged_user_input,test_opinion_list,most_common_aspect)\n",
1284 |     "df_train = get_data_frame(final_train_text_list,train_opinion_list,most_common_aspect)\n",
1285 |     "\n",
1286 |     "df_train_positive = get_positive_data_frame(df_train,most_common_aspect)\n",
1287 |     "y_test_pos,y_pred_class_pos,y_pred_class_svc_pos,y_pred_class_lin_svc_pos,y_pred_class_sgd_pos=classify_sentiment(df_train_positive,df_test,X_train_aspect_dtm,extra_feature_dtm)\n",
1288 |     "\n",
1289 |     "y_pred_class_svc_pos"
1290 |    ]
1291 |   },
1292 |   {
1293 |    "cell_type": "code",
1294 |    "execution_count": 39,
1295 |    "metadata": {
1296 |     "collapsed": false
1297 |    },
1298 |    "outputs": [
1299 |     {
1300 |      "data": {
1301 |       "text/plain": [
1302 |        "array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])"
1303 |       ]
1304 |      },
1305 |      "execution_count": 39,
1306 |      "metadata": {},
1307 |      "output_type": "execute_result"
1308 |     }
1309 |    ],
1310 |    "source": [
1311 |     "#predicting weather the dectected aspect is negative or not\n",
1312 |     "test_opinion_list=[]\n",
1313 |     "df_test = get_data_frame(filter_tagged_user_input,test_opinion_list,most_common_aspect)\n",
1314 |     "df_train = get_data_frame(final_train_text_list,train_opinion_list,most_common_aspect)\n",
1315 |     "\n",
1316 |     "df_train_negative = get_negative_data_frame(df_train,most_common_aspect)\n",
1317 |     "y_test_neg,y_pred_class_neg,y_pred_class_svc_neg,y_pred_class_lin_svc_neg,y_pred_class_sgd_neg=classify_sentiment(df_train_negative,df_test,X_train_aspect_dtm,extra_feature_dtm)\n",
1318 |     "\n",
1319 |     "y_pred_class_svc_neg"
1320 |    ]
1321 |   },
1322 |   {
1323 |    "cell_type": "code",
1324 |    "execution_count": 40,
1325 |    "metadata": {
1326 |     "collapsed": false
1327 |    },
1328 |    "outputs": [
1329 |     {
1330 |      "data": {
1331 |       "text/plain": [
1332 |        "array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]])"
1333 |       ]
1334 |      },
1335 |      "execution_count": 40,
1336 |      "metadata": {},
1337 |      "output_type": "execute_result"
1338 |     }
1339 |    ],
1340 |    "source": [
1341 |     "#predicting weather the dectected aspect is neutral or coflict or not\n",
1342 |     "test_opinion_list=[]\n",
1343 |     "df_test = get_data_frame(filter_tagged_user_input,test_opinion_list,most_common_aspect)\n",
1344 |     "df_train = get_data_frame(final_train_text_list,train_opinion_list,most_common_aspect)\n",
1345 |     "\n",
1346 |     "df_train_neutral = get_neutral_data_frame(df_train,most_common_aspect)\n",
1347 |     "y_test_neu,y_pred_class_neu,y_pred_class_svc_neu,y_pred_class_lin_svc_neu,y_pred_class_sgd_neu=classify_sentiment(df_train_neutral,df_test,X_train_aspect_dtm,extra_feature_dtm)\n",
1348 |     "\n",
1349 |     "y_pred_class_svc_neu"
1350 |    ]
1351 |   },
1352 |   {
1353 |    "cell_type": "code",
1354 |    "execution_count": 41,
1355 |    "metadata": {
1356 |     "collapsed": false
1357 |    },
1358 |    "outputs": [
1359 |     {
1360 |      "data": {
1361 |       "text/plain": [
1362 |        "[0, 3, 8, 10, 13]"
1363 |       ]
1364 |      },
1365 |      "execution_count": 41,
1366 |      "metadata": {},
1367 |      "output_type": "execute_result"
1368 |     }
1369 |    ],
1370 |    "source": [
1371 |     "#Finding the aspect that is positive\n",
1372 |     "index_positive=[]\n",
1373 |     "for i, (a, b) in enumerate(zip(predict_aspect.tolist()[0], y_pred_class_svc_pos.tolist()[0])):\n",
1374 |     "    if a ==1 and b==1:\n",
1375 |     "        index_positive.append(i)\n",
1376 |     "index_positive         "
1377 |    ]
1378 |   },
1379 |   {
1380 |    "cell_type": "code",
1381 |    "execution_count": 42,
1382 |    "metadata": {
1383 |     "collapsed": false
1384 |    },
1385 |    "outputs": [
1386 |     {
1387 |      "data": {
1388 |       "text/plain": [
1389 |        "[]"
1390 |       ]
1391 |      },
1392 |      "execution_count": 42,
1393 |      "metadata": {},
1394 |      "output_type": "execute_result"
1395 |     }
1396 |    ],
1397 |    "source": [
1398 |     "#Finding the aspect that is negative\n",
1399 |     "index_negative=[]\n",
1400 |     "for i, (a, b) in enumerate(zip(predict_aspect.tolist()[0], y_pred_class_svc_neg.tolist()[0])):\n",
1401 |     "    if a ==1 and b==1:\n",
1402 |     "        index_negative.append(i)\n",
1403 |     "index_negative         "
1404 |    ]
1405 |   },
1406 |   {
1407 |    "cell_type": "code",
1408 |    "execution_count": 43,
1409 |    "metadata": {
1410 |     "collapsed": false
1411 |    },
1412 |    "outputs": [
1413 |     {
1414 |      "data": {
1415 |       "text/plain": [
1416 |        "[12]"
1417 |       ]
1418 |      },
1419 |      "execution_count": 43,
1420 |      "metadata": {},
1421 |      "output_type": "execute_result"
1422 |     }
1423 |    ],
1424 |    "source": [
1425 |     "#Finding the aspect that is neutral\n",
1426 |     "index_neutral=[]\n",
1427 |     "for i, (a, b) in enumerate(zip(predict_aspect.tolist()[0], y_pred_class_svc_neu.tolist()[0])):\n",
1428 |     "    if a ==1 and b==1:\n",
1429 |     "        index_neutral.append(i)\n",
1430 |     "index_neutral         "
1431 |    ]
1432 |   },
1433 |   {
1434 |    "cell_type": "code",
1435 |    "execution_count": 44,
1436 |    "metadata": {
1437 |     "collapsed": true
1438 |    },
1439 |    "outputs": [],
1440 |    "source": [
1441 |     "output=[]"
1442 |    ]
1443 |   },
1444 |   {
1445 |    "cell_type": "code",
1446 |    "execution_count": 45,
1447 |    "metadata": {
1448 |     "collapsed": false
1449 |    },
1450 |    "outputs": [],
1451 |    "source": [
1452 |     "if index_positive:\n",
1453 |     "    for index in index_positive:\n",
1454 |     "        output.append(sorted(most_common_aspect)[index]+\": positive\")"
1455 |    ]
1456 |   },
1457 |   {
1458 |    "cell_type": "code",
1459 |    "execution_count": 46,
1460 |    "metadata": {
1461 |     "collapsed": false
1462 |    },
1463 |    "outputs": [],
1464 |    "source": [
1465 |     "if index_negative:\n",
1466 |     "    for index in index_negative:\n",
1467 |     "        output.append(sorted(most_common_aspect)[index]+\": negative\")"
1468 |    ]
1469 |   },
1470 |   {
1471 |    "cell_type": "code",
1472 |    "execution_count": 47,
1473 |    "metadata": {
1474 |     "collapsed": false
1475 |    },
1476 |    "outputs": [],
1477 |    "source": [
1478 |     "if index_neutral:\n",
1479 |     "    for index in index_neutral:\n",
1480 |     "        output.append(sorted(most_common_aspect)[index]+\": neutral or conflict\")"
1481 |    ]
1482 |   },
1483 |   {
1484 |    "cell_type": "code",
1485 |    "execution_count": 48,
1486 |    "metadata": {
1487 |     "collapsed": false
1488 |    },
1489 |    "outputs": [
1490 |     {
1491 |      "data": {
1492 |       "text/plain": [
1493 |        "['BATTERY_OPERATION_PERFORMANCE: positive',\n",
1494 |        " 'DISPLAY_GENERAL: positive',\n",
1495 |        " 'LAPTOP_GENERAL: positive',\n",
1496 |        " 'LAPTOP_OPERATION_PERFORMANCE: positive',\n",
1497 |        " 'LAPTOP_QUALITY: positive',\n",
1498 |        " 'LAPTOP_PRICE: neutral or conflict']"
1499 |       ]
1500 |      },
1501 |      "execution_count": 48,
1502 |      "metadata": {},
1503 |      "output_type": "execute_result"
1504 |     }
1505 |    ],
1506 |    "source": [
1507 |     "#Prediction of Aspect Based Sentiment Analaysis for user's input\n",
1508 |     "output"
1509 |    ]
1510 |   }
1511 |  ],
1512 |  "metadata": {
1513 |   "kernelspec": {
1514 |    "display_name": "Python 3",
1515 |    "language": "python",
1516 |    "name": "python3"
1517 |   },
1518 |   "language_info": {
1519 |    "codemirror_mode": {
1520 |     "name": "ipython",
1521 |     "version": 3
1522 |    },
1523 |    "file_extension": ".py",
1524 |    "mimetype": "text/x-python",
1525 |    "name": "python",
1526 |    "nbconvert_exporter": "python",
1527 |    "pygments_lexer": "ipython3",
1528 |    "version": "3.6.0"
1529 |   }
1530 |  },
1531 |  "nbformat": 4,
1532 |  "nbformat_minor": 2
1533 | }
1534 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Aspect-Based-Sentiment-Analysis
 2 | 
 3 | Aspect Based Sentiment Analysis task focuses on the recognition of aspect category and classification of emotions (positive, negative, neutral) in text. 
 4 | The aim of Aspect Based Sentiment Analysis is to determine the sentiment polarity expressed towards certain aspect. 
 5 | This system is based on supervised learning using Support Vector Machine (SVM). 
 6 | Multi label svc classifier is used to classify the laptop reviews into 20 aspect categories and also to classify the review as positive, negative or neutral. For each sentiment, a multi label svc classifier is used. Performance of the system is also compared with other classifiers. 
 7 | 
 8 | # Dataset
 9 | [Dataset is avaibale at SemEval website.](http://alt.qcri.org/semeval2016/task5/index.php?id=data-and-tools)
10 | 
11 | # Required libraries
12 |   * Scikit Learn
13 |   * Nltk
14 |   * pandas
15 |   * numpy
16 | 
17 | P.S. This is a Mini-Project for a 2 month crash course (NLP) in the Final Year at the Department of Computer Science and Engineering, Kathmandu University.
18 | 


--------------------------------------------------------------------------------
/tagged_text_list_test.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thestrox/Aspect-Based-Sentiment-Analysis/965a97fbfd933d0b7e94885da6d6cfaa5a0e714d/tagged_text_list_test.pkl


--------------------------------------------------------------------------------
/tagged_text_list_train.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thestrox/Aspect-Based-Sentiment-Analysis/965a97fbfd933d0b7e94885da6d6cfaa5a0e714d/tagged_text_list_train.pkl


--------------------------------------------------------------------------------