├── README.md └── mtl_sentiment_analysis.ipynb /README.md: -------------------------------------------------------------------------------- 1 | # MultiTask-Sentiment-Analysis 2 | 3 | Draft implementation for our SIGIR 2017 paper : "Multitask Learning for Fine-Grained Twitter Sentiment Analysis". If you find this code useful in your research, please consider citing: 4 | 5 | @inproceedings{Balikas:2017:MLF:3077136.3080702, 6 | author = {Balikas, Georgios and Moura, Simon and Amini, Massih-Reza}, 7 | title = {Multitask Learning for Fine-Grained Twitter Sentiment Analysis}, 8 | booktitle = {Proceedings of the 40th International ACM SIGIR Conference on Research and Development in Information Retrieval}, 9 | series = {SIGIR '17}, 10 | year = {2017}, 11 | isbn = {978-1-4503-5022-8}, 12 | location = {Shinjuku, Tokyo, Japan}, 13 | pages = {1005--1008}, 14 | numpages = {4}, 15 | url = {http://doi.acm.org/10.1145/3077136.3080702}, 16 | doi = {10.1145/3077136.3080702}, 17 | acmid = {3080702}, 18 | publisher = {ACM}, 19 | address = {New York, NY, USA}, 20 | keywords = {bilstm, deep learning, multitask learning, sentiment analysis, text classification, text mining, twitter analysis}, 21 | } 22 | -------------------------------------------------------------------------------- /mtl_sentiment_analysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [ 10 | { 11 | "name": "stderr", 12 | "output_type": "stream", 13 | "text": [ 14 | "Using Theano backend.\n" 15 | ] 16 | } 17 | ], 18 | "source": [ 19 | "from functions import *\n", 20 | "from twitterTokenizer import Tokenizer\n", 21 | "import numpy as np, random\n", 22 | "import subprocess \n", 23 | "np.random.seed(1337) # for reproducibility\n", 24 | "from keras.layers.normalization import BatchNormalization\n", 25 | "from keras.preprocessing import sequence\n", 26 | "from keras.utils import np_utils\n", 27 | "from keras.models import Sequential, Model\n", 28 | "from keras.layers import Dense, Dropout, Activation, Embedding, Bidirectional, LSTM, Input, merge" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 2, 34 | "metadata": { 35 | "collapsed": false 36 | }, 37 | "outputs": [], 38 | "source": [ 39 | "X_train, y_train = load_SemEval_from_file('./data/subtaskCE.train_dev.tsv')\n", 40 | "X_dev, y_dev = load_SemEval_from_file('./data/subtaskCE.devtest.tsv')\n", 41 | "X_test, y_test = load_SemEval_SubTaskCE_Test('./data/SemEval2016-task4-test.subtask-BCDE.txt', './data/SemEval2016_task4_subtaskC_test_gold.txt')\n", 42 | "X_train_ternary, y_train_ternary = load_SemEval_subtaskA('./data/subtaskA.downloaded.tsv')" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 3, 48 | "metadata": { 49 | "collapsed": false 50 | }, 51 | "outputs": [ 52 | { 53 | "data": { 54 | "text/plain": [ 55 | "((7292, 1368), (1778, 1368), (20632, 1368), (5500, 1368))" 56 | ] 57 | }, 58 | "execution_count": 3, 59 | "metadata": {}, 60 | "output_type": "execute_result" 61 | } 62 | ], 63 | "source": [ 64 | "X_train_additional = load_sparse_csr('./additional_features/X_train_additional.npz', )\n", 65 | "X_dev_additional = load_sparse_csr('./additional_features/X_dev_additional.npz', )\n", 66 | "X_test_additional = load_sparse_csr('./additional_features/X_test_additional.npz',)\n", 67 | "X_ternary_additional = load_sparse_csr('./additional_features/X_ternary_additional.npz',)\n", 68 | "X_train_additional.shape, X_dev_additional.shape, X_test_additional.shape, X_ternary_additional.shape" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": 4, 74 | "metadata": { 75 | "collapsed": false 76 | }, 77 | "outputs": [ 78 | { 79 | "name": "stdout", 80 | "output_type": "stream", 81 | "text": [ 82 | "('Train shape', (7292, 14356), 'Dev shape', (1778, 14356), 'Test shape', (20632, 14356), '14356 vocabulary terms found')\n" 83 | ] 84 | } 85 | ], 86 | "source": [ 87 | "from sklearn.feature_extraction.text import CountVectorizer\n", 88 | "MAX_FEATURES, MAX_LEN, BATCH_SIZE = 11000, 30, 64\n", 89 | "\n", 90 | "tokenizer = Tokenizer(preserve_case=False)\n", 91 | "\n", 92 | "vec = CountVectorizer( ngram_range=(1,1), analyzer='word', tokenizer=tokenizer.tokenize, stop_words=None)\n", 93 | "vec.fit(X_train+X_train_ternary)\n", 94 | "\n", 95 | "x_train = vec.transform(X_train)\n", 96 | "x_train_ternary = vec.transform(X_train_ternary)\n", 97 | "x_dev = vec.transform(X_dev)\n", 98 | "x_test = vec.transform(X_test)\n", 99 | "\n", 100 | "print(\"Train shape\", x_train.shape, \"Dev shape\", x_dev.shape, \"Test shape\", x_test.shape, \"%d vocabulary terms found\"%len(vec.vocabulary_))" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": 5, 106 | "metadata": { 107 | "collapsed": true 108 | }, 109 | "outputs": [], 110 | "source": [ 111 | "x_train_nn = np.split(x_train.indices, x_train.indptr[1:-1])\n", 112 | "x_train_ternary_nn = np.split(x_train_ternary.indices, x_train_ternary.indptr[1:-1])\n", 113 | "x_dev_nn = np.split(x_dev.indices, x_dev.indptr[1:-1])\n", 114 | "x_test_nn = np.split(x_test.indices, x_test.indptr[1:-1])" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": 6, 120 | "metadata": { 121 | "collapsed": false 122 | }, 123 | "outputs": [ 124 | { 125 | "name": "stdout", 126 | "output_type": "stream", 127 | "text": [ 128 | "Pad sequences (samples x time)\n", 129 | "('X_train shape:', (7292, 30))\n", 130 | "('X_ternary shape:', (5500, 30))\n", 131 | "('X_dev shape:', (1778, 30))\n", 132 | "('X_test shape:', (20632, 30))\n" 133 | ] 134 | } 135 | ], 136 | "source": [ 137 | "print('Pad sequences (samples x time)')\n", 138 | "x_train_nn = sequence.pad_sequences(x_train_nn, maxlen=MAX_LEN)\n", 139 | "x_train_ternary_nn = sequence.pad_sequences(x_train_ternary_nn, maxlen=MAX_LEN)\n", 140 | "x_dev_nn = sequence.pad_sequences(x_dev_nn, maxlen=MAX_LEN)\n", 141 | "x_test_nn = sequence.pad_sequences(x_test_nn, maxlen=MAX_LEN)\n", 142 | "print('X_train shape:', x_train_nn.shape)\n", 143 | "print('X_ternary shape:', x_train_ternary_nn.shape)\n", 144 | "print('X_dev shape:', x_dev_nn.shape)\n", 145 | "print('X_test shape:', x_test_nn.shape)" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": 7, 151 | "metadata": { 152 | "collapsed": false 153 | }, 154 | "outputs": [ 155 | { 156 | "name": "stdout", 157 | "output_type": "stream", 158 | "text": [ 159 | "Found 1193514 word vectors.\n" 160 | ] 161 | } 162 | ], 163 | "source": [ 164 | "import os, sys\n", 165 | "EMBEDDING_DIM = 50\n", 166 | "\n", 167 | "embeddings_index = {}\n", 168 | "f = open(os.path.join(\"./data/\", 'glove.twitter.27B.50d.txt'))\n", 169 | "for line in f:\n", 170 | " values = line.split()\n", 171 | " word = values[0]\n", 172 | " coefs = np.asarray(values[1:], dtype='float32')\n", 173 | " embeddings_index[word] = coefs\n", 174 | "f.close()\n", 175 | "\n", 176 | "embedding_matrix = np.zeros((len(vec.vocabulary_) + 1, EMBEDDING_DIM))\n", 177 | "for key,val in vec.vocabulary_.iteritems():\n", 178 | " embedding_vector = embeddings_index.get(key)\n", 179 | " if embedding_vector is not None:\n", 180 | " # words not found in embedding index will be all-zeros.\n", 181 | " embedding_matrix[val] = embedding_vector\n", 182 | "print('Found %s word vectors.' % len(embeddings_index))" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": 8, 188 | "metadata": { 189 | "collapsed": true 190 | }, 191 | "outputs": [], 192 | "source": [ 193 | "from sklearn.preprocessing import MultiLabelBinarizer\n", 194 | "mlb = MultiLabelBinarizer(classes=[-2, -1, 0 , 1, 2])\n", 195 | "y_train_nn = mlb.fit_transform([[y] for y in y_train])\n", 196 | "y_test_nn = mlb.transform([[y] for y in y_test])\n", 197 | "\n", 198 | "mlb2 = MultiLabelBinarizer(classes=[-1, 0, 1])\n", 199 | "y_train_nn_ternary = mlb2.fit_transform([[y] for y in y_train_ternary])\n", 200 | "# y_test_nn = mlb.transform([[y] for y in y_train_task2])" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": 9, 206 | "metadata": { 207 | "collapsed": true 208 | }, 209 | "outputs": [], 210 | "source": [ 211 | "from sklearn import utils \n", 212 | "class_weights = utils.compute_class_weight('balanced', [-2, -1, 0, 1, 2], y_train)\n", 213 | "class_weights= {class_id:class_weight for class_id, class_weight in zip(range(5), class_weights)}" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": 12, 219 | "metadata": { 220 | "collapsed": false 221 | }, 222 | "outputs": [ 223 | { 224 | "name": "stdout", 225 | "output_type": "stream", 226 | "text": [ 227 | "Build models...\n" 228 | ] 229 | } 230 | ], 231 | "source": [ 232 | "print('Build models...')\n", 233 | "\n", 234 | "\n", 235 | "main_input = Input(shape=(MAX_LEN,), dtype='int32', name='main_input')\n", 236 | "\n", 237 | "x = Embedding(input_dim = len(vec.vocabulary_)+1, output_dim = EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_LEN, trainable=True, dropout=0.3)(main_input)\n", 238 | "x = BatchNormalization()(x)\n", 239 | "\n", 240 | "lstm_out = Bidirectional(LSTM(output_dim = 50, input_dim = EMBEDDING_DIM, dropout_W=0.3, dropout_U=0.3) )(x)\n", 241 | "\n", 242 | "\n", 243 | "auxiliary_input = Input(shape=(1368,), name='aux_input')\n", 244 | "t_auxiliary_input = Dense(256, activation='tanh')(auxiliary_input)\n", 245 | "t_auxiliary_input = Dropout(0.5)(t_auxiliary_input)\n", 246 | "\n", 247 | "x = merge([lstm_out, t_auxiliary_input], mode='concat')\n", 248 | "\n", 249 | "\n", 250 | "x = Dense(30, activation='tanh', )(x)\n", 251 | "x = Dropout(0.5)(x)\n", 252 | "\n", 253 | "task1_output = Dense(5, activation='softmax', name='main_output')(x)\n", 254 | "task2_output = Dense(3, activation='softmax', name='aux_output')(x)\n", 255 | "\n", 256 | "\n", 257 | "model_task1 = Model(input=[main_input, auxiliary_input], output=[task1_output])\n", 258 | "model_task2 = Model(input=[main_input, auxiliary_input], output=[task2_output])\n", 259 | "\n", 260 | "model_task1.compile(optimizer='RMSprop', loss='categorical_crossentropy', metrics=['accuracy'])\n", 261 | "model_task2.compile(optimizer='RMSprop', loss='categorical_crossentropy', metrics=['accuracy'])\n", 262 | "#model_task1.summary()\n", 263 | "#model_task2.summary()" 264 | ] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "execution_count": 13, 269 | "metadata": { 270 | "collapsed": false, 271 | "scrolled": true 272 | }, 273 | "outputs": [ 274 | { 275 | "name": "stdout", 276 | "output_type": "stream", 277 | "text": [ 278 | "Iteration: 1 \tDEV: 1.92295925391 \tTEST: 1.93709559081\n", 279 | "Iteration: 2 \tDEV: 1.18776405748 \tTEST: 1.05533295264\n", 280 | "Iteration: 3 \tDEV: 0.895966946297 \tTEST: 0.816812569005\n", 281 | "Iteration: 4 \tDEV: 1.02211077479 \tTEST: 0.892998712215\n", 282 | "Iteration: 5 \tDEV: 0.954160935923 \tTEST: 0.826061924628\n", 283 | "Iteration: 6 \tDEV: 0.924547925944 \tTEST: 0.786129369759\n", 284 | "Iteration: 7 \tDEV: 0.854076047407 \tTEST: 0.735604932049\n", 285 | "Iteration: 8 \tDEV: 0.799235732629 \tTEST: 0.657763888703\n", 286 | "Iteration: 9 \tDEV: 0.833436881142 \tTEST: 0.726908912671\n", 287 | "Iteration: 10 \tDEV: 0.784366450381 \tTEST: 0.682021796296\n", 288 | "Iteration: 11 \tDEV: 0.755167897014 \tTEST: 0.665676846315\n", 289 | "Iteration: 12 \tDEV: 0.81856971338 \tTEST: 0.71937841112\n", 290 | "Iteration: 13 \tDEV: 0.762962818998 \tTEST: 0.658635105364\n", 291 | "Iteration: 14 \tDEV: 0.773821181079 \tTEST: 0.708637987154\n", 292 | "Iteration: 15 \tDEV: 0.775601371349 \tTEST: 0.665637346606\n", 293 | "Iteration: 16 \tDEV: 0.759631361236 \tTEST: 0.703419706797\n", 294 | "Iteration: 17 \tDEV: 0.787738085984 \tTEST: 0.751457377915\n", 295 | "Iteration: 18 \tDEV: 0.804429359786 \tTEST: 0.747156926376\n", 296 | "Iteration: 19 \tDEV: 0.815791954936 \tTEST: 0.683381326876\n", 297 | "Iteration: 20 \tDEV: 0.850576735275 \tTEST: 0.756387118012\n", 298 | "Iteration: 21 \tDEV: 0.790620373789 \tTEST: 0.739320606845\n", 299 | "Iteration: 22 \tDEV: 0.764482852452 \tTEST: 0.683031976188\n", 300 | "Iteration: 23 \tDEV: 0.803140795212 \tTEST: 0.725999022617\n", 301 | "Iteration: 24 \tDEV: 0.808647040185 \tTEST: 0.744529571075\n", 302 | "Iteration: 25 \tDEV: 0.817937194796 \tTEST: 0.706654354228\n", 303 | "Iteration: 26 \tDEV: 0.807327637535 \tTEST: 0.768356525751\n", 304 | "Iteration: 27 \tDEV: 0.861871030075 \tTEST: 0.756031051972\n", 305 | "Iteration: 28 \tDEV: 0.854804707783 \tTEST: 0.821395129217\n", 306 | "Iteration: 29 \tDEV: 0.823053542862 \tTEST: 0.783426924025\n", 307 | "Iteration: 30 \tDEV: 0.844449656178 \tTEST: 0.747247759193\n", 308 | "Iteration: 31 \tDEV: 0.824878779691 \tTEST: 0.797660345422\n", 309 | "Iteration: 32 \tDEV: 0.825061301588 \tTEST: 0.767186215062\n", 310 | "Iteration: 33 \tDEV: 0.83082769645 \tTEST: 0.7776032457\n", 311 | "Iteration: 34 \tDEV: 0.80440427363 \tTEST: 0.742673350311\n", 312 | "Iteration: 35 \tDEV: 0.828627354066 \tTEST: 0.744338298501\n", 313 | "Iteration: 36 \tDEV: 0.819334882033 \tTEST: 0.729371547829\n", 314 | "Iteration: 37 \tDEV: 0.901746496735 \tTEST: 0.783955756206\n", 315 | "Iteration: 38 \tDEV: 0.903028631744 \tTEST: 0.752098017949\n", 316 | "Iteration: 39 \tDEV: 0.795337867354 \tTEST: 0.722056277475\n", 317 | "Iteration: 40 \tDEV: 0.844048296355 \tTEST: 0.768640202983\n", 318 | "Iteration: 41 \tDEV: 0.780133585328 \tTEST: 0.741161528006\n", 319 | "Iteration: 42 \tDEV: 0.803202129502 \tTEST: 0.766803837581\n", 320 | "Iteration: 43 \tDEV: 0.825221027768 \tTEST: 0.769970364866\n", 321 | "Iteration: 44 \tDEV: 0.84310632542 \tTEST: 0.787748529746\n", 322 | "Iteration: 45 \tDEV: 0.829183898124 \tTEST: 0.773112306279\n", 323 | "Iteration: 46 \tDEV: 0.795794030928 \tTEST: 0.74658846479\n", 324 | "Iteration: 47 \tDEV: 0.775570256578 \tTEST: 0.737646257549\n", 325 | "Iteration: 48 \tDEV: 0.817065613571 \tTEST: 0.746390413085\n", 326 | "Iteration: 49 \tDEV: 0.903140564291 \tTEST: 0.811813157035\n", 327 | "Iteration: 50 \tDEV: 0.838185184101 \tTEST: 0.750441266774\n", 328 | "Iteration: 51 \tDEV: 0.871796566588 \tTEST: 0.759380888795\n", 329 | "Iteration: 52 \tDEV: 0.811734337629 \tTEST: 0.742526685269\n", 330 | "Iteration: 53 \tDEV: 0.804882941058 \tTEST: 0.746097919402\n", 331 | "[0.75516789701430687, 0.66567684631518431]\n" 332 | ] 333 | } 334 | ], 335 | "source": [ 336 | "BATCH_SIZE = 128\n", 337 | "results = []\n", 338 | "for batch in range(600*5):\n", 339 | " nb_rand = \n", 340 | " if random.random() < 1.0:\n", 341 | " sample = np.random.randint(0, len(x_train_nn), BATCH_SIZE)\n", 342 | " x_sampled, y_sampled, x_aux = x_train_nn[sample], y_train_nn[sample], X_train_additional[sample]\n", 343 | " model_task1.train_on_batch({'main_input': x_sampled, 'aux_input': x_aux.todense() }, [y_sampled], class_weight=class_weights, sample_weight=None)\n", 344 | " else:\n", 345 | " sample = np.random.randint(0, len(x_train_ternary_nn), BATCH_SIZE)\n", 346 | " x_sampled, y_sampled, x_aux = x_train_ternary_nn[sample], y_train_nn_ternary[sample], X_ternary_additional[sample]\n", 347 | " model_task2.train_on_batch({'main_input': x_sampled, 'aux_input': x_aux.todense()}, [y_sampled], class_weight=None, sample_weight=None)\n", 348 | " \n", 349 | " if batch%57==0:\n", 350 | " dev_preds = np.argmax(model_task1.predict({'main_input': x_dev_nn,'aux_input': X_dev_additional.todense() }, batch_size=BATCH_SIZE, verbose=0), axis=1)\n", 351 | " test_preds = np.argmax(model_task1.predict({'main_input': x_test_nn, 'aux_input': X_test_additional.todense()}, batch_size=BATCH_SIZE, verbose=0), axis=1)\n", 352 | " results.append([macroMAE(y_dev, dev_preds-2), macroMAE(y_test, test_preds-2)])\n", 353 | " print \"Iteration:\", int(batch/57)+1, \"\\tDEV:\", results[-1][0], \"\\tTEST:\", results[-1][1]\n", 354 | " \n", 355 | "#68 0.775022887016 0.778202313573 <- without extra features best\n", 356 | "#probably need to do some cross-val or increase the size of the validation set.. Increased dropout, helped. !Success!!!\n", 357 | "best_run = np.argmin(np.asarray(results)[:,0])\n", 358 | "print results[best_run]" 359 | ] 360 | }, 361 | { 362 | "cell_type": "code", 363 | "execution_count": null, 364 | "metadata": { 365 | "collapsed": true 366 | }, 367 | "outputs": [], 368 | "source": [] 369 | } 370 | ], 371 | "metadata": { 372 | "anaconda-cloud": {}, 373 | "kernelspec": { 374 | "display_name": "Python 2", 375 | "language": "python", 376 | "name": "python2" 377 | }, 378 | "language_info": { 379 | "codemirror_mode": { 380 | "name": "ipython", 381 | "version": 2 382 | }, 383 | "file_extension": ".py", 384 | "mimetype": "text/x-python", 385 | "name": "python", 386 | "nbconvert_exporter": "python", 387 | "pygments_lexer": "ipython2", 388 | "version": "2.7.9" 389 | } 390 | }, 391 | "nbformat": 4, 392 | "nbformat_minor": 1 393 | } 394 | --------------------------------------------------------------------------------