├── README.md
└── mtl_sentiment_analysis.ipynb


/README.md:
--------------------------------------------------------------------------------
 1 | # MultiTask-Sentiment-Analysis
 2 | 
 3 | Draft implementation for our SIGIR 2017 paper : "Multitask Learning for Fine-Grained Twitter Sentiment Analysis". If you find this code useful in your research, please consider citing:
 4 | 
 5 |     @inproceedings{Balikas:2017:MLF:3077136.3080702,
 6 |         author = {Balikas, Georgios and Moura, Simon and Amini, Massih-Reza},
 7 |         title = {Multitask Learning for Fine-Grained Twitter Sentiment Analysis},
 8 |         booktitle = {Proceedings of the 40th International ACM SIGIR Conference on Research and Development in Information Retrieval},
 9 |         series = {SIGIR '17},
10 |         year = {2017},
11 |         isbn = {978-1-4503-5022-8},
12 |         location = {Shinjuku, Tokyo, Japan},
13 |         pages = {1005--1008},
14 |         numpages = {4},
15 |         url = {http://doi.acm.org/10.1145/3077136.3080702},
16 |         doi = {10.1145/3077136.3080702},
17 |         acmid = {3080702},
18 |         publisher = {ACM},
19 |         address = {New York, NY, USA},
20 |         keywords = {bilstm, deep learning, multitask learning, sentiment analysis, text classification, text mining, twitter analysis},
21 |     }
22 | 


--------------------------------------------------------------------------------
/mtl_sentiment_analysis.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": false
  8 |    },
  9 |    "outputs": [
 10 |     {
 11 |      "name": "stderr",
 12 |      "output_type": "stream",
 13 |      "text": [
 14 |       "Using Theano backend.\n"
 15 |      ]
 16 |     }
 17 |    ],
 18 |    "source": [
 19 |     "from functions import *\n",
 20 |     "from twitterTokenizer import Tokenizer\n",
 21 |     "import numpy as np, random\n",
 22 |     "import subprocess \n",
 23 |     "np.random.seed(1337)  # for reproducibility\n",
 24 |     "from keras.layers.normalization  import BatchNormalization\n",
 25 |     "from keras.preprocessing import sequence\n",
 26 |     "from keras.utils import np_utils\n",
 27 |     "from keras.models import Sequential, Model\n",
 28 |     "from keras.layers import Dense, Dropout, Activation, Embedding, Bidirectional, LSTM, Input, merge"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": 2,
 34 |    "metadata": {
 35 |     "collapsed": false
 36 |    },
 37 |    "outputs": [],
 38 |    "source": [
 39 |     "X_train, y_train = load_SemEval_from_file('./data/subtaskCE.train_dev.tsv')\n",
 40 |     "X_dev, y_dev = load_SemEval_from_file('./data/subtaskCE.devtest.tsv')\n",
 41 |     "X_test, y_test = load_SemEval_SubTaskCE_Test('./data/SemEval2016-task4-test.subtask-BCDE.txt', './data/SemEval2016_task4_subtaskC_test_gold.txt')\n",
 42 |     "X_train_ternary, y_train_ternary = load_SemEval_subtaskA('./data/subtaskA.downloaded.tsv')"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": 3,
 48 |    "metadata": {
 49 |     "collapsed": false
 50 |    },
 51 |    "outputs": [
 52 |     {
 53 |      "data": {
 54 |       "text/plain": [
 55 |        "((7292, 1368), (1778, 1368), (20632, 1368), (5500, 1368))"
 56 |       ]
 57 |      },
 58 |      "execution_count": 3,
 59 |      "metadata": {},
 60 |      "output_type": "execute_result"
 61 |     }
 62 |    ],
 63 |    "source": [
 64 |     "X_train_additional = load_sparse_csr('./additional_features/X_train_additional.npz', )\n",
 65 |     "X_dev_additional = load_sparse_csr('./additional_features/X_dev_additional.npz', )\n",
 66 |     "X_test_additional = load_sparse_csr('./additional_features/X_test_additional.npz',)\n",
 67 |     "X_ternary_additional = load_sparse_csr('./additional_features/X_ternary_additional.npz',)\n",
 68 |     "X_train_additional.shape, X_dev_additional.shape,  X_test_additional.shape, X_ternary_additional.shape"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": 4,
 74 |    "metadata": {
 75 |     "collapsed": false
 76 |    },
 77 |    "outputs": [
 78 |     {
 79 |      "name": "stdout",
 80 |      "output_type": "stream",
 81 |      "text": [
 82 |       "('Train shape', (7292, 14356), 'Dev shape', (1778, 14356), 'Test shape', (20632, 14356), '14356 vocabulary terms found')\n"
 83 |      ]
 84 |     }
 85 |    ],
 86 |    "source": [
 87 |     "from sklearn.feature_extraction.text import CountVectorizer\n",
 88 |     "MAX_FEATURES, MAX_LEN, BATCH_SIZE  = 11000, 30, 64\n",
 89 |     "\n",
 90 |     "tokenizer = Tokenizer(preserve_case=False)\n",
 91 |     "\n",
 92 |     "vec = CountVectorizer( ngram_range=(1,1), analyzer='word', tokenizer=tokenizer.tokenize, stop_words=None)\n",
 93 |     "vec.fit(X_train+X_train_ternary)\n",
 94 |     "\n",
 95 |     "x_train = vec.transform(X_train)\n",
 96 |     "x_train_ternary = vec.transform(X_train_ternary)\n",
 97 |     "x_dev = vec.transform(X_dev)\n",
 98 |     "x_test = vec.transform(X_test)\n",
 99 |     "\n",
100 |     "print(\"Train shape\", x_train.shape, \"Dev shape\", x_dev.shape, \"Test shape\", x_test.shape, \"%d vocabulary terms found\"%len(vec.vocabulary_))"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": 5,
106 |    "metadata": {
107 |     "collapsed": true
108 |    },
109 |    "outputs": [],
110 |    "source": [
111 |     "x_train_nn = np.split(x_train.indices, x_train.indptr[1:-1])\n",
112 |     "x_train_ternary_nn = np.split(x_train_ternary.indices, x_train_ternary.indptr[1:-1])\n",
113 |     "x_dev_nn = np.split(x_dev.indices, x_dev.indptr[1:-1])\n",
114 |     "x_test_nn = np.split(x_test.indices, x_test.indptr[1:-1])"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": 6,
120 |    "metadata": {
121 |     "collapsed": false
122 |    },
123 |    "outputs": [
124 |     {
125 |      "name": "stdout",
126 |      "output_type": "stream",
127 |      "text": [
128 |       "Pad sequences (samples x time)\n",
129 |       "('X_train shape:', (7292, 30))\n",
130 |       "('X_ternary shape:', (5500, 30))\n",
131 |       "('X_dev shape:', (1778, 30))\n",
132 |       "('X_test shape:', (20632, 30))\n"
133 |      ]
134 |     }
135 |    ],
136 |    "source": [
137 |     "print('Pad sequences (samples x time)')\n",
138 |     "x_train_nn = sequence.pad_sequences(x_train_nn, maxlen=MAX_LEN)\n",
139 |     "x_train_ternary_nn = sequence.pad_sequences(x_train_ternary_nn, maxlen=MAX_LEN)\n",
140 |     "x_dev_nn = sequence.pad_sequences(x_dev_nn, maxlen=MAX_LEN)\n",
141 |     "x_test_nn = sequence.pad_sequences(x_test_nn, maxlen=MAX_LEN)\n",
142 |     "print('X_train shape:', x_train_nn.shape)\n",
143 |     "print('X_ternary shape:', x_train_ternary_nn.shape)\n",
144 |     "print('X_dev shape:', x_dev_nn.shape)\n",
145 |     "print('X_test shape:', x_test_nn.shape)"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "code",
150 |    "execution_count": 7,
151 |    "metadata": {
152 |     "collapsed": false
153 |    },
154 |    "outputs": [
155 |     {
156 |      "name": "stdout",
157 |      "output_type": "stream",
158 |      "text": [
159 |       "Found 1193514 word vectors.\n"
160 |      ]
161 |     }
162 |    ],
163 |    "source": [
164 |     "import os, sys\n",
165 |     "EMBEDDING_DIM = 50\n",
166 |     "\n",
167 |     "embeddings_index = {}\n",
168 |     "f = open(os.path.join(\"./data/\", 'glove.twitter.27B.50d.txt'))\n",
169 |     "for line in f:\n",
170 |     "    values = line.split()\n",
171 |     "    word = values[0]\n",
172 |     "    coefs = np.asarray(values[1:], dtype='float32')\n",
173 |     "    embeddings_index[word] = coefs\n",
174 |     "f.close()\n",
175 |     "\n",
176 |     "embedding_matrix = np.zeros((len(vec.vocabulary_) + 1, EMBEDDING_DIM))\n",
177 |     "for key,val in vec.vocabulary_.iteritems():\n",
178 |     "    embedding_vector = embeddings_index.get(key)\n",
179 |     "    if embedding_vector is not None:\n",
180 |     "        # words not found in embedding index will be all-zeros.\n",
181 |     "        embedding_matrix[val] = embedding_vector\n",
182 |     "print('Found %s word vectors.' % len(embeddings_index))"
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "code",
187 |    "execution_count": 8,
188 |    "metadata": {
189 |     "collapsed": true
190 |    },
191 |    "outputs": [],
192 |    "source": [
193 |     "from sklearn.preprocessing import MultiLabelBinarizer\n",
194 |     "mlb = MultiLabelBinarizer(classes=[-2, -1, 0 , 1, 2])\n",
195 |     "y_train_nn = mlb.fit_transform([[y] for y in y_train])\n",
196 |     "y_test_nn = mlb.transform([[y] for y in y_test])\n",
197 |     "\n",
198 |     "mlb2 = MultiLabelBinarizer(classes=[-1, 0, 1])\n",
199 |     "y_train_nn_ternary = mlb2.fit_transform([[y] for y in y_train_ternary])\n",
200 |     "# y_test_nn = mlb.transform([[y] for y in y_train_task2])"
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "code",
205 |    "execution_count": 9,
206 |    "metadata": {
207 |     "collapsed": true
208 |    },
209 |    "outputs": [],
210 |    "source": [
211 |     "from sklearn import utils \n",
212 |     "class_weights = utils.compute_class_weight('balanced', [-2, -1, 0, 1, 2], y_train)\n",
213 |     "class_weights= {class_id:class_weight for class_id, class_weight in zip(range(5), class_weights)}"
214 |    ]
215 |   },
216 |   {
217 |    "cell_type": "code",
218 |    "execution_count": 12,
219 |    "metadata": {
220 |     "collapsed": false
221 |    },
222 |    "outputs": [
223 |     {
224 |      "name": "stdout",
225 |      "output_type": "stream",
226 |      "text": [
227 |       "Build models...\n"
228 |      ]
229 |     }
230 |    ],
231 |    "source": [
232 |     "print('Build models...')\n",
233 |     "\n",
234 |     "\n",
235 |     "main_input = Input(shape=(MAX_LEN,), dtype='int32', name='main_input')\n",
236 |     "\n",
237 |     "x = Embedding(input_dim = len(vec.vocabulary_)+1, output_dim = EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_LEN, trainable=True, dropout=0.3)(main_input)\n",
238 |     "x = BatchNormalization()(x)\n",
239 |     "\n",
240 |     "lstm_out = Bidirectional(LSTM(output_dim = 50, input_dim = EMBEDDING_DIM, dropout_W=0.3, dropout_U=0.3) )(x)\n",
241 |     "\n",
242 |     "\n",
243 |     "auxiliary_input = Input(shape=(1368,), name='aux_input')\n",
244 |     "t_auxiliary_input = Dense(256, activation='tanh')(auxiliary_input)\n",
245 |     "t_auxiliary_input = Dropout(0.5)(t_auxiliary_input)\n",
246 |     "\n",
247 |     "x = merge([lstm_out, t_auxiliary_input], mode='concat')\n",
248 |     "\n",
249 |     "\n",
250 |     "x = Dense(30, activation='tanh', )(x)\n",
251 |     "x = Dropout(0.5)(x)\n",
252 |     "\n",
253 |     "task1_output = Dense(5, activation='softmax', name='main_output')(x)\n",
254 |     "task2_output = Dense(3, activation='softmax', name='aux_output')(x)\n",
255 |     "\n",
256 |     "\n",
257 |     "model_task1 = Model(input=[main_input, auxiliary_input], output=[task1_output])\n",
258 |     "model_task2 = Model(input=[main_input, auxiliary_input], output=[task2_output])\n",
259 |     "\n",
260 |     "model_task1.compile(optimizer='RMSprop', loss='categorical_crossentropy', metrics=['accuracy'])\n",
261 |     "model_task2.compile(optimizer='RMSprop', loss='categorical_crossentropy', metrics=['accuracy'])\n",
262 |     "#model_task1.summary()\n",
263 |     "#model_task2.summary()"
264 |    ]
265 |   },
266 |   {
267 |    "cell_type": "code",
268 |    "execution_count": 13,
269 |    "metadata": {
270 |     "collapsed": false,
271 |     "scrolled": true
272 |    },
273 |    "outputs": [
274 |     {
275 |      "name": "stdout",
276 |      "output_type": "stream",
277 |      "text": [
278 |       "Iteration: 1 \tDEV: 1.92295925391 \tTEST: 1.93709559081\n",
279 |       "Iteration: 2 \tDEV: 1.18776405748 \tTEST: 1.05533295264\n",
280 |       "Iteration: 3 \tDEV: 0.895966946297 \tTEST: 0.816812569005\n",
281 |       "Iteration: 4 \tDEV: 1.02211077479 \tTEST: 0.892998712215\n",
282 |       "Iteration: 5 \tDEV: 0.954160935923 \tTEST: 0.826061924628\n",
283 |       "Iteration: 6 \tDEV: 0.924547925944 \tTEST: 0.786129369759\n",
284 |       "Iteration: 7 \tDEV: 0.854076047407 \tTEST: 0.735604932049\n",
285 |       "Iteration: 8 \tDEV: 0.799235732629 \tTEST: 0.657763888703\n",
286 |       "Iteration: 9 \tDEV: 0.833436881142 \tTEST: 0.726908912671\n",
287 |       "Iteration: 10 \tDEV: 0.784366450381 \tTEST: 0.682021796296\n",
288 |       "Iteration: 11 \tDEV: 0.755167897014 \tTEST: 0.665676846315\n",
289 |       "Iteration: 12 \tDEV: 0.81856971338 \tTEST: 0.71937841112\n",
290 |       "Iteration: 13 \tDEV: 0.762962818998 \tTEST: 0.658635105364\n",
291 |       "Iteration: 14 \tDEV: 0.773821181079 \tTEST: 0.708637987154\n",
292 |       "Iteration: 15 \tDEV: 0.775601371349 \tTEST: 0.665637346606\n",
293 |       "Iteration: 16 \tDEV: 0.759631361236 \tTEST: 0.703419706797\n",
294 |       "Iteration: 17 \tDEV: 0.787738085984 \tTEST: 0.751457377915\n",
295 |       "Iteration: 18 \tDEV: 0.804429359786 \tTEST: 0.747156926376\n",
296 |       "Iteration: 19 \tDEV: 0.815791954936 \tTEST: 0.683381326876\n",
297 |       "Iteration: 20 \tDEV: 0.850576735275 \tTEST: 0.756387118012\n",
298 |       "Iteration: 21 \tDEV: 0.790620373789 \tTEST: 0.739320606845\n",
299 |       "Iteration: 22 \tDEV: 0.764482852452 \tTEST: 0.683031976188\n",
300 |       "Iteration: 23 \tDEV: 0.803140795212 \tTEST: 0.725999022617\n",
301 |       "Iteration: 24 \tDEV: 0.808647040185 \tTEST: 0.744529571075\n",
302 |       "Iteration: 25 \tDEV: 0.817937194796 \tTEST: 0.706654354228\n",
303 |       "Iteration: 26 \tDEV: 0.807327637535 \tTEST: 0.768356525751\n",
304 |       "Iteration: 27 \tDEV: 0.861871030075 \tTEST: 0.756031051972\n",
305 |       "Iteration: 28 \tDEV: 0.854804707783 \tTEST: 0.821395129217\n",
306 |       "Iteration: 29 \tDEV: 0.823053542862 \tTEST: 0.783426924025\n",
307 |       "Iteration: 30 \tDEV: 0.844449656178 \tTEST: 0.747247759193\n",
308 |       "Iteration: 31 \tDEV: 0.824878779691 \tTEST: 0.797660345422\n",
309 |       "Iteration: 32 \tDEV: 0.825061301588 \tTEST: 0.767186215062\n",
310 |       "Iteration: 33 \tDEV: 0.83082769645 \tTEST: 0.7776032457\n",
311 |       "Iteration: 34 \tDEV: 0.80440427363 \tTEST: 0.742673350311\n",
312 |       "Iteration: 35 \tDEV: 0.828627354066 \tTEST: 0.744338298501\n",
313 |       "Iteration: 36 \tDEV: 0.819334882033 \tTEST: 0.729371547829\n",
314 |       "Iteration: 37 \tDEV: 0.901746496735 \tTEST: 0.783955756206\n",
315 |       "Iteration: 38 \tDEV: 0.903028631744 \tTEST: 0.752098017949\n",
316 |       "Iteration: 39 \tDEV: 0.795337867354 \tTEST: 0.722056277475\n",
317 |       "Iteration: 40 \tDEV: 0.844048296355 \tTEST: 0.768640202983\n",
318 |       "Iteration: 41 \tDEV: 0.780133585328 \tTEST: 0.741161528006\n",
319 |       "Iteration: 42 \tDEV: 0.803202129502 \tTEST: 0.766803837581\n",
320 |       "Iteration: 43 \tDEV: 0.825221027768 \tTEST: 0.769970364866\n",
321 |       "Iteration: 44 \tDEV: 0.84310632542 \tTEST: 0.787748529746\n",
322 |       "Iteration: 45 \tDEV: 0.829183898124 \tTEST: 0.773112306279\n",
323 |       "Iteration: 46 \tDEV: 0.795794030928 \tTEST: 0.74658846479\n",
324 |       "Iteration: 47 \tDEV: 0.775570256578 \tTEST: 0.737646257549\n",
325 |       "Iteration: 48 \tDEV: 0.817065613571 \tTEST: 0.746390413085\n",
326 |       "Iteration: 49 \tDEV: 0.903140564291 \tTEST: 0.811813157035\n",
327 |       "Iteration: 50 \tDEV: 0.838185184101 \tTEST: 0.750441266774\n",
328 |       "Iteration: 51 \tDEV: 0.871796566588 \tTEST: 0.759380888795\n",
329 |       "Iteration: 52 \tDEV: 0.811734337629 \tTEST: 0.742526685269\n",
330 |       "Iteration: 53 \tDEV: 0.804882941058 \tTEST: 0.746097919402\n",
331 |       "[0.75516789701430687, 0.66567684631518431]\n"
332 |      ]
333 |     }
334 |    ],
335 |    "source": [
336 |     "BATCH_SIZE = 128\n",
337 |     "results = []\n",
338 |     "for batch in range(600*5):\n",
339 |     "    nb_rand = \n",
340 |     "    if random.random() < 1.0:\n",
341 |     "        sample = np.random.randint(0, len(x_train_nn), BATCH_SIZE)\n",
342 |     "        x_sampled, y_sampled, x_aux = x_train_nn[sample], y_train_nn[sample], X_train_additional[sample]\n",
343 |     "        model_task1.train_on_batch({'main_input': x_sampled, 'aux_input': x_aux.todense() }, [y_sampled], class_weight=class_weights, sample_weight=None)\n",
344 |     "    else:\n",
345 |     "        sample = np.random.randint(0, len(x_train_ternary_nn), BATCH_SIZE)\n",
346 |     "        x_sampled, y_sampled, x_aux = x_train_ternary_nn[sample], y_train_nn_ternary[sample], X_ternary_additional[sample]\n",
347 |     "        model_task2.train_on_batch({'main_input': x_sampled, 'aux_input': x_aux.todense()}, [y_sampled], class_weight=None, sample_weight=None)\n",
348 |     "        \n",
349 |     "    if batch%57==0:\n",
350 |     "        dev_preds = np.argmax(model_task1.predict({'main_input': x_dev_nn,'aux_input': X_dev_additional.todense() }, batch_size=BATCH_SIZE, verbose=0), axis=1)\n",
351 |     "        test_preds = np.argmax(model_task1.predict({'main_input': x_test_nn, 'aux_input': X_test_additional.todense()}, batch_size=BATCH_SIZE, verbose=0), axis=1)\n",
352 |     "        results.append([macroMAE(y_dev, dev_preds-2), macroMAE(y_test, test_preds-2)])\n",
353 |     "        print \"Iteration:\", int(batch/57)+1, \"\\tDEV:\", results[-1][0], \"\\tTEST:\", results[-1][1]\n",
354 |     "        \n",
355 |     "#68 0.775022887016 0.778202313573 <- without extra features best\n",
356 |     "#probably need to do some cross-val or increase the size of the validation set.. Increased dropout, helped. !Success!!!\n",
357 |     "best_run = np.argmin(np.asarray(results)[:,0])\n",
358 |     "print results[best_run]"
359 |    ]
360 |   },
361 |   {
362 |    "cell_type": "code",
363 |    "execution_count": null,
364 |    "metadata": {
365 |     "collapsed": true
366 |    },
367 |    "outputs": [],
368 |    "source": []
369 |   }
370 |  ],
371 |  "metadata": {
372 |   "anaconda-cloud": {},
373 |   "kernelspec": {
374 |    "display_name": "Python 2",
375 |    "language": "python",
376 |    "name": "python2"
377 |   },
378 |   "language_info": {
379 |    "codemirror_mode": {
380 |     "name": "ipython",
381 |     "version": 2
382 |    },
383 |    "file_extension": ".py",
384 |    "mimetype": "text/x-python",
385 |    "name": "python",
386 |    "nbconvert_exporter": "python",
387 |    "pygments_lexer": "ipython2",
388 |    "version": "2.7.9"
389 |   }
390 |  },
391 |  "nbformat": 4,
392 |  "nbformat_minor": 1
393 | }
394 | 


--------------------------------------------------------------------------------