├── .gitignore ├── Four Word Model ├── Model-2.ipynb ├── Model-3.ipynb ├── Model-4.ipynb ├── Model-5.ipynb ├── Model-6.ipynb ├── Model-7.ipynb ├── Model-8.ipynb ├── Model-9.ipynb ├── Model.ipynb ├── Preprocess_Files │ ├── hard │ │ ├── sense │ │ └── sent │ ├── interest │ │ ├── sense │ │ └── sent │ ├── line │ │ ├── sense │ │ └── sent │ └── serve │ │ ├── sense │ │ └── sent ├── Senses.txt ├── Sentences.txt ├── final_preprocessing.ipynb ├── full_train.pickle ├── initial_processing.ipynb ├── robsr_model.ipynb ├── train.pickle ├── vocab_overlap_analysis.ipynb └── words_not_in_vocab.pickle ├── LICENSE ├── README.md ├── UGP_Report.pdf ├── UGP_presentation.pdf ├── models_diagram ├── all-word-1.png ├── all-word-2.png ├── all-word-3.png ├── all-word-4.png ├── all-word-5.png ├── all-word-6.png ├── all-word-7.png ├── all-word-8.png ├── model-1.png ├── model-2.png ├── model-3.png └── model-4.png ├── one_million ├── One-Million All-Word Data Sampling Coarse.ipynb ├── One-Million All-Word Data Sampling-Fine.ipynb ├── One-Million All-Word Data-hierarchical Sampling-Fine.ipynb ├── One-Million All-Word Data-seq.ipynb ├── Sense-test.ipynb ├── Sense.ipynb ├── all-word-model ├── all-word │ ├── Model-aw-1-multigpu-1.ipynb │ ├── Model-aw-1-multigpu-2.ipynb │ ├── Model-aw-1-multigpu-3.ipynb │ ├── Model-aw-3-1.ipynb │ ├── Model-aw-3.ipynb │ ├── Model-aw-4-1.ipynb │ ├── Model-aw-lex-1.2.ipynb │ ├── Model-aw-lex-1.3.ipynb │ ├── Model-aw-lex-1.4.ipynb │ ├── Model-aw-lex-1.ipynb │ ├── Model-aw-lex-2.2.ipynb │ ├── Model-aw-lex-hierarchical-1.ipynb │ ├── Model-aw-lex-hierarchical-2.ipynb │ ├── Model-aw-lex-hierarchical-3.ipynb │ ├── Model-aw-lex-hierarchical-4.ipynb │ ├── Model-aw-lex-local_attention-fast-v1.ipynb │ ├── Model-aw-lex-local_attention-fast-v2-1.ipynb │ ├── Model-aw-lex-local_attention-fast-v2-2.ipynb │ ├── Model-aw-lex-local_attention-fast-v2-3.ipynb │ ├── Model-aw-lex-local_attention-fast-v2-4.ipynb │ ├── Model-aw-lex-local_attention-fast-v2-5.ipynb │ ├── Model-aw-lex-local_attention-fast-v2-6.ipynb │ ├── Model-aw-lex-local_attention-fast-v2-7.ipynb │ ├── Model-aw-lex-local_attention-fast-v2-8.ipynb │ ├── Model-aw-lex-local_attention-fast-v2-9.ipynb │ ├── Model-aw-lex-local_attention-fast-v3-1.ipynb │ ├── Model-aw-lex-local_attention-fast-v4-1.ipynb │ ├── Model-aw-lex-local_attention-slow-1.ipynb │ ├── Model-aw-lex-local_attention-slow-2.ipynb │ ├── Model-aw-lex-seq-hierarchical-1.ipynb │ ├── Model-aw-lex-seq-hierarchical-2.ipynb │ ├── Model-aw-sense-1.ipynb │ └── Readme.md ├── force │ ├── Force-Model-1-multigpu-1.ipynb │ ├── Force-Model-1-multigpu-2.ipynb │ ├── Force-Model-1-multigpu-3.ipynb │ ├── Force-Model-1.ipynb │ ├── Force-Model-2-multigpu-1.ipynb │ ├── Force-Model-2.ipynb │ ├── Force-Model-3-multigpu-1.ipynb │ ├── Force-Model-3.ipynb │ ├── Force-Model-4-multigpu-1.ipynb │ ├── Force-Model-4.ipynb │ └── Force-Model-5.ipynb ├── make │ ├── Make-Model-1-multigpu-1.ipynb │ ├── Make-Model-1.ipynb │ ├── Make-Model-2-multigpu-1.ipynb │ ├── Make-Model-2.ipynb │ ├── Make-Model-3-1.ipynb │ ├── Make-Model-3-2.ipynb │ ├── Make-Model-3-3.ipynb │ ├── Make-Model-3-multigpu-1.ipynb │ └── Make-Model-3.ipynb ├── one_million_parsing.ipynb ├── one_word_data_maker-test.ipynb ├── one_word_data_maker.ipynb ├── open │ ├── Open-Model-1-multigpu-1.ipynb │ ├── Open-Model-2-multigpu-1.ipynb │ ├── Open-Model-3-multigpu-1.ipynb │ ├── Open-Model-3.ipynb │ ├── Open-Model-4-multigpu-1.ipynb │ └── Open-Model-4.ipynb ├── place │ ├── Place-Model-1-multigpu-1.ipynb │ ├── Place-Model-2-multigpu-1.ipynb │ ├── Place-Model-2.ipynb │ ├── Place-Model-3-multigpu-1.ipynb │ ├── Place-Model-3.ipynb │ ├── Place-Model-4-multigpu-1.ipynb │ ├── Place-Model-4.ipynb │ └── Place-Model-6.ipynb ├── point │ ├── Point-Model-1-multigpu-1.ipynb │ ├── Point-Model-2-multigpu-1.ipynb │ ├── Point-Model-2.ipynb │ ├── Point-Model-3-multigpu-1.ipynb │ ├── Point-Model-3.ipynb │ ├── Point-Model-4-multigpu-1.ipynb │ └── Point-Model-4.ipynb ├── raw_one_million_parsing-test.ipynb ├── raw_one_million_parsing.ipynb ├── serve │ ├── Serve-Model-1-multigpu-2.ipynb │ ├── Serve-Model-1.ipynb │ ├── Serve-Model-2.ipynb │ └── Serve-Model-3.ipynb └── support │ ├── Support-Model-1-multigpu-1.ipynb │ ├── Support-Model-2-multigpu-1.ipynb │ ├── Support-Model-3-multigpu-1.ipynb │ ├── Support-Model-3.ipynb │ ├── Support-Model-4-multigpu-1.ipynb │ ├── Support-Model-4.ipynb │ └── Support-Model-5.ipynb └── papers ├── 1603.07012.pdf ├── 9f260612d5817d542cda2a7d9a6eb18d6471.pdf ├── D17-1008.pdf ├── K16-1006.pdf ├── P16-1085.pdf ├── W16-5307.pdf ├── a10-navigli.pdf ├── crf.pdf ├── report1.pdf └── report2.pdf /.gitignore: -------------------------------------------------------------------------------- 1 | rushab/ 2 | dataset/ 3 | Glove/ 4 | glove/ 5 | Four Word Model/.ipynb_checkpoints/ 6 | data/ 7 | .ipynb_checkpoints/ 8 | Four Word Model/output 9 | one_million/output 10 | *.pickle 11 | papers/ -------------------------------------------------------------------------------- /Four Word Model/Model-2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import tensorflow as tf\n", 12 | "tf.logging.set_verbosity(tf.logging.WARN)\n", 13 | "import pickle\n", 14 | "import numpy as np\n", 15 | "import os\n", 16 | "from sklearn.model_selection import train_test_split\n", 17 | "from sklearn.metrics import f1_score\n", 18 | "from sklearn.metrics import accuracy_score\n", 19 | "import os\n", 20 | "from tensorflow.python.client import device_lib" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 2, 26 | "metadata": { 27 | "collapsed": true 28 | }, 29 | "outputs": [], 30 | "source": [ 31 | "f = open('../Glove/word_embedding_glove', 'rb')\n", 32 | "word_embedding = pickle.load(f)\n", 33 | "f.close()\n", 34 | "word_embedding = word_embedding[: len(word_embedding)-1]\n", 35 | "\n", 36 | "f = open('../Glove/vocab_glove', 'rb')\n", 37 | "vocab = pickle.load(f)\n", 38 | "f.close()\n", 39 | "\n", 40 | "word2id = dict((w, i) for i,w in enumerate(vocab))\n", 41 | "id2word = dict((i, w) for i,w in enumerate(vocab))\n", 42 | "\n", 43 | "unknown_token = \"UNKNOWN_TOKEN\"\n", 44 | "\n", 45 | "f = open(\"train.pickle\", 'rb')\n", 46 | "full_data = pickle.load(f)\n", 47 | "f.close()" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 3, 53 | "metadata": { 54 | "collapsed": true 55 | }, 56 | "outputs": [], 57 | "source": [ 58 | "# Model Description\n", 59 | "sense_word = 'hard'\n", 60 | "model_name = 'model-2'\n", 61 | "model_dir = 'output/' + sense_word + '/' + model_name\n", 62 | "save_dir = os.path.join(model_dir, \"save/\")\n", 63 | "log_dir = os.path.join(model_dir, \"log\")\n", 64 | "\n", 65 | "if not os.path.exists(model_dir):\n", 66 | " os.mkdir(model_dir)\n", 67 | "if not os.path.exists(save_dir):\n", 68 | " os.mkdir(save_dir)\n", 69 | "if not os.path.exists(log_dir):\n", 70 | " os.mkdir(log_dir)" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 4, 76 | "metadata": { 77 | "collapsed": true 78 | }, 79 | "outputs": [], 80 | "source": [ 81 | "# Parameters\n", 82 | "mode = 'train'\n", 83 | "num_senses = 3\n", 84 | "batch_size = 64\n", 85 | "vocab_size = len(vocab)\n", 86 | "unk_vocab_size = 1\n", 87 | "word_emb_size = len(word_embedding[0])\n", 88 | "max_sent_size = 200\n", 89 | "hidden_size = 100\n", 90 | "keep_prob = 0.5\n", 91 | "l2_lambda = 0.001\n", 92 | "init_lr = 0.001\n", 93 | "decay_steps = 500\n", 94 | "decay_rate = 0.96\n", 95 | "clip_norm = 1\n", 96 | "clipping = True" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 5, 102 | "metadata": { 103 | "collapsed": true 104 | }, 105 | "outputs": [], 106 | "source": [ 107 | "# MODEL\n", 108 | "x = tf.placeholder('int32', [batch_size, max_sent_size], name=\"x\")\n", 109 | "y = tf.placeholder('int32', [batch_size], name=\"y\")\n", 110 | "x_mask = tf.placeholder('bool', [batch_size, max_sent_size], name='x_mask') \n", 111 | "is_train = tf.placeholder('bool', [], name='is_train')\n", 112 | "word_emb_mat = tf.placeholder('float', [None, word_emb_size], name='emb_mat')\n", 113 | "input_keep_prob = tf.cond(is_train,lambda:keep_prob, lambda:tf.constant(1.0))\n", 114 | "x_len = tf.reduce_sum(tf.cast(x_mask, 'int32'), 1)\n", 115 | "\n", 116 | "with tf.name_scope(\"word_embedding\"):\n", 117 | " if mode == 'train':\n", 118 | " unk_word_emb_mat = tf.get_variable(\"word_emb_mat\", dtype='float', shape=[unk_vocab_size, word_emb_size], initializer=tf.contrib.layers.xavier_initializer(uniform=True, seed=0, dtype=tf.float32))\n", 119 | " else:\n", 120 | " unk_word_emb_mat = tf.get_variable(\"word_emb_mat\", shape=[unk_vocab_size, word_emb_size], dtype='float')\n", 121 | " \n", 122 | " final_word_emb_mat = tf.concat([word_emb_mat, unk_word_emb_mat], 0)\n", 123 | " Wx = tf.nn.embedding_lookup(final_word_emb_mat, x) \n", 124 | "\n", 125 | "with tf.variable_scope(\"lstm1\"):\n", 126 | " cell_fw1 = tf.contrib.rnn.BasicLSTMCell(hidden_size,state_is_tuple=True)\n", 127 | " cell_bw1 = tf.contrib.rnn.BasicLSTMCell(hidden_size,state_is_tuple=True)\n", 128 | "\n", 129 | " d_cell_fw1 = tf.contrib.rnn.DropoutWrapper(cell_fw1, input_keep_prob=input_keep_prob)\n", 130 | " d_cell_bw1 = tf.contrib.rnn.DropoutWrapper(cell_bw1, input_keep_prob=input_keep_prob)\n", 131 | " \n", 132 | " (fw_h1, bw_h1), _ = tf.nn.bidirectional_dynamic_rnn(d_cell_fw1, d_cell_bw1, Wx, sequence_length=x_len, dtype='float', scope='lstm1')\n", 133 | " h1 = tf.concat([fw_h1, bw_h1], 2)\n", 134 | " \n", 135 | "with tf.variable_scope(\"lstm2\"):\n", 136 | " cell_fw2 = tf.contrib.rnn.BasicLSTMCell(hidden_size,state_is_tuple=True)\n", 137 | " cell_bw2 = tf.contrib.rnn.BasicLSTMCell(hidden_size,state_is_tuple=True)\n", 138 | "\n", 139 | " d_cell_fw2 = tf.contrib.rnn.DropoutWrapper(cell_fw2, input_keep_prob=input_keep_prob)\n", 140 | " d_cell_bw2 = tf.contrib.rnn.DropoutWrapper(cell_bw2, input_keep_prob=input_keep_prob)\n", 141 | " \n", 142 | " (fw_h2, bw_h2), _ = tf.nn.bidirectional_dynamic_rnn(d_cell_fw2, d_cell_bw2, h1, sequence_length=x_len, dtype='float', scope='lstm2')\n", 143 | " h = tf.concat([fw_h2, bw_h2], 2)\n", 144 | "\n", 145 | "def attention(input_x, input_mask, W_att):\n", 146 | " h_masked = tf.boolean_mask(input_x, input_mask)\n", 147 | " h_tanh = tf.tanh(h_masked)\n", 148 | " u = tf.matmul(h_tanh, W_att)\n", 149 | " a = tf.nn.softmax(u)\n", 150 | " c = tf.reduce_sum(tf.multiply(h_tanh, a), 0) \n", 151 | " return c\n", 152 | "\n", 153 | "with tf.variable_scope(\"attention\"):\n", 154 | " W_att = tf.Variable(tf.truncated_normal([2*hidden_size, 1], mean=0.0, stddev=0.1, seed=0), name=\"W_att\")\n", 155 | " c = tf.expand_dims(attention(h[0], x_mask[0], W_att), 0)\n", 156 | " for i in range(1, batch_size):\n", 157 | " c = tf.concat([c, tf.expand_dims(attention(h[i], x_mask[i], W_att), 0)], 0)\n", 158 | " \n", 159 | "with tf.variable_scope(\"softmax_layer\"):\n", 160 | " W = tf.Variable(tf.truncated_normal([2*hidden_size, num_senses], mean=0.0, stddev=0.1, seed=0), name=\"W\")\n", 161 | " b = tf.Variable(tf.zeros([num_senses]), name=\"b\")\n", 162 | " drop_c = tf.nn.dropout(c, input_keep_prob)\n", 163 | " logits = tf.matmul(drop_c, W) + b\n", 164 | " predictions = tf.argmax(logits, 1)\n", 165 | "\n", 166 | "loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=y))\n", 167 | "global_step = tf.Variable(0, trainable=False, name=\"global_step\")\n", 168 | "\n", 169 | "learning_rate = tf.train.exponential_decay(init_lr, global_step, decay_steps, decay_rate, staircase=True)\n", 170 | "\n", 171 | "tv_all = tf.trainable_variables()\n", 172 | "tv_regu =[]\n", 173 | "for t in tv_all:\n", 174 | " if t.name.find('b:')==-1:\n", 175 | " tv_regu.append(t)\n", 176 | " \n", 177 | "# l2 Loss\n", 178 | "l2_loss = l2_lambda * tf.reduce_sum([ tf.nn.l2_loss(v) for v in tv_regu ])\n", 179 | "\n", 180 | "total_loss = loss + l2_loss\n", 181 | "\n", 182 | "# Optimizer for loss\n", 183 | "optimizer = tf.train.AdamOptimizer(learning_rate)\n", 184 | "\n", 185 | "# Gradients and Variables for Loss\n", 186 | "grads_vars = optimizer.compute_gradients(total_loss)\n", 187 | "\n", 188 | "# Clipping of Gradients\n", 189 | "clipped_grads = grads_vars\n", 190 | "if(clipping == True):\n", 191 | " clipped_grads = [(tf.clip_by_norm(grad, clip_norm), var) for grad, var in clipped_grads]\n", 192 | "\n", 193 | "# Training Optimizer for Total Loss\n", 194 | "train_op = optimizer.apply_gradients(clipped_grads, global_step=global_step)\n", 195 | "\n", 196 | "# Summaries\n", 197 | "var_summaries = []\n", 198 | "for v in tv_all:\n", 199 | " var_summary = tf.summary.histogram(\"{}/var\".format(v.name), v)\n", 200 | " var_summaries.append(var_summary)\n", 201 | "\n", 202 | "var_summaries_merged = tf.summary.merge(var_summaries)\n", 203 | "\n", 204 | "loss_summary = tf.summary.scalar(\"loss\", loss)\n", 205 | "total_loss_summary = tf.summary.scalar(\"total_loss\", total_loss)\n", 206 | "summary = tf.summary.merge_all()" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": 6, 212 | "metadata": { 213 | "collapsed": true 214 | }, 215 | "outputs": [], 216 | "source": [ 217 | "os.environ[\"CUDA_DEVICE_ORDER\"]=\"PCI_BUS_ID\" # see issue #152\n", 218 | "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"3\"\n", 219 | "sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))\n", 220 | "sess.run(tf.global_variables_initializer()) # For initializing all the variables\n", 221 | "saver = tf.train.Saver() # For Saving the model\n", 222 | "summary_writer = tf.summary.FileWriter(log_dir, sess.graph) # For writing Summaries" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": 7, 228 | "metadata": { 229 | "scrolled": true 230 | }, 231 | "outputs": [ 232 | { 233 | "name": "stderr", 234 | "output_type": "stream", 235 | "text": [ 236 | "/users/btech/aviraj/cs771/lib/python3.5/site-packages/sklearn/model_selection/_split.py:2026: FutureWarning: From version 0.21, test_size will always complement train_size unless both are specified.\n", 237 | " FutureWarning)\n" 238 | ] 239 | } 240 | ], 241 | "source": [ 242 | "# Splitting\n", 243 | "data_x = full_data[sense_word][0]\n", 244 | "data_y = full_data[sense_word][2]\n", 245 | "x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, train_size=0.8, shuffle=True, stratify=data_y, random_state=0)\n", 246 | "\n", 247 | "x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, train_size=0.9, shuffle=True, stratify=y_train, random_state=0)" 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": 8, 253 | "metadata": { 254 | "collapsed": true 255 | }, 256 | "outputs": [], 257 | "source": [ 258 | "def data_prepare(x):\n", 259 | " num_examples = len(x)\n", 260 | "\n", 261 | " xx = np.zeros([num_examples, max_sent_size], dtype=int)\n", 262 | " xx_mask = np.zeros([num_examples, max_sent_size], dtype=bool)\n", 263 | "\n", 264 | " for j in range(num_examples):\n", 265 | " for i in range(max_sent_size):\n", 266 | " if(i>=len(x[j])):\n", 267 | " break\n", 268 | " w = x[j][i]\n", 269 | " xx[j][i] = word2id[w] if w in word2id else word2id['UNKNOWN_TOKEN']\n", 270 | " xx_mask[j][i] = True\n", 271 | " \n", 272 | " return xx, xx_mask\n", 273 | "\n", 274 | "def eval_score(yy, pred):\n", 275 | " num_batches = int(len(yy)/batch_size)\n", 276 | " f1 = f1_score(yy[:batch_size*num_batches], pred, average='macro')\n", 277 | " accu = accuracy_score(yy[:batch_size*num_batches], pred)\n", 278 | " return f1*100, accu*100\n", 279 | "\n", 280 | "def model(xx, yy, mask, train_cond=True):\n", 281 | " num_batches = int(len(xx)/batch_size)\n", 282 | " losses = 0\n", 283 | " preds = []\n", 284 | " for j in range(num_batches): \n", 285 | " \n", 286 | " s = j * batch_size\n", 287 | " e = (j+1) * batch_size\n", 288 | " \n", 289 | " feed_dict = {x:xx[s:e], y:yy[s:e], x_mask:mask[s:e], is_train:train_cond, input_keep_prob:keep_prob, word_emb_mat:word_embedding}\n", 290 | " \n", 291 | " \n", 292 | " if(train_cond==True):\n", 293 | " _, _loss, step, _summary = sess.run([train_op, total_loss, global_step, summary], feed_dict)\n", 294 | " summary_writer.add_summary(_summary, step) \n", 295 | "# print(\"Steps:{}\".format(step), \", Loss: {}\".format(_loss))\n", 296 | "\n", 297 | " else:\n", 298 | " _loss, pred = sess.run([total_loss, predictions], feed_dict)\n", 299 | " preds.append(pred)\n", 300 | " \n", 301 | " losses +=_loss\n", 302 | "\n", 303 | " if(train_cond==False):\n", 304 | " y_pred = []\n", 305 | " for i in range(num_batches):\n", 306 | " for pred in preds[i]:\n", 307 | " y_pred.append(pred)\n", 308 | " return losses/num_batches, y_pred\n", 309 | " \n", 310 | " return losses/num_batches, step" 311 | ] 312 | }, 313 | { 314 | "cell_type": "code", 315 | "execution_count": 9, 316 | "metadata": { 317 | "collapsed": true 318 | }, 319 | "outputs": [], 320 | "source": [ 321 | "x_id_train, mask_train = data_prepare(x_train)\n", 322 | "x_id_val, mask_val = data_prepare(x_val)\n", 323 | "x_id_test, mask_test = data_prepare(x_test)\n", 324 | "y_train = np.array(y_train)" 325 | ] 326 | }, 327 | { 328 | "cell_type": "code", 329 | "execution_count": 10, 330 | "metadata": { 331 | "scrolled": true 332 | }, 333 | "outputs": [ 334 | { 335 | "name": "stdout", 336 | "output_type": "stream", 337 | "text": [ 338 | "Epoch: 1 Step: 48 loss: 1.86646759758\n", 339 | "Epoch: 2 Step: 96 loss: 1.21714039147\n", 340 | "Epoch: 3 Step: 144 loss: 1.08560919886\n", 341 | "Epoch: 4 Step: 192 loss: 0.980009039243\n", 342 | "Epoch: 5 Step: 240 loss: 0.881924713651\n", 343 | "Saved Model Complete\n", 344 | "Train: F1 Score: 60.3386654855 Accuracy: 83.5611979167 Loss: 0.80800242722\n", 345 | "Val: F1 Score: 54.6225701167 Accuracy: 82.8125 Loss: 0.770564937592\n", 346 | "Epoch: 6 Step: 288 loss: 0.806718610227\n", 347 | "Epoch: 7 Step: 336 loss: 0.742333145191\n", 348 | "Epoch: 8 Step: 384 loss: 0.699159173295\n", 349 | "Epoch: 9 Step: 432 loss: 0.681758804868\n", 350 | "Epoch: 10 Step: 480 loss: 0.631260214373\n", 351 | "Saved Model Complete\n", 352 | "Train: F1 Score: 62.415876497 Accuracy: 85.64453125 Loss: 0.609668933476\n", 353 | "Val: F1 Score: 63.0309748731 Accuracy: 87.1875 Loss: 0.603685164452\n", 354 | "Epoch: 11 Step: 528 loss: 0.621730036413\n", 355 | "Epoch: 12 Step: 576 loss: 0.593547300746\n", 356 | "Epoch: 13 Step: 624 loss: 0.567168306559\n", 357 | "Epoch: 14 Step: 672 loss: 0.572736630837\n", 358 | "Epoch: 15 Step: 720 loss: 0.52119900162\n", 359 | "Saved Model Complete\n", 360 | "Train: F1 Score: 73.2345182784 Accuracy: 87.59765625 Loss: 0.500816229731\n", 361 | "Val: F1 Score: 71.3846572025 Accuracy: 88.125 Loss: 0.507379829884\n", 362 | "Epoch: 16 Step: 768 loss: 0.518757795294\n", 363 | "Epoch: 17 Step: 816 loss: 0.508907252923\n", 364 | "Epoch: 18 Step: 864 loss: 0.480370514716\n", 365 | "Epoch: 19 Step: 912 loss: 0.481487047548\n", 366 | "Epoch: 20 Step: 960 loss: 0.483874622112\n", 367 | "Saved Model Complete\n", 368 | "Train: F1 Score: 72.4541504438 Accuracy: 88.57421875 Loss: 0.454483479882\n", 369 | "Val: F1 Score: 69.7799159478 Accuracy: 88.4375 Loss: 0.505139875412\n", 370 | "Epoch: 21 Step: 1008 loss: 0.445587230225\n", 371 | "Epoch: 22 Step: 1056 loss: 0.448845259845\n", 372 | "Epoch: 23 Step: 1104 loss: 0.418395101403\n", 373 | "Epoch: 24 Step: 1152 loss: 0.42787179475\n", 374 | "Epoch: 25 Step: 1200 loss: 0.41220224835\n", 375 | "Saved Model Complete\n", 376 | "Train: F1 Score: 79.7367544121 Accuracy: 90.5598958333 Loss: 0.387414715563\n", 377 | "Val: F1 Score: 80.5119717533 Accuracy: 91.25 Loss: 0.428414440155\n", 378 | "Epoch: 26 Step: 1248 loss: 0.398100319629\n", 379 | "Epoch: 27 Step: 1296 loss: 0.401642986884\n", 380 | "Epoch: 28 Step: 1344 loss: 0.380077781156\n", 381 | "Epoch: 29 Step: 1392 loss: 0.371819969267\n", 382 | "Epoch: 30 Step: 1440 loss: 0.375808695642\n", 383 | "Saved Model Complete\n", 384 | "Train: F1 Score: 82.6141307319 Accuracy: 91.1458333333 Loss: 0.374826697633\n", 385 | "Val: F1 Score: 72.6194736328 Accuracy: 87.5 Loss: 0.443939989805\n", 386 | "Epoch: 31 Step: 1488 loss: 0.368128724086\n", 387 | "Epoch: 32 Step: 1536 loss: 0.363611215415\n", 388 | "Epoch: 33 Step: 1584 loss: 0.370647774388\n", 389 | "Epoch: 34 Step: 1632 loss: 0.368405311989\n", 390 | "Epoch: 35 Step: 1680 loss: 0.349992937719\n", 391 | "Saved Model Complete\n", 392 | "Train: F1 Score: 81.482253082 Accuracy: 91.6666666667 Loss: 0.36779523051\n", 393 | "Val: F1 Score: 76.2094695081 Accuracy: 89.6875 Loss: 0.484789025784\n", 394 | "Epoch: 36 Step: 1728 loss: 0.347480880097\n", 395 | "Epoch: 37 Step: 1776 loss: 0.344036137685\n", 396 | "Epoch: 38 Step: 1824 loss: 0.329046547723\n", 397 | "Epoch: 39 Step: 1872 loss: 0.308786494968\n", 398 | "Epoch: 40 Step: 1920 loss: 0.335401780282\n", 399 | "Saved Model Complete\n", 400 | "Train: F1 Score: 88.5588616245 Accuracy: 94.53125 Loss: 0.291247650981\n", 401 | "Val: F1 Score: 84.105797863 Accuracy: 92.5 Loss: 0.359305435419\n", 402 | "Epoch: 41 Step: 1968 loss: 0.332291507783\n", 403 | "Epoch: 42 Step: 2016 loss: 0.314355407842\n", 404 | "Epoch: 43 Step: 2064 loss: 0.319293403377\n", 405 | "Epoch: 44 Step: 2112 loss: 0.297154735463\n", 406 | "Epoch: 45 Step: 2160 loss: 0.305809813552\n", 407 | "Saved Model Complete\n", 408 | "Train: F1 Score: 88.5833478857 Accuracy: 94.5963541667 Loss: 0.283582553578\n", 409 | "Val: F1 Score: 78.8451959418 Accuracy: 90.625 Loss: 0.493378305435\n", 410 | "Epoch: 46 Step: 2208 loss: 0.28896213385\n", 411 | "Epoch: 47 Step: 2256 loss: 0.299109598001\n", 412 | "Epoch: 48 Step: 2304 loss: 0.285256354449\n", 413 | "Epoch: 49 Step: 2352 loss: 0.293783533076\n", 414 | "Epoch: 50 Step: 2400 loss: 0.288317035573\n", 415 | "Saved Model Complete\n", 416 | "Train: F1 Score: 89.4559800481 Accuracy: 94.8567708333 Loss: 0.271899814407\n", 417 | "Val: F1 Score: 78.6662686459 Accuracy: 89.375 Loss: 0.450355643034\n", 418 | "Test: F1 Score: 78.1099629833 Accuracy: 89.7836538462 Loss: 0.494133715446\n" 419 | ] 420 | } 421 | ], 422 | "source": [ 423 | "num_epochs = 50\n", 424 | "\n", 425 | "for i in range(num_epochs):\n", 426 | " \n", 427 | " random = np.random.choice(len(y_train), size=(len(y_train)), replace=False)\n", 428 | " x_id_train = x_id_train[random]\n", 429 | " y_train = y_train[random]\n", 430 | " mask_train = mask_train[random]\n", 431 | " \n", 432 | " losses, step = model(x_id_train, y_train, mask_train)\n", 433 | " print(\"Epoch:\", i+1,\"Step:\", step, \"loss:\",losses)\n", 434 | " \n", 435 | " if((i+1)%5==0):\n", 436 | " saver.save(sess, save_path=save_dir) \n", 437 | " print(\"Saved Model Complete\")\n", 438 | " train_loss, train_pred = model(x_id_train, y_train, mask_train, train_cond=False)\n", 439 | " f1_, accu_ = eval_score(y_train, train_pred)\n", 440 | " print(\"Train: F1 Score: \", f1_, \"Accuracy: \", accu_, \"Loss: \", train_loss)\n", 441 | " val_loss, val_pred = model(x_id_val, y_val, mask_val, train_cond=False)\n", 442 | " f1_, accu_ = eval_score(y_val, val_pred)\n", 443 | " print(\"Val: F1 Score: \", f1_, \"Accuracy: \", accu_, \"Loss: \", val_loss)\n", 444 | " \n", 445 | "test_loss, test_pred = model(x_id_test, y_test, mask_test, train_cond=False)\n", 446 | "f1_, accu_ = eval_score(y_test, test_pred)\n", 447 | "print(\"Test: F1 Score: \", f1_, \"Accuracy: \", accu_, \"Loss: \", test_loss)" 448 | ] 449 | }, 450 | { 451 | "cell_type": "code", 452 | "execution_count": null, 453 | "metadata": { 454 | "collapsed": true 455 | }, 456 | "outputs": [], 457 | "source": [] 458 | }, 459 | { 460 | "cell_type": "code", 461 | "execution_count": null, 462 | "metadata": { 463 | "collapsed": true 464 | }, 465 | "outputs": [], 466 | "source": [ 467 | "saver.restore(sess, save_dir)" 468 | ] 469 | } 470 | ], 471 | "metadata": { 472 | "kernelspec": { 473 | "display_name": "cs771", 474 | "language": "python", 475 | "name": "cs771" 476 | }, 477 | "language_info": { 478 | "codemirror_mode": { 479 | "name": "ipython", 480 | "version": 3 481 | }, 482 | "file_extension": ".py", 483 | "mimetype": "text/x-python", 484 | "name": "python", 485 | "nbconvert_exporter": "python", 486 | "pygments_lexer": "ipython3", 487 | "version": "3.5.2" 488 | } 489 | }, 490 | "nbformat": 4, 491 | "nbformat_minor": 2 492 | } 493 | -------------------------------------------------------------------------------- /Four Word Model/Model-3.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import tensorflow as tf\n", 12 | "tf.logging.set_verbosity(tf.logging.WARN)\n", 13 | "import pickle\n", 14 | "import numpy as np\n", 15 | "import os\n", 16 | "from sklearn.model_selection import train_test_split\n", 17 | "from sklearn.metrics import f1_score\n", 18 | "from sklearn.metrics import accuracy_score\n", 19 | "import os\n", 20 | "from tensorflow.python.client import device_lib\n", 21 | "from collections import Counter" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 2, 27 | "metadata": { 28 | "collapsed": true 29 | }, 30 | "outputs": [], 31 | "source": [ 32 | "f = open('../Glove/word_embedding_glove', 'rb')\n", 33 | "word_embedding = pickle.load(f)\n", 34 | "f.close()\n", 35 | "word_embedding = word_embedding[: len(word_embedding)-1]\n", 36 | "\n", 37 | "f = open('../Glove/vocab_glove', 'rb')\n", 38 | "vocab = pickle.load(f)\n", 39 | "f.close()\n", 40 | "\n", 41 | "word2id = dict((w, i) for i,w in enumerate(vocab))\n", 42 | "id2word = dict((i, w) for i,w in enumerate(vocab))\n", 43 | "\n", 44 | "unknown_token = \"UNKNOWN_TOKEN\"\n", 45 | "\n", 46 | "f = open(\"train.pickle\", 'rb')\n", 47 | "full_data = pickle.load(f)\n", 48 | "f.close()" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 3, 54 | "metadata": { 55 | "collapsed": true 56 | }, 57 | "outputs": [], 58 | "source": [ 59 | "# Model Description\n", 60 | "sense_word = 'hard'\n", 61 | "model_name = 'model-3'\n", 62 | "model_dir = 'output/' + sense_word + '/' + model_name\n", 63 | "save_dir = os.path.join(model_dir, \"save/\")\n", 64 | "log_dir = os.path.join(model_dir, \"log\")\n", 65 | "\n", 66 | "if not os.path.exists(model_dir):\n", 67 | " os.mkdir(model_dir)\n", 68 | "if not os.path.exists(save_dir):\n", 69 | " os.mkdir(save_dir)\n", 70 | "if not os.path.exists(log_dir):\n", 71 | " os.mkdir(log_dir)" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 4, 77 | "metadata": {}, 78 | "outputs": [ 79 | { 80 | "name": "stdout", 81 | "output_type": "stream", 82 | "text": [ 83 | "Counter({'HARD1': 3455, 'HARD2': 502, 'HARD3': 376})\n", 84 | "[ 1.21578586 5.30486965 5.47934437]\n" 85 | ] 86 | } 87 | ], 88 | "source": [ 89 | "sense_counts = Counter(full_data[sense_word][1])\n", 90 | "print(sense_counts)\n", 91 | "total_count = len(full_data[sense_word][1])\n", 92 | "sort_sense_counts = sense_counts.most_common()\n", 93 | "vocab_sense = [k for k,v in sort_sense_counts]\n", 94 | "freq_sense = [v for k,v in sort_sense_counts]\n", 95 | "weights = np.multiply(6, [1 - count/total_count for count in freq_sense])\n", 96 | "weights = weights.astype(np.float32)\n", 97 | "print(weights)" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": 5, 103 | "metadata": { 104 | "collapsed": true 105 | }, 106 | "outputs": [], 107 | "source": [ 108 | "# Parameters\n", 109 | "mode = 'train'\n", 110 | "num_senses = 3\n", 111 | "batch_size = 64\n", 112 | "vocab_size = len(vocab)\n", 113 | "unk_vocab_size = 1\n", 114 | "word_emb_size = len(word_embedding[0])\n", 115 | "max_sent_size = 200\n", 116 | "hidden_size = 100\n", 117 | "keep_prob = 0.5\n", 118 | "l2_lambda = 0.002\n", 119 | "init_lr = 0.005\n", 120 | "decay_steps = 500\n", 121 | "decay_rate = 0.96\n", 122 | "clip_norm = 1\n", 123 | "clipping = True" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": 6, 129 | "metadata": { 130 | "collapsed": true 131 | }, 132 | "outputs": [], 133 | "source": [ 134 | "# MODEL\n", 135 | "x = tf.placeholder('int32', [batch_size, max_sent_size], name=\"x\")\n", 136 | "y = tf.placeholder('int32', [batch_size], name=\"y\")\n", 137 | "x_mask = tf.placeholder('bool', [batch_size, max_sent_size], name='x_mask') \n", 138 | "is_train = tf.placeholder('bool', [], name='is_train')\n", 139 | "word_emb_mat = tf.placeholder('float', [None, word_emb_size], name='emb_mat')\n", 140 | "input_keep_prob = tf.cond(is_train,lambda:keep_prob, lambda:tf.constant(1.0))\n", 141 | "x_len = tf.reduce_sum(tf.cast(x_mask, 'int32'), 1)\n", 142 | "\n", 143 | "with tf.name_scope(\"word_embedding\"):\n", 144 | " if mode == 'train':\n", 145 | " unk_word_emb_mat = tf.get_variable(\"word_emb_mat\", dtype='float', shape=[unk_vocab_size, word_emb_size], initializer=tf.contrib.layers.xavier_initializer(uniform=True, seed=0, dtype=tf.float32))\n", 146 | " else:\n", 147 | " unk_word_emb_mat = tf.get_variable(\"word_emb_mat\", shape=[unk_vocab_size, word_emb_size], dtype='float')\n", 148 | " \n", 149 | " final_word_emb_mat = tf.concat([word_emb_mat, unk_word_emb_mat], 0)\n", 150 | " Wx = tf.nn.embedding_lookup(final_word_emb_mat, x) \n", 151 | "\n", 152 | "with tf.variable_scope(\"lstm1\"):\n", 153 | " cell_fw1 = tf.contrib.rnn.BasicLSTMCell(hidden_size,state_is_tuple=True)\n", 154 | " cell_bw1 = tf.contrib.rnn.BasicLSTMCell(hidden_size,state_is_tuple=True)\n", 155 | "\n", 156 | " d_cell_fw1 = tf.contrib.rnn.DropoutWrapper(cell_fw1, input_keep_prob=input_keep_prob)\n", 157 | " d_cell_bw1 = tf.contrib.rnn.DropoutWrapper(cell_bw1, input_keep_prob=input_keep_prob)\n", 158 | " \n", 159 | " (fw_h1, bw_h1), _ = tf.nn.bidirectional_dynamic_rnn(d_cell_fw1, d_cell_bw1, Wx, sequence_length=x_len, dtype='float', scope='lstm1')\n", 160 | " h1 = tf.concat([fw_h1, bw_h1], 2)\n", 161 | " \n", 162 | "with tf.variable_scope(\"lstm2\"):\n", 163 | " cell_fw2 = tf.contrib.rnn.BasicLSTMCell(hidden_size,state_is_tuple=True)\n", 164 | " cell_bw2 = tf.contrib.rnn.BasicLSTMCell(hidden_size,state_is_tuple=True)\n", 165 | "\n", 166 | " d_cell_fw2 = tf.contrib.rnn.DropoutWrapper(cell_fw2, input_keep_prob=input_keep_prob)\n", 167 | " d_cell_bw2 = tf.contrib.rnn.DropoutWrapper(cell_bw2, input_keep_prob=input_keep_prob)\n", 168 | " \n", 169 | " (fw_h2, bw_h2), _ = tf.nn.bidirectional_dynamic_rnn(d_cell_fw2, d_cell_bw2, h1, sequence_length=x_len, dtype='float', scope='lstm2')\n", 170 | " h = tf.concat([fw_h2, bw_h2], 2)\n", 171 | "\n", 172 | "def attention(input_x, input_mask, W_att):\n", 173 | " h_masked = tf.boolean_mask(input_x, input_mask)\n", 174 | " h_tanh = tf.tanh(h_masked)\n", 175 | " u = tf.matmul(h_tanh, W_att)\n", 176 | " a = tf.nn.softmax(u)\n", 177 | " c = tf.reduce_sum(tf.multiply(h_tanh, a), 0) \n", 178 | " return c\n", 179 | "\n", 180 | "with tf.variable_scope(\"attention\"):\n", 181 | " W_att = tf.Variable(tf.truncated_normal([2*hidden_size, 1], mean=0.0, stddev=0.1, seed=0), name=\"W_att\")\n", 182 | " c = tf.expand_dims(attention(h[0], x_mask[0], W_att), 0)\n", 183 | " for i in range(1, batch_size):\n", 184 | " c = tf.concat([c, tf.expand_dims(attention(h[i], x_mask[i], W_att), 0)], 0)\n", 185 | " \n", 186 | "with tf.variable_scope(\"softmax_layer\"):\n", 187 | " W = tf.Variable(tf.truncated_normal([2*hidden_size, num_senses], mean=0.0, stddev=0.1, seed=0), name=\"W\")\n", 188 | " b = tf.Variable(tf.zeros([num_senses]), name=\"b\")\n", 189 | " drop_c = tf.nn.dropout(c, input_keep_prob)\n", 190 | " logits = tf.matmul(drop_c, W) + b\n", 191 | " predictions = tf.argmax(logits, 1)\n", 192 | "\n", 193 | "class_weight = tf.constant(weights)\n", 194 | "weighted_logits = logits * class_weight\n", 195 | "loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=weighted_logits, labels=y))\n", 196 | "global_step = tf.Variable(0, trainable=False, name=\"global_step\")\n", 197 | "\n", 198 | "learning_rate = tf.train.exponential_decay(init_lr, global_step, decay_steps, decay_rate, staircase=True)\n", 199 | "\n", 200 | "tv_all = tf.trainable_variables()\n", 201 | "tv_regu =[]\n", 202 | "for t in tv_all:\n", 203 | " if t.name.find('b:')==-1:\n", 204 | " tv_regu.append(t)\n", 205 | " \n", 206 | "# l2 Loss\n", 207 | "l2_loss = l2_lambda * tf.reduce_sum([ tf.nn.l2_loss(v) for v in tv_regu ])\n", 208 | "\n", 209 | "total_loss = loss + l2_loss\n", 210 | "\n", 211 | "# Optimizer for loss\n", 212 | "optimizer = tf.train.AdamOptimizer(learning_rate)\n", 213 | "\n", 214 | "# Gradients and Variables for Loss\n", 215 | "grads_vars = optimizer.compute_gradients(total_loss)\n", 216 | "\n", 217 | "# Clipping of Gradients\n", 218 | "clipped_grads = grads_vars\n", 219 | "if(clipping == True):\n", 220 | " clipped_grads = [(tf.clip_by_norm(grad, clip_norm), var) for grad, var in clipped_grads]\n", 221 | "\n", 222 | "# Training Optimizer for Total Loss\n", 223 | "train_op = optimizer.apply_gradients(clipped_grads, global_step=global_step)\n", 224 | "\n", 225 | "# Summaries\n", 226 | "var_summaries = []\n", 227 | "for v in tv_all:\n", 228 | " var_summary = tf.summary.histogram(\"{}/var\".format(v.name), v)\n", 229 | " var_summaries.append(var_summary)\n", 230 | "\n", 231 | "var_summaries_merged = tf.summary.merge(var_summaries)\n", 232 | "\n", 233 | "loss_summary = tf.summary.scalar(\"loss\", loss)\n", 234 | "total_loss_summary = tf.summary.scalar(\"total_loss\", total_loss)\n", 235 | "summary = tf.summary.merge_all()" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": 7, 241 | "metadata": { 242 | "collapsed": true 243 | }, 244 | "outputs": [], 245 | "source": [ 246 | "os.environ[\"CUDA_DEVICE_ORDER\"]=\"PCI_BUS_ID\" # see issue #152\n", 247 | "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"1\"\n", 248 | "config = tf.ConfigProto()\n", 249 | "config.gpu_options.allow_growth = True\n", 250 | "sess = tf.Session(config=config)\n", 251 | "sess.run(tf.global_variables_initializer()) # For initializing all the variables\n", 252 | "saver = tf.train.Saver() # For Saving the model\n", 253 | "summary_writer = tf.summary.FileWriter(log_dir, sess.graph) # For writing Summaries" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": 8, 259 | "metadata": { 260 | "scrolled": true 261 | }, 262 | "outputs": [ 263 | { 264 | "name": "stderr", 265 | "output_type": "stream", 266 | "text": [ 267 | "/users/btech/aviraj/cs771/lib/python3.5/site-packages/sklearn/model_selection/_split.py:2026: FutureWarning: From version 0.21, test_size will always complement train_size unless both are specified.\n", 268 | " FutureWarning)\n" 269 | ] 270 | } 271 | ], 272 | "source": [ 273 | "# Splitting\n", 274 | "data_x = full_data[sense_word][0]\n", 275 | "data_y = full_data[sense_word][2]\n", 276 | "x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, train_size=0.8, shuffle=True, stratify=data_y, random_state=0)\n", 277 | "\n", 278 | "x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, train_size=0.9, shuffle=True, stratify=y_train, random_state=0)" 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": 9, 284 | "metadata": { 285 | "collapsed": true 286 | }, 287 | "outputs": [], 288 | "source": [ 289 | "def data_prepare(x):\n", 290 | " num_examples = len(x)\n", 291 | "\n", 292 | " xx = np.zeros([num_examples, max_sent_size], dtype=int)\n", 293 | " xx_mask = np.zeros([num_examples, max_sent_size], dtype=bool)\n", 294 | "\n", 295 | " for j in range(num_examples):\n", 296 | " for i in range(max_sent_size):\n", 297 | " if(i>=len(x[j])):\n", 298 | " break\n", 299 | " w = x[j][i]\n", 300 | " xx[j][i] = word2id[w] if w in word2id else word2id['UNKNOWN_TOKEN']\n", 301 | " xx_mask[j][i] = True\n", 302 | " \n", 303 | " return xx, xx_mask\n", 304 | "\n", 305 | "def eval_score(yy, pred):\n", 306 | " num_batches = int(len(yy)/batch_size)\n", 307 | " f1 = f1_score(yy[:batch_size*num_batches], pred, average='macro')\n", 308 | " accu = accuracy_score(yy[:batch_size*num_batches], pred)\n", 309 | " return f1*100, accu*100\n", 310 | "\n", 311 | "def model(xx, yy, mask, train_cond=True):\n", 312 | " num_batches = int(len(xx)/batch_size)\n", 313 | " losses = 0\n", 314 | " preds = []\n", 315 | " for j in range(num_batches): \n", 316 | " \n", 317 | " s = j * batch_size\n", 318 | " e = (j+1) * batch_size\n", 319 | " \n", 320 | " feed_dict = {x:xx[s:e], y:yy[s:e], x_mask:mask[s:e], is_train:train_cond, input_keep_prob:keep_prob, word_emb_mat:word_embedding}\n", 321 | " \n", 322 | " \n", 323 | " if(train_cond==True):\n", 324 | " _, _loss, step, _summary = sess.run([train_op, total_loss, global_step, summary], feed_dict)\n", 325 | " summary_writer.add_summary(_summary, step) \n", 326 | "# print(\"Steps:{}\".format(step), \", Loss: {}\".format(_loss))\n", 327 | "\n", 328 | " else:\n", 329 | " _loss, pred = sess.run([total_loss, predictions], feed_dict)\n", 330 | " preds.append(pred)\n", 331 | " \n", 332 | " losses +=_loss\n", 333 | "\n", 334 | " if(train_cond==False):\n", 335 | " y_pred = []\n", 336 | " for i in range(num_batches):\n", 337 | " for pred in preds[i]:\n", 338 | " y_pred.append(pred)\n", 339 | " return losses/num_batches, y_pred\n", 340 | " \n", 341 | " return losses/num_batches, step" 342 | ] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "execution_count": 10, 347 | "metadata": { 348 | "collapsed": true 349 | }, 350 | "outputs": [], 351 | "source": [ 352 | "x_id_train, mask_train = data_prepare(x_train)\n", 353 | "x_id_val, mask_val = data_prepare(x_val)\n", 354 | "x_id_test, mask_test = data_prepare(x_test)\n", 355 | "y_train = np.array(y_train)" 356 | ] 357 | }, 358 | { 359 | "cell_type": "code", 360 | "execution_count": 11, 361 | "metadata": { 362 | "scrolled": true 363 | }, 364 | "outputs": [ 365 | { 366 | "name": "stdout", 367 | "output_type": "stream", 368 | "text": [ 369 | "Epoch: 1 Step: 48 loss: 9.53506787121\n", 370 | "Epoch: 2 Step: 96 loss: 1.81961081177\n", 371 | "Epoch: 3 Step: 144 loss: 1.19337606803\n", 372 | "Epoch: 4 Step: 192 loss: 0.967174999416\n", 373 | "Epoch: 5 Step: 240 loss: 0.859784771999\n", 374 | "Saved Model Complete\n", 375 | "Train: F1 Score: 46.2728990557 Accuracy: 73.2096354167 Loss: 0.935422244171\n", 376 | "Val: F1 Score: 44.6230136155 Accuracy: 70.9375 Loss: 0.885838544369\n", 377 | "Epoch: 6 Step: 288 loss: 0.815433536967\n", 378 | "Epoch: 7 Step: 336 loss: 0.756411065037\n", 379 | "Epoch: 8 Step: 384 loss: 0.722958392153\n", 380 | "Epoch: 9 Step: 432 loss: 0.67455783921\n", 381 | "Epoch: 10 Step: 480 loss: 0.677137187993\n", 382 | "Saved Model Complete\n", 383 | "Train: F1 Score: 45.3513910841 Accuracy: 81.15234375 Loss: 0.622080009431\n", 384 | "Val: F1 Score: 43.2422709632 Accuracy: 81.875 Loss: 0.607948565483\n", 385 | "Epoch: 11 Step: 528 loss: 0.65565276891\n", 386 | "Epoch: 12 Step: 576 loss: 0.645226646215\n", 387 | "Epoch: 13 Step: 624 loss: 0.631849833454\n", 388 | "Epoch: 14 Step: 672 loss: 0.653128698468\n", 389 | "Epoch: 15 Step: 720 loss: 0.610900692021\n", 390 | "Saved Model Complete\n", 391 | "Train: F1 Score: 60.5870838384 Accuracy: 83.3658854167 Loss: 0.585401636859\n", 392 | "Val: F1 Score: 61.1966168463 Accuracy: 85.3125 Loss: 0.595154416561\n", 393 | "Epoch: 16 Step: 768 loss: 0.640408499787\n", 394 | "Epoch: 17 Step: 816 loss: 0.573454591756\n", 395 | "Epoch: 18 Step: 864 loss: 0.573158189033\n", 396 | "Epoch: 19 Step: 912 loss: 0.580998883272\n", 397 | "Epoch: 20 Step: 960 loss: 0.599028664331\n", 398 | "Saved Model Complete\n", 399 | "Train: F1 Score: 66.2391100441 Accuracy: 85.6119791667 Loss: 0.579200811684\n", 400 | "Val: F1 Score: 63.3909012244 Accuracy: 84.375 Loss: 0.571177864075\n", 401 | "Epoch: 21 Step: 1008 loss: 0.613934485242\n", 402 | "Epoch: 22 Step: 1056 loss: 0.607284868757\n", 403 | "Epoch: 23 Step: 1104 loss: 0.597342180709\n", 404 | "Epoch: 24 Step: 1152 loss: 0.570371546472\n", 405 | "Epoch: 25 Step: 1200 loss: 0.580265671636\n", 406 | "Saved Model Complete\n", 407 | "Train: F1 Score: 67.9210837096 Accuracy: 86.7513020833 Loss: 0.537070132792\n", 408 | "Val: F1 Score: 73.5174165398 Accuracy: 89.0625 Loss: 0.566295391321\n", 409 | "Epoch: 26 Step: 1248 loss: 0.568779307107\n", 410 | "Epoch: 27 Step: 1296 loss: 0.55141502743\n", 411 | "Epoch: 28 Step: 1344 loss: 0.559002238015\n", 412 | "Epoch: 29 Step: 1392 loss: 0.569756407291\n", 413 | "Epoch: 30 Step: 1440 loss: 0.573152939479\n", 414 | "Saved Model Complete\n", 415 | "Train: F1 Score: 69.0664553653 Accuracy: 87.3046875 Loss: 0.59051666595\n", 416 | "Val: F1 Score: 68.3056653491 Accuracy: 88.125 Loss: 0.647302913666\n", 417 | "Epoch: 31 Step: 1488 loss: 0.601928584278\n", 418 | "Epoch: 32 Step: 1536 loss: 0.581918654342\n", 419 | "Epoch: 33 Step: 1584 loss: 0.539948465923\n", 420 | "Epoch: 34 Step: 1632 loss: 0.562553635488\n", 421 | "Epoch: 35 Step: 1680 loss: 0.547960610439\n", 422 | "Saved Model Complete\n", 423 | "Train: F1 Score: 71.4368257896 Accuracy: 88.4765625 Loss: 0.517511847119\n", 424 | "Val: F1 Score: 63.9771663859 Accuracy: 86.875 Loss: 0.614117074013\n", 425 | "Epoch: 36 Step: 1728 loss: 0.566355666146\n", 426 | "Epoch: 37 Step: 1776 loss: 0.555698808903\n", 427 | "Epoch: 38 Step: 1824 loss: 0.56517353033\n", 428 | "Epoch: 39 Step: 1872 loss: 0.581259304037\n", 429 | "Epoch: 40 Step: 1920 loss: 0.585148503383\n", 430 | "Saved Model Complete\n", 431 | "Train: F1 Score: 72.4950138601 Accuracy: 88.7044270833 Loss: 0.578148378059\n", 432 | "Val: F1 Score: 68.0165923988 Accuracy: 87.5 Loss: 0.708620613813\n", 433 | "Epoch: 41 Step: 1968 loss: 0.567735542854\n", 434 | "Epoch: 42 Step: 2016 loss: 0.539583496749\n", 435 | "Epoch: 43 Step: 2064 loss: 0.544194473575\n", 436 | "Epoch: 44 Step: 2112 loss: 0.556465638181\n", 437 | "Epoch: 45 Step: 2160 loss: 0.559930261845\n", 438 | "Saved Model Complete\n", 439 | "Train: F1 Score: 76.9940617261 Accuracy: 89.16015625 Loss: 0.536304668213\n", 440 | "Val: F1 Score: 74.9496075234 Accuracy: 88.4375 Loss: 0.573511379957\n", 441 | "Epoch: 46 Step: 2208 loss: 0.556281161805\n", 442 | "Epoch: 47 Step: 2256 loss: 0.549503739923\n", 443 | "Epoch: 48 Step: 2304 loss: 0.561590575303\n", 444 | "Epoch: 49 Step: 2352 loss: 0.538634177297\n", 445 | "Epoch: 50 Step: 2400 loss: 0.548110162839\n", 446 | "Saved Model Complete\n", 447 | "Train: F1 Score: 69.2087726432 Accuracy: 88.2486979167 Loss: 0.513670069476\n", 448 | "Val: F1 Score: 75.8463136033 Accuracy: 89.6875 Loss: 0.542824417353\n", 449 | "Test: F1 Score: 61.845299018 Accuracy: 85.0961538462 Loss: 0.683341053816\n" 450 | ] 451 | } 452 | ], 453 | "source": [ 454 | "num_epochs = 50\n", 455 | "\n", 456 | "for i in range(num_epochs):\n", 457 | " \n", 458 | " random = np.random.choice(len(y_train), size=(len(y_train)), replace=False)\n", 459 | " x_id_train = x_id_train[random]\n", 460 | " y_train = y_train[random]\n", 461 | " mask_train = mask_train[random]\n", 462 | " \n", 463 | " losses, step = model(x_id_train, y_train, mask_train)\n", 464 | " print(\"Epoch:\", i+1,\"Step:\", step, \"loss:\",losses)\n", 465 | " \n", 466 | " if((i+1)%5==0):\n", 467 | " saver.save(sess, save_path=save_dir) \n", 468 | " print(\"Saved Model Complete\")\n", 469 | " train_loss, train_pred = model(x_id_train, y_train, mask_train, train_cond=False)\n", 470 | " f1_, accu_ = eval_score(y_train, train_pred)\n", 471 | " print(\"Train: F1 Score: \", f1_, \"Accuracy: \", accu_, \"Loss: \", train_loss)\n", 472 | " val_loss, val_pred = model(x_id_val, y_val, mask_val, train_cond=False)\n", 473 | " f1_, accu_ = eval_score(y_val, val_pred)\n", 474 | " print(\"Val: F1 Score: \", f1_, \"Accuracy: \", accu_, \"Loss: \", val_loss)\n", 475 | " \n", 476 | "test_loss, test_pred = model(x_id_test, y_test, mask_test, train_cond=False)\n", 477 | "f1_, accu_ = eval_score(y_test, test_pred)\n", 478 | "print(\"Test: F1 Score: \", f1_, \"Accuracy: \", accu_, \"Loss: \", test_loss)" 479 | ] 480 | }, 481 | { 482 | "cell_type": "code", 483 | "execution_count": null, 484 | "metadata": { 485 | "collapsed": true 486 | }, 487 | "outputs": [], 488 | "source": [] 489 | }, 490 | { 491 | "cell_type": "code", 492 | "execution_count": null, 493 | "metadata": { 494 | "collapsed": true 495 | }, 496 | "outputs": [], 497 | "source": [ 498 | "saver.restore(sess, save_dir)" 499 | ] 500 | } 501 | ], 502 | "metadata": { 503 | "kernelspec": { 504 | "display_name": "cs771", 505 | "language": "python", 506 | "name": "cs771" 507 | }, 508 | "language_info": { 509 | "codemirror_mode": { 510 | "name": "ipython", 511 | "version": 3 512 | }, 513 | "file_extension": ".py", 514 | "mimetype": "text/x-python", 515 | "name": "python", 516 | "nbconvert_exporter": "python", 517 | "pygments_lexer": "ipython3", 518 | "version": "3.5.2" 519 | } 520 | }, 521 | "nbformat": 4, 522 | "nbformat_minor": 2 523 | } 524 | -------------------------------------------------------------------------------- /Four Word Model/Model.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import tensorflow as tf\n", 12 | "tf.logging.set_verbosity(tf.logging.WARN)\n", 13 | "import pickle\n", 14 | "import numpy as np\n", 15 | "import os\n", 16 | "from sklearn.model_selection import train_test_split\n", 17 | "from sklearn.metrics import f1_score\n", 18 | "from sklearn.metrics import accuracy_score\n", 19 | "import os\n", 20 | "from tensorflow.python.client import device_lib" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 2, 26 | "metadata": { 27 | "collapsed": true 28 | }, 29 | "outputs": [], 30 | "source": [ 31 | "f = open('../Glove/word_embedding_glove', 'rb')\n", 32 | "word_embedding = pickle.load(f)\n", 33 | "f.close()\n", 34 | "word_embedding = word_embedding[: len(word_embedding)-1]\n", 35 | "\n", 36 | "f = open('../Glove/vocab_glove', 'rb')\n", 37 | "vocab = pickle.load(f)\n", 38 | "f.close()\n", 39 | "\n", 40 | "word2id = dict((w, i) for i,w in enumerate(vocab))\n", 41 | "id2word = dict((i, w) for i,w in enumerate(vocab))\n", 42 | "\n", 43 | "unknown_token = \"UNKNOWN_TOKEN\"\n", 44 | "\n", 45 | "f = open(\"train.pickle\", 'rb')\n", 46 | "full_data = pickle.load(f)\n", 47 | "f.close()" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 3, 53 | "metadata": { 54 | "collapsed": true 55 | }, 56 | "outputs": [], 57 | "source": [ 58 | "# Model Description\n", 59 | "sense_word = 'hard'\n", 60 | "model_name = 'basic'\n", 61 | "model_dir = 'output/' + sense_word + '/' + model_name\n", 62 | "save_dir = os.path.join(model_dir, \"save/\")\n", 63 | "log_dir = os.path.join(model_dir, \"log\")\n", 64 | "\n", 65 | "if not os.path.exists(model_dir):\n", 66 | " os.mkdir(model_dir)\n", 67 | "if not os.path.exists(save_dir):\n", 68 | " os.mkdir(save_dir)\n", 69 | "if not os.path.exists(log_dir):\n", 70 | " os.mkdir(log_dir)" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 4, 76 | "metadata": { 77 | "collapsed": true 78 | }, 79 | "outputs": [], 80 | "source": [ 81 | "# Parameters\n", 82 | "mode = 'train'\n", 83 | "num_senses = 3\n", 84 | "batch_size = 64\n", 85 | "vocab_size = len(vocab)\n", 86 | "unk_vocab_size = 1\n", 87 | "word_emb_size = len(word_embedding[0])\n", 88 | "max_sent_size = 200\n", 89 | "hidden_size = 100\n", 90 | "keep_prob = 0.5\n", 91 | "l2_lambda = 0.001\n", 92 | "init_lr = 0.001\n", 93 | "decay_steps = 5000\n", 94 | "decay_rate = 0.96\n", 95 | "clip_norm = 1\n", 96 | "clipping = True" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 5, 102 | "metadata": { 103 | "collapsed": true 104 | }, 105 | "outputs": [], 106 | "source": [ 107 | "# MODEL\n", 108 | "x = tf.placeholder('int32', [batch_size, max_sent_size], name=\"x\")\n", 109 | "y = tf.placeholder('int32', [batch_size], name=\"y\")\n", 110 | "x_mask = tf.placeholder('bool', [batch_size, max_sent_size], name='x_mask') \n", 111 | "is_train = tf.placeholder('bool', [], name='is_train')\n", 112 | "word_emb_mat = tf.placeholder('float', [None, word_emb_size], name='emb_mat')\n", 113 | "input_keep_prob = tf.cond(is_train,lambda:keep_prob, lambda:tf.constant(1.0))\n", 114 | "x_len = tf.reduce_sum(tf.cast(x_mask, 'int32'), 1)\n", 115 | "\n", 116 | "with tf.name_scope(\"word_embedding\"):\n", 117 | " if mode == 'train':\n", 118 | " unk_word_emb_mat = tf.get_variable(\"word_emb_mat\", dtype='float', shape=[unk_vocab_size, word_emb_size], initializer=tf.contrib.layers.xavier_initializer(uniform=True, seed=0, dtype=tf.float32))\n", 119 | " else:\n", 120 | " unk_word_emb_mat = tf.get_variable(\"word_emb_mat\", shape=[unk_vocab_size, word_emb_size], dtype='float')\n", 121 | " \n", 122 | " final_word_emb_mat = tf.concat([word_emb_mat, unk_word_emb_mat], 0)\n", 123 | " Wx = tf.nn.embedding_lookup(final_word_emb_mat, x) \n", 124 | "\n", 125 | "with tf.variable_scope(\"lstm\"):\n", 126 | " cell_fw = tf.contrib.rnn.BasicLSTMCell(hidden_size,state_is_tuple=True)\n", 127 | " cell_bw = tf.contrib.rnn.BasicLSTMCell(hidden_size,state_is_tuple=True)\n", 128 | "\n", 129 | " d_cell_fw = tf.contrib.rnn.DropoutWrapper(cell_fw, input_keep_prob=input_keep_prob)\n", 130 | " d_cell_bw = tf.contrib.rnn.DropoutWrapper(cell_bw, input_keep_prob=input_keep_prob)\n", 131 | " \n", 132 | " (fw_h, bw_h), _ = tf.nn.bidirectional_dynamic_rnn(d_cell_fw, d_cell_bw, Wx, sequence_length=x_len, dtype='float', scope='lstm')\n", 133 | " h = tf.concat([fw_h, bw_h], 2)\n", 134 | "\n", 135 | "def attention(input_x, input_mask, W_att):\n", 136 | " h_masked = tf.boolean_mask(input_x, input_mask)\n", 137 | " h_tanh = tf.tanh(h_masked)\n", 138 | " u = tf.matmul(h_tanh, W_att)\n", 139 | " a = tf.nn.softmax(u)\n", 140 | " c = tf.reduce_sum(tf.multiply(h_tanh, a), 0) \n", 141 | " return c\n", 142 | "\n", 143 | "with tf.variable_scope(\"attention\"):\n", 144 | " W_att = tf.Variable(tf.truncated_normal([2*hidden_size, 1], mean=0.0, stddev=0.1, seed=0), name=\"W_att\")\n", 145 | " c = tf.expand_dims(attention(h[0], x_mask[0], W_att), 0)\n", 146 | " for i in range(1, batch_size):\n", 147 | " c = tf.concat([c, tf.expand_dims(attention(h[i], x_mask[i], W_att), 0)], 0)\n", 148 | " \n", 149 | "with tf.variable_scope(\"softmax_layer\"):\n", 150 | " W = tf.Variable(tf.truncated_normal([2*hidden_size, num_senses], mean=0.0, stddev=0.1, seed=0), name=\"W\")\n", 151 | " b = tf.Variable(tf.zeros([num_senses]), name=\"b\")\n", 152 | " drop_c = tf.nn.dropout(c, input_keep_prob)\n", 153 | " logits = tf.matmul(drop_c, W) + b\n", 154 | " predictions = tf.argmax(logits, 1)\n", 155 | "\n", 156 | "loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=y))\n", 157 | "global_step = tf.Variable(0, trainable=False, name=\"global_step\")\n", 158 | "\n", 159 | "learning_rate = tf.train.exponential_decay(init_lr, global_step, decay_steps, decay_rate, staircase=True)\n", 160 | "\n", 161 | "tv_all = tf.trainable_variables()\n", 162 | "tv_regu =[]\n", 163 | "for t in tv_all:\n", 164 | " if t.name.find('b:')==-1:\n", 165 | " tv_regu.append(t)\n", 166 | " \n", 167 | "# l2 Loss\n", 168 | "l2_loss = l2_lambda * tf.reduce_sum([ tf.nn.l2_loss(v) for v in tv_regu ])\n", 169 | "\n", 170 | "total_loss = loss + l2_loss\n", 171 | "\n", 172 | "# Optimizer for loss\n", 173 | "optimizer = tf.train.AdamOptimizer(learning_rate)\n", 174 | "\n", 175 | "# Gradients and Variables for Loss\n", 176 | "grads_vars = optimizer.compute_gradients(total_loss)\n", 177 | "\n", 178 | "# Clipping of Gradients\n", 179 | "clipped_grads = grads_vars\n", 180 | "if(clipping == True):\n", 181 | " clipped_grads = [(tf.clip_by_norm(grad, clip_norm), var) for grad, var in clipped_grads]\n", 182 | "\n", 183 | "# Training Optimizer for Total Loss\n", 184 | "train_op = optimizer.apply_gradients(clipped_grads, global_step=global_step)\n", 185 | "\n", 186 | "# Summaries\n", 187 | "var_summaries = []\n", 188 | "for v in tv_all:\n", 189 | " var_summary = tf.summary.histogram(\"{}/var\".format(v.name), v)\n", 190 | " var_summaries.append(var_summary)\n", 191 | "\n", 192 | "var_summaries_merged = tf.summary.merge(var_summaries)\n", 193 | "\n", 194 | "loss_summary = tf.summary.scalar(\"loss\", loss)\n", 195 | "total_loss_summary = tf.summary.scalar(\"total_loss\", total_loss)\n", 196 | "summary = tf.summary.merge_all()" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": 6, 202 | "metadata": { 203 | "collapsed": true 204 | }, 205 | "outputs": [], 206 | "source": [ 207 | "os.environ[\"CUDA_DEVICE_ORDER\"]=\"PCI_BUS_ID\" # see issue #152\n", 208 | "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"0\"\n", 209 | "sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))\n", 210 | "sess.run(tf.global_variables_initializer()) # For initializing all the variables\n", 211 | "saver = tf.train.Saver() # For Saving the model\n", 212 | "summary_writer = tf.summary.FileWriter(log_dir, sess.graph) # For writing Summaries" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": null, 218 | "metadata": { 219 | "collapsed": true 220 | }, 221 | "outputs": [], 222 | "source": [ 223 | "# # k-fold Splitting\n", 224 | "# data_x = np.array(full_data[sense_word][0])\n", 225 | "# data_y = np.array(full_data[sense_word][2])\n", 226 | "# kf = KFold(n_splits=5,shuffle=True,random_state=0)\n", 227 | "# for train_index, test_index in kf.split(X):\n", 228 | "# print(\"TRAIN:\", train_index, \"TEST:\", test_index)\n", 229 | "# #x_train, x_test = data_x[train_index], data_x[test_index]\n", 230 | "# #y_train, y_test = data_y[train_index], data_y[test_index]" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": 6, 236 | "metadata": { 237 | "scrolled": true 238 | }, 239 | "outputs": [ 240 | { 241 | "name": "stderr", 242 | "output_type": "stream", 243 | "text": [ 244 | "/users/btech/aviraj/cs771/lib/python3.5/site-packages/sklearn/model_selection/_split.py:2026: FutureWarning: From version 0.21, test_size will always complement train_size unless both are specified.\n", 245 | " FutureWarning)\n" 246 | ] 247 | } 248 | ], 249 | "source": [ 250 | "# Splitting\n", 251 | "data_x = full_data[sense_word][0]\n", 252 | "data_y = full_data[sense_word][2]\n", 253 | "x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, train_size=0.8, shuffle=True, stratify=data_y, random_state=0)\n", 254 | "\n", 255 | "x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, train_size=0.9, shuffle=True, stratify=y_train, random_state=0)" 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": 7, 261 | "metadata": { 262 | "collapsed": true 263 | }, 264 | "outputs": [], 265 | "source": [ 266 | "def data_prepare(x):\n", 267 | " num_examples = len(x)\n", 268 | "\n", 269 | " xx = np.zeros([num_examples, max_sent_size], dtype=int)\n", 270 | " xx_mask = np.zeros([num_examples, max_sent_size], dtype=bool)\n", 271 | "\n", 272 | " for j in range(num_examples):\n", 273 | " for i in range(max_sent_size):\n", 274 | " if(i>=len(x[j])):\n", 275 | " break\n", 276 | " w = x[j][i]\n", 277 | " xx[j][i] = word2id[w] if w in word2id else word2id['UNKNOWN_TOKEN']\n", 278 | " xx_mask[j][i] = True\n", 279 | " \n", 280 | " return xx, xx_mask\n", 281 | "\n", 282 | "def eval_score(yy, pred):\n", 283 | " num_batches = int(len(yy)/batch_size)\n", 284 | " f1 = f1_score(yy[:batch_size*num_batches], pred, average='macro')\n", 285 | " accu = accuracy_score(yy[:batch_size*num_batches], pred)\n", 286 | " return f1*100, accu*100\n", 287 | "\n", 288 | "def model(xx, yy, mask, train_cond=True):\n", 289 | " num_batches = int(len(xx)/batch_size)\n", 290 | " losses = 0\n", 291 | " preds = []\n", 292 | " for j in range(num_batches): \n", 293 | " \n", 294 | " s = j * batch_size\n", 295 | " e = (j+1) * batch_size\n", 296 | " \n", 297 | " feed_dict = {x:xx[s:e], y:yy[s:e], x_mask:mask[s:e], is_train:train_cond, input_keep_prob:keep_prob, word_emb_mat:word_embedding}\n", 298 | " \n", 299 | " \n", 300 | " if(train_cond==True):\n", 301 | " _, _loss, step, _summary = sess.run([train_op, total_loss, global_step, summary], feed_dict)\n", 302 | " summary_writer.add_summary(_summary, step) \n", 303 | "# print(\"Steps:{}\".format(step), \", Loss: {}\".format(_loss))\n", 304 | "\n", 305 | " else:\n", 306 | " _loss, pred = sess.run([total_loss, predictions], feed_dict)\n", 307 | " preds.append(pred)\n", 308 | " \n", 309 | " losses +=_loss\n", 310 | "\n", 311 | " if(train_cond==False):\n", 312 | " y_pred = []\n", 313 | " for i in range(num_batches):\n", 314 | " for pred in preds[i]:\n", 315 | " y_pred.append(pred)\n", 316 | " return losses/num_batches, y_pred\n", 317 | " \n", 318 | " return losses/num_batches, step" 319 | ] 320 | }, 321 | { 322 | "cell_type": "code", 323 | "execution_count": 8, 324 | "metadata": { 325 | "collapsed": true 326 | }, 327 | "outputs": [], 328 | "source": [ 329 | "x_id_train, mask_train = data_prepare(x_train)\n", 330 | "x_id_val, mask_val = data_prepare(x_val)\n", 331 | "x_id_test, mask_test = data_prepare(x_test)\n", 332 | "y_train = np.array(y_train)" 333 | ] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "execution_count": null, 338 | "metadata": { 339 | "collapsed": true, 340 | "scrolled": true 341 | }, 342 | "outputs": [], 343 | "source": [ 344 | "num_epochs = 10\n", 345 | "\n", 346 | "for i in range(num_epochs):\n", 347 | " \n", 348 | " random = np.random.choice(len(y_train), size=(len(y_train)), replace=False)\n", 349 | " x_id_train = x_id_train[random]\n", 350 | " y_train = y_train[random]\n", 351 | " mask_train = mask_train[random]\n", 352 | " \n", 353 | " losses, step = model(x_id_train, y_train, mask_train)\n", 354 | " print(\"Epoch:\", i+1,\"Step:\", step, \"loss:\",losses)\n", 355 | " saver.save(sess, save_path=save_dir) \n", 356 | " print(\"Saved Model Complete\")\n", 357 | " \n", 358 | " if((i+1)%2==0):\n", 359 | " train_loss, train_pred = model(x_id_train, y_train, mask_train, train_cond=False)\n", 360 | " f1_, accu_ = eval_score(y_train, train_pred)\n", 361 | " print(\"Train: F1 Score: \", f1_, \"Accuracy: \", accu_, \"Loss: \", train_loss)\n", 362 | " val_loss, val_pred = model(x_id_val, y_val, mask_val, train_cond=False)\n", 363 | " f1_, accu_ = eval_score(y_val, val_pred)\n", 364 | " print(\"Val: F1 Score: \", f1_, \"Accuracy: \", accu_, \"Loss: \", val_loss)\n", 365 | " \n", 366 | "test_loss, test_pred = model(x_id_test, y_test, mask_test, train_cond=False)\n", 367 | "f1_, accu_ = eval_score(y_test, test_pred)\n", 368 | "print(\"Test: F1 Score: \", f1_, \"Accuracy: \", accu_, \"Loss: \", test_loss)" 369 | ] 370 | }, 371 | { 372 | "cell_type": "code", 373 | "execution_count": null, 374 | "metadata": { 375 | "collapsed": true 376 | }, 377 | "outputs": [], 378 | "source": [] 379 | }, 380 | { 381 | "cell_type": "code", 382 | "execution_count": 10, 383 | "metadata": { 384 | "collapsed": true 385 | }, 386 | "outputs": [], 387 | "source": [ 388 | "saver.restore(sess, save_dir)" 389 | ] 390 | } 391 | ], 392 | "metadata": { 393 | "kernelspec": { 394 | "display_name": "cs771", 395 | "language": "python", 396 | "name": "cs771" 397 | }, 398 | "language_info": { 399 | "codemirror_mode": { 400 | "name": "ipython", 401 | "version": 3 402 | }, 403 | "file_extension": ".py", 404 | "mimetype": "text/x-python", 405 | "name": "python", 406 | "nbconvert_exporter": "python", 407 | "pygments_lexer": "ipython3", 408 | "version": "3.5.2" 409 | } 410 | }, 411 | "nbformat": 4, 412 | "nbformat_minor": 2 413 | } 414 | -------------------------------------------------------------------------------- /Four Word Model/final_preprocessing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import re\n", 10 | "from sklearn.model_selection import train_test_split\n", 11 | "import pickle" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 2, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "def decontracted(phrase):\n", 21 | " # specific\n", 22 | " phrase = re.sub(r\"won't\", \"will not\", phrase)\n", 23 | "\n", 24 | " # general\n", 25 | " phrase = re.sub(r\"n\\'t\", \" not\", phrase)\n", 26 | " phrase = re.sub(r\"\\'re\", \" are\", phrase)\n", 27 | " phrase = re.sub(r\"\\'s\", \" is\", phrase)\n", 28 | " phrase = re.sub(r\"\\'d\", \" would\", phrase)\n", 29 | " phrase = re.sub(r\"\\'ll\", \" will\", phrase)\n", 30 | " phrase = re.sub(r\"\\'t\", \" not\", phrase)\n", 31 | " phrase = re.sub(r\"\\'ve\", \" have\", phrase)\n", 32 | " phrase = re.sub(r\"\\'m\", \" am\", phrase)\n", 33 | " phrase = re.sub(r\"\\'d've\", \" would have\", phrase)\n", 34 | " phrase = re.sub(r\"\\'d'y\", \" do you\", phrase)\n", 35 | " return phrase\n" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 3, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "train = {}\n", 45 | "\n", 46 | "with open('./Preprocess_Files/hard/sent') as f:\n", 47 | " sents = f.readlines()\n", 48 | "content = [x.strip() for x in sents]\n", 49 | " \n", 50 | "with open('./Preprocess_Files/hard/sense') as f:\n", 51 | " senses = f.readlines()\n", 52 | "\n", 53 | "sents = []\n", 54 | "for sent in content:\n", 55 | " text = decontracted(sent.replace(\" ' \",\"'\"))\n", 56 | " result = \"\".join(x for x in text if x.isalpha() or x.isspace())\n", 57 | " result = result.replace(' ',' ').split()\n", 58 | " result = [string.lower() for string in result]\n", 59 | " sents.append(result)\n", 60 | "\n", 61 | "type_class = []\n", 62 | "type_name = []\n", 63 | "for sense in senses:\n", 64 | " sense = sense.strip('\\n')\n", 65 | " type_name.append(sense)\n", 66 | " \n", 67 | " sense = sense.replace('HARD1','0').replace('HARD2','1').replace('HARD3','2')\n", 68 | " type_class.append(int(sense))\n", 69 | "\n", 70 | "train['hard'] = []\n", 71 | "train['hard'].extend([sents, type_name, type_class])" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 4, 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "with open('./Preprocess_Files/interest/sent') as f:\n", 81 | " sents = f.readlines()\n", 82 | "content = [x.strip() for x in sents]\n", 83 | " \n", 84 | "with open('./Preprocess_Files/interest/sense') as f:\n", 85 | " senses = f.readlines()\n", 86 | "\n", 87 | "sents = []\n", 88 | "for sent in content:\n", 89 | " text = decontracted(sent.replace(\" ' \",\"'\"))\n", 90 | " result = \"\".join(x for x in text if x.isalpha() or x.isspace())\n", 91 | " result = result.replace(' ',' ').split()\n", 92 | " result = [string.lower() for string in result]\n", 93 | " sents.append(result)\n", 94 | "\n", 95 | "type_class = []\n", 96 | "type_name = []\n", 97 | "for sense in senses:\n", 98 | " sense = sense.strip('\\n')\n", 99 | " type_name.append(sense)\n", 100 | " \n", 101 | " sense = sense.replace('interest1','0').replace('interest2','1').replace('interest3','2').replace('interest4','3').replace('interest5','4').replace('interest6','5')\n", 102 | " type_class.append(int(sense))\n", 103 | "\n", 104 | "train['interest'] = []\n", 105 | "train['interest'].extend([sents, type_name, type_class])" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": 5, 111 | "metadata": {}, 112 | "outputs": [], 113 | "source": [ 114 | "with open('./Preprocess_Files/line/sent') as f:\n", 115 | " sents = f.readlines()\n", 116 | "content = [x.strip() for x in sents]\n", 117 | " \n", 118 | "with open('./Preprocess_Files/line/sense') as f:\n", 119 | " senses = f.readlines()\n", 120 | "\n", 121 | "sents = []\n", 122 | "for sent in content:\n", 123 | " text = decontracted(sent.replace(\" ' \",\"'\"))\n", 124 | " result = \"\".join(x for x in text if x.isalpha() or x.isspace())\n", 125 | " result = result.replace(' ',' ').split()\n", 126 | " result = [string.lower() for string in result]\n", 127 | " sents.append(result)\n", 128 | "\n", 129 | "type_class = []\n", 130 | "type_name = []\n", 131 | "for sense in senses:\n", 132 | " sense = sense.strip('\\n')\n", 133 | " type_name.append(sense)\n", 134 | " \n", 135 | " sense = sense.replace('text','0').replace('phone','1').replace('product','2').replace('formation','3').replace('division','4').replace('cord','5')\n", 136 | " type_class.append(int(sense))\n", 137 | "\n", 138 | "train['line'] = []\n", 139 | "train['line'].extend([sents, type_name, type_class])" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": 6, 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [ 148 | "with open('./Preprocess_Files/serve/sent') as f:\n", 149 | " sents = f.readlines()\n", 150 | "content = [x.strip() for x in sents]\n", 151 | " \n", 152 | "with open('./Preprocess_Files/serve/sense') as f:\n", 153 | " senses = f.readlines()\n", 154 | "\n", 155 | "sents = []\n", 156 | "for sent in content:\n", 157 | " text = decontracted(sent.replace(\" ' \",\"'\"))\n", 158 | " result = \"\".join(x for x in text if x.isalpha() or x.isspace())\n", 159 | " result = result.replace(' ',' ').split()\n", 160 | " result = [string.lower() for string in result]\n", 161 | " sents.append(result)\n", 162 | "\n", 163 | "type_class = []\n", 164 | "type_name = []\n", 165 | "for sense in senses:\n", 166 | " sense = sense.strip('\\n')\n", 167 | " type_name.append(sense)\n", 168 | " \n", 169 | " sense = sense.replace('SERVE2','0').replace('SERVE6','1').replace('SERVE10','2').replace('SERVE12','3')\n", 170 | " type_class.append(int(sense))\n", 171 | "\n", 172 | "train['serve'] = []\n", 173 | "train['serve'].extend([sents, type_name, type_class])" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": 8, 179 | "metadata": {}, 180 | "outputs": [], 181 | "source": [ 182 | "def train_test(target):\n", 183 | " x = train['target'][0]\n", 184 | " y = train['target'][2]\n", 185 | " x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, shuffle=True, stratify=y)\n", 186 | " return x_train, x_test, y_train, y_test" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": 9, 192 | "metadata": {}, 193 | "outputs": [], 194 | "source": [ 195 | "with open('full_train.pickle', 'wb') as f:\n", 196 | " pickle.dump(train, f)" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": 10, 202 | "metadata": {}, 203 | "outputs": [ 204 | { 205 | "name": "stdout", 206 | "output_type": "stream", 207 | "text": [ 208 | "['he', 'may', 'lose', 'all', 'popular', 'support', 'but', 'someone', 'has', 'to', 'kill', 'him', 'to', 'defeat', 'him', 'and', 'that', 'is', 'hard', 'to', 'do']\n", 209 | "HARD1\n" 210 | ] 211 | } 212 | ], 213 | "source": [ 214 | "print(train['hard'][0][0])\n", 215 | "print(train['hard'][1][0]) #class of hard" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": 11, 221 | "metadata": {}, 222 | "outputs": [ 223 | { 224 | "name": "stdout", 225 | "output_type": "stream", 226 | "text": [ 227 | "108\n", 228 | "127\n", 229 | "165\n", 230 | "161\n" 231 | ] 232 | } 233 | ], 234 | "source": [ 235 | "def max_length(target):\n", 236 | " max_len = 0\n", 237 | " for sentence in train[target][0]:\n", 238 | " temp_len = len(sentence)\n", 239 | " max_len = max(max_len, temp_len)\n", 240 | " print(max_len)\n", 241 | "\n", 242 | "max_length('hard') \n", 243 | "max_length('interest')\n", 244 | "max_length('line')\n", 245 | "max_length('serve')" 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": null, 251 | "metadata": {}, 252 | "outputs": [], 253 | "source": [] 254 | } 255 | ], 256 | "metadata": { 257 | "kernelspec": { 258 | "display_name": "Python 3", 259 | "language": "python", 260 | "name": "python3" 261 | }, 262 | "language_info": { 263 | "codemirror_mode": { 264 | "name": "ipython", 265 | "version": 3 266 | }, 267 | "file_extension": ".py", 268 | "mimetype": "text/x-python", 269 | "name": "python", 270 | "nbconvert_exporter": "python", 271 | "pygments_lexer": "ipython3", 272 | "version": "3.6.2" 273 | } 274 | }, 275 | "nbformat": 4, 276 | "nbformat_minor": 2 277 | } 278 | -------------------------------------------------------------------------------- /Four Word Model/full_train.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sshanu/Hierarchical-Word-Sense-Disambiguation-using-WordNet-Senses/74905c60e48ab1884bf5c8f208e21015f7f7fd21/Four Word Model/full_train.pickle -------------------------------------------------------------------------------- /Four Word Model/robsr_model.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import tensorflow as tf\n", 12 | "tf.logging.set_verbosity(tf.logging.WARN)\n", 13 | "import pickle\n", 14 | "import numpy as np\n", 15 | "import os\n", 16 | "from sklearn.model_selection import train_test_split\n", 17 | "from sklearn.metrics import f1_score\n", 18 | "from sklearn.metrics import accuracy_score\n", 19 | "import os\n", 20 | "from tensorflow.python.client import device_lib" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 2, 26 | "metadata": { 27 | "collapsed": true 28 | }, 29 | "outputs": [], 30 | "source": [ 31 | "f = open('../Glove/word_embedding_glove', 'rb')\n", 32 | "word_embedding = pickle.load(f)\n", 33 | "f.close()\n", 34 | "word_embedding = word_embedding[: len(word_embedding)-1]\n", 35 | "\n", 36 | "f = open('../Glove/vocab_glove', 'rb')\n", 37 | "vocab = pickle.load(f)\n", 38 | "f.close()\n", 39 | "\n", 40 | "word2id = dict((w, i) for i,w in enumerate(vocab))\n", 41 | "id2word = dict((i, w) for i,w in enumerate(vocab))\n", 42 | "\n", 43 | "unknown_token = \"UNKNOWN_TOKEN\"\n", 44 | "\n", 45 | "f = open(\"train.pickle\", 'rb')\n", 46 | "full_data = pickle.load(f)\n", 47 | "f.close()" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 3, 53 | "metadata": { 54 | "collapsed": true 55 | }, 56 | "outputs": [], 57 | "source": [ 58 | "# Model Description\n", 59 | "sense_word = 'hard'\n", 60 | "model_name = 'basic'\n", 61 | "model_dir = 'output/' + sense_word + '/' + model_name\n", 62 | "save_dir = os.path.join(model_dir, \"save/\")\n", 63 | "log_dir = os.path.join(model_dir, \"log\")\n", 64 | "\n", 65 | "if not os.path.exists(save_dir):\n", 66 | " os.mkdir(save_dir)\n", 67 | "if not os.path.exists(log_dir):\n", 68 | " os.mkdir(log_dir)" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": 4, 74 | "metadata": { 75 | "collapsed": true 76 | }, 77 | "outputs": [], 78 | "source": [ 79 | "# Parameters\n", 80 | "mode = 'train'\n", 81 | "num_senses = 3\n", 82 | "batch_size = 64\n", 83 | "vocab_size = len(vocab)\n", 84 | "unk_vocab_size = 1\n", 85 | "word_emb_size = len(word_embedding[0])\n", 86 | "max_sent_size = 200\n", 87 | "hidden_size = 100\n", 88 | "keep_prob = 0.5\n", 89 | "l2_lambda = 0.001\n", 90 | "init_lr = 0.001\n", 91 | "decay_steps = 5000\n", 92 | "decay_rate = 0.96\n", 93 | "clip_norm = 1\n", 94 | "clipping = True" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 5, 100 | "metadata": { 101 | "collapsed": true 102 | }, 103 | "outputs": [], 104 | "source": [ 105 | "# MODEL\n", 106 | "x = tf.placeholder('int32', [batch_size, max_sent_size], name=\"x\")\n", 107 | "y = tf.placeholder('int32', [batch_size], name=\"y\")\n", 108 | "x_mask = tf.placeholder('bool', [batch_size, max_sent_size], name='x_mask') \n", 109 | "is_train = tf.placeholder('bool', [], name='is_train')\n", 110 | "word_emb_mat = tf.placeholder('float', [None, word_emb_size], name='emb_mat')\n", 111 | "input_keep_prob = tf.cond(is_train,lambda:keep_prob, lambda:tf.constant(1.0))\n", 112 | "x_len = tf.reduce_sum(tf.cast(x_mask, 'int32'), 1)\n", 113 | "\n", 114 | "with tf.name_scope(\"word_embedding\"):\n", 115 | " if mode == 'train':\n", 116 | " unk_word_emb_mat = tf.get_variable(\"word_emb_mat\", dtype='float', shape=[unk_vocab_size, word_emb_size], initializer=tf.contrib.layers.xavier_initializer(uniform=True, seed=0, dtype=tf.float32))\n", 117 | " else:\n", 118 | " unk_word_emb_mat = tf.get_variable(\"word_emb_mat\", shape=[unk_vocab_size, word_emb_size], dtype='float')\n", 119 | " \n", 120 | " final_word_emb_mat = tf.concat([word_emb_mat, unk_word_emb_mat], 0)\n", 121 | " Wx = tf.nn.embedding_lookup(final_word_emb_mat, x) \n", 122 | "\n", 123 | "with tf.variable_scope(\"lstm\"):\n", 124 | " cell_fw = tf.contrib.rnn.BasicLSTMCell(hidden_size,state_is_tuple=True)\n", 125 | " cell_bw = tf.contrib.rnn.BasicLSTMCell(hidden_size,state_is_tuple=True)\n", 126 | "\n", 127 | " d_cell_fw = tf.contrib.rnn.DropoutWrapper(cell_fw, input_keep_prob=input_keep_prob)\n", 128 | " d_cell_bw = tf.contrib.rnn.DropoutWrapper(cell_bw, input_keep_prob=input_keep_prob)\n", 129 | " \n", 130 | " (fw_h, bw_h), _ = tf.nn.bidirectional_dynamic_rnn(d_cell_fw, d_cell_bw, Wx, sequence_length=x_len, dtype='float', scope='lstm')\n", 131 | " h = tf.concat([fw_h, bw_h], 2)\n", 132 | "\n", 133 | "def attention(input_x, input_mask, W_att):\n", 134 | " h_masked = tf.boolean_mask(input_x, input_mask)\n", 135 | " h_tanh = tf.tanh(h_masked)\n", 136 | " u = tf.matmul(h_tanh, W_att)\n", 137 | " a = tf.nn.softmax(u)\n", 138 | " c = tf.reduce_sum(tf.multiply(h_tanh, a), 0) \n", 139 | " return c\n", 140 | "\n", 141 | "with tf.variable_scope(\"attention\"):\n", 142 | " W_att = tf.Variable(tf.truncated_normal([2*hidden_size, 1], mean=0.0, stddev=1.0, seed=0), name=\"W_att\")\n", 143 | " c = tf.expand_dims(attention(h[0], x_mask[0], W_att), 0)\n", 144 | " for i in range(1, batch_size):\n", 145 | " c = tf.concat([c, tf.expand_dims(attention(h[i], x_mask[i], W_att), 0)], 0)\n", 146 | " \n", 147 | "with tf.variable_scope(\"softmax_layer\"):\n", 148 | " W = tf.Variable(tf.truncated_normal([2*hidden_size, num_senses], mean=0.0, stddev=1.0, seed=0), name=\"W\")\n", 149 | " b = tf.Variable(tf.zeros([num_senses]), name=\"b\")\n", 150 | " drop_c = tf.nn.dropout(c, input_keep_prob)\n", 151 | " logits = tf.matmul(drop_c, W) + b\n", 152 | " predictions = tf.argmax(logits, 1)\n", 153 | "\n", 154 | "loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=y))\n", 155 | "global_step = tf.Variable(0, trainable=False, name=\"global_step\")\n", 156 | "\n", 157 | "learning_rate = tf.train.exponential_decay(init_lr, global_step, decay_steps, decay_rate, staircase=True)\n", 158 | "\n", 159 | "tv_all = tf.trainable_variables()\n", 160 | "tv_regu =[]\n", 161 | "for t in tv_all:\n", 162 | " if t.name.find('b:')==-1:\n", 163 | " tv_regu.append(t)\n", 164 | " \n", 165 | "# l2 Loss\n", 166 | "l2_loss = l2_lambda * tf.reduce_sum([ tf.nn.l2_loss(v) for v in tv_regu ])\n", 167 | "\n", 168 | "total_loss = loss + l2_loss\n", 169 | "\n", 170 | "# Optimizer for loss\n", 171 | "optimizer = tf.train.AdamOptimizer(learning_rate)\n", 172 | "\n", 173 | "# Gradients and Variables for Loss\n", 174 | "grads_vars = optimizer.compute_gradients(total_loss)\n", 175 | "\n", 176 | "# Clipping of Gradients\n", 177 | "clipped_grads = grads_vars\n", 178 | "if(clipping == True):\n", 179 | " clipped_grads = [(tf.clip_by_norm(grad, clip_norm), var) for grad, var in clipped_grads]\n", 180 | "\n", 181 | "# Training Optimizer for Total Loss\n", 182 | "train_op = optimizer.apply_gradients(clipped_grads, global_step=global_step)\n", 183 | "\n", 184 | "# Summaries\n", 185 | "var_summaries = []\n", 186 | "for v in tv_all:\n", 187 | " var_summary = tf.summary.histogram(\"{}/var\".format(v.name), v)\n", 188 | " var_summaries.append(var_summary)\n", 189 | "\n", 190 | "var_summaries_merged = tf.summary.merge(var_summaries)\n", 191 | "\n", 192 | "loss_summary = tf.summary.scalar(\"loss\", loss)\n", 193 | "total_loss_summary = tf.summary.scalar(\"total_loss\", total_loss)\n", 194 | "summary = tf.summary.merge_all()" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": 6, 200 | "metadata": {}, 201 | "outputs": [], 202 | "source": [ 203 | "os.environ[\"CUDA_DEVICE_ORDER\"]=\"PCI_BUS_ID\" # see issue #152\n", 204 | "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"0\"\n", 205 | "sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))\n", 206 | "sess.run(tf.global_variables_initializer()) # For initializing all the variables\n", 207 | "saver = tf.train.Saver() # For Saving the model\n", 208 | "summary_writer = tf.summary.FileWriter(log_dir, sess.graph) # For writing Summaries" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": 6, 214 | "metadata": { 215 | "scrolled": true 216 | }, 217 | "outputs": [ 218 | { 219 | "name": "stderr", 220 | "output_type": "stream", 221 | "text": [ 222 | "/users/btech/aviraj/cs771/lib/python3.5/site-packages/sklearn/model_selection/_split.py:2026: FutureWarning: From version 0.21, test_size will always complement train_size unless both are specified.\n", 223 | " FutureWarning)\n" 224 | ] 225 | } 226 | ], 227 | "source": [ 228 | "# Splitting\n", 229 | "data_x = full_data[sense_word][0]\n", 230 | "data_y = full_data[sense_word][2]\n", 231 | "x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, train_size=0.8, shuffle=True, stratify=data_y, random_state=0)\n", 232 | "\n", 233 | "x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, train_size=0.9, shuffle=True, stratify=y_train, random_state=0)" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": 7, 239 | "metadata": { 240 | "collapsed": true 241 | }, 242 | "outputs": [], 243 | "source": [ 244 | "def data_prepare(x):\n", 245 | " num_examples = len(x)\n", 246 | "\n", 247 | " xx = np.zeros([num_examples, max_sent_size], dtype=int)\n", 248 | " xx_mask = np.zeros([num_examples, max_sent_size], dtype=bool)\n", 249 | "\n", 250 | " for j in range(num_examples):\n", 251 | " for i in range(max_sent_size):\n", 252 | " if(i>=len(x[j])):\n", 253 | " break\n", 254 | " w = x[j][i]\n", 255 | " xx[j][i] = word2id[w] if w in word2id else word2id['UNKNOWN_TOKEN']\n", 256 | " xx_mask[j][i] = True\n", 257 | " \n", 258 | " return xx, xx_mask\n", 259 | "\n", 260 | "def eval_score(yy, pred):\n", 261 | " num_batches = int(len(yy)/batch_size)\n", 262 | " f1 = f1_score(yy[:batch_size*num_batches], pred, average='macro')\n", 263 | " accu = accuracy_score(yy[:batch_size*num_batches], pred)\n", 264 | " return f1*100, accu*100\n", 265 | "\n", 266 | "def model(xx, yy, mask, train_cond=True):\n", 267 | " num_batches = int(len(xx)/batch_size)\n", 268 | " losses = 0\n", 269 | " preds = []\n", 270 | " for j in range(num_batches): \n", 271 | " \n", 272 | " s = j * batch_size\n", 273 | " e = (j+1) * batch_size\n", 274 | " \n", 275 | " feed_dict = {x:xx[s:e], y:yy[s:e], x_mask:mask[s:e], is_train:train_cond, input_keep_prob:keep_prob, word_emb_mat:word_embedding}\n", 276 | " \n", 277 | " \n", 278 | " if(train_cond==True):\n", 279 | " _, _loss, step, _summary = sess.run([train_op, total_loss, global_step, summary], feed_dict)\n", 280 | " summary_writer.add_summary(_summary, step) \n", 281 | "# print(\"Steps:{}\".format(step), \", Loss: {}\".format(_loss))\n", 282 | "\n", 283 | " else:\n", 284 | " _loss, pred = sess.run([total_loss, predictions], feed_dict)\n", 285 | " preds.append(pred)\n", 286 | " \n", 287 | " losses +=_loss\n", 288 | "\n", 289 | " if(train_cond==False):\n", 290 | " y_pred = []\n", 291 | " for i in range(num_batches):\n", 292 | " for pred in preds[i]:\n", 293 | " y_pred.append(pred)\n", 294 | " return losses/num_batches, y_pred\n", 295 | " \n", 296 | " return losses/num_batches, step" 297 | ] 298 | }, 299 | { 300 | "cell_type": "code", 301 | "execution_count": 8, 302 | "metadata": { 303 | "collapsed": true 304 | }, 305 | "outputs": [], 306 | "source": [ 307 | "x_id_train, mask_train = data_prepare(x_train)\n", 308 | "x_id_val, mask_val = data_prepare(x_val)\n", 309 | "x_id_test, mask_test = data_prepare(x_test)" 310 | ] 311 | }, 312 | { 313 | "cell_type": "code", 314 | "execution_count": null, 315 | "metadata": { 316 | "scrolled": true 317 | }, 318 | "outputs": [], 319 | "source": [ 320 | "num_epochs = 10\n", 321 | "\n", 322 | "for i in range(num_epochs):\n", 323 | " \n", 324 | " random = np.random.choice(len(y_train), size=(len(y_train)), replace=False)\n", 325 | " x_id_train = x_id_train[random]\n", 326 | " y_train = y_train[random]\n", 327 | " mask_train = mask_train[random]\n", 328 | " \n", 329 | " losses, step = model(x_id_train, y_train, mask_train)\n", 330 | " print(\"Epoch:\", i+1,\"Step:\", step, \"loss:\",losses)\n", 331 | " saver.save(sess, save_path=save_dir) \n", 332 | " print(\"Saved Model Complete\")\n", 333 | " \n", 334 | " if((i+1)%2==0):\n", 335 | " train_loss, train_pred = model(x_id_train, y_train, mask_train, train_cond=False)\n", 336 | " f1_, accu_ = eval_score(y_train, train_pred)\n", 337 | " print(\"Train: F1 Score: \", f1_, \"Accuracy: \", accu_, \"Loss: \", val_loss)\n", 338 | " val_loss, val_pred = model(x_id_val, y_val, mask_val, train_cond=False)\n", 339 | " f1_, accu_ = eval_score(y_val, val_pred)\n", 340 | " print(\"Val: F1 Score: \", f1_, \"Accuracy: \", accu_, \"Loss: \", val_loss)\n", 341 | " \n", 342 | "test_loss, test_pred = model(x_id_test, y_test, mask_test, train_cond=False)\n", 343 | "f1_, accu_ = eval_score(y_test, test_pred)\n", 344 | "print(\"Test: F1 Score: \", f1_, \"Accuracy: \", accu_, \"Loss: \", test_loss)" 345 | ] 346 | }, 347 | { 348 | "cell_type": "code", 349 | "execution_count": null, 350 | "metadata": { 351 | "collapsed": true 352 | }, 353 | "outputs": [], 354 | "source": [] 355 | }, 356 | { 357 | "cell_type": "code", 358 | "execution_count": 10, 359 | "metadata": {}, 360 | "outputs": [], 361 | "source": [ 362 | "saver.restore(sess, save_dir)" 363 | ] 364 | } 365 | ], 366 | "metadata": { 367 | "kernelspec": { 368 | "display_name": "cs771", 369 | "language": "python", 370 | "name": "cs771" 371 | }, 372 | "language_info": { 373 | "codemirror_mode": { 374 | "name": "ipython", 375 | "version": 3 376 | }, 377 | "file_extension": ".py", 378 | "mimetype": "text/x-python", 379 | "name": "python", 380 | "nbconvert_exporter": "python", 381 | "pygments_lexer": "ipython3", 382 | "version": "3.5.2" 383 | } 384 | }, 385 | "nbformat": 4, 386 | "nbformat_minor": 2 387 | } 388 | -------------------------------------------------------------------------------- /Four Word Model/train.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sshanu/Hierarchical-Word-Sense-Disambiguation-using-WordNet-Senses/74905c60e48ab1884bf5c8f208e21015f7f7fd21/Four Word Model/train.pickle -------------------------------------------------------------------------------- /Four Word Model/words_not_in_vocab.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sshanu/Hierarchical-Word-Sense-Disambiguation-using-WordNet-Senses/74905c60e48ab1884bf5c8f208e21015f7f7fd21/Four Word Model/words_not_in_vocab.pickle -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Shanu Kumar 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Word Sense Disambiguation 2 | 3 | Word sense disambiguation (WSD) is the ability to identify the meaning of words in context. We address this problem using series of end-to-end neural architectures using bidirectional Long Short Term Memory (LSTM). We propose two variants for WSD: an end-to-end word specific neural model and all-words neural model. In the word specific models we have to train models for every disambiguation target word. We addressed this issue using the all-words model which rely on sequence learning. We also used POS tags to improve the performance. We tried different variants of attention mechanisms for the all-words model. Performance was boosted by using convolutional neural networks (CNN) which captures local features around the words that is normally what humans do for predicting the senses. We further improved the performance using hierarchical models. We used POS tags as hierarchy and used two variants as soft masking and hard masking. 4 | 5 | ### Methods 6 | 7 | * [Word Specific Model trained on Four Word Dataset](https://github.com/Sshanu/Word-Sense-Disambiguation/tree/master/Four%20Word%20Model) 8 | * [Word Specific Model trained on One Million Dataset](https://github.com/Sshanu/Word-Sense-Disambiguation/tree/master/one_million) 9 | * [All-words Model](https://github.com/Sshanu/Word-Sense-Disambiguation/tree/master/one_million/all-word) 10 | * [Hierarchical Model](https://github.com/Sshanu/Word-Sense-Disambiguation/blob/master/one_million/all-word/Model-aw-lex-hierarchical-2.ipynb) 11 | 12 | ### Best Models 13 | * [All-words Model+CNN](https://github.com/Sshanu/Word-Sense-Disambiguation/blob/master/one_million/all-word/Model-aw-lex-1.4.ipynb) 14 | * [All-words Hierarchical Model+Soft Masking](https://github.com/Sshanu/Word-Sense-Disambiguation/blob/master/one_million/all-word/Model-aw-lex-hierarchical-2.ipynb) 15 | * [All-words Hierarchical Model+Hard Masking](https://github.com/Sshanu/Word-Sense-Disambiguation/blob/master/one_million/all-word/Model-aw-lex-hierarchical-4.ipynb) 16 | 17 | 18 | ### Details 19 | For detailed information about models and results: 20 | * [Report](https://github.com/Sshanu/Word-Sense-Disambiguation/blob/master/UGP_Report.pdf) 21 | * [Presentation](https://github.com/Sshanu/Word-Sense-Disambiguation/blob/master/UGP_presentation.pdf) 22 | 23 | ### All words Models 24 | 25 | #### [All-words Hierarchical Model+Soft Masking](https://github.com/Sshanu/Word-Sense-Disambiguation/blob/master/one_million/all-word/Model-aw-lex-hierarchical-2.ipynb) 26 |

27 | 28 |

29 | 30 | #### [All-words Hierarchical Model+Hard Masking](https://github.com/Sshanu/Word-Sense-Disambiguation/blob/master/one_million/all-word/Model-aw-lex-hierarchical-4.ipynb) 31 |

32 | 33 |

34 | 35 | #### [Basic Model](https://github.com/Sshanu/Word-Sense-Disambiguation/blob/master/one_million/all-word/Model-aw-1-multigpu-1.ipynb) 36 |

37 | 38 |

39 | 40 | #### [Basic Model+Local Attention](https://github.com/Sshanu/Word-Sense-Disambiguation/blob/master/one_million/all-word/Model-aw-lex-local_attention-fast-v2-4.ipynb) 41 |

42 | 43 |

44 | 45 | #### [Basic Model+Local Attentionn+Hidden States](https://github.com/Sshanu/Word-Sense-Disambiguation/blob/master/one_million/all-word/Model-aw-lex-local_attention-fast-v2-6.ipynb) 46 |

47 | 48 |

49 | 50 | #### [Basic Model+Local Attentionn+Hidden States+CRF](https://github.com/Sshanu/Word-Sense-Disambiguation/blob/master/one_million/all-word/Model-aw-lex-local_attention-fast-v3-1.ipynb) 51 |

52 | 53 |

54 | 55 | #### [Basic Model+Gated Attention](https://github.com/Sshanu/Word-Sense-Disambiguation/blob/master/one_million/all-word/Model-aw-lex-local_attention-fast-v2-7.ipynb) 56 |

57 | 58 |

59 | 60 | #### [Basic Model+CNN](https://github.com/Sshanu/Word-Sense-Disambiguation/blob/master/one_million/all-word/Model-aw-lex-1.4.ipynb) 61 |

62 | 63 |

64 | 65 | ### Word Specific Models 66 | 67 | #### [Basic Model](https://github.com/Sshanu/Word-Sense-Disambiguation/blob/master/one_million/force/Force-Model-1-multigpu-1.ipynb) 68 | Files with name as Model-1-multigpu-1.ipynb are the basic models 69 |

70 | 71 |

72 | 73 | #### [Basic Model+POS Tags](https://github.com/Sshanu/Word-Sense-Disambiguation/blob/master/one_million/force/Force-Model-2-multigpu-1.ipynb) 74 | Files with name as Model-1-multigpu-2.ipynb are the basic models 75 |

76 | 77 |

78 | 79 | #### [Basic Model+POS Tags+CRF](https://github.com/Sshanu/Word-Sense-Disambiguation/blob/master/one_million/force/Force-Model-3-multigpu-1.ipynb) 80 | Files with name as Model-1-multigpu-3.ipynb are the basic models 81 |

82 | 83 |

84 | 85 | #### [Word specific hierarchical model](https://github.com/Sshanu/Word-Sense-Disambiguation/blob/master/one_million/force/Force-Model-4-multigpu-1.ipynb) 86 | Files with name as Model-1-multigpu-4.ipynb are the basic models 87 |

88 | 89 |

90 | 91 | -------------------------------------------------------------------------------- /UGP_Report.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sshanu/Hierarchical-Word-Sense-Disambiguation-using-WordNet-Senses/74905c60e48ab1884bf5c8f208e21015f7f7fd21/UGP_Report.pdf -------------------------------------------------------------------------------- /UGP_presentation.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sshanu/Hierarchical-Word-Sense-Disambiguation-using-WordNet-Senses/74905c60e48ab1884bf5c8f208e21015f7f7fd21/UGP_presentation.pdf -------------------------------------------------------------------------------- /models_diagram/all-word-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sshanu/Hierarchical-Word-Sense-Disambiguation-using-WordNet-Senses/74905c60e48ab1884bf5c8f208e21015f7f7fd21/models_diagram/all-word-1.png -------------------------------------------------------------------------------- /models_diagram/all-word-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sshanu/Hierarchical-Word-Sense-Disambiguation-using-WordNet-Senses/74905c60e48ab1884bf5c8f208e21015f7f7fd21/models_diagram/all-word-2.png -------------------------------------------------------------------------------- /models_diagram/all-word-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sshanu/Hierarchical-Word-Sense-Disambiguation-using-WordNet-Senses/74905c60e48ab1884bf5c8f208e21015f7f7fd21/models_diagram/all-word-3.png -------------------------------------------------------------------------------- /models_diagram/all-word-4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sshanu/Hierarchical-Word-Sense-Disambiguation-using-WordNet-Senses/74905c60e48ab1884bf5c8f208e21015f7f7fd21/models_diagram/all-word-4.png -------------------------------------------------------------------------------- /models_diagram/all-word-5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sshanu/Hierarchical-Word-Sense-Disambiguation-using-WordNet-Senses/74905c60e48ab1884bf5c8f208e21015f7f7fd21/models_diagram/all-word-5.png -------------------------------------------------------------------------------- /models_diagram/all-word-6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sshanu/Hierarchical-Word-Sense-Disambiguation-using-WordNet-Senses/74905c60e48ab1884bf5c8f208e21015f7f7fd21/models_diagram/all-word-6.png -------------------------------------------------------------------------------- /models_diagram/all-word-7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sshanu/Hierarchical-Word-Sense-Disambiguation-using-WordNet-Senses/74905c60e48ab1884bf5c8f208e21015f7f7fd21/models_diagram/all-word-7.png -------------------------------------------------------------------------------- /models_diagram/all-word-8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sshanu/Hierarchical-Word-Sense-Disambiguation-using-WordNet-Senses/74905c60e48ab1884bf5c8f208e21015f7f7fd21/models_diagram/all-word-8.png -------------------------------------------------------------------------------- /models_diagram/model-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sshanu/Hierarchical-Word-Sense-Disambiguation-using-WordNet-Senses/74905c60e48ab1884bf5c8f208e21015f7f7fd21/models_diagram/model-1.png -------------------------------------------------------------------------------- /models_diagram/model-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sshanu/Hierarchical-Word-Sense-Disambiguation-using-WordNet-Senses/74905c60e48ab1884bf5c8f208e21015f7f7fd21/models_diagram/model-2.png -------------------------------------------------------------------------------- /models_diagram/model-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sshanu/Hierarchical-Word-Sense-Disambiguation-using-WordNet-Senses/74905c60e48ab1884bf5c8f208e21015f7f7fd21/models_diagram/model-3.png -------------------------------------------------------------------------------- /models_diagram/model-4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sshanu/Hierarchical-Word-Sense-Disambiguation-using-WordNet-Senses/74905c60e48ab1884bf5c8f208e21015f7f7fd21/models_diagram/model-4.png -------------------------------------------------------------------------------- /one_million/One-Million All-Word Data Sampling Coarse.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "scrolled": true 8 | }, 9 | "outputs": [ 10 | { 11 | "name": "stdout", 12 | "output_type": "stream", 13 | "text": [ 14 | "46\n" 15 | ] 16 | } 17 | ], 18 | "source": [ 19 | "import pickle\n", 20 | "import numpy as np\n", 21 | "import os\n", 22 | "from sklearn.model_selection import train_test_split\n", 23 | "from collections import Counter\n", 24 | "from imblearn.over_sampling import RandomOverSampler\n", 25 | "\n", 26 | "f = open(\"../../dataset/sense/dict_sense-keys\", 'rb')\n", 27 | "dict_sense_keys = pickle.load(f)\n", 28 | "f.close()\n", 29 | "\n", 30 | "f = open(\"../../dataset/sense/dict_word-sense\", 'rb')\n", 31 | "dict_word_sense = pickle.load(f)\n", 32 | "f.close()\n", 33 | "\n", 34 | "f = open('../Glove/word_embedding_glove', 'rb')\n", 35 | "word_embedding = pickle.load(f)\n", 36 | "f.close()\n", 37 | "word_embedding = word_embedding[: len(word_embedding)-1]\n", 38 | "\n", 39 | "f = open('../Glove/vocab_glove', 'rb')\n", 40 | "vocab = pickle.load(f)\n", 41 | "f.close()\n", 42 | "\n", 43 | "word2id = dict((w, i) for i,w in enumerate(vocab))\n", 44 | "id2word = dict((i, w) for i,w in enumerate(vocab))\n", 45 | "\n", 46 | "unknown_token = \"UNKNOWN_TOKEN\"\n", 47 | "\n", 48 | "with open('/data/aviraj/dataset/raw_preprocess_train','rb') as f:\n", 49 | " data=pickle.load(f)\n", 50 | "\n", 51 | "with open('/data/aviraj/dataset/fulldata_vocab_sense','rb') as f:\n", 52 | " vocab_lex=pickle.load(f)\n", 53 | "\n", 54 | "lex2id = dict((s, i) for i,s in enumerate(vocab_lex))\n", 55 | "id2lex = dict((i, s) for i,s in enumerate(vocab_lex))\n", 56 | "\n", 57 | "print(len(vocab_lex))\n", 58 | "max_sent_size = 200" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 2, 64 | "metadata": {}, 65 | "outputs": [ 66 | { 67 | "name": "stdout", 68 | "output_type": "stream", 69 | "text": [ 70 | "12\n" 71 | ] 72 | } 73 | ], 74 | "source": [ 75 | "_pos = []\n", 76 | "for i in range(len(data)):\n", 77 | " for pp in data[i][4]:\n", 78 | " _pos.append(pp)\n", 79 | " \n", 80 | "pos_count = Counter(_pos)\n", 81 | "pos_count = pos_count.most_common()\n", 82 | "vocab_pos = [pp for pp, c in pos_count]\n", 83 | "pos2id = dict((s, i) for i,s in enumerate(vocab_pos))\n", 84 | "print(len(vocab_pos))" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 3, 90 | "metadata": { 91 | "collapsed": true 92 | }, 93 | "outputs": [], 94 | "source": [ 95 | "data_y1 = []\n", 96 | "data_y2 = []\n", 97 | "data_y3 = []\n", 98 | "for i in range(len(data)):\n", 99 | " if (len(data[i][1])<=200):\n", 100 | " for j in range(len(data[i][2])):\n", 101 | " if data[i][2][j] is not None:\n", 102 | " data_y1.append(dict_sense_keys[data[i][2][j]][3])\n", 103 | " data_y2.append(dict_sense_keys[data[i][2][j]][4])\n", 104 | " data_y3.append(dict_sense_keys[data[i][2][j]][5])\n", 105 | "\n", 106 | "sense_count1 = Counter(data_y1)\n", 107 | "sense_count1 = sense_count1.most_common()\n", 108 | "sense_count2 = Counter(data_y2)\n", 109 | "sense_count4 = sense_count2.most_common(272)\n", 110 | "sense_count2 = sense_count2.most_common(312)\n", 111 | "sense_count3 = Counter(data_y3)\n", 112 | "sense_count5 = sense_count3.most_common(505)\n", 113 | "sense_count3 = sense_count3.most_common(1051)\n", 114 | "\n", 115 | "dict_sense_count1 = dict(sense_count1)\n", 116 | "dict_sense_count2 = dict(sense_count2)\n", 117 | "dict_sense_count3 = dict(sense_count3)\n", 118 | "dict_sense_count4 = dict(sense_count4)\n", 119 | "dict_sense_count5 = dict(sense_count5)" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": 4, 125 | "metadata": { 126 | "scrolled": true 127 | }, 128 | "outputs": [ 129 | { 130 | "name": "stdout", 131 | "output_type": "stream", 132 | "text": [ 133 | "46 312 1051 272 505\n" 134 | ] 135 | } 136 | ], 137 | "source": [ 138 | "print(len(sense_count1), len(sense_count2), len(sense_count3), len(sense_count4), len(sense_count5))" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": 5, 144 | "metadata": { 145 | "collapsed": true 146 | }, 147 | "outputs": [], 148 | "source": [ 149 | "data_x = []\n", 150 | "data_pos = []\n", 151 | "data_label1 = []\n", 152 | "data_label2 = []\n", 153 | "data_label3 = []\n", 154 | "data_label4 = []\n", 155 | "data_label5 = []\n", 156 | "\n", 157 | "for i in range(len(data)):\n", 158 | " if not all(np.array(data[i][2])==None) and (len(data[i][1])<=200):\n", 159 | " data_label1.append([ss if ss is not None and dict_sense_keys[ss][3] in dict_sense_count1 else None for ss in data[i][2]])\n", 160 | " data_label2.append([ss if ss is not None and dict_sense_keys[ss][4] in dict_sense_count2 else None for ss in data[i][2]])\n", 161 | " data_label3.append([ss if ss is not None and dict_sense_keys[ss][5] in dict_sense_count3 else None for ss in data[i][2]])\n", 162 | " data_label4.append([ss if ss is not None and dict_sense_keys[ss][4] in dict_sense_count4 else None for ss in data[i][2]])\n", 163 | " data_label5.append([ss if ss is not None and dict_sense_keys[ss][5] in dict_sense_count5 else None for ss in data[i][2]])\n", 164 | " data_x.append(data[i][1])\n", 165 | " data_pos.append(data[i][4])" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": 6, 171 | "metadata": { 172 | "collapsed": true 173 | }, 174 | "outputs": [], 175 | "source": [ 176 | "def data_prepare(sense_id, x, pos, y, sense_count, lex_cond=False, pos_cond=False):\n", 177 | " num_examples = len(x)\n", 178 | " \n", 179 | " vocab_sense = [s for s, c in sense_count]\n", 180 | " sense2id = dict((s, i) for i,s in enumerate(vocab_sense))\n", 181 | " \n", 182 | " xx = np.zeros([num_examples, max_sent_size], dtype=int)\n", 183 | " xx_mask = np.zeros([num_examples, max_sent_size], dtype=bool)\n", 184 | " ss_mask = np.zeros([num_examples, max_sent_size], dtype=bool)\n", 185 | " yy = np.zeros([num_examples,max_sent_size], dtype=int)\n", 186 | " y_lex = np.zeros([num_examples, max_sent_size], dtype=int)\n", 187 | " y_pos = np.zeros([num_examples, max_sent_size], dtype=int)\n", 188 | " \n", 189 | " for j in range(num_examples):\n", 190 | " for i in range(max_sent_size):\n", 191 | " if(i>=len(x[j])):\n", 192 | " break\n", 193 | " w = x[j][i]\n", 194 | " s = y[j][i]\n", 195 | " p = pos[j][i]\n", 196 | " xx[j][i] = word2id[w] if w in word2id else word2id['UNKNOWN_TOKEN']\n", 197 | " xx_mask[j][i] = True\n", 198 | " ss_mask[j][i] = True if s is not None and dict_sense_keys[s][sense_id] in vocab_sense else False\n", 199 | " yy[j][i] = sense2id[dict_sense_keys[s][sense_id]] if s is not None and dict_sense_keys[s][sense_id] in vocab_sense else 0\n", 200 | " if(lex_cond):\n", 201 | " y_lex[j][i] = lex2id[dict_sense_keys[s][3]] if s is not None and dict_sense_keys[s][3] in vocab_lex else len(vocab_lex)\n", 202 | " if(pos_cond):\n", 203 | " y_pos[j][i] = pos2id[p] if p in vocab_pos else len(vocab_pos)\n", 204 | " \n", 205 | " return xx, xx_mask, ss_mask, yy, y_lex, y_pos" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": 7, 211 | "metadata": { 212 | "collapsed": true 213 | }, 214 | "outputs": [], 215 | "source": [ 216 | "data_x = np.array(data_x)\n", 217 | "data_pos = np.array(data_pos)\n", 218 | "\n", 219 | "def train_val_data(name, sense_id, index, split_label, data_label, sense_count, sampling_list, lex_cond=False, pos_cond=False, sampling=False):\n", 220 | " \n", 221 | " index_train, index_val, label_train_id, label_val_id = train_test_split(index, split_label, train_size=0.8, shuffle=True, stratify=split_label, random_state=0)\n", 222 | " \n", 223 | " if(sampling):\n", 224 | " dict_sample = dict(sampling_list)\n", 225 | " sm = RandomOverSampler(ratio=dict_sample)\n", 226 | " index_train1 = np.array(index_train).reshape(-1, 1)\n", 227 | " sampled_index, _ = sm.fit_sample(index_train1, label_train_id)\n", 228 | " count = Counter(_)\n", 229 | " count = count.most_common()\n", 230 | " sampled_index_train = np.array(sampled_index).reshape(1, -1)\n", 231 | " index_train = sampled_index_train[0]\n", 232 | " \n", 233 | " data_label = np.array(data_label)\n", 234 | " x_train = data_x[index_train]\n", 235 | " y_train = data_label[index_train]\n", 236 | " x_val = data_x[index_val]\n", 237 | " y_val = data_label[index_val]\n", 238 | " pos_train = []\n", 239 | " pos_val = []\n", 240 | " \n", 241 | " if(pos_cond):\n", 242 | " pos_train = data_pos[index_train]\n", 243 | " pos_val = data_pos[index_val]\n", 244 | "\n", 245 | " x_id_train, mask_train, sense_mask_train, y_id_train, lex_train, pos_id_train = data_prepare(sense_id, x_train, pos_train, y_train, sense_count, lex_cond=lex_cond, pos_cond=pos_cond)\n", 246 | " x_id_val, mask_val, sense_mask_val, y_id_val, lex_val, pos_id_val = data_prepare(sense_id, x_val, pos_val, y_val, sense_count, lex_cond=lex_cond, pos_cond=pos_cond)\n", 247 | "\n", 248 | " train_data = {'x':x_id_train,'x_mask':mask_train, 'sense_mask':sense_mask_train, 'y':y_id_train, 'lex':lex_train, 'pos':pos_id_train}\n", 249 | " val_data = {'x':x_id_val,'x_mask':mask_val, 'sense_mask':sense_mask_val, 'y':y_id_val, 'lex':lex_val, 'pos':pos_id_val}\n", 250 | " \n", 251 | " with open('/data/aviraj/dataset/train_val_data_coarse/all_word_'+ name,'wb') as f:\n", 252 | " pickle.dump([train_data,val_data], f)\n", 253 | " \n", 254 | " print(len(x_id_train)+len(x_id_val))" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": 8, 260 | "metadata": { 261 | "scrolled": true 262 | }, 263 | "outputs": [ 264 | { 265 | "name": "stdout", 266 | "output_type": "stream", 267 | "text": [ 268 | "850093\n", 269 | "850062\n", 270 | "850052\n", 271 | "849793\n", 272 | "848996\n" 273 | ] 274 | } 275 | ], 276 | "source": [ 277 | "split_label1 = []\n", 278 | "split_label2 = []\n", 279 | "split_label3 = []\n", 280 | "split_label4 = []\n", 281 | "split_label5 = []\n", 282 | "\n", 283 | "index1 = []\n", 284 | "index2 = []\n", 285 | "index3 = []\n", 286 | "index4 = []\n", 287 | "index5 = []\n", 288 | "\n", 289 | "for jj, lab in enumerate(data_label1):\n", 290 | " min_idx = np.argmin([dict_sense_count1[dict_sense_keys[lab[i]][3]] if lab[i] is not None else np.inf for i in range(len(lab)) ]) \n", 291 | " if(lab[min_idx] is not None):\n", 292 | " index1.append(jj)\n", 293 | " split_label1.append(dict_sense_keys[lab[min_idx]][3])\n", 294 | "\n", 295 | "for jj, lab in enumerate(data_label2):\n", 296 | " min_idx = np.argmin([dict_sense_count2[dict_sense_keys[lab[i]][4]] if lab[i] is not None else np.inf for i in range(len(lab)) ]) \n", 297 | " if(lab[min_idx] is not None):\n", 298 | " index2.append(jj)\n", 299 | " split_label2.append(dict_sense_keys[lab[min_idx]][4])\n", 300 | "\n", 301 | "for jj, lab in enumerate(data_label3):\n", 302 | " min_idx = np.argmin([dict_sense_count3[dict_sense_keys[lab[i]][5]] if lab[i] is not None else np.inf for i in range(len(lab)) ]) \n", 303 | " if(lab[min_idx] is not None):\n", 304 | " index3.append(jj)\n", 305 | " split_label3.append(dict_sense_keys[lab[min_idx]][5])\n", 306 | " \n", 307 | "for jj, lab in enumerate(data_label4):\n", 308 | " min_idx = np.argmin([dict_sense_count4[dict_sense_keys[lab[i]][4]] if lab[i] is not None else np.inf for i in range(len(lab)) ]) \n", 309 | " if(lab[min_idx] is not None):\n", 310 | " index4.append(jj)\n", 311 | " split_label4.append(dict_sense_keys[lab[min_idx]][4])\n", 312 | "\n", 313 | "for jj, lab in enumerate(data_label5):\n", 314 | " min_idx = np.argmin([dict_sense_count5[dict_sense_keys[lab[i]][5]] if lab[i] is not None else np.inf for i in range(len(lab)) ]) \n", 315 | " if(lab[min_idx] is not None):\n", 316 | " index5.append(jj)\n", 317 | " split_label5.append(dict_sense_keys[lab[min_idx]][5])\n", 318 | " \n", 319 | "print(len(split_label1))\n", 320 | "print(len(split_label2))\n", 321 | "print(len(split_label3))\n", 322 | "print(len(split_label4))\n", 323 | "print(len(split_label5))" 324 | ] 325 | }, 326 | { 327 | "cell_type": "code", 328 | "execution_count": 9, 329 | "metadata": { 330 | "scrolled": true 331 | }, 332 | "outputs": [ 333 | { 334 | "name": "stderr", 335 | "output_type": "stream", 336 | "text": [ 337 | "/users/btech/aviraj/cs771/lib/python3.5/site-packages/sklearn/model_selection/_split.py:2026: FutureWarning: From version 0.21, test_size will always complement train_size unless both are specified.\n", 338 | " FutureWarning)\n" 339 | ] 340 | }, 341 | { 342 | "name": "stdout", 343 | "output_type": "stream", 344 | "text": [ 345 | "850093\n", 346 | "850062\n", 347 | "850052\n", 348 | "849793\n", 349 | "848996\n", 350 | "848996\n" 351 | ] 352 | } 353 | ], 354 | "source": [ 355 | "train_val_data('lex1', 3, index1, split_label1, data_label1, sense_count1, [], lex_cond=False, pos_cond=True)\n", 356 | "train_val_data('lex2', 3, index2, split_label2, data_label2, sense_count1, [], lex_cond=False, pos_cond=True)\n", 357 | "train_val_data('lex3', 3, index3, split_label3, data_label3, sense_count1, [], lex_cond=False, pos_cond=True)\n", 358 | "train_val_data('sense1', 4, index4, split_label4, data_label4, sense_count4, [], lex_cond=True, pos_cond=True)\n", 359 | "train_val_data('sense2', 4, index5, split_label5, data_label5, sense_count4, [], lex_cond=True, pos_cond=True)\n", 360 | "train_val_data('full_sense', 5, index5, split_label5, data_label5, sense_count5, [], lex_cond=True, pos_cond=True)" 361 | ] 362 | }, 363 | { 364 | "cell_type": "code", 365 | "execution_count": 10, 366 | "metadata": { 367 | "collapsed": true 368 | }, 369 | "outputs": [], 370 | "source": [ 371 | "sampled_sense_count1 = [('1:19', 10000),\n", 372 | " ('1:17', 10000),\n", 373 | " ('2:34', 10000),\n", 374 | " ('2:33', 10000),\n", 375 | " ('1:27', 10000),\n", 376 | " ('2:37', 8000),\n", 377 | " ('1:24', 8000),\n", 378 | " ('1:08', 8000),\n", 379 | " ('1:12', 7000),\n", 380 | " ('1:22', 5000),\n", 381 | " ('2:29', 5000),\n", 382 | " ('1:05', 3000),\n", 383 | " ('1:16', 3000),\n", 384 | " ('1:25', 3000),\n", 385 | " ('1:20', 3000),\n", 386 | " ('1:13', 2000),\n", 387 | " ('2:43', 1100),\n", 388 | " ('3:44', 1000)]" 389 | ] 390 | }, 391 | { 392 | "cell_type": "code", 393 | "execution_count": 11, 394 | "metadata": { 395 | "collapsed": true 396 | }, 397 | "outputs": [], 398 | "source": [ 399 | "sampled_sense_count2= []\n", 400 | "for s, c in sense_count2[260:]:\n", 401 | " sampled_sense_count2.append((s, 500))\n", 402 | "for s, c in sense_count2[180:260]:\n", 403 | " sampled_sense_count2.append((s, 2000))\n", 404 | "for s, c in sense_count2[140:180]:\n", 405 | " sampled_sense_count2.append((s, 5000))\n", 406 | "for s, c in sense_count2[75:140]:\n", 407 | " sampled_sense_count2.append((s, 8000))\n", 408 | "for s, c in sense_count2[25:75]:\n", 409 | " sampled_sense_count2.append((s, 12000))" 410 | ] 411 | }, 412 | { 413 | "cell_type": "code", 414 | "execution_count": 12, 415 | "metadata": { 416 | "collapsed": true 417 | }, 418 | "outputs": [], 419 | "source": [ 420 | "sampled_sense_count3= []\n", 421 | "for s, c in sense_count3[400:]:\n", 422 | " sampled_sense_count3.append((s, 500))\n", 423 | "for s, c in sense_count3[200:400]:\n", 424 | " sampled_sense_count3.append((s, 2000))\n", 425 | "for s, c in sense_count3[100:200]:\n", 426 | " sampled_sense_count3.append((s, 5000))\n", 427 | "for s, c in sense_count3[70:100]:\n", 428 | " sampled_sense_count3.append((s, 8000))\n", 429 | "for s, c in sense_count3[25:70]:\n", 430 | " sampled_sense_count3.append((s, 12000))" 431 | ] 432 | }, 433 | { 434 | "cell_type": "code", 435 | "execution_count": 13, 436 | "metadata": { 437 | "collapsed": true 438 | }, 439 | "outputs": [], 440 | "source": [ 441 | "sampled_sense_count4= []\n", 442 | "for s, c in sense_count4[260:]:\n", 443 | " sampled_sense_count4.append((s, 500))\n", 444 | "for s, c in sense_count4[180:260]:\n", 445 | " sampled_sense_count4.append((s, 2000))\n", 446 | "for s, c in sense_count4[140:180]:\n", 447 | " sampled_sense_count4.append((s, 5000))\n", 448 | "for s, c in sense_count4[75:140]:\n", 449 | " sampled_sense_count4.append((s, 8000))\n", 450 | "for s, c in sense_count4[25:75]:\n", 451 | " sampled_sense_count4.append((s, 12000))" 452 | ] 453 | }, 454 | { 455 | "cell_type": "code", 456 | "execution_count": 14, 457 | "metadata": { 458 | "collapsed": true 459 | }, 460 | "outputs": [], 461 | "source": [ 462 | "sampled_sense_count5= []\n", 463 | "for s, c in sense_count5[400:]:\n", 464 | " sampled_sense_count5.append((s, 500))\n", 465 | "for s, c in sense_count5[200:400]:\n", 466 | " sampled_sense_count5.append((s, 2000))\n", 467 | "for s, c in sense_count5[100:200]:\n", 468 | " sampled_sense_count5.append((s, 5000))\n", 469 | "for s, c in sense_count5[70:100]:\n", 470 | " sampled_sense_count5.append((s, 8000))\n", 471 | "for s, c in sense_count5[25:70]:\n", 472 | " sampled_sense_count5.append((s, 12000))" 473 | ] 474 | }, 475 | { 476 | "cell_type": "code", 477 | "execution_count": 15, 478 | "metadata": { 479 | "scrolled": false 480 | }, 481 | "outputs": [ 482 | { 483 | "name": "stderr", 484 | "output_type": "stream", 485 | "text": [ 486 | "/users/btech/aviraj/cs771/lib/python3.5/site-packages/sklearn/model_selection/_split.py:2026: FutureWarning: From version 0.21, test_size will always complement train_size unless both are specified.\n", 487 | " FutureWarning)\n" 488 | ] 489 | }, 490 | { 491 | "name": "stdout", 492 | "output_type": "stream", 493 | "text": [ 494 | "911174\n", 495 | "2061567\n", 496 | "2512876\n", 497 | "2041581\n", 498 | "2239996\n", 499 | "2239996\n" 500 | ] 501 | } 502 | ], 503 | "source": [ 504 | "train_val_data('lex1_sampled', 3, index1, split_label1, data_label1, sense_count1, sampled_sense_count1, lex_cond=False, pos_cond=True, sampling=True)\n", 505 | "train_val_data('lex2_sampled', 3, index2, split_label2, data_label2, sense_count1, sampled_sense_count2, lex_cond=False, pos_cond=True, sampling=True)\n", 506 | "train_val_data('lex3_sampled', 3, index3, split_label3, data_label3, sense_count1, sampled_sense_count3, lex_cond=False, pos_cond=True, sampling=True)\n", 507 | "train_val_data('sense1_sampled', 4, index4, split_label4, data_label4, sense_count4, sampled_sense_count4, lex_cond=True, pos_cond=True, sampling=True)\n", 508 | "train_val_data('sense2_sampled', 4, index5, split_label5, data_label5, sense_count4, sampled_sense_count5, lex_cond=True, pos_cond=True, sampling=True)\n", 509 | "train_val_data('full_sense_sampled', 5, index5, split_label5, data_label5, sense_count5, sampled_sense_count5, lex_cond=True, pos_cond=True, sampling=True)" 510 | ] 511 | } 512 | ], 513 | "metadata": { 514 | "kernelspec": { 515 | "display_name": "cs771", 516 | "language": "python", 517 | "name": "cs771" 518 | }, 519 | "language_info": { 520 | "codemirror_mode": { 521 | "name": "ipython", 522 | "version": 3 523 | }, 524 | "file_extension": ".py", 525 | "mimetype": "text/x-python", 526 | "name": "python", 527 | "nbconvert_exporter": "python", 528 | "pygments_lexer": "ipython3", 529 | "version": "3.5.2" 530 | } 531 | }, 532 | "nbformat": 4, 533 | "nbformat_minor": 2 534 | } 535 | -------------------------------------------------------------------------------- /one_million/One-Million All-Word Data Sampling-Fine.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true, 8 | "deletable": true, 9 | "editable": true 10 | }, 11 | "outputs": [], 12 | "source": [ 13 | "import pickle\n", 14 | "import numpy as np\n", 15 | "import os\n", 16 | "from sklearn.model_selection import train_test_split\n", 17 | "from collections import Counter\n", 18 | "from imblearn.over_sampling import RandomOverSampler" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 2, 24 | "metadata": { 25 | "collapsed": false, 26 | "deletable": true, 27 | "editable": true, 28 | "scrolled": true 29 | }, 30 | "outputs": [ 31 | { 32 | "name": "stdout", 33 | "output_type": "stream", 34 | "text": [ 35 | "46\n" 36 | ] 37 | } 38 | ], 39 | "source": [ 40 | "f = open(\"../../dataset/sense/dict_sense-keys\", 'rb')\n", 41 | "dict_sense_keys = pickle.load(f)\n", 42 | "f.close()\n", 43 | "\n", 44 | "f = open(\"../../dataset/sense/dict_word-sense\", 'rb')\n", 45 | "dict_word_sense = pickle.load(f)\n", 46 | "f.close()\n", 47 | "\n", 48 | "f = open('../Glove/word_embedding_glove', 'rb')\n", 49 | "word_embedding = pickle.load(f)\n", 50 | "f.close()\n", 51 | "word_embedding = word_embedding[: len(word_embedding)-1]\n", 52 | "\n", 53 | "f = open('../Glove/vocab_glove', 'rb')\n", 54 | "vocab = pickle.load(f)\n", 55 | "f.close()\n", 56 | "\n", 57 | "word2id = dict((w, i) for i,w in enumerate(vocab))\n", 58 | "id2word = dict((i, w) for i,w in enumerate(vocab))\n", 59 | "\n", 60 | "unknown_token = \"UNKNOWN_TOKEN\"\n", 61 | "\n", 62 | "with open('/data/aviraj/dataset/raw_preprocess_train','rb') as f:\n", 63 | " data=pickle.load(f)\n", 64 | "\n", 65 | "with open('/data/aviraj/dataset/fulldata_vocab_sense','rb') as f:\n", 66 | " vocab_lex=pickle.load(f)\n", 67 | "\n", 68 | "lex2id = dict((s, i) for i,s in enumerate(vocab_lex))\n", 69 | "id2lex = dict((i, s) for i,s in enumerate(vocab_lex))\n", 70 | "\n", 71 | "print(len(vocab_lex))\n", 72 | "max_sent_size = 200" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 3, 78 | "metadata": { 79 | "collapsed": false, 80 | "deletable": true, 81 | "editable": true 82 | }, 83 | "outputs": [ 84 | { 85 | "name": "stdout", 86 | "output_type": "stream", 87 | "text": [ 88 | "12\n" 89 | ] 90 | } 91 | ], 92 | "source": [ 93 | "_pos = []\n", 94 | "for i in range(len(data)):\n", 95 | " for pp in data[i][4]:\n", 96 | " _pos.append(pp)\n", 97 | " \n", 98 | "pos_count = Counter(_pos)\n", 99 | "pos_count = pos_count.most_common()\n", 100 | "vocab_pos = [pp for pp, c in pos_count]\n", 101 | "pos2id = dict((s, i) for i,s in enumerate(vocab_pos))\n", 102 | "print(len(vocab_pos))" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": 4, 108 | "metadata": { 109 | "collapsed": false, 110 | "deletable": true, 111 | "editable": true 112 | }, 113 | "outputs": [], 114 | "source": [ 115 | "data_y1 = []\n", 116 | "data_y2 = []\n", 117 | "data_y3 = []\n", 118 | "for i in range(len(data)):\n", 119 | " if (len(data[i][1])<=200):\n", 120 | " for j in range(len(data[i][2])):\n", 121 | " if data[i][2][j] is not None:\n", 122 | " data_y1.append(dict_sense_keys[data[i][2][j]][3])\n", 123 | " data_y2.append(dict_sense_keys[data[i][2][j]][4])\n", 124 | " data_y3.append(dict_sense_keys[data[i][2][j]][5])\n", 125 | "\n", 126 | "sense_count1 = Counter(data_y1)\n", 127 | "sense_count1 = sense_count1.most_common()[:-2]\n", 128 | "\n", 129 | "sense_count2 = Counter(data_y2)\n", 130 | "sense_count2 = sense_count2.most_common(180)\n", 131 | "\n", 132 | "sense_count3 = Counter(data_y3)\n", 133 | "sense_count3 = sense_count3.most_common(300)" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": 5, 139 | "metadata": { 140 | "collapsed": true, 141 | "deletable": true, 142 | "editable": true 143 | }, 144 | "outputs": [], 145 | "source": [ 146 | "dict_sense_count1 = dict(sense_count1)\n", 147 | "dict_sense_count2 = dict(sense_count2)\n", 148 | "dict_sense_count3 = dict(sense_count3)" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": 6, 154 | "metadata": { 155 | "collapsed": false, 156 | "deletable": true, 157 | "editable": true, 158 | "scrolled": true 159 | }, 160 | "outputs": [ 161 | { 162 | "name": "stdout", 163 | "output_type": "stream", 164 | "text": [ 165 | "44 180 300\n" 166 | ] 167 | } 168 | ], 169 | "source": [ 170 | "print(len(sense_count1), len(sense_count2), len(sense_count3))" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": 7, 176 | "metadata": { 177 | "collapsed": true, 178 | "deletable": true, 179 | "editable": true 180 | }, 181 | "outputs": [], 182 | "source": [ 183 | "data_x = []\n", 184 | "data_pos = []\n", 185 | "data_label1 = []\n", 186 | "data_label2 = []\n", 187 | "data_label3 = []\n", 188 | "\n", 189 | "for i in range(len(data)):\n", 190 | " if not all(np.array(data[i][2])==None) and (len(data[i][1])<=200):\n", 191 | " data_label1.append([ss if ss is not None and dict_sense_keys[ss][3] in dict_sense_count1 else None for ss in data[i][2]])\n", 192 | " data_label2.append([ss if ss is not None and dict_sense_keys[ss][4] in dict_sense_count2 else None for ss in data[i][2]])\n", 193 | " data_label3.append([ss if ss is not None and dict_sense_keys[ss][5] in dict_sense_count3 else None for ss in data[i][2]])\n", 194 | " data_x.append(data[i][1])\n", 195 | " data_pos.append(data[i][4])" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": 8, 201 | "metadata": { 202 | "collapsed": true, 203 | "deletable": true, 204 | "editable": true 205 | }, 206 | "outputs": [], 207 | "source": [ 208 | "def data_prepare(sense_id, x, pos, y, sense_count, lex_cond=False, pos_cond=False):\n", 209 | " num_examples = len(x)\n", 210 | " \n", 211 | " vocab_sense = [s for s, c in sense_count]\n", 212 | " sense2id = dict((s, i) for i,s in enumerate(vocab_sense))\n", 213 | " \n", 214 | " xx = np.zeros([num_examples, max_sent_size], dtype=int)\n", 215 | " xx_mask = np.zeros([num_examples, max_sent_size], dtype=bool)\n", 216 | " ss_mask = np.zeros([num_examples, max_sent_size], dtype=bool)\n", 217 | " yy = np.zeros([num_examples,max_sent_size], dtype=int)\n", 218 | " y_lex = np.zeros([num_examples, max_sent_size], dtype=int)\n", 219 | " y_pos = np.zeros([num_examples, max_sent_size], dtype=int)\n", 220 | " \n", 221 | " for j in range(num_examples):\n", 222 | " for i in range(max_sent_size):\n", 223 | " if(i>=len(x[j])):\n", 224 | " break\n", 225 | " w = x[j][i]\n", 226 | " s = y[j][i]\n", 227 | " p = pos[j][i]\n", 228 | " xx[j][i] = word2id[w] if w in word2id else word2id['UNKNOWN_TOKEN']\n", 229 | " xx_mask[j][i] = True\n", 230 | " ss_mask[j][i] = True if s is not None and dict_sense_keys[s][sense_id] in vocab_sense else False\n", 231 | " yy[j][i] = sense2id[dict_sense_keys[s][sense_id]] if s is not None and dict_sense_keys[s][sense_id] in vocab_sense else 0\n", 232 | " if(lex_cond):\n", 233 | " y_lex[j][i] = lex2id[dict_sense_keys[s][3]] if s is not None and dict_sense_keys[s][3] in vocab_lex else len(vocab_lex)\n", 234 | " if(pos_cond):\n", 235 | " y_pos[j][i] = pos2id[p] if p in vocab_pos else len(vocab_pos)\n", 236 | " \n", 237 | " return xx, xx_mask, ss_mask, yy, y_lex, y_pos" 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": 9, 243 | "metadata": { 244 | "collapsed": true, 245 | "deletable": true, 246 | "editable": true 247 | }, 248 | "outputs": [], 249 | "source": [ 250 | "data_x = np.array(data_x)\n", 251 | "data_pos = np.array(data_pos)\n", 252 | "\n", 253 | "def train_val_data(name, sense_id, index, split_label, data_label, sense_count, sampling_list, lex_cond=False, pos_cond=False, sampling=False):\n", 254 | " \n", 255 | " index_train, index_val, label_train_id, label_val_id = train_test_split(index, split_label, train_size=0.8, shuffle=True, stratify=split_label, random_state=0)\n", 256 | " \n", 257 | " if(sampling):\n", 258 | " dict_sample = dict(sampling_list)\n", 259 | " sm = RandomOverSampler(ratio=dict_sample)\n", 260 | " index_train1 = np.array(index_train).reshape(-1, 1)\n", 261 | " sampled_index, _ = sm.fit_sample(index_train1, label_train_id)\n", 262 | " count = Counter(_)\n", 263 | " count = count.most_common()\n", 264 | " sampled_index_train = np.array(sampled_index).reshape(1, -1)\n", 265 | " index_train = sampled_index_train[0]\n", 266 | " \n", 267 | " data_label = np.array(data_label)\n", 268 | " x_train = data_x[index_train]\n", 269 | " y_train = data_label[index_train]\n", 270 | " x_val = data_x[index_val]\n", 271 | " y_val = data_label[index_val]\n", 272 | " pos_train = []\n", 273 | " pos_val = []\n", 274 | " \n", 275 | " if(pos_cond):\n", 276 | " pos_train = data_pos[index_train]\n", 277 | " pos_val = data_pos[index_val]\n", 278 | "\n", 279 | " x_id_train, mask_train, sense_mask_train, y_id_train, lex_train, pos_id_train = data_prepare(sense_id, x_train, pos_train, y_train, sense_count, lex_cond=lex_cond, pos_cond=pos_cond)\n", 280 | " x_id_val, mask_val, sense_mask_val, y_id_val, lex_val, pos_id_val = data_prepare(sense_id, x_val, pos_val, y_val, sense_count, lex_cond=lex_cond, pos_cond=pos_cond)\n", 281 | "\n", 282 | " train_data = {'x':x_id_train,'x_mask':mask_train, 'sense_mask':sense_mask_train, 'y':y_id_train, 'lex':lex_train, 'pos':pos_id_train}\n", 283 | " val_data = {'x':x_id_val,'x_mask':mask_val, 'sense_mask':sense_mask_val, 'y':y_id_val, 'lex':lex_val, 'pos':pos_id_val}\n", 284 | " \n", 285 | " with open('/data/aviraj/dataset/train_val_data_fine/all_word_'+ name,'wb') as f:\n", 286 | " pickle.dump([train_data,val_data], f)\n", 287 | " \n", 288 | " print(len(x_id_train)+len(x_id_val))" 289 | ] 290 | }, 291 | { 292 | "cell_type": "code", 293 | "execution_count": 10, 294 | "metadata": { 295 | "collapsed": false, 296 | "deletable": true, 297 | "editable": true, 298 | "scrolled": true 299 | }, 300 | "outputs": [ 301 | { 302 | "name": "stdout", 303 | "output_type": "stream", 304 | "text": [ 305 | "850083\n", 306 | "838757\n", 307 | "828921\n" 308 | ] 309 | } 310 | ], 311 | "source": [ 312 | "split_label1 = []\n", 313 | "split_label2 = []\n", 314 | "split_label3 = []\n", 315 | "\n", 316 | "index1 = []\n", 317 | "index2 = []\n", 318 | "index3 = []\n", 319 | "\n", 320 | "for jj, lab in enumerate(data_label1):\n", 321 | " min_idx = np.argmin([dict_sense_count1[dict_sense_keys[lab[i]][3]] if lab[i] is not None else np.inf for i in range(len(lab)) ]) \n", 322 | " if(lab[min_idx] is not None):\n", 323 | " index1.append(jj)\n", 324 | " split_label1.append(dict_sense_keys[lab[min_idx]][3])\n", 325 | "\n", 326 | "for jj, lab in enumerate(data_label2):\n", 327 | " min_idx = np.argmin([dict_sense_count2[dict_sense_keys[lab[i]][4]] if lab[i] is not None else np.inf for i in range(len(lab)) ]) \n", 328 | " if(lab[min_idx] is not None):\n", 329 | " index2.append(jj)\n", 330 | " split_label2.append(dict_sense_keys[lab[min_idx]][4])\n", 331 | "\n", 332 | "for jj, lab in enumerate(data_label3):\n", 333 | " min_idx = np.argmin([dict_sense_count3[dict_sense_keys[lab[i]][5]] if lab[i] is not None else np.inf for i in range(len(lab)) ]) \n", 334 | " if(lab[min_idx] is not None):\n", 335 | " index3.append(jj)\n", 336 | " split_label3.append(dict_sense_keys[lab[min_idx]][5])\n", 337 | " \n", 338 | "print(len(split_label1))\n", 339 | "print(len(split_label2))\n", 340 | "print(len(split_label3))" 341 | ] 342 | }, 343 | { 344 | "cell_type": "code", 345 | "execution_count": 11, 346 | "metadata": { 347 | "collapsed": false, 348 | "deletable": true, 349 | "editable": true, 350 | "scrolled": true 351 | }, 352 | "outputs": [ 353 | { 354 | "name": "stderr", 355 | "output_type": "stream", 356 | "text": [ 357 | "/users/btech/aviraj/envs/lib/python3.5/site-packages/sklearn/model_selection/_split.py:2026: FutureWarning: From version 0.21, test_size will always complement train_size unless both are specified.\n", 358 | " FutureWarning)\n" 359 | ] 360 | }, 361 | { 362 | "name": "stdout", 363 | "output_type": "stream", 364 | "text": [ 365 | "850083\n", 366 | "838757\n", 367 | "828921\n" 368 | ] 369 | } 370 | ], 371 | "source": [ 372 | "train_val_data('lex', 3, index1, split_label1, data_label1, sense_count1, [], lex_cond=False, pos_cond=True)\n", 373 | "train_val_data('sense', 4, index2, split_label2, data_label2, sense_count2, [], lex_cond=True, pos_cond=True)\n", 374 | "train_val_data('full_sense', 5, index3, split_label3, data_label3, sense_count3, [], lex_cond=True, pos_cond=True)" 375 | ] 376 | }, 377 | { 378 | "cell_type": "code", 379 | "execution_count": 12, 380 | "metadata": { 381 | "collapsed": true, 382 | "deletable": true, 383 | "editable": true 384 | }, 385 | "outputs": [], 386 | "source": [ 387 | "sampled_sense_count1 = [('1:19', 10000),\n", 388 | " ('1:17', 10000),\n", 389 | " ('2:34', 10000),\n", 390 | " ('2:33', 10000),\n", 391 | " ('1:27', 10000),\n", 392 | " ('2:37', 8000),\n", 393 | " ('1:24', 8000),\n", 394 | " ('1:08', 8000),\n", 395 | " ('1:12', 7000),\n", 396 | " ('1:22', 5000),\n", 397 | " ('2:29', 5000),\n", 398 | " ('1:05', 3000),\n", 399 | " ('1:16', 3000),\n", 400 | " ('1:25', 3000),\n", 401 | " ('1:20', 3000),\n", 402 | " ('1:13', 2000)]" 403 | ] 404 | }, 405 | { 406 | "cell_type": "code", 407 | "execution_count": 13, 408 | "metadata": { 409 | "collapsed": true, 410 | "deletable": true, 411 | "editable": true 412 | }, 413 | "outputs": [], 414 | "source": [ 415 | "sampled_sense_count2= []\n", 416 | "for s, c in sense_count2[120:]:\n", 417 | " sampled_sense_count2.append((s, 5000))\n", 418 | "for s, c in sense_count2[75:120]:\n", 419 | " sampled_sense_count2.append((s, 8000))\n", 420 | "for s, c in sense_count2[25:75]:\n", 421 | " sampled_sense_count2.append((s, 12000))" 422 | ] 423 | }, 424 | { 425 | "cell_type": "code", 426 | "execution_count": 14, 427 | "metadata": { 428 | "collapsed": true, 429 | "deletable": true, 430 | "editable": true 431 | }, 432 | "outputs": [], 433 | "source": [ 434 | "sampled_sense_count3= []\n", 435 | "for s, c in sense_count3[130:]:\n", 436 | " sampled_sense_count3.append((s, 5000))\n", 437 | "for s, c in sense_count3[70:130]:\n", 438 | " sampled_sense_count3.append((s, 8000))\n", 439 | "for s, c in sense_count3[25:70]:\n", 440 | " sampled_sense_count3.append((s, 12000))" 441 | ] 442 | }, 443 | { 444 | "cell_type": "code", 445 | "execution_count": 15, 446 | "metadata": { 447 | "collapsed": false, 448 | "deletable": true, 449 | "editable": true 450 | }, 451 | "outputs": [ 452 | { 453 | "name": "stderr", 454 | "output_type": "stream", 455 | "text": [ 456 | "/users/btech/aviraj/envs/lib/python3.5/site-packages/sklearn/model_selection/_split.py:2026: FutureWarning: From version 0.21, test_size will always complement train_size unless both are specified.\n", 457 | " FutureWarning)\n" 458 | ] 459 | }, 460 | { 461 | "name": "stdout", 462 | "output_type": "stream", 463 | "text": [ 464 | "909119\n", 465 | "1814988\n", 466 | "2375783\n" 467 | ] 468 | } 469 | ], 470 | "source": [ 471 | "train_val_data('lex_sampled', 3, index1, split_label1, data_label1, sense_count1, sampled_sense_count1, lex_cond=False, pos_cond=True, sampling=True)\n", 472 | "train_val_data('sense_sampled', 4, index2, split_label2, data_label2, sense_count2, sampled_sense_count2, lex_cond=True, pos_cond=True, sampling=True)\n", 473 | "train_val_data('full_sense_sampled', 5, index3, split_label3, data_label3, sense_count3, sampled_sense_count3, lex_cond=True, pos_cond=True, sampling=True)" 474 | ] 475 | }, 476 | { 477 | "cell_type": "code", 478 | "execution_count": null, 479 | "metadata": { 480 | "collapsed": true 481 | }, 482 | "outputs": [], 483 | "source": [] 484 | } 485 | ], 486 | "metadata": { 487 | "kernelspec": { 488 | "display_name": "envs", 489 | "language": "python", 490 | "name": "cs771" 491 | }, 492 | "language_info": { 493 | "codemirror_mode": { 494 | "name": "ipython", 495 | "version": 3 496 | }, 497 | "file_extension": ".py", 498 | "mimetype": "text/x-python", 499 | "name": "python", 500 | "nbconvert_exporter": "python", 501 | "pygments_lexer": "ipython3", 502 | "version": "3.5.2" 503 | } 504 | }, 505 | "nbformat": 4, 506 | "nbformat_minor": 2 507 | } 508 | -------------------------------------------------------------------------------- /one_million/Sense-test.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import pickle\n", 12 | "from collections import Counter" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 2, 18 | "metadata": { 19 | "collapsed": true 20 | }, 21 | "outputs": [], 22 | "source": [ 23 | "with open('../../dataset/ALL.gold.key.txt') as f:\n", 24 | " sense_key = f.readlines()\n", 25 | "sense_key = [x.strip() for x in sense_key] " 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 3, 31 | "metadata": { 32 | "collapsed": true 33 | }, 34 | "outputs": [], 35 | "source": [ 36 | "with open(\"../../dataset/sense/ALL-keys\",\"wb\") as f:\n", 37 | " pickle.dump(sense_key, f)" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": null, 43 | "metadata": { 44 | "collapsed": true 45 | }, 46 | "outputs": [], 47 | "source": [] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 4, 52 | "metadata": {}, 53 | "outputs": [ 54 | { 55 | "name": "stdout", 56 | "output_type": "stream", 57 | "text": [ 58 | "4132\n" 59 | ] 60 | } 61 | ], 62 | "source": [ 63 | "for i,s in enumerate(sense_key):\n", 64 | " if(s[:11] == 'semeval2007'):\n", 65 | " print(i)\n", 66 | " break" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 5, 72 | "metadata": {}, 73 | "outputs": [ 74 | { 75 | "data": { 76 | "text/plain": [ 77 | "'art%1:09:00::'" 78 | ] 79 | }, 80 | "execution_count": 5, 81 | "metadata": {}, 82 | "output_type": "execute_result" 83 | } 84 | ], 85 | "source": [ 86 | "sense_key[0][25:]" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 7, 92 | "metadata": { 93 | "collapsed": true 94 | }, 95 | "outputs": [], 96 | "source": [ 97 | "dict_sense_keys = dict((sense_key[i][:24], []) if i<4132 else (sense_key[i][:26], []) for i in range(len(sense_key)))\n", 98 | "\n", 99 | "for i in range(4132):\n", 100 | " index = sense_key[i].find(\"%\")\n", 101 | " dict_sense_keys[sense_key[i][:24]].append(sense_key[i][25:])\n", 102 | " dict_sense_keys[sense_key[i][:24]].append(sense_key[i][index+1])\n", 103 | " dict_sense_keys[sense_key[i][:24]].append(sense_key[i][index+3:index+5])\n", 104 | " dict_sense_keys[sense_key[i][:24]].append(sense_key[i][index+1:index+5])\n", 105 | " dict_sense_keys[sense_key[i][:24]].append(sense_key[i][index+1:index+8])\n", 106 | " dict_sense_keys[sense_key[i][:24]].append(sense_key[i][index+1:])\n", 107 | "\n", 108 | "for i in range(4132, len(sense_key)):\n", 109 | " index = sense_key[i].find(\"%\")\n", 110 | " dict_sense_keys[sense_key[i][:26]].append(sense_key[i][27:])\n", 111 | " dict_sense_keys[sense_key[i][:26]].append(sense_key[i][index+1])\n", 112 | " dict_sense_keys[sense_key[i][:26]].append(sense_key[i][index+3:index+5])\n", 113 | " dict_sense_keys[sense_key[i][:26]].append(sense_key[i][index+1:index+5])\n", 114 | " dict_sense_keys[sense_key[i][:26]].append(sense_key[i][index+1:index+8])\n", 115 | " dict_sense_keys[sense_key[i][:26]].append(sense_key[i][index+1:])" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": 10, 121 | "metadata": { 122 | "collapsed": true 123 | }, 124 | "outputs": [], 125 | "source": [ 126 | "with open(\"../../dataset/sense/dict_sense-keys_test\",\"wb\") as f:\n", 127 | " pickle.dump(dict_sense_keys, f)" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": 3, 133 | "metadata": { 134 | "collapsed": true 135 | }, 136 | "outputs": [], 137 | "source": [ 138 | "total_words = []\n", 139 | "for i in range(226036):\n", 140 | " index = sense_key[i].find(\"%\")\n", 141 | " total_words.append(sense_key[i][15:index])\n", 142 | "\n", 143 | "for i in range(226036, len(sense_key)):\n", 144 | " index = sense_key[i].find(\"%\")\n", 145 | " total_words.append(sense_key[i][24:index])\n", 146 | "\n", 147 | "total_words = Counter(total_words)\n", 148 | "word_count = total_words.most_common()\n", 149 | "vocab_words = [k for k,v in word_count]" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": 7, 155 | "metadata": { 156 | "scrolled": true 157 | }, 158 | "outputs": [ 159 | { 160 | "data": { 161 | "text/plain": [ 162 | "20400" 163 | ] 164 | }, 165 | "execution_count": 7, 166 | "metadata": {}, 167 | "output_type": "execute_result" 168 | } 169 | ], 170 | "source": [ 171 | "with open(\"../../dataset/sense/vocab_sense-words\",\"wb\") as f:\n", 172 | " pickle.dump(vocab_words, f)\n", 173 | " \n", 174 | "len(vocab_words)" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": 4, 180 | "metadata": { 181 | "scrolled": true 182 | }, 183 | "outputs": [ 184 | { 185 | "data": { 186 | "text/plain": [ 187 | "[('change', 3074),\n", 188 | " ('lead', 2987),\n", 189 | " ('design', 2938),\n", 190 | " ('open', 2922),\n", 191 | " ('study', 2920),\n", 192 | " ('set', 2909),\n", 193 | " ('call', 2906),\n", 194 | " ('point', 2855),\n", 195 | " ('bring', 2836),\n", 196 | " ('extend', 2832)]" 197 | ] 198 | }, 199 | "execution_count": 4, 200 | "metadata": {}, 201 | "output_type": "execute_result" 202 | } 203 | ], 204 | "source": [ 205 | "word_count[20:30]" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": null, 211 | "metadata": { 212 | "collapsed": true 213 | }, 214 | "outputs": [], 215 | "source": [] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": 9, 220 | "metadata": {}, 221 | "outputs": [ 222 | { 223 | "data": { 224 | "text/plain": [ 225 | "34322" 226 | ] 227 | }, 228 | "execution_count": 9, 229 | "metadata": {}, 230 | "output_type": "execute_result" 231 | } 232 | ], 233 | "source": [ 234 | "total_word_senses = []\n", 235 | "\n", 236 | "for i in range(226036):\n", 237 | " total_word_senses.append(sense_key[i][15:])\n", 238 | "\n", 239 | "for i in range(226036, len(sense_key)):\n", 240 | " total_word_senses.append(sense_key[i][24:])\n", 241 | "\n", 242 | "total_word_senses = Counter(total_word_senses)\n", 243 | "word_senses_count = total_word_senses.most_common()\n", 244 | "vocab_word_senses = [k for k,v in word_senses_count]\n", 245 | "\n", 246 | "with open(\"../../dataset/sense/vocab_word-senses\",\"wb\") as f:\n", 247 | " pickle.dump(vocab_word_senses, f)\n", 248 | " \n", 249 | "len(vocab_word_senses)" 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": 10, 255 | "metadata": {}, 256 | "outputs": [ 257 | { 258 | "data": { 259 | "text/plain": [ 260 | "[('be%2:42:03::', 10582),\n", 261 | " ('person%1:03:00::', 7195),\n", 262 | " ('line%1:04:01::', 4968),\n", 263 | " ('see%2:31:00::', 4554),\n", 264 | " ('be%2:42:06::', 3423),\n", 265 | " ('keep%2:41:03::', 2283),\n", 266 | " ('little%3:00:03::', 2042),\n", 267 | " ('group%1:03:00::', 1826),\n", 268 | " ('say%2:32:00::', 1819),\n", 269 | " ('not%4:02:00::', 1703)]" 270 | ] 271 | }, 272 | "execution_count": 10, 273 | "metadata": {}, 274 | "output_type": "execute_result" 275 | } 276 | ], 277 | "source": [ 278 | "word_senses_count[:10]" 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": null, 284 | "metadata": { 285 | "collapsed": true 286 | }, 287 | "outputs": [], 288 | "source": [] 289 | }, 290 | { 291 | "cell_type": "code", 292 | "execution_count": 11, 293 | "metadata": { 294 | "collapsed": true 295 | }, 296 | "outputs": [], 297 | "source": [ 298 | "dict_word_sense_keys = dict((w, []) for w in vocab_words)\n", 299 | "\n", 300 | "for v in vocab_word_senses:\n", 301 | " dict_word_sense_keys[v[:v.find(\"%\")]].append(v)\n", 302 | "\n", 303 | "with open(\"../../dataset/sense/dict_word-sense\",\"wb\") as f:\n", 304 | " pickle.dump(dict_word_sense_keys, f)" 305 | ] 306 | }, 307 | { 308 | "cell_type": "code", 309 | "execution_count": 12, 310 | "metadata": { 311 | "scrolled": true 312 | }, 313 | "outputs": [ 314 | { 315 | "data": { 316 | "text/plain": [ 317 | "['open%2:35:00::',\n", 318 | " 'open%5:00:00:public:00',\n", 319 | " 'open%2:41:01::',\n", 320 | " 'open%2:30:00::',\n", 321 | " 'open%5:00:00:unrestricted:00',\n", 322 | " 'open%2:30:01::',\n", 323 | " 'open%2:35:06::',\n", 324 | " 'open%2:41:00::',\n", 325 | " 'open%3:00:01::',\n", 326 | " 'open%3:00:02::',\n", 327 | " 'open%5:00:00:unprotected:00',\n", 328 | " 'open%2:35:08::',\n", 329 | " 'open%2:33:00::',\n", 330 | " 'open%5:00:00:available:00',\n", 331 | " 'open%2:42:00::',\n", 332 | " 'open%5:00:00:coarse:00',\n", 333 | " 'open%5:00:00:unenclosed:00',\n", 334 | " 'open%5:00:00:vulnerable:00',\n", 335 | " 'open%3:00:04::',\n", 336 | " 'open%1:15:02::',\n", 337 | " 'open%5:00:00:unconstricted:00',\n", 338 | " 'open%3:00:08::',\n", 339 | " 'open%5:00:00:unsealed:01',\n", 340 | " 'open%5:00:00:unsettled:02',\n", 341 | " 'open%1:15:01::']" 342 | ] 343 | }, 344 | "execution_count": 12, 345 | "metadata": {}, 346 | "output_type": "execute_result" 347 | } 348 | ], 349 | "source": [ 350 | "dict_word_sense_keys['open']" 351 | ] 352 | }, 353 | { 354 | "cell_type": "code", 355 | "execution_count": null, 356 | "metadata": { 357 | "collapsed": true 358 | }, 359 | "outputs": [], 360 | "source": [] 361 | }, 362 | { 363 | "cell_type": "code", 364 | "execution_count": 14, 365 | "metadata": {}, 366 | "outputs": [ 367 | { 368 | "data": { 369 | "text/plain": [ 370 | "2468" 371 | ] 372 | }, 373 | "execution_count": 14, 374 | "metadata": {}, 375 | "output_type": "execute_result" 376 | } 377 | ], 378 | "source": [ 379 | "total_sense = []\n", 380 | "senses = []\n", 381 | "for i in range(len(sense_key)):\n", 382 | " index = sense_key[i].find(\"%\")\n", 383 | " total_sense.append(sense_key[i][index+1:])\n", 384 | "\n", 385 | "total_sense = Counter(total_sense)\n", 386 | "sense_count = total_sense.most_common()\n", 387 | "\n", 388 | "vocab_sense = [k for k,v in sense_count]\n", 389 | "\n", 390 | "with open(\"../../dataset/sense/vocab_sense\",\"wb\") as f:\n", 391 | " pickle.dump(vocab_sense, f)\n", 392 | " \n", 393 | "len(vocab_sense)" 394 | ] 395 | }, 396 | { 397 | "cell_type": "code", 398 | "execution_count": null, 399 | "metadata": { 400 | "collapsed": true 401 | }, 402 | "outputs": [], 403 | "source": [] 404 | } 405 | ], 406 | "metadata": { 407 | "kernelspec": { 408 | "display_name": "envs", 409 | "language": "python", 410 | "name": "cs771" 411 | }, 412 | "language_info": { 413 | "codemirror_mode": { 414 | "name": "ipython", 415 | "version": 3 416 | }, 417 | "file_extension": ".py", 418 | "mimetype": "text/x-python", 419 | "name": "python", 420 | "nbconvert_exporter": "python", 421 | "pygments_lexer": "ipython3", 422 | "version": "3.5.2" 423 | } 424 | }, 425 | "nbformat": 4, 426 | "nbformat_minor": 2 427 | } 428 | -------------------------------------------------------------------------------- /one_million/all-word-model: -------------------------------------------------------------------------------- 1 | all-word-model: 2 | 3 | 1: basic Val: F1 Score:65.5462 Accuracy:73.1659 Model-aw-1-multigpu-1 4 | 5 | 2: cnn with pos Val: F1 Score:72.33 Accuracy:77.93 POS: F1 Score:94.84 Accuracy:97.54 Model-aw-lex-1.4 6 | 7 | 3: local attention 44.361822318916865, 53.75801083454307, 82.19997565386215, 90.42074423342494 8 | Model-aw-lex-local_attention-fast-v2-4 9 | 10 | 4: local attention with hidden states Val: F1 Score:52.19 Accuracy:58.68 POS: F1 Score:85.66 Accuracy:92.72 Model-aw-lex-local_attention-fast-v2-6 11 | 12 | 5: gated local attention Val: F1 Score:44.17 Accuracy:53.07 POS: F1 Score:84.01 Accuracy:91.94 13 | Model-aw-lex-local_attention-fast-v2-7 14 | 15 | 6: local attention with crf Val: F1 Score:50.65 Accuracy:57.15 POS: F1 Score:87.84 Accuracy:93.70 16 | Model-aw-lex-local_attention-fast-v3-1 and Model-aw-lex-local_attention-fast-v4-1 17 | 18 | 7: soft hierarchical Model-aw-lex-hierarchical-2.ipynb 19 | Val: F1 Score:74.04 Accuracy:79.38 POS: F1 Score:96.34 Accuracy:98.21 Loss:0.8093 , Time: 1240.6 20 | 21 | 8: hard hierarchical Model-aw-lex-hierarchical-2.ipynb 22 | Val: F1 Score:70.35 Accuracy:77.30 POS: F1 Score:95.56 Accuracy:97.89 Loss:0.9279 , Time: 1195.1 23 | -------------------------------------------------------------------------------- /one_million/all-word/Model-aw-4-1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import tensorflow as tf\n", 12 | "tf.logging.set_verbosity(tf.logging.WARN)\n", 13 | "import pickle\n", 14 | "import numpy as np\n", 15 | "import os\n", 16 | "from sklearn.model_selection import train_test_split\n", 17 | "from sklearn.metrics import f1_score\n", 18 | "from sklearn.metrics import accuracy_score\n", 19 | "import os\n", 20 | "from tensorflow.python.client import device_lib\n", 21 | "from collections import Counter\n", 22 | "import time" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 2, 28 | "metadata": { 29 | "collapsed": true 30 | }, 31 | "outputs": [], 32 | "source": [ 33 | "f = open('../../Glove/word_embedding_glove', 'rb')\n", 34 | "word_embedding = pickle.load(f)\n", 35 | "f.close()\n", 36 | "\n", 37 | "word_embedding = word_embedding[: len(word_embedding)-1]\n", 38 | "\n", 39 | "f = open('../../Glove/vocab_glove', 'rb')\n", 40 | "vocab = pickle.load(f)\n", 41 | "f.close()\n", 42 | "\n", 43 | "word2id = dict((w, i) for i,w in enumerate(vocab))\n", 44 | "id2word = dict((i, w) for i,w in enumerate(vocab))\n", 45 | "\n", 46 | "unknown_token = \"UNKNOWN_TOKEN\"\n", 47 | "\n", 48 | "# Model Description\n", 49 | "model_name = 'model-aw-4-1'\n", 50 | "model_dir = '../output/all-word/' + model_name\n", 51 | "save_dir = os.path.join(model_dir, \"save/\")\n", 52 | "log_dir = os.path.join(model_dir, \"log\")\n", 53 | "\n", 54 | "if not os.path.exists(model_dir):\n", 55 | " os.mkdir(model_dir)\n", 56 | "if not os.path.exists(save_dir):\n", 57 | " os.mkdir(save_dir)\n", 58 | "if not os.path.exists(log_dir):\n", 59 | " os.mkdir(log_dir)\n", 60 | "\n", 61 | "with open('/data/aviraj/dataset/train_val_data/all_word_sense2_sampled','rb') as f:\n", 62 | " train_data, val_data = pickle.load(f) \n", 63 | " \n", 64 | "\n", 65 | "# Parameters\n", 66 | "mode = 'train'\n", 67 | "num_senses = 272\n", 68 | "num_lex = 47\n", 69 | "num_pos = 12\n", 70 | "batch_size = 32\n", 71 | "vocab_size = len(vocab)\n", 72 | "unk_vocab_size = 1\n", 73 | "word_emb_size = len(word_embedding[0])\n", 74 | "max_sent_size = 200\n", 75 | "hidden_size = 512\n", 76 | "keep_prob = 0.4\n", 77 | "l2_lambda = 0.001\n", 78 | "init_lr = 0.01\n", 79 | "decay_steps = 5000\n", 80 | "decay_rate = 0.999\n", 81 | "clip_norm = 1\n", 82 | "clipping = True\n", 83 | "moving_avg_deacy = 0.999\n", 84 | "num_gpus = 6" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 3, 90 | "metadata": { 91 | "collapsed": true 92 | }, 93 | "outputs": [], 94 | "source": [ 95 | "def average_gradients(tower_grads):\n", 96 | " average_grads = []\n", 97 | " for grad_and_vars in zip(*tower_grads):\n", 98 | " # Note that each grad_and_vars looks like the following:\n", 99 | " # ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))\n", 100 | " grads = []\n", 101 | " for g, _ in grad_and_vars:\n", 102 | " # Add 0 dimension to the gradients to represent the tower.\n", 103 | " expanded_g = tf.expand_dims(g, 0)\n", 104 | "\n", 105 | " # Append on a 'tower' dimension which we will average over below.\n", 106 | " grads.append(expanded_g)\n", 107 | "\n", 108 | " # Average over the 'tower' dimension.\n", 109 | " grad = tf.concat(grads, 0)\n", 110 | " grad = tf.reduce_mean(grad, 0)\n", 111 | "\n", 112 | " # Keep in mind that the Variables are redundant because they are shared\n", 113 | " # across towers. So .. we will just return the first tower's pointer to\n", 114 | " # the Variable.\n", 115 | " v = grad_and_vars[0][1]\n", 116 | " grad_and_var = (grad, v)\n", 117 | " average_grads.append(grad_and_var)\n", 118 | " return average_grads" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": 4, 124 | "metadata": { 125 | "collapsed": true 126 | }, 127 | "outputs": [], 128 | "source": [ 129 | "# MODEL\n", 130 | "device_num = 0\n", 131 | "tower_grads = []\n", 132 | "losses = []\n", 133 | "predictions = []\n", 134 | "\n", 135 | "x = tf.placeholder('int32', [num_gpus, batch_size, max_sent_size], name=\"x\")\n", 136 | "y = tf.placeholder('int32', [num_gpus, batch_size, max_sent_size], name=\"y\")\n", 137 | "x_mask = tf.placeholder('bool', [num_gpus, batch_size, max_sent_size], name='x_mask') \n", 138 | "sense_mask = tf.placeholder('bool', [num_gpus, batch_size, max_sent_size], name='sense_mask')\n", 139 | "is_train = tf.placeholder('bool', [], name='is_train')\n", 140 | "word_emb_mat = tf.placeholder('float', [None, word_emb_size], name='emb_mat')\n", 141 | "input_keep_prob = tf.cond(is_train,lambda:keep_prob, lambda:tf.constant(1.0))\n", 142 | "\n", 143 | "global_step = tf.Variable(0, trainable=False, name=\"global_step\")\n", 144 | "learning_rate = tf.train.exponential_decay(init_lr, global_step, decay_steps, decay_rate, staircase=True)\n", 145 | "summaries = []\n", 146 | "\n", 147 | "with tf.variable_scope(\"word_embedding\"):\n", 148 | " unk_word_emb_mat = tf.get_variable(\"word_emb_mat\", dtype='float', shape=[unk_vocab_size, word_emb_size], initializer=tf.contrib.layers.xavier_initializer(uniform=True, seed=0, dtype=tf.float32))\n", 149 | " final_word_emb_mat = tf.concat([word_emb_mat, unk_word_emb_mat], 0)\n", 150 | "\n", 151 | "with tf.variable_scope(tf.get_variable_scope()):\n", 152 | " for gpu_idx in range(num_gpus):\n", 153 | " if gpu_idx>2:\n", 154 | " device_num = 1\n", 155 | " with tf.name_scope(\"model_{}\".format(gpu_idx)) as scope, tf.device('/gpu:%d' % device_num):\n", 156 | "\n", 157 | " if gpu_idx > 0:\n", 158 | " tf.get_variable_scope().reuse_variables()\n", 159 | "\n", 160 | " with tf.name_scope(\"word\"):\n", 161 | " Wx = tf.nn.embedding_lookup(final_word_emb_mat, x[gpu_idx]) \n", 162 | "\n", 163 | " x_len = tf.reduce_sum(tf.cast(x_mask[gpu_idx], 'int32'), 1)\n", 164 | "\n", 165 | " with tf.variable_scope(\"lstm1\"):\n", 166 | " cell_fw1 = tf.contrib.rnn.BasicLSTMCell(hidden_size,state_is_tuple=True)\n", 167 | " cell_bw1 = tf.contrib.rnn.BasicLSTMCell(hidden_size,state_is_tuple=True)\n", 168 | "\n", 169 | " d_cell_fw1 = tf.contrib.rnn.DropoutWrapper(cell_fw1, input_keep_prob=input_keep_prob)\n", 170 | " d_cell_bw1 = tf.contrib.rnn.DropoutWrapper(cell_bw1, input_keep_prob=input_keep_prob)\n", 171 | "\n", 172 | " (fw_h1, bw_h1), _ = tf.nn.bidirectional_dynamic_rnn(d_cell_fw1, d_cell_bw1, Wx, sequence_length=x_len, dtype='float', scope='lstm1')\n", 173 | " h1 = tf.concat([fw_h1, bw_h1], 2)\n", 174 | "\n", 175 | " with tf.variable_scope(\"lstm2\"):\n", 176 | " cell_fw2 = tf.contrib.rnn.BasicLSTMCell(hidden_size,state_is_tuple=True)\n", 177 | " cell_bw2 = tf.contrib.rnn.BasicLSTMCell(hidden_size,state_is_tuple=True)\n", 178 | "\n", 179 | " d_cell_fw2 = tf.contrib.rnn.DropoutWrapper(cell_fw2, input_keep_prob=input_keep_prob)\n", 180 | " d_cell_bw2 = tf.contrib.rnn.DropoutWrapper(cell_bw2, input_keep_prob=input_keep_prob)\n", 181 | "\n", 182 | " (fw_h2, bw_h2), _ = tf.nn.bidirectional_dynamic_rnn(d_cell_fw2, d_cell_bw2, h1, sequence_length=x_len, dtype='float', scope='lstm2')\n", 183 | " h = tf.concat([fw_h2, bw_h2], 2)\n", 184 | "\n", 185 | " def attention(input_x, input_mask, W_att):\n", 186 | " h_masked = tf.boolean_mask(input_x, input_mask)\n", 187 | " h_tanh = tf.tanh(h_masked)\n", 188 | " u = tf.matmul(h_tanh, W_att)\n", 189 | " a = tf.nn.softmax(u)\n", 190 | " c = tf.reduce_sum(tf.multiply(h_tanh, a), 0) \n", 191 | " return c\n", 192 | "\n", 193 | " with tf.variable_scope(\"attention\"):\n", 194 | " W_att = tf.get_variable(\"W_att\", shape=[2*hidden_size, 1], initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.1, seed=gpu_idx*10))\n", 195 | " c = tf.expand_dims(attention(h[0], x_mask[gpu_idx][0], W_att), 0)\n", 196 | " for i in range(1, batch_size):\n", 197 | " c = tf.concat([c, tf.expand_dims(attention(h[i], x_mask[gpu_idx][i], W_att), 0)], 0)\n", 198 | " \n", 199 | " cc = tf.expand_dims(c, 1)\n", 200 | " c_final = tf.tile(cc, [1, max_sent_size, 1])\n", 201 | " h_final = tf.concat([c_final, h],2)\n", 202 | " flat_h_final = tf.reshape(h_final, [-1, 4*hidden_size])\n", 203 | " \n", 204 | " with tf.variable_scope(\"hidden_layer\"):\n", 205 | " W = tf.get_variable(\"W\", shape=[4*hidden_size, 2*hidden_size], initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.1, seed=gpu_idx*20))\n", 206 | " b = tf.get_variable(\"b\", shape=[2*hidden_size], initializer=tf.zeros_initializer())\n", 207 | " drop_flat_h_final = tf.nn.dropout(flat_h_final, input_keep_prob)\n", 208 | " flat_hl = tf.matmul(drop_flat_h_final, W) + b\n", 209 | " \n", 210 | " with tf.variable_scope(\"softmax_layer\"):\n", 211 | " W = tf.get_variable(\"W\", shape=[2*hidden_size, num_senses], initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.1, seed=gpu_idx*20))\n", 212 | " b = tf.get_variable(\"b\", shape=[num_senses], initializer=tf.zeros_initializer())\n", 213 | " drop_flat_hl = tf.nn.dropout(flat_hl, input_keep_prob)\n", 214 | " flat_logits_sense = tf.matmul(drop_flat_hl, W) + b\n", 215 | " logits = tf.reshape(flat_logits_sense, [batch_size, max_sent_size, num_senses])\n", 216 | " predictions.append(tf.arg_max(logits, 2))\n", 217 | "\n", 218 | " float_sense_mask = tf.cast(sense_mask[gpu_idx], 'float')\n", 219 | "\n", 220 | " loss = tf.contrib.seq2seq.sequence_loss(logits, y[gpu_idx], float_sense_mask, name=\"loss\")\n", 221 | "\n", 222 | " l2_loss = l2_lambda * tf.losses.get_regularization_loss()\n", 223 | "\n", 224 | " total_loss = loss + l2_loss\n", 225 | "\n", 226 | " summaries.append(tf.summary.scalar(\"loss_{}\".format(gpu_idx), loss))\n", 227 | " summaries.append(tf.summary.scalar(\"total_loss_{}\".format(gpu_idx), total_loss))\n", 228 | "\n", 229 | "\n", 230 | " optimizer = tf.train.AdamOptimizer(learning_rate)\n", 231 | " grads_vars = optimizer.compute_gradients(total_loss)\n", 232 | "\n", 233 | " clipped_grads = grads_vars\n", 234 | " if(clipping == True):\n", 235 | " clipped_grads = [(tf.clip_by_norm(grad, clip_norm), var) for grad, var in clipped_grads]\n", 236 | "\n", 237 | " tower_grads.append(clipped_grads)\n", 238 | " losses.append(total_loss)\n", 239 | "\n", 240 | "tower_grads = average_gradients(tower_grads)\n", 241 | "losses = tf.add_n(losses)/len(losses)\n", 242 | "apply_grad_op = optimizer.apply_gradients(tower_grads, global_step=global_step)\n", 243 | "summaries.append(tf.summary.scalar('total_loss', losses))\n", 244 | "summaries.append(tf.summary.scalar('learning_rate', learning_rate))\n", 245 | "\n", 246 | "for var in tf.trainable_variables():\n", 247 | " summaries.append(tf.summary.histogram(var.op.name, var))\n", 248 | "\n", 249 | "variable_averages = tf.train.ExponentialMovingAverage(moving_avg_deacy, global_step)\n", 250 | "variables_averages_op = variable_averages.apply(tf.trainable_variables())\n", 251 | "\n", 252 | "train_op = tf.group(apply_grad_op, variables_averages_op)\n", 253 | "saver = tf.train.Saver(tf.global_variables())\n", 254 | "summary = tf.summary.merge(summaries)" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": 5, 260 | "metadata": { 261 | "collapsed": true 262 | }, 263 | "outputs": [], 264 | "source": [ 265 | "os.environ[\"CUDA_DEVICE_ORDER\"]=\"PCI_BUS_ID\" # see issue #152\n", 266 | "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"0,1\"\n", 267 | "# print (device_lib.list_local_devices())\n", 268 | "config = tf.ConfigProto()\n", 269 | "config.gpu_options.allow_growth = True\n", 270 | "config.allow_soft_placement = True\n", 271 | "sess = tf.Session(config=config)\n", 272 | "sess.run(tf.global_variables_initializer()) # For initializing all the variables\n", 273 | "summary_writer = tf.summary.FileWriter(log_dir, sess.graph) # For writing Summaries" 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": 6, 279 | "metadata": { 280 | "collapsed": true 281 | }, 282 | "outputs": [], 283 | "source": [ 284 | "def model(xx, yy, mask, smask, train_cond=True):\n", 285 | " num_batches = int(len(xx)/(batch_size*num_gpus))\n", 286 | " _losses = 0\n", 287 | " temp_loss = 0\n", 288 | " preds_sense = []\n", 289 | " true_sense = []\n", 290 | " \n", 291 | " for j in range(num_batches): \n", 292 | " \n", 293 | " s = j * batch_size * num_gpus\n", 294 | " e = (j+1) * batch_size * num_gpus\n", 295 | " xx_re = xx[s:e].reshape([num_gpus, batch_size, -1])\n", 296 | " yy_re = yy[s:e].reshape([num_gpus, batch_size, -1])\n", 297 | " mask_re = mask[s:e].reshape([num_gpus, batch_size, -1])\n", 298 | " smask_re = smask[s:e].reshape([num_gpus, batch_size, -1])\n", 299 | " \n", 300 | " feed_dict = {x:xx_re, y:yy_re, x_mask:mask_re, sense_mask:smask_re, is_train:train_cond, input_keep_prob:keep_prob, word_emb_mat:word_embedding}\n", 301 | " \n", 302 | " if(train_cond==True):\n", 303 | " _, _loss, step, _summary = sess.run([train_op, losses, global_step, summary], feed_dict)\n", 304 | " summary_writer.add_summary(_summary, step)\n", 305 | " \n", 306 | " temp_loss += _loss\n", 307 | " if((j+1)%1000==0):\n", 308 | " print(\"Steps: {}\".format(step), \"Loss:{0:.4f}\".format(temp_loss/1000), \", Current Loss: {0:.4f}\".format(_loss))\n", 309 | " temp_loss = 0\n", 310 | " if((j+1)%5000==0):\n", 311 | " saver.save(sess, save_path=save_dir) \n", 312 | " \n", 313 | " else:\n", 314 | " _loss, pred = sess.run([total_loss, predictions], feed_dict)\n", 315 | " for i in range(num_gpus):\n", 316 | " preds_sense.append(pred[i][smask_re[i]])\n", 317 | " true_sense.append(yy_re[i][smask_re[i]])\n", 318 | "\n", 319 | " _losses +=_loss\n", 320 | "\n", 321 | " if(train_cond==False): \n", 322 | " sense_preds = []\n", 323 | " sense_true = []\n", 324 | " \n", 325 | " for preds in preds_sense:\n", 326 | " for ps in preds: \n", 327 | " sense_preds.append(ps) \n", 328 | " for trues in true_sense:\n", 329 | " for ts in trues:\n", 330 | " sense_true.append(ts)\n", 331 | " \n", 332 | " return _losses/num_batches, sense_preds, sense_true\n", 333 | "\n", 334 | " return _losses/num_batches, step\n", 335 | "\n", 336 | "def eval_score(yy, pred):\n", 337 | " f1 = f1_score(yy, pred, average='macro')\n", 338 | " accu = accuracy_score(yy, pred)\n", 339 | " return f1*100, accu*100" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": null, 345 | "metadata": { 346 | "collapsed": true 347 | }, 348 | "outputs": [], 349 | "source": [ 350 | "x_id_train = train_data['x']\n", 351 | "mask_train = train_data['x_mask']\n", 352 | "sense_mask_train = train_data['sense_mask']\n", 353 | "y_train = train_data['y']\n", 354 | "\n", 355 | "x_id_val = val_data['x']\n", 356 | "mask_val = val_data['x_mask']\n", 357 | "sense_mask_val = val_data['sense_mask']\n", 358 | "y_val = val_data['y']" 359 | ] 360 | }, 361 | { 362 | "cell_type": "code", 363 | "execution_count": null, 364 | "metadata": { 365 | "scrolled": true 366 | }, 367 | "outputs": [ 368 | { 369 | "name": "stdout", 370 | "output_type": "stream", 371 | "text": [ 372 | "Steps: 1000 Loss:15.7534 , Current Loss: 4.6380\n", 373 | "Steps: 2000 Loss:4.6967 , Current Loss: 4.6226\n", 374 | "Steps: 3000 Loss:4.7022 , Current Loss: 4.5907\n", 375 | "Steps: 4000 Loss:4.7315 , Current Loss: 4.6306\n", 376 | "Steps: 5000 Loss:4.8571 , Current Loss: 4.8387\n" 377 | ] 378 | } 379 | ], 380 | "source": [ 381 | "num_epochs = 5\n", 382 | "log_period = 1\n", 383 | "\n", 384 | "for i in range(num_epochs):\n", 385 | " random = np.random.choice(len(y_train), size=(len(y_train)), replace=False)\n", 386 | " x_id_train = x_id_train[random]\n", 387 | " y_train = y_train[random]\n", 388 | " mask_train = mask_train[random] \n", 389 | " sense_mask_train = sense_mask_train[random]\n", 390 | " \n", 391 | " start_time = time.time()\n", 392 | " train_loss, step = model(x_id_train, y_train, mask_train, sense_mask_train)\n", 393 | " time_taken = time.time() - start_time\n", 394 | " print(\"Epoch: {}\".format(i+1),\", Step: {}\".format(step), \", loss: {0:.4f}\".format(train_loss), \", Time: {0:.1f}\".format(time_taken))\n", 395 | " saver.save(sess, save_path=save_dir) \n", 396 | " print(\"Model Saved\")\n", 397 | " \n", 398 | " if((i+1)%log_period==0):\n", 399 | " start_time = time.time()\n", 400 | " val_loss, val_pred, val_true = model(x_id_val, y_val, mask_val, sense_mask_val, train_cond=False) \n", 401 | " f1_, accu_ = eval_score(val_true, val_pred)\n", 402 | " time_taken = time.time() - start_time\n", 403 | " print(\"Val: F1 Score:{0:.2f}\".format(f1_), \"Accuracy:{0:.2f}\".format(accu_), \"Loss:{0:.4f}\".format(val_loss), \", Time: {0:.1f}\".format(time_taken))" 404 | ] 405 | }, 406 | { 407 | "cell_type": "code", 408 | "execution_count": 1, 409 | "metadata": {}, 410 | "outputs": [], 411 | "source": [ 412 | "start_time = time.time()\n", 413 | "train_loss, train_pred, train_true = model(x_id_train, y_train, mask_train, sense_mask_train, train_cond=False) \n", 414 | "f1_, accu_ = etrain_score(train_true, train_pred)\n", 415 | "time_taken = time.time() - start_time\n", 416 | "print(\"train: F1 Score:{0:.2f}\".format(f1_), \"Accuracy:{0:.2f}\".format(accu_), \"Loss:{0:.4f}\".format(train_loss), \", Time: {0:.1f}\".format(time_taken))" 417 | ] 418 | }, 419 | { 420 | "cell_type": "code", 421 | "execution_count": null, 422 | "metadata": { 423 | "collapsed": true 424 | }, 425 | "outputs": [], 426 | "source": [] 427 | }, 428 | { 429 | "cell_type": "code", 430 | "execution_count": 8, 431 | "metadata": { 432 | "collapsed": true 433 | }, 434 | "outputs": [], 435 | "source": [ 436 | "saver.restore(sess, save_dir)" 437 | ] 438 | } 439 | ], 440 | "metadata": { 441 | "kernelspec": { 442 | "display_name": "cs771", 443 | "language": "python", 444 | "name": "cs771" 445 | }, 446 | "language_info": { 447 | "codemirror_mode": { 448 | "name": "ipython", 449 | "version": 3 450 | }, 451 | "file_extension": ".py", 452 | "mimetype": "text/x-python", 453 | "name": "python", 454 | "nbconvert_exporter": "python", 455 | "pygments_lexer": "ipython3", 456 | "version": "3.5.2" 457 | } 458 | }, 459 | "nbformat": 4, 460 | "nbformat_minor": 2 461 | } 462 | -------------------------------------------------------------------------------- /one_million/all-word/Readme.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Model-aw-lex-1 4 | Convolution over hidden states of lstms -------------------------------------------------------------------------------- /one_million/make/Make-Model-1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import tensorflow as tf\n", 12 | "tf.logging.set_verbosity(tf.logging.WARN)\n", 13 | "import pickle\n", 14 | "import numpy as np\n", 15 | "import os\n", 16 | "from sklearn.model_selection import train_test_split\n", 17 | "from sklearn.metrics import f1_score\n", 18 | "from sklearn.metrics import accuracy_score\n", 19 | "import os\n", 20 | "from tensorflow.python.client import device_lib\n", 21 | "from collections import Counter" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 2, 27 | "metadata": { 28 | "collapsed": true 29 | }, 30 | "outputs": [], 31 | "source": [ 32 | "f = open('../../Glove/word_embedding_glove', 'rb')\n", 33 | "word_embedding = pickle.load(f)\n", 34 | "f.close()\n", 35 | "word_embedding = word_embedding[: len(word_embedding)-1]\n", 36 | "\n", 37 | "f = open('../../Glove/vocab_glove', 'rb')\n", 38 | "vocab = pickle.load(f)\n", 39 | "f.close()\n", 40 | "\n", 41 | "word2id = dict((w, i) for i,w in enumerate(vocab))\n", 42 | "id2word = dict((i, w) for i,w in enumerate(vocab))\n", 43 | "\n", 44 | "unknown_token = \"UNKNOWN_TOKEN\"" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 3, 50 | "metadata": { 51 | "collapsed": true 52 | }, 53 | "outputs": [], 54 | "source": [ 55 | "f = open(\"../../../dataset/sense/dict_sense-keys\", 'rb')\n", 56 | "dict_sense_keys = pickle.load(f)\n", 57 | "f.close()\n", 58 | "\n", 59 | "f = open(\"../../../dataset/sense/dict_word-sense\", 'rb')\n", 60 | "dict_word_sense = pickle.load(f)\n", 61 | "f.close()" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 4, 67 | "metadata": { 68 | "collapsed": true 69 | }, 70 | "outputs": [], 71 | "source": [ 72 | "# Model Description\n", 73 | "sense_word = 'make'\n", 74 | "model_name = 'model-1'\n", 75 | "sense_word_dir = '../output/' + sense_word\n", 76 | "model_dir = sense_word_dir + '/' + model_name\n", 77 | "save_dir = os.path.join(model_dir, \"save/\")\n", 78 | "log_dir = os.path.join(model_dir, \"log\")\n", 79 | "\n", 80 | "if not os.path.exists(sense_word_dir):\n", 81 | " os.mkdir(sense_word_dir)\n", 82 | "if not os.path.exists(model_dir):\n", 83 | " os.mkdir(model_dir)\n", 84 | "if not os.path.exists(save_dir):\n", 85 | " os.mkdir(save_dir)\n", 86 | "if not os.path.exists(log_dir):\n", 87 | " os.mkdir(log_dir)" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 8, 93 | "metadata": { 94 | "scrolled": false 95 | }, 96 | "outputs": [ 97 | { 98 | "data": { 99 | "text/plain": [ 100 | "[('36', 2006),\n", 101 | " ('30', 1025),\n", 102 | " ('42', 968),\n", 103 | " ('41', 962),\n", 104 | " ('31', 617),\n", 105 | " ('32', 543),\n", 106 | " ('38', 445),\n", 107 | " ('40', 20),\n", 108 | " ('29', 6),\n", 109 | " ('09', 1)]" 110 | ] 111 | }, 112 | "execution_count": 8, 113 | "metadata": {}, 114 | "output_type": "execute_result" 115 | } 116 | ], 117 | "source": [ 118 | "f = open(\"../../../dataset/checkwords/\"+ sense_word + \"_data\", 'rb')\n", 119 | "data = pickle.load(f)\n", 120 | "f.close()\n", 121 | "\n", 122 | "data_y = []\n", 123 | "for i in range(len(data)):\n", 124 | " data_y.append(dict_sense_keys[data[i][0]][2])\n", 125 | "\n", 126 | "sense_count = Counter(data_y)\n", 127 | "sense_count = sense_count.most_common()\n", 128 | "vocab_sense = [k for k,v in sense_count[:7]]\n", 129 | "sense_count" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": 9, 135 | "metadata": {}, 136 | "outputs": [ 137 | { 138 | "data": { 139 | "text/plain": [ 140 | "['36', '30', '42', '41', '31', '32', '38']" 141 | ] 142 | }, 143 | "execution_count": 9, 144 | "metadata": {}, 145 | "output_type": "execute_result" 146 | } 147 | ], 148 | "source": [ 149 | "vocab_sense" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": 10, 155 | "metadata": {}, 156 | "outputs": [ 157 | { 158 | "name": "stdout", 159 | "output_type": "stream", 160 | "text": [ 161 | "6566 6593\n" 162 | ] 163 | } 164 | ], 165 | "source": [ 166 | "data_x = []\n", 167 | "data_label = []\n", 168 | "for i in range(len(data)):\n", 169 | " if dict_sense_keys[data[i][0]][2] in vocab_sense:\n", 170 | " data_x.append(data[i][1])\n", 171 | " data_label.append(dict_sense_keys[data[i][0]][2])\n", 172 | "\n", 173 | "print(len(data_label), len(data_y))\n", 174 | "\n", 175 | "# vocab_sense = dict_word_sense[sense_word]\n", 176 | "\n", 177 | "sense2id = dict((s, i) for i,s in enumerate(vocab_sense))\n", 178 | "id2sense = dict((i, s) for i,s in enumerate(vocab))" 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": 11, 184 | "metadata": { 185 | "collapsed": true 186 | }, 187 | "outputs": [], 188 | "source": [ 189 | "# Parameters\n", 190 | "mode = 'train'\n", 191 | "num_senses = len(vocab_sense)\n", 192 | "batch_size = 64\n", 193 | "vocab_size = len(vocab)\n", 194 | "unk_vocab_size = 1\n", 195 | "word_emb_size = len(word_embedding[0])\n", 196 | "max_sent_size = 300\n", 197 | "hidden_size = 100\n", 198 | "keep_prob = 0.5\n", 199 | "l2_lambda = 0.001\n", 200 | "init_lr = 0.01\n", 201 | "decay_steps = 500\n", 202 | "decay_rate = 0.96\n", 203 | "clip_norm = 1\n", 204 | "clipping = True" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": 12, 210 | "metadata": { 211 | "collapsed": true, 212 | "scrolled": true 213 | }, 214 | "outputs": [], 215 | "source": [ 216 | "# MODEL\n", 217 | "x = tf.placeholder('int32', [batch_size, max_sent_size], name=\"x\")\n", 218 | "y = tf.placeholder('int32', [batch_size], name=\"y\")\n", 219 | "x_mask = tf.placeholder('bool', [batch_size, max_sent_size], name='x_mask') \n", 220 | "is_train = tf.placeholder('bool', [], name='is_train')\n", 221 | "word_emb_mat = tf.placeholder('float', [None, word_emb_size], name='emb_mat')\n", 222 | "input_keep_prob = tf.cond(is_train,lambda:keep_prob, lambda:tf.constant(1.0))\n", 223 | "x_len = tf.reduce_sum(tf.cast(x_mask, 'int32'), 1)\n", 224 | "\n", 225 | "with tf.name_scope(\"word_embedding\"):\n", 226 | " if mode == 'train':\n", 227 | " unk_word_emb_mat = tf.get_variable(\"word_emb_mat\", dtype='float', shape=[unk_vocab_size, word_emb_size], initializer=tf.contrib.layers.xavier_initializer(uniform=True, seed=0, dtype=tf.float32))\n", 228 | " else:\n", 229 | " unk_word_emb_mat = tf.get_variable(\"word_emb_mat\", shape=[unk_vocab_size, word_emb_size], dtype='float')\n", 230 | " \n", 231 | " final_word_emb_mat = tf.concat([word_emb_mat, unk_word_emb_mat], 0)\n", 232 | " Wx = tf.nn.embedding_lookup(final_word_emb_mat, x) \n", 233 | "\n", 234 | "with tf.variable_scope(\"lstm1\"):\n", 235 | " cell_fw1 = tf.contrib.rnn.BasicLSTMCell(hidden_size,state_is_tuple=True)\n", 236 | " cell_bw1 = tf.contrib.rnn.BasicLSTMCell(hidden_size,state_is_tuple=True)\n", 237 | "\n", 238 | " d_cell_fw1 = tf.contrib.rnn.DropoutWrapper(cell_fw1, input_keep_prob=input_keep_prob)\n", 239 | " d_cell_bw1 = tf.contrib.rnn.DropoutWrapper(cell_bw1, input_keep_prob=input_keep_prob)\n", 240 | " \n", 241 | " (fw_h1, bw_h1), _ = tf.nn.bidirectional_dynamic_rnn(d_cell_fw1, d_cell_bw1, Wx, sequence_length=x_len, dtype='float', scope='lstm1')\n", 242 | " h1 = tf.concat([fw_h1, bw_h1], 2)\n", 243 | " \n", 244 | "with tf.variable_scope(\"lstm2\"):\n", 245 | " cell_fw2 = tf.contrib.rnn.BasicLSTMCell(hidden_size,state_is_tuple=True)\n", 246 | " cell_bw2 = tf.contrib.rnn.BasicLSTMCell(hidden_size,state_is_tuple=True)\n", 247 | "\n", 248 | " d_cell_fw2 = tf.contrib.rnn.DropoutWrapper(cell_fw2, input_keep_prob=input_keep_prob)\n", 249 | " d_cell_bw2 = tf.contrib.rnn.DropoutWrapper(cell_bw2, input_keep_prob=input_keep_prob)\n", 250 | " \n", 251 | " (fw_h2, bw_h2), _ = tf.nn.bidirectional_dynamic_rnn(d_cell_fw2, d_cell_bw2, h1, sequence_length=x_len, dtype='float', scope='lstm2')\n", 252 | " h = tf.concat([fw_h2, bw_h2], 2)\n", 253 | "\n", 254 | "def attention(input_x, input_mask, W_att):\n", 255 | " h_masked = tf.boolean_mask(input_x, input_mask)\n", 256 | " h_tanh = tf.tanh(h_masked)\n", 257 | " u = tf.matmul(h_tanh, W_att)\n", 258 | " a = tf.nn.softmax(u)\n", 259 | " c = tf.reduce_sum(tf.multiply(h_tanh, a), 0) \n", 260 | " return c\n", 261 | "\n", 262 | "with tf.variable_scope(\"attention\"):\n", 263 | " W_att = tf.Variable(tf.truncated_normal([2*hidden_size, 1], mean=0.0, stddev=0.1, seed=0), name=\"W_att\")\n", 264 | " c = tf.expand_dims(attention(h[0], x_mask[0], W_att), 0)\n", 265 | " for i in range(1, batch_size):\n", 266 | " c = tf.concat([c, tf.expand_dims(attention(h[i], x_mask[i], W_att), 0)], 0)\n", 267 | " \n", 268 | "with tf.variable_scope(\"softmax_layer\"):\n", 269 | " W = tf.Variable(tf.truncated_normal([2*hidden_size, num_senses], mean=0.0, stddev=0.1, seed=0), name=\"W\")\n", 270 | " b = tf.Variable(tf.zeros([num_senses]), name=\"b\")\n", 271 | " drop_c = tf.nn.dropout(c, input_keep_prob)\n", 272 | " logits = tf.matmul(drop_c, W) + b\n", 273 | " predictions = tf.argmax(logits, 1)\n", 274 | " \n", 275 | "loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=y))\n", 276 | "\n", 277 | "global_step = tf.Variable(0, trainable=False, name=\"global_step\")\n", 278 | "\n", 279 | "learning_rate = tf.train.exponential_decay(init_lr, global_step, decay_steps, decay_rate, staircase=True)\n", 280 | "\n", 281 | "tv_all = tf.trainable_variables()\n", 282 | "tv_regu =[]\n", 283 | "for t in tv_all:\n", 284 | " if t.name.find('b:')==-1:\n", 285 | " tv_regu.append(t)\n", 286 | " \n", 287 | "# l2 Loss\n", 288 | "l2_loss = l2_lambda * tf.reduce_sum([ tf.nn.l2_loss(v) for v in tv_regu ])\n", 289 | "\n", 290 | "total_loss = loss + l2_loss\n", 291 | "\n", 292 | "# Optimizer for loss\n", 293 | "optimizer = tf.train.AdamOptimizer(learning_rate)\n", 294 | "\n", 295 | "# Gradients and Variables for Loss\n", 296 | "grads_vars = optimizer.compute_gradients(total_loss)\n", 297 | "\n", 298 | "# Clipping of Gradients\n", 299 | "clipped_grads = grads_vars\n", 300 | "if(clipping == True):\n", 301 | " clipped_grads = [(tf.clip_by_norm(grad, clip_norm), var) for grad, var in clipped_grads]\n", 302 | "\n", 303 | "# Training Optimizer for Total Loss\n", 304 | "train_op = optimizer.apply_gradients(clipped_grads, global_step=global_step)\n", 305 | "\n", 306 | "# Summaries\n", 307 | "var_summaries = []\n", 308 | "for v in tv_all:\n", 309 | " var_summary = tf.summary.histogram(\"{}/var\".format(v.name), v)\n", 310 | " var_summaries.append(var_summary)\n", 311 | "\n", 312 | "var_summaries_merged = tf.summary.merge(var_summaries)\n", 313 | "\n", 314 | "loss_summary = tf.summary.scalar(\"loss\", loss)\n", 315 | "total_loss_summary = tf.summary.scalar(\"total_loss\", total_loss)\n", 316 | "summary = tf.summary.merge_all()" 317 | ] 318 | }, 319 | { 320 | "cell_type": "code", 321 | "execution_count": 13, 322 | "metadata": { 323 | "collapsed": true 324 | }, 325 | "outputs": [], 326 | "source": [ 327 | "os.environ[\"CUDA_DEVICE_ORDER\"]=\"PCI_BUS_ID\" # see issue #152\n", 328 | "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"0\"\n", 329 | "config = tf.ConfigProto()\n", 330 | "config.gpu_options.allow_growth = True\n", 331 | "sess = tf.Session(config=config)\n", 332 | "sess.run(tf.global_variables_initializer()) # For initializing all the variables\n", 333 | "saver = tf.train.Saver() # For Saving the model\n", 334 | "summary_writer = tf.summary.FileWriter(log_dir, sess.graph) # For writing Summaries" 335 | ] 336 | }, 337 | { 338 | "cell_type": "code", 339 | "execution_count": 14, 340 | "metadata": { 341 | "collapsed": true 342 | }, 343 | "outputs": [], 344 | "source": [ 345 | "index = []\n", 346 | "for i in range(len(data_x)):\n", 347 | " index.append(i)" 348 | ] 349 | }, 350 | { 351 | "cell_type": "code", 352 | "execution_count": 15, 353 | "metadata": { 354 | "scrolled": true 355 | }, 356 | "outputs": [ 357 | { 358 | "name": "stderr", 359 | "output_type": "stream", 360 | "text": [ 361 | "/users/btech/aviraj/cs771/lib/python3.5/site-packages/sklearn/model_selection/_split.py:2026: FutureWarning: From version 0.21, test_size will always complement train_size unless both are specified.\n", 362 | " FutureWarning)\n" 363 | ] 364 | } 365 | ], 366 | "source": [ 367 | "index_train, index_val, label_train, label_val = train_test_split(index, data_label, train_size=0.8, shuffle=True, stratify=data_label, random_state=0)\n", 368 | "\n", 369 | "data_x = np.array(data_x)\n", 370 | "\n", 371 | "x_train = data_x[index_train]\n", 372 | "x_val = data_x[index_val]" 373 | ] 374 | }, 375 | { 376 | "cell_type": "code", 377 | "execution_count": 16, 378 | "metadata": { 379 | "collapsed": true 380 | }, 381 | "outputs": [], 382 | "source": [ 383 | "def data_prepare(x, y):\n", 384 | " num_examples = len(x)\n", 385 | "\n", 386 | " xx = np.zeros([num_examples, max_sent_size], dtype=int)\n", 387 | " xx_mask = np.zeros([num_examples, max_sent_size], dtype=bool)\n", 388 | " yy = np.zeros([num_examples], dtype=int)\n", 389 | "\n", 390 | " for j in range(num_examples):\n", 391 | " for i in range(max_sent_size):\n", 392 | " if(i>=len(x[j])):\n", 393 | " break\n", 394 | " w = x[j][i]\n", 395 | " xx[j][i] = word2id[w] if w in word2id else word2id['UNKNOWN_TOKEN']\n", 396 | " xx_mask[j][i] = True\n", 397 | " yy[j] = sense2id[y[j]]\n", 398 | " return xx, xx_mask, yy\n", 399 | "\n", 400 | "def eval_score(yy, pred):\n", 401 | " num_batches = int(len(yy)/batch_size)\n", 402 | " f1 = f1_score(yy[:batch_size*num_batches], pred, average='macro')\n", 403 | " accu = accuracy_score(yy[:batch_size*num_batches], pred)\n", 404 | " return f1*100, accu*100\n", 405 | "\n", 406 | "def model(xx, yy, mask, train_cond=True):\n", 407 | " num_batches = int(len(xx)/batch_size)\n", 408 | " losses = 0\n", 409 | " preds = []\n", 410 | " for j in range(num_batches): \n", 411 | " \n", 412 | " s = j * batch_size\n", 413 | " e = (j+1) * batch_size\n", 414 | " \n", 415 | " feed_dict = {x:xx[s:e], y:yy[s:e], x_mask:mask[s:e], is_train:train_cond, input_keep_prob:keep_prob, word_emb_mat:word_embedding}\n", 416 | " \n", 417 | " \n", 418 | " if(train_cond==True):\n", 419 | " _, _loss, step, _summary = sess.run([train_op, total_loss, global_step, summary], feed_dict)\n", 420 | " summary_writer.add_summary(_summary, step) \n", 421 | "# print(\"Steps:{}\".format(step), \", Loss: {}\".format(_loss))\n", 422 | "\n", 423 | " else:\n", 424 | " _loss, pred = sess.run([total_loss, predictions], feed_dict)\n", 425 | " preds.append(pred)\n", 426 | " \n", 427 | " losses +=_loss\n", 428 | "\n", 429 | " if(train_cond==False):\n", 430 | " y_pred = []\n", 431 | " for i in range(num_batches):\n", 432 | " for pred in preds[i]:\n", 433 | " y_pred.append(pred)\n", 434 | " return losses/num_batches, y_pred\n", 435 | " \n", 436 | " return losses/num_batches, step" 437 | ] 438 | }, 439 | { 440 | "cell_type": "code", 441 | "execution_count": null, 442 | "metadata": { 443 | "collapsed": true 444 | }, 445 | "outputs": [], 446 | "source": [ 447 | "x_id_train, mask_train, y_train = data_prepare(x_train, label_train)\n", 448 | "x_id_val, mask_val, y_val = data_prepare(x_val, label_val)" 449 | ] 450 | }, 451 | { 452 | "cell_type": "code", 453 | "execution_count": null, 454 | "metadata": { 455 | "scrolled": true 456 | }, 457 | "outputs": [ 458 | { 459 | "name": "stdout", 460 | "output_type": "stream", 461 | "text": [ 462 | "Epoch: 1 Step: 82 loss: 7.29599668631\n", 463 | "Epoch: 2 Step: 164 loss: 2.07766101418\n", 464 | "Epoch: 3 Step: 246 loss: 1.99490781528\n", 465 | "Epoch: 4 Step: 328 loss: 1.97611695673\n", 466 | "Epoch: 5 Step: 410 loss: 1.97086549387\n", 467 | "Model Saved\n" 468 | ] 469 | }, 470 | { 471 | "name": "stderr", 472 | "output_type": "stream", 473 | "text": [ 474 | "/users/btech/aviraj/cs771/lib/python3.5/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.\n", 475 | " 'precision', 'predicted', average, warn_for)\n" 476 | ] 477 | }, 478 | { 479 | "name": "stdout", 480 | "output_type": "stream", 481 | "text": [ 482 | "Train: F1 Score: 6.69154280711 Accuracy: 30.5830792683 Loss: 1.95732803025\n", 483 | "Val: F1 Score: 6.72469704728 Accuracy: 30.78125 Loss: 1.95580910444\n", 484 | "Epoch: 6 Step: 492 loss: 1.98001657899\n" 485 | ] 486 | } 487 | ], 488 | "source": [ 489 | "num_epochs = 60\n", 490 | "log_period = 5\n", 491 | "\n", 492 | "for i in range(num_epochs):\n", 493 | " random = np.random.choice(len(y_train), size=(len(y_train)), replace=False)\n", 494 | " x_id_train = x_id_train[random]\n", 495 | " y_train = y_train[random]\n", 496 | " mask_train = mask_train[random] \n", 497 | " \n", 498 | " losses, step = model(x_id_train, y_train, mask_train)\n", 499 | " print(\"Epoch:\", i+1,\"Step:\", step, \"loss:\",losses)\n", 500 | " \n", 501 | " if((i+1)%log_period==0):\n", 502 | " saver.save(sess, save_path=save_dir) \n", 503 | " print(\"Model Saved\")\n", 504 | " train_loss, train_pred = model(x_id_train, y_train, mask_train, train_cond=False)\n", 505 | " f1_, accu_ = eval_score(y_train, train_pred)\n", 506 | " print(\"Train: F1 Score: \", f1_, \"Accuracy: \", accu_, \"Loss: \", train_loss)\n", 507 | " val_loss, val_pred = model(x_id_val, y_val, mask_val, train_cond=False)\n", 508 | " f1_, accu_ = eval_score(y_val, val_pred)\n", 509 | " print(\"Val: F1 Score: \", f1_, \"Accuracy: \", accu_, \"Loss: \", val_loss)\n", 510 | " \n", 511 | "# test_loss, test_pred, test_pred_pos, test_true_pos = model(x_id_test, y_test, mask_test, pos_id_test, train_cond=False) \n", 512 | "# f1_, accu_, f1_pos_, accu_pos_ = etest_score(y_test, test_pred, test_pred_pos, test_true_pos)\n", 513 | "# print(\"test: F1 Score: \", f1_, \"Accuracy: \", accu_, \"POS F1 Score: \", f1_pos_, \"POS Accuracy: \", accu_pos_, \"Loss: \", test_loss)" 514 | ] 515 | }, 516 | { 517 | "cell_type": "code", 518 | "execution_count": null, 519 | "metadata": { 520 | "collapsed": true 521 | }, 522 | "outputs": [], 523 | "source": [] 524 | }, 525 | { 526 | "cell_type": "code", 527 | "execution_count": null, 528 | "metadata": { 529 | "collapsed": true 530 | }, 531 | "outputs": [], 532 | "source": [ 533 | "saver.restore(sess, save_dir)" 534 | ] 535 | } 536 | ], 537 | "metadata": { 538 | "kernelspec": { 539 | "display_name": "cs771", 540 | "language": "python", 541 | "name": "cs771" 542 | }, 543 | "language_info": { 544 | "codemirror_mode": { 545 | "name": "ipython", 546 | "version": 3 547 | }, 548 | "file_extension": ".py", 549 | "mimetype": "text/x-python", 550 | "name": "python", 551 | "nbconvert_exporter": "python", 552 | "pygments_lexer": "ipython3", 553 | "version": "3.5.2" 554 | } 555 | }, 556 | "nbformat": 4, 557 | "nbformat_minor": 2 558 | } 559 | -------------------------------------------------------------------------------- /one_million/one_million_parsing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import xml.etree.ElementTree as ET\n", 12 | "import numpy as np\n", 13 | "tree = ET.parse('semcor+omsti.data.xml')\n", 14 | "root = tree.getroot()" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 2, 20 | "metadata": { 21 | "collapsed": true 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "cor1 = root[0]\n", 26 | "cor2 = root[1]\n", 27 | "#sent = cor2.findall('text')" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 3, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "def isalphabet(word):\n", 37 | " list_ = list(word)\n", 38 | " if len(list_) > 1:\n", 39 | " return True\n", 40 | " else:\n", 41 | " if word.isalpha():\n", 42 | " return True\n", 43 | " return False" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 50, 49 | "metadata": {}, 50 | "outputs": [ 51 | { 52 | "name": "stdout", 53 | "output_type": "stream", 54 | "text": [ 55 | "37176\n" 56 | ] 57 | } 58 | ], 59 | "source": [ 60 | "train1 = []\n", 61 | "#soup=soup.find_all('corpus')[1]\n", 62 | "count=0\n", 63 | "for sentences in cor1.findall('text'):\n", 64 | " for sentence in sentences:\n", 65 | " temp_sent = []\n", 66 | " temp_sent.append(sentence.get('id'))\n", 67 | "\n", 68 | " temp_words = []\n", 69 | " i_cnt=1\n", 70 | " ind=[]\n", 71 | " for word in sentence:\n", 72 | " string = word.text.lower() \n", 73 | " if (isalphabet(string)):\n", 74 | " temp_words.append(string)\n", 75 | " ind.append(i_cnt)\n", 76 | " i_cnt+=1\n", 77 | "\n", 78 | " temp_sent.append(temp_words)\n", 79 | " list_ = sentence.iter()\n", 80 | " id_list = []\n", 81 | " lemma_list = []\n", 82 | " pos_list = []\n", 83 | " for i in list_:\n", 84 | " id_list.append(i.get('id'))\n", 85 | " lemma_list.append(i.get('lemma'))\n", 86 | " pos_list.append(i.get('pos'))\n", 87 | "\n", 88 | " id_list, lemma_list , pos_list = np.array(id_list),np.array(lemma_list),np.array(pos_list)\n", 89 | " temp_sent.extend([list(id_list[ind]), list(lemma_list[ind]), list(pos_list[ind])])\n", 90 | " train1.append(temp_sent)\n", 91 | " count+=1\n", 92 | " \n", 93 | "print(count)" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": 51, 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [ 102 | "train1=train1[:len(train1)-2]" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": 19, 108 | "metadata": {}, 109 | "outputs": [ 110 | { 111 | "name": "stdout", 112 | "output_type": "stream", 113 | "text": [ 114 | "813798\n" 115 | ] 116 | } 117 | ], 118 | "source": [ 119 | "train2 = []\n", 120 | "#soup=soup.find_all('corpus')[1]\n", 121 | "count=0\n", 122 | "for sentences in cor2.findall('text'):\n", 123 | " for sentence in sentences:\n", 124 | " temp_sent = []\n", 125 | " temp_sent.append(sentence.get('id'))\n", 126 | "\n", 127 | " temp_words = []\n", 128 | " i_cnt=1\n", 129 | " ind=[]\n", 130 | " for word in sentence:\n", 131 | " string = word.text.lower() \n", 132 | " if (isalphabet(string)):\n", 133 | " temp_words.append(string)\n", 134 | " ind.append(i_cnt)\n", 135 | " i_cnt+=1\n", 136 | "\n", 137 | " temp_sent.append(temp_words)\n", 138 | " list_ = sentence.iter()\n", 139 | " id_list = []\n", 140 | " lemma_list = []\n", 141 | " pos_list = []\n", 142 | " for i in list_:\n", 143 | " id_list.append(i.get('id'))\n", 144 | " lemma_list.append(i.get('lemma'))\n", 145 | " pos_list.append(i.get('pos'))\n", 146 | "\n", 147 | " id_list, lemma_list , pos_list = np.array(id_list),np.array(lemma_list),np.array(pos_list)\n", 148 | " temp_sent.extend([list(id_list[ind]), list(lemma_list[ind]), list(pos_list[ind])])\n", 149 | " train2.append(temp_sent)\n", 150 | " count+=1\n", 151 | " \n", 152 | "print(count)" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": 54, 158 | "metadata": {}, 159 | "outputs": [ 160 | { 161 | "data": { 162 | "text/plain": [ 163 | "850972" 164 | ] 165 | }, 166 | "execution_count": 54, 167 | "metadata": {}, 168 | "output_type": "execute_result" 169 | } 170 | ], 171 | "source": [ 172 | "len(train1)+len(train2)" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": 55, 178 | "metadata": {}, 179 | "outputs": [], 180 | "source": [ 181 | "train=train1+train2" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": 56, 187 | "metadata": {}, 188 | "outputs": [ 189 | { 190 | "data": { 191 | "text/plain": [ 192 | "850972" 193 | ] 194 | }, 195 | "execution_count": 56, 196 | "metadata": {}, 197 | "output_type": "execute_result" 198 | } 199 | ], 200 | "source": [ 201 | "len(train)" 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": 57, 207 | "metadata": { 208 | "collapsed": true 209 | }, 210 | "outputs": [], 211 | "source": [ 212 | "import pickle\n", 213 | "with open('preprocess_train','wb') as f:\n", 214 | " pickle.dump(train,f)" 215 | ] 216 | } 217 | ], 218 | "metadata": { 219 | "kernelspec": { 220 | "display_name": "cs771", 221 | "language": "python", 222 | "name": "cs771" 223 | }, 224 | "language_info": { 225 | "codemirror_mode": { 226 | "name": "ipython", 227 | "version": 3 228 | }, 229 | "file_extension": ".py", 230 | "mimetype": "text/x-python", 231 | "name": "python", 232 | "nbconvert_exporter": "python", 233 | "pygments_lexer": "ipython3", 234 | "version": "3.5.2" 235 | } 236 | }, 237 | "nbformat": 4, 238 | "nbformat_minor": 2 239 | } 240 | -------------------------------------------------------------------------------- /one_million/one_word_data_maker-test.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import pickle\n", 12 | "from nltk.corpus import wordnet as wn" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 2, 18 | "metadata": { 19 | "collapsed": true 20 | }, 21 | "outputs": [], 22 | "source": [ 23 | "with open('/data/aviraj/dataset/raw_preprocess_test','rb') as f:\n", 24 | " global_data=pickle.load(f)" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 3, 30 | "metadata": { 31 | "collapsed": true 32 | }, 33 | "outputs": [], 34 | "source": [ 35 | "with open('/data/aviraj/dataset/ALL.gold.key.txt','r') as f:\n", 36 | " data_key=f.readlines()" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 9, 42 | "metadata": {}, 43 | "outputs": [ 44 | { 45 | "data": { 46 | "text/plain": [ 47 | "['the',\n", 48 | " 'art',\n", 49 | " 'of',\n", 50 | " 'change_ringing',\n", 51 | " 'be',\n", 52 | " 'peculiar',\n", 53 | " 'to',\n", 54 | " 'the',\n", 55 | " 'english',\n", 56 | " ',',\n", 57 | " 'and',\n", 58 | " ',',\n", 59 | " 'like',\n", 60 | " 'most',\n", 61 | " 'english',\n", 62 | " 'peculiarity',\n", 63 | " ',',\n", 64 | " 'unintelligible',\n", 65 | " 'to',\n", 66 | " 'the',\n", 67 | " 'rest',\n", 68 | " 'of',\n", 69 | " 'the',\n", 70 | " 'world',\n", 71 | " '.']" 72 | ] 73 | }, 74 | "execution_count": 9, 75 | "metadata": {}, 76 | "output_type": "execute_result" 77 | } 78 | ], 79 | "source": [ 80 | "global_data[0][3]" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 4, 86 | "metadata": { 87 | "collapsed": true 88 | }, 89 | "outputs": [], 90 | "source": [ 91 | "def make_word_data(checkword):\n", 92 | " \n", 93 | " dataset_line=[]\n", 94 | " for i,list_ in enumerate(global_data): \n", 95 | " ind=[idx for idx,it in enumerate(list_[3]) if it==checkword]\n", 96 | " for ii in ind:\n", 97 | " if list_[2][ii] is not None:\n", 98 | " dataset_line.append([list_[2][ii],list_[1],list_[4]])\n", 99 | " \n", 100 | " print(len(dataset_line))\n", 101 | " with open('/data/aviraj/dataset/checkwords/'+checkword + '_data_test', 'wb') as f:\n", 102 | " pickle.dump(dataset_line, f)\n", 103 | " with open('/data/aviraj/dataset/checkwords/'+checkword + '_data_test', 'rb') as f:\n", 104 | " data_ = pickle.load(f)" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": 5, 110 | "metadata": { 111 | "collapsed": true 112 | }, 113 | "outputs": [], 114 | "source": [ 115 | "test_words = ['force', 'make', 'open', 'place', 'point', 'serve', 'support']" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": 6, 121 | "metadata": {}, 122 | "outputs": [ 123 | { 124 | "name": "stdout", 125 | "output_type": "stream", 126 | "text": [ 127 | "1\n", 128 | "31\n", 129 | "4\n", 130 | "5\n", 131 | "11\n", 132 | "2\n", 133 | "12\n" 134 | ] 135 | } 136 | ], 137 | "source": [ 138 | "for word in test_words:\n", 139 | " make_word_data(word)" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": null, 145 | "metadata": { 146 | "collapsed": true 147 | }, 148 | "outputs": [], 149 | "source": [ 150 | "with open('../Glove/vocab_glove', 'rb') as f:\n", 151 | " vocab = pickle.load(f)\n" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": null, 157 | "metadata": { 158 | "collapsed": true 159 | }, 160 | "outputs": [], 161 | "source": [ 162 | "train_words = []\n", 163 | "for sent in global_data:\n", 164 | " train_words.extend(sent[1])" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": null, 170 | "metadata": {}, 171 | "outputs": [], 172 | "source": [ 173 | "len(train_words), len(set(train_words)), len(vocab)" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": null, 179 | "metadata": { 180 | "collapsed": true 181 | }, 182 | "outputs": [], 183 | "source": [ 184 | "import collections\n", 185 | "unknown_words = []\n", 186 | "for word in set(train_words):\n", 187 | " if word not in vocab:\n", 188 | " unknown_words.append(word)\n", 189 | " \n", 190 | "un_counter = collections.Counter(unknown_words)\n", 191 | "un_counter = dict(un_counter)\n", 192 | "\n", 193 | "sorted_un_counter = sorted(un_counter.items(), key=lambda x:x[1], reverse=True)\n", 194 | "sorted_un_counter" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": null, 200 | "metadata": { 201 | "collapsed": true 202 | }, 203 | "outputs": [], 204 | "source": [ 205 | "with open('million_unknown_words.pickle', 'wb') as f:\n", 206 | " pickle.dump(unknown_words, f)" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": null, 212 | "metadata": {}, 213 | "outputs": [], 214 | "source": [ 215 | "len(sorted(global_data, key=lambda x:len(x[1]), reverse=True)[0][1])" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": null, 221 | "metadata": { 222 | "collapsed": true 223 | }, 224 | "outputs": [], 225 | "source": [] 226 | } 227 | ], 228 | "metadata": { 229 | "kernelspec": { 230 | "display_name": "envs", 231 | "language": "python", 232 | "name": "cs771" 233 | }, 234 | "language_info": { 235 | "codemirror_mode": { 236 | "name": "ipython", 237 | "version": 3 238 | }, 239 | "file_extension": ".py", 240 | "mimetype": "text/x-python", 241 | "name": "python", 242 | "nbconvert_exporter": "python", 243 | "pygments_lexer": "ipython3", 244 | "version": "3.5.2" 245 | } 246 | }, 247 | "nbformat": 4, 248 | "nbformat_minor": 2 249 | } 250 | -------------------------------------------------------------------------------- /one_million/one_word_data_maker.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import pickle\n", 12 | "from nltk.corpus import wordnet as wn" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 2, 18 | "metadata": { 19 | "collapsed": true 20 | }, 21 | "outputs": [], 22 | "source": [ 23 | "with open('/data/aviraj/dataset/raw_preprocess_train','rb') as f:\n", 24 | " global_data=pickle.load(f)" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 3, 30 | "metadata": { 31 | "collapsed": true 32 | }, 33 | "outputs": [], 34 | "source": [ 35 | "with open('/data/aviraj/dataset/semcor+omsti.gold.key.txt','r') as f:\n", 36 | " data_key=f.readlines()" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 4, 42 | "metadata": { 43 | "collapsed": true 44 | }, 45 | "outputs": [], 46 | "source": [ 47 | "def make_word_data(checkword):\n", 48 | " \n", 49 | " dataset_line=[]\n", 50 | " for i,list_ in enumerate(global_data): \n", 51 | " ind=[idx for idx,it in enumerate(list_[3]) if it==checkword]\n", 52 | " for ii in ind:\n", 53 | " if list_[2][ii] is not None:\n", 54 | " dataset_line.append([list_[2][ii],list_[1],list_[4]])\n", 55 | " \n", 56 | " print(len(dataset_line))\n", 57 | " with open('/data/aviraj/dataset/checkwords/'+checkword + '_data', 'wb') as f:\n", 58 | " pickle.dump(dataset_line, f)\n", 59 | " with open('/data/aviraj/dataset/checkwords/'+checkword + '_data', 'rb') as f:\n", 60 | " data_ = pickle.load(f)" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 5, 66 | "metadata": { 67 | "collapsed": true 68 | }, 69 | "outputs": [], 70 | "source": [ 71 | "test_words = ['force', 'make', 'open', 'place', 'point', 'serve', 'support']" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": { 78 | "collapsed": true 79 | }, 80 | "outputs": [], 81 | "source": [] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 6, 86 | "metadata": {}, 87 | "outputs": [ 88 | { 89 | "name": "stdout", 90 | "output_type": "stream", 91 | "text": [ 92 | "3723\n", 93 | "6593\n", 94 | "2922\n", 95 | "3569\n", 96 | "2855\n", 97 | "3462\n", 98 | "3489\n" 99 | ] 100 | } 101 | ], 102 | "source": [ 103 | "for word in test_words:\n", 104 | " make_word_data(word)" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "metadata": { 111 | "collapsed": true 112 | }, 113 | "outputs": [], 114 | "source": [ 115 | "with open('../Glove/vocab_glove', 'rb') as f:\n", 116 | " vocab = pickle.load(f)\n" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": null, 122 | "metadata": { 123 | "collapsed": true 124 | }, 125 | "outputs": [], 126 | "source": [ 127 | "train_words = []\n", 128 | "for sent in global_data:\n", 129 | " train_words.extend(sent[1])" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": null, 135 | "metadata": {}, 136 | "outputs": [], 137 | "source": [ 138 | "len(train_words), len(set(train_words)), len(vocab)" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": null, 144 | "metadata": { 145 | "collapsed": true 146 | }, 147 | "outputs": [], 148 | "source": [ 149 | "import collections\n", 150 | "unknown_words = []\n", 151 | "for word in set(train_words):\n", 152 | " if word not in vocab:\n", 153 | " unknown_words.append(word)\n", 154 | " \n", 155 | "un_counter = collections.Counter(unknown_words)\n", 156 | "un_counter = dict(un_counter)\n", 157 | "\n", 158 | "sorted_un_counter = sorted(un_counter.items(), key=lambda x:x[1], reverse=True)\n", 159 | "sorted_un_counter" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": null, 165 | "metadata": { 166 | "collapsed": true 167 | }, 168 | "outputs": [], 169 | "source": [ 170 | "with open('million_unknown_words.pickle', 'wb') as f:\n", 171 | " pickle.dump(unknown_words, f)" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": null, 177 | "metadata": {}, 178 | "outputs": [], 179 | "source": [ 180 | "len(sorted(global_data, key=lambda x:len(x[1]), reverse=True)[0][1])" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": null, 186 | "metadata": { 187 | "collapsed": true 188 | }, 189 | "outputs": [], 190 | "source": [] 191 | } 192 | ], 193 | "metadata": { 194 | "kernelspec": { 195 | "display_name": "envs", 196 | "language": "python", 197 | "name": "cs771" 198 | }, 199 | "language_info": { 200 | "codemirror_mode": { 201 | "name": "ipython", 202 | "version": 3 203 | }, 204 | "file_extension": ".py", 205 | "mimetype": "text/x-python", 206 | "name": "python", 207 | "nbconvert_exporter": "python", 208 | "pygments_lexer": "ipython3", 209 | "version": "3.5.2" 210 | } 211 | }, 212 | "nbformat": 4, 213 | "nbformat_minor": 2 214 | } 215 | -------------------------------------------------------------------------------- /one_million/raw_one_million_parsing-test.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import xml.etree.ElementTree as ET\n", 10 | "import numpy as np\n", 11 | "tree = ET.parse('../../dataset/ALL.data.xml')\n", 12 | "root = tree.getroot()" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 2, 18 | "metadata": {}, 19 | "outputs": [ 20 | { 21 | "data": { 22 | "text/plain": [ 23 | "" 24 | ] 25 | }, 26 | "execution_count": 2, 27 | "metadata": {}, 28 | "output_type": "execute_result" 29 | } 30 | ], 31 | "source": [ 32 | "root" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 4, 38 | "metadata": { 39 | "collapsed": true 40 | }, 41 | "outputs": [], 42 | "source": [ 43 | "def isalphabet(word):\n", 44 | " return True" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 5, 50 | "metadata": {}, 51 | "outputs": [ 52 | { 53 | "name": "stdout", 54 | "output_type": "stream", 55 | "text": [ 56 | "1173\n" 57 | ] 58 | } 59 | ], 60 | "source": [ 61 | "train1 = []\n", 62 | "count=0\n", 63 | "for sentences in root.findall('text'):\n", 64 | " for sentence in sentences:\n", 65 | " temp_sent = []\n", 66 | " temp_sent.append(sentence.get('id'))\n", 67 | "\n", 68 | " temp_words = []\n", 69 | " i_cnt=1\n", 70 | " ind=[]\n", 71 | " for word in sentence:\n", 72 | " string = word.text.lower() \n", 73 | " if (isalphabet(string)):\n", 74 | " temp_words.append(string)\n", 75 | " ind.append(i_cnt)\n", 76 | " i_cnt+=1\n", 77 | "\n", 78 | " temp_sent.append(temp_words)\n", 79 | " list_ = sentence.iter()\n", 80 | " id_list = []\n", 81 | " lemma_list = []\n", 82 | " pos_list = []\n", 83 | " for i in list_:\n", 84 | " id_list.append(i.get('id'))\n", 85 | " lemma_list.append(i.get('lemma'))\n", 86 | " pos_list.append(i.get('pos'))\n", 87 | "\n", 88 | " id_list, lemma_list , pos_list = np.array(id_list),np.array(lemma_list),np.array(pos_list)\n", 89 | " temp_sent.extend([list(id_list[ind]), list(lemma_list[ind]), list(pos_list[ind])])\n", 90 | " train1.append(temp_sent)\n", 91 | " count+=1\n", 92 | " \n", 93 | "print(count)" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": 6, 99 | "metadata": {}, 100 | "outputs": [ 101 | { 102 | "data": { 103 | "text/plain": [ 104 | "1173" 105 | ] 106 | }, 107 | "execution_count": 6, 108 | "metadata": {}, 109 | "output_type": "execute_result" 110 | } 111 | ], 112 | "source": [ 113 | "len(train1)" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 10, 119 | "metadata": { 120 | "collapsed": true 121 | }, 122 | "outputs": [], 123 | "source": [ 124 | "import pickle\n", 125 | "with open('/data/aviraj/dataset/raw_preprocess_test','wb') as f:\n", 126 | " pickle.dump(train1,f)" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": 12, 132 | "metadata": {}, 133 | "outputs": [ 134 | { 135 | "data": { 136 | "text/plain": [ 137 | "['senseval2.d000.s000',\n", 138 | " ['the',\n", 139 | " 'art',\n", 140 | " 'of',\n", 141 | " 'change-ringing',\n", 142 | " 'is',\n", 143 | " 'peculiar',\n", 144 | " 'to',\n", 145 | " 'the',\n", 146 | " 'english',\n", 147 | " ',',\n", 148 | " 'and',\n", 149 | " ',',\n", 150 | " 'like',\n", 151 | " 'most',\n", 152 | " 'english',\n", 153 | " 'peculiarities',\n", 154 | " ',',\n", 155 | " 'unintelligible',\n", 156 | " 'to',\n", 157 | " 'the',\n", 158 | " 'rest',\n", 159 | " 'of',\n", 160 | " 'the',\n", 161 | " 'world',\n", 162 | " '.'],\n", 163 | " [None,\n", 164 | " 'senseval2.d000.s000.t000',\n", 165 | " None,\n", 166 | " 'senseval2.d000.s000.t001',\n", 167 | " None,\n", 168 | " 'senseval2.d000.s000.t002',\n", 169 | " None,\n", 170 | " None,\n", 171 | " 'senseval2.d000.s000.t003',\n", 172 | " None,\n", 173 | " None,\n", 174 | " None,\n", 175 | " None,\n", 176 | " 'senseval2.d000.s000.t004',\n", 177 | " 'senseval2.d000.s000.t005',\n", 178 | " 'senseval2.d000.s000.t006',\n", 179 | " None,\n", 180 | " 'senseval2.d000.s000.t007',\n", 181 | " None,\n", 182 | " None,\n", 183 | " 'senseval2.d000.s000.t008',\n", 184 | " None,\n", 185 | " None,\n", 186 | " 'senseval2.d000.s000.t009',\n", 187 | " None],\n", 188 | " ['the',\n", 189 | " 'art',\n", 190 | " 'of',\n", 191 | " 'change_ringing',\n", 192 | " 'be',\n", 193 | " 'peculiar',\n", 194 | " 'to',\n", 195 | " 'the',\n", 196 | " 'english',\n", 197 | " ',',\n", 198 | " 'and',\n", 199 | " ',',\n", 200 | " 'like',\n", 201 | " 'most',\n", 202 | " 'english',\n", 203 | " 'peculiarity',\n", 204 | " ',',\n", 205 | " 'unintelligible',\n", 206 | " 'to',\n", 207 | " 'the',\n", 208 | " 'rest',\n", 209 | " 'of',\n", 210 | " 'the',\n", 211 | " 'world',\n", 212 | " '.'],\n", 213 | " ['DET',\n", 214 | " 'NOUN',\n", 215 | " 'ADP',\n", 216 | " 'NOUN',\n", 217 | " 'VERB',\n", 218 | " 'ADJ',\n", 219 | " 'PRT',\n", 220 | " 'DET',\n", 221 | " 'NOUN',\n", 222 | " '.',\n", 223 | " 'CONJ',\n", 224 | " '.',\n", 225 | " 'ADP',\n", 226 | " 'ADJ',\n", 227 | " 'ADJ',\n", 228 | " 'NOUN',\n", 229 | " '.',\n", 230 | " 'ADJ',\n", 231 | " 'PRT',\n", 232 | " 'DET',\n", 233 | " 'NOUN',\n", 234 | " 'ADP',\n", 235 | " 'DET',\n", 236 | " 'NOUN',\n", 237 | " '.']]" 238 | ] 239 | }, 240 | "execution_count": 12, 241 | "metadata": {}, 242 | "output_type": "execute_result" 243 | } 244 | ], 245 | "source": [ 246 | "train1[0]" 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": null, 252 | "metadata": {}, 253 | "outputs": [], 254 | "source": [] 255 | } 256 | ], 257 | "metadata": { 258 | "kernelspec": { 259 | "display_name": "envs", 260 | "language": "python", 261 | "name": "cs771" 262 | }, 263 | "language_info": { 264 | "codemirror_mode": { 265 | "name": "ipython", 266 | "version": 3 267 | }, 268 | "file_extension": ".py", 269 | "mimetype": "text/x-python", 270 | "name": "python", 271 | "nbconvert_exporter": "python", 272 | "pygments_lexer": "ipython3", 273 | "version": "3.5.2" 274 | } 275 | }, 276 | "nbformat": 4, 277 | "nbformat_minor": 2 278 | } 279 | -------------------------------------------------------------------------------- /one_million/raw_one_million_parsing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import xml.etree.ElementTree as ET\n", 10 | "import numpy as np\n", 11 | "tree = ET.parse('../../dataset/semcor+omsti.data.xml')\n", 12 | "root = tree.getroot()" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 3, 18 | "metadata": {}, 19 | "outputs": [ 20 | { 21 | "data": { 22 | "text/plain": [ 23 | "" 24 | ] 25 | }, 26 | "execution_count": 3, 27 | "metadata": {}, 28 | "output_type": "execute_result" 29 | } 30 | ], 31 | "source": [ 32 | "root" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 4, 38 | "metadata": { 39 | "collapsed": true 40 | }, 41 | "outputs": [], 42 | "source": [ 43 | "cor1 = root[0]\n", 44 | "cor2 = root[1]" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 6, 50 | "metadata": {}, 51 | "outputs": [ 52 | { 53 | "data": { 54 | "text/plain": [ 55 | "" 56 | ] 57 | }, 58 | "execution_count": 6, 59 | "metadata": {}, 60 | "output_type": "execute_result" 61 | } 62 | ], 63 | "source": [ 64 | "cor2" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 3, 70 | "metadata": { 71 | "collapsed": true 72 | }, 73 | "outputs": [], 74 | "source": [ 75 | "def isalphabet(word):\n", 76 | " return True" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 4, 82 | "metadata": {}, 83 | "outputs": [ 84 | { 85 | "name": "stdout", 86 | "output_type": "stream", 87 | "text": [ 88 | "37176\n" 89 | ] 90 | } 91 | ], 92 | "source": [ 93 | "train1 = []\n", 94 | "#soup=soup.find_all('corpus')[1]\n", 95 | "count=0\n", 96 | "for sentences in cor1.findall('text'):\n", 97 | " for sentence in sentences:\n", 98 | " temp_sent = []\n", 99 | " temp_sent.append(sentence.get('id'))\n", 100 | "\n", 101 | " temp_words = []\n", 102 | " i_cnt=1\n", 103 | " ind=[]\n", 104 | " for word in sentence:\n", 105 | " string = word.text.lower() \n", 106 | " if (isalphabet(string)):\n", 107 | " temp_words.append(string)\n", 108 | " ind.append(i_cnt)\n", 109 | " i_cnt+=1\n", 110 | "\n", 111 | " temp_sent.append(temp_words)\n", 112 | " list_ = sentence.iter()\n", 113 | " id_list = []\n", 114 | " lemma_list = []\n", 115 | " pos_list = []\n", 116 | " for i in list_:\n", 117 | " id_list.append(i.get('id'))\n", 118 | " lemma_list.append(i.get('lemma'))\n", 119 | " pos_list.append(i.get('pos'))\n", 120 | "\n", 121 | " id_list, lemma_list , pos_list = np.array(id_list),np.array(lemma_list),np.array(pos_list)\n", 122 | " temp_sent.extend([list(id_list[ind]), list(lemma_list[ind]), list(pos_list[ind])])\n", 123 | " train1.append(temp_sent)\n", 124 | " count+=1\n", 125 | " \n", 126 | "print(count)" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": 5, 132 | "metadata": { 133 | "collapsed": true 134 | }, 135 | "outputs": [], 136 | "source": [ 137 | "train1=train1[:len(train1)-2]" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": 6, 143 | "metadata": {}, 144 | "outputs": [ 145 | { 146 | "name": "stdout", 147 | "output_type": "stream", 148 | "text": [ 149 | "813798\n" 150 | ] 151 | } 152 | ], 153 | "source": [ 154 | "train2 = []\n", 155 | "#soup=soup.find_all('corpus')[1]\n", 156 | "count=0\n", 157 | "for sentences in cor2.findall('text'):\n", 158 | " for sentence in sentences:\n", 159 | " temp_sent = []\n", 160 | " temp_sent.append(sentence.get('id'))\n", 161 | "\n", 162 | " temp_words = []\n", 163 | " i_cnt=1\n", 164 | " ind=[]\n", 165 | " for word in sentence:\n", 166 | " string = word.text.lower() \n", 167 | " if (isalphabet(string)):\n", 168 | " temp_words.append(string)\n", 169 | " ind.append(i_cnt)\n", 170 | " i_cnt+=1\n", 171 | "\n", 172 | " temp_sent.append(temp_words)\n", 173 | " list_ = sentence.iter()\n", 174 | " id_list = []\n", 175 | " lemma_list = []\n", 176 | " pos_list = []\n", 177 | " for i in list_:\n", 178 | " id_list.append(i.get('id'))\n", 179 | " lemma_list.append(i.get('lemma'))\n", 180 | " pos_list.append(i.get('pos'))\n", 181 | "\n", 182 | " id_list, lemma_list , pos_list = np.array(id_list),np.array(lemma_list),np.array(pos_list)\n", 183 | " temp_sent.extend([list(id_list[ind]), list(lemma_list[ind]), list(pos_list[ind])])\n", 184 | " train2.append(temp_sent)\n", 185 | " count+=1\n", 186 | " \n", 187 | "print(count)" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": 7, 193 | "metadata": {}, 194 | "outputs": [ 195 | { 196 | "data": { 197 | "text/plain": [ 198 | "850972" 199 | ] 200 | }, 201 | "execution_count": 7, 202 | "metadata": {}, 203 | "output_type": "execute_result" 204 | } 205 | ], 206 | "source": [ 207 | "len(train1)+len(train2)" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": 17, 213 | "metadata": { 214 | "collapsed": true 215 | }, 216 | "outputs": [], 217 | "source": [ 218 | "train=train1+train2" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": 18, 224 | "metadata": {}, 225 | "outputs": [ 226 | { 227 | "data": { 228 | "text/plain": [ 229 | "850972" 230 | ] 231 | }, 232 | "execution_count": 18, 233 | "metadata": {}, 234 | "output_type": "execute_result" 235 | } 236 | ], 237 | "source": [ 238 | "len(train)" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": 19, 244 | "metadata": { 245 | "collapsed": true 246 | }, 247 | "outputs": [], 248 | "source": [ 249 | "import pickle\n", 250 | "with open('raw_preprocess_train','wb') as f:\n", 251 | " pickle.dump(train,f)" 252 | ] 253 | } 254 | ], 255 | "metadata": { 256 | "kernelspec": { 257 | "display_name": "envs", 258 | "language": "python", 259 | "name": "cs771" 260 | }, 261 | "language_info": { 262 | "codemirror_mode": { 263 | "name": "ipython", 264 | "version": 3 265 | }, 266 | "file_extension": ".py", 267 | "mimetype": "text/x-python", 268 | "name": "python", 269 | "nbconvert_exporter": "python", 270 | "pygments_lexer": "ipython3", 271 | "version": "3.5.2" 272 | } 273 | }, 274 | "nbformat": 4, 275 | "nbformat_minor": 2 276 | } 277 | -------------------------------------------------------------------------------- /papers/1603.07012.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sshanu/Hierarchical-Word-Sense-Disambiguation-using-WordNet-Senses/74905c60e48ab1884bf5c8f208e21015f7f7fd21/papers/1603.07012.pdf -------------------------------------------------------------------------------- /papers/9f260612d5817d542cda2a7d9a6eb18d6471.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sshanu/Hierarchical-Word-Sense-Disambiguation-using-WordNet-Senses/74905c60e48ab1884bf5c8f208e21015f7f7fd21/papers/9f260612d5817d542cda2a7d9a6eb18d6471.pdf -------------------------------------------------------------------------------- /papers/D17-1008.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sshanu/Hierarchical-Word-Sense-Disambiguation-using-WordNet-Senses/74905c60e48ab1884bf5c8f208e21015f7f7fd21/papers/D17-1008.pdf -------------------------------------------------------------------------------- /papers/K16-1006.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sshanu/Hierarchical-Word-Sense-Disambiguation-using-WordNet-Senses/74905c60e48ab1884bf5c8f208e21015f7f7fd21/papers/K16-1006.pdf -------------------------------------------------------------------------------- /papers/P16-1085.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sshanu/Hierarchical-Word-Sense-Disambiguation-using-WordNet-Senses/74905c60e48ab1884bf5c8f208e21015f7f7fd21/papers/P16-1085.pdf -------------------------------------------------------------------------------- /papers/W16-5307.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sshanu/Hierarchical-Word-Sense-Disambiguation-using-WordNet-Senses/74905c60e48ab1884bf5c8f208e21015f7f7fd21/papers/W16-5307.pdf -------------------------------------------------------------------------------- /papers/a10-navigli.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sshanu/Hierarchical-Word-Sense-Disambiguation-using-WordNet-Senses/74905c60e48ab1884bf5c8f208e21015f7f7fd21/papers/a10-navigli.pdf -------------------------------------------------------------------------------- /papers/crf.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sshanu/Hierarchical-Word-Sense-Disambiguation-using-WordNet-Senses/74905c60e48ab1884bf5c8f208e21015f7f7fd21/papers/crf.pdf -------------------------------------------------------------------------------- /papers/report1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sshanu/Hierarchical-Word-Sense-Disambiguation-using-WordNet-Senses/74905c60e48ab1884bf5c8f208e21015f7f7fd21/papers/report1.pdf -------------------------------------------------------------------------------- /papers/report2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sshanu/Hierarchical-Word-Sense-Disambiguation-using-WordNet-Senses/74905c60e48ab1884bf5c8f208e21015f7f7fd21/papers/report2.pdf --------------------------------------------------------------------------------