├── .gitignore
├── Four Word Model
    ├── Model-2.ipynb
    ├── Model-3.ipynb
    ├── Model-4.ipynb
    ├── Model-5.ipynb
    ├── Model-6.ipynb
    ├── Model-7.ipynb
    ├── Model-8.ipynb
    ├── Model-9.ipynb
    ├── Model.ipynb
    ├── Preprocess_Files
    │   ├── hard
    │   │   ├── sense
    │   │   └── sent
    │   ├── interest
    │   │   ├── sense
    │   │   └── sent
    │   ├── line
    │   │   ├── sense
    │   │   └── sent
    │   └── serve
    │   │   ├── sense
    │   │   └── sent
    ├── Senses.txt
    ├── Sentences.txt
    ├── final_preprocessing.ipynb
    ├── full_train.pickle
    ├── initial_processing.ipynb
    ├── robsr_model.ipynb
    ├── train.pickle
    ├── vocab_overlap_analysis.ipynb
    └── words_not_in_vocab.pickle
├── LICENSE
├── README.md
├── UGP_Report.pdf
├── UGP_presentation.pdf
├── models_diagram
    ├── all-word-1.png
    ├── all-word-2.png
    ├── all-word-3.png
    ├── all-word-4.png
    ├── all-word-5.png
    ├── all-word-6.png
    ├── all-word-7.png
    ├── all-word-8.png
    ├── model-1.png
    ├── model-2.png
    ├── model-3.png
    └── model-4.png
├── one_million
    ├── One-Million All-Word Data Sampling Coarse.ipynb
    ├── One-Million All-Word Data Sampling-Fine.ipynb
    ├── One-Million All-Word Data-hierarchical Sampling-Fine.ipynb
    ├── One-Million All-Word Data-seq.ipynb
    ├── Sense-test.ipynb
    ├── Sense.ipynb
    ├── all-word-model
    ├── all-word
    │   ├── Model-aw-1-multigpu-1.ipynb
    │   ├── Model-aw-1-multigpu-2.ipynb
    │   ├── Model-aw-1-multigpu-3.ipynb
    │   ├── Model-aw-3-1.ipynb
    │   ├── Model-aw-3.ipynb
    │   ├── Model-aw-4-1.ipynb
    │   ├── Model-aw-lex-1.2.ipynb
    │   ├── Model-aw-lex-1.3.ipynb
    │   ├── Model-aw-lex-1.4.ipynb
    │   ├── Model-aw-lex-1.ipynb
    │   ├── Model-aw-lex-2.2.ipynb
    │   ├── Model-aw-lex-hierarchical-1.ipynb
    │   ├── Model-aw-lex-hierarchical-2.ipynb
    │   ├── Model-aw-lex-hierarchical-3.ipynb
    │   ├── Model-aw-lex-hierarchical-4.ipynb
    │   ├── Model-aw-lex-local_attention-fast-v1.ipynb
    │   ├── Model-aw-lex-local_attention-fast-v2-1.ipynb
    │   ├── Model-aw-lex-local_attention-fast-v2-2.ipynb
    │   ├── Model-aw-lex-local_attention-fast-v2-3.ipynb
    │   ├── Model-aw-lex-local_attention-fast-v2-4.ipynb
    │   ├── Model-aw-lex-local_attention-fast-v2-5.ipynb
    │   ├── Model-aw-lex-local_attention-fast-v2-6.ipynb
    │   ├── Model-aw-lex-local_attention-fast-v2-7.ipynb
    │   ├── Model-aw-lex-local_attention-fast-v2-8.ipynb
    │   ├── Model-aw-lex-local_attention-fast-v2-9.ipynb
    │   ├── Model-aw-lex-local_attention-fast-v3-1.ipynb
    │   ├── Model-aw-lex-local_attention-fast-v4-1.ipynb
    │   ├── Model-aw-lex-local_attention-slow-1.ipynb
    │   ├── Model-aw-lex-local_attention-slow-2.ipynb
    │   ├── Model-aw-lex-seq-hierarchical-1.ipynb
    │   ├── Model-aw-lex-seq-hierarchical-2.ipynb
    │   ├── Model-aw-sense-1.ipynb
    │   └── Readme.md
    ├── force
    │   ├── Force-Model-1-multigpu-1.ipynb
    │   ├── Force-Model-1-multigpu-2.ipynb
    │   ├── Force-Model-1-multigpu-3.ipynb
    │   ├── Force-Model-1.ipynb
    │   ├── Force-Model-2-multigpu-1.ipynb
    │   ├── Force-Model-2.ipynb
    │   ├── Force-Model-3-multigpu-1.ipynb
    │   ├── Force-Model-3.ipynb
    │   ├── Force-Model-4-multigpu-1.ipynb
    │   ├── Force-Model-4.ipynb
    │   └── Force-Model-5.ipynb
    ├── make
    │   ├── Make-Model-1-multigpu-1.ipynb
    │   ├── Make-Model-1.ipynb
    │   ├── Make-Model-2-multigpu-1.ipynb
    │   ├── Make-Model-2.ipynb
    │   ├── Make-Model-3-1.ipynb
    │   ├── Make-Model-3-2.ipynb
    │   ├── Make-Model-3-3.ipynb
    │   ├── Make-Model-3-multigpu-1.ipynb
    │   └── Make-Model-3.ipynb
    ├── one_million_parsing.ipynb
    ├── one_word_data_maker-test.ipynb
    ├── one_word_data_maker.ipynb
    ├── open
    │   ├── Open-Model-1-multigpu-1.ipynb
    │   ├── Open-Model-2-multigpu-1.ipynb
    │   ├── Open-Model-3-multigpu-1.ipynb
    │   ├── Open-Model-3.ipynb
    │   ├── Open-Model-4-multigpu-1.ipynb
    │   └── Open-Model-4.ipynb
    ├── place
    │   ├── Place-Model-1-multigpu-1.ipynb
    │   ├── Place-Model-2-multigpu-1.ipynb
    │   ├── Place-Model-2.ipynb
    │   ├── Place-Model-3-multigpu-1.ipynb
    │   ├── Place-Model-3.ipynb
    │   ├── Place-Model-4-multigpu-1.ipynb
    │   ├── Place-Model-4.ipynb
    │   └── Place-Model-6.ipynb
    ├── point
    │   ├── Point-Model-1-multigpu-1.ipynb
    │   ├── Point-Model-2-multigpu-1.ipynb
    │   ├── Point-Model-2.ipynb
    │   ├── Point-Model-3-multigpu-1.ipynb
    │   ├── Point-Model-3.ipynb
    │   ├── Point-Model-4-multigpu-1.ipynb
    │   └── Point-Model-4.ipynb
    ├── raw_one_million_parsing-test.ipynb
    ├── raw_one_million_parsing.ipynb
    ├── serve
    │   ├── Serve-Model-1-multigpu-2.ipynb
    │   ├── Serve-Model-1.ipynb
    │   ├── Serve-Model-2.ipynb
    │   └── Serve-Model-3.ipynb
    └── support
    │   ├── Support-Model-1-multigpu-1.ipynb
    │   ├── Support-Model-2-multigpu-1.ipynb
    │   ├── Support-Model-3-multigpu-1.ipynb
    │   ├── Support-Model-3.ipynb
    │   ├── Support-Model-4-multigpu-1.ipynb
    │   ├── Support-Model-4.ipynb
    │   └── Support-Model-5.ipynb
└── papers
    ├── 1603.07012.pdf
    ├── 9f260612d5817d542cda2a7d9a6eb18d6471.pdf
    ├── D17-1008.pdf
    ├── K16-1006.pdf
    ├── P16-1085.pdf
    ├── W16-5307.pdf
    ├── a10-navigli.pdf
    ├── crf.pdf
    ├── report1.pdf
    └── report2.pdf


/.gitignore:
--------------------------------------------------------------------------------
 1 | rushab/
 2 | dataset/
 3 | Glove/
 4 | glove/
 5 | Four Word Model/.ipynb_checkpoints/
 6 | data/
 7 | .ipynb_checkpoints/
 8 | Four Word Model/output
 9 | one_million/output
10 | *.pickle
11 | papers/


--------------------------------------------------------------------------------
/Four Word Model/Model-2.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import tensorflow as tf\n",
 12 |     "tf.logging.set_verbosity(tf.logging.WARN)\n",
 13 |     "import pickle\n",
 14 |     "import numpy as np\n",
 15 |     "import os\n",
 16 |     "from sklearn.model_selection import train_test_split\n",
 17 |     "from sklearn.metrics import f1_score\n",
 18 |     "from sklearn.metrics import accuracy_score\n",
 19 |     "import os\n",
 20 |     "from tensorflow.python.client import device_lib"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": 2,
 26 |    "metadata": {
 27 |     "collapsed": true
 28 |    },
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "f = open('../Glove/word_embedding_glove', 'rb')\n",
 32 |     "word_embedding = pickle.load(f)\n",
 33 |     "f.close()\n",
 34 |     "word_embedding = word_embedding[: len(word_embedding)-1]\n",
 35 |     "\n",
 36 |     "f = open('../Glove/vocab_glove', 'rb')\n",
 37 |     "vocab = pickle.load(f)\n",
 38 |     "f.close()\n",
 39 |     "\n",
 40 |     "word2id = dict((w, i) for i,w in enumerate(vocab))\n",
 41 |     "id2word = dict((i, w) for i,w in enumerate(vocab))\n",
 42 |     "\n",
 43 |     "unknown_token = \"UNKNOWN_TOKEN\"\n",
 44 |     "\n",
 45 |     "f = open(\"train.pickle\", 'rb')\n",
 46 |     "full_data = pickle.load(f)\n",
 47 |     "f.close()"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": 3,
 53 |    "metadata": {
 54 |     "collapsed": true
 55 |    },
 56 |    "outputs": [],
 57 |    "source": [
 58 |     "# Model Description\n",
 59 |     "sense_word = 'hard'\n",
 60 |     "model_name = 'model-2'\n",
 61 |     "model_dir = 'output/' + sense_word + '/' + model_name\n",
 62 |     "save_dir = os.path.join(model_dir, \"save/\")\n",
 63 |     "log_dir = os.path.join(model_dir, \"log\")\n",
 64 |     "\n",
 65 |     "if not os.path.exists(model_dir):\n",
 66 |     "    os.mkdir(model_dir)\n",
 67 |     "if not os.path.exists(save_dir):\n",
 68 |     "    os.mkdir(save_dir)\n",
 69 |     "if not os.path.exists(log_dir):\n",
 70 |     "    os.mkdir(log_dir)"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": 4,
 76 |    "metadata": {
 77 |     "collapsed": true
 78 |    },
 79 |    "outputs": [],
 80 |    "source": [
 81 |     "# Parameters\n",
 82 |     "mode = 'train'\n",
 83 |     "num_senses = 3\n",
 84 |     "batch_size = 64\n",
 85 |     "vocab_size = len(vocab)\n",
 86 |     "unk_vocab_size = 1\n",
 87 |     "word_emb_size = len(word_embedding[0])\n",
 88 |     "max_sent_size = 200\n",
 89 |     "hidden_size = 100\n",
 90 |     "keep_prob = 0.5\n",
 91 |     "l2_lambda = 0.001\n",
 92 |     "init_lr = 0.001\n",
 93 |     "decay_steps = 500\n",
 94 |     "decay_rate = 0.96\n",
 95 |     "clip_norm = 1\n",
 96 |     "clipping = True"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": 5,
102 |    "metadata": {
103 |     "collapsed": true
104 |    },
105 |    "outputs": [],
106 |    "source": [
107 |     "# MODEL\n",
108 |     "x = tf.placeholder('int32', [batch_size, max_sent_size], name=\"x\")\n",
109 |     "y = tf.placeholder('int32', [batch_size], name=\"y\")\n",
110 |     "x_mask  = tf.placeholder('bool', [batch_size, max_sent_size], name='x_mask') \n",
111 |     "is_train = tf.placeholder('bool', [], name='is_train')\n",
112 |     "word_emb_mat = tf.placeholder('float', [None, word_emb_size], name='emb_mat')\n",
113 |     "input_keep_prob = tf.cond(is_train,lambda:keep_prob, lambda:tf.constant(1.0))\n",
114 |     "x_len = tf.reduce_sum(tf.cast(x_mask, 'int32'), 1)\n",
115 |     "\n",
116 |     "with tf.name_scope(\"word_embedding\"):\n",
117 |     "    if mode == 'train':\n",
118 |     "        unk_word_emb_mat = tf.get_variable(\"word_emb_mat\", dtype='float', shape=[unk_vocab_size, word_emb_size], initializer=tf.contrib.layers.xavier_initializer(uniform=True, seed=0, dtype=tf.float32))\n",
119 |     "    else:\n",
120 |     "        unk_word_emb_mat = tf.get_variable(\"word_emb_mat\", shape=[unk_vocab_size, word_emb_size], dtype='float')\n",
121 |     "        \n",
122 |     "    final_word_emb_mat = tf.concat([word_emb_mat, unk_word_emb_mat], 0)\n",
123 |     "    Wx = tf.nn.embedding_lookup(final_word_emb_mat, x)  \n",
124 |     "\n",
125 |     "with tf.variable_scope(\"lstm1\"):\n",
126 |     "    cell_fw1 = tf.contrib.rnn.BasicLSTMCell(hidden_size,state_is_tuple=True)\n",
127 |     "    cell_bw1 = tf.contrib.rnn.BasicLSTMCell(hidden_size,state_is_tuple=True)\n",
128 |     "\n",
129 |     "    d_cell_fw1 = tf.contrib.rnn.DropoutWrapper(cell_fw1, input_keep_prob=input_keep_prob)\n",
130 |     "    d_cell_bw1 = tf.contrib.rnn.DropoutWrapper(cell_bw1, input_keep_prob=input_keep_prob)\n",
131 |     "    \n",
132 |     "    (fw_h1, bw_h1), _ = tf.nn.bidirectional_dynamic_rnn(d_cell_fw1, d_cell_bw1, Wx, sequence_length=x_len, dtype='float', scope='lstm1')\n",
133 |     "    h1 = tf.concat([fw_h1, bw_h1], 2)\n",
134 |     "    \n",
135 |     "with tf.variable_scope(\"lstm2\"):\n",
136 |     "    cell_fw2 = tf.contrib.rnn.BasicLSTMCell(hidden_size,state_is_tuple=True)\n",
137 |     "    cell_bw2 = tf.contrib.rnn.BasicLSTMCell(hidden_size,state_is_tuple=True)\n",
138 |     "\n",
139 |     "    d_cell_fw2 = tf.contrib.rnn.DropoutWrapper(cell_fw2, input_keep_prob=input_keep_prob)\n",
140 |     "    d_cell_bw2 = tf.contrib.rnn.DropoutWrapper(cell_bw2, input_keep_prob=input_keep_prob)\n",
141 |     "    \n",
142 |     "    (fw_h2, bw_h2), _ = tf.nn.bidirectional_dynamic_rnn(d_cell_fw2, d_cell_bw2, h1, sequence_length=x_len, dtype='float', scope='lstm2')\n",
143 |     "    h = tf.concat([fw_h2, bw_h2], 2)\n",
144 |     "\n",
145 |     "def attention(input_x, input_mask, W_att):\n",
146 |     "    h_masked = tf.boolean_mask(input_x, input_mask)\n",
147 |     "    h_tanh = tf.tanh(h_masked)\n",
148 |     "    u = tf.matmul(h_tanh, W_att)\n",
149 |     "    a = tf.nn.softmax(u)\n",
150 |     "    c = tf.reduce_sum(tf.multiply(h_tanh, a), 0)  \n",
151 |     "    return c\n",
152 |     "\n",
153 |     "with tf.variable_scope(\"attention\"):\n",
154 |     "    W_att = tf.Variable(tf.truncated_normal([2*hidden_size, 1], mean=0.0, stddev=0.1, seed=0), name=\"W_att\")\n",
155 |     "    c = tf.expand_dims(attention(h[0], x_mask[0], W_att), 0)\n",
156 |     "    for i in range(1, batch_size):\n",
157 |     "        c = tf.concat([c, tf.expand_dims(attention(h[i], x_mask[i], W_att), 0)], 0)\n",
158 |     "        \n",
159 |     "with tf.variable_scope(\"softmax_layer\"):\n",
160 |     "    W = tf.Variable(tf.truncated_normal([2*hidden_size, num_senses], mean=0.0, stddev=0.1, seed=0), name=\"W\")\n",
161 |     "    b = tf.Variable(tf.zeros([num_senses]), name=\"b\")\n",
162 |     "    drop_c = tf.nn.dropout(c, input_keep_prob)\n",
163 |     "    logits = tf.matmul(drop_c, W) + b\n",
164 |     "    predictions = tf.argmax(logits, 1)\n",
165 |     "\n",
166 |     "loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=y))\n",
167 |     "global_step = tf.Variable(0, trainable=False, name=\"global_step\")\n",
168 |     "\n",
169 |     "learning_rate = tf.train.exponential_decay(init_lr, global_step, decay_steps, decay_rate, staircase=True)\n",
170 |     "\n",
171 |     "tv_all = tf.trainable_variables()\n",
172 |     "tv_regu =[]\n",
173 |     "for t in tv_all:\n",
174 |     "    if t.name.find('b:')==-1:\n",
175 |     "        tv_regu.append(t)\n",
176 |     "        \n",
177 |     "# l2 Loss\n",
178 |     "l2_loss = l2_lambda * tf.reduce_sum([ tf.nn.l2_loss(v) for v in tv_regu ])\n",
179 |     "\n",
180 |     "total_loss = loss + l2_loss\n",
181 |     "\n",
182 |     "# Optimizer for loss\n",
183 |     "optimizer = tf.train.AdamOptimizer(learning_rate)\n",
184 |     "\n",
185 |     "# Gradients and Variables for Loss\n",
186 |     "grads_vars = optimizer.compute_gradients(total_loss)\n",
187 |     "\n",
188 |     "# Clipping of Gradients\n",
189 |     "clipped_grads = grads_vars\n",
190 |     "if(clipping == True):\n",
191 |     "    clipped_grads = [(tf.clip_by_norm(grad, clip_norm), var) for grad, var in clipped_grads]\n",
192 |     "\n",
193 |     "# Training Optimizer for Total Loss\n",
194 |     "train_op = optimizer.apply_gradients(clipped_grads, global_step=global_step)\n",
195 |     "\n",
196 |     "# Summaries\n",
197 |     "var_summaries = []\n",
198 |     "for v in tv_all:\n",
199 |     "    var_summary = tf.summary.histogram(\"{}/var\".format(v.name), v)\n",
200 |     "    var_summaries.append(var_summary)\n",
201 |     "\n",
202 |     "var_summaries_merged = tf.summary.merge(var_summaries)\n",
203 |     "\n",
204 |     "loss_summary = tf.summary.scalar(\"loss\", loss)\n",
205 |     "total_loss_summary = tf.summary.scalar(\"total_loss\", total_loss)\n",
206 |     "summary = tf.summary.merge_all()"
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "code",
211 |    "execution_count": 6,
212 |    "metadata": {
213 |     "collapsed": true
214 |    },
215 |    "outputs": [],
216 |    "source": [
217 |     "os.environ[\"CUDA_DEVICE_ORDER\"]=\"PCI_BUS_ID\"   # see issue #152\n",
218 |     "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"3\"\n",
219 |     "sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))\n",
220 |     "sess.run(tf.global_variables_initializer())                          # For initializing all the variables\n",
221 |     "saver = tf.train.Saver()                                             # For Saving the model\n",
222 |     "summary_writer = tf.summary.FileWriter(log_dir, sess.graph)          # For writing Summaries"
223 |    ]
224 |   },
225 |   {
226 |    "cell_type": "code",
227 |    "execution_count": 7,
228 |    "metadata": {
229 |     "scrolled": true
230 |    },
231 |    "outputs": [
232 |     {
233 |      "name": "stderr",
234 |      "output_type": "stream",
235 |      "text": [
236 |       "/users/btech/aviraj/cs771/lib/python3.5/site-packages/sklearn/model_selection/_split.py:2026: FutureWarning: From version 0.21, test_size will always complement train_size unless both are specified.\n",
237 |       "  FutureWarning)\n"
238 |      ]
239 |     }
240 |    ],
241 |    "source": [
242 |     "# Splitting\n",
243 |     "data_x = full_data[sense_word][0]\n",
244 |     "data_y = full_data[sense_word][2]\n",
245 |     "x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, train_size=0.8, shuffle=True, stratify=data_y, random_state=0)\n",
246 |     "\n",
247 |     "x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, train_size=0.9, shuffle=True, stratify=y_train, random_state=0)"
248 |    ]
249 |   },
250 |   {
251 |    "cell_type": "code",
252 |    "execution_count": 8,
253 |    "metadata": {
254 |     "collapsed": true
255 |    },
256 |    "outputs": [],
257 |    "source": [
258 |     "def data_prepare(x):\n",
259 |     "    num_examples = len(x)\n",
260 |     "\n",
261 |     "    xx = np.zeros([num_examples, max_sent_size], dtype=int)\n",
262 |     "    xx_mask = np.zeros([num_examples, max_sent_size], dtype=bool)\n",
263 |     "\n",
264 |     "    for j in range(num_examples):\n",
265 |     "        for i in range(max_sent_size):\n",
266 |     "            if(i>=len(x[j])):\n",
267 |     "                break\n",
268 |     "            w = x[j][i]\n",
269 |     "            xx[j][i] = word2id[w] if w in word2id else word2id['UNKNOWN_TOKEN']\n",
270 |     "            xx_mask[j][i] = True\n",
271 |     "            \n",
272 |     "    return xx, xx_mask\n",
273 |     "\n",
274 |     "def eval_score(yy, pred):\n",
275 |     "    num_batches = int(len(yy)/batch_size)\n",
276 |     "    f1 = f1_score(yy[:batch_size*num_batches], pred, average='macro')\n",
277 |     "    accu = accuracy_score(yy[:batch_size*num_batches], pred)\n",
278 |     "    return f1*100, accu*100\n",
279 |     "\n",
280 |     "def model(xx, yy, mask, train_cond=True):\n",
281 |     "    num_batches = int(len(xx)/batch_size)\n",
282 |     "    losses = 0\n",
283 |     "    preds = []\n",
284 |     "    for j in range(num_batches): \n",
285 |     "        \n",
286 |     "        s = j * batch_size\n",
287 |     "        e = (j+1) * batch_size\n",
288 |     "        \n",
289 |     "        feed_dict = {x:xx[s:e], y:yy[s:e], x_mask:mask[s:e], is_train:train_cond, input_keep_prob:keep_prob, word_emb_mat:word_embedding}\n",
290 |     "        \n",
291 |     "        \n",
292 |     "        if(train_cond==True):\n",
293 |     "            _, _loss, step, _summary = sess.run([train_op, total_loss, global_step, summary], feed_dict)\n",
294 |     "            summary_writer.add_summary(_summary, step)  \n",
295 |     "#             print(\"Steps:{}\".format(step), \", Loss: {}\".format(_loss))\n",
296 |     "\n",
297 |     "        else:\n",
298 |     "            _loss, pred = sess.run([total_loss, predictions], feed_dict)\n",
299 |     "            preds.append(pred)\n",
300 |     "            \n",
301 |     "        losses +=_loss\n",
302 |     "\n",
303 |     "    if(train_cond==False):\n",
304 |     "        y_pred = []\n",
305 |     "        for i in range(num_batches):\n",
306 |     "            for pred in preds[i]:\n",
307 |     "                y_pred.append(pred)\n",
308 |     "        return losses/num_batches, y_pred\n",
309 |     "    \n",
310 |     "    return losses/num_batches, step"
311 |    ]
312 |   },
313 |   {
314 |    "cell_type": "code",
315 |    "execution_count": 9,
316 |    "metadata": {
317 |     "collapsed": true
318 |    },
319 |    "outputs": [],
320 |    "source": [
321 |     "x_id_train, mask_train = data_prepare(x_train)\n",
322 |     "x_id_val, mask_val = data_prepare(x_val)\n",
323 |     "x_id_test, mask_test = data_prepare(x_test)\n",
324 |     "y_train = np.array(y_train)"
325 |    ]
326 |   },
327 |   {
328 |    "cell_type": "code",
329 |    "execution_count": 10,
330 |    "metadata": {
331 |     "scrolled": true
332 |    },
333 |    "outputs": [
334 |     {
335 |      "name": "stdout",
336 |      "output_type": "stream",
337 |      "text": [
338 |       "Epoch: 1 Step: 48 loss: 1.86646759758\n",
339 |       "Epoch: 2 Step: 96 loss: 1.21714039147\n",
340 |       "Epoch: 3 Step: 144 loss: 1.08560919886\n",
341 |       "Epoch: 4 Step: 192 loss: 0.980009039243\n",
342 |       "Epoch: 5 Step: 240 loss: 0.881924713651\n",
343 |       "Saved Model Complete\n",
344 |       "Train: F1 Score:  60.3386654855 Accuracy:  83.5611979167 Loss:  0.80800242722\n",
345 |       "Val: F1 Score:  54.6225701167 Accuracy:  82.8125 Loss:  0.770564937592\n",
346 |       "Epoch: 6 Step: 288 loss: 0.806718610227\n",
347 |       "Epoch: 7 Step: 336 loss: 0.742333145191\n",
348 |       "Epoch: 8 Step: 384 loss: 0.699159173295\n",
349 |       "Epoch: 9 Step: 432 loss: 0.681758804868\n",
350 |       "Epoch: 10 Step: 480 loss: 0.631260214373\n",
351 |       "Saved Model Complete\n",
352 |       "Train: F1 Score:  62.415876497 Accuracy:  85.64453125 Loss:  0.609668933476\n",
353 |       "Val: F1 Score:  63.0309748731 Accuracy:  87.1875 Loss:  0.603685164452\n",
354 |       "Epoch: 11 Step: 528 loss: 0.621730036413\n",
355 |       "Epoch: 12 Step: 576 loss: 0.593547300746\n",
356 |       "Epoch: 13 Step: 624 loss: 0.567168306559\n",
357 |       "Epoch: 14 Step: 672 loss: 0.572736630837\n",
358 |       "Epoch: 15 Step: 720 loss: 0.52119900162\n",
359 |       "Saved Model Complete\n",
360 |       "Train: F1 Score:  73.2345182784 Accuracy:  87.59765625 Loss:  0.500816229731\n",
361 |       "Val: F1 Score:  71.3846572025 Accuracy:  88.125 Loss:  0.507379829884\n",
362 |       "Epoch: 16 Step: 768 loss: 0.518757795294\n",
363 |       "Epoch: 17 Step: 816 loss: 0.508907252923\n",
364 |       "Epoch: 18 Step: 864 loss: 0.480370514716\n",
365 |       "Epoch: 19 Step: 912 loss: 0.481487047548\n",
366 |       "Epoch: 20 Step: 960 loss: 0.483874622112\n",
367 |       "Saved Model Complete\n",
368 |       "Train: F1 Score:  72.4541504438 Accuracy:  88.57421875 Loss:  0.454483479882\n",
369 |       "Val: F1 Score:  69.7799159478 Accuracy:  88.4375 Loss:  0.505139875412\n",
370 |       "Epoch: 21 Step: 1008 loss: 0.445587230225\n",
371 |       "Epoch: 22 Step: 1056 loss: 0.448845259845\n",
372 |       "Epoch: 23 Step: 1104 loss: 0.418395101403\n",
373 |       "Epoch: 24 Step: 1152 loss: 0.42787179475\n",
374 |       "Epoch: 25 Step: 1200 loss: 0.41220224835\n",
375 |       "Saved Model Complete\n",
376 |       "Train: F1 Score:  79.7367544121 Accuracy:  90.5598958333 Loss:  0.387414715563\n",
377 |       "Val: F1 Score:  80.5119717533 Accuracy:  91.25 Loss:  0.428414440155\n",
378 |       "Epoch: 26 Step: 1248 loss: 0.398100319629\n",
379 |       "Epoch: 27 Step: 1296 loss: 0.401642986884\n",
380 |       "Epoch: 28 Step: 1344 loss: 0.380077781156\n",
381 |       "Epoch: 29 Step: 1392 loss: 0.371819969267\n",
382 |       "Epoch: 30 Step: 1440 loss: 0.375808695642\n",
383 |       "Saved Model Complete\n",
384 |       "Train: F1 Score:  82.6141307319 Accuracy:  91.1458333333 Loss:  0.374826697633\n",
385 |       "Val: F1 Score:  72.6194736328 Accuracy:  87.5 Loss:  0.443939989805\n",
386 |       "Epoch: 31 Step: 1488 loss: 0.368128724086\n",
387 |       "Epoch: 32 Step: 1536 loss: 0.363611215415\n",
388 |       "Epoch: 33 Step: 1584 loss: 0.370647774388\n",
389 |       "Epoch: 34 Step: 1632 loss: 0.368405311989\n",
390 |       "Epoch: 35 Step: 1680 loss: 0.349992937719\n",
391 |       "Saved Model Complete\n",
392 |       "Train: F1 Score:  81.482253082 Accuracy:  91.6666666667 Loss:  0.36779523051\n",
393 |       "Val: F1 Score:  76.2094695081 Accuracy:  89.6875 Loss:  0.484789025784\n",
394 |       "Epoch: 36 Step: 1728 loss: 0.347480880097\n",
395 |       "Epoch: 37 Step: 1776 loss: 0.344036137685\n",
396 |       "Epoch: 38 Step: 1824 loss: 0.329046547723\n",
397 |       "Epoch: 39 Step: 1872 loss: 0.308786494968\n",
398 |       "Epoch: 40 Step: 1920 loss: 0.335401780282\n",
399 |       "Saved Model Complete\n",
400 |       "Train: F1 Score:  88.5588616245 Accuracy:  94.53125 Loss:  0.291247650981\n",
401 |       "Val: F1 Score:  84.105797863 Accuracy:  92.5 Loss:  0.359305435419\n",
402 |       "Epoch: 41 Step: 1968 loss: 0.332291507783\n",
403 |       "Epoch: 42 Step: 2016 loss: 0.314355407842\n",
404 |       "Epoch: 43 Step: 2064 loss: 0.319293403377\n",
405 |       "Epoch: 44 Step: 2112 loss: 0.297154735463\n",
406 |       "Epoch: 45 Step: 2160 loss: 0.305809813552\n",
407 |       "Saved Model Complete\n",
408 |       "Train: F1 Score:  88.5833478857 Accuracy:  94.5963541667 Loss:  0.283582553578\n",
409 |       "Val: F1 Score:  78.8451959418 Accuracy:  90.625 Loss:  0.493378305435\n",
410 |       "Epoch: 46 Step: 2208 loss: 0.28896213385\n",
411 |       "Epoch: 47 Step: 2256 loss: 0.299109598001\n",
412 |       "Epoch: 48 Step: 2304 loss: 0.285256354449\n",
413 |       "Epoch: 49 Step: 2352 loss: 0.293783533076\n",
414 |       "Epoch: 50 Step: 2400 loss: 0.288317035573\n",
415 |       "Saved Model Complete\n",
416 |       "Train: F1 Score:  89.4559800481 Accuracy:  94.8567708333 Loss:  0.271899814407\n",
417 |       "Val: F1 Score:  78.6662686459 Accuracy:  89.375 Loss:  0.450355643034\n",
418 |       "Test: F1 Score:  78.1099629833 Accuracy:  89.7836538462 Loss:  0.494133715446\n"
419 |      ]
420 |     }
421 |    ],
422 |    "source": [
423 |     "num_epochs = 50\n",
424 |     "\n",
425 |     "for i in range(num_epochs):\n",
426 |     "    \n",
427 |     "    random = np.random.choice(len(y_train), size=(len(y_train)), replace=False)\n",
428 |     "    x_id_train = x_id_train[random]\n",
429 |     "    y_train = y_train[random]\n",
430 |     "    mask_train = mask_train[random]\n",
431 |     "            \n",
432 |     "    losses, step = model(x_id_train, y_train, mask_train)\n",
433 |     "    print(\"Epoch:\", i+1,\"Step:\", step, \"loss:\",losses)\n",
434 |     "    \n",
435 |     "    if((i+1)%5==0):\n",
436 |     "        saver.save(sess, save_path=save_dir)                         \n",
437 |     "        print(\"Saved Model Complete\")\n",
438 |     "        train_loss, train_pred = model(x_id_train, y_train, mask_train, train_cond=False)\n",
439 |     "        f1_, accu_ = eval_score(y_train, train_pred)\n",
440 |     "        print(\"Train: F1 Score: \",  f1_, \"Accuracy: \", accu_, \"Loss: \", train_loss)\n",
441 |     "        val_loss, val_pred = model(x_id_val, y_val, mask_val, train_cond=False)\n",
442 |     "        f1_, accu_ = eval_score(y_val, val_pred)\n",
443 |     "        print(\"Val: F1 Score: \",  f1_, \"Accuracy: \", accu_, \"Loss: \", val_loss)\n",
444 |     "        \n",
445 |     "test_loss, test_pred = model(x_id_test, y_test, mask_test, train_cond=False)\n",
446 |     "f1_, accu_ = eval_score(y_test, test_pred)\n",
447 |     "print(\"Test: F1 Score: \",  f1_, \"Accuracy: \", accu_, \"Loss: \", test_loss)"
448 |    ]
449 |   },
450 |   {
451 |    "cell_type": "code",
452 |    "execution_count": null,
453 |    "metadata": {
454 |     "collapsed": true
455 |    },
456 |    "outputs": [],
457 |    "source": []
458 |   },
459 |   {
460 |    "cell_type": "code",
461 |    "execution_count": null,
462 |    "metadata": {
463 |     "collapsed": true
464 |    },
465 |    "outputs": [],
466 |    "source": [
467 |     "saver.restore(sess, save_dir)"
468 |    ]
469 |   }
470 |  ],
471 |  "metadata": {
472 |   "kernelspec": {
473 |    "display_name": "cs771",
474 |    "language": "python",
475 |    "name": "cs771"
476 |   },
477 |   "language_info": {
478 |    "codemirror_mode": {
479 |     "name": "ipython",
480 |     "version": 3
481 |    },
482 |    "file_extension": ".py",
483 |    "mimetype": "text/x-python",
484 |    "name": "python",
485 |    "nbconvert_exporter": "python",
486 |    "pygments_lexer": "ipython3",
487 |    "version": "3.5.2"
488 |   }
489 |  },
490 |  "nbformat": 4,
491 |  "nbformat_minor": 2
492 | }
493 | 


--------------------------------------------------------------------------------
/Four Word Model/Model-3.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import tensorflow as tf\n",
 12 |     "tf.logging.set_verbosity(tf.logging.WARN)\n",
 13 |     "import pickle\n",
 14 |     "import numpy as np\n",
 15 |     "import os\n",
 16 |     "from sklearn.model_selection import train_test_split\n",
 17 |     "from sklearn.metrics import f1_score\n",
 18 |     "from sklearn.metrics import accuracy_score\n",
 19 |     "import os\n",
 20 |     "from tensorflow.python.client import device_lib\n",
 21 |     "from collections import Counter"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": 2,
 27 |    "metadata": {
 28 |     "collapsed": true
 29 |    },
 30 |    "outputs": [],
 31 |    "source": [
 32 |     "f = open('../Glove/word_embedding_glove', 'rb')\n",
 33 |     "word_embedding = pickle.load(f)\n",
 34 |     "f.close()\n",
 35 |     "word_embedding = word_embedding[: len(word_embedding)-1]\n",
 36 |     "\n",
 37 |     "f = open('../Glove/vocab_glove', 'rb')\n",
 38 |     "vocab = pickle.load(f)\n",
 39 |     "f.close()\n",
 40 |     "\n",
 41 |     "word2id = dict((w, i) for i,w in enumerate(vocab))\n",
 42 |     "id2word = dict((i, w) for i,w in enumerate(vocab))\n",
 43 |     "\n",
 44 |     "unknown_token = \"UNKNOWN_TOKEN\"\n",
 45 |     "\n",
 46 |     "f = open(\"train.pickle\", 'rb')\n",
 47 |     "full_data = pickle.load(f)\n",
 48 |     "f.close()"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": 3,
 54 |    "metadata": {
 55 |     "collapsed": true
 56 |    },
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "# Model Description\n",
 60 |     "sense_word = 'hard'\n",
 61 |     "model_name = 'model-3'\n",
 62 |     "model_dir = 'output/' + sense_word + '/' + model_name\n",
 63 |     "save_dir = os.path.join(model_dir, \"save/\")\n",
 64 |     "log_dir = os.path.join(model_dir, \"log\")\n",
 65 |     "\n",
 66 |     "if not os.path.exists(model_dir):\n",
 67 |     "    os.mkdir(model_dir)\n",
 68 |     "if not os.path.exists(save_dir):\n",
 69 |     "    os.mkdir(save_dir)\n",
 70 |     "if not os.path.exists(log_dir):\n",
 71 |     "    os.mkdir(log_dir)"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": 4,
 77 |    "metadata": {},
 78 |    "outputs": [
 79 |     {
 80 |      "name": "stdout",
 81 |      "output_type": "stream",
 82 |      "text": [
 83 |       "Counter({'HARD1': 3455, 'HARD2': 502, 'HARD3': 376})\n",
 84 |       "[ 1.21578586  5.30486965  5.47934437]\n"
 85 |      ]
 86 |     }
 87 |    ],
 88 |    "source": [
 89 |     "sense_counts = Counter(full_data[sense_word][1])\n",
 90 |     "print(sense_counts)\n",
 91 |     "total_count = len(full_data[sense_word][1])\n",
 92 |     "sort_sense_counts = sense_counts.most_common()\n",
 93 |     "vocab_sense = [k for k,v in sort_sense_counts]\n",
 94 |     "freq_sense = [v for k,v in sort_sense_counts]\n",
 95 |     "weights = np.multiply(6, [1 - count/total_count for count in freq_sense])\n",
 96 |     "weights = weights.astype(np.float32)\n",
 97 |     "print(weights)"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "execution_count": 5,
103 |    "metadata": {
104 |     "collapsed": true
105 |    },
106 |    "outputs": [],
107 |    "source": [
108 |     "# Parameters\n",
109 |     "mode = 'train'\n",
110 |     "num_senses = 3\n",
111 |     "batch_size = 64\n",
112 |     "vocab_size = len(vocab)\n",
113 |     "unk_vocab_size = 1\n",
114 |     "word_emb_size = len(word_embedding[0])\n",
115 |     "max_sent_size = 200\n",
116 |     "hidden_size = 100\n",
117 |     "keep_prob = 0.5\n",
118 |     "l2_lambda = 0.002\n",
119 |     "init_lr = 0.005\n",
120 |     "decay_steps = 500\n",
121 |     "decay_rate = 0.96\n",
122 |     "clip_norm = 1\n",
123 |     "clipping = True"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": 6,
129 |    "metadata": {
130 |     "collapsed": true
131 |    },
132 |    "outputs": [],
133 |    "source": [
134 |     "# MODEL\n",
135 |     "x = tf.placeholder('int32', [batch_size, max_sent_size], name=\"x\")\n",
136 |     "y = tf.placeholder('int32', [batch_size], name=\"y\")\n",
137 |     "x_mask  = tf.placeholder('bool', [batch_size, max_sent_size], name='x_mask') \n",
138 |     "is_train = tf.placeholder('bool', [], name='is_train')\n",
139 |     "word_emb_mat = tf.placeholder('float', [None, word_emb_size], name='emb_mat')\n",
140 |     "input_keep_prob = tf.cond(is_train,lambda:keep_prob, lambda:tf.constant(1.0))\n",
141 |     "x_len = tf.reduce_sum(tf.cast(x_mask, 'int32'), 1)\n",
142 |     "\n",
143 |     "with tf.name_scope(\"word_embedding\"):\n",
144 |     "    if mode == 'train':\n",
145 |     "        unk_word_emb_mat = tf.get_variable(\"word_emb_mat\", dtype='float', shape=[unk_vocab_size, word_emb_size], initializer=tf.contrib.layers.xavier_initializer(uniform=True, seed=0, dtype=tf.float32))\n",
146 |     "    else:\n",
147 |     "        unk_word_emb_mat = tf.get_variable(\"word_emb_mat\", shape=[unk_vocab_size, word_emb_size], dtype='float')\n",
148 |     "        \n",
149 |     "    final_word_emb_mat = tf.concat([word_emb_mat, unk_word_emb_mat], 0)\n",
150 |     "    Wx = tf.nn.embedding_lookup(final_word_emb_mat, x)  \n",
151 |     "\n",
152 |     "with tf.variable_scope(\"lstm1\"):\n",
153 |     "    cell_fw1 = tf.contrib.rnn.BasicLSTMCell(hidden_size,state_is_tuple=True)\n",
154 |     "    cell_bw1 = tf.contrib.rnn.BasicLSTMCell(hidden_size,state_is_tuple=True)\n",
155 |     "\n",
156 |     "    d_cell_fw1 = tf.contrib.rnn.DropoutWrapper(cell_fw1, input_keep_prob=input_keep_prob)\n",
157 |     "    d_cell_bw1 = tf.contrib.rnn.DropoutWrapper(cell_bw1, input_keep_prob=input_keep_prob)\n",
158 |     "    \n",
159 |     "    (fw_h1, bw_h1), _ = tf.nn.bidirectional_dynamic_rnn(d_cell_fw1, d_cell_bw1, Wx, sequence_length=x_len, dtype='float', scope='lstm1')\n",
160 |     "    h1 = tf.concat([fw_h1, bw_h1], 2)\n",
161 |     "    \n",
162 |     "with tf.variable_scope(\"lstm2\"):\n",
163 |     "    cell_fw2 = tf.contrib.rnn.BasicLSTMCell(hidden_size,state_is_tuple=True)\n",
164 |     "    cell_bw2 = tf.contrib.rnn.BasicLSTMCell(hidden_size,state_is_tuple=True)\n",
165 |     "\n",
166 |     "    d_cell_fw2 = tf.contrib.rnn.DropoutWrapper(cell_fw2, input_keep_prob=input_keep_prob)\n",
167 |     "    d_cell_bw2 = tf.contrib.rnn.DropoutWrapper(cell_bw2, input_keep_prob=input_keep_prob)\n",
168 |     "    \n",
169 |     "    (fw_h2, bw_h2), _ = tf.nn.bidirectional_dynamic_rnn(d_cell_fw2, d_cell_bw2, h1, sequence_length=x_len, dtype='float', scope='lstm2')\n",
170 |     "    h = tf.concat([fw_h2, bw_h2], 2)\n",
171 |     "\n",
172 |     "def attention(input_x, input_mask, W_att):\n",
173 |     "    h_masked = tf.boolean_mask(input_x, input_mask)\n",
174 |     "    h_tanh = tf.tanh(h_masked)\n",
175 |     "    u = tf.matmul(h_tanh, W_att)\n",
176 |     "    a = tf.nn.softmax(u)\n",
177 |     "    c = tf.reduce_sum(tf.multiply(h_tanh, a), 0)  \n",
178 |     "    return c\n",
179 |     "\n",
180 |     "with tf.variable_scope(\"attention\"):\n",
181 |     "    W_att = tf.Variable(tf.truncated_normal([2*hidden_size, 1], mean=0.0, stddev=0.1, seed=0), name=\"W_att\")\n",
182 |     "    c = tf.expand_dims(attention(h[0], x_mask[0], W_att), 0)\n",
183 |     "    for i in range(1, batch_size):\n",
184 |     "        c = tf.concat([c, tf.expand_dims(attention(h[i], x_mask[i], W_att), 0)], 0)\n",
185 |     "        \n",
186 |     "with tf.variable_scope(\"softmax_layer\"):\n",
187 |     "    W = tf.Variable(tf.truncated_normal([2*hidden_size, num_senses], mean=0.0, stddev=0.1, seed=0), name=\"W\")\n",
188 |     "    b = tf.Variable(tf.zeros([num_senses]), name=\"b\")\n",
189 |     "    drop_c = tf.nn.dropout(c, input_keep_prob)\n",
190 |     "    logits = tf.matmul(drop_c, W) + b\n",
191 |     "    predictions = tf.argmax(logits, 1)\n",
192 |     "\n",
193 |     "class_weight = tf.constant(weights)\n",
194 |     "weighted_logits = logits * class_weight\n",
195 |     "loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=weighted_logits, labels=y))\n",
196 |     "global_step = tf.Variable(0, trainable=False, name=\"global_step\")\n",
197 |     "\n",
198 |     "learning_rate = tf.train.exponential_decay(init_lr, global_step, decay_steps, decay_rate, staircase=True)\n",
199 |     "\n",
200 |     "tv_all = tf.trainable_variables()\n",
201 |     "tv_regu =[]\n",
202 |     "for t in tv_all:\n",
203 |     "    if t.name.find('b:')==-1:\n",
204 |     "        tv_regu.append(t)\n",
205 |     "        \n",
206 |     "# l2 Loss\n",
207 |     "l2_loss = l2_lambda * tf.reduce_sum([ tf.nn.l2_loss(v) for v in tv_regu ])\n",
208 |     "\n",
209 |     "total_loss = loss + l2_loss\n",
210 |     "\n",
211 |     "# Optimizer for loss\n",
212 |     "optimizer = tf.train.AdamOptimizer(learning_rate)\n",
213 |     "\n",
214 |     "# Gradients and Variables for Loss\n",
215 |     "grads_vars = optimizer.compute_gradients(total_loss)\n",
216 |     "\n",
217 |     "# Clipping of Gradients\n",
218 |     "clipped_grads = grads_vars\n",
219 |     "if(clipping == True):\n",
220 |     "    clipped_grads = [(tf.clip_by_norm(grad, clip_norm), var) for grad, var in clipped_grads]\n",
221 |     "\n",
222 |     "# Training Optimizer for Total Loss\n",
223 |     "train_op = optimizer.apply_gradients(clipped_grads, global_step=global_step)\n",
224 |     "\n",
225 |     "# Summaries\n",
226 |     "var_summaries = []\n",
227 |     "for v in tv_all:\n",
228 |     "    var_summary = tf.summary.histogram(\"{}/var\".format(v.name), v)\n",
229 |     "    var_summaries.append(var_summary)\n",
230 |     "\n",
231 |     "var_summaries_merged = tf.summary.merge(var_summaries)\n",
232 |     "\n",
233 |     "loss_summary = tf.summary.scalar(\"loss\", loss)\n",
234 |     "total_loss_summary = tf.summary.scalar(\"total_loss\", total_loss)\n",
235 |     "summary = tf.summary.merge_all()"
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "code",
240 |    "execution_count": 7,
241 |    "metadata": {
242 |     "collapsed": true
243 |    },
244 |    "outputs": [],
245 |    "source": [
246 |     "os.environ[\"CUDA_DEVICE_ORDER\"]=\"PCI_BUS_ID\"   # see issue #152\n",
247 |     "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"1\"\n",
248 |     "config = tf.ConfigProto()\n",
249 |     "config.gpu_options.allow_growth = True\n",
250 |     "sess = tf.Session(config=config)\n",
251 |     "sess.run(tf.global_variables_initializer())                          # For initializing all the variables\n",
252 |     "saver = tf.train.Saver()                                             # For Saving the model\n",
253 |     "summary_writer = tf.summary.FileWriter(log_dir, sess.graph)          # For writing Summaries"
254 |    ]
255 |   },
256 |   {
257 |    "cell_type": "code",
258 |    "execution_count": 8,
259 |    "metadata": {
260 |     "scrolled": true
261 |    },
262 |    "outputs": [
263 |     {
264 |      "name": "stderr",
265 |      "output_type": "stream",
266 |      "text": [
267 |       "/users/btech/aviraj/cs771/lib/python3.5/site-packages/sklearn/model_selection/_split.py:2026: FutureWarning: From version 0.21, test_size will always complement train_size unless both are specified.\n",
268 |       "  FutureWarning)\n"
269 |      ]
270 |     }
271 |    ],
272 |    "source": [
273 |     "# Splitting\n",
274 |     "data_x = full_data[sense_word][0]\n",
275 |     "data_y = full_data[sense_word][2]\n",
276 |     "x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, train_size=0.8, shuffle=True, stratify=data_y, random_state=0)\n",
277 |     "\n",
278 |     "x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, train_size=0.9, shuffle=True, stratify=y_train, random_state=0)"
279 |    ]
280 |   },
281 |   {
282 |    "cell_type": "code",
283 |    "execution_count": 9,
284 |    "metadata": {
285 |     "collapsed": true
286 |    },
287 |    "outputs": [],
288 |    "source": [
289 |     "def data_prepare(x):\n",
290 |     "    num_examples = len(x)\n",
291 |     "\n",
292 |     "    xx = np.zeros([num_examples, max_sent_size], dtype=int)\n",
293 |     "    xx_mask = np.zeros([num_examples, max_sent_size], dtype=bool)\n",
294 |     "\n",
295 |     "    for j in range(num_examples):\n",
296 |     "        for i in range(max_sent_size):\n",
297 |     "            if(i>=len(x[j])):\n",
298 |     "                break\n",
299 |     "            w = x[j][i]\n",
300 |     "            xx[j][i] = word2id[w] if w in word2id else word2id['UNKNOWN_TOKEN']\n",
301 |     "            xx_mask[j][i] = True\n",
302 |     "            \n",
303 |     "    return xx, xx_mask\n",
304 |     "\n",
305 |     "def eval_score(yy, pred):\n",
306 |     "    num_batches = int(len(yy)/batch_size)\n",
307 |     "    f1 = f1_score(yy[:batch_size*num_batches], pred, average='macro')\n",
308 |     "    accu = accuracy_score(yy[:batch_size*num_batches], pred)\n",
309 |     "    return f1*100, accu*100\n",
310 |     "\n",
311 |     "def model(xx, yy, mask, train_cond=True):\n",
312 |     "    num_batches = int(len(xx)/batch_size)\n",
313 |     "    losses = 0\n",
314 |     "    preds = []\n",
315 |     "    for j in range(num_batches): \n",
316 |     "        \n",
317 |     "        s = j * batch_size\n",
318 |     "        e = (j+1) * batch_size\n",
319 |     "        \n",
320 |     "        feed_dict = {x:xx[s:e], y:yy[s:e], x_mask:mask[s:e], is_train:train_cond, input_keep_prob:keep_prob, word_emb_mat:word_embedding}\n",
321 |     "        \n",
322 |     "        \n",
323 |     "        if(train_cond==True):\n",
324 |     "            _, _loss, step, _summary = sess.run([train_op, total_loss, global_step, summary], feed_dict)\n",
325 |     "            summary_writer.add_summary(_summary, step)  \n",
326 |     "#             print(\"Steps:{}\".format(step), \", Loss: {}\".format(_loss))\n",
327 |     "\n",
328 |     "        else:\n",
329 |     "            _loss, pred = sess.run([total_loss, predictions], feed_dict)\n",
330 |     "            preds.append(pred)\n",
331 |     "            \n",
332 |     "        losses +=_loss\n",
333 |     "\n",
334 |     "    if(train_cond==False):\n",
335 |     "        y_pred = []\n",
336 |     "        for i in range(num_batches):\n",
337 |     "            for pred in preds[i]:\n",
338 |     "                y_pred.append(pred)\n",
339 |     "        return losses/num_batches, y_pred\n",
340 |     "    \n",
341 |     "    return losses/num_batches, step"
342 |    ]
343 |   },
344 |   {
345 |    "cell_type": "code",
346 |    "execution_count": 10,
347 |    "metadata": {
348 |     "collapsed": true
349 |    },
350 |    "outputs": [],
351 |    "source": [
352 |     "x_id_train, mask_train = data_prepare(x_train)\n",
353 |     "x_id_val, mask_val = data_prepare(x_val)\n",
354 |     "x_id_test, mask_test = data_prepare(x_test)\n",
355 |     "y_train = np.array(y_train)"
356 |    ]
357 |   },
358 |   {
359 |    "cell_type": "code",
360 |    "execution_count": 11,
361 |    "metadata": {
362 |     "scrolled": true
363 |    },
364 |    "outputs": [
365 |     {
366 |      "name": "stdout",
367 |      "output_type": "stream",
368 |      "text": [
369 |       "Epoch: 1 Step: 48 loss: 9.53506787121\n",
370 |       "Epoch: 2 Step: 96 loss: 1.81961081177\n",
371 |       "Epoch: 3 Step: 144 loss: 1.19337606803\n",
372 |       "Epoch: 4 Step: 192 loss: 0.967174999416\n",
373 |       "Epoch: 5 Step: 240 loss: 0.859784771999\n",
374 |       "Saved Model Complete\n",
375 |       "Train: F1 Score:  46.2728990557 Accuracy:  73.2096354167 Loss:  0.935422244171\n",
376 |       "Val: F1 Score:  44.6230136155 Accuracy:  70.9375 Loss:  0.885838544369\n",
377 |       "Epoch: 6 Step: 288 loss: 0.815433536967\n",
378 |       "Epoch: 7 Step: 336 loss: 0.756411065037\n",
379 |       "Epoch: 8 Step: 384 loss: 0.722958392153\n",
380 |       "Epoch: 9 Step: 432 loss: 0.67455783921\n",
381 |       "Epoch: 10 Step: 480 loss: 0.677137187993\n",
382 |       "Saved Model Complete\n",
383 |       "Train: F1 Score:  45.3513910841 Accuracy:  81.15234375 Loss:  0.622080009431\n",
384 |       "Val: F1 Score:  43.2422709632 Accuracy:  81.875 Loss:  0.607948565483\n",
385 |       "Epoch: 11 Step: 528 loss: 0.65565276891\n",
386 |       "Epoch: 12 Step: 576 loss: 0.645226646215\n",
387 |       "Epoch: 13 Step: 624 loss: 0.631849833454\n",
388 |       "Epoch: 14 Step: 672 loss: 0.653128698468\n",
389 |       "Epoch: 15 Step: 720 loss: 0.610900692021\n",
390 |       "Saved Model Complete\n",
391 |       "Train: F1 Score:  60.5870838384 Accuracy:  83.3658854167 Loss:  0.585401636859\n",
392 |       "Val: F1 Score:  61.1966168463 Accuracy:  85.3125 Loss:  0.595154416561\n",
393 |       "Epoch: 16 Step: 768 loss: 0.640408499787\n",
394 |       "Epoch: 17 Step: 816 loss: 0.573454591756\n",
395 |       "Epoch: 18 Step: 864 loss: 0.573158189033\n",
396 |       "Epoch: 19 Step: 912 loss: 0.580998883272\n",
397 |       "Epoch: 20 Step: 960 loss: 0.599028664331\n",
398 |       "Saved Model Complete\n",
399 |       "Train: F1 Score:  66.2391100441 Accuracy:  85.6119791667 Loss:  0.579200811684\n",
400 |       "Val: F1 Score:  63.3909012244 Accuracy:  84.375 Loss:  0.571177864075\n",
401 |       "Epoch: 21 Step: 1008 loss: 0.613934485242\n",
402 |       "Epoch: 22 Step: 1056 loss: 0.607284868757\n",
403 |       "Epoch: 23 Step: 1104 loss: 0.597342180709\n",
404 |       "Epoch: 24 Step: 1152 loss: 0.570371546472\n",
405 |       "Epoch: 25 Step: 1200 loss: 0.580265671636\n",
406 |       "Saved Model Complete\n",
407 |       "Train: F1 Score:  67.9210837096 Accuracy:  86.7513020833 Loss:  0.537070132792\n",
408 |       "Val: F1 Score:  73.5174165398 Accuracy:  89.0625 Loss:  0.566295391321\n",
409 |       "Epoch: 26 Step: 1248 loss: 0.568779307107\n",
410 |       "Epoch: 27 Step: 1296 loss: 0.55141502743\n",
411 |       "Epoch: 28 Step: 1344 loss: 0.559002238015\n",
412 |       "Epoch: 29 Step: 1392 loss: 0.569756407291\n",
413 |       "Epoch: 30 Step: 1440 loss: 0.573152939479\n",
414 |       "Saved Model Complete\n",
415 |       "Train: F1 Score:  69.0664553653 Accuracy:  87.3046875 Loss:  0.59051666595\n",
416 |       "Val: F1 Score:  68.3056653491 Accuracy:  88.125 Loss:  0.647302913666\n",
417 |       "Epoch: 31 Step: 1488 loss: 0.601928584278\n",
418 |       "Epoch: 32 Step: 1536 loss: 0.581918654342\n",
419 |       "Epoch: 33 Step: 1584 loss: 0.539948465923\n",
420 |       "Epoch: 34 Step: 1632 loss: 0.562553635488\n",
421 |       "Epoch: 35 Step: 1680 loss: 0.547960610439\n",
422 |       "Saved Model Complete\n",
423 |       "Train: F1 Score:  71.4368257896 Accuracy:  88.4765625 Loss:  0.517511847119\n",
424 |       "Val: F1 Score:  63.9771663859 Accuracy:  86.875 Loss:  0.614117074013\n",
425 |       "Epoch: 36 Step: 1728 loss: 0.566355666146\n",
426 |       "Epoch: 37 Step: 1776 loss: 0.555698808903\n",
427 |       "Epoch: 38 Step: 1824 loss: 0.56517353033\n",
428 |       "Epoch: 39 Step: 1872 loss: 0.581259304037\n",
429 |       "Epoch: 40 Step: 1920 loss: 0.585148503383\n",
430 |       "Saved Model Complete\n",
431 |       "Train: F1 Score:  72.4950138601 Accuracy:  88.7044270833 Loss:  0.578148378059\n",
432 |       "Val: F1 Score:  68.0165923988 Accuracy:  87.5 Loss:  0.708620613813\n",
433 |       "Epoch: 41 Step: 1968 loss: 0.567735542854\n",
434 |       "Epoch: 42 Step: 2016 loss: 0.539583496749\n",
435 |       "Epoch: 43 Step: 2064 loss: 0.544194473575\n",
436 |       "Epoch: 44 Step: 2112 loss: 0.556465638181\n",
437 |       "Epoch: 45 Step: 2160 loss: 0.559930261845\n",
438 |       "Saved Model Complete\n",
439 |       "Train: F1 Score:  76.9940617261 Accuracy:  89.16015625 Loss:  0.536304668213\n",
440 |       "Val: F1 Score:  74.9496075234 Accuracy:  88.4375 Loss:  0.573511379957\n",
441 |       "Epoch: 46 Step: 2208 loss: 0.556281161805\n",
442 |       "Epoch: 47 Step: 2256 loss: 0.549503739923\n",
443 |       "Epoch: 48 Step: 2304 loss: 0.561590575303\n",
444 |       "Epoch: 49 Step: 2352 loss: 0.538634177297\n",
445 |       "Epoch: 50 Step: 2400 loss: 0.548110162839\n",
446 |       "Saved Model Complete\n",
447 |       "Train: F1 Score:  69.2087726432 Accuracy:  88.2486979167 Loss:  0.513670069476\n",
448 |       "Val: F1 Score:  75.8463136033 Accuracy:  89.6875 Loss:  0.542824417353\n",
449 |       "Test: F1 Score:  61.845299018 Accuracy:  85.0961538462 Loss:  0.683341053816\n"
450 |      ]
451 |     }
452 |    ],
453 |    "source": [
454 |     "num_epochs = 50\n",
455 |     "\n",
456 |     "for i in range(num_epochs):\n",
457 |     "    \n",
458 |     "    random = np.random.choice(len(y_train), size=(len(y_train)), replace=False)\n",
459 |     "    x_id_train = x_id_train[random]\n",
460 |     "    y_train = y_train[random]\n",
461 |     "    mask_train = mask_train[random]\n",
462 |     "            \n",
463 |     "    losses, step = model(x_id_train, y_train, mask_train)\n",
464 |     "    print(\"Epoch:\", i+1,\"Step:\", step, \"loss:\",losses)\n",
465 |     "    \n",
466 |     "    if((i+1)%5==0):\n",
467 |     "        saver.save(sess, save_path=save_dir)                         \n",
468 |     "        print(\"Saved Model Complete\")\n",
469 |     "        train_loss, train_pred = model(x_id_train, y_train, mask_train, train_cond=False)\n",
470 |     "        f1_, accu_ = eval_score(y_train, train_pred)\n",
471 |     "        print(\"Train: F1 Score: \",  f1_, \"Accuracy: \", accu_, \"Loss: \", train_loss)\n",
472 |     "        val_loss, val_pred = model(x_id_val, y_val, mask_val, train_cond=False)\n",
473 |     "        f1_, accu_ = eval_score(y_val, val_pred)\n",
474 |     "        print(\"Val: F1 Score: \",  f1_, \"Accuracy: \", accu_, \"Loss: \", val_loss)\n",
475 |     "        \n",
476 |     "test_loss, test_pred = model(x_id_test, y_test, mask_test, train_cond=False)\n",
477 |     "f1_, accu_ = eval_score(y_test, test_pred)\n",
478 |     "print(\"Test: F1 Score: \",  f1_, \"Accuracy: \", accu_, \"Loss: \", test_loss)"
479 |    ]
480 |   },
481 |   {
482 |    "cell_type": "code",
483 |    "execution_count": null,
484 |    "metadata": {
485 |     "collapsed": true
486 |    },
487 |    "outputs": [],
488 |    "source": []
489 |   },
490 |   {
491 |    "cell_type": "code",
492 |    "execution_count": null,
493 |    "metadata": {
494 |     "collapsed": true
495 |    },
496 |    "outputs": [],
497 |    "source": [
498 |     "saver.restore(sess, save_dir)"
499 |    ]
500 |   }
501 |  ],
502 |  "metadata": {
503 |   "kernelspec": {
504 |    "display_name": "cs771",
505 |    "language": "python",
506 |    "name": "cs771"
507 |   },
508 |   "language_info": {
509 |    "codemirror_mode": {
510 |     "name": "ipython",
511 |     "version": 3
512 |    },
513 |    "file_extension": ".py",
514 |    "mimetype": "text/x-python",
515 |    "name": "python",
516 |    "nbconvert_exporter": "python",
517 |    "pygments_lexer": "ipython3",
518 |    "version": "3.5.2"
519 |   }
520 |  },
521 |  "nbformat": 4,
522 |  "nbformat_minor": 2
523 | }
524 | 


--------------------------------------------------------------------------------
/Four Word Model/Model.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import tensorflow as tf\n",
 12 |     "tf.logging.set_verbosity(tf.logging.WARN)\n",
 13 |     "import pickle\n",
 14 |     "import numpy as np\n",
 15 |     "import os\n",
 16 |     "from sklearn.model_selection import train_test_split\n",
 17 |     "from sklearn.metrics import f1_score\n",
 18 |     "from sklearn.metrics import accuracy_score\n",
 19 |     "import os\n",
 20 |     "from tensorflow.python.client import device_lib"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": 2,
 26 |    "metadata": {
 27 |     "collapsed": true
 28 |    },
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "f = open('../Glove/word_embedding_glove', 'rb')\n",
 32 |     "word_embedding = pickle.load(f)\n",
 33 |     "f.close()\n",
 34 |     "word_embedding = word_embedding[: len(word_embedding)-1]\n",
 35 |     "\n",
 36 |     "f = open('../Glove/vocab_glove', 'rb')\n",
 37 |     "vocab = pickle.load(f)\n",
 38 |     "f.close()\n",
 39 |     "\n",
 40 |     "word2id = dict((w, i) for i,w in enumerate(vocab))\n",
 41 |     "id2word = dict((i, w) for i,w in enumerate(vocab))\n",
 42 |     "\n",
 43 |     "unknown_token = \"UNKNOWN_TOKEN\"\n",
 44 |     "\n",
 45 |     "f = open(\"train.pickle\", 'rb')\n",
 46 |     "full_data = pickle.load(f)\n",
 47 |     "f.close()"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": 3,
 53 |    "metadata": {
 54 |     "collapsed": true
 55 |    },
 56 |    "outputs": [],
 57 |    "source": [
 58 |     "# Model Description\n",
 59 |     "sense_word = 'hard'\n",
 60 |     "model_name = 'basic'\n",
 61 |     "model_dir = 'output/' + sense_word + '/' + model_name\n",
 62 |     "save_dir = os.path.join(model_dir, \"save/\")\n",
 63 |     "log_dir = os.path.join(model_dir, \"log\")\n",
 64 |     "\n",
 65 |     "if not os.path.exists(model_dir):\n",
 66 |     "    os.mkdir(model_dir)\n",
 67 |     "if not os.path.exists(save_dir):\n",
 68 |     "    os.mkdir(save_dir)\n",
 69 |     "if not os.path.exists(log_dir):\n",
 70 |     "    os.mkdir(log_dir)"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": 4,
 76 |    "metadata": {
 77 |     "collapsed": true
 78 |    },
 79 |    "outputs": [],
 80 |    "source": [
 81 |     "# Parameters\n",
 82 |     "mode = 'train'\n",
 83 |     "num_senses = 3\n",
 84 |     "batch_size = 64\n",
 85 |     "vocab_size = len(vocab)\n",
 86 |     "unk_vocab_size = 1\n",
 87 |     "word_emb_size = len(word_embedding[0])\n",
 88 |     "max_sent_size = 200\n",
 89 |     "hidden_size = 100\n",
 90 |     "keep_prob = 0.5\n",
 91 |     "l2_lambda = 0.001\n",
 92 |     "init_lr = 0.001\n",
 93 |     "decay_steps = 5000\n",
 94 |     "decay_rate = 0.96\n",
 95 |     "clip_norm = 1\n",
 96 |     "clipping = True"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": 5,
102 |    "metadata": {
103 |     "collapsed": true
104 |    },
105 |    "outputs": [],
106 |    "source": [
107 |     "# MODEL\n",
108 |     "x = tf.placeholder('int32', [batch_size, max_sent_size], name=\"x\")\n",
109 |     "y = tf.placeholder('int32', [batch_size], name=\"y\")\n",
110 |     "x_mask  = tf.placeholder('bool', [batch_size, max_sent_size], name='x_mask') \n",
111 |     "is_train = tf.placeholder('bool', [], name='is_train')\n",
112 |     "word_emb_mat = tf.placeholder('float', [None, word_emb_size], name='emb_mat')\n",
113 |     "input_keep_prob = tf.cond(is_train,lambda:keep_prob, lambda:tf.constant(1.0))\n",
114 |     "x_len = tf.reduce_sum(tf.cast(x_mask, 'int32'), 1)\n",
115 |     "\n",
116 |     "with tf.name_scope(\"word_embedding\"):\n",
117 |     "    if mode == 'train':\n",
118 |     "        unk_word_emb_mat = tf.get_variable(\"word_emb_mat\", dtype='float', shape=[unk_vocab_size, word_emb_size], initializer=tf.contrib.layers.xavier_initializer(uniform=True, seed=0, dtype=tf.float32))\n",
119 |     "    else:\n",
120 |     "        unk_word_emb_mat = tf.get_variable(\"word_emb_mat\", shape=[unk_vocab_size, word_emb_size], dtype='float')\n",
121 |     "        \n",
122 |     "    final_word_emb_mat = tf.concat([word_emb_mat, unk_word_emb_mat], 0)\n",
123 |     "    Wx = tf.nn.embedding_lookup(final_word_emb_mat, x)  \n",
124 |     "\n",
125 |     "with tf.variable_scope(\"lstm\"):\n",
126 |     "    cell_fw = tf.contrib.rnn.BasicLSTMCell(hidden_size,state_is_tuple=True)\n",
127 |     "    cell_bw = tf.contrib.rnn.BasicLSTMCell(hidden_size,state_is_tuple=True)\n",
128 |     "\n",
129 |     "    d_cell_fw = tf.contrib.rnn.DropoutWrapper(cell_fw, input_keep_prob=input_keep_prob)\n",
130 |     "    d_cell_bw = tf.contrib.rnn.DropoutWrapper(cell_bw, input_keep_prob=input_keep_prob)\n",
131 |     "    \n",
132 |     "    (fw_h, bw_h), _ = tf.nn.bidirectional_dynamic_rnn(d_cell_fw, d_cell_bw, Wx, sequence_length=x_len, dtype='float', scope='lstm')\n",
133 |     "    h = tf.concat([fw_h, bw_h], 2)\n",
134 |     "\n",
135 |     "def attention(input_x, input_mask, W_att):\n",
136 |     "    h_masked = tf.boolean_mask(input_x, input_mask)\n",
137 |     "    h_tanh = tf.tanh(h_masked)\n",
138 |     "    u = tf.matmul(h_tanh, W_att)\n",
139 |     "    a = tf.nn.softmax(u)\n",
140 |     "    c = tf.reduce_sum(tf.multiply(h_tanh, a), 0)  \n",
141 |     "    return c\n",
142 |     "\n",
143 |     "with tf.variable_scope(\"attention\"):\n",
144 |     "    W_att = tf.Variable(tf.truncated_normal([2*hidden_size, 1], mean=0.0, stddev=0.1, seed=0), name=\"W_att\")\n",
145 |     "    c = tf.expand_dims(attention(h[0], x_mask[0], W_att), 0)\n",
146 |     "    for i in range(1, batch_size):\n",
147 |     "        c = tf.concat([c, tf.expand_dims(attention(h[i], x_mask[i], W_att), 0)], 0)\n",
148 |     "        \n",
149 |     "with tf.variable_scope(\"softmax_layer\"):\n",
150 |     "    W = tf.Variable(tf.truncated_normal([2*hidden_size, num_senses], mean=0.0, stddev=0.1, seed=0), name=\"W\")\n",
151 |     "    b = tf.Variable(tf.zeros([num_senses]), name=\"b\")\n",
152 |     "    drop_c = tf.nn.dropout(c, input_keep_prob)\n",
153 |     "    logits = tf.matmul(drop_c, W) + b\n",
154 |     "    predictions = tf.argmax(logits, 1)\n",
155 |     "\n",
156 |     "loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=y))\n",
157 |     "global_step = tf.Variable(0, trainable=False, name=\"global_step\")\n",
158 |     "\n",
159 |     "learning_rate = tf.train.exponential_decay(init_lr, global_step, decay_steps, decay_rate, staircase=True)\n",
160 |     "\n",
161 |     "tv_all = tf.trainable_variables()\n",
162 |     "tv_regu =[]\n",
163 |     "for t in tv_all:\n",
164 |     "    if t.name.find('b:')==-1:\n",
165 |     "        tv_regu.append(t)\n",
166 |     "        \n",
167 |     "# l2 Loss\n",
168 |     "l2_loss = l2_lambda * tf.reduce_sum([ tf.nn.l2_loss(v) for v in tv_regu ])\n",
169 |     "\n",
170 |     "total_loss = loss + l2_loss\n",
171 |     "\n",
172 |     "# Optimizer for loss\n",
173 |     "optimizer = tf.train.AdamOptimizer(learning_rate)\n",
174 |     "\n",
175 |     "# Gradients and Variables for Loss\n",
176 |     "grads_vars = optimizer.compute_gradients(total_loss)\n",
177 |     "\n",
178 |     "# Clipping of Gradients\n",
179 |     "clipped_grads = grads_vars\n",
180 |     "if(clipping == True):\n",
181 |     "    clipped_grads = [(tf.clip_by_norm(grad, clip_norm), var) for grad, var in clipped_grads]\n",
182 |     "\n",
183 |     "# Training Optimizer for Total Loss\n",
184 |     "train_op = optimizer.apply_gradients(clipped_grads, global_step=global_step)\n",
185 |     "\n",
186 |     "# Summaries\n",
187 |     "var_summaries = []\n",
188 |     "for v in tv_all:\n",
189 |     "    var_summary = tf.summary.histogram(\"{}/var\".format(v.name), v)\n",
190 |     "    var_summaries.append(var_summary)\n",
191 |     "\n",
192 |     "var_summaries_merged = tf.summary.merge(var_summaries)\n",
193 |     "\n",
194 |     "loss_summary = tf.summary.scalar(\"loss\", loss)\n",
195 |     "total_loss_summary = tf.summary.scalar(\"total_loss\", total_loss)\n",
196 |     "summary = tf.summary.merge_all()"
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "code",
201 |    "execution_count": 6,
202 |    "metadata": {
203 |     "collapsed": true
204 |    },
205 |    "outputs": [],
206 |    "source": [
207 |     "os.environ[\"CUDA_DEVICE_ORDER\"]=\"PCI_BUS_ID\"   # see issue #152\n",
208 |     "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"0\"\n",
209 |     "sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))\n",
210 |     "sess.run(tf.global_variables_initializer())                          # For initializing all the variables\n",
211 |     "saver = tf.train.Saver()                                             # For Saving the model\n",
212 |     "summary_writer = tf.summary.FileWriter(log_dir, sess.graph)          # For writing Summaries"
213 |    ]
214 |   },
215 |   {
216 |    "cell_type": "code",
217 |    "execution_count": null,
218 |    "metadata": {
219 |     "collapsed": true
220 |    },
221 |    "outputs": [],
222 |    "source": [
223 |     "# # k-fold Splitting\n",
224 |     "# data_x = np.array(full_data[sense_word][0])\n",
225 |     "# data_y = np.array(full_data[sense_word][2])\n",
226 |     "# kf = KFold(n_splits=5,shuffle=True,random_state=0)\n",
227 |     "# for train_index, test_index in kf.split(X):\n",
228 |     "#     print(\"TRAIN:\", train_index, \"TEST:\", test_index)\n",
229 |     "#     #x_train, x_test = data_x[train_index], data_x[test_index]\n",
230 |     "#     #y_train, y_test = data_y[train_index], data_y[test_index]"
231 |    ]
232 |   },
233 |   {
234 |    "cell_type": "code",
235 |    "execution_count": 6,
236 |    "metadata": {
237 |     "scrolled": true
238 |    },
239 |    "outputs": [
240 |     {
241 |      "name": "stderr",
242 |      "output_type": "stream",
243 |      "text": [
244 |       "/users/btech/aviraj/cs771/lib/python3.5/site-packages/sklearn/model_selection/_split.py:2026: FutureWarning: From version 0.21, test_size will always complement train_size unless both are specified.\n",
245 |       "  FutureWarning)\n"
246 |      ]
247 |     }
248 |    ],
249 |    "source": [
250 |     "# Splitting\n",
251 |     "data_x = full_data[sense_word][0]\n",
252 |     "data_y = full_data[sense_word][2]\n",
253 |     "x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, train_size=0.8, shuffle=True, stratify=data_y, random_state=0)\n",
254 |     "\n",
255 |     "x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, train_size=0.9, shuffle=True, stratify=y_train, random_state=0)"
256 |    ]
257 |   },
258 |   {
259 |    "cell_type": "code",
260 |    "execution_count": 7,
261 |    "metadata": {
262 |     "collapsed": true
263 |    },
264 |    "outputs": [],
265 |    "source": [
266 |     "def data_prepare(x):\n",
267 |     "    num_examples = len(x)\n",
268 |     "\n",
269 |     "    xx = np.zeros([num_examples, max_sent_size], dtype=int)\n",
270 |     "    xx_mask = np.zeros([num_examples, max_sent_size], dtype=bool)\n",
271 |     "\n",
272 |     "    for j in range(num_examples):\n",
273 |     "        for i in range(max_sent_size):\n",
274 |     "            if(i>=len(x[j])):\n",
275 |     "                break\n",
276 |     "            w = x[j][i]\n",
277 |     "            xx[j][i] = word2id[w] if w in word2id else word2id['UNKNOWN_TOKEN']\n",
278 |     "            xx_mask[j][i] = True\n",
279 |     "            \n",
280 |     "    return xx, xx_mask\n",
281 |     "\n",
282 |     "def eval_score(yy, pred):\n",
283 |     "    num_batches = int(len(yy)/batch_size)\n",
284 |     "    f1 = f1_score(yy[:batch_size*num_batches], pred, average='macro')\n",
285 |     "    accu = accuracy_score(yy[:batch_size*num_batches], pred)\n",
286 |     "    return f1*100, accu*100\n",
287 |     "\n",
288 |     "def model(xx, yy, mask, train_cond=True):\n",
289 |     "    num_batches = int(len(xx)/batch_size)\n",
290 |     "    losses = 0\n",
291 |     "    preds = []\n",
292 |     "    for j in range(num_batches): \n",
293 |     "        \n",
294 |     "        s = j * batch_size\n",
295 |     "        e = (j+1) * batch_size\n",
296 |     "        \n",
297 |     "        feed_dict = {x:xx[s:e], y:yy[s:e], x_mask:mask[s:e], is_train:train_cond, input_keep_prob:keep_prob, word_emb_mat:word_embedding}\n",
298 |     "        \n",
299 |     "        \n",
300 |     "        if(train_cond==True):\n",
301 |     "            _, _loss, step, _summary = sess.run([train_op, total_loss, global_step, summary], feed_dict)\n",
302 |     "            summary_writer.add_summary(_summary, step)  \n",
303 |     "#             print(\"Steps:{}\".format(step), \", Loss: {}\".format(_loss))\n",
304 |     "\n",
305 |     "        else:\n",
306 |     "            _loss, pred = sess.run([total_loss, predictions], feed_dict)\n",
307 |     "            preds.append(pred)\n",
308 |     "            \n",
309 |     "        losses +=_loss\n",
310 |     "\n",
311 |     "    if(train_cond==False):\n",
312 |     "        y_pred = []\n",
313 |     "        for i in range(num_batches):\n",
314 |     "            for pred in preds[i]:\n",
315 |     "                y_pred.append(pred)\n",
316 |     "        return losses/num_batches, y_pred\n",
317 |     "    \n",
318 |     "    return losses/num_batches, step"
319 |    ]
320 |   },
321 |   {
322 |    "cell_type": "code",
323 |    "execution_count": 8,
324 |    "metadata": {
325 |     "collapsed": true
326 |    },
327 |    "outputs": [],
328 |    "source": [
329 |     "x_id_train, mask_train = data_prepare(x_train)\n",
330 |     "x_id_val, mask_val = data_prepare(x_val)\n",
331 |     "x_id_test, mask_test = data_prepare(x_test)\n",
332 |     "y_train = np.array(y_train)"
333 |    ]
334 |   },
335 |   {
336 |    "cell_type": "code",
337 |    "execution_count": null,
338 |    "metadata": {
339 |     "collapsed": true,
340 |     "scrolled": true
341 |    },
342 |    "outputs": [],
343 |    "source": [
344 |     "num_epochs = 10\n",
345 |     "\n",
346 |     "for i in range(num_epochs):\n",
347 |     "    \n",
348 |     "    random = np.random.choice(len(y_train), size=(len(y_train)), replace=False)\n",
349 |     "    x_id_train = x_id_train[random]\n",
350 |     "    y_train = y_train[random]\n",
351 |     "    mask_train = mask_train[random]\n",
352 |     "            \n",
353 |     "    losses, step = model(x_id_train, y_train, mask_train)\n",
354 |     "    print(\"Epoch:\", i+1,\"Step:\", step, \"loss:\",losses)\n",
355 |     "    saver.save(sess, save_path=save_dir)                         \n",
356 |     "    print(\"Saved Model Complete\")\n",
357 |     "    \n",
358 |     "    if((i+1)%2==0):\n",
359 |     "        train_loss, train_pred = model(x_id_train, y_train, mask_train, train_cond=False)\n",
360 |     "        f1_, accu_ = eval_score(y_train, train_pred)\n",
361 |     "        print(\"Train: F1 Score: \",  f1_, \"Accuracy: \", accu_, \"Loss: \", train_loss)\n",
362 |     "        val_loss, val_pred = model(x_id_val, y_val, mask_val, train_cond=False)\n",
363 |     "        f1_, accu_ = eval_score(y_val, val_pred)\n",
364 |     "        print(\"Val: F1 Score: \",  f1_, \"Accuracy: \", accu_, \"Loss: \", val_loss)\n",
365 |     "        \n",
366 |     "test_loss, test_pred = model(x_id_test, y_test, mask_test, train_cond=False)\n",
367 |     "f1_, accu_ = eval_score(y_test, test_pred)\n",
368 |     "print(\"Test: F1 Score: \",  f1_, \"Accuracy: \", accu_, \"Loss: \", test_loss)"
369 |    ]
370 |   },
371 |   {
372 |    "cell_type": "code",
373 |    "execution_count": null,
374 |    "metadata": {
375 |     "collapsed": true
376 |    },
377 |    "outputs": [],
378 |    "source": []
379 |   },
380 |   {
381 |    "cell_type": "code",
382 |    "execution_count": 10,
383 |    "metadata": {
384 |     "collapsed": true
385 |    },
386 |    "outputs": [],
387 |    "source": [
388 |     "saver.restore(sess, save_dir)"
389 |    ]
390 |   }
391 |  ],
392 |  "metadata": {
393 |   "kernelspec": {
394 |    "display_name": "cs771",
395 |    "language": "python",
396 |    "name": "cs771"
397 |   },
398 |   "language_info": {
399 |    "codemirror_mode": {
400 |     "name": "ipython",
401 |     "version": 3
402 |    },
403 |    "file_extension": ".py",
404 |    "mimetype": "text/x-python",
405 |    "name": "python",
406 |    "nbconvert_exporter": "python",
407 |    "pygments_lexer": "ipython3",
408 |    "version": "3.5.2"
409 |   }
410 |  },
411 |  "nbformat": 4,
412 |  "nbformat_minor": 2
413 | }
414 | 


--------------------------------------------------------------------------------
/Four Word Model/final_preprocessing.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import re\n",
 10 |     "from sklearn.model_selection import train_test_split\n",
 11 |     "import pickle"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 2,
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "def decontracted(phrase):\n",
 21 |     "    # specific\n",
 22 |     "    phrase = re.sub(r\"won't\", \"will not\", phrase)\n",
 23 |     "\n",
 24 |     "    # general\n",
 25 |     "    phrase = re.sub(r\"n\\'t\", \" not\", phrase)\n",
 26 |     "    phrase = re.sub(r\"\\'re\", \" are\", phrase)\n",
 27 |     "    phrase = re.sub(r\"\\'s\", \" is\", phrase)\n",
 28 |     "    phrase = re.sub(r\"\\'d\", \" would\", phrase)\n",
 29 |     "    phrase = re.sub(r\"\\'ll\", \" will\", phrase)\n",
 30 |     "    phrase = re.sub(r\"\\'t\", \" not\", phrase)\n",
 31 |     "    phrase = re.sub(r\"\\'ve\", \" have\", phrase)\n",
 32 |     "    phrase = re.sub(r\"\\'m\", \" am\", phrase)\n",
 33 |     "    phrase = re.sub(r\"\\'d've\", \" would have\", phrase)\n",
 34 |     "    phrase = re.sub(r\"\\'d'y\", \" do you\", phrase)\n",
 35 |     "    return phrase\n"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": 3,
 41 |    "metadata": {},
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "train = {}\n",
 45 |     "\n",
 46 |     "with open('./Preprocess_Files/hard/sent') as f:\n",
 47 |     "    sents = f.readlines()\n",
 48 |     "content = [x.strip() for x in sents]\n",
 49 |     "    \n",
 50 |     "with open('./Preprocess_Files/hard/sense') as f:\n",
 51 |     "    senses = f.readlines()\n",
 52 |     "\n",
 53 |     "sents = []\n",
 54 |     "for sent in content:\n",
 55 |     "    text = decontracted(sent.replace(\" ' \",\"'\"))\n",
 56 |     "    result = \"\".join(x for x in text if x.isalpha() or x.isspace())\n",
 57 |     "    result = result.replace('  ',' ').split()\n",
 58 |     "    result = [string.lower() for string in result]\n",
 59 |     "    sents.append(result)\n",
 60 |     "\n",
 61 |     "type_class = []\n",
 62 |     "type_name = []\n",
 63 |     "for sense in senses:\n",
 64 |     "    sense = sense.strip('\\n')\n",
 65 |     "    type_name.append(sense)\n",
 66 |     "    \n",
 67 |     "    sense = sense.replace('HARD1','0').replace('HARD2','1').replace('HARD3','2')\n",
 68 |     "    type_class.append(int(sense))\n",
 69 |     "\n",
 70 |     "train['hard'] = []\n",
 71 |     "train['hard'].extend([sents, type_name, type_class])"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": 4,
 77 |    "metadata": {},
 78 |    "outputs": [],
 79 |    "source": [
 80 |     "with open('./Preprocess_Files/interest/sent') as f:\n",
 81 |     "    sents = f.readlines()\n",
 82 |     "content = [x.strip() for x in sents]\n",
 83 |     "    \n",
 84 |     "with open('./Preprocess_Files/interest/sense') as f:\n",
 85 |     "    senses = f.readlines()\n",
 86 |     "\n",
 87 |     "sents = []\n",
 88 |     "for sent in content:\n",
 89 |     "    text = decontracted(sent.replace(\" ' \",\"'\"))\n",
 90 |     "    result = \"\".join(x for x in text if x.isalpha() or x.isspace())\n",
 91 |     "    result = result.replace('  ',' ').split()\n",
 92 |     "    result = [string.lower() for string in result]\n",
 93 |     "    sents.append(result)\n",
 94 |     "\n",
 95 |     "type_class = []\n",
 96 |     "type_name = []\n",
 97 |     "for sense in senses:\n",
 98 |     "    sense = sense.strip('\\n')\n",
 99 |     "    type_name.append(sense)\n",
100 |     "    \n",
101 |     "    sense = sense.replace('interest1','0').replace('interest2','1').replace('interest3','2').replace('interest4','3').replace('interest5','4').replace('interest6','5')\n",
102 |     "    type_class.append(int(sense))\n",
103 |     "\n",
104 |     "train['interest'] = []\n",
105 |     "train['interest'].extend([sents, type_name, type_class])"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "code",
110 |    "execution_count": 5,
111 |    "metadata": {},
112 |    "outputs": [],
113 |    "source": [
114 |     "with open('./Preprocess_Files/line/sent') as f:\n",
115 |     "    sents = f.readlines()\n",
116 |     "content = [x.strip() for x in sents]\n",
117 |     "    \n",
118 |     "with open('./Preprocess_Files/line/sense') as f:\n",
119 |     "    senses = f.readlines()\n",
120 |     "\n",
121 |     "sents = []\n",
122 |     "for sent in content:\n",
123 |     "    text = decontracted(sent.replace(\" ' \",\"'\"))\n",
124 |     "    result = \"\".join(x for x in text if x.isalpha() or x.isspace())\n",
125 |     "    result = result.replace('  ',' ').split()\n",
126 |     "    result = [string.lower() for string in result]\n",
127 |     "    sents.append(result)\n",
128 |     "\n",
129 |     "type_class = []\n",
130 |     "type_name = []\n",
131 |     "for sense in senses:\n",
132 |     "    sense = sense.strip('\\n')\n",
133 |     "    type_name.append(sense)\n",
134 |     "    \n",
135 |     "    sense = sense.replace('text','0').replace('phone','1').replace('product','2').replace('formation','3').replace('division','4').replace('cord','5')\n",
136 |     "    type_class.append(int(sense))\n",
137 |     "\n",
138 |     "train['line'] = []\n",
139 |     "train['line'].extend([sents, type_name, type_class])"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": 6,
145 |    "metadata": {},
146 |    "outputs": [],
147 |    "source": [
148 |     "with open('./Preprocess_Files/serve/sent') as f:\n",
149 |     "    sents = f.readlines()\n",
150 |     "content = [x.strip() for x in sents]\n",
151 |     "    \n",
152 |     "with open('./Preprocess_Files/serve/sense') as f:\n",
153 |     "    senses = f.readlines()\n",
154 |     "\n",
155 |     "sents = []\n",
156 |     "for sent in content:\n",
157 |     "    text = decontracted(sent.replace(\" ' \",\"'\"))\n",
158 |     "    result = \"\".join(x for x in text if x.isalpha() or x.isspace())\n",
159 |     "    result = result.replace('  ',' ').split()\n",
160 |     "    result = [string.lower() for string in result]\n",
161 |     "    sents.append(result)\n",
162 |     "\n",
163 |     "type_class = []\n",
164 |     "type_name = []\n",
165 |     "for sense in senses:\n",
166 |     "    sense = sense.strip('\\n')\n",
167 |     "    type_name.append(sense)\n",
168 |     "    \n",
169 |     "    sense = sense.replace('SERVE2','0').replace('SERVE6','1').replace('SERVE10','2').replace('SERVE12','3')\n",
170 |     "    type_class.append(int(sense))\n",
171 |     "\n",
172 |     "train['serve'] = []\n",
173 |     "train['serve'].extend([sents, type_name, type_class])"
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "code",
178 |    "execution_count": 8,
179 |    "metadata": {},
180 |    "outputs": [],
181 |    "source": [
182 |     "def train_test(target):\n",
183 |     "    x = train['target'][0]\n",
184 |     "    y = train['target'][2]\n",
185 |     "    x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, shuffle=True, stratify=y)\n",
186 |     "    return x_train, x_test, y_train, y_test"
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "code",
191 |    "execution_count": 9,
192 |    "metadata": {},
193 |    "outputs": [],
194 |    "source": [
195 |     "with open('full_train.pickle', 'wb') as f:\n",
196 |     "    pickle.dump(train, f)"
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "code",
201 |    "execution_count": 10,
202 |    "metadata": {},
203 |    "outputs": [
204 |     {
205 |      "name": "stdout",
206 |      "output_type": "stream",
207 |      "text": [
208 |       "['he', 'may', 'lose', 'all', 'popular', 'support', 'but', 'someone', 'has', 'to', 'kill', 'him', 'to', 'defeat', 'him', 'and', 'that', 'is', 'hard', 'to', 'do']\n",
209 |       "HARD1\n"
210 |      ]
211 |     }
212 |    ],
213 |    "source": [
214 |     "print(train['hard'][0][0])\n",
215 |     "print(train['hard'][1][0]) #class of hard"
216 |    ]
217 |   },
218 |   {
219 |    "cell_type": "code",
220 |    "execution_count": 11,
221 |    "metadata": {},
222 |    "outputs": [
223 |     {
224 |      "name": "stdout",
225 |      "output_type": "stream",
226 |      "text": [
227 |       "108\n",
228 |       "127\n",
229 |       "165\n",
230 |       "161\n"
231 |      ]
232 |     }
233 |    ],
234 |    "source": [
235 |     "def max_length(target):\n",
236 |     "    max_len = 0\n",
237 |     "    for sentence in train[target][0]:\n",
238 |     "        temp_len = len(sentence)\n",
239 |     "        max_len = max(max_len, temp_len)\n",
240 |     "    print(max_len)\n",
241 |     "\n",
242 |     "max_length('hard') \n",
243 |     "max_length('interest')\n",
244 |     "max_length('line')\n",
245 |     "max_length('serve')"
246 |    ]
247 |   },
248 |   {
249 |    "cell_type": "code",
250 |    "execution_count": null,
251 |    "metadata": {},
252 |    "outputs": [],
253 |    "source": []
254 |   }
255 |  ],
256 |  "metadata": {
257 |   "kernelspec": {
258 |    "display_name": "Python 3",
259 |    "language": "python",
260 |    "name": "python3"
261 |   },
262 |   "language_info": {
263 |    "codemirror_mode": {
264 |     "name": "ipython",
265 |     "version": 3
266 |    },
267 |    "file_extension": ".py",
268 |    "mimetype": "text/x-python",
269 |    "name": "python",
270 |    "nbconvert_exporter": "python",
271 |    "pygments_lexer": "ipython3",
272 |    "version": "3.6.2"
273 |   }
274 |  },
275 |  "nbformat": 4,
276 |  "nbformat_minor": 2
277 | }
278 | 


--------------------------------------------------------------------------------
/Four Word Model/full_train.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sshanu/Hierarchical-Word-Sense-Disambiguation-using-WordNet-Senses/74905c60e48ab1884bf5c8f208e21015f7f7fd21/Four Word Model/full_train.pickle


--------------------------------------------------------------------------------
/Four Word Model/robsr_model.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import tensorflow as tf\n",
 12 |     "tf.logging.set_verbosity(tf.logging.WARN)\n",
 13 |     "import pickle\n",
 14 |     "import numpy as np\n",
 15 |     "import os\n",
 16 |     "from sklearn.model_selection import train_test_split\n",
 17 |     "from sklearn.metrics import f1_score\n",
 18 |     "from sklearn.metrics import accuracy_score\n",
 19 |     "import os\n",
 20 |     "from tensorflow.python.client import device_lib"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": 2,
 26 |    "metadata": {
 27 |     "collapsed": true
 28 |    },
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "f = open('../Glove/word_embedding_glove', 'rb')\n",
 32 |     "word_embedding = pickle.load(f)\n",
 33 |     "f.close()\n",
 34 |     "word_embedding = word_embedding[: len(word_embedding)-1]\n",
 35 |     "\n",
 36 |     "f = open('../Glove/vocab_glove', 'rb')\n",
 37 |     "vocab = pickle.load(f)\n",
 38 |     "f.close()\n",
 39 |     "\n",
 40 |     "word2id = dict((w, i) for i,w in enumerate(vocab))\n",
 41 |     "id2word = dict((i, w) for i,w in enumerate(vocab))\n",
 42 |     "\n",
 43 |     "unknown_token = \"UNKNOWN_TOKEN\"\n",
 44 |     "\n",
 45 |     "f = open(\"train.pickle\", 'rb')\n",
 46 |     "full_data = pickle.load(f)\n",
 47 |     "f.close()"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": 3,
 53 |    "metadata": {
 54 |     "collapsed": true
 55 |    },
 56 |    "outputs": [],
 57 |    "source": [
 58 |     "# Model Description\n",
 59 |     "sense_word = 'hard'\n",
 60 |     "model_name = 'basic'\n",
 61 |     "model_dir = 'output/' + sense_word + '/' + model_name\n",
 62 |     "save_dir = os.path.join(model_dir, \"save/\")\n",
 63 |     "log_dir = os.path.join(model_dir, \"log\")\n",
 64 |     "\n",
 65 |     "if not os.path.exists(save_dir):\n",
 66 |     "    os.mkdir(save_dir)\n",
 67 |     "if not os.path.exists(log_dir):\n",
 68 |     "    os.mkdir(log_dir)"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": 4,
 74 |    "metadata": {
 75 |     "collapsed": true
 76 |    },
 77 |    "outputs": [],
 78 |    "source": [
 79 |     "# Parameters\n",
 80 |     "mode = 'train'\n",
 81 |     "num_senses = 3\n",
 82 |     "batch_size = 64\n",
 83 |     "vocab_size = len(vocab)\n",
 84 |     "unk_vocab_size = 1\n",
 85 |     "word_emb_size = len(word_embedding[0])\n",
 86 |     "max_sent_size = 200\n",
 87 |     "hidden_size = 100\n",
 88 |     "keep_prob = 0.5\n",
 89 |     "l2_lambda = 0.001\n",
 90 |     "init_lr = 0.001\n",
 91 |     "decay_steps = 5000\n",
 92 |     "decay_rate = 0.96\n",
 93 |     "clip_norm = 1\n",
 94 |     "clipping = True"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": 5,
100 |    "metadata": {
101 |     "collapsed": true
102 |    },
103 |    "outputs": [],
104 |    "source": [
105 |     "# MODEL\n",
106 |     "x = tf.placeholder('int32', [batch_size, max_sent_size], name=\"x\")\n",
107 |     "y = tf.placeholder('int32', [batch_size], name=\"y\")\n",
108 |     "x_mask  = tf.placeholder('bool', [batch_size, max_sent_size], name='x_mask') \n",
109 |     "is_train = tf.placeholder('bool', [], name='is_train')\n",
110 |     "word_emb_mat = tf.placeholder('float', [None, word_emb_size], name='emb_mat')\n",
111 |     "input_keep_prob = tf.cond(is_train,lambda:keep_prob, lambda:tf.constant(1.0))\n",
112 |     "x_len = tf.reduce_sum(tf.cast(x_mask, 'int32'), 1)\n",
113 |     "\n",
114 |     "with tf.name_scope(\"word_embedding\"):\n",
115 |     "    if mode == 'train':\n",
116 |     "        unk_word_emb_mat = tf.get_variable(\"word_emb_mat\", dtype='float', shape=[unk_vocab_size, word_emb_size], initializer=tf.contrib.layers.xavier_initializer(uniform=True, seed=0, dtype=tf.float32))\n",
117 |     "    else:\n",
118 |     "        unk_word_emb_mat = tf.get_variable(\"word_emb_mat\", shape=[unk_vocab_size, word_emb_size], dtype='float')\n",
119 |     "        \n",
120 |     "    final_word_emb_mat = tf.concat([word_emb_mat, unk_word_emb_mat], 0)\n",
121 |     "    Wx = tf.nn.embedding_lookup(final_word_emb_mat, x)  \n",
122 |     "\n",
123 |     "with tf.variable_scope(\"lstm\"):\n",
124 |     "    cell_fw = tf.contrib.rnn.BasicLSTMCell(hidden_size,state_is_tuple=True)\n",
125 |     "    cell_bw = tf.contrib.rnn.BasicLSTMCell(hidden_size,state_is_tuple=True)\n",
126 |     "\n",
127 |     "    d_cell_fw = tf.contrib.rnn.DropoutWrapper(cell_fw, input_keep_prob=input_keep_prob)\n",
128 |     "    d_cell_bw = tf.contrib.rnn.DropoutWrapper(cell_bw, input_keep_prob=input_keep_prob)\n",
129 |     "    \n",
130 |     "    (fw_h, bw_h), _ = tf.nn.bidirectional_dynamic_rnn(d_cell_fw, d_cell_bw, Wx, sequence_length=x_len, dtype='float', scope='lstm')\n",
131 |     "    h = tf.concat([fw_h, bw_h], 2)\n",
132 |     "\n",
133 |     "def attention(input_x, input_mask, W_att):\n",
134 |     "    h_masked = tf.boolean_mask(input_x, input_mask)\n",
135 |     "    h_tanh = tf.tanh(h_masked)\n",
136 |     "    u = tf.matmul(h_tanh, W_att)\n",
137 |     "    a = tf.nn.softmax(u)\n",
138 |     "    c = tf.reduce_sum(tf.multiply(h_tanh, a), 0)  \n",
139 |     "    return c\n",
140 |     "\n",
141 |     "with tf.variable_scope(\"attention\"):\n",
142 |     "    W_att = tf.Variable(tf.truncated_normal([2*hidden_size, 1], mean=0.0, stddev=1.0, seed=0), name=\"W_att\")\n",
143 |     "    c = tf.expand_dims(attention(h[0], x_mask[0], W_att), 0)\n",
144 |     "    for i in range(1, batch_size):\n",
145 |     "        c = tf.concat([c, tf.expand_dims(attention(h[i], x_mask[i], W_att), 0)], 0)\n",
146 |     "        \n",
147 |     "with tf.variable_scope(\"softmax_layer\"):\n",
148 |     "    W = tf.Variable(tf.truncated_normal([2*hidden_size, num_senses], mean=0.0, stddev=1.0, seed=0), name=\"W\")\n",
149 |     "    b = tf.Variable(tf.zeros([num_senses]), name=\"b\")\n",
150 |     "    drop_c = tf.nn.dropout(c, input_keep_prob)\n",
151 |     "    logits = tf.matmul(drop_c, W) + b\n",
152 |     "    predictions = tf.argmax(logits, 1)\n",
153 |     "\n",
154 |     "loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=y))\n",
155 |     "global_step = tf.Variable(0, trainable=False, name=\"global_step\")\n",
156 |     "\n",
157 |     "learning_rate = tf.train.exponential_decay(init_lr, global_step, decay_steps, decay_rate, staircase=True)\n",
158 |     "\n",
159 |     "tv_all = tf.trainable_variables()\n",
160 |     "tv_regu =[]\n",
161 |     "for t in tv_all:\n",
162 |     "    if t.name.find('b:')==-1:\n",
163 |     "        tv_regu.append(t)\n",
164 |     "        \n",
165 |     "# l2 Loss\n",
166 |     "l2_loss = l2_lambda * tf.reduce_sum([ tf.nn.l2_loss(v) for v in tv_regu ])\n",
167 |     "\n",
168 |     "total_loss = loss + l2_loss\n",
169 |     "\n",
170 |     "# Optimizer for loss\n",
171 |     "optimizer = tf.train.AdamOptimizer(learning_rate)\n",
172 |     "\n",
173 |     "# Gradients and Variables for Loss\n",
174 |     "grads_vars = optimizer.compute_gradients(total_loss)\n",
175 |     "\n",
176 |     "# Clipping of Gradients\n",
177 |     "clipped_grads = grads_vars\n",
178 |     "if(clipping == True):\n",
179 |     "    clipped_grads = [(tf.clip_by_norm(grad, clip_norm), var) for grad, var in clipped_grads]\n",
180 |     "\n",
181 |     "# Training Optimizer for Total Loss\n",
182 |     "train_op = optimizer.apply_gradients(clipped_grads, global_step=global_step)\n",
183 |     "\n",
184 |     "# Summaries\n",
185 |     "var_summaries = []\n",
186 |     "for v in tv_all:\n",
187 |     "    var_summary = tf.summary.histogram(\"{}/var\".format(v.name), v)\n",
188 |     "    var_summaries.append(var_summary)\n",
189 |     "\n",
190 |     "var_summaries_merged = tf.summary.merge(var_summaries)\n",
191 |     "\n",
192 |     "loss_summary = tf.summary.scalar(\"loss\", loss)\n",
193 |     "total_loss_summary = tf.summary.scalar(\"total_loss\", total_loss)\n",
194 |     "summary = tf.summary.merge_all()"
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "code",
199 |    "execution_count": 6,
200 |    "metadata": {},
201 |    "outputs": [],
202 |    "source": [
203 |     "os.environ[\"CUDA_DEVICE_ORDER\"]=\"PCI_BUS_ID\"   # see issue #152\n",
204 |     "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"0\"\n",
205 |     "sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))\n",
206 |     "sess.run(tf.global_variables_initializer())                          # For initializing all the variables\n",
207 |     "saver = tf.train.Saver()                                             # For Saving the model\n",
208 |     "summary_writer = tf.summary.FileWriter(log_dir, sess.graph)          # For writing Summaries"
209 |    ]
210 |   },
211 |   {
212 |    "cell_type": "code",
213 |    "execution_count": 6,
214 |    "metadata": {
215 |     "scrolled": true
216 |    },
217 |    "outputs": [
218 |     {
219 |      "name": "stderr",
220 |      "output_type": "stream",
221 |      "text": [
222 |       "/users/btech/aviraj/cs771/lib/python3.5/site-packages/sklearn/model_selection/_split.py:2026: FutureWarning: From version 0.21, test_size will always complement train_size unless both are specified.\n",
223 |       "  FutureWarning)\n"
224 |      ]
225 |     }
226 |    ],
227 |    "source": [
228 |     "# Splitting\n",
229 |     "data_x = full_data[sense_word][0]\n",
230 |     "data_y = full_data[sense_word][2]\n",
231 |     "x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, train_size=0.8, shuffle=True, stratify=data_y, random_state=0)\n",
232 |     "\n",
233 |     "x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, train_size=0.9, shuffle=True, stratify=y_train, random_state=0)"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "code",
238 |    "execution_count": 7,
239 |    "metadata": {
240 |     "collapsed": true
241 |    },
242 |    "outputs": [],
243 |    "source": [
244 |     "def data_prepare(x):\n",
245 |     "    num_examples = len(x)\n",
246 |     "\n",
247 |     "    xx = np.zeros([num_examples, max_sent_size], dtype=int)\n",
248 |     "    xx_mask = np.zeros([num_examples, max_sent_size], dtype=bool)\n",
249 |     "\n",
250 |     "    for j in range(num_examples):\n",
251 |     "        for i in range(max_sent_size):\n",
252 |     "            if(i>=len(x[j])):\n",
253 |     "                break\n",
254 |     "            w = x[j][i]\n",
255 |     "            xx[j][i] = word2id[w] if w in word2id else word2id['UNKNOWN_TOKEN']\n",
256 |     "            xx_mask[j][i] = True\n",
257 |     "            \n",
258 |     "    return xx, xx_mask\n",
259 |     "\n",
260 |     "def eval_score(yy, pred):\n",
261 |     "    num_batches = int(len(yy)/batch_size)\n",
262 |     "    f1 = f1_score(yy[:batch_size*num_batches], pred, average='macro')\n",
263 |     "    accu = accuracy_score(yy[:batch_size*num_batches], pred)\n",
264 |     "    return f1*100, accu*100\n",
265 |     "\n",
266 |     "def model(xx, yy, mask, train_cond=True):\n",
267 |     "    num_batches = int(len(xx)/batch_size)\n",
268 |     "    losses = 0\n",
269 |     "    preds = []\n",
270 |     "    for j in range(num_batches): \n",
271 |     "        \n",
272 |     "        s = j * batch_size\n",
273 |     "        e = (j+1) * batch_size\n",
274 |     "        \n",
275 |     "        feed_dict = {x:xx[s:e], y:yy[s:e], x_mask:mask[s:e], is_train:train_cond, input_keep_prob:keep_prob, word_emb_mat:word_embedding}\n",
276 |     "        \n",
277 |     "        \n",
278 |     "        if(train_cond==True):\n",
279 |     "            _, _loss, step, _summary = sess.run([train_op, total_loss, global_step, summary], feed_dict)\n",
280 |     "            summary_writer.add_summary(_summary, step)  \n",
281 |     "#             print(\"Steps:{}\".format(step), \", Loss: {}\".format(_loss))\n",
282 |     "\n",
283 |     "        else:\n",
284 |     "            _loss, pred = sess.run([total_loss, predictions], feed_dict)\n",
285 |     "            preds.append(pred)\n",
286 |     "            \n",
287 |     "        losses +=_loss\n",
288 |     "\n",
289 |     "    if(train_cond==False):\n",
290 |     "        y_pred = []\n",
291 |     "        for i in range(num_batches):\n",
292 |     "            for pred in preds[i]:\n",
293 |     "                y_pred.append(pred)\n",
294 |     "        return losses/num_batches, y_pred\n",
295 |     "    \n",
296 |     "    return losses/num_batches, step"
297 |    ]
298 |   },
299 |   {
300 |    "cell_type": "code",
301 |    "execution_count": 8,
302 |    "metadata": {
303 |     "collapsed": true
304 |    },
305 |    "outputs": [],
306 |    "source": [
307 |     "x_id_train, mask_train = data_prepare(x_train)\n",
308 |     "x_id_val, mask_val = data_prepare(x_val)\n",
309 |     "x_id_test, mask_test = data_prepare(x_test)"
310 |    ]
311 |   },
312 |   {
313 |    "cell_type": "code",
314 |    "execution_count": null,
315 |    "metadata": {
316 |     "scrolled": true
317 |    },
318 |    "outputs": [],
319 |    "source": [
320 |     "num_epochs = 10\n",
321 |     "\n",
322 |     "for i in range(num_epochs):\n",
323 |     "    \n",
324 |     "    random = np.random.choice(len(y_train), size=(len(y_train)), replace=False)\n",
325 |     "    x_id_train = x_id_train[random]\n",
326 |     "    y_train = y_train[random]\n",
327 |     "    mask_train = mask_train[random]\n",
328 |     "            \n",
329 |     "    losses, step = model(x_id_train, y_train, mask_train)\n",
330 |     "    print(\"Epoch:\", i+1,\"Step:\", step, \"loss:\",losses)\n",
331 |     "    saver.save(sess, save_path=save_dir)                         \n",
332 |     "    print(\"Saved Model Complete\")\n",
333 |     "    \n",
334 |     "    if((i+1)%2==0):\n",
335 |     "        train_loss, train_pred = model(x_id_train, y_train, mask_train, train_cond=False)\n",
336 |     "        f1_, accu_ = eval_score(y_train, train_pred)\n",
337 |     "        print(\"Train: F1 Score: \",  f1_, \"Accuracy: \", accu_, \"Loss: \", val_loss)\n",
338 |     "        val_loss, val_pred = model(x_id_val, y_val, mask_val, train_cond=False)\n",
339 |     "        f1_, accu_ = eval_score(y_val, val_pred)\n",
340 |     "        print(\"Val: F1 Score: \",  f1_, \"Accuracy: \", accu_, \"Loss: \", val_loss)\n",
341 |     "        \n",
342 |     "test_loss, test_pred = model(x_id_test, y_test, mask_test, train_cond=False)\n",
343 |     "f1_, accu_ = eval_score(y_test, test_pred)\n",
344 |     "print(\"Test: F1 Score: \",  f1_, \"Accuracy: \", accu_, \"Loss: \", test_loss)"
345 |    ]
346 |   },
347 |   {
348 |    "cell_type": "code",
349 |    "execution_count": null,
350 |    "metadata": {
351 |     "collapsed": true
352 |    },
353 |    "outputs": [],
354 |    "source": []
355 |   },
356 |   {
357 |    "cell_type": "code",
358 |    "execution_count": 10,
359 |    "metadata": {},
360 |    "outputs": [],
361 |    "source": [
362 |     "saver.restore(sess, save_dir)"
363 |    ]
364 |   }
365 |  ],
366 |  "metadata": {
367 |   "kernelspec": {
368 |    "display_name": "cs771",
369 |    "language": "python",
370 |    "name": "cs771"
371 |   },
372 |   "language_info": {
373 |    "codemirror_mode": {
374 |     "name": "ipython",
375 |     "version": 3
376 |    },
377 |    "file_extension": ".py",
378 |    "mimetype": "text/x-python",
379 |    "name": "python",
380 |    "nbconvert_exporter": "python",
381 |    "pygments_lexer": "ipython3",
382 |    "version": "3.5.2"
383 |   }
384 |  },
385 |  "nbformat": 4,
386 |  "nbformat_minor": 2
387 | }
388 | 


--------------------------------------------------------------------------------
/Four Word Model/train.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sshanu/Hierarchical-Word-Sense-Disambiguation-using-WordNet-Senses/74905c60e48ab1884bf5c8f208e21015f7f7fd21/Four Word Model/train.pickle


--------------------------------------------------------------------------------
/Four Word Model/words_not_in_vocab.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sshanu/Hierarchical-Word-Sense-Disambiguation-using-WordNet-Senses/74905c60e48ab1884bf5c8f208e21015f7f7fd21/Four Word Model/words_not_in_vocab.pickle


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Shanu Kumar
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## Word Sense Disambiguation 
 2 | 
 3 | Word sense disambiguation (WSD) is the ability to identify the meaning of words in context. We address this problem using series of end-to-end neural architectures using bidirectional Long Short Term Memory (LSTM). We propose two variants for WSD: an end-to-end word specific neural model and all-words neural model. In the word specific models we have to train models for every disambiguation target word. We addressed this issue using the all-words model which rely on sequence learning. We also used POS tags to improve the performance. We tried different variants of attention mechanisms for the all-words model. Performance was boosted by using convolutional neural networks (CNN) which captures local features around the words that is normally what humans do for predicting the senses. We further improved the performance using hierarchical models. We used POS tags as hierarchy and used two variants as soft masking and hard masking.
 4 | 
 5 | ### Methods
 6 | 
 7 | * [Word Specific Model trained on Four Word Dataset](https://github.com/Sshanu/Word-Sense-Disambiguation/tree/master/Four%20Word%20Model)
 8 | * [Word Specific Model trained on One Million Dataset](https://github.com/Sshanu/Word-Sense-Disambiguation/tree/master/one_million)
 9 | * [All-words Model](https://github.com/Sshanu/Word-Sense-Disambiguation/tree/master/one_million/all-word)
10 | * [Hierarchical Model](https://github.com/Sshanu/Word-Sense-Disambiguation/blob/master/one_million/all-word/Model-aw-lex-hierarchical-2.ipynb)
11 | 
12 | ### Best Models
13 | * [All-words Model+CNN](https://github.com/Sshanu/Word-Sense-Disambiguation/blob/master/one_million/all-word/Model-aw-lex-1.4.ipynb)
14 | * [All-words Hierarchical Model+Soft Masking](https://github.com/Sshanu/Word-Sense-Disambiguation/blob/master/one_million/all-word/Model-aw-lex-hierarchical-2.ipynb)
15 | * [All-words Hierarchical Model+Hard Masking](https://github.com/Sshanu/Word-Sense-Disambiguation/blob/master/one_million/all-word/Model-aw-lex-hierarchical-4.ipynb)
16 | 
17 | 
18 | ### Details
19 | For detailed information about models and results:
20 | * [Report](https://github.com/Sshanu/Word-Sense-Disambiguation/blob/master/UGP_Report.pdf)
21 | * [Presentation](https://github.com/Sshanu/Word-Sense-Disambiguation/blob/master/UGP_presentation.pdf)
22 | 
23 | ### All words Models
24 | 
25 | #### [All-words Hierarchical Model+Soft Masking](https://github.com/Sshanu/Word-Sense-Disambiguation/blob/master/one_million/all-word/Model-aw-lex-hierarchical-2.ipynb)
26 | <p align="center">
27 | <img src="models_diagram/all-word-7.png" height="500" width="700">
28 | </p>
29 | 
30 | #### [All-words Hierarchical Model+Hard Masking](https://github.com/Sshanu/Word-Sense-Disambiguation/blob/master/one_million/all-word/Model-aw-lex-hierarchical-4.ipynb)
31 | <p align="center">
32 | <img src="models_diagram/all-word-8.png" height="500" width="700">
33 | </p>
34 | 
35 | #### [Basic Model](https://github.com/Sshanu/Word-Sense-Disambiguation/blob/master/one_million/all-word/Model-aw-1-multigpu-1.ipynb)
36 | <p align="center">
37 | <img src="models_diagram/all-word-1.png" height="400" width="450">
38 | </p>
39 | 
40 | #### [Basic Model+Local Attention](https://github.com/Sshanu/Word-Sense-Disambiguation/blob/master/one_million/all-word/Model-aw-lex-local_attention-fast-v2-4.ipynb)
41 | <p align="center">
42 | <img src="models_diagram/all-word-3.png" height="450" width="600">
43 | </p>
44 | 
45 | #### [Basic Model+Local Attentionn+Hidden States](https://github.com/Sshanu/Word-Sense-Disambiguation/blob/master/one_million/all-word/Model-aw-lex-local_attention-fast-v2-6.ipynb)
46 | <p align="center">
47 | <img src="models_diagram/all-word-4.png" height="450" width="600">
48 | </p>
49 | 
50 | #### [Basic Model+Local Attentionn+Hidden States+CRF](https://github.com/Sshanu/Word-Sense-Disambiguation/blob/master/one_million/all-word/Model-aw-lex-local_attention-fast-v3-1.ipynb)
51 | <p align="center">
52 | <img src="models_diagram/all-word-6.png" height="450" width="600">
53 | </p>
54 | 
55 | #### [Basic Model+Gated Attention](https://github.com/Sshanu/Word-Sense-Disambiguation/blob/master/one_million/all-word/Model-aw-lex-local_attention-fast-v2-7.ipynb)
56 | <p align="center">
57 | <img src="models_diagram/all-word-5.png" height="450" width="600">
58 | </p>
59 | 
60 | #### [Basic Model+CNN](https://github.com/Sshanu/Word-Sense-Disambiguation/blob/master/one_million/all-word/Model-aw-lex-1.4.ipynb)
61 | <p align="center">
62 | <img src="models_diagram/all-word-2.png" height="450" width="600">
63 | </p>
64 | 
65 | ### Word Specific Models
66 | 
67 | #### [Basic Model](https://github.com/Sshanu/Word-Sense-Disambiguation/blob/master/one_million/force/Force-Model-1-multigpu-1.ipynb)
68 | Files with name as Model-1-multigpu-1.ipynb are the basic models
69 | <p align="center">
70 | <img src="models_diagram/model-1.png" height="350" width="400">
71 | </p>
72 | 
73 | #### [Basic Model+POS Tags](https://github.com/Sshanu/Word-Sense-Disambiguation/blob/master/one_million/force/Force-Model-2-multigpu-1.ipynb)
74 | Files with name as Model-1-multigpu-2.ipynb are the basic models
75 | <p align="center">
76 | <img src="models_diagram/model-2.png" height="400" width="600">
77 | </p>
78 | 
79 | #### [Basic Model+POS Tags+CRF](https://github.com/Sshanu/Word-Sense-Disambiguation/blob/master/one_million/force/Force-Model-3-multigpu-1.ipynb)
80 | Files with name as Model-1-multigpu-3.ipynb are the basic models
81 | <p align="center">
82 | <img src="models_diagram/model-3.png" height="400" width="600">
83 | </p>
84 | 
85 | #### [Word specific hierarchical model](https://github.com/Sshanu/Word-Sense-Disambiguation/blob/master/one_million/force/Force-Model-4-multigpu-1.ipynb)
86 | Files with name as Model-1-multigpu-4.ipynb are the basic models
87 | <p align="center">
88 | <img src="models_diagram/model-4.png" height="450" width="600">
89 | </p>
90 | 
91 | 


--------------------------------------------------------------------------------
/UGP_Report.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sshanu/Hierarchical-Word-Sense-Disambiguation-using-WordNet-Senses/74905c60e48ab1884bf5c8f208e21015f7f7fd21/UGP_Report.pdf


--------------------------------------------------------------------------------
/UGP_presentation.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sshanu/Hierarchical-Word-Sense-Disambiguation-using-WordNet-Senses/74905c60e48ab1884bf5c8f208e21015f7f7fd21/UGP_presentation.pdf


--------------------------------------------------------------------------------
/models_diagram/all-word-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sshanu/Hierarchical-Word-Sense-Disambiguation-using-WordNet-Senses/74905c60e48ab1884bf5c8f208e21015f7f7fd21/models_diagram/all-word-1.png


--------------------------------------------------------------------------------
/models_diagram/all-word-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sshanu/Hierarchical-Word-Sense-Disambiguation-using-WordNet-Senses/74905c60e48ab1884bf5c8f208e21015f7f7fd21/models_diagram/all-word-2.png


--------------------------------------------------------------------------------
/models_diagram/all-word-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sshanu/Hierarchical-Word-Sense-Disambiguation-using-WordNet-Senses/74905c60e48ab1884bf5c8f208e21015f7f7fd21/models_diagram/all-word-3.png


--------------------------------------------------------------------------------
/models_diagram/all-word-4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sshanu/Hierarchical-Word-Sense-Disambiguation-using-WordNet-Senses/74905c60e48ab1884bf5c8f208e21015f7f7fd21/models_diagram/all-word-4.png


--------------------------------------------------------------------------------
/models_diagram/all-word-5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sshanu/Hierarchical-Word-Sense-Disambiguation-using-WordNet-Senses/74905c60e48ab1884bf5c8f208e21015f7f7fd21/models_diagram/all-word-5.png


--------------------------------------------------------------------------------
/models_diagram/all-word-6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sshanu/Hierarchical-Word-Sense-Disambiguation-using-WordNet-Senses/74905c60e48ab1884bf5c8f208e21015f7f7fd21/models_diagram/all-word-6.png


--------------------------------------------------------------------------------
/models_diagram/all-word-7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sshanu/Hierarchical-Word-Sense-Disambiguation-using-WordNet-Senses/74905c60e48ab1884bf5c8f208e21015f7f7fd21/models_diagram/all-word-7.png


--------------------------------------------------------------------------------
/models_diagram/all-word-8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sshanu/Hierarchical-Word-Sense-Disambiguation-using-WordNet-Senses/74905c60e48ab1884bf5c8f208e21015f7f7fd21/models_diagram/all-word-8.png


--------------------------------------------------------------------------------
/models_diagram/model-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sshanu/Hierarchical-Word-Sense-Disambiguation-using-WordNet-Senses/74905c60e48ab1884bf5c8f208e21015f7f7fd21/models_diagram/model-1.png


--------------------------------------------------------------------------------
/models_diagram/model-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sshanu/Hierarchical-Word-Sense-Disambiguation-using-WordNet-Senses/74905c60e48ab1884bf5c8f208e21015f7f7fd21/models_diagram/model-2.png


--------------------------------------------------------------------------------
/models_diagram/model-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sshanu/Hierarchical-Word-Sense-Disambiguation-using-WordNet-Senses/74905c60e48ab1884bf5c8f208e21015f7f7fd21/models_diagram/model-3.png


--------------------------------------------------------------------------------
/models_diagram/model-4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sshanu/Hierarchical-Word-Sense-Disambiguation-using-WordNet-Senses/74905c60e48ab1884bf5c8f208e21015f7f7fd21/models_diagram/model-4.png


--------------------------------------------------------------------------------
/one_million/One-Million All-Word Data Sampling Coarse.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "scrolled": true
  8 |    },
  9 |    "outputs": [
 10 |     {
 11 |      "name": "stdout",
 12 |      "output_type": "stream",
 13 |      "text": [
 14 |       "46\n"
 15 |      ]
 16 |     }
 17 |    ],
 18 |    "source": [
 19 |     "import pickle\n",
 20 |     "import numpy as np\n",
 21 |     "import os\n",
 22 |     "from sklearn.model_selection import train_test_split\n",
 23 |     "from collections import Counter\n",
 24 |     "from imblearn.over_sampling import RandomOverSampler\n",
 25 |     "\n",
 26 |     "f = open(\"../../dataset/sense/dict_sense-keys\", 'rb')\n",
 27 |     "dict_sense_keys = pickle.load(f)\n",
 28 |     "f.close()\n",
 29 |     "\n",
 30 |     "f = open(\"../../dataset/sense/dict_word-sense\", 'rb')\n",
 31 |     "dict_word_sense = pickle.load(f)\n",
 32 |     "f.close()\n",
 33 |     "\n",
 34 |     "f = open('../Glove/word_embedding_glove', 'rb')\n",
 35 |     "word_embedding = pickle.load(f)\n",
 36 |     "f.close()\n",
 37 |     "word_embedding = word_embedding[: len(word_embedding)-1]\n",
 38 |     "\n",
 39 |     "f = open('../Glove/vocab_glove', 'rb')\n",
 40 |     "vocab = pickle.load(f)\n",
 41 |     "f.close()\n",
 42 |     "\n",
 43 |     "word2id = dict((w, i) for i,w in enumerate(vocab))\n",
 44 |     "id2word = dict((i, w) for i,w in enumerate(vocab))\n",
 45 |     "\n",
 46 |     "unknown_token = \"UNKNOWN_TOKEN\"\n",
 47 |     "\n",
 48 |     "with open('/data/aviraj/dataset/raw_preprocess_train','rb') as f:\n",
 49 |     "    data=pickle.load(f)\n",
 50 |     "\n",
 51 |     "with open('/data/aviraj/dataset/fulldata_vocab_sense','rb') as f:\n",
 52 |     "    vocab_lex=pickle.load(f)\n",
 53 |     "\n",
 54 |     "lex2id = dict((s, i) for i,s in enumerate(vocab_lex))\n",
 55 |     "id2lex = dict((i, s) for i,s in enumerate(vocab_lex))\n",
 56 |     "\n",
 57 |     "print(len(vocab_lex))\n",
 58 |     "max_sent_size = 200"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": 2,
 64 |    "metadata": {},
 65 |    "outputs": [
 66 |     {
 67 |      "name": "stdout",
 68 |      "output_type": "stream",
 69 |      "text": [
 70 |       "12\n"
 71 |      ]
 72 |     }
 73 |    ],
 74 |    "source": [
 75 |     "_pos = []\n",
 76 |     "for i in range(len(data)):\n",
 77 |     "    for pp in data[i][4]:\n",
 78 |     "        _pos.append(pp)\n",
 79 |     "        \n",
 80 |     "pos_count = Counter(_pos)\n",
 81 |     "pos_count = pos_count.most_common()\n",
 82 |     "vocab_pos = [pp for pp, c in pos_count]\n",
 83 |     "pos2id = dict((s, i) for i,s in enumerate(vocab_pos))\n",
 84 |     "print(len(vocab_pos))"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": 3,
 90 |    "metadata": {
 91 |     "collapsed": true
 92 |    },
 93 |    "outputs": [],
 94 |    "source": [
 95 |     "data_y1 = []\n",
 96 |     "data_y2 = []\n",
 97 |     "data_y3 = []\n",
 98 |     "for i in range(len(data)):\n",
 99 |     "    if (len(data[i][1])<=200):\n",
100 |     "        for j in range(len(data[i][2])):\n",
101 |     "            if data[i][2][j] is not None:\n",
102 |     "                data_y1.append(dict_sense_keys[data[i][2][j]][3])\n",
103 |     "                data_y2.append(dict_sense_keys[data[i][2][j]][4])\n",
104 |     "                data_y3.append(dict_sense_keys[data[i][2][j]][5])\n",
105 |     "\n",
106 |     "sense_count1 = Counter(data_y1)\n",
107 |     "sense_count1 = sense_count1.most_common()\n",
108 |     "sense_count2 = Counter(data_y2)\n",
109 |     "sense_count4 = sense_count2.most_common(272)\n",
110 |     "sense_count2 = sense_count2.most_common(312)\n",
111 |     "sense_count3 = Counter(data_y3)\n",
112 |     "sense_count5 = sense_count3.most_common(505)\n",
113 |     "sense_count3 = sense_count3.most_common(1051)\n",
114 |     "\n",
115 |     "dict_sense_count1 = dict(sense_count1)\n",
116 |     "dict_sense_count2 = dict(sense_count2)\n",
117 |     "dict_sense_count3 = dict(sense_count3)\n",
118 |     "dict_sense_count4 = dict(sense_count4)\n",
119 |     "dict_sense_count5 = dict(sense_count5)"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "code",
124 |    "execution_count": 4,
125 |    "metadata": {
126 |     "scrolled": true
127 |    },
128 |    "outputs": [
129 |     {
130 |      "name": "stdout",
131 |      "output_type": "stream",
132 |      "text": [
133 |       "46 312 1051 272 505\n"
134 |      ]
135 |     }
136 |    ],
137 |    "source": [
138 |     "print(len(sense_count1), len(sense_count2), len(sense_count3), len(sense_count4), len(sense_count5))"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": 5,
144 |    "metadata": {
145 |     "collapsed": true
146 |    },
147 |    "outputs": [],
148 |    "source": [
149 |     "data_x = []\n",
150 |     "data_pos = []\n",
151 |     "data_label1 = []\n",
152 |     "data_label2 = []\n",
153 |     "data_label3 = []\n",
154 |     "data_label4 = []\n",
155 |     "data_label5 = []\n",
156 |     "\n",
157 |     "for i in range(len(data)):\n",
158 |     "    if not all(np.array(data[i][2])==None) and (len(data[i][1])<=200):\n",
159 |     "        data_label1.append([ss if ss is not None and dict_sense_keys[ss][3] in dict_sense_count1 else None for ss in data[i][2]])\n",
160 |     "        data_label2.append([ss if ss is not None and dict_sense_keys[ss][4] in dict_sense_count2 else None for ss in data[i][2]])\n",
161 |     "        data_label3.append([ss if ss is not None and dict_sense_keys[ss][5] in dict_sense_count3 else None for ss in data[i][2]])\n",
162 |     "        data_label4.append([ss if ss is not None and dict_sense_keys[ss][4] in dict_sense_count4 else None for ss in data[i][2]])\n",
163 |     "        data_label5.append([ss if ss is not None and dict_sense_keys[ss][5] in dict_sense_count5 else None for ss in data[i][2]])\n",
164 |     "        data_x.append(data[i][1])\n",
165 |     "        data_pos.append(data[i][4])"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "code",
170 |    "execution_count": 6,
171 |    "metadata": {
172 |     "collapsed": true
173 |    },
174 |    "outputs": [],
175 |    "source": [
176 |     "def data_prepare(sense_id, x, pos, y, sense_count, lex_cond=False, pos_cond=False):\n",
177 |     "    num_examples = len(x)\n",
178 |     "    \n",
179 |     "    vocab_sense = [s for s, c in sense_count]\n",
180 |     "    sense2id = dict((s, i) for i,s in enumerate(vocab_sense))\n",
181 |     "    \n",
182 |     "    xx = np.zeros([num_examples, max_sent_size], dtype=int)\n",
183 |     "    xx_mask = np.zeros([num_examples, max_sent_size], dtype=bool)\n",
184 |     "    ss_mask = np.zeros([num_examples, max_sent_size], dtype=bool)\n",
185 |     "    yy = np.zeros([num_examples,max_sent_size], dtype=int)\n",
186 |     "    y_lex = np.zeros([num_examples, max_sent_size], dtype=int)\n",
187 |     "    y_pos = np.zeros([num_examples, max_sent_size], dtype=int)\n",
188 |     "        \n",
189 |     "    for j in range(num_examples):\n",
190 |     "        for i in range(max_sent_size):\n",
191 |     "            if(i>=len(x[j])):\n",
192 |     "                break\n",
193 |     "            w = x[j][i]\n",
194 |     "            s = y[j][i]\n",
195 |     "            p = pos[j][i]\n",
196 |     "            xx[j][i] = word2id[w] if w in word2id else word2id['UNKNOWN_TOKEN']\n",
197 |     "            xx_mask[j][i] = True\n",
198 |     "            ss_mask[j][i] = True if s is not None and dict_sense_keys[s][sense_id] in vocab_sense else False\n",
199 |     "            yy[j][i] = sense2id[dict_sense_keys[s][sense_id]] if s is not None and dict_sense_keys[s][sense_id] in vocab_sense else 0\n",
200 |     "            if(lex_cond):\n",
201 |     "                y_lex[j][i] = lex2id[dict_sense_keys[s][3]] if s is not None and dict_sense_keys[s][3] in vocab_lex else len(vocab_lex)\n",
202 |     "            if(pos_cond):\n",
203 |     "                y_pos[j][i] = pos2id[p] if p in vocab_pos else len(vocab_pos)\n",
204 |     "        \n",
205 |     "    return xx, xx_mask, ss_mask, yy, y_lex, y_pos"
206 |    ]
207 |   },
208 |   {
209 |    "cell_type": "code",
210 |    "execution_count": 7,
211 |    "metadata": {
212 |     "collapsed": true
213 |    },
214 |    "outputs": [],
215 |    "source": [
216 |     "data_x = np.array(data_x)\n",
217 |     "data_pos = np.array(data_pos)\n",
218 |     "\n",
219 |     "def train_val_data(name, sense_id, index, split_label, data_label, sense_count, sampling_list, lex_cond=False, pos_cond=False, sampling=False):\n",
220 |     "    \n",
221 |     "    index_train, index_val, label_train_id, label_val_id = train_test_split(index, split_label, train_size=0.8, shuffle=True, stratify=split_label, random_state=0)\n",
222 |     "  \n",
223 |     "    if(sampling):\n",
224 |     "        dict_sample = dict(sampling_list)\n",
225 |     "        sm = RandomOverSampler(ratio=dict_sample)\n",
226 |     "        index_train1 = np.array(index_train).reshape(-1, 1)\n",
227 |     "        sampled_index, _ = sm.fit_sample(index_train1, label_train_id)\n",
228 |     "        count = Counter(_)\n",
229 |     "        count = count.most_common()\n",
230 |     "        sampled_index_train = np.array(sampled_index).reshape(1, -1)\n",
231 |     "        index_train = sampled_index_train[0]\n",
232 |     "    \n",
233 |     "    data_label = np.array(data_label)\n",
234 |     "    x_train = data_x[index_train]\n",
235 |     "    y_train = data_label[index_train]\n",
236 |     "    x_val = data_x[index_val]\n",
237 |     "    y_val = data_label[index_val]\n",
238 |     "    pos_train = []\n",
239 |     "    pos_val = []\n",
240 |     "    \n",
241 |     "    if(pos_cond):\n",
242 |     "        pos_train = data_pos[index_train]\n",
243 |     "        pos_val = data_pos[index_val]\n",
244 |     "\n",
245 |     "    x_id_train, mask_train, sense_mask_train, y_id_train, lex_train, pos_id_train  = data_prepare(sense_id, x_train, pos_train, y_train, sense_count, lex_cond=lex_cond, pos_cond=pos_cond)\n",
246 |     "    x_id_val, mask_val, sense_mask_val, y_id_val, lex_val, pos_id_val = data_prepare(sense_id, x_val, pos_val, y_val, sense_count, lex_cond=lex_cond, pos_cond=pos_cond)\n",
247 |     "\n",
248 |     "    train_data = {'x':x_id_train,'x_mask':mask_train, 'sense_mask':sense_mask_train, 'y':y_id_train, 'lex':lex_train, 'pos':pos_id_train}\n",
249 |     "    val_data = {'x':x_id_val,'x_mask':mask_val, 'sense_mask':sense_mask_val, 'y':y_id_val, 'lex':lex_val, 'pos':pos_id_val}\n",
250 |     "    \n",
251 |     "    with open('/data/aviraj/dataset/train_val_data_coarse/all_word_'+ name,'wb') as f:\n",
252 |     "        pickle.dump([train_data,val_data], f)\n",
253 |     "    \n",
254 |     "    print(len(x_id_train)+len(x_id_val))"
255 |    ]
256 |   },
257 |   {
258 |    "cell_type": "code",
259 |    "execution_count": 8,
260 |    "metadata": {
261 |     "scrolled": true
262 |    },
263 |    "outputs": [
264 |     {
265 |      "name": "stdout",
266 |      "output_type": "stream",
267 |      "text": [
268 |       "850093\n",
269 |       "850062\n",
270 |       "850052\n",
271 |       "849793\n",
272 |       "848996\n"
273 |      ]
274 |     }
275 |    ],
276 |    "source": [
277 |     "split_label1 = []\n",
278 |     "split_label2 = []\n",
279 |     "split_label3 = []\n",
280 |     "split_label4 = []\n",
281 |     "split_label5 = []\n",
282 |     "\n",
283 |     "index1 = []\n",
284 |     "index2 = []\n",
285 |     "index3 = []\n",
286 |     "index4 = []\n",
287 |     "index5 = []\n",
288 |     "\n",
289 |     "for jj, lab in enumerate(data_label1):\n",
290 |     "    min_idx = np.argmin([dict_sense_count1[dict_sense_keys[lab[i]][3]] if lab[i] is not None else np.inf for i in range(len(lab))  ]) \n",
291 |     "    if(lab[min_idx] is not None):\n",
292 |     "        index1.append(jj)\n",
293 |     "        split_label1.append(dict_sense_keys[lab[min_idx]][3])\n",
294 |     "\n",
295 |     "for jj, lab in enumerate(data_label2):\n",
296 |     "    min_idx = np.argmin([dict_sense_count2[dict_sense_keys[lab[i]][4]] if lab[i] is not None else np.inf for i in range(len(lab))  ]) \n",
297 |     "    if(lab[min_idx] is not None):\n",
298 |     "        index2.append(jj)\n",
299 |     "        split_label2.append(dict_sense_keys[lab[min_idx]][4])\n",
300 |     "\n",
301 |     "for jj, lab in enumerate(data_label3):\n",
302 |     "    min_idx = np.argmin([dict_sense_count3[dict_sense_keys[lab[i]][5]] if lab[i] is not None else np.inf for i in range(len(lab))  ]) \n",
303 |     "    if(lab[min_idx] is not None):\n",
304 |     "        index3.append(jj)\n",
305 |     "        split_label3.append(dict_sense_keys[lab[min_idx]][5])\n",
306 |     "            \n",
307 |     "for jj, lab in enumerate(data_label4):\n",
308 |     "    min_idx = np.argmin([dict_sense_count4[dict_sense_keys[lab[i]][4]] if lab[i] is not None else np.inf for i in range(len(lab))  ]) \n",
309 |     "    if(lab[min_idx] is not None):\n",
310 |     "        index4.append(jj)\n",
311 |     "        split_label4.append(dict_sense_keys[lab[min_idx]][4])\n",
312 |     "\n",
313 |     "for jj, lab in enumerate(data_label5):\n",
314 |     "    min_idx = np.argmin([dict_sense_count5[dict_sense_keys[lab[i]][5]] if lab[i] is not None else np.inf for i in range(len(lab))  ]) \n",
315 |     "    if(lab[min_idx] is not None):\n",
316 |     "        index5.append(jj)\n",
317 |     "        split_label5.append(dict_sense_keys[lab[min_idx]][5])\n",
318 |     "            \n",
319 |     "print(len(split_label1))\n",
320 |     "print(len(split_label2))\n",
321 |     "print(len(split_label3))\n",
322 |     "print(len(split_label4))\n",
323 |     "print(len(split_label5))"
324 |    ]
325 |   },
326 |   {
327 |    "cell_type": "code",
328 |    "execution_count": 9,
329 |    "metadata": {
330 |     "scrolled": true
331 |    },
332 |    "outputs": [
333 |     {
334 |      "name": "stderr",
335 |      "output_type": "stream",
336 |      "text": [
337 |       "/users/btech/aviraj/cs771/lib/python3.5/site-packages/sklearn/model_selection/_split.py:2026: FutureWarning: From version 0.21, test_size will always complement train_size unless both are specified.\n",
338 |       "  FutureWarning)\n"
339 |      ]
340 |     },
341 |     {
342 |      "name": "stdout",
343 |      "output_type": "stream",
344 |      "text": [
345 |       "850093\n",
346 |       "850062\n",
347 |       "850052\n",
348 |       "849793\n",
349 |       "848996\n",
350 |       "848996\n"
351 |      ]
352 |     }
353 |    ],
354 |    "source": [
355 |     "train_val_data('lex1', 3, index1, split_label1, data_label1, sense_count1, [], lex_cond=False, pos_cond=True)\n",
356 |     "train_val_data('lex2', 3, index2, split_label2, data_label2, sense_count1, [], lex_cond=False, pos_cond=True)\n",
357 |     "train_val_data('lex3', 3, index3, split_label3, data_label3, sense_count1, [], lex_cond=False, pos_cond=True)\n",
358 |     "train_val_data('sense1', 4, index4, split_label4, data_label4, sense_count4, [], lex_cond=True, pos_cond=True)\n",
359 |     "train_val_data('sense2', 4, index5, split_label5, data_label5, sense_count4, [], lex_cond=True, pos_cond=True)\n",
360 |     "train_val_data('full_sense', 5, index5, split_label5, data_label5, sense_count5, [], lex_cond=True, pos_cond=True)"
361 |    ]
362 |   },
363 |   {
364 |    "cell_type": "code",
365 |    "execution_count": 10,
366 |    "metadata": {
367 |     "collapsed": true
368 |    },
369 |    "outputs": [],
370 |    "source": [
371 |     "sampled_sense_count1 = [('1:19', 10000),\n",
372 |     " ('1:17', 10000),\n",
373 |     " ('2:34', 10000),\n",
374 |     " ('2:33', 10000),\n",
375 |     " ('1:27', 10000),\n",
376 |     " ('2:37', 8000),\n",
377 |     " ('1:24', 8000),\n",
378 |     " ('1:08', 8000),\n",
379 |     " ('1:12', 7000),\n",
380 |     " ('1:22', 5000),\n",
381 |     " ('2:29', 5000),\n",
382 |     " ('1:05', 3000),\n",
383 |     " ('1:16', 3000),\n",
384 |     " ('1:25', 3000),\n",
385 |     " ('1:20', 3000),\n",
386 |     " ('1:13', 2000),\n",
387 |     " ('2:43', 1100),\n",
388 |     " ('3:44', 1000)]"
389 |    ]
390 |   },
391 |   {
392 |    "cell_type": "code",
393 |    "execution_count": 11,
394 |    "metadata": {
395 |     "collapsed": true
396 |    },
397 |    "outputs": [],
398 |    "source": [
399 |     "sampled_sense_count2= []\n",
400 |     "for s, c in sense_count2[260:]:\n",
401 |     "    sampled_sense_count2.append((s, 500))\n",
402 |     "for s, c in sense_count2[180:260]:\n",
403 |     "    sampled_sense_count2.append((s, 2000))\n",
404 |     "for s, c in sense_count2[140:180]:\n",
405 |     "    sampled_sense_count2.append((s, 5000))\n",
406 |     "for s, c in sense_count2[75:140]:\n",
407 |     "    sampled_sense_count2.append((s, 8000))\n",
408 |     "for s, c in sense_count2[25:75]:\n",
409 |     "    sampled_sense_count2.append((s, 12000))"
410 |    ]
411 |   },
412 |   {
413 |    "cell_type": "code",
414 |    "execution_count": 12,
415 |    "metadata": {
416 |     "collapsed": true
417 |    },
418 |    "outputs": [],
419 |    "source": [
420 |     "sampled_sense_count3= []\n",
421 |     "for s, c in sense_count3[400:]:\n",
422 |     "    sampled_sense_count3.append((s, 500))\n",
423 |     "for s, c in sense_count3[200:400]:\n",
424 |     "    sampled_sense_count3.append((s, 2000))\n",
425 |     "for s, c in sense_count3[100:200]:\n",
426 |     "    sampled_sense_count3.append((s, 5000))\n",
427 |     "for s, c in sense_count3[70:100]:\n",
428 |     "    sampled_sense_count3.append((s, 8000))\n",
429 |     "for s, c in sense_count3[25:70]:\n",
430 |     "    sampled_sense_count3.append((s, 12000))"
431 |    ]
432 |   },
433 |   {
434 |    "cell_type": "code",
435 |    "execution_count": 13,
436 |    "metadata": {
437 |     "collapsed": true
438 |    },
439 |    "outputs": [],
440 |    "source": [
441 |     "sampled_sense_count4= []\n",
442 |     "for s, c in sense_count4[260:]:\n",
443 |     "    sampled_sense_count4.append((s, 500))\n",
444 |     "for s, c in sense_count4[180:260]:\n",
445 |     "    sampled_sense_count4.append((s, 2000))\n",
446 |     "for s, c in sense_count4[140:180]:\n",
447 |     "    sampled_sense_count4.append((s, 5000))\n",
448 |     "for s, c in sense_count4[75:140]:\n",
449 |     "    sampled_sense_count4.append((s, 8000))\n",
450 |     "for s, c in sense_count4[25:75]:\n",
451 |     "    sampled_sense_count4.append((s, 12000))"
452 |    ]
453 |   },
454 |   {
455 |    "cell_type": "code",
456 |    "execution_count": 14,
457 |    "metadata": {
458 |     "collapsed": true
459 |    },
460 |    "outputs": [],
461 |    "source": [
462 |     "sampled_sense_count5= []\n",
463 |     "for s, c in sense_count5[400:]:\n",
464 |     "    sampled_sense_count5.append((s, 500))\n",
465 |     "for s, c in sense_count5[200:400]:\n",
466 |     "    sampled_sense_count5.append((s, 2000))\n",
467 |     "for s, c in sense_count5[100:200]:\n",
468 |     "    sampled_sense_count5.append((s, 5000))\n",
469 |     "for s, c in sense_count5[70:100]:\n",
470 |     "    sampled_sense_count5.append((s, 8000))\n",
471 |     "for s, c in sense_count5[25:70]:\n",
472 |     "    sampled_sense_count5.append((s, 12000))"
473 |    ]
474 |   },
475 |   {
476 |    "cell_type": "code",
477 |    "execution_count": 15,
478 |    "metadata": {
479 |     "scrolled": false
480 |    },
481 |    "outputs": [
482 |     {
483 |      "name": "stderr",
484 |      "output_type": "stream",
485 |      "text": [
486 |       "/users/btech/aviraj/cs771/lib/python3.5/site-packages/sklearn/model_selection/_split.py:2026: FutureWarning: From version 0.21, test_size will always complement train_size unless both are specified.\n",
487 |       "  FutureWarning)\n"
488 |      ]
489 |     },
490 |     {
491 |      "name": "stdout",
492 |      "output_type": "stream",
493 |      "text": [
494 |       "911174\n",
495 |       "2061567\n",
496 |       "2512876\n",
497 |       "2041581\n",
498 |       "2239996\n",
499 |       "2239996\n"
500 |      ]
501 |     }
502 |    ],
503 |    "source": [
504 |     "train_val_data('lex1_sampled', 3, index1, split_label1, data_label1, sense_count1, sampled_sense_count1, lex_cond=False, pos_cond=True, sampling=True)\n",
505 |     "train_val_data('lex2_sampled', 3, index2, split_label2, data_label2, sense_count1, sampled_sense_count2, lex_cond=False, pos_cond=True, sampling=True)\n",
506 |     "train_val_data('lex3_sampled', 3, index3, split_label3, data_label3, sense_count1, sampled_sense_count3, lex_cond=False, pos_cond=True, sampling=True)\n",
507 |     "train_val_data('sense1_sampled', 4, index4, split_label4, data_label4, sense_count4, sampled_sense_count4, lex_cond=True, pos_cond=True, sampling=True)\n",
508 |     "train_val_data('sense2_sampled', 4, index5, split_label5, data_label5, sense_count4, sampled_sense_count5, lex_cond=True, pos_cond=True, sampling=True)\n",
509 |     "train_val_data('full_sense_sampled', 5, index5, split_label5, data_label5, sense_count5, sampled_sense_count5, lex_cond=True, pos_cond=True, sampling=True)"
510 |    ]
511 |   }
512 |  ],
513 |  "metadata": {
514 |   "kernelspec": {
515 |    "display_name": "cs771",
516 |    "language": "python",
517 |    "name": "cs771"
518 |   },
519 |   "language_info": {
520 |    "codemirror_mode": {
521 |     "name": "ipython",
522 |     "version": 3
523 |    },
524 |    "file_extension": ".py",
525 |    "mimetype": "text/x-python",
526 |    "name": "python",
527 |    "nbconvert_exporter": "python",
528 |    "pygments_lexer": "ipython3",
529 |    "version": "3.5.2"
530 |   }
531 |  },
532 |  "nbformat": 4,
533 |  "nbformat_minor": 2
534 | }
535 | 


--------------------------------------------------------------------------------
/one_million/One-Million All-Word Data Sampling-Fine.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true,
  8 |     "deletable": true,
  9 |     "editable": true
 10 |    },
 11 |    "outputs": [],
 12 |    "source": [
 13 |     "import pickle\n",
 14 |     "import numpy as np\n",
 15 |     "import os\n",
 16 |     "from sklearn.model_selection import train_test_split\n",
 17 |     "from collections import Counter\n",
 18 |     "from imblearn.over_sampling import RandomOverSampler"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": 2,
 24 |    "metadata": {
 25 |     "collapsed": false,
 26 |     "deletable": true,
 27 |     "editable": true,
 28 |     "scrolled": true
 29 |    },
 30 |    "outputs": [
 31 |     {
 32 |      "name": "stdout",
 33 |      "output_type": "stream",
 34 |      "text": [
 35 |       "46\n"
 36 |      ]
 37 |     }
 38 |    ],
 39 |    "source": [
 40 |     "f = open(\"../../dataset/sense/dict_sense-keys\", 'rb')\n",
 41 |     "dict_sense_keys = pickle.load(f)\n",
 42 |     "f.close()\n",
 43 |     "\n",
 44 |     "f = open(\"../../dataset/sense/dict_word-sense\", 'rb')\n",
 45 |     "dict_word_sense = pickle.load(f)\n",
 46 |     "f.close()\n",
 47 |     "\n",
 48 |     "f = open('../Glove/word_embedding_glove', 'rb')\n",
 49 |     "word_embedding = pickle.load(f)\n",
 50 |     "f.close()\n",
 51 |     "word_embedding = word_embedding[: len(word_embedding)-1]\n",
 52 |     "\n",
 53 |     "f = open('../Glove/vocab_glove', 'rb')\n",
 54 |     "vocab = pickle.load(f)\n",
 55 |     "f.close()\n",
 56 |     "\n",
 57 |     "word2id = dict((w, i) for i,w in enumerate(vocab))\n",
 58 |     "id2word = dict((i, w) for i,w in enumerate(vocab))\n",
 59 |     "\n",
 60 |     "unknown_token = \"UNKNOWN_TOKEN\"\n",
 61 |     "\n",
 62 |     "with open('/data/aviraj/dataset/raw_preprocess_train','rb') as f:\n",
 63 |     "    data=pickle.load(f)\n",
 64 |     "\n",
 65 |     "with open('/data/aviraj/dataset/fulldata_vocab_sense','rb') as f:\n",
 66 |     "    vocab_lex=pickle.load(f)\n",
 67 |     "\n",
 68 |     "lex2id = dict((s, i) for i,s in enumerate(vocab_lex))\n",
 69 |     "id2lex = dict((i, s) for i,s in enumerate(vocab_lex))\n",
 70 |     "\n",
 71 |     "print(len(vocab_lex))\n",
 72 |     "max_sent_size = 200"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": 3,
 78 |    "metadata": {
 79 |     "collapsed": false,
 80 |     "deletable": true,
 81 |     "editable": true
 82 |    },
 83 |    "outputs": [
 84 |     {
 85 |      "name": "stdout",
 86 |      "output_type": "stream",
 87 |      "text": [
 88 |       "12\n"
 89 |      ]
 90 |     }
 91 |    ],
 92 |    "source": [
 93 |     "_pos = []\n",
 94 |     "for i in range(len(data)):\n",
 95 |     "    for pp in data[i][4]:\n",
 96 |     "        _pos.append(pp)\n",
 97 |     "        \n",
 98 |     "pos_count = Counter(_pos)\n",
 99 |     "pos_count = pos_count.most_common()\n",
100 |     "vocab_pos = [pp for pp, c in pos_count]\n",
101 |     "pos2id = dict((s, i) for i,s in enumerate(vocab_pos))\n",
102 |     "print(len(vocab_pos))"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": 4,
108 |    "metadata": {
109 |     "collapsed": false,
110 |     "deletable": true,
111 |     "editable": true
112 |    },
113 |    "outputs": [],
114 |    "source": [
115 |     "data_y1 = []\n",
116 |     "data_y2 = []\n",
117 |     "data_y3 = []\n",
118 |     "for i in range(len(data)):\n",
119 |     "    if (len(data[i][1])<=200):\n",
120 |     "        for j in range(len(data[i][2])):\n",
121 |     "            if data[i][2][j] is not None:\n",
122 |     "                data_y1.append(dict_sense_keys[data[i][2][j]][3])\n",
123 |     "                data_y2.append(dict_sense_keys[data[i][2][j]][4])\n",
124 |     "                data_y3.append(dict_sense_keys[data[i][2][j]][5])\n",
125 |     "\n",
126 |     "sense_count1 = Counter(data_y1)\n",
127 |     "sense_count1 = sense_count1.most_common()[:-2]\n",
128 |     "\n",
129 |     "sense_count2 = Counter(data_y2)\n",
130 |     "sense_count2 = sense_count2.most_common(180)\n",
131 |     "\n",
132 |     "sense_count3 = Counter(data_y3)\n",
133 |     "sense_count3 = sense_count3.most_common(300)"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "code",
138 |    "execution_count": 5,
139 |    "metadata": {
140 |     "collapsed": true,
141 |     "deletable": true,
142 |     "editable": true
143 |    },
144 |    "outputs": [],
145 |    "source": [
146 |     "dict_sense_count1 = dict(sense_count1)\n",
147 |     "dict_sense_count2 = dict(sense_count2)\n",
148 |     "dict_sense_count3 = dict(sense_count3)"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": 6,
154 |    "metadata": {
155 |     "collapsed": false,
156 |     "deletable": true,
157 |     "editable": true,
158 |     "scrolled": true
159 |    },
160 |    "outputs": [
161 |     {
162 |      "name": "stdout",
163 |      "output_type": "stream",
164 |      "text": [
165 |       "44 180 300\n"
166 |      ]
167 |     }
168 |    ],
169 |    "source": [
170 |     "print(len(sense_count1), len(sense_count2), len(sense_count3))"
171 |    ]
172 |   },
173 |   {
174 |    "cell_type": "code",
175 |    "execution_count": 7,
176 |    "metadata": {
177 |     "collapsed": true,
178 |     "deletable": true,
179 |     "editable": true
180 |    },
181 |    "outputs": [],
182 |    "source": [
183 |     "data_x = []\n",
184 |     "data_pos = []\n",
185 |     "data_label1 = []\n",
186 |     "data_label2 = []\n",
187 |     "data_label3 = []\n",
188 |     "\n",
189 |     "for i in range(len(data)):\n",
190 |     "    if not all(np.array(data[i][2])==None) and (len(data[i][1])<=200):\n",
191 |     "        data_label1.append([ss if ss is not None and dict_sense_keys[ss][3] in dict_sense_count1 else None for ss in data[i][2]])\n",
192 |     "        data_label2.append([ss if ss is not None and dict_sense_keys[ss][4] in dict_sense_count2 else None for ss in data[i][2]])\n",
193 |     "        data_label3.append([ss if ss is not None and dict_sense_keys[ss][5] in dict_sense_count3 else None for ss in data[i][2]])\n",
194 |     "        data_x.append(data[i][1])\n",
195 |     "        data_pos.append(data[i][4])"
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "code",
200 |    "execution_count": 8,
201 |    "metadata": {
202 |     "collapsed": true,
203 |     "deletable": true,
204 |     "editable": true
205 |    },
206 |    "outputs": [],
207 |    "source": [
208 |     "def data_prepare(sense_id, x, pos, y, sense_count, lex_cond=False, pos_cond=False):\n",
209 |     "    num_examples = len(x)\n",
210 |     "    \n",
211 |     "    vocab_sense = [s for s, c in sense_count]\n",
212 |     "    sense2id = dict((s, i) for i,s in enumerate(vocab_sense))\n",
213 |     "    \n",
214 |     "    xx = np.zeros([num_examples, max_sent_size], dtype=int)\n",
215 |     "    xx_mask = np.zeros([num_examples, max_sent_size], dtype=bool)\n",
216 |     "    ss_mask = np.zeros([num_examples, max_sent_size], dtype=bool)\n",
217 |     "    yy = np.zeros([num_examples,max_sent_size], dtype=int)\n",
218 |     "    y_lex = np.zeros([num_examples, max_sent_size], dtype=int)\n",
219 |     "    y_pos = np.zeros([num_examples, max_sent_size], dtype=int)\n",
220 |     "        \n",
221 |     "    for j in range(num_examples):\n",
222 |     "        for i in range(max_sent_size):\n",
223 |     "            if(i>=len(x[j])):\n",
224 |     "                break\n",
225 |     "            w = x[j][i]\n",
226 |     "            s = y[j][i]\n",
227 |     "            p = pos[j][i]\n",
228 |     "            xx[j][i] = word2id[w] if w in word2id else word2id['UNKNOWN_TOKEN']\n",
229 |     "            xx_mask[j][i] = True\n",
230 |     "            ss_mask[j][i] = True if s is not None and dict_sense_keys[s][sense_id] in vocab_sense else False\n",
231 |     "            yy[j][i] = sense2id[dict_sense_keys[s][sense_id]] if s is not None and dict_sense_keys[s][sense_id] in vocab_sense else 0\n",
232 |     "            if(lex_cond):\n",
233 |     "                y_lex[j][i] = lex2id[dict_sense_keys[s][3]] if s is not None and dict_sense_keys[s][3] in vocab_lex else len(vocab_lex)\n",
234 |     "            if(pos_cond):\n",
235 |     "                y_pos[j][i] = pos2id[p] if p in vocab_pos else len(vocab_pos)\n",
236 |     "        \n",
237 |     "    return xx, xx_mask, ss_mask, yy, y_lex, y_pos"
238 |    ]
239 |   },
240 |   {
241 |    "cell_type": "code",
242 |    "execution_count": 9,
243 |    "metadata": {
244 |     "collapsed": true,
245 |     "deletable": true,
246 |     "editable": true
247 |    },
248 |    "outputs": [],
249 |    "source": [
250 |     "data_x = np.array(data_x)\n",
251 |     "data_pos = np.array(data_pos)\n",
252 |     "\n",
253 |     "def train_val_data(name, sense_id, index, split_label, data_label, sense_count, sampling_list, lex_cond=False, pos_cond=False, sampling=False):\n",
254 |     "    \n",
255 |     "    index_train, index_val, label_train_id, label_val_id = train_test_split(index, split_label, train_size=0.8, shuffle=True, stratify=split_label, random_state=0)\n",
256 |     "  \n",
257 |     "    if(sampling):\n",
258 |     "        dict_sample = dict(sampling_list)\n",
259 |     "        sm = RandomOverSampler(ratio=dict_sample)\n",
260 |     "        index_train1 = np.array(index_train).reshape(-1, 1)\n",
261 |     "        sampled_index, _ = sm.fit_sample(index_train1, label_train_id)\n",
262 |     "        count = Counter(_)\n",
263 |     "        count = count.most_common()\n",
264 |     "        sampled_index_train = np.array(sampled_index).reshape(1, -1)\n",
265 |     "        index_train = sampled_index_train[0]\n",
266 |     "    \n",
267 |     "    data_label = np.array(data_label)\n",
268 |     "    x_train = data_x[index_train]\n",
269 |     "    y_train = data_label[index_train]\n",
270 |     "    x_val = data_x[index_val]\n",
271 |     "    y_val = data_label[index_val]\n",
272 |     "    pos_train = []\n",
273 |     "    pos_val = []\n",
274 |     "    \n",
275 |     "    if(pos_cond):\n",
276 |     "        pos_train = data_pos[index_train]\n",
277 |     "        pos_val = data_pos[index_val]\n",
278 |     "\n",
279 |     "    x_id_train, mask_train, sense_mask_train, y_id_train, lex_train, pos_id_train  = data_prepare(sense_id, x_train, pos_train, y_train, sense_count, lex_cond=lex_cond, pos_cond=pos_cond)\n",
280 |     "    x_id_val, mask_val, sense_mask_val, y_id_val, lex_val, pos_id_val = data_prepare(sense_id, x_val, pos_val, y_val, sense_count, lex_cond=lex_cond, pos_cond=pos_cond)\n",
281 |     "\n",
282 |     "    train_data = {'x':x_id_train,'x_mask':mask_train, 'sense_mask':sense_mask_train, 'y':y_id_train, 'lex':lex_train, 'pos':pos_id_train}\n",
283 |     "    val_data = {'x':x_id_val,'x_mask':mask_val, 'sense_mask':sense_mask_val, 'y':y_id_val, 'lex':lex_val, 'pos':pos_id_val}\n",
284 |     "    \n",
285 |     "    with open('/data/aviraj/dataset/train_val_data_fine/all_word_'+ name,'wb') as f:\n",
286 |     "        pickle.dump([train_data,val_data], f)\n",
287 |     "    \n",
288 |     "    print(len(x_id_train)+len(x_id_val))"
289 |    ]
290 |   },
291 |   {
292 |    "cell_type": "code",
293 |    "execution_count": 10,
294 |    "metadata": {
295 |     "collapsed": false,
296 |     "deletable": true,
297 |     "editable": true,
298 |     "scrolled": true
299 |    },
300 |    "outputs": [
301 |     {
302 |      "name": "stdout",
303 |      "output_type": "stream",
304 |      "text": [
305 |       "850083\n",
306 |       "838757\n",
307 |       "828921\n"
308 |      ]
309 |     }
310 |    ],
311 |    "source": [
312 |     "split_label1 = []\n",
313 |     "split_label2 = []\n",
314 |     "split_label3 = []\n",
315 |     "\n",
316 |     "index1 = []\n",
317 |     "index2 = []\n",
318 |     "index3 = []\n",
319 |     "\n",
320 |     "for jj, lab in enumerate(data_label1):\n",
321 |     "    min_idx = np.argmin([dict_sense_count1[dict_sense_keys[lab[i]][3]] if lab[i] is not None else np.inf for i in range(len(lab))  ]) \n",
322 |     "    if(lab[min_idx] is not None):\n",
323 |     "        index1.append(jj)\n",
324 |     "        split_label1.append(dict_sense_keys[lab[min_idx]][3])\n",
325 |     "\n",
326 |     "for jj, lab in enumerate(data_label2):\n",
327 |     "    min_idx = np.argmin([dict_sense_count2[dict_sense_keys[lab[i]][4]] if lab[i] is not None else np.inf for i in range(len(lab))  ]) \n",
328 |     "    if(lab[min_idx] is not None):\n",
329 |     "        index2.append(jj)\n",
330 |     "        split_label2.append(dict_sense_keys[lab[min_idx]][4])\n",
331 |     "\n",
332 |     "for jj, lab in enumerate(data_label3):\n",
333 |     "    min_idx = np.argmin([dict_sense_count3[dict_sense_keys[lab[i]][5]] if lab[i] is not None else np.inf for i in range(len(lab))  ]) \n",
334 |     "    if(lab[min_idx] is not None):\n",
335 |     "        index3.append(jj)\n",
336 |     "        split_label3.append(dict_sense_keys[lab[min_idx]][5])\n",
337 |     "    \n",
338 |     "print(len(split_label1))\n",
339 |     "print(len(split_label2))\n",
340 |     "print(len(split_label3))"
341 |    ]
342 |   },
343 |   {
344 |    "cell_type": "code",
345 |    "execution_count": 11,
346 |    "metadata": {
347 |     "collapsed": false,
348 |     "deletable": true,
349 |     "editable": true,
350 |     "scrolled": true
351 |    },
352 |    "outputs": [
353 |     {
354 |      "name": "stderr",
355 |      "output_type": "stream",
356 |      "text": [
357 |       "/users/btech/aviraj/envs/lib/python3.5/site-packages/sklearn/model_selection/_split.py:2026: FutureWarning: From version 0.21, test_size will always complement train_size unless both are specified.\n",
358 |       "  FutureWarning)\n"
359 |      ]
360 |     },
361 |     {
362 |      "name": "stdout",
363 |      "output_type": "stream",
364 |      "text": [
365 |       "850083\n",
366 |       "838757\n",
367 |       "828921\n"
368 |      ]
369 |     }
370 |    ],
371 |    "source": [
372 |     "train_val_data('lex', 3, index1, split_label1, data_label1, sense_count1, [], lex_cond=False, pos_cond=True)\n",
373 |     "train_val_data('sense', 4, index2, split_label2, data_label2, sense_count2, [], lex_cond=True, pos_cond=True)\n",
374 |     "train_val_data('full_sense', 5, index3, split_label3, data_label3, sense_count3, [], lex_cond=True, pos_cond=True)"
375 |    ]
376 |   },
377 |   {
378 |    "cell_type": "code",
379 |    "execution_count": 12,
380 |    "metadata": {
381 |     "collapsed": true,
382 |     "deletable": true,
383 |     "editable": true
384 |    },
385 |    "outputs": [],
386 |    "source": [
387 |     "sampled_sense_count1 = [('1:19', 10000),\n",
388 |     " ('1:17', 10000),\n",
389 |     " ('2:34', 10000),\n",
390 |     " ('2:33', 10000),\n",
391 |     " ('1:27', 10000),\n",
392 |     " ('2:37', 8000),\n",
393 |     " ('1:24', 8000),\n",
394 |     " ('1:08', 8000),\n",
395 |     " ('1:12', 7000),\n",
396 |     " ('1:22', 5000),\n",
397 |     " ('2:29', 5000),\n",
398 |     " ('1:05', 3000),\n",
399 |     " ('1:16', 3000),\n",
400 |     " ('1:25', 3000),\n",
401 |     " ('1:20', 3000),\n",
402 |     " ('1:13', 2000)]"
403 |    ]
404 |   },
405 |   {
406 |    "cell_type": "code",
407 |    "execution_count": 13,
408 |    "metadata": {
409 |     "collapsed": true,
410 |     "deletable": true,
411 |     "editable": true
412 |    },
413 |    "outputs": [],
414 |    "source": [
415 |     "sampled_sense_count2= []\n",
416 |     "for s, c in sense_count2[120:]:\n",
417 |     "    sampled_sense_count2.append((s, 5000))\n",
418 |     "for s, c in sense_count2[75:120]:\n",
419 |     "    sampled_sense_count2.append((s, 8000))\n",
420 |     "for s, c in sense_count2[25:75]:\n",
421 |     "    sampled_sense_count2.append((s, 12000))"
422 |    ]
423 |   },
424 |   {
425 |    "cell_type": "code",
426 |    "execution_count": 14,
427 |    "metadata": {
428 |     "collapsed": true,
429 |     "deletable": true,
430 |     "editable": true
431 |    },
432 |    "outputs": [],
433 |    "source": [
434 |     "sampled_sense_count3= []\n",
435 |     "for s, c in sense_count3[130:]:\n",
436 |     "    sampled_sense_count3.append((s, 5000))\n",
437 |     "for s, c in sense_count3[70:130]:\n",
438 |     "    sampled_sense_count3.append((s, 8000))\n",
439 |     "for s, c in sense_count3[25:70]:\n",
440 |     "    sampled_sense_count3.append((s, 12000))"
441 |    ]
442 |   },
443 |   {
444 |    "cell_type": "code",
445 |    "execution_count": 15,
446 |    "metadata": {
447 |     "collapsed": false,
448 |     "deletable": true,
449 |     "editable": true
450 |    },
451 |    "outputs": [
452 |     {
453 |      "name": "stderr",
454 |      "output_type": "stream",
455 |      "text": [
456 |       "/users/btech/aviraj/envs/lib/python3.5/site-packages/sklearn/model_selection/_split.py:2026: FutureWarning: From version 0.21, test_size will always complement train_size unless both are specified.\n",
457 |       "  FutureWarning)\n"
458 |      ]
459 |     },
460 |     {
461 |      "name": "stdout",
462 |      "output_type": "stream",
463 |      "text": [
464 |       "909119\n",
465 |       "1814988\n",
466 |       "2375783\n"
467 |      ]
468 |     }
469 |    ],
470 |    "source": [
471 |     "train_val_data('lex_sampled', 3, index1, split_label1, data_label1, sense_count1, sampled_sense_count1, lex_cond=False, pos_cond=True, sampling=True)\n",
472 |     "train_val_data('sense_sampled', 4, index2, split_label2, data_label2, sense_count2, sampled_sense_count2, lex_cond=True, pos_cond=True, sampling=True)\n",
473 |     "train_val_data('full_sense_sampled', 5, index3, split_label3, data_label3, sense_count3, sampled_sense_count3, lex_cond=True, pos_cond=True, sampling=True)"
474 |    ]
475 |   },
476 |   {
477 |    "cell_type": "code",
478 |    "execution_count": null,
479 |    "metadata": {
480 |     "collapsed": true
481 |    },
482 |    "outputs": [],
483 |    "source": []
484 |   }
485 |  ],
486 |  "metadata": {
487 |   "kernelspec": {
488 |    "display_name": "envs",
489 |    "language": "python",
490 |    "name": "cs771"
491 |   },
492 |   "language_info": {
493 |    "codemirror_mode": {
494 |     "name": "ipython",
495 |     "version": 3
496 |    },
497 |    "file_extension": ".py",
498 |    "mimetype": "text/x-python",
499 |    "name": "python",
500 |    "nbconvert_exporter": "python",
501 |    "pygments_lexer": "ipython3",
502 |    "version": "3.5.2"
503 |   }
504 |  },
505 |  "nbformat": 4,
506 |  "nbformat_minor": 2
507 | }
508 | 


--------------------------------------------------------------------------------
/one_million/Sense-test.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import pickle\n",
 12 |     "from collections import Counter"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 2,
 18 |    "metadata": {
 19 |     "collapsed": true
 20 |    },
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "with open('../../dataset/ALL.gold.key.txt') as f:\n",
 24 |     "    sense_key = f.readlines()\n",
 25 |     "sense_key = [x.strip() for x in sense_key] "
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 3,
 31 |    "metadata": {
 32 |     "collapsed": true
 33 |    },
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "with open(\"../../dataset/sense/ALL-keys\",\"wb\") as f:\n",
 37 |     "    pickle.dump(sense_key, f)"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": null,
 43 |    "metadata": {
 44 |     "collapsed": true
 45 |    },
 46 |    "outputs": [],
 47 |    "source": []
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": 4,
 52 |    "metadata": {},
 53 |    "outputs": [
 54 |     {
 55 |      "name": "stdout",
 56 |      "output_type": "stream",
 57 |      "text": [
 58 |       "4132\n"
 59 |      ]
 60 |     }
 61 |    ],
 62 |    "source": [
 63 |     "for i,s in enumerate(sense_key):\n",
 64 |     "    if(s[:11] == 'semeval2007'):\n",
 65 |     "        print(i)\n",
 66 |     "        break"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": 5,
 72 |    "metadata": {},
 73 |    "outputs": [
 74 |     {
 75 |      "data": {
 76 |       "text/plain": [
 77 |        "'art%1:09:00::'"
 78 |       ]
 79 |      },
 80 |      "execution_count": 5,
 81 |      "metadata": {},
 82 |      "output_type": "execute_result"
 83 |     }
 84 |    ],
 85 |    "source": [
 86 |     "sense_key[0][25:]"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": 7,
 92 |    "metadata": {
 93 |     "collapsed": true
 94 |    },
 95 |    "outputs": [],
 96 |    "source": [
 97 |     "dict_sense_keys = dict((sense_key[i][:24], []) if i<4132 else (sense_key[i][:26], []) for i in range(len(sense_key)))\n",
 98 |     "\n",
 99 |     "for i in range(4132):\n",
100 |     "    index = sense_key[i].find(\"%\")\n",
101 |     "    dict_sense_keys[sense_key[i][:24]].append(sense_key[i][25:])\n",
102 |     "    dict_sense_keys[sense_key[i][:24]].append(sense_key[i][index+1])\n",
103 |     "    dict_sense_keys[sense_key[i][:24]].append(sense_key[i][index+3:index+5])\n",
104 |     "    dict_sense_keys[sense_key[i][:24]].append(sense_key[i][index+1:index+5])\n",
105 |     "    dict_sense_keys[sense_key[i][:24]].append(sense_key[i][index+1:index+8])\n",
106 |     "    dict_sense_keys[sense_key[i][:24]].append(sense_key[i][index+1:])\n",
107 |     "\n",
108 |     "for i in range(4132, len(sense_key)):\n",
109 |     "    index = sense_key[i].find(\"%\")\n",
110 |     "    dict_sense_keys[sense_key[i][:26]].append(sense_key[i][27:])\n",
111 |     "    dict_sense_keys[sense_key[i][:26]].append(sense_key[i][index+1])\n",
112 |     "    dict_sense_keys[sense_key[i][:26]].append(sense_key[i][index+3:index+5])\n",
113 |     "    dict_sense_keys[sense_key[i][:26]].append(sense_key[i][index+1:index+5])\n",
114 |     "    dict_sense_keys[sense_key[i][:26]].append(sense_key[i][index+1:index+8])\n",
115 |     "    dict_sense_keys[sense_key[i][:26]].append(sense_key[i][index+1:])"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "code",
120 |    "execution_count": 10,
121 |    "metadata": {
122 |     "collapsed": true
123 |    },
124 |    "outputs": [],
125 |    "source": [
126 |     "with open(\"../../dataset/sense/dict_sense-keys_test\",\"wb\") as f:\n",
127 |     "    pickle.dump(dict_sense_keys, f)"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": 3,
133 |    "metadata": {
134 |     "collapsed": true
135 |    },
136 |    "outputs": [],
137 |    "source": [
138 |     "total_words = []\n",
139 |     "for i in range(226036):\n",
140 |     "    index = sense_key[i].find(\"%\")\n",
141 |     "    total_words.append(sense_key[i][15:index])\n",
142 |     "\n",
143 |     "for i in range(226036, len(sense_key)):\n",
144 |     "    index = sense_key[i].find(\"%\")\n",
145 |     "    total_words.append(sense_key[i][24:index])\n",
146 |     "\n",
147 |     "total_words = Counter(total_words)\n",
148 |     "word_count = total_words.most_common()\n",
149 |     "vocab_words = [k for k,v in word_count]"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": 7,
155 |    "metadata": {
156 |     "scrolled": true
157 |    },
158 |    "outputs": [
159 |     {
160 |      "data": {
161 |       "text/plain": [
162 |        "20400"
163 |       ]
164 |      },
165 |      "execution_count": 7,
166 |      "metadata": {},
167 |      "output_type": "execute_result"
168 |     }
169 |    ],
170 |    "source": [
171 |     "with open(\"../../dataset/sense/vocab_sense-words\",\"wb\") as f:\n",
172 |     "    pickle.dump(vocab_words, f)\n",
173 |     "    \n",
174 |     "len(vocab_words)"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "code",
179 |    "execution_count": 4,
180 |    "metadata": {
181 |     "scrolled": true
182 |    },
183 |    "outputs": [
184 |     {
185 |      "data": {
186 |       "text/plain": [
187 |        "[('change', 3074),\n",
188 |        " ('lead', 2987),\n",
189 |        " ('design', 2938),\n",
190 |        " ('open', 2922),\n",
191 |        " ('study', 2920),\n",
192 |        " ('set', 2909),\n",
193 |        " ('call', 2906),\n",
194 |        " ('point', 2855),\n",
195 |        " ('bring', 2836),\n",
196 |        " ('extend', 2832)]"
197 |       ]
198 |      },
199 |      "execution_count": 4,
200 |      "metadata": {},
201 |      "output_type": "execute_result"
202 |     }
203 |    ],
204 |    "source": [
205 |     "word_count[20:30]"
206 |    ]
207 |   },
208 |   {
209 |    "cell_type": "code",
210 |    "execution_count": null,
211 |    "metadata": {
212 |     "collapsed": true
213 |    },
214 |    "outputs": [],
215 |    "source": []
216 |   },
217 |   {
218 |    "cell_type": "code",
219 |    "execution_count": 9,
220 |    "metadata": {},
221 |    "outputs": [
222 |     {
223 |      "data": {
224 |       "text/plain": [
225 |        "34322"
226 |       ]
227 |      },
228 |      "execution_count": 9,
229 |      "metadata": {},
230 |      "output_type": "execute_result"
231 |     }
232 |    ],
233 |    "source": [
234 |     "total_word_senses = []\n",
235 |     "\n",
236 |     "for i in range(226036):\n",
237 |     "    total_word_senses.append(sense_key[i][15:])\n",
238 |     "\n",
239 |     "for i in range(226036, len(sense_key)):\n",
240 |     "    total_word_senses.append(sense_key[i][24:])\n",
241 |     "\n",
242 |     "total_word_senses = Counter(total_word_senses)\n",
243 |     "word_senses_count = total_word_senses.most_common()\n",
244 |     "vocab_word_senses = [k for k,v in word_senses_count]\n",
245 |     "\n",
246 |     "with open(\"../../dataset/sense/vocab_word-senses\",\"wb\") as f:\n",
247 |     "    pickle.dump(vocab_word_senses, f)\n",
248 |     "    \n",
249 |     "len(vocab_word_senses)"
250 |    ]
251 |   },
252 |   {
253 |    "cell_type": "code",
254 |    "execution_count": 10,
255 |    "metadata": {},
256 |    "outputs": [
257 |     {
258 |      "data": {
259 |       "text/plain": [
260 |        "[('be%2:42:03::', 10582),\n",
261 |        " ('person%1:03:00::', 7195),\n",
262 |        " ('line%1:04:01::', 4968),\n",
263 |        " ('see%2:31:00::', 4554),\n",
264 |        " ('be%2:42:06::', 3423),\n",
265 |        " ('keep%2:41:03::', 2283),\n",
266 |        " ('little%3:00:03::', 2042),\n",
267 |        " ('group%1:03:00::', 1826),\n",
268 |        " ('say%2:32:00::', 1819),\n",
269 |        " ('not%4:02:00::', 1703)]"
270 |       ]
271 |      },
272 |      "execution_count": 10,
273 |      "metadata": {},
274 |      "output_type": "execute_result"
275 |     }
276 |    ],
277 |    "source": [
278 |     "word_senses_count[:10]"
279 |    ]
280 |   },
281 |   {
282 |    "cell_type": "code",
283 |    "execution_count": null,
284 |    "metadata": {
285 |     "collapsed": true
286 |    },
287 |    "outputs": [],
288 |    "source": []
289 |   },
290 |   {
291 |    "cell_type": "code",
292 |    "execution_count": 11,
293 |    "metadata": {
294 |     "collapsed": true
295 |    },
296 |    "outputs": [],
297 |    "source": [
298 |     "dict_word_sense_keys = dict((w, []) for w in vocab_words)\n",
299 |     "\n",
300 |     "for v in vocab_word_senses:\n",
301 |     "    dict_word_sense_keys[v[:v.find(\"%\")]].append(v)\n",
302 |     "\n",
303 |     "with open(\"../../dataset/sense/dict_word-sense\",\"wb\") as f:\n",
304 |     "    pickle.dump(dict_word_sense_keys, f)"
305 |    ]
306 |   },
307 |   {
308 |    "cell_type": "code",
309 |    "execution_count": 12,
310 |    "metadata": {
311 |     "scrolled": true
312 |    },
313 |    "outputs": [
314 |     {
315 |      "data": {
316 |       "text/plain": [
317 |        "['open%2:35:00::',\n",
318 |        " 'open%5:00:00:public:00',\n",
319 |        " 'open%2:41:01::',\n",
320 |        " 'open%2:30:00::',\n",
321 |        " 'open%5:00:00:unrestricted:00',\n",
322 |        " 'open%2:30:01::',\n",
323 |        " 'open%2:35:06::',\n",
324 |        " 'open%2:41:00::',\n",
325 |        " 'open%3:00:01::',\n",
326 |        " 'open%3:00:02::',\n",
327 |        " 'open%5:00:00:unprotected:00',\n",
328 |        " 'open%2:35:08::',\n",
329 |        " 'open%2:33:00::',\n",
330 |        " 'open%5:00:00:available:00',\n",
331 |        " 'open%2:42:00::',\n",
332 |        " 'open%5:00:00:coarse:00',\n",
333 |        " 'open%5:00:00:unenclosed:00',\n",
334 |        " 'open%5:00:00:vulnerable:00',\n",
335 |        " 'open%3:00:04::',\n",
336 |        " 'open%1:15:02::',\n",
337 |        " 'open%5:00:00:unconstricted:00',\n",
338 |        " 'open%3:00:08::',\n",
339 |        " 'open%5:00:00:unsealed:01',\n",
340 |        " 'open%5:00:00:unsettled:02',\n",
341 |        " 'open%1:15:01::']"
342 |       ]
343 |      },
344 |      "execution_count": 12,
345 |      "metadata": {},
346 |      "output_type": "execute_result"
347 |     }
348 |    ],
349 |    "source": [
350 |     "dict_word_sense_keys['open']"
351 |    ]
352 |   },
353 |   {
354 |    "cell_type": "code",
355 |    "execution_count": null,
356 |    "metadata": {
357 |     "collapsed": true
358 |    },
359 |    "outputs": [],
360 |    "source": []
361 |   },
362 |   {
363 |    "cell_type": "code",
364 |    "execution_count": 14,
365 |    "metadata": {},
366 |    "outputs": [
367 |     {
368 |      "data": {
369 |       "text/plain": [
370 |        "2468"
371 |       ]
372 |      },
373 |      "execution_count": 14,
374 |      "metadata": {},
375 |      "output_type": "execute_result"
376 |     }
377 |    ],
378 |    "source": [
379 |     "total_sense = []\n",
380 |     "senses = []\n",
381 |     "for i in range(len(sense_key)):\n",
382 |     "    index = sense_key[i].find(\"%\")\n",
383 |     "    total_sense.append(sense_key[i][index+1:])\n",
384 |     "\n",
385 |     "total_sense = Counter(total_sense)\n",
386 |     "sense_count = total_sense.most_common()\n",
387 |     "\n",
388 |     "vocab_sense = [k for k,v in sense_count]\n",
389 |     "\n",
390 |     "with open(\"../../dataset/sense/vocab_sense\",\"wb\") as f:\n",
391 |     "    pickle.dump(vocab_sense, f)\n",
392 |     "    \n",
393 |     "len(vocab_sense)"
394 |    ]
395 |   },
396 |   {
397 |    "cell_type": "code",
398 |    "execution_count": null,
399 |    "metadata": {
400 |     "collapsed": true
401 |    },
402 |    "outputs": [],
403 |    "source": []
404 |   }
405 |  ],
406 |  "metadata": {
407 |   "kernelspec": {
408 |    "display_name": "envs",
409 |    "language": "python",
410 |    "name": "cs771"
411 |   },
412 |   "language_info": {
413 |    "codemirror_mode": {
414 |     "name": "ipython",
415 |     "version": 3
416 |    },
417 |    "file_extension": ".py",
418 |    "mimetype": "text/x-python",
419 |    "name": "python",
420 |    "nbconvert_exporter": "python",
421 |    "pygments_lexer": "ipython3",
422 |    "version": "3.5.2"
423 |   }
424 |  },
425 |  "nbformat": 4,
426 |  "nbformat_minor": 2
427 | }
428 | 


--------------------------------------------------------------------------------
/one_million/all-word-model:
--------------------------------------------------------------------------------
 1 | all-word-model:
 2 | 
 3 | 1: basic   Val: F1 Score:65.5462 Accuracy:73.1659  Model-aw-1-multigpu-1
 4 | 
 5 | 2: cnn with pos  Val: F1 Score:72.33 Accuracy:77.93  POS: F1 Score:94.84 Accuracy:97.54  Model-aw-lex-1.4
 6 | 
 7 | 3: local attention  44.361822318916865, 53.75801083454307, 82.19997565386215, 90.42074423342494
 8 | Model-aw-lex-local_attention-fast-v2-4
 9 | 
10 | 4: local attention with hidden states  Val: F1 Score:52.19 Accuracy:58.68  POS: F1 Score:85.66 Accuracy:92.72  Model-aw-lex-local_attention-fast-v2-6
11 | 
12 | 5: gated local attention Val: F1 Score:44.17 Accuracy:53.07  POS: F1 Score:84.01 Accuracy:91.94
13 | 	Model-aw-lex-local_attention-fast-v2-7
14 | 
15 | 6: local attention with crf  Val: F1 Score:50.65 Accuracy:57.15  POS: F1 Score:87.84 Accuracy:93.70
16 | Model-aw-lex-local_attention-fast-v3-1  and Model-aw-lex-local_attention-fast-v4-1
17 | 
18 | 7: soft hierarchical Model-aw-lex-hierarchical-2.ipynb
19 | Val: F1 Score:74.04 Accuracy:79.38  POS: F1 Score:96.34 Accuracy:98.21 Loss:0.8093 , Time: 1240.6
20 | 
21 | 8: hard hierarchical Model-aw-lex-hierarchical-2.ipynb
22 | Val: F1 Score:70.35 Accuracy:77.30  POS: F1 Score:95.56 Accuracy:97.89 Loss:0.9279 , Time: 1195.1
23 | 


--------------------------------------------------------------------------------
/one_million/all-word/Model-aw-4-1.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import tensorflow as tf\n",
 12 |     "tf.logging.set_verbosity(tf.logging.WARN)\n",
 13 |     "import pickle\n",
 14 |     "import numpy as np\n",
 15 |     "import os\n",
 16 |     "from sklearn.model_selection import train_test_split\n",
 17 |     "from sklearn.metrics import f1_score\n",
 18 |     "from sklearn.metrics import accuracy_score\n",
 19 |     "import os\n",
 20 |     "from tensorflow.python.client import device_lib\n",
 21 |     "from collections import Counter\n",
 22 |     "import time"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": 2,
 28 |    "metadata": {
 29 |     "collapsed": true
 30 |    },
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "f = open('../../Glove/word_embedding_glove', 'rb')\n",
 34 |     "word_embedding = pickle.load(f)\n",
 35 |     "f.close()\n",
 36 |     "\n",
 37 |     "word_embedding = word_embedding[: len(word_embedding)-1]\n",
 38 |     "\n",
 39 |     "f = open('../../Glove/vocab_glove', 'rb')\n",
 40 |     "vocab = pickle.load(f)\n",
 41 |     "f.close()\n",
 42 |     "\n",
 43 |     "word2id = dict((w, i) for i,w in enumerate(vocab))\n",
 44 |     "id2word = dict((i, w) for i,w in enumerate(vocab))\n",
 45 |     "\n",
 46 |     "unknown_token = \"UNKNOWN_TOKEN\"\n",
 47 |     "\n",
 48 |     "# Model Description\n",
 49 |     "model_name = 'model-aw-4-1'\n",
 50 |     "model_dir = '../output/all-word/' + model_name\n",
 51 |     "save_dir = os.path.join(model_dir, \"save/\")\n",
 52 |     "log_dir = os.path.join(model_dir, \"log\")\n",
 53 |     "\n",
 54 |     "if not os.path.exists(model_dir):\n",
 55 |     "    os.mkdir(model_dir)\n",
 56 |     "if not os.path.exists(save_dir):\n",
 57 |     "    os.mkdir(save_dir)\n",
 58 |     "if not os.path.exists(log_dir):\n",
 59 |     "    os.mkdir(log_dir)\n",
 60 |     "\n",
 61 |     "with open('/data/aviraj/dataset/train_val_data/all_word_sense2_sampled','rb') as f:\n",
 62 |     "    train_data, val_data = pickle.load(f)    \n",
 63 |     "    \n",
 64 |     "\n",
 65 |     "# Parameters\n",
 66 |     "mode = 'train'\n",
 67 |     "num_senses = 272\n",
 68 |     "num_lex = 47\n",
 69 |     "num_pos = 12\n",
 70 |     "batch_size = 32\n",
 71 |     "vocab_size = len(vocab)\n",
 72 |     "unk_vocab_size = 1\n",
 73 |     "word_emb_size = len(word_embedding[0])\n",
 74 |     "max_sent_size = 200\n",
 75 |     "hidden_size = 512\n",
 76 |     "keep_prob = 0.4\n",
 77 |     "l2_lambda = 0.001\n",
 78 |     "init_lr = 0.01\n",
 79 |     "decay_steps = 5000\n",
 80 |     "decay_rate = 0.999\n",
 81 |     "clip_norm = 1\n",
 82 |     "clipping = True\n",
 83 |     "moving_avg_deacy = 0.999\n",
 84 |     "num_gpus = 6"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": 3,
 90 |    "metadata": {
 91 |     "collapsed": true
 92 |    },
 93 |    "outputs": [],
 94 |    "source": [
 95 |     "def average_gradients(tower_grads):\n",
 96 |     "    average_grads = []\n",
 97 |     "    for grad_and_vars in zip(*tower_grads):\n",
 98 |     "        # Note that each grad_and_vars looks like the following:\n",
 99 |     "        #   ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))\n",
100 |     "        grads = []\n",
101 |     "        for g, _ in grad_and_vars:\n",
102 |     "            # Add 0 dimension to the gradients to represent the tower.\n",
103 |     "            expanded_g = tf.expand_dims(g, 0)\n",
104 |     "\n",
105 |     "            # Append on a 'tower' dimension which we will average over below.\n",
106 |     "            grads.append(expanded_g)\n",
107 |     "\n",
108 |     "        # Average over the 'tower' dimension.\n",
109 |     "        grad = tf.concat(grads, 0)\n",
110 |     "        grad = tf.reduce_mean(grad, 0)\n",
111 |     "\n",
112 |     "        # Keep in mind that the Variables are redundant because they are shared\n",
113 |     "        # across towers. So .. we will just return the first tower's pointer to\n",
114 |     "        # the Variable.\n",
115 |     "        v = grad_and_vars[0][1]\n",
116 |     "        grad_and_var = (grad, v)\n",
117 |     "        average_grads.append(grad_and_var)\n",
118 |     "    return average_grads"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": 4,
124 |    "metadata": {
125 |     "collapsed": true
126 |    },
127 |    "outputs": [],
128 |    "source": [
129 |     "# MODEL\n",
130 |     "device_num = 0\n",
131 |     "tower_grads = []\n",
132 |     "losses = []\n",
133 |     "predictions = []\n",
134 |     "\n",
135 |     "x = tf.placeholder('int32', [num_gpus, batch_size, max_sent_size], name=\"x\")\n",
136 |     "y = tf.placeholder('int32', [num_gpus, batch_size, max_sent_size], name=\"y\")\n",
137 |     "x_mask  = tf.placeholder('bool', [num_gpus, batch_size, max_sent_size], name='x_mask') \n",
138 |     "sense_mask  = tf.placeholder('bool', [num_gpus, batch_size, max_sent_size], name='sense_mask')\n",
139 |     "is_train = tf.placeholder('bool', [], name='is_train')\n",
140 |     "word_emb_mat = tf.placeholder('float', [None, word_emb_size], name='emb_mat')\n",
141 |     "input_keep_prob = tf.cond(is_train,lambda:keep_prob, lambda:tf.constant(1.0))\n",
142 |     "\n",
143 |     "global_step = tf.Variable(0, trainable=False, name=\"global_step\")\n",
144 |     "learning_rate = tf.train.exponential_decay(init_lr, global_step, decay_steps, decay_rate, staircase=True)\n",
145 |     "summaries = []\n",
146 |     "\n",
147 |     "with tf.variable_scope(\"word_embedding\"):\n",
148 |     "    unk_word_emb_mat = tf.get_variable(\"word_emb_mat\", dtype='float', shape=[unk_vocab_size, word_emb_size], initializer=tf.contrib.layers.xavier_initializer(uniform=True, seed=0, dtype=tf.float32))\n",
149 |     "    final_word_emb_mat = tf.concat([word_emb_mat, unk_word_emb_mat], 0)\n",
150 |     "\n",
151 |     "with tf.variable_scope(tf.get_variable_scope()):\n",
152 |     "    for gpu_idx in range(num_gpus):\n",
153 |     "        if gpu_idx>2:\n",
154 |     "            device_num = 1\n",
155 |     "        with tf.name_scope(\"model_{}\".format(gpu_idx)) as scope, tf.device('/gpu:%d' % device_num):\n",
156 |     "\n",
157 |     "            if gpu_idx > 0:\n",
158 |     "                    tf.get_variable_scope().reuse_variables()\n",
159 |     "\n",
160 |     "            with tf.name_scope(\"word\"):\n",
161 |     "                Wx = tf.nn.embedding_lookup(final_word_emb_mat, x[gpu_idx])  \n",
162 |     "\n",
163 |     "            x_len = tf.reduce_sum(tf.cast(x_mask[gpu_idx], 'int32'), 1)\n",
164 |     "\n",
165 |     "            with tf.variable_scope(\"lstm1\"):\n",
166 |     "                cell_fw1 = tf.contrib.rnn.BasicLSTMCell(hidden_size,state_is_tuple=True)\n",
167 |     "                cell_bw1 = tf.contrib.rnn.BasicLSTMCell(hidden_size,state_is_tuple=True)\n",
168 |     "\n",
169 |     "                d_cell_fw1 = tf.contrib.rnn.DropoutWrapper(cell_fw1, input_keep_prob=input_keep_prob)\n",
170 |     "                d_cell_bw1 = tf.contrib.rnn.DropoutWrapper(cell_bw1, input_keep_prob=input_keep_prob)\n",
171 |     "\n",
172 |     "                (fw_h1, bw_h1), _ = tf.nn.bidirectional_dynamic_rnn(d_cell_fw1, d_cell_bw1, Wx, sequence_length=x_len, dtype='float', scope='lstm1')\n",
173 |     "                h1 = tf.concat([fw_h1, bw_h1], 2)\n",
174 |     "\n",
175 |     "            with tf.variable_scope(\"lstm2\"):\n",
176 |     "                cell_fw2 = tf.contrib.rnn.BasicLSTMCell(hidden_size,state_is_tuple=True)\n",
177 |     "                cell_bw2 = tf.contrib.rnn.BasicLSTMCell(hidden_size,state_is_tuple=True)\n",
178 |     "\n",
179 |     "                d_cell_fw2 = tf.contrib.rnn.DropoutWrapper(cell_fw2, input_keep_prob=input_keep_prob)\n",
180 |     "                d_cell_bw2 = tf.contrib.rnn.DropoutWrapper(cell_bw2, input_keep_prob=input_keep_prob)\n",
181 |     "\n",
182 |     "                (fw_h2, bw_h2), _ = tf.nn.bidirectional_dynamic_rnn(d_cell_fw2, d_cell_bw2, h1, sequence_length=x_len, dtype='float', scope='lstm2')\n",
183 |     "                h = tf.concat([fw_h2, bw_h2], 2)\n",
184 |     "\n",
185 |     "            def attention(input_x, input_mask, W_att):\n",
186 |     "                h_masked = tf.boolean_mask(input_x, input_mask)\n",
187 |     "                h_tanh = tf.tanh(h_masked)\n",
188 |     "                u = tf.matmul(h_tanh, W_att)\n",
189 |     "                a = tf.nn.softmax(u)\n",
190 |     "                c = tf.reduce_sum(tf.multiply(h_tanh, a), 0)  \n",
191 |     "                return c\n",
192 |     "\n",
193 |     "            with tf.variable_scope(\"attention\"):\n",
194 |     "                W_att = tf.get_variable(\"W_att\", shape=[2*hidden_size, 1], initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.1, seed=gpu_idx*10))\n",
195 |     "                c = tf.expand_dims(attention(h[0], x_mask[gpu_idx][0], W_att), 0)\n",
196 |     "                for i in range(1, batch_size):\n",
197 |     "                    c = tf.concat([c, tf.expand_dims(attention(h[i], x_mask[gpu_idx][i], W_att), 0)], 0)\n",
198 |     "                \n",
199 |     "                cc = tf.expand_dims(c, 1)\n",
200 |     "                c_final = tf.tile(cc, [1, max_sent_size, 1])\n",
201 |     "                h_final = tf.concat([c_final, h],2)\n",
202 |     "                flat_h_final = tf.reshape(h_final, [-1, 4*hidden_size])\n",
203 |     "           \n",
204 |     "            with tf.variable_scope(\"hidden_layer\"):\n",
205 |     "                W = tf.get_variable(\"W\", shape=[4*hidden_size, 2*hidden_size], initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.1, seed=gpu_idx*20))\n",
206 |     "                b = tf.get_variable(\"b\", shape=[2*hidden_size], initializer=tf.zeros_initializer())\n",
207 |     "                drop_flat_h_final = tf.nn.dropout(flat_h_final, input_keep_prob)\n",
208 |     "                flat_hl = tf.matmul(drop_flat_h_final, W) + b\n",
209 |     "                \n",
210 |     "            with tf.variable_scope(\"softmax_layer\"):\n",
211 |     "                W = tf.get_variable(\"W\", shape=[2*hidden_size, num_senses], initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.1, seed=gpu_idx*20))\n",
212 |     "                b = tf.get_variable(\"b\", shape=[num_senses], initializer=tf.zeros_initializer())\n",
213 |     "                drop_flat_hl = tf.nn.dropout(flat_hl, input_keep_prob)\n",
214 |     "                flat_logits_sense = tf.matmul(drop_flat_hl, W) + b\n",
215 |     "                logits = tf.reshape(flat_logits_sense, [batch_size, max_sent_size, num_senses])\n",
216 |     "                predictions.append(tf.arg_max(logits, 2))\n",
217 |     "\n",
218 |     "            float_sense_mask = tf.cast(sense_mask[gpu_idx], 'float')\n",
219 |     "\n",
220 |     "            loss = tf.contrib.seq2seq.sequence_loss(logits, y[gpu_idx], float_sense_mask, name=\"loss\")\n",
221 |     "\n",
222 |     "            l2_loss = l2_lambda * tf.losses.get_regularization_loss()\n",
223 |     "\n",
224 |     "            total_loss = loss + l2_loss\n",
225 |     "\n",
226 |     "            summaries.append(tf.summary.scalar(\"loss_{}\".format(gpu_idx), loss))\n",
227 |     "            summaries.append(tf.summary.scalar(\"total_loss_{}\".format(gpu_idx), total_loss))\n",
228 |     "\n",
229 |     "\n",
230 |     "            optimizer = tf.train.AdamOptimizer(learning_rate)\n",
231 |     "            grads_vars = optimizer.compute_gradients(total_loss)\n",
232 |     "\n",
233 |     "            clipped_grads = grads_vars\n",
234 |     "            if(clipping == True):\n",
235 |     "                clipped_grads = [(tf.clip_by_norm(grad, clip_norm), var) for grad, var in clipped_grads]\n",
236 |     "\n",
237 |     "            tower_grads.append(clipped_grads)\n",
238 |     "            losses.append(total_loss)\n",
239 |     "\n",
240 |     "tower_grads = average_gradients(tower_grads)\n",
241 |     "losses = tf.add_n(losses)/len(losses)\n",
242 |     "apply_grad_op = optimizer.apply_gradients(tower_grads, global_step=global_step)\n",
243 |     "summaries.append(tf.summary.scalar('total_loss', losses))\n",
244 |     "summaries.append(tf.summary.scalar('learning_rate', learning_rate))\n",
245 |     "\n",
246 |     "for var in tf.trainable_variables():\n",
247 |     "    summaries.append(tf.summary.histogram(var.op.name, var))\n",
248 |     "\n",
249 |     "variable_averages = tf.train.ExponentialMovingAverage(moving_avg_deacy, global_step)\n",
250 |     "variables_averages_op = variable_averages.apply(tf.trainable_variables())\n",
251 |     "\n",
252 |     "train_op = tf.group(apply_grad_op, variables_averages_op)\n",
253 |     "saver = tf.train.Saver(tf.global_variables())\n",
254 |     "summary = tf.summary.merge(summaries)"
255 |    ]
256 |   },
257 |   {
258 |    "cell_type": "code",
259 |    "execution_count": 5,
260 |    "metadata": {
261 |     "collapsed": true
262 |    },
263 |    "outputs": [],
264 |    "source": [
265 |     "os.environ[\"CUDA_DEVICE_ORDER\"]=\"PCI_BUS_ID\"   # see issue #152\n",
266 |     "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"0,1\"\n",
267 |     "# print (device_lib.list_local_devices())\n",
268 |     "config = tf.ConfigProto()\n",
269 |     "config.gpu_options.allow_growth = True\n",
270 |     "config.allow_soft_placement = True\n",
271 |     "sess = tf.Session(config=config)\n",
272 |     "sess.run(tf.global_variables_initializer())                          # For initializing all the variables\n",
273 |     "summary_writer = tf.summary.FileWriter(log_dir, sess.graph)          # For writing Summaries"
274 |    ]
275 |   },
276 |   {
277 |    "cell_type": "code",
278 |    "execution_count": 6,
279 |    "metadata": {
280 |     "collapsed": true
281 |    },
282 |    "outputs": [],
283 |    "source": [
284 |     "def model(xx, yy, mask, smask, train_cond=True):\n",
285 |     "    num_batches = int(len(xx)/(batch_size*num_gpus))\n",
286 |     "    _losses = 0\n",
287 |     "    temp_loss = 0\n",
288 |     "    preds_sense = []\n",
289 |     "    true_sense = []\n",
290 |     "    \n",
291 |     "    for j in range(num_batches): \n",
292 |     "        \n",
293 |     "        s = j * batch_size * num_gpus\n",
294 |     "        e = (j+1) * batch_size * num_gpus\n",
295 |     "        xx_re = xx[s:e].reshape([num_gpus, batch_size, -1])\n",
296 |     "        yy_re = yy[s:e].reshape([num_gpus, batch_size, -1])\n",
297 |     "        mask_re = mask[s:e].reshape([num_gpus, batch_size, -1])\n",
298 |     "        smask_re = smask[s:e].reshape([num_gpus, batch_size, -1])\n",
299 |     " \n",
300 |     "        feed_dict = {x:xx_re, y:yy_re, x_mask:mask_re, sense_mask:smask_re, is_train:train_cond, input_keep_prob:keep_prob, word_emb_mat:word_embedding}\n",
301 |     "        \n",
302 |     "        if(train_cond==True):\n",
303 |     "            _, _loss, step, _summary = sess.run([train_op, losses, global_step, summary], feed_dict)\n",
304 |     "            summary_writer.add_summary(_summary, step)\n",
305 |     "            \n",
306 |     "            temp_loss += _loss\n",
307 |     "            if((j+1)%1000==0):\n",
308 |     "                print(\"Steps: {}\".format(step), \"Loss:{0:.4f}\".format(temp_loss/1000), \", Current Loss: {0:.4f}\".format(_loss))\n",
309 |     "                temp_loss = 0\n",
310 |     "            if((j+1)%5000==0):\n",
311 |     "                saver.save(sess, save_path=save_dir)                         \n",
312 |     "                \n",
313 |     "        else:\n",
314 |     "            _loss, pred = sess.run([total_loss, predictions], feed_dict)\n",
315 |     "            for i in range(num_gpus):\n",
316 |     "                preds_sense.append(pred[i][smask_re[i]])\n",
317 |     "                true_sense.append(yy_re[i][smask_re[i]])\n",
318 |     "\n",
319 |     "        _losses +=_loss\n",
320 |     "\n",
321 |     "    if(train_cond==False): \n",
322 |     "        sense_preds = []\n",
323 |     "        sense_true = []\n",
324 |     "        \n",
325 |     "        for preds in preds_sense:\n",
326 |     "            for ps in preds:      \n",
327 |     "                sense_preds.append(ps)  \n",
328 |     "        for trues in true_sense:\n",
329 |     "            for ts in trues:\n",
330 |     "                sense_true.append(ts)\n",
331 |     "                \n",
332 |     "        return _losses/num_batches, sense_preds, sense_true\n",
333 |     "\n",
334 |     "    return _losses/num_batches, step\n",
335 |     "\n",
336 |     "def eval_score(yy, pred):\n",
337 |     "    f1 = f1_score(yy, pred, average='macro')\n",
338 |     "    accu = accuracy_score(yy, pred)\n",
339 |     "    return f1*100, accu*100"
340 |    ]
341 |   },
342 |   {
343 |    "cell_type": "code",
344 |    "execution_count": null,
345 |    "metadata": {
346 |     "collapsed": true
347 |    },
348 |    "outputs": [],
349 |    "source": [
350 |     "x_id_train = train_data['x']\n",
351 |     "mask_train = train_data['x_mask']\n",
352 |     "sense_mask_train = train_data['sense_mask']\n",
353 |     "y_train = train_data['y']\n",
354 |     "\n",
355 |     "x_id_val = val_data['x']\n",
356 |     "mask_val = val_data['x_mask']\n",
357 |     "sense_mask_val = val_data['sense_mask']\n",
358 |     "y_val = val_data['y']"
359 |    ]
360 |   },
361 |   {
362 |    "cell_type": "code",
363 |    "execution_count": null,
364 |    "metadata": {
365 |     "scrolled": true
366 |    },
367 |    "outputs": [
368 |     {
369 |      "name": "stdout",
370 |      "output_type": "stream",
371 |      "text": [
372 |       "Steps: 1000 Loss:15.7534 , Current Loss: 4.6380\n",
373 |       "Steps: 2000 Loss:4.6967 , Current Loss: 4.6226\n",
374 |       "Steps: 3000 Loss:4.7022 , Current Loss: 4.5907\n",
375 |       "Steps: 4000 Loss:4.7315 , Current Loss: 4.6306\n",
376 |       "Steps: 5000 Loss:4.8571 , Current Loss: 4.8387\n"
377 |      ]
378 |     }
379 |    ],
380 |    "source": [
381 |     "num_epochs = 5\n",
382 |     "log_period = 1\n",
383 |     "\n",
384 |     "for i in range(num_epochs):\n",
385 |     "    random = np.random.choice(len(y_train), size=(len(y_train)), replace=False)\n",
386 |     "    x_id_train = x_id_train[random]\n",
387 |     "    y_train = y_train[random]\n",
388 |     "    mask_train = mask_train[random]    \n",
389 |     "    sense_mask_train = sense_mask_train[random]\n",
390 |     "    \n",
391 |     "    start_time = time.time()\n",
392 |     "    train_loss, step = model(x_id_train, y_train, mask_train, sense_mask_train)\n",
393 |     "    time_taken = time.time() - start_time\n",
394 |     "    print(\"Epoch: {}\".format(i+1),\", Step: {}\".format(step), \", loss: {0:.4f}\".format(train_loss), \", Time: {0:.1f}\".format(time_taken))\n",
395 |     "    saver.save(sess, save_path=save_dir)                         \n",
396 |     "    print(\"Model Saved\")\n",
397 |     "    \n",
398 |     "    if((i+1)%log_period==0):\n",
399 |     "        start_time = time.time()\n",
400 |     "        val_loss, val_pred, val_true = model(x_id_val, y_val, mask_val, sense_mask_val, train_cond=False)        \n",
401 |     "        f1_, accu_ = eval_score(val_true, val_pred)\n",
402 |     "        time_taken = time.time() - start_time\n",
403 |     "        print(\"Val: F1 Score:{0:.2f}\".format(f1_), \"Accuracy:{0:.2f}\".format(accu_), \"Loss:{0:.4f}\".format(val_loss), \", Time: {0:.1f}\".format(time_taken))"
404 |    ]
405 |   },
406 |   {
407 |    "cell_type": "code",
408 |    "execution_count": 1,
409 |    "metadata": {},
410 |    "outputs": [],
411 |    "source": [
412 |     "start_time = time.time()\n",
413 |     "train_loss, train_pred, train_true = model(x_id_train, y_train, mask_train, sense_mask_train, train_cond=False)        \n",
414 |     "f1_, accu_ = etrain_score(train_true, train_pred)\n",
415 |     "time_taken = time.time() - start_time\n",
416 |     "print(\"train: F1 Score:{0:.2f}\".format(f1_), \"Accuracy:{0:.2f}\".format(accu_), \"Loss:{0:.4f}\".format(train_loss), \", Time: {0:.1f}\".format(time_taken))"
417 |    ]
418 |   },
419 |   {
420 |    "cell_type": "code",
421 |    "execution_count": null,
422 |    "metadata": {
423 |     "collapsed": true
424 |    },
425 |    "outputs": [],
426 |    "source": []
427 |   },
428 |   {
429 |    "cell_type": "code",
430 |    "execution_count": 8,
431 |    "metadata": {
432 |     "collapsed": true
433 |    },
434 |    "outputs": [],
435 |    "source": [
436 |     "saver.restore(sess, save_dir)"
437 |    ]
438 |   }
439 |  ],
440 |  "metadata": {
441 |   "kernelspec": {
442 |    "display_name": "cs771",
443 |    "language": "python",
444 |    "name": "cs771"
445 |   },
446 |   "language_info": {
447 |    "codemirror_mode": {
448 |     "name": "ipython",
449 |     "version": 3
450 |    },
451 |    "file_extension": ".py",
452 |    "mimetype": "text/x-python",
453 |    "name": "python",
454 |    "nbconvert_exporter": "python",
455 |    "pygments_lexer": "ipython3",
456 |    "version": "3.5.2"
457 |   }
458 |  },
459 |  "nbformat": 4,
460 |  "nbformat_minor": 2
461 | }
462 | 


--------------------------------------------------------------------------------
/one_million/all-word/Readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | ## Model-aw-lex-1 
4 | Convolution over hidden states of lstms


--------------------------------------------------------------------------------
/one_million/make/Make-Model-1.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import tensorflow as tf\n",
 12 |     "tf.logging.set_verbosity(tf.logging.WARN)\n",
 13 |     "import pickle\n",
 14 |     "import numpy as np\n",
 15 |     "import os\n",
 16 |     "from sklearn.model_selection import train_test_split\n",
 17 |     "from sklearn.metrics import f1_score\n",
 18 |     "from sklearn.metrics import accuracy_score\n",
 19 |     "import os\n",
 20 |     "from tensorflow.python.client import device_lib\n",
 21 |     "from collections import Counter"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": 2,
 27 |    "metadata": {
 28 |     "collapsed": true
 29 |    },
 30 |    "outputs": [],
 31 |    "source": [
 32 |     "f = open('../../Glove/word_embedding_glove', 'rb')\n",
 33 |     "word_embedding = pickle.load(f)\n",
 34 |     "f.close()\n",
 35 |     "word_embedding = word_embedding[: len(word_embedding)-1]\n",
 36 |     "\n",
 37 |     "f = open('../../Glove/vocab_glove', 'rb')\n",
 38 |     "vocab = pickle.load(f)\n",
 39 |     "f.close()\n",
 40 |     "\n",
 41 |     "word2id = dict((w, i) for i,w in enumerate(vocab))\n",
 42 |     "id2word = dict((i, w) for i,w in enumerate(vocab))\n",
 43 |     "\n",
 44 |     "unknown_token = \"UNKNOWN_TOKEN\""
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": 3,
 50 |    "metadata": {
 51 |     "collapsed": true
 52 |    },
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "f = open(\"../../../dataset/sense/dict_sense-keys\", 'rb')\n",
 56 |     "dict_sense_keys = pickle.load(f)\n",
 57 |     "f.close()\n",
 58 |     "\n",
 59 |     "f = open(\"../../../dataset/sense/dict_word-sense\", 'rb')\n",
 60 |     "dict_word_sense = pickle.load(f)\n",
 61 |     "f.close()"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": 4,
 67 |    "metadata": {
 68 |     "collapsed": true
 69 |    },
 70 |    "outputs": [],
 71 |    "source": [
 72 |     "# Model Description\n",
 73 |     "sense_word = 'make'\n",
 74 |     "model_name = 'model-1'\n",
 75 |     "sense_word_dir = '../output/' + sense_word\n",
 76 |     "model_dir = sense_word_dir + '/' + model_name\n",
 77 |     "save_dir = os.path.join(model_dir, \"save/\")\n",
 78 |     "log_dir = os.path.join(model_dir, \"log\")\n",
 79 |     "\n",
 80 |     "if not os.path.exists(sense_word_dir):\n",
 81 |     "    os.mkdir(sense_word_dir)\n",
 82 |     "if not os.path.exists(model_dir):\n",
 83 |     "    os.mkdir(model_dir)\n",
 84 |     "if not os.path.exists(save_dir):\n",
 85 |     "    os.mkdir(save_dir)\n",
 86 |     "if not os.path.exists(log_dir):\n",
 87 |     "    os.mkdir(log_dir)"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": 8,
 93 |    "metadata": {
 94 |     "scrolled": false
 95 |    },
 96 |    "outputs": [
 97 |     {
 98 |      "data": {
 99 |       "text/plain": [
100 |        "[('36', 2006),\n",
101 |        " ('30', 1025),\n",
102 |        " ('42', 968),\n",
103 |        " ('41', 962),\n",
104 |        " ('31', 617),\n",
105 |        " ('32', 543),\n",
106 |        " ('38', 445),\n",
107 |        " ('40', 20),\n",
108 |        " ('29', 6),\n",
109 |        " ('09', 1)]"
110 |       ]
111 |      },
112 |      "execution_count": 8,
113 |      "metadata": {},
114 |      "output_type": "execute_result"
115 |     }
116 |    ],
117 |    "source": [
118 |     "f = open(\"../../../dataset/checkwords/\"+ sense_word + \"_data\", 'rb')\n",
119 |     "data = pickle.load(f)\n",
120 |     "f.close()\n",
121 |     "\n",
122 |     "data_y = []\n",
123 |     "for i in range(len(data)):\n",
124 |     "    data_y.append(dict_sense_keys[data[i][0]][2])\n",
125 |     "\n",
126 |     "sense_count = Counter(data_y)\n",
127 |     "sense_count = sense_count.most_common()\n",
128 |     "vocab_sense = [k for k,v in sense_count[:7]]\n",
129 |     "sense_count"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": 9,
135 |    "metadata": {},
136 |    "outputs": [
137 |     {
138 |      "data": {
139 |       "text/plain": [
140 |        "['36', '30', '42', '41', '31', '32', '38']"
141 |       ]
142 |      },
143 |      "execution_count": 9,
144 |      "metadata": {},
145 |      "output_type": "execute_result"
146 |     }
147 |    ],
148 |    "source": [
149 |     "vocab_sense"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": 10,
155 |    "metadata": {},
156 |    "outputs": [
157 |     {
158 |      "name": "stdout",
159 |      "output_type": "stream",
160 |      "text": [
161 |       "6566 6593\n"
162 |      ]
163 |     }
164 |    ],
165 |    "source": [
166 |     "data_x = []\n",
167 |     "data_label = []\n",
168 |     "for i in range(len(data)):\n",
169 |     "    if dict_sense_keys[data[i][0]][2] in vocab_sense:\n",
170 |     "        data_x.append(data[i][1])\n",
171 |     "        data_label.append(dict_sense_keys[data[i][0]][2])\n",
172 |     "\n",
173 |     "print(len(data_label), len(data_y))\n",
174 |     "\n",
175 |     "# vocab_sense = dict_word_sense[sense_word]\n",
176 |     "\n",
177 |     "sense2id = dict((s, i) for i,s in enumerate(vocab_sense))\n",
178 |     "id2sense = dict((i, s) for i,s in enumerate(vocab))"
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "code",
183 |    "execution_count": 11,
184 |    "metadata": {
185 |     "collapsed": true
186 |    },
187 |    "outputs": [],
188 |    "source": [
189 |     "# Parameters\n",
190 |     "mode = 'train'\n",
191 |     "num_senses = len(vocab_sense)\n",
192 |     "batch_size = 64\n",
193 |     "vocab_size = len(vocab)\n",
194 |     "unk_vocab_size = 1\n",
195 |     "word_emb_size = len(word_embedding[0])\n",
196 |     "max_sent_size = 300\n",
197 |     "hidden_size = 100\n",
198 |     "keep_prob = 0.5\n",
199 |     "l2_lambda = 0.001\n",
200 |     "init_lr = 0.01\n",
201 |     "decay_steps = 500\n",
202 |     "decay_rate = 0.96\n",
203 |     "clip_norm = 1\n",
204 |     "clipping = True"
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "code",
209 |    "execution_count": 12,
210 |    "metadata": {
211 |     "collapsed": true,
212 |     "scrolled": true
213 |    },
214 |    "outputs": [],
215 |    "source": [
216 |     "# MODEL\n",
217 |     "x = tf.placeholder('int32', [batch_size, max_sent_size], name=\"x\")\n",
218 |     "y = tf.placeholder('int32', [batch_size], name=\"y\")\n",
219 |     "x_mask  = tf.placeholder('bool', [batch_size, max_sent_size], name='x_mask') \n",
220 |     "is_train = tf.placeholder('bool', [], name='is_train')\n",
221 |     "word_emb_mat = tf.placeholder('float', [None, word_emb_size], name='emb_mat')\n",
222 |     "input_keep_prob = tf.cond(is_train,lambda:keep_prob, lambda:tf.constant(1.0))\n",
223 |     "x_len = tf.reduce_sum(tf.cast(x_mask, 'int32'), 1)\n",
224 |     "\n",
225 |     "with tf.name_scope(\"word_embedding\"):\n",
226 |     "    if mode == 'train':\n",
227 |     "        unk_word_emb_mat = tf.get_variable(\"word_emb_mat\", dtype='float', shape=[unk_vocab_size, word_emb_size], initializer=tf.contrib.layers.xavier_initializer(uniform=True, seed=0, dtype=tf.float32))\n",
228 |     "    else:\n",
229 |     "        unk_word_emb_mat = tf.get_variable(\"word_emb_mat\", shape=[unk_vocab_size, word_emb_size], dtype='float')\n",
230 |     "        \n",
231 |     "    final_word_emb_mat = tf.concat([word_emb_mat, unk_word_emb_mat], 0)\n",
232 |     "    Wx = tf.nn.embedding_lookup(final_word_emb_mat, x)  \n",
233 |     "\n",
234 |     "with tf.variable_scope(\"lstm1\"):\n",
235 |     "    cell_fw1 = tf.contrib.rnn.BasicLSTMCell(hidden_size,state_is_tuple=True)\n",
236 |     "    cell_bw1 = tf.contrib.rnn.BasicLSTMCell(hidden_size,state_is_tuple=True)\n",
237 |     "\n",
238 |     "    d_cell_fw1 = tf.contrib.rnn.DropoutWrapper(cell_fw1, input_keep_prob=input_keep_prob)\n",
239 |     "    d_cell_bw1 = tf.contrib.rnn.DropoutWrapper(cell_bw1, input_keep_prob=input_keep_prob)\n",
240 |     "    \n",
241 |     "    (fw_h1, bw_h1), _ = tf.nn.bidirectional_dynamic_rnn(d_cell_fw1, d_cell_bw1, Wx, sequence_length=x_len, dtype='float', scope='lstm1')\n",
242 |     "    h1 = tf.concat([fw_h1, bw_h1], 2)\n",
243 |     "    \n",
244 |     "with tf.variable_scope(\"lstm2\"):\n",
245 |     "    cell_fw2 = tf.contrib.rnn.BasicLSTMCell(hidden_size,state_is_tuple=True)\n",
246 |     "    cell_bw2 = tf.contrib.rnn.BasicLSTMCell(hidden_size,state_is_tuple=True)\n",
247 |     "\n",
248 |     "    d_cell_fw2 = tf.contrib.rnn.DropoutWrapper(cell_fw2, input_keep_prob=input_keep_prob)\n",
249 |     "    d_cell_bw2 = tf.contrib.rnn.DropoutWrapper(cell_bw2, input_keep_prob=input_keep_prob)\n",
250 |     "    \n",
251 |     "    (fw_h2, bw_h2), _ = tf.nn.bidirectional_dynamic_rnn(d_cell_fw2, d_cell_bw2, h1, sequence_length=x_len, dtype='float', scope='lstm2')\n",
252 |     "    h = tf.concat([fw_h2, bw_h2], 2)\n",
253 |     "\n",
254 |     "def attention(input_x, input_mask, W_att):\n",
255 |     "    h_masked = tf.boolean_mask(input_x, input_mask)\n",
256 |     "    h_tanh = tf.tanh(h_masked)\n",
257 |     "    u = tf.matmul(h_tanh, W_att)\n",
258 |     "    a = tf.nn.softmax(u)\n",
259 |     "    c = tf.reduce_sum(tf.multiply(h_tanh, a), 0)  \n",
260 |     "    return c\n",
261 |     "\n",
262 |     "with tf.variable_scope(\"attention\"):\n",
263 |     "    W_att = tf.Variable(tf.truncated_normal([2*hidden_size, 1], mean=0.0, stddev=0.1, seed=0), name=\"W_att\")\n",
264 |     "    c = tf.expand_dims(attention(h[0], x_mask[0], W_att), 0)\n",
265 |     "    for i in range(1, batch_size):\n",
266 |     "        c = tf.concat([c, tf.expand_dims(attention(h[i], x_mask[i], W_att), 0)], 0)\n",
267 |     "        \n",
268 |     "with tf.variable_scope(\"softmax_layer\"):\n",
269 |     "    W = tf.Variable(tf.truncated_normal([2*hidden_size, num_senses], mean=0.0, stddev=0.1, seed=0), name=\"W\")\n",
270 |     "    b = tf.Variable(tf.zeros([num_senses]), name=\"b\")\n",
271 |     "    drop_c = tf.nn.dropout(c, input_keep_prob)\n",
272 |     "    logits = tf.matmul(drop_c, W) + b\n",
273 |     "    predictions = tf.argmax(logits, 1)\n",
274 |     "    \n",
275 |     "loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=y))\n",
276 |     "\n",
277 |     "global_step = tf.Variable(0, trainable=False, name=\"global_step\")\n",
278 |     "\n",
279 |     "learning_rate = tf.train.exponential_decay(init_lr, global_step, decay_steps, decay_rate, staircase=True)\n",
280 |     "\n",
281 |     "tv_all = tf.trainable_variables()\n",
282 |     "tv_regu =[]\n",
283 |     "for t in tv_all:\n",
284 |     "    if t.name.find('b:')==-1:\n",
285 |     "        tv_regu.append(t)\n",
286 |     "        \n",
287 |     "# l2 Loss\n",
288 |     "l2_loss = l2_lambda * tf.reduce_sum([ tf.nn.l2_loss(v) for v in tv_regu ])\n",
289 |     "\n",
290 |     "total_loss = loss + l2_loss\n",
291 |     "\n",
292 |     "# Optimizer for loss\n",
293 |     "optimizer = tf.train.AdamOptimizer(learning_rate)\n",
294 |     "\n",
295 |     "# Gradients and Variables for Loss\n",
296 |     "grads_vars = optimizer.compute_gradients(total_loss)\n",
297 |     "\n",
298 |     "# Clipping of Gradients\n",
299 |     "clipped_grads = grads_vars\n",
300 |     "if(clipping == True):\n",
301 |     "    clipped_grads = [(tf.clip_by_norm(grad, clip_norm), var) for grad, var in clipped_grads]\n",
302 |     "\n",
303 |     "# Training Optimizer for Total Loss\n",
304 |     "train_op = optimizer.apply_gradients(clipped_grads, global_step=global_step)\n",
305 |     "\n",
306 |     "# Summaries\n",
307 |     "var_summaries = []\n",
308 |     "for v in tv_all:\n",
309 |     "    var_summary = tf.summary.histogram(\"{}/var\".format(v.name), v)\n",
310 |     "    var_summaries.append(var_summary)\n",
311 |     "\n",
312 |     "var_summaries_merged = tf.summary.merge(var_summaries)\n",
313 |     "\n",
314 |     "loss_summary = tf.summary.scalar(\"loss\", loss)\n",
315 |     "total_loss_summary = tf.summary.scalar(\"total_loss\", total_loss)\n",
316 |     "summary = tf.summary.merge_all()"
317 |    ]
318 |   },
319 |   {
320 |    "cell_type": "code",
321 |    "execution_count": 13,
322 |    "metadata": {
323 |     "collapsed": true
324 |    },
325 |    "outputs": [],
326 |    "source": [
327 |     "os.environ[\"CUDA_DEVICE_ORDER\"]=\"PCI_BUS_ID\"   # see issue #152\n",
328 |     "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"0\"\n",
329 |     "config = tf.ConfigProto()\n",
330 |     "config.gpu_options.allow_growth = True\n",
331 |     "sess = tf.Session(config=config)\n",
332 |     "sess.run(tf.global_variables_initializer())                          # For initializing all the variables\n",
333 |     "saver = tf.train.Saver()                                             # For Saving the model\n",
334 |     "summary_writer = tf.summary.FileWriter(log_dir, sess.graph)          # For writing Summaries"
335 |    ]
336 |   },
337 |   {
338 |    "cell_type": "code",
339 |    "execution_count": 14,
340 |    "metadata": {
341 |     "collapsed": true
342 |    },
343 |    "outputs": [],
344 |    "source": [
345 |     "index = []\n",
346 |     "for i in range(len(data_x)):\n",
347 |     "    index.append(i)"
348 |    ]
349 |   },
350 |   {
351 |    "cell_type": "code",
352 |    "execution_count": 15,
353 |    "metadata": {
354 |     "scrolled": true
355 |    },
356 |    "outputs": [
357 |     {
358 |      "name": "stderr",
359 |      "output_type": "stream",
360 |      "text": [
361 |       "/users/btech/aviraj/cs771/lib/python3.5/site-packages/sklearn/model_selection/_split.py:2026: FutureWarning: From version 0.21, test_size will always complement train_size unless both are specified.\n",
362 |       "  FutureWarning)\n"
363 |      ]
364 |     }
365 |    ],
366 |    "source": [
367 |     "index_train, index_val, label_train, label_val = train_test_split(index, data_label, train_size=0.8, shuffle=True, stratify=data_label, random_state=0)\n",
368 |     "\n",
369 |     "data_x = np.array(data_x)\n",
370 |     "\n",
371 |     "x_train = data_x[index_train]\n",
372 |     "x_val = data_x[index_val]"
373 |    ]
374 |   },
375 |   {
376 |    "cell_type": "code",
377 |    "execution_count": 16,
378 |    "metadata": {
379 |     "collapsed": true
380 |    },
381 |    "outputs": [],
382 |    "source": [
383 |     "def data_prepare(x, y):\n",
384 |     "    num_examples = len(x)\n",
385 |     "\n",
386 |     "    xx = np.zeros([num_examples, max_sent_size], dtype=int)\n",
387 |     "    xx_mask = np.zeros([num_examples, max_sent_size], dtype=bool)\n",
388 |     "    yy = np.zeros([num_examples], dtype=int)\n",
389 |     "\n",
390 |     "    for j in range(num_examples):\n",
391 |     "        for i in range(max_sent_size):\n",
392 |     "            if(i>=len(x[j])):\n",
393 |     "                break\n",
394 |     "            w = x[j][i]\n",
395 |     "            xx[j][i] = word2id[w] if w in word2id else word2id['UNKNOWN_TOKEN']\n",
396 |     "            xx_mask[j][i] = True\n",
397 |     "        yy[j] = sense2id[y[j]]\n",
398 |     "    return xx, xx_mask, yy\n",
399 |     "\n",
400 |     "def eval_score(yy, pred):\n",
401 |     "    num_batches = int(len(yy)/batch_size)\n",
402 |     "    f1 = f1_score(yy[:batch_size*num_batches], pred, average='macro')\n",
403 |     "    accu = accuracy_score(yy[:batch_size*num_batches], pred)\n",
404 |     "    return f1*100, accu*100\n",
405 |     "\n",
406 |     "def model(xx, yy, mask, train_cond=True):\n",
407 |     "    num_batches = int(len(xx)/batch_size)\n",
408 |     "    losses = 0\n",
409 |     "    preds = []\n",
410 |     "    for j in range(num_batches): \n",
411 |     "        \n",
412 |     "        s = j * batch_size\n",
413 |     "        e = (j+1) * batch_size\n",
414 |     "        \n",
415 |     "        feed_dict = {x:xx[s:e], y:yy[s:e], x_mask:mask[s:e], is_train:train_cond, input_keep_prob:keep_prob, word_emb_mat:word_embedding}\n",
416 |     "        \n",
417 |     "        \n",
418 |     "        if(train_cond==True):\n",
419 |     "            _, _loss, step, _summary = sess.run([train_op, total_loss, global_step, summary], feed_dict)\n",
420 |     "            summary_writer.add_summary(_summary, step)  \n",
421 |     "#             print(\"Steps:{}\".format(step), \", Loss: {}\".format(_loss))\n",
422 |     "\n",
423 |     "        else:\n",
424 |     "            _loss, pred = sess.run([total_loss, predictions], feed_dict)\n",
425 |     "            preds.append(pred)\n",
426 |     "            \n",
427 |     "        losses +=_loss\n",
428 |     "\n",
429 |     "    if(train_cond==False):\n",
430 |     "        y_pred = []\n",
431 |     "        for i in range(num_batches):\n",
432 |     "            for pred in preds[i]:\n",
433 |     "                y_pred.append(pred)\n",
434 |     "        return losses/num_batches, y_pred\n",
435 |     "    \n",
436 |     "    return losses/num_batches, step"
437 |    ]
438 |   },
439 |   {
440 |    "cell_type": "code",
441 |    "execution_count": null,
442 |    "metadata": {
443 |     "collapsed": true
444 |    },
445 |    "outputs": [],
446 |    "source": [
447 |     "x_id_train, mask_train, y_train = data_prepare(x_train, label_train)\n",
448 |     "x_id_val, mask_val, y_val = data_prepare(x_val, label_val)"
449 |    ]
450 |   },
451 |   {
452 |    "cell_type": "code",
453 |    "execution_count": null,
454 |    "metadata": {
455 |     "scrolled": true
456 |    },
457 |    "outputs": [
458 |     {
459 |      "name": "stdout",
460 |      "output_type": "stream",
461 |      "text": [
462 |       "Epoch: 1 Step: 82 loss: 7.29599668631\n",
463 |       "Epoch: 2 Step: 164 loss: 2.07766101418\n",
464 |       "Epoch: 3 Step: 246 loss: 1.99490781528\n",
465 |       "Epoch: 4 Step: 328 loss: 1.97611695673\n",
466 |       "Epoch: 5 Step: 410 loss: 1.97086549387\n",
467 |       "Model Saved\n"
468 |      ]
469 |     },
470 |     {
471 |      "name": "stderr",
472 |      "output_type": "stream",
473 |      "text": [
474 |       "/users/btech/aviraj/cs771/lib/python3.5/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.\n",
475 |       "  'precision', 'predicted', average, warn_for)\n"
476 |      ]
477 |     },
478 |     {
479 |      "name": "stdout",
480 |      "output_type": "stream",
481 |      "text": [
482 |       "Train: F1 Score:  6.69154280711 Accuracy:  30.5830792683 Loss:  1.95732803025\n",
483 |       "Val: F1 Score:  6.72469704728 Accuracy:  30.78125 Loss:  1.95580910444\n",
484 |       "Epoch: 6 Step: 492 loss: 1.98001657899\n"
485 |      ]
486 |     }
487 |    ],
488 |    "source": [
489 |     "num_epochs = 60\n",
490 |     "log_period = 5\n",
491 |     "\n",
492 |     "for i in range(num_epochs):\n",
493 |     "    random = np.random.choice(len(y_train), size=(len(y_train)), replace=False)\n",
494 |     "    x_id_train = x_id_train[random]\n",
495 |     "    y_train = y_train[random]\n",
496 |     "    mask_train = mask_train[random]    \n",
497 |     "    \n",
498 |     "    losses, step = model(x_id_train, y_train, mask_train)\n",
499 |     "    print(\"Epoch:\", i+1,\"Step:\", step, \"loss:\",losses)\n",
500 |     "    \n",
501 |     "    if((i+1)%log_period==0):\n",
502 |     "        saver.save(sess, save_path=save_dir)                         \n",
503 |     "        print(\"Model Saved\")\n",
504 |     "        train_loss, train_pred = model(x_id_train, y_train, mask_train, train_cond=False)\n",
505 |     "        f1_, accu_ = eval_score(y_train, train_pred)\n",
506 |     "        print(\"Train: F1 Score: \",  f1_, \"Accuracy: \", accu_, \"Loss: \", train_loss)\n",
507 |     "        val_loss, val_pred = model(x_id_val, y_val, mask_val, train_cond=False)\n",
508 |     "        f1_, accu_ = eval_score(y_val, val_pred)\n",
509 |     "        print(\"Val: F1 Score: \",  f1_, \"Accuracy: \", accu_, \"Loss: \", val_loss)\n",
510 |     "        \n",
511 |     "# test_loss, test_pred, test_pred_pos, test_true_pos = model(x_id_test, y_test, mask_test, pos_id_test, train_cond=False)        \n",
512 |     "# f1_, accu_, f1_pos_, accu_pos_ = etest_score(y_test, test_pred, test_pred_pos, test_true_pos)\n",
513 |     "# print(\"test: F1 Score: \",  f1_, \"Accuracy: \", accu_, \"POS F1 Score: \",  f1_pos_, \"POS Accuracy: \", accu_pos_, \"Loss: \", test_loss)"
514 |    ]
515 |   },
516 |   {
517 |    "cell_type": "code",
518 |    "execution_count": null,
519 |    "metadata": {
520 |     "collapsed": true
521 |    },
522 |    "outputs": [],
523 |    "source": []
524 |   },
525 |   {
526 |    "cell_type": "code",
527 |    "execution_count": null,
528 |    "metadata": {
529 |     "collapsed": true
530 |    },
531 |    "outputs": [],
532 |    "source": [
533 |     "saver.restore(sess, save_dir)"
534 |    ]
535 |   }
536 |  ],
537 |  "metadata": {
538 |   "kernelspec": {
539 |    "display_name": "cs771",
540 |    "language": "python",
541 |    "name": "cs771"
542 |   },
543 |   "language_info": {
544 |    "codemirror_mode": {
545 |     "name": "ipython",
546 |     "version": 3
547 |    },
548 |    "file_extension": ".py",
549 |    "mimetype": "text/x-python",
550 |    "name": "python",
551 |    "nbconvert_exporter": "python",
552 |    "pygments_lexer": "ipython3",
553 |    "version": "3.5.2"
554 |   }
555 |  },
556 |  "nbformat": 4,
557 |  "nbformat_minor": 2
558 | }
559 | 


--------------------------------------------------------------------------------
/one_million/one_million_parsing.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import xml.etree.ElementTree as ET\n",
 12 |     "import numpy as np\n",
 13 |     "tree = ET.parse('semcor+omsti.data.xml')\n",
 14 |     "root = tree.getroot()"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 2,
 20 |    "metadata": {
 21 |     "collapsed": true
 22 |    },
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "cor1 = root[0]\n",
 26 |     "cor2 = root[1]\n",
 27 |     "#sent = cor2.findall('text')"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": 3,
 33 |    "metadata": {},
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "def isalphabet(word):\n",
 37 |     "    list_ = list(word)\n",
 38 |     "    if len(list_) > 1:\n",
 39 |     "        return True\n",
 40 |     "    else:\n",
 41 |     "        if word.isalpha():\n",
 42 |     "            return True\n",
 43 |     "        return False"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": 50,
 49 |    "metadata": {},
 50 |    "outputs": [
 51 |     {
 52 |      "name": "stdout",
 53 |      "output_type": "stream",
 54 |      "text": [
 55 |       "37176\n"
 56 |      ]
 57 |     }
 58 |    ],
 59 |    "source": [
 60 |     "train1 = []\n",
 61 |     "#soup=soup.find_all('corpus')[1]\n",
 62 |     "count=0\n",
 63 |     "for sentences in cor1.findall('text'):\n",
 64 |     "    for sentence in sentences:\n",
 65 |     "        temp_sent = []\n",
 66 |     "        temp_sent.append(sentence.get('id'))\n",
 67 |     "\n",
 68 |     "        temp_words = []\n",
 69 |     "        i_cnt=1\n",
 70 |     "        ind=[]\n",
 71 |     "        for word in sentence:\n",
 72 |     "            string = word.text.lower()        \n",
 73 |     "            if (isalphabet(string)):\n",
 74 |     "                temp_words.append(string)\n",
 75 |     "                ind.append(i_cnt)\n",
 76 |     "            i_cnt+=1\n",
 77 |     "\n",
 78 |     "        temp_sent.append(temp_words)\n",
 79 |     "        list_ = sentence.iter()\n",
 80 |     "        id_list = []\n",
 81 |     "        lemma_list = []\n",
 82 |     "        pos_list = []\n",
 83 |     "        for i in list_:\n",
 84 |     "            id_list.append(i.get('id'))\n",
 85 |     "            lemma_list.append(i.get('lemma'))\n",
 86 |     "            pos_list.append(i.get('pos'))\n",
 87 |     "\n",
 88 |     "        id_list, lemma_list , pos_list = np.array(id_list),np.array(lemma_list),np.array(pos_list)\n",
 89 |     "        temp_sent.extend([list(id_list[ind]), list(lemma_list[ind]), list(pos_list[ind])])\n",
 90 |     "        train1.append(temp_sent)\n",
 91 |     "        count+=1\n",
 92 |     "        \n",
 93 |     "print(count)"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": 51,
 99 |    "metadata": {},
100 |    "outputs": [],
101 |    "source": [
102 |     "train1=train1[:len(train1)-2]"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": 19,
108 |    "metadata": {},
109 |    "outputs": [
110 |     {
111 |      "name": "stdout",
112 |      "output_type": "stream",
113 |      "text": [
114 |       "813798\n"
115 |      ]
116 |     }
117 |    ],
118 |    "source": [
119 |     "train2 = []\n",
120 |     "#soup=soup.find_all('corpus')[1]\n",
121 |     "count=0\n",
122 |     "for sentences in cor2.findall('text'):\n",
123 |     "    for sentence in sentences:\n",
124 |     "        temp_sent = []\n",
125 |     "        temp_sent.append(sentence.get('id'))\n",
126 |     "\n",
127 |     "        temp_words = []\n",
128 |     "        i_cnt=1\n",
129 |     "        ind=[]\n",
130 |     "        for word in sentence:\n",
131 |     "            string = word.text.lower()        \n",
132 |     "            if (isalphabet(string)):\n",
133 |     "                temp_words.append(string)\n",
134 |     "                ind.append(i_cnt)\n",
135 |     "            i_cnt+=1\n",
136 |     "\n",
137 |     "        temp_sent.append(temp_words)\n",
138 |     "        list_ = sentence.iter()\n",
139 |     "        id_list = []\n",
140 |     "        lemma_list = []\n",
141 |     "        pos_list = []\n",
142 |     "        for i in list_:\n",
143 |     "            id_list.append(i.get('id'))\n",
144 |     "            lemma_list.append(i.get('lemma'))\n",
145 |     "            pos_list.append(i.get('pos'))\n",
146 |     "\n",
147 |     "        id_list, lemma_list , pos_list = np.array(id_list),np.array(lemma_list),np.array(pos_list)\n",
148 |     "        temp_sent.extend([list(id_list[ind]), list(lemma_list[ind]), list(pos_list[ind])])\n",
149 |     "        train2.append(temp_sent)\n",
150 |     "        count+=1\n",
151 |     "        \n",
152 |     "print(count)"
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "code",
157 |    "execution_count": 54,
158 |    "metadata": {},
159 |    "outputs": [
160 |     {
161 |      "data": {
162 |       "text/plain": [
163 |        "850972"
164 |       ]
165 |      },
166 |      "execution_count": 54,
167 |      "metadata": {},
168 |      "output_type": "execute_result"
169 |     }
170 |    ],
171 |    "source": [
172 |     "len(train1)+len(train2)"
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "code",
177 |    "execution_count": 55,
178 |    "metadata": {},
179 |    "outputs": [],
180 |    "source": [
181 |     "train=train1+train2"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "code",
186 |    "execution_count": 56,
187 |    "metadata": {},
188 |    "outputs": [
189 |     {
190 |      "data": {
191 |       "text/plain": [
192 |        "850972"
193 |       ]
194 |      },
195 |      "execution_count": 56,
196 |      "metadata": {},
197 |      "output_type": "execute_result"
198 |     }
199 |    ],
200 |    "source": [
201 |     "len(train)"
202 |    ]
203 |   },
204 |   {
205 |    "cell_type": "code",
206 |    "execution_count": 57,
207 |    "metadata": {
208 |     "collapsed": true
209 |    },
210 |    "outputs": [],
211 |    "source": [
212 |     "import pickle\n",
213 |     "with open('preprocess_train','wb') as f:\n",
214 |     "    pickle.dump(train,f)"
215 |    ]
216 |   }
217 |  ],
218 |  "metadata": {
219 |   "kernelspec": {
220 |    "display_name": "cs771",
221 |    "language": "python",
222 |    "name": "cs771"
223 |   },
224 |   "language_info": {
225 |    "codemirror_mode": {
226 |     "name": "ipython",
227 |     "version": 3
228 |    },
229 |    "file_extension": ".py",
230 |    "mimetype": "text/x-python",
231 |    "name": "python",
232 |    "nbconvert_exporter": "python",
233 |    "pygments_lexer": "ipython3",
234 |    "version": "3.5.2"
235 |   }
236 |  },
237 |  "nbformat": 4,
238 |  "nbformat_minor": 2
239 | }
240 | 


--------------------------------------------------------------------------------
/one_million/one_word_data_maker-test.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import pickle\n",
 12 |     "from nltk.corpus import wordnet as wn"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 2,
 18 |    "metadata": {
 19 |     "collapsed": true
 20 |    },
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "with open('/data/aviraj/dataset/raw_preprocess_test','rb') as f:\n",
 24 |     "    global_data=pickle.load(f)"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": 3,
 30 |    "metadata": {
 31 |     "collapsed": true
 32 |    },
 33 |    "outputs": [],
 34 |    "source": [
 35 |     "with open('/data/aviraj/dataset/ALL.gold.key.txt','r') as f:\n",
 36 |     "    data_key=f.readlines()"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 9,
 42 |    "metadata": {},
 43 |    "outputs": [
 44 |     {
 45 |      "data": {
 46 |       "text/plain": [
 47 |        "['the',\n",
 48 |        " 'art',\n",
 49 |        " 'of',\n",
 50 |        " 'change_ringing',\n",
 51 |        " 'be',\n",
 52 |        " 'peculiar',\n",
 53 |        " 'to',\n",
 54 |        " 'the',\n",
 55 |        " 'english',\n",
 56 |        " ',',\n",
 57 |        " 'and',\n",
 58 |        " ',',\n",
 59 |        " 'like',\n",
 60 |        " 'most',\n",
 61 |        " 'english',\n",
 62 |        " 'peculiarity',\n",
 63 |        " ',',\n",
 64 |        " 'unintelligible',\n",
 65 |        " 'to',\n",
 66 |        " 'the',\n",
 67 |        " 'rest',\n",
 68 |        " 'of',\n",
 69 |        " 'the',\n",
 70 |        " 'world',\n",
 71 |        " '.']"
 72 |       ]
 73 |      },
 74 |      "execution_count": 9,
 75 |      "metadata": {},
 76 |      "output_type": "execute_result"
 77 |     }
 78 |    ],
 79 |    "source": [
 80 |     "global_data[0][3]"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": 4,
 86 |    "metadata": {
 87 |     "collapsed": true
 88 |    },
 89 |    "outputs": [],
 90 |    "source": [
 91 |     "def make_word_data(checkword):\n",
 92 |     "    \n",
 93 |     "    dataset_line=[]\n",
 94 |     "    for i,list_ in enumerate(global_data): \n",
 95 |     "        ind=[idx for idx,it in enumerate(list_[3]) if it==checkword]\n",
 96 |     "        for ii in ind:\n",
 97 |     "            if list_[2][ii] is not None:\n",
 98 |     "                dataset_line.append([list_[2][ii],list_[1],list_[4]])\n",
 99 |     "    \n",
100 |     "    print(len(dataset_line))\n",
101 |     "    with open('/data/aviraj/dataset/checkwords/'+checkword + '_data_test', 'wb') as f:\n",
102 |     "        pickle.dump(dataset_line, f)\n",
103 |     "    with open('/data/aviraj/dataset/checkwords/'+checkword + '_data_test', 'rb') as f:\n",
104 |     "        data_ = pickle.load(f)"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "code",
109 |    "execution_count": 5,
110 |    "metadata": {
111 |     "collapsed": true
112 |    },
113 |    "outputs": [],
114 |    "source": [
115 |     "test_words = ['force', 'make', 'open', 'place', 'point', 'serve', 'support']"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "code",
120 |    "execution_count": 6,
121 |    "metadata": {},
122 |    "outputs": [
123 |     {
124 |      "name": "stdout",
125 |      "output_type": "stream",
126 |      "text": [
127 |       "1\n",
128 |       "31\n",
129 |       "4\n",
130 |       "5\n",
131 |       "11\n",
132 |       "2\n",
133 |       "12\n"
134 |      ]
135 |     }
136 |    ],
137 |    "source": [
138 |     "for word in test_words:\n",
139 |     "    make_word_data(word)"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": null,
145 |    "metadata": {
146 |     "collapsed": true
147 |    },
148 |    "outputs": [],
149 |    "source": [
150 |     "with open('../Glove/vocab_glove', 'rb') as f:\n",
151 |     "    vocab = pickle.load(f)\n"
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "code",
156 |    "execution_count": null,
157 |    "metadata": {
158 |     "collapsed": true
159 |    },
160 |    "outputs": [],
161 |    "source": [
162 |     "train_words = []\n",
163 |     "for sent in global_data:\n",
164 |     "    train_words.extend(sent[1])"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "code",
169 |    "execution_count": null,
170 |    "metadata": {},
171 |    "outputs": [],
172 |    "source": [
173 |     "len(train_words), len(set(train_words)), len(vocab)"
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "code",
178 |    "execution_count": null,
179 |    "metadata": {
180 |     "collapsed": true
181 |    },
182 |    "outputs": [],
183 |    "source": [
184 |     "import collections\n",
185 |     "unknown_words = []\n",
186 |     "for word in set(train_words):\n",
187 |     "    if word not in vocab:\n",
188 |     "        unknown_words.append(word)\n",
189 |     "        \n",
190 |     "un_counter = collections.Counter(unknown_words)\n",
191 |     "un_counter = dict(un_counter)\n",
192 |     "\n",
193 |     "sorted_un_counter = sorted(un_counter.items(), key=lambda x:x[1], reverse=True)\n",
194 |     "sorted_un_counter"
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "code",
199 |    "execution_count": null,
200 |    "metadata": {
201 |     "collapsed": true
202 |    },
203 |    "outputs": [],
204 |    "source": [
205 |     "with open('million_unknown_words.pickle', 'wb') as f:\n",
206 |     "    pickle.dump(unknown_words, f)"
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "code",
211 |    "execution_count": null,
212 |    "metadata": {},
213 |    "outputs": [],
214 |    "source": [
215 |     "len(sorted(global_data, key=lambda x:len(x[1]), reverse=True)[0][1])"
216 |    ]
217 |   },
218 |   {
219 |    "cell_type": "code",
220 |    "execution_count": null,
221 |    "metadata": {
222 |     "collapsed": true
223 |    },
224 |    "outputs": [],
225 |    "source": []
226 |   }
227 |  ],
228 |  "metadata": {
229 |   "kernelspec": {
230 |    "display_name": "envs",
231 |    "language": "python",
232 |    "name": "cs771"
233 |   },
234 |   "language_info": {
235 |    "codemirror_mode": {
236 |     "name": "ipython",
237 |     "version": 3
238 |    },
239 |    "file_extension": ".py",
240 |    "mimetype": "text/x-python",
241 |    "name": "python",
242 |    "nbconvert_exporter": "python",
243 |    "pygments_lexer": "ipython3",
244 |    "version": "3.5.2"
245 |   }
246 |  },
247 |  "nbformat": 4,
248 |  "nbformat_minor": 2
249 | }
250 | 


--------------------------------------------------------------------------------
/one_million/one_word_data_maker.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import pickle\n",
 12 |     "from nltk.corpus import wordnet as wn"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 2,
 18 |    "metadata": {
 19 |     "collapsed": true
 20 |    },
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "with open('/data/aviraj/dataset/raw_preprocess_train','rb') as f:\n",
 24 |     "    global_data=pickle.load(f)"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": 3,
 30 |    "metadata": {
 31 |     "collapsed": true
 32 |    },
 33 |    "outputs": [],
 34 |    "source": [
 35 |     "with open('/data/aviraj/dataset/semcor+omsti.gold.key.txt','r') as f:\n",
 36 |     "    data_key=f.readlines()"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 4,
 42 |    "metadata": {
 43 |     "collapsed": true
 44 |    },
 45 |    "outputs": [],
 46 |    "source": [
 47 |     "def make_word_data(checkword):\n",
 48 |     "    \n",
 49 |     "    dataset_line=[]\n",
 50 |     "    for i,list_ in enumerate(global_data): \n",
 51 |     "        ind=[idx for idx,it in enumerate(list_[3]) if it==checkword]\n",
 52 |     "        for ii in ind:\n",
 53 |     "            if list_[2][ii] is not None:\n",
 54 |     "                dataset_line.append([list_[2][ii],list_[1],list_[4]])\n",
 55 |     "    \n",
 56 |     "    print(len(dataset_line))\n",
 57 |     "    with open('/data/aviraj/dataset/checkwords/'+checkword + '_data', 'wb') as f:\n",
 58 |     "        pickle.dump(dataset_line, f)\n",
 59 |     "    with open('/data/aviraj/dataset/checkwords/'+checkword + '_data', 'rb') as f:\n",
 60 |     "        data_ = pickle.load(f)"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": 5,
 66 |    "metadata": {
 67 |     "collapsed": true
 68 |    },
 69 |    "outputs": [],
 70 |    "source": [
 71 |     "test_words = ['force', 'make', 'open', 'place', 'point', 'serve', 'support']"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": null,
 77 |    "metadata": {
 78 |     "collapsed": true
 79 |    },
 80 |    "outputs": [],
 81 |    "source": []
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": 6,
 86 |    "metadata": {},
 87 |    "outputs": [
 88 |     {
 89 |      "name": "stdout",
 90 |      "output_type": "stream",
 91 |      "text": [
 92 |       "3723\n",
 93 |       "6593\n",
 94 |       "2922\n",
 95 |       "3569\n",
 96 |       "2855\n",
 97 |       "3462\n",
 98 |       "3489\n"
 99 |      ]
100 |     }
101 |    ],
102 |    "source": [
103 |     "for word in test_words:\n",
104 |     "    make_word_data(word)"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "code",
109 |    "execution_count": null,
110 |    "metadata": {
111 |     "collapsed": true
112 |    },
113 |    "outputs": [],
114 |    "source": [
115 |     "with open('../Glove/vocab_glove', 'rb') as f:\n",
116 |     "    vocab = pickle.load(f)\n"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "code",
121 |    "execution_count": null,
122 |    "metadata": {
123 |     "collapsed": true
124 |    },
125 |    "outputs": [],
126 |    "source": [
127 |     "train_words = []\n",
128 |     "for sent in global_data:\n",
129 |     "    train_words.extend(sent[1])"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": null,
135 |    "metadata": {},
136 |    "outputs": [],
137 |    "source": [
138 |     "len(train_words), len(set(train_words)), len(vocab)"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": null,
144 |    "metadata": {
145 |     "collapsed": true
146 |    },
147 |    "outputs": [],
148 |    "source": [
149 |     "import collections\n",
150 |     "unknown_words = []\n",
151 |     "for word in set(train_words):\n",
152 |     "    if word not in vocab:\n",
153 |     "        unknown_words.append(word)\n",
154 |     "        \n",
155 |     "un_counter = collections.Counter(unknown_words)\n",
156 |     "un_counter = dict(un_counter)\n",
157 |     "\n",
158 |     "sorted_un_counter = sorted(un_counter.items(), key=lambda x:x[1], reverse=True)\n",
159 |     "sorted_un_counter"
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "code",
164 |    "execution_count": null,
165 |    "metadata": {
166 |     "collapsed": true
167 |    },
168 |    "outputs": [],
169 |    "source": [
170 |     "with open('million_unknown_words.pickle', 'wb') as f:\n",
171 |     "    pickle.dump(unknown_words, f)"
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "code",
176 |    "execution_count": null,
177 |    "metadata": {},
178 |    "outputs": [],
179 |    "source": [
180 |     "len(sorted(global_data, key=lambda x:len(x[1]), reverse=True)[0][1])"
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "code",
185 |    "execution_count": null,
186 |    "metadata": {
187 |     "collapsed": true
188 |    },
189 |    "outputs": [],
190 |    "source": []
191 |   }
192 |  ],
193 |  "metadata": {
194 |   "kernelspec": {
195 |    "display_name": "envs",
196 |    "language": "python",
197 |    "name": "cs771"
198 |   },
199 |   "language_info": {
200 |    "codemirror_mode": {
201 |     "name": "ipython",
202 |     "version": 3
203 |    },
204 |    "file_extension": ".py",
205 |    "mimetype": "text/x-python",
206 |    "name": "python",
207 |    "nbconvert_exporter": "python",
208 |    "pygments_lexer": "ipython3",
209 |    "version": "3.5.2"
210 |   }
211 |  },
212 |  "nbformat": 4,
213 |  "nbformat_minor": 2
214 | }
215 | 


--------------------------------------------------------------------------------
/one_million/raw_one_million_parsing-test.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import xml.etree.ElementTree as ET\n",
 10 |     "import numpy as np\n",
 11 |     "tree = ET.parse('../../dataset/ALL.data.xml')\n",
 12 |     "root = tree.getroot()"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 2,
 18 |    "metadata": {},
 19 |    "outputs": [
 20 |     {
 21 |      "data": {
 22 |       "text/plain": [
 23 |        "<Element 'corpus' at 0x7f5c0c0ee9a8>"
 24 |       ]
 25 |      },
 26 |      "execution_count": 2,
 27 |      "metadata": {},
 28 |      "output_type": "execute_result"
 29 |     }
 30 |    ],
 31 |    "source": [
 32 |     "root"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": 4,
 38 |    "metadata": {
 39 |     "collapsed": true
 40 |    },
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "def isalphabet(word):\n",
 44 |     "    return True"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": 5,
 50 |    "metadata": {},
 51 |    "outputs": [
 52 |     {
 53 |      "name": "stdout",
 54 |      "output_type": "stream",
 55 |      "text": [
 56 |       "1173\n"
 57 |      ]
 58 |     }
 59 |    ],
 60 |    "source": [
 61 |     "train1 = []\n",
 62 |     "count=0\n",
 63 |     "for sentences in root.findall('text'):\n",
 64 |     "    for sentence in sentences:\n",
 65 |     "        temp_sent = []\n",
 66 |     "        temp_sent.append(sentence.get('id'))\n",
 67 |     "\n",
 68 |     "        temp_words = []\n",
 69 |     "        i_cnt=1\n",
 70 |     "        ind=[]\n",
 71 |     "        for word in sentence:\n",
 72 |     "            string = word.text.lower()        \n",
 73 |     "            if (isalphabet(string)):\n",
 74 |     "                temp_words.append(string)\n",
 75 |     "                ind.append(i_cnt)\n",
 76 |     "            i_cnt+=1\n",
 77 |     "\n",
 78 |     "        temp_sent.append(temp_words)\n",
 79 |     "        list_ = sentence.iter()\n",
 80 |     "        id_list = []\n",
 81 |     "        lemma_list = []\n",
 82 |     "        pos_list = []\n",
 83 |     "        for i in list_:\n",
 84 |     "            id_list.append(i.get('id'))\n",
 85 |     "            lemma_list.append(i.get('lemma'))\n",
 86 |     "            pos_list.append(i.get('pos'))\n",
 87 |     "\n",
 88 |     "        id_list, lemma_list , pos_list = np.array(id_list),np.array(lemma_list),np.array(pos_list)\n",
 89 |     "        temp_sent.extend([list(id_list[ind]), list(lemma_list[ind]), list(pos_list[ind])])\n",
 90 |     "        train1.append(temp_sent)\n",
 91 |     "        count+=1\n",
 92 |     "        \n",
 93 |     "print(count)"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": 6,
 99 |    "metadata": {},
100 |    "outputs": [
101 |     {
102 |      "data": {
103 |       "text/plain": [
104 |        "1173"
105 |       ]
106 |      },
107 |      "execution_count": 6,
108 |      "metadata": {},
109 |      "output_type": "execute_result"
110 |     }
111 |    ],
112 |    "source": [
113 |     "len(train1)"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": 10,
119 |    "metadata": {
120 |     "collapsed": true
121 |    },
122 |    "outputs": [],
123 |    "source": [
124 |     "import pickle\n",
125 |     "with open('/data/aviraj/dataset/raw_preprocess_test','wb') as f:\n",
126 |     "    pickle.dump(train1,f)"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "code",
131 |    "execution_count": 12,
132 |    "metadata": {},
133 |    "outputs": [
134 |     {
135 |      "data": {
136 |       "text/plain": [
137 |        "['senseval2.d000.s000',\n",
138 |        " ['the',\n",
139 |        "  'art',\n",
140 |        "  'of',\n",
141 |        "  'change-ringing',\n",
142 |        "  'is',\n",
143 |        "  'peculiar',\n",
144 |        "  'to',\n",
145 |        "  'the',\n",
146 |        "  'english',\n",
147 |        "  ',',\n",
148 |        "  'and',\n",
149 |        "  ',',\n",
150 |        "  'like',\n",
151 |        "  'most',\n",
152 |        "  'english',\n",
153 |        "  'peculiarities',\n",
154 |        "  ',',\n",
155 |        "  'unintelligible',\n",
156 |        "  'to',\n",
157 |        "  'the',\n",
158 |        "  'rest',\n",
159 |        "  'of',\n",
160 |        "  'the',\n",
161 |        "  'world',\n",
162 |        "  '.'],\n",
163 |        " [None,\n",
164 |        "  'senseval2.d000.s000.t000',\n",
165 |        "  None,\n",
166 |        "  'senseval2.d000.s000.t001',\n",
167 |        "  None,\n",
168 |        "  'senseval2.d000.s000.t002',\n",
169 |        "  None,\n",
170 |        "  None,\n",
171 |        "  'senseval2.d000.s000.t003',\n",
172 |        "  None,\n",
173 |        "  None,\n",
174 |        "  None,\n",
175 |        "  None,\n",
176 |        "  'senseval2.d000.s000.t004',\n",
177 |        "  'senseval2.d000.s000.t005',\n",
178 |        "  'senseval2.d000.s000.t006',\n",
179 |        "  None,\n",
180 |        "  'senseval2.d000.s000.t007',\n",
181 |        "  None,\n",
182 |        "  None,\n",
183 |        "  'senseval2.d000.s000.t008',\n",
184 |        "  None,\n",
185 |        "  None,\n",
186 |        "  'senseval2.d000.s000.t009',\n",
187 |        "  None],\n",
188 |        " ['the',\n",
189 |        "  'art',\n",
190 |        "  'of',\n",
191 |        "  'change_ringing',\n",
192 |        "  'be',\n",
193 |        "  'peculiar',\n",
194 |        "  'to',\n",
195 |        "  'the',\n",
196 |        "  'english',\n",
197 |        "  ',',\n",
198 |        "  'and',\n",
199 |        "  ',',\n",
200 |        "  'like',\n",
201 |        "  'most',\n",
202 |        "  'english',\n",
203 |        "  'peculiarity',\n",
204 |        "  ',',\n",
205 |        "  'unintelligible',\n",
206 |        "  'to',\n",
207 |        "  'the',\n",
208 |        "  'rest',\n",
209 |        "  'of',\n",
210 |        "  'the',\n",
211 |        "  'world',\n",
212 |        "  '.'],\n",
213 |        " ['DET',\n",
214 |        "  'NOUN',\n",
215 |        "  'ADP',\n",
216 |        "  'NOUN',\n",
217 |        "  'VERB',\n",
218 |        "  'ADJ',\n",
219 |        "  'PRT',\n",
220 |        "  'DET',\n",
221 |        "  'NOUN',\n",
222 |        "  '.',\n",
223 |        "  'CONJ',\n",
224 |        "  '.',\n",
225 |        "  'ADP',\n",
226 |        "  'ADJ',\n",
227 |        "  'ADJ',\n",
228 |        "  'NOUN',\n",
229 |        "  '.',\n",
230 |        "  'ADJ',\n",
231 |        "  'PRT',\n",
232 |        "  'DET',\n",
233 |        "  'NOUN',\n",
234 |        "  'ADP',\n",
235 |        "  'DET',\n",
236 |        "  'NOUN',\n",
237 |        "  '.']]"
238 |       ]
239 |      },
240 |      "execution_count": 12,
241 |      "metadata": {},
242 |      "output_type": "execute_result"
243 |     }
244 |    ],
245 |    "source": [
246 |     "train1[0]"
247 |    ]
248 |   },
249 |   {
250 |    "cell_type": "code",
251 |    "execution_count": null,
252 |    "metadata": {},
253 |    "outputs": [],
254 |    "source": []
255 |   }
256 |  ],
257 |  "metadata": {
258 |   "kernelspec": {
259 |    "display_name": "envs",
260 |    "language": "python",
261 |    "name": "cs771"
262 |   },
263 |   "language_info": {
264 |    "codemirror_mode": {
265 |     "name": "ipython",
266 |     "version": 3
267 |    },
268 |    "file_extension": ".py",
269 |    "mimetype": "text/x-python",
270 |    "name": "python",
271 |    "nbconvert_exporter": "python",
272 |    "pygments_lexer": "ipython3",
273 |    "version": "3.5.2"
274 |   }
275 |  },
276 |  "nbformat": 4,
277 |  "nbformat_minor": 2
278 | }
279 | 


--------------------------------------------------------------------------------
/one_million/raw_one_million_parsing.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 2,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import xml.etree.ElementTree as ET\n",
 10 |     "import numpy as np\n",
 11 |     "tree = ET.parse('../../dataset/semcor+omsti.data.xml')\n",
 12 |     "root = tree.getroot()"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 3,
 18 |    "metadata": {},
 19 |    "outputs": [
 20 |     {
 21 |      "data": {
 22 |       "text/plain": [
 23 |        "<Element 't' at 0x7f21585404a8>"
 24 |       ]
 25 |      },
 26 |      "execution_count": 3,
 27 |      "metadata": {},
 28 |      "output_type": "execute_result"
 29 |     }
 30 |    ],
 31 |    "source": [
 32 |     "root"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": 4,
 38 |    "metadata": {
 39 |     "collapsed": true
 40 |    },
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "cor1 = root[0]\n",
 44 |     "cor2 = root[1]"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": 6,
 50 |    "metadata": {},
 51 |    "outputs": [
 52 |     {
 53 |      "data": {
 54 |       "text/plain": [
 55 |        "<Element 'corpus' at 0x7f21451f70e8>"
 56 |       ]
 57 |      },
 58 |      "execution_count": 6,
 59 |      "metadata": {},
 60 |      "output_type": "execute_result"
 61 |     }
 62 |    ],
 63 |    "source": [
 64 |     "cor2"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": 3,
 70 |    "metadata": {
 71 |     "collapsed": true
 72 |    },
 73 |    "outputs": [],
 74 |    "source": [
 75 |     "def isalphabet(word):\n",
 76 |     "    return True"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": 4,
 82 |    "metadata": {},
 83 |    "outputs": [
 84 |     {
 85 |      "name": "stdout",
 86 |      "output_type": "stream",
 87 |      "text": [
 88 |       "37176\n"
 89 |      ]
 90 |     }
 91 |    ],
 92 |    "source": [
 93 |     "train1 = []\n",
 94 |     "#soup=soup.find_all('corpus')[1]\n",
 95 |     "count=0\n",
 96 |     "for sentences in cor1.findall('text'):\n",
 97 |     "    for sentence in sentences:\n",
 98 |     "        temp_sent = []\n",
 99 |     "        temp_sent.append(sentence.get('id'))\n",
100 |     "\n",
101 |     "        temp_words = []\n",
102 |     "        i_cnt=1\n",
103 |     "        ind=[]\n",
104 |     "        for word in sentence:\n",
105 |     "            string = word.text.lower()        \n",
106 |     "            if (isalphabet(string)):\n",
107 |     "                temp_words.append(string)\n",
108 |     "                ind.append(i_cnt)\n",
109 |     "            i_cnt+=1\n",
110 |     "\n",
111 |     "        temp_sent.append(temp_words)\n",
112 |     "        list_ = sentence.iter()\n",
113 |     "        id_list = []\n",
114 |     "        lemma_list = []\n",
115 |     "        pos_list = []\n",
116 |     "        for i in list_:\n",
117 |     "            id_list.append(i.get('id'))\n",
118 |     "            lemma_list.append(i.get('lemma'))\n",
119 |     "            pos_list.append(i.get('pos'))\n",
120 |     "\n",
121 |     "        id_list, lemma_list , pos_list = np.array(id_list),np.array(lemma_list),np.array(pos_list)\n",
122 |     "        temp_sent.extend([list(id_list[ind]), list(lemma_list[ind]), list(pos_list[ind])])\n",
123 |     "        train1.append(temp_sent)\n",
124 |     "        count+=1\n",
125 |     "        \n",
126 |     "print(count)"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "code",
131 |    "execution_count": 5,
132 |    "metadata": {
133 |     "collapsed": true
134 |    },
135 |    "outputs": [],
136 |    "source": [
137 |     "train1=train1[:len(train1)-2]"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "code",
142 |    "execution_count": 6,
143 |    "metadata": {},
144 |    "outputs": [
145 |     {
146 |      "name": "stdout",
147 |      "output_type": "stream",
148 |      "text": [
149 |       "813798\n"
150 |      ]
151 |     }
152 |    ],
153 |    "source": [
154 |     "train2 = []\n",
155 |     "#soup=soup.find_all('corpus')[1]\n",
156 |     "count=0\n",
157 |     "for sentences in cor2.findall('text'):\n",
158 |     "    for sentence in sentences:\n",
159 |     "        temp_sent = []\n",
160 |     "        temp_sent.append(sentence.get('id'))\n",
161 |     "\n",
162 |     "        temp_words = []\n",
163 |     "        i_cnt=1\n",
164 |     "        ind=[]\n",
165 |     "        for word in sentence:\n",
166 |     "            string = word.text.lower()        \n",
167 |     "            if (isalphabet(string)):\n",
168 |     "                temp_words.append(string)\n",
169 |     "                ind.append(i_cnt)\n",
170 |     "            i_cnt+=1\n",
171 |     "\n",
172 |     "        temp_sent.append(temp_words)\n",
173 |     "        list_ = sentence.iter()\n",
174 |     "        id_list = []\n",
175 |     "        lemma_list = []\n",
176 |     "        pos_list = []\n",
177 |     "        for i in list_:\n",
178 |     "            id_list.append(i.get('id'))\n",
179 |     "            lemma_list.append(i.get('lemma'))\n",
180 |     "            pos_list.append(i.get('pos'))\n",
181 |     "\n",
182 |     "        id_list, lemma_list , pos_list = np.array(id_list),np.array(lemma_list),np.array(pos_list)\n",
183 |     "        temp_sent.extend([list(id_list[ind]), list(lemma_list[ind]), list(pos_list[ind])])\n",
184 |     "        train2.append(temp_sent)\n",
185 |     "        count+=1\n",
186 |     "        \n",
187 |     "print(count)"
188 |    ]
189 |   },
190 |   {
191 |    "cell_type": "code",
192 |    "execution_count": 7,
193 |    "metadata": {},
194 |    "outputs": [
195 |     {
196 |      "data": {
197 |       "text/plain": [
198 |        "850972"
199 |       ]
200 |      },
201 |      "execution_count": 7,
202 |      "metadata": {},
203 |      "output_type": "execute_result"
204 |     }
205 |    ],
206 |    "source": [
207 |     "len(train1)+len(train2)"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "code",
212 |    "execution_count": 17,
213 |    "metadata": {
214 |     "collapsed": true
215 |    },
216 |    "outputs": [],
217 |    "source": [
218 |     "train=train1+train2"
219 |    ]
220 |   },
221 |   {
222 |    "cell_type": "code",
223 |    "execution_count": 18,
224 |    "metadata": {},
225 |    "outputs": [
226 |     {
227 |      "data": {
228 |       "text/plain": [
229 |        "850972"
230 |       ]
231 |      },
232 |      "execution_count": 18,
233 |      "metadata": {},
234 |      "output_type": "execute_result"
235 |     }
236 |    ],
237 |    "source": [
238 |     "len(train)"
239 |    ]
240 |   },
241 |   {
242 |    "cell_type": "code",
243 |    "execution_count": 19,
244 |    "metadata": {
245 |     "collapsed": true
246 |    },
247 |    "outputs": [],
248 |    "source": [
249 |     "import pickle\n",
250 |     "with open('raw_preprocess_train','wb') as f:\n",
251 |     "    pickle.dump(train,f)"
252 |    ]
253 |   }
254 |  ],
255 |  "metadata": {
256 |   "kernelspec": {
257 |    "display_name": "envs",
258 |    "language": "python",
259 |    "name": "cs771"
260 |   },
261 |   "language_info": {
262 |    "codemirror_mode": {
263 |     "name": "ipython",
264 |     "version": 3
265 |    },
266 |    "file_extension": ".py",
267 |    "mimetype": "text/x-python",
268 |    "name": "python",
269 |    "nbconvert_exporter": "python",
270 |    "pygments_lexer": "ipython3",
271 |    "version": "3.5.2"
272 |   }
273 |  },
274 |  "nbformat": 4,
275 |  "nbformat_minor": 2
276 | }
277 | 


--------------------------------------------------------------------------------
/papers/1603.07012.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sshanu/Hierarchical-Word-Sense-Disambiguation-using-WordNet-Senses/74905c60e48ab1884bf5c8f208e21015f7f7fd21/papers/1603.07012.pdf


--------------------------------------------------------------------------------
/papers/9f260612d5817d542cda2a7d9a6eb18d6471.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sshanu/Hierarchical-Word-Sense-Disambiguation-using-WordNet-Senses/74905c60e48ab1884bf5c8f208e21015f7f7fd21/papers/9f260612d5817d542cda2a7d9a6eb18d6471.pdf


--------------------------------------------------------------------------------
/papers/D17-1008.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sshanu/Hierarchical-Word-Sense-Disambiguation-using-WordNet-Senses/74905c60e48ab1884bf5c8f208e21015f7f7fd21/papers/D17-1008.pdf


--------------------------------------------------------------------------------
/papers/K16-1006.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sshanu/Hierarchical-Word-Sense-Disambiguation-using-WordNet-Senses/74905c60e48ab1884bf5c8f208e21015f7f7fd21/papers/K16-1006.pdf


--------------------------------------------------------------------------------
/papers/P16-1085.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sshanu/Hierarchical-Word-Sense-Disambiguation-using-WordNet-Senses/74905c60e48ab1884bf5c8f208e21015f7f7fd21/papers/P16-1085.pdf


--------------------------------------------------------------------------------
/papers/W16-5307.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sshanu/Hierarchical-Word-Sense-Disambiguation-using-WordNet-Senses/74905c60e48ab1884bf5c8f208e21015f7f7fd21/papers/W16-5307.pdf


--------------------------------------------------------------------------------
/papers/a10-navigli.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sshanu/Hierarchical-Word-Sense-Disambiguation-using-WordNet-Senses/74905c60e48ab1884bf5c8f208e21015f7f7fd21/papers/a10-navigli.pdf


--------------------------------------------------------------------------------
/papers/crf.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sshanu/Hierarchical-Word-Sense-Disambiguation-using-WordNet-Senses/74905c60e48ab1884bf5c8f208e21015f7f7fd21/papers/crf.pdf


--------------------------------------------------------------------------------
/papers/report1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sshanu/Hierarchical-Word-Sense-Disambiguation-using-WordNet-Senses/74905c60e48ab1884bf5c8f208e21015f7f7fd21/papers/report1.pdf


--------------------------------------------------------------------------------
/papers/report2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sshanu/Hierarchical-Word-Sense-Disambiguation-using-WordNet-Senses/74905c60e48ab1884bf5c8f208e21015f7f7fd21/papers/report2.pdf


--------------------------------------------------------------------------------