├── .gitignore
├── Four Word Model
├── Model-2.ipynb
├── Model-3.ipynb
├── Model-4.ipynb
├── Model-5.ipynb
├── Model-6.ipynb
├── Model-7.ipynb
├── Model-8.ipynb
├── Model-9.ipynb
├── Model.ipynb
├── Preprocess_Files
│ ├── hard
│ │ ├── sense
│ │ └── sent
│ ├── interest
│ │ ├── sense
│ │ └── sent
│ ├── line
│ │ ├── sense
│ │ └── sent
│ └── serve
│ │ ├── sense
│ │ └── sent
├── Senses.txt
├── Sentences.txt
├── final_preprocessing.ipynb
├── full_train.pickle
├── initial_processing.ipynb
├── robsr_model.ipynb
├── train.pickle
├── vocab_overlap_analysis.ipynb
└── words_not_in_vocab.pickle
├── LICENSE
├── README.md
├── UGP_Report.pdf
├── UGP_presentation.pdf
├── models_diagram
├── all-word-1.png
├── all-word-2.png
├── all-word-3.png
├── all-word-4.png
├── all-word-5.png
├── all-word-6.png
├── all-word-7.png
├── all-word-8.png
├── model-1.png
├── model-2.png
├── model-3.png
└── model-4.png
├── one_million
├── One-Million All-Word Data Sampling Coarse.ipynb
├── One-Million All-Word Data Sampling-Fine.ipynb
├── One-Million All-Word Data-hierarchical Sampling-Fine.ipynb
├── One-Million All-Word Data-seq.ipynb
├── Sense-test.ipynb
├── Sense.ipynb
├── all-word-model
├── all-word
│ ├── Model-aw-1-multigpu-1.ipynb
│ ├── Model-aw-1-multigpu-2.ipynb
│ ├── Model-aw-1-multigpu-3.ipynb
│ ├── Model-aw-3-1.ipynb
│ ├── Model-aw-3.ipynb
│ ├── Model-aw-4-1.ipynb
│ ├── Model-aw-lex-1.2.ipynb
│ ├── Model-aw-lex-1.3.ipynb
│ ├── Model-aw-lex-1.4.ipynb
│ ├── Model-aw-lex-1.ipynb
│ ├── Model-aw-lex-2.2.ipynb
│ ├── Model-aw-lex-hierarchical-1.ipynb
│ ├── Model-aw-lex-hierarchical-2.ipynb
│ ├── Model-aw-lex-hierarchical-3.ipynb
│ ├── Model-aw-lex-hierarchical-4.ipynb
│ ├── Model-aw-lex-local_attention-fast-v1.ipynb
│ ├── Model-aw-lex-local_attention-fast-v2-1.ipynb
│ ├── Model-aw-lex-local_attention-fast-v2-2.ipynb
│ ├── Model-aw-lex-local_attention-fast-v2-3.ipynb
│ ├── Model-aw-lex-local_attention-fast-v2-4.ipynb
│ ├── Model-aw-lex-local_attention-fast-v2-5.ipynb
│ ├── Model-aw-lex-local_attention-fast-v2-6.ipynb
│ ├── Model-aw-lex-local_attention-fast-v2-7.ipynb
│ ├── Model-aw-lex-local_attention-fast-v2-8.ipynb
│ ├── Model-aw-lex-local_attention-fast-v2-9.ipynb
│ ├── Model-aw-lex-local_attention-fast-v3-1.ipynb
│ ├── Model-aw-lex-local_attention-fast-v4-1.ipynb
│ ├── Model-aw-lex-local_attention-slow-1.ipynb
│ ├── Model-aw-lex-local_attention-slow-2.ipynb
│ ├── Model-aw-lex-seq-hierarchical-1.ipynb
│ ├── Model-aw-lex-seq-hierarchical-2.ipynb
│ ├── Model-aw-sense-1.ipynb
│ └── Readme.md
├── force
│ ├── Force-Model-1-multigpu-1.ipynb
│ ├── Force-Model-1-multigpu-2.ipynb
│ ├── Force-Model-1-multigpu-3.ipynb
│ ├── Force-Model-1.ipynb
│ ├── Force-Model-2-multigpu-1.ipynb
│ ├── Force-Model-2.ipynb
│ ├── Force-Model-3-multigpu-1.ipynb
│ ├── Force-Model-3.ipynb
│ ├── Force-Model-4-multigpu-1.ipynb
│ ├── Force-Model-4.ipynb
│ └── Force-Model-5.ipynb
├── make
│ ├── Make-Model-1-multigpu-1.ipynb
│ ├── Make-Model-1.ipynb
│ ├── Make-Model-2-multigpu-1.ipynb
│ ├── Make-Model-2.ipynb
│ ├── Make-Model-3-1.ipynb
│ ├── Make-Model-3-2.ipynb
│ ├── Make-Model-3-3.ipynb
│ ├── Make-Model-3-multigpu-1.ipynb
│ └── Make-Model-3.ipynb
├── one_million_parsing.ipynb
├── one_word_data_maker-test.ipynb
├── one_word_data_maker.ipynb
├── open
│ ├── Open-Model-1-multigpu-1.ipynb
│ ├── Open-Model-2-multigpu-1.ipynb
│ ├── Open-Model-3-multigpu-1.ipynb
│ ├── Open-Model-3.ipynb
│ ├── Open-Model-4-multigpu-1.ipynb
│ └── Open-Model-4.ipynb
├── place
│ ├── Place-Model-1-multigpu-1.ipynb
│ ├── Place-Model-2-multigpu-1.ipynb
│ ├── Place-Model-2.ipynb
│ ├── Place-Model-3-multigpu-1.ipynb
│ ├── Place-Model-3.ipynb
│ ├── Place-Model-4-multigpu-1.ipynb
│ ├── Place-Model-4.ipynb
│ └── Place-Model-6.ipynb
├── point
│ ├── Point-Model-1-multigpu-1.ipynb
│ ├── Point-Model-2-multigpu-1.ipynb
│ ├── Point-Model-2.ipynb
│ ├── Point-Model-3-multigpu-1.ipynb
│ ├── Point-Model-3.ipynb
│ ├── Point-Model-4-multigpu-1.ipynb
│ └── Point-Model-4.ipynb
├── raw_one_million_parsing-test.ipynb
├── raw_one_million_parsing.ipynb
├── serve
│ ├── Serve-Model-1-multigpu-2.ipynb
│ ├── Serve-Model-1.ipynb
│ ├── Serve-Model-2.ipynb
│ └── Serve-Model-3.ipynb
└── support
│ ├── Support-Model-1-multigpu-1.ipynb
│ ├── Support-Model-2-multigpu-1.ipynb
│ ├── Support-Model-3-multigpu-1.ipynb
│ ├── Support-Model-3.ipynb
│ ├── Support-Model-4-multigpu-1.ipynb
│ ├── Support-Model-4.ipynb
│ └── Support-Model-5.ipynb
└── papers
├── 1603.07012.pdf
├── 9f260612d5817d542cda2a7d9a6eb18d6471.pdf
├── D17-1008.pdf
├── K16-1006.pdf
├── P16-1085.pdf
├── W16-5307.pdf
├── a10-navigli.pdf
├── crf.pdf
├── report1.pdf
└── report2.pdf
/.gitignore:
--------------------------------------------------------------------------------
1 | rushab/
2 | dataset/
3 | Glove/
4 | glove/
5 | Four Word Model/.ipynb_checkpoints/
6 | data/
7 | .ipynb_checkpoints/
8 | Four Word Model/output
9 | one_million/output
10 | *.pickle
11 | papers/
--------------------------------------------------------------------------------
/Four Word Model/Model-2.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import tensorflow as tf\n",
12 | "tf.logging.set_verbosity(tf.logging.WARN)\n",
13 | "import pickle\n",
14 | "import numpy as np\n",
15 | "import os\n",
16 | "from sklearn.model_selection import train_test_split\n",
17 | "from sklearn.metrics import f1_score\n",
18 | "from sklearn.metrics import accuracy_score\n",
19 | "import os\n",
20 | "from tensorflow.python.client import device_lib"
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": 2,
26 | "metadata": {
27 | "collapsed": true
28 | },
29 | "outputs": [],
30 | "source": [
31 | "f = open('../Glove/word_embedding_glove', 'rb')\n",
32 | "word_embedding = pickle.load(f)\n",
33 | "f.close()\n",
34 | "word_embedding = word_embedding[: len(word_embedding)-1]\n",
35 | "\n",
36 | "f = open('../Glove/vocab_glove', 'rb')\n",
37 | "vocab = pickle.load(f)\n",
38 | "f.close()\n",
39 | "\n",
40 | "word2id = dict((w, i) for i,w in enumerate(vocab))\n",
41 | "id2word = dict((i, w) for i,w in enumerate(vocab))\n",
42 | "\n",
43 | "unknown_token = \"UNKNOWN_TOKEN\"\n",
44 | "\n",
45 | "f = open(\"train.pickle\", 'rb')\n",
46 | "full_data = pickle.load(f)\n",
47 | "f.close()"
48 | ]
49 | },
50 | {
51 | "cell_type": "code",
52 | "execution_count": 3,
53 | "metadata": {
54 | "collapsed": true
55 | },
56 | "outputs": [],
57 | "source": [
58 | "# Model Description\n",
59 | "sense_word = 'hard'\n",
60 | "model_name = 'model-2'\n",
61 | "model_dir = 'output/' + sense_word + '/' + model_name\n",
62 | "save_dir = os.path.join(model_dir, \"save/\")\n",
63 | "log_dir = os.path.join(model_dir, \"log\")\n",
64 | "\n",
65 | "if not os.path.exists(model_dir):\n",
66 | " os.mkdir(model_dir)\n",
67 | "if not os.path.exists(save_dir):\n",
68 | " os.mkdir(save_dir)\n",
69 | "if not os.path.exists(log_dir):\n",
70 | " os.mkdir(log_dir)"
71 | ]
72 | },
73 | {
74 | "cell_type": "code",
75 | "execution_count": 4,
76 | "metadata": {
77 | "collapsed": true
78 | },
79 | "outputs": [],
80 | "source": [
81 | "# Parameters\n",
82 | "mode = 'train'\n",
83 | "num_senses = 3\n",
84 | "batch_size = 64\n",
85 | "vocab_size = len(vocab)\n",
86 | "unk_vocab_size = 1\n",
87 | "word_emb_size = len(word_embedding[0])\n",
88 | "max_sent_size = 200\n",
89 | "hidden_size = 100\n",
90 | "keep_prob = 0.5\n",
91 | "l2_lambda = 0.001\n",
92 | "init_lr = 0.001\n",
93 | "decay_steps = 500\n",
94 | "decay_rate = 0.96\n",
95 | "clip_norm = 1\n",
96 | "clipping = True"
97 | ]
98 | },
99 | {
100 | "cell_type": "code",
101 | "execution_count": 5,
102 | "metadata": {
103 | "collapsed": true
104 | },
105 | "outputs": [],
106 | "source": [
107 | "# MODEL\n",
108 | "x = tf.placeholder('int32', [batch_size, max_sent_size], name=\"x\")\n",
109 | "y = tf.placeholder('int32', [batch_size], name=\"y\")\n",
110 | "x_mask = tf.placeholder('bool', [batch_size, max_sent_size], name='x_mask') \n",
111 | "is_train = tf.placeholder('bool', [], name='is_train')\n",
112 | "word_emb_mat = tf.placeholder('float', [None, word_emb_size], name='emb_mat')\n",
113 | "input_keep_prob = tf.cond(is_train,lambda:keep_prob, lambda:tf.constant(1.0))\n",
114 | "x_len = tf.reduce_sum(tf.cast(x_mask, 'int32'), 1)\n",
115 | "\n",
116 | "with tf.name_scope(\"word_embedding\"):\n",
117 | " if mode == 'train':\n",
118 | " unk_word_emb_mat = tf.get_variable(\"word_emb_mat\", dtype='float', shape=[unk_vocab_size, word_emb_size], initializer=tf.contrib.layers.xavier_initializer(uniform=True, seed=0, dtype=tf.float32))\n",
119 | " else:\n",
120 | " unk_word_emb_mat = tf.get_variable(\"word_emb_mat\", shape=[unk_vocab_size, word_emb_size], dtype='float')\n",
121 | " \n",
122 | " final_word_emb_mat = tf.concat([word_emb_mat, unk_word_emb_mat], 0)\n",
123 | " Wx = tf.nn.embedding_lookup(final_word_emb_mat, x) \n",
124 | "\n",
125 | "with tf.variable_scope(\"lstm1\"):\n",
126 | " cell_fw1 = tf.contrib.rnn.BasicLSTMCell(hidden_size,state_is_tuple=True)\n",
127 | " cell_bw1 = tf.contrib.rnn.BasicLSTMCell(hidden_size,state_is_tuple=True)\n",
128 | "\n",
129 | " d_cell_fw1 = tf.contrib.rnn.DropoutWrapper(cell_fw1, input_keep_prob=input_keep_prob)\n",
130 | " d_cell_bw1 = tf.contrib.rnn.DropoutWrapper(cell_bw1, input_keep_prob=input_keep_prob)\n",
131 | " \n",
132 | " (fw_h1, bw_h1), _ = tf.nn.bidirectional_dynamic_rnn(d_cell_fw1, d_cell_bw1, Wx, sequence_length=x_len, dtype='float', scope='lstm1')\n",
133 | " h1 = tf.concat([fw_h1, bw_h1], 2)\n",
134 | " \n",
135 | "with tf.variable_scope(\"lstm2\"):\n",
136 | " cell_fw2 = tf.contrib.rnn.BasicLSTMCell(hidden_size,state_is_tuple=True)\n",
137 | " cell_bw2 = tf.contrib.rnn.BasicLSTMCell(hidden_size,state_is_tuple=True)\n",
138 | "\n",
139 | " d_cell_fw2 = tf.contrib.rnn.DropoutWrapper(cell_fw2, input_keep_prob=input_keep_prob)\n",
140 | " d_cell_bw2 = tf.contrib.rnn.DropoutWrapper(cell_bw2, input_keep_prob=input_keep_prob)\n",
141 | " \n",
142 | " (fw_h2, bw_h2), _ = tf.nn.bidirectional_dynamic_rnn(d_cell_fw2, d_cell_bw2, h1, sequence_length=x_len, dtype='float', scope='lstm2')\n",
143 | " h = tf.concat([fw_h2, bw_h2], 2)\n",
144 | "\n",
145 | "def attention(input_x, input_mask, W_att):\n",
146 | " h_masked = tf.boolean_mask(input_x, input_mask)\n",
147 | " h_tanh = tf.tanh(h_masked)\n",
148 | " u = tf.matmul(h_tanh, W_att)\n",
149 | " a = tf.nn.softmax(u)\n",
150 | " c = tf.reduce_sum(tf.multiply(h_tanh, a), 0) \n",
151 | " return c\n",
152 | "\n",
153 | "with tf.variable_scope(\"attention\"):\n",
154 | " W_att = tf.Variable(tf.truncated_normal([2*hidden_size, 1], mean=0.0, stddev=0.1, seed=0), name=\"W_att\")\n",
155 | " c = tf.expand_dims(attention(h[0], x_mask[0], W_att), 0)\n",
156 | " for i in range(1, batch_size):\n",
157 | " c = tf.concat([c, tf.expand_dims(attention(h[i], x_mask[i], W_att), 0)], 0)\n",
158 | " \n",
159 | "with tf.variable_scope(\"softmax_layer\"):\n",
160 | " W = tf.Variable(tf.truncated_normal([2*hidden_size, num_senses], mean=0.0, stddev=0.1, seed=0), name=\"W\")\n",
161 | " b = tf.Variable(tf.zeros([num_senses]), name=\"b\")\n",
162 | " drop_c = tf.nn.dropout(c, input_keep_prob)\n",
163 | " logits = tf.matmul(drop_c, W) + b\n",
164 | " predictions = tf.argmax(logits, 1)\n",
165 | "\n",
166 | "loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=y))\n",
167 | "global_step = tf.Variable(0, trainable=False, name=\"global_step\")\n",
168 | "\n",
169 | "learning_rate = tf.train.exponential_decay(init_lr, global_step, decay_steps, decay_rate, staircase=True)\n",
170 | "\n",
171 | "tv_all = tf.trainable_variables()\n",
172 | "tv_regu =[]\n",
173 | "for t in tv_all:\n",
174 | " if t.name.find('b:')==-1:\n",
175 | " tv_regu.append(t)\n",
176 | " \n",
177 | "# l2 Loss\n",
178 | "l2_loss = l2_lambda * tf.reduce_sum([ tf.nn.l2_loss(v) for v in tv_regu ])\n",
179 | "\n",
180 | "total_loss = loss + l2_loss\n",
181 | "\n",
182 | "# Optimizer for loss\n",
183 | "optimizer = tf.train.AdamOptimizer(learning_rate)\n",
184 | "\n",
185 | "# Gradients and Variables for Loss\n",
186 | "grads_vars = optimizer.compute_gradients(total_loss)\n",
187 | "\n",
188 | "# Clipping of Gradients\n",
189 | "clipped_grads = grads_vars\n",
190 | "if(clipping == True):\n",
191 | " clipped_grads = [(tf.clip_by_norm(grad, clip_norm), var) for grad, var in clipped_grads]\n",
192 | "\n",
193 | "# Training Optimizer for Total Loss\n",
194 | "train_op = optimizer.apply_gradients(clipped_grads, global_step=global_step)\n",
195 | "\n",
196 | "# Summaries\n",
197 | "var_summaries = []\n",
198 | "for v in tv_all:\n",
199 | " var_summary = tf.summary.histogram(\"{}/var\".format(v.name), v)\n",
200 | " var_summaries.append(var_summary)\n",
201 | "\n",
202 | "var_summaries_merged = tf.summary.merge(var_summaries)\n",
203 | "\n",
204 | "loss_summary = tf.summary.scalar(\"loss\", loss)\n",
205 | "total_loss_summary = tf.summary.scalar(\"total_loss\", total_loss)\n",
206 | "summary = tf.summary.merge_all()"
207 | ]
208 | },
209 | {
210 | "cell_type": "code",
211 | "execution_count": 6,
212 | "metadata": {
213 | "collapsed": true
214 | },
215 | "outputs": [],
216 | "source": [
217 | "os.environ[\"CUDA_DEVICE_ORDER\"]=\"PCI_BUS_ID\" # see issue #152\n",
218 | "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"3\"\n",
219 | "sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))\n",
220 | "sess.run(tf.global_variables_initializer()) # For initializing all the variables\n",
221 | "saver = tf.train.Saver() # For Saving the model\n",
222 | "summary_writer = tf.summary.FileWriter(log_dir, sess.graph) # For writing Summaries"
223 | ]
224 | },
225 | {
226 | "cell_type": "code",
227 | "execution_count": 7,
228 | "metadata": {
229 | "scrolled": true
230 | },
231 | "outputs": [
232 | {
233 | "name": "stderr",
234 | "output_type": "stream",
235 | "text": [
236 | "/users/btech/aviraj/cs771/lib/python3.5/site-packages/sklearn/model_selection/_split.py:2026: FutureWarning: From version 0.21, test_size will always complement train_size unless both are specified.\n",
237 | " FutureWarning)\n"
238 | ]
239 | }
240 | ],
241 | "source": [
242 | "# Splitting\n",
243 | "data_x = full_data[sense_word][0]\n",
244 | "data_y = full_data[sense_word][2]\n",
245 | "x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, train_size=0.8, shuffle=True, stratify=data_y, random_state=0)\n",
246 | "\n",
247 | "x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, train_size=0.9, shuffle=True, stratify=y_train, random_state=0)"
248 | ]
249 | },
250 | {
251 | "cell_type": "code",
252 | "execution_count": 8,
253 | "metadata": {
254 | "collapsed": true
255 | },
256 | "outputs": [],
257 | "source": [
258 | "def data_prepare(x):\n",
259 | " num_examples = len(x)\n",
260 | "\n",
261 | " xx = np.zeros([num_examples, max_sent_size], dtype=int)\n",
262 | " xx_mask = np.zeros([num_examples, max_sent_size], dtype=bool)\n",
263 | "\n",
264 | " for j in range(num_examples):\n",
265 | " for i in range(max_sent_size):\n",
266 | " if(i>=len(x[j])):\n",
267 | " break\n",
268 | " w = x[j][i]\n",
269 | " xx[j][i] = word2id[w] if w in word2id else word2id['UNKNOWN_TOKEN']\n",
270 | " xx_mask[j][i] = True\n",
271 | " \n",
272 | " return xx, xx_mask\n",
273 | "\n",
274 | "def eval_score(yy, pred):\n",
275 | " num_batches = int(len(yy)/batch_size)\n",
276 | " f1 = f1_score(yy[:batch_size*num_batches], pred, average='macro')\n",
277 | " accu = accuracy_score(yy[:batch_size*num_batches], pred)\n",
278 | " return f1*100, accu*100\n",
279 | "\n",
280 | "def model(xx, yy, mask, train_cond=True):\n",
281 | " num_batches = int(len(xx)/batch_size)\n",
282 | " losses = 0\n",
283 | " preds = []\n",
284 | " for j in range(num_batches): \n",
285 | " \n",
286 | " s = j * batch_size\n",
287 | " e = (j+1) * batch_size\n",
288 | " \n",
289 | " feed_dict = {x:xx[s:e], y:yy[s:e], x_mask:mask[s:e], is_train:train_cond, input_keep_prob:keep_prob, word_emb_mat:word_embedding}\n",
290 | " \n",
291 | " \n",
292 | " if(train_cond==True):\n",
293 | " _, _loss, step, _summary = sess.run([train_op, total_loss, global_step, summary], feed_dict)\n",
294 | " summary_writer.add_summary(_summary, step) \n",
295 | "# print(\"Steps:{}\".format(step), \", Loss: {}\".format(_loss))\n",
296 | "\n",
297 | " else:\n",
298 | " _loss, pred = sess.run([total_loss, predictions], feed_dict)\n",
299 | " preds.append(pred)\n",
300 | " \n",
301 | " losses +=_loss\n",
302 | "\n",
303 | " if(train_cond==False):\n",
304 | " y_pred = []\n",
305 | " for i in range(num_batches):\n",
306 | " for pred in preds[i]:\n",
307 | " y_pred.append(pred)\n",
308 | " return losses/num_batches, y_pred\n",
309 | " \n",
310 | " return losses/num_batches, step"
311 | ]
312 | },
313 | {
314 | "cell_type": "code",
315 | "execution_count": 9,
316 | "metadata": {
317 | "collapsed": true
318 | },
319 | "outputs": [],
320 | "source": [
321 | "x_id_train, mask_train = data_prepare(x_train)\n",
322 | "x_id_val, mask_val = data_prepare(x_val)\n",
323 | "x_id_test, mask_test = data_prepare(x_test)\n",
324 | "y_train = np.array(y_train)"
325 | ]
326 | },
327 | {
328 | "cell_type": "code",
329 | "execution_count": 10,
330 | "metadata": {
331 | "scrolled": true
332 | },
333 | "outputs": [
334 | {
335 | "name": "stdout",
336 | "output_type": "stream",
337 | "text": [
338 | "Epoch: 1 Step: 48 loss: 1.86646759758\n",
339 | "Epoch: 2 Step: 96 loss: 1.21714039147\n",
340 | "Epoch: 3 Step: 144 loss: 1.08560919886\n",
341 | "Epoch: 4 Step: 192 loss: 0.980009039243\n",
342 | "Epoch: 5 Step: 240 loss: 0.881924713651\n",
343 | "Saved Model Complete\n",
344 | "Train: F1 Score: 60.3386654855 Accuracy: 83.5611979167 Loss: 0.80800242722\n",
345 | "Val: F1 Score: 54.6225701167 Accuracy: 82.8125 Loss: 0.770564937592\n",
346 | "Epoch: 6 Step: 288 loss: 0.806718610227\n",
347 | "Epoch: 7 Step: 336 loss: 0.742333145191\n",
348 | "Epoch: 8 Step: 384 loss: 0.699159173295\n",
349 | "Epoch: 9 Step: 432 loss: 0.681758804868\n",
350 | "Epoch: 10 Step: 480 loss: 0.631260214373\n",
351 | "Saved Model Complete\n",
352 | "Train: F1 Score: 62.415876497 Accuracy: 85.64453125 Loss: 0.609668933476\n",
353 | "Val: F1 Score: 63.0309748731 Accuracy: 87.1875 Loss: 0.603685164452\n",
354 | "Epoch: 11 Step: 528 loss: 0.621730036413\n",
355 | "Epoch: 12 Step: 576 loss: 0.593547300746\n",
356 | "Epoch: 13 Step: 624 loss: 0.567168306559\n",
357 | "Epoch: 14 Step: 672 loss: 0.572736630837\n",
358 | "Epoch: 15 Step: 720 loss: 0.52119900162\n",
359 | "Saved Model Complete\n",
360 | "Train: F1 Score: 73.2345182784 Accuracy: 87.59765625 Loss: 0.500816229731\n",
361 | "Val: F1 Score: 71.3846572025 Accuracy: 88.125 Loss: 0.507379829884\n",
362 | "Epoch: 16 Step: 768 loss: 0.518757795294\n",
363 | "Epoch: 17 Step: 816 loss: 0.508907252923\n",
364 | "Epoch: 18 Step: 864 loss: 0.480370514716\n",
365 | "Epoch: 19 Step: 912 loss: 0.481487047548\n",
366 | "Epoch: 20 Step: 960 loss: 0.483874622112\n",
367 | "Saved Model Complete\n",
368 | "Train: F1 Score: 72.4541504438 Accuracy: 88.57421875 Loss: 0.454483479882\n",
369 | "Val: F1 Score: 69.7799159478 Accuracy: 88.4375 Loss: 0.505139875412\n",
370 | "Epoch: 21 Step: 1008 loss: 0.445587230225\n",
371 | "Epoch: 22 Step: 1056 loss: 0.448845259845\n",
372 | "Epoch: 23 Step: 1104 loss: 0.418395101403\n",
373 | "Epoch: 24 Step: 1152 loss: 0.42787179475\n",
374 | "Epoch: 25 Step: 1200 loss: 0.41220224835\n",
375 | "Saved Model Complete\n",
376 | "Train: F1 Score: 79.7367544121 Accuracy: 90.5598958333 Loss: 0.387414715563\n",
377 | "Val: F1 Score: 80.5119717533 Accuracy: 91.25 Loss: 0.428414440155\n",
378 | "Epoch: 26 Step: 1248 loss: 0.398100319629\n",
379 | "Epoch: 27 Step: 1296 loss: 0.401642986884\n",
380 | "Epoch: 28 Step: 1344 loss: 0.380077781156\n",
381 | "Epoch: 29 Step: 1392 loss: 0.371819969267\n",
382 | "Epoch: 30 Step: 1440 loss: 0.375808695642\n",
383 | "Saved Model Complete\n",
384 | "Train: F1 Score: 82.6141307319 Accuracy: 91.1458333333 Loss: 0.374826697633\n",
385 | "Val: F1 Score: 72.6194736328 Accuracy: 87.5 Loss: 0.443939989805\n",
386 | "Epoch: 31 Step: 1488 loss: 0.368128724086\n",
387 | "Epoch: 32 Step: 1536 loss: 0.363611215415\n",
388 | "Epoch: 33 Step: 1584 loss: 0.370647774388\n",
389 | "Epoch: 34 Step: 1632 loss: 0.368405311989\n",
390 | "Epoch: 35 Step: 1680 loss: 0.349992937719\n",
391 | "Saved Model Complete\n",
392 | "Train: F1 Score: 81.482253082 Accuracy: 91.6666666667 Loss: 0.36779523051\n",
393 | "Val: F1 Score: 76.2094695081 Accuracy: 89.6875 Loss: 0.484789025784\n",
394 | "Epoch: 36 Step: 1728 loss: 0.347480880097\n",
395 | "Epoch: 37 Step: 1776 loss: 0.344036137685\n",
396 | "Epoch: 38 Step: 1824 loss: 0.329046547723\n",
397 | "Epoch: 39 Step: 1872 loss: 0.308786494968\n",
398 | "Epoch: 40 Step: 1920 loss: 0.335401780282\n",
399 | "Saved Model Complete\n",
400 | "Train: F1 Score: 88.5588616245 Accuracy: 94.53125 Loss: 0.291247650981\n",
401 | "Val: F1 Score: 84.105797863 Accuracy: 92.5 Loss: 0.359305435419\n",
402 | "Epoch: 41 Step: 1968 loss: 0.332291507783\n",
403 | "Epoch: 42 Step: 2016 loss: 0.314355407842\n",
404 | "Epoch: 43 Step: 2064 loss: 0.319293403377\n",
405 | "Epoch: 44 Step: 2112 loss: 0.297154735463\n",
406 | "Epoch: 45 Step: 2160 loss: 0.305809813552\n",
407 | "Saved Model Complete\n",
408 | "Train: F1 Score: 88.5833478857 Accuracy: 94.5963541667 Loss: 0.283582553578\n",
409 | "Val: F1 Score: 78.8451959418 Accuracy: 90.625 Loss: 0.493378305435\n",
410 | "Epoch: 46 Step: 2208 loss: 0.28896213385\n",
411 | "Epoch: 47 Step: 2256 loss: 0.299109598001\n",
412 | "Epoch: 48 Step: 2304 loss: 0.285256354449\n",
413 | "Epoch: 49 Step: 2352 loss: 0.293783533076\n",
414 | "Epoch: 50 Step: 2400 loss: 0.288317035573\n",
415 | "Saved Model Complete\n",
416 | "Train: F1 Score: 89.4559800481 Accuracy: 94.8567708333 Loss: 0.271899814407\n",
417 | "Val: F1 Score: 78.6662686459 Accuracy: 89.375 Loss: 0.450355643034\n",
418 | "Test: F1 Score: 78.1099629833 Accuracy: 89.7836538462 Loss: 0.494133715446\n"
419 | ]
420 | }
421 | ],
422 | "source": [
423 | "num_epochs = 50\n",
424 | "\n",
425 | "for i in range(num_epochs):\n",
426 | " \n",
427 | " random = np.random.choice(len(y_train), size=(len(y_train)), replace=False)\n",
428 | " x_id_train = x_id_train[random]\n",
429 | " y_train = y_train[random]\n",
430 | " mask_train = mask_train[random]\n",
431 | " \n",
432 | " losses, step = model(x_id_train, y_train, mask_train)\n",
433 | " print(\"Epoch:\", i+1,\"Step:\", step, \"loss:\",losses)\n",
434 | " \n",
435 | " if((i+1)%5==0):\n",
436 | " saver.save(sess, save_path=save_dir) \n",
437 | " print(\"Saved Model Complete\")\n",
438 | " train_loss, train_pred = model(x_id_train, y_train, mask_train, train_cond=False)\n",
439 | " f1_, accu_ = eval_score(y_train, train_pred)\n",
440 | " print(\"Train: F1 Score: \", f1_, \"Accuracy: \", accu_, \"Loss: \", train_loss)\n",
441 | " val_loss, val_pred = model(x_id_val, y_val, mask_val, train_cond=False)\n",
442 | " f1_, accu_ = eval_score(y_val, val_pred)\n",
443 | " print(\"Val: F1 Score: \", f1_, \"Accuracy: \", accu_, \"Loss: \", val_loss)\n",
444 | " \n",
445 | "test_loss, test_pred = model(x_id_test, y_test, mask_test, train_cond=False)\n",
446 | "f1_, accu_ = eval_score(y_test, test_pred)\n",
447 | "print(\"Test: F1 Score: \", f1_, \"Accuracy: \", accu_, \"Loss: \", test_loss)"
448 | ]
449 | },
450 | {
451 | "cell_type": "code",
452 | "execution_count": null,
453 | "metadata": {
454 | "collapsed": true
455 | },
456 | "outputs": [],
457 | "source": []
458 | },
459 | {
460 | "cell_type": "code",
461 | "execution_count": null,
462 | "metadata": {
463 | "collapsed": true
464 | },
465 | "outputs": [],
466 | "source": [
467 | "saver.restore(sess, save_dir)"
468 | ]
469 | }
470 | ],
471 | "metadata": {
472 | "kernelspec": {
473 | "display_name": "cs771",
474 | "language": "python",
475 | "name": "cs771"
476 | },
477 | "language_info": {
478 | "codemirror_mode": {
479 | "name": "ipython",
480 | "version": 3
481 | },
482 | "file_extension": ".py",
483 | "mimetype": "text/x-python",
484 | "name": "python",
485 | "nbconvert_exporter": "python",
486 | "pygments_lexer": "ipython3",
487 | "version": "3.5.2"
488 | }
489 | },
490 | "nbformat": 4,
491 | "nbformat_minor": 2
492 | }
493 |
--------------------------------------------------------------------------------
/Four Word Model/Model-3.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import tensorflow as tf\n",
12 | "tf.logging.set_verbosity(tf.logging.WARN)\n",
13 | "import pickle\n",
14 | "import numpy as np\n",
15 | "import os\n",
16 | "from sklearn.model_selection import train_test_split\n",
17 | "from sklearn.metrics import f1_score\n",
18 | "from sklearn.metrics import accuracy_score\n",
19 | "import os\n",
20 | "from tensorflow.python.client import device_lib\n",
21 | "from collections import Counter"
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": 2,
27 | "metadata": {
28 | "collapsed": true
29 | },
30 | "outputs": [],
31 | "source": [
32 | "f = open('../Glove/word_embedding_glove', 'rb')\n",
33 | "word_embedding = pickle.load(f)\n",
34 | "f.close()\n",
35 | "word_embedding = word_embedding[: len(word_embedding)-1]\n",
36 | "\n",
37 | "f = open('../Glove/vocab_glove', 'rb')\n",
38 | "vocab = pickle.load(f)\n",
39 | "f.close()\n",
40 | "\n",
41 | "word2id = dict((w, i) for i,w in enumerate(vocab))\n",
42 | "id2word = dict((i, w) for i,w in enumerate(vocab))\n",
43 | "\n",
44 | "unknown_token = \"UNKNOWN_TOKEN\"\n",
45 | "\n",
46 | "f = open(\"train.pickle\", 'rb')\n",
47 | "full_data = pickle.load(f)\n",
48 | "f.close()"
49 | ]
50 | },
51 | {
52 | "cell_type": "code",
53 | "execution_count": 3,
54 | "metadata": {
55 | "collapsed": true
56 | },
57 | "outputs": [],
58 | "source": [
59 | "# Model Description\n",
60 | "sense_word = 'hard'\n",
61 | "model_name = 'model-3'\n",
62 | "model_dir = 'output/' + sense_word + '/' + model_name\n",
63 | "save_dir = os.path.join(model_dir, \"save/\")\n",
64 | "log_dir = os.path.join(model_dir, \"log\")\n",
65 | "\n",
66 | "if not os.path.exists(model_dir):\n",
67 | " os.mkdir(model_dir)\n",
68 | "if not os.path.exists(save_dir):\n",
69 | " os.mkdir(save_dir)\n",
70 | "if not os.path.exists(log_dir):\n",
71 | " os.mkdir(log_dir)"
72 | ]
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": 4,
77 | "metadata": {},
78 | "outputs": [
79 | {
80 | "name": "stdout",
81 | "output_type": "stream",
82 | "text": [
83 | "Counter({'HARD1': 3455, 'HARD2': 502, 'HARD3': 376})\n",
84 | "[ 1.21578586 5.30486965 5.47934437]\n"
85 | ]
86 | }
87 | ],
88 | "source": [
89 | "sense_counts = Counter(full_data[sense_word][1])\n",
90 | "print(sense_counts)\n",
91 | "total_count = len(full_data[sense_word][1])\n",
92 | "sort_sense_counts = sense_counts.most_common()\n",
93 | "vocab_sense = [k for k,v in sort_sense_counts]\n",
94 | "freq_sense = [v for k,v in sort_sense_counts]\n",
95 | "weights = np.multiply(6, [1 - count/total_count for count in freq_sense])\n",
96 | "weights = weights.astype(np.float32)\n",
97 | "print(weights)"
98 | ]
99 | },
100 | {
101 | "cell_type": "code",
102 | "execution_count": 5,
103 | "metadata": {
104 | "collapsed": true
105 | },
106 | "outputs": [],
107 | "source": [
108 | "# Parameters\n",
109 | "mode = 'train'\n",
110 | "num_senses = 3\n",
111 | "batch_size = 64\n",
112 | "vocab_size = len(vocab)\n",
113 | "unk_vocab_size = 1\n",
114 | "word_emb_size = len(word_embedding[0])\n",
115 | "max_sent_size = 200\n",
116 | "hidden_size = 100\n",
117 | "keep_prob = 0.5\n",
118 | "l2_lambda = 0.002\n",
119 | "init_lr = 0.005\n",
120 | "decay_steps = 500\n",
121 | "decay_rate = 0.96\n",
122 | "clip_norm = 1\n",
123 | "clipping = True"
124 | ]
125 | },
126 | {
127 | "cell_type": "code",
128 | "execution_count": 6,
129 | "metadata": {
130 | "collapsed": true
131 | },
132 | "outputs": [],
133 | "source": [
134 | "# MODEL\n",
135 | "x = tf.placeholder('int32', [batch_size, max_sent_size], name=\"x\")\n",
136 | "y = tf.placeholder('int32', [batch_size], name=\"y\")\n",
137 | "x_mask = tf.placeholder('bool', [batch_size, max_sent_size], name='x_mask') \n",
138 | "is_train = tf.placeholder('bool', [], name='is_train')\n",
139 | "word_emb_mat = tf.placeholder('float', [None, word_emb_size], name='emb_mat')\n",
140 | "input_keep_prob = tf.cond(is_train,lambda:keep_prob, lambda:tf.constant(1.0))\n",
141 | "x_len = tf.reduce_sum(tf.cast(x_mask, 'int32'), 1)\n",
142 | "\n",
143 | "with tf.name_scope(\"word_embedding\"):\n",
144 | " if mode == 'train':\n",
145 | " unk_word_emb_mat = tf.get_variable(\"word_emb_mat\", dtype='float', shape=[unk_vocab_size, word_emb_size], initializer=tf.contrib.layers.xavier_initializer(uniform=True, seed=0, dtype=tf.float32))\n",
146 | " else:\n",
147 | " unk_word_emb_mat = tf.get_variable(\"word_emb_mat\", shape=[unk_vocab_size, word_emb_size], dtype='float')\n",
148 | " \n",
149 | " final_word_emb_mat = tf.concat([word_emb_mat, unk_word_emb_mat], 0)\n",
150 | " Wx = tf.nn.embedding_lookup(final_word_emb_mat, x) \n",
151 | "\n",
152 | "with tf.variable_scope(\"lstm1\"):\n",
153 | " cell_fw1 = tf.contrib.rnn.BasicLSTMCell(hidden_size,state_is_tuple=True)\n",
154 | " cell_bw1 = tf.contrib.rnn.BasicLSTMCell(hidden_size,state_is_tuple=True)\n",
155 | "\n",
156 | " d_cell_fw1 = tf.contrib.rnn.DropoutWrapper(cell_fw1, input_keep_prob=input_keep_prob)\n",
157 | " d_cell_bw1 = tf.contrib.rnn.DropoutWrapper(cell_bw1, input_keep_prob=input_keep_prob)\n",
158 | " \n",
159 | " (fw_h1, bw_h1), _ = tf.nn.bidirectional_dynamic_rnn(d_cell_fw1, d_cell_bw1, Wx, sequence_length=x_len, dtype='float', scope='lstm1')\n",
160 | " h1 = tf.concat([fw_h1, bw_h1], 2)\n",
161 | " \n",
162 | "with tf.variable_scope(\"lstm2\"):\n",
163 | " cell_fw2 = tf.contrib.rnn.BasicLSTMCell(hidden_size,state_is_tuple=True)\n",
164 | " cell_bw2 = tf.contrib.rnn.BasicLSTMCell(hidden_size,state_is_tuple=True)\n",
165 | "\n",
166 | " d_cell_fw2 = tf.contrib.rnn.DropoutWrapper(cell_fw2, input_keep_prob=input_keep_prob)\n",
167 | " d_cell_bw2 = tf.contrib.rnn.DropoutWrapper(cell_bw2, input_keep_prob=input_keep_prob)\n",
168 | " \n",
169 | " (fw_h2, bw_h2), _ = tf.nn.bidirectional_dynamic_rnn(d_cell_fw2, d_cell_bw2, h1, sequence_length=x_len, dtype='float', scope='lstm2')\n",
170 | " h = tf.concat([fw_h2, bw_h2], 2)\n",
171 | "\n",
172 | "def attention(input_x, input_mask, W_att):\n",
173 | " h_masked = tf.boolean_mask(input_x, input_mask)\n",
174 | " h_tanh = tf.tanh(h_masked)\n",
175 | " u = tf.matmul(h_tanh, W_att)\n",
176 | " a = tf.nn.softmax(u)\n",
177 | " c = tf.reduce_sum(tf.multiply(h_tanh, a), 0) \n",
178 | " return c\n",
179 | "\n",
180 | "with tf.variable_scope(\"attention\"):\n",
181 | " W_att = tf.Variable(tf.truncated_normal([2*hidden_size, 1], mean=0.0, stddev=0.1, seed=0), name=\"W_att\")\n",
182 | " c = tf.expand_dims(attention(h[0], x_mask[0], W_att), 0)\n",
183 | " for i in range(1, batch_size):\n",
184 | " c = tf.concat([c, tf.expand_dims(attention(h[i], x_mask[i], W_att), 0)], 0)\n",
185 | " \n",
186 | "with tf.variable_scope(\"softmax_layer\"):\n",
187 | " W = tf.Variable(tf.truncated_normal([2*hidden_size, num_senses], mean=0.0, stddev=0.1, seed=0), name=\"W\")\n",
188 | " b = tf.Variable(tf.zeros([num_senses]), name=\"b\")\n",
189 | " drop_c = tf.nn.dropout(c, input_keep_prob)\n",
190 | " logits = tf.matmul(drop_c, W) + b\n",
191 | " predictions = tf.argmax(logits, 1)\n",
192 | "\n",
193 | "class_weight = tf.constant(weights)\n",
194 | "weighted_logits = logits * class_weight\n",
195 | "loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=weighted_logits, labels=y))\n",
196 | "global_step = tf.Variable(0, trainable=False, name=\"global_step\")\n",
197 | "\n",
198 | "learning_rate = tf.train.exponential_decay(init_lr, global_step, decay_steps, decay_rate, staircase=True)\n",
199 | "\n",
200 | "tv_all = tf.trainable_variables()\n",
201 | "tv_regu =[]\n",
202 | "for t in tv_all:\n",
203 | " if t.name.find('b:')==-1:\n",
204 | " tv_regu.append(t)\n",
205 | " \n",
206 | "# l2 Loss\n",
207 | "l2_loss = l2_lambda * tf.reduce_sum([ tf.nn.l2_loss(v) for v in tv_regu ])\n",
208 | "\n",
209 | "total_loss = loss + l2_loss\n",
210 | "\n",
211 | "# Optimizer for loss\n",
212 | "optimizer = tf.train.AdamOptimizer(learning_rate)\n",
213 | "\n",
214 | "# Gradients and Variables for Loss\n",
215 | "grads_vars = optimizer.compute_gradients(total_loss)\n",
216 | "\n",
217 | "# Clipping of Gradients\n",
218 | "clipped_grads = grads_vars\n",
219 | "if(clipping == True):\n",
220 | " clipped_grads = [(tf.clip_by_norm(grad, clip_norm), var) for grad, var in clipped_grads]\n",
221 | "\n",
222 | "# Training Optimizer for Total Loss\n",
223 | "train_op = optimizer.apply_gradients(clipped_grads, global_step=global_step)\n",
224 | "\n",
225 | "# Summaries\n",
226 | "var_summaries = []\n",
227 | "for v in tv_all:\n",
228 | " var_summary = tf.summary.histogram(\"{}/var\".format(v.name), v)\n",
229 | " var_summaries.append(var_summary)\n",
230 | "\n",
231 | "var_summaries_merged = tf.summary.merge(var_summaries)\n",
232 | "\n",
233 | "loss_summary = tf.summary.scalar(\"loss\", loss)\n",
234 | "total_loss_summary = tf.summary.scalar(\"total_loss\", total_loss)\n",
235 | "summary = tf.summary.merge_all()"
236 | ]
237 | },
238 | {
239 | "cell_type": "code",
240 | "execution_count": 7,
241 | "metadata": {
242 | "collapsed": true
243 | },
244 | "outputs": [],
245 | "source": [
246 | "os.environ[\"CUDA_DEVICE_ORDER\"]=\"PCI_BUS_ID\" # see issue #152\n",
247 | "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"1\"\n",
248 | "config = tf.ConfigProto()\n",
249 | "config.gpu_options.allow_growth = True\n",
250 | "sess = tf.Session(config=config)\n",
251 | "sess.run(tf.global_variables_initializer()) # For initializing all the variables\n",
252 | "saver = tf.train.Saver() # For Saving the model\n",
253 | "summary_writer = tf.summary.FileWriter(log_dir, sess.graph) # For writing Summaries"
254 | ]
255 | },
256 | {
257 | "cell_type": "code",
258 | "execution_count": 8,
259 | "metadata": {
260 | "scrolled": true
261 | },
262 | "outputs": [
263 | {
264 | "name": "stderr",
265 | "output_type": "stream",
266 | "text": [
267 | "/users/btech/aviraj/cs771/lib/python3.5/site-packages/sklearn/model_selection/_split.py:2026: FutureWarning: From version 0.21, test_size will always complement train_size unless both are specified.\n",
268 | " FutureWarning)\n"
269 | ]
270 | }
271 | ],
272 | "source": [
273 | "# Splitting\n",
274 | "data_x = full_data[sense_word][0]\n",
275 | "data_y = full_data[sense_word][2]\n",
276 | "x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, train_size=0.8, shuffle=True, stratify=data_y, random_state=0)\n",
277 | "\n",
278 | "x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, train_size=0.9, shuffle=True, stratify=y_train, random_state=0)"
279 | ]
280 | },
281 | {
282 | "cell_type": "code",
283 | "execution_count": 9,
284 | "metadata": {
285 | "collapsed": true
286 | },
287 | "outputs": [],
288 | "source": [
289 | "def data_prepare(x):\n",
290 | " num_examples = len(x)\n",
291 | "\n",
292 | " xx = np.zeros([num_examples, max_sent_size], dtype=int)\n",
293 | " xx_mask = np.zeros([num_examples, max_sent_size], dtype=bool)\n",
294 | "\n",
295 | " for j in range(num_examples):\n",
296 | " for i in range(max_sent_size):\n",
297 | " if(i>=len(x[j])):\n",
298 | " break\n",
299 | " w = x[j][i]\n",
300 | " xx[j][i] = word2id[w] if w in word2id else word2id['UNKNOWN_TOKEN']\n",
301 | " xx_mask[j][i] = True\n",
302 | " \n",
303 | " return xx, xx_mask\n",
304 | "\n",
305 | "def eval_score(yy, pred):\n",
306 | " num_batches = int(len(yy)/batch_size)\n",
307 | " f1 = f1_score(yy[:batch_size*num_batches], pred, average='macro')\n",
308 | " accu = accuracy_score(yy[:batch_size*num_batches], pred)\n",
309 | " return f1*100, accu*100\n",
310 | "\n",
311 | "def model(xx, yy, mask, train_cond=True):\n",
312 | " num_batches = int(len(xx)/batch_size)\n",
313 | " losses = 0\n",
314 | " preds = []\n",
315 | " for j in range(num_batches): \n",
316 | " \n",
317 | " s = j * batch_size\n",
318 | " e = (j+1) * batch_size\n",
319 | " \n",
320 | " feed_dict = {x:xx[s:e], y:yy[s:e], x_mask:mask[s:e], is_train:train_cond, input_keep_prob:keep_prob, word_emb_mat:word_embedding}\n",
321 | " \n",
322 | " \n",
323 | " if(train_cond==True):\n",
324 | " _, _loss, step, _summary = sess.run([train_op, total_loss, global_step, summary], feed_dict)\n",
325 | " summary_writer.add_summary(_summary, step) \n",
326 | "# print(\"Steps:{}\".format(step), \", Loss: {}\".format(_loss))\n",
327 | "\n",
328 | " else:\n",
329 | " _loss, pred = sess.run([total_loss, predictions], feed_dict)\n",
330 | " preds.append(pred)\n",
331 | " \n",
332 | " losses +=_loss\n",
333 | "\n",
334 | " if(train_cond==False):\n",
335 | " y_pred = []\n",
336 | " for i in range(num_batches):\n",
337 | " for pred in preds[i]:\n",
338 | " y_pred.append(pred)\n",
339 | " return losses/num_batches, y_pred\n",
340 | " \n",
341 | " return losses/num_batches, step"
342 | ]
343 | },
344 | {
345 | "cell_type": "code",
346 | "execution_count": 10,
347 | "metadata": {
348 | "collapsed": true
349 | },
350 | "outputs": [],
351 | "source": [
352 | "x_id_train, mask_train = data_prepare(x_train)\n",
353 | "x_id_val, mask_val = data_prepare(x_val)\n",
354 | "x_id_test, mask_test = data_prepare(x_test)\n",
355 | "y_train = np.array(y_train)"
356 | ]
357 | },
358 | {
359 | "cell_type": "code",
360 | "execution_count": 11,
361 | "metadata": {
362 | "scrolled": true
363 | },
364 | "outputs": [
365 | {
366 | "name": "stdout",
367 | "output_type": "stream",
368 | "text": [
369 | "Epoch: 1 Step: 48 loss: 9.53506787121\n",
370 | "Epoch: 2 Step: 96 loss: 1.81961081177\n",
371 | "Epoch: 3 Step: 144 loss: 1.19337606803\n",
372 | "Epoch: 4 Step: 192 loss: 0.967174999416\n",
373 | "Epoch: 5 Step: 240 loss: 0.859784771999\n",
374 | "Saved Model Complete\n",
375 | "Train: F1 Score: 46.2728990557 Accuracy: 73.2096354167 Loss: 0.935422244171\n",
376 | "Val: F1 Score: 44.6230136155 Accuracy: 70.9375 Loss: 0.885838544369\n",
377 | "Epoch: 6 Step: 288 loss: 0.815433536967\n",
378 | "Epoch: 7 Step: 336 loss: 0.756411065037\n",
379 | "Epoch: 8 Step: 384 loss: 0.722958392153\n",
380 | "Epoch: 9 Step: 432 loss: 0.67455783921\n",
381 | "Epoch: 10 Step: 480 loss: 0.677137187993\n",
382 | "Saved Model Complete\n",
383 | "Train: F1 Score: 45.3513910841 Accuracy: 81.15234375 Loss: 0.622080009431\n",
384 | "Val: F1 Score: 43.2422709632 Accuracy: 81.875 Loss: 0.607948565483\n",
385 | "Epoch: 11 Step: 528 loss: 0.65565276891\n",
386 | "Epoch: 12 Step: 576 loss: 0.645226646215\n",
387 | "Epoch: 13 Step: 624 loss: 0.631849833454\n",
388 | "Epoch: 14 Step: 672 loss: 0.653128698468\n",
389 | "Epoch: 15 Step: 720 loss: 0.610900692021\n",
390 | "Saved Model Complete\n",
391 | "Train: F1 Score: 60.5870838384 Accuracy: 83.3658854167 Loss: 0.585401636859\n",
392 | "Val: F1 Score: 61.1966168463 Accuracy: 85.3125 Loss: 0.595154416561\n",
393 | "Epoch: 16 Step: 768 loss: 0.640408499787\n",
394 | "Epoch: 17 Step: 816 loss: 0.573454591756\n",
395 | "Epoch: 18 Step: 864 loss: 0.573158189033\n",
396 | "Epoch: 19 Step: 912 loss: 0.580998883272\n",
397 | "Epoch: 20 Step: 960 loss: 0.599028664331\n",
398 | "Saved Model Complete\n",
399 | "Train: F1 Score: 66.2391100441 Accuracy: 85.6119791667 Loss: 0.579200811684\n",
400 | "Val: F1 Score: 63.3909012244 Accuracy: 84.375 Loss: 0.571177864075\n",
401 | "Epoch: 21 Step: 1008 loss: 0.613934485242\n",
402 | "Epoch: 22 Step: 1056 loss: 0.607284868757\n",
403 | "Epoch: 23 Step: 1104 loss: 0.597342180709\n",
404 | "Epoch: 24 Step: 1152 loss: 0.570371546472\n",
405 | "Epoch: 25 Step: 1200 loss: 0.580265671636\n",
406 | "Saved Model Complete\n",
407 | "Train: F1 Score: 67.9210837096 Accuracy: 86.7513020833 Loss: 0.537070132792\n",
408 | "Val: F1 Score: 73.5174165398 Accuracy: 89.0625 Loss: 0.566295391321\n",
409 | "Epoch: 26 Step: 1248 loss: 0.568779307107\n",
410 | "Epoch: 27 Step: 1296 loss: 0.55141502743\n",
411 | "Epoch: 28 Step: 1344 loss: 0.559002238015\n",
412 | "Epoch: 29 Step: 1392 loss: 0.569756407291\n",
413 | "Epoch: 30 Step: 1440 loss: 0.573152939479\n",
414 | "Saved Model Complete\n",
415 | "Train: F1 Score: 69.0664553653 Accuracy: 87.3046875 Loss: 0.59051666595\n",
416 | "Val: F1 Score: 68.3056653491 Accuracy: 88.125 Loss: 0.647302913666\n",
417 | "Epoch: 31 Step: 1488 loss: 0.601928584278\n",
418 | "Epoch: 32 Step: 1536 loss: 0.581918654342\n",
419 | "Epoch: 33 Step: 1584 loss: 0.539948465923\n",
420 | "Epoch: 34 Step: 1632 loss: 0.562553635488\n",
421 | "Epoch: 35 Step: 1680 loss: 0.547960610439\n",
422 | "Saved Model Complete\n",
423 | "Train: F1 Score: 71.4368257896 Accuracy: 88.4765625 Loss: 0.517511847119\n",
424 | "Val: F1 Score: 63.9771663859 Accuracy: 86.875 Loss: 0.614117074013\n",
425 | "Epoch: 36 Step: 1728 loss: 0.566355666146\n",
426 | "Epoch: 37 Step: 1776 loss: 0.555698808903\n",
427 | "Epoch: 38 Step: 1824 loss: 0.56517353033\n",
428 | "Epoch: 39 Step: 1872 loss: 0.581259304037\n",
429 | "Epoch: 40 Step: 1920 loss: 0.585148503383\n",
430 | "Saved Model Complete\n",
431 | "Train: F1 Score: 72.4950138601 Accuracy: 88.7044270833 Loss: 0.578148378059\n",
432 | "Val: F1 Score: 68.0165923988 Accuracy: 87.5 Loss: 0.708620613813\n",
433 | "Epoch: 41 Step: 1968 loss: 0.567735542854\n",
434 | "Epoch: 42 Step: 2016 loss: 0.539583496749\n",
435 | "Epoch: 43 Step: 2064 loss: 0.544194473575\n",
436 | "Epoch: 44 Step: 2112 loss: 0.556465638181\n",
437 | "Epoch: 45 Step: 2160 loss: 0.559930261845\n",
438 | "Saved Model Complete\n",
439 | "Train: F1 Score: 76.9940617261 Accuracy: 89.16015625 Loss: 0.536304668213\n",
440 | "Val: F1 Score: 74.9496075234 Accuracy: 88.4375 Loss: 0.573511379957\n",
441 | "Epoch: 46 Step: 2208 loss: 0.556281161805\n",
442 | "Epoch: 47 Step: 2256 loss: 0.549503739923\n",
443 | "Epoch: 48 Step: 2304 loss: 0.561590575303\n",
444 | "Epoch: 49 Step: 2352 loss: 0.538634177297\n",
445 | "Epoch: 50 Step: 2400 loss: 0.548110162839\n",
446 | "Saved Model Complete\n",
447 | "Train: F1 Score: 69.2087726432 Accuracy: 88.2486979167 Loss: 0.513670069476\n",
448 | "Val: F1 Score: 75.8463136033 Accuracy: 89.6875 Loss: 0.542824417353\n",
449 | "Test: F1 Score: 61.845299018 Accuracy: 85.0961538462 Loss: 0.683341053816\n"
450 | ]
451 | }
452 | ],
453 | "source": [
454 | "num_epochs = 50\n",
455 | "\n",
456 | "for i in range(num_epochs):\n",
457 | " \n",
458 | " random = np.random.choice(len(y_train), size=(len(y_train)), replace=False)\n",
459 | " x_id_train = x_id_train[random]\n",
460 | " y_train = y_train[random]\n",
461 | " mask_train = mask_train[random]\n",
462 | " \n",
463 | " losses, step = model(x_id_train, y_train, mask_train)\n",
464 | " print(\"Epoch:\", i+1,\"Step:\", step, \"loss:\",losses)\n",
465 | " \n",
466 | " if((i+1)%5==0):\n",
467 | " saver.save(sess, save_path=save_dir) \n",
468 | " print(\"Saved Model Complete\")\n",
469 | " train_loss, train_pred = model(x_id_train, y_train, mask_train, train_cond=False)\n",
470 | " f1_, accu_ = eval_score(y_train, train_pred)\n",
471 | " print(\"Train: F1 Score: \", f1_, \"Accuracy: \", accu_, \"Loss: \", train_loss)\n",
472 | " val_loss, val_pred = model(x_id_val, y_val, mask_val, train_cond=False)\n",
473 | " f1_, accu_ = eval_score(y_val, val_pred)\n",
474 | " print(\"Val: F1 Score: \", f1_, \"Accuracy: \", accu_, \"Loss: \", val_loss)\n",
475 | " \n",
476 | "test_loss, test_pred = model(x_id_test, y_test, mask_test, train_cond=False)\n",
477 | "f1_, accu_ = eval_score(y_test, test_pred)\n",
478 | "print(\"Test: F1 Score: \", f1_, \"Accuracy: \", accu_, \"Loss: \", test_loss)"
479 | ]
480 | },
481 | {
482 | "cell_type": "code",
483 | "execution_count": null,
484 | "metadata": {
485 | "collapsed": true
486 | },
487 | "outputs": [],
488 | "source": []
489 | },
490 | {
491 | "cell_type": "code",
492 | "execution_count": null,
493 | "metadata": {
494 | "collapsed": true
495 | },
496 | "outputs": [],
497 | "source": [
498 | "saver.restore(sess, save_dir)"
499 | ]
500 | }
501 | ],
502 | "metadata": {
503 | "kernelspec": {
504 | "display_name": "cs771",
505 | "language": "python",
506 | "name": "cs771"
507 | },
508 | "language_info": {
509 | "codemirror_mode": {
510 | "name": "ipython",
511 | "version": 3
512 | },
513 | "file_extension": ".py",
514 | "mimetype": "text/x-python",
515 | "name": "python",
516 | "nbconvert_exporter": "python",
517 | "pygments_lexer": "ipython3",
518 | "version": "3.5.2"
519 | }
520 | },
521 | "nbformat": 4,
522 | "nbformat_minor": 2
523 | }
524 |
--------------------------------------------------------------------------------
/Four Word Model/Model.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import tensorflow as tf\n",
12 | "tf.logging.set_verbosity(tf.logging.WARN)\n",
13 | "import pickle\n",
14 | "import numpy as np\n",
15 | "import os\n",
16 | "from sklearn.model_selection import train_test_split\n",
17 | "from sklearn.metrics import f1_score\n",
18 | "from sklearn.metrics import accuracy_score\n",
19 | "import os\n",
20 | "from tensorflow.python.client import device_lib"
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": 2,
26 | "metadata": {
27 | "collapsed": true
28 | },
29 | "outputs": [],
30 | "source": [
31 | "f = open('../Glove/word_embedding_glove', 'rb')\n",
32 | "word_embedding = pickle.load(f)\n",
33 | "f.close()\n",
34 | "word_embedding = word_embedding[: len(word_embedding)-1]\n",
35 | "\n",
36 | "f = open('../Glove/vocab_glove', 'rb')\n",
37 | "vocab = pickle.load(f)\n",
38 | "f.close()\n",
39 | "\n",
40 | "word2id = dict((w, i) for i,w in enumerate(vocab))\n",
41 | "id2word = dict((i, w) for i,w in enumerate(vocab))\n",
42 | "\n",
43 | "unknown_token = \"UNKNOWN_TOKEN\"\n",
44 | "\n",
45 | "f = open(\"train.pickle\", 'rb')\n",
46 | "full_data = pickle.load(f)\n",
47 | "f.close()"
48 | ]
49 | },
50 | {
51 | "cell_type": "code",
52 | "execution_count": 3,
53 | "metadata": {
54 | "collapsed": true
55 | },
56 | "outputs": [],
57 | "source": [
58 | "# Model Description\n",
59 | "sense_word = 'hard'\n",
60 | "model_name = 'basic'\n",
61 | "model_dir = 'output/' + sense_word + '/' + model_name\n",
62 | "save_dir = os.path.join(model_dir, \"save/\")\n",
63 | "log_dir = os.path.join(model_dir, \"log\")\n",
64 | "\n",
65 | "if not os.path.exists(model_dir):\n",
66 | " os.mkdir(model_dir)\n",
67 | "if not os.path.exists(save_dir):\n",
68 | " os.mkdir(save_dir)\n",
69 | "if not os.path.exists(log_dir):\n",
70 | " os.mkdir(log_dir)"
71 | ]
72 | },
73 | {
74 | "cell_type": "code",
75 | "execution_count": 4,
76 | "metadata": {
77 | "collapsed": true
78 | },
79 | "outputs": [],
80 | "source": [
81 | "# Parameters\n",
82 | "mode = 'train'\n",
83 | "num_senses = 3\n",
84 | "batch_size = 64\n",
85 | "vocab_size = len(vocab)\n",
86 | "unk_vocab_size = 1\n",
87 | "word_emb_size = len(word_embedding[0])\n",
88 | "max_sent_size = 200\n",
89 | "hidden_size = 100\n",
90 | "keep_prob = 0.5\n",
91 | "l2_lambda = 0.001\n",
92 | "init_lr = 0.001\n",
93 | "decay_steps = 5000\n",
94 | "decay_rate = 0.96\n",
95 | "clip_norm = 1\n",
96 | "clipping = True"
97 | ]
98 | },
99 | {
100 | "cell_type": "code",
101 | "execution_count": 5,
102 | "metadata": {
103 | "collapsed": true
104 | },
105 | "outputs": [],
106 | "source": [
107 | "# MODEL\n",
108 | "x = tf.placeholder('int32', [batch_size, max_sent_size], name=\"x\")\n",
109 | "y = tf.placeholder('int32', [batch_size], name=\"y\")\n",
110 | "x_mask = tf.placeholder('bool', [batch_size, max_sent_size], name='x_mask') \n",
111 | "is_train = tf.placeholder('bool', [], name='is_train')\n",
112 | "word_emb_mat = tf.placeholder('float', [None, word_emb_size], name='emb_mat')\n",
113 | "input_keep_prob = tf.cond(is_train,lambda:keep_prob, lambda:tf.constant(1.0))\n",
114 | "x_len = tf.reduce_sum(tf.cast(x_mask, 'int32'), 1)\n",
115 | "\n",
116 | "with tf.name_scope(\"word_embedding\"):\n",
117 | " if mode == 'train':\n",
118 | " unk_word_emb_mat = tf.get_variable(\"word_emb_mat\", dtype='float', shape=[unk_vocab_size, word_emb_size], initializer=tf.contrib.layers.xavier_initializer(uniform=True, seed=0, dtype=tf.float32))\n",
119 | " else:\n",
120 | " unk_word_emb_mat = tf.get_variable(\"word_emb_mat\", shape=[unk_vocab_size, word_emb_size], dtype='float')\n",
121 | " \n",
122 | " final_word_emb_mat = tf.concat([word_emb_mat, unk_word_emb_mat], 0)\n",
123 | " Wx = tf.nn.embedding_lookup(final_word_emb_mat, x) \n",
124 | "\n",
125 | "with tf.variable_scope(\"lstm\"):\n",
126 | " cell_fw = tf.contrib.rnn.BasicLSTMCell(hidden_size,state_is_tuple=True)\n",
127 | " cell_bw = tf.contrib.rnn.BasicLSTMCell(hidden_size,state_is_tuple=True)\n",
128 | "\n",
129 | " d_cell_fw = tf.contrib.rnn.DropoutWrapper(cell_fw, input_keep_prob=input_keep_prob)\n",
130 | " d_cell_bw = tf.contrib.rnn.DropoutWrapper(cell_bw, input_keep_prob=input_keep_prob)\n",
131 | " \n",
132 | " (fw_h, bw_h), _ = tf.nn.bidirectional_dynamic_rnn(d_cell_fw, d_cell_bw, Wx, sequence_length=x_len, dtype='float', scope='lstm')\n",
133 | " h = tf.concat([fw_h, bw_h], 2)\n",
134 | "\n",
135 | "def attention(input_x, input_mask, W_att):\n",
136 | " h_masked = tf.boolean_mask(input_x, input_mask)\n",
137 | " h_tanh = tf.tanh(h_masked)\n",
138 | " u = tf.matmul(h_tanh, W_att)\n",
139 | " a = tf.nn.softmax(u)\n",
140 | " c = tf.reduce_sum(tf.multiply(h_tanh, a), 0) \n",
141 | " return c\n",
142 | "\n",
143 | "with tf.variable_scope(\"attention\"):\n",
144 | " W_att = tf.Variable(tf.truncated_normal([2*hidden_size, 1], mean=0.0, stddev=0.1, seed=0), name=\"W_att\")\n",
145 | " c = tf.expand_dims(attention(h[0], x_mask[0], W_att), 0)\n",
146 | " for i in range(1, batch_size):\n",
147 | " c = tf.concat([c, tf.expand_dims(attention(h[i], x_mask[i], W_att), 0)], 0)\n",
148 | " \n",
149 | "with tf.variable_scope(\"softmax_layer\"):\n",
150 | " W = tf.Variable(tf.truncated_normal([2*hidden_size, num_senses], mean=0.0, stddev=0.1, seed=0), name=\"W\")\n",
151 | " b = tf.Variable(tf.zeros([num_senses]), name=\"b\")\n",
152 | " drop_c = tf.nn.dropout(c, input_keep_prob)\n",
153 | " logits = tf.matmul(drop_c, W) + b\n",
154 | " predictions = tf.argmax(logits, 1)\n",
155 | "\n",
156 | "loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=y))\n",
157 | "global_step = tf.Variable(0, trainable=False, name=\"global_step\")\n",
158 | "\n",
159 | "learning_rate = tf.train.exponential_decay(init_lr, global_step, decay_steps, decay_rate, staircase=True)\n",
160 | "\n",
161 | "tv_all = tf.trainable_variables()\n",
162 | "tv_regu =[]\n",
163 | "for t in tv_all:\n",
164 | " if t.name.find('b:')==-1:\n",
165 | " tv_regu.append(t)\n",
166 | " \n",
167 | "# l2 Loss\n",
168 | "l2_loss = l2_lambda * tf.reduce_sum([ tf.nn.l2_loss(v) for v in tv_regu ])\n",
169 | "\n",
170 | "total_loss = loss + l2_loss\n",
171 | "\n",
172 | "# Optimizer for loss\n",
173 | "optimizer = tf.train.AdamOptimizer(learning_rate)\n",
174 | "\n",
175 | "# Gradients and Variables for Loss\n",
176 | "grads_vars = optimizer.compute_gradients(total_loss)\n",
177 | "\n",
178 | "# Clipping of Gradients\n",
179 | "clipped_grads = grads_vars\n",
180 | "if(clipping == True):\n",
181 | " clipped_grads = [(tf.clip_by_norm(grad, clip_norm), var) for grad, var in clipped_grads]\n",
182 | "\n",
183 | "# Training Optimizer for Total Loss\n",
184 | "train_op = optimizer.apply_gradients(clipped_grads, global_step=global_step)\n",
185 | "\n",
186 | "# Summaries\n",
187 | "var_summaries = []\n",
188 | "for v in tv_all:\n",
189 | " var_summary = tf.summary.histogram(\"{}/var\".format(v.name), v)\n",
190 | " var_summaries.append(var_summary)\n",
191 | "\n",
192 | "var_summaries_merged = tf.summary.merge(var_summaries)\n",
193 | "\n",
194 | "loss_summary = tf.summary.scalar(\"loss\", loss)\n",
195 | "total_loss_summary = tf.summary.scalar(\"total_loss\", total_loss)\n",
196 | "summary = tf.summary.merge_all()"
197 | ]
198 | },
199 | {
200 | "cell_type": "code",
201 | "execution_count": 6,
202 | "metadata": {
203 | "collapsed": true
204 | },
205 | "outputs": [],
206 | "source": [
207 | "os.environ[\"CUDA_DEVICE_ORDER\"]=\"PCI_BUS_ID\" # see issue #152\n",
208 | "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"0\"\n",
209 | "sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))\n",
210 | "sess.run(tf.global_variables_initializer()) # For initializing all the variables\n",
211 | "saver = tf.train.Saver() # For Saving the model\n",
212 | "summary_writer = tf.summary.FileWriter(log_dir, sess.graph) # For writing Summaries"
213 | ]
214 | },
215 | {
216 | "cell_type": "code",
217 | "execution_count": null,
218 | "metadata": {
219 | "collapsed": true
220 | },
221 | "outputs": [],
222 | "source": [
223 | "# # k-fold Splitting\n",
224 | "# data_x = np.array(full_data[sense_word][0])\n",
225 | "# data_y = np.array(full_data[sense_word][2])\n",
226 | "# kf = KFold(n_splits=5,shuffle=True,random_state=0)\n",
227 | "# for train_index, test_index in kf.split(X):\n",
228 | "# print(\"TRAIN:\", train_index, \"TEST:\", test_index)\n",
229 | "# #x_train, x_test = data_x[train_index], data_x[test_index]\n",
230 | "# #y_train, y_test = data_y[train_index], data_y[test_index]"
231 | ]
232 | },
233 | {
234 | "cell_type": "code",
235 | "execution_count": 6,
236 | "metadata": {
237 | "scrolled": true
238 | },
239 | "outputs": [
240 | {
241 | "name": "stderr",
242 | "output_type": "stream",
243 | "text": [
244 | "/users/btech/aviraj/cs771/lib/python3.5/site-packages/sklearn/model_selection/_split.py:2026: FutureWarning: From version 0.21, test_size will always complement train_size unless both are specified.\n",
245 | " FutureWarning)\n"
246 | ]
247 | }
248 | ],
249 | "source": [
250 | "# Splitting\n",
251 | "data_x = full_data[sense_word][0]\n",
252 | "data_y = full_data[sense_word][2]\n",
253 | "x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, train_size=0.8, shuffle=True, stratify=data_y, random_state=0)\n",
254 | "\n",
255 | "x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, train_size=0.9, shuffle=True, stratify=y_train, random_state=0)"
256 | ]
257 | },
258 | {
259 | "cell_type": "code",
260 | "execution_count": 7,
261 | "metadata": {
262 | "collapsed": true
263 | },
264 | "outputs": [],
265 | "source": [
266 | "def data_prepare(x):\n",
267 | " num_examples = len(x)\n",
268 | "\n",
269 | " xx = np.zeros([num_examples, max_sent_size], dtype=int)\n",
270 | " xx_mask = np.zeros([num_examples, max_sent_size], dtype=bool)\n",
271 | "\n",
272 | " for j in range(num_examples):\n",
273 | " for i in range(max_sent_size):\n",
274 | " if(i>=len(x[j])):\n",
275 | " break\n",
276 | " w = x[j][i]\n",
277 | " xx[j][i] = word2id[w] if w in word2id else word2id['UNKNOWN_TOKEN']\n",
278 | " xx_mask[j][i] = True\n",
279 | " \n",
280 | " return xx, xx_mask\n",
281 | "\n",
282 | "def eval_score(yy, pred):\n",
283 | " num_batches = int(len(yy)/batch_size)\n",
284 | " f1 = f1_score(yy[:batch_size*num_batches], pred, average='macro')\n",
285 | " accu = accuracy_score(yy[:batch_size*num_batches], pred)\n",
286 | " return f1*100, accu*100\n",
287 | "\n",
288 | "def model(xx, yy, mask, train_cond=True):\n",
289 | " num_batches = int(len(xx)/batch_size)\n",
290 | " losses = 0\n",
291 | " preds = []\n",
292 | " for j in range(num_batches): \n",
293 | " \n",
294 | " s = j * batch_size\n",
295 | " e = (j+1) * batch_size\n",
296 | " \n",
297 | " feed_dict = {x:xx[s:e], y:yy[s:e], x_mask:mask[s:e], is_train:train_cond, input_keep_prob:keep_prob, word_emb_mat:word_embedding}\n",
298 | " \n",
299 | " \n",
300 | " if(train_cond==True):\n",
301 | " _, _loss, step, _summary = sess.run([train_op, total_loss, global_step, summary], feed_dict)\n",
302 | " summary_writer.add_summary(_summary, step) \n",
303 | "# print(\"Steps:{}\".format(step), \", Loss: {}\".format(_loss))\n",
304 | "\n",
305 | " else:\n",
306 | " _loss, pred = sess.run([total_loss, predictions], feed_dict)\n",
307 | " preds.append(pred)\n",
308 | " \n",
309 | " losses +=_loss\n",
310 | "\n",
311 | " if(train_cond==False):\n",
312 | " y_pred = []\n",
313 | " for i in range(num_batches):\n",
314 | " for pred in preds[i]:\n",
315 | " y_pred.append(pred)\n",
316 | " return losses/num_batches, y_pred\n",
317 | " \n",
318 | " return losses/num_batches, step"
319 | ]
320 | },
321 | {
322 | "cell_type": "code",
323 | "execution_count": 8,
324 | "metadata": {
325 | "collapsed": true
326 | },
327 | "outputs": [],
328 | "source": [
329 | "x_id_train, mask_train = data_prepare(x_train)\n",
330 | "x_id_val, mask_val = data_prepare(x_val)\n",
331 | "x_id_test, mask_test = data_prepare(x_test)\n",
332 | "y_train = np.array(y_train)"
333 | ]
334 | },
335 | {
336 | "cell_type": "code",
337 | "execution_count": null,
338 | "metadata": {
339 | "collapsed": true,
340 | "scrolled": true
341 | },
342 | "outputs": [],
343 | "source": [
344 | "num_epochs = 10\n",
345 | "\n",
346 | "for i in range(num_epochs):\n",
347 | " \n",
348 | " random = np.random.choice(len(y_train), size=(len(y_train)), replace=False)\n",
349 | " x_id_train = x_id_train[random]\n",
350 | " y_train = y_train[random]\n",
351 | " mask_train = mask_train[random]\n",
352 | " \n",
353 | " losses, step = model(x_id_train, y_train, mask_train)\n",
354 | " print(\"Epoch:\", i+1,\"Step:\", step, \"loss:\",losses)\n",
355 | " saver.save(sess, save_path=save_dir) \n",
356 | " print(\"Saved Model Complete\")\n",
357 | " \n",
358 | " if((i+1)%2==0):\n",
359 | " train_loss, train_pred = model(x_id_train, y_train, mask_train, train_cond=False)\n",
360 | " f1_, accu_ = eval_score(y_train, train_pred)\n",
361 | " print(\"Train: F1 Score: \", f1_, \"Accuracy: \", accu_, \"Loss: \", train_loss)\n",
362 | " val_loss, val_pred = model(x_id_val, y_val, mask_val, train_cond=False)\n",
363 | " f1_, accu_ = eval_score(y_val, val_pred)\n",
364 | " print(\"Val: F1 Score: \", f1_, \"Accuracy: \", accu_, \"Loss: \", val_loss)\n",
365 | " \n",
366 | "test_loss, test_pred = model(x_id_test, y_test, mask_test, train_cond=False)\n",
367 | "f1_, accu_ = eval_score(y_test, test_pred)\n",
368 | "print(\"Test: F1 Score: \", f1_, \"Accuracy: \", accu_, \"Loss: \", test_loss)"
369 | ]
370 | },
371 | {
372 | "cell_type": "code",
373 | "execution_count": null,
374 | "metadata": {
375 | "collapsed": true
376 | },
377 | "outputs": [],
378 | "source": []
379 | },
380 | {
381 | "cell_type": "code",
382 | "execution_count": 10,
383 | "metadata": {
384 | "collapsed": true
385 | },
386 | "outputs": [],
387 | "source": [
388 | "saver.restore(sess, save_dir)"
389 | ]
390 | }
391 | ],
392 | "metadata": {
393 | "kernelspec": {
394 | "display_name": "cs771",
395 | "language": "python",
396 | "name": "cs771"
397 | },
398 | "language_info": {
399 | "codemirror_mode": {
400 | "name": "ipython",
401 | "version": 3
402 | },
403 | "file_extension": ".py",
404 | "mimetype": "text/x-python",
405 | "name": "python",
406 | "nbconvert_exporter": "python",
407 | "pygments_lexer": "ipython3",
408 | "version": "3.5.2"
409 | }
410 | },
411 | "nbformat": 4,
412 | "nbformat_minor": 2
413 | }
414 |
--------------------------------------------------------------------------------
/Four Word Model/final_preprocessing.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import re\n",
10 | "from sklearn.model_selection import train_test_split\n",
11 | "import pickle"
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": 2,
17 | "metadata": {},
18 | "outputs": [],
19 | "source": [
20 | "def decontracted(phrase):\n",
21 | " # specific\n",
22 | " phrase = re.sub(r\"won't\", \"will not\", phrase)\n",
23 | "\n",
24 | " # general\n",
25 | " phrase = re.sub(r\"n\\'t\", \" not\", phrase)\n",
26 | " phrase = re.sub(r\"\\'re\", \" are\", phrase)\n",
27 | " phrase = re.sub(r\"\\'s\", \" is\", phrase)\n",
28 | " phrase = re.sub(r\"\\'d\", \" would\", phrase)\n",
29 | " phrase = re.sub(r\"\\'ll\", \" will\", phrase)\n",
30 | " phrase = re.sub(r\"\\'t\", \" not\", phrase)\n",
31 | " phrase = re.sub(r\"\\'ve\", \" have\", phrase)\n",
32 | " phrase = re.sub(r\"\\'m\", \" am\", phrase)\n",
33 | " phrase = re.sub(r\"\\'d've\", \" would have\", phrase)\n",
34 | " phrase = re.sub(r\"\\'d'y\", \" do you\", phrase)\n",
35 | " return phrase\n"
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": 3,
41 | "metadata": {},
42 | "outputs": [],
43 | "source": [
44 | "train = {}\n",
45 | "\n",
46 | "with open('./Preprocess_Files/hard/sent') as f:\n",
47 | " sents = f.readlines()\n",
48 | "content = [x.strip() for x in sents]\n",
49 | " \n",
50 | "with open('./Preprocess_Files/hard/sense') as f:\n",
51 | " senses = f.readlines()\n",
52 | "\n",
53 | "sents = []\n",
54 | "for sent in content:\n",
55 | " text = decontracted(sent.replace(\" ' \",\"'\"))\n",
56 | " result = \"\".join(x for x in text if x.isalpha() or x.isspace())\n",
57 | " result = result.replace(' ',' ').split()\n",
58 | " result = [string.lower() for string in result]\n",
59 | " sents.append(result)\n",
60 | "\n",
61 | "type_class = []\n",
62 | "type_name = []\n",
63 | "for sense in senses:\n",
64 | " sense = sense.strip('\\n')\n",
65 | " type_name.append(sense)\n",
66 | " \n",
67 | " sense = sense.replace('HARD1','0').replace('HARD2','1').replace('HARD3','2')\n",
68 | " type_class.append(int(sense))\n",
69 | "\n",
70 | "train['hard'] = []\n",
71 | "train['hard'].extend([sents, type_name, type_class])"
72 | ]
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": 4,
77 | "metadata": {},
78 | "outputs": [],
79 | "source": [
80 | "with open('./Preprocess_Files/interest/sent') as f:\n",
81 | " sents = f.readlines()\n",
82 | "content = [x.strip() for x in sents]\n",
83 | " \n",
84 | "with open('./Preprocess_Files/interest/sense') as f:\n",
85 | " senses = f.readlines()\n",
86 | "\n",
87 | "sents = []\n",
88 | "for sent in content:\n",
89 | " text = decontracted(sent.replace(\" ' \",\"'\"))\n",
90 | " result = \"\".join(x for x in text if x.isalpha() or x.isspace())\n",
91 | " result = result.replace(' ',' ').split()\n",
92 | " result = [string.lower() for string in result]\n",
93 | " sents.append(result)\n",
94 | "\n",
95 | "type_class = []\n",
96 | "type_name = []\n",
97 | "for sense in senses:\n",
98 | " sense = sense.strip('\\n')\n",
99 | " type_name.append(sense)\n",
100 | " \n",
101 | " sense = sense.replace('interest1','0').replace('interest2','1').replace('interest3','2').replace('interest4','3').replace('interest5','4').replace('interest6','5')\n",
102 | " type_class.append(int(sense))\n",
103 | "\n",
104 | "train['interest'] = []\n",
105 | "train['interest'].extend([sents, type_name, type_class])"
106 | ]
107 | },
108 | {
109 | "cell_type": "code",
110 | "execution_count": 5,
111 | "metadata": {},
112 | "outputs": [],
113 | "source": [
114 | "with open('./Preprocess_Files/line/sent') as f:\n",
115 | " sents = f.readlines()\n",
116 | "content = [x.strip() for x in sents]\n",
117 | " \n",
118 | "with open('./Preprocess_Files/line/sense') as f:\n",
119 | " senses = f.readlines()\n",
120 | "\n",
121 | "sents = []\n",
122 | "for sent in content:\n",
123 | " text = decontracted(sent.replace(\" ' \",\"'\"))\n",
124 | " result = \"\".join(x for x in text if x.isalpha() or x.isspace())\n",
125 | " result = result.replace(' ',' ').split()\n",
126 | " result = [string.lower() for string in result]\n",
127 | " sents.append(result)\n",
128 | "\n",
129 | "type_class = []\n",
130 | "type_name = []\n",
131 | "for sense in senses:\n",
132 | " sense = sense.strip('\\n')\n",
133 | " type_name.append(sense)\n",
134 | " \n",
135 | " sense = sense.replace('text','0').replace('phone','1').replace('product','2').replace('formation','3').replace('division','4').replace('cord','5')\n",
136 | " type_class.append(int(sense))\n",
137 | "\n",
138 | "train['line'] = []\n",
139 | "train['line'].extend([sents, type_name, type_class])"
140 | ]
141 | },
142 | {
143 | "cell_type": "code",
144 | "execution_count": 6,
145 | "metadata": {},
146 | "outputs": [],
147 | "source": [
148 | "with open('./Preprocess_Files/serve/sent') as f:\n",
149 | " sents = f.readlines()\n",
150 | "content = [x.strip() for x in sents]\n",
151 | " \n",
152 | "with open('./Preprocess_Files/serve/sense') as f:\n",
153 | " senses = f.readlines()\n",
154 | "\n",
155 | "sents = []\n",
156 | "for sent in content:\n",
157 | " text = decontracted(sent.replace(\" ' \",\"'\"))\n",
158 | " result = \"\".join(x for x in text if x.isalpha() or x.isspace())\n",
159 | " result = result.replace(' ',' ').split()\n",
160 | " result = [string.lower() for string in result]\n",
161 | " sents.append(result)\n",
162 | "\n",
163 | "type_class = []\n",
164 | "type_name = []\n",
165 | "for sense in senses:\n",
166 | " sense = sense.strip('\\n')\n",
167 | " type_name.append(sense)\n",
168 | " \n",
169 | " sense = sense.replace('SERVE2','0').replace('SERVE6','1').replace('SERVE10','2').replace('SERVE12','3')\n",
170 | " type_class.append(int(sense))\n",
171 | "\n",
172 | "train['serve'] = []\n",
173 | "train['serve'].extend([sents, type_name, type_class])"
174 | ]
175 | },
176 | {
177 | "cell_type": "code",
178 | "execution_count": 8,
179 | "metadata": {},
180 | "outputs": [],
181 | "source": [
182 | "def train_test(target):\n",
183 | " x = train['target'][0]\n",
184 | " y = train['target'][2]\n",
185 | " x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, shuffle=True, stratify=y)\n",
186 | " return x_train, x_test, y_train, y_test"
187 | ]
188 | },
189 | {
190 | "cell_type": "code",
191 | "execution_count": 9,
192 | "metadata": {},
193 | "outputs": [],
194 | "source": [
195 | "with open('full_train.pickle', 'wb') as f:\n",
196 | " pickle.dump(train, f)"
197 | ]
198 | },
199 | {
200 | "cell_type": "code",
201 | "execution_count": 10,
202 | "metadata": {},
203 | "outputs": [
204 | {
205 | "name": "stdout",
206 | "output_type": "stream",
207 | "text": [
208 | "['he', 'may', 'lose', 'all', 'popular', 'support', 'but', 'someone', 'has', 'to', 'kill', 'him', 'to', 'defeat', 'him', 'and', 'that', 'is', 'hard', 'to', 'do']\n",
209 | "HARD1\n"
210 | ]
211 | }
212 | ],
213 | "source": [
214 | "print(train['hard'][0][0])\n",
215 | "print(train['hard'][1][0]) #class of hard"
216 | ]
217 | },
218 | {
219 | "cell_type": "code",
220 | "execution_count": 11,
221 | "metadata": {},
222 | "outputs": [
223 | {
224 | "name": "stdout",
225 | "output_type": "stream",
226 | "text": [
227 | "108\n",
228 | "127\n",
229 | "165\n",
230 | "161\n"
231 | ]
232 | }
233 | ],
234 | "source": [
235 | "def max_length(target):\n",
236 | " max_len = 0\n",
237 | " for sentence in train[target][0]:\n",
238 | " temp_len = len(sentence)\n",
239 | " max_len = max(max_len, temp_len)\n",
240 | " print(max_len)\n",
241 | "\n",
242 | "max_length('hard') \n",
243 | "max_length('interest')\n",
244 | "max_length('line')\n",
245 | "max_length('serve')"
246 | ]
247 | },
248 | {
249 | "cell_type": "code",
250 | "execution_count": null,
251 | "metadata": {},
252 | "outputs": [],
253 | "source": []
254 | }
255 | ],
256 | "metadata": {
257 | "kernelspec": {
258 | "display_name": "Python 3",
259 | "language": "python",
260 | "name": "python3"
261 | },
262 | "language_info": {
263 | "codemirror_mode": {
264 | "name": "ipython",
265 | "version": 3
266 | },
267 | "file_extension": ".py",
268 | "mimetype": "text/x-python",
269 | "name": "python",
270 | "nbconvert_exporter": "python",
271 | "pygments_lexer": "ipython3",
272 | "version": "3.6.2"
273 | }
274 | },
275 | "nbformat": 4,
276 | "nbformat_minor": 2
277 | }
278 |
--------------------------------------------------------------------------------
/Four Word Model/full_train.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sshanu/Hierarchical-Word-Sense-Disambiguation-using-WordNet-Senses/74905c60e48ab1884bf5c8f208e21015f7f7fd21/Four Word Model/full_train.pickle
--------------------------------------------------------------------------------
/Four Word Model/robsr_model.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import tensorflow as tf\n",
12 | "tf.logging.set_verbosity(tf.logging.WARN)\n",
13 | "import pickle\n",
14 | "import numpy as np\n",
15 | "import os\n",
16 | "from sklearn.model_selection import train_test_split\n",
17 | "from sklearn.metrics import f1_score\n",
18 | "from sklearn.metrics import accuracy_score\n",
19 | "import os\n",
20 | "from tensorflow.python.client import device_lib"
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": 2,
26 | "metadata": {
27 | "collapsed": true
28 | },
29 | "outputs": [],
30 | "source": [
31 | "f = open('../Glove/word_embedding_glove', 'rb')\n",
32 | "word_embedding = pickle.load(f)\n",
33 | "f.close()\n",
34 | "word_embedding = word_embedding[: len(word_embedding)-1]\n",
35 | "\n",
36 | "f = open('../Glove/vocab_glove', 'rb')\n",
37 | "vocab = pickle.load(f)\n",
38 | "f.close()\n",
39 | "\n",
40 | "word2id = dict((w, i) for i,w in enumerate(vocab))\n",
41 | "id2word = dict((i, w) for i,w in enumerate(vocab))\n",
42 | "\n",
43 | "unknown_token = \"UNKNOWN_TOKEN\"\n",
44 | "\n",
45 | "f = open(\"train.pickle\", 'rb')\n",
46 | "full_data = pickle.load(f)\n",
47 | "f.close()"
48 | ]
49 | },
50 | {
51 | "cell_type": "code",
52 | "execution_count": 3,
53 | "metadata": {
54 | "collapsed": true
55 | },
56 | "outputs": [],
57 | "source": [
58 | "# Model Description\n",
59 | "sense_word = 'hard'\n",
60 | "model_name = 'basic'\n",
61 | "model_dir = 'output/' + sense_word + '/' + model_name\n",
62 | "save_dir = os.path.join(model_dir, \"save/\")\n",
63 | "log_dir = os.path.join(model_dir, \"log\")\n",
64 | "\n",
65 | "if not os.path.exists(save_dir):\n",
66 | " os.mkdir(save_dir)\n",
67 | "if not os.path.exists(log_dir):\n",
68 | " os.mkdir(log_dir)"
69 | ]
70 | },
71 | {
72 | "cell_type": "code",
73 | "execution_count": 4,
74 | "metadata": {
75 | "collapsed": true
76 | },
77 | "outputs": [],
78 | "source": [
79 | "# Parameters\n",
80 | "mode = 'train'\n",
81 | "num_senses = 3\n",
82 | "batch_size = 64\n",
83 | "vocab_size = len(vocab)\n",
84 | "unk_vocab_size = 1\n",
85 | "word_emb_size = len(word_embedding[0])\n",
86 | "max_sent_size = 200\n",
87 | "hidden_size = 100\n",
88 | "keep_prob = 0.5\n",
89 | "l2_lambda = 0.001\n",
90 | "init_lr = 0.001\n",
91 | "decay_steps = 5000\n",
92 | "decay_rate = 0.96\n",
93 | "clip_norm = 1\n",
94 | "clipping = True"
95 | ]
96 | },
97 | {
98 | "cell_type": "code",
99 | "execution_count": 5,
100 | "metadata": {
101 | "collapsed": true
102 | },
103 | "outputs": [],
104 | "source": [
105 | "# MODEL\n",
106 | "x = tf.placeholder('int32', [batch_size, max_sent_size], name=\"x\")\n",
107 | "y = tf.placeholder('int32', [batch_size], name=\"y\")\n",
108 | "x_mask = tf.placeholder('bool', [batch_size, max_sent_size], name='x_mask') \n",
109 | "is_train = tf.placeholder('bool', [], name='is_train')\n",
110 | "word_emb_mat = tf.placeholder('float', [None, word_emb_size], name='emb_mat')\n",
111 | "input_keep_prob = tf.cond(is_train,lambda:keep_prob, lambda:tf.constant(1.0))\n",
112 | "x_len = tf.reduce_sum(tf.cast(x_mask, 'int32'), 1)\n",
113 | "\n",
114 | "with tf.name_scope(\"word_embedding\"):\n",
115 | " if mode == 'train':\n",
116 | " unk_word_emb_mat = tf.get_variable(\"word_emb_mat\", dtype='float', shape=[unk_vocab_size, word_emb_size], initializer=tf.contrib.layers.xavier_initializer(uniform=True, seed=0, dtype=tf.float32))\n",
117 | " else:\n",
118 | " unk_word_emb_mat = tf.get_variable(\"word_emb_mat\", shape=[unk_vocab_size, word_emb_size], dtype='float')\n",
119 | " \n",
120 | " final_word_emb_mat = tf.concat([word_emb_mat, unk_word_emb_mat], 0)\n",
121 | " Wx = tf.nn.embedding_lookup(final_word_emb_mat, x) \n",
122 | "\n",
123 | "with tf.variable_scope(\"lstm\"):\n",
124 | " cell_fw = tf.contrib.rnn.BasicLSTMCell(hidden_size,state_is_tuple=True)\n",
125 | " cell_bw = tf.contrib.rnn.BasicLSTMCell(hidden_size,state_is_tuple=True)\n",
126 | "\n",
127 | " d_cell_fw = tf.contrib.rnn.DropoutWrapper(cell_fw, input_keep_prob=input_keep_prob)\n",
128 | " d_cell_bw = tf.contrib.rnn.DropoutWrapper(cell_bw, input_keep_prob=input_keep_prob)\n",
129 | " \n",
130 | " (fw_h, bw_h), _ = tf.nn.bidirectional_dynamic_rnn(d_cell_fw, d_cell_bw, Wx, sequence_length=x_len, dtype='float', scope='lstm')\n",
131 | " h = tf.concat([fw_h, bw_h], 2)\n",
132 | "\n",
133 | "def attention(input_x, input_mask, W_att):\n",
134 | " h_masked = tf.boolean_mask(input_x, input_mask)\n",
135 | " h_tanh = tf.tanh(h_masked)\n",
136 | " u = tf.matmul(h_tanh, W_att)\n",
137 | " a = tf.nn.softmax(u)\n",
138 | " c = tf.reduce_sum(tf.multiply(h_tanh, a), 0) \n",
139 | " return c\n",
140 | "\n",
141 | "with tf.variable_scope(\"attention\"):\n",
142 | " W_att = tf.Variable(tf.truncated_normal([2*hidden_size, 1], mean=0.0, stddev=1.0, seed=0), name=\"W_att\")\n",
143 | " c = tf.expand_dims(attention(h[0], x_mask[0], W_att), 0)\n",
144 | " for i in range(1, batch_size):\n",
145 | " c = tf.concat([c, tf.expand_dims(attention(h[i], x_mask[i], W_att), 0)], 0)\n",
146 | " \n",
147 | "with tf.variable_scope(\"softmax_layer\"):\n",
148 | " W = tf.Variable(tf.truncated_normal([2*hidden_size, num_senses], mean=0.0, stddev=1.0, seed=0), name=\"W\")\n",
149 | " b = tf.Variable(tf.zeros([num_senses]), name=\"b\")\n",
150 | " drop_c = tf.nn.dropout(c, input_keep_prob)\n",
151 | " logits = tf.matmul(drop_c, W) + b\n",
152 | " predictions = tf.argmax(logits, 1)\n",
153 | "\n",
154 | "loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=y))\n",
155 | "global_step = tf.Variable(0, trainable=False, name=\"global_step\")\n",
156 | "\n",
157 | "learning_rate = tf.train.exponential_decay(init_lr, global_step, decay_steps, decay_rate, staircase=True)\n",
158 | "\n",
159 | "tv_all = tf.trainable_variables()\n",
160 | "tv_regu =[]\n",
161 | "for t in tv_all:\n",
162 | " if t.name.find('b:')==-1:\n",
163 | " tv_regu.append(t)\n",
164 | " \n",
165 | "# l2 Loss\n",
166 | "l2_loss = l2_lambda * tf.reduce_sum([ tf.nn.l2_loss(v) for v in tv_regu ])\n",
167 | "\n",
168 | "total_loss = loss + l2_loss\n",
169 | "\n",
170 | "# Optimizer for loss\n",
171 | "optimizer = tf.train.AdamOptimizer(learning_rate)\n",
172 | "\n",
173 | "# Gradients and Variables for Loss\n",
174 | "grads_vars = optimizer.compute_gradients(total_loss)\n",
175 | "\n",
176 | "# Clipping of Gradients\n",
177 | "clipped_grads = grads_vars\n",
178 | "if(clipping == True):\n",
179 | " clipped_grads = [(tf.clip_by_norm(grad, clip_norm), var) for grad, var in clipped_grads]\n",
180 | "\n",
181 | "# Training Optimizer for Total Loss\n",
182 | "train_op = optimizer.apply_gradients(clipped_grads, global_step=global_step)\n",
183 | "\n",
184 | "# Summaries\n",
185 | "var_summaries = []\n",
186 | "for v in tv_all:\n",
187 | " var_summary = tf.summary.histogram(\"{}/var\".format(v.name), v)\n",
188 | " var_summaries.append(var_summary)\n",
189 | "\n",
190 | "var_summaries_merged = tf.summary.merge(var_summaries)\n",
191 | "\n",
192 | "loss_summary = tf.summary.scalar(\"loss\", loss)\n",
193 | "total_loss_summary = tf.summary.scalar(\"total_loss\", total_loss)\n",
194 | "summary = tf.summary.merge_all()"
195 | ]
196 | },
197 | {
198 | "cell_type": "code",
199 | "execution_count": 6,
200 | "metadata": {},
201 | "outputs": [],
202 | "source": [
203 | "os.environ[\"CUDA_DEVICE_ORDER\"]=\"PCI_BUS_ID\" # see issue #152\n",
204 | "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"0\"\n",
205 | "sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))\n",
206 | "sess.run(tf.global_variables_initializer()) # For initializing all the variables\n",
207 | "saver = tf.train.Saver() # For Saving the model\n",
208 | "summary_writer = tf.summary.FileWriter(log_dir, sess.graph) # For writing Summaries"
209 | ]
210 | },
211 | {
212 | "cell_type": "code",
213 | "execution_count": 6,
214 | "metadata": {
215 | "scrolled": true
216 | },
217 | "outputs": [
218 | {
219 | "name": "stderr",
220 | "output_type": "stream",
221 | "text": [
222 | "/users/btech/aviraj/cs771/lib/python3.5/site-packages/sklearn/model_selection/_split.py:2026: FutureWarning: From version 0.21, test_size will always complement train_size unless both are specified.\n",
223 | " FutureWarning)\n"
224 | ]
225 | }
226 | ],
227 | "source": [
228 | "# Splitting\n",
229 | "data_x = full_data[sense_word][0]\n",
230 | "data_y = full_data[sense_word][2]\n",
231 | "x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, train_size=0.8, shuffle=True, stratify=data_y, random_state=0)\n",
232 | "\n",
233 | "x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, train_size=0.9, shuffle=True, stratify=y_train, random_state=0)"
234 | ]
235 | },
236 | {
237 | "cell_type": "code",
238 | "execution_count": 7,
239 | "metadata": {
240 | "collapsed": true
241 | },
242 | "outputs": [],
243 | "source": [
244 | "def data_prepare(x):\n",
245 | " num_examples = len(x)\n",
246 | "\n",
247 | " xx = np.zeros([num_examples, max_sent_size], dtype=int)\n",
248 | " xx_mask = np.zeros([num_examples, max_sent_size], dtype=bool)\n",
249 | "\n",
250 | " for j in range(num_examples):\n",
251 | " for i in range(max_sent_size):\n",
252 | " if(i>=len(x[j])):\n",
253 | " break\n",
254 | " w = x[j][i]\n",
255 | " xx[j][i] = word2id[w] if w in word2id else word2id['UNKNOWN_TOKEN']\n",
256 | " xx_mask[j][i] = True\n",
257 | " \n",
258 | " return xx, xx_mask\n",
259 | "\n",
260 | "def eval_score(yy, pred):\n",
261 | " num_batches = int(len(yy)/batch_size)\n",
262 | " f1 = f1_score(yy[:batch_size*num_batches], pred, average='macro')\n",
263 | " accu = accuracy_score(yy[:batch_size*num_batches], pred)\n",
264 | " return f1*100, accu*100\n",
265 | "\n",
266 | "def model(xx, yy, mask, train_cond=True):\n",
267 | " num_batches = int(len(xx)/batch_size)\n",
268 | " losses = 0\n",
269 | " preds = []\n",
270 | " for j in range(num_batches): \n",
271 | " \n",
272 | " s = j * batch_size\n",
273 | " e = (j+1) * batch_size\n",
274 | " \n",
275 | " feed_dict = {x:xx[s:e], y:yy[s:e], x_mask:mask[s:e], is_train:train_cond, input_keep_prob:keep_prob, word_emb_mat:word_embedding}\n",
276 | " \n",
277 | " \n",
278 | " if(train_cond==True):\n",
279 | " _, _loss, step, _summary = sess.run([train_op, total_loss, global_step, summary], feed_dict)\n",
280 | " summary_writer.add_summary(_summary, step) \n",
281 | "# print(\"Steps:{}\".format(step), \", Loss: {}\".format(_loss))\n",
282 | "\n",
283 | " else:\n",
284 | " _loss, pred = sess.run([total_loss, predictions], feed_dict)\n",
285 | " preds.append(pred)\n",
286 | " \n",
287 | " losses +=_loss\n",
288 | "\n",
289 | " if(train_cond==False):\n",
290 | " y_pred = []\n",
291 | " for i in range(num_batches):\n",
292 | " for pred in preds[i]:\n",
293 | " y_pred.append(pred)\n",
294 | " return losses/num_batches, y_pred\n",
295 | " \n",
296 | " return losses/num_batches, step"
297 | ]
298 | },
299 | {
300 | "cell_type": "code",
301 | "execution_count": 8,
302 | "metadata": {
303 | "collapsed": true
304 | },
305 | "outputs": [],
306 | "source": [
307 | "x_id_train, mask_train = data_prepare(x_train)\n",
308 | "x_id_val, mask_val = data_prepare(x_val)\n",
309 | "x_id_test, mask_test = data_prepare(x_test)"
310 | ]
311 | },
312 | {
313 | "cell_type": "code",
314 | "execution_count": null,
315 | "metadata": {
316 | "scrolled": true
317 | },
318 | "outputs": [],
319 | "source": [
320 | "num_epochs = 10\n",
321 | "\n",
322 | "for i in range(num_epochs):\n",
323 | " \n",
324 | " random = np.random.choice(len(y_train), size=(len(y_train)), replace=False)\n",
325 | " x_id_train = x_id_train[random]\n",
326 | " y_train = y_train[random]\n",
327 | " mask_train = mask_train[random]\n",
328 | " \n",
329 | " losses, step = model(x_id_train, y_train, mask_train)\n",
330 | " print(\"Epoch:\", i+1,\"Step:\", step, \"loss:\",losses)\n",
331 | " saver.save(sess, save_path=save_dir) \n",
332 | " print(\"Saved Model Complete\")\n",
333 | " \n",
334 | " if((i+1)%2==0):\n",
335 | " train_loss, train_pred = model(x_id_train, y_train, mask_train, train_cond=False)\n",
336 | " f1_, accu_ = eval_score(y_train, train_pred)\n",
337 | " print(\"Train: F1 Score: \", f1_, \"Accuracy: \", accu_, \"Loss: \", val_loss)\n",
338 | " val_loss, val_pred = model(x_id_val, y_val, mask_val, train_cond=False)\n",
339 | " f1_, accu_ = eval_score(y_val, val_pred)\n",
340 | " print(\"Val: F1 Score: \", f1_, \"Accuracy: \", accu_, \"Loss: \", val_loss)\n",
341 | " \n",
342 | "test_loss, test_pred = model(x_id_test, y_test, mask_test, train_cond=False)\n",
343 | "f1_, accu_ = eval_score(y_test, test_pred)\n",
344 | "print(\"Test: F1 Score: \", f1_, \"Accuracy: \", accu_, \"Loss: \", test_loss)"
345 | ]
346 | },
347 | {
348 | "cell_type": "code",
349 | "execution_count": null,
350 | "metadata": {
351 | "collapsed": true
352 | },
353 | "outputs": [],
354 | "source": []
355 | },
356 | {
357 | "cell_type": "code",
358 | "execution_count": 10,
359 | "metadata": {},
360 | "outputs": [],
361 | "source": [
362 | "saver.restore(sess, save_dir)"
363 | ]
364 | }
365 | ],
366 | "metadata": {
367 | "kernelspec": {
368 | "display_name": "cs771",
369 | "language": "python",
370 | "name": "cs771"
371 | },
372 | "language_info": {
373 | "codemirror_mode": {
374 | "name": "ipython",
375 | "version": 3
376 | },
377 | "file_extension": ".py",
378 | "mimetype": "text/x-python",
379 | "name": "python",
380 | "nbconvert_exporter": "python",
381 | "pygments_lexer": "ipython3",
382 | "version": "3.5.2"
383 | }
384 | },
385 | "nbformat": 4,
386 | "nbformat_minor": 2
387 | }
388 |
--------------------------------------------------------------------------------
/Four Word Model/train.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sshanu/Hierarchical-Word-Sense-Disambiguation-using-WordNet-Senses/74905c60e48ab1884bf5c8f208e21015f7f7fd21/Four Word Model/train.pickle
--------------------------------------------------------------------------------
/Four Word Model/words_not_in_vocab.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sshanu/Hierarchical-Word-Sense-Disambiguation-using-WordNet-Senses/74905c60e48ab1884bf5c8f208e21015f7f7fd21/Four Word Model/words_not_in_vocab.pickle
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2020 Shanu Kumar
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ## Word Sense Disambiguation
2 |
3 | Word sense disambiguation (WSD) is the ability to identify the meaning of words in context. We address this problem using series of end-to-end neural architectures using bidirectional Long Short Term Memory (LSTM). We propose two variants for WSD: an end-to-end word specific neural model and all-words neural model. In the word specific models we have to train models for every disambiguation target word. We addressed this issue using the all-words model which rely on sequence learning. We also used POS tags to improve the performance. We tried different variants of attention mechanisms for the all-words model. Performance was boosted by using convolutional neural networks (CNN) which captures local features around the words that is normally what humans do for predicting the senses. We further improved the performance using hierarchical models. We used POS tags as hierarchy and used two variants as soft masking and hard masking.
4 |
5 | ### Methods
6 |
7 | * [Word Specific Model trained on Four Word Dataset](https://github.com/Sshanu/Word-Sense-Disambiguation/tree/master/Four%20Word%20Model)
8 | * [Word Specific Model trained on One Million Dataset](https://github.com/Sshanu/Word-Sense-Disambiguation/tree/master/one_million)
9 | * [All-words Model](https://github.com/Sshanu/Word-Sense-Disambiguation/tree/master/one_million/all-word)
10 | * [Hierarchical Model](https://github.com/Sshanu/Word-Sense-Disambiguation/blob/master/one_million/all-word/Model-aw-lex-hierarchical-2.ipynb)
11 |
12 | ### Best Models
13 | * [All-words Model+CNN](https://github.com/Sshanu/Word-Sense-Disambiguation/blob/master/one_million/all-word/Model-aw-lex-1.4.ipynb)
14 | * [All-words Hierarchical Model+Soft Masking](https://github.com/Sshanu/Word-Sense-Disambiguation/blob/master/one_million/all-word/Model-aw-lex-hierarchical-2.ipynb)
15 | * [All-words Hierarchical Model+Hard Masking](https://github.com/Sshanu/Word-Sense-Disambiguation/blob/master/one_million/all-word/Model-aw-lex-hierarchical-4.ipynb)
16 |
17 |
18 | ### Details
19 | For detailed information about models and results:
20 | * [Report](https://github.com/Sshanu/Word-Sense-Disambiguation/blob/master/UGP_Report.pdf)
21 | * [Presentation](https://github.com/Sshanu/Word-Sense-Disambiguation/blob/master/UGP_presentation.pdf)
22 |
23 | ### All words Models
24 |
25 | #### [All-words Hierarchical Model+Soft Masking](https://github.com/Sshanu/Word-Sense-Disambiguation/blob/master/one_million/all-word/Model-aw-lex-hierarchical-2.ipynb)
26 |
27 |
28 |
29 |
30 | #### [All-words Hierarchical Model+Hard Masking](https://github.com/Sshanu/Word-Sense-Disambiguation/blob/master/one_million/all-word/Model-aw-lex-hierarchical-4.ipynb)
31 |
32 |
33 |
34 |
35 | #### [Basic Model](https://github.com/Sshanu/Word-Sense-Disambiguation/blob/master/one_million/all-word/Model-aw-1-multigpu-1.ipynb)
36 |
37 |
38 |
39 |
40 | #### [Basic Model+Local Attention](https://github.com/Sshanu/Word-Sense-Disambiguation/blob/master/one_million/all-word/Model-aw-lex-local_attention-fast-v2-4.ipynb)
41 |
42 |
43 |
44 |
45 | #### [Basic Model+Local Attentionn+Hidden States](https://github.com/Sshanu/Word-Sense-Disambiguation/blob/master/one_million/all-word/Model-aw-lex-local_attention-fast-v2-6.ipynb)
46 |
47 |
48 |
49 |
50 | #### [Basic Model+Local Attentionn+Hidden States+CRF](https://github.com/Sshanu/Word-Sense-Disambiguation/blob/master/one_million/all-word/Model-aw-lex-local_attention-fast-v3-1.ipynb)
51 |
52 |
53 |
54 |
55 | #### [Basic Model+Gated Attention](https://github.com/Sshanu/Word-Sense-Disambiguation/blob/master/one_million/all-word/Model-aw-lex-local_attention-fast-v2-7.ipynb)
56 |
57 |
58 |
59 |
60 | #### [Basic Model+CNN](https://github.com/Sshanu/Word-Sense-Disambiguation/blob/master/one_million/all-word/Model-aw-lex-1.4.ipynb)
61 |
62 |
63 |
64 |
65 | ### Word Specific Models
66 |
67 | #### [Basic Model](https://github.com/Sshanu/Word-Sense-Disambiguation/blob/master/one_million/force/Force-Model-1-multigpu-1.ipynb)
68 | Files with name as Model-1-multigpu-1.ipynb are the basic models
69 |
70 |
71 |
72 |
73 | #### [Basic Model+POS Tags](https://github.com/Sshanu/Word-Sense-Disambiguation/blob/master/one_million/force/Force-Model-2-multigpu-1.ipynb)
74 | Files with name as Model-1-multigpu-2.ipynb are the basic models
75 |
76 |
77 |
78 |
79 | #### [Basic Model+POS Tags+CRF](https://github.com/Sshanu/Word-Sense-Disambiguation/blob/master/one_million/force/Force-Model-3-multigpu-1.ipynb)
80 | Files with name as Model-1-multigpu-3.ipynb are the basic models
81 |
82 |
83 |
84 |
85 | #### [Word specific hierarchical model](https://github.com/Sshanu/Word-Sense-Disambiguation/blob/master/one_million/force/Force-Model-4-multigpu-1.ipynb)
86 | Files with name as Model-1-multigpu-4.ipynb are the basic models
87 |
88 |
89 |
90 |
91 |
--------------------------------------------------------------------------------
/UGP_Report.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sshanu/Hierarchical-Word-Sense-Disambiguation-using-WordNet-Senses/74905c60e48ab1884bf5c8f208e21015f7f7fd21/UGP_Report.pdf
--------------------------------------------------------------------------------
/UGP_presentation.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sshanu/Hierarchical-Word-Sense-Disambiguation-using-WordNet-Senses/74905c60e48ab1884bf5c8f208e21015f7f7fd21/UGP_presentation.pdf
--------------------------------------------------------------------------------
/models_diagram/all-word-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sshanu/Hierarchical-Word-Sense-Disambiguation-using-WordNet-Senses/74905c60e48ab1884bf5c8f208e21015f7f7fd21/models_diagram/all-word-1.png
--------------------------------------------------------------------------------
/models_diagram/all-word-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sshanu/Hierarchical-Word-Sense-Disambiguation-using-WordNet-Senses/74905c60e48ab1884bf5c8f208e21015f7f7fd21/models_diagram/all-word-2.png
--------------------------------------------------------------------------------
/models_diagram/all-word-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sshanu/Hierarchical-Word-Sense-Disambiguation-using-WordNet-Senses/74905c60e48ab1884bf5c8f208e21015f7f7fd21/models_diagram/all-word-3.png
--------------------------------------------------------------------------------
/models_diagram/all-word-4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sshanu/Hierarchical-Word-Sense-Disambiguation-using-WordNet-Senses/74905c60e48ab1884bf5c8f208e21015f7f7fd21/models_diagram/all-word-4.png
--------------------------------------------------------------------------------
/models_diagram/all-word-5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sshanu/Hierarchical-Word-Sense-Disambiguation-using-WordNet-Senses/74905c60e48ab1884bf5c8f208e21015f7f7fd21/models_diagram/all-word-5.png
--------------------------------------------------------------------------------
/models_diagram/all-word-6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sshanu/Hierarchical-Word-Sense-Disambiguation-using-WordNet-Senses/74905c60e48ab1884bf5c8f208e21015f7f7fd21/models_diagram/all-word-6.png
--------------------------------------------------------------------------------
/models_diagram/all-word-7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sshanu/Hierarchical-Word-Sense-Disambiguation-using-WordNet-Senses/74905c60e48ab1884bf5c8f208e21015f7f7fd21/models_diagram/all-word-7.png
--------------------------------------------------------------------------------
/models_diagram/all-word-8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sshanu/Hierarchical-Word-Sense-Disambiguation-using-WordNet-Senses/74905c60e48ab1884bf5c8f208e21015f7f7fd21/models_diagram/all-word-8.png
--------------------------------------------------------------------------------
/models_diagram/model-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sshanu/Hierarchical-Word-Sense-Disambiguation-using-WordNet-Senses/74905c60e48ab1884bf5c8f208e21015f7f7fd21/models_diagram/model-1.png
--------------------------------------------------------------------------------
/models_diagram/model-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sshanu/Hierarchical-Word-Sense-Disambiguation-using-WordNet-Senses/74905c60e48ab1884bf5c8f208e21015f7f7fd21/models_diagram/model-2.png
--------------------------------------------------------------------------------
/models_diagram/model-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sshanu/Hierarchical-Word-Sense-Disambiguation-using-WordNet-Senses/74905c60e48ab1884bf5c8f208e21015f7f7fd21/models_diagram/model-3.png
--------------------------------------------------------------------------------
/models_diagram/model-4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sshanu/Hierarchical-Word-Sense-Disambiguation-using-WordNet-Senses/74905c60e48ab1884bf5c8f208e21015f7f7fd21/models_diagram/model-4.png
--------------------------------------------------------------------------------
/one_million/One-Million All-Word Data Sampling Coarse.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "scrolled": true
8 | },
9 | "outputs": [
10 | {
11 | "name": "stdout",
12 | "output_type": "stream",
13 | "text": [
14 | "46\n"
15 | ]
16 | }
17 | ],
18 | "source": [
19 | "import pickle\n",
20 | "import numpy as np\n",
21 | "import os\n",
22 | "from sklearn.model_selection import train_test_split\n",
23 | "from collections import Counter\n",
24 | "from imblearn.over_sampling import RandomOverSampler\n",
25 | "\n",
26 | "f = open(\"../../dataset/sense/dict_sense-keys\", 'rb')\n",
27 | "dict_sense_keys = pickle.load(f)\n",
28 | "f.close()\n",
29 | "\n",
30 | "f = open(\"../../dataset/sense/dict_word-sense\", 'rb')\n",
31 | "dict_word_sense = pickle.load(f)\n",
32 | "f.close()\n",
33 | "\n",
34 | "f = open('../Glove/word_embedding_glove', 'rb')\n",
35 | "word_embedding = pickle.load(f)\n",
36 | "f.close()\n",
37 | "word_embedding = word_embedding[: len(word_embedding)-1]\n",
38 | "\n",
39 | "f = open('../Glove/vocab_glove', 'rb')\n",
40 | "vocab = pickle.load(f)\n",
41 | "f.close()\n",
42 | "\n",
43 | "word2id = dict((w, i) for i,w in enumerate(vocab))\n",
44 | "id2word = dict((i, w) for i,w in enumerate(vocab))\n",
45 | "\n",
46 | "unknown_token = \"UNKNOWN_TOKEN\"\n",
47 | "\n",
48 | "with open('/data/aviraj/dataset/raw_preprocess_train','rb') as f:\n",
49 | " data=pickle.load(f)\n",
50 | "\n",
51 | "with open('/data/aviraj/dataset/fulldata_vocab_sense','rb') as f:\n",
52 | " vocab_lex=pickle.load(f)\n",
53 | "\n",
54 | "lex2id = dict((s, i) for i,s in enumerate(vocab_lex))\n",
55 | "id2lex = dict((i, s) for i,s in enumerate(vocab_lex))\n",
56 | "\n",
57 | "print(len(vocab_lex))\n",
58 | "max_sent_size = 200"
59 | ]
60 | },
61 | {
62 | "cell_type": "code",
63 | "execution_count": 2,
64 | "metadata": {},
65 | "outputs": [
66 | {
67 | "name": "stdout",
68 | "output_type": "stream",
69 | "text": [
70 | "12\n"
71 | ]
72 | }
73 | ],
74 | "source": [
75 | "_pos = []\n",
76 | "for i in range(len(data)):\n",
77 | " for pp in data[i][4]:\n",
78 | " _pos.append(pp)\n",
79 | " \n",
80 | "pos_count = Counter(_pos)\n",
81 | "pos_count = pos_count.most_common()\n",
82 | "vocab_pos = [pp for pp, c in pos_count]\n",
83 | "pos2id = dict((s, i) for i,s in enumerate(vocab_pos))\n",
84 | "print(len(vocab_pos))"
85 | ]
86 | },
87 | {
88 | "cell_type": "code",
89 | "execution_count": 3,
90 | "metadata": {
91 | "collapsed": true
92 | },
93 | "outputs": [],
94 | "source": [
95 | "data_y1 = []\n",
96 | "data_y2 = []\n",
97 | "data_y3 = []\n",
98 | "for i in range(len(data)):\n",
99 | " if (len(data[i][1])<=200):\n",
100 | " for j in range(len(data[i][2])):\n",
101 | " if data[i][2][j] is not None:\n",
102 | " data_y1.append(dict_sense_keys[data[i][2][j]][3])\n",
103 | " data_y2.append(dict_sense_keys[data[i][2][j]][4])\n",
104 | " data_y3.append(dict_sense_keys[data[i][2][j]][5])\n",
105 | "\n",
106 | "sense_count1 = Counter(data_y1)\n",
107 | "sense_count1 = sense_count1.most_common()\n",
108 | "sense_count2 = Counter(data_y2)\n",
109 | "sense_count4 = sense_count2.most_common(272)\n",
110 | "sense_count2 = sense_count2.most_common(312)\n",
111 | "sense_count3 = Counter(data_y3)\n",
112 | "sense_count5 = sense_count3.most_common(505)\n",
113 | "sense_count3 = sense_count3.most_common(1051)\n",
114 | "\n",
115 | "dict_sense_count1 = dict(sense_count1)\n",
116 | "dict_sense_count2 = dict(sense_count2)\n",
117 | "dict_sense_count3 = dict(sense_count3)\n",
118 | "dict_sense_count4 = dict(sense_count4)\n",
119 | "dict_sense_count5 = dict(sense_count5)"
120 | ]
121 | },
122 | {
123 | "cell_type": "code",
124 | "execution_count": 4,
125 | "metadata": {
126 | "scrolled": true
127 | },
128 | "outputs": [
129 | {
130 | "name": "stdout",
131 | "output_type": "stream",
132 | "text": [
133 | "46 312 1051 272 505\n"
134 | ]
135 | }
136 | ],
137 | "source": [
138 | "print(len(sense_count1), len(sense_count2), len(sense_count3), len(sense_count4), len(sense_count5))"
139 | ]
140 | },
141 | {
142 | "cell_type": "code",
143 | "execution_count": 5,
144 | "metadata": {
145 | "collapsed": true
146 | },
147 | "outputs": [],
148 | "source": [
149 | "data_x = []\n",
150 | "data_pos = []\n",
151 | "data_label1 = []\n",
152 | "data_label2 = []\n",
153 | "data_label3 = []\n",
154 | "data_label4 = []\n",
155 | "data_label5 = []\n",
156 | "\n",
157 | "for i in range(len(data)):\n",
158 | " if not all(np.array(data[i][2])==None) and (len(data[i][1])<=200):\n",
159 | " data_label1.append([ss if ss is not None and dict_sense_keys[ss][3] in dict_sense_count1 else None for ss in data[i][2]])\n",
160 | " data_label2.append([ss if ss is not None and dict_sense_keys[ss][4] in dict_sense_count2 else None for ss in data[i][2]])\n",
161 | " data_label3.append([ss if ss is not None and dict_sense_keys[ss][5] in dict_sense_count3 else None for ss in data[i][2]])\n",
162 | " data_label4.append([ss if ss is not None and dict_sense_keys[ss][4] in dict_sense_count4 else None for ss in data[i][2]])\n",
163 | " data_label5.append([ss if ss is not None and dict_sense_keys[ss][5] in dict_sense_count5 else None for ss in data[i][2]])\n",
164 | " data_x.append(data[i][1])\n",
165 | " data_pos.append(data[i][4])"
166 | ]
167 | },
168 | {
169 | "cell_type": "code",
170 | "execution_count": 6,
171 | "metadata": {
172 | "collapsed": true
173 | },
174 | "outputs": [],
175 | "source": [
176 | "def data_prepare(sense_id, x, pos, y, sense_count, lex_cond=False, pos_cond=False):\n",
177 | " num_examples = len(x)\n",
178 | " \n",
179 | " vocab_sense = [s for s, c in sense_count]\n",
180 | " sense2id = dict((s, i) for i,s in enumerate(vocab_sense))\n",
181 | " \n",
182 | " xx = np.zeros([num_examples, max_sent_size], dtype=int)\n",
183 | " xx_mask = np.zeros([num_examples, max_sent_size], dtype=bool)\n",
184 | " ss_mask = np.zeros([num_examples, max_sent_size], dtype=bool)\n",
185 | " yy = np.zeros([num_examples,max_sent_size], dtype=int)\n",
186 | " y_lex = np.zeros([num_examples, max_sent_size], dtype=int)\n",
187 | " y_pos = np.zeros([num_examples, max_sent_size], dtype=int)\n",
188 | " \n",
189 | " for j in range(num_examples):\n",
190 | " for i in range(max_sent_size):\n",
191 | " if(i>=len(x[j])):\n",
192 | " break\n",
193 | " w = x[j][i]\n",
194 | " s = y[j][i]\n",
195 | " p = pos[j][i]\n",
196 | " xx[j][i] = word2id[w] if w in word2id else word2id['UNKNOWN_TOKEN']\n",
197 | " xx_mask[j][i] = True\n",
198 | " ss_mask[j][i] = True if s is not None and dict_sense_keys[s][sense_id] in vocab_sense else False\n",
199 | " yy[j][i] = sense2id[dict_sense_keys[s][sense_id]] if s is not None and dict_sense_keys[s][sense_id] in vocab_sense else 0\n",
200 | " if(lex_cond):\n",
201 | " y_lex[j][i] = lex2id[dict_sense_keys[s][3]] if s is not None and dict_sense_keys[s][3] in vocab_lex else len(vocab_lex)\n",
202 | " if(pos_cond):\n",
203 | " y_pos[j][i] = pos2id[p] if p in vocab_pos else len(vocab_pos)\n",
204 | " \n",
205 | " return xx, xx_mask, ss_mask, yy, y_lex, y_pos"
206 | ]
207 | },
208 | {
209 | "cell_type": "code",
210 | "execution_count": 7,
211 | "metadata": {
212 | "collapsed": true
213 | },
214 | "outputs": [],
215 | "source": [
216 | "data_x = np.array(data_x)\n",
217 | "data_pos = np.array(data_pos)\n",
218 | "\n",
219 | "def train_val_data(name, sense_id, index, split_label, data_label, sense_count, sampling_list, lex_cond=False, pos_cond=False, sampling=False):\n",
220 | " \n",
221 | " index_train, index_val, label_train_id, label_val_id = train_test_split(index, split_label, train_size=0.8, shuffle=True, stratify=split_label, random_state=0)\n",
222 | " \n",
223 | " if(sampling):\n",
224 | " dict_sample = dict(sampling_list)\n",
225 | " sm = RandomOverSampler(ratio=dict_sample)\n",
226 | " index_train1 = np.array(index_train).reshape(-1, 1)\n",
227 | " sampled_index, _ = sm.fit_sample(index_train1, label_train_id)\n",
228 | " count = Counter(_)\n",
229 | " count = count.most_common()\n",
230 | " sampled_index_train = np.array(sampled_index).reshape(1, -1)\n",
231 | " index_train = sampled_index_train[0]\n",
232 | " \n",
233 | " data_label = np.array(data_label)\n",
234 | " x_train = data_x[index_train]\n",
235 | " y_train = data_label[index_train]\n",
236 | " x_val = data_x[index_val]\n",
237 | " y_val = data_label[index_val]\n",
238 | " pos_train = []\n",
239 | " pos_val = []\n",
240 | " \n",
241 | " if(pos_cond):\n",
242 | " pos_train = data_pos[index_train]\n",
243 | " pos_val = data_pos[index_val]\n",
244 | "\n",
245 | " x_id_train, mask_train, sense_mask_train, y_id_train, lex_train, pos_id_train = data_prepare(sense_id, x_train, pos_train, y_train, sense_count, lex_cond=lex_cond, pos_cond=pos_cond)\n",
246 | " x_id_val, mask_val, sense_mask_val, y_id_val, lex_val, pos_id_val = data_prepare(sense_id, x_val, pos_val, y_val, sense_count, lex_cond=lex_cond, pos_cond=pos_cond)\n",
247 | "\n",
248 | " train_data = {'x':x_id_train,'x_mask':mask_train, 'sense_mask':sense_mask_train, 'y':y_id_train, 'lex':lex_train, 'pos':pos_id_train}\n",
249 | " val_data = {'x':x_id_val,'x_mask':mask_val, 'sense_mask':sense_mask_val, 'y':y_id_val, 'lex':lex_val, 'pos':pos_id_val}\n",
250 | " \n",
251 | " with open('/data/aviraj/dataset/train_val_data_coarse/all_word_'+ name,'wb') as f:\n",
252 | " pickle.dump([train_data,val_data], f)\n",
253 | " \n",
254 | " print(len(x_id_train)+len(x_id_val))"
255 | ]
256 | },
257 | {
258 | "cell_type": "code",
259 | "execution_count": 8,
260 | "metadata": {
261 | "scrolled": true
262 | },
263 | "outputs": [
264 | {
265 | "name": "stdout",
266 | "output_type": "stream",
267 | "text": [
268 | "850093\n",
269 | "850062\n",
270 | "850052\n",
271 | "849793\n",
272 | "848996\n"
273 | ]
274 | }
275 | ],
276 | "source": [
277 | "split_label1 = []\n",
278 | "split_label2 = []\n",
279 | "split_label3 = []\n",
280 | "split_label4 = []\n",
281 | "split_label5 = []\n",
282 | "\n",
283 | "index1 = []\n",
284 | "index2 = []\n",
285 | "index3 = []\n",
286 | "index4 = []\n",
287 | "index5 = []\n",
288 | "\n",
289 | "for jj, lab in enumerate(data_label1):\n",
290 | " min_idx = np.argmin([dict_sense_count1[dict_sense_keys[lab[i]][3]] if lab[i] is not None else np.inf for i in range(len(lab)) ]) \n",
291 | " if(lab[min_idx] is not None):\n",
292 | " index1.append(jj)\n",
293 | " split_label1.append(dict_sense_keys[lab[min_idx]][3])\n",
294 | "\n",
295 | "for jj, lab in enumerate(data_label2):\n",
296 | " min_idx = np.argmin([dict_sense_count2[dict_sense_keys[lab[i]][4]] if lab[i] is not None else np.inf for i in range(len(lab)) ]) \n",
297 | " if(lab[min_idx] is not None):\n",
298 | " index2.append(jj)\n",
299 | " split_label2.append(dict_sense_keys[lab[min_idx]][4])\n",
300 | "\n",
301 | "for jj, lab in enumerate(data_label3):\n",
302 | " min_idx = np.argmin([dict_sense_count3[dict_sense_keys[lab[i]][5]] if lab[i] is not None else np.inf for i in range(len(lab)) ]) \n",
303 | " if(lab[min_idx] is not None):\n",
304 | " index3.append(jj)\n",
305 | " split_label3.append(dict_sense_keys[lab[min_idx]][5])\n",
306 | " \n",
307 | "for jj, lab in enumerate(data_label4):\n",
308 | " min_idx = np.argmin([dict_sense_count4[dict_sense_keys[lab[i]][4]] if lab[i] is not None else np.inf for i in range(len(lab)) ]) \n",
309 | " if(lab[min_idx] is not None):\n",
310 | " index4.append(jj)\n",
311 | " split_label4.append(dict_sense_keys[lab[min_idx]][4])\n",
312 | "\n",
313 | "for jj, lab in enumerate(data_label5):\n",
314 | " min_idx = np.argmin([dict_sense_count5[dict_sense_keys[lab[i]][5]] if lab[i] is not None else np.inf for i in range(len(lab)) ]) \n",
315 | " if(lab[min_idx] is not None):\n",
316 | " index5.append(jj)\n",
317 | " split_label5.append(dict_sense_keys[lab[min_idx]][5])\n",
318 | " \n",
319 | "print(len(split_label1))\n",
320 | "print(len(split_label2))\n",
321 | "print(len(split_label3))\n",
322 | "print(len(split_label4))\n",
323 | "print(len(split_label5))"
324 | ]
325 | },
326 | {
327 | "cell_type": "code",
328 | "execution_count": 9,
329 | "metadata": {
330 | "scrolled": true
331 | },
332 | "outputs": [
333 | {
334 | "name": "stderr",
335 | "output_type": "stream",
336 | "text": [
337 | "/users/btech/aviraj/cs771/lib/python3.5/site-packages/sklearn/model_selection/_split.py:2026: FutureWarning: From version 0.21, test_size will always complement train_size unless both are specified.\n",
338 | " FutureWarning)\n"
339 | ]
340 | },
341 | {
342 | "name": "stdout",
343 | "output_type": "stream",
344 | "text": [
345 | "850093\n",
346 | "850062\n",
347 | "850052\n",
348 | "849793\n",
349 | "848996\n",
350 | "848996\n"
351 | ]
352 | }
353 | ],
354 | "source": [
355 | "train_val_data('lex1', 3, index1, split_label1, data_label1, sense_count1, [], lex_cond=False, pos_cond=True)\n",
356 | "train_val_data('lex2', 3, index2, split_label2, data_label2, sense_count1, [], lex_cond=False, pos_cond=True)\n",
357 | "train_val_data('lex3', 3, index3, split_label3, data_label3, sense_count1, [], lex_cond=False, pos_cond=True)\n",
358 | "train_val_data('sense1', 4, index4, split_label4, data_label4, sense_count4, [], lex_cond=True, pos_cond=True)\n",
359 | "train_val_data('sense2', 4, index5, split_label5, data_label5, sense_count4, [], lex_cond=True, pos_cond=True)\n",
360 | "train_val_data('full_sense', 5, index5, split_label5, data_label5, sense_count5, [], lex_cond=True, pos_cond=True)"
361 | ]
362 | },
363 | {
364 | "cell_type": "code",
365 | "execution_count": 10,
366 | "metadata": {
367 | "collapsed": true
368 | },
369 | "outputs": [],
370 | "source": [
371 | "sampled_sense_count1 = [('1:19', 10000),\n",
372 | " ('1:17', 10000),\n",
373 | " ('2:34', 10000),\n",
374 | " ('2:33', 10000),\n",
375 | " ('1:27', 10000),\n",
376 | " ('2:37', 8000),\n",
377 | " ('1:24', 8000),\n",
378 | " ('1:08', 8000),\n",
379 | " ('1:12', 7000),\n",
380 | " ('1:22', 5000),\n",
381 | " ('2:29', 5000),\n",
382 | " ('1:05', 3000),\n",
383 | " ('1:16', 3000),\n",
384 | " ('1:25', 3000),\n",
385 | " ('1:20', 3000),\n",
386 | " ('1:13', 2000),\n",
387 | " ('2:43', 1100),\n",
388 | " ('3:44', 1000)]"
389 | ]
390 | },
391 | {
392 | "cell_type": "code",
393 | "execution_count": 11,
394 | "metadata": {
395 | "collapsed": true
396 | },
397 | "outputs": [],
398 | "source": [
399 | "sampled_sense_count2= []\n",
400 | "for s, c in sense_count2[260:]:\n",
401 | " sampled_sense_count2.append((s, 500))\n",
402 | "for s, c in sense_count2[180:260]:\n",
403 | " sampled_sense_count2.append((s, 2000))\n",
404 | "for s, c in sense_count2[140:180]:\n",
405 | " sampled_sense_count2.append((s, 5000))\n",
406 | "for s, c in sense_count2[75:140]:\n",
407 | " sampled_sense_count2.append((s, 8000))\n",
408 | "for s, c in sense_count2[25:75]:\n",
409 | " sampled_sense_count2.append((s, 12000))"
410 | ]
411 | },
412 | {
413 | "cell_type": "code",
414 | "execution_count": 12,
415 | "metadata": {
416 | "collapsed": true
417 | },
418 | "outputs": [],
419 | "source": [
420 | "sampled_sense_count3= []\n",
421 | "for s, c in sense_count3[400:]:\n",
422 | " sampled_sense_count3.append((s, 500))\n",
423 | "for s, c in sense_count3[200:400]:\n",
424 | " sampled_sense_count3.append((s, 2000))\n",
425 | "for s, c in sense_count3[100:200]:\n",
426 | " sampled_sense_count3.append((s, 5000))\n",
427 | "for s, c in sense_count3[70:100]:\n",
428 | " sampled_sense_count3.append((s, 8000))\n",
429 | "for s, c in sense_count3[25:70]:\n",
430 | " sampled_sense_count3.append((s, 12000))"
431 | ]
432 | },
433 | {
434 | "cell_type": "code",
435 | "execution_count": 13,
436 | "metadata": {
437 | "collapsed": true
438 | },
439 | "outputs": [],
440 | "source": [
441 | "sampled_sense_count4= []\n",
442 | "for s, c in sense_count4[260:]:\n",
443 | " sampled_sense_count4.append((s, 500))\n",
444 | "for s, c in sense_count4[180:260]:\n",
445 | " sampled_sense_count4.append((s, 2000))\n",
446 | "for s, c in sense_count4[140:180]:\n",
447 | " sampled_sense_count4.append((s, 5000))\n",
448 | "for s, c in sense_count4[75:140]:\n",
449 | " sampled_sense_count4.append((s, 8000))\n",
450 | "for s, c in sense_count4[25:75]:\n",
451 | " sampled_sense_count4.append((s, 12000))"
452 | ]
453 | },
454 | {
455 | "cell_type": "code",
456 | "execution_count": 14,
457 | "metadata": {
458 | "collapsed": true
459 | },
460 | "outputs": [],
461 | "source": [
462 | "sampled_sense_count5= []\n",
463 | "for s, c in sense_count5[400:]:\n",
464 | " sampled_sense_count5.append((s, 500))\n",
465 | "for s, c in sense_count5[200:400]:\n",
466 | " sampled_sense_count5.append((s, 2000))\n",
467 | "for s, c in sense_count5[100:200]:\n",
468 | " sampled_sense_count5.append((s, 5000))\n",
469 | "for s, c in sense_count5[70:100]:\n",
470 | " sampled_sense_count5.append((s, 8000))\n",
471 | "for s, c in sense_count5[25:70]:\n",
472 | " sampled_sense_count5.append((s, 12000))"
473 | ]
474 | },
475 | {
476 | "cell_type": "code",
477 | "execution_count": 15,
478 | "metadata": {
479 | "scrolled": false
480 | },
481 | "outputs": [
482 | {
483 | "name": "stderr",
484 | "output_type": "stream",
485 | "text": [
486 | "/users/btech/aviraj/cs771/lib/python3.5/site-packages/sklearn/model_selection/_split.py:2026: FutureWarning: From version 0.21, test_size will always complement train_size unless both are specified.\n",
487 | " FutureWarning)\n"
488 | ]
489 | },
490 | {
491 | "name": "stdout",
492 | "output_type": "stream",
493 | "text": [
494 | "911174\n",
495 | "2061567\n",
496 | "2512876\n",
497 | "2041581\n",
498 | "2239996\n",
499 | "2239996\n"
500 | ]
501 | }
502 | ],
503 | "source": [
504 | "train_val_data('lex1_sampled', 3, index1, split_label1, data_label1, sense_count1, sampled_sense_count1, lex_cond=False, pos_cond=True, sampling=True)\n",
505 | "train_val_data('lex2_sampled', 3, index2, split_label2, data_label2, sense_count1, sampled_sense_count2, lex_cond=False, pos_cond=True, sampling=True)\n",
506 | "train_val_data('lex3_sampled', 3, index3, split_label3, data_label3, sense_count1, sampled_sense_count3, lex_cond=False, pos_cond=True, sampling=True)\n",
507 | "train_val_data('sense1_sampled', 4, index4, split_label4, data_label4, sense_count4, sampled_sense_count4, lex_cond=True, pos_cond=True, sampling=True)\n",
508 | "train_val_data('sense2_sampled', 4, index5, split_label5, data_label5, sense_count4, sampled_sense_count5, lex_cond=True, pos_cond=True, sampling=True)\n",
509 | "train_val_data('full_sense_sampled', 5, index5, split_label5, data_label5, sense_count5, sampled_sense_count5, lex_cond=True, pos_cond=True, sampling=True)"
510 | ]
511 | }
512 | ],
513 | "metadata": {
514 | "kernelspec": {
515 | "display_name": "cs771",
516 | "language": "python",
517 | "name": "cs771"
518 | },
519 | "language_info": {
520 | "codemirror_mode": {
521 | "name": "ipython",
522 | "version": 3
523 | },
524 | "file_extension": ".py",
525 | "mimetype": "text/x-python",
526 | "name": "python",
527 | "nbconvert_exporter": "python",
528 | "pygments_lexer": "ipython3",
529 | "version": "3.5.2"
530 | }
531 | },
532 | "nbformat": 4,
533 | "nbformat_minor": 2
534 | }
535 |
--------------------------------------------------------------------------------
/one_million/One-Million All-Word Data Sampling-Fine.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": true,
8 | "deletable": true,
9 | "editable": true
10 | },
11 | "outputs": [],
12 | "source": [
13 | "import pickle\n",
14 | "import numpy as np\n",
15 | "import os\n",
16 | "from sklearn.model_selection import train_test_split\n",
17 | "from collections import Counter\n",
18 | "from imblearn.over_sampling import RandomOverSampler"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": 2,
24 | "metadata": {
25 | "collapsed": false,
26 | "deletable": true,
27 | "editable": true,
28 | "scrolled": true
29 | },
30 | "outputs": [
31 | {
32 | "name": "stdout",
33 | "output_type": "stream",
34 | "text": [
35 | "46\n"
36 | ]
37 | }
38 | ],
39 | "source": [
40 | "f = open(\"../../dataset/sense/dict_sense-keys\", 'rb')\n",
41 | "dict_sense_keys = pickle.load(f)\n",
42 | "f.close()\n",
43 | "\n",
44 | "f = open(\"../../dataset/sense/dict_word-sense\", 'rb')\n",
45 | "dict_word_sense = pickle.load(f)\n",
46 | "f.close()\n",
47 | "\n",
48 | "f = open('../Glove/word_embedding_glove', 'rb')\n",
49 | "word_embedding = pickle.load(f)\n",
50 | "f.close()\n",
51 | "word_embedding = word_embedding[: len(word_embedding)-1]\n",
52 | "\n",
53 | "f = open('../Glove/vocab_glove', 'rb')\n",
54 | "vocab = pickle.load(f)\n",
55 | "f.close()\n",
56 | "\n",
57 | "word2id = dict((w, i) for i,w in enumerate(vocab))\n",
58 | "id2word = dict((i, w) for i,w in enumerate(vocab))\n",
59 | "\n",
60 | "unknown_token = \"UNKNOWN_TOKEN\"\n",
61 | "\n",
62 | "with open('/data/aviraj/dataset/raw_preprocess_train','rb') as f:\n",
63 | " data=pickle.load(f)\n",
64 | "\n",
65 | "with open('/data/aviraj/dataset/fulldata_vocab_sense','rb') as f:\n",
66 | " vocab_lex=pickle.load(f)\n",
67 | "\n",
68 | "lex2id = dict((s, i) for i,s in enumerate(vocab_lex))\n",
69 | "id2lex = dict((i, s) for i,s in enumerate(vocab_lex))\n",
70 | "\n",
71 | "print(len(vocab_lex))\n",
72 | "max_sent_size = 200"
73 | ]
74 | },
75 | {
76 | "cell_type": "code",
77 | "execution_count": 3,
78 | "metadata": {
79 | "collapsed": false,
80 | "deletable": true,
81 | "editable": true
82 | },
83 | "outputs": [
84 | {
85 | "name": "stdout",
86 | "output_type": "stream",
87 | "text": [
88 | "12\n"
89 | ]
90 | }
91 | ],
92 | "source": [
93 | "_pos = []\n",
94 | "for i in range(len(data)):\n",
95 | " for pp in data[i][4]:\n",
96 | " _pos.append(pp)\n",
97 | " \n",
98 | "pos_count = Counter(_pos)\n",
99 | "pos_count = pos_count.most_common()\n",
100 | "vocab_pos = [pp for pp, c in pos_count]\n",
101 | "pos2id = dict((s, i) for i,s in enumerate(vocab_pos))\n",
102 | "print(len(vocab_pos))"
103 | ]
104 | },
105 | {
106 | "cell_type": "code",
107 | "execution_count": 4,
108 | "metadata": {
109 | "collapsed": false,
110 | "deletable": true,
111 | "editable": true
112 | },
113 | "outputs": [],
114 | "source": [
115 | "data_y1 = []\n",
116 | "data_y2 = []\n",
117 | "data_y3 = []\n",
118 | "for i in range(len(data)):\n",
119 | " if (len(data[i][1])<=200):\n",
120 | " for j in range(len(data[i][2])):\n",
121 | " if data[i][2][j] is not None:\n",
122 | " data_y1.append(dict_sense_keys[data[i][2][j]][3])\n",
123 | " data_y2.append(dict_sense_keys[data[i][2][j]][4])\n",
124 | " data_y3.append(dict_sense_keys[data[i][2][j]][5])\n",
125 | "\n",
126 | "sense_count1 = Counter(data_y1)\n",
127 | "sense_count1 = sense_count1.most_common()[:-2]\n",
128 | "\n",
129 | "sense_count2 = Counter(data_y2)\n",
130 | "sense_count2 = sense_count2.most_common(180)\n",
131 | "\n",
132 | "sense_count3 = Counter(data_y3)\n",
133 | "sense_count3 = sense_count3.most_common(300)"
134 | ]
135 | },
136 | {
137 | "cell_type": "code",
138 | "execution_count": 5,
139 | "metadata": {
140 | "collapsed": true,
141 | "deletable": true,
142 | "editable": true
143 | },
144 | "outputs": [],
145 | "source": [
146 | "dict_sense_count1 = dict(sense_count1)\n",
147 | "dict_sense_count2 = dict(sense_count2)\n",
148 | "dict_sense_count3 = dict(sense_count3)"
149 | ]
150 | },
151 | {
152 | "cell_type": "code",
153 | "execution_count": 6,
154 | "metadata": {
155 | "collapsed": false,
156 | "deletable": true,
157 | "editable": true,
158 | "scrolled": true
159 | },
160 | "outputs": [
161 | {
162 | "name": "stdout",
163 | "output_type": "stream",
164 | "text": [
165 | "44 180 300\n"
166 | ]
167 | }
168 | ],
169 | "source": [
170 | "print(len(sense_count1), len(sense_count2), len(sense_count3))"
171 | ]
172 | },
173 | {
174 | "cell_type": "code",
175 | "execution_count": 7,
176 | "metadata": {
177 | "collapsed": true,
178 | "deletable": true,
179 | "editable": true
180 | },
181 | "outputs": [],
182 | "source": [
183 | "data_x = []\n",
184 | "data_pos = []\n",
185 | "data_label1 = []\n",
186 | "data_label2 = []\n",
187 | "data_label3 = []\n",
188 | "\n",
189 | "for i in range(len(data)):\n",
190 | " if not all(np.array(data[i][2])==None) and (len(data[i][1])<=200):\n",
191 | " data_label1.append([ss if ss is not None and dict_sense_keys[ss][3] in dict_sense_count1 else None for ss in data[i][2]])\n",
192 | " data_label2.append([ss if ss is not None and dict_sense_keys[ss][4] in dict_sense_count2 else None for ss in data[i][2]])\n",
193 | " data_label3.append([ss if ss is not None and dict_sense_keys[ss][5] in dict_sense_count3 else None for ss in data[i][2]])\n",
194 | " data_x.append(data[i][1])\n",
195 | " data_pos.append(data[i][4])"
196 | ]
197 | },
198 | {
199 | "cell_type": "code",
200 | "execution_count": 8,
201 | "metadata": {
202 | "collapsed": true,
203 | "deletable": true,
204 | "editable": true
205 | },
206 | "outputs": [],
207 | "source": [
208 | "def data_prepare(sense_id, x, pos, y, sense_count, lex_cond=False, pos_cond=False):\n",
209 | " num_examples = len(x)\n",
210 | " \n",
211 | " vocab_sense = [s for s, c in sense_count]\n",
212 | " sense2id = dict((s, i) for i,s in enumerate(vocab_sense))\n",
213 | " \n",
214 | " xx = np.zeros([num_examples, max_sent_size], dtype=int)\n",
215 | " xx_mask = np.zeros([num_examples, max_sent_size], dtype=bool)\n",
216 | " ss_mask = np.zeros([num_examples, max_sent_size], dtype=bool)\n",
217 | " yy = np.zeros([num_examples,max_sent_size], dtype=int)\n",
218 | " y_lex = np.zeros([num_examples, max_sent_size], dtype=int)\n",
219 | " y_pos = np.zeros([num_examples, max_sent_size], dtype=int)\n",
220 | " \n",
221 | " for j in range(num_examples):\n",
222 | " for i in range(max_sent_size):\n",
223 | " if(i>=len(x[j])):\n",
224 | " break\n",
225 | " w = x[j][i]\n",
226 | " s = y[j][i]\n",
227 | " p = pos[j][i]\n",
228 | " xx[j][i] = word2id[w] if w in word2id else word2id['UNKNOWN_TOKEN']\n",
229 | " xx_mask[j][i] = True\n",
230 | " ss_mask[j][i] = True if s is not None and dict_sense_keys[s][sense_id] in vocab_sense else False\n",
231 | " yy[j][i] = sense2id[dict_sense_keys[s][sense_id]] if s is not None and dict_sense_keys[s][sense_id] in vocab_sense else 0\n",
232 | " if(lex_cond):\n",
233 | " y_lex[j][i] = lex2id[dict_sense_keys[s][3]] if s is not None and dict_sense_keys[s][3] in vocab_lex else len(vocab_lex)\n",
234 | " if(pos_cond):\n",
235 | " y_pos[j][i] = pos2id[p] if p in vocab_pos else len(vocab_pos)\n",
236 | " \n",
237 | " return xx, xx_mask, ss_mask, yy, y_lex, y_pos"
238 | ]
239 | },
240 | {
241 | "cell_type": "code",
242 | "execution_count": 9,
243 | "metadata": {
244 | "collapsed": true,
245 | "deletable": true,
246 | "editable": true
247 | },
248 | "outputs": [],
249 | "source": [
250 | "data_x = np.array(data_x)\n",
251 | "data_pos = np.array(data_pos)\n",
252 | "\n",
253 | "def train_val_data(name, sense_id, index, split_label, data_label, sense_count, sampling_list, lex_cond=False, pos_cond=False, sampling=False):\n",
254 | " \n",
255 | " index_train, index_val, label_train_id, label_val_id = train_test_split(index, split_label, train_size=0.8, shuffle=True, stratify=split_label, random_state=0)\n",
256 | " \n",
257 | " if(sampling):\n",
258 | " dict_sample = dict(sampling_list)\n",
259 | " sm = RandomOverSampler(ratio=dict_sample)\n",
260 | " index_train1 = np.array(index_train).reshape(-1, 1)\n",
261 | " sampled_index, _ = sm.fit_sample(index_train1, label_train_id)\n",
262 | " count = Counter(_)\n",
263 | " count = count.most_common()\n",
264 | " sampled_index_train = np.array(sampled_index).reshape(1, -1)\n",
265 | " index_train = sampled_index_train[0]\n",
266 | " \n",
267 | " data_label = np.array(data_label)\n",
268 | " x_train = data_x[index_train]\n",
269 | " y_train = data_label[index_train]\n",
270 | " x_val = data_x[index_val]\n",
271 | " y_val = data_label[index_val]\n",
272 | " pos_train = []\n",
273 | " pos_val = []\n",
274 | " \n",
275 | " if(pos_cond):\n",
276 | " pos_train = data_pos[index_train]\n",
277 | " pos_val = data_pos[index_val]\n",
278 | "\n",
279 | " x_id_train, mask_train, sense_mask_train, y_id_train, lex_train, pos_id_train = data_prepare(sense_id, x_train, pos_train, y_train, sense_count, lex_cond=lex_cond, pos_cond=pos_cond)\n",
280 | " x_id_val, mask_val, sense_mask_val, y_id_val, lex_val, pos_id_val = data_prepare(sense_id, x_val, pos_val, y_val, sense_count, lex_cond=lex_cond, pos_cond=pos_cond)\n",
281 | "\n",
282 | " train_data = {'x':x_id_train,'x_mask':mask_train, 'sense_mask':sense_mask_train, 'y':y_id_train, 'lex':lex_train, 'pos':pos_id_train}\n",
283 | " val_data = {'x':x_id_val,'x_mask':mask_val, 'sense_mask':sense_mask_val, 'y':y_id_val, 'lex':lex_val, 'pos':pos_id_val}\n",
284 | " \n",
285 | " with open('/data/aviraj/dataset/train_val_data_fine/all_word_'+ name,'wb') as f:\n",
286 | " pickle.dump([train_data,val_data], f)\n",
287 | " \n",
288 | " print(len(x_id_train)+len(x_id_val))"
289 | ]
290 | },
291 | {
292 | "cell_type": "code",
293 | "execution_count": 10,
294 | "metadata": {
295 | "collapsed": false,
296 | "deletable": true,
297 | "editable": true,
298 | "scrolled": true
299 | },
300 | "outputs": [
301 | {
302 | "name": "stdout",
303 | "output_type": "stream",
304 | "text": [
305 | "850083\n",
306 | "838757\n",
307 | "828921\n"
308 | ]
309 | }
310 | ],
311 | "source": [
312 | "split_label1 = []\n",
313 | "split_label2 = []\n",
314 | "split_label3 = []\n",
315 | "\n",
316 | "index1 = []\n",
317 | "index2 = []\n",
318 | "index3 = []\n",
319 | "\n",
320 | "for jj, lab in enumerate(data_label1):\n",
321 | " min_idx = np.argmin([dict_sense_count1[dict_sense_keys[lab[i]][3]] if lab[i] is not None else np.inf for i in range(len(lab)) ]) \n",
322 | " if(lab[min_idx] is not None):\n",
323 | " index1.append(jj)\n",
324 | " split_label1.append(dict_sense_keys[lab[min_idx]][3])\n",
325 | "\n",
326 | "for jj, lab in enumerate(data_label2):\n",
327 | " min_idx = np.argmin([dict_sense_count2[dict_sense_keys[lab[i]][4]] if lab[i] is not None else np.inf for i in range(len(lab)) ]) \n",
328 | " if(lab[min_idx] is not None):\n",
329 | " index2.append(jj)\n",
330 | " split_label2.append(dict_sense_keys[lab[min_idx]][4])\n",
331 | "\n",
332 | "for jj, lab in enumerate(data_label3):\n",
333 | " min_idx = np.argmin([dict_sense_count3[dict_sense_keys[lab[i]][5]] if lab[i] is not None else np.inf for i in range(len(lab)) ]) \n",
334 | " if(lab[min_idx] is not None):\n",
335 | " index3.append(jj)\n",
336 | " split_label3.append(dict_sense_keys[lab[min_idx]][5])\n",
337 | " \n",
338 | "print(len(split_label1))\n",
339 | "print(len(split_label2))\n",
340 | "print(len(split_label3))"
341 | ]
342 | },
343 | {
344 | "cell_type": "code",
345 | "execution_count": 11,
346 | "metadata": {
347 | "collapsed": false,
348 | "deletable": true,
349 | "editable": true,
350 | "scrolled": true
351 | },
352 | "outputs": [
353 | {
354 | "name": "stderr",
355 | "output_type": "stream",
356 | "text": [
357 | "/users/btech/aviraj/envs/lib/python3.5/site-packages/sklearn/model_selection/_split.py:2026: FutureWarning: From version 0.21, test_size will always complement train_size unless both are specified.\n",
358 | " FutureWarning)\n"
359 | ]
360 | },
361 | {
362 | "name": "stdout",
363 | "output_type": "stream",
364 | "text": [
365 | "850083\n",
366 | "838757\n",
367 | "828921\n"
368 | ]
369 | }
370 | ],
371 | "source": [
372 | "train_val_data('lex', 3, index1, split_label1, data_label1, sense_count1, [], lex_cond=False, pos_cond=True)\n",
373 | "train_val_data('sense', 4, index2, split_label2, data_label2, sense_count2, [], lex_cond=True, pos_cond=True)\n",
374 | "train_val_data('full_sense', 5, index3, split_label3, data_label3, sense_count3, [], lex_cond=True, pos_cond=True)"
375 | ]
376 | },
377 | {
378 | "cell_type": "code",
379 | "execution_count": 12,
380 | "metadata": {
381 | "collapsed": true,
382 | "deletable": true,
383 | "editable": true
384 | },
385 | "outputs": [],
386 | "source": [
387 | "sampled_sense_count1 = [('1:19', 10000),\n",
388 | " ('1:17', 10000),\n",
389 | " ('2:34', 10000),\n",
390 | " ('2:33', 10000),\n",
391 | " ('1:27', 10000),\n",
392 | " ('2:37', 8000),\n",
393 | " ('1:24', 8000),\n",
394 | " ('1:08', 8000),\n",
395 | " ('1:12', 7000),\n",
396 | " ('1:22', 5000),\n",
397 | " ('2:29', 5000),\n",
398 | " ('1:05', 3000),\n",
399 | " ('1:16', 3000),\n",
400 | " ('1:25', 3000),\n",
401 | " ('1:20', 3000),\n",
402 | " ('1:13', 2000)]"
403 | ]
404 | },
405 | {
406 | "cell_type": "code",
407 | "execution_count": 13,
408 | "metadata": {
409 | "collapsed": true,
410 | "deletable": true,
411 | "editable": true
412 | },
413 | "outputs": [],
414 | "source": [
415 | "sampled_sense_count2= []\n",
416 | "for s, c in sense_count2[120:]:\n",
417 | " sampled_sense_count2.append((s, 5000))\n",
418 | "for s, c in sense_count2[75:120]:\n",
419 | " sampled_sense_count2.append((s, 8000))\n",
420 | "for s, c in sense_count2[25:75]:\n",
421 | " sampled_sense_count2.append((s, 12000))"
422 | ]
423 | },
424 | {
425 | "cell_type": "code",
426 | "execution_count": 14,
427 | "metadata": {
428 | "collapsed": true,
429 | "deletable": true,
430 | "editable": true
431 | },
432 | "outputs": [],
433 | "source": [
434 | "sampled_sense_count3= []\n",
435 | "for s, c in sense_count3[130:]:\n",
436 | " sampled_sense_count3.append((s, 5000))\n",
437 | "for s, c in sense_count3[70:130]:\n",
438 | " sampled_sense_count3.append((s, 8000))\n",
439 | "for s, c in sense_count3[25:70]:\n",
440 | " sampled_sense_count3.append((s, 12000))"
441 | ]
442 | },
443 | {
444 | "cell_type": "code",
445 | "execution_count": 15,
446 | "metadata": {
447 | "collapsed": false,
448 | "deletable": true,
449 | "editable": true
450 | },
451 | "outputs": [
452 | {
453 | "name": "stderr",
454 | "output_type": "stream",
455 | "text": [
456 | "/users/btech/aviraj/envs/lib/python3.5/site-packages/sklearn/model_selection/_split.py:2026: FutureWarning: From version 0.21, test_size will always complement train_size unless both are specified.\n",
457 | " FutureWarning)\n"
458 | ]
459 | },
460 | {
461 | "name": "stdout",
462 | "output_type": "stream",
463 | "text": [
464 | "909119\n",
465 | "1814988\n",
466 | "2375783\n"
467 | ]
468 | }
469 | ],
470 | "source": [
471 | "train_val_data('lex_sampled', 3, index1, split_label1, data_label1, sense_count1, sampled_sense_count1, lex_cond=False, pos_cond=True, sampling=True)\n",
472 | "train_val_data('sense_sampled', 4, index2, split_label2, data_label2, sense_count2, sampled_sense_count2, lex_cond=True, pos_cond=True, sampling=True)\n",
473 | "train_val_data('full_sense_sampled', 5, index3, split_label3, data_label3, sense_count3, sampled_sense_count3, lex_cond=True, pos_cond=True, sampling=True)"
474 | ]
475 | },
476 | {
477 | "cell_type": "code",
478 | "execution_count": null,
479 | "metadata": {
480 | "collapsed": true
481 | },
482 | "outputs": [],
483 | "source": []
484 | }
485 | ],
486 | "metadata": {
487 | "kernelspec": {
488 | "display_name": "envs",
489 | "language": "python",
490 | "name": "cs771"
491 | },
492 | "language_info": {
493 | "codemirror_mode": {
494 | "name": "ipython",
495 | "version": 3
496 | },
497 | "file_extension": ".py",
498 | "mimetype": "text/x-python",
499 | "name": "python",
500 | "nbconvert_exporter": "python",
501 | "pygments_lexer": "ipython3",
502 | "version": "3.5.2"
503 | }
504 | },
505 | "nbformat": 4,
506 | "nbformat_minor": 2
507 | }
508 |
--------------------------------------------------------------------------------
/one_million/Sense-test.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import pickle\n",
12 | "from collections import Counter"
13 | ]
14 | },
15 | {
16 | "cell_type": "code",
17 | "execution_count": 2,
18 | "metadata": {
19 | "collapsed": true
20 | },
21 | "outputs": [],
22 | "source": [
23 | "with open('../../dataset/ALL.gold.key.txt') as f:\n",
24 | " sense_key = f.readlines()\n",
25 | "sense_key = [x.strip() for x in sense_key] "
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 3,
31 | "metadata": {
32 | "collapsed": true
33 | },
34 | "outputs": [],
35 | "source": [
36 | "with open(\"../../dataset/sense/ALL-keys\",\"wb\") as f:\n",
37 | " pickle.dump(sense_key, f)"
38 | ]
39 | },
40 | {
41 | "cell_type": "code",
42 | "execution_count": null,
43 | "metadata": {
44 | "collapsed": true
45 | },
46 | "outputs": [],
47 | "source": []
48 | },
49 | {
50 | "cell_type": "code",
51 | "execution_count": 4,
52 | "metadata": {},
53 | "outputs": [
54 | {
55 | "name": "stdout",
56 | "output_type": "stream",
57 | "text": [
58 | "4132\n"
59 | ]
60 | }
61 | ],
62 | "source": [
63 | "for i,s in enumerate(sense_key):\n",
64 | " if(s[:11] == 'semeval2007'):\n",
65 | " print(i)\n",
66 | " break"
67 | ]
68 | },
69 | {
70 | "cell_type": "code",
71 | "execution_count": 5,
72 | "metadata": {},
73 | "outputs": [
74 | {
75 | "data": {
76 | "text/plain": [
77 | "'art%1:09:00::'"
78 | ]
79 | },
80 | "execution_count": 5,
81 | "metadata": {},
82 | "output_type": "execute_result"
83 | }
84 | ],
85 | "source": [
86 | "sense_key[0][25:]"
87 | ]
88 | },
89 | {
90 | "cell_type": "code",
91 | "execution_count": 7,
92 | "metadata": {
93 | "collapsed": true
94 | },
95 | "outputs": [],
96 | "source": [
97 | "dict_sense_keys = dict((sense_key[i][:24], []) if i<4132 else (sense_key[i][:26], []) for i in range(len(sense_key)))\n",
98 | "\n",
99 | "for i in range(4132):\n",
100 | " index = sense_key[i].find(\"%\")\n",
101 | " dict_sense_keys[sense_key[i][:24]].append(sense_key[i][25:])\n",
102 | " dict_sense_keys[sense_key[i][:24]].append(sense_key[i][index+1])\n",
103 | " dict_sense_keys[sense_key[i][:24]].append(sense_key[i][index+3:index+5])\n",
104 | " dict_sense_keys[sense_key[i][:24]].append(sense_key[i][index+1:index+5])\n",
105 | " dict_sense_keys[sense_key[i][:24]].append(sense_key[i][index+1:index+8])\n",
106 | " dict_sense_keys[sense_key[i][:24]].append(sense_key[i][index+1:])\n",
107 | "\n",
108 | "for i in range(4132, len(sense_key)):\n",
109 | " index = sense_key[i].find(\"%\")\n",
110 | " dict_sense_keys[sense_key[i][:26]].append(sense_key[i][27:])\n",
111 | " dict_sense_keys[sense_key[i][:26]].append(sense_key[i][index+1])\n",
112 | " dict_sense_keys[sense_key[i][:26]].append(sense_key[i][index+3:index+5])\n",
113 | " dict_sense_keys[sense_key[i][:26]].append(sense_key[i][index+1:index+5])\n",
114 | " dict_sense_keys[sense_key[i][:26]].append(sense_key[i][index+1:index+8])\n",
115 | " dict_sense_keys[sense_key[i][:26]].append(sense_key[i][index+1:])"
116 | ]
117 | },
118 | {
119 | "cell_type": "code",
120 | "execution_count": 10,
121 | "metadata": {
122 | "collapsed": true
123 | },
124 | "outputs": [],
125 | "source": [
126 | "with open(\"../../dataset/sense/dict_sense-keys_test\",\"wb\") as f:\n",
127 | " pickle.dump(dict_sense_keys, f)"
128 | ]
129 | },
130 | {
131 | "cell_type": "code",
132 | "execution_count": 3,
133 | "metadata": {
134 | "collapsed": true
135 | },
136 | "outputs": [],
137 | "source": [
138 | "total_words = []\n",
139 | "for i in range(226036):\n",
140 | " index = sense_key[i].find(\"%\")\n",
141 | " total_words.append(sense_key[i][15:index])\n",
142 | "\n",
143 | "for i in range(226036, len(sense_key)):\n",
144 | " index = sense_key[i].find(\"%\")\n",
145 | " total_words.append(sense_key[i][24:index])\n",
146 | "\n",
147 | "total_words = Counter(total_words)\n",
148 | "word_count = total_words.most_common()\n",
149 | "vocab_words = [k for k,v in word_count]"
150 | ]
151 | },
152 | {
153 | "cell_type": "code",
154 | "execution_count": 7,
155 | "metadata": {
156 | "scrolled": true
157 | },
158 | "outputs": [
159 | {
160 | "data": {
161 | "text/plain": [
162 | "20400"
163 | ]
164 | },
165 | "execution_count": 7,
166 | "metadata": {},
167 | "output_type": "execute_result"
168 | }
169 | ],
170 | "source": [
171 | "with open(\"../../dataset/sense/vocab_sense-words\",\"wb\") as f:\n",
172 | " pickle.dump(vocab_words, f)\n",
173 | " \n",
174 | "len(vocab_words)"
175 | ]
176 | },
177 | {
178 | "cell_type": "code",
179 | "execution_count": 4,
180 | "metadata": {
181 | "scrolled": true
182 | },
183 | "outputs": [
184 | {
185 | "data": {
186 | "text/plain": [
187 | "[('change', 3074),\n",
188 | " ('lead', 2987),\n",
189 | " ('design', 2938),\n",
190 | " ('open', 2922),\n",
191 | " ('study', 2920),\n",
192 | " ('set', 2909),\n",
193 | " ('call', 2906),\n",
194 | " ('point', 2855),\n",
195 | " ('bring', 2836),\n",
196 | " ('extend', 2832)]"
197 | ]
198 | },
199 | "execution_count": 4,
200 | "metadata": {},
201 | "output_type": "execute_result"
202 | }
203 | ],
204 | "source": [
205 | "word_count[20:30]"
206 | ]
207 | },
208 | {
209 | "cell_type": "code",
210 | "execution_count": null,
211 | "metadata": {
212 | "collapsed": true
213 | },
214 | "outputs": [],
215 | "source": []
216 | },
217 | {
218 | "cell_type": "code",
219 | "execution_count": 9,
220 | "metadata": {},
221 | "outputs": [
222 | {
223 | "data": {
224 | "text/plain": [
225 | "34322"
226 | ]
227 | },
228 | "execution_count": 9,
229 | "metadata": {},
230 | "output_type": "execute_result"
231 | }
232 | ],
233 | "source": [
234 | "total_word_senses = []\n",
235 | "\n",
236 | "for i in range(226036):\n",
237 | " total_word_senses.append(sense_key[i][15:])\n",
238 | "\n",
239 | "for i in range(226036, len(sense_key)):\n",
240 | " total_word_senses.append(sense_key[i][24:])\n",
241 | "\n",
242 | "total_word_senses = Counter(total_word_senses)\n",
243 | "word_senses_count = total_word_senses.most_common()\n",
244 | "vocab_word_senses = [k for k,v in word_senses_count]\n",
245 | "\n",
246 | "with open(\"../../dataset/sense/vocab_word-senses\",\"wb\") as f:\n",
247 | " pickle.dump(vocab_word_senses, f)\n",
248 | " \n",
249 | "len(vocab_word_senses)"
250 | ]
251 | },
252 | {
253 | "cell_type": "code",
254 | "execution_count": 10,
255 | "metadata": {},
256 | "outputs": [
257 | {
258 | "data": {
259 | "text/plain": [
260 | "[('be%2:42:03::', 10582),\n",
261 | " ('person%1:03:00::', 7195),\n",
262 | " ('line%1:04:01::', 4968),\n",
263 | " ('see%2:31:00::', 4554),\n",
264 | " ('be%2:42:06::', 3423),\n",
265 | " ('keep%2:41:03::', 2283),\n",
266 | " ('little%3:00:03::', 2042),\n",
267 | " ('group%1:03:00::', 1826),\n",
268 | " ('say%2:32:00::', 1819),\n",
269 | " ('not%4:02:00::', 1703)]"
270 | ]
271 | },
272 | "execution_count": 10,
273 | "metadata": {},
274 | "output_type": "execute_result"
275 | }
276 | ],
277 | "source": [
278 | "word_senses_count[:10]"
279 | ]
280 | },
281 | {
282 | "cell_type": "code",
283 | "execution_count": null,
284 | "metadata": {
285 | "collapsed": true
286 | },
287 | "outputs": [],
288 | "source": []
289 | },
290 | {
291 | "cell_type": "code",
292 | "execution_count": 11,
293 | "metadata": {
294 | "collapsed": true
295 | },
296 | "outputs": [],
297 | "source": [
298 | "dict_word_sense_keys = dict((w, []) for w in vocab_words)\n",
299 | "\n",
300 | "for v in vocab_word_senses:\n",
301 | " dict_word_sense_keys[v[:v.find(\"%\")]].append(v)\n",
302 | "\n",
303 | "with open(\"../../dataset/sense/dict_word-sense\",\"wb\") as f:\n",
304 | " pickle.dump(dict_word_sense_keys, f)"
305 | ]
306 | },
307 | {
308 | "cell_type": "code",
309 | "execution_count": 12,
310 | "metadata": {
311 | "scrolled": true
312 | },
313 | "outputs": [
314 | {
315 | "data": {
316 | "text/plain": [
317 | "['open%2:35:00::',\n",
318 | " 'open%5:00:00:public:00',\n",
319 | " 'open%2:41:01::',\n",
320 | " 'open%2:30:00::',\n",
321 | " 'open%5:00:00:unrestricted:00',\n",
322 | " 'open%2:30:01::',\n",
323 | " 'open%2:35:06::',\n",
324 | " 'open%2:41:00::',\n",
325 | " 'open%3:00:01::',\n",
326 | " 'open%3:00:02::',\n",
327 | " 'open%5:00:00:unprotected:00',\n",
328 | " 'open%2:35:08::',\n",
329 | " 'open%2:33:00::',\n",
330 | " 'open%5:00:00:available:00',\n",
331 | " 'open%2:42:00::',\n",
332 | " 'open%5:00:00:coarse:00',\n",
333 | " 'open%5:00:00:unenclosed:00',\n",
334 | " 'open%5:00:00:vulnerable:00',\n",
335 | " 'open%3:00:04::',\n",
336 | " 'open%1:15:02::',\n",
337 | " 'open%5:00:00:unconstricted:00',\n",
338 | " 'open%3:00:08::',\n",
339 | " 'open%5:00:00:unsealed:01',\n",
340 | " 'open%5:00:00:unsettled:02',\n",
341 | " 'open%1:15:01::']"
342 | ]
343 | },
344 | "execution_count": 12,
345 | "metadata": {},
346 | "output_type": "execute_result"
347 | }
348 | ],
349 | "source": [
350 | "dict_word_sense_keys['open']"
351 | ]
352 | },
353 | {
354 | "cell_type": "code",
355 | "execution_count": null,
356 | "metadata": {
357 | "collapsed": true
358 | },
359 | "outputs": [],
360 | "source": []
361 | },
362 | {
363 | "cell_type": "code",
364 | "execution_count": 14,
365 | "metadata": {},
366 | "outputs": [
367 | {
368 | "data": {
369 | "text/plain": [
370 | "2468"
371 | ]
372 | },
373 | "execution_count": 14,
374 | "metadata": {},
375 | "output_type": "execute_result"
376 | }
377 | ],
378 | "source": [
379 | "total_sense = []\n",
380 | "senses = []\n",
381 | "for i in range(len(sense_key)):\n",
382 | " index = sense_key[i].find(\"%\")\n",
383 | " total_sense.append(sense_key[i][index+1:])\n",
384 | "\n",
385 | "total_sense = Counter(total_sense)\n",
386 | "sense_count = total_sense.most_common()\n",
387 | "\n",
388 | "vocab_sense = [k for k,v in sense_count]\n",
389 | "\n",
390 | "with open(\"../../dataset/sense/vocab_sense\",\"wb\") as f:\n",
391 | " pickle.dump(vocab_sense, f)\n",
392 | " \n",
393 | "len(vocab_sense)"
394 | ]
395 | },
396 | {
397 | "cell_type": "code",
398 | "execution_count": null,
399 | "metadata": {
400 | "collapsed": true
401 | },
402 | "outputs": [],
403 | "source": []
404 | }
405 | ],
406 | "metadata": {
407 | "kernelspec": {
408 | "display_name": "envs",
409 | "language": "python",
410 | "name": "cs771"
411 | },
412 | "language_info": {
413 | "codemirror_mode": {
414 | "name": "ipython",
415 | "version": 3
416 | },
417 | "file_extension": ".py",
418 | "mimetype": "text/x-python",
419 | "name": "python",
420 | "nbconvert_exporter": "python",
421 | "pygments_lexer": "ipython3",
422 | "version": "3.5.2"
423 | }
424 | },
425 | "nbformat": 4,
426 | "nbformat_minor": 2
427 | }
428 |
--------------------------------------------------------------------------------
/one_million/all-word-model:
--------------------------------------------------------------------------------
1 | all-word-model:
2 |
3 | 1: basic Val: F1 Score:65.5462 Accuracy:73.1659 Model-aw-1-multigpu-1
4 |
5 | 2: cnn with pos Val: F1 Score:72.33 Accuracy:77.93 POS: F1 Score:94.84 Accuracy:97.54 Model-aw-lex-1.4
6 |
7 | 3: local attention 44.361822318916865, 53.75801083454307, 82.19997565386215, 90.42074423342494
8 | Model-aw-lex-local_attention-fast-v2-4
9 |
10 | 4: local attention with hidden states Val: F1 Score:52.19 Accuracy:58.68 POS: F1 Score:85.66 Accuracy:92.72 Model-aw-lex-local_attention-fast-v2-6
11 |
12 | 5: gated local attention Val: F1 Score:44.17 Accuracy:53.07 POS: F1 Score:84.01 Accuracy:91.94
13 | Model-aw-lex-local_attention-fast-v2-7
14 |
15 | 6: local attention with crf Val: F1 Score:50.65 Accuracy:57.15 POS: F1 Score:87.84 Accuracy:93.70
16 | Model-aw-lex-local_attention-fast-v3-1 and Model-aw-lex-local_attention-fast-v4-1
17 |
18 | 7: soft hierarchical Model-aw-lex-hierarchical-2.ipynb
19 | Val: F1 Score:74.04 Accuracy:79.38 POS: F1 Score:96.34 Accuracy:98.21 Loss:0.8093 , Time: 1240.6
20 |
21 | 8: hard hierarchical Model-aw-lex-hierarchical-2.ipynb
22 | Val: F1 Score:70.35 Accuracy:77.30 POS: F1 Score:95.56 Accuracy:97.89 Loss:0.9279 , Time: 1195.1
23 |
--------------------------------------------------------------------------------
/one_million/all-word/Model-aw-4-1.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import tensorflow as tf\n",
12 | "tf.logging.set_verbosity(tf.logging.WARN)\n",
13 | "import pickle\n",
14 | "import numpy as np\n",
15 | "import os\n",
16 | "from sklearn.model_selection import train_test_split\n",
17 | "from sklearn.metrics import f1_score\n",
18 | "from sklearn.metrics import accuracy_score\n",
19 | "import os\n",
20 | "from tensorflow.python.client import device_lib\n",
21 | "from collections import Counter\n",
22 | "import time"
23 | ]
24 | },
25 | {
26 | "cell_type": "code",
27 | "execution_count": 2,
28 | "metadata": {
29 | "collapsed": true
30 | },
31 | "outputs": [],
32 | "source": [
33 | "f = open('../../Glove/word_embedding_glove', 'rb')\n",
34 | "word_embedding = pickle.load(f)\n",
35 | "f.close()\n",
36 | "\n",
37 | "word_embedding = word_embedding[: len(word_embedding)-1]\n",
38 | "\n",
39 | "f = open('../../Glove/vocab_glove', 'rb')\n",
40 | "vocab = pickle.load(f)\n",
41 | "f.close()\n",
42 | "\n",
43 | "word2id = dict((w, i) for i,w in enumerate(vocab))\n",
44 | "id2word = dict((i, w) for i,w in enumerate(vocab))\n",
45 | "\n",
46 | "unknown_token = \"UNKNOWN_TOKEN\"\n",
47 | "\n",
48 | "# Model Description\n",
49 | "model_name = 'model-aw-4-1'\n",
50 | "model_dir = '../output/all-word/' + model_name\n",
51 | "save_dir = os.path.join(model_dir, \"save/\")\n",
52 | "log_dir = os.path.join(model_dir, \"log\")\n",
53 | "\n",
54 | "if not os.path.exists(model_dir):\n",
55 | " os.mkdir(model_dir)\n",
56 | "if not os.path.exists(save_dir):\n",
57 | " os.mkdir(save_dir)\n",
58 | "if not os.path.exists(log_dir):\n",
59 | " os.mkdir(log_dir)\n",
60 | "\n",
61 | "with open('/data/aviraj/dataset/train_val_data/all_word_sense2_sampled','rb') as f:\n",
62 | " train_data, val_data = pickle.load(f) \n",
63 | " \n",
64 | "\n",
65 | "# Parameters\n",
66 | "mode = 'train'\n",
67 | "num_senses = 272\n",
68 | "num_lex = 47\n",
69 | "num_pos = 12\n",
70 | "batch_size = 32\n",
71 | "vocab_size = len(vocab)\n",
72 | "unk_vocab_size = 1\n",
73 | "word_emb_size = len(word_embedding[0])\n",
74 | "max_sent_size = 200\n",
75 | "hidden_size = 512\n",
76 | "keep_prob = 0.4\n",
77 | "l2_lambda = 0.001\n",
78 | "init_lr = 0.01\n",
79 | "decay_steps = 5000\n",
80 | "decay_rate = 0.999\n",
81 | "clip_norm = 1\n",
82 | "clipping = True\n",
83 | "moving_avg_deacy = 0.999\n",
84 | "num_gpus = 6"
85 | ]
86 | },
87 | {
88 | "cell_type": "code",
89 | "execution_count": 3,
90 | "metadata": {
91 | "collapsed": true
92 | },
93 | "outputs": [],
94 | "source": [
95 | "def average_gradients(tower_grads):\n",
96 | " average_grads = []\n",
97 | " for grad_and_vars in zip(*tower_grads):\n",
98 | " # Note that each grad_and_vars looks like the following:\n",
99 | " # ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))\n",
100 | " grads = []\n",
101 | " for g, _ in grad_and_vars:\n",
102 | " # Add 0 dimension to the gradients to represent the tower.\n",
103 | " expanded_g = tf.expand_dims(g, 0)\n",
104 | "\n",
105 | " # Append on a 'tower' dimension which we will average over below.\n",
106 | " grads.append(expanded_g)\n",
107 | "\n",
108 | " # Average over the 'tower' dimension.\n",
109 | " grad = tf.concat(grads, 0)\n",
110 | " grad = tf.reduce_mean(grad, 0)\n",
111 | "\n",
112 | " # Keep in mind that the Variables are redundant because they are shared\n",
113 | " # across towers. So .. we will just return the first tower's pointer to\n",
114 | " # the Variable.\n",
115 | " v = grad_and_vars[0][1]\n",
116 | " grad_and_var = (grad, v)\n",
117 | " average_grads.append(grad_and_var)\n",
118 | " return average_grads"
119 | ]
120 | },
121 | {
122 | "cell_type": "code",
123 | "execution_count": 4,
124 | "metadata": {
125 | "collapsed": true
126 | },
127 | "outputs": [],
128 | "source": [
129 | "# MODEL\n",
130 | "device_num = 0\n",
131 | "tower_grads = []\n",
132 | "losses = []\n",
133 | "predictions = []\n",
134 | "\n",
135 | "x = tf.placeholder('int32', [num_gpus, batch_size, max_sent_size], name=\"x\")\n",
136 | "y = tf.placeholder('int32', [num_gpus, batch_size, max_sent_size], name=\"y\")\n",
137 | "x_mask = tf.placeholder('bool', [num_gpus, batch_size, max_sent_size], name='x_mask') \n",
138 | "sense_mask = tf.placeholder('bool', [num_gpus, batch_size, max_sent_size], name='sense_mask')\n",
139 | "is_train = tf.placeholder('bool', [], name='is_train')\n",
140 | "word_emb_mat = tf.placeholder('float', [None, word_emb_size], name='emb_mat')\n",
141 | "input_keep_prob = tf.cond(is_train,lambda:keep_prob, lambda:tf.constant(1.0))\n",
142 | "\n",
143 | "global_step = tf.Variable(0, trainable=False, name=\"global_step\")\n",
144 | "learning_rate = tf.train.exponential_decay(init_lr, global_step, decay_steps, decay_rate, staircase=True)\n",
145 | "summaries = []\n",
146 | "\n",
147 | "with tf.variable_scope(\"word_embedding\"):\n",
148 | " unk_word_emb_mat = tf.get_variable(\"word_emb_mat\", dtype='float', shape=[unk_vocab_size, word_emb_size], initializer=tf.contrib.layers.xavier_initializer(uniform=True, seed=0, dtype=tf.float32))\n",
149 | " final_word_emb_mat = tf.concat([word_emb_mat, unk_word_emb_mat], 0)\n",
150 | "\n",
151 | "with tf.variable_scope(tf.get_variable_scope()):\n",
152 | " for gpu_idx in range(num_gpus):\n",
153 | " if gpu_idx>2:\n",
154 | " device_num = 1\n",
155 | " with tf.name_scope(\"model_{}\".format(gpu_idx)) as scope, tf.device('/gpu:%d' % device_num):\n",
156 | "\n",
157 | " if gpu_idx > 0:\n",
158 | " tf.get_variable_scope().reuse_variables()\n",
159 | "\n",
160 | " with tf.name_scope(\"word\"):\n",
161 | " Wx = tf.nn.embedding_lookup(final_word_emb_mat, x[gpu_idx]) \n",
162 | "\n",
163 | " x_len = tf.reduce_sum(tf.cast(x_mask[gpu_idx], 'int32'), 1)\n",
164 | "\n",
165 | " with tf.variable_scope(\"lstm1\"):\n",
166 | " cell_fw1 = tf.contrib.rnn.BasicLSTMCell(hidden_size,state_is_tuple=True)\n",
167 | " cell_bw1 = tf.contrib.rnn.BasicLSTMCell(hidden_size,state_is_tuple=True)\n",
168 | "\n",
169 | " d_cell_fw1 = tf.contrib.rnn.DropoutWrapper(cell_fw1, input_keep_prob=input_keep_prob)\n",
170 | " d_cell_bw1 = tf.contrib.rnn.DropoutWrapper(cell_bw1, input_keep_prob=input_keep_prob)\n",
171 | "\n",
172 | " (fw_h1, bw_h1), _ = tf.nn.bidirectional_dynamic_rnn(d_cell_fw1, d_cell_bw1, Wx, sequence_length=x_len, dtype='float', scope='lstm1')\n",
173 | " h1 = tf.concat([fw_h1, bw_h1], 2)\n",
174 | "\n",
175 | " with tf.variable_scope(\"lstm2\"):\n",
176 | " cell_fw2 = tf.contrib.rnn.BasicLSTMCell(hidden_size,state_is_tuple=True)\n",
177 | " cell_bw2 = tf.contrib.rnn.BasicLSTMCell(hidden_size,state_is_tuple=True)\n",
178 | "\n",
179 | " d_cell_fw2 = tf.contrib.rnn.DropoutWrapper(cell_fw2, input_keep_prob=input_keep_prob)\n",
180 | " d_cell_bw2 = tf.contrib.rnn.DropoutWrapper(cell_bw2, input_keep_prob=input_keep_prob)\n",
181 | "\n",
182 | " (fw_h2, bw_h2), _ = tf.nn.bidirectional_dynamic_rnn(d_cell_fw2, d_cell_bw2, h1, sequence_length=x_len, dtype='float', scope='lstm2')\n",
183 | " h = tf.concat([fw_h2, bw_h2], 2)\n",
184 | "\n",
185 | " def attention(input_x, input_mask, W_att):\n",
186 | " h_masked = tf.boolean_mask(input_x, input_mask)\n",
187 | " h_tanh = tf.tanh(h_masked)\n",
188 | " u = tf.matmul(h_tanh, W_att)\n",
189 | " a = tf.nn.softmax(u)\n",
190 | " c = tf.reduce_sum(tf.multiply(h_tanh, a), 0) \n",
191 | " return c\n",
192 | "\n",
193 | " with tf.variable_scope(\"attention\"):\n",
194 | " W_att = tf.get_variable(\"W_att\", shape=[2*hidden_size, 1], initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.1, seed=gpu_idx*10))\n",
195 | " c = tf.expand_dims(attention(h[0], x_mask[gpu_idx][0], W_att), 0)\n",
196 | " for i in range(1, batch_size):\n",
197 | " c = tf.concat([c, tf.expand_dims(attention(h[i], x_mask[gpu_idx][i], W_att), 0)], 0)\n",
198 | " \n",
199 | " cc = tf.expand_dims(c, 1)\n",
200 | " c_final = tf.tile(cc, [1, max_sent_size, 1])\n",
201 | " h_final = tf.concat([c_final, h],2)\n",
202 | " flat_h_final = tf.reshape(h_final, [-1, 4*hidden_size])\n",
203 | " \n",
204 | " with tf.variable_scope(\"hidden_layer\"):\n",
205 | " W = tf.get_variable(\"W\", shape=[4*hidden_size, 2*hidden_size], initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.1, seed=gpu_idx*20))\n",
206 | " b = tf.get_variable(\"b\", shape=[2*hidden_size], initializer=tf.zeros_initializer())\n",
207 | " drop_flat_h_final = tf.nn.dropout(flat_h_final, input_keep_prob)\n",
208 | " flat_hl = tf.matmul(drop_flat_h_final, W) + b\n",
209 | " \n",
210 | " with tf.variable_scope(\"softmax_layer\"):\n",
211 | " W = tf.get_variable(\"W\", shape=[2*hidden_size, num_senses], initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.1, seed=gpu_idx*20))\n",
212 | " b = tf.get_variable(\"b\", shape=[num_senses], initializer=tf.zeros_initializer())\n",
213 | " drop_flat_hl = tf.nn.dropout(flat_hl, input_keep_prob)\n",
214 | " flat_logits_sense = tf.matmul(drop_flat_hl, W) + b\n",
215 | " logits = tf.reshape(flat_logits_sense, [batch_size, max_sent_size, num_senses])\n",
216 | " predictions.append(tf.arg_max(logits, 2))\n",
217 | "\n",
218 | " float_sense_mask = tf.cast(sense_mask[gpu_idx], 'float')\n",
219 | "\n",
220 | " loss = tf.contrib.seq2seq.sequence_loss(logits, y[gpu_idx], float_sense_mask, name=\"loss\")\n",
221 | "\n",
222 | " l2_loss = l2_lambda * tf.losses.get_regularization_loss()\n",
223 | "\n",
224 | " total_loss = loss + l2_loss\n",
225 | "\n",
226 | " summaries.append(tf.summary.scalar(\"loss_{}\".format(gpu_idx), loss))\n",
227 | " summaries.append(tf.summary.scalar(\"total_loss_{}\".format(gpu_idx), total_loss))\n",
228 | "\n",
229 | "\n",
230 | " optimizer = tf.train.AdamOptimizer(learning_rate)\n",
231 | " grads_vars = optimizer.compute_gradients(total_loss)\n",
232 | "\n",
233 | " clipped_grads = grads_vars\n",
234 | " if(clipping == True):\n",
235 | " clipped_grads = [(tf.clip_by_norm(grad, clip_norm), var) for grad, var in clipped_grads]\n",
236 | "\n",
237 | " tower_grads.append(clipped_grads)\n",
238 | " losses.append(total_loss)\n",
239 | "\n",
240 | "tower_grads = average_gradients(tower_grads)\n",
241 | "losses = tf.add_n(losses)/len(losses)\n",
242 | "apply_grad_op = optimizer.apply_gradients(tower_grads, global_step=global_step)\n",
243 | "summaries.append(tf.summary.scalar('total_loss', losses))\n",
244 | "summaries.append(tf.summary.scalar('learning_rate', learning_rate))\n",
245 | "\n",
246 | "for var in tf.trainable_variables():\n",
247 | " summaries.append(tf.summary.histogram(var.op.name, var))\n",
248 | "\n",
249 | "variable_averages = tf.train.ExponentialMovingAverage(moving_avg_deacy, global_step)\n",
250 | "variables_averages_op = variable_averages.apply(tf.trainable_variables())\n",
251 | "\n",
252 | "train_op = tf.group(apply_grad_op, variables_averages_op)\n",
253 | "saver = tf.train.Saver(tf.global_variables())\n",
254 | "summary = tf.summary.merge(summaries)"
255 | ]
256 | },
257 | {
258 | "cell_type": "code",
259 | "execution_count": 5,
260 | "metadata": {
261 | "collapsed": true
262 | },
263 | "outputs": [],
264 | "source": [
265 | "os.environ[\"CUDA_DEVICE_ORDER\"]=\"PCI_BUS_ID\" # see issue #152\n",
266 | "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"0,1\"\n",
267 | "# print (device_lib.list_local_devices())\n",
268 | "config = tf.ConfigProto()\n",
269 | "config.gpu_options.allow_growth = True\n",
270 | "config.allow_soft_placement = True\n",
271 | "sess = tf.Session(config=config)\n",
272 | "sess.run(tf.global_variables_initializer()) # For initializing all the variables\n",
273 | "summary_writer = tf.summary.FileWriter(log_dir, sess.graph) # For writing Summaries"
274 | ]
275 | },
276 | {
277 | "cell_type": "code",
278 | "execution_count": 6,
279 | "metadata": {
280 | "collapsed": true
281 | },
282 | "outputs": [],
283 | "source": [
284 | "def model(xx, yy, mask, smask, train_cond=True):\n",
285 | " num_batches = int(len(xx)/(batch_size*num_gpus))\n",
286 | " _losses = 0\n",
287 | " temp_loss = 0\n",
288 | " preds_sense = []\n",
289 | " true_sense = []\n",
290 | " \n",
291 | " for j in range(num_batches): \n",
292 | " \n",
293 | " s = j * batch_size * num_gpus\n",
294 | " e = (j+1) * batch_size * num_gpus\n",
295 | " xx_re = xx[s:e].reshape([num_gpus, batch_size, -1])\n",
296 | " yy_re = yy[s:e].reshape([num_gpus, batch_size, -1])\n",
297 | " mask_re = mask[s:e].reshape([num_gpus, batch_size, -1])\n",
298 | " smask_re = smask[s:e].reshape([num_gpus, batch_size, -1])\n",
299 | " \n",
300 | " feed_dict = {x:xx_re, y:yy_re, x_mask:mask_re, sense_mask:smask_re, is_train:train_cond, input_keep_prob:keep_prob, word_emb_mat:word_embedding}\n",
301 | " \n",
302 | " if(train_cond==True):\n",
303 | " _, _loss, step, _summary = sess.run([train_op, losses, global_step, summary], feed_dict)\n",
304 | " summary_writer.add_summary(_summary, step)\n",
305 | " \n",
306 | " temp_loss += _loss\n",
307 | " if((j+1)%1000==0):\n",
308 | " print(\"Steps: {}\".format(step), \"Loss:{0:.4f}\".format(temp_loss/1000), \", Current Loss: {0:.4f}\".format(_loss))\n",
309 | " temp_loss = 0\n",
310 | " if((j+1)%5000==0):\n",
311 | " saver.save(sess, save_path=save_dir) \n",
312 | " \n",
313 | " else:\n",
314 | " _loss, pred = sess.run([total_loss, predictions], feed_dict)\n",
315 | " for i in range(num_gpus):\n",
316 | " preds_sense.append(pred[i][smask_re[i]])\n",
317 | " true_sense.append(yy_re[i][smask_re[i]])\n",
318 | "\n",
319 | " _losses +=_loss\n",
320 | "\n",
321 | " if(train_cond==False): \n",
322 | " sense_preds = []\n",
323 | " sense_true = []\n",
324 | " \n",
325 | " for preds in preds_sense:\n",
326 | " for ps in preds: \n",
327 | " sense_preds.append(ps) \n",
328 | " for trues in true_sense:\n",
329 | " for ts in trues:\n",
330 | " sense_true.append(ts)\n",
331 | " \n",
332 | " return _losses/num_batches, sense_preds, sense_true\n",
333 | "\n",
334 | " return _losses/num_batches, step\n",
335 | "\n",
336 | "def eval_score(yy, pred):\n",
337 | " f1 = f1_score(yy, pred, average='macro')\n",
338 | " accu = accuracy_score(yy, pred)\n",
339 | " return f1*100, accu*100"
340 | ]
341 | },
342 | {
343 | "cell_type": "code",
344 | "execution_count": null,
345 | "metadata": {
346 | "collapsed": true
347 | },
348 | "outputs": [],
349 | "source": [
350 | "x_id_train = train_data['x']\n",
351 | "mask_train = train_data['x_mask']\n",
352 | "sense_mask_train = train_data['sense_mask']\n",
353 | "y_train = train_data['y']\n",
354 | "\n",
355 | "x_id_val = val_data['x']\n",
356 | "mask_val = val_data['x_mask']\n",
357 | "sense_mask_val = val_data['sense_mask']\n",
358 | "y_val = val_data['y']"
359 | ]
360 | },
361 | {
362 | "cell_type": "code",
363 | "execution_count": null,
364 | "metadata": {
365 | "scrolled": true
366 | },
367 | "outputs": [
368 | {
369 | "name": "stdout",
370 | "output_type": "stream",
371 | "text": [
372 | "Steps: 1000 Loss:15.7534 , Current Loss: 4.6380\n",
373 | "Steps: 2000 Loss:4.6967 , Current Loss: 4.6226\n",
374 | "Steps: 3000 Loss:4.7022 , Current Loss: 4.5907\n",
375 | "Steps: 4000 Loss:4.7315 , Current Loss: 4.6306\n",
376 | "Steps: 5000 Loss:4.8571 , Current Loss: 4.8387\n"
377 | ]
378 | }
379 | ],
380 | "source": [
381 | "num_epochs = 5\n",
382 | "log_period = 1\n",
383 | "\n",
384 | "for i in range(num_epochs):\n",
385 | " random = np.random.choice(len(y_train), size=(len(y_train)), replace=False)\n",
386 | " x_id_train = x_id_train[random]\n",
387 | " y_train = y_train[random]\n",
388 | " mask_train = mask_train[random] \n",
389 | " sense_mask_train = sense_mask_train[random]\n",
390 | " \n",
391 | " start_time = time.time()\n",
392 | " train_loss, step = model(x_id_train, y_train, mask_train, sense_mask_train)\n",
393 | " time_taken = time.time() - start_time\n",
394 | " print(\"Epoch: {}\".format(i+1),\", Step: {}\".format(step), \", loss: {0:.4f}\".format(train_loss), \", Time: {0:.1f}\".format(time_taken))\n",
395 | " saver.save(sess, save_path=save_dir) \n",
396 | " print(\"Model Saved\")\n",
397 | " \n",
398 | " if((i+1)%log_period==0):\n",
399 | " start_time = time.time()\n",
400 | " val_loss, val_pred, val_true = model(x_id_val, y_val, mask_val, sense_mask_val, train_cond=False) \n",
401 | " f1_, accu_ = eval_score(val_true, val_pred)\n",
402 | " time_taken = time.time() - start_time\n",
403 | " print(\"Val: F1 Score:{0:.2f}\".format(f1_), \"Accuracy:{0:.2f}\".format(accu_), \"Loss:{0:.4f}\".format(val_loss), \", Time: {0:.1f}\".format(time_taken))"
404 | ]
405 | },
406 | {
407 | "cell_type": "code",
408 | "execution_count": 1,
409 | "metadata": {},
410 | "outputs": [],
411 | "source": [
412 | "start_time = time.time()\n",
413 | "train_loss, train_pred, train_true = model(x_id_train, y_train, mask_train, sense_mask_train, train_cond=False) \n",
414 | "f1_, accu_ = etrain_score(train_true, train_pred)\n",
415 | "time_taken = time.time() - start_time\n",
416 | "print(\"train: F1 Score:{0:.2f}\".format(f1_), \"Accuracy:{0:.2f}\".format(accu_), \"Loss:{0:.4f}\".format(train_loss), \", Time: {0:.1f}\".format(time_taken))"
417 | ]
418 | },
419 | {
420 | "cell_type": "code",
421 | "execution_count": null,
422 | "metadata": {
423 | "collapsed": true
424 | },
425 | "outputs": [],
426 | "source": []
427 | },
428 | {
429 | "cell_type": "code",
430 | "execution_count": 8,
431 | "metadata": {
432 | "collapsed": true
433 | },
434 | "outputs": [],
435 | "source": [
436 | "saver.restore(sess, save_dir)"
437 | ]
438 | }
439 | ],
440 | "metadata": {
441 | "kernelspec": {
442 | "display_name": "cs771",
443 | "language": "python",
444 | "name": "cs771"
445 | },
446 | "language_info": {
447 | "codemirror_mode": {
448 | "name": "ipython",
449 | "version": 3
450 | },
451 | "file_extension": ".py",
452 | "mimetype": "text/x-python",
453 | "name": "python",
454 | "nbconvert_exporter": "python",
455 | "pygments_lexer": "ipython3",
456 | "version": "3.5.2"
457 | }
458 | },
459 | "nbformat": 4,
460 | "nbformat_minor": 2
461 | }
462 |
--------------------------------------------------------------------------------
/one_million/all-word/Readme.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | ## Model-aw-lex-1
4 | Convolution over hidden states of lstms
--------------------------------------------------------------------------------
/one_million/make/Make-Model-1.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import tensorflow as tf\n",
12 | "tf.logging.set_verbosity(tf.logging.WARN)\n",
13 | "import pickle\n",
14 | "import numpy as np\n",
15 | "import os\n",
16 | "from sklearn.model_selection import train_test_split\n",
17 | "from sklearn.metrics import f1_score\n",
18 | "from sklearn.metrics import accuracy_score\n",
19 | "import os\n",
20 | "from tensorflow.python.client import device_lib\n",
21 | "from collections import Counter"
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": 2,
27 | "metadata": {
28 | "collapsed": true
29 | },
30 | "outputs": [],
31 | "source": [
32 | "f = open('../../Glove/word_embedding_glove', 'rb')\n",
33 | "word_embedding = pickle.load(f)\n",
34 | "f.close()\n",
35 | "word_embedding = word_embedding[: len(word_embedding)-1]\n",
36 | "\n",
37 | "f = open('../../Glove/vocab_glove', 'rb')\n",
38 | "vocab = pickle.load(f)\n",
39 | "f.close()\n",
40 | "\n",
41 | "word2id = dict((w, i) for i,w in enumerate(vocab))\n",
42 | "id2word = dict((i, w) for i,w in enumerate(vocab))\n",
43 | "\n",
44 | "unknown_token = \"UNKNOWN_TOKEN\""
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": 3,
50 | "metadata": {
51 | "collapsed": true
52 | },
53 | "outputs": [],
54 | "source": [
55 | "f = open(\"../../../dataset/sense/dict_sense-keys\", 'rb')\n",
56 | "dict_sense_keys = pickle.load(f)\n",
57 | "f.close()\n",
58 | "\n",
59 | "f = open(\"../../../dataset/sense/dict_word-sense\", 'rb')\n",
60 | "dict_word_sense = pickle.load(f)\n",
61 | "f.close()"
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": 4,
67 | "metadata": {
68 | "collapsed": true
69 | },
70 | "outputs": [],
71 | "source": [
72 | "# Model Description\n",
73 | "sense_word = 'make'\n",
74 | "model_name = 'model-1'\n",
75 | "sense_word_dir = '../output/' + sense_word\n",
76 | "model_dir = sense_word_dir + '/' + model_name\n",
77 | "save_dir = os.path.join(model_dir, \"save/\")\n",
78 | "log_dir = os.path.join(model_dir, \"log\")\n",
79 | "\n",
80 | "if not os.path.exists(sense_word_dir):\n",
81 | " os.mkdir(sense_word_dir)\n",
82 | "if not os.path.exists(model_dir):\n",
83 | " os.mkdir(model_dir)\n",
84 | "if not os.path.exists(save_dir):\n",
85 | " os.mkdir(save_dir)\n",
86 | "if not os.path.exists(log_dir):\n",
87 | " os.mkdir(log_dir)"
88 | ]
89 | },
90 | {
91 | "cell_type": "code",
92 | "execution_count": 8,
93 | "metadata": {
94 | "scrolled": false
95 | },
96 | "outputs": [
97 | {
98 | "data": {
99 | "text/plain": [
100 | "[('36', 2006),\n",
101 | " ('30', 1025),\n",
102 | " ('42', 968),\n",
103 | " ('41', 962),\n",
104 | " ('31', 617),\n",
105 | " ('32', 543),\n",
106 | " ('38', 445),\n",
107 | " ('40', 20),\n",
108 | " ('29', 6),\n",
109 | " ('09', 1)]"
110 | ]
111 | },
112 | "execution_count": 8,
113 | "metadata": {},
114 | "output_type": "execute_result"
115 | }
116 | ],
117 | "source": [
118 | "f = open(\"../../../dataset/checkwords/\"+ sense_word + \"_data\", 'rb')\n",
119 | "data = pickle.load(f)\n",
120 | "f.close()\n",
121 | "\n",
122 | "data_y = []\n",
123 | "for i in range(len(data)):\n",
124 | " data_y.append(dict_sense_keys[data[i][0]][2])\n",
125 | "\n",
126 | "sense_count = Counter(data_y)\n",
127 | "sense_count = sense_count.most_common()\n",
128 | "vocab_sense = [k for k,v in sense_count[:7]]\n",
129 | "sense_count"
130 | ]
131 | },
132 | {
133 | "cell_type": "code",
134 | "execution_count": 9,
135 | "metadata": {},
136 | "outputs": [
137 | {
138 | "data": {
139 | "text/plain": [
140 | "['36', '30', '42', '41', '31', '32', '38']"
141 | ]
142 | },
143 | "execution_count": 9,
144 | "metadata": {},
145 | "output_type": "execute_result"
146 | }
147 | ],
148 | "source": [
149 | "vocab_sense"
150 | ]
151 | },
152 | {
153 | "cell_type": "code",
154 | "execution_count": 10,
155 | "metadata": {},
156 | "outputs": [
157 | {
158 | "name": "stdout",
159 | "output_type": "stream",
160 | "text": [
161 | "6566 6593\n"
162 | ]
163 | }
164 | ],
165 | "source": [
166 | "data_x = []\n",
167 | "data_label = []\n",
168 | "for i in range(len(data)):\n",
169 | " if dict_sense_keys[data[i][0]][2] in vocab_sense:\n",
170 | " data_x.append(data[i][1])\n",
171 | " data_label.append(dict_sense_keys[data[i][0]][2])\n",
172 | "\n",
173 | "print(len(data_label), len(data_y))\n",
174 | "\n",
175 | "# vocab_sense = dict_word_sense[sense_word]\n",
176 | "\n",
177 | "sense2id = dict((s, i) for i,s in enumerate(vocab_sense))\n",
178 | "id2sense = dict((i, s) for i,s in enumerate(vocab))"
179 | ]
180 | },
181 | {
182 | "cell_type": "code",
183 | "execution_count": 11,
184 | "metadata": {
185 | "collapsed": true
186 | },
187 | "outputs": [],
188 | "source": [
189 | "# Parameters\n",
190 | "mode = 'train'\n",
191 | "num_senses = len(vocab_sense)\n",
192 | "batch_size = 64\n",
193 | "vocab_size = len(vocab)\n",
194 | "unk_vocab_size = 1\n",
195 | "word_emb_size = len(word_embedding[0])\n",
196 | "max_sent_size = 300\n",
197 | "hidden_size = 100\n",
198 | "keep_prob = 0.5\n",
199 | "l2_lambda = 0.001\n",
200 | "init_lr = 0.01\n",
201 | "decay_steps = 500\n",
202 | "decay_rate = 0.96\n",
203 | "clip_norm = 1\n",
204 | "clipping = True"
205 | ]
206 | },
207 | {
208 | "cell_type": "code",
209 | "execution_count": 12,
210 | "metadata": {
211 | "collapsed": true,
212 | "scrolled": true
213 | },
214 | "outputs": [],
215 | "source": [
216 | "# MODEL\n",
217 | "x = tf.placeholder('int32', [batch_size, max_sent_size], name=\"x\")\n",
218 | "y = tf.placeholder('int32', [batch_size], name=\"y\")\n",
219 | "x_mask = tf.placeholder('bool', [batch_size, max_sent_size], name='x_mask') \n",
220 | "is_train = tf.placeholder('bool', [], name='is_train')\n",
221 | "word_emb_mat = tf.placeholder('float', [None, word_emb_size], name='emb_mat')\n",
222 | "input_keep_prob = tf.cond(is_train,lambda:keep_prob, lambda:tf.constant(1.0))\n",
223 | "x_len = tf.reduce_sum(tf.cast(x_mask, 'int32'), 1)\n",
224 | "\n",
225 | "with tf.name_scope(\"word_embedding\"):\n",
226 | " if mode == 'train':\n",
227 | " unk_word_emb_mat = tf.get_variable(\"word_emb_mat\", dtype='float', shape=[unk_vocab_size, word_emb_size], initializer=tf.contrib.layers.xavier_initializer(uniform=True, seed=0, dtype=tf.float32))\n",
228 | " else:\n",
229 | " unk_word_emb_mat = tf.get_variable(\"word_emb_mat\", shape=[unk_vocab_size, word_emb_size], dtype='float')\n",
230 | " \n",
231 | " final_word_emb_mat = tf.concat([word_emb_mat, unk_word_emb_mat], 0)\n",
232 | " Wx = tf.nn.embedding_lookup(final_word_emb_mat, x) \n",
233 | "\n",
234 | "with tf.variable_scope(\"lstm1\"):\n",
235 | " cell_fw1 = tf.contrib.rnn.BasicLSTMCell(hidden_size,state_is_tuple=True)\n",
236 | " cell_bw1 = tf.contrib.rnn.BasicLSTMCell(hidden_size,state_is_tuple=True)\n",
237 | "\n",
238 | " d_cell_fw1 = tf.contrib.rnn.DropoutWrapper(cell_fw1, input_keep_prob=input_keep_prob)\n",
239 | " d_cell_bw1 = tf.contrib.rnn.DropoutWrapper(cell_bw1, input_keep_prob=input_keep_prob)\n",
240 | " \n",
241 | " (fw_h1, bw_h1), _ = tf.nn.bidirectional_dynamic_rnn(d_cell_fw1, d_cell_bw1, Wx, sequence_length=x_len, dtype='float', scope='lstm1')\n",
242 | " h1 = tf.concat([fw_h1, bw_h1], 2)\n",
243 | " \n",
244 | "with tf.variable_scope(\"lstm2\"):\n",
245 | " cell_fw2 = tf.contrib.rnn.BasicLSTMCell(hidden_size,state_is_tuple=True)\n",
246 | " cell_bw2 = tf.contrib.rnn.BasicLSTMCell(hidden_size,state_is_tuple=True)\n",
247 | "\n",
248 | " d_cell_fw2 = tf.contrib.rnn.DropoutWrapper(cell_fw2, input_keep_prob=input_keep_prob)\n",
249 | " d_cell_bw2 = tf.contrib.rnn.DropoutWrapper(cell_bw2, input_keep_prob=input_keep_prob)\n",
250 | " \n",
251 | " (fw_h2, bw_h2), _ = tf.nn.bidirectional_dynamic_rnn(d_cell_fw2, d_cell_bw2, h1, sequence_length=x_len, dtype='float', scope='lstm2')\n",
252 | " h = tf.concat([fw_h2, bw_h2], 2)\n",
253 | "\n",
254 | "def attention(input_x, input_mask, W_att):\n",
255 | " h_masked = tf.boolean_mask(input_x, input_mask)\n",
256 | " h_tanh = tf.tanh(h_masked)\n",
257 | " u = tf.matmul(h_tanh, W_att)\n",
258 | " a = tf.nn.softmax(u)\n",
259 | " c = tf.reduce_sum(tf.multiply(h_tanh, a), 0) \n",
260 | " return c\n",
261 | "\n",
262 | "with tf.variable_scope(\"attention\"):\n",
263 | " W_att = tf.Variable(tf.truncated_normal([2*hidden_size, 1], mean=0.0, stddev=0.1, seed=0), name=\"W_att\")\n",
264 | " c = tf.expand_dims(attention(h[0], x_mask[0], W_att), 0)\n",
265 | " for i in range(1, batch_size):\n",
266 | " c = tf.concat([c, tf.expand_dims(attention(h[i], x_mask[i], W_att), 0)], 0)\n",
267 | " \n",
268 | "with tf.variable_scope(\"softmax_layer\"):\n",
269 | " W = tf.Variable(tf.truncated_normal([2*hidden_size, num_senses], mean=0.0, stddev=0.1, seed=0), name=\"W\")\n",
270 | " b = tf.Variable(tf.zeros([num_senses]), name=\"b\")\n",
271 | " drop_c = tf.nn.dropout(c, input_keep_prob)\n",
272 | " logits = tf.matmul(drop_c, W) + b\n",
273 | " predictions = tf.argmax(logits, 1)\n",
274 | " \n",
275 | "loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=y))\n",
276 | "\n",
277 | "global_step = tf.Variable(0, trainable=False, name=\"global_step\")\n",
278 | "\n",
279 | "learning_rate = tf.train.exponential_decay(init_lr, global_step, decay_steps, decay_rate, staircase=True)\n",
280 | "\n",
281 | "tv_all = tf.trainable_variables()\n",
282 | "tv_regu =[]\n",
283 | "for t in tv_all:\n",
284 | " if t.name.find('b:')==-1:\n",
285 | " tv_regu.append(t)\n",
286 | " \n",
287 | "# l2 Loss\n",
288 | "l2_loss = l2_lambda * tf.reduce_sum([ tf.nn.l2_loss(v) for v in tv_regu ])\n",
289 | "\n",
290 | "total_loss = loss + l2_loss\n",
291 | "\n",
292 | "# Optimizer for loss\n",
293 | "optimizer = tf.train.AdamOptimizer(learning_rate)\n",
294 | "\n",
295 | "# Gradients and Variables for Loss\n",
296 | "grads_vars = optimizer.compute_gradients(total_loss)\n",
297 | "\n",
298 | "# Clipping of Gradients\n",
299 | "clipped_grads = grads_vars\n",
300 | "if(clipping == True):\n",
301 | " clipped_grads = [(tf.clip_by_norm(grad, clip_norm), var) for grad, var in clipped_grads]\n",
302 | "\n",
303 | "# Training Optimizer for Total Loss\n",
304 | "train_op = optimizer.apply_gradients(clipped_grads, global_step=global_step)\n",
305 | "\n",
306 | "# Summaries\n",
307 | "var_summaries = []\n",
308 | "for v in tv_all:\n",
309 | " var_summary = tf.summary.histogram(\"{}/var\".format(v.name), v)\n",
310 | " var_summaries.append(var_summary)\n",
311 | "\n",
312 | "var_summaries_merged = tf.summary.merge(var_summaries)\n",
313 | "\n",
314 | "loss_summary = tf.summary.scalar(\"loss\", loss)\n",
315 | "total_loss_summary = tf.summary.scalar(\"total_loss\", total_loss)\n",
316 | "summary = tf.summary.merge_all()"
317 | ]
318 | },
319 | {
320 | "cell_type": "code",
321 | "execution_count": 13,
322 | "metadata": {
323 | "collapsed": true
324 | },
325 | "outputs": [],
326 | "source": [
327 | "os.environ[\"CUDA_DEVICE_ORDER\"]=\"PCI_BUS_ID\" # see issue #152\n",
328 | "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"0\"\n",
329 | "config = tf.ConfigProto()\n",
330 | "config.gpu_options.allow_growth = True\n",
331 | "sess = tf.Session(config=config)\n",
332 | "sess.run(tf.global_variables_initializer()) # For initializing all the variables\n",
333 | "saver = tf.train.Saver() # For Saving the model\n",
334 | "summary_writer = tf.summary.FileWriter(log_dir, sess.graph) # For writing Summaries"
335 | ]
336 | },
337 | {
338 | "cell_type": "code",
339 | "execution_count": 14,
340 | "metadata": {
341 | "collapsed": true
342 | },
343 | "outputs": [],
344 | "source": [
345 | "index = []\n",
346 | "for i in range(len(data_x)):\n",
347 | " index.append(i)"
348 | ]
349 | },
350 | {
351 | "cell_type": "code",
352 | "execution_count": 15,
353 | "metadata": {
354 | "scrolled": true
355 | },
356 | "outputs": [
357 | {
358 | "name": "stderr",
359 | "output_type": "stream",
360 | "text": [
361 | "/users/btech/aviraj/cs771/lib/python3.5/site-packages/sklearn/model_selection/_split.py:2026: FutureWarning: From version 0.21, test_size will always complement train_size unless both are specified.\n",
362 | " FutureWarning)\n"
363 | ]
364 | }
365 | ],
366 | "source": [
367 | "index_train, index_val, label_train, label_val = train_test_split(index, data_label, train_size=0.8, shuffle=True, stratify=data_label, random_state=0)\n",
368 | "\n",
369 | "data_x = np.array(data_x)\n",
370 | "\n",
371 | "x_train = data_x[index_train]\n",
372 | "x_val = data_x[index_val]"
373 | ]
374 | },
375 | {
376 | "cell_type": "code",
377 | "execution_count": 16,
378 | "metadata": {
379 | "collapsed": true
380 | },
381 | "outputs": [],
382 | "source": [
383 | "def data_prepare(x, y):\n",
384 | " num_examples = len(x)\n",
385 | "\n",
386 | " xx = np.zeros([num_examples, max_sent_size], dtype=int)\n",
387 | " xx_mask = np.zeros([num_examples, max_sent_size], dtype=bool)\n",
388 | " yy = np.zeros([num_examples], dtype=int)\n",
389 | "\n",
390 | " for j in range(num_examples):\n",
391 | " for i in range(max_sent_size):\n",
392 | " if(i>=len(x[j])):\n",
393 | " break\n",
394 | " w = x[j][i]\n",
395 | " xx[j][i] = word2id[w] if w in word2id else word2id['UNKNOWN_TOKEN']\n",
396 | " xx_mask[j][i] = True\n",
397 | " yy[j] = sense2id[y[j]]\n",
398 | " return xx, xx_mask, yy\n",
399 | "\n",
400 | "def eval_score(yy, pred):\n",
401 | " num_batches = int(len(yy)/batch_size)\n",
402 | " f1 = f1_score(yy[:batch_size*num_batches], pred, average='macro')\n",
403 | " accu = accuracy_score(yy[:batch_size*num_batches], pred)\n",
404 | " return f1*100, accu*100\n",
405 | "\n",
406 | "def model(xx, yy, mask, train_cond=True):\n",
407 | " num_batches = int(len(xx)/batch_size)\n",
408 | " losses = 0\n",
409 | " preds = []\n",
410 | " for j in range(num_batches): \n",
411 | " \n",
412 | " s = j * batch_size\n",
413 | " e = (j+1) * batch_size\n",
414 | " \n",
415 | " feed_dict = {x:xx[s:e], y:yy[s:e], x_mask:mask[s:e], is_train:train_cond, input_keep_prob:keep_prob, word_emb_mat:word_embedding}\n",
416 | " \n",
417 | " \n",
418 | " if(train_cond==True):\n",
419 | " _, _loss, step, _summary = sess.run([train_op, total_loss, global_step, summary], feed_dict)\n",
420 | " summary_writer.add_summary(_summary, step) \n",
421 | "# print(\"Steps:{}\".format(step), \", Loss: {}\".format(_loss))\n",
422 | "\n",
423 | " else:\n",
424 | " _loss, pred = sess.run([total_loss, predictions], feed_dict)\n",
425 | " preds.append(pred)\n",
426 | " \n",
427 | " losses +=_loss\n",
428 | "\n",
429 | " if(train_cond==False):\n",
430 | " y_pred = []\n",
431 | " for i in range(num_batches):\n",
432 | " for pred in preds[i]:\n",
433 | " y_pred.append(pred)\n",
434 | " return losses/num_batches, y_pred\n",
435 | " \n",
436 | " return losses/num_batches, step"
437 | ]
438 | },
439 | {
440 | "cell_type": "code",
441 | "execution_count": null,
442 | "metadata": {
443 | "collapsed": true
444 | },
445 | "outputs": [],
446 | "source": [
447 | "x_id_train, mask_train, y_train = data_prepare(x_train, label_train)\n",
448 | "x_id_val, mask_val, y_val = data_prepare(x_val, label_val)"
449 | ]
450 | },
451 | {
452 | "cell_type": "code",
453 | "execution_count": null,
454 | "metadata": {
455 | "scrolled": true
456 | },
457 | "outputs": [
458 | {
459 | "name": "stdout",
460 | "output_type": "stream",
461 | "text": [
462 | "Epoch: 1 Step: 82 loss: 7.29599668631\n",
463 | "Epoch: 2 Step: 164 loss: 2.07766101418\n",
464 | "Epoch: 3 Step: 246 loss: 1.99490781528\n",
465 | "Epoch: 4 Step: 328 loss: 1.97611695673\n",
466 | "Epoch: 5 Step: 410 loss: 1.97086549387\n",
467 | "Model Saved\n"
468 | ]
469 | },
470 | {
471 | "name": "stderr",
472 | "output_type": "stream",
473 | "text": [
474 | "/users/btech/aviraj/cs771/lib/python3.5/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.\n",
475 | " 'precision', 'predicted', average, warn_for)\n"
476 | ]
477 | },
478 | {
479 | "name": "stdout",
480 | "output_type": "stream",
481 | "text": [
482 | "Train: F1 Score: 6.69154280711 Accuracy: 30.5830792683 Loss: 1.95732803025\n",
483 | "Val: F1 Score: 6.72469704728 Accuracy: 30.78125 Loss: 1.95580910444\n",
484 | "Epoch: 6 Step: 492 loss: 1.98001657899\n"
485 | ]
486 | }
487 | ],
488 | "source": [
489 | "num_epochs = 60\n",
490 | "log_period = 5\n",
491 | "\n",
492 | "for i in range(num_epochs):\n",
493 | " random = np.random.choice(len(y_train), size=(len(y_train)), replace=False)\n",
494 | " x_id_train = x_id_train[random]\n",
495 | " y_train = y_train[random]\n",
496 | " mask_train = mask_train[random] \n",
497 | " \n",
498 | " losses, step = model(x_id_train, y_train, mask_train)\n",
499 | " print(\"Epoch:\", i+1,\"Step:\", step, \"loss:\",losses)\n",
500 | " \n",
501 | " if((i+1)%log_period==0):\n",
502 | " saver.save(sess, save_path=save_dir) \n",
503 | " print(\"Model Saved\")\n",
504 | " train_loss, train_pred = model(x_id_train, y_train, mask_train, train_cond=False)\n",
505 | " f1_, accu_ = eval_score(y_train, train_pred)\n",
506 | " print(\"Train: F1 Score: \", f1_, \"Accuracy: \", accu_, \"Loss: \", train_loss)\n",
507 | " val_loss, val_pred = model(x_id_val, y_val, mask_val, train_cond=False)\n",
508 | " f1_, accu_ = eval_score(y_val, val_pred)\n",
509 | " print(\"Val: F1 Score: \", f1_, \"Accuracy: \", accu_, \"Loss: \", val_loss)\n",
510 | " \n",
511 | "# test_loss, test_pred, test_pred_pos, test_true_pos = model(x_id_test, y_test, mask_test, pos_id_test, train_cond=False) \n",
512 | "# f1_, accu_, f1_pos_, accu_pos_ = etest_score(y_test, test_pred, test_pred_pos, test_true_pos)\n",
513 | "# print(\"test: F1 Score: \", f1_, \"Accuracy: \", accu_, \"POS F1 Score: \", f1_pos_, \"POS Accuracy: \", accu_pos_, \"Loss: \", test_loss)"
514 | ]
515 | },
516 | {
517 | "cell_type": "code",
518 | "execution_count": null,
519 | "metadata": {
520 | "collapsed": true
521 | },
522 | "outputs": [],
523 | "source": []
524 | },
525 | {
526 | "cell_type": "code",
527 | "execution_count": null,
528 | "metadata": {
529 | "collapsed": true
530 | },
531 | "outputs": [],
532 | "source": [
533 | "saver.restore(sess, save_dir)"
534 | ]
535 | }
536 | ],
537 | "metadata": {
538 | "kernelspec": {
539 | "display_name": "cs771",
540 | "language": "python",
541 | "name": "cs771"
542 | },
543 | "language_info": {
544 | "codemirror_mode": {
545 | "name": "ipython",
546 | "version": 3
547 | },
548 | "file_extension": ".py",
549 | "mimetype": "text/x-python",
550 | "name": "python",
551 | "nbconvert_exporter": "python",
552 | "pygments_lexer": "ipython3",
553 | "version": "3.5.2"
554 | }
555 | },
556 | "nbformat": 4,
557 | "nbformat_minor": 2
558 | }
559 |
--------------------------------------------------------------------------------
/one_million/one_million_parsing.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import xml.etree.ElementTree as ET\n",
12 | "import numpy as np\n",
13 | "tree = ET.parse('semcor+omsti.data.xml')\n",
14 | "root = tree.getroot()"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 2,
20 | "metadata": {
21 | "collapsed": true
22 | },
23 | "outputs": [],
24 | "source": [
25 | "cor1 = root[0]\n",
26 | "cor2 = root[1]\n",
27 | "#sent = cor2.findall('text')"
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": 3,
33 | "metadata": {},
34 | "outputs": [],
35 | "source": [
36 | "def isalphabet(word):\n",
37 | " list_ = list(word)\n",
38 | " if len(list_) > 1:\n",
39 | " return True\n",
40 | " else:\n",
41 | " if word.isalpha():\n",
42 | " return True\n",
43 | " return False"
44 | ]
45 | },
46 | {
47 | "cell_type": "code",
48 | "execution_count": 50,
49 | "metadata": {},
50 | "outputs": [
51 | {
52 | "name": "stdout",
53 | "output_type": "stream",
54 | "text": [
55 | "37176\n"
56 | ]
57 | }
58 | ],
59 | "source": [
60 | "train1 = []\n",
61 | "#soup=soup.find_all('corpus')[1]\n",
62 | "count=0\n",
63 | "for sentences in cor1.findall('text'):\n",
64 | " for sentence in sentences:\n",
65 | " temp_sent = []\n",
66 | " temp_sent.append(sentence.get('id'))\n",
67 | "\n",
68 | " temp_words = []\n",
69 | " i_cnt=1\n",
70 | " ind=[]\n",
71 | " for word in sentence:\n",
72 | " string = word.text.lower() \n",
73 | " if (isalphabet(string)):\n",
74 | " temp_words.append(string)\n",
75 | " ind.append(i_cnt)\n",
76 | " i_cnt+=1\n",
77 | "\n",
78 | " temp_sent.append(temp_words)\n",
79 | " list_ = sentence.iter()\n",
80 | " id_list = []\n",
81 | " lemma_list = []\n",
82 | " pos_list = []\n",
83 | " for i in list_:\n",
84 | " id_list.append(i.get('id'))\n",
85 | " lemma_list.append(i.get('lemma'))\n",
86 | " pos_list.append(i.get('pos'))\n",
87 | "\n",
88 | " id_list, lemma_list , pos_list = np.array(id_list),np.array(lemma_list),np.array(pos_list)\n",
89 | " temp_sent.extend([list(id_list[ind]), list(lemma_list[ind]), list(pos_list[ind])])\n",
90 | " train1.append(temp_sent)\n",
91 | " count+=1\n",
92 | " \n",
93 | "print(count)"
94 | ]
95 | },
96 | {
97 | "cell_type": "code",
98 | "execution_count": 51,
99 | "metadata": {},
100 | "outputs": [],
101 | "source": [
102 | "train1=train1[:len(train1)-2]"
103 | ]
104 | },
105 | {
106 | "cell_type": "code",
107 | "execution_count": 19,
108 | "metadata": {},
109 | "outputs": [
110 | {
111 | "name": "stdout",
112 | "output_type": "stream",
113 | "text": [
114 | "813798\n"
115 | ]
116 | }
117 | ],
118 | "source": [
119 | "train2 = []\n",
120 | "#soup=soup.find_all('corpus')[1]\n",
121 | "count=0\n",
122 | "for sentences in cor2.findall('text'):\n",
123 | " for sentence in sentences:\n",
124 | " temp_sent = []\n",
125 | " temp_sent.append(sentence.get('id'))\n",
126 | "\n",
127 | " temp_words = []\n",
128 | " i_cnt=1\n",
129 | " ind=[]\n",
130 | " for word in sentence:\n",
131 | " string = word.text.lower() \n",
132 | " if (isalphabet(string)):\n",
133 | " temp_words.append(string)\n",
134 | " ind.append(i_cnt)\n",
135 | " i_cnt+=1\n",
136 | "\n",
137 | " temp_sent.append(temp_words)\n",
138 | " list_ = sentence.iter()\n",
139 | " id_list = []\n",
140 | " lemma_list = []\n",
141 | " pos_list = []\n",
142 | " for i in list_:\n",
143 | " id_list.append(i.get('id'))\n",
144 | " lemma_list.append(i.get('lemma'))\n",
145 | " pos_list.append(i.get('pos'))\n",
146 | "\n",
147 | " id_list, lemma_list , pos_list = np.array(id_list),np.array(lemma_list),np.array(pos_list)\n",
148 | " temp_sent.extend([list(id_list[ind]), list(lemma_list[ind]), list(pos_list[ind])])\n",
149 | " train2.append(temp_sent)\n",
150 | " count+=1\n",
151 | " \n",
152 | "print(count)"
153 | ]
154 | },
155 | {
156 | "cell_type": "code",
157 | "execution_count": 54,
158 | "metadata": {},
159 | "outputs": [
160 | {
161 | "data": {
162 | "text/plain": [
163 | "850972"
164 | ]
165 | },
166 | "execution_count": 54,
167 | "metadata": {},
168 | "output_type": "execute_result"
169 | }
170 | ],
171 | "source": [
172 | "len(train1)+len(train2)"
173 | ]
174 | },
175 | {
176 | "cell_type": "code",
177 | "execution_count": 55,
178 | "metadata": {},
179 | "outputs": [],
180 | "source": [
181 | "train=train1+train2"
182 | ]
183 | },
184 | {
185 | "cell_type": "code",
186 | "execution_count": 56,
187 | "metadata": {},
188 | "outputs": [
189 | {
190 | "data": {
191 | "text/plain": [
192 | "850972"
193 | ]
194 | },
195 | "execution_count": 56,
196 | "metadata": {},
197 | "output_type": "execute_result"
198 | }
199 | ],
200 | "source": [
201 | "len(train)"
202 | ]
203 | },
204 | {
205 | "cell_type": "code",
206 | "execution_count": 57,
207 | "metadata": {
208 | "collapsed": true
209 | },
210 | "outputs": [],
211 | "source": [
212 | "import pickle\n",
213 | "with open('preprocess_train','wb') as f:\n",
214 | " pickle.dump(train,f)"
215 | ]
216 | }
217 | ],
218 | "metadata": {
219 | "kernelspec": {
220 | "display_name": "cs771",
221 | "language": "python",
222 | "name": "cs771"
223 | },
224 | "language_info": {
225 | "codemirror_mode": {
226 | "name": "ipython",
227 | "version": 3
228 | },
229 | "file_extension": ".py",
230 | "mimetype": "text/x-python",
231 | "name": "python",
232 | "nbconvert_exporter": "python",
233 | "pygments_lexer": "ipython3",
234 | "version": "3.5.2"
235 | }
236 | },
237 | "nbformat": 4,
238 | "nbformat_minor": 2
239 | }
240 |
--------------------------------------------------------------------------------
/one_million/one_word_data_maker-test.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import pickle\n",
12 | "from nltk.corpus import wordnet as wn"
13 | ]
14 | },
15 | {
16 | "cell_type": "code",
17 | "execution_count": 2,
18 | "metadata": {
19 | "collapsed": true
20 | },
21 | "outputs": [],
22 | "source": [
23 | "with open('/data/aviraj/dataset/raw_preprocess_test','rb') as f:\n",
24 | " global_data=pickle.load(f)"
25 | ]
26 | },
27 | {
28 | "cell_type": "code",
29 | "execution_count": 3,
30 | "metadata": {
31 | "collapsed": true
32 | },
33 | "outputs": [],
34 | "source": [
35 | "with open('/data/aviraj/dataset/ALL.gold.key.txt','r') as f:\n",
36 | " data_key=f.readlines()"
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": 9,
42 | "metadata": {},
43 | "outputs": [
44 | {
45 | "data": {
46 | "text/plain": [
47 | "['the',\n",
48 | " 'art',\n",
49 | " 'of',\n",
50 | " 'change_ringing',\n",
51 | " 'be',\n",
52 | " 'peculiar',\n",
53 | " 'to',\n",
54 | " 'the',\n",
55 | " 'english',\n",
56 | " ',',\n",
57 | " 'and',\n",
58 | " ',',\n",
59 | " 'like',\n",
60 | " 'most',\n",
61 | " 'english',\n",
62 | " 'peculiarity',\n",
63 | " ',',\n",
64 | " 'unintelligible',\n",
65 | " 'to',\n",
66 | " 'the',\n",
67 | " 'rest',\n",
68 | " 'of',\n",
69 | " 'the',\n",
70 | " 'world',\n",
71 | " '.']"
72 | ]
73 | },
74 | "execution_count": 9,
75 | "metadata": {},
76 | "output_type": "execute_result"
77 | }
78 | ],
79 | "source": [
80 | "global_data[0][3]"
81 | ]
82 | },
83 | {
84 | "cell_type": "code",
85 | "execution_count": 4,
86 | "metadata": {
87 | "collapsed": true
88 | },
89 | "outputs": [],
90 | "source": [
91 | "def make_word_data(checkword):\n",
92 | " \n",
93 | " dataset_line=[]\n",
94 | " for i,list_ in enumerate(global_data): \n",
95 | " ind=[idx for idx,it in enumerate(list_[3]) if it==checkword]\n",
96 | " for ii in ind:\n",
97 | " if list_[2][ii] is not None:\n",
98 | " dataset_line.append([list_[2][ii],list_[1],list_[4]])\n",
99 | " \n",
100 | " print(len(dataset_line))\n",
101 | " with open('/data/aviraj/dataset/checkwords/'+checkword + '_data_test', 'wb') as f:\n",
102 | " pickle.dump(dataset_line, f)\n",
103 | " with open('/data/aviraj/dataset/checkwords/'+checkword + '_data_test', 'rb') as f:\n",
104 | " data_ = pickle.load(f)"
105 | ]
106 | },
107 | {
108 | "cell_type": "code",
109 | "execution_count": 5,
110 | "metadata": {
111 | "collapsed": true
112 | },
113 | "outputs": [],
114 | "source": [
115 | "test_words = ['force', 'make', 'open', 'place', 'point', 'serve', 'support']"
116 | ]
117 | },
118 | {
119 | "cell_type": "code",
120 | "execution_count": 6,
121 | "metadata": {},
122 | "outputs": [
123 | {
124 | "name": "stdout",
125 | "output_type": "stream",
126 | "text": [
127 | "1\n",
128 | "31\n",
129 | "4\n",
130 | "5\n",
131 | "11\n",
132 | "2\n",
133 | "12\n"
134 | ]
135 | }
136 | ],
137 | "source": [
138 | "for word in test_words:\n",
139 | " make_word_data(word)"
140 | ]
141 | },
142 | {
143 | "cell_type": "code",
144 | "execution_count": null,
145 | "metadata": {
146 | "collapsed": true
147 | },
148 | "outputs": [],
149 | "source": [
150 | "with open('../Glove/vocab_glove', 'rb') as f:\n",
151 | " vocab = pickle.load(f)\n"
152 | ]
153 | },
154 | {
155 | "cell_type": "code",
156 | "execution_count": null,
157 | "metadata": {
158 | "collapsed": true
159 | },
160 | "outputs": [],
161 | "source": [
162 | "train_words = []\n",
163 | "for sent in global_data:\n",
164 | " train_words.extend(sent[1])"
165 | ]
166 | },
167 | {
168 | "cell_type": "code",
169 | "execution_count": null,
170 | "metadata": {},
171 | "outputs": [],
172 | "source": [
173 | "len(train_words), len(set(train_words)), len(vocab)"
174 | ]
175 | },
176 | {
177 | "cell_type": "code",
178 | "execution_count": null,
179 | "metadata": {
180 | "collapsed": true
181 | },
182 | "outputs": [],
183 | "source": [
184 | "import collections\n",
185 | "unknown_words = []\n",
186 | "for word in set(train_words):\n",
187 | " if word not in vocab:\n",
188 | " unknown_words.append(word)\n",
189 | " \n",
190 | "un_counter = collections.Counter(unknown_words)\n",
191 | "un_counter = dict(un_counter)\n",
192 | "\n",
193 | "sorted_un_counter = sorted(un_counter.items(), key=lambda x:x[1], reverse=True)\n",
194 | "sorted_un_counter"
195 | ]
196 | },
197 | {
198 | "cell_type": "code",
199 | "execution_count": null,
200 | "metadata": {
201 | "collapsed": true
202 | },
203 | "outputs": [],
204 | "source": [
205 | "with open('million_unknown_words.pickle', 'wb') as f:\n",
206 | " pickle.dump(unknown_words, f)"
207 | ]
208 | },
209 | {
210 | "cell_type": "code",
211 | "execution_count": null,
212 | "metadata": {},
213 | "outputs": [],
214 | "source": [
215 | "len(sorted(global_data, key=lambda x:len(x[1]), reverse=True)[0][1])"
216 | ]
217 | },
218 | {
219 | "cell_type": "code",
220 | "execution_count": null,
221 | "metadata": {
222 | "collapsed": true
223 | },
224 | "outputs": [],
225 | "source": []
226 | }
227 | ],
228 | "metadata": {
229 | "kernelspec": {
230 | "display_name": "envs",
231 | "language": "python",
232 | "name": "cs771"
233 | },
234 | "language_info": {
235 | "codemirror_mode": {
236 | "name": "ipython",
237 | "version": 3
238 | },
239 | "file_extension": ".py",
240 | "mimetype": "text/x-python",
241 | "name": "python",
242 | "nbconvert_exporter": "python",
243 | "pygments_lexer": "ipython3",
244 | "version": "3.5.2"
245 | }
246 | },
247 | "nbformat": 4,
248 | "nbformat_minor": 2
249 | }
250 |
--------------------------------------------------------------------------------
/one_million/one_word_data_maker.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import pickle\n",
12 | "from nltk.corpus import wordnet as wn"
13 | ]
14 | },
15 | {
16 | "cell_type": "code",
17 | "execution_count": 2,
18 | "metadata": {
19 | "collapsed": true
20 | },
21 | "outputs": [],
22 | "source": [
23 | "with open('/data/aviraj/dataset/raw_preprocess_train','rb') as f:\n",
24 | " global_data=pickle.load(f)"
25 | ]
26 | },
27 | {
28 | "cell_type": "code",
29 | "execution_count": 3,
30 | "metadata": {
31 | "collapsed": true
32 | },
33 | "outputs": [],
34 | "source": [
35 | "with open('/data/aviraj/dataset/semcor+omsti.gold.key.txt','r') as f:\n",
36 | " data_key=f.readlines()"
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": 4,
42 | "metadata": {
43 | "collapsed": true
44 | },
45 | "outputs": [],
46 | "source": [
47 | "def make_word_data(checkword):\n",
48 | " \n",
49 | " dataset_line=[]\n",
50 | " for i,list_ in enumerate(global_data): \n",
51 | " ind=[idx for idx,it in enumerate(list_[3]) if it==checkword]\n",
52 | " for ii in ind:\n",
53 | " if list_[2][ii] is not None:\n",
54 | " dataset_line.append([list_[2][ii],list_[1],list_[4]])\n",
55 | " \n",
56 | " print(len(dataset_line))\n",
57 | " with open('/data/aviraj/dataset/checkwords/'+checkword + '_data', 'wb') as f:\n",
58 | " pickle.dump(dataset_line, f)\n",
59 | " with open('/data/aviraj/dataset/checkwords/'+checkword + '_data', 'rb') as f:\n",
60 | " data_ = pickle.load(f)"
61 | ]
62 | },
63 | {
64 | "cell_type": "code",
65 | "execution_count": 5,
66 | "metadata": {
67 | "collapsed": true
68 | },
69 | "outputs": [],
70 | "source": [
71 | "test_words = ['force', 'make', 'open', 'place', 'point', 'serve', 'support']"
72 | ]
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": null,
77 | "metadata": {
78 | "collapsed": true
79 | },
80 | "outputs": [],
81 | "source": []
82 | },
83 | {
84 | "cell_type": "code",
85 | "execution_count": 6,
86 | "metadata": {},
87 | "outputs": [
88 | {
89 | "name": "stdout",
90 | "output_type": "stream",
91 | "text": [
92 | "3723\n",
93 | "6593\n",
94 | "2922\n",
95 | "3569\n",
96 | "2855\n",
97 | "3462\n",
98 | "3489\n"
99 | ]
100 | }
101 | ],
102 | "source": [
103 | "for word in test_words:\n",
104 | " make_word_data(word)"
105 | ]
106 | },
107 | {
108 | "cell_type": "code",
109 | "execution_count": null,
110 | "metadata": {
111 | "collapsed": true
112 | },
113 | "outputs": [],
114 | "source": [
115 | "with open('../Glove/vocab_glove', 'rb') as f:\n",
116 | " vocab = pickle.load(f)\n"
117 | ]
118 | },
119 | {
120 | "cell_type": "code",
121 | "execution_count": null,
122 | "metadata": {
123 | "collapsed": true
124 | },
125 | "outputs": [],
126 | "source": [
127 | "train_words = []\n",
128 | "for sent in global_data:\n",
129 | " train_words.extend(sent[1])"
130 | ]
131 | },
132 | {
133 | "cell_type": "code",
134 | "execution_count": null,
135 | "metadata": {},
136 | "outputs": [],
137 | "source": [
138 | "len(train_words), len(set(train_words)), len(vocab)"
139 | ]
140 | },
141 | {
142 | "cell_type": "code",
143 | "execution_count": null,
144 | "metadata": {
145 | "collapsed": true
146 | },
147 | "outputs": [],
148 | "source": [
149 | "import collections\n",
150 | "unknown_words = []\n",
151 | "for word in set(train_words):\n",
152 | " if word not in vocab:\n",
153 | " unknown_words.append(word)\n",
154 | " \n",
155 | "un_counter = collections.Counter(unknown_words)\n",
156 | "un_counter = dict(un_counter)\n",
157 | "\n",
158 | "sorted_un_counter = sorted(un_counter.items(), key=lambda x:x[1], reverse=True)\n",
159 | "sorted_un_counter"
160 | ]
161 | },
162 | {
163 | "cell_type": "code",
164 | "execution_count": null,
165 | "metadata": {
166 | "collapsed": true
167 | },
168 | "outputs": [],
169 | "source": [
170 | "with open('million_unknown_words.pickle', 'wb') as f:\n",
171 | " pickle.dump(unknown_words, f)"
172 | ]
173 | },
174 | {
175 | "cell_type": "code",
176 | "execution_count": null,
177 | "metadata": {},
178 | "outputs": [],
179 | "source": [
180 | "len(sorted(global_data, key=lambda x:len(x[1]), reverse=True)[0][1])"
181 | ]
182 | },
183 | {
184 | "cell_type": "code",
185 | "execution_count": null,
186 | "metadata": {
187 | "collapsed": true
188 | },
189 | "outputs": [],
190 | "source": []
191 | }
192 | ],
193 | "metadata": {
194 | "kernelspec": {
195 | "display_name": "envs",
196 | "language": "python",
197 | "name": "cs771"
198 | },
199 | "language_info": {
200 | "codemirror_mode": {
201 | "name": "ipython",
202 | "version": 3
203 | },
204 | "file_extension": ".py",
205 | "mimetype": "text/x-python",
206 | "name": "python",
207 | "nbconvert_exporter": "python",
208 | "pygments_lexer": "ipython3",
209 | "version": "3.5.2"
210 | }
211 | },
212 | "nbformat": 4,
213 | "nbformat_minor": 2
214 | }
215 |
--------------------------------------------------------------------------------
/one_million/raw_one_million_parsing-test.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import xml.etree.ElementTree as ET\n",
10 | "import numpy as np\n",
11 | "tree = ET.parse('../../dataset/ALL.data.xml')\n",
12 | "root = tree.getroot()"
13 | ]
14 | },
15 | {
16 | "cell_type": "code",
17 | "execution_count": 2,
18 | "metadata": {},
19 | "outputs": [
20 | {
21 | "data": {
22 | "text/plain": [
23 | ""
24 | ]
25 | },
26 | "execution_count": 2,
27 | "metadata": {},
28 | "output_type": "execute_result"
29 | }
30 | ],
31 | "source": [
32 | "root"
33 | ]
34 | },
35 | {
36 | "cell_type": "code",
37 | "execution_count": 4,
38 | "metadata": {
39 | "collapsed": true
40 | },
41 | "outputs": [],
42 | "source": [
43 | "def isalphabet(word):\n",
44 | " return True"
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": 5,
50 | "metadata": {},
51 | "outputs": [
52 | {
53 | "name": "stdout",
54 | "output_type": "stream",
55 | "text": [
56 | "1173\n"
57 | ]
58 | }
59 | ],
60 | "source": [
61 | "train1 = []\n",
62 | "count=0\n",
63 | "for sentences in root.findall('text'):\n",
64 | " for sentence in sentences:\n",
65 | " temp_sent = []\n",
66 | " temp_sent.append(sentence.get('id'))\n",
67 | "\n",
68 | " temp_words = []\n",
69 | " i_cnt=1\n",
70 | " ind=[]\n",
71 | " for word in sentence:\n",
72 | " string = word.text.lower() \n",
73 | " if (isalphabet(string)):\n",
74 | " temp_words.append(string)\n",
75 | " ind.append(i_cnt)\n",
76 | " i_cnt+=1\n",
77 | "\n",
78 | " temp_sent.append(temp_words)\n",
79 | " list_ = sentence.iter()\n",
80 | " id_list = []\n",
81 | " lemma_list = []\n",
82 | " pos_list = []\n",
83 | " for i in list_:\n",
84 | " id_list.append(i.get('id'))\n",
85 | " lemma_list.append(i.get('lemma'))\n",
86 | " pos_list.append(i.get('pos'))\n",
87 | "\n",
88 | " id_list, lemma_list , pos_list = np.array(id_list),np.array(lemma_list),np.array(pos_list)\n",
89 | " temp_sent.extend([list(id_list[ind]), list(lemma_list[ind]), list(pos_list[ind])])\n",
90 | " train1.append(temp_sent)\n",
91 | " count+=1\n",
92 | " \n",
93 | "print(count)"
94 | ]
95 | },
96 | {
97 | "cell_type": "code",
98 | "execution_count": 6,
99 | "metadata": {},
100 | "outputs": [
101 | {
102 | "data": {
103 | "text/plain": [
104 | "1173"
105 | ]
106 | },
107 | "execution_count": 6,
108 | "metadata": {},
109 | "output_type": "execute_result"
110 | }
111 | ],
112 | "source": [
113 | "len(train1)"
114 | ]
115 | },
116 | {
117 | "cell_type": "code",
118 | "execution_count": 10,
119 | "metadata": {
120 | "collapsed": true
121 | },
122 | "outputs": [],
123 | "source": [
124 | "import pickle\n",
125 | "with open('/data/aviraj/dataset/raw_preprocess_test','wb') as f:\n",
126 | " pickle.dump(train1,f)"
127 | ]
128 | },
129 | {
130 | "cell_type": "code",
131 | "execution_count": 12,
132 | "metadata": {},
133 | "outputs": [
134 | {
135 | "data": {
136 | "text/plain": [
137 | "['senseval2.d000.s000',\n",
138 | " ['the',\n",
139 | " 'art',\n",
140 | " 'of',\n",
141 | " 'change-ringing',\n",
142 | " 'is',\n",
143 | " 'peculiar',\n",
144 | " 'to',\n",
145 | " 'the',\n",
146 | " 'english',\n",
147 | " ',',\n",
148 | " 'and',\n",
149 | " ',',\n",
150 | " 'like',\n",
151 | " 'most',\n",
152 | " 'english',\n",
153 | " 'peculiarities',\n",
154 | " ',',\n",
155 | " 'unintelligible',\n",
156 | " 'to',\n",
157 | " 'the',\n",
158 | " 'rest',\n",
159 | " 'of',\n",
160 | " 'the',\n",
161 | " 'world',\n",
162 | " '.'],\n",
163 | " [None,\n",
164 | " 'senseval2.d000.s000.t000',\n",
165 | " None,\n",
166 | " 'senseval2.d000.s000.t001',\n",
167 | " None,\n",
168 | " 'senseval2.d000.s000.t002',\n",
169 | " None,\n",
170 | " None,\n",
171 | " 'senseval2.d000.s000.t003',\n",
172 | " None,\n",
173 | " None,\n",
174 | " None,\n",
175 | " None,\n",
176 | " 'senseval2.d000.s000.t004',\n",
177 | " 'senseval2.d000.s000.t005',\n",
178 | " 'senseval2.d000.s000.t006',\n",
179 | " None,\n",
180 | " 'senseval2.d000.s000.t007',\n",
181 | " None,\n",
182 | " None,\n",
183 | " 'senseval2.d000.s000.t008',\n",
184 | " None,\n",
185 | " None,\n",
186 | " 'senseval2.d000.s000.t009',\n",
187 | " None],\n",
188 | " ['the',\n",
189 | " 'art',\n",
190 | " 'of',\n",
191 | " 'change_ringing',\n",
192 | " 'be',\n",
193 | " 'peculiar',\n",
194 | " 'to',\n",
195 | " 'the',\n",
196 | " 'english',\n",
197 | " ',',\n",
198 | " 'and',\n",
199 | " ',',\n",
200 | " 'like',\n",
201 | " 'most',\n",
202 | " 'english',\n",
203 | " 'peculiarity',\n",
204 | " ',',\n",
205 | " 'unintelligible',\n",
206 | " 'to',\n",
207 | " 'the',\n",
208 | " 'rest',\n",
209 | " 'of',\n",
210 | " 'the',\n",
211 | " 'world',\n",
212 | " '.'],\n",
213 | " ['DET',\n",
214 | " 'NOUN',\n",
215 | " 'ADP',\n",
216 | " 'NOUN',\n",
217 | " 'VERB',\n",
218 | " 'ADJ',\n",
219 | " 'PRT',\n",
220 | " 'DET',\n",
221 | " 'NOUN',\n",
222 | " '.',\n",
223 | " 'CONJ',\n",
224 | " '.',\n",
225 | " 'ADP',\n",
226 | " 'ADJ',\n",
227 | " 'ADJ',\n",
228 | " 'NOUN',\n",
229 | " '.',\n",
230 | " 'ADJ',\n",
231 | " 'PRT',\n",
232 | " 'DET',\n",
233 | " 'NOUN',\n",
234 | " 'ADP',\n",
235 | " 'DET',\n",
236 | " 'NOUN',\n",
237 | " '.']]"
238 | ]
239 | },
240 | "execution_count": 12,
241 | "metadata": {},
242 | "output_type": "execute_result"
243 | }
244 | ],
245 | "source": [
246 | "train1[0]"
247 | ]
248 | },
249 | {
250 | "cell_type": "code",
251 | "execution_count": null,
252 | "metadata": {},
253 | "outputs": [],
254 | "source": []
255 | }
256 | ],
257 | "metadata": {
258 | "kernelspec": {
259 | "display_name": "envs",
260 | "language": "python",
261 | "name": "cs771"
262 | },
263 | "language_info": {
264 | "codemirror_mode": {
265 | "name": "ipython",
266 | "version": 3
267 | },
268 | "file_extension": ".py",
269 | "mimetype": "text/x-python",
270 | "name": "python",
271 | "nbconvert_exporter": "python",
272 | "pygments_lexer": "ipython3",
273 | "version": "3.5.2"
274 | }
275 | },
276 | "nbformat": 4,
277 | "nbformat_minor": 2
278 | }
279 |
--------------------------------------------------------------------------------
/one_million/raw_one_million_parsing.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 2,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import xml.etree.ElementTree as ET\n",
10 | "import numpy as np\n",
11 | "tree = ET.parse('../../dataset/semcor+omsti.data.xml')\n",
12 | "root = tree.getroot()"
13 | ]
14 | },
15 | {
16 | "cell_type": "code",
17 | "execution_count": 3,
18 | "metadata": {},
19 | "outputs": [
20 | {
21 | "data": {
22 | "text/plain": [
23 | ""
24 | ]
25 | },
26 | "execution_count": 3,
27 | "metadata": {},
28 | "output_type": "execute_result"
29 | }
30 | ],
31 | "source": [
32 | "root"
33 | ]
34 | },
35 | {
36 | "cell_type": "code",
37 | "execution_count": 4,
38 | "metadata": {
39 | "collapsed": true
40 | },
41 | "outputs": [],
42 | "source": [
43 | "cor1 = root[0]\n",
44 | "cor2 = root[1]"
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": 6,
50 | "metadata": {},
51 | "outputs": [
52 | {
53 | "data": {
54 | "text/plain": [
55 | ""
56 | ]
57 | },
58 | "execution_count": 6,
59 | "metadata": {},
60 | "output_type": "execute_result"
61 | }
62 | ],
63 | "source": [
64 | "cor2"
65 | ]
66 | },
67 | {
68 | "cell_type": "code",
69 | "execution_count": 3,
70 | "metadata": {
71 | "collapsed": true
72 | },
73 | "outputs": [],
74 | "source": [
75 | "def isalphabet(word):\n",
76 | " return True"
77 | ]
78 | },
79 | {
80 | "cell_type": "code",
81 | "execution_count": 4,
82 | "metadata": {},
83 | "outputs": [
84 | {
85 | "name": "stdout",
86 | "output_type": "stream",
87 | "text": [
88 | "37176\n"
89 | ]
90 | }
91 | ],
92 | "source": [
93 | "train1 = []\n",
94 | "#soup=soup.find_all('corpus')[1]\n",
95 | "count=0\n",
96 | "for sentences in cor1.findall('text'):\n",
97 | " for sentence in sentences:\n",
98 | " temp_sent = []\n",
99 | " temp_sent.append(sentence.get('id'))\n",
100 | "\n",
101 | " temp_words = []\n",
102 | " i_cnt=1\n",
103 | " ind=[]\n",
104 | " for word in sentence:\n",
105 | " string = word.text.lower() \n",
106 | " if (isalphabet(string)):\n",
107 | " temp_words.append(string)\n",
108 | " ind.append(i_cnt)\n",
109 | " i_cnt+=1\n",
110 | "\n",
111 | " temp_sent.append(temp_words)\n",
112 | " list_ = sentence.iter()\n",
113 | " id_list = []\n",
114 | " lemma_list = []\n",
115 | " pos_list = []\n",
116 | " for i in list_:\n",
117 | " id_list.append(i.get('id'))\n",
118 | " lemma_list.append(i.get('lemma'))\n",
119 | " pos_list.append(i.get('pos'))\n",
120 | "\n",
121 | " id_list, lemma_list , pos_list = np.array(id_list),np.array(lemma_list),np.array(pos_list)\n",
122 | " temp_sent.extend([list(id_list[ind]), list(lemma_list[ind]), list(pos_list[ind])])\n",
123 | " train1.append(temp_sent)\n",
124 | " count+=1\n",
125 | " \n",
126 | "print(count)"
127 | ]
128 | },
129 | {
130 | "cell_type": "code",
131 | "execution_count": 5,
132 | "metadata": {
133 | "collapsed": true
134 | },
135 | "outputs": [],
136 | "source": [
137 | "train1=train1[:len(train1)-2]"
138 | ]
139 | },
140 | {
141 | "cell_type": "code",
142 | "execution_count": 6,
143 | "metadata": {},
144 | "outputs": [
145 | {
146 | "name": "stdout",
147 | "output_type": "stream",
148 | "text": [
149 | "813798\n"
150 | ]
151 | }
152 | ],
153 | "source": [
154 | "train2 = []\n",
155 | "#soup=soup.find_all('corpus')[1]\n",
156 | "count=0\n",
157 | "for sentences in cor2.findall('text'):\n",
158 | " for sentence in sentences:\n",
159 | " temp_sent = []\n",
160 | " temp_sent.append(sentence.get('id'))\n",
161 | "\n",
162 | " temp_words = []\n",
163 | " i_cnt=1\n",
164 | " ind=[]\n",
165 | " for word in sentence:\n",
166 | " string = word.text.lower() \n",
167 | " if (isalphabet(string)):\n",
168 | " temp_words.append(string)\n",
169 | " ind.append(i_cnt)\n",
170 | " i_cnt+=1\n",
171 | "\n",
172 | " temp_sent.append(temp_words)\n",
173 | " list_ = sentence.iter()\n",
174 | " id_list = []\n",
175 | " lemma_list = []\n",
176 | " pos_list = []\n",
177 | " for i in list_:\n",
178 | " id_list.append(i.get('id'))\n",
179 | " lemma_list.append(i.get('lemma'))\n",
180 | " pos_list.append(i.get('pos'))\n",
181 | "\n",
182 | " id_list, lemma_list , pos_list = np.array(id_list),np.array(lemma_list),np.array(pos_list)\n",
183 | " temp_sent.extend([list(id_list[ind]), list(lemma_list[ind]), list(pos_list[ind])])\n",
184 | " train2.append(temp_sent)\n",
185 | " count+=1\n",
186 | " \n",
187 | "print(count)"
188 | ]
189 | },
190 | {
191 | "cell_type": "code",
192 | "execution_count": 7,
193 | "metadata": {},
194 | "outputs": [
195 | {
196 | "data": {
197 | "text/plain": [
198 | "850972"
199 | ]
200 | },
201 | "execution_count": 7,
202 | "metadata": {},
203 | "output_type": "execute_result"
204 | }
205 | ],
206 | "source": [
207 | "len(train1)+len(train2)"
208 | ]
209 | },
210 | {
211 | "cell_type": "code",
212 | "execution_count": 17,
213 | "metadata": {
214 | "collapsed": true
215 | },
216 | "outputs": [],
217 | "source": [
218 | "train=train1+train2"
219 | ]
220 | },
221 | {
222 | "cell_type": "code",
223 | "execution_count": 18,
224 | "metadata": {},
225 | "outputs": [
226 | {
227 | "data": {
228 | "text/plain": [
229 | "850972"
230 | ]
231 | },
232 | "execution_count": 18,
233 | "metadata": {},
234 | "output_type": "execute_result"
235 | }
236 | ],
237 | "source": [
238 | "len(train)"
239 | ]
240 | },
241 | {
242 | "cell_type": "code",
243 | "execution_count": 19,
244 | "metadata": {
245 | "collapsed": true
246 | },
247 | "outputs": [],
248 | "source": [
249 | "import pickle\n",
250 | "with open('raw_preprocess_train','wb') as f:\n",
251 | " pickle.dump(train,f)"
252 | ]
253 | }
254 | ],
255 | "metadata": {
256 | "kernelspec": {
257 | "display_name": "envs",
258 | "language": "python",
259 | "name": "cs771"
260 | },
261 | "language_info": {
262 | "codemirror_mode": {
263 | "name": "ipython",
264 | "version": 3
265 | },
266 | "file_extension": ".py",
267 | "mimetype": "text/x-python",
268 | "name": "python",
269 | "nbconvert_exporter": "python",
270 | "pygments_lexer": "ipython3",
271 | "version": "3.5.2"
272 | }
273 | },
274 | "nbformat": 4,
275 | "nbformat_minor": 2
276 | }
277 |
--------------------------------------------------------------------------------
/papers/1603.07012.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sshanu/Hierarchical-Word-Sense-Disambiguation-using-WordNet-Senses/74905c60e48ab1884bf5c8f208e21015f7f7fd21/papers/1603.07012.pdf
--------------------------------------------------------------------------------
/papers/9f260612d5817d542cda2a7d9a6eb18d6471.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sshanu/Hierarchical-Word-Sense-Disambiguation-using-WordNet-Senses/74905c60e48ab1884bf5c8f208e21015f7f7fd21/papers/9f260612d5817d542cda2a7d9a6eb18d6471.pdf
--------------------------------------------------------------------------------
/papers/D17-1008.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sshanu/Hierarchical-Word-Sense-Disambiguation-using-WordNet-Senses/74905c60e48ab1884bf5c8f208e21015f7f7fd21/papers/D17-1008.pdf
--------------------------------------------------------------------------------
/papers/K16-1006.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sshanu/Hierarchical-Word-Sense-Disambiguation-using-WordNet-Senses/74905c60e48ab1884bf5c8f208e21015f7f7fd21/papers/K16-1006.pdf
--------------------------------------------------------------------------------
/papers/P16-1085.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sshanu/Hierarchical-Word-Sense-Disambiguation-using-WordNet-Senses/74905c60e48ab1884bf5c8f208e21015f7f7fd21/papers/P16-1085.pdf
--------------------------------------------------------------------------------
/papers/W16-5307.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sshanu/Hierarchical-Word-Sense-Disambiguation-using-WordNet-Senses/74905c60e48ab1884bf5c8f208e21015f7f7fd21/papers/W16-5307.pdf
--------------------------------------------------------------------------------
/papers/a10-navigli.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sshanu/Hierarchical-Word-Sense-Disambiguation-using-WordNet-Senses/74905c60e48ab1884bf5c8f208e21015f7f7fd21/papers/a10-navigli.pdf
--------------------------------------------------------------------------------
/papers/crf.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sshanu/Hierarchical-Word-Sense-Disambiguation-using-WordNet-Senses/74905c60e48ab1884bf5c8f208e21015f7f7fd21/papers/crf.pdf
--------------------------------------------------------------------------------
/papers/report1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sshanu/Hierarchical-Word-Sense-Disambiguation-using-WordNet-Senses/74905c60e48ab1884bf5c8f208e21015f7f7fd21/papers/report1.pdf
--------------------------------------------------------------------------------
/papers/report2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sshanu/Hierarchical-Word-Sense-Disambiguation-using-WordNet-Senses/74905c60e48ab1884bf5c8f208e21015f7f7fd21/papers/report2.pdf
--------------------------------------------------------------------------------