├── BLEU.py
├── GCN.ipynb
├── NMT - default.ipynb
├── README.md
├── corpus
    ├── en-spa.txt
    ├── en.csv
    └── spa.csv
├── data.py
├── model
    ├── .ipynb_checkpoints
    │   ├── decoder-checkpoint.py
    │   ├── encoder-checkpoint.py
    │   └── graph-checkpoint.py
    ├── __pycache__
    │   ├── decoder.cpython-36.pyc
    │   ├── encoder.cpython-36.pyc
    │   └── graph.cpython-36.pyc
    ├── decoder.py
    ├── encoder.py
    └── graph.py
├── utils.py
└── validation.py


/BLEU.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import codecs
  3 | import os
  4 | import math
  5 | import operator
  6 | import json
  7 | from functools import reduce
  8 | 
  9 | 
 10 | def fetch_data(cand, ref):
 11 |     """ Store each reference and candidate sentences as a list """
 12 |     references = []
 13 |     if '.txt' in ref:
 14 |         reference_file = codecs.open(ref, 'r')
 15 |         references.append(reference_file.readlines())
 16 |     else:
 17 |         for root, dirs, files in os.walk(ref):
 18 |             for f in files:
 19 |                 reference_file = codecs.open(os.path.join(root, f), 'r')
 20 |                 references.append(reference_file.readlines())
 21 |     candidate_file = codecs.open(cand, 'r')
 22 |     candidate = candidate_file.readlines()
 23 |     return candidate, references
 24 | 
 25 | 
 26 | def count_ngram(candidate, references, n):
 27 |     clipped_count = 0
 28 |     count = 0
 29 |     r = 0
 30 |     c = 0
 31 |     for si in range(len(candidate)):
 32 |         # Calculate precision for each sentence
 33 |         ref_counts = []
 34 |         ref_lengths = []
 35 |         # Build dictionary of ngram counts
 36 |         for reference in references:
 37 |             ref_sentence = reference[si]
 38 |             ngram_d = {}
 39 |             words = ref_sentence.strip().split()
 40 |             ref_lengths.append(len(words))
 41 |             limits = len(words) - n + 1
 42 |             # loop through the sentance consider the ngram length
 43 |             for i in range(limits):
 44 |                 ngram = ' '.join(words[i:i+n]).lower()
 45 |                 if ngram in ngram_d.keys():
 46 |                     ngram_d[ngram] += 1
 47 |                 else:
 48 |                     ngram_d[ngram] = 1
 49 |             ref_counts.append(ngram_d)
 50 |         # candidate
 51 |         cand_sentence = candidate[si]
 52 |         cand_dict = {}
 53 |         words = cand_sentence.strip().split()
 54 |         limits = len(words) - n + 1
 55 |         for i in range(0, limits):
 56 |             ngram = ' '.join(words[i:i + n]).lower()
 57 |             if ngram in cand_dict:
 58 |                 cand_dict[ngram] += 1
 59 |             else:
 60 |                 cand_dict[ngram] = 1
 61 |         clipped_count += clip_count(cand_dict, ref_counts)
 62 |         count += limits
 63 |         r += best_length_match(ref_lengths, len(words))
 64 |         c += len(words)
 65 |     if clipped_count == 0:
 66 |         pr = 0
 67 |     else:
 68 |         pr = float(clipped_count) / count
 69 |     bp = brevity_penalty(c, r)
 70 |     return pr, bp
 71 | 
 72 | 
 73 | def clip_count(cand_d, ref_ds):
 74 |     """Count the clip count for each ngram considering all references"""
 75 |     count = 0
 76 |     for m in cand_d.keys():
 77 |         m_w = cand_d[m]
 78 |         m_max = 0
 79 |         for ref in ref_ds:
 80 |             if m in ref:
 81 |                 m_max = max(m_max, ref[m])
 82 |         m_w = min(m_w, m_max)
 83 |         count += m_w
 84 |     return count
 85 | 
 86 | 
 87 | def best_length_match(ref_l, cand_l):
 88 |     """Find the closest length of reference to that of candidate"""
 89 |     
 90 |     least_diff = abs(cand_l-ref_l[0])
 91 |     best = ref_l[0]
 92 |     for ref in ref_l:
 93 |         if abs(cand_l-ref) < least_diff:
 94 |             least_diff = abs(cand_l-ref)
 95 |             best = ref
 96 |     return best
 97 | 
 98 | 
 99 | def brevity_penalty(c, r):
100 |     if c > r:
101 |         bp = 1
102 |     else:
103 |         bp = math.exp(1-(float(r)/c))
104 |     return bp
105 | 
106 | 
107 | def geometric_mean(precisions):
108 |     return (reduce(operator.mul, precisions)) ** (1.0 / len(precisions))
109 | 
110 | 
111 | def BLEU(candidate, references, n_grams=4):
112 |     precisions = []
113 |     for i in range(n_grams):
114 |         pr, bp = count_ngram(candidate, references, i+1)
115 |         precisions.append(pr)
116 |     bleu = geometric_mean(precisions) * bp    
117 |     return bleu, precisions, bp


--------------------------------------------------------------------------------
/GCN.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "from data import generate_batches\n",
 12 |     "from data import prepare_data\n",
 13 |     "from data import data_to_index\n",
 14 |     "from data import DEP_LABELS\n",
 15 |     "\n",
 16 |     "from model.graph import Sintactic_GCN\n",
 17 |     "from model.encoder import Encoder\n",
 18 |     "from model.decoder import Decoder_luong\n",
 19 |     "\n",
 20 |     "from BLEU import BLEU\n",
 21 |     "\n",
 22 |     "from utils import time_since\n",
 23 |     "\n",
 24 |     "import torch\n",
 25 |     "import torch.nn as nn\n",
 26 |     "from torch.nn import functional\n",
 27 |     "from torch.autograd import Variable\n",
 28 |     "from torch import optim\n",
 29 |     "import torch.nn.functional as F\n",
 30 |     "\n",
 31 |     "from stanfordcorenlp import StanfordCoreNLP \n",
 32 |     "\n",
 33 |     "import numpy as np\n",
 34 |     "import time\n",
 35 |     "\n",
 36 |     "from validation import Evaluator\n",
 37 |     "\n",
 38 |     "%load_ext autoreload\n",
 39 |     "%autoreload 2"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": 2,
 45 |    "metadata": {},
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "USE_CUDA = True\n",
 49 |     "MAX_LENGTH = 100\n",
 50 |     "\n",
 51 |     "SPLIT_TRAIN = 0.7\n",
 52 |     "SPLIT_VALID = 0.15\n",
 53 |     "# The rest is for test"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "markdown",
 58 |    "metadata": {},
 59 |    "source": [
 60 |     "# Reading the data"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "markdown",
 65 |    "metadata": {},
 66 |    "source": [
 67 |     "Prepare vocabulary and pairs for the data"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": 3,
 73 |    "metadata": {},
 74 |    "outputs": [
 75 |     {
 76 |      "name": "stdout",
 77 |      "output_type": "stream",
 78 |      "text": [
 79 |       "Reading lines...\n",
 80 |       "Read 118964 sentence pairs\n",
 81 |       "Filtered to 85785 pairs\n",
 82 |       "Creating vocab...\n",
 83 |       "Indexed 12436 words in input language, 22765 words in output\n"
 84 |      ]
 85 |     }
 86 |    ],
 87 |    "source": [
 88 |     "input_lang, output_lang, pairs = prepare_data('en', 'spa', max_length=MAX_LENGTH)"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "markdown",
 93 |    "metadata": {},
 94 |    "source": [
 95 |     "Splitting pairs into test, val and test"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": 4,
101 |    "metadata": {},
102 |    "outputs": [],
103 |    "source": [
104 |     "#np.shuffle(pairs)\n",
105 |     "pairs_train = pairs[:int(len(pairs) * SPLIT_TRAIN)]\n",
106 |     "pairs_valid = pairs[int(len(pairs) * SPLIT_TRAIN):int(len(pairs) * (SPLIT_TRAIN + SPLIT_VALID))]\n",
107 |     "pairs_test = pairs[int(len(pairs) * (SPLIT_TRAIN + SPLIT_VALID)):]"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": 5,
113 |    "metadata": {},
114 |    "outputs": [
115 |     {
116 |      "data": {
117 |       "text/plain": [
118 |        "(60049, 12868, 12868)"
119 |       ]
120 |      },
121 |      "execution_count": 5,
122 |      "metadata": {},
123 |      "output_type": "execute_result"
124 |     }
125 |    ],
126 |    "source": [
127 |     "len(pairs_train), len(pairs_valid), len(pairs_test)"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "markdown",
132 |    "metadata": {},
133 |    "source": [
134 |     "Get the adjacency matrix for the pairs"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": 6,
140 |    "metadata": {},
141 |    "outputs": [],
142 |    "source": [
143 |     "nlp = StanfordCoreNLP(r'/home/krivas/stanford-corenlp-full-2018-02-27/')"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "code",
148 |    "execution_count": 7,
149 |    "metadata": {},
150 |    "outputs": [],
151 |    "source": [
152 |     "from tqdm import tqdm\n",
153 |     "def get_adjacency_matrix(pairs):\n",
154 |     "    arr_dep = []\n",
155 |     "    for pair in tqdm(pairs):\n",
156 |     "        arr_dep.append(nlp.dependency_parse(pair[0]))\n",
157 |     "    return np.array(arr_dep)"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "code",
162 |    "execution_count": 8,
163 |    "metadata": {},
164 |    "outputs": [
165 |     {
166 |      "name": "stderr",
167 |      "output_type": "stream",
168 |      "text": [
169 |       "100%|██████████| 60049/60049 [07:20<00:00, 136.33it/s]\n",
170 |       "100%|██████████| 12868/12868 [02:03<00:00, 104.42it/s]\n",
171 |       "100%|██████████| 12868/12868 [02:29<00:00, 86.21it/s]\n"
172 |      ]
173 |     }
174 |    ],
175 |    "source": [
176 |     "arr_dep_train = get_adjacency_matrix(pairs_train)\n",
177 |     "arr_dep_valid = get_adjacency_matrix(pairs_valid)\n",
178 |     "arr_dep_test = get_adjacency_matrix(pairs_test)"
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "markdown",
183 |    "metadata": {},
184 |    "source": [
185 |     "Converting words to index in pairs"
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "code",
190 |    "execution_count": 9,
191 |    "metadata": {},
192 |    "outputs": [],
193 |    "source": [
194 |     "pairs_train = data_to_index(pairs_train, input_lang, output_lang)\n",
195 |     "pairs_valid = data_to_index(pairs_valid, input_lang, output_lang)\n",
196 |     "pairs_test = data_to_index(pairs_test, input_lang, output_lang)"
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "markdown",
201 |    "metadata": {},
202 |    "source": [
203 |     "# Training"
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "code",
208 |    "execution_count": 10,
209 |    "metadata": {},
210 |    "outputs": [],
211 |    "source": [
212 |     "def pass_batch_luong(batch_size, input_batches, target_batches, train=True, adj_arc_in=None, adj_arc_out=None, adj_lab_in=None, adj_lab_out=None, mask_in=None, mask_out=None, mask_loop=None):\n",
213 |     "        \n",
214 |     "    hidden = encoder.init_hidden(batch_size)\n",
215 |     "\n",
216 |     "    encoder_outputs, encoder_hidden = encoder(input_batches, hidden)\n",
217 |     "    decoder_input = Variable(torch.LongTensor([input_lang.vocab.stoi[\"<sos>\"]] * batch_size))\n",
218 |     "    \n",
219 |     "    if gcn1:\n",
220 |     "        encoder_outputs = gcn1(encoder_outputs,\n",
221 |     "                             adj_arc_in, adj_arc_out,\n",
222 |     "                             adj_lab_in, adj_lab_out,\n",
223 |     "                             mask_in, mask_out,  \n",
224 |     "                             mask_loop)\n",
225 |     "    \n",
226 |     "    decoder_hidden = encoder_hidden\n",
227 |     "    decoder_context = Variable(torch.zeros(batch_size, decoder.hidden_size)) \n",
228 |     "    \n",
229 |     "    all_decoder_outputs = Variable(torch.zeros(target_batches.data.size()[0], batch_size, len(output_lang.vocab.itos)))\n",
230 |     "\n",
231 |     "    if USE_CUDA:\n",
232 |     "        all_decoder_outputs = all_decoder_outputs.cuda()\n",
233 |     "        decoder_input = decoder_input.cuda()\n",
234 |     "        decoder_context = decoder_context.cuda()\n",
235 |     "    \n",
236 |     "    if train:\n",
237 |     "        use_teacher_forcing = np.random.random() < tf_ratio\n",
238 |     "    else:\n",
239 |     "        use_teacher_forcing = False\n",
240 |     "    \n",
241 |     "    if use_teacher_forcing:        \n",
242 |     "        # Use targets as inputs\n",
243 |     "        for di in range(target_batches.shape[0]):\n",
244 |     "            decoder_output, decoder_context, decoder_hidden, decoder_attention = decoder(\n",
245 |     "                decoder_input.unsqueeze(0), decoder_context, decoder_hidden, encoder_outputs)\n",
246 |     "            \n",
247 |     "            all_decoder_outputs[di] = decoder_output\n",
248 |     "            decoder_input = target_batches[di]\n",
249 |     "    else:        \n",
250 |     "        # Use decoder output as inputs\n",
251 |     "        for di in range(target_batches.shape[0]):            \n",
252 |     "            decoder_output, decoder_context, decoder_hidden, decoder_attention = decoder(\n",
253 |     "                decoder_input.unsqueeze(0), decoder_context, decoder_hidden, encoder_outputs) \n",
254 |     "            \n",
255 |     "            all_decoder_outputs[di] = decoder_output\n",
256 |     "            \n",
257 |     "            # Greedy approach, take the word with highest probability\n",
258 |     "            topv, topi = decoder_output.data.topk(1)            \n",
259 |     "            decoder_input = Variable(torch.LongTensor(topi.cpu()).squeeze())\n",
260 |     "            if USE_CUDA: decoder_input = decoder_input.cuda()\n",
261 |     "        \n",
262 |     "    del decoder_output\n",
263 |     "    del decoder_hidden\n",
264 |     "        \n",
265 |     "    return all_decoder_outputs, target_batches\n",
266 |     "\n",
267 |     "def train_luong(input_batches, target_batches, batch_size, train=True, adj_arc_in=None, adj_arc_out=None, adj_lab_in=None, adj_lab_out=None, mask_in=None, mask_out=None, mask_loop=None):\n",
268 |     "    \n",
269 |     "    # Zero gradients of both optimizers\n",
270 |     "    if train:\n",
271 |     "        encoder_optimizer.zero_grad()\n",
272 |     "        decoder_optimizer.zero_grad()\n",
273 |     "\n",
274 |     "    loss = 0 # Added onto for each word\n",
275 |     "    all_decoder_outputs, target_batches = pass_batch_luong(batch_size, input_batches, target_batches, train, adj_arc_in, adj_arc_out, adj_lab_in, adj_lab_out, mask_in, mask_out, mask_loop)\n",
276 |     "    \n",
277 |     "    # Loss calculation and backpropagation\n",
278 |     "    loss = criterion(all_decoder_outputs.view(-1, decoder.output_size), target_batches.contiguous().view(-1))\n",
279 |     "    \n",
280 |     "    if train:\n",
281 |     "        loss.backward()\n",
282 |     "        torch.nn.utils.clip_grad_norm_(encoder.parameters(), clip)\n",
283 |     "        torch.nn.utils.clip_grad_norm_(decoder.parameters(), clip)\n",
284 |     "        encoder_optimizer.step()\n",
285 |     "        decoder_optimizer.step()\n",
286 |     "        \n",
287 |     "        if gcn1:\n",
288 |     "            torch.nn.utils.clip_grad_norm_(gcn1.parameters(), clip)\n",
289 |     "            gcn1_optimizer.step()\n",
290 |     "\n",
291 |     "    del all_decoder_outputs\n",
292 |     "    del target_batches\n",
293 |     "    \n",
294 |     "    return loss.item()"
295 |    ]
296 |   },
297 |   {
298 |    "cell_type": "markdown",
299 |    "metadata": {},
300 |    "source": [
301 |     "# Model"
302 |    ]
303 |   },
304 |   {
305 |    "cell_type": "code",
306 |    "execution_count": 18,
307 |    "metadata": {
308 |     "collapsed": true
309 |    },
310 |    "outputs": [],
311 |    "source": [
312 |     "# Configure models\n",
313 |     "hidden_size_rnn = 512\n",
314 |     "hidden_size_graph = 512\n",
315 |     "emb_size=300\n",
316 |     "n_layers = 2\n",
317 |     "dropout = 0.1\n",
318 |     "batch_size = 50\n",
319 |     "\n",
320 |     "# Configure training/optimization\n",
321 |     "clip = 10.0\n",
322 |     "learning_rate_graph = 0.0002\n",
323 |     "n_epochs = 20\n",
324 |     "print_every = 10\n",
325 |     "validate_loss_every = 50\n",
326 |     "validate_acc_every = 2 * validate_loss_every\n",
327 |     "tf_ratio = 0.5\n",
328 |     "best_bleu = 0"
329 |    ]
330 |   },
331 |   {
332 |    "cell_type": "code",
333 |    "execution_count": null,
334 |    "metadata": {},
335 |    "outputs": [],
336 |    "source": [
337 |     "# Initialize models\n",
338 |     "encoder = Encoder(len(input_lang.vocab.itos), hidden_size_rnn, emb_size, n_layers=n_layers, dropout=dropout, USE_CUDA=USE_CUDA)\n",
339 |     "decoder = Decoder_luong('general', hidden_size_graph, len(output_lang.vocab.itos), 300, n_layers=2 * n_layers, dropout=dropout, USE_CUDA=USE_CUDA)\n",
340 |     "gcn1 = Sintactic_GCN(hidden_size_rnn, hidden_size_graph, num_labels=len(DEP_LABELS))\n",
341 |     "\n",
342 |     "# Initialize optimizers and criterion\n",
343 |     "encoder_optimizer = optim.Adam(encoder.parameters())\n",
344 |     "decoder_optimizer = optim.Adam(decoder.parameters())\n",
345 |     "gcn1_optimizer = optim.Adam(gcn1.parameters())#, learning_rate_graph)\n",
346 |     "\n",
347 |     "criterion = nn.NLLLoss()\n",
348 |     "\n",
349 |     "# Move models to GPU\n",
350 |     "if USE_CUDA:\n",
351 |     "    encoder = encoder.cuda()\n",
352 |     "    decoder = decoder.cuda()\n",
353 |     "    gcn1 = gcn1.cuda()\n",
354 |     "    \n",
355 |     "# Keep track of time elapsed and running averages\n",
356 |     "start = time.time()\n",
357 |     "train_losses = []\n",
358 |     "validation_losses = []\n",
359 |     "validation_bleu = []\n",
360 |     "\n",
361 |     "print_loss_total = 0 # Reset every print_every\n",
362 |     "plot_loss_total = 0 # Reset every plot_every"
363 |    ]
364 |   },
365 |   {
366 |    "cell_type": "code",
367 |    "execution_count": null,
368 |    "metadata": {},
369 |    "outputs": [
370 |     {
371 |      "name": "stdout",
372 |      "output_type": "stream",
373 |      "text": [
374 |       "0m 4s (- 8m 56s) (10 0.83%) train_loss: 5.5433\n",
375 |       "0m 6s (- 6m 40s) (20 1.67%) train_loss: 2.3962\n",
376 |       "0m 8s (- 5m 50s) (30 2.50%) train_loss: 2.1459\n",
377 |       "0m 11s (- 5m 21s) (40 3.33%) train_loss: 2.1469\n",
378 |       "0m 13s (- 5m 7s) (50 4.16%) train_loss: 2.0643\n",
379 |       "0m 15s (- 5m 1s) (60 5.00%) train_loss: 2.0653\n",
380 |       "0m 18s (- 4m 57s) (70 5.83%) train_loss: 2.1074\n",
381 |       "0m 20s (- 4m 52s) (80 6.66%) train_loss: 2.0922\n",
382 |       "0m 23s (- 4m 48s) (90 7.49%) train_loss: 2.1566\n",
383 |       "0m 25s (- 4m 45s) (100 8.33%) train_loss: 1.9920\n",
384 |       "0m 28s (- 4m 42s) (110 9.16%) train_loss: 1.9252\n",
385 |       "0m 30s (- 4m 39s) (120 9.99%) train_loss: 1.9822\n",
386 |       "0m 33s (- 4m 35s) (130 10.82%) train_loss: 1.8853\n",
387 |       "0m 35s (- 4m 32s) (140 11.66%) train_loss: 1.8869\n",
388 |       "0m 39s (- 4m 37s) (150 12.49%) train_loss: 1.8213\n",
389 |       "0m 43s (- 4m 41s) (160 13.32%) train_loss: 1.7827\n",
390 |       "0m 46s (- 4m 44s) (170 14.15%) train_loss: 1.7572\n",
391 |       "0m 50s (- 4m 47s) (180 14.99%) train_loss: 1.7951\n",
392 |       "0m 54s (- 4m 49s) (190 15.82%) train_loss: 1.7455\n",
393 |       "0m 58s (- 4m 50s) (200 16.65%) train_loss: 1.7772\n",
394 |       "1m 1s (- 4m 51s) (210 17.49%) train_loss: 1.7657\n",
395 |       "1m 5s (- 4m 52s) (220 18.32%) train_loss: 1.7416\n",
396 |       "1m 9s (- 4m 51s) (230 19.15%) train_loss: 1.8612\n",
397 |       "1m 12s (- 4m 50s) (240 19.98%) train_loss: 1.7539\n",
398 |       "1m 15s (- 4m 48s) (250 20.82%) train_loss: 1.5999\n",
399 |       "1m 19s (- 4m 47s) (260 21.65%) train_loss: 1.7584\n",
400 |       "1m 23s (- 4m 46s) (270 22.48%) train_loss: 1.6611\n",
401 |       "1m 26s (- 4m 45s) (280 23.31%) train_loss: 1.7680\n",
402 |       "1m 30s (- 4m 44s) (290 24.15%) train_loss: 1.7306\n",
403 |       "1m 34s (- 4m 43s) (300 24.98%) train_loss: 1.7626\n",
404 |       "1m 38s (- 4m 41s) (310 25.81%) train_loss: 1.7287\n",
405 |       "1m 41s (- 4m 40s) (320 26.64%) train_loss: 1.6571\n",
406 |       "1m 45s (- 4m 38s) (330 27.48%) train_loss: 1.7704\n",
407 |       "1m 48s (- 4m 35s) (340 28.31%) train_loss: 1.7828\n",
408 |       "1m 52s (- 4m 33s) (350 29.14%) train_loss: 1.7588\n",
409 |       "1m 56s (- 4m 31s) (360 29.98%) train_loss: 1.7041\n",
410 |       "1m 59s (- 4m 28s) (370 30.81%) train_loss: 1.7498\n",
411 |       "2m 3s (- 4m 26s) (380 31.64%) train_loss: 1.6711\n",
412 |       "2m 6s (- 4m 23s) (390 32.47%) train_loss: 1.6425\n"
413 |      ]
414 |     }
415 |    ],
416 |    "source": [
417 |     "for epoch in range(1, n_epochs): \n",
418 |     "    # Shuffle data\n",
419 |     "    id_aux = np.random.permutation(np.arange(len(pairs_train)))\n",
420 |     "    pairs_train = pairs_train[id_aux]\n",
421 |     "    arr_dep_train = arr_dep_train[id_aux]\n",
422 |     "    \n",
423 |     "    # Get the batches for this epoch\n",
424 |     "    input_batches, target_batches = generate_batches(input_lang, output_lang, batch_size, pairs_train, return_dep_tree=True, arr_dep=arr_dep_train, max_degree=6, USE_CUDA=USE_CUDA)\n",
425 |     "    print_loss_total = 0\n",
426 |     "    for batch_ix, (input_batch, target_var) in enumerate(zip(input_batches, target_batches)):\n",
427 |     "    \n",
428 |     "        encoder.train()\n",
429 |     "        decoder.train()\n",
430 |     "        gcn1.train()\n",
431 |     "    \n",
432 |     "        [input_var, adj_arc_in, adj_arc_out, adj_lab_in, adj_lab_out, mask_in, mask_out, mask_loop] = input_batch\n",
433 |     "        # Run the train function\n",
434 |     "        loss = train_luong(input_var, target_var, input_var.size(1), \n",
435 |     "                    True, adj_arc_in, adj_arc_out, adj_lab_in, adj_lab_out, mask_in, mask_out, mask_loop)\n",
436 |     "            \n",
437 |     "        torch.cuda.empty_cache()\n",
438 |     "\n",
439 |     "        # Keep track of loss\n",
440 |     "        print_loss_total += loss\n",
441 |     "        plot_loss_total += loss\n",
442 |     "\n",
443 |     "        if batch_ix == 0: continue\n",
444 |     "\n",
445 |     "        if batch_ix % print_every == 0:\n",
446 |     "            print_loss_avg = print_loss_total / print_every\n",
447 |     "            print_loss_total = 0\n",
448 |     "            print_summary = '%s (%d %d%%) %.4f' % (time_since(start, epoch / n_epochs), epoch, epoch / n_epochs * 100, print_loss_avg)\n",
449 |     "            train_losses.append(loss)\n",
450 |     "\n",
451 |     "            print(f'{time_since(start, batch_ix / len(input_batches))} ({batch_ix} {batch_ix / len(input_batches) * 100:.2f}%) train_loss: {print_loss_avg:.4f}')\n",
452 |     "    \n",
453 |     "    input_batches, target_batches = generate_batches(input_lang, output_lang, batch_size, pairs_valid, return_dep_tree=True, arr_dep=arr_dep_train, max_degree=6, USE_CUDA=USE_CUDA)\n",
454 |     "    print_loss_total = 0\n",
455 |     "    for input_batch, target_var in zip(input_batches, target_batches):\n",
456 |     "    \n",
457 |     "        encoder.eval()\n",
458 |     "        decoder.eval()\n",
459 |     "        gcn1.eval()\n",
460 |     "    \n",
461 |     "        [input_var, adj_arc_in, adj_arc_out, adj_lab_in, adj_lab_out, mask_in, mask_out, mask_loop] = input_batch\n",
462 |     "        # Run the train function\n",
463 |     "        loss = train_luong(input_var, target_var, input_var.size(1), \n",
464 |     "                     False, adj_arc_in, adj_arc_out, adj_lab_in, adj_lab_out, mask_in, mask_out, mask_loop)\n",
465 |     "        \n",
466 |     "        print_loss_total += loss\n",
467 |     "    val_loss = print_loss_total / len(input_batches)\n",
468 |     "    validation_losses.append(val_loss)\n",
469 |     "    # Evaluating Bleu\n",
470 |     "    evaluator = Evaluator(encoder, decoder, gcn1, None, input_lang, output_lang, MAX_LENGTH, True)\n",
471 |     "    candidates, references = evaluator.get_candidates_and_references(pairs_test, arr_dep_test, k_beams=1)\n",
472 |     "    bleu = BLEU(candidates, [references])\n",
473 |     "    if bleu[0] > best_bleu:\n",
474 |     "        best_bleu = bleu[0]\n",
475 |     "        torch.save(encoder.state_dict(), 'encoder_graph.pkl')\n",
476 |     "        torch.save(decoder.state_dict(), 'decoder_graph.pkl')\n",
477 |     "        torch.save(gcn1.state_dict(), 'gcn_graph.pkl')\n",
478 |     "    validation_bleu.append(bleu)\n",
479 |     "    print(f'val_loss: {val_loss:.4f} - bleu: {bleu}', end=' ')\n",
480 |     "\n",
481 |     "    # Prevent overflow gpu memory\n",
482 |     "    del evaluator"
483 |    ]
484 |   },
485 |   {
486 |    "cell_type": "code",
487 |    "execution_count": null,
488 |    "metadata": {},
489 |    "outputs": [],
490 |    "source": []
491 |   }
492 |  ],
493 |  "metadata": {
494 |   "kernelspec": {
495 |    "display_name": "Python 3",
496 |    "language": "python",
497 |    "name": "python3"
498 |   },
499 |   "language_info": {
500 |    "codemirror_mode": {
501 |     "name": "ipython",
502 |     "version": 3
503 |    },
504 |    "file_extension": ".py",
505 |    "mimetype": "text/x-python",
506 |    "name": "python",
507 |    "nbconvert_exporter": "python",
508 |    "pygments_lexer": "ipython3",
509 |    "version": "3.6.4"
510 |   },
511 |   "toc": {
512 |    "nav_menu": {},
513 |    "number_sections": true,
514 |    "sideBar": true,
515 |    "skip_h1_title": false,
516 |    "toc_cell": false,
517 |    "toc_position": {},
518 |    "toc_section_display": "block",
519 |    "toc_window_display": false
520 |   }
521 |  },
522 |  "nbformat": 4,
523 |  "nbformat_minor": 2
524 | }
525 | 


--------------------------------------------------------------------------------
/NMT - default.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "from data import generate_batches\n",
 12 |     "from data import prepare_data\n",
 13 |     "from data import data_to_index\n",
 14 |     "from data import DEP_LABELS\n",
 15 |     "\n",
 16 |     "from model.graph import Sintactic_GCN\n",
 17 |     "from model.encoder import Encoder\n",
 18 |     "from model.decoder import Decoder_luong\n",
 19 |     "\n",
 20 |     "from BLEU import BLEU\n",
 21 |     "\n",
 22 |     "from utils import time_since\n",
 23 |     "\n",
 24 |     "import torch\n",
 25 |     "import torch.nn as nn\n",
 26 |     "from torch.nn import functional\n",
 27 |     "from torch.autograd import Variable\n",
 28 |     "from torch import optim\n",
 29 |     "import torch.nn.functional as F\n",
 30 |     "\n",
 31 |     "from stanfordcorenlp import StanfordCoreNLP \n",
 32 |     "\n",
 33 |     "import numpy as np\n",
 34 |     "import time\n",
 35 |     "\n",
 36 |     "from validation import Evaluator\n",
 37 |     "\n",
 38 |     "%load_ext autoreload\n",
 39 |     "%autoreload 2"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": 2,
 45 |    "metadata": {},
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "USE_CUDA = True\n",
 49 |     "MAX_LENGTH = 100\n",
 50 |     "\n",
 51 |     "SPLIT_TRAIN = 0.7\n",
 52 |     "SPLIT_VALID = 0.15\n",
 53 |     "# The rest is for test"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "markdown",
 58 |    "metadata": {},
 59 |    "source": [
 60 |     "# Reading the data"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "markdown",
 65 |    "metadata": {},
 66 |    "source": [
 67 |     "Prepare vocabulary and pairs for the data"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": 3,
 73 |    "metadata": {},
 74 |    "outputs": [
 75 |     {
 76 |      "name": "stdout",
 77 |      "output_type": "stream",
 78 |      "text": [
 79 |       "Reading lines...\n",
 80 |       "Read 118964 sentence pairs\n",
 81 |       "Filtered to 85785 pairs\n",
 82 |       "Creating vocab...\n",
 83 |       "Indexed 12436 words in input language, 22765 words in output\n"
 84 |      ]
 85 |     }
 86 |    ],
 87 |    "source": [
 88 |     "input_lang, output_lang, pairs = prepare_data('en', 'spa', max_length=MAX_LENGTH)"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "markdown",
 93 |    "metadata": {},
 94 |    "source": [
 95 |     "Splitting pairs into test, val and test"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": 4,
101 |    "metadata": {},
102 |    "outputs": [],
103 |    "source": [
104 |     "#np.shuffle(pairs)\n",
105 |     "pairs_train = pairs[:int(len(pairs) * SPLIT_TRAIN)]\n",
106 |     "pairs_valid = pairs[int(len(pairs) * SPLIT_TRAIN):int(len(pairs) * (SPLIT_TRAIN + SPLIT_VALID))]\n",
107 |     "pairs_test = pairs[int(len(pairs) * (SPLIT_TRAIN + SPLIT_VALID)):]"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": 5,
113 |    "metadata": {},
114 |    "outputs": [
115 |     {
116 |      "data": {
117 |       "text/plain": [
118 |        "(60049, 12868, 12868)"
119 |       ]
120 |      },
121 |      "execution_count": 5,
122 |      "metadata": {},
123 |      "output_type": "execute_result"
124 |     }
125 |    ],
126 |    "source": [
127 |     "len(pairs_train), len(pairs_valid), len(pairs_test)"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "markdown",
132 |    "metadata": {},
133 |    "source": [
134 |     "Get the adjacency matrix for the pairs"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": 6,
140 |    "metadata": {},
141 |    "outputs": [],
142 |    "source": [
143 |     "nlp = StanfordCoreNLP(r'/home/krivas/stanford-corenlp-full-2018-02-27/')"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "code",
148 |    "execution_count": 7,
149 |    "metadata": {},
150 |    "outputs": [],
151 |    "source": [
152 |     "from tqdm import tqdm\n",
153 |     "def get_adjacency_matrix(pairs):\n",
154 |     "    arr_dep = []\n",
155 |     "    for pair in tqdm(pairs):\n",
156 |     "        arr_dep.append(nlp.dependency_parse(pair[0]))\n",
157 |     "    return np.array(arr_dep)"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "code",
162 |    "execution_count": 8,
163 |    "metadata": {},
164 |    "outputs": [
165 |     {
166 |      "name": "stderr",
167 |      "output_type": "stream",
168 |      "text": [
169 |       "100%|██████████| 60049/60049 [07:22<00:00, 135.68it/s]\n",
170 |       "100%|██████████| 12868/12868 [02:01<00:00, 106.01it/s]\n",
171 |       "100%|██████████| 12868/12868 [02:26<00:00, 87.54it/s]\n"
172 |      ]
173 |     }
174 |    ],
175 |    "source": [
176 |     "arr_dep_train = get_adjacency_matrix(pairs_train)\n",
177 |     "arr_dep_valid = get_adjacency_matrix(pairs_valid)\n",
178 |     "arr_dep_test = get_adjacency_matrix(pairs_test)"
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "markdown",
183 |    "metadata": {},
184 |    "source": [
185 |     "Converting words to index in pairs"
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "code",
190 |    "execution_count": 9,
191 |    "metadata": {},
192 |    "outputs": [],
193 |    "source": [
194 |     "pairs_train = data_to_index(pairs_train, input_lang, output_lang)\n",
195 |     "pairs_valid = data_to_index(pairs_valid, input_lang, output_lang)\n",
196 |     "pairs_test = data_to_index(pairs_test, input_lang, output_lang)"
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "markdown",
201 |    "metadata": {},
202 |    "source": [
203 |     "# Training"
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "code",
208 |    "execution_count": 10,
209 |    "metadata": {},
210 |    "outputs": [],
211 |    "source": [
212 |     "def pass_batch_luong(batch_size, input_batches, target_batches, train=True, adj_arc_in=None, adj_arc_out=None, adj_lab_in=None, adj_lab_out=None, mask_in=None, mask_out=None, mask_loop=None):\n",
213 |     "        \n",
214 |     "    hidden = encoder.init_hidden(batch_size)\n",
215 |     "\n",
216 |     "    encoder_outputs, encoder_hidden = encoder(input_batches, hidden)\n",
217 |     "    decoder_input = Variable(torch.LongTensor([input_lang.vocab.stoi[\"<sos>\"]] * batch_size))\n",
218 |     "    \n",
219 |     "    decoder_hidden = encoder_hidden\n",
220 |     "    decoder_context = Variable(torch.zeros(batch_size, decoder.hidden_size)) \n",
221 |     "    \n",
222 |     "    all_decoder_outputs = Variable(torch.zeros(target_batches.data.size()[0], batch_size, len(output_lang.vocab.itos)))\n",
223 |     "\n",
224 |     "    if USE_CUDA:\n",
225 |     "        all_decoder_outputs = all_decoder_outputs.cuda()\n",
226 |     "        decoder_input = decoder_input.cuda()\n",
227 |     "        decoder_context = decoder_context.cuda()\n",
228 |     "    \n",
229 |     "    if train:\n",
230 |     "        use_teacher_forcing = np.random.random() < tf_ratio\n",
231 |     "    else:\n",
232 |     "        use_teacher_forcing = False\n",
233 |     "    \n",
234 |     "    if use_teacher_forcing:        \n",
235 |     "        # Use targets as inputs\n",
236 |     "        for di in range(target_batches.shape[0]):\n",
237 |     "            decoder_output, decoder_context, decoder_hidden, decoder_attention = decoder(\n",
238 |     "                decoder_input.unsqueeze(0), decoder_context, decoder_hidden, encoder_outputs)\n",
239 |     "            \n",
240 |     "            all_decoder_outputs[di] = decoder_output\n",
241 |     "            decoder_input = target_batches[di]\n",
242 |     "    else:        \n",
243 |     "        # Use decoder output as inputs\n",
244 |     "        for di in range(target_batches.shape[0]):            \n",
245 |     "            decoder_output, decoder_context, decoder_hidden, decoder_attention = decoder(\n",
246 |     "                decoder_input.unsqueeze(0), decoder_context, decoder_hidden, encoder_outputs) \n",
247 |     "            \n",
248 |     "            all_decoder_outputs[di] = decoder_output\n",
249 |     "            \n",
250 |     "            # Greedy approach, take the word with highest probability\n",
251 |     "            topv, topi = decoder_output.data.topk(1)            \n",
252 |     "            decoder_input = Variable(torch.LongTensor(topi.cpu()).squeeze())\n",
253 |     "            if USE_CUDA: decoder_input = decoder_input.cuda()\n",
254 |     "        \n",
255 |     "    del decoder_output\n",
256 |     "    del decoder_hidden\n",
257 |     "        \n",
258 |     "    return all_decoder_outputs, target_batches\n",
259 |     "\n",
260 |     "def train_luong(input_batches, target_batches, batch_size, train=True, adj_arc_in=None, adj_arc_out=None, adj_lab_in=None, adj_lab_out=None, mask_in=None, mask_out=None, mask_loop=None):\n",
261 |     "    \n",
262 |     "    # Zero gradients of both optimizers\n",
263 |     "    if train:\n",
264 |     "        encoder_optimizer.zero_grad()\n",
265 |     "        decoder_optimizer.zero_grad()\n",
266 |     "\n",
267 |     "    loss = 0 # Added onto for each word\n",
268 |     "    all_decoder_outputs, target_batches = pass_batch_luong(batch_size, input_batches, target_batches, train, adj_arc_in, adj_arc_out, adj_lab_in, adj_lab_out, mask_in, mask_out, mask_loop)\n",
269 |     "    \n",
270 |     "    # Loss calculation and backpropagation\n",
271 |     "    loss = criterion(all_decoder_outputs.view(-1, decoder.output_size), target_batches.contiguous().view(-1))\n",
272 |     "    \n",
273 |     "    if train:\n",
274 |     "        loss.backward()\n",
275 |     "        torch.nn.utils.clip_grad_norm_(encoder.parameters(), clip)\n",
276 |     "        torch.nn.utils.clip_grad_norm_(decoder.parameters(), clip)\n",
277 |     "        encoder_optimizer.step()\n",
278 |     "        decoder_optimizer.step()\n",
279 |     "        \n",
280 |     "        if gcn1:\n",
281 |     "            torch.nn.utils.clip_grad_norm_(gcn1.parameters(), clip)\n",
282 |     "            gcn1_optimizer.step()\n",
283 |     "\n",
284 |     "    del all_decoder_outputs\n",
285 |     "    del target_batches\n",
286 |     "    \n",
287 |     "    return loss.item()"
288 |    ]
289 |   },
290 |   {
291 |    "cell_type": "markdown",
292 |    "metadata": {},
293 |    "source": [
294 |     "# Model"
295 |    ]
296 |   },
297 |   {
298 |    "cell_type": "code",
299 |    "execution_count": 18,
300 |    "metadata": {
301 |     "collapsed": true
302 |    },
303 |    "outputs": [],
304 |    "source": [
305 |     "# Configure models\n",
306 |     "hidden_size_rnn = 512\n",
307 |     "hidden_size_graph = 512\n",
308 |     "emb_size=300\n",
309 |     "n_layers = 2\n",
310 |     "dropout = 0.1\n",
311 |     "batch_size = 50\n",
312 |     "\n",
313 |     "# Configure training/optimization\n",
314 |     "clip = 10.0\n",
315 |     "learning_rate_graph = 0.0002\n",
316 |     "n_epochs = 20\n",
317 |     "print_every = 10\n",
318 |     "validate_loss_every = 50\n",
319 |     "validate_acc_every = 2 * validate_loss_every\n",
320 |     "tf_ratio = 0.5\n",
321 |     "best_bleu = 0"
322 |    ]
323 |   },
324 |   {
325 |    "cell_type": "code",
326 |    "execution_count": null,
327 |    "metadata": {},
328 |    "outputs": [],
329 |    "source": [
330 |     "# Initialize models\n",
331 |     "encoder = Encoder(len(input_lang.vocab.itos), hidden_size_rnn, emb_size, n_layers=n_layers, dropout=dropout, USE_CUDA=USE_CUDA)\n",
332 |     "decoder = Decoder_luong('general', hidden_size_graph, len(output_lang.vocab.itos), 300, n_layers=2 * n_layers, dropout=dropout, USE_CUDA=USE_CUDA)\n",
333 |     "gcn1 = Sintactic_GCN(hidden_size_rnn, hidden_size_graph, num_labels=len(DEP_LABELS))\n",
334 |     "\n",
335 |     "# Initialize optimizers and criterion\n",
336 |     "encoder_optimizer = optim.Adam(encoder.parameters())\n",
337 |     "decoder_optimizer = optim.Adam(decoder.parameters())\n",
338 |     "gcn1_optimizer = optim.Adam(gcn1.parameters(), learning_rate_graph)\n",
339 |     "\n",
340 |     "criterion = nn.NLLLoss()\n",
341 |     "\n",
342 |     "# Move models to GPU\n",
343 |     "if USE_CUDA:\n",
344 |     "    encoder = encoder.cuda()\n",
345 |     "    decoder = decoder.cuda()\n",
346 |     "    gcn1 = gcn1.cuda()\n",
347 |     "    \n",
348 |     "# Keep track of time elapsed and running averages\n",
349 |     "start = time.time()\n",
350 |     "train_losses = []\n",
351 |     "validation_losses = []\n",
352 |     "validation_bleu = []\n",
353 |     "\n",
354 |     "print_loss_total = 0 # Reset every print_every\n",
355 |     "plot_loss_total = 0 # Reset every plot_every"
356 |    ]
357 |   },
358 |   {
359 |    "cell_type": "code",
360 |    "execution_count": null,
361 |    "metadata": {},
362 |    "outputs": [
363 |     {
364 |      "name": "stdout",
365 |      "output_type": "stream",
366 |      "text": [
367 |       "0m 7s (- 15m 12s) (10 0.83%) train_loss: 4.7464\n",
368 |       "0m 11s (- 11m 9s) (20 1.67%) train_loss: 2.4366\n",
369 |       "0m 15s (- 9m 45s) (30 2.50%) train_loss: 2.3272\n",
370 |       "0m 18s (- 9m 4s) (40 3.33%) train_loss: 2.1440\n",
371 |       "0m 22s (- 8m 37s) (50 4.16%) train_loss: 2.0919\n",
372 |       "0m 26s (- 8m 18s) (60 5.00%) train_loss: 2.0173\n",
373 |       "0m 29s (- 8m 3s) (70 5.83%) train_loss: 2.0141\n",
374 |       "0m 33s (- 7m 51s) (80 6.66%) train_loss: 1.9101\n",
375 |       "0m 37s (- 7m 42s) (90 7.49%) train_loss: 1.8445\n",
376 |       "0m 41s (- 7m 35s) (100 8.33%) train_loss: 1.8303\n",
377 |       "0m 45s (- 7m 27s) (110 9.16%) train_loss: 1.7503\n",
378 |       "0m 48s (- 7m 20s) (120 9.99%) train_loss: 1.7693\n",
379 |       "0m 52s (- 7m 14s) (130 10.82%) train_loss: 1.7140\n",
380 |       "0m 56s (- 7m 7s) (140 11.66%) train_loss: 1.7159\n",
381 |       "1m 0s (- 7m 1s) (150 12.49%) train_loss: 1.6555\n",
382 |       "1m 3s (- 6m 56s) (160 13.32%) train_loss: 1.6903\n",
383 |       "1m 7s (- 6m 50s) (170 14.15%) train_loss: 1.6072\n",
384 |       "1m 11s (- 6m 44s) (180 14.99%) train_loss: 1.5495\n",
385 |       "1m 15s (- 6m 39s) (190 15.82%) train_loss: 1.6719\n",
386 |       "1m 18s (- 6m 34s) (200 16.65%) train_loss: 1.6109\n",
387 |       "1m 22s (- 6m 28s) (210 17.49%) train_loss: 1.5898\n",
388 |       "1m 26s (- 6m 24s) (220 18.32%) train_loss: 1.5067\n",
389 |       "1m 29s (- 6m 19s) (230 19.15%) train_loss: 1.4599\n",
390 |       "1m 33s (- 6m 15s) (240 19.98%) train_loss: 1.4415\n"
391 |      ]
392 |     }
393 |    ],
394 |    "source": [
395 |     "for epoch in range(1, n_epochs): \n",
396 |     "    # Shuffle data\n",
397 |     "    id_aux = np.random.permutation(np.arange(len(pairs_train)))\n",
398 |     "    pairs_train = pairs_train[id_aux]\n",
399 |     "    arr_dep_train = arr_dep_train[id_aux]\n",
400 |     "    \n",
401 |     "    # Get the batches for this epoch\n",
402 |     "    input_batches, target_batches = generate_batches(input_lang, output_lang, batch_size, pairs_train, return_dep_tree=True, arr_dep=arr_dep_train, max_degree=6, USE_CUDA=USE_CUDA)\n",
403 |     "    print_loss_total = 0\n",
404 |     "    for batch_ix, (input_batch, target_var) in enumerate(zip(input_batches, target_batches)):\n",
405 |     "    \n",
406 |     "        encoder.train()\n",
407 |     "        decoder.train()\n",
408 |     "        gcn1.train()\n",
409 |     "    \n",
410 |     "        [input_var, adj_arc_in, adj_arc_out, adj_lab_in, adj_lab_out, mask_in, mask_out, mask_loop] = input_batch\n",
411 |     "        # Run the train function\n",
412 |     "        loss = train_luong(input_var, target_var, input_var.size(1), \n",
413 |     "                    True, adj_arc_in, adj_arc_out, adj_lab_in, adj_lab_out, mask_in, mask_out, mask_loop)\n",
414 |     "            \n",
415 |     "        torch.cuda.empty_cache()\n",
416 |     "\n",
417 |     "        # Keep track of loss\n",
418 |     "        print_loss_total += loss\n",
419 |     "        plot_loss_total += loss\n",
420 |     "\n",
421 |     "        if batch_ix == 0: continue\n",
422 |     "\n",
423 |     "        if batch_ix % print_every == 0:\n",
424 |     "            print_loss_avg = print_loss_total / print_every\n",
425 |     "            print_loss_total = 0\n",
426 |     "            print_summary = '%s (%d %d%%) %.4f' % (time_since(start, epoch / n_epochs), epoch, epoch / n_epochs * 100, print_loss_avg)\n",
427 |     "            train_losses.append(loss)\n",
428 |     "\n",
429 |     "            print(f'{time_since(start, batch_ix / len(input_batches))} ({batch_ix} {batch_ix / len(input_batches) * 100:.2f}%) train_loss: {print_loss_avg:.4f}')\n",
430 |     "    \n",
431 |     "    input_batches, target_batches = generate_batches(input_lang, output_lang, batch_size, pairs_valid, return_dep_tree=True, arr_dep=arr_dep_train, max_degree=6, USE_CUDA=USE_CUDA)\n",
432 |     "    print_loss_total = 0\n",
433 |     "    for input_batch, target_var in zip(input_batches, target_batches):\n",
434 |     "    \n",
435 |     "        encoder.eval()\n",
436 |     "        decoder.eval()\n",
437 |     "        gcn1.eval()\n",
438 |     "    \n",
439 |     "        [input_var, adj_arc_in, adj_arc_out, adj_lab_in, adj_lab_out, mask_in, mask_out, mask_loop] = input_batch\n",
440 |     "        # Run the train function\n",
441 |     "        loss = train_luong(input_var, target_var, input_var.size(1), \n",
442 |     "                     False, adj_arc_in, adj_arc_out, adj_lab_in, adj_lab_out, mask_in, mask_out, mask_loop)\n",
443 |     "        \n",
444 |     "        print_loss_total += loss\n",
445 |     "    val_loss = print_loss_total / len(input_batches)\n",
446 |     "    validation_losses.append(val_loss)\n",
447 |     "    # Evaluating Bleu\n",
448 |     "    evaluator = Evaluator(encoder, decoder, gcn1, None, input_lang, output_lang, MAX_LENGTH, True)\n",
449 |     "    candidates, references = evaluator.get_candidates_and_references(pairs_test, arr_dep_test, k_beams=1)\n",
450 |     "    bleu = BLEU(candidates, [references])\n",
451 |     "    if bleu[0] > best_bleu:\n",
452 |     "        best_bleu = bleu[0]\n",
453 |     "        torch.save(encoder.state_dict(), 'encoder_graph.pkl')\n",
454 |     "        torch.save(decoder.state_dict(), 'decoder_graph.pkl')\n",
455 |     "        torch.save(gcn1.state_dict(), 'gcn_graph.pkl')\n",
456 |     "    validation_bleu.append(bleu)\n",
457 |     "    print(f'val_loss: {val_loss:.4f} - bleu: {bleu}', end=' ')\n",
458 |     "\n",
459 |     "    # Prevent overflow gpu memory\n",
460 |     "    del evaluator"
461 |    ]
462 |   },
463 |   {
464 |    "cell_type": "code",
465 |    "execution_count": null,
466 |    "metadata": {},
467 |    "outputs": [],
468 |    "source": []
469 |   }
470 |  ],
471 |  "metadata": {
472 |   "kernelspec": {
473 |    "display_name": "Python 3",
474 |    "language": "python",
475 |    "name": "python3"
476 |   },
477 |   "language_info": {
478 |    "codemirror_mode": {
479 |     "name": "ipython",
480 |     "version": 3
481 |    },
482 |    "file_extension": ".py",
483 |    "mimetype": "text/x-python",
484 |    "name": "python",
485 |    "nbconvert_exporter": "python",
486 |    "pygments_lexer": "ipython3",
487 |    "version": "3.6.4"
488 |   },
489 |   "toc": {
490 |    "nav_menu": {},
491 |    "number_sections": true,
492 |    "sideBar": true,
493 |    "skip_h1_title": false,
494 |    "toc_cell": false,
495 |    "toc_position": {},
496 |    "toc_section_display": "block",
497 |    "toc_window_display": false
498 |   }
499 |  },
500 |  "nbformat": 4,
501 |  "nbformat_minor": 2
502 | }
503 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Graph-convolutional
2 | 
3 | This is an application in a small corpus of the graph convolutional network proposed in this paper https://arxiv.org/abs/1703.04826 
4 | 


--------------------------------------------------------------------------------
/data.py:
--------------------------------------------------------------------------------
  1 | import unicodedata
  2 | import string
  3 | import re
  4 | import random
  5 | import time
  6 | import math
  7 | import os
  8 | import sys
  9 | import pandas as pd
 10 | import numpy as np
 11 | 
 12 | import nltk
 13 | 
 14 | import torch
 15 | import torch.nn as nn
 16 | from torch.nn import functional
 17 | from torch.autograd import Variable
 18 | from torch import optim
 19 | import torch.nn.functional as F
 20 | 
 21 | from stanfordcorenlp import StanfordCoreNLP
 22 | from nltk.parse.stanford import StanfordParser
 23 | from nltk.tag import StanfordNERTagger
 24 | from nltk.tokenize import word_tokenize
 25 | from nltk.corpus import wordnet
 26 | 
 27 | import enchant
 28 | 
 29 | import torchtext 
 30 | from torchtext import data
 31 | from torchtext import datasets
 32 | 
 33 | # label of dependencies https://nlp.stanford.edu/pubs/USD_LREC14_paper_camera_ready.pdf
 34 | 
 35 | DEP_LABELS = ['ROOT', 'ACL','ACVLCL', 'ADVMOD', 'AMOD', 'APPOS', 'AUX', 'CASE', 'CC', 'CCOMP',
 36 |                'CLF', 'COMPOUND', 'CONJ', 'COP', 'CSUBJ', 'DEP', 'DET',
 37 |                'DISCOURSE', 'DISLOCATED', 'EXPL', 'FIXED', 'FLAT', 'GOESWITH',
 38 |                'IOBJ', 'LIST', 'MARK', 'NMOD', 'NSUBJ', 'NUMMOD',
 39 |                'OBJ', 'OBL', 'ORPHAN', 'PARATAXIS', 'PUNXT', 'REPARANDUM', 'VOCATIVE',
 40 |                'XCOMP']
 41 | 
 42 | _DEP_LABELS_DICT = {label:ix for ix, label in enumerate(DEP_LABELS)}
 43 | 
 44 | def find_type(type_dep):
 45 |     if type_dep=='NSUBJ' or type_dep=='OBJ' or type_dep=='IOBJ' or type_dep=='CSUBJ' or type_dep=='CCOMP' or type_dep == 'XCOMP':
 46 |         return 0
 47 |     elif type_dep=='OBL' or type_dep=='VOCATIVE' or type_dep=='DISLOCATED' or type_dep=='ADVCL' or type_dep=='ADVMOD' or type_dep=='DISCOURSE' or type_dep=='AUX' or type_dep=='COP' or type_dep=='MARK':
 48 |         return 1
 49 |     elif type_dep=='NMOD' or type_dep=='APPOS' or type_dep=='NUMMOD' or type_dep=='ACL' or type_dep=='AMOD' or type_dep=='DET' or type_dep=='CLF' or type_dep=='CASE':
 50 |         return 2
 51 |     else:
 52 |         return 3
 53 | 
 54 | def get_adj(deps, batch_size, seq_len, max_degree):
 55 | 
 56 |     adj_arc_in = np.zeros((batch_size * seq_len, 2), dtype='int32')
 57 |     adj_lab_in = np.zeros((batch_size * seq_len, 1), dtype='int32')
 58 |     adj_arc_out = np.zeros((batch_size * seq_len * max_degree, 2), dtype='int32')
 59 |     adj_lab_out = np.zeros((batch_size * seq_len * max_degree, 1), dtype='int32')
 60 | 
 61 | 
 62 |     mask_in = np.zeros((batch_size * seq_len), dtype='float32')
 63 |     mask_out = np.zeros((batch_size * seq_len * max_degree), dtype='float32')
 64 | 
 65 |     mask_loop = np.ones((batch_size * seq_len, 1), dtype='float32')
 66 | 
 67 |     tmp_in = {}
 68 |     tmp_out = {}
 69 | 
 70 |     for d, de in enumerate(deps):
 71 |         for a, arc in enumerate(de):
 72 |             if arc[0] != 'ROOT' and arc[0].upper() in DEP_LABELS:         
 73 |                 arc_1 = int(arc[2])-1
 74 |                 arc_2 = int(arc[1])-1
 75 |                 
 76 |                 if a in tmp_in:
 77 |                     tmp_in[a] += 1
 78 |                 else:
 79 |                     tmp_in[a] = 0
 80 | 
 81 |                 if arc_2 in tmp_out:
 82 |                     tmp_out[arc_2] += 1
 83 |                 else:
 84 |                     tmp_out[arc_2] = 0
 85 | 
 86 |                 idx_in = (d * seq_len) + a + tmp_in[a]
 87 |                 idx_out = (d * seq_len * max_degree) + arc_2 * max_degree + tmp_out[arc_2]
 88 | 
 89 |                 adj_arc_in[idx_in] = np.array([d, arc_2])  # incoming arcs
 90 |                 adj_lab_in[idx_in] = np.array([find_type([arc[0].upper()])])  # incoming arcs
 91 | 
 92 |                 mask_in[idx_in] = 1.
 93 | 
 94 |                 if tmp_out[arc_2] < max_degree:
 95 |                     adj_arc_out[idx_out] = np.array([d, arc_1])  # outgoing arcs
 96 |                     adj_lab_out[idx_out] = np.array([find_type([arc[0].upper()])])  # outgoing arcs
 97 |                     mask_out[idx_out] = 1.
 98 | 
 99 |         tmp_in = {}
100 |         tmp_out = {}
101 | 
102 |     adj_arc_in = Variable(torch.LongTensor(np.transpose(adj_arc_in)))
103 |     adj_arc_out = Variable(torch.LongTensor(np.transpose(adj_arc_out)))
104 | 
105 |     adj_lab_in = Variable(torch.LongTensor(np.transpose(adj_lab_in)))
106 |     adj_lab_out = Variable(torch.LongTensor(np.transpose(adj_lab_out)))
107 | 
108 |     mask_in = Variable(torch.FloatTensor(mask_in.reshape((batch_size * seq_len, 1))))
109 |     mask_out = Variable(torch.FloatTensor(mask_out.reshape((batch_size * seq_len, max_degree))))
110 |     mask_loop = Variable(torch.FloatTensor(mask_loop))
111 | 
112 |     return adj_arc_in, adj_arc_out, adj_lab_in, adj_lab_out, mask_in, mask_out, mask_loop
113 | 
114 | def pad_seq(lang, seq, max_length):
115 |     seq += [lang.vocab.stoi['<pad>'] for i in range(max_length - len(seq))]
116 |     return seq
117 | 
118 | def generate_batches(input_lang, output_lang, batch_size, pairs, return_dep_tree=False, arr_dep=None, max_degree=None, USE_CUDA=False):
119 |     input_batches = []
120 |     target_batches = []
121 |     
122 |     for pos in range(0, len(pairs), batch_size):
123 |         # Avoiding out of array
124 |         if pos == 10431:
125 |             continue
126 |         cant = min(batch_size, len(pairs) - pos)
127 |         
128 |         input_seqs = pairs[pos:cant+pos, 0]#.tolist()
129 |         target_seqs = pairs[pos:cant+pos, 1]#.tolist()
130 |         if return_dep_tree:
131 |             arr_aux = arr_dep[pos:cant+pos]#.tolist()
132 | 
133 |         seq_pairs = sorted(zip(input_seqs, target_seqs), key=lambda p: len(p[0]), reverse=True)
134 |         input_seqs, target_seqs = zip(*seq_pairs)
135 | 
136 |         input_lengths = [len(s) for s in input_seqs]
137 |         input_padded = [pad_seq(input_lang, s, max(input_lengths)) for s in input_seqs]
138 |         target_lengths = [len(s) for s in target_seqs]
139 |         target_padded = [pad_seq(output_lang, s, max(target_lengths)) for s in target_seqs]
140 | 
141 |         input_var = Variable(torch.LongTensor(input_padded)).transpose(0, 1)
142 |         target_var = Variable(torch.LongTensor(target_padded)).transpose(0, 1)
143 | 
144 |         if USE_CUDA:
145 |             input_var = input_var.cuda()
146 |             target_var = target_var.cuda()
147 | 
148 |         if return_dep_tree:
149 |             # max len is setting mannually
150 |             adj_arc_in, adj_arc_out, adj_lab_in, adj_lab_out, mask_in, mask_out, mask_loop = get_adj(arr_aux, cant, max(input_lengths), max_degree)  
151 | 
152 |             if USE_CUDA:
153 |                 adj_arc_in = adj_arc_in.cuda()
154 |                 adj_arc_out = adj_arc_out.cuda()
155 |                 adj_lab_in = adj_lab_in.cuda()
156 |                 adj_lab_out = adj_lab_out.cuda()
157 | 
158 |                 mask_in = mask_in.cuda()
159 |                 mask_out = mask_out.cuda()
160 |                 mask_loop = mask_loop.cuda()
161 |         else:
162 |             adj_arc_in = None
163 |             adj_arc_out = None
164 |             adj_lab_in = None
165 |             adj_lab_out = None
166 | 
167 |             mask_in = None
168 |             mask_out = None
169 |             mask_loop = None
170 |                 
171 |         input_batches.append([input_var, adj_arc_in, adj_arc_out, adj_lab_in, adj_lab_out, mask_in, mask_out, mask_loop])
172 |         target_batches.append(target_var)
173 | 
174 |     return input_batches, target_batches
175 | 
176 | def indexes_from_sentence(lang, sentence):
177 |     return [lang.vocab.stoi[word] for word in sentence.split(' ')] + [lang.vocab.stoi['<eos>']]
178 | 
179 | def data_to_index(pairs, input_vec, output_vec):
180 |     new_pairs = []
181 |     
182 |     for pair in pairs:
183 |         new_pairs.append([indexes_from_sentence(input_vec, pair[0]), indexes_from_sentence(output_vec, pair[1])])
184 |         
185 |     return np.array(new_pairs)
186 | 
187 | def construct_vector(pair, name_lang, construct_vector=True, vector_name='fasttext.en.300d'):
188 |     lang = pd.DataFrame(pair, columns=[name_lang])
189 | 
190 |     lang.to_csv('corpus/' + name_lang + '.csv', index=False)
191 | 
192 |     lang = data.Field(sequential=True, lower=True, init_token='<sos>', eos_token='<eos>')
193 | 
194 |     mt_lang = data.TabularDataset(
195 |         path='corpus/' + name_lang + '.csv', format='csv',
196 |         fields=[(name_lang, lang)])
197 | 
198 |     lang.build_vocab(mt_lang)
199 | 
200 |     if construct_vector:
201 |         lang.vocab.load_vectors(vector_name)
202 |     
203 |     return lang
204 |             
205 | def unicode_to_ascii(s):
206 |     return ''.join(
207 |         c for c in unicodedata.normalize('NFD', s)
208 |         if unicodedata.category(c) != 'Mn'
209 |     )
210 | 
211 | def normalize_string(pair):
212 |     pair = unicode_to_ascii(pair.lower().strip())
213 |     pair = re.sub(r'([.,;!?])', r' \1', pair) # separate .!? from words
214 |     
215 |     
216 |     return ' '.join(pair.split())
217 | 
218 | def normalize_pairs(pairs):
219 |     for pair in pairs:
220 |         pair[0] = normalize_string(pair[0])
221 |         pair[1] = normalize_string(pair[1])
222 | 
223 | def filter_pairs_lang(pairs, min_length, max_length):
224 |     filtered_pairs = []
225 |     for pair in pairs:
226 |         # Removing '' and "" in pairs, this is for easy processing 
227 |         if len(pair[0].split()) >= min_length and len(pair[0].split()) <= max_length \
228 |             and len(pair[1].split()) >= min_length and len(pair[1].split()) <= max_length \
229 |             and "'" not in pair[0] and '"' not in pair[0]:
230 |                 filtered_pairs.append(pair)
231 |     return filtered_pairs
232 | 
233 | def read_langs(lang1, lang2, reverse=False):
234 |     print("Reading lines...")
235 | 
236 |     # Read the file and split into lines
237 |     filename = f'corpus/{lang1}-{lang2}.txt'
238 |     lines = open(filename).read().strip().split('\n')
239 | 
240 |     # Split every line into pairs and normalize
241 |     pairs = [[normalize_string(s) for s in l.split('\t')] for l in lines]
242 | 
243 |     # Reverse pairs, make Lang instances
244 |     if reverse:
245 |         pairs = [list(reversed(p)) for p in pairs]
246 | 
247 |     return pairs
248 | 
249 | def prepare_data(lang1_name, lang2_name, reverse=False, min_length=3, max_length=50):
250 |     pairs = read_langs(lang1_name, lang2_name, reverse=reverse)
251 |     print("Read %d sentence pairs" % len(pairs))
252 |     
253 |     pairs = filter_pairs_lang(pairs, min_length, max_length)
254 |     print("Filtered to %d pairs" % len(pairs))
255 |     
256 |     print("Creating vocab...")
257 |     pairs = np.array(pairs)
258 |     vector_1 = construct_vector(pairs[:, 0], lang1_name)
259 |     vector_2 = construct_vector(pairs[:, 1], lang2_name)
260 | 
261 |     print('Indexed %d words in input language, %d words in output' % (len(vector_1.vocab.itos), len(vector_2.vocab.itos)))
262 |     return vector_1, vector_2, pairs


--------------------------------------------------------------------------------
/model/.ipynb_checkpoints/decoder-checkpoint.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | from torch.autograd import Variable
  6 | import torch.nn.functional as F
  7 | from torch.nn import Parameter
  8 | 
  9 | ######################### ATTENTION ###########################
 10 | 
 11 | class Global_attn(nn.Module):
 12 |     def __init__(self, method, hidden_size, USE_CUDA=False):
 13 |         super(Global_attn, self).__init__()
 14 |         
 15 |         self.method = method
 16 |         self.hidden_size = hidden_size
 17 |         self.USE_CUDA = USE_CUDA
 18 |         
 19 |         if self.method == 'general':
 20 |             self.attn = nn.Linear(self.hidden_size, hidden_size)
 21 |             
 22 |         elif self.method == 'concat':
 23 |             self.attn = nn.Linear(self.hidden_size * 2, hidden_size)
 24 |             self.other = nn.Parameter(torch.FloatTensor(1, hidden_size))
 25 |             
 26 |     def forward(self, hidden, encoder_outputs):
 27 |         '''
 28 |         hidden: (BS, hidden_size)
 29 |         encoder_outputs(seq_len, BS, encoder_hidden_size)
 30 |         '''
 31 |         # encoder_outputs: (seq_len, batch_size, encoder_hidden_size)
 32 |         seq_len = len(encoder_outputs)
 33 |         batch_size = encoder_outputs.shape[1]
 34 |         
 35 |         # Calculate attention energies for each encoder output
 36 |         # attn_energies: (seq_len, batch_size)
 37 |         # hidden: (batch_size, hidden_size)
 38 |         attn_energies = Variable(torch.zeros(seq_len, batch_size))
 39 |         if self.USE_CUDA: attn_energies = attn_energies.cuda()
 40 |         for i in range(seq_len):
 41 |             attn_energies[i] = self.score(hidden, encoder_outputs[i])
 42 |         
 43 |         # Normalize energies [0-1] and resize to (batch_size, x=1, seq_len)
 44 |         return F.softmax(attn_energies, 0).transpose(0, 1).unsqueeze(1)
 45 |     
 46 |     def score(self, hidden, encoder_output):
 47 |         # hidden: (batch_size, hidden_size)
 48 |         # encoder_output: (batch_size, encoder_hidden_size)
 49 |         
 50 |         # hidden sizes must match, batch_size = 1 only
 51 |         if self.method == 'dot': 
 52 |             # batch element-wise dot product
 53 |             energy = torch.bmm(hidden.unsqueeze(1), 
 54 |                         encoder_output.unsqueeze(2)).squeeze().squeeze()
 55 |             # energy = hidden.dot(encoder_output)
 56 |             return energy
 57 |         
 58 |         elif self.method == 'general':
 59 |             energy = self.attn(encoder_output)
 60 |             # batch element-wise dot product
 61 |             energy = torch.bmm(hidden.unsqueeze(1), 
 62 |                         encoder_output.unsqueeze(2)).squeeze().squeeze()
 63 |             # energy = hidden.dot(energy)
 64 |             return energy
 65 |         
 66 |         # TODO: test / modify method to support batch size > 1
 67 |         elif self.method == 'concat': 
 68 |             energy = self.attn(torch.cat((hidden, encoder_output), 1))
 69 |             energy = self.other.dot(energy)
 70 |             return energy  
 71 |             
 72 | ######################### DECODER  LUONG ###########################
 73 | 
 74 | class Decoder_luong(nn.Module):
 75 |     def __init__(self, attn_method, hidden_size, output_size, emb_size, n_layers=1, dropout=0.1, lang=None, USE_CUDA=False):
 76 |         
 77 |         super(Decoder_luong, self).__init__()
 78 |         
 79 |         self.attn_method = attn_method
 80 |         self.hidden_size = hidden_size
 81 |         self.output_size = output_size
 82 |         self.n_layers = n_layers
 83 |         self.dropout_p = dropout
 84 |         self.USE_CUDA = USE_CUDA
 85 |         self.lang = lang
 86 |         
 87 |         # (size of dictionary of embeddings, size of embedding vector)
 88 |         self.embedding = nn.Embedding(output_size, emb_size)
 89 |         # (input features: hidden_size + emb_size, hidden state features, number of layers)
 90 |         self.gru = nn.GRU(emb_size + hidden_size, hidden_size, n_layers, dropout=dropout)
 91 |         self.attn = Global_attn(attn_method, hidden_size, USE_CUDA)
 92 |         self.out = nn.Linear(hidden_size * 2, output_size)        
 93 |         
 94 |         self.init_weights()
 95 |         
 96 |     def forward(self, word_input, last_context, last_hidden, encoder_outputs):
 97 |         '''
 98 |         word_input: (seq_len, BS)
 99 |         last_context: (BS, encoder_hidden_size)
100 |         last_hidden: (n_layers, BS, hidden_size)
101 |         last_cell: (n_layers, BS, hidden_size)
102 |         encoder_outputs: (seq_len, BS, encoder_hidden)
103 |         < output: (BS, output_size)
104 |         < attn_weights: (BS, 1, seq_len)
105 |         '''
106 |         # This is run one step at a time
107 |         
108 |         # Get the embedding of the current input word (last output word)
109 |         # word_input: (seq_len=1, batch_size), values in [0..output_size)
110 |         word_embedded = self.embedding(word_input) #.view(1, 1, -1)
111 |         # word_embedded: (seq_len=1, batch_size, embedding_size)
112 |         
113 |         # Combine embedded input word and last context, run through RNN
114 |         # last_context: (batch_size, encoder_hidden_size)
115 |         rnn_input = torch.cat((word_embedded, last_context.unsqueeze(0)), 2)
116 |         # rnn_input: (seq_len=1, batch_size, embedding_size + encoder_hidden_size)
117 |         # last_hidden: (num_layers, batch_size, hidden_size)
118 |         rnn_output, hidden = self.gru(rnn_input, last_hidden)
119 |         # rnn_output: (seq_len=1, batch_size, hidden_size)
120 |         # hidden: same
121 |         
122 |         # Calculate attention and apply to encoder outputs
123 |         # encoder_outputs: (seq_len, batch_size, encoder_hidden_size)
124 |         attn_weights = self.attn(rnn_output.squeeze(0), encoder_outputs)
125 |         
126 |         # Check softmax:
127 |         # print('attn_weights sum: ', torch.sum(attn_weights.squeeze(), 1))
128 |         
129 |         # attn_weights: (batch_size, x=1, seq_len)
130 |         context = attn_weights.bmm(encoder_outputs.transpose(0, 1))
131 |         # context: (batch_size, x=1, encoder_hidden_size)
132 |         
133 |         # Final output layer using hidden state and context vector
134 |         rnn_output = rnn_output.squeeze(0)
135 |         # rnn_output: (batch_size, hidden_size)
136 |         context = context.squeeze(1)
137 |         # context: (batch_size, encoder_hidden_size)
138 |         output = F.log_softmax(self.out(torch.cat((rnn_output, context), 1)), 1)
139 |         # output: (batch_size, output_size)
140 |         # Check softmax (not log_softmax):
141 |         # print('output sum: ', torch.sum(output.squeeze(), 1))
142 |         
143 |         # Also return attention weights for visualization
144 |         return output, context, hidden, attn_weights
145 |     
146 |     def init_weights(self):
147 |         if self.lang:
148 |             self.embedding.weight.data.copy_(self.lang.vocab.vectors)
149 |             self.embedding.weight.requires_grad = False
150 |             
151 |         for name, param in self.gru.named_parameters():
152 |             if 'bias' in name:
153 |                 nn.init.constant_(param, 0.0)
154 |             elif 'weight' in name:
155 |                 nn.init.xavier_normal_(param)
156 |         self.out.bias.data.fill_(0)
157 |         self.out.weight.data.uniform_(-0.1, 0.1)


--------------------------------------------------------------------------------
/model/.ipynb_checkpoints/encoder-checkpoint.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | import torch
 4 | import torch.nn as nn
 5 | from torch.autograd import Variable
 6 | import torch.nn.functional as F
 7 | from torch.nn import Parameter
 8 | 
 9 | class Encoder(nn.Module):
10 |     def __init__(self, input_size, hidden_size, emb_size, n_layers=2, dropout=0.1, lang=None, USE_CUDA=False):
11 |         super(Encoder, self).__init__()
12 |         
13 |         self.input_size = input_size
14 |         self.hidden_size = hidden_size
15 |         self.n_layers = n_layers
16 |         self.dropout = dropout
17 |         self.USE_CUDA = USE_CUDA
18 |         self.lang = lang
19 |         
20 |         self.embedding = nn.Embedding(input_size, emb_size)
21 |         self.gru = nn.GRU(emb_size, hidden_size, n_layers, dropout=self.dropout, bidirectional=True)
22 |         self.init_weights()
23 |             
24 |     def forward(self, input_seqs, hidden = None):
25 |         embedded = self.embedding(input_seqs)
26 | 
27 |         self.gru.flatten_parameters() 
28 |         outputs, hidden = self.gru(embedded, hidden)
29 |         
30 |         outputs = outputs[:, :, :self.hidden_size] + outputs[:, : ,self.hidden_size:] # Sum bidirectional outputs
31 |         return outputs, hidden
32 |     
33 |     def init_hidden(self, batch_size):
34 |         hidden = Variable(torch.zeros(2 * self.n_layers, batch_size, self.hidden_size))
35 |         if self.USE_CUDA: hidden = hidden.cuda()
36 |         return hidden
37 |     
38 |     def init_weights(self):
39 |         if self.lang:
40 |             self.embedding.weight.data.copy_(self.lang.vocab.vectors)           
41 |             self.embedding.weight.required_grad = False
42 |             
43 |         for name, param in self.gru.named_parameters():
44 |             if 'bias' in name:
45 |                 nn.init.constant_(param, 0.0)
46 |             elif 'weight' in name:
47 |                 nn.init.xavier_normal_(param)


--------------------------------------------------------------------------------
/model/.ipynb_checkpoints/graph-checkpoint.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | from torch.autograd import Variable
  6 | import torch.nn.functional as F
  7 | from torch.nn import Parameter
  8 | 
  9 | class Sintactic_GCN(nn.Module):
 10 |     def __init__(self, num_inputs, num_units,
 11 |                  num_labels,
 12 |                  dropout = 0.2,
 13 |                  in_arcs = True,
 14 |                  out_arcs = True,
 15 |                  batch_first = False,
 16 |                  USE_CUDA=False):       
 17 |         super(Sintactic_GCN, self).__init__()      
 18 | 
 19 |         self.in_arcs = in_arcs
 20 |         self.out_arcs = out_arcs
 21 |         
 22 |         self.retain = 1. - dropout
 23 |         self.num_inputs = num_inputs
 24 |         self.num_units = num_units
 25 |         self.num_labels = num_labels
 26 |         self.batch_first = batch_first
 27 |         
 28 |         self.relu = nn.LeakyReLU()
 29 |         self.sigmoid = nn.Sigmoid()
 30 |         self.dropout_rate = dropout
 31 |         
 32 |         if in_arcs:
 33 |             self.V_in = Parameter(torch.FloatTensor(self.num_inputs, self.num_units))
 34 |             nn.init.xavier_normal_(self.V_in)
 35 |             
 36 |             self.b_in = Parameter(torch.FloatTensor(num_labels, self.num_units))
 37 |             nn.init.constant_(self.b_in, 0)
 38 |             
 39 |             self.V_in_gate = Parameter(torch.FloatTensor(self.num_inputs, 1))
 40 |             nn.init.uniform_(self.V_in_gate)
 41 |             
 42 |             self.b_in_gate = Parameter(torch.FloatTensor(num_labels, 1))
 43 |             nn.init.constant_(self.b_in_gate, 1)
 44 | 
 45 |         if out_arcs:
 46 |             self.V_out = Parameter(torch.FloatTensor(self.num_inputs, self.num_units))
 47 |             nn.init.xavier_normal_(self.V_out)
 48 |             
 49 |             self.b_out = Parameter(torch.FloatTensor(num_labels, self.num_units))
 50 |             nn.init.constant_(self.b_in, 0)
 51 |             
 52 |             self.V_out_gate = Parameter(torch.FloatTensor(self.num_inputs, 1))
 53 |             nn.init.uniform_(self.V_out_gate)
 54 |             
 55 |             self.b_out_gate = Parameter(torch.FloatTensor(num_labels, 1))
 56 |             nn.init.constant_(self.b_out_gate, 1)
 57 |         
 58 |         self.W_self_loop = Parameter(torch.FloatTensor(self.num_inputs, self.num_units))
 59 |         nn.init.xavier_normal_(self.W_self_loop)        
 60 |         
 61 |         self.W_self_loop_gate = Parameter(torch.FloatTensor(self.num_inputs, 1))
 62 |         nn.init.uniform_(self.W_self_loop_gate)
 63 | 
 64 |     def forward(self, encoder_outputs,
 65 |                  arc_tensor_in, arc_tensor_out,
 66 |                  label_tensor_in, label_tensor_out,
 67 |                  mask_in, mask_out,  # batch* t, degree
 68 |                  mask_loop):
 69 | 
 70 |         if(not self.batch_first):
 71 |             encoder_outputs = encoder_outputs.permute(1, 0, 2).contiguous()
 72 |         
 73 |         batch_size, seq_len, _ = encoder_outputs.shape
 74 |         input_ = encoder_outputs.view((batch_size * seq_len , self.num_inputs))  # [b* t, h]        
 75 |         
 76 |         max_degree = 1
 77 |         if self.in_arcs:
 78 |             input_in = torch.mm(input_, self.V_in)  # [b* t, h] * [h,h] = [b*t, h]
 79 |             first_in = input_in.index_select(0, arc_tensor_in[0] * seq_len + arc_tensor_in[1])
 80 |             
 81 |             second_in = self.b_in.index_select(0, label_tensor_in.squeeze(0))  # [b* t* 1, h]
 82 |             in_ = (first_in + second_in).view((batch_size, seq_len, 1, self.num_units))
 83 | 
 84 |             # compute gate weights
 85 |             input_in_gate = torch.mm(input_, self.V_in_gate)  # [b* t, h] * [h,h] = [b*t, h]
 86 |             first_in_gate = input_in_gate.index_select(0, arc_tensor_in[0] * seq_len + arc_tensor_in[1])
 87 |             
 88 |             second_in_gate = self.b_in_gate.index_select(0, label_tensor_in.squeeze(0))
 89 |             in_gate = (input_in_gate + second_in_gate).view((batch_size, seq_len, 1))
 90 | 
 91 |             max_degree += 1
 92 |             
 93 |         if self.out_arcs:           
 94 |             input_out = torch.mm(input_, self.V_out)  # [b* t, h] * [h,h] = [b* t, h]        
 95 |             first_out = input_out.index_select(0, arc_tensor_out[0] * seq_len + arc_tensor_out[1])
 96 |         
 97 |             second_out = self.b_out.index_select(0, label_tensor_out.squeeze(0))     
 98 |             
 99 |             degr = int(first_out.shape[0] / batch_size / seq_len)
100 |             max_degree += degr
101 | 
102 |             out_ = (first_out + second_out).view((batch_size, seq_len, degr, self.num_units))
103 | 
104 |             # compute gate weights
105 |             input_out_gate = torch.mm(input_, self.V_out_gate)  # [b* t, h] * [h,h] = [b* t, h]
106 |             first_out_gate = input_out_gate.index_select(0, arc_tensor_out[0] * seq_len + arc_tensor_out[1])
107 |             
108 |             second_out_gate = self.b_out_gate.index_select(0, label_tensor_out.squeeze(0))
109 |             
110 |             out_gate = (first_out_gate + second_out_gate).view((batch_size, seq_len, degr))
111 |        
112 |         same_input = torch.mm(encoder_outputs.view(-1,encoder_outputs.size(2)), self.W_self_loop).\
113 |                         view(encoder_outputs.size(0), encoder_outputs.size(1), -1)
114 |         same_input = same_input.view(encoder_outputs.size(0), encoder_outputs.size(1), 1, self.W_self_loop.size(1))
115 |         
116 |         same_input_gate = torch.mm(encoder_outputs.view(-1, encoder_outputs.size(2)), self.W_self_loop_gate)\
117 |                                 .view(encoder_outputs.size(0), encoder_outputs.size(1), -1)
118 | 
119 |         if self.in_arcs and self.out_arcs:
120 |             potentials = torch.cat((in_, out_, same_input), dim=2)  # [b, t,  mxdeg, h]         
121 |             potentials_gate = torch.cat((in_gate, out_gate, same_input_gate), dim=2)  # [b, t,  mxdeg, h]
122 |             mask_soft = torch.cat((mask_in, mask_out, mask_loop), dim=1)  # [b* t, mxdeg]
123 |         elif self.out_arcs:
124 |             potentials = torch.cat((out_, same_input), dim=2)  # [b, t,  2*mxdeg+1, h]
125 |             potentials_gate = torch.cat((out_gate, same_input_gate), dim=2)  # [b, t,  mxdeg, h]
126 |             mask_soft = torch.cat((mask_out, mask_loop), dim=1)  # [b* t, mxdeg]
127 |         elif self.in_arcs:
128 |             potentials = torch.cat((in_, same_input), dim=2)  # [b, t,  2*mxdeg+1, h]
129 |             potentials_gate = torch.cat((in_gate, same_input_gate), dim=2)  # [b, t,  mxdeg, h]
130 |             mask_soft = torch.cat((mask_in, mask_loop), dim=1)  # [b* t, mxdeg]
131 | 
132 |         potentials_ = potentials.permute(3, 0, 1, 2).contiguous()  # [h, b, t, mxdeg]
133 |         potentials_resh = potentials_.view((self.num_units,
134 |                                                batch_size * seq_len,
135 |                                                max_degree))  # [h, b * t, mxdeg]
136 | 
137 |         potentials_r = potentials_gate.view((batch_size * seq_len,
138 |                                                   max_degree))  # [h, b * t, mxdeg]
139 |         # calculate the gate
140 |         probs_det_ = self.sigmoid(potentials_r) * mask_soft  # [b * t, mxdeg]
141 |         potentials_masked = potentials_resh * mask_soft * probs_det_  # [h, b * t, mxdeg]
142 | 
143 |         
144 |         #if self.retain == 1 or deterministic:
145 |         #    pass
146 |         #else:
147 |         #    drop_mask = self._srng.binomial(potentials_resh.shape[1:], p=self.retain, dtype=input.dtype)
148 |         #    potentials_masked /= self.retain
149 |         #    potentials_masked *= drop_mask
150 | 
151 |         potentials_masked_ = potentials_masked.sum(dim=2)  # [h, b * t]
152 |         potentials_masked_ = self.relu(potentials_masked_)
153 | 
154 |         result_ = potentials_masked_.permute(1, 0).contiguous()   # [b * t, h]
155 |         result_ = F.dropout(result_, p=self.dropout_rate, training=self.training)
156 |         
157 |         if not self.batch_first:
158 |             result_ = result_.view((seq_len, batch_size, self.num_units))  # [ b, t, h]
159 |         else:
160 |             result_ = result_.view((batch_size, seq_len, self.num_units))
161 | 
162 |         
163 |         return result_    
164 | 


--------------------------------------------------------------------------------
/model/__pycache__/decoder.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kdrivas/Graph-convolutional/1fd05c42d082989c3fc54bcd2d317c40d4ebe73d/model/__pycache__/decoder.cpython-36.pyc


--------------------------------------------------------------------------------
/model/__pycache__/encoder.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kdrivas/Graph-convolutional/1fd05c42d082989c3fc54bcd2d317c40d4ebe73d/model/__pycache__/encoder.cpython-36.pyc


--------------------------------------------------------------------------------
/model/__pycache__/graph.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kdrivas/Graph-convolutional/1fd05c42d082989c3fc54bcd2d317c40d4ebe73d/model/__pycache__/graph.cpython-36.pyc


--------------------------------------------------------------------------------
/model/decoder.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | from torch.autograd import Variable
  6 | import torch.nn.functional as F
  7 | from torch.nn import Parameter
  8 | 
  9 | ######################### ATTENTION ###########################
 10 | 
 11 | class Global_attn(nn.Module):
 12 |     def __init__(self, method, hidden_size, USE_CUDA=False):
 13 |         super(Global_attn, self).__init__()
 14 |         
 15 |         self.method = method
 16 |         self.hidden_size = hidden_size
 17 |         self.USE_CUDA = USE_CUDA
 18 |         
 19 |         if self.method == 'general':
 20 |             self.attn = nn.Linear(self.hidden_size, hidden_size)
 21 |             
 22 |         elif self.method == 'concat':
 23 |             self.attn = nn.Linear(self.hidden_size * 2, hidden_size)
 24 |             self.other = nn.Parameter(torch.FloatTensor(1, hidden_size))
 25 |             
 26 |     def forward(self, hidden, encoder_outputs):
 27 |         '''
 28 |         hidden: (BS, hidden_size)
 29 |         encoder_outputs(seq_len, BS, encoder_hidden_size)
 30 |         '''
 31 |         # encoder_outputs: (seq_len, batch_size, encoder_hidden_size)
 32 |         seq_len = len(encoder_outputs)
 33 |         batch_size = encoder_outputs.shape[1]
 34 |         
 35 |         # Calculate attention energies for each encoder output
 36 |         # attn_energies: (seq_len, batch_size)
 37 |         # hidden: (batch_size, hidden_size)
 38 |         attn_energies = Variable(torch.zeros(seq_len, batch_size))
 39 |         if self.USE_CUDA: attn_energies = attn_energies.cuda()
 40 |         for i in range(seq_len):
 41 |             attn_energies[i] = self.score(hidden, encoder_outputs[i])
 42 |         
 43 |         # Normalize energies [0-1] and resize to (batch_size, x=1, seq_len)
 44 |         return F.softmax(attn_energies, 0).transpose(0, 1).unsqueeze(1)
 45 |     
 46 |     def score(self, hidden, encoder_output):
 47 |         # hidden: (batch_size, hidden_size)
 48 |         # encoder_output: (batch_size, encoder_hidden_size)
 49 |         
 50 |         # hidden sizes must match, batch_size = 1 only
 51 |         if self.method == 'dot': 
 52 |             # batch element-wise dot product
 53 |             energy = torch.bmm(hidden.unsqueeze(1), 
 54 |                         encoder_output.unsqueeze(2)).squeeze().squeeze()
 55 |             # energy = hidden.dot(encoder_output)
 56 |             return energy
 57 |         
 58 |         elif self.method == 'general':
 59 |             energy = self.attn(encoder_output)
 60 |             # batch element-wise dot product
 61 |             energy = torch.bmm(hidden.unsqueeze(1), 
 62 |                         encoder_output.unsqueeze(2)).squeeze().squeeze()
 63 |             # energy = hidden.dot(energy)
 64 |             return energy
 65 |         
 66 |         # TODO: test / modify method to support batch size > 1
 67 |         elif self.method == 'concat': 
 68 |             energy = self.attn(torch.cat((hidden, encoder_output), 1))
 69 |             energy = self.other.dot(energy)
 70 |             return energy  
 71 |             
 72 | ######################### DECODER  LUONG ###########################
 73 | 
 74 | class Decoder_luong(nn.Module):
 75 |     def __init__(self, attn_method, hidden_size, output_size, emb_size, n_layers=1, dropout=0.1, lang=None, USE_CUDA=False):
 76 |         
 77 |         super(Decoder_luong, self).__init__()
 78 |         
 79 |         self.attn_method = attn_method
 80 |         self.hidden_size = hidden_size
 81 |         self.output_size = output_size
 82 |         self.n_layers = n_layers
 83 |         self.dropout_p = dropout
 84 |         self.USE_CUDA = USE_CUDA
 85 |         self.lang = lang
 86 |         
 87 |         # (size of dictionary of embeddings, size of embedding vector)
 88 |         self.embedding = nn.Embedding(output_size, emb_size)
 89 |         # (input features: hidden_size + emb_size, hidden state features, number of layers)
 90 |         self.gru = nn.GRU(emb_size + hidden_size, hidden_size, n_layers, dropout=dropout)
 91 |         self.attn = Global_attn(attn_method, hidden_size, USE_CUDA)
 92 |         self.out = nn.Linear(hidden_size * 2, output_size)        
 93 |         
 94 |         self.init_weights()
 95 |         
 96 |     def forward(self, word_input, last_context, last_hidden, encoder_outputs):
 97 |         '''
 98 |         word_input: (seq_len, BS)
 99 |         last_context: (BS, encoder_hidden_size)
100 |         last_hidden: (n_layers, BS, hidden_size)
101 |         last_cell: (n_layers, BS, hidden_size)
102 |         encoder_outputs: (seq_len, BS, encoder_hidden)
103 |         < output: (BS, output_size)
104 |         < attn_weights: (BS, 1, seq_len)
105 |         '''
106 |         # This is run one step at a time
107 |         
108 |         # Get the embedding of the current input word (last output word)
109 |         # word_input: (seq_len=1, batch_size), values in [0..output_size)
110 |         word_embedded = self.embedding(word_input) #.view(1, 1, -1)
111 |         # word_embedded: (seq_len=1, batch_size, embedding_size)
112 |         
113 |         # Combine embedded input word and last context, run through RNN
114 |         # last_context: (batch_size, encoder_hidden_size)
115 |         rnn_input = torch.cat((word_embedded, last_context.unsqueeze(0)), 2)
116 |         # rnn_input: (seq_len=1, batch_size, embedding_size + encoder_hidden_size)
117 |         # last_hidden: (num_layers, batch_size, hidden_size)
118 |         rnn_output, hidden = self.gru(rnn_input, last_hidden)
119 |         # rnn_output: (seq_len=1, batch_size, hidden_size)
120 |         # hidden: same
121 |         
122 |         # Calculate attention and apply to encoder outputs
123 |         # encoder_outputs: (seq_len, batch_size, encoder_hidden_size)
124 |         attn_weights = self.attn(rnn_output.squeeze(0), encoder_outputs)
125 |         
126 |         # Check softmax:
127 |         # print('attn_weights sum: ', torch.sum(attn_weights.squeeze(), 1))
128 |         
129 |         # attn_weights: (batch_size, x=1, seq_len)
130 |         context = attn_weights.bmm(encoder_outputs.transpose(0, 1))
131 |         # context: (batch_size, x=1, encoder_hidden_size)
132 |         
133 |         # Final output layer using hidden state and context vector
134 |         rnn_output = rnn_output.squeeze(0)
135 |         # rnn_output: (batch_size, hidden_size)
136 |         context = context.squeeze(1)
137 |         # context: (batch_size, encoder_hidden_size)
138 |         output = F.log_softmax(self.out(torch.cat((rnn_output, context), 1)), 1)
139 |         # output: (batch_size, output_size)
140 |         # Check softmax (not log_softmax):
141 |         # print('output sum: ', torch.sum(output.squeeze(), 1))
142 |         
143 |         # Also return attention weights for visualization
144 |         return output, context, hidden, attn_weights
145 |     
146 |     def init_weights(self):
147 |         if self.lang:
148 |             self.embedding.weight.data.copy_(self.lang.vocab.vectors)
149 |             self.embedding.weight.requires_grad = False
150 |             
151 |         for name, param in self.gru.named_parameters():
152 |             if 'bias' in name:
153 |                 nn.init.constant_(param, 0.0)
154 |             elif 'weight' in name:
155 |                 nn.init.xavier_normal_(param)
156 |         self.out.bias.data.fill_(0)
157 |         self.out.weight.data.uniform_(-0.1, 0.1)


--------------------------------------------------------------------------------
/model/encoder.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | import torch
 4 | import torch.nn as nn
 5 | from torch.autograd import Variable
 6 | import torch.nn.functional as F
 7 | from torch.nn import Parameter
 8 | 
 9 | class Encoder(nn.Module):
10 |     def __init__(self, input_size, hidden_size, emb_size, n_layers=2, dropout=0.1, lang=None, USE_CUDA=False):
11 |         super(Encoder, self).__init__()
12 |         
13 |         self.input_size = input_size
14 |         self.hidden_size = hidden_size
15 |         self.n_layers = n_layers
16 |         self.dropout = dropout
17 |         self.USE_CUDA = USE_CUDA
18 |         self.lang = lang
19 |         
20 |         self.embedding = nn.Embedding(input_size, emb_size)
21 |         self.gru = nn.GRU(emb_size, hidden_size, n_layers, dropout=self.dropout, bidirectional=True)
22 |         self.init_weights()
23 |             
24 |     def forward(self, input_seqs, hidden = None):
25 |         embedded = self.embedding(input_seqs)
26 | 
27 |         self.gru.flatten_parameters() 
28 |         outputs, hidden = self.gru(embedded, hidden)
29 |         
30 |         outputs = outputs[:, :, :self.hidden_size] + outputs[:, : ,self.hidden_size:] # Sum bidirectional outputs
31 |         return outputs, hidden
32 |     
33 |     def init_hidden(self, batch_size):
34 |         hidden = Variable(torch.zeros(2 * self.n_layers, batch_size, self.hidden_size))
35 |         if self.USE_CUDA: hidden = hidden.cuda()
36 |         return hidden
37 |     
38 |     def init_weights(self):
39 |         if self.lang:
40 |             self.embedding.weight.data.copy_(self.lang.vocab.vectors)           
41 |             self.embedding.weight.required_grad = False
42 |             
43 |         for name, param in self.gru.named_parameters():
44 |             if 'bias' in name:
45 |                 nn.init.constant_(param, 0.0)
46 |             elif 'weight' in name:
47 |                 nn.init.xavier_normal_(param)


--------------------------------------------------------------------------------
/model/graph.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | from torch.autograd import Variable
  6 | import torch.nn.functional as F
  7 | from torch.nn import Parameter
  8 | 
  9 | class Sintactic_GCN(nn.Module):
 10 |     def __init__(self, num_inputs, num_units,
 11 |                  num_labels,
 12 |                  dropout = 0.2,
 13 |                  in_arcs = True,
 14 |                  out_arcs = True,
 15 |                  batch_first = False,
 16 |                  USE_CUDA=False):       
 17 |         super(Sintactic_GCN, self).__init__()      
 18 | 
 19 |         self.in_arcs = in_arcs
 20 |         self.out_arcs = out_arcs
 21 |         
 22 |         self.retain = 1. - dropout
 23 |         self.num_inputs = num_inputs
 24 |         self.num_units = num_units
 25 |         self.num_labels = num_labels
 26 |         self.batch_first = batch_first
 27 |         
 28 |         self.relu = nn.LeakyReLU()
 29 |         self.sigmoid = nn.Sigmoid()
 30 |         self.dropout_rate = dropout
 31 |         
 32 |         if in_arcs:
 33 |             self.V_in = Parameter(torch.FloatTensor(self.num_inputs, self.num_units))
 34 |             nn.init.xavier_normal_(self.V_in)
 35 |             
 36 |             self.b_in = Parameter(torch.FloatTensor(num_labels, self.num_units))
 37 |             nn.init.constant_(self.b_in, 0)
 38 |             
 39 |             self.V_in_gate = Parameter(torch.FloatTensor(self.num_inputs, 1))
 40 |             nn.init.uniform_(self.V_in_gate)
 41 |             
 42 |             self.b_in_gate = Parameter(torch.FloatTensor(num_labels, 1))
 43 |             nn.init.constant_(self.b_in_gate, 1)
 44 | 
 45 |         if out_arcs:
 46 |             self.V_out = Parameter(torch.FloatTensor(self.num_inputs, self.num_units))
 47 |             nn.init.xavier_normal_(self.V_out)
 48 |             
 49 |             self.b_out = Parameter(torch.FloatTensor(num_labels, self.num_units))
 50 |             nn.init.constant_(self.b_in, 0)
 51 |             
 52 |             self.V_out_gate = Parameter(torch.FloatTensor(self.num_inputs, 1))
 53 |             nn.init.uniform_(self.V_out_gate)
 54 |             
 55 |             self.b_out_gate = Parameter(torch.FloatTensor(num_labels, 1))
 56 |             nn.init.constant_(self.b_out_gate, 1)
 57 |         
 58 |         self.W_self_loop = Parameter(torch.FloatTensor(self.num_inputs, self.num_units))
 59 |         nn.init.xavier_normal_(self.W_self_loop)        
 60 |         
 61 |         self.W_self_loop_gate = Parameter(torch.FloatTensor(self.num_inputs, 1))
 62 |         nn.init.uniform_(self.W_self_loop_gate)
 63 | 
 64 |     def forward(self, encoder_outputs,
 65 |                  arc_tensor_in, arc_tensor_out,
 66 |                  label_tensor_in, label_tensor_out,
 67 |                  mask_in, mask_out,  # batch* t, degree
 68 |                  mask_loop):
 69 | 
 70 |         if(not self.batch_first):
 71 |             encoder_outputs = encoder_outputs.permute(1, 0, 2).contiguous()
 72 |         
 73 |         batch_size, seq_len, _ = encoder_outputs.shape
 74 |         input_ = encoder_outputs.view((batch_size * seq_len , self.num_inputs))  # [b* t, h]        
 75 |         
 76 |         max_degree = 1
 77 |         if self.in_arcs:
 78 |             input_in = torch.mm(input_, self.V_in)  # [b* t, h] * [h,h] = [b*t, h]
 79 |             first_in = input_in.index_select(0, arc_tensor_in[0] * seq_len + arc_tensor_in[1])
 80 |             
 81 |             second_in = self.b_in.index_select(0, label_tensor_in.squeeze(0))  # [b* t* 1, h]
 82 |             in_ = (first_in + second_in).view((batch_size, seq_len, 1, self.num_units))
 83 | 
 84 |             # compute gate weights
 85 |             input_in_gate = torch.mm(input_, self.V_in_gate)  # [b* t, h] * [h,h] = [b*t, h]
 86 |             first_in_gate = input_in_gate.index_select(0, arc_tensor_in[0] * seq_len + arc_tensor_in[1])
 87 |             
 88 |             second_in_gate = self.b_in_gate.index_select(0, label_tensor_in.squeeze(0))
 89 |             in_gate = (input_in_gate + second_in_gate).view((batch_size, seq_len, 1))
 90 | 
 91 |             max_degree += 1
 92 |             
 93 |         if self.out_arcs:           
 94 |             input_out = torch.mm(input_, self.V_out)  # [b* t, h] * [h,h] = [b* t, h]        
 95 |             first_out = input_out.index_select(0, arc_tensor_out[0] * seq_len + arc_tensor_out[1])
 96 |         
 97 |             second_out = self.b_out.index_select(0, label_tensor_out.squeeze(0))     
 98 |             
 99 |             degr = int(first_out.shape[0] / batch_size / seq_len)
100 |             max_degree += degr
101 | 
102 |             out_ = (first_out + second_out).view((batch_size, seq_len, degr, self.num_units))
103 | 
104 |             # compute gate weights
105 |             input_out_gate = torch.mm(input_, self.V_out_gate)  # [b* t, h] * [h,h] = [b* t, h]
106 |             first_out_gate = input_out_gate.index_select(0, arc_tensor_out[0] * seq_len + arc_tensor_out[1])
107 |             
108 |             second_out_gate = self.b_out_gate.index_select(0, label_tensor_out.squeeze(0))
109 |             
110 |             out_gate = (first_out_gate + second_out_gate).view((batch_size, seq_len, degr))
111 |        
112 |         same_input = torch.mm(encoder_outputs.view(-1,encoder_outputs.size(2)), self.W_self_loop).\
113 |                         view(encoder_outputs.size(0), encoder_outputs.size(1), -1)
114 |         same_input = same_input.view(encoder_outputs.size(0), encoder_outputs.size(1), 1, self.W_self_loop.size(1))
115 |         
116 |         same_input_gate = torch.mm(encoder_outputs.view(-1, encoder_outputs.size(2)), self.W_self_loop_gate)\
117 |                                 .view(encoder_outputs.size(0), encoder_outputs.size(1), -1)
118 | 
119 |         if self.in_arcs and self.out_arcs:
120 |             potentials = torch.cat((in_, out_, same_input), dim=2)  # [b, t,  mxdeg, h]         
121 |             potentials_gate = torch.cat((in_gate, out_gate, same_input_gate), dim=2)  # [b, t,  mxdeg, h]
122 |             mask_soft = torch.cat((mask_in, mask_out, mask_loop), dim=1)  # [b* t, mxdeg]
123 |         elif self.out_arcs:
124 |             potentials = torch.cat((out_, same_input), dim=2)  # [b, t,  2*mxdeg+1, h]
125 |             potentials_gate = torch.cat((out_gate, same_input_gate), dim=2)  # [b, t,  mxdeg, h]
126 |             mask_soft = torch.cat((mask_out, mask_loop), dim=1)  # [b* t, mxdeg]
127 |         elif self.in_arcs:
128 |             potentials = torch.cat((in_, same_input), dim=2)  # [b, t,  2*mxdeg+1, h]
129 |             potentials_gate = torch.cat((in_gate, same_input_gate), dim=2)  # [b, t,  mxdeg, h]
130 |             mask_soft = torch.cat((mask_in, mask_loop), dim=1)  # [b* t, mxdeg]
131 | 
132 |         potentials_ = potentials.permute(3, 0, 1, 2).contiguous()  # [h, b, t, mxdeg]
133 |         potentials_resh = potentials_.view((self.num_units,
134 |                                                batch_size * seq_len,
135 |                                                max_degree))  # [h, b * t, mxdeg]
136 | 
137 |         potentials_r = potentials_gate.view((batch_size * seq_len,
138 |                                                   max_degree))  # [h, b * t, mxdeg]
139 |         # calculate the gate
140 |         probs_det_ = self.sigmoid(potentials_r) * mask_soft  # [b * t, mxdeg]
141 |         potentials_masked = potentials_resh * mask_soft * probs_det_  # [h, b * t, mxdeg]
142 | 
143 |         
144 |         #if self.retain == 1 or deterministic:
145 |         #    pass
146 |         #else:
147 |         #    drop_mask = self._srng.binomial(potentials_resh.shape[1:], p=self.retain, dtype=input.dtype)
148 |         #    potentials_masked /= self.retain
149 |         #    potentials_masked *= drop_mask
150 | 
151 |         potentials_masked_ = potentials_masked.sum(dim=2)  # [h, b * t]
152 |         potentials_masked_ = self.relu(potentials_masked_)
153 | 
154 |         result_ = potentials_masked_.permute(1, 0).contiguous()   # [b * t, h]
155 |         result_ = F.dropout(result_, p=self.dropout_rate, training=self.training)
156 |         
157 |         if not self.batch_first:
158 |             result_ = result_.view((seq_len, batch_size, self.num_units))  # [ b, t, h]
159 |         else:
160 |             result_ = result_.view((batch_size, seq_len, self.num_units))
161 | 
162 |         
163 |         return result_    
164 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import time
 3 | import math
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | def as_minutes(s):
 7 |     m = math.floor(s / 60)
 8 |     s -= m * 60
 9 |     return '%dm %ds' % (m, s)
10 | 
11 | def time_since(since, percent):
12 |     now = time.time()
13 |     s = now - since
14 |     es = s / (percent)
15 |     rs = es - s
16 |     return '%s (- %s)' % (as_minutes(s), as_minutes(rs))
17 | 
18 | def show_plot(points):
19 |     plt.figure()
20 |     fig, ax = plt.subplots()
21 |     loc = ticker.MultipleLocator(base=1) # put ticks at regular intervals
22 |     ax.yaxis.set_major_locator(loc)
23 |     plt.plot(points)
24 |     
25 | def plot_losses(train_loss, val_loss, scale):
26 |     plt.figure(figsize=(10,5))
27 |     plt.plot(train_loss)
28 |     plt.plot([(x + 1) * scale - 1 for x in range(len(val_loss))], val_loss)
29 |     plt.legend(['train loss', 'validation loss'])
30 | 


--------------------------------------------------------------------------------
/validation.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | from torch.nn import functional
  4 | from torch.autograd import Variable
  5 | from torch import optim
  6 | import torch.nn.functional as F
  7 | import numpy as np
  8 | import re 
  9 | from tqdm import tqdm
 10 | from data import generate_batches
 11 | 
 12 | class Beam():
 13 |     def __init__(self, decoder_input, decoder_context, decoder_hidden,
 14 |                     decoded_words=[], decoder_attentions=[], sequence_log_probs=[], decoded_index=[]):
 15 |         self.decoded_words = decoded_words
 16 |         self.decoded_index = decoded_index
 17 |         self.decoder_attentions = decoder_attentions
 18 |         self.sequence_log_probs = sequence_log_probs
 19 |         self.decoder_input = decoder_input
 20 |         self.decoder_context = decoder_context
 21 |         self.decoder_hidden = decoder_hidden
 22 | 
 23 | class Evaluator():
 24 |     def __init__(self, encoder, decoder, gcn1, gcn2, input_lang, output_lang, max_length, USE_CUDA):
 25 |         self.encoder = encoder
 26 |         self.decoder = decoder
 27 |         self.input_lang = input_lang
 28 |         self.output_lang = output_lang
 29 |         self.max_length = max_length
 30 |         self.USE_CUDA = USE_CUDA
 31 |         self.gcn1 = gcn1
 32 |         self.gcn2 = gcn2
 33 | 
 34 |     def evaluate(self, input_batch, k_beams, testing_luong=True):
 35 |         
 36 |         [input_var, adj_arc_in, adj_arc_out, adj_lab_in, adj_lab_out, mask_in, mask_out, mask_loop] = input_batch        
 37 |         input_length = input_var.shape[0]
 38 |         
 39 |         encoder_hidden = self.encoder.init_hidden(1)
 40 |         encoder_outputs, encoder_hidden = self.encoder(input_var, encoder_hidden)
 41 | 
 42 |         if self.gcn1:
 43 |             encoder_outputs = self.gcn1(encoder_outputs,
 44 |                                  adj_arc_in, adj_arc_out,
 45 |                                  adj_lab_in, adj_lab_out,
 46 |                                  mask_in, mask_out,  
 47 |                                  mask_loop)
 48 |         if self.gcn2:     
 49 |             encoder_outputs = self.gcn2(encoder_outputs,
 50 |                                  adj_arc_in, adj_arc_out,
 51 |                                  adj_lab_in, adj_lab_out,
 52 |                                  mask_in, mask_out,  
 53 |                                  mask_loop)
 54 |             
 55 |         if testing_luong:
 56 |             decoder_input = Variable(torch.LongTensor([[self.output_lang.vocab.stoi['<sos>']]]))
 57 |         else:
 58 |             decoder_input = Variable(torch.LongTensor([self.output_lang.vocab.stoi['<sos>']]))
 59 |             
 60 |         decoder_context = Variable(torch.zeros(1, self.decoder.hidden_size))
 61 |         decoder_hidden = encoder_hidden
 62 |         
 63 |         if self.USE_CUDA:
 64 |             decoder_input = decoder_input.cuda()
 65 |             decoder_context = decoder_context.cuda()
 66 |             
 67 |         decoded_words = []
 68 |         decoder_attentions = torch.zeros(self.max_length, self.max_length)
 69 | 
 70 |         beams = [Beam(decoder_input, decoder_context, decoder_hidden)]
 71 |         top_beams = []
 72 |         
 73 |         # Use decoder output as inputs
 74 |         for di in range(input_length):      
 75 |             new_beams = []
 76 |             for beam in beams:
 77 |                 decoder_output, decoder_context, decoder_hidden, decoder_attention = self.decoder(
 78 |                     beam.decoder_input, beam.decoder_context, beam.decoder_hidden, encoder_outputs)     
 79 | 
 80 |                 # Beam search, take the top k with highest probability
 81 |                 topv, topi = decoder_output.data.topk(k_beams)
 82 | 
 83 |                 for ni, vi in zip(topi[0], topv[0]):
 84 |                     new_beam = Beam(None, decoder_context, decoder_hidden, 
 85 |                                     beam.decoded_words[:], beam.decoder_attentions[:], beam.sequence_log_probs[:])
 86 |                     new_beam.decoder_attentions.append(decoder_attention.squeeze().cpu().data)
 87 |                     new_beam.sequence_log_probs.append(vi)
 88 | 
 89 |                     if ni == self.output_lang.vocab.stoi['<eos>'] or ni == self.output_lang.vocab.stoi['<pad>']: 
 90 |                         new_beam.decoded_words.append('<eos>')
 91 |                         top_beams.append(new_beam)
 92 | 
 93 |                     else:
 94 |                         new_beam.decoded_words.append(self.output_lang.vocab.itos[ni])                        
 95 |                     
 96 |                         if testing_luong:
 97 |                             decoder_input = Variable(torch.LongTensor([[ni]]))
 98 |                         else:
 99 |                             decoder_input = Variable(torch.LongTensor([ni]))
100 |                         if self.USE_CUDA: decoder_input = decoder_input.cuda()
101 | 
102 |                         new_beam.decoder_input = decoder_input                        
103 |                         new_beams.append(new_beam)                    
104 |             
105 |             new_beams = {beam: np.mean(beam.sequence_log_probs) for beam in new_beams}
106 |             beams = sorted(new_beams, key=new_beams.get, reverse=True)[:k_beams]
107 | 
108 |             if len(beams) == 0:
109 |                 break
110 |                 
111 |         if len(top_beams) != 0:
112 |             top_beams = {beam: np.mean(beam.sequence_log_probs) for beam in top_beams}
113 |         else:
114 |             top_beams = {beam: np.mean(beam.sequence_log_probs) for beam in new_beams}
115 | 
116 |         top_beams = sorted(top_beams, key=top_beams.get, reverse=True)[:k_beams]        
117 | 
118 |         decoded_words = top_beams[0].decoded_words
119 | 
120 |         return decoded_words, top_beams
121 |     
122 |     def evaluate_sentence(self, input_batch, k_beams=3):        
123 |         output_words, beams = self.evaluate(input_batch, k_beams)
124 |         output_sentence = ' '.join(output_words)
125 |         
126 |         print('>', sentence)
127 |         print('<', output_sentence)
128 |         print('')
129 |     
130 |     def ref_to_string(self, reference):
131 |         aux = ''
132 |         for i in range(len(reference)):
133 |             aux += self.output_lang.vocab.itos[reference[i]] + ' '
134 |         return aux.strip()
135 |     
136 |     def get_candidates_and_references(self, pairs, arr_dep, k_beams=3):
137 |         input_batches, _ = generate_batches(self.input_lang, self.output_lang, 1, pairs, return_dep_tree=True, arr_dep=arr_dep, max_degree=10, USE_CUDA=self.USE_CUDA)
138 | 
139 |         candidates = [self.evaluate(input_batch, k_beams)[0] for input_batch in tqdm(input_batches)]
140 |         candidates = [' '.join(candidate[:-1]) for candidate in candidates]
141 |         references = pairs[:,1]
142 |         references = [self.ref_to_string(reference) for reference in references]
143 |         return candidates, references


--------------------------------------------------------------------------------