├── BLEU.py ├── GCN.ipynb ├── NMT - default.ipynb ├── README.md ├── corpus ├── en-spa.txt ├── en.csv └── spa.csv ├── data.py ├── model ├── .ipynb_checkpoints │ ├── decoder-checkpoint.py │ ├── encoder-checkpoint.py │ └── graph-checkpoint.py ├── __pycache__ │ ├── decoder.cpython-36.pyc │ ├── encoder.cpython-36.pyc │ └── graph.cpython-36.pyc ├── decoder.py ├── encoder.py └── graph.py ├── utils.py └── validation.py /BLEU.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import codecs 3 | import os 4 | import math 5 | import operator 6 | import json 7 | from functools import reduce 8 | 9 | 10 | def fetch_data(cand, ref): 11 | """ Store each reference and candidate sentences as a list """ 12 | references = [] 13 | if '.txt' in ref: 14 | reference_file = codecs.open(ref, 'r') 15 | references.append(reference_file.readlines()) 16 | else: 17 | for root, dirs, files in os.walk(ref): 18 | for f in files: 19 | reference_file = codecs.open(os.path.join(root, f), 'r') 20 | references.append(reference_file.readlines()) 21 | candidate_file = codecs.open(cand, 'r') 22 | candidate = candidate_file.readlines() 23 | return candidate, references 24 | 25 | 26 | def count_ngram(candidate, references, n): 27 | clipped_count = 0 28 | count = 0 29 | r = 0 30 | c = 0 31 | for si in range(len(candidate)): 32 | # Calculate precision for each sentence 33 | ref_counts = [] 34 | ref_lengths = [] 35 | # Build dictionary of ngram counts 36 | for reference in references: 37 | ref_sentence = reference[si] 38 | ngram_d = {} 39 | words = ref_sentence.strip().split() 40 | ref_lengths.append(len(words)) 41 | limits = len(words) - n + 1 42 | # loop through the sentance consider the ngram length 43 | for i in range(limits): 44 | ngram = ' '.join(words[i:i+n]).lower() 45 | if ngram in ngram_d.keys(): 46 | ngram_d[ngram] += 1 47 | else: 48 | ngram_d[ngram] = 1 49 | ref_counts.append(ngram_d) 50 | # candidate 51 | cand_sentence = candidate[si] 52 | cand_dict = {} 53 | words = cand_sentence.strip().split() 54 | limits = len(words) - n + 1 55 | for i in range(0, limits): 56 | ngram = ' '.join(words[i:i + n]).lower() 57 | if ngram in cand_dict: 58 | cand_dict[ngram] += 1 59 | else: 60 | cand_dict[ngram] = 1 61 | clipped_count += clip_count(cand_dict, ref_counts) 62 | count += limits 63 | r += best_length_match(ref_lengths, len(words)) 64 | c += len(words) 65 | if clipped_count == 0: 66 | pr = 0 67 | else: 68 | pr = float(clipped_count) / count 69 | bp = brevity_penalty(c, r) 70 | return pr, bp 71 | 72 | 73 | def clip_count(cand_d, ref_ds): 74 | """Count the clip count for each ngram considering all references""" 75 | count = 0 76 | for m in cand_d.keys(): 77 | m_w = cand_d[m] 78 | m_max = 0 79 | for ref in ref_ds: 80 | if m in ref: 81 | m_max = max(m_max, ref[m]) 82 | m_w = min(m_w, m_max) 83 | count += m_w 84 | return count 85 | 86 | 87 | def best_length_match(ref_l, cand_l): 88 | """Find the closest length of reference to that of candidate""" 89 | 90 | least_diff = abs(cand_l-ref_l[0]) 91 | best = ref_l[0] 92 | for ref in ref_l: 93 | if abs(cand_l-ref) < least_diff: 94 | least_diff = abs(cand_l-ref) 95 | best = ref 96 | return best 97 | 98 | 99 | def brevity_penalty(c, r): 100 | if c > r: 101 | bp = 1 102 | else: 103 | bp = math.exp(1-(float(r)/c)) 104 | return bp 105 | 106 | 107 | def geometric_mean(precisions): 108 | return (reduce(operator.mul, precisions)) ** (1.0 / len(precisions)) 109 | 110 | 111 | def BLEU(candidate, references, n_grams=4): 112 | precisions = [] 113 | for i in range(n_grams): 114 | pr, bp = count_ngram(candidate, references, i+1) 115 | precisions.append(pr) 116 | bleu = geometric_mean(precisions) * bp 117 | return bleu, precisions, bp -------------------------------------------------------------------------------- /GCN.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "from data import generate_batches\n", 12 | "from data import prepare_data\n", 13 | "from data import data_to_index\n", 14 | "from data import DEP_LABELS\n", 15 | "\n", 16 | "from model.graph import Sintactic_GCN\n", 17 | "from model.encoder import Encoder\n", 18 | "from model.decoder import Decoder_luong\n", 19 | "\n", 20 | "from BLEU import BLEU\n", 21 | "\n", 22 | "from utils import time_since\n", 23 | "\n", 24 | "import torch\n", 25 | "import torch.nn as nn\n", 26 | "from torch.nn import functional\n", 27 | "from torch.autograd import Variable\n", 28 | "from torch import optim\n", 29 | "import torch.nn.functional as F\n", 30 | "\n", 31 | "from stanfordcorenlp import StanfordCoreNLP \n", 32 | "\n", 33 | "import numpy as np\n", 34 | "import time\n", 35 | "\n", 36 | "from validation import Evaluator\n", 37 | "\n", 38 | "%load_ext autoreload\n", 39 | "%autoreload 2" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 2, 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "USE_CUDA = True\n", 49 | "MAX_LENGTH = 100\n", 50 | "\n", 51 | "SPLIT_TRAIN = 0.7\n", 52 | "SPLIT_VALID = 0.15\n", 53 | "# The rest is for test" 54 | ] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "metadata": {}, 59 | "source": [ 60 | "# Reading the data" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "Prepare vocabulary and pairs for the data" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 3, 73 | "metadata": {}, 74 | "outputs": [ 75 | { 76 | "name": "stdout", 77 | "output_type": "stream", 78 | "text": [ 79 | "Reading lines...\n", 80 | "Read 118964 sentence pairs\n", 81 | "Filtered to 85785 pairs\n", 82 | "Creating vocab...\n", 83 | "Indexed 12436 words in input language, 22765 words in output\n" 84 | ] 85 | } 86 | ], 87 | "source": [ 88 | "input_lang, output_lang, pairs = prepare_data('en', 'spa', max_length=MAX_LENGTH)" 89 | ] 90 | }, 91 | { 92 | "cell_type": "markdown", 93 | "metadata": {}, 94 | "source": [ 95 | "Splitting pairs into test, val and test" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 4, 101 | "metadata": {}, 102 | "outputs": [], 103 | "source": [ 104 | "#np.shuffle(pairs)\n", 105 | "pairs_train = pairs[:int(len(pairs) * SPLIT_TRAIN)]\n", 106 | "pairs_valid = pairs[int(len(pairs) * SPLIT_TRAIN):int(len(pairs) * (SPLIT_TRAIN + SPLIT_VALID))]\n", 107 | "pairs_test = pairs[int(len(pairs) * (SPLIT_TRAIN + SPLIT_VALID)):]" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 5, 113 | "metadata": {}, 114 | "outputs": [ 115 | { 116 | "data": { 117 | "text/plain": [ 118 | "(60049, 12868, 12868)" 119 | ] 120 | }, 121 | "execution_count": 5, 122 | "metadata": {}, 123 | "output_type": "execute_result" 124 | } 125 | ], 126 | "source": [ 127 | "len(pairs_train), len(pairs_valid), len(pairs_test)" 128 | ] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "metadata": {}, 133 | "source": [ 134 | "Get the adjacency matrix for the pairs" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": 6, 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [ 143 | "nlp = StanfordCoreNLP(r'/home/krivas/stanford-corenlp-full-2018-02-27/')" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": 7, 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [ 152 | "from tqdm import tqdm\n", 153 | "def get_adjacency_matrix(pairs):\n", 154 | " arr_dep = []\n", 155 | " for pair in tqdm(pairs):\n", 156 | " arr_dep.append(nlp.dependency_parse(pair[0]))\n", 157 | " return np.array(arr_dep)" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": 8, 163 | "metadata": {}, 164 | "outputs": [ 165 | { 166 | "name": "stderr", 167 | "output_type": "stream", 168 | "text": [ 169 | "100%|██████████| 60049/60049 [07:20<00:00, 136.33it/s]\n", 170 | "100%|██████████| 12868/12868 [02:03<00:00, 104.42it/s]\n", 171 | "100%|██████████| 12868/12868 [02:29<00:00, 86.21it/s]\n" 172 | ] 173 | } 174 | ], 175 | "source": [ 176 | "arr_dep_train = get_adjacency_matrix(pairs_train)\n", 177 | "arr_dep_valid = get_adjacency_matrix(pairs_valid)\n", 178 | "arr_dep_test = get_adjacency_matrix(pairs_test)" 179 | ] 180 | }, 181 | { 182 | "cell_type": "markdown", 183 | "metadata": {}, 184 | "source": [ 185 | "Converting words to index in pairs" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": 9, 191 | "metadata": {}, 192 | "outputs": [], 193 | "source": [ 194 | "pairs_train = data_to_index(pairs_train, input_lang, output_lang)\n", 195 | "pairs_valid = data_to_index(pairs_valid, input_lang, output_lang)\n", 196 | "pairs_test = data_to_index(pairs_test, input_lang, output_lang)" 197 | ] 198 | }, 199 | { 200 | "cell_type": "markdown", 201 | "metadata": {}, 202 | "source": [ 203 | "# Training" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": 10, 209 | "metadata": {}, 210 | "outputs": [], 211 | "source": [ 212 | "def pass_batch_luong(batch_size, input_batches, target_batches, train=True, adj_arc_in=None, adj_arc_out=None, adj_lab_in=None, adj_lab_out=None, mask_in=None, mask_out=None, mask_loop=None):\n", 213 | " \n", 214 | " hidden = encoder.init_hidden(batch_size)\n", 215 | "\n", 216 | " encoder_outputs, encoder_hidden = encoder(input_batches, hidden)\n", 217 | " decoder_input = Variable(torch.LongTensor([input_lang.vocab.stoi[\"\"]] * batch_size))\n", 218 | " \n", 219 | " if gcn1:\n", 220 | " encoder_outputs = gcn1(encoder_outputs,\n", 221 | " adj_arc_in, adj_arc_out,\n", 222 | " adj_lab_in, adj_lab_out,\n", 223 | " mask_in, mask_out, \n", 224 | " mask_loop)\n", 225 | " \n", 226 | " decoder_hidden = encoder_hidden\n", 227 | " decoder_context = Variable(torch.zeros(batch_size, decoder.hidden_size)) \n", 228 | " \n", 229 | " all_decoder_outputs = Variable(torch.zeros(target_batches.data.size()[0], batch_size, len(output_lang.vocab.itos)))\n", 230 | "\n", 231 | " if USE_CUDA:\n", 232 | " all_decoder_outputs = all_decoder_outputs.cuda()\n", 233 | " decoder_input = decoder_input.cuda()\n", 234 | " decoder_context = decoder_context.cuda()\n", 235 | " \n", 236 | " if train:\n", 237 | " use_teacher_forcing = np.random.random() < tf_ratio\n", 238 | " else:\n", 239 | " use_teacher_forcing = False\n", 240 | " \n", 241 | " if use_teacher_forcing: \n", 242 | " # Use targets as inputs\n", 243 | " for di in range(target_batches.shape[0]):\n", 244 | " decoder_output, decoder_context, decoder_hidden, decoder_attention = decoder(\n", 245 | " decoder_input.unsqueeze(0), decoder_context, decoder_hidden, encoder_outputs)\n", 246 | " \n", 247 | " all_decoder_outputs[di] = decoder_output\n", 248 | " decoder_input = target_batches[di]\n", 249 | " else: \n", 250 | " # Use decoder output as inputs\n", 251 | " for di in range(target_batches.shape[0]): \n", 252 | " decoder_output, decoder_context, decoder_hidden, decoder_attention = decoder(\n", 253 | " decoder_input.unsqueeze(0), decoder_context, decoder_hidden, encoder_outputs) \n", 254 | " \n", 255 | " all_decoder_outputs[di] = decoder_output\n", 256 | " \n", 257 | " # Greedy approach, take the word with highest probability\n", 258 | " topv, topi = decoder_output.data.topk(1) \n", 259 | " decoder_input = Variable(torch.LongTensor(topi.cpu()).squeeze())\n", 260 | " if USE_CUDA: decoder_input = decoder_input.cuda()\n", 261 | " \n", 262 | " del decoder_output\n", 263 | " del decoder_hidden\n", 264 | " \n", 265 | " return all_decoder_outputs, target_batches\n", 266 | "\n", 267 | "def train_luong(input_batches, target_batches, batch_size, train=True, adj_arc_in=None, adj_arc_out=None, adj_lab_in=None, adj_lab_out=None, mask_in=None, mask_out=None, mask_loop=None):\n", 268 | " \n", 269 | " # Zero gradients of both optimizers\n", 270 | " if train:\n", 271 | " encoder_optimizer.zero_grad()\n", 272 | " decoder_optimizer.zero_grad()\n", 273 | "\n", 274 | " loss = 0 # Added onto for each word\n", 275 | " all_decoder_outputs, target_batches = pass_batch_luong(batch_size, input_batches, target_batches, train, adj_arc_in, adj_arc_out, adj_lab_in, adj_lab_out, mask_in, mask_out, mask_loop)\n", 276 | " \n", 277 | " # Loss calculation and backpropagation\n", 278 | " loss = criterion(all_decoder_outputs.view(-1, decoder.output_size), target_batches.contiguous().view(-1))\n", 279 | " \n", 280 | " if train:\n", 281 | " loss.backward()\n", 282 | " torch.nn.utils.clip_grad_norm_(encoder.parameters(), clip)\n", 283 | " torch.nn.utils.clip_grad_norm_(decoder.parameters(), clip)\n", 284 | " encoder_optimizer.step()\n", 285 | " decoder_optimizer.step()\n", 286 | " \n", 287 | " if gcn1:\n", 288 | " torch.nn.utils.clip_grad_norm_(gcn1.parameters(), clip)\n", 289 | " gcn1_optimizer.step()\n", 290 | "\n", 291 | " del all_decoder_outputs\n", 292 | " del target_batches\n", 293 | " \n", 294 | " return loss.item()" 295 | ] 296 | }, 297 | { 298 | "cell_type": "markdown", 299 | "metadata": {}, 300 | "source": [ 301 | "# Model" 302 | ] 303 | }, 304 | { 305 | "cell_type": "code", 306 | "execution_count": 18, 307 | "metadata": { 308 | "collapsed": true 309 | }, 310 | "outputs": [], 311 | "source": [ 312 | "# Configure models\n", 313 | "hidden_size_rnn = 512\n", 314 | "hidden_size_graph = 512\n", 315 | "emb_size=300\n", 316 | "n_layers = 2\n", 317 | "dropout = 0.1\n", 318 | "batch_size = 50\n", 319 | "\n", 320 | "# Configure training/optimization\n", 321 | "clip = 10.0\n", 322 | "learning_rate_graph = 0.0002\n", 323 | "n_epochs = 20\n", 324 | "print_every = 10\n", 325 | "validate_loss_every = 50\n", 326 | "validate_acc_every = 2 * validate_loss_every\n", 327 | "tf_ratio = 0.5\n", 328 | "best_bleu = 0" 329 | ] 330 | }, 331 | { 332 | "cell_type": "code", 333 | "execution_count": null, 334 | "metadata": {}, 335 | "outputs": [], 336 | "source": [ 337 | "# Initialize models\n", 338 | "encoder = Encoder(len(input_lang.vocab.itos), hidden_size_rnn, emb_size, n_layers=n_layers, dropout=dropout, USE_CUDA=USE_CUDA)\n", 339 | "decoder = Decoder_luong('general', hidden_size_graph, len(output_lang.vocab.itos), 300, n_layers=2 * n_layers, dropout=dropout, USE_CUDA=USE_CUDA)\n", 340 | "gcn1 = Sintactic_GCN(hidden_size_rnn, hidden_size_graph, num_labels=len(DEP_LABELS))\n", 341 | "\n", 342 | "# Initialize optimizers and criterion\n", 343 | "encoder_optimizer = optim.Adam(encoder.parameters())\n", 344 | "decoder_optimizer = optim.Adam(decoder.parameters())\n", 345 | "gcn1_optimizer = optim.Adam(gcn1.parameters())#, learning_rate_graph)\n", 346 | "\n", 347 | "criterion = nn.NLLLoss()\n", 348 | "\n", 349 | "# Move models to GPU\n", 350 | "if USE_CUDA:\n", 351 | " encoder = encoder.cuda()\n", 352 | " decoder = decoder.cuda()\n", 353 | " gcn1 = gcn1.cuda()\n", 354 | " \n", 355 | "# Keep track of time elapsed and running averages\n", 356 | "start = time.time()\n", 357 | "train_losses = []\n", 358 | "validation_losses = []\n", 359 | "validation_bleu = []\n", 360 | "\n", 361 | "print_loss_total = 0 # Reset every print_every\n", 362 | "plot_loss_total = 0 # Reset every plot_every" 363 | ] 364 | }, 365 | { 366 | "cell_type": "code", 367 | "execution_count": null, 368 | "metadata": {}, 369 | "outputs": [ 370 | { 371 | "name": "stdout", 372 | "output_type": "stream", 373 | "text": [ 374 | "0m 4s (- 8m 56s) (10 0.83%) train_loss: 5.5433\n", 375 | "0m 6s (- 6m 40s) (20 1.67%) train_loss: 2.3962\n", 376 | "0m 8s (- 5m 50s) (30 2.50%) train_loss: 2.1459\n", 377 | "0m 11s (- 5m 21s) (40 3.33%) train_loss: 2.1469\n", 378 | "0m 13s (- 5m 7s) (50 4.16%) train_loss: 2.0643\n", 379 | "0m 15s (- 5m 1s) (60 5.00%) train_loss: 2.0653\n", 380 | "0m 18s (- 4m 57s) (70 5.83%) train_loss: 2.1074\n", 381 | "0m 20s (- 4m 52s) (80 6.66%) train_loss: 2.0922\n", 382 | "0m 23s (- 4m 48s) (90 7.49%) train_loss: 2.1566\n", 383 | "0m 25s (- 4m 45s) (100 8.33%) train_loss: 1.9920\n", 384 | "0m 28s (- 4m 42s) (110 9.16%) train_loss: 1.9252\n", 385 | "0m 30s (- 4m 39s) (120 9.99%) train_loss: 1.9822\n", 386 | "0m 33s (- 4m 35s) (130 10.82%) train_loss: 1.8853\n", 387 | "0m 35s (- 4m 32s) (140 11.66%) train_loss: 1.8869\n", 388 | "0m 39s (- 4m 37s) (150 12.49%) train_loss: 1.8213\n", 389 | "0m 43s (- 4m 41s) (160 13.32%) train_loss: 1.7827\n", 390 | "0m 46s (- 4m 44s) (170 14.15%) train_loss: 1.7572\n", 391 | "0m 50s (- 4m 47s) (180 14.99%) train_loss: 1.7951\n", 392 | "0m 54s (- 4m 49s) (190 15.82%) train_loss: 1.7455\n", 393 | "0m 58s (- 4m 50s) (200 16.65%) train_loss: 1.7772\n", 394 | "1m 1s (- 4m 51s) (210 17.49%) train_loss: 1.7657\n", 395 | "1m 5s (- 4m 52s) (220 18.32%) train_loss: 1.7416\n", 396 | "1m 9s (- 4m 51s) (230 19.15%) train_loss: 1.8612\n", 397 | "1m 12s (- 4m 50s) (240 19.98%) train_loss: 1.7539\n", 398 | "1m 15s (- 4m 48s) (250 20.82%) train_loss: 1.5999\n", 399 | "1m 19s (- 4m 47s) (260 21.65%) train_loss: 1.7584\n", 400 | "1m 23s (- 4m 46s) (270 22.48%) train_loss: 1.6611\n", 401 | "1m 26s (- 4m 45s) (280 23.31%) train_loss: 1.7680\n", 402 | "1m 30s (- 4m 44s) (290 24.15%) train_loss: 1.7306\n", 403 | "1m 34s (- 4m 43s) (300 24.98%) train_loss: 1.7626\n", 404 | "1m 38s (- 4m 41s) (310 25.81%) train_loss: 1.7287\n", 405 | "1m 41s (- 4m 40s) (320 26.64%) train_loss: 1.6571\n", 406 | "1m 45s (- 4m 38s) (330 27.48%) train_loss: 1.7704\n", 407 | "1m 48s (- 4m 35s) (340 28.31%) train_loss: 1.7828\n", 408 | "1m 52s (- 4m 33s) (350 29.14%) train_loss: 1.7588\n", 409 | "1m 56s (- 4m 31s) (360 29.98%) train_loss: 1.7041\n", 410 | "1m 59s (- 4m 28s) (370 30.81%) train_loss: 1.7498\n", 411 | "2m 3s (- 4m 26s) (380 31.64%) train_loss: 1.6711\n", 412 | "2m 6s (- 4m 23s) (390 32.47%) train_loss: 1.6425\n" 413 | ] 414 | } 415 | ], 416 | "source": [ 417 | "for epoch in range(1, n_epochs): \n", 418 | " # Shuffle data\n", 419 | " id_aux = np.random.permutation(np.arange(len(pairs_train)))\n", 420 | " pairs_train = pairs_train[id_aux]\n", 421 | " arr_dep_train = arr_dep_train[id_aux]\n", 422 | " \n", 423 | " # Get the batches for this epoch\n", 424 | " input_batches, target_batches = generate_batches(input_lang, output_lang, batch_size, pairs_train, return_dep_tree=True, arr_dep=arr_dep_train, max_degree=6, USE_CUDA=USE_CUDA)\n", 425 | " print_loss_total = 0\n", 426 | " for batch_ix, (input_batch, target_var) in enumerate(zip(input_batches, target_batches)):\n", 427 | " \n", 428 | " encoder.train()\n", 429 | " decoder.train()\n", 430 | " gcn1.train()\n", 431 | " \n", 432 | " [input_var, adj_arc_in, adj_arc_out, adj_lab_in, adj_lab_out, mask_in, mask_out, mask_loop] = input_batch\n", 433 | " # Run the train function\n", 434 | " loss = train_luong(input_var, target_var, input_var.size(1), \n", 435 | " True, adj_arc_in, adj_arc_out, adj_lab_in, adj_lab_out, mask_in, mask_out, mask_loop)\n", 436 | " \n", 437 | " torch.cuda.empty_cache()\n", 438 | "\n", 439 | " # Keep track of loss\n", 440 | " print_loss_total += loss\n", 441 | " plot_loss_total += loss\n", 442 | "\n", 443 | " if batch_ix == 0: continue\n", 444 | "\n", 445 | " if batch_ix % print_every == 0:\n", 446 | " print_loss_avg = print_loss_total / print_every\n", 447 | " print_loss_total = 0\n", 448 | " print_summary = '%s (%d %d%%) %.4f' % (time_since(start, epoch / n_epochs), epoch, epoch / n_epochs * 100, print_loss_avg)\n", 449 | " train_losses.append(loss)\n", 450 | "\n", 451 | " print(f'{time_since(start, batch_ix / len(input_batches))} ({batch_ix} {batch_ix / len(input_batches) * 100:.2f}%) train_loss: {print_loss_avg:.4f}')\n", 452 | " \n", 453 | " input_batches, target_batches = generate_batches(input_lang, output_lang, batch_size, pairs_valid, return_dep_tree=True, arr_dep=arr_dep_train, max_degree=6, USE_CUDA=USE_CUDA)\n", 454 | " print_loss_total = 0\n", 455 | " for input_batch, target_var in zip(input_batches, target_batches):\n", 456 | " \n", 457 | " encoder.eval()\n", 458 | " decoder.eval()\n", 459 | " gcn1.eval()\n", 460 | " \n", 461 | " [input_var, adj_arc_in, adj_arc_out, adj_lab_in, adj_lab_out, mask_in, mask_out, mask_loop] = input_batch\n", 462 | " # Run the train function\n", 463 | " loss = train_luong(input_var, target_var, input_var.size(1), \n", 464 | " False, adj_arc_in, adj_arc_out, adj_lab_in, adj_lab_out, mask_in, mask_out, mask_loop)\n", 465 | " \n", 466 | " print_loss_total += loss\n", 467 | " val_loss = print_loss_total / len(input_batches)\n", 468 | " validation_losses.append(val_loss)\n", 469 | " # Evaluating Bleu\n", 470 | " evaluator = Evaluator(encoder, decoder, gcn1, None, input_lang, output_lang, MAX_LENGTH, True)\n", 471 | " candidates, references = evaluator.get_candidates_and_references(pairs_test, arr_dep_test, k_beams=1)\n", 472 | " bleu = BLEU(candidates, [references])\n", 473 | " if bleu[0] > best_bleu:\n", 474 | " best_bleu = bleu[0]\n", 475 | " torch.save(encoder.state_dict(), 'encoder_graph.pkl')\n", 476 | " torch.save(decoder.state_dict(), 'decoder_graph.pkl')\n", 477 | " torch.save(gcn1.state_dict(), 'gcn_graph.pkl')\n", 478 | " validation_bleu.append(bleu)\n", 479 | " print(f'val_loss: {val_loss:.4f} - bleu: {bleu}', end=' ')\n", 480 | "\n", 481 | " # Prevent overflow gpu memory\n", 482 | " del evaluator" 483 | ] 484 | }, 485 | { 486 | "cell_type": "code", 487 | "execution_count": null, 488 | "metadata": {}, 489 | "outputs": [], 490 | "source": [] 491 | } 492 | ], 493 | "metadata": { 494 | "kernelspec": { 495 | "display_name": "Python 3", 496 | "language": "python", 497 | "name": "python3" 498 | }, 499 | "language_info": { 500 | "codemirror_mode": { 501 | "name": "ipython", 502 | "version": 3 503 | }, 504 | "file_extension": ".py", 505 | "mimetype": "text/x-python", 506 | "name": "python", 507 | "nbconvert_exporter": "python", 508 | "pygments_lexer": "ipython3", 509 | "version": "3.6.4" 510 | }, 511 | "toc": { 512 | "nav_menu": {}, 513 | "number_sections": true, 514 | "sideBar": true, 515 | "skip_h1_title": false, 516 | "toc_cell": false, 517 | "toc_position": {}, 518 | "toc_section_display": "block", 519 | "toc_window_display": false 520 | } 521 | }, 522 | "nbformat": 4, 523 | "nbformat_minor": 2 524 | } 525 | -------------------------------------------------------------------------------- /NMT - default.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "from data import generate_batches\n", 12 | "from data import prepare_data\n", 13 | "from data import data_to_index\n", 14 | "from data import DEP_LABELS\n", 15 | "\n", 16 | "from model.graph import Sintactic_GCN\n", 17 | "from model.encoder import Encoder\n", 18 | "from model.decoder import Decoder_luong\n", 19 | "\n", 20 | "from BLEU import BLEU\n", 21 | "\n", 22 | "from utils import time_since\n", 23 | "\n", 24 | "import torch\n", 25 | "import torch.nn as nn\n", 26 | "from torch.nn import functional\n", 27 | "from torch.autograd import Variable\n", 28 | "from torch import optim\n", 29 | "import torch.nn.functional as F\n", 30 | "\n", 31 | "from stanfordcorenlp import StanfordCoreNLP \n", 32 | "\n", 33 | "import numpy as np\n", 34 | "import time\n", 35 | "\n", 36 | "from validation import Evaluator\n", 37 | "\n", 38 | "%load_ext autoreload\n", 39 | "%autoreload 2" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 2, 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "USE_CUDA = True\n", 49 | "MAX_LENGTH = 100\n", 50 | "\n", 51 | "SPLIT_TRAIN = 0.7\n", 52 | "SPLIT_VALID = 0.15\n", 53 | "# The rest is for test" 54 | ] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "metadata": {}, 59 | "source": [ 60 | "# Reading the data" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "Prepare vocabulary and pairs for the data" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 3, 73 | "metadata": {}, 74 | "outputs": [ 75 | { 76 | "name": "stdout", 77 | "output_type": "stream", 78 | "text": [ 79 | "Reading lines...\n", 80 | "Read 118964 sentence pairs\n", 81 | "Filtered to 85785 pairs\n", 82 | "Creating vocab...\n", 83 | "Indexed 12436 words in input language, 22765 words in output\n" 84 | ] 85 | } 86 | ], 87 | "source": [ 88 | "input_lang, output_lang, pairs = prepare_data('en', 'spa', max_length=MAX_LENGTH)" 89 | ] 90 | }, 91 | { 92 | "cell_type": "markdown", 93 | "metadata": {}, 94 | "source": [ 95 | "Splitting pairs into test, val and test" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 4, 101 | "metadata": {}, 102 | "outputs": [], 103 | "source": [ 104 | "#np.shuffle(pairs)\n", 105 | "pairs_train = pairs[:int(len(pairs) * SPLIT_TRAIN)]\n", 106 | "pairs_valid = pairs[int(len(pairs) * SPLIT_TRAIN):int(len(pairs) * (SPLIT_TRAIN + SPLIT_VALID))]\n", 107 | "pairs_test = pairs[int(len(pairs) * (SPLIT_TRAIN + SPLIT_VALID)):]" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 5, 113 | "metadata": {}, 114 | "outputs": [ 115 | { 116 | "data": { 117 | "text/plain": [ 118 | "(60049, 12868, 12868)" 119 | ] 120 | }, 121 | "execution_count": 5, 122 | "metadata": {}, 123 | "output_type": "execute_result" 124 | } 125 | ], 126 | "source": [ 127 | "len(pairs_train), len(pairs_valid), len(pairs_test)" 128 | ] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "metadata": {}, 133 | "source": [ 134 | "Get the adjacency matrix for the pairs" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": 6, 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [ 143 | "nlp = StanfordCoreNLP(r'/home/krivas/stanford-corenlp-full-2018-02-27/')" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": 7, 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [ 152 | "from tqdm import tqdm\n", 153 | "def get_adjacency_matrix(pairs):\n", 154 | " arr_dep = []\n", 155 | " for pair in tqdm(pairs):\n", 156 | " arr_dep.append(nlp.dependency_parse(pair[0]))\n", 157 | " return np.array(arr_dep)" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": 8, 163 | "metadata": {}, 164 | "outputs": [ 165 | { 166 | "name": "stderr", 167 | "output_type": "stream", 168 | "text": [ 169 | "100%|██████████| 60049/60049 [07:22<00:00, 135.68it/s]\n", 170 | "100%|██████████| 12868/12868 [02:01<00:00, 106.01it/s]\n", 171 | "100%|██████████| 12868/12868 [02:26<00:00, 87.54it/s]\n" 172 | ] 173 | } 174 | ], 175 | "source": [ 176 | "arr_dep_train = get_adjacency_matrix(pairs_train)\n", 177 | "arr_dep_valid = get_adjacency_matrix(pairs_valid)\n", 178 | "arr_dep_test = get_adjacency_matrix(pairs_test)" 179 | ] 180 | }, 181 | { 182 | "cell_type": "markdown", 183 | "metadata": {}, 184 | "source": [ 185 | "Converting words to index in pairs" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": 9, 191 | "metadata": {}, 192 | "outputs": [], 193 | "source": [ 194 | "pairs_train = data_to_index(pairs_train, input_lang, output_lang)\n", 195 | "pairs_valid = data_to_index(pairs_valid, input_lang, output_lang)\n", 196 | "pairs_test = data_to_index(pairs_test, input_lang, output_lang)" 197 | ] 198 | }, 199 | { 200 | "cell_type": "markdown", 201 | "metadata": {}, 202 | "source": [ 203 | "# Training" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": 10, 209 | "metadata": {}, 210 | "outputs": [], 211 | "source": [ 212 | "def pass_batch_luong(batch_size, input_batches, target_batches, train=True, adj_arc_in=None, adj_arc_out=None, adj_lab_in=None, adj_lab_out=None, mask_in=None, mask_out=None, mask_loop=None):\n", 213 | " \n", 214 | " hidden = encoder.init_hidden(batch_size)\n", 215 | "\n", 216 | " encoder_outputs, encoder_hidden = encoder(input_batches, hidden)\n", 217 | " decoder_input = Variable(torch.LongTensor([input_lang.vocab.stoi[\"\"]] * batch_size))\n", 218 | " \n", 219 | " decoder_hidden = encoder_hidden\n", 220 | " decoder_context = Variable(torch.zeros(batch_size, decoder.hidden_size)) \n", 221 | " \n", 222 | " all_decoder_outputs = Variable(torch.zeros(target_batches.data.size()[0], batch_size, len(output_lang.vocab.itos)))\n", 223 | "\n", 224 | " if USE_CUDA:\n", 225 | " all_decoder_outputs = all_decoder_outputs.cuda()\n", 226 | " decoder_input = decoder_input.cuda()\n", 227 | " decoder_context = decoder_context.cuda()\n", 228 | " \n", 229 | " if train:\n", 230 | " use_teacher_forcing = np.random.random() < tf_ratio\n", 231 | " else:\n", 232 | " use_teacher_forcing = False\n", 233 | " \n", 234 | " if use_teacher_forcing: \n", 235 | " # Use targets as inputs\n", 236 | " for di in range(target_batches.shape[0]):\n", 237 | " decoder_output, decoder_context, decoder_hidden, decoder_attention = decoder(\n", 238 | " decoder_input.unsqueeze(0), decoder_context, decoder_hidden, encoder_outputs)\n", 239 | " \n", 240 | " all_decoder_outputs[di] = decoder_output\n", 241 | " decoder_input = target_batches[di]\n", 242 | " else: \n", 243 | " # Use decoder output as inputs\n", 244 | " for di in range(target_batches.shape[0]): \n", 245 | " decoder_output, decoder_context, decoder_hidden, decoder_attention = decoder(\n", 246 | " decoder_input.unsqueeze(0), decoder_context, decoder_hidden, encoder_outputs) \n", 247 | " \n", 248 | " all_decoder_outputs[di] = decoder_output\n", 249 | " \n", 250 | " # Greedy approach, take the word with highest probability\n", 251 | " topv, topi = decoder_output.data.topk(1) \n", 252 | " decoder_input = Variable(torch.LongTensor(topi.cpu()).squeeze())\n", 253 | " if USE_CUDA: decoder_input = decoder_input.cuda()\n", 254 | " \n", 255 | " del decoder_output\n", 256 | " del decoder_hidden\n", 257 | " \n", 258 | " return all_decoder_outputs, target_batches\n", 259 | "\n", 260 | "def train_luong(input_batches, target_batches, batch_size, train=True, adj_arc_in=None, adj_arc_out=None, adj_lab_in=None, adj_lab_out=None, mask_in=None, mask_out=None, mask_loop=None):\n", 261 | " \n", 262 | " # Zero gradients of both optimizers\n", 263 | " if train:\n", 264 | " encoder_optimizer.zero_grad()\n", 265 | " decoder_optimizer.zero_grad()\n", 266 | "\n", 267 | " loss = 0 # Added onto for each word\n", 268 | " all_decoder_outputs, target_batches = pass_batch_luong(batch_size, input_batches, target_batches, train, adj_arc_in, adj_arc_out, adj_lab_in, adj_lab_out, mask_in, mask_out, mask_loop)\n", 269 | " \n", 270 | " # Loss calculation and backpropagation\n", 271 | " loss = criterion(all_decoder_outputs.view(-1, decoder.output_size), target_batches.contiguous().view(-1))\n", 272 | " \n", 273 | " if train:\n", 274 | " loss.backward()\n", 275 | " torch.nn.utils.clip_grad_norm_(encoder.parameters(), clip)\n", 276 | " torch.nn.utils.clip_grad_norm_(decoder.parameters(), clip)\n", 277 | " encoder_optimizer.step()\n", 278 | " decoder_optimizer.step()\n", 279 | " \n", 280 | " if gcn1:\n", 281 | " torch.nn.utils.clip_grad_norm_(gcn1.parameters(), clip)\n", 282 | " gcn1_optimizer.step()\n", 283 | "\n", 284 | " del all_decoder_outputs\n", 285 | " del target_batches\n", 286 | " \n", 287 | " return loss.item()" 288 | ] 289 | }, 290 | { 291 | "cell_type": "markdown", 292 | "metadata": {}, 293 | "source": [ 294 | "# Model" 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": 18, 300 | "metadata": { 301 | "collapsed": true 302 | }, 303 | "outputs": [], 304 | "source": [ 305 | "# Configure models\n", 306 | "hidden_size_rnn = 512\n", 307 | "hidden_size_graph = 512\n", 308 | "emb_size=300\n", 309 | "n_layers = 2\n", 310 | "dropout = 0.1\n", 311 | "batch_size = 50\n", 312 | "\n", 313 | "# Configure training/optimization\n", 314 | "clip = 10.0\n", 315 | "learning_rate_graph = 0.0002\n", 316 | "n_epochs = 20\n", 317 | "print_every = 10\n", 318 | "validate_loss_every = 50\n", 319 | "validate_acc_every = 2 * validate_loss_every\n", 320 | "tf_ratio = 0.5\n", 321 | "best_bleu = 0" 322 | ] 323 | }, 324 | { 325 | "cell_type": "code", 326 | "execution_count": null, 327 | "metadata": {}, 328 | "outputs": [], 329 | "source": [ 330 | "# Initialize models\n", 331 | "encoder = Encoder(len(input_lang.vocab.itos), hidden_size_rnn, emb_size, n_layers=n_layers, dropout=dropout, USE_CUDA=USE_CUDA)\n", 332 | "decoder = Decoder_luong('general', hidden_size_graph, len(output_lang.vocab.itos), 300, n_layers=2 * n_layers, dropout=dropout, USE_CUDA=USE_CUDA)\n", 333 | "gcn1 = Sintactic_GCN(hidden_size_rnn, hidden_size_graph, num_labels=len(DEP_LABELS))\n", 334 | "\n", 335 | "# Initialize optimizers and criterion\n", 336 | "encoder_optimizer = optim.Adam(encoder.parameters())\n", 337 | "decoder_optimizer = optim.Adam(decoder.parameters())\n", 338 | "gcn1_optimizer = optim.Adam(gcn1.parameters(), learning_rate_graph)\n", 339 | "\n", 340 | "criterion = nn.NLLLoss()\n", 341 | "\n", 342 | "# Move models to GPU\n", 343 | "if USE_CUDA:\n", 344 | " encoder = encoder.cuda()\n", 345 | " decoder = decoder.cuda()\n", 346 | " gcn1 = gcn1.cuda()\n", 347 | " \n", 348 | "# Keep track of time elapsed and running averages\n", 349 | "start = time.time()\n", 350 | "train_losses = []\n", 351 | "validation_losses = []\n", 352 | "validation_bleu = []\n", 353 | "\n", 354 | "print_loss_total = 0 # Reset every print_every\n", 355 | "plot_loss_total = 0 # Reset every plot_every" 356 | ] 357 | }, 358 | { 359 | "cell_type": "code", 360 | "execution_count": null, 361 | "metadata": {}, 362 | "outputs": [ 363 | { 364 | "name": "stdout", 365 | "output_type": "stream", 366 | "text": [ 367 | "0m 7s (- 15m 12s) (10 0.83%) train_loss: 4.7464\n", 368 | "0m 11s (- 11m 9s) (20 1.67%) train_loss: 2.4366\n", 369 | "0m 15s (- 9m 45s) (30 2.50%) train_loss: 2.3272\n", 370 | "0m 18s (- 9m 4s) (40 3.33%) train_loss: 2.1440\n", 371 | "0m 22s (- 8m 37s) (50 4.16%) train_loss: 2.0919\n", 372 | "0m 26s (- 8m 18s) (60 5.00%) train_loss: 2.0173\n", 373 | "0m 29s (- 8m 3s) (70 5.83%) train_loss: 2.0141\n", 374 | "0m 33s (- 7m 51s) (80 6.66%) train_loss: 1.9101\n", 375 | "0m 37s (- 7m 42s) (90 7.49%) train_loss: 1.8445\n", 376 | "0m 41s (- 7m 35s) (100 8.33%) train_loss: 1.8303\n", 377 | "0m 45s (- 7m 27s) (110 9.16%) train_loss: 1.7503\n", 378 | "0m 48s (- 7m 20s) (120 9.99%) train_loss: 1.7693\n", 379 | "0m 52s (- 7m 14s) (130 10.82%) train_loss: 1.7140\n", 380 | "0m 56s (- 7m 7s) (140 11.66%) train_loss: 1.7159\n", 381 | "1m 0s (- 7m 1s) (150 12.49%) train_loss: 1.6555\n", 382 | "1m 3s (- 6m 56s) (160 13.32%) train_loss: 1.6903\n", 383 | "1m 7s (- 6m 50s) (170 14.15%) train_loss: 1.6072\n", 384 | "1m 11s (- 6m 44s) (180 14.99%) train_loss: 1.5495\n", 385 | "1m 15s (- 6m 39s) (190 15.82%) train_loss: 1.6719\n", 386 | "1m 18s (- 6m 34s) (200 16.65%) train_loss: 1.6109\n", 387 | "1m 22s (- 6m 28s) (210 17.49%) train_loss: 1.5898\n", 388 | "1m 26s (- 6m 24s) (220 18.32%) train_loss: 1.5067\n", 389 | "1m 29s (- 6m 19s) (230 19.15%) train_loss: 1.4599\n", 390 | "1m 33s (- 6m 15s) (240 19.98%) train_loss: 1.4415\n" 391 | ] 392 | } 393 | ], 394 | "source": [ 395 | "for epoch in range(1, n_epochs): \n", 396 | " # Shuffle data\n", 397 | " id_aux = np.random.permutation(np.arange(len(pairs_train)))\n", 398 | " pairs_train = pairs_train[id_aux]\n", 399 | " arr_dep_train = arr_dep_train[id_aux]\n", 400 | " \n", 401 | " # Get the batches for this epoch\n", 402 | " input_batches, target_batches = generate_batches(input_lang, output_lang, batch_size, pairs_train, return_dep_tree=True, arr_dep=arr_dep_train, max_degree=6, USE_CUDA=USE_CUDA)\n", 403 | " print_loss_total = 0\n", 404 | " for batch_ix, (input_batch, target_var) in enumerate(zip(input_batches, target_batches)):\n", 405 | " \n", 406 | " encoder.train()\n", 407 | " decoder.train()\n", 408 | " gcn1.train()\n", 409 | " \n", 410 | " [input_var, adj_arc_in, adj_arc_out, adj_lab_in, adj_lab_out, mask_in, mask_out, mask_loop] = input_batch\n", 411 | " # Run the train function\n", 412 | " loss = train_luong(input_var, target_var, input_var.size(1), \n", 413 | " True, adj_arc_in, adj_arc_out, adj_lab_in, adj_lab_out, mask_in, mask_out, mask_loop)\n", 414 | " \n", 415 | " torch.cuda.empty_cache()\n", 416 | "\n", 417 | " # Keep track of loss\n", 418 | " print_loss_total += loss\n", 419 | " plot_loss_total += loss\n", 420 | "\n", 421 | " if batch_ix == 0: continue\n", 422 | "\n", 423 | " if batch_ix % print_every == 0:\n", 424 | " print_loss_avg = print_loss_total / print_every\n", 425 | " print_loss_total = 0\n", 426 | " print_summary = '%s (%d %d%%) %.4f' % (time_since(start, epoch / n_epochs), epoch, epoch / n_epochs * 100, print_loss_avg)\n", 427 | " train_losses.append(loss)\n", 428 | "\n", 429 | " print(f'{time_since(start, batch_ix / len(input_batches))} ({batch_ix} {batch_ix / len(input_batches) * 100:.2f}%) train_loss: {print_loss_avg:.4f}')\n", 430 | " \n", 431 | " input_batches, target_batches = generate_batches(input_lang, output_lang, batch_size, pairs_valid, return_dep_tree=True, arr_dep=arr_dep_train, max_degree=6, USE_CUDA=USE_CUDA)\n", 432 | " print_loss_total = 0\n", 433 | " for input_batch, target_var in zip(input_batches, target_batches):\n", 434 | " \n", 435 | " encoder.eval()\n", 436 | " decoder.eval()\n", 437 | " gcn1.eval()\n", 438 | " \n", 439 | " [input_var, adj_arc_in, adj_arc_out, adj_lab_in, adj_lab_out, mask_in, mask_out, mask_loop] = input_batch\n", 440 | " # Run the train function\n", 441 | " loss = train_luong(input_var, target_var, input_var.size(1), \n", 442 | " False, adj_arc_in, adj_arc_out, adj_lab_in, adj_lab_out, mask_in, mask_out, mask_loop)\n", 443 | " \n", 444 | " print_loss_total += loss\n", 445 | " val_loss = print_loss_total / len(input_batches)\n", 446 | " validation_losses.append(val_loss)\n", 447 | " # Evaluating Bleu\n", 448 | " evaluator = Evaluator(encoder, decoder, gcn1, None, input_lang, output_lang, MAX_LENGTH, True)\n", 449 | " candidates, references = evaluator.get_candidates_and_references(pairs_test, arr_dep_test, k_beams=1)\n", 450 | " bleu = BLEU(candidates, [references])\n", 451 | " if bleu[0] > best_bleu:\n", 452 | " best_bleu = bleu[0]\n", 453 | " torch.save(encoder.state_dict(), 'encoder_graph.pkl')\n", 454 | " torch.save(decoder.state_dict(), 'decoder_graph.pkl')\n", 455 | " torch.save(gcn1.state_dict(), 'gcn_graph.pkl')\n", 456 | " validation_bleu.append(bleu)\n", 457 | " print(f'val_loss: {val_loss:.4f} - bleu: {bleu}', end=' ')\n", 458 | "\n", 459 | " # Prevent overflow gpu memory\n", 460 | " del evaluator" 461 | ] 462 | }, 463 | { 464 | "cell_type": "code", 465 | "execution_count": null, 466 | "metadata": {}, 467 | "outputs": [], 468 | "source": [] 469 | } 470 | ], 471 | "metadata": { 472 | "kernelspec": { 473 | "display_name": "Python 3", 474 | "language": "python", 475 | "name": "python3" 476 | }, 477 | "language_info": { 478 | "codemirror_mode": { 479 | "name": "ipython", 480 | "version": 3 481 | }, 482 | "file_extension": ".py", 483 | "mimetype": "text/x-python", 484 | "name": "python", 485 | "nbconvert_exporter": "python", 486 | "pygments_lexer": "ipython3", 487 | "version": "3.6.4" 488 | }, 489 | "toc": { 490 | "nav_menu": {}, 491 | "number_sections": true, 492 | "sideBar": true, 493 | "skip_h1_title": false, 494 | "toc_cell": false, 495 | "toc_position": {}, 496 | "toc_section_display": "block", 497 | "toc_window_display": false 498 | } 499 | }, 500 | "nbformat": 4, 501 | "nbformat_minor": 2 502 | } 503 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Graph-convolutional 2 | 3 | This is an application in a small corpus of the graph convolutional network proposed in this paper https://arxiv.org/abs/1703.04826 4 | -------------------------------------------------------------------------------- /data.py: -------------------------------------------------------------------------------- 1 | import unicodedata 2 | import string 3 | import re 4 | import random 5 | import time 6 | import math 7 | import os 8 | import sys 9 | import pandas as pd 10 | import numpy as np 11 | 12 | import nltk 13 | 14 | import torch 15 | import torch.nn as nn 16 | from torch.nn import functional 17 | from torch.autograd import Variable 18 | from torch import optim 19 | import torch.nn.functional as F 20 | 21 | from stanfordcorenlp import StanfordCoreNLP 22 | from nltk.parse.stanford import StanfordParser 23 | from nltk.tag import StanfordNERTagger 24 | from nltk.tokenize import word_tokenize 25 | from nltk.corpus import wordnet 26 | 27 | import enchant 28 | 29 | import torchtext 30 | from torchtext import data 31 | from torchtext import datasets 32 | 33 | # label of dependencies https://nlp.stanford.edu/pubs/USD_LREC14_paper_camera_ready.pdf 34 | 35 | DEP_LABELS = ['ROOT', 'ACL','ACVLCL', 'ADVMOD', 'AMOD', 'APPOS', 'AUX', 'CASE', 'CC', 'CCOMP', 36 | 'CLF', 'COMPOUND', 'CONJ', 'COP', 'CSUBJ', 'DEP', 'DET', 37 | 'DISCOURSE', 'DISLOCATED', 'EXPL', 'FIXED', 'FLAT', 'GOESWITH', 38 | 'IOBJ', 'LIST', 'MARK', 'NMOD', 'NSUBJ', 'NUMMOD', 39 | 'OBJ', 'OBL', 'ORPHAN', 'PARATAXIS', 'PUNXT', 'REPARANDUM', 'VOCATIVE', 40 | 'XCOMP'] 41 | 42 | _DEP_LABELS_DICT = {label:ix for ix, label in enumerate(DEP_LABELS)} 43 | 44 | def find_type(type_dep): 45 | if type_dep=='NSUBJ' or type_dep=='OBJ' or type_dep=='IOBJ' or type_dep=='CSUBJ' or type_dep=='CCOMP' or type_dep == 'XCOMP': 46 | return 0 47 | elif type_dep=='OBL' or type_dep=='VOCATIVE' or type_dep=='DISLOCATED' or type_dep=='ADVCL' or type_dep=='ADVMOD' or type_dep=='DISCOURSE' or type_dep=='AUX' or type_dep=='COP' or type_dep=='MARK': 48 | return 1 49 | elif type_dep=='NMOD' or type_dep=='APPOS' or type_dep=='NUMMOD' or type_dep=='ACL' or type_dep=='AMOD' or type_dep=='DET' or type_dep=='CLF' or type_dep=='CASE': 50 | return 2 51 | else: 52 | return 3 53 | 54 | def get_adj(deps, batch_size, seq_len, max_degree): 55 | 56 | adj_arc_in = np.zeros((batch_size * seq_len, 2), dtype='int32') 57 | adj_lab_in = np.zeros((batch_size * seq_len, 1), dtype='int32') 58 | adj_arc_out = np.zeros((batch_size * seq_len * max_degree, 2), dtype='int32') 59 | adj_lab_out = np.zeros((batch_size * seq_len * max_degree, 1), dtype='int32') 60 | 61 | 62 | mask_in = np.zeros((batch_size * seq_len), dtype='float32') 63 | mask_out = np.zeros((batch_size * seq_len * max_degree), dtype='float32') 64 | 65 | mask_loop = np.ones((batch_size * seq_len, 1), dtype='float32') 66 | 67 | tmp_in = {} 68 | tmp_out = {} 69 | 70 | for d, de in enumerate(deps): 71 | for a, arc in enumerate(de): 72 | if arc[0] != 'ROOT' and arc[0].upper() in DEP_LABELS: 73 | arc_1 = int(arc[2])-1 74 | arc_2 = int(arc[1])-1 75 | 76 | if a in tmp_in: 77 | tmp_in[a] += 1 78 | else: 79 | tmp_in[a] = 0 80 | 81 | if arc_2 in tmp_out: 82 | tmp_out[arc_2] += 1 83 | else: 84 | tmp_out[arc_2] = 0 85 | 86 | idx_in = (d * seq_len) + a + tmp_in[a] 87 | idx_out = (d * seq_len * max_degree) + arc_2 * max_degree + tmp_out[arc_2] 88 | 89 | adj_arc_in[idx_in] = np.array([d, arc_2]) # incoming arcs 90 | adj_lab_in[idx_in] = np.array([find_type([arc[0].upper()])]) # incoming arcs 91 | 92 | mask_in[idx_in] = 1. 93 | 94 | if tmp_out[arc_2] < max_degree: 95 | adj_arc_out[idx_out] = np.array([d, arc_1]) # outgoing arcs 96 | adj_lab_out[idx_out] = np.array([find_type([arc[0].upper()])]) # outgoing arcs 97 | mask_out[idx_out] = 1. 98 | 99 | tmp_in = {} 100 | tmp_out = {} 101 | 102 | adj_arc_in = Variable(torch.LongTensor(np.transpose(adj_arc_in))) 103 | adj_arc_out = Variable(torch.LongTensor(np.transpose(adj_arc_out))) 104 | 105 | adj_lab_in = Variable(torch.LongTensor(np.transpose(adj_lab_in))) 106 | adj_lab_out = Variable(torch.LongTensor(np.transpose(adj_lab_out))) 107 | 108 | mask_in = Variable(torch.FloatTensor(mask_in.reshape((batch_size * seq_len, 1)))) 109 | mask_out = Variable(torch.FloatTensor(mask_out.reshape((batch_size * seq_len, max_degree)))) 110 | mask_loop = Variable(torch.FloatTensor(mask_loop)) 111 | 112 | return adj_arc_in, adj_arc_out, adj_lab_in, adj_lab_out, mask_in, mask_out, mask_loop 113 | 114 | def pad_seq(lang, seq, max_length): 115 | seq += [lang.vocab.stoi[''] for i in range(max_length - len(seq))] 116 | return seq 117 | 118 | def generate_batches(input_lang, output_lang, batch_size, pairs, return_dep_tree=False, arr_dep=None, max_degree=None, USE_CUDA=False): 119 | input_batches = [] 120 | target_batches = [] 121 | 122 | for pos in range(0, len(pairs), batch_size): 123 | # Avoiding out of array 124 | if pos == 10431: 125 | continue 126 | cant = min(batch_size, len(pairs) - pos) 127 | 128 | input_seqs = pairs[pos:cant+pos, 0]#.tolist() 129 | target_seqs = pairs[pos:cant+pos, 1]#.tolist() 130 | if return_dep_tree: 131 | arr_aux = arr_dep[pos:cant+pos]#.tolist() 132 | 133 | seq_pairs = sorted(zip(input_seqs, target_seqs), key=lambda p: len(p[0]), reverse=True) 134 | input_seqs, target_seqs = zip(*seq_pairs) 135 | 136 | input_lengths = [len(s) for s in input_seqs] 137 | input_padded = [pad_seq(input_lang, s, max(input_lengths)) for s in input_seqs] 138 | target_lengths = [len(s) for s in target_seqs] 139 | target_padded = [pad_seq(output_lang, s, max(target_lengths)) for s in target_seqs] 140 | 141 | input_var = Variable(torch.LongTensor(input_padded)).transpose(0, 1) 142 | target_var = Variable(torch.LongTensor(target_padded)).transpose(0, 1) 143 | 144 | if USE_CUDA: 145 | input_var = input_var.cuda() 146 | target_var = target_var.cuda() 147 | 148 | if return_dep_tree: 149 | # max len is setting mannually 150 | adj_arc_in, adj_arc_out, adj_lab_in, adj_lab_out, mask_in, mask_out, mask_loop = get_adj(arr_aux, cant, max(input_lengths), max_degree) 151 | 152 | if USE_CUDA: 153 | adj_arc_in = adj_arc_in.cuda() 154 | adj_arc_out = adj_arc_out.cuda() 155 | adj_lab_in = adj_lab_in.cuda() 156 | adj_lab_out = adj_lab_out.cuda() 157 | 158 | mask_in = mask_in.cuda() 159 | mask_out = mask_out.cuda() 160 | mask_loop = mask_loop.cuda() 161 | else: 162 | adj_arc_in = None 163 | adj_arc_out = None 164 | adj_lab_in = None 165 | adj_lab_out = None 166 | 167 | mask_in = None 168 | mask_out = None 169 | mask_loop = None 170 | 171 | input_batches.append([input_var, adj_arc_in, adj_arc_out, adj_lab_in, adj_lab_out, mask_in, mask_out, mask_loop]) 172 | target_batches.append(target_var) 173 | 174 | return input_batches, target_batches 175 | 176 | def indexes_from_sentence(lang, sentence): 177 | return [lang.vocab.stoi[word] for word in sentence.split(' ')] + [lang.vocab.stoi['']] 178 | 179 | def data_to_index(pairs, input_vec, output_vec): 180 | new_pairs = [] 181 | 182 | for pair in pairs: 183 | new_pairs.append([indexes_from_sentence(input_vec, pair[0]), indexes_from_sentence(output_vec, pair[1])]) 184 | 185 | return np.array(new_pairs) 186 | 187 | def construct_vector(pair, name_lang, construct_vector=True, vector_name='fasttext.en.300d'): 188 | lang = pd.DataFrame(pair, columns=[name_lang]) 189 | 190 | lang.to_csv('corpus/' + name_lang + '.csv', index=False) 191 | 192 | lang = data.Field(sequential=True, lower=True, init_token='', eos_token='') 193 | 194 | mt_lang = data.TabularDataset( 195 | path='corpus/' + name_lang + '.csv', format='csv', 196 | fields=[(name_lang, lang)]) 197 | 198 | lang.build_vocab(mt_lang) 199 | 200 | if construct_vector: 201 | lang.vocab.load_vectors(vector_name) 202 | 203 | return lang 204 | 205 | def unicode_to_ascii(s): 206 | return ''.join( 207 | c for c in unicodedata.normalize('NFD', s) 208 | if unicodedata.category(c) != 'Mn' 209 | ) 210 | 211 | def normalize_string(pair): 212 | pair = unicode_to_ascii(pair.lower().strip()) 213 | pair = re.sub(r'([.,;!?])', r' \1', pair) # separate .!? from words 214 | 215 | 216 | return ' '.join(pair.split()) 217 | 218 | def normalize_pairs(pairs): 219 | for pair in pairs: 220 | pair[0] = normalize_string(pair[0]) 221 | pair[1] = normalize_string(pair[1]) 222 | 223 | def filter_pairs_lang(pairs, min_length, max_length): 224 | filtered_pairs = [] 225 | for pair in pairs: 226 | # Removing '' and "" in pairs, this is for easy processing 227 | if len(pair[0].split()) >= min_length and len(pair[0].split()) <= max_length \ 228 | and len(pair[1].split()) >= min_length and len(pair[1].split()) <= max_length \ 229 | and "'" not in pair[0] and '"' not in pair[0]: 230 | filtered_pairs.append(pair) 231 | return filtered_pairs 232 | 233 | def read_langs(lang1, lang2, reverse=False): 234 | print("Reading lines...") 235 | 236 | # Read the file and split into lines 237 | filename = f'corpus/{lang1}-{lang2}.txt' 238 | lines = open(filename).read().strip().split('\n') 239 | 240 | # Split every line into pairs and normalize 241 | pairs = [[normalize_string(s) for s in l.split('\t')] for l in lines] 242 | 243 | # Reverse pairs, make Lang instances 244 | if reverse: 245 | pairs = [list(reversed(p)) for p in pairs] 246 | 247 | return pairs 248 | 249 | def prepare_data(lang1_name, lang2_name, reverse=False, min_length=3, max_length=50): 250 | pairs = read_langs(lang1_name, lang2_name, reverse=reverse) 251 | print("Read %d sentence pairs" % len(pairs)) 252 | 253 | pairs = filter_pairs_lang(pairs, min_length, max_length) 254 | print("Filtered to %d pairs" % len(pairs)) 255 | 256 | print("Creating vocab...") 257 | pairs = np.array(pairs) 258 | vector_1 = construct_vector(pairs[:, 0], lang1_name) 259 | vector_2 = construct_vector(pairs[:, 1], lang2_name) 260 | 261 | print('Indexed %d words in input language, %d words in output' % (len(vector_1.vocab.itos), len(vector_2.vocab.itos))) 262 | return vector_1, vector_2, pairs -------------------------------------------------------------------------------- /model/.ipynb_checkpoints/decoder-checkpoint.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import torch 4 | import torch.nn as nn 5 | from torch.autograd import Variable 6 | import torch.nn.functional as F 7 | from torch.nn import Parameter 8 | 9 | ######################### ATTENTION ########################### 10 | 11 | class Global_attn(nn.Module): 12 | def __init__(self, method, hidden_size, USE_CUDA=False): 13 | super(Global_attn, self).__init__() 14 | 15 | self.method = method 16 | self.hidden_size = hidden_size 17 | self.USE_CUDA = USE_CUDA 18 | 19 | if self.method == 'general': 20 | self.attn = nn.Linear(self.hidden_size, hidden_size) 21 | 22 | elif self.method == 'concat': 23 | self.attn = nn.Linear(self.hidden_size * 2, hidden_size) 24 | self.other = nn.Parameter(torch.FloatTensor(1, hidden_size)) 25 | 26 | def forward(self, hidden, encoder_outputs): 27 | ''' 28 | hidden: (BS, hidden_size) 29 | encoder_outputs(seq_len, BS, encoder_hidden_size) 30 | ''' 31 | # encoder_outputs: (seq_len, batch_size, encoder_hidden_size) 32 | seq_len = len(encoder_outputs) 33 | batch_size = encoder_outputs.shape[1] 34 | 35 | # Calculate attention energies for each encoder output 36 | # attn_energies: (seq_len, batch_size) 37 | # hidden: (batch_size, hidden_size) 38 | attn_energies = Variable(torch.zeros(seq_len, batch_size)) 39 | if self.USE_CUDA: attn_energies = attn_energies.cuda() 40 | for i in range(seq_len): 41 | attn_energies[i] = self.score(hidden, encoder_outputs[i]) 42 | 43 | # Normalize energies [0-1] and resize to (batch_size, x=1, seq_len) 44 | return F.softmax(attn_energies, 0).transpose(0, 1).unsqueeze(1) 45 | 46 | def score(self, hidden, encoder_output): 47 | # hidden: (batch_size, hidden_size) 48 | # encoder_output: (batch_size, encoder_hidden_size) 49 | 50 | # hidden sizes must match, batch_size = 1 only 51 | if self.method == 'dot': 52 | # batch element-wise dot product 53 | energy = torch.bmm(hidden.unsqueeze(1), 54 | encoder_output.unsqueeze(2)).squeeze().squeeze() 55 | # energy = hidden.dot(encoder_output) 56 | return energy 57 | 58 | elif self.method == 'general': 59 | energy = self.attn(encoder_output) 60 | # batch element-wise dot product 61 | energy = torch.bmm(hidden.unsqueeze(1), 62 | encoder_output.unsqueeze(2)).squeeze().squeeze() 63 | # energy = hidden.dot(energy) 64 | return energy 65 | 66 | # TODO: test / modify method to support batch size > 1 67 | elif self.method == 'concat': 68 | energy = self.attn(torch.cat((hidden, encoder_output), 1)) 69 | energy = self.other.dot(energy) 70 | return energy 71 | 72 | ######################### DECODER LUONG ########################### 73 | 74 | class Decoder_luong(nn.Module): 75 | def __init__(self, attn_method, hidden_size, output_size, emb_size, n_layers=1, dropout=0.1, lang=None, USE_CUDA=False): 76 | 77 | super(Decoder_luong, self).__init__() 78 | 79 | self.attn_method = attn_method 80 | self.hidden_size = hidden_size 81 | self.output_size = output_size 82 | self.n_layers = n_layers 83 | self.dropout_p = dropout 84 | self.USE_CUDA = USE_CUDA 85 | self.lang = lang 86 | 87 | # (size of dictionary of embeddings, size of embedding vector) 88 | self.embedding = nn.Embedding(output_size, emb_size) 89 | # (input features: hidden_size + emb_size, hidden state features, number of layers) 90 | self.gru = nn.GRU(emb_size + hidden_size, hidden_size, n_layers, dropout=dropout) 91 | self.attn = Global_attn(attn_method, hidden_size, USE_CUDA) 92 | self.out = nn.Linear(hidden_size * 2, output_size) 93 | 94 | self.init_weights() 95 | 96 | def forward(self, word_input, last_context, last_hidden, encoder_outputs): 97 | ''' 98 | word_input: (seq_len, BS) 99 | last_context: (BS, encoder_hidden_size) 100 | last_hidden: (n_layers, BS, hidden_size) 101 | last_cell: (n_layers, BS, hidden_size) 102 | encoder_outputs: (seq_len, BS, encoder_hidden) 103 | < output: (BS, output_size) 104 | < attn_weights: (BS, 1, seq_len) 105 | ''' 106 | # This is run one step at a time 107 | 108 | # Get the embedding of the current input word (last output word) 109 | # word_input: (seq_len=1, batch_size), values in [0..output_size) 110 | word_embedded = self.embedding(word_input) #.view(1, 1, -1) 111 | # word_embedded: (seq_len=1, batch_size, embedding_size) 112 | 113 | # Combine embedded input word and last context, run through RNN 114 | # last_context: (batch_size, encoder_hidden_size) 115 | rnn_input = torch.cat((word_embedded, last_context.unsqueeze(0)), 2) 116 | # rnn_input: (seq_len=1, batch_size, embedding_size + encoder_hidden_size) 117 | # last_hidden: (num_layers, batch_size, hidden_size) 118 | rnn_output, hidden = self.gru(rnn_input, last_hidden) 119 | # rnn_output: (seq_len=1, batch_size, hidden_size) 120 | # hidden: same 121 | 122 | # Calculate attention and apply to encoder outputs 123 | # encoder_outputs: (seq_len, batch_size, encoder_hidden_size) 124 | attn_weights = self.attn(rnn_output.squeeze(0), encoder_outputs) 125 | 126 | # Check softmax: 127 | # print('attn_weights sum: ', torch.sum(attn_weights.squeeze(), 1)) 128 | 129 | # attn_weights: (batch_size, x=1, seq_len) 130 | context = attn_weights.bmm(encoder_outputs.transpose(0, 1)) 131 | # context: (batch_size, x=1, encoder_hidden_size) 132 | 133 | # Final output layer using hidden state and context vector 134 | rnn_output = rnn_output.squeeze(0) 135 | # rnn_output: (batch_size, hidden_size) 136 | context = context.squeeze(1) 137 | # context: (batch_size, encoder_hidden_size) 138 | output = F.log_softmax(self.out(torch.cat((rnn_output, context), 1)), 1) 139 | # output: (batch_size, output_size) 140 | # Check softmax (not log_softmax): 141 | # print('output sum: ', torch.sum(output.squeeze(), 1)) 142 | 143 | # Also return attention weights for visualization 144 | return output, context, hidden, attn_weights 145 | 146 | def init_weights(self): 147 | if self.lang: 148 | self.embedding.weight.data.copy_(self.lang.vocab.vectors) 149 | self.embedding.weight.requires_grad = False 150 | 151 | for name, param in self.gru.named_parameters(): 152 | if 'bias' in name: 153 | nn.init.constant_(param, 0.0) 154 | elif 'weight' in name: 155 | nn.init.xavier_normal_(param) 156 | self.out.bias.data.fill_(0) 157 | self.out.weight.data.uniform_(-0.1, 0.1) -------------------------------------------------------------------------------- /model/.ipynb_checkpoints/encoder-checkpoint.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import torch 4 | import torch.nn as nn 5 | from torch.autograd import Variable 6 | import torch.nn.functional as F 7 | from torch.nn import Parameter 8 | 9 | class Encoder(nn.Module): 10 | def __init__(self, input_size, hidden_size, emb_size, n_layers=2, dropout=0.1, lang=None, USE_CUDA=False): 11 | super(Encoder, self).__init__() 12 | 13 | self.input_size = input_size 14 | self.hidden_size = hidden_size 15 | self.n_layers = n_layers 16 | self.dropout = dropout 17 | self.USE_CUDA = USE_CUDA 18 | self.lang = lang 19 | 20 | self.embedding = nn.Embedding(input_size, emb_size) 21 | self.gru = nn.GRU(emb_size, hidden_size, n_layers, dropout=self.dropout, bidirectional=True) 22 | self.init_weights() 23 | 24 | def forward(self, input_seqs, hidden = None): 25 | embedded = self.embedding(input_seqs) 26 | 27 | self.gru.flatten_parameters() 28 | outputs, hidden = self.gru(embedded, hidden) 29 | 30 | outputs = outputs[:, :, :self.hidden_size] + outputs[:, : ,self.hidden_size:] # Sum bidirectional outputs 31 | return outputs, hidden 32 | 33 | def init_hidden(self, batch_size): 34 | hidden = Variable(torch.zeros(2 * self.n_layers, batch_size, self.hidden_size)) 35 | if self.USE_CUDA: hidden = hidden.cuda() 36 | return hidden 37 | 38 | def init_weights(self): 39 | if self.lang: 40 | self.embedding.weight.data.copy_(self.lang.vocab.vectors) 41 | self.embedding.weight.required_grad = False 42 | 43 | for name, param in self.gru.named_parameters(): 44 | if 'bias' in name: 45 | nn.init.constant_(param, 0.0) 46 | elif 'weight' in name: 47 | nn.init.xavier_normal_(param) -------------------------------------------------------------------------------- /model/.ipynb_checkpoints/graph-checkpoint.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import torch 4 | import torch.nn as nn 5 | from torch.autograd import Variable 6 | import torch.nn.functional as F 7 | from torch.nn import Parameter 8 | 9 | class Sintactic_GCN(nn.Module): 10 | def __init__(self, num_inputs, num_units, 11 | num_labels, 12 | dropout = 0.2, 13 | in_arcs = True, 14 | out_arcs = True, 15 | batch_first = False, 16 | USE_CUDA=False): 17 | super(Sintactic_GCN, self).__init__() 18 | 19 | self.in_arcs = in_arcs 20 | self.out_arcs = out_arcs 21 | 22 | self.retain = 1. - dropout 23 | self.num_inputs = num_inputs 24 | self.num_units = num_units 25 | self.num_labels = num_labels 26 | self.batch_first = batch_first 27 | 28 | self.relu = nn.LeakyReLU() 29 | self.sigmoid = nn.Sigmoid() 30 | self.dropout_rate = dropout 31 | 32 | if in_arcs: 33 | self.V_in = Parameter(torch.FloatTensor(self.num_inputs, self.num_units)) 34 | nn.init.xavier_normal_(self.V_in) 35 | 36 | self.b_in = Parameter(torch.FloatTensor(num_labels, self.num_units)) 37 | nn.init.constant_(self.b_in, 0) 38 | 39 | self.V_in_gate = Parameter(torch.FloatTensor(self.num_inputs, 1)) 40 | nn.init.uniform_(self.V_in_gate) 41 | 42 | self.b_in_gate = Parameter(torch.FloatTensor(num_labels, 1)) 43 | nn.init.constant_(self.b_in_gate, 1) 44 | 45 | if out_arcs: 46 | self.V_out = Parameter(torch.FloatTensor(self.num_inputs, self.num_units)) 47 | nn.init.xavier_normal_(self.V_out) 48 | 49 | self.b_out = Parameter(torch.FloatTensor(num_labels, self.num_units)) 50 | nn.init.constant_(self.b_in, 0) 51 | 52 | self.V_out_gate = Parameter(torch.FloatTensor(self.num_inputs, 1)) 53 | nn.init.uniform_(self.V_out_gate) 54 | 55 | self.b_out_gate = Parameter(torch.FloatTensor(num_labels, 1)) 56 | nn.init.constant_(self.b_out_gate, 1) 57 | 58 | self.W_self_loop = Parameter(torch.FloatTensor(self.num_inputs, self.num_units)) 59 | nn.init.xavier_normal_(self.W_self_loop) 60 | 61 | self.W_self_loop_gate = Parameter(torch.FloatTensor(self.num_inputs, 1)) 62 | nn.init.uniform_(self.W_self_loop_gate) 63 | 64 | def forward(self, encoder_outputs, 65 | arc_tensor_in, arc_tensor_out, 66 | label_tensor_in, label_tensor_out, 67 | mask_in, mask_out, # batch* t, degree 68 | mask_loop): 69 | 70 | if(not self.batch_first): 71 | encoder_outputs = encoder_outputs.permute(1, 0, 2).contiguous() 72 | 73 | batch_size, seq_len, _ = encoder_outputs.shape 74 | input_ = encoder_outputs.view((batch_size * seq_len , self.num_inputs)) # [b* t, h] 75 | 76 | max_degree = 1 77 | if self.in_arcs: 78 | input_in = torch.mm(input_, self.V_in) # [b* t, h] * [h,h] = [b*t, h] 79 | first_in = input_in.index_select(0, arc_tensor_in[0] * seq_len + arc_tensor_in[1]) 80 | 81 | second_in = self.b_in.index_select(0, label_tensor_in.squeeze(0)) # [b* t* 1, h] 82 | in_ = (first_in + second_in).view((batch_size, seq_len, 1, self.num_units)) 83 | 84 | # compute gate weights 85 | input_in_gate = torch.mm(input_, self.V_in_gate) # [b* t, h] * [h,h] = [b*t, h] 86 | first_in_gate = input_in_gate.index_select(0, arc_tensor_in[0] * seq_len + arc_tensor_in[1]) 87 | 88 | second_in_gate = self.b_in_gate.index_select(0, label_tensor_in.squeeze(0)) 89 | in_gate = (input_in_gate + second_in_gate).view((batch_size, seq_len, 1)) 90 | 91 | max_degree += 1 92 | 93 | if self.out_arcs: 94 | input_out = torch.mm(input_, self.V_out) # [b* t, h] * [h,h] = [b* t, h] 95 | first_out = input_out.index_select(0, arc_tensor_out[0] * seq_len + arc_tensor_out[1]) 96 | 97 | second_out = self.b_out.index_select(0, label_tensor_out.squeeze(0)) 98 | 99 | degr = int(first_out.shape[0] / batch_size / seq_len) 100 | max_degree += degr 101 | 102 | out_ = (first_out + second_out).view((batch_size, seq_len, degr, self.num_units)) 103 | 104 | # compute gate weights 105 | input_out_gate = torch.mm(input_, self.V_out_gate) # [b* t, h] * [h,h] = [b* t, h] 106 | first_out_gate = input_out_gate.index_select(0, arc_tensor_out[0] * seq_len + arc_tensor_out[1]) 107 | 108 | second_out_gate = self.b_out_gate.index_select(0, label_tensor_out.squeeze(0)) 109 | 110 | out_gate = (first_out_gate + second_out_gate).view((batch_size, seq_len, degr)) 111 | 112 | same_input = torch.mm(encoder_outputs.view(-1,encoder_outputs.size(2)), self.W_self_loop).\ 113 | view(encoder_outputs.size(0), encoder_outputs.size(1), -1) 114 | same_input = same_input.view(encoder_outputs.size(0), encoder_outputs.size(1), 1, self.W_self_loop.size(1)) 115 | 116 | same_input_gate = torch.mm(encoder_outputs.view(-1, encoder_outputs.size(2)), self.W_self_loop_gate)\ 117 | .view(encoder_outputs.size(0), encoder_outputs.size(1), -1) 118 | 119 | if self.in_arcs and self.out_arcs: 120 | potentials = torch.cat((in_, out_, same_input), dim=2) # [b, t, mxdeg, h] 121 | potentials_gate = torch.cat((in_gate, out_gate, same_input_gate), dim=2) # [b, t, mxdeg, h] 122 | mask_soft = torch.cat((mask_in, mask_out, mask_loop), dim=1) # [b* t, mxdeg] 123 | elif self.out_arcs: 124 | potentials = torch.cat((out_, same_input), dim=2) # [b, t, 2*mxdeg+1, h] 125 | potentials_gate = torch.cat((out_gate, same_input_gate), dim=2) # [b, t, mxdeg, h] 126 | mask_soft = torch.cat((mask_out, mask_loop), dim=1) # [b* t, mxdeg] 127 | elif self.in_arcs: 128 | potentials = torch.cat((in_, same_input), dim=2) # [b, t, 2*mxdeg+1, h] 129 | potentials_gate = torch.cat((in_gate, same_input_gate), dim=2) # [b, t, mxdeg, h] 130 | mask_soft = torch.cat((mask_in, mask_loop), dim=1) # [b* t, mxdeg] 131 | 132 | potentials_ = potentials.permute(3, 0, 1, 2).contiguous() # [h, b, t, mxdeg] 133 | potentials_resh = potentials_.view((self.num_units, 134 | batch_size * seq_len, 135 | max_degree)) # [h, b * t, mxdeg] 136 | 137 | potentials_r = potentials_gate.view((batch_size * seq_len, 138 | max_degree)) # [h, b * t, mxdeg] 139 | # calculate the gate 140 | probs_det_ = self.sigmoid(potentials_r) * mask_soft # [b * t, mxdeg] 141 | potentials_masked = potentials_resh * mask_soft * probs_det_ # [h, b * t, mxdeg] 142 | 143 | 144 | #if self.retain == 1 or deterministic: 145 | # pass 146 | #else: 147 | # drop_mask = self._srng.binomial(potentials_resh.shape[1:], p=self.retain, dtype=input.dtype) 148 | # potentials_masked /= self.retain 149 | # potentials_masked *= drop_mask 150 | 151 | potentials_masked_ = potentials_masked.sum(dim=2) # [h, b * t] 152 | potentials_masked_ = self.relu(potentials_masked_) 153 | 154 | result_ = potentials_masked_.permute(1, 0).contiguous() # [b * t, h] 155 | result_ = F.dropout(result_, p=self.dropout_rate, training=self.training) 156 | 157 | if not self.batch_first: 158 | result_ = result_.view((seq_len, batch_size, self.num_units)) # [ b, t, h] 159 | else: 160 | result_ = result_.view((batch_size, seq_len, self.num_units)) 161 | 162 | 163 | return result_ 164 | -------------------------------------------------------------------------------- /model/__pycache__/decoder.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kdrivas/Graph-convolutional/1fd05c42d082989c3fc54bcd2d317c40d4ebe73d/model/__pycache__/decoder.cpython-36.pyc -------------------------------------------------------------------------------- /model/__pycache__/encoder.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kdrivas/Graph-convolutional/1fd05c42d082989c3fc54bcd2d317c40d4ebe73d/model/__pycache__/encoder.cpython-36.pyc -------------------------------------------------------------------------------- /model/__pycache__/graph.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kdrivas/Graph-convolutional/1fd05c42d082989c3fc54bcd2d317c40d4ebe73d/model/__pycache__/graph.cpython-36.pyc -------------------------------------------------------------------------------- /model/decoder.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import torch 4 | import torch.nn as nn 5 | from torch.autograd import Variable 6 | import torch.nn.functional as F 7 | from torch.nn import Parameter 8 | 9 | ######################### ATTENTION ########################### 10 | 11 | class Global_attn(nn.Module): 12 | def __init__(self, method, hidden_size, USE_CUDA=False): 13 | super(Global_attn, self).__init__() 14 | 15 | self.method = method 16 | self.hidden_size = hidden_size 17 | self.USE_CUDA = USE_CUDA 18 | 19 | if self.method == 'general': 20 | self.attn = nn.Linear(self.hidden_size, hidden_size) 21 | 22 | elif self.method == 'concat': 23 | self.attn = nn.Linear(self.hidden_size * 2, hidden_size) 24 | self.other = nn.Parameter(torch.FloatTensor(1, hidden_size)) 25 | 26 | def forward(self, hidden, encoder_outputs): 27 | ''' 28 | hidden: (BS, hidden_size) 29 | encoder_outputs(seq_len, BS, encoder_hidden_size) 30 | ''' 31 | # encoder_outputs: (seq_len, batch_size, encoder_hidden_size) 32 | seq_len = len(encoder_outputs) 33 | batch_size = encoder_outputs.shape[1] 34 | 35 | # Calculate attention energies for each encoder output 36 | # attn_energies: (seq_len, batch_size) 37 | # hidden: (batch_size, hidden_size) 38 | attn_energies = Variable(torch.zeros(seq_len, batch_size)) 39 | if self.USE_CUDA: attn_energies = attn_energies.cuda() 40 | for i in range(seq_len): 41 | attn_energies[i] = self.score(hidden, encoder_outputs[i]) 42 | 43 | # Normalize energies [0-1] and resize to (batch_size, x=1, seq_len) 44 | return F.softmax(attn_energies, 0).transpose(0, 1).unsqueeze(1) 45 | 46 | def score(self, hidden, encoder_output): 47 | # hidden: (batch_size, hidden_size) 48 | # encoder_output: (batch_size, encoder_hidden_size) 49 | 50 | # hidden sizes must match, batch_size = 1 only 51 | if self.method == 'dot': 52 | # batch element-wise dot product 53 | energy = torch.bmm(hidden.unsqueeze(1), 54 | encoder_output.unsqueeze(2)).squeeze().squeeze() 55 | # energy = hidden.dot(encoder_output) 56 | return energy 57 | 58 | elif self.method == 'general': 59 | energy = self.attn(encoder_output) 60 | # batch element-wise dot product 61 | energy = torch.bmm(hidden.unsqueeze(1), 62 | encoder_output.unsqueeze(2)).squeeze().squeeze() 63 | # energy = hidden.dot(energy) 64 | return energy 65 | 66 | # TODO: test / modify method to support batch size > 1 67 | elif self.method == 'concat': 68 | energy = self.attn(torch.cat((hidden, encoder_output), 1)) 69 | energy = self.other.dot(energy) 70 | return energy 71 | 72 | ######################### DECODER LUONG ########################### 73 | 74 | class Decoder_luong(nn.Module): 75 | def __init__(self, attn_method, hidden_size, output_size, emb_size, n_layers=1, dropout=0.1, lang=None, USE_CUDA=False): 76 | 77 | super(Decoder_luong, self).__init__() 78 | 79 | self.attn_method = attn_method 80 | self.hidden_size = hidden_size 81 | self.output_size = output_size 82 | self.n_layers = n_layers 83 | self.dropout_p = dropout 84 | self.USE_CUDA = USE_CUDA 85 | self.lang = lang 86 | 87 | # (size of dictionary of embeddings, size of embedding vector) 88 | self.embedding = nn.Embedding(output_size, emb_size) 89 | # (input features: hidden_size + emb_size, hidden state features, number of layers) 90 | self.gru = nn.GRU(emb_size + hidden_size, hidden_size, n_layers, dropout=dropout) 91 | self.attn = Global_attn(attn_method, hidden_size, USE_CUDA) 92 | self.out = nn.Linear(hidden_size * 2, output_size) 93 | 94 | self.init_weights() 95 | 96 | def forward(self, word_input, last_context, last_hidden, encoder_outputs): 97 | ''' 98 | word_input: (seq_len, BS) 99 | last_context: (BS, encoder_hidden_size) 100 | last_hidden: (n_layers, BS, hidden_size) 101 | last_cell: (n_layers, BS, hidden_size) 102 | encoder_outputs: (seq_len, BS, encoder_hidden) 103 | < output: (BS, output_size) 104 | < attn_weights: (BS, 1, seq_len) 105 | ''' 106 | # This is run one step at a time 107 | 108 | # Get the embedding of the current input word (last output word) 109 | # word_input: (seq_len=1, batch_size), values in [0..output_size) 110 | word_embedded = self.embedding(word_input) #.view(1, 1, -1) 111 | # word_embedded: (seq_len=1, batch_size, embedding_size) 112 | 113 | # Combine embedded input word and last context, run through RNN 114 | # last_context: (batch_size, encoder_hidden_size) 115 | rnn_input = torch.cat((word_embedded, last_context.unsqueeze(0)), 2) 116 | # rnn_input: (seq_len=1, batch_size, embedding_size + encoder_hidden_size) 117 | # last_hidden: (num_layers, batch_size, hidden_size) 118 | rnn_output, hidden = self.gru(rnn_input, last_hidden) 119 | # rnn_output: (seq_len=1, batch_size, hidden_size) 120 | # hidden: same 121 | 122 | # Calculate attention and apply to encoder outputs 123 | # encoder_outputs: (seq_len, batch_size, encoder_hidden_size) 124 | attn_weights = self.attn(rnn_output.squeeze(0), encoder_outputs) 125 | 126 | # Check softmax: 127 | # print('attn_weights sum: ', torch.sum(attn_weights.squeeze(), 1)) 128 | 129 | # attn_weights: (batch_size, x=1, seq_len) 130 | context = attn_weights.bmm(encoder_outputs.transpose(0, 1)) 131 | # context: (batch_size, x=1, encoder_hidden_size) 132 | 133 | # Final output layer using hidden state and context vector 134 | rnn_output = rnn_output.squeeze(0) 135 | # rnn_output: (batch_size, hidden_size) 136 | context = context.squeeze(1) 137 | # context: (batch_size, encoder_hidden_size) 138 | output = F.log_softmax(self.out(torch.cat((rnn_output, context), 1)), 1) 139 | # output: (batch_size, output_size) 140 | # Check softmax (not log_softmax): 141 | # print('output sum: ', torch.sum(output.squeeze(), 1)) 142 | 143 | # Also return attention weights for visualization 144 | return output, context, hidden, attn_weights 145 | 146 | def init_weights(self): 147 | if self.lang: 148 | self.embedding.weight.data.copy_(self.lang.vocab.vectors) 149 | self.embedding.weight.requires_grad = False 150 | 151 | for name, param in self.gru.named_parameters(): 152 | if 'bias' in name: 153 | nn.init.constant_(param, 0.0) 154 | elif 'weight' in name: 155 | nn.init.xavier_normal_(param) 156 | self.out.bias.data.fill_(0) 157 | self.out.weight.data.uniform_(-0.1, 0.1) -------------------------------------------------------------------------------- /model/encoder.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import torch 4 | import torch.nn as nn 5 | from torch.autograd import Variable 6 | import torch.nn.functional as F 7 | from torch.nn import Parameter 8 | 9 | class Encoder(nn.Module): 10 | def __init__(self, input_size, hidden_size, emb_size, n_layers=2, dropout=0.1, lang=None, USE_CUDA=False): 11 | super(Encoder, self).__init__() 12 | 13 | self.input_size = input_size 14 | self.hidden_size = hidden_size 15 | self.n_layers = n_layers 16 | self.dropout = dropout 17 | self.USE_CUDA = USE_CUDA 18 | self.lang = lang 19 | 20 | self.embedding = nn.Embedding(input_size, emb_size) 21 | self.gru = nn.GRU(emb_size, hidden_size, n_layers, dropout=self.dropout, bidirectional=True) 22 | self.init_weights() 23 | 24 | def forward(self, input_seqs, hidden = None): 25 | embedded = self.embedding(input_seqs) 26 | 27 | self.gru.flatten_parameters() 28 | outputs, hidden = self.gru(embedded, hidden) 29 | 30 | outputs = outputs[:, :, :self.hidden_size] + outputs[:, : ,self.hidden_size:] # Sum bidirectional outputs 31 | return outputs, hidden 32 | 33 | def init_hidden(self, batch_size): 34 | hidden = Variable(torch.zeros(2 * self.n_layers, batch_size, self.hidden_size)) 35 | if self.USE_CUDA: hidden = hidden.cuda() 36 | return hidden 37 | 38 | def init_weights(self): 39 | if self.lang: 40 | self.embedding.weight.data.copy_(self.lang.vocab.vectors) 41 | self.embedding.weight.required_grad = False 42 | 43 | for name, param in self.gru.named_parameters(): 44 | if 'bias' in name: 45 | nn.init.constant_(param, 0.0) 46 | elif 'weight' in name: 47 | nn.init.xavier_normal_(param) -------------------------------------------------------------------------------- /model/graph.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import torch 4 | import torch.nn as nn 5 | from torch.autograd import Variable 6 | import torch.nn.functional as F 7 | from torch.nn import Parameter 8 | 9 | class Sintactic_GCN(nn.Module): 10 | def __init__(self, num_inputs, num_units, 11 | num_labels, 12 | dropout = 0.2, 13 | in_arcs = True, 14 | out_arcs = True, 15 | batch_first = False, 16 | USE_CUDA=False): 17 | super(Sintactic_GCN, self).__init__() 18 | 19 | self.in_arcs = in_arcs 20 | self.out_arcs = out_arcs 21 | 22 | self.retain = 1. - dropout 23 | self.num_inputs = num_inputs 24 | self.num_units = num_units 25 | self.num_labels = num_labels 26 | self.batch_first = batch_first 27 | 28 | self.relu = nn.LeakyReLU() 29 | self.sigmoid = nn.Sigmoid() 30 | self.dropout_rate = dropout 31 | 32 | if in_arcs: 33 | self.V_in = Parameter(torch.FloatTensor(self.num_inputs, self.num_units)) 34 | nn.init.xavier_normal_(self.V_in) 35 | 36 | self.b_in = Parameter(torch.FloatTensor(num_labels, self.num_units)) 37 | nn.init.constant_(self.b_in, 0) 38 | 39 | self.V_in_gate = Parameter(torch.FloatTensor(self.num_inputs, 1)) 40 | nn.init.uniform_(self.V_in_gate) 41 | 42 | self.b_in_gate = Parameter(torch.FloatTensor(num_labels, 1)) 43 | nn.init.constant_(self.b_in_gate, 1) 44 | 45 | if out_arcs: 46 | self.V_out = Parameter(torch.FloatTensor(self.num_inputs, self.num_units)) 47 | nn.init.xavier_normal_(self.V_out) 48 | 49 | self.b_out = Parameter(torch.FloatTensor(num_labels, self.num_units)) 50 | nn.init.constant_(self.b_in, 0) 51 | 52 | self.V_out_gate = Parameter(torch.FloatTensor(self.num_inputs, 1)) 53 | nn.init.uniform_(self.V_out_gate) 54 | 55 | self.b_out_gate = Parameter(torch.FloatTensor(num_labels, 1)) 56 | nn.init.constant_(self.b_out_gate, 1) 57 | 58 | self.W_self_loop = Parameter(torch.FloatTensor(self.num_inputs, self.num_units)) 59 | nn.init.xavier_normal_(self.W_self_loop) 60 | 61 | self.W_self_loop_gate = Parameter(torch.FloatTensor(self.num_inputs, 1)) 62 | nn.init.uniform_(self.W_self_loop_gate) 63 | 64 | def forward(self, encoder_outputs, 65 | arc_tensor_in, arc_tensor_out, 66 | label_tensor_in, label_tensor_out, 67 | mask_in, mask_out, # batch* t, degree 68 | mask_loop): 69 | 70 | if(not self.batch_first): 71 | encoder_outputs = encoder_outputs.permute(1, 0, 2).contiguous() 72 | 73 | batch_size, seq_len, _ = encoder_outputs.shape 74 | input_ = encoder_outputs.view((batch_size * seq_len , self.num_inputs)) # [b* t, h] 75 | 76 | max_degree = 1 77 | if self.in_arcs: 78 | input_in = torch.mm(input_, self.V_in) # [b* t, h] * [h,h] = [b*t, h] 79 | first_in = input_in.index_select(0, arc_tensor_in[0] * seq_len + arc_tensor_in[1]) 80 | 81 | second_in = self.b_in.index_select(0, label_tensor_in.squeeze(0)) # [b* t* 1, h] 82 | in_ = (first_in + second_in).view((batch_size, seq_len, 1, self.num_units)) 83 | 84 | # compute gate weights 85 | input_in_gate = torch.mm(input_, self.V_in_gate) # [b* t, h] * [h,h] = [b*t, h] 86 | first_in_gate = input_in_gate.index_select(0, arc_tensor_in[0] * seq_len + arc_tensor_in[1]) 87 | 88 | second_in_gate = self.b_in_gate.index_select(0, label_tensor_in.squeeze(0)) 89 | in_gate = (input_in_gate + second_in_gate).view((batch_size, seq_len, 1)) 90 | 91 | max_degree += 1 92 | 93 | if self.out_arcs: 94 | input_out = torch.mm(input_, self.V_out) # [b* t, h] * [h,h] = [b* t, h] 95 | first_out = input_out.index_select(0, arc_tensor_out[0] * seq_len + arc_tensor_out[1]) 96 | 97 | second_out = self.b_out.index_select(0, label_tensor_out.squeeze(0)) 98 | 99 | degr = int(first_out.shape[0] / batch_size / seq_len) 100 | max_degree += degr 101 | 102 | out_ = (first_out + second_out).view((batch_size, seq_len, degr, self.num_units)) 103 | 104 | # compute gate weights 105 | input_out_gate = torch.mm(input_, self.V_out_gate) # [b* t, h] * [h,h] = [b* t, h] 106 | first_out_gate = input_out_gate.index_select(0, arc_tensor_out[0] * seq_len + arc_tensor_out[1]) 107 | 108 | second_out_gate = self.b_out_gate.index_select(0, label_tensor_out.squeeze(0)) 109 | 110 | out_gate = (first_out_gate + second_out_gate).view((batch_size, seq_len, degr)) 111 | 112 | same_input = torch.mm(encoder_outputs.view(-1,encoder_outputs.size(2)), self.W_self_loop).\ 113 | view(encoder_outputs.size(0), encoder_outputs.size(1), -1) 114 | same_input = same_input.view(encoder_outputs.size(0), encoder_outputs.size(1), 1, self.W_self_loop.size(1)) 115 | 116 | same_input_gate = torch.mm(encoder_outputs.view(-1, encoder_outputs.size(2)), self.W_self_loop_gate)\ 117 | .view(encoder_outputs.size(0), encoder_outputs.size(1), -1) 118 | 119 | if self.in_arcs and self.out_arcs: 120 | potentials = torch.cat((in_, out_, same_input), dim=2) # [b, t, mxdeg, h] 121 | potentials_gate = torch.cat((in_gate, out_gate, same_input_gate), dim=2) # [b, t, mxdeg, h] 122 | mask_soft = torch.cat((mask_in, mask_out, mask_loop), dim=1) # [b* t, mxdeg] 123 | elif self.out_arcs: 124 | potentials = torch.cat((out_, same_input), dim=2) # [b, t, 2*mxdeg+1, h] 125 | potentials_gate = torch.cat((out_gate, same_input_gate), dim=2) # [b, t, mxdeg, h] 126 | mask_soft = torch.cat((mask_out, mask_loop), dim=1) # [b* t, mxdeg] 127 | elif self.in_arcs: 128 | potentials = torch.cat((in_, same_input), dim=2) # [b, t, 2*mxdeg+1, h] 129 | potentials_gate = torch.cat((in_gate, same_input_gate), dim=2) # [b, t, mxdeg, h] 130 | mask_soft = torch.cat((mask_in, mask_loop), dim=1) # [b* t, mxdeg] 131 | 132 | potentials_ = potentials.permute(3, 0, 1, 2).contiguous() # [h, b, t, mxdeg] 133 | potentials_resh = potentials_.view((self.num_units, 134 | batch_size * seq_len, 135 | max_degree)) # [h, b * t, mxdeg] 136 | 137 | potentials_r = potentials_gate.view((batch_size * seq_len, 138 | max_degree)) # [h, b * t, mxdeg] 139 | # calculate the gate 140 | probs_det_ = self.sigmoid(potentials_r) * mask_soft # [b * t, mxdeg] 141 | potentials_masked = potentials_resh * mask_soft * probs_det_ # [h, b * t, mxdeg] 142 | 143 | 144 | #if self.retain == 1 or deterministic: 145 | # pass 146 | #else: 147 | # drop_mask = self._srng.binomial(potentials_resh.shape[1:], p=self.retain, dtype=input.dtype) 148 | # potentials_masked /= self.retain 149 | # potentials_masked *= drop_mask 150 | 151 | potentials_masked_ = potentials_masked.sum(dim=2) # [h, b * t] 152 | potentials_masked_ = self.relu(potentials_masked_) 153 | 154 | result_ = potentials_masked_.permute(1, 0).contiguous() # [b * t, h] 155 | result_ = F.dropout(result_, p=self.dropout_rate, training=self.training) 156 | 157 | if not self.batch_first: 158 | result_ = result_.view((seq_len, batch_size, self.num_units)) # [ b, t, h] 159 | else: 160 | result_ = result_.view((batch_size, seq_len, self.num_units)) 161 | 162 | 163 | return result_ 164 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import time 3 | import math 4 | import matplotlib.pyplot as plt 5 | 6 | def as_minutes(s): 7 | m = math.floor(s / 60) 8 | s -= m * 60 9 | return '%dm %ds' % (m, s) 10 | 11 | def time_since(since, percent): 12 | now = time.time() 13 | s = now - since 14 | es = s / (percent) 15 | rs = es - s 16 | return '%s (- %s)' % (as_minutes(s), as_minutes(rs)) 17 | 18 | def show_plot(points): 19 | plt.figure() 20 | fig, ax = plt.subplots() 21 | loc = ticker.MultipleLocator(base=1) # put ticks at regular intervals 22 | ax.yaxis.set_major_locator(loc) 23 | plt.plot(points) 24 | 25 | def plot_losses(train_loss, val_loss, scale): 26 | plt.figure(figsize=(10,5)) 27 | plt.plot(train_loss) 28 | plt.plot([(x + 1) * scale - 1 for x in range(len(val_loss))], val_loss) 29 | plt.legend(['train loss', 'validation loss']) 30 | -------------------------------------------------------------------------------- /validation.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.nn import functional 4 | from torch.autograd import Variable 5 | from torch import optim 6 | import torch.nn.functional as F 7 | import numpy as np 8 | import re 9 | from tqdm import tqdm 10 | from data import generate_batches 11 | 12 | class Beam(): 13 | def __init__(self, decoder_input, decoder_context, decoder_hidden, 14 | decoded_words=[], decoder_attentions=[], sequence_log_probs=[], decoded_index=[]): 15 | self.decoded_words = decoded_words 16 | self.decoded_index = decoded_index 17 | self.decoder_attentions = decoder_attentions 18 | self.sequence_log_probs = sequence_log_probs 19 | self.decoder_input = decoder_input 20 | self.decoder_context = decoder_context 21 | self.decoder_hidden = decoder_hidden 22 | 23 | class Evaluator(): 24 | def __init__(self, encoder, decoder, gcn1, gcn2, input_lang, output_lang, max_length, USE_CUDA): 25 | self.encoder = encoder 26 | self.decoder = decoder 27 | self.input_lang = input_lang 28 | self.output_lang = output_lang 29 | self.max_length = max_length 30 | self.USE_CUDA = USE_CUDA 31 | self.gcn1 = gcn1 32 | self.gcn2 = gcn2 33 | 34 | def evaluate(self, input_batch, k_beams, testing_luong=True): 35 | 36 | [input_var, adj_arc_in, adj_arc_out, adj_lab_in, adj_lab_out, mask_in, mask_out, mask_loop] = input_batch 37 | input_length = input_var.shape[0] 38 | 39 | encoder_hidden = self.encoder.init_hidden(1) 40 | encoder_outputs, encoder_hidden = self.encoder(input_var, encoder_hidden) 41 | 42 | if self.gcn1: 43 | encoder_outputs = self.gcn1(encoder_outputs, 44 | adj_arc_in, adj_arc_out, 45 | adj_lab_in, adj_lab_out, 46 | mask_in, mask_out, 47 | mask_loop) 48 | if self.gcn2: 49 | encoder_outputs = self.gcn2(encoder_outputs, 50 | adj_arc_in, adj_arc_out, 51 | adj_lab_in, adj_lab_out, 52 | mask_in, mask_out, 53 | mask_loop) 54 | 55 | if testing_luong: 56 | decoder_input = Variable(torch.LongTensor([[self.output_lang.vocab.stoi['']]])) 57 | else: 58 | decoder_input = Variable(torch.LongTensor([self.output_lang.vocab.stoi['']])) 59 | 60 | decoder_context = Variable(torch.zeros(1, self.decoder.hidden_size)) 61 | decoder_hidden = encoder_hidden 62 | 63 | if self.USE_CUDA: 64 | decoder_input = decoder_input.cuda() 65 | decoder_context = decoder_context.cuda() 66 | 67 | decoded_words = [] 68 | decoder_attentions = torch.zeros(self.max_length, self.max_length) 69 | 70 | beams = [Beam(decoder_input, decoder_context, decoder_hidden)] 71 | top_beams = [] 72 | 73 | # Use decoder output as inputs 74 | for di in range(input_length): 75 | new_beams = [] 76 | for beam in beams: 77 | decoder_output, decoder_context, decoder_hidden, decoder_attention = self.decoder( 78 | beam.decoder_input, beam.decoder_context, beam.decoder_hidden, encoder_outputs) 79 | 80 | # Beam search, take the top k with highest probability 81 | topv, topi = decoder_output.data.topk(k_beams) 82 | 83 | for ni, vi in zip(topi[0], topv[0]): 84 | new_beam = Beam(None, decoder_context, decoder_hidden, 85 | beam.decoded_words[:], beam.decoder_attentions[:], beam.sequence_log_probs[:]) 86 | new_beam.decoder_attentions.append(decoder_attention.squeeze().cpu().data) 87 | new_beam.sequence_log_probs.append(vi) 88 | 89 | if ni == self.output_lang.vocab.stoi[''] or ni == self.output_lang.vocab.stoi['']: 90 | new_beam.decoded_words.append('') 91 | top_beams.append(new_beam) 92 | 93 | else: 94 | new_beam.decoded_words.append(self.output_lang.vocab.itos[ni]) 95 | 96 | if testing_luong: 97 | decoder_input = Variable(torch.LongTensor([[ni]])) 98 | else: 99 | decoder_input = Variable(torch.LongTensor([ni])) 100 | if self.USE_CUDA: decoder_input = decoder_input.cuda() 101 | 102 | new_beam.decoder_input = decoder_input 103 | new_beams.append(new_beam) 104 | 105 | new_beams = {beam: np.mean(beam.sequence_log_probs) for beam in new_beams} 106 | beams = sorted(new_beams, key=new_beams.get, reverse=True)[:k_beams] 107 | 108 | if len(beams) == 0: 109 | break 110 | 111 | if len(top_beams) != 0: 112 | top_beams = {beam: np.mean(beam.sequence_log_probs) for beam in top_beams} 113 | else: 114 | top_beams = {beam: np.mean(beam.sequence_log_probs) for beam in new_beams} 115 | 116 | top_beams = sorted(top_beams, key=top_beams.get, reverse=True)[:k_beams] 117 | 118 | decoded_words = top_beams[0].decoded_words 119 | 120 | return decoded_words, top_beams 121 | 122 | def evaluate_sentence(self, input_batch, k_beams=3): 123 | output_words, beams = self.evaluate(input_batch, k_beams) 124 | output_sentence = ' '.join(output_words) 125 | 126 | print('>', sentence) 127 | print('<', output_sentence) 128 | print('') 129 | 130 | def ref_to_string(self, reference): 131 | aux = '' 132 | for i in range(len(reference)): 133 | aux += self.output_lang.vocab.itos[reference[i]] + ' ' 134 | return aux.strip() 135 | 136 | def get_candidates_and_references(self, pairs, arr_dep, k_beams=3): 137 | input_batches, _ = generate_batches(self.input_lang, self.output_lang, 1, pairs, return_dep_tree=True, arr_dep=arr_dep, max_degree=10, USE_CUDA=self.USE_CUDA) 138 | 139 | candidates = [self.evaluate(input_batch, k_beams)[0] for input_batch in tqdm(input_batches)] 140 | candidates = [' '.join(candidate[:-1]) for candidate in candidates] 141 | references = pairs[:,1] 142 | references = [self.ref_to_string(reference) for reference in references] 143 | return candidates, references --------------------------------------------------------------------------------