├── .gitignore
├── README.md
├── checkpoints
    └── model.pt
├── data.py
├── embedding.py
├── encoder.py
├── eval.py
├── img
    ├── a.100.pdf
    ├── a.1000.pdf
    ├── a.1100.pdf
    ├── a.1200.pdf
    ├── a.200.pdf
    ├── a.300.pdf
    ├── a.400.pdf
    ├── a.500.pdf
    ├── a.600.pdf
    ├── a.700.pdf
    ├── a.800.pdf
    ├── a.900.pdf
    ├── a.pdf
    └── gold.pdf
├── lisa-modules.sh
├── log
    ├── acc.train.csv
    ├── acc.val.csv
    ├── loss.train.csv
    ├── no-pos.pdf
    └── with-pos.pdf
├── main.py
├── model.py
├── mst.py
├── nn.py
├── optimizer.py
├── predict.py
├── preprocess.py
├── train.py
├── transformer.py
└── util.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | __pycache__
 2 | 
 3 | reading
 4 | vocab
 5 | csv
 6 | models
 7 | 
 8 | *.ipynb
 9 | *.txt
10 | *.conll
11 | *.pl
12 | 
13 | .remote-sync.json
14 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Biaffine dependency parser
  2 | A PyTorch implementation of the neural dependency parser described in [Deep Biaffine Attention for Neural Dependency Parsing](https://arxiv.org/abs/1611.01734).
  3 | 
  4 | ## Data
  5 | You can train on the Penn Treebank, converted to [Stanford Dependencies](https://nlp.stanford.edu/software/stanford-dependencies.shtml). We assume you have the PTB in standard train/dev/test splits in conll-format, stored somewhere in one directory, and that they are named `train.conll`, `dev.conll`, `test.conll`.
  6 | 
  7 | ## Usage
  8 | First, extract a vocabulary:
  9 | ```bash
 10 | mkdir vocab
 11 | ./preprocess.py --data your/ptb/conll/dir --out vocab
 12 | ```
 13 | 
 14 | Then, train a default model with the following arguments:
 15 | ```bash
 16 | mkdir log checkpoints
 17 | ./main.py train --data your/ptb/conll/dir
 18 | ```
 19 | Training can be exited at any moment with Control-C and the current model will be evaluated on the development-set.
 20 | 
 21 | ### Arguments
 22 | The following options are available:
 23 | ```
 24 | usage: main.py {train,predict} [...]
 25 | 
 26 | Biaffine graph-based dependency parser
 27 | 
 28 | positional arguments:
 29 |   {train,predict}
 30 | 
 31 | optional arguments:
 32 |   -h, --help            show this help message and exit
 33 | 
 34 | Data:
 35 |   --data DATA           location of the data corpus
 36 |   --vocab VOCAB         location of the preprocessed vocabulary
 37 |   --disable-length-ordered
 38 |                         do not order sentences by length so batches have more
 39 |                         padding
 40 | 
 41 | Embedding options:
 42 |   --use-glove           use pretrained glove embeddings
 43 |   --use-chars           use character level word embeddings
 44 |   --char-encoder {rnn,cnn,transformer}
 45 |                         type of character encoder used for word embeddings
 46 |   --filter-factor FILTER_FACTOR
 47 |                         controls output size of cnn character embedding
 48 |   --disable-words       do not use words as input
 49 |   --disable-tags        do not use tags as input
 50 |   --word-emb-dim WORD_EMB_DIM
 51 |                         size of word embeddings
 52 |   --tag-emb-dim TAG_EMB_DIM
 53 |                         size of tag embeddings
 54 |   --emb-dropout EMB_DROPOUT
 55 |                         dropout used on embeddings
 56 | 
 57 | Encoder options:
 58 |   --encoder {rnn,cnn,transformer,none}
 59 |                         type of sentence encoder used
 60 | 
 61 | RNN options:
 62 |   --rnn-type {RNN,GRU,LSTM}
 63 |                         type of rnn
 64 |   --rnn-hidden RNN_HIDDEN
 65 |                         number of hidden units in rnn
 66 |   --rnn-num-layers RNN_NUM_LAYERS
 67 |                         number of layers
 68 |   --batch-first BATCH_FIRST
 69 |                         number of layers
 70 |   --rnn-dropout RNN_DROPOUT
 71 |                         dropout used in rnn
 72 | 
 73 | CNN options:
 74 |   --cnn-num-layers CNN_NUM_LAYERS
 75 |                         number convolutions
 76 |   --kernel-size KERNEL_SIZE
 77 |                         size of convolution kernel
 78 |   --cnn-dropout CNN_DROPOUT
 79 |                         dropout used in cnn
 80 | 
 81 | Transformer options:
 82 |   --N N                 transformer options
 83 |   --d-model D_MODEL     transformer options
 84 |   --d-ff D_FF           transformer options
 85 |   --h H                 transformer options
 86 |   --trans-dropout TRANS_DROPOUT
 87 |                         dropout used in transformer
 88 | 
 89 | Biaffine classifier arguments:
 90 |   --mlp-arc-hidden MLP_ARC_HIDDEN
 91 |                         number of hidden units in arc MLP
 92 |   --mlp-lab-hidden MLP_LAB_HIDDEN
 93 |                         number of hidden units in label MLP
 94 |   --mlp-dropout MLP_DROPOUT
 95 |                         dropout used in mlps
 96 | 
 97 | Training arguments:
 98 |   --multi-gpu           enable training on multiple GPUs
 99 |   --lr LR               initial learning rate
100 |   --epochs EPOCHS       number of epochs of training
101 |   --batch-size BATCH_SIZE
102 |                         batch size
103 |   --seed SEED           random seed
104 |   --disable-cuda        disable cuda
105 |   --print-every PRINT_EVERY
106 |                         report interval
107 |   --plot-every PLOT_EVERY
108 |                         plot interval
109 |   --logdir LOGDIR       directory to log losses
110 |   --checkpoints CHECKPOINTS
111 |                         path to save the final model
112 |   ```
113 | 
114 | ## Requirements
115 | ```
116 | python>=3.6.0
117 | torch>=0.3.0
118 | numpy
119 | ```
120 | 
121 | ## TODO
122 | - [x] Add MST algorithm for decoding.
123 | - [x] Write predicted parses to conll file.
124 | - [x] A couple of full runs of the model for results.
125 | - [x] Enable multi-GPU training
126 | - [x] Work on character-level embedding of words (CNN or LSTM).
127 | - [x] Implement RNN options: RNN, GRU, (RAN?)
128 | - [x] Character level word embeddings: CNN
129 | - [x] Character level word embeddings: RNN
130 | - [x] Different encoder: [Transformer](http://nlp.seas.harvard.edu/2018/04/03/attention.html).
131 | - [x] Different encoder: CNN (again see [spaCy's parser](https://spacy.io/api/)).
132 | - [ ] Label loss converges very fast, which maybe hurts the arc accuracy?
133 | - [ ] Perform some ablation experiments.
134 | - [ ] Disable input POS-tags at prediction time but train with them using mutli-task learning. See [spaCy's parser](https://spacy.io/api/) and these papers that it is based on: [Stack-propagation: Improved Representation Learning for Syntax](https://arxiv.org/pdf/1603.06598.pdf) and [Deep multi-task learning with low level tasks supervised at lower layers](http://anthology.aclweb.org/P16-2038).
135 | - [ ] Load pretrained GloVe embeddings.
136 | 


--------------------------------------------------------------------------------
/checkpoints/model.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daandouwe/biaffine-dependency-parser/9338c6fde6de5393ac1bbdd6a8bb152c2a015a6c/checkpoints/model.pt


--------------------------------------------------------------------------------
/data.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import string
  3 | from collections import defaultdict
  4 | import numpy as np
  5 | 
  6 | import torch
  7 | from torch.autograd import Variable
  8 | 
  9 | PAD_TOKEN = '<pad>'
 10 | PAD_TAG   = 'PAD'
 11 | PAD_LABEL = '_pad_'
 12 | PAD_INDEX = 0
 13 | 
 14 | UNK_TOKEN = '<unk>'
 15 | UNK_TAG   = 'UNK'
 16 | UNK_LABEL = '_unk_'
 17 | UNK_INDEX = 1
 18 | 
 19 | ROOT_TOKEN = '<root>'
 20 | ROOT_TAG   = 'ROOT'
 21 | ROOT_LABEL = '_root_'
 22 | ROOT_INDEX = 2
 23 | 
 24 | def wrap(batch):
 25 |     """Packages the batch as a Variable containing a LongTensor."""
 26 |     return Variable(torch.LongTensor(batch))
 27 | 
 28 | def pad(batch, char=False):
 29 |     """Pad a batch of irregular length indices.
 30 | 
 31 |     Returns:
 32 |         Variable so it is ready as input for a PyTorch model.
 33 |     """
 34 |     # If character input then we first need to pad the individual words
 35 |     # before we can pad the sentences.
 36 |     if char:
 37 |         max_word_len = max(map(len, [w for sent in batch for w in sent]))
 38 |         new_batch = []
 39 |         for sent in batch:
 40 |             lens = list(map(len, sent))
 41 |             new_sent = []
 42 |             for k, word in zip(lens, sent):
 43 |                 padded = word + (max_word_len - k)*[PAD_INDEX]
 44 |                 new_sent.append(padded)
 45 |             new_batch.append(new_sent)
 46 |         batch = new_batch
 47 |     # Padding the sentences is then the same for both cases.
 48 |     pad_word = max_word_len*[PAD_INDEX] if char else PAD_INDEX
 49 |     lens = list(map(len, batch))
 50 |     max_len = max(lens)
 51 |     padded_batch = []
 52 |     for k, seq in zip(lens, batch):
 53 |         padded = seq + (max_len - k)*[pad_word]
 54 |         padded_batch.append(padded)
 55 |     return wrap(padded_batch)
 56 | 
 57 | class Dictionary:
 58 |     """A dependency parse dictionary."""
 59 |     def __init__(self, path, char=False):
 60 |         self.w2i = defaultdict(lambda: UNK_INDEX)
 61 |         self.t2i = defaultdict(lambda: UNK_INDEX)
 62 |         self.l2i = defaultdict(lambda: UNK_INDEX)
 63 | 
 64 |         self.i2w = defaultdict(lambda: UNK_TOKEN)
 65 |         self.i2t = defaultdict(lambda: UNK_TAG)
 66 |         self.i2l = defaultdict(lambda: UNK_LABEL)
 67 | 
 68 |         self.add_word(PAD_TOKEN)
 69 |         self.add_word(UNK_TOKEN)
 70 |         self.add_word(ROOT_TOKEN)
 71 | 
 72 |         self.add_tag(PAD_TAG)
 73 |         self.add_tag(UNK_TAG)
 74 |         self.add_tag(ROOT_TAG)
 75 | 
 76 |         self.add_label(PAD_LABEL)
 77 |         self.add_label(UNK_LABEL)
 78 |         self.add_label(ROOT_LABEL)
 79 | 
 80 |         self.char = char
 81 | 
 82 |         self.read(path)
 83 | 
 84 |     def add_word(self, word, processed_word=None, unk=False):
 85 |         if word not in self.w2i:
 86 |             if unk:
 87 |                 self.i2w[UNK_INDEX] = UNK_TOKEN
 88 |                 self.w2i[word] = UNK_INDEX
 89 |             else:
 90 |                 i = len(self.i2w)
 91 |                 self.i2w[i] = word
 92 |                 self.w2i[word] = i
 93 | 
 94 |     def add_tag(self, tag):
 95 |         if tag not in self.t2i:
 96 |             i = len(self.i2t)
 97 |             self.i2t[i] = tag
 98 |             self.t2i[tag] = i
 99 | 
100 |     def add_label(self, label):
101 |         if label not in self.l2i:
102 |             i = len(self.i2l)
103 |             self.i2l[i] = label
104 |             self.l2i[label] = i
105 | 
106 |     def read(self, path):
107 |         with open(path + ".words.txt", 'r') as f:
108 |             if self.char:
109 |                 chars = set(f.read())
110 |                 printable = set(string.printable)
111 |                 chars = list(chars | printable)
112 |                 for char in chars:
113 |                     self.add_word(char, char, unk=False)
114 |             else:
115 |                 for line in f:
116 |                     word, processed_word, _ = line.split()
117 |                     unk = bool(word != processed_word)
118 |                     self.add_word(word, processed_word, unk=unk)
119 |         with open(path + ".tags.txt", 'r') as f:
120 |             for line in f:
121 |                 tag, _ = line.split()
122 |                 self.add_tag(tag)
123 |         with open(path + ".labels.txt", 'r') as f:
124 |             for line in f:
125 |                 label, _ = line.split()
126 |                 self.add_label(label)
127 | 
128 | class Data:
129 |     """A dependency parse dataset."""
130 |     def __init__(self, path, dictionary, char=False):
131 |         self.words = []
132 |         self.tags = []
133 |         self.heads = []
134 |         self.labels = []
135 |         self.lengths = []
136 | 
137 |         self.char = char
138 | 
139 |         self.read(path, dictionary)
140 | 
141 |     def read(self, path, dictionary):
142 |         with open(path, 'r') as f:
143 |             ws, ts, hs, ls, n = self.newline()
144 |             for line in f:
145 |                 fields = line.split()
146 |                 if fields:
147 |                     w, t, h, l = fields[1], fields[3], fields[6], fields[7]
148 |                     if self.char:
149 |                         ws.append([dictionary.w2i[char] for char in w])
150 |                     else:
151 |                         ws.append(dictionary.w2i[w.lower()])
152 |                     ts.append(dictionary.t2i[t])
153 |                     hs.append(int(h))
154 |                     ls.append(dictionary.l2i[l])
155 |                     n += 1
156 |                 else:
157 |                     self.words.append(ws)
158 |                     self.tags.append(ts)
159 |                     self.heads.append(hs)
160 |                     self.labels.append(ls)
161 |                     self.lengths.append(n)
162 |                     ws, ts, hs, ls, n = self.newline()
163 | 
164 |     def newline(self):
165 |         """Each sentence in our data-set must start with these indices.
166 | 
167 |         Note the convention: the root has itelf as head.
168 |         """
169 |         if self.char:
170 |             return [[ROOT_INDEX]], [ROOT_INDEX], [0], [ROOT_INDEX], 1
171 |         else:
172 |             return [ROOT_INDEX], [ROOT_INDEX], [0], [ROOT_INDEX], 1
173 | 
174 |     def order(self):
175 |         old_order = zip(range(len(self.lengths)), self.lengths)
176 |         new_order, _ = zip(*sorted(old_order, key=lambda t: t[1]))
177 |         self.words = [self.words[i] for i in new_order]
178 |         self.tags = [self.tags[i] for i in new_order]
179 |         self.heads = [self.heads[i] for i in new_order]
180 |         self.labels = [self.labels[i] for i in new_order]
181 |         self.lengths = [self.lengths[i] for i in new_order]
182 | 
183 |     def shuffle(self):
184 |         n = len(self.words)
185 |         new_order = list(range(0, n))
186 |         np.random.shuffle(new_order)
187 |         self.words = [self.words[i] for i in new_order]
188 |         self.tags = [self.tags[i] for i in new_order]
189 |         self.heads = [self.heads[i] for i in new_order]
190 |         self.labels = [self.labels[i] for i in new_order]
191 |         self.lengths = [self.lengths[i] for i in new_order]
192 | 
193 |     def batches(self, batch_size, shuffle=True, length_ordered=False):
194 |         """An iterator over batches."""
195 |         n = len(self.words)
196 |         batch_order = list(range(0, n, batch_size))
197 |         if shuffle:
198 |             self.shuffle()
199 |             np.random.shuffle(batch_order)
200 |         if length_ordered:
201 |             self.order()
202 |         for i in batch_order:
203 |             words = pad(self.words[i:i+batch_size], char=self.char)
204 |             tags = pad(self.tags[i:i+batch_size])
205 |             heads = pad(self.heads[i:i+batch_size])
206 |             labels = pad(self.labels[i:i+batch_size])
207 |             yield words, tags, heads, labels
208 | 
209 | class Corpus:
210 |     """A corpus of a dictionary and three datasets (train, development, and test)."""
211 |     def __init__(self, vocab_path="vocab/train", data_path="~/data/ptb-stanford/", char=False):
212 |         data_path = os.path.expanduser(data_path)
213 |         self.dictionary = Dictionary(vocab_path, char=char)
214 |         self.train = Data(os.path.join(data_path, "train.conll"), self.dictionary, char=char)
215 |         self.dev = Data(os.path.join(data_path, "dev.conll"), self.dictionary, char=char)
216 |         self.test = Data(os.path.join(data_path, "test.conll"), self.dictionary, char=char)
217 | 
218 | if __name__ == "__main__":
219 |     # Example usage:
220 |     corpus = Corpus(data_path="~/data/ptb-stanford", char=True)
221 |     batches = corpus.train.batches(16)
222 |     for _ in range(10):
223 |         words, tags, heads, labels = next(batches)
224 | 
225 |     # Test character model on data.
226 |     from nn import RecurrentCharEmbedding
227 |     model = RecurrentCharEmbedding(len(corpus.dictionary.w2i), 100, 100, 100, dropout=0.33, bi=True)
228 |     model(words)
229 | 


--------------------------------------------------------------------------------
/embedding.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | from numpy import prod
  4 | 
  5 | from nn import ResidualConnection, HighwayNetwork
  6 | 
  7 | 
  8 | class WordEmbedding(nn.Module):
  9 |     """Embed words."""
 10 |     def __init__(self, embedding, dropout):
 11 |         super(WordEmbedding, self).__init__()
 12 |         self.embedding = embedding
 13 |         self.dropout = nn.Dropout(p=dropout)
 14 | 
 15 |     def forward(self, *args, **kwargs):
 16 |         words = kwargs['words']
 17 |         x = self.embedding(words)
 18 |         return self.dropout(x)
 19 | 
 20 |     @property
 21 |     def num_parameters(self):
 22 |         """Returns the number of trainable parameters of the model."""
 23 |         return sum(prod(p.shape) for p in self.parameters() if p.requires_grad)
 24 | 
 25 | 
 26 | class TagEmbedding(nn.Module):
 27 |     """Embed tags."""
 28 |     def __init__(self, embedding, dropout):
 29 |         super(TagEmbedding, self).__init__()
 30 |         self.embedding = embedding
 31 |         self.dropout = nn.Dropout(p=dropout)
 32 | 
 33 |     def forward(self, *args, **kwargs):
 34 |         tags = kwargs['tags']
 35 |         x = self.embedding(tags)
 36 |         return self.dropout(x)
 37 | 
 38 |     @property
 39 |     def num_parameters(self):
 40 |         """Returns the number of trainable parameters of the model."""
 41 |         return sum(prod(p.shape) for p in self.parameters() if p.requires_grad)
 42 | 
 43 | 
 44 | class WordTagEmbedding(nn.Module):
 45 |     """Embeds words and tags and concatenates them."""
 46 |     def __init__(self, word_embedding, tag_embedding, dropout):
 47 |         super(WordTagEmbedding, self).__init__()
 48 |         self.word_embedding = word_embedding
 49 |         self.tag_embedding = tag_embedding
 50 |         self.dropout = nn.Dropout(p=dropout)
 51 | 
 52 |     def forward(self, *args, **kwargs):
 53 |         words, tags = kwargs['words'], kwargs['tags']
 54 |         words = self.word_embedding(words)
 55 |         tags = self.tag_embedding(tags)
 56 |         x = torch.cat((words, tags), dim=-1)
 57 |         return self.dropout(x)
 58 | 
 59 |     @property
 60 |     def num_parameters(self):
 61 |         """Returns the number of trainable parameters of the model."""
 62 |         return sum(prod(p.shape) for p in self.parameters() if p.requires_grad)
 63 | 
 64 | 
 65 | class ConvolutionalCharEmbedding(nn.Module):
 66 |     """Convolutional character embedding following https://arxiv.org/pdf/1508.06615.pdf."""
 67 |     def __init__(self, nchars, padding_idx, emb_dim=15, filter_factor=25, activation='Tanh', dropout=0.):
 68 |         super(ConvolutionalCharEmbedding, self).__init__()
 69 |         self.padding_idx = padding_idx
 70 |         self.embedding = nn.Embedding(nchars, emb_dim, padding_idx=padding_idx)
 71 | 
 72 |         filter_size = lambda kernel_size: filter_factor * kernel_size
 73 |         self.output_size = sum(map(filter_size, range(1, 7)))
 74 |         self.conv1 = nn.Conv1d(emb_dim, filter_size(1), kernel_size=1)
 75 |         self.conv2 = nn.Conv1d(emb_dim, filter_size(2), kernel_size=2)
 76 |         self.conv3 = nn.Conv1d(emb_dim, filter_size(3), kernel_size=3)
 77 |         self.conv4 = nn.Conv1d(emb_dim, filter_size(4), kernel_size=4)
 78 |         self.conv5 = nn.Conv1d(emb_dim, filter_size(5), kernel_size=5)
 79 |         self.conv6 = nn.Conv1d(emb_dim, filter_size(6), kernel_size=6)
 80 | 
 81 |         self.act_fn = getattr(nn, activation)()
 82 | 
 83 |         self.pool = nn.AdaptiveMaxPool1d(1) # Max pooling over time.
 84 | 
 85 |         self.highway = HighwayNetwork(self.output_size)
 86 | 
 87 |     def forward(self, x):
 88 |         """Expect input of shape (batch, sent_len, word_len)."""
 89 |         # Preprocessing of character batch.
 90 |         batch_size, sent_len, word_len = x.shape
 91 |         x = x.view(-1, word_len) # (batch * sent, word)
 92 |         mask = (x != self.padding_idx).float()
 93 |         x = self.embedding(x)   # (batch * sent, word, emb)
 94 |         mask = mask.unsqueeze(-1).repeat(1, 1, x.size(-1))
 95 |         x = mask * x
 96 |         x = x.transpose(1, 2)   # (batch * sent, emb, word)
 97 | 
 98 |         # Ready for input
 99 |         f1 = self.pool(self.act_fn(self.conv1(x))).squeeze(-1)
100 |         f2 = self.pool(self.act_fn(self.conv2(x))).squeeze(-1)
101 |         f3 = self.pool(self.act_fn(self.conv3(x))).squeeze(-1)
102 |         f4 = self.pool(self.act_fn(self.conv4(x))).squeeze(-1)
103 |         f5 = self.pool(self.act_fn(self.conv5(x))).squeeze(-1)
104 |         f6 = self.pool(self.act_fn(self.conv6(x))).squeeze(-1)
105 | 
106 |         f = torch.cat([f1, f2, f3, f4, f5, f6], dim=-1)
107 | 
108 |         f = self.highway(f)
109 | 
110 |         return f.contiguous().view(batch_size, sent_len, f.size(-1)) # (batch, sent, emb)
111 | 
112 |     @property
113 |     def num_parameters(self):
114 |         """Returns the number of trainable parameters of the model."""
115 |         return sum(prod(p.shape) for p in self.parameters() if p.requires_grad)
116 | 
117 | 
118 | class SimpleConvolutionalCharEmbedding(nn.Module):
119 |     def __init__(self, nchars, output_size, padding_idx, num_conv=3, kernel_size=3, emb_dim=None,
120 |                 hidden_size=None, activation='ReLU', dropout=0.):
121 |         super(SimpleConvolutionalCharEmbedding, self).__init__()
122 |         emb_dim = output_size if emb_dim is None else emb_dim
123 |         hidden_size = output_size if hidden_size is None else hidden_size
124 |         # Make sure kernel_size is odd.
125 |         assert kernel_size % 2 == 1
126 |         # Padding to keep shape constant
127 |         padding = kernel_size // 2
128 |         act_fn = getattr(nn, activation)
129 |         self.padding_idx = padding_idx
130 |         self.embedding = nn.Embedding(nchars, emb_dim, padding_idx=padding_idx)
131 |         input_size = emb_dim
132 |         layers = nn.Sequential()
133 |         for i in range(num_conv):
134 |             conv = nn.Conv1d(input_size, output_size, kernel_size, padding=padding)
135 |             layers.add_module('res_{}'.format(i),
136 |                     ResidualConnection(conv, dropout))
137 |             layers.add_module('{}_{}'.format(activation, i),
138 |                     act_fn())
139 |             input_size = output_size
140 |         self.layers = layers
141 | 
142 |     def forward(self, x):
143 |         """Expect input of shape (batch, seq, emb)."""
144 |         batch_size, sent_len, word_len = x.shape
145 |         x = x.view(-1, word_len) # (batch * sent, word)
146 |         mask = (x != self.padding_idx).float()
147 |         x = self.embedding(x)   # (batch * sent, word, emb)
148 |         mask = mask.unsqueeze(-1).repeat(1, 1, x.size(-1))
149 |         x = mask * x
150 |         x = x.transpose(1, 2)   # (batch * sent, emb, word)
151 |         x = self.layers(x)      # (batch * sent, emb, word)
152 |         x = x.mean(-1)          # (batch * sent, emb)
153 |         x = x.contiguous().view(batch_size, sent_len, x.size(-1)) # (batch, sent, emb)
154 |         return x
155 | 
156 |     @property
157 |     def num_parameters(self):
158 |         """Returns the number of trainable parameters of the model."""
159 |         return sum(prod(p.shape) for p in self.parameters() if p.requires_grad)
160 | 
161 | 
162 | class RecurrentCharEmbedding(nn.Module):
163 |     """Simple RNN based encoder for character-level word embeddings.
164 | 
165 |     Based on:
166 |         https://github.com/bastings/parser/blob/extended_parser/parser/nn.py
167 |     """
168 |     def __init__(self, nchars, output_size, padding_idx,
169 |                  hidden_size=None, emb_dim=None, dropout=0.33, bi=True):
170 |         super(RecurrentCharEmbedding, self).__init__()
171 |         # Default values for encoder.
172 |         emb_dim = output_size if emb_dim is None else emb_dim
173 |         hidden_size = output_size if hidden_size is None else hidden_size
174 | 
175 |         self.padding_idx = padding_idx
176 |         self.embedding = nn.Embedding(nchars, emb_dim)
177 |         self.dropout = nn.Dropout(p=dropout)
178 | 
179 |         self.rnn = nn.LSTM(input_size=emb_dim, hidden_size=hidden_size, num_layers=1,
180 |                             batch_first=True, dropout=dropout, bidirectional=bi)
181 | 
182 |         rnn_dim = hidden_size * 2 if bi else hidden_size
183 |         self.linear = nn.Linear(rnn_dim, output_size, bias=False)
184 | 
185 |         self.relu = nn.ReLU()
186 | 
187 |     def forward(self, x):
188 |         cuda = torch.cuda.is_available()
189 | 
190 |         batch_size, sent_len, word_len = x.shape
191 |         # Reshape so that the characters are the only sequences.
192 |         x = x.view(-1, word_len) # (batch_size * sent_len, word_len)
193 | 
194 |         # Sort x by word length.
195 |         lengths = (x != self.padding_idx).long().sum(-1)
196 |         sorted_lengths, sort_idx = lengths.sort(0, descending=True)
197 |         sort_idx = sort_idx.cuda() if cuda else sort_idx
198 |         x = x[sort_idx]
199 | 
200 |         # Remove the rows (i.e. words) from x that consist entirely of PAD_INDEX.
201 |         non_padding_idx = (sorted_lengths != 0).long().sum().data[0]
202 |         num_all_pad = x.size(0) - non_padding_idx
203 |         x = x[:non_padding_idx]
204 |         sorted_lengths = sorted_lengths[:non_padding_idx]
205 | 
206 |         # Embed chars and pack for rnn input.
207 |         x = self.embedding(x)
208 |         x = self.dropout(x)
209 |         sorted_lengths = [i for i in sorted_lengths.data]
210 |         x = nn.utils.rnn.pack_padded_sequence(x, sorted_lengths, batch_first=True)
211 | 
212 |         # RNN computation.
213 |         out, _ = self.rnn(x)
214 | 
215 |         # Unpack and keep only final embedding.
216 |         out, _ = nn.utils.rnn.pad_packed_sequence(out, batch_first=True)
217 |         sorted_lengths = wrap(sorted_lengths) - 1
218 |         sorted_lengths = sorted_lengths.unsqueeze(-1).unsqueeze(-1).repeat(1, 1, out.size(-1))
219 |         sorted_lengths = sorted_lengths.cuda() if cuda else sorted_lengths
220 |         out = torch.gather(out, 1, sorted_lengths).squeeze(1)
221 |         # Project rnn output states to proper embedding dimension.
222 |         out = self.relu(out)
223 |         out = self.linear(out)
224 | 
225 |         # Put back zero vectors for the pad words that we removed.
226 |         if num_all_pad > 0:
227 |             pad_embeddings = Variable(torch.zeros(num_all_pad, out.size(-1)))
228 |             pad_embeddings = pad_embeddings.cuda() if cuda else pad_embeddings
229 |             out = torch.cat([out, pad_embeddings])
230 | 
231 |         # Put everything back into the original order.
232 |         pairs = list(zip(sort_idx.data, range(sort_idx.size(0))))
233 |         undo_sort_idx = [pair[1] for pair in sorted(pairs, key=lambda t: t[0])]
234 |         undo_sort_idx = wrap(undo_sort_idx)
235 |         undo_sort_idx = undo_sort_idx.cuda() if cuda else undo_sort_idx
236 |         out = out[undo_sort_idx]
237 |         out = out.view(batch_size, sent_len, out.size(-1))
238 | 
239 |         return out
240 | 
241 |     @property
242 |     def num_parameters(self):
243 |         """Returns the number of trainable parameters of the model."""
244 |         return sum(prod(p.shape) for p in self.parameters() if p.requires_grad)
245 | 
246 | 
247 | if __name__ == '__main__':
248 |     # Debugging
249 |     import argparse
250 |     from data import Corpus
251 | 
252 |     parser = argparse.ArgumentParser(description='Biaffine graph-based dependency parser')
253 | 
254 |     parser.add_argument('--data', type=str, default='~/data/stanford-ptb/',
255 |                         help='location of the data corpus')
256 |     parser.add_argument('--vocab', type=str, default='vocab/train',
257 |                         help='location of the preprocessed vocabulary')
258 |     parser.add_argument('--char', action='store_true',
259 |                         help='character embeddings')
260 |     parser.add_argument('--disable_length_ordered', action='store_false',
261 |                         help='do not order sentences by length so batches have more padding')
262 |     args = parser.parse_args()
263 | 
264 |     corpus = Corpus(data_path=args.data, vocab_path=args.vocab, char=args.char)
265 |     batches = corpus.train.batches(4, length_ordered=args.disable_length_ordered)
266 | 
267 |     words, tags, heads, labels = next(batches)
268 | 
269 |     emb_dim = 100
270 |     embedding = ConvolutionalCharEmbedding(len(corpus.dictionary.w2i), emb_dim)
271 | 
272 |     x = embedding(words)
273 |     print(x)
274 | 


--------------------------------------------------------------------------------
/encoder.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | 
  3 | import torch
  4 | from torch import nn
  5 | from torch.autograd import Variable
  6 | from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
  7 | from numpy import prod
  8 | 
  9 | from nn import ResidualConnection
 10 | 
 11 | 
 12 | class RecurrentEncoder(nn.Module):
 13 |     """A simple RNN based sentence encoder."""
 14 |     def __init__(self, rnn_type, input_size, hidden_size, num_layers,
 15 |                  batch_first, dropout, bidirectional,
 16 |                  use_cuda=False, hidden_init='zeros', train_hidden_init=False):
 17 |         super(RecurrentEncoder, self).__init__()
 18 |         self.hidden_size = hidden_size
 19 |         self.num_layers = num_layers
 20 |         self.num_directions = 2 if bidirectional else 1
 21 |         self.batch_first = batch_first
 22 | 
 23 |         assert rnn_type in ('LSTM', 'GRU', 'RNN')
 24 |         self.rnn_type = rnn_type
 25 |         args = input_size, hidden_size, num_layers, batch_first, dropout, bidirectional
 26 |         self.rnn = getattr(nn, rnn_type)(input_size, hidden_size, num_layers,
 27 |                                             batch_first=batch_first,
 28 |                                             dropout=dropout,
 29 |                                             bidirectional=bidirectional)
 30 | 
 31 |         assert hidden_init in ('zeros', 'randn')  # availlable initialization methods.
 32 |         self.hidden_init = getattr(torch, hidden_init)
 33 |         self.train_hidden_init = train_hidden_init  # TODO: make initial hidden trainable.
 34 |         self.cuda = use_cuda
 35 | 
 36 |     def get_hidden(self, batch):
 37 |         args = self.num_layers * self.num_directions, batch, self.hidden_size
 38 |         use_cuda = torch.cuda.is_available()
 39 |         if self.rnn_type == 'LSTM':
 40 |             h0 = Variable(self.hidden_init(*args))  # (num_layers * directions, batch, hidden_size)
 41 |             c0 = Variable(self.hidden_init(*args))  # (num_layers * directions, batch, hidden_size)
 42 |             if use_cuda:
 43 |                 h0, c0 = h0.cuda(), c0.cuda()
 44 |             return h0, c0
 45 |         else:
 46 |             h0 = Variable(self.hidden_init(*args))  # (num_layers * directions, batch, hidden_size)
 47 |             if use_cuda:
 48 |                 h0 = h0.cuda()
 49 |             return h0
 50 | 
 51 |     def forward(self, x, lengths):
 52 |         batch = x.size(0) if self.batch_first else x.size(1)
 53 |         # RNN computation.
 54 |         h0 = self.get_hidden(batch)
 55 |         out, _ = self.rnn(x, h0)
 56 |         return out
 57 | 
 58 |     @property
 59 |     def num_parameters(self):
 60 |         """Returns the number of trainable parameters of the model."""
 61 |         return sum(prod(p.shape) for p in self.parameters() if p.requires_grad)
 62 | 
 63 | 
 64 | class ConvolutionalEncoder(nn.Module):
 65 |     """Stacked convolutions with residual connection.
 66 | 
 67 |     Similar to the architectures used in https://arxiv.org/pdf/1705.03122.pdf and
 68 |     https://arxiv.org/pdf/1611.02344.pdf.
 69 |     """
 70 |     def __init__(self, input_size, num_conv, kernel_size, activation='Tanh', dropout=0.):
 71 |         super(ConvolutionalEncoder, self).__init__()
 72 |         assert kernel_size % 2 == 1, 'only odd kernel sizes supported'
 73 |         padding = kernel_size // 2 # Padding to keep size constant
 74 |         act_fn = getattr(nn, activation)
 75 |         layers = nn.Sequential()
 76 |         c = copy.deepcopy
 77 |         conv = nn.Conv1d(input_size, input_size, kernel_size, padding=padding)
 78 |         for i in range(num_conv):
 79 |             layers.add_module('res_{}'.format(i),
 80 |                     ResidualConnection(c(conv), dropout))
 81 |             layers.add_module('{}_{}'.format(activation, i),
 82 |                     act_fn())
 83 |         self.layers = layers
 84 | 
 85 |     def forward(self, x, mask):
 86 |         """Expect input of shape (batch, seq, emb)."""
 87 |         # x = mask * x
 88 |         x = x.transpose(1, 2)  # (batch, emb, seq)
 89 |         x = self.layers(x)
 90 |         return x.transpose(1, 2)  # (batch, seq, emb)
 91 | 
 92 |     @property
 93 |     def num_parameters(self):
 94 |         """Returns the number of trainable parameters of the model."""
 95 |         return sum(prod(p.shape) for p in self.parameters() if p.requires_grad)
 96 | 
 97 | 
 98 | class NoEncoder(nn.Module):
 99 |     """This encoder does nothing."""
100 |     def __init__(self, *args, **kwargs):
101 |         super(NoEncoder, self).__init__()
102 | 
103 |     def forward(self, x, *args, **kwargs):
104 |         return x
105 | 
106 |     @property
107 |     def num_parameters(self):
108 |         return 0
109 | 
110 | 
111 | if __name__ == '__main__':
112 |     m = ConvolutionalEncoder(10, 8, num_conv=3)
113 |     x = Variable(torch.randn(5, 6, 10))  # (batch, seq, emb)
114 |     print(m(x))
115 | 


--------------------------------------------------------------------------------
/eval.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import argparse
  3 | import os
  4 | 
  5 | import numpy as np
  6 | import torch
  7 | import torch.nn as nn
  8 | from torch.autograd import Variable
  9 | 
 10 | from data import Dictionary, Corpus, PAD_INDEX
 11 | from predict import predict, predict_batch
 12 | 
 13 | 
 14 | class CONLL:
 15 |     """A CONLL dataset."""
 16 |     def __init__(self, dictionary):
 17 |         self.dictionary = dictionary
 18 |         self.words = []
 19 |         self.tags = []
 20 |         self.heads = []
 21 |         self.labels = []
 22 |         self.lengths = []
 23 | 
 24 |     def add(self, words, tags, heads, labels):
 25 |         self.words.append([self.dictionary.i2w[i] for i in words])
 26 |         self.tags.append([self.dictionary.i2t[i] for i in tags])
 27 |         self.heads.append(heads)
 28 |         self.labels.append([self.dictionary.i2l[i] for i in labels])
 29 | 
 30 |     def write(self, path='predicted.conll'):
 31 |         """"Write the data out as a conll file (Stanford style)."""
 32 |         with open(path, 'w') as f:
 33 |             for line in zip(self.words, self.tags, self.heads, self.labels):
 34 |                 words, tags, heads, labels = line
 35 |                 lines = zip(words[1:], tags[1:], heads[1:], labels[1:])
 36 |                 for i, (w, t, h, l) in enumerate(lines, 1):
 37 |                     print(i, w, '_', t, t, '_', h, l, '_', '_', sep='\t', file=f)
 38 |                 print(file=f)
 39 | 
 40 | 
 41 | class Decoder:
 42 |     def __init__(self, corpus, model):
 43 |         self.model = model
 44 |         self.corpus = corpus
 45 | 
 46 |     def batch_eval(self, batch_size=128):
 47 |         conll = CONLL(self.corpus.dictionary)
 48 |         batches = self.corpus.dev.batches(batch_size, shuffle=False)
 49 |         self.model.eval()
 50 |         for i, batch in enumerate(batches):
 51 |             print('Batch:', i, end='\r')
 52 |             words, tags, heads, labels = batch
 53 |             # Predict score matrices for the batch.
 54 |             S_arc, S_lab = self.model(words=words, tags=tags)
 55 |             for i in range(words.size(0)):
 56 |                 # Find the sentence length.
 57 |                 n = (words[i] != PAD_INDEX).int().sum().data.numpy()[0]
 58 |                 # Predict for the selected parts that are the sentence.
 59 |                 heads_pred, labels_pred = predict_batch(
 60 |                     S_arc[i, :n, :n],
 61 |                     S_lab[i, :, :n, :n],
 62 |                     tags[i, :n]
 63 |                 )
 64 |                 conll.add(
 65 |                     words[i].data.numpy(), tags[i].data.numpy(), heads_pred, labels_pred)
 66 |         return conll
 67 | 
 68 |     def eval(self, corpus):
 69 |         self.model.eval()
 70 |         batches = self.corpus.dev.batches(1, shuffle=False)
 71 |         conll = CONLL(corpus.dictionary)
 72 |         for i, batch in enumerate(batches):
 73 |             print('Batch:', i, end='\r')
 74 |             words, tags, heads, labels = batch
 75 |             heads_pred, labels_pred = predict(model, words, tags)
 76 |             words = words[0].data.numpy()
 77 |             tags = tags[0].data.numpy()
 78 |             conll.add(words, tags, heads_pred, labels_pred)
 79 |         return conll
 80 | 
 81 | 
 82 | def main(args):
 83 |     data_dir = os.path.expanduser(args.data_dir)
 84 |     gold_path = os.path.expanduser(args.gold_path)
 85 | 
 86 |     corpus = Corpus(args.vocab_path, data_dir)
 87 |     model = torch.load(args.model_path)
 88 | 
 89 |     parser = Decoder(corpus, model)
 90 |     conll = parser.batch_eval()
 91 | 
 92 |     # Write the conll as text.
 93 |     conll.write(args.predict_path)
 94 |     # Evaluate the predicted conll.
 95 |     os.system('perl eval.pl -g {0} -s {1} > {2}'.format(gold_path, args.predict_path, args.result_path))
 96 | 
 97 | 
 98 | if __name__ == '__main__':
 99 |     parser = argparse.ArgumentParser()
100 | 
101 |     parser.add_argument('--data-dir', default='~/data/ptb-stanford')
102 |     parser.add_argument('--vocab-path', default='vocab/train')
103 |     parser.add_argument('--model-path', default='models/trained/model.pt')
104 |     parser.add_argument('--gold-path', default='~/data/ptb-stanford/dev.conll')
105 |     parser.add_argument('--predict-path', default='predicted.conll')
106 |     parser.add_argument('--result-path', default='result.txt')
107 |     args = parser.parse_args()
108 |     main(args)
109 | 


--------------------------------------------------------------------------------
/img/a.100.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daandouwe/biaffine-dependency-parser/9338c6fde6de5393ac1bbdd6a8bb152c2a015a6c/img/a.100.pdf


--------------------------------------------------------------------------------
/img/a.1000.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daandouwe/biaffine-dependency-parser/9338c6fde6de5393ac1bbdd6a8bb152c2a015a6c/img/a.1000.pdf


--------------------------------------------------------------------------------
/img/a.1100.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daandouwe/biaffine-dependency-parser/9338c6fde6de5393ac1bbdd6a8bb152c2a015a6c/img/a.1100.pdf


--------------------------------------------------------------------------------
/img/a.1200.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daandouwe/biaffine-dependency-parser/9338c6fde6de5393ac1bbdd6a8bb152c2a015a6c/img/a.1200.pdf


--------------------------------------------------------------------------------
/img/a.200.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daandouwe/biaffine-dependency-parser/9338c6fde6de5393ac1bbdd6a8bb152c2a015a6c/img/a.200.pdf


--------------------------------------------------------------------------------
/img/a.300.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daandouwe/biaffine-dependency-parser/9338c6fde6de5393ac1bbdd6a8bb152c2a015a6c/img/a.300.pdf


--------------------------------------------------------------------------------
/img/a.400.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daandouwe/biaffine-dependency-parser/9338c6fde6de5393ac1bbdd6a8bb152c2a015a6c/img/a.400.pdf


--------------------------------------------------------------------------------
/img/a.500.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daandouwe/biaffine-dependency-parser/9338c6fde6de5393ac1bbdd6a8bb152c2a015a6c/img/a.500.pdf


--------------------------------------------------------------------------------
/img/a.600.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daandouwe/biaffine-dependency-parser/9338c6fde6de5393ac1bbdd6a8bb152c2a015a6c/img/a.600.pdf


--------------------------------------------------------------------------------
/img/a.700.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daandouwe/biaffine-dependency-parser/9338c6fde6de5393ac1bbdd6a8bb152c2a015a6c/img/a.700.pdf


--------------------------------------------------------------------------------
/img/a.800.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daandouwe/biaffine-dependency-parser/9338c6fde6de5393ac1bbdd6a8bb152c2a015a6c/img/a.800.pdf


--------------------------------------------------------------------------------
/img/a.900.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daandouwe/biaffine-dependency-parser/9338c6fde6de5393ac1bbdd6a8bb152c2a015a6c/img/a.900.pdf


--------------------------------------------------------------------------------
/img/a.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daandouwe/biaffine-dependency-parser/9338c6fde6de5393ac1bbdd6a8bb152c2a015a6c/img/a.pdf


--------------------------------------------------------------------------------
/img/gold.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daandouwe/biaffine-dependency-parser/9338c6fde6de5393ac1bbdd6a8bb152c2a015a6c/img/gold.pdf


--------------------------------------------------------------------------------
/lisa-modules.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | module load eb
4 | module load Python/3.6.3-foss-2017b
5 | module load CUDA/9.0.176
6 | module load cuDNN/7.0.5-CUDA-9.0.176
7 | module load PyTorch/0.3.0-foss-2017b-Python-3.6.3-CUDA-9.0.176
8 | 


--------------------------------------------------------------------------------
/log/acc.train.csv:
--------------------------------------------------------------------------------
 1 | train_arc_acc,train_lab_acc
 2 | tensor(0.0703),tensor(0.0254)
 3 | tensor(0.0327),tensor(0.0781)
 4 | tensor(0.0273),tensor(0.0977)
 5 | tensor(0.0495),tensor(0.1120)
 6 | tensor(0.0707),tensor(0.0992)
 7 | tensor(0.0211),tensor(0.0910)
 8 | tensor(0.0312),tensor(0.0986)
 9 | tensor(0.0300),tensor(0.0853)
10 | tensor(0.0187),tensor(0.0962)
11 | tensor(0.0278),tensor(0.0972)
12 | tensor(0.0128),tensor(0.0852)
13 | tensor(0.0531),tensor(0.1000)
14 | tensor(0.0795),tensor(0.0938)
15 | tensor(0.0556),tensor(0.1146)
16 | tensor(0.0299),tensor(0.1141)
17 | tensor(0.0134),tensor(0.1696)
18 | tensor(0.0119),tensor(0.1354)
19 | tensor(0.0097),tensor(0.1007)
20 | tensor(0.0625),tensor(0.1319)
21 | tensor(0.0404),tensor(0.1121)
22 | tensor(0.0194),tensor(0.1216)
23 | tensor(0.0609),tensor(0.1118)
24 | tensor(0.0256),tensor(0.1130)
25 | tensor(0.0262),tensor(0.1200)
26 | tensor(0.0353),tensor(0.1155)
27 | tensor(0.0232),tensor(0.1109)
28 | tensor(0.0703),tensor(0.1458)
29 | tensor(0.0511),tensor(0.1179)
30 | tensor(0.0576),tensor(0.1168)
31 | tensor(0.0402),tensor(0.1049)
32 | tensor(0.0462),tensor(0.1175)
33 | tensor(0.0859),tensor(0.1458)
34 | tensor(0.0296),tensor(0.1250)
35 | tensor(0.0352),tensor(0.1198)
36 | tensor(0.0236),tensor(0.1301)
37 | tensor(0.0327),tensor(0.1222)
38 | tensor(0.0208),tensor(0.1319)
39 | 


--------------------------------------------------------------------------------
/log/acc.val.csv:
--------------------------------------------------------------------------------
1 | val_arc_acc,val_lab_acc
2 | 


--------------------------------------------------------------------------------
/log/loss.train.csv:
--------------------------------------------------------------------------------
 1 | loss
 2 | tensor(6.8414)
 3 | tensor(7.0927)
 4 | tensor(6.8396)
 5 | tensor(7.0666)
 6 | tensor(6.9516)
 7 | tensor(7.4251)
 8 | tensor(6.5466)
 9 | tensor(7.1447)
10 | tensor(7.0329)
11 | tensor(5.9982)
12 | tensor(6.5118)
13 | tensor(5.8605)
14 | tensor(5.7545)
15 | tensor(6.1673)
16 | tensor(6.3938)
17 | tensor(5.3236)
18 | tensor(6.5271)
19 | tensor(7.0804)
20 | tensor(5.6261)
21 | tensor(6.2248)
22 | tensor(6.8624)
23 | tensor(6.3401)
24 | tensor(6.9874)
25 | tensor(6.6882)
26 | tensor(6.3879)
27 | tensor(6.7277)
28 | tensor(5.6935)
29 | tensor(6.2885)
30 | tensor(6.1476)
31 | tensor(6.5396)
32 | tensor(6.3351)
33 | tensor(5.5205)
34 | tensor(6.0055)
35 | tensor(6.2230)
36 | tensor(6.9263)
37 | tensor(5.9272)
38 | tensor(4.9149)
39 | 


--------------------------------------------------------------------------------
/log/no-pos.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daandouwe/biaffine-dependency-parser/9338c6fde6de5393ac1bbdd6a8bb152c2a015a6c/log/no-pos.pdf


--------------------------------------------------------------------------------
/log/with-pos.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daandouwe/biaffine-dependency-parser/9338c6fde6de5393ac1bbdd6a8bb152c2a015a6c/log/with-pos.pdf


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import argparse
  3 | 
  4 | from train import train
  5 | 
  6 | 
  7 | def main():
  8 | 
  9 |     parser = argparse.ArgumentParser(description='Biaffine graph-based dependency parser')
 10 | 
 11 |     parser.add_argument('mode', type=str, choices=['train', 'predict'])
 12 | 
 13 |     # Data arguments
 14 |     data = parser.add_argument_group('Data')
 15 |     data.add_argument('--data', type=str, default='~/data/ptb-stanford/',
 16 |                         help='location of the data corpus')
 17 |     data.add_argument('--vocab', type=str, default='vocab/train',
 18 |                         help='location of the preprocessed vocabulary')
 19 |     data.add_argument('--disable-length-ordered', action='store_false',
 20 |                         help='do not order sentences by length so batches have more padding')
 21 | 
 22 |     # Embedding arguments
 23 |     embed = parser.add_argument_group('Embedding options')
 24 |     embed.add_argument('--use-glove', action='store_true',
 25 |                         help='use pretrained glove embeddings')
 26 |     embed.add_argument('--use-chars', action='store_true',
 27 |                         help='use character level word embeddings')
 28 |     embed.add_argument('--char-encoder', type=str, choices=['rnn', 'cnn', 'transformer'],
 29 |                         default='cnn', help='type of character encoder used for word embeddings')
 30 |     embed.add_argument('--filter-factor', type=int, default=25,
 31 |                         help='controls output size of cnn character embedding')
 32 |     embed.add_argument('--disable-words', action='store_false',
 33 |                         help='do not use words as input')
 34 |     embed.add_argument('--disable-tags', action='store_false',
 35 |                         help='do not use tags as input')
 36 |     embed.add_argument('--word-emb-dim', type=int, default=300,
 37 |                         help='size of word embeddings')
 38 |     embed.add_argument('--tag-emb-dim', type=int, default=50,
 39 |                         help='size of tag embeddings')
 40 |     embed.add_argument('--emb-dropout', type=float, default=0.3,
 41 |                         help='dropout used on embeddings')
 42 | 
 43 |     # Encoder arguments
 44 |     encode = parser.add_argument_group('Encoder options')
 45 |     encode.add_argument('--encoder', type=str, choices=['rnn', 'cnn', 'transformer', 'none'],
 46 |                         default='rnn', help='type of sentence encoder used')
 47 | 
 48 |     # RNN encoder arguments
 49 |     rnn = parser.add_argument_group('RNN options')
 50 |     rnn.add_argument('--rnn-type', type=str, choices=['RNN', 'GRU', 'LSTM'], default='LSTM',
 51 |                         help='type of rnn')
 52 |     rnn.add_argument('--rnn-hidden', type=int, default=400,
 53 |                         help='number of hidden units in rnn')
 54 |     rnn.add_argument('--rnn-num-layers', type=int, default=3,
 55 |                         help='number of layers')
 56 |     rnn.add_argument('--batch-first', type=bool, default=True,
 57 |                         help='number of layers')
 58 |     rnn.add_argument('--rnn-dropout', type=float, default=0.3,
 59 |                         help='dropout used in rnn')
 60 | 
 61 |     # CNN encoder arguments
 62 |     cnn = parser.add_argument_group('CNN options')
 63 |     cnn.add_argument('--cnn-num-layers', type=int, default=6,
 64 |                         help='number convolutions')
 65 |     cnn.add_argument('--kernel-size', type=int, default=5,
 66 |                         help='size of convolution kernel')
 67 |     cnn.add_argument('--cnn-dropout', type=float, default=0.3,
 68 |                         help='dropout used in cnn')
 69 | 
 70 |     # Transformer encoder arguments
 71 |     trans = parser.add_argument_group('Transformer options')
 72 |     trans.add_argument('--N', type=int, default=6,
 73 |                         help='transformer options')
 74 |     trans.add_argument('--d-model', type=int, default=512,
 75 |                         help='transformer options')
 76 |     trans.add_argument('--d-ff', type=int, default=2048,
 77 |                         help='transformer options')
 78 |     trans.add_argument('--h', type=int, default=8,
 79 |                         help='transformer options')
 80 |     trans.add_argument('--trans-dropout', type=float, default=0.1,
 81 |                         help='dropout used in transformer')
 82 | 
 83 |     # Biaffine transformations
 84 |     biaff = parser.add_argument_group('Biaffine classifier arguments')
 85 |     biaff.add_argument('--mlp-arc-hidden', type=int, default=500,
 86 |                         help='number of hidden units in arc MLP')
 87 |     biaff.add_argument('--mlp-lab-hidden', type=int, default=100,
 88 |                         help='number of hidden units in label MLP')
 89 |     biaff.add_argument('--mlp-dropout', type=float, default=0.3,
 90 |                         help='dropout used in mlps')
 91 | 
 92 |     # Training.
 93 |     training = parser.add_argument_group('Training arguments')
 94 |     training.add_argument('--multi-gpu', action='store_true',
 95 |                         help='enable training on multiple GPUs')
 96 |     training.add_argument('--lr', type=float, default=2e-3,
 97 |                         help='initial learning rate')
 98 |     training.add_argument('--epochs', type=int, default=10,
 99 |                         help='number of epochs of training')
100 |     training.add_argument('--batch-size', type=int, default=32,
101 |                         help='batch size')
102 |     training.add_argument('--seed', type=int, default=42,
103 |                         help='random seed')
104 |     training.add_argument('--disable-cuda', action='store_true',
105 |                         help='disable cuda')
106 |     training.add_argument('--print-every', type=int, default=100,
107 |                         help='report interval')
108 |     training.add_argument('--plot-every', type=int, default=100,
109 |                         help='plot interval')
110 |     training.add_argument('--logdir', type=str,  default='log',
111 |                         help='directory to log losses')
112 |     training.add_argument('--checkpoints', type=str,  default='checkpoints/model.pt',
113 |                         help='path to save the final model')
114 |     args = parser.parse_args()
115 | 
116 |     if args.mode == 'train':
117 |         train(args)
118 |     if args.mode == 'predict':
119 |         exit('Prediction not implemented yet...')
120 | 
121 | 
122 | if __name__ == '__main__':
123 |     main()
124 | 


--------------------------------------------------------------------------------
/model.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch import nn
  3 | from torch.autograd import Variable
  4 | from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
  5 | from numpy import prod
  6 | 
  7 | from data import PAD_INDEX
  8 | from embedding import WordEmbedding, WordTagEmbedding, RecurrentCharEmbedding, ConvolutionalCharEmbedding
  9 | from nn import MLP, BiAffine
 10 | from encoder import RecurrentEncoder, ConvolutionalEncoder, NoEncoder
 11 | from transformer import TransformerEncoder
 12 | 
 13 | 
 14 | class BiAffineParser(nn.Module):
 15 |     """Biaffine Dependency Parser."""
 16 |     def __init__(self, embedding, encoder, encoder_type,
 17 |                  mlp_input, mlp_arc_hidden,
 18 |                  mlp_lab_hidden, mlp_dropout,
 19 |                  num_labels, criterion):
 20 |         super(BiAffineParser, self).__init__()
 21 | 
 22 |         self.embedding = embedding
 23 |         self.encoder = encoder
 24 | 
 25 |         self.encoder_type = encoder_type
 26 | 
 27 |         # Arc MLPs
 28 |         self.arc_mlp_h = MLP(mlp_input, mlp_arc_hidden, 2, 'ReLU', mlp_dropout)
 29 |         self.arc_mlp_d = MLP(mlp_input, mlp_arc_hidden, 2, 'ReLU', mlp_dropout)
 30 |         # Label MLPs
 31 |         self.lab_mlp_h = MLP(mlp_input, mlp_lab_hidden, 2, 'ReLU', mlp_dropout)
 32 |         self.lab_mlp_d = MLP(mlp_input, mlp_lab_hidden, 2, 'ReLU', mlp_dropout)
 33 | 
 34 |         # BiAffine layers
 35 |         self.arc_biaffine = BiAffine(mlp_arc_hidden, 1)
 36 |         self.lab_biaffine = BiAffine(mlp_lab_hidden, num_labels)
 37 | 
 38 |         # Loss criterion
 39 |         self.criterion = criterion()
 40 | 
 41 |     def forward(self, *args, **kwargs):
 42 |         """Compute the score matrices for the arcs and labels."""
 43 |         words = kwargs['words']
 44 |         if self.encoder_type == 'rnn':
 45 |             aux = (words != PAD_INDEX).long().sum(-1) # sentence_lenghts
 46 |         elif self.encoder_type == 'transformer':
 47 |             aux = (words != PAD_INDEX).unsqueeze(-2) # mask
 48 |         else:
 49 |             aux = None
 50 | 
 51 |         x = self.embedding(*args, **kwargs)
 52 | 
 53 |         h = self.encoder(x, aux)
 54 | 
 55 |         arc_h = self.arc_mlp_h(h)
 56 |         arc_d = self.arc_mlp_d(h)
 57 |         lab_h = self.lab_mlp_h(h)
 58 |         lab_d = self.lab_mlp_d(h)
 59 | 
 60 |         S_arc = self.arc_biaffine(arc_h, arc_d)
 61 |         S_lab = self.lab_biaffine(lab_h, lab_d)
 62 |         return S_arc, S_lab
 63 | 
 64 |     def arc_loss(self, S_arc, heads):
 65 |         """Compute the loss for the arc predictions."""
 66 |         S_arc = S_arc.transpose(-1, -2)                      # [batch, sent_len, sent_len]
 67 |         S_arc = S_arc.contiguous().view(-1, S_arc.size(-1))  # [batch*sent_len, sent_len]
 68 |         heads = heads.view(-1)                               # [batch*sent_len]
 69 |         return self.criterion(S_arc, heads)
 70 | 
 71 |     def lab_loss(self, S_lab, heads, labels):
 72 |         """Compute the loss for the label predictions on the gold arcs (heads)."""
 73 |         heads = heads.unsqueeze(1).unsqueeze(2)              # [batch, 1, 1, sent_len]
 74 |         heads = heads.expand(-1, S_lab.size(1), -1, -1)      # [batch, n_labels, 1, sent_len]
 75 |         S_lab = torch.gather(S_lab, 2, heads).squeeze(2)     # [batch, n_labels, sent_len]
 76 |         S_lab = S_lab.transpose(-1, -2)                      # [batch, sent_len, n_labels]
 77 |         S_lab = S_lab.contiguous().view(-1, S_lab.size(-1))  # [batch*sent_len, n_labels]
 78 |         labels = labels.view(-1)                             # [batch*sent_len]
 79 |         return self.criterion(S_lab, labels)
 80 | 
 81 |     @property
 82 |     def num_parameters(self):
 83 |         """Returns the number of trainable parameters of the model."""
 84 |         return sum(prod(p.shape) for p in self.parameters() if p.requires_grad)
 85 | 
 86 | 
 87 | def make_model(args, word_vocab_size, tag_vocab_size, num_labels):
 88 |     """Initiliaze a the BiAffine parser according to the specs in args."""
 89 |     # Embeddings
 90 |     if args.use_chars:
 91 |         if args.char_encoder == 'rnn':
 92 |             word_embedding = RecurrentCharEmbedding(
 93 |                 word_vocab_size, args.word_emb_dim, padding_idx=PAD_INDEX)
 94 |         elif args.char_encoder == 'cnn':
 95 |             word_embedding = ConvolutionalCharEmbedding(
 96 |                 word_vocab_size, padding_idx=PAD_INDEX, filter_factor=args.filter_factor)
 97 |             args.word_emb_dim = word_embedding.output_size # CNN encoder is not so flexible
 98 |             print('CNN character model produces word embeddings of dimension {}.'.format(args.word_emb_dim))
 99 |         elif args.char_encoder == 'transformer':
100 |             raise NotImplementedError('Transformer character econder not yet implemented.')
101 |     else:
102 |         word_embedding = nn.Embedding(word_vocab_size, args.word_emb_dim, padding_idx=PAD_INDEX)
103 |         if args.use_glove:
104 |             raise NotImplementedError('GloVe embeddings not yet implemented.')
105 |     # Words, tags, or both
106 |     if args.disable_tags:
107 |         embedding = WordEmbedding(word_embedding, args.emb_dropout)
108 |         embedding_dim = args.word_emb_dim
109 |     elif args.disable_words: # Experimental reasons
110 |         tag_embedding = nn.Embedding(tag_vocab_size, args.tag_emb_dim, padding_idx=PAD_INDEX)
111 |         embedding = TagEmbedding(tag_embedding, args.emb_dropout)
112 |         embedding_dim = args.tag_emb_dim
113 |     else:
114 |         tag_embedding = nn.Embedding(tag_vocab_size, args.tag_emb_dim, padding_idx=PAD_INDEX)
115 |         embedding = WordTagEmbedding(word_embedding, tag_embedding, args.emb_dropout)
116 |         embedding_dim = args.word_emb_dim + args.tag_emb_dim
117 | 
118 |     # Encoder
119 |     if args.encoder == 'rnn':
120 |         encoder = RecurrentEncoder(
121 |             args.rnn_type, embedding_dim, args.rnn_hidden, args.rnn_num_layers,
122 |             args.batch_first, args.rnn_dropout, bidirectional=True)
123 |         encoder_dim = 2 * args.rnn_hidden
124 |     elif args.encoder == 'cnn':
125 |         encoder = ConvolutionalEncoder(
126 |             embedding_dim, args.cnn_num_layers, args.kernel_size, dropout=args.cnn_dropout)
127 |         encoder_dim = embedding_dim
128 |     elif args.encoder == 'transformer':
129 |         encoder = TransformerEncoder(
130 |             embedding_dim, args.N, args.d_model, args.d_ff, args.h, dropout=args.trans_dropout)
131 |         encoder_dim = args.d_model
132 |     elif args.encoder == 'none':
133 |         encoder = NoEncoder()
134 |         encoder_dim = embedding_dim
135 | 
136 |     # Initialize the model.
137 |     model = BiAffineParser(
138 |         embedding,
139 |         encoder,
140 |         args.encoder,
141 |         encoder_dim,
142 |         args.mlp_arc_hidden,
143 |         args.mlp_lab_hidden,
144 |         args.mlp_dropout,
145 |         num_labels,
146 |         nn.CrossEntropyLoss
147 |     )
148 | 
149 |     # Initialize parameters with Glorot.
150 |     for p in model.parameters():
151 |         if p.dim() > 1:
152 |             nn.init.xavier_uniform(p)
153 | 
154 |     return model
155 | 


--------------------------------------------------------------------------------
/mst.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Source: https://github.com/chantera/biaffineparser/blob/master/utils.py.
  3 | """
  4 | import numpy as np
  5 | from collections import defaultdict
  6 | 
  7 | 
  8 | def mst(scores, eps=1e-10):
  9 |     """
 10 |     Chu-Liu-Edmonds' algorithm for finding minimum spanning arborescence in graphs.
 11 |     Calculates the arborescence with node 0 as root.
 12 |     :param scores: `scores[i][j]` is the weight of edge from node `j` to node `i`.
 13 |     :returns an array containing the head node (node with edge pointing to current node) for each node,
 14 |              with head[0] fixed as 0
 15 |     """
 16 |     scores = scores.T
 17 |     length = scores.shape[0]
 18 |     scores = scores * (1 - np.eye(length)) # mask all the diagonal elements wih a zero
 19 |     heads = np.argmax(scores, axis=1) # THIS MEANS THAT scores[i][j] = score(j -> i)!
 20 |     heads[0] = 0 # the root has a self-loop to make it special
 21 |     tokens = np.arange(1, length)
 22 |     roots = np.where(heads[tokens] == 0)[0] + 1
 23 |     if len(roots) < 1:
 24 |         root_scores = scores[tokens, 0]
 25 |         head_scores = scores[tokens, heads[tokens]]
 26 |         new_root = tokens[np.argmax(root_scores / (head_scores + eps))]
 27 |         heads[new_root] = 0
 28 |     elif len(roots) > 1:
 29 |         root_scores = scores[roots, 0]
 30 |         scores[roots, 0] = 0
 31 |         new_heads = np.argmax(scores[roots][:, tokens], axis=1) + 1
 32 |         new_root = roots[np.argmin(
 33 |             scores[roots, new_heads] / (root_scores + eps))]
 34 |         heads[roots] = new_heads
 35 |         heads[new_root] = 0
 36 | 
 37 |     edges = defaultdict(set) # head -> dep
 38 |     vertices = set((0,))
 39 |     for dep, head in enumerate(heads[tokens]):
 40 |         vertices.add(dep + 1)
 41 |         edges[head].add(dep + 1)
 42 |     for cycle in _find_cycle(vertices, edges):
 43 |         dependents = set()
 44 |         to_visit = set(cycle)
 45 |         while len(to_visit) > 0:
 46 |             node = to_visit.pop()
 47 |             if node not in dependents:
 48 |                 dependents.add(node)
 49 |                 to_visit.update(edges[node])
 50 |         cycle = np.array(list(cycle))
 51 |         old_heads = heads[cycle]
 52 |         old_scores = scores[cycle, old_heads]
 53 |         non_heads = np.array(list(dependents))
 54 |         scores[np.repeat(cycle, len(non_heads)),
 55 |                np.repeat([non_heads], len(cycle), axis=0).flatten()] = 0
 56 |         new_heads = np.argmax(scores[cycle][:, tokens], axis=1) + 1
 57 |         new_scores = scores[cycle, new_heads] / (old_scores + eps)
 58 |         change = np.argmax(new_scores)
 59 |         changed_cycle = cycle[change]
 60 |         old_head = old_heads[change]
 61 |         new_head = new_heads[change]
 62 |         heads[changed_cycle] = new_head
 63 |         edges[new_head].add(changed_cycle)
 64 |         edges[old_head].remove(changed_cycle)
 65 | 
 66 |     return heads
 67 | 
 68 | 
 69 | def _find_cycle(vertices, edges):
 70 |     """
 71 |     https://en.wikipedia.org/wiki/Tarjan%27s_strongly_connected_components_algorithm  # NOQA
 72 |     https://github.com/tdozat/Parser/blob/0739216129cd39d69997d28cbc4133b360ea3934/lib/etc/tarjan.py  # NOQA
 73 |     """
 74 |     _index = [0]
 75 |     _stack = []
 76 |     _indices = {}
 77 |     _lowlinks = {}
 78 |     _onstack = defaultdict(lambda: False)
 79 |     _SCCs = []
 80 | 
 81 |     def _strongconnect(v):
 82 |         _indices[v] = _index[0]
 83 |         _lowlinks[v] = _index[0]
 84 |         _index[0] += 1
 85 |         _stack.append(v)
 86 |         _onstack[v] = True
 87 | 
 88 |         for w in edges[v]:
 89 |             if w not in _indices:
 90 |                 _strongconnect(w)
 91 |                 _lowlinks[v] = min(_lowlinks[v], _lowlinks[w])
 92 |             elif _onstack[w]:
 93 |                 _lowlinks[v] = min(_lowlinks[v], _indices[w])
 94 | 
 95 |         if _lowlinks[v] == _indices[v]:
 96 |             SCC = set()
 97 |             while True:
 98 |                 w = _stack.pop()
 99 |                 _onstack[w] = False
100 |                 SCC.add(w)
101 |                 if not (w != v):
102 |                     break
103 |             _SCCs.append(SCC)
104 | 
105 |     for v in vertices:
106 |         if v not in _indices:
107 |             _strongconnect(v)
108 | 
109 |     return [SCC for SCC in _SCCs if len(SCC) > 1]
110 | 


--------------------------------------------------------------------------------
/nn.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from torch.autograd import Variable
 4 | from numpy import prod
 5 | 
 6 | from data import PAD_INDEX, wrap
 7 | 
 8 | class MLP(nn.Module):
 9 |     """Module for an MLP with dropout."""
10 |     def __init__(self, input_size, layer_size, depth, activation, dropout):
11 |         super(MLP, self).__init__()
12 |         self.layers = nn.Sequential()
13 |         act_fn = getattr(nn, activation)
14 |         for i in range(depth):
15 |             self.layers.add_module('fc_{}'.format(i),
16 |                                    nn.Linear(input_size, layer_size))
17 |             if activation:
18 |                 self.layers.add_module('{}_{}'.format(activation, i),
19 |                                        act_fn())
20 |             if dropout:
21 |                 self.layers.add_module('dropout_{}'.format(i),
22 |                                        nn.Dropout(dropout))
23 |             input_size = layer_size
24 | 
25 |     def forward(self, x):
26 |         return self.layers(x)
27 | 
28 |     @property
29 |     def num_parameters(self):
30 |         """Returns the number of trainable parameters of the model."""
31 |         return sum(prod(p.shape) for p in self.parameters() if p.requires_grad)
32 | 
33 | 
34 | class BiAffine(nn.Module):
35 |     """Biaffine attention layer."""
36 |     def __init__(self, input_dim, output_dim):
37 |         super(BiAffine, self).__init__()
38 |         self.input_dim = input_dim
39 |         self.output_dim = output_dim
40 |         self.U = nn.Parameter(torch.FloatTensor(output_dim, input_dim, input_dim))
41 |         nn.init.xavier_uniform(self.U)
42 | 
43 |     def forward(self, Rh, Rd):
44 |         Rh = Rh.unsqueeze(1)
45 |         Rd = Rd.unsqueeze(1)
46 |         S = Rh @ self.U @ Rd.transpose(-1, -2)
47 |         return S.squeeze(1)
48 | 
49 |     # TODO: add collumns of ones to Rh and Rd for biases.
50 | 
51 |     @property
52 |     def num_parameters(self):
53 |         """Returns the number of trainable parameters of the model."""
54 |         return sum(prod(p.shape) for p in self.parameters() if p.requires_grad)
55 | 
56 | 
57 | class ResidualConnection(nn.Module):
58 |     """A residual connection with dropout."""
59 |     def __init__(self, layer, dropout):
60 |         super(ResidualConnection, self).__init__()
61 |         self.layer = layer
62 |         self.dropout = nn.Dropout(dropout)
63 | 
64 |     def forward(self, x):
65 |         "Apply residual connection to any sublayer with the same size."
66 |         return x + self.dropout(self.layer(x))
67 | 
68 | 
69 | class HighwayNetwork(nn.Module):
70 |     """A highway network used in the character convolution word embeddings."""
71 |     def __init__(self, input_size, activation='ReLU'):
72 |         super(HighwayNetwork, self).__init__()
73 |         self.linear = nn.Linear(input_size, input_size)
74 |         self.gate = nn.Linear(input_size, 1)
75 |         self.act_fn = getattr(nn, activation)()
76 |         self.sigmoid = nn.Sigmoid()
77 | 
78 |     def forward(self, x):
79 |         t = self.sigmoid(self.gate(x))
80 |         out = self.act_fn(self.linear(x))
81 |         return t * out + (1 - t) * x
82 | 


--------------------------------------------------------------------------------
/optimizer.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Source: http://nlp.seas.harvard.edu/2018/04/03/attention.html.
 3 | """
 4 | import torch
 5 | 
 6 | 
 7 | class NoamOpt:
 8 |     """Optim wrapper that implements rate."""
 9 |     def __init__(self, model_size, factor, warmup, optimizer):
10 |         self.optimizer = optimizer
11 |         self._step = 0
12 |         self.warmup = warmup
13 |         self.factor = factor
14 |         self.model_size = model_size
15 |         self._rate = 0
16 | 
17 |     def step(self):
18 |         """Update parameters and rate."""
19 |         self._step += 1
20 |         rate = self.rate()
21 |         for p in self.optimizer.param_groups:
22 |             p['lr'] = rate
23 |         self._rate = rate
24 |         self.optimizer.step()
25 | 
26 |     def zero_grad(self):
27 |         """Delegate zero grad to underlying optimizer."""
28 |         self.optimizer.zero_grad()
29 | 
30 |     def rate(self, step = None):
31 |         """Implement `lrate` above."""
32 |         if step is None:
33 |             step = self._step
34 |         return self.factor * \
35 |             (self.model_size ** (-0.5) *
36 |             min(step ** (-0.5), step * self.warmup ** (-1.5)))
37 | 
38 | 
39 | def get_std_transformer_opt(args, model):
40 |     return NoamOpt(args.d_model, 2, 4000,
41 |             torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9))
42 | 


--------------------------------------------------------------------------------
/predict.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | import torch.nn as nn
 4 | from torch.autograd import Variable
 5 | import torch.nn.functional as F
 6 | import matplotlib.pyplot as plt
 7 | 
 8 | from data import Dictionary, Corpus, PAD_INDEX
 9 | from mst import mst
10 | 
11 | 
12 | def plot(S_arc, heads):
13 |     fig, ax = plt.subplots()
14 |     # Make a 0/1 gold adjacency matrix.
15 |     n = heads.size(1)
16 |     G = np.zeros((n, n))
17 |     heads = heads.squeeze().data.numpy()
18 |     G[heads, np.arange(n)] = 1.
19 |     im = ax.imshow(G, vmin=0, vmax=1)
20 |     fig.colorbar(im)
21 |     plt.savefig('img/gold.pdf')
22 |     plt.cla()
23 |     # Plot the predicted adjacency matrix
24 |     A = F.softmax(S_arc.squeeze(0), dim=0)
25 |     fig, ax = plt.subplots()
26 |     im = ax.imshow(A.data.numpy(), vmin=0, vmax=1)
27 |     fig.colorbar(im)
28 |     plt.savefig('img/a.pdf')
29 |     plt.cla()
30 |     plt.clf()
31 | 
32 | 
33 | def predict(model, words, tags):
34 |     assert type(words) == type(tags)
35 |     if type(words) == type(tags) == list:
36 |         # Convert the lists into input for the PyTorch model.
37 |         words = Variable(torch.LongTensor([words]))
38 |         tags = Variable(torch.LongTensor([tags]))
39 |     # Dissable dropout.
40 |     model.eval()
41 |     # Predict arc and label score matrices.
42 |     S_arc, S_lab = model(words, tags)
43 | 
44 |     # Predict heads
45 |     S = S_arc[0].data.numpy()
46 |     heads = mst(S)
47 | 
48 |     # Predict labels
49 |     S_lab = S_lab[0]
50 |     select = torch.LongTensor(heads).unsqueeze(0).expand(S_lab.size(0), -1)
51 |     select = Variable(select)
52 |     selected = torch.gather(S_lab, 1, select.unsqueeze(1)).squeeze(1)
53 |     _, labels = selected.max(dim=0)
54 |     labels = labels.data.numpy()
55 |     return heads, labels
56 | 
57 | 
58 | def predict_batch(S_arc, S_lab, tags):
59 |     # Predict heads
60 |     S = S_arc.data.numpy()
61 |     heads = mst(S)
62 | 
63 |     # Predict labels
64 |     select = torch.LongTensor(heads).unsqueeze(0).expand(S_lab.size(0), -1)
65 |     select = Variable(select)
66 |     selected = torch.gather(S_lab, 1, select.unsqueeze(1)).squeeze(1)
67 |     _, labels = selected.max(dim=0)
68 |     labels = labels.data.numpy()
69 |     return heads, labels
70 | 
71 | 
72 | if __name__ == '__main__':
73 | 
74 |     data_path = '../../stanford-ptb'
75 |     vocab_path = 'vocab/train'
76 |     model_path = 'models/model.pt'
77 | 
78 |     dictionary = Dictionary(vocab_path)
79 |     corpus = Corpus(data_path=data_path, vocab_path=vocab_path)
80 |     model = torch.load(model_path)
81 |     batches = corpus.train.batches(1)
82 | 
83 |     words, tags, heads, labels = next(batches)
84 |     S_arc, S_lab = model(words, tags)
85 | 
86 |     plot(S_arc, heads)
87 |     words = tags = [1, 2, 3, 4]
88 |     heads_pred, labels_pred = predict(model, words, tags)
89 |     print(heads_pred, '\n', heads[0].data.numpy())
90 |     print(labels_pred, '\n', labels[0].data.numpy())
91 | 


--------------------------------------------------------------------------------
/preprocess.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import argparse
  3 | import os
  4 | from collections import Counter
  5 | import unicodedata
  6 | 
  7 | import numpy as np
  8 | 
  9 | 
 10 | def is_number(s):
 11 |     s = s.replace(',', '') # 10,000 -> 10000
 12 |     s = s.replace(':', '') # 5:30 -> 530
 13 |     s = s.replace('-', '') # 17-08 -> 1708
 14 |     s = s.replace('/', '') # 17/08/1992 -> 17081992
 15 |     try:
 16 |         float(s)
 17 |         return True
 18 |     except ValueError:
 19 |         pass
 20 |     try:
 21 |         unicodedata.numeric(s)
 22 |         return True
 23 |     except (TypeError, ValueError):
 24 |         pass
 25 |     return False
 26 | 
 27 | 
 28 | def process_conll(in_path, out_path, lower=True, clean=True, p=0.1):
 29 |     word_counts = Counter()
 30 |     tag_counts = Counter()
 31 |     label_counts = Counter()
 32 |     with open(in_path, 'r') as f:
 33 |         for line in f:
 34 |             fields = line.split()
 35 |             if fields:
 36 |                 word = fields[1].lower() if lower else fields[1]
 37 |                 tag = fields[3]
 38 |                 label = fields[7]
 39 |                 word_counts.update([word])
 40 |                 tag_counts.update([tag])
 41 |                 label_counts.update([label])
 42 |     with open(out_path + '.words.txt', 'w') as f:
 43 |         for word, count in word_counts.most_common():
 44 |             processed = word
 45 |             if count == 1:
 46 |                 if is_number(word) and clean:
 47 |                     processed = '<num>'
 48 |                 elif np.random.random() < p:
 49 |                     processed = '<unk>'
 50 |             print('{} {} {}'.format(word, processed, count), file=f)
 51 |     with open(out_path + '.tags.txt', 'w') as f:
 52 |         for tag, count in tag_counts.most_common():
 53 |             print('{} {}'.format(tag, count), file=f)
 54 |     with open(out_path + '.labels.txt', 'w') as f:
 55 |         for label, count in label_counts.most_common():
 56 |             print('{} {}'.format(label, count), file=f)
 57 | 
 58 | 
 59 | def compare_vocabulary(train_path, dev_path, test_path):
 60 |     train_vocab = dict()
 61 |     dev_vocab = dict()
 62 |     test_vocab = dict()
 63 | 
 64 |     def read_dict(path, dict):
 65 |         with open(path, 'r') as f:
 66 |             for line in f:
 67 |                 word, _, count = line.split()
 68 |                 dict[word] = int(count)
 69 | 
 70 |     read_dict(train_path, train_vocab)
 71 |     read_dict(dev_path, dev_vocab)
 72 |     read_dict(test_path, test_vocab)
 73 | 
 74 |     nwords_train = len(train_vocab)
 75 |     ntokens_train = sum(train_vocab.values())
 76 |     nwords_dev = len(dev_vocab)
 77 |     ntokens_dev = sum(dev_vocab.values())
 78 |     nwords_test = len(test_vocab)
 79 |     ntokens_test = sum(test_vocab.values())
 80 |     unseen_words = list(set(dev_vocab.keys()) - (set(train_vocab.keys()) & set(dev_vocab.keys())))
 81 |     num_unseen_tokens = sum([dev_vocab[w] for w in unseen_words])
 82 |     with open('vocab/data-statistics.csv', 'w') as g:
 83 |         print('dataset,nwords,ntokens', file=g)
 84 |         print('train,{},{}'.format(nwords_train, ntokens_train), file=g)
 85 |         print('dev,{},{}'.format(nwords_dev, ntokens_dev), file=g)
 86 |         print('test,{},{}'.format(nwords_test, ntokens_test), file=g)
 87 |         print('unseen,{},{}'.format(len(unseen_words), num_unseen_tokens), file=g)
 88 |     with open('vocab/unseen.txt', 'w') as f:
 89 |         for word in unseen_words:
 90 |             print('{} {}'.format(word, dev_vocab[word]), file=f)
 91 | 
 92 | 
 93 | def main(args):
 94 |     data = os.path.expanduser(args.data)
 95 |     train_conll_path = os.path.join(data, 'train.conll')
 96 |     dev_conll_path = os.path.join(data, 'dev.conll')
 97 |     test_conll_path = os.path.join(data, 'test.conll')
 98 | 
 99 |     train_vocab_path = os.path.join(args.out, 'train')
100 |     dev_vocab_path = os.path.join(args.out, 'dev')
101 |     test_vocab_path = os.path.join(args.out, 'test')
102 | 
103 |     process_conll(train_conll_path, train_vocab_path, p=0.5, clean=False)
104 |     process_conll(dev_conll_path, dev_vocab_path, p=0.0)
105 |     process_conll(test_conll_path, test_vocab_path, p=0.0)
106 | 
107 |     compare_vocabulary(
108 |         train_vocab_path + '.words.txt', dev_vocab_path + '.words.txt', test_vocab_path + '.words.txt')
109 | 
110 | 
111 | if __name__ == '__main__':
112 |     parser = argparse.ArgumentParser()
113 |     parser.add_argument('--data', default='~/data/ptb-stanford')
114 |     parser.add_argument('--out', default='vocab')
115 |     args = parser.parse_args()
116 | 
117 |     main(args)
118 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | 
  4 | import torch
  5 | from torch import nn
  6 | from torch.autograd import Variable
  7 | import numpy as np
  8 | import matplotlib.pyplot as plt
  9 | 
 10 | from model import PAD_INDEX
 11 | from data import Corpus
 12 | from model import make_model
 13 | from optimizer import get_std_transformer_opt
 14 | from util import Timer, write_losses
 15 | 
 16 | 
 17 | LOSSES = dict(train_loss=[], train_acc=[], val_acc=[], test_acc=[])
 18 | 
 19 | 
 20 | def arc_accuracy(S_arc, heads, eps=1e-10):
 21 |     """Accuracy of the arc predictions based on gready head prediction."""
 22 |     _, pred = S_arc.max(dim=-2)
 23 |     mask = (heads != PAD_INDEX).float()
 24 |     accuracy = torch.sum((pred == heads).float() * mask, dim=-1) / (torch.sum(mask, dim=-1) + eps)
 25 |     return torch.mean(accuracy).data[0]
 26 | 
 27 | 
 28 | def lab_accuracy(S_lab, heads, labels, eps=1e-10):
 29 |     """Accuracy of label predictions on the gold arcs."""
 30 |     _, pred = S_lab.max(dim=1)
 31 |     pred = torch.gather(pred, 1, heads.unsqueeze(1)).squeeze(1)
 32 |     mask = (heads != PAD_INDEX).float()
 33 |     accuracy = torch.sum((pred == labels).float() * mask, dim=-1) / (torch.sum(mask, dim=-1) + eps)
 34 |     return torch.mean(accuracy).data[0]
 35 | 
 36 | 
 37 | def evaluate(args, model, corpus):
 38 |     """Evaluate the arc and label accuracy of the model on the development corpus."""
 39 |     # Turn on evaluation mode to disable dropout.
 40 |     model.eval()
 41 |     dev_batches = corpus.dev.batches(256, length_ordered=True)
 42 |     arc_acc, lab_acc = 0, 0
 43 |     for k, batch in enumerate(dev_batches, 1):
 44 |         words, tags, heads, labels = batch
 45 |         if args.cuda:
 46 |             words, tags, heads, labels = words.cuda(), tags.cuda(), heads.cuda(), labels.cuda()
 47 |         S_arc, S_lab = model(words=words, tags=tags)
 48 |         arc_acc += arc_accuracy(S_arc, heads)
 49 |         lab_acc += lab_accuracy(S_lab, heads, labels)
 50 |     arc_acc /= k
 51 |     lab_acc /= k
 52 |     return arc_acc, lab_acc
 53 | 
 54 | 
 55 | class SimpleLossCompute:
 56 |     """A simple loss compute and train function on one device."""
 57 |     def __init__(self, model, optimizer):
 58 |         self.model = model
 59 |         self.optimizer = optimizer
 60 | 
 61 |     def __call__(self, words, tags, heads, labels):
 62 |         # Forward pass.
 63 |         S_arc, S_lab = self.model(words=words, tags=tags)
 64 |         # Compute loss.
 65 |         arc_loss = self.model.arc_loss(S_arc, heads)
 66 |         lab_loss = self.model.lab_loss(S_lab, heads, labels)
 67 |         loss = arc_loss + lab_loss
 68 |         # Update parameters.
 69 |         self.optimizer.zero_grad()
 70 |         loss.backward()
 71 |         self.optimizer.step()
 72 |         loss_dict = dict(loss=loss.data[0], arc_loss=arc_loss.data[0], lab_loss=lab_loss.data[0])
 73 |         return S_arc, S_lab, loss_dict
 74 | 
 75 | 
 76 | class MultiGPULossCompute:
 77 |     """A multi-gpu loss compute and train function.
 78 | 
 79 |     Only difference with SimpleLossCompute is we need to access loss
 80 |     through model.module.
 81 |     """
 82 |     def __init__(self, model, optimizer, devices, output_device=None):
 83 |         self.model = model
 84 |         self.optimizer = optimizer
 85 |         self.devices = devices
 86 |         self.output_device = output_device if output_device is not None else devices[0]
 87 | 
 88 |     def __call__(self, words, tags, heads, labels):
 89 |         # Forward pass.
 90 |         S_arc, S_lab = self.model(words=words, tags=tags)
 91 |         # Compute loss.
 92 |         arc_loss = self.model.module.arc_loss(S_arc, heads)
 93 |         lab_loss = self.model.module.lab_loss(S_lab, heads, labels)
 94 |         loss = arc_loss + lab_loss
 95 |         # Update parameters.
 96 |         self.optimizer.zero_grad()
 97 |         loss.backward()
 98 |         self.optimizer.step()
 99 |         loss_dict = dict(loss=loss.data[0], arc_loss=arc_loss.data[0], lab_loss=lab_loss.data[0])
100 |         return S_arc, S_lab, loss_dict
101 | 
102 | 
103 | def run_epoch(args, model, corpus, train_step):
104 |     model.train()
105 |     nbatches = len(corpus.train.words) // args.batch_size
106 |     start_time = time.time()
107 |     # Get a new set of shuffled training batches.
108 |     train_batches = corpus.train.batches(args.batch_size, length_ordered=args.disable_length_ordered)
109 |     ntokens = 0
110 |     for step, batch in enumerate(train_batches, 1):
111 |         words, tags, heads, labels = batch
112 |         if args.cuda:
113 |             words, tags, heads, labels = words.cuda(), tags.cuda(), heads.cuda(), labels.cuda()
114 |         S_arc, S_lab, loss_dict = train_step(words, tags, heads, labels)
115 |         ntokens += words.size(0) * words.size(1)
116 |         LOSSES['train_loss'].append(loss_dict['loss'])
117 |         if step % args.print_every == 0:
118 |             arc_train_acc = arc_accuracy(S_arc, heads)
119 |             lab_train_acc = lab_accuracy(S_lab, heads, labels)
120 |             LOSSES['train_acc'].append([arc_train_acc, lab_train_acc])
121 |             print(
122 |                 '| Step {:5d}/{:5d} ({:.0f}%)| Avg loss {:3.4f} | Arc acc {:4.2f}% '
123 |                 '| Label acc {:4.2f}% | {:4.0f} tokens/sec |'.format(
124 |                     step,
125 |                     nbatches,
126 |                     100*step/nbatches,
127 |                     np.mean(LOSSES['train_loss'][-args.print_every:]),
128 |                     100*arc_train_acc,
129 |                     100*lab_train_acc,
130 |                     ntokens/(time.time() - start_time)),
131 |             )
132 | 
133 | 
134 | def train(args):
135 |     np.random.seed(args.seed)
136 |     torch.manual_seed(args.seed)
137 |     torch.cuda.manual_seed(args.seed)
138 | 
139 |     args.cuda = torch.cuda.is_available()
140 |     print('Using cuda: {}'.format(args.cuda))
141 | 
142 |     # Initialize the data, model, and optimizer.
143 |     corpus = Corpus(data_path=args.data, vocab_path=args.vocab, char=args.use_chars)
144 |     model = make_model(
145 |                 args,
146 |                 word_vocab_size=len(corpus.dictionary.w2i),
147 |                 tag_vocab_size=len(corpus.dictionary.t2i),
148 |                 num_labels=len(corpus.dictionary.l2i)
149 |             )
150 |     print('Embedding parameters: {:,}'.format(model.embedding.num_parameters))
151 |     print('Encoder parameters: {:,}'.format(model.encoder.num_parameters))
152 |     print('Total model parameters: {:,}'.format(model.num_parameters))
153 |     if args.cuda:
154 |         model.cuda()
155 | 
156 |     if args.encoder == 'transformer':
157 |         optimizer = get_std_transformer_opt(args, model)
158 |     else:
159 |         optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
160 | 
161 |     if args.cuda:
162 |         device_count = torch.cuda.device_count()
163 |         if args.multi_gpu:
164 |             devices = list(range(device_count))
165 |             model = nn.DataParallel(model, device_ids=devices)
166 |             train_step = MultiGPULossCompute(model, optimizer, devices)
167 |             print('Training on {} GPUs: {}.'.format(device_count, devices))
168 |         else:
169 |             train_step = SimpleLossCompute(model, optimizer)
170 |             print('Training on 1 device out of {} availlable.'.format(device_count))
171 |     else:
172 |         train_step = SimpleLossCompute(model, optimizer)
173 | 
174 |     timer = Timer()
175 |     best_val_acc = 0.
176 |     best_epoch = 0
177 |     print('Start of training..')
178 |     try:
179 |         for epoch in range(1, args.epochs+1):
180 |             run_epoch(args, model, corpus, train_step)
181 | 
182 |             # Evaluate model on validation set.
183 |             # TODO: replace this with a UAS and LAS eval instead of this proxy
184 |             arc_val_acc, lab_val_acc = evaluate(args, model, corpus)
185 |             LOSSES['val_acc'].append([arc_val_acc, lab_val_acc])
186 | 
187 |             # Save model if it is the best so far.
188 |             if arc_val_acc > best_val_acc:
189 |                 torch.save(model, args.checkpoints)
190 |                 best_val_acc = arc_val_acc
191 |                 best_epoch = epoch
192 | 
193 |             write_losses(LOSSES['train_loss'], LOSSES['train_acc'], LOSSES['val_acc'], args.logdir)
194 |             # End epoch with some useful info in the terminal.
195 |             print('-' * 89)
196 |             print(
197 |                 '| End of epoch {:3d}/{} | Time {:5.2f}s | Valid accuracy {:3.2f}% |'
198 |                 ' Best accuracy {:3.2f}% (epoch {:3d}) |'.format(
199 |                     epoch,
200 |                     args.epochs,
201 |                     timer.elapsed(),
202 |                     100*arc_val_acc,
203 |                     100*best_val_acc,
204 |                     best_epoch)
205 |             )
206 |             print('-' * 89)
207 |     except KeyboardInterrupt:
208 |         print()
209 |         print('-' * 89)
210 |         print('Exiting from training early')
211 | 
212 |     write_losses(LOSSES['train_loss'], LOSSES['train_acc'], LOSSES['val_acc'], args.logdir)
213 |     arc_val_acc, lab_val_acc = evaluate(args, model, corpus)
214 |     if arc_val_acc > best_val_acc:
215 |         torch.save(model, args.checkpoints)
216 |         best_val_acc = arc_val_acc
217 |         best_epoch = epoch
218 | 
219 |     print('=' * 89)
220 |     print('| End of training | Best validation accuracy {:3.2f} (epoch {}) |'.format(
221 |         100*best_val_acc, best_epoch))
222 |     print('=' * 89)
223 | 


--------------------------------------------------------------------------------
/transformer.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """
  3 | Source: http://nlp.seas.harvard.edu/2018/04/03/attention.html#model-architecture.
  4 | """
  5 | import math, copy, time
  6 | 
  7 | import numpy as np
  8 | import torch
  9 | import torch.nn as nn
 10 | import torch.nn.functional as F
 11 | from torch.autograd import Variable
 12 | 
 13 | 
 14 | def clones(module, N):
 15 |     """Produce N identical layers."""
 16 |     return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])
 17 | 
 18 | 
 19 | def attention(query, key, value, mask=None, dropout=None):
 20 |     """Compute 'Scaled Dot Product Attention'"""
 21 |     d_k = query.size(-1)
 22 |     scores = torch.matmul(query, key.transpose(-2, -1)) \
 23 |              / math.sqrt(d_k)
 24 |     if mask is not None:
 25 |         scores = scores.masked_fill(mask == 0, -1e9)
 26 |     p_attn = F.softmax(scores, dim = -1)
 27 |     if dropout is not None:
 28 |         p_attn = dropout(p_attn)
 29 |     return torch.matmul(p_attn, value), p_attn
 30 | 
 31 | 
 32 | class Encoder(nn.Module):
 33 |     """Core encoder is a stack of N layers"""
 34 |     def __init__(self, layer, N):
 35 |         super(Encoder, self).__init__()
 36 |         self.layers = clones(layer, N)
 37 |         self.norm = LayerNorm(layer.size)
 38 | 
 39 |     def forward(self, x, mask):
 40 |         """Pass the input (and mask) through each layer in turn."""
 41 |         for layer in self.layers:
 42 |             x = layer(x, mask)
 43 |         return self.norm(x)
 44 | 
 45 | 
 46 | class LayerNorm(nn.Module):
 47 |     """Construct a layernorm module (See citation for details)."""
 48 |     def __init__(self, features, eps=1e-6):
 49 |         super(LayerNorm, self).__init__()
 50 |         self.a_2 = nn.Parameter(torch.ones(features))
 51 |         self.b_2 = nn.Parameter(torch.zeros(features))
 52 |         self.eps = eps
 53 | 
 54 |     def forward(self, x):
 55 |         mean = x.mean(-1, keepdim=True)
 56 |         std = x.std(-1, keepdim=True)
 57 |         return self.a_2 * (x - mean) / (std + self.eps) + self.b_2
 58 | 
 59 | 
 60 | class SublayerConnection(nn.Module):
 61 |     """A residual connection followed by a layer norm.
 62 | 
 63 |     Note for code simplicity the norm is first as opposed to last.
 64 |     """
 65 |     def __init__(self, size, dropout):
 66 |         super(SublayerConnection, self).__init__()
 67 |         self.norm = LayerNorm(size)
 68 |         self.dropout = nn.Dropout(dropout)
 69 | 
 70 |     def forward(self, x, sublayer):
 71 |         "Apply residual connection to any sublayer with the same size."
 72 |         return x + self.dropout(sublayer(self.norm(x)))
 73 | 
 74 | 
 75 | class EncoderLayer(nn.Module):
 76 |     """Encoder is made up of self-attn and feed forward (defined below)"""
 77 |     def __init__(self, size, self_attn, feed_forward, dropout):
 78 |         super(EncoderLayer, self).__init__()
 79 |         self.self_attn = self_attn
 80 |         self.feed_forward = feed_forward
 81 |         self.sublayer = clones(SublayerConnection(size, dropout), 2)
 82 |         self.size = size
 83 | 
 84 |     def forward(self, x, mask):
 85 |         """Follow Figure 1 (left) for connections."""
 86 |         x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, mask))
 87 |         return self.sublayer[1](x, self.feed_forward)
 88 | 
 89 | 
 90 | class MultiHeadedAttention(nn.Module):
 91 |     def __init__(self, h, d_model, dropout=0.1):
 92 |         """Take in model size and number of heads."""
 93 |         super(MultiHeadedAttention, self).__init__()
 94 |         assert d_model % h == 0
 95 |         # We assume d_v always equals d_k
 96 |         self.d_k = d_model // h
 97 |         self.h = h
 98 |         self.linears = clones(nn.Linear(d_model, d_model), 4)
 99 |         self.attn = None
100 |         self.dropout = nn.Dropout(p=dropout)
101 | 
102 |     def forward(self, query, key, value, mask=None):
103 |         """Implements Figure 2."""
104 |         if mask is not None:
105 |             # Same mask applied to all h heads.
106 |             mask = mask.unsqueeze(1)
107 |         nbatches = query.size(0)
108 | 
109 |         # 1) Do all the linear projections in batch from d_model => h x d_k
110 |         query, key, value = \
111 |             [l(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
112 |              for l, x in zip(self.linears, (query, key, value))]
113 | 
114 |         # 2) Apply attention on all the projected vectors in batch.
115 |         x, self.attn = attention(query, key, value, mask=mask,
116 |                                  dropout=self.dropout)
117 | 
118 |         # 3) "Concat" using a view and apply a final linear.
119 |         x = x.transpose(1, 2).contiguous() \
120 |              .view(nbatches, -1, self.h * self.d_k)
121 |         return self.linears[-1](x)
122 | 
123 | 
124 | class PositionwiseFeedForward(nn.Module):
125 |     """Implements FFN equation."""
126 |     def __init__(self, d_model, d_ff, dropout=0.1):
127 |         super(PositionwiseFeedForward, self).__init__()
128 |         self.w_1 = nn.Linear(d_model, d_ff)
129 |         self.w_2 = nn.Linear(d_ff, d_model)
130 |         self.dropout = nn.Dropout(dropout)
131 | 
132 |     def forward(self, x):
133 |         return self.w_2(self.dropout(F.relu(self.w_1(x))))
134 | 
135 | 
136 | class PositionalEncoding(nn.Module):
137 |     """Implement the PE function."""
138 |     def __init__(self, d_model, dropout, max_len=5000):
139 |         super(PositionalEncoding, self).__init__()
140 |         self.dropout = nn.Dropout(p=dropout)
141 | 
142 |         # Compute the positional encodings once in log space.
143 |         pe = torch.zeros(max_len, d_model)
144 |         position = torch.arange(0, max_len).unsqueeze(1)
145 |         div_term = torch.exp(torch.arange(0, d_model, 2) *
146 |                              -(math.log(10000.0) / d_model))
147 |         pe[:, 0::2] = torch.sin(position * div_term)
148 |         pe[:, 1::2] = torch.cos(position * div_term)
149 |         pe = pe.unsqueeze(0)
150 |         self.register_buffer('pe', pe)
151 | 
152 |     def forward(self, x):
153 |         x = x + Variable(self.pe[:, :x.size(1)],
154 |                          requires_grad=False)
155 |         return self.dropout(x)
156 | 
157 | 
158 | class TransformerEncoder(nn.Module):
159 |     """A Transformer encoder."""
160 |     def __init__(self, input_size, N, d_model, d_ff, h, dropout):
161 |         super(TransformerEncoder, self).__init__()
162 |         attn = MultiHeadedAttention(h, d_model)
163 |         ff = PositionwiseFeedForward(d_model, d_ff, dropout)
164 |         self.d_model = d_model
165 |         self.encoder = Encoder(EncoderLayer(d_model, attn, ff, dropout), N)
166 |         self.projection = nn.Linear(input_size, d_model) # to get to the proper input size
167 |         self.positional = PositionalEncoding(d_model, dropout)
168 | 
169 |         self.initialize_parameters()
170 | 
171 |     def initialize_parameters(self):
172 |         """Initialize parameters with Glorot."""
173 |         for p in self.parameters():
174 |             if p.dim() > 1:
175 |                 nn.init.xavier_uniform(p)
176 | 
177 |     def forward(self, x, mask):
178 |         """Take in and process masked src and target sequences."""
179 |         return self.encode(x, mask)
180 | 
181 |     def embed(self, x):
182 |         x = self.projection(x) * math.sqrt(self.d_model)
183 |         return self.positional(x)
184 | 
185 |     def encode(self, x, mask):
186 |         return self.encoder(self.embed(x), mask)
187 | 
188 |     @property
189 |     def num_parameters(self):
190 |         """Returns the number of trainable parameters of the model."""
191 |         return sum(np.prod(p.shape) for p in self.parameters() if p.requires_grad)
192 | 
193 | 
194 | if __name__ == '__main__':
195 |     encoder = TransformerEncoder(d_model=512)
196 |     embedding = nn.Embedding(100, 512)
197 | 
198 |     x = Variable(torch.arange(1, 16).long().view(3, 5))
199 |     mask = (x != 0).unsqueeze(-2) # Why unsqueeze?
200 |     print(x)
201 |     print(mask)
202 |     out = encoder(embedding(x), mask)
203 |     print(out)
204 | 


--------------------------------------------------------------------------------
/util.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import time
 3 | import csv
 4 | 
 5 | import torch
 6 | import torch.nn as nn
 7 | import torch.nn.functional as F
 8 | from torch.autograd import Variable
 9 | import matplotlib.pyplot as plt
10 | import numpy as np
11 | 
12 | 
13 | class Timer:
14 |     """A simple timer to use during training."""
15 |     def __init__(self):
16 |         self.time0 = time.time()
17 | 
18 |     def elapsed(self):
19 |         time1 = time.time()
20 |         elapsed = time1 - self.time0
21 |         self.time0 = time1
22 |         return elapsed
23 | 
24 | 
25 | def write_losses(train_loss, train_acc, val_acc, outdir):
26 |     """Write out the loss and accuracy to CSV files."""
27 |     with open(os.path.join(outdir, 'loss.train.csv'), 'w') as f:
28 |         writer = csv.writer(f)
29 |         names = [['loss']]
30 |         losses = [[l] for l in train_loss]
31 |         writer.writerows(names + losses)
32 |     with open(os.path.join(outdir, 'acc.train.csv'), 'w') as f:
33 |         writer = csv.writer(f)
34 |         names = [["train_arc_acc", "train_lab_acc"]]
35 |         writer.writerows(names + train_acc)
36 |     with open(os.path.join(outdir, 'acc.val.csv'), 'w') as f:
37 |         writer = csv.writer(f)
38 |         names = [["val_arc_acc", "val_lab_acc"]]
39 |         writer.writerows(names + val_acc)
40 | 
41 | 
42 | # def plot(corpus, model, fig, ax, step, sent=2):
43 | #     words = Variable(torch.LongTensor([corpus.train.words[sent]]))
44 | #     tags = Variable(torch.LongTensor([corpus.train.tags[sent]]))
45 | #     heads = Variable(torch.LongTensor([corpus.train.heads[sent]]))
46 | #     labels = Variable(torch.LongTensor([corpus.train.labels[sent]]))
47 | #     # Disable dropout.
48 | #     model.eval()
49 | #     S_arc, S_lab = model(words, tags)
50 | #     # Turn dropout back on.
51 | #     model.train()
52 | #     # Plot the gold adjacency matrix, if does not yet exist.
53 | #     if not os.path.exists('img/gold.pdf'):
54 | #         # Make a 0/1 gold adjacency matrix.
55 | #         n = words.size(1)
56 | #         G = np.zeros((n, n))
57 | #         heads = heads.squeeze().data.numpy()
58 | #         G[heads, np.arange(n)] = 1.
59 | #         im = ax.imshow(G, vmin=0, vmax=1)
60 | #         fig.colorbar(im)
61 | #         plt.savefig('img/gold.pdf'.format(step))
62 | #         plt.cla()
63 | #         plt.clf()
64 | #     # Plot the predicted adjacency matrix
65 | #     A = F.softmax(S_arc.squeeze(0), dim=0)
66 | #     fig, ax = plt.subplots()
67 | #     im = ax.imshow(A.data.numpy(), vmin=0, vmax=1)
68 | #     fig.colorbar(im)
69 | #     plt.savefig('img/a.{}.pdf'.format(step))
70 | #     plt.cla()
71 | #     plt.clf()
72 | 


--------------------------------------------------------------------------------