├── README.md
├── __init__.py
├── __init__.pyc
├── aaai19 sup.pdf
├── data
├── cluster
│ └── fr_cluster
├── pr_rules
│ └── ud_c
│ │ ├── fr_0.5.txt
│ │ └── fr_10_gt.txt
├── raw_text
│ └── fr_train.txt
└── ud
│ ├── fr-ud-dev_clean.conllu
│ ├── fr-ud-test_clean.conllu
│ ├── fr-ud-train_clean.conllu
│ ├── fr_test_enhanced
│ ├── fr_train_enhanced
│ └── fr_valid_enhanced
└── vi
├── Session.py
├── __init__.py
├── __init__.pyc
├── modules
├── Decoder.py
├── Encoder.py
├── Encoder.pyc
├── PR.py
├── RNNLM.py
├── Utils.py
├── Utils.pyc
├── __init__.py
└── __init__.pyc
├── myio
├── IO.py
├── IO.pyc
├── Preprocess.py
├── Preprocess.pyc
├── Tree.py
├── Tree.pyc
├── Utils.py
├── Utils.pyc
├── __init__.py
├── __init__.pyc
└── ud_cleaner.py
├── nvil_ft.py
├── nvil_ft_ud.py
├── nvil_pre.py
├── nvil_pre_ud.py
├── rl_ft.py
├── test.py
├── test_ud.py
├── train_decoder.py
├── train_encoder.py
├── train_lm.py
└── ud_scripts
├── ud_ft.sh
├── ud_pre.sh
├── ud_test.sh
├── ud_train_decoder.sh
└── ud_train_encoder.sh
/README.md:
--------------------------------------------------------------------------------
1 | # Code for 'Dependency Grammar Induction with a Neural Variational Transition-based Parser' (AAAI2019)
2 |
3 | ## Preprocessing:
4 |
5 | [Brown Clustering](https://github.com/percyliang/brown-cluster)
6 | After clustering, add extra two fields (cluster index and token index inside the cluster) to the UD/WSJ dataset
7 | [Customized TorchText `0.2.3`](https://drive.google.com/file/d/1sDGfPwq0vNwSh-2JmXomi8GyTjAvOY81/view?usp=sharing)
8 |
9 | *Since WSJ corpus is not publicly available, training and evaluating scripts for UD are as below*.
10 |
11 | ## Supervised training (for UD)
12 | Train the encoder `./ud_scripts/ud_train_encoder.sh`
13 | Train the decoder `./ud_scripts/ud_train_decoder.sh`
14 | Note:
15 | Set no length limitation for preprocessing to keep a full vocabulary;
16 | Set random seed to be -1
17 |
18 | ## Weakly-/Un-supervised training (for UD)
19 | Rule settings:
20 | Universal Ruels: `--pr_fname "./data/pr_rules/ud_c/"$LANGUAGE"_0.5.txt"`
21 | Weakly Supervised: `--pr_fname "./data/pr_rules/ud_c/"$LANGUAGE"_10_gt.txt"`
22 |
23 | Pretrain: `cd ud_scripts && ./ud_pre.sh`
24 | Finetune: `cd ud_scripts && ./ud_ft.sh`
25 |
26 | ## Evaluation (for UD)
27 | `cd ud_scripts && ./ud_test.sh`
--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/libowen2121/VI-dependency-syntax/b9853a32fbfd7810ef03b5728fb1e01941504d96/__init__.py
--------------------------------------------------------------------------------
/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/libowen2121/VI-dependency-syntax/b9853a32fbfd7810ef03b5728fb1e01941504d96/__init__.pyc
--------------------------------------------------------------------------------
/aaai19 sup.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/libowen2121/VI-dependency-syntax/b9853a32fbfd7810ef03b5728fb1e01941504d96/aaai19 sup.pdf
--------------------------------------------------------------------------------
/data/pr_rules/ud_c/fr_0.5.txt:
--------------------------------------------------------------------------------
1 | ROOT VERB 0.5
2 | ROOT NOUN 0.5
3 | VERB NOUN 0.5
4 | VERB ADV 0.5
5 | VERB VERB 0.5
6 | VERB AUX 0.5
7 | NOUN ADJ 0.5
8 | NOUN DET 0.5
9 | NOUN NUM 0.5
10 | NOUN NOUN 0.5
11 | NOUN CONJ 0.5
12 | NOUN ADP 0.5
13 | ADJ ADV 0.5
--------------------------------------------------------------------------------
/data/pr_rules/ud_c/fr_10_gt.txt:
--------------------------------------------------------------------------------
1 | VERB AUX 0.828
2 | ROOT VERB 0.600
3 | VERB NOUN 0.512
4 | VERB ADV 0.488
5 | NOUN DET 0.484
6 | NOUN ADJ 0.412
7 | NOUN NUM 0.374
8 | ADJ ADV 0.370
9 | NOUN ADP 0.319
10 | VERB VERB 0.270
11 | NOUN NOUN 0.217
12 | NOUN CONJ 0.194
13 | ROOT NOUN 0.117
--------------------------------------------------------------------------------
/vi/Session.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import math
3 | import os.path
4 | import os
5 | import time
6 | import sys
7 | import random
8 |
9 | import torch
10 | import torch.nn as nn
11 | import torch.nn.init as init
12 | import torch.optim as optim
13 | from torchtext.data import BucketIterator, Iterator
14 | from torchtext.vocab import FastText, GloVe
15 | from torch.nn.utils import clip_grad_norm_
16 |
17 | from vi_syntax.vi.modules.Decoder import Decoder
18 | from vi_syntax.vi.modules.Encoder import Encoder
19 | from vi_syntax.vi.modules.RNNLM import LanguageModel
20 | from vi_syntax.vi.modules.RNNLM import Baseline_linear
21 | from vi_syntax.vi.modules.PR import PR
22 | from vi_syntax.vi.modules.Utils import compute_dda, compute_rule_acc, compute_dda_long_dep
23 | from vi_syntax.vi.myio.IO import VIDataset
24 |
25 |
26 | class Session(object):
27 |
28 | def __init__(self, opt):
29 | self.opt = opt
30 | self.logger = self.create_logger()
31 | self.initializers = {'glorot': init.xavier_normal_,
32 | 'constant': lambda x: init.constant(x, 0.01),
33 | 'uniform': lambda x: init.uniform(x, a=-0.1, b=0.1),
34 | 'normal': lambda x: init.normal_(x, mean=0, std=1)
35 | }
36 |
37 | self.optimizers = {'sgd': lambda x: optim.SGD(x, lr=0.1, momentum=0.9),
38 | 'adam': lambda x: optim.Adam(x, lr=self.opt.lr),
39 | 'adadelta': lambda x: optim.Adadelta(x, lr=self.opt.lr),
40 | 'adagrad': lambda x: optim.Adagrad(x, lr=self.opt.lr, weight_decay=1e-4)
41 | }
42 |
43 | def _build_wsj_dataset(self):
44 | """
45 | load wsj dataset
46 | """
47 | self.dataset = VIDataset()
48 | self.dataset.build_wsj_enhanced_dataset(data_path=self.opt.data_path, train_fname=self.opt.train_fname,
49 | valid_fname = self.opt.valid_fname, test_fname=self.opt.test_fname,
50 | max_length=9999, cluster_fname=self.opt.cluster_fname,
51 | min_freq=self.opt.min_freq,
52 | vectors=[GloVe(name='6B', dim='300',
53 | cache = self.opt.word_vector_cache,
54 | unk_init = torch.Tensor.normal_)])
55 |
56 | train_iter, self.valid_iter, self.test_iter = BucketIterator.splits(
57 | (self.dataset.train, self.dataset.valid, self.dataset.test),
58 | batch_size=self.opt.batchsize, sort_within_batch=True, device=self.opt.gpu_id) # sort within batch
59 | self.train_sampler = train_iter.__iter__()
60 | self.valid_sampler = self.valid_iter.__iter__()
61 | self.test_sampler = self.test_iter.__iter__()
62 |
63 | def _build_ud_dataset(self):
64 | """
65 | load ud dataset
66 | """
67 | self.dataset = VIDataset()
68 | self.dataset.build_ud_enhanced_dataset(data_path=self.opt.data_path, train_fname=self.opt.train_fname,
69 | valid_fname=self.opt.valid_fname, test_fname=self.opt.test_fname,
70 | max_length=999,
71 | min_freq=self.opt.min_freq,
72 | vectors = [FastText(language=self.opt.language,
73 | cache=self.opt.word_vector_cache,
74 | unk_init=torch.Tensor.normal_)],
75 | cluster_fname=self.opt.cluster_fname)
76 |
77 | if self.opt.seed > 0:
78 | random.seed(self.opt.seed)
79 | train_iter, self.valid_iter, self.test_iter = BucketIterator.splits(
80 | (self.dataset.train, self.dataset.valid, self.dataset.test),
81 | batch_size=self.opt.batchsize,
82 | sort_within_batch=True,
83 | device=self.opt.gpu_id,
84 | random_state=random.getstate()) # sort within batch
85 | else:
86 | train_iter, self.valid_iter, self.test_iter = BucketIterator.splits(
87 | (self.dataset.train, self.dataset.valid, self.dataset.test),
88 | batch_size=self.opt.batchsize,
89 | sort_within_batch=True,
90 | device=self.opt.gpu_id) # sort within batch
91 | self.train_sampler = train_iter.__iter__()
92 | self.valid_sampler = self.valid_iter.__iter__()
93 | self.test_sampler = self.test_iter.__iter__()
94 |
95 | def train_encoder(self):
96 | """
97 | train the encoder in a supervised setting
98 | """
99 | if self.opt.language != '':
100 | self._build_ud_dataset()
101 | else:
102 | self._build_wsj_dataset()
103 | self.encoder = self.create_encoder()
104 | self.optim = self.optimizers[self.opt.optimizer](
105 | [{'params': [param for param in self.encoder.parameters() if param.requires_grad]}])
106 | self.encoder.train()
107 |
108 | self.len_train = len(self.dataset.train)
109 | self.len_real_train = 0
110 | for i in range(1, self.len_train + 1):
111 | sample = self.train_sampler.next()
112 | if sample.word[1].item() <= self.opt.train_max_length:
113 | self.len_real_train += 1
114 | total_loss_act = 0.
115 | for epoch in range(1, self.opt.epochs + 1):
116 | cur_time = time.time()
117 | cur_sample = 0
118 | i = 0
119 | for _ in range(1, self.len_train + 1):
120 | self.optim.zero_grad()
121 | sample = self.train_sampler.next()
122 | if sample.word[1].item() > self.opt.train_max_length:
123 | continue
124 | i += 1
125 | loss_act = self.encoder.train_parser(words=sample.word[0], pos_tags=sample.pos_tag, oracle_actions=sample.action)
126 | if loss_act is not None:
127 | total_loss_act += loss_act.data.item()
128 | loss_act.backward()
129 | self.optim.step()
130 | if i % self.opt.print_every == 0 or i == self.len_real_train:
131 | elapsed_time = time.time() - cur_time
132 | cur_time = time.time()
133 | elapsed_sample = i - cur_sample
134 | cur_sample = i
135 | self.logger.info('epoch {:3d} | {:5d}/{:5d} | avg loss act {:5.2f} | time {:5.2f}s'. \
136 | format(epoch, i, self.len_real_train,
137 | total_loss_act / elapsed_sample, elapsed_time))
138 | total_loss_act = 0.
139 | self.logger.info('=' * 80)
140 | valid_dda = self.parse(self.valid_sampler)
141 | self.valid_sampler = self.valid_iter.__iter__() # renew the iterator
142 | self.logger.info('epoch {:3d} | valid dda {:5.2f}'.format(epoch, valid_dda))
143 | test_dda = self.parse(self.test_sampler)
144 | self.test_sampler = self.test_iter.__iter__() # renew the iterator
145 | self.logger.info('epoch {:3d} | test dda {:5.2f}'.format(epoch, test_dda))
146 | self.logger.info('=' * 80)
147 |
148 | def train_decoder(self):
149 | """
150 | train the decoder in a supervised setting
151 | """
152 | if self.opt.language != '':
153 | self._build_ud_dataset()
154 | else:
155 | self._build_wsj_dataset()
156 | self.decoder = self.create_decoder()
157 | self.optim = self.optimizers[self.opt.optimizer](
158 | # [{'params': [param for name, param in self.decoder.named_parameters() if 'embedding' not in name]}])
159 | [{'params': [param for param in self.decoder.parameters() if param.requires_grad]}])
160 | self.decoder.train()
161 |
162 | self.len_train = len(self.dataset.train)
163 | self.len_real_train = 0
164 | for i in range(1, self.len_train + 1):
165 | sample = self.train_sampler.next()
166 | if sample.word[1].item() <= self.opt.train_max_length:
167 | self.len_real_train += 1
168 | total_loss_act = 0.
169 | total_loss_token = 0.
170 | total_loss = 0.
171 | for epoch in range(1, self.opt.epochs + 1):
172 | cur_time = time.time()
173 | cur_sample = 0
174 | i = 0
175 | for _ in range(1, self.len_train + 1):
176 | self.optim.zero_grad()
177 | sample = self.train_sampler.next()
178 | if sample.word[1].item() > self.opt.train_max_length:
179 | continue
180 | i += 1
181 | loss, loss_act, loss_token = self.decoder(words=sample.word[0], pos_tags=sample.pos_tag, oracle_actions=sample.action)
182 | loss.backward()
183 | self.optim.step()
184 | if loss_act is not None:
185 | total_loss_act += loss_act.data[0]
186 | total_loss += loss.data[0]
187 | total_loss_token += loss_token.data.item()
188 | if i % self.opt.print_every == 0 or i == self.len_real_train:
189 | elapsed_time = time.time() - cur_time
190 | cur_time = time.time()
191 | elapsed_sample = i - cur_sample
192 | cur_sample = i
193 | self.logger.info('epoch {:3d} | {:5d}/{:5d} | avg loss act {:5.2f} | avg loss token {:5.2f} | avg loss {:5.2f} | time {:5.2f}s'. \
194 | format(epoch, i, self.len_real_train, total_loss_act / elapsed_sample,
195 | total_loss_token / elapsed_sample, total_loss / elapsed_sample,
196 | elapsed_time))
197 | total_loss_act = 0.
198 | total_loss_token = 0.
199 | total_loss = 0.
200 | self.logger.info('=' * 80)
201 | valid_loss, valid_loss_act, valid_loss_token = self.compute_joint_loss(self.valid_sampler)
202 | self.valid_sampler = self.valid_iter.__iter__() # renew the iterator
203 | self.logger.info('epoch {:3d} | avg valid loss act {:5.2f} | avg valid loss token {:5.2f} | avg valid loss {:5.2f}'
204 | .format(epoch, valid_loss_act, valid_loss_token, valid_loss))
205 | test_loss, test_loss_act, test_loss_token = self.compute_joint_loss(self.test_sampler)
206 | self.test_sampler = self.test_iter.__iter__() # renew the iterator
207 | self.logger.info('epoch {:3d} | avg test loss act {:5.2f} | avg test loss token {:5.2f} | avg test loss {:5.2f}'
208 | .format(epoch, test_loss_act, test_loss_token, test_loss))
209 | self.logger.info('=' * 80)
210 |
211 | def train_lm(self):
212 | """
213 | train language model
214 | """
215 | self.opt.cluster_fname = None
216 | self._build_wsj_dataset()
217 | self.lm = self.create_lm()
218 | self.lm.train()
219 |
220 | self.optim = self.optimizers[self.opt.optimizer](
221 | [{'params': [param for param in self.lm.parameters() if param.requires_grad]}])
222 |
223 | self.len_train = len(self.dataset.train) / self.opt.batchsize + 1
224 | self.len_real_train = 0
225 | for i in range(1, self.len_train + 1):
226 | sample = self.train_sampler.next()
227 | if torch.sum(sample.word[1]).item() <= self.opt.train_max_length * self.opt.batchsize + 3:
228 | self.len_real_train += 1
229 | total_loss = 0.
230 | best_valid_ppl = 10.e9
231 | print 'real train length', self.len_real_train
232 | for epoch in range(1, self.opt.epochs + 1):
233 | cur_time = time.time()
234 | elapsed_token = 0
235 | for i in range(1, self.len_real_train + 1):
236 | self.optim.zero_grad()
237 | sample = self.train_sampler.next()
238 | while torch.sum(sample.word[1]).item() > (self.opt.train_max_length + 1) * self.opt.batchsize:
239 | sample = self.train_sampler.next()
240 | loss = self.lm(words=sample.word, pos_tags=sample.pos_tag)
241 | loss.backward()
242 | total_loss += loss.cpu().item()
243 | clip_grad_norm_([param for param in self.lm.parameters() if param.requires_grad], self.opt.clip)
244 | self.optim.step()
245 | elapsed_token += torch.sum(sample.word[1]).item()
246 | if i % self.opt.print_every == 0 or i == self.len_real_train:
247 | elapsed_time = time.time() - cur_time
248 | cur_time = time.time()
249 | self.logger.info('epoch {:3d} | {:5d}/{:5d} | ppl {:5.2f} | time {:5.2f}s'. \
250 | format(epoch, i, self.len_real_train,
251 | math.exp(total_loss / elapsed_token), elapsed_time))
252 | elapsed_token = 0
253 | total_loss = 0.
254 | self.logger.info('=' * 80)
255 | valid_ppl = self.compute_lm_ppl(self.valid_sampler)
256 | self.valid_sampler = self.valid_iter.__iter__() # renew the iterator
257 | self.logger.info('epoch {:3d} | valid ppl {:5.2f}'.format(epoch, valid_ppl))
258 | test_ppl = self.compute_lm_ppl(self.test_sampler)
259 | self.test_sampler = self.test_iter.__iter__() # renew the iterator
260 | self.logger.info('epoch {:3d} | test ppl {:5.2f}'.format(epoch, test_ppl))
261 | self.logger.info('=' * 80)
262 | if self.opt.save_model:
263 | if valid_ppl < best_valid_ppl:
264 | prev_model_fname = os.path.join(self.opt.result_dir,
265 | '{}_lm_valid-ppl-{:.2f}_len-{}.pt'
266 | .format(self.opt.log_name, best_valid_ppl, self.opt.train_max_length))
267 | if os.path.exists(prev_model_fname):
268 | os.remove(prev_model_fname)
269 | best_valid_ppl = valid_ppl
270 | cur_model_fname = os.path.join(self.opt.result_dir,
271 | '{}_lm_valid-ppl-{:.2f}_len-{}.pt'
272 | .format(self.opt.log_name, best_valid_ppl, self.opt.train_max_length))
273 | self.lm.save(cur_model_fname)
274 |
275 | def nvil_pr_pretrain(self):
276 | """
277 | Pretrain both encoder and decoder using posterior regularization as the direct reward
278 | batchsize here means nvil batch (not exact devision here!)
279 | """
280 | self._build_wsj_dataset()
281 | self.encoder = self.create_encoder()
282 | self.decoder = self.create_decoder()
283 | self.pr = self.create_pr()
284 | enc_param = [param for param in self.encoder.parameters() if param.requires_grad]
285 | dec_param = [param for param in self.decoder.parameters() if param.requires_grad]
286 | pr_param = [param for param in self.pr.parameters() if param.requires_grad]
287 | self.enc_optim = self.optimizers[self.opt.optimizer](
288 | [{'params': enc_param}])
289 | self.dec_optim = self.optimizers[self.opt.optimizer](
290 | [{'params': dec_param}])
291 | self.pr_optim = self.optimizers[self.opt.pr_optimizer](
292 | [{'params': pr_param}])
293 | self.encoder.train()
294 | self.decoder.train()
295 | self.pr.train()
296 | self.enc_optim.zero_grad()
297 | self.dec_optim.zero_grad()
298 | self.pr_optim.zero_grad()
299 |
300 | self.logger.info('=' * 80)
301 | valid_dda = self.parse(self.valid_sampler)
302 | self.valid_sampler = self.valid_iter.__iter__() # renew the iterator
303 | self.logger.info('epoch {:3d} | valid dda {:5.2f}'.format(0, valid_dda))
304 | self.logger.info('=' * 80)
305 |
306 | self.len_train = len(self.dataset.train)
307 | self.len_real_train = 0.
308 | for i in range(1, self.len_train + 1):
309 | sample = self.train_sampler.next()
310 | if sample.word[1].item() <= self.opt.train_max_length:
311 | self.len_real_train += 1
312 | self.len_real_train = int(math.ceil(self.len_real_train / self.opt.nvil_batchsize))
313 | total_enc_loss = 0.
314 | total_dec_loss = 0.
315 | total_score_mean = 0.
316 | total_score_var = 0.
317 | best_valid_dda = 0.
318 | for epoch in range(1, self.opt.epochs + 1):
319 | cur_time = time.time()
320 | cur_batch = 0
321 | i = 0
322 | for _ in range(1, self.len_real_train + 1):
323 | batch = []
324 | while len(batch) < self.opt.nvil_batchsize:
325 | sample = self.train_sampler.next()
326 | if sample.word[1].item() <= self.opt.train_max_length:
327 | batch.append(sample)
328 | i += 1
329 | for sample in batch:
330 | enc_loss_act_list = []
331 | dec_loss_list = []
332 | for mc in range(self.opt.mc_samples):
333 | enc_loss_act, predicted_act, feature = self.encoder.parse_pr(
334 | sample.word[0], sample.pos_tag, self.pr.rule2i, sample=True)
335 | self.pr.phi.data[mc] = feature
336 | enc_loss_act_list.append(enc_loss_act)
337 | dec_loss, _, _ = self.decoder(
338 | words=sample.word[0], pos_tags=sample.pos_tag, oracle_actions=predicted_act)
339 | dec_loss_list.append(dec_loss)
340 | if sample.word[0].size(1) == 1:
341 | continue # skip backward
342 |
343 | # update posterior regularizer
344 | pr_factor = torch.ones(self.opt.mc_samples)
345 | if self.opt.gpu_id > -1:
346 | pr_factor.cuda()
347 | if torch.sum(self.pr.phi).item() < 0:
348 | pr_loss, pr_factor = self.pr()
349 | pr_loss.backward()
350 | self.pr.reset_phi()
351 |
352 | # backward w.r.t. encoder and decoder
353 | for mc in range(self.opt.mc_samples):
354 | enc_loss_act = enc_loss_act_list[mc]
355 | dec_loss = dec_loss_list[mc]
356 | total_dec_loss += dec_loss.item()
357 | total_enc_loss += enc_loss_act.item()
358 |
359 | dec_loss = dec_loss * pr_factor[mc].item() / self.opt.mc_samples
360 | dec_loss.backward()
361 |
362 | enc_loss_act = enc_loss_act * pr_factor[mc].item() / self.opt.mc_samples
363 | enc_loss_act.backward()
364 |
365 | clip_grad_norm_(enc_param + dec_param, self.opt.clip)
366 | self.enc_optim.step()
367 | self.dec_optim.step()
368 | self.pr_optim.step()
369 | self.pr.project()
370 | self.enc_optim.zero_grad()
371 | self.dec_optim.zero_grad()
372 | self.pr_optim.zero_grad()
373 |
374 | if i % self.opt.print_every == 0 or i == self.len_real_train:
375 | elapsed_time = time.time() - cur_time
376 | cur_time = time.time()
377 | elapsed_batch = i - cur_batch
378 | cur_batch = i
379 | self.logger.info('epoch {:3d} | {:5d}/{:5d} | avg enc loss {:5.2f} | avg dec loss {:5.2f} | time {:5.2f}s'. \
380 | format(epoch, i, self.len_real_train, total_enc_loss / elapsed_batch / self.opt.nvil_batchsize / self.opt.mc_samples,
381 | total_dec_loss / elapsed_batch / self.opt.nvil_batchsize / self.opt.mc_samples, elapsed_time))
382 | total_enc_loss = 0.
383 | total_dec_loss = 0.
384 |
385 | if i % self.opt.save_every == 0 or i == self.len_real_train:
386 | # validate
387 | self.logger.info('=' * 80)
388 | valid_dda = self.parse(self.valid_sampler)
389 | self.valid_sampler = self.valid_iter.__iter__() # renew the iterator
390 | self.logger.info('epoch {:3d} | valid dda {:5.4f}'.format(epoch, valid_dda))
391 |
392 | if valid_dda > best_valid_dda:
393 | # save encoder model
394 | prev_enc_fname = os.path.join(self.opt.result_dir,
395 | '{}_enc_valid-dda-{:.4f}_len-{}.pt'
396 | .format(self.opt.log_name, best_valid_dda, self.opt.train_max_length))
397 | if os.path.exists(prev_enc_fname):
398 | os.remove(prev_enc_fname)
399 | cur_enc_fname = os.path.join(self.opt.result_dir,
400 | '{}_enc_valid-dda-{:.4f}_len-{}.pt'
401 | .format(self.opt.log_name, valid_dda, self.opt.train_max_length))
402 | self.encoder.save(cur_enc_fname)
403 |
404 | # save decoder model
405 | prev_dec_fname = os.path.join(self.opt.result_dir,
406 | '{}_dec_valid-dda-{:.4f}_len-{}.pt'
407 | .format(self.opt.log_name, best_valid_dda, self.opt.train_max_length))
408 | if os.path.exists(prev_dec_fname):
409 | os.remove(prev_dec_fname)
410 | cur_dec_fname = os.path.join(self.opt.result_dir,
411 | '{}_dec_valid-dda-{:.4f}_len-{}.pt'
412 | .format(self.opt.log_name, valid_dda, self.opt.train_max_length))
413 | self.decoder.save(cur_dec_fname)
414 |
415 | # test short/long
416 | best_valid_dda = valid_dda
417 | max_length = 10
418 | test_dda = self.parse(self.test_sampler, max_length=max_length)
419 | self.test_sampler = self.test_iter.__iter__() # renew the iterator
420 | self.logger.info('epoch {:3d} | test dda-{:2d} {:5.4f}'.format(epoch, max_length, test_dda))
421 | # max_length = 9999
422 | # valid_dda = self.parse(self.valid_sampler, max_length=max_length)
423 | # self.valid_sampler = self.test_iter.__iter__() # renew the iterator
424 | # self.logger.info('epoch {:3d} | valid dda-{:2d} {:5.4f}'.format(epoch, max_length, valid_dda))
425 | # test_dda = self.parse(self.test_sampler, max_length=max_length)
426 | # self.test_sampler = self.test_iter.__iter__() # renew the iterator
427 | # self.logger.info('epoch {:3d} | test dda-{:2d} {:5.4f}'.format(epoch, max_length, test_dda))
428 |
429 | self.logger.info('=' * 80)
430 |
431 | def nvil_pr_ft(self):
432 | """
433 | Finetune both encoder and decoder based on the pretrained model using different functions
434 | (RL-SN/RL-C/RL-PC details can be found in the paper)
435 | batchsize here means nvil batch (not exact devision here!)
436 | """
437 | self._build_wsj_dataset()
438 | self.encoder = self.create_encoder(self.opt.encoder_fname)
439 | self.decoder = self.create_decoder(self.opt.decoder_fname)
440 | self.pr = self.create_pr()
441 | enc_param = [param for param in self.encoder.parameters() if param.requires_grad]
442 | dec_param = [param for param in self.decoder.parameters() if param.requires_grad]
443 | pr_param = [param for param in self.pr.parameters() if param.requires_grad]
444 | self.enc_optim = self.optimizers[self.opt.optimizer](
445 | [{'params': enc_param}])
446 | self.dec_optim = self.optimizers[self.opt.optimizer](
447 | [{'params': dec_param}])
448 | self.pr_optim = self.optimizers[self.opt.pr_optimizer](
449 | [{'params': pr_param}])
450 | self.encoder.train()
451 | self.decoder.train()
452 | self.pr.train()
453 | self.enc_optim.zero_grad()
454 | self.dec_optim.zero_grad()
455 | self.pr_optim.zero_grad()
456 |
457 | self.logger.info('=' * 80)
458 | valid_dda = self.parse(self.valid_sampler)
459 | self.valid_sampler = self.valid_iter.__iter__() # renew the iterator
460 | self.logger.info('initial | valid dda {:5.4f}'.format(valid_dda))
461 | self.logger.info('=' * 80)
462 |
463 | self.len_train = len(self.dataset.train)
464 | self.len_real_train = 0.
465 | for i in range(1, self.len_train + 1):
466 | sample = self.train_sampler.next()
467 | if sample.word[1].item() <= self.opt.train_max_length:
468 | self.len_real_train += 1
469 | self.len_real_train = int(math.ceil(self.len_real_train / self.opt.nvil_batchsize))
470 | total_enc_loss = 0.
471 | total_dec_loss = 0.
472 | best_valid_dda = 0.
473 | for epoch in range(1, self.opt.epochs + 1):
474 | cur_time = time.time()
475 | cur_batch = 0
476 | i = 0
477 | for _ in range(1, self.len_real_train + 1):
478 | batch = []
479 | while len(batch) < self.opt.nvil_batchsize:
480 | sample = self.train_sampler.next()
481 | if sample.word[1].item() <= self.opt.train_max_length:
482 | batch.append(sample)
483 | i += 1
484 | for sample in batch:
485 | enc_loss_act_list = []
486 | dec_loss_list = []
487 | for mc in range(self.opt.mc_samples):
488 | enc_loss_act, predicted_act, feature = self.encoder.parse_pr(
489 | sample.word[0], sample.pos_tag, self.pr.rule2i, sample=True)
490 | self.pr.phi.data[mc] = feature
491 | enc_loss_act_list.append(enc_loss_act)
492 | dec_loss, _, _ = self.decoder(
493 | words=sample.word[0], pos_tags=sample.pos_tag, oracle_actions=predicted_act)
494 | dec_loss_list.append(dec_loss)
495 |
496 | if sample.word[0].size(1) == 1:
497 | continue # skip backward
498 |
499 | # update posterior regulizer
500 | pr_factor = torch.ones(self.opt.mc_samples)
501 | if self.opt.gpu_id > -1:
502 | pr_factor.cuda()
503 | if torch.sum(self.pr.phi).item() < 0:
504 | pr_loss, pr_factor = self.pr()
505 | pr_loss.backward()
506 |
507 | phi = torch.sum(self.pr.phi, dim=1)
508 | normalized_phi = (phi - torch.mean(phi))
509 |
510 | # show SFE
511 | # print 'lambda', self.pr.Lambda.weight
512 | phi = phi.cpu().numpy() # for display
513 |
514 | self.pr.reset_phi()
515 | score_list = None
516 |
517 | # get score
518 | for mc in range(self.opt.mc_samples):
519 | enc_loss_act = enc_loss_act_list[mc]
520 | dec_loss = dec_loss_list[mc]
521 | total_dec_loss += dec_loss.item()
522 | total_enc_loss += enc_loss_act.item()
523 |
524 | score = - dec_loss + enc_loss_act
525 | score.unsqueeze_(0)
526 | if score_list is None:
527 | score_list = score
528 | else:
529 | score_list = torch.cat((score_list, score))
530 |
531 | # normalize scores
532 | score_mean = torch.mean(score_list)
533 | score_std = torch.std(score_list)
534 | nomalized_score_list = (score_list - score_mean) / score_std
535 |
536 | # backward w.r.t. encoder decoder
537 | for mc in range(self.opt.mc_samples):
538 | enc_loss_act = enc_loss_act_list[mc]
539 | dec_loss = dec_loss_list[mc]
540 | score = nomalized_score_list[mc]
541 |
542 | # RL-PC
543 | if normalized_phi[mc].item() < 0:
544 | score = abs(score.item())
545 | else:
546 | score = - abs(score.item())
547 | enc_loss_act = enc_loss_act * score * pr_factor[mc].item() / self.opt.mc_samples
548 | dec_loss = dec_loss * score * pr_factor[mc].item() / self.opt.mc_samples
549 |
550 | # # RL-SN
551 | # score = score.item()
552 | # enc_loss_act = enc_loss_act * score * pr_factor[mc].item() / self.opt.mc_samples
553 | # dec_loss = dec_loss * score * pr_factor[mc].item() / self.opt.mc_samples
554 |
555 | # # RL-C
556 | # enc_loss_act = enc_loss_act * (-normalized_phi[mc].item()) * pr_factor[mc].item() / self.opt.mc_samples
557 | # dec_loss = dec_loss * (-normalized_phi[mc].item()) * pr_factor[mc].item() / self.opt.mc_samples
558 |
559 | # backward
560 | enc_loss_act.backward()
561 | dec_loss.backward()
562 |
563 | clip_grad_norm_(enc_param + dec_param + pr_param, self.opt.clip)
564 | self.enc_optim.step()
565 | self.dec_optim.step()
566 | self.pr_optim.step()
567 | self.pr.project()
568 | self.enc_optim.zero_grad()
569 | self.dec_optim.zero_grad()
570 | self.pr_optim.zero_grad()
571 |
572 | if i % self.opt.print_every == 0 or i == self.len_real_train:
573 | elapsed_time = time.time() - cur_time
574 | cur_time = time.time()
575 | elapsed_batch = i - cur_batch
576 | cur_batch = i
577 | self.logger.info('epoch {:3d} | {:5d}/{:5d} | avg enc loss {:5.2f} | avg dec loss {:5.2f} | time {:5.2f}s'. \
578 | format(epoch, i, self.len_real_train, total_enc_loss / elapsed_batch / self.opt.nvil_batchsize / self.opt.mc_samples,
579 | total_dec_loss / elapsed_batch / self.opt.nvil_batchsize / self.opt.mc_samples, elapsed_time))
580 | total_enc_loss = 0.
581 | total_dec_loss = 0.
582 |
583 | if i % self.opt.save_every == 0 or i == self.len_train:
584 | # validate
585 | self.logger.info('=' * 80)
586 | valid_dda = self.parse(self.valid_sampler)
587 | self.valid_sampler = self.valid_iter.__iter__() # renew the iterator
588 | self.logger.info('epoch {:3d} | valid dda {:5.4f}'.format(epoch, valid_dda))
589 |
590 | if valid_dda > best_valid_dda:
591 | # save encoder model
592 | prev_enc_fname = os.path.join(self.opt.result_dir,
593 | '{}_enc_valid-dda-{:.4f}_len-{}.pt'
594 | .format(self.opt.log_name, best_valid_dda, self.opt.train_max_length))
595 | if os.path.exists(prev_enc_fname):
596 | os.remove(prev_enc_fname)
597 | cur_enc_fname = os.path.join(self.opt.result_dir,
598 | '{}_enc_valid-dda-{:.4f}_len-{}.pt'
599 | .format(self.opt.log_name, valid_dda, self.opt.train_max_length))
600 | self.encoder.save(cur_enc_fname)
601 |
602 | # save decoder model
603 | prev_dec_fname = os.path.join(self.opt.result_dir,
604 | '{}_dec_valid-dda-{:.4f}_len-{}.pt'
605 | .format(self.opt.log_name, best_valid_dda, self.opt.train_max_length))
606 | if os.path.exists(prev_dec_fname):
607 | os.remove(prev_dec_fname)
608 | cur_dec_fname = os.path.join(self.opt.result_dir,
609 | '{}_dec_valid-dda-{:.4f}_len-{}.pt'
610 | .format(self.opt.log_name, valid_dda, self.opt.train_max_length))
611 | self.decoder.save(cur_dec_fname)
612 |
613 | # test
614 | best_valid_dda = valid_dda
615 | max_length = 10
616 | test_dda = self.parse(self.test_sampler, max_length=max_length)
617 | self.test_sampler = self.test_iter.__iter__() # renew the iterator
618 | self.logger.info('epoch {:3d} | test dda-{:d} {:5.4f}'.format(epoch, max_length, test_dda))
619 |
620 | self.logger.info('=' * 80)
621 |
622 | def rl_pr_ft(self):
623 | """
624 | REINFORCE + baseline + posterior regularized (\cite{Miao})
625 | """
626 | self._build_wsj_dataset()
627 | self.lm = self.create_lm(self.opt.lm_fname)
628 | self.lm.eval()
629 | self.encoder = self.create_encoder(self.opt.encoder_fname)
630 | self.decoder = self.create_decoder(self.opt.decoder_fname)
631 | self.pr = self.create_pr()
632 | self.bl_linear = self.create_baseline_linear()
633 | bl_criterion = nn.MSELoss()
634 | enc_param = [param for param in self.encoder.parameters() if param.requires_grad]
635 | dec_param = [param for param in self.decoder.parameters() if param.requires_grad]
636 | pr_param = [param for param in self.pr.parameters() if param.requires_grad]
637 | bl_linear_param = [param for param in self.bl_linear.parameters()]
638 | self.enc_optim = self.optimizers[self.opt.optimizer](
639 | [{'params': enc_param}])
640 | self.dec_optim = self.optimizers[self.opt.optimizer](
641 | [{'params': dec_param}])
642 | self.pr_optim = self.optimizers[self.opt.pr_optimizer](
643 | [{'params': pr_param}])
644 | self.bl_linear_optim = self.optimizers[self.opt.pr_optimizer](
645 | [{'params': bl_linear_param}])
646 | self.encoder.train()
647 | self.decoder.train()
648 | self.pr.train()
649 | self.enc_optim.zero_grad()
650 | self.dec_optim.zero_grad()
651 | self.pr_optim.zero_grad()
652 | self.bl_linear_optim.zero_grad()
653 |
654 | self.logger.info('=' * 80)
655 | valid_dda = self.parse(self.valid_sampler)
656 | self.valid_sampler = self.valid_iter.__iter__() # renew the iterator
657 | self.logger.info('initial | valid dda {:5.4f}'.format(valid_dda))
658 | self.logger.info('=' * 80)
659 |
660 | self.len_train = len(self.dataset.train)
661 | self.len_real_train = 0.
662 | for i in range(1, self.len_train + 1):
663 | sample = self.train_sampler.next()
664 | if sample.word[1].item() <= self.opt.train_max_length:
665 | self.len_real_train += 1
666 | self.len_real_train = int(math.ceil(self.len_real_train / self.opt.nvil_batchsize))
667 | total_enc_loss = 0.
668 | total_dec_loss = 0.
669 | best_valid_dda = 0.
670 | for epoch in range(1, self.opt.epochs + 1):
671 | cur_time = time.time()
672 | cur_batch = 0
673 | i = 0
674 | for _ in range(1, self.len_real_train + 1):
675 | batch = []
676 | while len(batch) < self.opt.nvil_batchsize:
677 | sample = self.train_sampler.next()
678 | if sample.word[1].item() <= self.opt.train_max_length:
679 | batch.append(sample)
680 | i += 1
681 | for sample in batch:
682 | enc_loss_act_list = []
683 | baseline_list = []
684 | dec_loss_list = []
685 | for mc in range(self.opt.mc_samples):
686 | enc_loss_act, predicted_act, feature = self.encoder.parse_pr(
687 | sample.word[0], sample.pos_tag, self.pr.rule2i, sample=True)
688 | self.pr.phi.data[mc] = feature
689 | enc_loss_act_list.append(enc_loss_act)
690 | dec_loss, _, _ = self.decoder(
691 | words=sample.word[0], pos_tags=sample.pos_tag, oracle_actions=predicted_act)
692 | dec_loss_list.append(dec_loss)
693 | lm_loss = self.lm(words=sample.word, pos_tags=sample.pos_tag).unsqueeze(0)
694 | baseline_list.append(self.bl_linear(lm_loss))
695 |
696 | if sample.word[0].size(1) == 1:
697 | continue # skip backward
698 |
699 | # update posterior regulizer
700 | pr_factor = torch.ones(self.opt.mc_samples)
701 | if self.opt.gpu_id > -1:
702 | pr_factor.cuda()
703 | if torch.sum(self.pr.phi).item() < 0:
704 | pr_loss, pr_factor = self.pr()
705 | pr_loss.backward()
706 |
707 | self.pr.reset_phi()
708 | score_list = None
709 |
710 | for mc in range(self.opt.mc_samples):
711 | enc_loss_act = enc_loss_act_list[mc]
712 | baseline = baseline_list[mc]
713 | dec_loss = dec_loss_list[mc]
714 | total_dec_loss += dec_loss.item()
715 | total_enc_loss += enc_loss_act.item()
716 |
717 | score = - dec_loss + enc_loss_act + baseline
718 | score.unsqueeze_(0)
719 | if score_list is None:
720 | score_list = score
721 | else:
722 | score_list = torch.cat((score_list, score))
723 | # backward w.r.t. baseline
724 | bl_loss = bl_criterion(baseline, torch.tensor((dec_loss - enc_loss_act).item()).unsqueeze_(0))
725 | bl_loss.backward()
726 |
727 | # backward w.r.t. encoder and decoder
728 | for mc in range(self.opt.mc_samples):
729 | enc_loss_act = enc_loss_act_list[mc]
730 | dec_loss = dec_loss_list[mc]
731 | score = score_list[mc]
732 | score = score.item()
733 |
734 | enc_loss_act = enc_loss_act * score * pr_factor[mc].item() / self.opt.mc_samples
735 | enc_loss_act.backward()
736 |
737 | dec_loss = dec_loss * score * pr_factor[mc].item() / self.opt.mc_samples
738 | dec_loss.backward()
739 |
740 | clip_grad_norm_(enc_param + dec_param + bl_linear_param + pr_param, self.opt.clip)
741 | self.enc_optim.step()
742 | self.dec_optim.step()
743 | self.pr_optim.step()
744 | self.pr.project()
745 | self.bl_linear_optim.step()
746 | self.enc_optim.zero_grad()
747 | self.dec_optim.zero_grad()
748 | self.pr_optim.zero_grad()
749 | self.bl_linear_optim.zero_grad()
750 |
751 | if i % self.opt.print_every == 0 or i == self.len_real_train:
752 | elapsed_time = time.time() - cur_time
753 | cur_time = time.time()
754 | elapsed_batch = i - cur_batch
755 | cur_batch = i
756 | self.logger.info('epoch {:3d} | {:5d}/{:5d} | avg enc loss {:5.2f} | avg dec loss {:5.2f} | time {:5.2f}s'. \
757 | format(epoch, i, self.len_real_train, total_enc_loss / elapsed_batch / self.opt.nvil_batchsize / self.opt.mc_samples,
758 | total_dec_loss / elapsed_batch / self.opt.nvil_batchsize / self.opt.mc_samples, elapsed_time))
759 | total_enc_loss = 0.
760 | total_dec_loss = 0.
761 |
762 | if i % self.opt.save_every == 0 or i == self.len_train:
763 | # validate
764 | self.logger.info('=' * 80)
765 | valid_dda = self.parse(self.valid_sampler)
766 | self.valid_sampler = self.valid_iter.__iter__() # renew the iterator
767 | self.logger.info('epoch {:3d} | valid dda {:5.4f}'.format(epoch, valid_dda))
768 |
769 | if valid_dda > best_valid_dda:
770 | # save encoder model
771 | prev_enc_fname = os.path.join(self.opt.result_dir,
772 | '{}_enc_valid-dda-{:.4f}_len-{}.pt'
773 | .format(self.opt.log_name, best_valid_dda, self.opt.train_max_length))
774 | if os.path.exists(prev_enc_fname):
775 | os.remove(prev_enc_fname)
776 | cur_enc_fname = os.path.join(self.opt.result_dir,
777 | '{}_enc_valid-dda-{:.4f}_len-{}.pt'
778 | .format(self.opt.log_name, valid_dda, self.opt.train_max_length))
779 | self.encoder.save(cur_enc_fname)
780 |
781 | # save decoder model
782 | prev_dec_fname = os.path.join(self.opt.result_dir,
783 | '{}_dec_valid-dda-{:.4f}_len-{}.pt'
784 | .format(self.opt.log_name, best_valid_dda, self.opt.train_max_length))
785 | if os.path.exists(prev_dec_fname):
786 | os.remove(prev_dec_fname)
787 | cur_dec_fname = os.path.join(self.opt.result_dir,
788 | '{}_dec_valid-dda-{:.4f}_len-{}.pt'
789 | .format(self.opt.log_name, valid_dda, self.opt.train_max_length))
790 | self.decoder.save(cur_dec_fname)
791 |
792 | # test
793 | best_valid_dda = valid_dda
794 | max_length = 10
795 | test_dda = self.parse(self.test_sampler, max_length=max_length)
796 | self.test_sampler = self.test_iter.__iter__() # renew the iterator
797 | self.logger.info('epoch {:3d} | test dda-{:d} {:5.4f}'.format(epoch, max_length, test_dda))
798 |
799 | self.logger.info('=' * 80)
800 |
801 | def test(self):
802 | """
803 | test the trained model (10/inf for wsj)
804 | Reranking by the decoder made no difference, so we only used the encoder for parsing
805 | """
806 | self._build_wsj_dataset()
807 | self.encoder = self.create_encoder(self.opt.encoder_fname)
808 | # self.decoder = self.create_decoder(self.opt.decoder_fname)
809 | self.logger.info('=' * 80)
810 | cur_time = time.time()
811 | valid_dda = self.parse(self.valid_sampler, nsample=self.opt.nsample)
812 | print 'time', time.time() - cur_time
813 | self.valid_sampler = self.valid_iter.__iter__() # renew the iterator
814 | self.logger.info('nsample {:d} | valid dda {:5.4f}'.format(self.opt.nsample, valid_dda))
815 |
816 | max_length = 10
817 | cur_time = time.time()
818 | test_dda = self.parse(self.test_sampler, nsample=self.opt.nsample, max_length=max_length, get_rule_acc=True)
819 | print 'time', time.time() - cur_time
820 | self.test_sampler = self.test_iter.__iter__() # renew the iterator
821 | self.logger.info('nsample {:d} | test dda-{:d} {:5.4f}'.format(self.opt.nsample, max_length, test_dda))
822 | self.logger.info('=' * 80)
823 |
824 | max_length = 9999
825 | cur_time = time.time()
826 | test_dda = self.parse(self.test_sampler, nsample=self.opt.nsample, max_length=max_length)
827 | print 'time', time.time() - cur_time
828 | self.test_sampler = self.test_iter.__iter__() # renew the iterator
829 | self.logger.info('nsample {:d} | test dda-{:d} {:5.4f}'.format(self.opt.nsample, max_length, test_dda))
830 | self.logger.info('=' * 80)
831 |
832 | def test_ud(self):
833 | """
834 | test the trained model (15/40 for ud)
835 | Reranking by the decoder made no difference, so we only used the encoder for parsing
836 | """
837 | self._build_ud_dataset()
838 | self.encoder = self.create_encoder(self.opt.encoder_fname)
839 | # self.decoder = self.create_decoder(self.opt.decoder_fname)
840 | self.logger.info('=' * 80)
841 | cur_time = time.time()
842 | valid_dda = self.parse(self.valid_sampler, nsample=self.opt.nsample)
843 | print 'time', time.time() - cur_time
844 | self.valid_sampler = self.valid_iter.__iter__() # renew the iterator
845 | self.logger.info('nsample {:d} | valid dda {:5.4f}'.format(self.opt.nsample, valid_dda))
846 |
847 | max_length = 15
848 | cur_time = time.time()
849 | test_dda = self.parse(self.test_sampler, nsample=self.opt.nsample, max_length=max_length)
850 | print 'time', time.time() - cur_time
851 | self.test_sampler = self.test_iter.__iter__() # renew the iterator
852 | self.logger.info('nsample {:d} | test dda-{:d} {:5.4f}'.format(self.opt.nsample, max_length, test_dda))
853 |
854 | max_length = 40
855 | cur_time = time.time()
856 | test_dda = self.parse(self.test_sampler, nsample=self.opt.nsample, max_length=max_length)
857 | print 'time', time.time() - cur_time
858 | self.test_sampler = self.test_iter.__iter__() # renew the iterator
859 | self.logger.info('nsample {:d} | test dda-{:d} {:5.4f}'.format(self.opt.nsample, max_length, test_dda))
860 |
861 | max_length = 999
862 | cur_time = time.time()
863 | test_dda = self.parse(self.test_sampler, nsample=self.opt.nsample, max_length=max_length)
864 | print 'time', time.time() - cur_time
865 | self.test_sampler = self.test_iter.__iter__() # renew the iterator
866 | self.logger.info('nsample {:d} | test dda-{:d} {:5.4f}'.format(self.opt.nsample, max_length, test_dda))
867 | self.logger.info('=' * 80)
868 |
869 | def parse(self, data_sampler, max_length=10, nsample=1, output_tree=False, get_rule_acc=False):
870 | """
871 | parse and return dda
872 | Arguemnts:
873 | data_sampler(generator):valid/test
874 | nsample(int): parse by encoder if 1; reranking by decoder if >1
875 | output_tree(bool): whether to output parse trees
876 | get_rule_acc(bool): whether to output the parsing accuracy w.r.t. linguistic rules
877 | """
878 | self.encoder.eval()
879 | correct = 0.
880 | total = 0.
881 | total_word = 0.
882 | if get_rule_acc:
883 | correct_rule = [0.] * 13
884 | total_rule = [0.] * 13
885 | if nsample == 1:
886 | for sample in data_sampler:
887 | if sample.word[1].item() > max_length:
888 | continue
889 | total_word += sample.word[1].item()
890 | _, predicted_act = self.encoder(sample.word[0], sample.pos_tag)
891 | cur_correct, cur_total = compute_dda(oracle_heads=[sample.dep_head.data[0, i] for i in range(sample.dep_head.size(1))],
892 | act_seq=predicted_act)
893 | correct += cur_correct
894 | total += cur_total
895 | if get_rule_acc:
896 | c, t = compute_rule_acc(oracle_arcs=[(sample.dep_head.data[0, i], i+1) for i in range(sample.dep_head.size(1))],
897 | act_seq=predicted_act,
898 | tags=[self.dataset.POS_TAG.vocab.itos[sample.pos_tag[0,i].item()] for i in range(sample.word[1].item())])
899 | total_rule = [sum(x) for x in zip(total_rule, t)]
900 | correct_rule = [sum(x) for x in zip(correct_rule, c)]
901 | if get_rule_acc:
902 | self.logger.info('rule acc of coarse rules:')
903 | for i in range(13):
904 | if total_rule[i] < 1:
905 | self.logger.info('\t -')
906 | else:
907 | self.logger.info('\t{:.3f} {:.1f}|{:.1f} '.format(correct_rule[i] / total_rule[i], correct_rule[i], total_rule[i]))
908 |
909 | else:
910 | self.decoder.eval()
911 | for sample in data_sampler:
912 | if sample.word[1].item() > max_length:
913 | continue
914 | predicted_act_list = []
915 | _, predicted_act = self.encoder(sample.word[0], sample.pos_tag)
916 | predicted_act_list.append(predicted_act)
917 | for _ in range(nsample - 1):
918 | _, predicted_act = self.encoder(sample.word[0], sample.pos_tag, sample=True)
919 | predicted_act_list.append(predicted_act)
920 | assert len(predicted_act_list) == nsample
921 | dec_loss_list = []
922 | for act in predicted_act_list:
923 | loss, _, _ = self.decoder(words=sample.word[0], pos_tags=sample.pos_tag, oracle_actions=act)
924 | dec_loss_list.append(loss.item())
925 | best_act = predicted_act_list[torch.argmin(torch.FloatTensor(dec_loss_list)).item()]
926 | cur_correct, cur_total = compute_dda(oracle_heads=[sample.dep_head.data[0, i] for i in range(sample.dep_head.size(1))],
927 | act_seq=best_act)
928 | correct += cur_correct
929 | total += cur_total
930 | self.decoder.train()
931 | self.encoder.train()
932 | # print 'total word', total_word
933 | # print correct, total
934 | return correct / total
935 |
936 | def compute_joint_loss(self, data_sampler):
937 | """
938 | compute joint loss by the decoder
939 | Arguemnts:
940 | data_sampler(generator):valid/test
941 | Returns:
942 | average total loss/ act loss/ token loss
943 | """
944 | self.decoder.eval()
945 | total_loss_act = 0.
946 | total_loss_token = 0.
947 | total_loss = 0.
948 | i = 0
949 | for sample in data_sampler:
950 | i += 1
951 | loss, loss_act, loss_token = self.decoder(words=sample.word[0], pos_tags=sample.pos_tag, oracle_actions=sample.action)
952 | if loss_act is not None:
953 | total_loss_act += loss_act.data.item()
954 | total_loss += loss.data.item()
955 | total_loss_token += loss_token.data.item()
956 | self.decoder.train()
957 | return total_loss / i, total_loss_act / i, total_loss_token / i
958 |
959 | def compute_lm_ppl(self, data_sampler):
960 | """
961 | valid/test language model
962 | Arguemnts:
963 | data_sampler(generator): valid/test
964 | Returns:
965 | ppl(float): perplexity
966 | """
967 | self.lm.eval()
968 | total_loss = 0.
969 | total_token = 0
970 | for sample in data_sampler:
971 | if torch.sum(sample.word[1]).item() <= (self.opt.train_max_length + 1) * self.opt.batchsize:
972 | loss = self.lm(sample.word, sample.pos_tag)
973 | total_loss += loss.data.item()
974 | total_token += torch.sum(sample.word[1]).item()
975 | ppl = math.exp(total_loss / total_token)
976 | self.lm.train()
977 | return ppl
978 |
979 | def create_logger(self):
980 | # initialize logger
981 | # create logger
982 | logger_name = "mylog"
983 | logger = logging.getLogger(logger_name)
984 | logger.setLevel(logging.INFO)
985 | # file handler
986 | if not os.path.exists(self.opt.result_dir):
987 | os.makedirs(self.opt.result_dir)
988 | fh = logging.FileHandler(os.path.join(self.opt.result_dir, self.opt.log_name))
989 | fh.setLevel(logging.INFO)
990 | logger.addHandler(fh)
991 | # stream handler
992 | console = logging.StreamHandler()
993 | console.setLevel(logging.INFO)
994 | logger.addHandler(console)
995 |
996 | l = self.opt.__dict__.items()
997 | l.sort(key=lambda x: x[0])
998 | for opt, value in l:
999 | logger.info(str(opt) + '\t' + str(value))
1000 |
1001 | if self.opt.gpu_id >= 0:
1002 | torch.cuda.set_device(self.opt.gpu_id) # PyTorch GPU
1003 | return logger
1004 |
1005 | def create_encoder(self, fname=None):
1006 | """
1007 | create encoder
1008 | """
1009 | encoder = Encoder(nlayers=self.opt.nlayers,
1010 | pos_dim=self.opt.pos_dim,
1011 | action_dim=self.opt.action_dim,
1012 | lstm_dim=self.opt.enc_lstm_dim,
1013 | dropout=self.opt.enc_dropout,
1014 | word_dim=self.opt.word_dim,
1015 | pretrain_word_dim=self.opt.pretrain_word_dim,
1016 | pretrain_word_vectors=self.dataset.WORD.vocab.vectors,
1017 | w2i=self.dataset.WORD.vocab.stoi,
1018 | i2w=self.dataset.WORD.vocab.itos,
1019 | w2i_pos=self.dataset.POS_TAG.vocab.stoi,
1020 | i2w_pos=self.dataset.POS_TAG.vocab.itos,
1021 | para_init=self.initializers[self.opt.initializer],
1022 | init_name=self.opt.initializer,
1023 | gpu_id=self.opt.gpu_id,
1024 | seed=self.opt.seed)
1025 | if fname is not None:
1026 | encoder.load_state_dict(torch.load(fname, map_location=lambda storage, loc: storage))
1027 | return encoder
1028 |
1029 | def create_decoder(self, fname=None):
1030 | """
1031 | create decoder
1032 | """
1033 | decoder = Decoder(nlayers=self.opt.nlayers,
1034 | pos_dim=self.opt.pos_dim,
1035 | action_dim=self.opt.action_dim,
1036 | lstm_dim=self.opt.dec_lstm_dim,
1037 | dropout=self.opt.dec_dropout,
1038 | word_dim=self.opt.word_dim,
1039 | pretrain_word_dim=self.opt.pretrain_word_dim,
1040 | pretrain_word_vectors=self.dataset.WORD.vocab.vectors,
1041 | w2i=self.dataset.WORD.vocab.stoi,
1042 | i2w=self.dataset.WORD.vocab.itos,
1043 | w2i_pos=self.dataset.POS_TAG.vocab.stoi,
1044 | i2w_pos=self.dataset.POS_TAG.vocab.itos,
1045 | cluster=True,
1046 | wi2ci=self.dataset.wordi2ci,
1047 | wi2i=self.dataset.wordi2i,
1048 | ci2wi=self.dataset.ci2wordi,
1049 | para_init=self.initializers[self.opt.initializer],
1050 | init_name=self.opt.initializer,
1051 | gpu_id=self.opt.gpu_id,
1052 | seed=self.opt.seed)
1053 | if fname is not None:
1054 | decoder.load_state_dict(torch.load(fname, map_location=lambda storage, loc: storage))
1055 | return decoder
1056 |
1057 | def create_lm(self, fname=None):
1058 | """
1059 | create language model
1060 | """
1061 | lm = LanguageModel(word_dim=self.opt.lm_word_dim,
1062 | pos_dim=self.opt.lm_pos_dim,
1063 | lstm_dim=self.opt.lm_lstm_dim,
1064 | nlayers=self.opt.lm_nlayers,
1065 | dropout=self.opt.lm_dropout,
1066 | batchsize=self.opt.batchsize,
1067 | tie_weights=self.opt.tie_weights,
1068 | pretrain=self.opt.lm_pretrain,
1069 | pretrain_word_vectors=self.dataset.WORD.vocab.vectors,
1070 | w2i=self.dataset.WORD.vocab.stoi,
1071 | i2w=self.dataset.WORD.vocab.itos,
1072 | w2i_pos=self.dataset.POS_TAG.vocab.stoi,
1073 | i2w_pos=self.dataset.POS_TAG.vocab.itos,
1074 | para_init=self.initializers[self.opt.initializer],
1075 | init_name=self.opt.initializer,
1076 | gpu_id=self.opt.gpu_id)
1077 | if fname is not None:
1078 | lm.load_state_dict(torch.load(fname, map_location=lambda storage, loc: storage))
1079 | return lm
1080 |
1081 | def create_pr(self):
1082 | """
1083 | create posterior regulizer
1084 | """
1085 | pr = PR(
1086 | epsilon=self.opt.epsilon,
1087 | pr_fname=self.opt.pr_fname,
1088 | mc_samples=self.opt.mc_samples,
1089 | para_init=self.initializers[self.opt.pr_initializer],
1090 | gpu_id=self.opt.gpu_id)
1091 | return pr
1092 |
1093 | def create_baseline_linear(self):
1094 | """
1095 | """
1096 | baseline_linear = Baseline_linear(gpu_id=self.opt.gpu_id)
1097 | return baseline_linear
1098 |
1099 | def nvil_pr_pretrain_ud(self):
1100 | """
1101 | ud pretraining
1102 | """
1103 | self._build_ud_dataset()
1104 | self.encoder = self.create_encoder()
1105 | self.decoder = self.create_decoder()
1106 | self.pr = self.create_pr()
1107 | enc_param = [param for param in self.encoder.parameters() if param.requires_grad]
1108 | dec_param = [param for param in self.decoder.parameters() if param.requires_grad]
1109 | pr_param = [param for param in self.pr.parameters() if param.requires_grad]
1110 | self.enc_optim = self.optimizers[self.opt.optimizer](
1111 | [{'params': enc_param}])
1112 | self.dec_optim = self.optimizers[self.opt.optimizer](
1113 | [{'params': dec_param}])
1114 | self.pr_optim = self.optimizers[self.opt.pr_optimizer](
1115 | [{'params': pr_param}])
1116 | self.encoder.train()
1117 | self.decoder.train()
1118 | self.pr.train()
1119 | self.enc_optim.zero_grad()
1120 | self.dec_optim.zero_grad()
1121 | self.pr_optim.zero_grad()
1122 |
1123 | self.logger.info('=' * 80)
1124 | max_length = 15
1125 | valid_dda = self.parse(self.valid_sampler, max_length=max_length)
1126 | self.valid_sampler = self.valid_iter.__iter__() # renew the iterator
1127 | self.logger.info('epoch {:3d} | valid dda {:5.2f}'.format(0, valid_dda))
1128 | self.logger.info('=' * 80)
1129 |
1130 | self.len_train = len(self.dataset.train)
1131 | self.len_real_train = 0.
1132 | for i in range(1, self.len_train + 1):
1133 | sample = self.train_sampler.next()
1134 | if sample.word[1].item() <= self.opt.train_max_length:
1135 | self.len_real_train += 1
1136 | self.len_real_train = int(math.ceil(self.len_real_train / self.opt.nvil_batchsize))
1137 | total_enc_loss = 0.
1138 | total_dec_loss = 0.
1139 | total_score_mean = 0.
1140 | total_score_var = 0.
1141 | best_valid_dda = 0.
1142 | for epoch in range(1, self.opt.epochs + 1):
1143 | cur_time = time.time()
1144 | cur_batch = 0
1145 | i = 0
1146 | for _ in range(1, self.len_real_train + 1):
1147 | batch = []
1148 | while len(batch) < self.opt.nvil_batchsize:
1149 | sample = self.train_sampler.next()
1150 | if sample.word[1].item() <= self.opt.train_max_length:
1151 | batch.append(sample)
1152 | i += 1
1153 | for sample in batch:
1154 | enc_loss_act_list = []
1155 | dec_loss_list = []
1156 | for mc in range(self.opt.mc_samples):
1157 | enc_loss_act, predicted_act, feature = self.encoder.parse_pr(
1158 | sample.word[0], sample.pos_tag, self.pr.rule2i, sample=True)
1159 | self.pr.phi.data[mc] = feature
1160 | enc_loss_act_list.append(enc_loss_act)
1161 | dec_loss, _, _ = self.decoder(
1162 | words=sample.word[0], pos_tags=sample.pos_tag, oracle_actions=predicted_act)
1163 | dec_loss_list.append(dec_loss)
1164 | if sample.word[0].size(1) == 1:
1165 | continue # skip backward
1166 |
1167 | # update posterior regularizer
1168 | pr_factor = torch.ones(self.opt.mc_samples)
1169 | if self.opt.gpu_id > -1:
1170 | pr_factor.cuda()
1171 | if torch.sum(self.pr.phi).item() < 0:
1172 | pr_loss, pr_factor = self.pr()
1173 | pr_loss.backward()
1174 | self.pr.reset_phi()
1175 |
1176 | # backward w.r.t. encoder and decoder
1177 | for mc in range(self.opt.mc_samples):
1178 | enc_loss_act = enc_loss_act_list[mc]
1179 | dec_loss = dec_loss_list[mc]
1180 | total_dec_loss += dec_loss.item()
1181 | total_enc_loss += enc_loss_act.item()
1182 |
1183 | dec_loss = dec_loss * pr_factor[mc].item() / self.opt.mc_samples
1184 | dec_loss.backward()
1185 |
1186 | enc_loss_act = enc_loss_act * pr_factor[mc].item() / self.opt.mc_samples
1187 | enc_loss_act.backward()
1188 |
1189 | clip_grad_norm_(enc_param + dec_param, self.opt.clip)
1190 | self.enc_optim.step()
1191 | self.dec_optim.step()
1192 | self.pr_optim.step()
1193 | self.pr.project()
1194 | self.enc_optim.zero_grad()
1195 | self.dec_optim.zero_grad()
1196 | self.pr_optim.zero_grad()
1197 |
1198 | if i % self.opt.print_every == 0 or i == self.len_real_train:
1199 | elapsed_time = time.time() - cur_time
1200 | cur_time = time.time()
1201 | elapsed_batch = i - cur_batch
1202 | cur_batch = i
1203 | self.logger.info('epoch {:3d} | {:5d}/{:5d} | avg enc loss {:5.2f} | avg dec loss {:5.2f} | time {:5.2f}s'. \
1204 | format(epoch, i, self.len_real_train, total_enc_loss / elapsed_batch / self.opt.nvil_batchsize / self.opt.mc_samples,
1205 | total_dec_loss / elapsed_batch / self.opt.nvil_batchsize / self.opt.mc_samples, elapsed_time))
1206 | total_enc_loss = 0.
1207 | total_dec_loss = 0.
1208 |
1209 | if i % self.opt.save_every == 0 or i == self.len_real_train:
1210 | # validate
1211 | self.logger.info('=' * 80)
1212 | max_length = 15
1213 | valid_dda = self.parse(self.valid_sampler, max_length=max_length)
1214 | self.valid_sampler = self.valid_iter.__iter__() # renew the iterator
1215 | self.logger.info('epoch {:3d} | valid dda {:5.4f}'.format(epoch, valid_dda))
1216 |
1217 | if valid_dda > best_valid_dda:
1218 | # save encoder model
1219 | prev_enc_fname = os.path.join(self.opt.result_dir,
1220 | '{}_enc_valid-dda-{:.4f}_len-{}.pt'
1221 | .format(self.opt.log_name, best_valid_dda, self.opt.train_max_length))
1222 | if os.path.exists(prev_enc_fname):
1223 | os.remove(prev_enc_fname)
1224 | cur_enc_fname = os.path.join(self.opt.result_dir,
1225 | '{}_enc_valid-dda-{:.4f}_len-{}.pt'
1226 | .format(self.opt.log_name, valid_dda, self.opt.train_max_length))
1227 | self.encoder.save(cur_enc_fname)
1228 |
1229 | # save decoder model
1230 | prev_dec_fname = os.path.join(self.opt.result_dir,
1231 | '{}_dec_valid-dda-{:.4f}_len-{}.pt'
1232 | .format(self.opt.log_name, best_valid_dda, self.opt.train_max_length))
1233 | if os.path.exists(prev_dec_fname):
1234 | os.remove(prev_dec_fname)
1235 | cur_dec_fname = os.path.join(self.opt.result_dir,
1236 | '{}_dec_valid-dda-{:.4f}_len-{}.pt'
1237 | .format(self.opt.log_name, valid_dda, self.opt.train_max_length))
1238 | self.decoder.save(cur_dec_fname)
1239 |
1240 | # test short/long
1241 | best_valid_dda = valid_dda
1242 | max_length = 15
1243 | test_dda = self.parse(self.test_sampler, max_length=max_length)
1244 | self.test_sampler = self.test_iter.__iter__() # renew the iterator
1245 | self.logger.info('epoch {:3d} | test dda-{:2d} {:5.4f}'.format(epoch, max_length, test_dda))
1246 | self.logger.info('=' * 80)
1247 |
1248 | def nvil_pr_ft_ud(self):
1249 | """
1250 | ud finetuning
1251 | """
1252 | self._build_ud_dataset()
1253 | if os.path.isfile(self.opt.encoder_fname) and os.path.isfile(self.opt.decoder_fname):
1254 | self.encoder = self.create_encoder(self.opt.encoder_fname)
1255 | self.decoder = self.create_decoder(self.opt.decoder_fname)
1256 | else:
1257 | self.encoder = self.create_encoder()
1258 | self.decoder = self.create_decoder()
1259 | self.pr = self.create_pr()
1260 | enc_param = [param for param in self.encoder.parameters() if param.requires_grad]
1261 | dec_param = [param for param in self.decoder.parameters() if param.requires_grad]
1262 | pr_param = [param for param in self.pr.parameters() if param.requires_grad]
1263 | self.enc_optim = self.optimizers[self.opt.optimizer](
1264 | [{'params': enc_param}])
1265 | self.dec_optim = self.optimizers[self.opt.optimizer](
1266 | [{'params': dec_param}])
1267 | self.pr_optim = self.optimizers[self.opt.pr_optimizer](
1268 | [{'params': pr_param}])
1269 | self.encoder.train()
1270 | self.decoder.train()
1271 | self.pr.train()
1272 | self.enc_optim.zero_grad()
1273 | self.dec_optim.zero_grad()
1274 | self.pr_optim.zero_grad()
1275 |
1276 | self.logger.info('=' * 80)
1277 | max_length = 15
1278 | valid_dda = self.parse(self.valid_sampler, max_length=max_length)
1279 | self.valid_sampler = self.valid_iter.__iter__() # renew the iterator
1280 | self.logger.info('initial | valid dda {:5.4f}'.format(valid_dda))
1281 | self.logger.info('=' * 80)
1282 |
1283 | self.len_train = len(self.dataset.train)
1284 | self.len_real_train = 0.
1285 | for i in range(1, self.len_train + 1):
1286 | sample = self.train_sampler.next()
1287 | if sample.word[1].item() <= self.opt.train_max_length:
1288 | self.len_real_train += 1
1289 | self.len_real_train = int(math.ceil(self.len_real_train / self.opt.nvil_batchsize))
1290 | total_enc_loss = 0.
1291 | total_dec_loss = 0.
1292 | best_valid_dda = 0.
1293 | for epoch in range(1, self.opt.epochs + 1):
1294 | cur_time = time.time()
1295 | cur_batch = 0
1296 | i = 0
1297 | for _ in range(1, self.len_real_train + 1):
1298 | batch = []
1299 | while len(batch) < self.opt.nvil_batchsize:
1300 | sample = self.train_sampler.next()
1301 | if sample.word[1].item() <= self.opt.train_max_length:
1302 | batch.append(sample)
1303 | i += 1
1304 | for sample in batch:
1305 | enc_loss_act_list = []
1306 | dec_loss_list = []
1307 | for mc in range(self.opt.mc_samples):
1308 | enc_loss_act, predicted_act, feature = self.encoder.parse_pr(
1309 | sample.word[0], sample.pos_tag, self.pr.rule2i, sample=True)
1310 | self.pr.phi.data[mc] = feature
1311 | enc_loss_act_list.append(enc_loss_act)
1312 | dec_loss, _, _ = self.decoder(
1313 | words=sample.word[0], pos_tags=sample.pos_tag, oracle_actions=predicted_act)
1314 | dec_loss_list.append(dec_loss)
1315 |
1316 | if sample.word[0].size(1) == 1:
1317 | continue # skip backward
1318 |
1319 | # update posterior regulizer
1320 | pr_factor = torch.ones(self.opt.mc_samples)
1321 | if self.opt.gpu_id > -1:
1322 | pr_factor.cuda()
1323 | if torch.sum(self.pr.phi).item() < 0:
1324 | pr_loss, pr_factor = self.pr()
1325 | pr_loss.backward()
1326 |
1327 | phi = torch.sum(self.pr.phi, dim=1)
1328 | normalized_phi = (phi - torch.mean(phi))
1329 | phi = phi.cpu().numpy()
1330 |
1331 | self.pr.reset_phi()
1332 | score_list = None
1333 |
1334 | # backward w.r.t. decoder
1335 | for mc in range(self.opt.mc_samples):
1336 | enc_loss_act = enc_loss_act_list[mc]
1337 | # baseline = baseline_list[mc]
1338 | dec_loss = dec_loss_list[mc]
1339 | total_dec_loss += dec_loss.item()
1340 | total_enc_loss += enc_loss_act.item()
1341 |
1342 | # score = - dec_loss + enc_loss_act + baseline
1343 | score = - dec_loss + enc_loss_act
1344 | score.unsqueeze_(0)
1345 | if score_list is None:
1346 | score_list = score
1347 | else:
1348 | score_list = torch.cat((score_list, score))
1349 |
1350 | # normalize scores
1351 | score_mean = torch.mean(score_list)
1352 | score_std = torch.std(score_list)
1353 | nomalized_score_list = (score_list - score_mean) / score_std
1354 |
1355 | # backward w.r.t. encoder
1356 | for mc in range(self.opt.mc_samples):
1357 | enc_loss_act = enc_loss_act_list[mc]
1358 | score = nomalized_score_list[mc]
1359 | if normalized_phi[mc].item() < 0:
1360 | score = abs(score.item())
1361 | else:
1362 | score = - abs(score.item())
1363 | enc_loss_act = enc_loss_act * score * pr_factor[mc].item() / self.opt.mc_samples # ft4
1364 | enc_loss_act.backward()
1365 |
1366 | dec_loss = dec_loss_list[mc]
1367 | dec_loss = dec_loss * score * pr_factor[mc].item() / self.opt.mc_samples
1368 | dec_loss.backward()
1369 |
1370 | clip_grad_norm_(enc_param + dec_param + pr_param, self.opt.clip)
1371 | self.enc_optim.step()
1372 | self.dec_optim.step()
1373 | self.pr_optim.step()
1374 | self.pr.project()
1375 | self.enc_optim.zero_grad()
1376 | self.dec_optim.zero_grad()
1377 | self.pr_optim.zero_grad()
1378 |
1379 | if i % self.opt.print_every == 0 or i == self.len_real_train:
1380 | elapsed_time = time.time() - cur_time
1381 | cur_time = time.time()
1382 | elapsed_batch = i - cur_batch
1383 | cur_batch = i
1384 | self.logger.info('epoch {:3d} | {:5d}/{:5d} | avg enc loss {:5.2f} | avg dec loss {:5.2f} | time {:5.2f}s'. \
1385 | format(epoch, i, self.len_real_train, total_enc_loss / elapsed_batch / self.opt.nvil_batchsize / self.opt.mc_samples,
1386 | total_dec_loss / elapsed_batch / self.opt.nvil_batchsize / self.opt.mc_samples, elapsed_time))
1387 | total_enc_loss = 0.
1388 | total_dec_loss = 0.
1389 |
1390 | if i % self.opt.save_every == 0 or i == self.len_train:
1391 | # validate
1392 | self.logger.info('=' * 80)
1393 | max_length = 15
1394 | valid_dda = self.parse(self.valid_sampler, max_length=max_length)
1395 | self.valid_sampler = self.valid_iter.__iter__() # renew the iterator
1396 | self.logger.info('epoch {:3d} | valid dda {:5.4f}'.format(epoch, valid_dda))
1397 |
1398 | if valid_dda > best_valid_dda:
1399 | # save encoder model
1400 | prev_enc_fname = os.path.join(self.opt.result_dir,
1401 | '{}_enc_valid-dda-{:.4f}_len-{}.pt'
1402 | .format(self.opt.log_name, best_valid_dda, self.opt.train_max_length))
1403 | if os.path.exists(prev_enc_fname):
1404 | os.remove(prev_enc_fname)
1405 | cur_enc_fname = os.path.join(self.opt.result_dir,
1406 | '{}_enc_valid-dda-{:.4f}_len-{}.pt'
1407 | .format(self.opt.log_name, valid_dda, self.opt.train_max_length))
1408 | self.encoder.save(cur_enc_fname)
1409 |
1410 | # save decoder model
1411 | prev_dec_fname = os.path.join(self.opt.result_dir,
1412 | '{}_dec_valid-dda-{:.4f}_len-{}.pt'
1413 | .format(self.opt.log_name, best_valid_dda, self.opt.train_max_length))
1414 | if os.path.exists(prev_dec_fname):
1415 | os.remove(prev_dec_fname)
1416 | cur_dec_fname = os.path.join(self.opt.result_dir,
1417 | '{}_dec_valid-dda-{:.4f}_len-{}.pt'
1418 | .format(self.opt.log_name, valid_dda, self.opt.train_max_length))
1419 | self.decoder.save(cur_dec_fname)
1420 |
1421 | # test
1422 | best_valid_dda = valid_dda
1423 | max_length = 15
1424 | test_dda = self.parse(self.test_sampler, max_length=max_length)
1425 | self.test_sampler = self.test_iter.__iter__() # renew the iterator
1426 | self.logger.info('epoch {:3d} | test dda-{:d} {:5.4f}'.format(epoch, max_length, test_dda))
1427 |
1428 | self.logger.info('=' * 80)
1429 |
--------------------------------------------------------------------------------
/vi/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/libowen2121/VI-dependency-syntax/b9853a32fbfd7810ef03b5728fb1e01941504d96/vi/__init__.py
--------------------------------------------------------------------------------
/vi/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/libowen2121/VI-dependency-syntax/b9853a32fbfd7810ef03b5728fb1e01941504d96/vi/__init__.pyc
--------------------------------------------------------------------------------
/vi/modules/Decoder.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 | from torch.autograd import Variable
5 | from vi_syntax.vi.myio.Utils import SHIFT, REDUCE_L, REDUCE_R
6 |
7 | class Decoder(nn.Module):
8 |
9 | def __init__(self,
10 | nlayers,
11 | pos_dim,
12 | action_dim,
13 | lstm_dim,
14 | dropout,
15 | word_dim,
16 | pretrain_word_dim,
17 | pretrain_word_vectors=None,
18 | w2i=None,
19 | i2w=None,
20 | w2i_pos=None,
21 | i2w_pos=None,
22 | # H1=False,
23 | # H1_decay=5.,
24 | # H2=False,
25 | # H2_decay=5.,
26 | cluster=True,
27 | wi2ci=None,
28 | wi2i=None,
29 | ci2wi=None,
30 | para_init=None,
31 | init_name=None,
32 | gpu_id=-1,
33 | seed=-1):
34 |
35 | super(Decoder, self).__init__()
36 |
37 | if seed > 0:
38 | torch.manual_seed(seed)
39 | torch.cuda.manual_seed(seed)
40 | self.i2w = i2w
41 | self.w2i = w2i
42 | self.i2w_pos = i2w_pos
43 | self.w2i_pos = w2i_pos
44 | self.pos_dim = pos_dim
45 | self.word_dim = word_dim
46 | self.pretrain_word_dim = pretrain_word_dim
47 | self.nlayers = nlayers
48 | self.action_dim = action_dim
49 | self.len_word = len(self.i2w)
50 | self.len_pos = len(self.i2w_pos)
51 | self.lstm_dim = lstm_dim
52 | self.len_act = 3
53 | self.parser_state_dim = 3 * self.lstm_dim if self.action_dim > 0 else 2 * self.lstm_dim
54 | self.dropout = dropout
55 | self.cluster = cluster
56 | self.para_init = para_init
57 | self.init_name = init_name
58 | self.gpu_id = gpu_id
59 |
60 | self.SHIFT = SHIFT # GEN for decoder
61 | self.REDUCE_L = REDUCE_L
62 | self.REDUCE_R = REDUCE_R
63 |
64 | # nn blocks
65 | self.input_proj = nn.Linear(self.word_dim + self.pos_dim, self.lstm_dim, False)
66 | self.comp = nn.Linear(2 * self.lstm_dim, self.lstm_dim)
67 | self.mlp = nn.Linear(self.parser_state_dim, self.lstm_dim)
68 | self.act_output = nn.Linear(self.lstm_dim, self.len_act)
69 |
70 | if self.dropout > 0:
71 | self.dp = nn.Dropout(p=self.dropout)
72 |
73 | self.buffer_rnn = nn.LSTM(input_size=self.lstm_dim, hidden_size=self.lstm_dim, num_layers=nlayers)
74 | self.stack_rnn = nn.LSTM(input_size=self.lstm_dim, hidden_size=self.lstm_dim, num_layers=nlayers)
75 | self.stack_initial_state = self.get_initial_state(num_layers=self.nlayers, direction=1,
76 | hidden_size=self.lstm_dim, batch_size=1)
77 |
78 | if self.word_dim > 0:
79 | self.word_proj = nn.Linear(self.pretrain_word_dim, self.word_dim)
80 | self.word_embedding = nn.Embedding(self.len_word, self.pretrain_word_dim)
81 | if not self.cluster:
82 | self.word_output = nn.Linear(self.lstm_dim, self.len_word)
83 | else:
84 | self.wordi2i = wi2i
85 | self.wordi2ci = wi2ci
86 | self.ci2wordi = ci2wi
87 | self.len_cluster = len(self.ci2wordi)
88 | self.cluster_output = nn.Linear(self.lstm_dim, self.len_cluster)
89 | self.word_output_l = []
90 | for c in self.ci2wordi:
91 | if len(self.ci2wordi[c]) < 1:
92 | self.word_output_l.append(None)
93 | else:
94 | self.word_output_l.append(nn.Linear(self.lstm_dim, len(self.ci2wordi[c])))
95 | if self.gpu_id > -1:
96 | self.word_output_l[-1].cuda()
97 |
98 | if self.pos_dim > 0:
99 | self.pos_output = nn.Linear(self.lstm_dim, self.len_pos)
100 | self.pos_embedding = nn.Embedding(self.len_pos, pos_dim)
101 | if self.action_dim > 0:
102 | self.act_rnn = nn.LSTM(input_size=self.lstm_dim, hidden_size=self.lstm_dim, num_layers=nlayers)
103 | self.act_proj = nn.Linear(action_dim, self.lstm_dim)
104 | self.act_embedding = nn.Embedding(self.len_act, action_dim)
105 | self.act_initial_state = self.get_initial_state(num_layers=self.nlayers, direction=1,
106 | hidden_size=self.lstm_dim, batch_size=1)
107 |
108 | self.empty_buffer_emb = torch.randn(1, self.lstm_dim)
109 | self.empty_buffer_emb = nn.Parameter(self.empty_buffer_emb, requires_grad=False)
110 | self.empty_stack_emb = torch.randn(1, self.lstm_dim)
111 | self.empty_stack_emb = nn.Parameter(self.empty_stack_emb, requires_grad=False)
112 | if self.action_dim > 0:
113 | self.empty_act_emb = torch.randn(1, self.lstm_dim)
114 | self.empty_act_emb = nn.Parameter(self.empty_buffer_emb, requires_grad=False)
115 |
116 | if self.para_init is not None:
117 | if self.init_name == 'glorot':
118 | for name, para in self.named_parameters():
119 | # print 'initializing', name
120 | if len(para.size()) < 2:
121 | para.data.zero_()
122 | else:
123 | self.para_init(para)
124 | else:
125 | for name, para in self.named_parameters():
126 | # print 'initializing', name
127 | self.para_init(para)
128 |
129 | if self.word_dim > 0 and self.pretrain_word_dim > 0:
130 | self.load_embeddings(pretrain_word_vectors)
131 |
132 | if self.gpu_id > -1:
133 | self.cuda()
134 |
135 | def load_embeddings(self, pretrain_word_vectors):
136 | self.word_embedding.weight.data.copy_(pretrain_word_vectors)
137 |
138 | def save(self, filename):
139 | torch.save(self.state_dict(), filename)
140 |
141 | def get_initial_state(self, num_layers, direction, hidden_size, batch_size):
142 | '''
143 | initial states for LSTMs
144 | '''
145 | if self.gpu_id > -1:
146 | h0_tensor = torch.zeros(num_layers * direction, batch_size, hidden_size).cuda()
147 | c0_tensor = torch.zeros(num_layers * direction, batch_size, hidden_size).cuda()
148 | else:
149 | h0_tensor = torch.zeros(num_layers * direction, batch_size, hidden_size)
150 | c0_tensor = torch.zeros(num_layers * direction, batch_size, hidden_size)
151 | return (Variable(h0_tensor), Variable(c0_tensor))
152 |
153 | def encode_sentence(self, words, pos_tags):
154 | '''
155 | To score a sentence or parse tree, we can encode the forward tokens all at once.
156 | This will not be used in generation mode.
157 | Arguments:
158 | words(Variable):
159 | pos_tags(Variable):
160 | '''
161 | buffer_initial_state = self.get_initial_state(self.nlayers, 1, self.lstm_dim, 1)
162 | input_sent = None # input embeddings
163 | len_tokens = words.size(1)
164 |
165 | if self.word_dim > 0:
166 | input_word = self.word_proj(self.word_embedding(words).view(-1, self.pretrain_word_dim)) \
167 | .view(len_tokens, 1, self.word_dim) # length x 1 x word_dim
168 | input_word = F.relu(input_word)
169 | input_sent = input_word
170 |
171 | if self.pos_dim > 0:
172 | input_pos = self.pos_embedding(pos_tags).view(len_tokens, 1, self.pos_dim) # length x 1 x pos_dim
173 | if input_sent is not None:
174 | input_sent = torch.cat((input_sent, input_pos), 2) # length x 1 x dim
175 | else:
176 | input_sent = input_pos
177 |
178 | input_sent = F.relu(self.input_proj(input_sent.view(len_tokens, self.word_dim + self.pos_dim))) \
179 | .view(len_tokens, 1, self.lstm_dim) # len_tokens x 1 x self.lstm_dim
180 |
181 | buffer_sent, _ = self.buffer_rnn(input_sent, buffer_initial_state) # len_tokens x 1 x self.lstm_dim
182 |
183 | input_sent = [input_sent[idx] for idx in range(len_tokens)] # 1 x self.lstm_dim
184 | buffer_sent = [buffer_sent[idx] for idx in range(len_tokens)] # 1 x self.lstm_dim
185 |
186 | return input_sent, buffer_sent
187 |
188 | def forward(self, words, pos_tags, oracle_actions):
189 | '''
190 | compute loss of the given actions
191 | Arguments:
192 | words(Variable):
193 | pos_tags(Variable):
194 | oracle_actions(Variable):
195 | '''
196 | if isinstance(oracle_actions, list):
197 | if self.gpu_id > -1:
198 | oracle_actions = Variable(torch.LongTensor(oracle_actions).unsqueeze(0).cuda())
199 | else:
200 | oracle_actions = Variable(torch.LongTensor(oracle_actions).unsqueeze(0))
201 | oracle_actions = [oracle_actions[0, i] for i in range(oracle_actions.size(1))] # copy list (Variable)
202 |
203 | tokens = [(words.data[0,i], pos_tags.data[0,i]) for i in range(words.size(1))]
204 |
205 | stack = []
206 | if self.action_dim > 0:
207 | act_state = self.act_initial_state # state of act rnn
208 | act_summary = self.empty_act_emb # output of act rnn
209 |
210 | input_sent, buffer_sent = self.encode_sentence(words, pos_tags) # token_embeddings and buffer in original order
211 | loss_act = None
212 | loss_token = None
213 | input_sent.reverse() # in reverse order
214 | buffer_sent.reverse() # in reverse order
215 | tokens.reverse() # in reverse order
216 |
217 | buffer_embedding = self.empty_buffer_emb
218 |
219 | while not (len(stack) == 1 and len(buffer_sent) == 0):
220 | valid_actions = []
221 |
222 | if len(buffer_sent) > 0:
223 | valid_actions += [self.SHIFT]
224 | if len(stack) >= 2:
225 | valid_actions += [self.REDUCE_L, self.REDUCE_R]
226 |
227 | action = oracle_actions.pop(0) # Variable
228 |
229 | parser_state_l = []
230 | stack_embedding = self.empty_stack_emb if len(stack) == 0 else stack[-1][0]
231 | parser_state_l.append(stack_embedding.view(1, self.lstm_dim))
232 | parser_state_l.append(buffer_embedding)
233 | if self.action_dim > 0:
234 | act_summary = act_summary.view(1, self.lstm_dim)
235 | parser_state_l.append(act_summary)
236 | parser_state = torch.cat(parser_state_l, 1)
237 | h = F.relu(self.mlp(parser_state)) # 1 x self.lstm_dim
238 | if self.dropout > 0:
239 | h = self.dp(h)
240 |
241 | if len(valid_actions) > 1:
242 | log_probs = F.log_softmax(self.act_output(h), dim=1) # 1 x out_dim
243 | cur_loss_act = - log_probs[0, action.data.item()]
244 | if loss_act is None:
245 | loss_act = cur_loss_act
246 | else:
247 | loss_act += cur_loss_act
248 |
249 | if self.action_dim > 0:
250 | act_embedding = self.act_proj(self.act_embedding(action).view(1, self.action_dim)).view(1, 1, self.action_dim) # 1 x 1 x dim
251 | act_summary, act_state = self.act_rnn(act_embedding, act_state)
252 |
253 | # execute the action to update the parser state
254 | if action.data.item() == self.SHIFT:
255 |
256 | token = tokens.pop()
257 | if self.pos_dim > 0:
258 | log_probs_pos = F.log_softmax(self.pos_output(h), dim=1)
259 | cur_loss_pos = - log_probs_pos[0, token[1]]
260 | if loss_token is None:
261 | loss_token = cur_loss_pos
262 | else:
263 | loss_token += cur_loss_pos
264 | if self.word_dim > 0:
265 | cur_word_idx = token[0].item()
266 | if not self.cluster:
267 | log_probs_word = F.log_softmax(self.word_output(h), dim=1)
268 | cur_loss_word = - log_probs_word[0, cur_word_idx]
269 | if loss_token is None:
270 | loss_token = cur_loss_word
271 | else:
272 | loss_token += cur_loss_word
273 | else:
274 | cur_word_intra_idx = self.wordi2i[cur_word_idx]
275 | cur_c = self.wordi2ci[cur_word_idx] # cluster idx
276 | log_probs_cluster = F.log_softmax(self.cluster_output(h), dim=1)
277 | log_probs_word = F.log_softmax(self.word_output_l[cur_c](h), dim=1) # given c
278 | cur_loss_cluster = - log_probs_cluster[0, cur_c]
279 | cur_loss_intra_cluster = - log_probs_word[0, cur_word_intra_idx]
280 | if loss_token is None:
281 | loss_token = cur_loss_cluster + cur_loss_intra_cluster
282 | else:
283 | loss_token += (cur_loss_cluster + cur_loss_intra_cluster)
284 |
285 | buffer_embedding = buffer_sent.pop() # 1 x self.lstm_dim
286 | token_embedding = input_sent.pop()
287 | stack_state = stack[-1][1] if stack else self.stack_initial_state
288 | output, stack_state = self.stack_rnn(token_embedding.view(1, 1, self.lstm_dim), stack_state)
289 | stack.append((output, stack_state, token))
290 |
291 | else:
292 | right = stack.pop()
293 | left = stack.pop()
294 | head, modifier = (left, right) if action.data.item() == self.REDUCE_R else (right, left)
295 | top_stack_state = stack[-1][1] if len(stack) > 0 else self.stack_initial_state
296 | head_rep, head_tok = head[0], head[2]
297 | mod_rep, mod_tok = modifier[0], modifier[2]
298 | composed_rep = F.relu(self.comp(torch.cat([head_rep, mod_rep], 2).view(1, 2 * self.lstm_dim)))
299 | output, top_stack_state = self.stack_rnn(composed_rep.view(1, 1, self.lstm_dim), top_stack_state)
300 | stack.append((output, top_stack_state, head_tok))
301 |
302 | loss = loss_token if loss_act is None else loss_token + loss_act
303 | return loss, loss_act, loss_token
304 |
--------------------------------------------------------------------------------
/vi/modules/Encoder.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import dill
3 | import torch.nn as nn
4 | import torch.nn.functional as F
5 | from torch.autograd import Variable
6 |
7 | from vi_syntax.vi.myio.Utils import REDUCE_L, REDUCE_R, SHIFT
8 |
9 |
10 | class Encoder(nn.Module):
11 | '''
12 | encoder (inference network)
13 | '''
14 |
15 | def __init__(self,
16 | nlayers,
17 | pos_dim,
18 | action_dim,
19 | lstm_dim,
20 | dropout,
21 | word_dim,
22 | pretrain_word_dim,
23 | pretrain_word_vectors=None,
24 | w2i=None,
25 | i2w=None,
26 | w2i_pos=None,
27 | i2w_pos=None,
28 | # H1=False,
29 | # H1_decay=5.,
30 | # H2=False,
31 | # H2_decay=5.,
32 | para_init=None,
33 | init_name=None,
34 | gpu_id=-1,
35 | seed=-1):
36 |
37 | super(Encoder, self).__init__()
38 |
39 | if seed > 0:
40 | torch.manual_seed(seed)
41 | torch.cuda.manual_seed(seed)
42 | self.i2w = i2w
43 | self.w2i = w2i
44 | self.i2w_pos = i2w_pos
45 | self.w2i_pos = w2i_pos
46 | self.word_dim = word_dim
47 | self.pretrain_word_dim = pretrain_word_dim
48 | self.pos_dim = pos_dim
49 | self.nlayers = nlayers
50 | self.action_dim = action_dim
51 | self.len_word = len(self.i2w)
52 | self.len_pos = len(self.i2w_pos)
53 | self.len_act = 3
54 | self.dropout = dropout
55 | self.lstm_dim = lstm_dim
56 | self.parser_state_dim = 4 * self.lstm_dim if self.action_dim > 0 else 3 * self.lstm_dim
57 | self.para_init = para_init
58 | self.init_name = init_name
59 | self.gpu_id = gpu_id
60 |
61 | self.SHIFT = SHIFT
62 | self.REDUCE_L = REDUCE_L
63 | self.REDUCE_R = REDUCE_R
64 |
65 | # build nn blocks
66 | self.input_proj = nn.Linear(word_dim + pos_dim, self.lstm_dim, False)
67 | self.buffer_rnn = nn.LSTM(input_size=self.lstm_dim, hidden_size=self.lstm_dim,
68 | num_layers=nlayers, bidirectional=True)
69 | self.comp = nn.Linear(2 * self.lstm_dim, self.lstm_dim)
70 | self.stack_rnn = nn.LSTM(input_size=self.lstm_dim, hidden_size=self.lstm_dim,
71 | num_layers=nlayers)
72 | if dropout > 0:
73 | self.dp = nn.Dropout(p=self.dropout)
74 | self.mlp = nn.Linear(self.parser_state_dim, self.lstm_dim)
75 | self.act_output = nn.Linear(self.lstm_dim, self.len_act)
76 |
77 | if self.word_dim > 0:
78 | self.pretrain_proj = nn.Linear(pretrain_word_dim, word_dim)
79 | self.word_embedding = nn.Embedding(self.len_word, pretrain_word_dim)
80 | if self.action_dim > 0:
81 | self.act_rnn = nn.LSTM(input_size=self.lstm_dim, hidden_size=self.lstm_dim,
82 | num_layers=nlayers)
83 | self.act_proj = nn.Linear(action_dim, self.lstm_dim)
84 | self.act_embedding = nn.Embedding(self.len_act, action_dim)
85 | self.act_initial_state = self.get_initial_state(num_layers=self.nlayers, direction=1,
86 | hidden_size=self.lstm_dim, batch_size=1)
87 | if self.pos_dim > 0:
88 | self.pos_embedding = nn.Embedding(self.len_pos, pos_dim)
89 | self.stack_initial_state = self.get_initial_state(num_layers=self.nlayers, direction=1,
90 | hidden_size=self.lstm_dim, batch_size=1)
91 |
92 | self.empty_buffer_emb = torch.randn(1, 2 * self.lstm_dim)
93 | self.empty_buffer_emb = nn.Parameter(self.empty_buffer_emb, requires_grad=False)
94 | if self.action_dim > 0:
95 | self.empty_act_emb = torch.randn(1, self.lstm_dim)
96 | self.empty_act_emb = nn.Parameter(self.empty_buffer_emb, requires_grad=False)
97 |
98 | if self.para_init is not None:
99 | if self.init_name == 'glorot':
100 | for name, para in self.named_parameters():
101 | # print 'initializing', name
102 | if len(para.size()) < 2:
103 | para.data.zero_()
104 | else:
105 | self.para_init(para)
106 | else:
107 | for name, para in self.named_parameters():
108 | # print 'initializing', name
109 | self.para_init(para)
110 |
111 | if self.word_dim > 0 and self.pretrain_word_dim > 0:
112 | self.load_embeddings(pretrain_word_vectors)
113 |
114 | if self.gpu_id > -1:
115 | self.cuda()
116 |
117 | def load_embeddings(self, pretrain_word_vectors):
118 | self.word_embedding.weight.data.copy_(pretrain_word_vectors)
119 | for para in self.word_embedding.parameters():
120 | para.requires_grad = False
121 |
122 | def save(self, filename):
123 | torch.save(self.state_dict(), filename)
124 |
125 | def get_initial_state(self, num_layers, direction, hidden_size, batch_size):
126 | """
127 | initial states for LSTMs
128 | """
129 | if self.gpu_id > -1:
130 | h0_tensor = torch.zeros(num_layers * direction, batch_size, hidden_size).cuda()
131 | c0_tensor = torch.zeros(num_layers * direction, batch_size, hidden_size).cuda()
132 | else:
133 | h0_tensor = torch.zeros(num_layers * direction, batch_size, hidden_size)
134 | c0_tensor = torch.zeros(num_layers * direction, batch_size, hidden_size)
135 | return (Variable(h0_tensor), Variable(c0_tensor))
136 |
137 | def encode_sentence(self, inv_words, inv_pos_tags):
138 | """
139 | sentence encoding for bi-directional buffer
140 | Arguments:
141 | inv_words([Variable]):
142 | inv_pos_tags([Variable]):
143 | Returns:
144 | input_sent([Variable]):
145 | buffer_sent([Variable]):
146 | """
147 | buffer_initial_state = self.get_initial_state(self.nlayers, 2, self.lstm_dim, 1)
148 | input_sent = None # input embeddings
149 | len_tokens = inv_words.size(1)
150 |
151 | if self.word_dim > 0:
152 | input_word = self.pretrain_proj(self.word_embedding(inv_words).view(-1, self.pretrain_word_dim)) \
153 | .view(len_tokens, 1, self.word_dim) # length x 1 x word_dim
154 | input_word = F.relu(input_word)
155 | input_sent = input_word
156 | if self.pos_dim > 0:
157 | input_pos = self.pos_embedding(inv_pos_tags).view(len_tokens, 1, self.pos_dim) # length x 1 x pos_dim
158 | if input_sent is not None:
159 | input_sent = torch.cat((input_sent, input_pos), 2) # length x 1 x dim
160 | else:
161 | input_sent = input_pos
162 |
163 | input_sent = F.relu(self.input_proj(input_sent.view(len_tokens, self.word_dim + self.pos_dim))) \
164 | .view(len_tokens, 1, self.lstm_dim) # length x 1 x lstm_dim
165 |
166 | # len_tokens x 1 x 2*lstm_dim
167 | buffer_sent, _ = self.buffer_rnn(input_sent, buffer_initial_state)
168 |
169 | input_sent = [input_sent[idx] for idx in range(len_tokens)] # 1 x lstm_dim
170 | buffer_sent = [buffer_sent[idx] for idx in range(len_tokens)] # 1 x 2*lstm_dim
171 |
172 | return input_sent, buffer_sent
173 |
174 | def train_parser(self, words, pos_tags, oracle_actions):
175 | """
176 | train encoder
177 | Arguments:
178 | words(Variable):
179 | pos_tags(Variable):
180 | oracle_actions(Variable):
181 | Returns:
182 | loss_act(Variable):
183 | """
184 |
185 | oracle_actions = [oracle_actions[0, i] for i in range(oracle_actions.size(1))] # copy list
186 | # print [oracle_actions[i].data[0] for i in range(len(oracle_actions))]
187 |
188 | if self.gpu_id > -1:
189 | word_inv_idx = Variable(torch.arange(words.size(1) - 1, -1, -1).long().cuda())
190 | pos_inv_idx = Variable(torch.arange(pos_tags.size(1) - 1, -1, -1).long().cuda())
191 | else:
192 | word_inv_idx = Variable(torch.arange(words.size(1) - 1, -1, -1).long())
193 | pos_inv_idx = Variable(torch.arange(pos_tags.size(1) - 1, -1, -1).long())
194 | # get token_embeddings and buffer (in reversed order)
195 | input_sent, buffer_sent=self.encode_sentence(words.index_select(1, word_inv_idx),
196 | pos_tags.index_select(1, pos_inv_idx))
197 | assert len(buffer_sent) * 2 - 1 == len(oracle_actions)
198 | stack = [] # stack LSTM
199 | if self.action_dim > 0:
200 | act_state = self.act_initial_state # state of act rnn
201 | act_summary = self.empty_act_emb # output of act rnn
202 |
203 | loss_act = None
204 |
205 | while not (len(stack) == 1 and len(buffer_sent) == 0):
206 | valid_actions = [] # based on parser state, get valid actions
207 |
208 | if len(buffer_sent) > 0:
209 | valid_actions += [self.SHIFT]
210 | if len(stack) >= 2:
211 | valid_actions += [self.REDUCE_L, self.REDUCE_R]
212 |
213 | action = oracle_actions.pop(0)
214 | if len(valid_actions) > 1:
215 | parser_state_l = []
216 | stack_embedding = stack[-1][0].view(1, self.lstm_dim)
217 | buffer_embedding = buffer_sent[-1] if buffer_sent else self.empty_buffer_emb # 1 x 2*lstm_dim
218 | parser_state_l.append(stack_embedding)
219 | parser_state_l.append(buffer_embedding)
220 | if self.action_dim > 0:
221 | parser_state_l.append(act_summary.view(1, self.lstm_dim))
222 | parser_state = torch.cat(parser_state_l, 1)
223 | h = F.relu(self.mlp(parser_state)) # 1 x lstm_dim
224 | if self.dropout > 0:
225 | h = self.dp(h)
226 | f = self.act_output(h) # 1 x out_dim
227 | log_probs = F.log_softmax(f, dim=1)
228 | cur_loss_act = - log_probs[0, action.data[0]]
229 |
230 | if loss_act is None:
231 | loss_act = cur_loss_act
232 | else:
233 | loss_act += cur_loss_act
234 |
235 | if self.action_dim > 0:
236 | act_embedding = F.relu(self.act_proj(self.act_embedding(action).view(1, self.action_dim))).view(1, 1, self.action_dim) # 1 x 1 x dim
237 | act_summary, act_state = self.act_rnn(act_embedding, act_state)
238 |
239 | # execute the action to update the parser state
240 | if action.data[0] == self.SHIFT:
241 | buffer_sent.pop()
242 | token_embedding = input_sent.pop() # 1 x lstm_dim
243 | stack_state = stack[-1][1] if len(stack) > 0 else self.stack_initial_state
244 | output, stack_state = self.stack_rnn(token_embedding.view(1, 1, self.lstm_dim), stack_state)
245 | stack.append((output, stack_state))
246 | else:
247 | right = stack.pop()
248 | left = stack.pop()
249 | head, modifier = (left, right) if action.data[0] == self.REDUCE_R else (right, left)
250 | top_stack_state = stack[-1][1] if len(stack) > 0 else self.stack_initial_state
251 | head_rep = head[0]
252 | mod_rep = modifier[0]
253 | composed_rep = F.relu(self.comp(torch.cat([head_rep, mod_rep], 2) \
254 | .view(1, 2 * self.lstm_dim)))
255 | output, top_stack_state = self.stack_rnn(composed_rep.view(1, 1, self.lstm_dim), top_stack_state)
256 | stack.append((output, top_stack_state))
257 |
258 | return loss_act
259 |
260 | def forward(self, words, pos_tags, sample=False):
261 | """
262 | parse
263 | Arguments:
264 | words(Variable):
265 | pos_tags(Variable):
266 | sample(bool): parse if Ture; sample if False
267 | Return:
268 | loss_act(Variable):
269 | act_sequence([]):
270 | """
271 | act_sequence = []
272 |
273 | if self.gpu_id > -1:
274 | word_inv_idx = Variable(torch.arange(words.size(1) - 1, -1, -1).long().cuda())
275 | pos_inv_idx = Variable(torch.arange(pos_tags.size(1) - 1, -1, -1).long().cuda())
276 | else:
277 | word_inv_idx = Variable(torch.arange(words.size(1) - 1, -1, -1).long())
278 | pos_inv_idx = Variable(torch.arange(pos_tags.size(1) - 1, -1, -1).long())
279 | # get token_embeddings and buffer (in reversed order)
280 | input_sent, buffer_sent=self.encode_sentence(words.index_select(1, word_inv_idx),
281 | pos_tags.index_select(1, pos_inv_idx))
282 | stack = [] # stack LSTM
283 | if self.action_dim > 0:
284 | act_summary = self.empty_act_emb # output of act rnn
285 | act_state = self.act_initial_state # state of act rnn
286 |
287 | loss_act = None
288 |
289 | while not (len(stack) == 1 and len(buffer_sent) == 0):
290 | valid_actions = [] # based on parser state, get valid actions
291 |
292 | if len(buffer_sent) > 0:
293 | valid_actions += [self.SHIFT]
294 | if len(stack) >= 2:
295 | valid_actions += [self.REDUCE_L, self.REDUCE_R]
296 |
297 | action = valid_actions[0]
298 |
299 | if len(valid_actions) > 1:
300 | parser_state_l = []
301 | stack_embedding = stack[-1][0].view(1, self.lstm_dim)
302 | # 1 x 2*lstm_dim
303 | buffer_embedding = buffer_sent[-1] if buffer_sent else self.empty_buffer_emb
304 | parser_state_l.append(stack_embedding)
305 | parser_state_l.append(buffer_embedding)
306 | if self.action_dim > 0:
307 | parser_state_l.append(act_summary.view(1, self.lstm_dim))
308 | parser_state = torch.cat(parser_state_l, 1)
309 | h = F.relu(self.mlp(parser_state)) # 1 x lstm_dim
310 | if self.dropout > 0:
311 | h = self.dp(h)
312 | f = self.act_output(h) # 1 x out_dim
313 | log_probs = F.log_softmax(f, dim=1)
314 |
315 | probs = torch.exp(log_probs * 0.8).data # 1 x len_act
316 | for act in (self.SHIFT, self.REDUCE_L, self.REDUCE_R):
317 | if act not in valid_actions:
318 | probs[0, act] = 0.
319 | if sample:
320 | action = torch.multinomial(probs, 1, replacement=True)[0,0]
321 | else:
322 | action = torch.max(probs, 1)[1][0] # int
323 | assert action in valid_actions
324 |
325 | cur_loss_act = - log_probs[0, action]
326 | if loss_act is None:
327 | loss_act = cur_loss_act
328 | else:
329 | loss_act += cur_loss_act
330 |
331 | if self.action_dim > 0:
332 | if self.gpu_id > -1:
333 | act_idx = Variable(torch.LongTensor([[action]]).cuda())
334 | else:
335 | act_idx = Variable(torch.LongTensor([[action]]))
336 | act_embedding = F.relu(self.act_proj(self.act_embedding(act_idx).view(1, self.action_dim)) \
337 | .view(1, 1, self.action_dim)) # 1 x 1 x dim
338 | act_summary, act_state = self.act_rnn(act_embedding, act_state)
339 |
340 | # execute the action to update the parser state
341 | if action == self.SHIFT:
342 | buffer_sent.pop()
343 | token_embedding = input_sent.pop() # 1 x lstm_dim
344 | stack_state = stack[-1][1] if len(stack) > 0 else self.stack_initial_state
345 | output, stack_state = self.stack_rnn(token_embedding.view(1, 1, self.lstm_dim), stack_state)
346 | stack.append((output, stack_state))
347 | else:
348 | right = stack.pop()
349 | left = stack.pop()
350 | head, modifier = (left, right) if action == self.REDUCE_R else (right, left)
351 | top_stack_state = stack[-1][1] if len(stack) > 0 else self.stack_initial_state
352 | head_rep = head[0]
353 | mod_rep = modifier[0]
354 | composed_rep = F.relu(self.comp(torch.cat([head_rep, mod_rep], 2).view(1, 2 * self.lstm_dim)))
355 | output, top_stack_state = self.stack_rnn(composed_rep.view(1, 1, self.lstm_dim), top_stack_state)
356 | stack.append((output, top_stack_state))
357 | act_sequence.append(action)
358 |
359 | return loss_act, act_sequence
360 |
361 | def parse_pr(self, words, pos_tags, rule2i, sample=False):
362 | """
363 | parse for posterior regularization
364 | Arguments:
365 | words(Variable):
366 | pos_tags(Variable):
367 | rule2i(dict):
368 | sample(bool): parse if Ture; sample if False
369 | Return:
370 | loss_act(Variable):
371 | act_sequence([]):
372 | feature(FloatTensor):
373 | """
374 | act_sequence = []
375 |
376 | n = len(rule2i)
377 | feature = torch.zeros(n)
378 |
379 | tokens = [(words.data[0,i], pos_tags.data[0,i]) for i in range(words.size(1))]
380 | tokens.reverse()
381 |
382 | stack = [] # stack LSTM
383 | if self.action_dim > 0:
384 | act_summary = self.empty_act_emb # output of act rnn
385 | act_state = self.act_initial_state # state of act rnn
386 |
387 | if self.gpu_id > -1:
388 | word_inv_idx = Variable(torch.arange(words.size(1) - 1, -1, -1).long().cuda())
389 | pos_inv_idx = Variable(torch.arange(pos_tags.size(1) - 1, -1, -1).long().cuda())
390 | else:
391 | word_inv_idx = Variable(torch.arange(words.size(1) - 1, -1, -1).long())
392 | pos_inv_idx = Variable(torch.arange(pos_tags.size(1) - 1, -1, -1).long())
393 | # get token_embeddings and buffer (in reversed order)
394 | input_sent, buffer_sent = self.encode_sentence(words.index_select(1, word_inv_idx),
395 | pos_tags.index_select(1, pos_inv_idx))
396 | loss_act = None
397 |
398 | while not (len(stack) == 1 and len(buffer_sent) == 0):
399 | valid_actions = [] # based on parser state, get valid actions
400 |
401 | if len(buffer_sent) > 0:
402 | valid_actions += [self.SHIFT]
403 | if len(stack) >= 2:
404 | valid_actions += [self.REDUCE_L, self.REDUCE_R]
405 |
406 | action = valid_actions[0]
407 |
408 | if len(valid_actions) > 1:
409 | parser_state_l = []
410 | stack_embedding = stack[-1][0].view(1, self.lstm_dim)
411 | buffer_embedding = buffer_sent[-1] if buffer_sent else self.empty_buffer_emb # 1 x 2*lstm_dim
412 | parser_state_l.append(stack_embedding)
413 | parser_state_l.append(buffer_embedding)
414 | if self.action_dim > 0:
415 | parser_state_l.append(act_summary.view(1, self.lstm_dim))
416 | parser_state = torch.cat(parser_state_l, 1)
417 | h = F.relu(self.mlp(parser_state)) # 1 x lstm_dim
418 | if self.dropout > 0:
419 | h = self.dp(h)
420 | f = self.act_output(h) # 1 x out_dim
421 | log_probs = F.log_softmax(f, dim=1)
422 |
423 | probs = torch.exp(log_probs * 0.8).data # 1 x len_act
424 | for act in (self.SHIFT, self.REDUCE_L, self.REDUCE_R):
425 | if act not in valid_actions:
426 | probs[0, act] = 0.
427 | if sample:
428 | action = torch.multinomial(probs, 1, replacement=True)[0][0]
429 | else:
430 | action = torch.max(probs, 1)[1][0] # int
431 | assert action in valid_actions
432 |
433 | cur_loss_act = - log_probs[0, action]
434 | if loss_act is None:
435 | loss_act = cur_loss_act
436 | else:
437 | loss_act += cur_loss_act
438 |
439 | if self.action_dim > 0:
440 | if self.gpu_id > -1:
441 | act_idx = Variable(torch.LongTensor([[action]]).cuda())
442 | else:
443 | act_idx = Variable(torch.LongTensor([[action]]))
444 | act_embedding = F.relu(self.act_proj(self.act_embedding(act_idx).view(1, self.action_dim)). \
445 | view(1, 1, self.action_dim)) # 1 x 1 x dim
446 | act_summary, act_state = self.act_rnn(act_embedding, act_state)
447 |
448 | # execute the action to update the parser state
449 | if action == self.SHIFT:
450 | token = tokens.pop()
451 | buffer_sent.pop()
452 | token_embedding = input_sent.pop() # 1 x lstm_dim
453 | stack_state = stack[-1][1] if len(stack) > 0 else self.stack_initial_state
454 | output, stack_state = self.stack_rnn(token_embedding.view(1, 1, self.lstm_dim), stack_state)
455 | stack.append((output, stack_state, token))
456 | else:
457 | right = stack.pop()
458 | left = stack.pop()
459 | head, modifier = (
460 | left, right) if action == self.REDUCE_R else (right, left)
461 | top_stack_state = stack[-1][1] if stack else self.stack_initial_state
462 | head_rep, head_tok = head[0], head[2]
463 | mod_rep, mod_tok = modifier[0], modifier[2]
464 | composed_rep = F.relu(self.comp(torch.cat([head_rep, mod_rep], 2).view(1, 2 * self.lstm_dim)))
465 | output, top_stack_state = self.stack_rnn(composed_rep.view(1, 1, self.lstm_dim), top_stack_state)
466 | stack.append((output, top_stack_state, head_tok))
467 |
468 | head_pos = self.i2w_pos[head_tok[1]]
469 | mod_pos = self.i2w_pos[mod_tok[1]]
470 | if (head_pos, mod_pos) in rule2i:
471 | # print (head_pos, mod_pos)
472 | feature[rule2i[(head_pos, mod_pos)]] -= 1
473 |
474 | act_sequence.append(action)
475 |
476 | tok_mod = stack.pop()
477 | mod_pos = self.i2w_pos[tok_mod[2][1]]
478 | head_pos = 'ROOT'
479 | if (head_pos, mod_pos) in rule2i:
480 | # print (head_pos, mod_pos)
481 | feature[rule2i[(head_pos, mod_pos)]] -= 1
482 |
483 | return loss_act, act_sequence, feature
484 |
--------------------------------------------------------------------------------
/vi/modules/Encoder.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/libowen2121/VI-dependency-syntax/b9853a32fbfd7810ef03b5728fb1e01941504d96/vi/modules/Encoder.pyc
--------------------------------------------------------------------------------
/vi/modules/PR.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 | from torch.autograd import Variable
5 | import math
6 |
7 | class PR(nn.Module):
8 |
9 | def __init__(self, epsilon, pr_fname, mc_samples, para_init=None, gpu_id=-1):
10 | """
11 | Arguments:
12 | epsilon(float):
13 | pr_fname(str):
14 | mc_samples(int):
15 | """
16 | super(PR, self).__init__()
17 | self.epsilon = epsilon
18 | self.rules = [] # [(head, mod), ...]
19 | self.mc_samples = mc_samples
20 | self.gpu_id = gpu_id
21 |
22 | b = [] # expectations (b must be negative here)
23 | with open(pr_fname, 'r') as f:
24 | for line in f:
25 | head, mod, expectation = line.split()
26 | self.rules.append((head, mod))
27 | b.append( - float(expectation))
28 | self.rule2i = {self.rules[i]: i for i in range(len(self.rules))}
29 | n = len(self.rules)
30 | M = self.mc_samples
31 | self.phi = torch.zeros(M, n) # features
32 | if self.gpu_id > -1:
33 | self.phi = self.phi.cuda()
34 | self.phi = Variable(self.phi)
35 | self.Lambda = nn.Linear(n, 1, False) # no bias
36 |
37 | if self.gpu_id > -1:
38 | self.b = Variable(torch.FloatTensor(b).cuda())
39 | self.cuda()
40 | else:
41 | self.b = Variable(torch.FloatTensor(b))
42 |
43 | print 'initializing PR'
44 | if para_init is None:
45 | raise ValueError('No initializer')
46 | else:
47 | para_init(self.Lambda.weight)
48 |
49 | self.Lambda.weight = nn.Parameter(-self.b.data.clone().unsqueeze_(0))
50 |
51 | self.project()
52 |
53 | def forward(self):
54 | '''
55 | Arguments:
56 | phi(Variable): negative M x n
57 | Return:
58 | objective(Variable): pr loss
59 | pr_factor(Variable): M x 1
60 | '''
61 | M = self.phi.size(0)
62 | temp = - self.Lambda(self.phi) # M x 1
63 | temp = temp.squeeze(1)
64 | log_Z = - math.log(M) + log_sum_exp(temp, dim=0)
65 | objective = torch.mv(self.Lambda.weight, self.b) + log_Z + torch.norm(self.Lambda.weight, p=2) * self.epsilon
66 | pr_factor = temp - log_Z # M
67 | pr_factor = torch.exp(pr_factor) # M
68 | return objective, pr_factor
69 |
70 | def project(self):
71 | """
72 | constrain lambda is no less than zero
73 | """
74 | self.Lambda.weight.data.copy_(F.relu(self.Lambda.weight).data)
75 |
76 | def reset_phi(self):
77 | """
78 | reset phi
79 | """
80 | self.phi.zero_()
81 |
82 | def check_total(self, pos, rule_total):
83 | """
84 | count the total number of pos tag occurences which involve in rules
85 | Arguments:
86 | pos([str]): pos tags of sentences
87 | rule_total(dic):
88 | """
89 | for i in range(len(self.rules)):
90 | head, mod = self.rules[i]
91 | if head in pos + ['ROOT'] and mod in pos:
92 | rule_total[i] += 1
93 | continue
94 |
95 | # def check_occ(self, phi, rule_occ):
96 | # """
97 | # desperate
98 | # """
99 | # rule_occ -= np.sum(phi, 1)
100 |
101 |
102 | def log_sum_exp(value, dim=None, keepdim=False):
103 | """
104 | Numerically stable implementation of the operation value.exp().sum(dim, keepdim).log()
105 | """
106 | if dim is None:
107 | raise ValueError('Plese specify dim')
108 | m, _ = torch.max(value, dim=dim, keepdim=True)
109 | value0 = value - m
110 | if keepdim is False:
111 | m = m.squeeze(dim)
112 | return m + torch.log(torch.sum(torch.exp(value0), dim=dim, keepdim=keepdim))
--------------------------------------------------------------------------------
/vi/modules/RNNLM.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.init as init
4 | import torch.nn.functional as F
5 | from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
6 | from torch.autograd import Variable
7 |
8 | import numpy as np
9 |
10 | class LanguageModel(nn.Module):
11 |
12 | def __init__(self,
13 | word_dim,
14 | pos_dim,
15 | lstm_dim,
16 | nlayers,
17 | dropout,
18 | batchsize=1,
19 | tie_weights=False,
20 | pretrain=False,
21 | pretrain_word_vectors=None,
22 | w2i=None,
23 | i2w=None,
24 | w2i_pos=None,
25 | i2w_pos=None,
26 | para_init=None,
27 | init_name=None,
28 | gpu_id=-1):
29 | super(LanguageModel, self).__init__()
30 | self.word_dim = word_dim
31 | self.pos_dim = pos_dim
32 | self.lstm_dim = lstm_dim
33 | self.nlayers = nlayers
34 | self.dropout = dropout
35 | self.batchsize = batchsize
36 | self.tie_weights = tie_weights
37 | self.pretrain = pretrain
38 | self.i2w = i2w
39 | self.w2i = w2i
40 | self.i2w_pos = i2w_pos
41 | self.w2i_pos = w2i_pos
42 | self.len_word = len(self.i2w)
43 | self.len_pos = len(self.i2w_pos)
44 | self.para_init = para_init
45 | self.init_name = init_name
46 | self.gpu_id = gpu_id
47 |
48 | assert self.word_dim * self.pos_dim == 0 # only one feature
49 |
50 | # build nn blocks
51 | if self.w2i[''] != self.w2i_pos['']:
52 | raise ValueError('index of is not consisent')
53 | self.pad_idx = self.w2i['']
54 | if self.word_dim > 0:
55 | self.embedding = nn.Embedding(self.len_word, self.word_dim, padding_idx=self.pad_idx)
56 | self.decoder = nn.Linear(self.lstm_dim, self.len_word)
57 | self.input_dim = self.word_dim
58 | if self.pos_dim > 0:
59 | self.embedding = nn.Embedding(self.len_pos, self.pos_dim, padding_idx=self.pad_idx)
60 | self.decoder = nn.Linear(self.lstm_dim, self.len_pos)
61 | self.input_dim = self.pos_dim
62 | self.criterion = nn.CrossEntropyLoss(size_average=False, ignore_index=self.pad_idx)
63 | if self.dropout > 0:
64 | self.dp = nn.Dropout(p=self.dropout)
65 | self.lm_rnn = nn.LSTM(input_size=self.input_dim, hidden_size=self.lstm_dim,
66 | num_layers=nlayers, dropout=dropout)
67 |
68 | # initialize
69 | if self.para_init is not None:
70 | if self.init_name == 'glorot':
71 | for name, para in self.named_parameters():
72 | print 'initializing', name
73 | if len(para.size()) < 2:
74 | para.data.zero_()
75 | else:
76 | self.para_init(para)
77 | else:
78 | for name, para in self.named_parameters():
79 | print 'initializing', name
80 | self.para_init(para)
81 |
82 | # load pretrain word embeddings
83 | if self.pretrain:
84 | # only for word embeddings
85 | self.load_embeddings(pretrain_word_vectors)
86 |
87 | # tie weights
88 | if self.tie_weights:
89 | if self.input_dim != self.lstm_dim:
90 | raise ValueError('When using the tied flag, input dim must be equal to lstm hidden dim')
91 | self.decoder.weight = self.embedding.weight
92 | if self.pretrain:
93 | for para in self.decoder.parameters():
94 | para.requires_grad = False
95 |
96 | self.hidden0 = self.init_hidden()
97 | self.empty_emb = torch.randn(1, self.batchsize, self.input_dim) # for
98 | if self.gpu_id > -1:
99 | self.empty_emb = self.empty_emb.cuda()
100 | self.empty_emb = Variable(self.empty_emb)
101 |
102 | if gpu_id > -1:
103 | self.cuda()
104 |
105 | def init_hidden(self):
106 | if self.gpu_id > -1:
107 | return (Variable(torch.zeros(self.nlayers, self.batchsize, self.input_dim).cuda()),
108 | Variable(torch.zeros(self.nlayers, self.batchsize, self.input_dim).cuda()))
109 | else:
110 | return (Variable(torch.zeros(self.nlayers, self.batchsize, self.input_dim)),
111 | Variable(torch.zeros(self.nlayers, self.batchsize, self.input_dim)))
112 |
113 | def load_embeddings(self, pretrain_word_vectors):
114 | self.embedding.weight.data.copy_(pretrain_word_vectors)
115 | for para in self.embedding.parameters():
116 | para.requires_grad = False
117 |
118 | def save(self, filename):
119 | torch.save(self.state_dict(), filename)
120 |
121 | def forward(self, words, pos_tags):
122 | """
123 | language model with a batch of variable lengths
124 | Arguments:
125 | words(Variable): words and lengths variables
126 | pos_tags(Variable):
127 | oracle_actions(Variable):
128 | Returns:
129 | loss(Variable): - log prob
130 | """
131 | temp = self.batchsize
132 |
133 | if self.word_dim > 0:
134 | tokens, seq_lengths = words
135 | else:
136 | _, seq_lengths = words
137 | tokens = pos_tags
138 | seq_size = tokens.size(1)
139 | self.batchsize = tokens.size(0)
140 |
141 | input_seq = tokens.clone()
142 | target = tokens.clone()
143 | # remove the last token
144 | if input_seq.size(1) == 1:
145 | input_seq = self.empty_emb
146 | else:
147 | for i in range(self.batchsize):
148 | input_seq[i, seq_lengths[i] - 1] = self.pad_idx
149 | input_seq = input_seq[:, : - 1]
150 | input_seq = self.embedding(input_seq)
151 | input_seq = torch.transpose(input_seq, 0, 1).contiguous() # seq_len x batchsize x embed_dim
152 | input_seq = torch.cat((self.empty_emb[:, :self.batchsize, :], input_seq), 0)
153 | input_seq = self.dp(input_seq)
154 |
155 | packed_input = pack_padded_sequence(input_seq, seq_lengths.cpu().numpy())
156 | packed_output, _ = self.lm_rnn(packed_input,
157 | (self.hidden0[0][:, :self.batchsize, :].contiguous(),
158 | self.hidden0[1][:, :self.batchsize, :].contiguous()))
159 | output, _ = pad_packed_sequence(packed_output)
160 | output = self.dp(output) # seq_size x batchsize x lstm_dim
161 | output = output.view(seq_size * self.batchsize, self.lstm_dim)
162 | output = self.decoder(output) # seq_size * batchsize x len_token
163 | output = output.view(seq_size, self.batchsize, -1)
164 | output = torch.transpose(output, 0, 1).contiguous() # batch first
165 | loss = self.criterion(output.view(self.batchsize * seq_size, -1), target.view(-1))
166 |
167 | self.batchsize = temp
168 |
169 | return loss
170 |
171 | # def generate(self, length, temperature):
172 | # sent = ''
173 | # input = self.empty_emb
174 | # hidden = self.hidden0
175 | # for i in range(length):
176 | # output, hidden = self.lm_rnn(input, hidden)
177 | # output_weights = output.squeeze().data.div(temperature).exp()
178 | # token_idx = torch.multinomial(output_weights, 1)[0]
179 | # if self.word_dim > 0:
180 | # token = self.i2w[token_idx]
181 | # else:
182 | # token = self.i2w_pos[token_idx]
183 | # sent += token
184 | # sent += ' '
185 | # if self.gpu_id > -1:
186 | # token_idx = Variable(torch.LongTensor([[token_idx]]).cuda())
187 | # else:
188 | # token_idx = Variable(torch.LongTensor([[token_idx]]))
189 | # input = self.embedding(token_idx)
190 | # return sent
191 |
192 | class Baseline_linear(nn.Module):
193 | """
194 | add a 1 to 1 linear layer to language model baseline
195 | """
196 | def __init__(self, gpu_id=-1):
197 | super(Baseline_linear, self).__init__()
198 | self.a = nn.Linear(1, 1)
199 | if gpu_id > -1:
200 | self.a.cuda()
201 | init.constant_(self.a.weight, 1.)
202 | init.constant_(self.a.bias, 0.)
203 |
204 | def forward(self, x):
205 | return self.a(x)
206 |
207 |
208 |
209 |
210 |
--------------------------------------------------------------------------------
/vi/modules/Utils.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from vi_syntax.vi.myio.Utils import SHIFT, REDUCE_L, REDUCE_R
3 |
4 | def compute_dda(oracle_heads, act_seq, output=False, words=None, pos_tags=None, logger=None):
5 | '''
6 | compute directed dependency accuracy (correct and total)
7 | Arguments:
8 | oracle_heads(list): length = n
9 | act_seq(list): length = 2n - 1
10 | output(bool): whether to output tree
11 | words([str]):
12 | pos_tags([str]):
13 | logger():
14 | Return:
15 | correct(int): number of correct heads
16 | n(int): number of total heads
17 | '''
18 | act_seq = act_seq[:]
19 | n = len(oracle_heads)
20 | assert len(act_seq) == 2 * n - 1
21 |
22 | stack = []
23 | buffer = range(1, n + 1)
24 | buffer.reverse()
25 | heads = [-1] * n
26 |
27 | while not (len(stack) == 1 and len(buffer) == 0 ):
28 | action = act_seq.pop(0)
29 | # execute the action
30 | if action == SHIFT:
31 | stack.append(buffer.pop())
32 | else:
33 | right = stack.pop()
34 | left = stack.pop()
35 | head, modifier = (left, right) if action == REDUCE_R else (right, left)
36 | stack.append(head)
37 | heads[modifier - 1] = head
38 |
39 | head = stack.pop()
40 | heads[head - 1] = 0 # head is ROOT
41 |
42 | correct = sum([1 if x == y else 0 for x,y in zip(oracle_heads, heads)])
43 |
44 | # output the parsing trees
45 | if output:
46 | logger.info(' '.join(words))
47 | logger.info('oracle')
48 | for i in range(n):
49 | mod_pos = pos_tags[i]
50 | head_idx = oracle_heads[i]
51 | if head_idx == 0:
52 | head_pos = 'ROOT'
53 | else:
54 | head_pos = pos_tags[head_idx - 1]
55 | logger.info('{:d} {} -> {:d} {}'.format(head_idx, head_pos, i + 1, mod_pos))
56 | logger.info('induced')
57 | for i in range(n):
58 | mod_pos = pos_tags[i]
59 | head_idx = heads[i]
60 | if head_idx == 0:
61 | head_pos = 'ROOT'
62 | else:
63 | head_pos = pos_tags[head_idx - 1]
64 | logger.info('{:d} {} -> {:d} {}'.format(head_idx, head_pos, i + 1, mod_pos))
65 | logger.info('correct: {:d}/{:d}'.format(correct, n))
66 | logger.info('')
67 |
68 | return correct, n
69 |
70 |
71 | def compute_dda_long_dep(oracle_heads,
72 | act_seq,
73 | output=False,
74 | words=None,
75 | pos_tags=None,
76 | logger=None):
77 | '''
78 | compute directed dependency accuracy (correct and total) especially for long-distance dependency
79 | Arguments:
80 | oracle_heads(list): length = n
81 | act_seq(list): length = 2n - 1
82 | output(bool): whether to output tree
83 | words([str]):
84 | pos_tags([str]):
85 | logger():
86 | Return:
87 | correct(int): number of correct heads
88 | n(int): number of total heads
89 | '''
90 | act_seq = act_seq[:]
91 | n = len(oracle_heads)
92 | assert len(act_seq) == 2 * n - 1
93 |
94 | stack = []
95 | buffer = range(1, n + 1)
96 | buffer.reverse()
97 | heads = [-1] * n
98 |
99 | while not (len(stack) == 1 and len(buffer) == 0):
100 | action = act_seq.pop(0)
101 | # execute the action
102 | if action == SHIFT:
103 | stack.append(buffer.pop())
104 | else:
105 | right = stack.pop()
106 | left = stack.pop()
107 | head, modifier = (left, right) if action == REDUCE_R else (right,
108 | left)
109 | stack.append(head)
110 | heads[modifier - 1] = head
111 |
112 | head = stack.pop()
113 | heads[head - 1] = 0 # head is ROOT
114 |
115 | correct = 0
116 | n = 0
117 |
118 | for i in range(len(oracle_heads)):
119 | if abs(oracle_heads[i] - i) >= 7:
120 | n += 1
121 | if oracle_heads[i] == heads[i]:
122 | correct += 1
123 |
124 |
125 | return correct, n
126 |
127 | # rule translation for English WSJ
128 | ROOT_SET = ['ROOT']
129 | VERB_SET = ['VB','VBD','VBG','VBN','VBP', 'VBZ']
130 | AUX_SET = ['MD']
131 | ADV_SET = ['RB', 'RBR', 'RBS', 'WRB']
132 | NOUN_SET = ['NN', 'NNS', 'NNP', 'NNPS']
133 | PRON_SET = ['PRP', 'PRP$', 'WP', 'WP$']
134 | ADJ_SET = ['JJ', 'JJR', 'JJS']
135 | ART_SET = ['DT', 'PDT', 'WDT']
136 | NUM_SET = ['CD']
137 | PREP_SET = ['IN']
138 |
139 |
140 | def get_rule_idx(pos_l, pos_r):
141 | if pos_l in ROOT_SET:
142 | if pos_r in AUX_SET:
143 | return 0
144 | if pos_l in ROOT_SET:
145 | if pos_r in VERB_SET:
146 | return 1
147 | if pos_l in VERB_SET:
148 | if pos_r in NOUN_SET:
149 | return 2
150 | if pos_l in VERB_SET:
151 | if pos_r in PRON_SET:
152 | return 3
153 | if pos_l in VERB_SET:
154 | if pos_r in ADV_SET:
155 | return 4
156 | if pos_l in VERB_SET:
157 | if pos_r in VERB_SET:
158 | return 5
159 | if pos_l in AUX_SET:
160 | if pos_r in VERB_SET:
161 | return 6
162 | if pos_l in NOUN_SET:
163 | if pos_r in ADJ_SET:
164 | return 7
165 | if pos_l in NOUN_SET:
166 | if pos_r in ART_SET:
167 | return 8
168 | if pos_l in NOUN_SET:
169 | if pos_r in NOUN_SET:
170 | return 9
171 | if pos_l in NOUN_SET:
172 | if pos_r in NUM_SET:
173 | return 10
174 | if pos_l in PREP_SET:
175 | if pos_r in NOUN_SET:
176 | return 11
177 | if pos_l in ADJ_SET:
178 | if pos_r in ADV_SET:
179 | return 12
180 | return -1
181 |
182 | def compute_rule_acc(oracle_arcs, act_seq, tags):
183 | '''
184 | compute directed dependency accuracy (correct and total)
185 | '''
186 | oracle_arcs = [(x if isinstance(x, int) else x.item(), y) for x,y in oracle_arcs]
187 |
188 | act_seq = act_seq[:]
189 | n = (len(act_seq) + 1) / 2
190 |
191 | stack = []
192 | buffer = range(1, n + 1)
193 | buffer.reverse()
194 | arcs = set()
195 |
196 | while not (len(stack) == 1 and len(buffer) == 0 ):
197 | action = act_seq.pop(0)
198 | # execute the action
199 | if action == SHIFT:
200 | stack.append(buffer.pop())
201 | else:
202 | right = stack.pop()
203 | left = stack.pop()
204 | head, modifier = (left, right) if action == REDUCE_R else (right, left)
205 | stack.append(head)
206 | arcs.add((head, modifier))
207 |
208 | head = stack.pop()
209 | arcs.add((0, head))
210 |
211 | total_rule = [0.] * 13
212 | correct_rule = [0.] * 13
213 |
214 | assert len(arcs) == len(oracle_arcs)
215 | for arc in oracle_arcs:
216 | if arc[0] == 0:
217 | head_pos = 'ROOT'
218 | else:
219 | head_pos = tags[arc[0] - 1]
220 | mod_pos = tags[arc[1] - 1]
221 | rule_idx = get_rule_idx(head_pos, mod_pos)
222 | if rule_idx > -1:
223 | total_rule[rule_idx] += 1
224 | if arc in arcs:
225 | correct_rule[rule_idx] += 1
226 |
227 | return correct_rule, total_rule
228 |
229 | if __name__ == '__main__':
230 | oracle_heads = [2, 0, 5, 5, 2]
231 | act_seq = [SHIFT, SHIFT, REDUCE_L, SHIFT, SHIFT, REDUCE_L, SHIFT, REDUCE_L, REDUCE_R]
232 | print compute_dda(oracle_heads, act_seq)
233 | else:
234 | pass
--------------------------------------------------------------------------------
/vi/modules/Utils.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/libowen2121/VI-dependency-syntax/b9853a32fbfd7810ef03b5728fb1e01941504d96/vi/modules/Utils.pyc
--------------------------------------------------------------------------------
/vi/modules/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/libowen2121/VI-dependency-syntax/b9853a32fbfd7810ef03b5728fb1e01941504d96/vi/modules/__init__.py
--------------------------------------------------------------------------------
/vi/modules/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/libowen2121/VI-dependency-syntax/b9853a32fbfd7810ef03b5728fb1e01941504d96/vi/modules/__init__.pyc
--------------------------------------------------------------------------------
/vi/myio/IO.py:
--------------------------------------------------------------------------------
1 | from vi_syntax.vi.myio.Utils import check_projective, get_dep_oracle
2 | from vi_syntax.vi.myio.Utils import read_cluster
3 | from torchtext.data import Iterator, Batch
4 | from torchtext import data
5 | from torchtext import datasets
6 | import sys
7 |
8 |
9 | class VIDataset(object):
10 |
11 | def __init__(self):
12 | pass
13 |
14 | def build_wsj_enhanced_dataset(self, data_path=None, train_fname=None,
15 | valid_fname=None, test_fname=None, categorical_unk=True,
16 | filter_non_proj=True, min_length=0, max_length=9999,
17 | min_freq=2, vectors=None, vectors_cache=None, cluster_fname=None):
18 | """
19 | for generating enhanced dataset
20 | """
21 | if data_path is None or train_fname is None or valid_fname is None or test_fname is None:
22 | raise ValueError('missing data path/filename')
23 |
24 | intify = lambda x, *args: [int(token) for token in x]
25 |
26 | # Define the fields associated with the sequences.
27 | self.ID = data.Field(use_vocab=False, batch_first=True,
28 | postprocessing = data.Pipeline(intify),
29 | pad_token='-1')
30 | self.WORD = data.CategoricalUnkField(batch_first=True, include_lengths=True) \
31 | if categorical_unk else data.Field(batch_first=True, include_lengths=True)
32 | self.POS_TAG = data.Field(batch_first=True)
33 | self.DEP_HEAD = data.Field(use_vocab=False, batch_first=True,
34 | postprocessing = data.Pipeline(intify),
35 | pad_token='-1')
36 | self.CLUSTER_IDX = data.Field(use_vocab=False, batch_first=True,
37 | postprocessing = data.Pipeline(intify),
38 | pad_token='-1')
39 | self.INTRA_CLUSTER_IDX = data.Field(use_vocab=False, batch_first=True,
40 | postprocessing = data.Pipeline(intify),
41 | pad_token='-1')
42 | self.ACTION = data.Field(use_vocab=False, batch_first=True,
43 | postprocessing = data.Pipeline(intify),
44 | pad_token='-1')
45 |
46 | wsj_fields = [('id', self.ID), ('word', self.WORD), (None, None),
47 | (None, None), ('pos_tag', self.POS_TAG), (None, None),
48 | ('dep_head', self.DEP_HEAD), (None, None), (None, None),
49 | (None, None), ('cluster_idx', self.CLUSTER_IDX),
50 | ('intra_cluster_idx', self.INTRA_CLUSTER_IDX)]
51 |
52 | def length_filter(x): return len(
53 | x.word) >= min_length and len(x.word) <= max_length
54 |
55 | def sample_filter(x):
56 | if not filter_non_proj:
57 | return length_filter(x)
58 | else:
59 | return length_filter(x) and check_projective(x.dep_head)
60 |
61 | self.train, self.valid, self.test = datasets.SequenceTaggingDataset.splits(path=data_path, train=train_fname,
62 | validation=valid_fname, test=test_fname,
63 | fields=wsj_fields, filter_pred=sample_filter)
64 |
65 | # add shift-reduce action field
66 | for subdataset in(self.train, self.valid, self.test):
67 | subdataset.fields['action'] = self.ACTION
68 | for i in range(len(subdataset)):
69 | setattr(subdataset.examples[i], 'action', get_dep_oracle(
70 | subdataset.examples[i].dep_head))
71 |
72 | print 'train: {:d}, valid: {:d}, test: {:d}'.format(
73 | len(self.train), len(self.valid), len(self.test))
74 |
75 | self.WORD.build_vocab(self.train.word, min_freq=min_freq, vectors=vectors, vectors_cache=vectors_cache)
76 | self.POS_TAG.build_vocab(self.train.pos_tag)
77 |
78 | if cluster_fname is not None:
79 | self.wordi2ci, self.wordi2i, _, self.ci2wordi = read_cluster(cluster_fname, self.WORD.vocab.stoi)
80 |
81 | def build_ud_enhanced_dataset(self, data_path=None, train_fname=None,
82 | valid_fname=None, test_fname=None,
83 | filter_non_proj=True, min_length=0, max_length=9999,
84 | min_freq=2, vectors=None, vectors_cache=None, cluster_fname=None):
85 | """
86 | for generating enhanced dataset
87 | """
88 | if data_path is None or train_fname is None or valid_fname is None or test_fname is None:
89 | raise ValueError('missing data path/filename')
90 |
91 | def intify(x, *args):
92 | return [int(token) for token in x]
93 |
94 | def mylower(x, *args):
95 | return x.lower()
96 |
97 | # Define the fields associated with the sequences.
98 | self.ID = data.Field(use_vocab=False, batch_first=True,
99 | postprocessing = data.Pipeline(intify),
100 | pad_token='-1')
101 | self.WORD = data.Field(batch_first=True, include_lengths=True, preprocessing = data.Pipeline(mylower)) # lower for UD
102 | self.POS_TAG = data.Field(batch_first=True)
103 | self.DEP_HEAD = data.Field(use_vocab=False, batch_first=True,
104 | postprocessing = data.Pipeline(intify),
105 | pad_token='-1')
106 | self.CLUSTER_IDX = data.Field(use_vocab=False, batch_first=True,
107 | postprocessing = data.Pipeline(intify),
108 | pad_token='-1')
109 | self.INTRA_CLUSTER_IDX = data.Field(use_vocab=False, batch_first=True,
110 | postprocessing = data.Pipeline(intify),
111 | pad_token='-1')
112 | self.ACTION = data.Field(use_vocab=False, batch_first=True,
113 | postprocessing = data.Pipeline(intify),
114 | pad_token='-1')
115 |
116 | ud_fields = [('id', self.ID), ('word', self.WORD), (None, None),
117 | ('pos_tag', self.POS_TAG), (None, None), (None, None),
118 | ('dep_head', self.DEP_HEAD), (None, None), (None, None),
119 | (None, None), ('cluster_idx', self.CLUSTER_IDX),
120 | ('intra_cluster_idx', self.INTRA_CLUSTER_IDX)]
121 |
122 | def length_filter(x): return len(
123 | x.word) >= min_length and len(x.word) <= max_length
124 |
125 | def sample_filter(x):
126 | if not filter_non_proj:
127 | return length_filter(x)
128 | else:
129 | return length_filter(x) and check_projective(x.dep_head)
130 |
131 | self.train, self.valid, self.test = datasets.SequenceTaggingDataset.splits(path=data_path, train=train_fname,
132 | validation=valid_fname, test=test_fname,
133 | fields=ud_fields, filter_pred=sample_filter)
134 |
135 | # add shift-reduce action field
136 | for subdataset in(self.train, self.valid, self.test):
137 | subdataset.fields['action'] = self.ACTION
138 | for i in range(len(subdataset)):
139 | setattr(subdataset.examples[i], 'action', get_dep_oracle(
140 | subdataset.examples[i].dep_head))
141 |
142 | print 'train: {:d}, valid: {:d}, test: {:d}'.format(
143 | len(self.train), len(self.valid), len(self.test))
144 |
145 | self.WORD.build_vocab(self.train.word, min_freq=min_freq, vectors=vectors, vectors_cache=vectors_cache)
146 | self.POS_TAG.build_vocab(self.train.pos_tag)
147 |
148 | if cluster_fname is not None:
149 | self.wordi2ci, self.wordi2i, _, self.ci2wordi = read_cluster(cluster_fname, self.WORD.vocab.stoi)
150 |
151 | def build_wsj_dataset(self, data_path=None, train_fname=None,
152 | valid_fname=None, test_fname=None, categorical_unk=True,
153 | filter_non_proj=True, min_length=0, max_length=9999,
154 | min_freq=2):
155 | """
156 | for generating enhanced dataset
157 | """
158 | if data_path is None or train_fname is None or valid_fname is None or test_fname is None:
159 | raise ValueError('missing data path/filename')
160 |
161 | # Define the fields associated with the sequences.
162 | self.WORD = data.CategoricalUnkField() if categorical_unk else data.Field()
163 | self.CPOS_TAG = data.Field()
164 | self.POS_TAG = data.Field()
165 | self.DEP_HEAD = data.Field(use_vocab=False)
166 | self.CLUSTER_IDX = data.Field(use_vocab=False)
167 | self.INTRA_CLUSTER_IDX = data.Field(use_vocab=False)
168 |
169 | wsj_fields = [(None, None), ('word', self.WORD), (None, None),
170 | ('cpos_tag', self.CPOS_TAG), ('pos_tag', self.POS_TAG), (None, None),
171 | ('dep_head', self.DEP_HEAD)]
172 |
173 | def length_filter(x): return len(
174 | x.word) >= min_length and len(x.word) <= max_length
175 |
176 | def sample_filter(x):
177 | if not filter_non_proj:
178 | return length_filter(x)
179 | else:
180 | return length_filter(x) and check_projective(x.dep_head)
181 |
182 | self.train, self.valid, self.test = datasets.SequenceTaggingDataset.splits(path=data_path, train=train_fname,
183 | validation=valid_fname, test=test_fname,
184 | fields=wsj_fields, filter_pred=sample_filter)
185 | print 'train: {:d}, valid: {:d}, test: {:d}'.format(len(self.train), len(self.valid), len(self.test))
186 |
187 | self.WORD.build_vocab(self.train.word, min_freq=min_freq)
188 | self.POS_TAG.build_vocab(self.train.pos_tag)
189 | print 'vocab: {:d}'.format(len(self.WORD.vocab))
190 |
191 | def build_ud_dataset(self, data_path=None, train_fname=None,
192 | valid_fname=None, test_fname=None, categorical_unk=True,
193 | filter_non_proj=True, min_length=0, max_length=9999,
194 | min_freq=2):
195 | """
196 | for generating enhanced dataset
197 | """
198 | if data_path is None or train_fname is None or valid_fname is None or test_fname is None:
199 | raise ValueError('missing data path/filename')
200 |
201 | def mylower(x, *args):
202 | return x.lower()
203 |
204 | # Define the fields associated with the sequences.
205 | self.ID = data.Field(use_vocab=False)
206 | self.WORD = data.Field(preprocessing=data.Pipeline(mylower))
207 | self.CPOS_TAG = data.Field()
208 | self.POS_TAG = data.Field()
209 | self.DEP_HEAD = data.Field(use_vocab=False)
210 | self.DEP_REL = data.Field()
211 |
212 | ud_fields = [('id', self.ID), ('word', self.WORD), (None, None),
213 | ('cpos_tag', self.CPOS_TAG), ('pos_tag',self.POS_TAG), (None, None),
214 | ('dep_head', self.DEP_HEAD), ('dep_rel',self.DEP_REL), (None, None),
215 | (None, None)]
216 |
217 | def length_filter(x): return len(
218 | x.word) >= min_length and len(x.word) <= max_length
219 |
220 | def sample_filter(x):
221 | if not filter_non_proj:
222 | return length_filter(x)
223 | else:
224 | return length_filter(x) and check_projective(x.dep_head)
225 |
226 | self.train, self.valid, self.test = datasets.SequenceTaggingDataset.splits(path=data_path, train=train_fname,
227 | validation=valid_fname, test=test_fname,
228 | fields=ud_fields, filter_pred=sample_filter)
229 |
230 |
231 | self.WORD.build_vocab(self.train.word, min_freq=min_freq)
232 | self.CPOS_TAG.build_vocab(self.train.cpos_tag)
233 | self.POS_TAG.build_vocab(self.train.pos_tag)
234 |
--------------------------------------------------------------------------------
/vi/myio/IO.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/libowen2121/VI-dependency-syntax/b9853a32fbfd7810ef03b5728fb1e01941504d96/vi/myio/IO.pyc
--------------------------------------------------------------------------------
/vi/myio/Preprocess.py:
--------------------------------------------------------------------------------
1 | """
2 | preprocess wsj and ud treebank and add fields:
3 | shift-reduce action, cluster index, intra cluster index
4 | """
5 | import sys
6 | from IO import VIDataset
7 | from Utils import read_cluster
8 | import os.path
9 |
10 | def preprocess_wsj(data_path, train_fname, valid_fname,
11 | test_fname, target_path, train_enhanced_fname,
12 | valid_enhanced_fname, test_enhanced_fname, cluster_fname=None):
13 | dataset = VIDataset()
14 | dataset.build_wsj_dataset(data_path=data_path, train_fname=train_fname, valid_fname=valid_fname,
15 | test_fname=test_fname)
16 |
17 | if cluster_fname is not None:
18 | wordi2ci, wordi2i, _, _ = read_cluster(cluster_fname, dataset.WORD.vocab.stoi)
19 |
20 | for fname, subdataset in zip([train_enhanced_fname, valid_enhanced_fname, test_enhanced_fname],
21 | [dataset.train, dataset.valid, dataset.test]):
22 | with open(os.path.join(target_path, fname), 'w') as f:
23 | for i in range(len(subdataset.examples)):
24 | example = subdataset.examples[i]
25 | cluster_indices = [wordi2ci[dataset.WORD.vocab.stoi[token]] for token in example.word]
26 | intra_cluster_indices = [
27 | wordi2i[dataset.WORD.vocab.stoi[token]] for token in example.word]
28 | lines = get_conll_sample_str(range(1, len(example.word)+1), example.word,
29 | ['_'] * len(example.word), ['_'] * len(example.word),
30 | example.pos_tag, ['_'] * len(example.word),
31 | example.dep_head, ['_'] * len(example.word),
32 | ['_'] * len(example.word), ['_'] * len(example.word),
33 | cluster_indices, intra_cluster_indices)
34 | f.write(lines)
35 | f.write('\n')
36 |
37 |
38 | def preprocess_ud(data_path, train_fname, valid_fname,
39 | test_fname, target_path, train_enhanced_fname,
40 | valid_enhanced_fname, test_enhanced_fname, cluster_fname=None):
41 | dataset = VIDataset()
42 | dataset.build_ud_dataset(data_path=data_path, train_fname=train_fname, valid_fname=valid_fname,
43 | test_fname=test_fname)
44 |
45 | if cluster_fname is not None:
46 | wordi2ci, wordi2i, _, _ = read_cluster(cluster_fname, dataset.WORD.vocab.stoi)
47 |
48 | for fname, subdataset in zip([train_enhanced_fname, valid_enhanced_fname, test_enhanced_fname],
49 | [dataset.train, dataset.valid, dataset.test]):
50 | with open(os.path.join(target_path, fname), 'w') as f:
51 | for i in range(len(subdataset.examples)):
52 | example = subdataset.examples[i]
53 | cluster_indices = [wordi2ci[dataset.WORD.vocab.stoi[token]] for token in example.word]
54 | intra_cluster_indices = [wordi2i[dataset.WORD.vocab.stoi[token]] for token in example.word]
55 | lines = get_conll_sample_str(range(1, len(example.word)+1), example.word,
56 | ['_'] * len(example.word), example.cpos_tag,
57 | example.pos_tag, ['_'] * len(example.word),
58 | example.dep_head, example.dep_rel,
59 | ['_'] * len(example.word), ['_'] * len(example.word),
60 | cluster_indices, intra_cluster_indices)
61 | f.write(lines)
62 | f.write('\n')
63 | return dataset
64 |
65 | def get_conll_sample_str(*args):
66 | """
67 | Arguments:
68 | fp: file pointer
69 | *args: lists
70 | """
71 | column = len(args)
72 | lines = ''
73 | for x in zip(*args):
74 | for i in range(column):
75 | lines += str(x[i]) if not isinstance(x[i], str) else x[i]
76 | lines += '\t'
77 | lines += '\n'
78 | return lines
79 |
80 |
81 | if __name__ == '__main__':
82 | # # generate enhanced wsj conll dataset
83 | # data_path = '/Users/boon/code/study/corpora/wsj dependency/'
84 | # train_fname = 'wsj-inf_2-21_dep'
85 | # valid_fname = 'wsj-inf_22_dep'
86 | # test_fname = 'wsj-inf_23_dep'
87 | # target_path = '/Users/boon/Dropbox/code/study/pytorch/vi_syntax/data/wsj'
88 | # train_enhanced_fname = 'wsj_train_enhanced'
89 | # valid_enhanced_fname = 'wsj_valid_enhanced'
90 | # test_enhanced_fname = 'wsj_test_enhanced'
91 | # cluster_fname = '../../data/cluster/clusters-train-berk.txt'
92 | # preprocess_wsj(data_path, train_fname, valid_fname,
93 | # test_fname, target_path, train_enhanced_fname,
94 | # valid_enhanced_fname, test_enhanced_fname, cluster_fname)
95 |
96 | # generate enhanced ud conll dataset
97 | language_id = 'pt'
98 | data_path = '/Users/boon/Dropbox/code/study/pytorch/vi_syntax/data/ud'
99 | train_fname = language_id + '-ud-train_clean.conllu'
100 | valid_fname = language_id + '-ud-dev_clean.conllu'
101 | test_fname = language_id + '-ud-test_clean.conllu'
102 | target_path = '/Users/boon/Dropbox/code/study/pytorch/vi_syntax/data/ud'
103 | train_enhanced_fname = language_id + '_train_enhanced'
104 | valid_enhanced_fname = language_id + '_valid_enhanced'
105 | test_enhanced_fname = language_id + '_test_enhanced'
106 | cluster_fname = '../../data/cluster/' + language_id + '_cluster'
107 | dataset = preprocess_ud(data_path, train_fname, valid_fname,
108 | test_fname, target_path, train_enhanced_fname,
109 | valid_enhanced_fname, test_enhanced_fname, cluster_fname)
110 | pass
111 | else:
112 | pass
113 |
--------------------------------------------------------------------------------
/vi/myio/Preprocess.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/libowen2121/VI-dependency-syntax/b9853a32fbfd7810ef03b5728fb1e01941504d96/vi/myio/Preprocess.pyc
--------------------------------------------------------------------------------
/vi/myio/Tree.py:
--------------------------------------------------------------------------------
1 | '''
2 | a simple tree structure to help get shift-reduce actions
3 | '''
4 | class tree(object):
5 |
6 | def __init__(self, tokens, arcs):
7 | self.nodes = {token: node(name=token) for token in tokens}
8 | for arc in arcs:
9 | i, j = arc
10 | self.nodes[j].set_parent(self.nodes[i])
11 |
12 | def remove_node(self, idx):
13 | if len(self.nodes[idx].children) > 0 or not self.nodes[idx].parent:
14 | return False
15 | else:
16 | self.nodes[idx].parent.children.remove(self.nodes[idx])
17 | del self.nodes[idx]
18 | return True
19 |
20 | def get_nodes(self):
21 | return self.nodes
22 |
23 | def show(self):
24 | for name in self.nodes:
25 | print self.nodes[name], '->', self.nodes[name].children
26 |
27 | def _depth(self, n):
28 | if len(n.children) == 0:
29 | return 1
30 | else:
31 | return max([1 + self._depth(child) for child in n.children])
32 |
33 | def get_depth(self):
34 | root = None
35 | for token in self.nodes:
36 | if not self.nodes[token].parent:
37 | root = self.nodes[token]
38 | break
39 | if not root:
40 | raise ValueError('no root node!')
41 | return self._depth(root)
42 |
43 |
44 | class node(object):
45 | def __init__(self, name):
46 | self.name = name
47 | self.children = []
48 | self.parent = None
49 |
50 | def set_parent(self, parent):
51 | self.parent = parent
52 | self.parent.children.append(self)
53 |
54 | def get_name(self):
55 | return self.name
56 |
57 | def get_parent(self):
58 | return self.parent
59 |
60 | def get_children(self):
61 | return self.children
62 |
63 | def __str__(self):
64 | return str(self.name)
65 |
66 | def __repr__(self):
67 | return self.__str__()
68 |
69 |
70 | if __name__ == '__main__':
71 | tokens = range(1, 6)
72 | arcs = [(2, 1), (2, 4), (4, 3), (4, 5)]
73 |
74 | t = tree(tokens, arcs)
75 | t.show()
76 | print 'tree depth', t.get_depth()
77 |
78 | print '*' * 10
79 | t.remove_node(3)
80 | t.show()
81 |
82 | print '*' * 10
83 | t.remove_node(5)
84 | t.show()
85 |
86 | print '*' * 10
87 | t.remove_node(4)
88 | t.show()
89 |
90 | print '*' * 10
91 | t.remove_node(1)
92 | t.show()
93 |
94 | print '*' * 10
95 | t.remove_node(2)
96 | t.show()
97 |
98 | else:
99 | pass
100 |
--------------------------------------------------------------------------------
/vi/myio/Tree.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/libowen2121/VI-dependency-syntax/b9853a32fbfd7810ef03b5728fb1e01941504d96/vi/myio/Tree.pyc
--------------------------------------------------------------------------------
/vi/myio/Utils.py:
--------------------------------------------------------------------------------
1 | from Tree import tree
2 | import time
3 |
4 | SHIFT = 0
5 | REDUCE_L = 1
6 | REDUCE_R = 2
7 |
8 | def check_projective(heads):
9 | """
10 | brute force check non-projectivity
11 | """
12 | if len(heads) < 1:
13 | raise ValueError('length of heads should be larger than 0')
14 | if not isinstance(heads[0], int):
15 | heads = [int(head) for head in heads]
16 | arcs = [ (heads[mod], mod+1) for mod in range(len(heads)) ]
17 | for p in range(len(arcs)):
18 | for q in range(len(arcs)):
19 | if p != q:
20 | a = min(arcs[p])
21 | b = max(arcs[p])
22 | c = min(arcs[q])
23 | d = max(arcs[q])
24 | if a in range(c + 1, d) and b > d or b in range(c + 1, d) and a < c:
25 | return False
26 | return True
27 |
28 | def get_dep_oracle(heads):
29 | """
30 | get shift-reduce actions
31 | """
32 | if len(heads) < 1:
33 | raise ValueError('length of heads should be larger than 0')
34 | if not isinstance(heads[0], int):
35 | heads = [int(head) for head in heads]
36 | arcs = [(heads[mod], mod + 1) for mod in range(len(heads))]
37 | action = []
38 | t = tree(range(len(heads) + 1), arcs)
39 |
40 | buffer = range(1, len(arcs) + 1)
41 | buffer.reverse()
42 | stack = []
43 |
44 | while not (len(stack) == 1 and len(buffer) == 0):
45 | if len(stack) < 2:
46 | # shift
47 | stack.append(buffer.pop())
48 | action.append(SHIFT)
49 | else:
50 | i = stack[-2]
51 | j = stack[-1]
52 | child = None
53 | if (i, j) in arcs and t.remove_node(j):
54 | # reduce_r
55 | action.append(REDUCE_R)
56 | child = j
57 | stack.remove(child)
58 | continue
59 | if (j, i) in arcs and t.remove_node(i):
60 | # reduce_l
61 | action.append(REDUCE_L)
62 | child = i
63 | stack.remove(child)
64 | continue
65 | if not child:
66 | # shift
67 | if len(buffer) == 0:
68 | # non projective dependency tree
69 | raise ValueError('Encounter a non-projective/non-single-head tree')
70 | stack.append(buffer.pop())
71 | action.append(SHIFT)
72 | assert len(heads) * 2 - 1 == len(action)
73 | return action
74 |
75 | def read_cluster(cluster_fname, w2i_word):
76 | """
77 | Arguments:
78 | cluster_fname(str):
79 | w2i_word (dictionary):
80 | """
81 | wordi2ci = {} # word 2 cluster idx
82 | wordi2i = {} # word 2 intra cluster idx
83 | cw = set()
84 | ci2wordi = {} # cluster idx 2 word idx list
85 | c_set = set()
86 | c_list = []
87 | with open(cluster_fname) as f:
88 | for line in f:
89 | binary, word, _ = line.split()
90 | c = int(binary, 2)
91 | if c not in c_set:
92 | c_list.append(len(c_list))
93 | ci2wordi[c_list[-1]] = []
94 | c_set.add(c)
95 | if word in w2i_word:
96 | wordi2i[w2i_word[word]] = len(ci2wordi[c_list[-1]])
97 | ci2wordi[c_list[-1]].append(w2i_word[word])
98 | wordi2ci[w2i_word[word]] = c_list[-1]
99 | cw.add(word)
100 |
101 | extra = set(w2i_word.keys()) - cw
102 | if len(extra) > 0: # add one more cluster for extra words
103 | c_list.append(len(c_list))
104 | ci2wordi[c_list[-1]] = []
105 | for word in extra:
106 | wordi2i[w2i_word[word]] = len(ci2wordi[c_list[-1]])
107 | ci2wordi[c_list[-1]].append(w2i_word[word])
108 | wordi2ci[w2i_word[word]] = c_list[-1]
109 | return wordi2ci, wordi2i, c_list, ci2wordi
110 |
111 | if __name__ == '__main__':
112 | # test check_projective
113 | assert check_projective([0, 0, 2, 2]) == True
114 | assert check_projective([0, 0, 1, 3]) == False
115 |
116 | # test get dependency oracle
117 | cur_time = time.time()
118 | heads = [2, 0, 4, 5, 2, 2, 8, 6, 8, 9, 10, 13, 10, 13, 13, 8, 16, 17, 21, 21, 23, 23, 18] # should be single head trees
119 | print len(heads)
120 | print get_dep_oracle(heads)
121 | print time.time() - cur_time
122 |
123 | else:
124 | pass
125 |
--------------------------------------------------------------------------------
/vi/myio/Utils.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/libowen2121/VI-dependency-syntax/b9853a32fbfd7810ef03b5728fb1e01941504d96/vi/myio/Utils.pyc
--------------------------------------------------------------------------------
/vi/myio/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/libowen2121/VI-dependency-syntax/b9853a32fbfd7810ef03b5728fb1e01941504d96/vi/myio/__init__.py
--------------------------------------------------------------------------------
/vi/myio/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/libowen2121/VI-dependency-syntax/b9853a32fbfd7810ef03b5728fb1e01941504d96/vi/myio/__init__.pyc
--------------------------------------------------------------------------------
/vi/myio/ud_cleaner.py:
--------------------------------------------------------------------------------
1 | '''
2 | remove X_X line in ud treebank
3 | '''
4 | import os.path
5 |
6 | lang_dir = 'UD_Spanish'
7 | lang = 'es'
8 |
9 | data_dir = '/Users/boon/code/study/corpora/Universal Dependencies 1.4/ud-treebanks-v1.4'
10 | target_dir = '../../data/ud'
11 |
12 | train_fr = os.path.join(data_dir, lang_dir, lang+'-ud-train.conllu')
13 | dev_fr = os.path.join(data_dir, lang_dir, lang+'-ud-dev.conllu')
14 | test_fr = os.path.join(data_dir, lang_dir, lang + '-ud-test.conllu')
15 |
16 | train_fw = os.path.join(target_dir, lang+'-ud-train_clean.conllu')
17 | dev_fw = os.path.join(target_dir, lang+'-ud-dev_clean.conllu')
18 | test_fw = os.path.join(target_dir, lang + '-ud-test_clean.conllu')
19 |
20 | for fr, fw in zip((train_fr, dev_fr, test_fr), (train_fw, dev_fw, test_fw)):
21 | fpr = open(fr)
22 | fpw = open(fw, 'w')
23 | for line in fpr:
24 | if len(line.split()) > 0 and '-' in line.split()[0]:
25 | continue
26 | fpw.write(line)
27 | fpr.close()
28 | fpw.close()
--------------------------------------------------------------------------------
/vi/nvil_ft.py:
--------------------------------------------------------------------------------
1 | from optparse import OptionParser
2 | from vi_syntax.vi.Session import Session
3 |
4 |
5 | if __name__ == '__main__':
6 | parser = OptionParser()
7 | parser.add_option('--data_path', dest='data_path', metavar='FILE',
8 | default='', help='')
9 | parser.add_option('--train_fname', dest='train_fname', metavar='FILE',
10 | default='', help='')
11 | parser.add_option('--valid_fname', dest='valid_fname', metavar='FILE',
12 | default='', help='')
13 | parser.add_option('--test_fname', dest='test_fname', metavar='FILE',
14 | default='', help='')
15 | parser.add_option('--cluster', dest='cluster', action='store_true',
16 | default=False)
17 | parser.add_option('--cluster_fname', dest='cluster_fname', metavar='FILE',
18 | default='', help='dir pretrained word cluster files')
19 | parser.add_option('--word_vector_cache', dest='word_vector_cache', metavar='FILE',
20 | default='', help='dir for caching pretrained word vectors')
21 | parser.add_option('--result_dir', dest='result_dir', metavar='FILE',
22 | default='', help='dir to store results')
23 | parser.add_option('--encoder_fname', dest='encoder_fname', metavar='FILE',
24 | default='', help='dir to store encoder models') # for loading
25 | parser.add_option('--decoder_fname', dest='decoder_fname', metavar='FILE',
26 | default='', help='dir to store decoder models') # for loading
27 | parser.add_option('--lm_fname', dest='lm_fname', metavar='FILE',
28 | default='', help='dir to store baseline models') # for loading
29 | parser.add_option('--log_name', dest='log_name', metavar='FILE',
30 | default='nvil_test.log')
31 | parser.add_option('--save_model', action='store_true', dest='save_model', default=False)
32 |
33 | # optmizer & initializer
34 | parser.add_option('--initializer', dest='initializer',
35 | default='glorot', help='[glorot,constant,uniform,normal]')
36 | parser.add_option('--optimizer', dest='optimizer',
37 | default='adagrad', help='[sgd,momentum,adam,adadelta,adagrad]')
38 | # model params
39 | parser.add_option("--pretrain_word_dim",type="int", dest="pretrain_word_dim", default=300)
40 | parser.add_option('--word_dim', type='int', dest='word_dim', default=80)
41 | parser.add_option('--pos_dim', type='int', dest='pos_dim', default=80)
42 | parser.add_option('--action_dim', type='int', dest='action_dim', default=0) # 32
43 | parser.add_option('--enc_lstm_dim', type='int', dest='enc_lstm_dim', default=64)
44 | parser.add_option('--dec_lstm_dim', type='int', dest='dec_lstm_dim', default=64)
45 | parser.add_option('--nlayers', type='int', dest='nlayers', default=1)
46 | # training options
47 | # parser.add_option('--decay_every', type='int', dest='decay_every', default=5000)
48 | parser.add_option('--epochs', type='int', dest='epochs', default=10)
49 | parser.add_option('--print_every', type='int', dest='print_every', default=10)
50 | parser.add_option('--save_every', type='int', dest='save_every', default=10)
51 | # optimization misc
52 | parser.add_option('--lr', type='float', dest='lr', default=0.01) # 0.01
53 |
54 | # parser.add_option('--lrdecay', type='float', dest='decay', default=0.75)
55 | parser.add_option('--enc_dropout', type='float', dest='enc_dropout', default=0.5)
56 | parser.add_option('--dec_dropout', type='float', dest='dec_dropout', default=0.5)
57 | # parser.add_option('--enc_reg', type='float', dest='enc_reg', default=0.1)
58 | # parser.add_option('--dec_reg', type='float', dest='dec_reg', default=1.0)
59 | # parser.add_option('--enc_update', type='float', dest='enc_update', default=1.0)
60 | parser.add_option('--l2_reg', type='float', dest='l2_reg', default=1e-4)
61 | # limit length options
62 | parser.add_option('--train_max_length', type='int', dest='train_max_length', default=10)
63 | parser.add_option('--gpu_id', type='int', dest='gpu_id', default=-1)
64 | # test options
65 | parser.add_option('--output_tree', action='store_true', dest='output_tree', default=False)
66 |
67 | # for lm
68 | parser.add_option("--lm_pretrain", action='store_true', dest="lm_pretrain", default=False)
69 | parser.add_option("--tie_weights", action='store_true', dest="tie_weights", default=False)
70 | parser.add_option('--lm_word_dim', type='int', dest='lm_word_dim', default=100)
71 | parser.add_option('--lm_pos_dim', type='int', dest='lm_pos_dim', default=0) # 50
72 | parser.add_option('--lm_lstm_dim', type='int', dest='lm_lstm_dim', default=100)
73 | parser.add_option('--lm_nlayers', type='int', dest='lm_nlayers', default=2)
74 | parser.add_option('--lm_dropout', type='float', dest='lm_dropout', default=0.5)
75 | parser.add_option('--clip', type='float', dest='clip', default=0.25)
76 |
77 | # for pr
78 | parser.add_option('--pr_initializer', dest='pr_initializer',
79 | default='normal', help='[glorot,constant,uniform,normal]')
80 | parser.add_option('--pr_optimizer', dest='pr_optimizer',
81 | default='adagrad', help='[sgd,momentum,adam,adadelta,adagrad]')
82 | parser.add_option('--epsilon', type='float', dest='epsilon', default=0.1)
83 | parser.add_option('--pr_fname', dest='pr_fname', metavar='FILE',
84 | default='', help='')
85 | parser.add_option('--mc_samples', type='int', dest='mc_samples', default=20)
86 | parser.add_option('--batchsize', type='int', dest='batchsize', default=1)
87 | parser.add_option('--nvil_batchsize', type='int', dest='nvil_batchsize', default=1)
88 | parser.add_option('--min_freq', type='int', dest='min_freq', default=2)
89 | (options, args) = parser.parse_args()
90 | s = Session(options)
91 | s.nvil_pr_ft()
--------------------------------------------------------------------------------
/vi/nvil_ft_ud.py:
--------------------------------------------------------------------------------
1 | from optparse import OptionParser
2 | from vi_syntax.vi.Session import Session
3 |
4 |
5 | if __name__ == '__main__':
6 | parser = OptionParser()
7 | parser.add_option('--data_path', dest='data_path', metavar='FILE',
8 | default='', help='')
9 | parser.add_option('--train_fname', dest='train_fname', metavar='FILE',
10 | default='', help='')
11 | parser.add_option('--valid_fname', dest='valid_fname', metavar='FILE',
12 | default='', help='')
13 | parser.add_option('--test_fname', dest='test_fname', metavar='FILE',
14 | default='', help='')
15 | parser.add_option('--cluster', dest='cluster', action='store_true',
16 | default=False)
17 | parser.add_option('--cluster_fname', dest='cluster_fname', metavar='FILE',
18 | default='', help='dir pretrained word cluster files')
19 | parser.add_option('--word_vector_cache', dest='word_vector_cache', metavar='FILE',
20 | default='', help='dir for caching pretrained word vectors')
21 | parser.add_option('--result_dir', dest='result_dir', metavar='FILE',
22 | default='', help='dir to store results')
23 | parser.add_option('--encoder_fname', dest='encoder_fname', metavar='FILE',
24 | default='', help='dir to store encoder models') # for loading
25 | parser.add_option('--decoder_fname', dest='decoder_fname', metavar='FILE',
26 | default='', help='dir to store decoder models') # for loading
27 | parser.add_option('--log_name', dest='log_name', metavar='FILE',
28 | default='nvil_test.log')
29 | parser.add_option('--save_model', action='store_true', dest='save_model', default=False)
30 |
31 | # optmizer & initializer
32 | parser.add_option('--initializer', dest='initializer',
33 | default='glorot', help='[glorot,constant,uniform,normal]')
34 | parser.add_option('--optimizer', dest='optimizer',
35 | default='adagrad', help='[sgd,momentum,adam,adadelta,adagrad]')
36 | # model params
37 | parser.add_option("--pretrain_word_dim",type="int", dest="pretrain_word_dim", default=300)
38 | parser.add_option('--word_dim', type='int', dest='word_dim', default=80)
39 | parser.add_option('--pos_dim', type='int', dest='pos_dim', default=80)
40 | parser.add_option('--action_dim', type='int', dest='action_dim', default=0) # 32
41 | parser.add_option('--enc_lstm_dim', type='int', dest='enc_lstm_dim', default=64)
42 | parser.add_option('--dec_lstm_dim', type='int', dest='dec_lstm_dim', default=64)
43 | parser.add_option('--nlayers', type='int', dest='nlayers', default=1)
44 | # training options
45 | parser.add_option('--epochs', type='int', dest='epochs', default=10)
46 | parser.add_option('--print_every', type='int', dest='print_every', default=10)
47 | parser.add_option('--save_every', type='int', dest='save_every', default=10)
48 | # optimization misc
49 | parser.add_option('--lr', type='float', dest='lr', default=0.01) # 0.01
50 |
51 | parser.add_option('--enc_dropout', type='float', dest='enc_dropout', default=0.5)
52 | parser.add_option('--dec_dropout', type='float', dest='dec_dropout', default=0.5)
53 | parser.add_option('--l2_reg', type='float', dest='l2_reg', default=1e-4)
54 | # limit length options
55 | parser.add_option('--train_max_length', type='int', dest='train_max_length', default=10)
56 | parser.add_option('--gpu_id', type='int', dest='gpu_id', default=-1)
57 | # test options
58 | parser.add_option('--output_tree', action='store_true', dest='output_tree', default=False)
59 |
60 | # for lm
61 | parser.add_option('--clip', type='float', dest='clip', default=0.25)
62 | parser.add_option('--seed', type='int', dest='seed', default=-1)
63 |
64 | # for pr
65 | parser.add_option('--pr_initializer', dest='pr_initializer',
66 | default='normal', help='[glorot,constant,uniform,normal]')
67 | parser.add_option('--pr_optimizer', dest='pr_optimizer',
68 | default='adagrad', help='[sgd,momentum,adam,adadelta,adagrad]')
69 | parser.add_option('--epsilon', type='float', dest='epsilon', default=0.1)
70 | parser.add_option('--pr_fname', dest='pr_fname', metavar='FILE',
71 | default='', help='')
72 | parser.add_option('--mc_samples', type='int', dest='mc_samples', default=20)
73 | parser.add_option('--batchsize', type='int', dest='batchsize', default=1)
74 | parser.add_option('--nvil_batchsize', type='int', dest='nvil_batchsize', default=1)
75 | parser.add_option('--min_freq', type='int', dest='min_freq', default=2)
76 | parser.add_option('--language', dest='language', metavar='FILE',
77 | default='', help='')
78 | (options, args) = parser.parse_args()
79 | s = Session(options)
80 | s.nvil_pr_ft_ud()
--------------------------------------------------------------------------------
/vi/nvil_pre.py:
--------------------------------------------------------------------------------
1 | from optparse import OptionParser
2 | from vi_syntax.vi.Session import Session
3 |
4 |
5 | if __name__ == '__main__':
6 | parser = OptionParser()
7 | parser.add_option('--data_path', dest='data_path', metavar='FILE',
8 | default='', help='')
9 | parser.add_option('--train_fname', dest='train_fname', metavar='FILE',
10 | default='', help='')
11 | parser.add_option('--valid_fname', dest='valid_fname', metavar='FILE',
12 | default='', help='')
13 | parser.add_option('--test_fname', dest='test_fname', metavar='FILE',
14 | default='', help='')
15 | parser.add_option('--cluster', dest='cluster', action='store_true',
16 | default=False)
17 | parser.add_option('--cluster_fname', dest='cluster_fname', metavar='FILE',
18 | default='', help='dir pretrained word cluster files')
19 | parser.add_option('--word_vector_cache', dest='word_vector_cache', metavar='FILE',
20 | default='', help='dir for caching pretrained word vectors')
21 | parser.add_option('--result_dir', dest='result_dir', metavar='FILE',
22 | default='', help='dir to store results')
23 | parser.add_option('--encoder_fname', dest='encoder_fname', metavar='FILE',
24 | default='', help='dir to store encoder models') # for loading
25 | parser.add_option('--decoder_fname', dest='decoder_fname', metavar='FILE',
26 | default='', help='dir to store decoder models') # for loading
27 | parser.add_option('--lm_fname', dest='lm_fname', metavar='FILE',
28 | default='', help='dir to store baseline models') # for loading
29 | parser.add_option('--log_name', dest='log_name', metavar='FILE',
30 | default='nvil_test.log')
31 | parser.add_option('--save_model', action='store_true', dest='save_model', default=False)
32 |
33 | # optmizer & initializer
34 | parser.add_option('--initializer', dest='initializer',
35 | default='glorot', help='[glorot,constant,uniform,normal]')
36 | parser.add_option('--optimizer', dest='optimizer',
37 | default='adagrad', help='[sgd,momentum,adam,adadelta,adagrad]')
38 | # model params
39 | parser.add_option("--pretrain_word_dim",type="int", dest="pretrain_word_dim", default=300)
40 | parser.add_option('--word_dim', type='int', dest='word_dim', default=80)
41 | parser.add_option('--pos_dim', type='int', dest='pos_dim', default=80)
42 | parser.add_option('--action_dim', type='int', dest='action_dim', default=0) # 32
43 | parser.add_option('--enc_lstm_dim', type='int', dest='enc_lstm_dim', default=64)
44 | parser.add_option('--dec_lstm_dim', type='int', dest='dec_lstm_dim', default=64)
45 | parser.add_option('--nlayers', type='int', dest='nlayers', default=1)
46 | # training options
47 | # parser.add_option('--decay_every', type='int', dest='decay_every', default=5000)
48 | parser.add_option('--epochs', type='int', dest='epochs', default=50)
49 | parser.add_option('--print_every', type='int', dest='print_every', default=200)
50 | parser.add_option('--save_every', type='int', dest='save_every', default=1000)
51 | # optimization misc
52 | parser.add_option('--lr', type='float', dest='lr', default=0.01) # 0.01
53 |
54 | # parser.add_option('--lrdecay', type='float', dest='decay', default=0.75)
55 | parser.add_option('--enc_dropout', type='float', dest='enc_dropout', default=0.5)
56 | parser.add_option('--dec_dropout', type='float', dest='dec_dropout', default=0.5)
57 | # parser.add_option('--enc_reg', type='float', dest='enc_reg', default=0.1)
58 | # parser.add_option('--dec_reg', type='float', dest='dec_reg', default=1.0)
59 | # parser.add_option('--enc_update', type='float', dest='enc_update', default=1.0)
60 | parser.add_option('--l2_reg', type='float', dest='l2_reg', default=1e-4)
61 | # limit length options
62 | parser.add_option('--train_max_length', type='int', dest='train_max_length', default=10)
63 | parser.add_option('--gpu_id', type='int', dest='gpu_id', default=-1)
64 | # test options
65 | parser.add_option('--output_tree', action='store_true', dest='output_tree', default=False)
66 |
67 | # for lm
68 | parser.add_option("--lm_pretrain", action='store_true', dest="lm_pretrain", default=False)
69 | parser.add_option("--tie_weights", action='store_true', dest="tie_weights", default=False)
70 | parser.add_option('--lm_word_dim', type='int', dest='lm_word_dim', default=100)
71 | parser.add_option('--lm_pos_dim', type='int', dest='lm_pos_dim', default=0) # 50
72 | parser.add_option('--lm_lstm_dim', type='int', dest='lm_lstm_dim', default=100)
73 | parser.add_option('--lm_nlayers', type='int', dest='lm_nlayers', default=2)
74 | parser.add_option('--lm_dropout', type='float', dest='lm_dropout', default=0.2)
75 | parser.add_option('--clip', type='float', dest='clip', default=0.25)
76 |
77 | # for pr
78 | parser.add_option('--pr_initializer', dest='pr_initializer',
79 | default='normal', help='[glorot,constant,uniform,normal]')
80 | parser.add_option('--pr_optimizer', dest='pr_optimizer',
81 | default='sgd', help='[sgd,momentum,adam,adadelta,adagrad]')
82 | parser.add_option('--epsilon', type='float', dest='epsilon', default=0.1)
83 | parser.add_option('--pr_fname', dest='pr_fname', metavar='FILE',
84 | default='', help='')
85 | parser.add_option('--mc_samples', type='int', dest='mc_samples', default=20)
86 | parser.add_option('--batchsize', type='int', dest='batchsize', default=1)
87 | parser.add_option('--nvil_batchsize', type='int', dest='nvil_batchsize', default=1)
88 | parser.add_option('--min_freq', type='int', dest='min_freq', default=2)
89 | (options, args) = parser.parse_args()
90 | s = Session(options)
91 | s.nvil_pr_pretrain()
--------------------------------------------------------------------------------
/vi/nvil_pre_ud.py:
--------------------------------------------------------------------------------
1 | from optparse import OptionParser
2 | from vi_syntax.vi.Session import Session
3 |
4 |
5 | if __name__ == '__main__':
6 | parser = OptionParser()
7 | parser.add_option('--data_path', dest='data_path', metavar='FILE',
8 | default='', help='')
9 | parser.add_option('--train_fname', dest='train_fname', metavar='FILE',
10 | default='', help='')
11 | parser.add_option('--valid_fname', dest='valid_fname', metavar='FILE',
12 | default='', help='')
13 | parser.add_option('--test_fname', dest='test_fname', metavar='FILE',
14 | default='', help='')
15 | parser.add_option('--cluster', dest='cluster', action='store_true',
16 | default=False)
17 | parser.add_option('--cluster_fname', dest='cluster_fname', metavar='FILE',
18 | default='', help='dir pretrained word cluster files')
19 | parser.add_option('--word_vector_cache', dest='word_vector_cache', metavar='FILE',
20 | default='', help='dir for caching pretrained word vectors')
21 | parser.add_option('--result_dir', dest='result_dir', metavar='FILE',
22 | default='', help='dir to store results')
23 | parser.add_option('--encoder_fname', dest='encoder_fname', metavar='FILE',
24 | default='', help='dir to store encoder models') # for loading
25 | parser.add_option('--decoder_fname', dest='decoder_fname', metavar='FILE',
26 | default='', help='dir to store decoder models') # for loading
27 | parser.add_option('--lm_fname', dest='lm_fname', metavar='FILE',
28 | default='', help='dir to store baseline models') # for loading
29 | parser.add_option('--log_name', dest='log_name', metavar='FILE',
30 | default='nvil_test.log')
31 | parser.add_option('--save_model', action='store_true', dest='save_model', default=False)
32 |
33 | # optmizer & initializer
34 | parser.add_option('--initializer', dest='initializer',
35 | default='glorot', help='[glorot,constant,uniform,normal]')
36 | parser.add_option('--optimizer', dest='optimizer',
37 | default='adagrad', help='[sgd,momentum,adam,adadelta,adagrad]')
38 | # model params
39 | parser.add_option("--pretrain_word_dim",type="int", dest="pretrain_word_dim", default=300)
40 | parser.add_option('--word_dim', type='int', dest='word_dim', default=80)
41 | parser.add_option('--pos_dim', type='int', dest='pos_dim', default=80)
42 | parser.add_option('--action_dim', type='int', dest='action_dim', default=0) # 32
43 | parser.add_option('--enc_lstm_dim', type='int', dest='enc_lstm_dim', default=64)
44 | parser.add_option('--dec_lstm_dim', type='int', dest='dec_lstm_dim', default=64)
45 | parser.add_option('--nlayers', type='int', dest='nlayers', default=1)
46 | # training options
47 | parser.add_option('--epochs', type='int', dest='epochs', default=10)
48 | parser.add_option('--print_every', type='int', dest='print_every', default=10)
49 | parser.add_option('--save_every', type='int', dest='save_every', default=10)
50 | # optimization misc
51 | parser.add_option('--lr', type='float', dest='lr', default=0.01) # 0.01
52 | parser.add_option('--clip', type='float', dest='clip', default=0.5)
53 |
54 | parser.add_option('--enc_dropout', type='float', dest='enc_dropout', default=0.5)
55 | parser.add_option('--dec_dropout', type='float', dest='dec_dropout', default=0.5)
56 | parser.add_option('--l2_reg', type='float', dest='l2_reg', default=1e-4)
57 | # limit length options
58 | parser.add_option('--train_max_length', type='int', dest='train_max_length', default=10)
59 | parser.add_option('--gpu_id', type='int', dest='gpu_id', default=-1)
60 | # test options
61 | parser.add_option('--output_tree', action='store_true', dest='output_tree', default=False)
62 |
63 | # for pr
64 | parser.add_option('--pr_initializer', dest='pr_initializer',
65 | default='normal', help='[glorot,constant,uniform,normal]')
66 | parser.add_option('--pr_optimizer', dest='pr_optimizer',
67 | default='sgd', help='[sgd,momentum,adam,adadelta,adagrad]')
68 | parser.add_option('--epsilon', type='float', dest='epsilon', default=0.1)
69 | parser.add_option('--pr_fname', dest='pr_fname', metavar='FILE',
70 | default='', help='')
71 | parser.add_option('--mc_samples', type='int', dest='mc_samples', default=20)
72 | parser.add_option('--batchsize', type='int', dest='batchsize', default=1)
73 | parser.add_option('--nvil_batchsize', type='int', dest='nvil_batchsize', default=1)
74 | parser.add_option('--min_freq', type='int', dest='min_freq', default=2)
75 | parser.add_option('--seed', type='int', dest='seed', default=1000)
76 |
77 | parser.add_option('--language', dest='language', metavar='FILE',
78 | default='', help='')
79 | (options, args) = parser.parse_args()
80 | s = Session(options)
81 | s.nvil_pr_pretrain_ud()
--------------------------------------------------------------------------------
/vi/rl_ft.py:
--------------------------------------------------------------------------------
1 | from optparse import OptionParser
2 | from vi_syntax.vi.Session import Session
3 |
4 |
5 | if __name__ == '__main__':
6 | parser = OptionParser()
7 | parser.add_option('--data_path', dest='data_path', metavar='FILE',
8 | default='', help='')
9 | parser.add_option('--train_fname', dest='train_fname', metavar='FILE',
10 | default='', help='')
11 | parser.add_option('--valid_fname', dest='valid_fname', metavar='FILE',
12 | default='', help='')
13 | parser.add_option('--test_fname', dest='test_fname', metavar='FILE',
14 | default='', help='')
15 | parser.add_option('--cluster', dest='cluster', action='store_true',
16 | default=False)
17 | parser.add_option('--cluster_fname', dest='cluster_fname', metavar='FILE',
18 | default='', help='dir pretrained word cluster files')
19 | parser.add_option('--word_vector_cache', dest='word_vector_cache', metavar='FILE',
20 | default='', help='dir for caching pretrained word vectors')
21 | parser.add_option('--result_dir', dest='result_dir', metavar='FILE',
22 | default='', help='dir to store results')
23 | parser.add_option('--encoder_fname', dest='encoder_fname', metavar='FILE',
24 | default='', help='dir to store encoder models') # for loading
25 | parser.add_option('--decoder_fname', dest='decoder_fname', metavar='FILE',
26 | default='', help='dir to store decoder models') # for loading
27 | parser.add_option('--lm_fname', dest='lm_fname', metavar='FILE',
28 | default='', help='dir to store baseline models') # for loading
29 | parser.add_option('--log_name', dest='log_name', metavar='FILE',
30 | default='nvil_test.log')
31 | parser.add_option('--save_model', action='store_true', dest='save_model', default=False)
32 |
33 | # optmizer & initializer
34 | parser.add_option('--initializer', dest='initializer',
35 | default='glorot', help='[glorot,constant,uniform,normal]')
36 | parser.add_option('--optimizer', dest='optimizer',
37 | default='adagrad', help='[sgd,momentum,adam,adadelta,adagrad]')
38 | # model params
39 | parser.add_option("--pretrain_word_dim",type="int", dest="pretrain_word_dim", default=300)
40 | parser.add_option('--word_dim', type='int', dest='word_dim', default=80)
41 | parser.add_option('--pos_dim', type='int', dest='pos_dim', default=80)
42 | parser.add_option('--action_dim', type='int', dest='action_dim', default=0) # 32
43 | parser.add_option('--enc_lstm_dim', type='int', dest='enc_lstm_dim', default=64)
44 | parser.add_option('--dec_lstm_dim', type='int', dest='dec_lstm_dim', default=64)
45 | parser.add_option('--nlayers', type='int', dest='nlayers', default=1)
46 | # training options
47 | # parser.add_option('--decay_every', type='int', dest='decay_every', default=5000)
48 | parser.add_option('--epochs', type='int', dest='epochs', default=10)
49 | parser.add_option('--print_every', type='int', dest='print_every', default=10)
50 | parser.add_option('--save_every', type='int', dest='save_every', default=10)
51 | # optimization misc
52 | parser.add_option('--lr', type='float', dest='lr', default=0.005) # 0.01
53 |
54 | # parser.add_option('--lrdecay', type='float', dest='decay', default=0.75)
55 | parser.add_option('--enc_dropout', type='float', dest='enc_dropout', default=0.5)
56 | parser.add_option('--dec_dropout', type='float', dest='dec_dropout', default=0.5)
57 | # parser.add_option('--enc_reg', type='float', dest='enc_reg', default=0.1)
58 | # parser.add_option('--dec_reg', type='float', dest='dec_reg', default=1.0)
59 | # parser.add_option('--enc_update', type='float', dest='enc_update', default=1.0)
60 | parser.add_option('--l2_reg', type='float', dest='l2_reg', default=1e-4)
61 | # limit length options
62 | parser.add_option('--train_max_length', type='int', dest='train_max_length', default=10)
63 | parser.add_option('--gpu_id', type='int', dest='gpu_id', default=-1)
64 | # test options
65 | parser.add_option('--output_tree', action='store_true', dest='output_tree', default=False)
66 |
67 | # for lm
68 | parser.add_option("--lm_pretrain", action='store_true', dest="lm_pretrain", default=False)
69 | parser.add_option("--tie_weights", action='store_true', dest="tie_weights", default=False)
70 | parser.add_option('--lm_word_dim', type='int', dest='lm_word_dim', default=100)
71 | parser.add_option('--lm_pos_dim', type='int', dest='lm_pos_dim', default=0) # 50
72 | parser.add_option('--lm_lstm_dim', type='int', dest='lm_lstm_dim', default=100)
73 | parser.add_option('--lm_nlayers', type='int', dest='lm_nlayers', default=2)
74 | parser.add_option('--lm_dropout', type='float', dest='lm_dropout', default=0.5)
75 | parser.add_option('--clip', type='float', dest='clip', default=0.05)
76 |
77 | # for pr
78 | parser.add_option('--pr_initializer', dest='pr_initializer',
79 | default='normal', help='[glorot,constant,uniform,normal]')
80 | parser.add_option('--pr_optimizer', dest='pr_optimizer',
81 | default='adagrad', help='[sgd,momentum,adam,adadelta,adagrad]')
82 | parser.add_option('--epsilon', type='float', dest='epsilon', default=0.1)
83 | parser.add_option('--pr_fname', dest='pr_fname', metavar='FILE',
84 | default='', help='')
85 | parser.add_option('--mc_samples', type='int', dest='mc_samples', default=20)
86 | parser.add_option('--batchsize', type='int', dest='batchsize', default=1)
87 | parser.add_option('--nvil_batchsize', type='int', dest='nvil_batchsize', default=1)
88 | parser.add_option('--min_freq', type='int', dest='min_freq', default=2)
89 | (options, args) = parser.parse_args()
90 | s = Session(options)
91 | s.rl_pr_ft()
--------------------------------------------------------------------------------
/vi/test.py:
--------------------------------------------------------------------------------
1 | from optparse import OptionParser
2 | from vi_syntax.vi.Session import Session
3 |
4 |
5 | if __name__ == '__main__':
6 | parser = OptionParser()
7 | parser.add_option('--data_path', dest='data_path', metavar='FILE',
8 | default='', help='')
9 | parser.add_option('--train_fname', dest='train_fname', metavar='FILE',
10 | default='', help='')
11 | parser.add_option('--valid_fname', dest='valid_fname', metavar='FILE',
12 | default='', help='')
13 | parser.add_option('--test_fname', dest='test_fname', metavar='FILE',
14 | default='', help='')
15 | parser.add_option('--cluster', dest='cluster', action='store_true',
16 | default=False)
17 | parser.add_option('--cluster_fname', dest='cluster_fname', metavar='FILE',
18 | default='', help='dir pretrained word cluster files')
19 | parser.add_option('--word_vector_cache', dest='word_vector_cache', metavar='FILE',
20 | default='', help='dir for caching pretrained word vectors')
21 | parser.add_option('--result_dir', dest='result_dir', metavar='FILE',
22 | default='', help='dir to store results')
23 | parser.add_option('--encoder_fname', dest='encoder_fname', metavar='FILE',
24 | default='', help='dir to store encoder models') # for loading
25 | parser.add_option('--decoder_fname', dest='decoder_fname', metavar='FILE',
26 | default='', help='dir to store decoder models') # for loading
27 | parser.add_option('--lm_fname', dest='lm_fname', metavar='FILE',
28 | default='', help='dir to store baseline models') # for loading
29 | parser.add_option('--log_name', dest='log_name', metavar='FILE',
30 | default='nvil_test.log')
31 | parser.add_option('--save_model', action='store_true', dest='save_model', default=False)
32 |
33 | # optmizer & initializer
34 | parser.add_option('--initializer', dest='initializer',
35 | default='glorot', help='[glorot,constant,uniform,normal]')
36 | parser.add_option('--optimizer', dest='optimizer',
37 | default='adagrad', help='[sgd,momentum,adam,adadelta,adagrad]')
38 | # model params
39 | parser.add_option("--pretrain_word_dim",type="int", dest="pretrain_word_dim", default=300)
40 | parser.add_option('--word_dim', type='int', dest='word_dim', default=80)
41 | parser.add_option('--pos_dim', type='int', dest='pos_dim', default=80)
42 | parser.add_option('--action_dim', type='int', dest='action_dim', default=0) # 32
43 | parser.add_option('--enc_lstm_dim', type='int', dest='enc_lstm_dim', default=64)
44 | parser.add_option('--dec_lstm_dim', type='int', dest='dec_lstm_dim', default=64)
45 | parser.add_option('--nlayers', type='int', dest='nlayers', default=1)
46 | # training options
47 | # parser.add_option('--decay_every', type='int', dest='decay_every', default=5000)
48 | parser.add_option('--epochs', type='int', dest='epochs', default=50)
49 | parser.add_option('--print_every', type='int', dest='print_every', default=200)
50 | parser.add_option('--save_every', type='int', dest='save_every', default=1000)
51 | # optimization misc
52 | parser.add_option('--lr', type='float', dest='lr', default=0.01) # 0.01
53 |
54 | # parser.add_option('--lrdecay', type='float', dest='decay', default=0.75)
55 | parser.add_option('--enc_dropout', type='float', dest='enc_dropout', default=0.5)
56 | parser.add_option('--dec_dropout', type='float', dest='dec_dropout', default=0.5)
57 | # parser.add_option('--enc_reg', type='float', dest='enc_reg', default=0.1)
58 | # parser.add_option('--dec_reg', type='float', dest='dec_reg', default=1.0)
59 | # parser.add_option('--enc_update', type='float', dest='enc_update', default=1.0)
60 | parser.add_option('--l2_reg', type='float', dest='l2_reg', default=1e-4)
61 | # limit length options
62 | parser.add_option('--train_max_length', type='int', dest='train_max_length', default=10)
63 | parser.add_option('--gpu_id', type='int', dest='gpu_id', default=-1)
64 | # test options
65 | parser.add_option('--output_tree', action='store_true', dest='output_tree', default=False)
66 | parser.add_option('--batchsize', type='int', dest='batchsize', default=1)
67 | parser.add_option('--nsample', type='int', dest='nsample', default=1)
68 | parser.add_option('--min_freq', type='int', dest='min_freq', default=2)
69 | parser.add_option('--seed', type='int', dest='seed', default=1000)
70 | (options, args) = parser.parse_args()
71 | s = Session(options)
72 | s.test()
--------------------------------------------------------------------------------
/vi/test_ud.py:
--------------------------------------------------------------------------------
1 | from optparse import OptionParser
2 | from vi_syntax.vi.Session import Session
3 |
4 |
5 | if __name__ == '__main__':
6 | parser = OptionParser()
7 | parser.add_option('--data_path', dest='data_path', metavar='FILE',
8 | default='', help='')
9 | parser.add_option('--train_fname', dest='train_fname', metavar='FILE',
10 | default='', help='')
11 | parser.add_option('--valid_fname', dest='valid_fname', metavar='FILE',
12 | default='', help='')
13 | parser.add_option('--test_fname', dest='test_fname', metavar='FILE',
14 | default='', help='')
15 | parser.add_option('--cluster', dest='cluster', action='store_true',
16 | default=False)
17 | parser.add_option('--cluster_fname', dest='cluster_fname', metavar='FILE',
18 | default='', help='dir pretrained word cluster files')
19 | parser.add_option('--word_vector_cache', dest='word_vector_cache', metavar='FILE',
20 | default='', help='dir for caching pretrained word vectors')
21 | parser.add_option('--result_dir', dest='result_dir', metavar='FILE',
22 | default='', help='dir to store results')
23 | parser.add_option('--encoder_fname', dest='encoder_fname', metavar='FILE',
24 | default='', help='dir to store encoder models') # for loading
25 | parser.add_option('--decoder_fname', dest='decoder_fname', metavar='FILE',
26 | default='', help='dir to store decoder models') # for loading
27 | parser.add_option('--lm_fname', dest='lm_fname', metavar='FILE',
28 | default='', help='dir to store baseline models') # for loading
29 | parser.add_option('--log_name', dest='log_name', metavar='FILE',
30 | default='nvil_test.log')
31 | parser.add_option('--save_model', action='store_true', dest='save_model', default=False)
32 |
33 | # optmizer & initializer
34 | parser.add_option('--initializer', dest='initializer',
35 | default='glorot', help='[glorot,constant,uniform,normal]')
36 | parser.add_option('--optimizer', dest='optimizer',
37 | default='adagrad', help='[sgd,momentum,adam,adadelta,adagrad]')
38 | # model params
39 | parser.add_option("--pretrain_word_dim",type="int", dest="pretrain_word_dim", default=300)
40 | parser.add_option('--word_dim', type='int', dest='word_dim', default=80)
41 | parser.add_option('--pos_dim', type='int', dest='pos_dim', default=80)
42 | parser.add_option('--action_dim', type='int', dest='action_dim', default=0) # 32
43 | parser.add_option('--enc_lstm_dim', type='int', dest='enc_lstm_dim', default=64)
44 | parser.add_option('--dec_lstm_dim', type='int', dest='dec_lstm_dim', default=64)
45 | parser.add_option('--nlayers', type='int', dest='nlayers', default=1)
46 | # training options
47 | # parser.add_option('--decay_every', type='int', dest='decay_every', default=5000)
48 | parser.add_option('--epochs', type='int', dest='epochs', default=50)
49 | parser.add_option('--print_every', type='int', dest='print_every', default=200)
50 | parser.add_option('--save_every', type='int', dest='save_every', default=1000)
51 | # optimization misc
52 | parser.add_option('--lr', type='float', dest='lr', default=0.01) # 0.01
53 |
54 | parser.add_option('--enc_dropout', type='float', dest='enc_dropout', default=0.5)
55 | parser.add_option('--dec_dropout', type='float', dest='dec_dropout', default=0.5)
56 | parser.add_option('--l2_reg', type='float', dest='l2_reg', default=1e-4)
57 | # limit length options
58 | parser.add_option('--train_max_length', type='int', dest='train_max_length', default=10)
59 | parser.add_option('--gpu_id', type='int', dest='gpu_id', default=-1)
60 | # test options
61 | parser.add_option('--output_tree', action='store_true', dest='output_tree', default=False)
62 | parser.add_option('--batchsize', type='int', dest='batchsize', default=1)
63 | parser.add_option('--nsample', type='int', dest='nsample', default=1)
64 | parser.add_option('--min_freq', type='int', dest='min_freq', default=2)
65 | parser.add_option('--seed', type='int', dest='seed', default=1000)
66 | parser.add_option('--language', dest='language', metavar='FILE',
67 | default='', help='')
68 | (options, args) = parser.parse_args()
69 | s = Session(options)
70 | s.test_ud()
--------------------------------------------------------------------------------
/vi/train_decoder.py:
--------------------------------------------------------------------------------
1 | from optparse import OptionParser
2 | from vi_syntax.vi.Session import Session
3 |
4 |
5 | if __name__ == '__main__':
6 | parser = OptionParser()
7 | parser.add_option('--data_path', dest='data_path', metavar='FILE',
8 | default='', help='')
9 | parser.add_option('--train_fname', dest='train_fname', metavar='FILE',
10 | default='', help='')
11 | parser.add_option('--valid_fname', dest='valid_fname', metavar='FILE',
12 | default='', help='')
13 | parser.add_option('--test_fname', dest='test_fname', metavar='FILE',
14 | default='', help='')
15 | parser.add_option('--cluster', dest='cluster', action='store_true',
16 | default=False)
17 | parser.add_option('--cluster_fname', dest='cluster_fname', metavar='FILE',
18 | default='', help='dir pretrained word cluster files')
19 | parser.add_option('--word_vector_cache', dest='word_vector_cache', metavar='FILE',
20 | default='', help='dir for caching pretrained word vectors')
21 | parser.add_option('--result_dir', dest='result_dir', metavar='FILE',
22 | default='', help='dir to store results')
23 | parser.add_option('--encoder_fname', dest='encoder_fname', metavar='FILE',
24 | default='', help='dir to store encoder models') # for loading
25 | parser.add_option('--decoder_fname', dest='decoder_fname', metavar='FILE',
26 | default='', help='dir to store decoder models') # for loading
27 | parser.add_option('--baseline_fname', dest='baseline_fname', metavar='FILE',
28 | default='', help='dir to store baseline models') # for loading
29 | parser.add_option('--log_name', dest='log_name', metavar='FILE',
30 | default='pre_de0.log')
31 |
32 | parser.add_option('--save_model', action='store_true', dest='save_model', default=False)
33 |
34 | # optmizer & initializer
35 | parser.add_option('--initializer', dest='initializer',
36 | default='glorot', help='[glorot,constant,uniform,normal]')
37 | parser.add_option('--optimizer', dest='optimizer',
38 | default='adagrad', help='[sgd,momentum,adam,adadelta,adagrad]')
39 | # model params
40 | parser.add_option("--pretrain_word_dim",type="int", dest="pretrain_word_dim", default=300)
41 | parser.add_option('--word_dim', type='int', dest='word_dim', default=50)
42 | parser.add_option('--pos_dim', type='int', dest='pos_dim', default=50)
43 | parser.add_option('--action_dim', type='int', dest='action_dim', default=0) # 32
44 | # parser.add_option('--enc_lstm_dim', type='int', dest='enc_lstm_dim', default=32)
45 | parser.add_option('--dec_lstm_dim', type='int', dest='dec_lstm_dim', default=32)
46 | parser.add_option('--nlayers', type='int', dest='nlayers', default=1)
47 | parser.add_option('--batchsize', type='int', dest='batchsize', default=1)
48 | # training options
49 | parser.add_option('--decay_every', type='int', dest='decay_every', default=5000)
50 | parser.add_option('--epochs', type='int', dest='epochs', default=50)
51 | parser.add_option('--print_every', type='int', dest='print_every', default=200)
52 | # parser.add_option('--save_every', type='int', dest='save_every', default=1000)
53 | # optimization misc
54 | parser.add_option('--lr', type='float', dest='lr', default=0.005) # 1e-3
55 | # parser.add_option('--lrdecay', type='float', dest='decay', default=0.75)
56 | # parser.add_option('--enc_dropout', type='float', dest='enc_dropout', default=0.3)
57 | parser.add_option('--dec_dropout', type='float', dest='dec_dropout', default=0.3)
58 | # parser.add_option('--enc_reg', type='float', dest='enc_reg', default=0.1)
59 | # parser.add_option('--dec_reg', type='float', dest='dec_reg', default=1.0)
60 | # parser.add_option('--enc_update', type='float', dest='enc_update', default=1.0)
61 | parser.add_option('--l2_reg', type='float', dest='l2_reg', default=1e-4)
62 | # limit length options
63 | parser.add_option('--train_max_length', type='int', dest='train_max_length', default=10)
64 | parser.add_option('--gpu_id', type='int', dest='gpu_id', default=-1)
65 | # test options
66 | parser.add_option('--output_tree', action='store_true', dest='output_tree', default=False)
67 | parser.add_option('--min_freq', type='int', dest='min_freq', default=2)
68 | parser.add_option('--seed', type='int', dest='seed', default=2)
69 | parser.add_option('--language', dest='language', metavar='FILE', default='', help='')
70 |
71 | (options, args) = parser.parse_args()
72 | s = Session(options)
73 | s.train_decoder()
--------------------------------------------------------------------------------
/vi/train_encoder.py:
--------------------------------------------------------------------------------
1 | from optparse import OptionParser
2 | from vi_syntax.vi.Session import Session
3 | from vi_syntax.vi.myio.IO import VIDataset
4 |
5 |
6 | if __name__ == '__main__':
7 | parser = OptionParser()
8 | parser.add_option('--data_path', dest='data_path', metavar='FILE',
9 | default='', help='')
10 | parser.add_option('--train_fname', dest='train_fname', metavar='FILE',
11 | default='', help='')
12 | parser.add_option('--valid_fname', dest='valid_fname', metavar='FILE',
13 | default='', help='')
14 | parser.add_option('--test_fname', dest='test_fname', metavar='FILE',
15 | default='', help='')
16 | parser.add_option('--cluster', dest='cluster', action='store_true',
17 | default=False)
18 | parser.add_option('--cluster_fname', dest='cluster_fname', metavar='FILE',
19 | default='', help='dir pretrained word cluster files')
20 | parser.add_option('--word_vector_cache', dest='word_vector_cache', metavar='FILE',
21 | default='', help='dir for caching pretrained word vectors')
22 | parser.add_option('--result_dir', dest='result_dir', metavar='FILE',
23 | default='', help='dir to store results')
24 | parser.add_option('--encoder_fname', dest='encoder_fname', metavar='FILE',
25 | default='', help='dir to store encoder models') # for loading
26 | parser.add_option('--decoder_fname', dest='decoder_fname', metavar='FILE',
27 | default='', help='dir to store decoder models') # for loading
28 | parser.add_option('--baseline_fname', dest='baseline_fname', metavar='FILE',
29 | default='', help='dir to store baseline models') # for loading
30 | parser.add_option('--log_name', dest='log_name', metavar='FILE',
31 | default='pre_de0.log')
32 |
33 | parser.add_option('--save_model', action='store_true', dest='save_model', default=False)
34 |
35 | # optmizer & initializer
36 | parser.add_option('--initializer', dest='initializer',
37 | default='glorot', help='[glorot,constant,uniform,normal]')
38 | parser.add_option('--optimizer', dest='optimizer',
39 | default='adagrad', help='[sgd,momentum,adam,adadelta,adagrad]')
40 | # model params
41 | parser.add_option("--pretrain_word_dim",type="int", dest="pretrain_word_dim", default=300)
42 | parser.add_option('--word_dim', type='int', dest='word_dim', default=50)
43 | parser.add_option('--pos_dim', type='int', dest='pos_dim', default=50)
44 | parser.add_option('--action_dim', type='int', dest='action_dim', default=0) # 32
45 | parser.add_option('--enc_lstm_dim', type='int', dest='enc_lstm_dim', default=32)
46 | # parser.add_option('--dec_lstm_dim', type='int', dest='dec_lstm_dim', default=32)
47 | parser.add_option('--nlayers', type='int', dest='nlayers', default=1)
48 | parser.add_option('--batchsize', type='int', dest='batchsize', default=1)
49 | # training options
50 | parser.add_option('--decay_every', type='int', dest='decay_every', default=5000)
51 | parser.add_option('--epochs', type='int', dest='epochs', default=50)
52 | parser.add_option('--print_every', type='int', dest='print_every', default=200)
53 | # parser.add_option('--save_every', type='int', dest='save_every', default=1000)
54 | # optimization misc
55 | parser.add_option('--lr', type='float', dest='lr', default=0.005) # 1e-3
56 | # parser.add_option('--lrdecay', type='float', dest='decay', default=0.75)
57 | parser.add_option('--enc_dropout', type='float', dest='enc_dropout', default=0.3)
58 | # parser.add_option('--dec_dropout', type='float', dest='dec_dropout', default=0.5)
59 | # parser.add_option('--enc_reg', type='float', dest='enc_reg', default=0.1)
60 | # parser.add_option('--dec_reg', type='float', dest='dec_reg', default=1.0)
61 | # parser.add_option('--enc_update', type='float', dest='enc_update', default=1.0)
62 | parser.add_option('--l2_reg', type='float', dest='l2_reg', default=1e-4)
63 | # limit length options
64 | parser.add_option('--train_max_length', type='int', dest='train_max_length', default=10)
65 | parser.add_option('--gpu_id', type='int', dest='gpu_id', default=-1)
66 | # test options
67 | parser.add_option('--output_tree', action='store_true', dest='output_tree', default=False)
68 | parser.add_option('--min_freq', type='int', dest='min_freq', default=2)
69 | parser.add_option('--seed', type='int', dest='seed', default=2)
70 | parser.add_option('--language', dest='language', metavar='FILE', default='', help='')
71 |
72 |
73 | (options, args) = parser.parse_args()
74 | s = Session(options)
75 | s.train_encoder()
--------------------------------------------------------------------------------
/vi/train_lm.py:
--------------------------------------------------------------------------------
1 | from optparse import OptionParser
2 | from vi_syntax.vi.Session import Session
3 |
4 |
5 | if __name__ == '__main__':
6 | parser = OptionParser()
7 | parser.add_option('--data_path', dest='data_path', metavar='FILE',
8 | default='', help='')
9 | parser.add_option('--train_fname', dest='train_fname', metavar='FILE',
10 | default='', help='')
11 | parser.add_option('--valid_fname', dest='valid_fname', metavar='FILE',
12 | default='', help='')
13 | parser.add_option('--test_fname', dest='test_fname', metavar='FILE',
14 | default='', help='')
15 | parser.add_option('--word_vector_cache', dest='word_vector_cache', metavar='FILE',
16 | default='', help='dir for caching pretrained word vectors')
17 | parser.add_option('--result_dir', dest='result_dir', metavar='FILE',
18 | default='', help='dir to store results')
19 | parser.add_option('--log_name', dest='log_name', metavar='FILE',
20 | default='pre_lm0.log')
21 | parser.add_option('--save_model', action='store_true', dest='save_model',
22 | default=False)
23 |
24 | # optmizer & initializer
25 | parser.add_option('--initializer', dest='initializer',
26 | default='normal', help='[glorot,constant,uniform,normal]')
27 | parser.add_option('--optimizer', dest='optimizer',
28 | default='adagrad', help='[sgd,momentum,adam,adadelta,adagrad]')
29 | # model params
30 | parser.add_option("--lm_pretrain", action='store_true', dest="lm_pretrain", default=False)
31 | parser.add_option("--tie_weights", action='store_true', dest="tie_weights", default=False)
32 | parser.add_option('--lm_word_dim', type='int', dest='lm_word_dim', default=100)
33 | parser.add_option('--lm_pos_dim', type='int', dest='lm_pos_dim', default=0) # 50
34 | parser.add_option('--lm_lstm_dim', type='int', dest='lm_lstm_dim', default=100)
35 | parser.add_option('--lm_nlayers', type='int', dest='lm_nlayers', default=2)
36 | parser.add_option('--batchsize', type='int', dest='batchsize', default=64)
37 | # training options
38 | parser.add_option('--decay_every', type='int', dest='decay_every', default=5000)
39 | parser.add_option('--epochs', type='int', dest='epochs', default=50)
40 | parser.add_option('--print_every', type='int', dest='print_every', default=20)
41 | # optimization misc
42 | parser.add_option('--lr', type='float', dest='lr', default=0.01) # 1e-3
43 | # parser.add_option('--lrdecay', type='float', dest='decay', default=0.75)
44 | parser.add_option('--lm_dropout', type='float', dest='lm_dropout', default=0.2)
45 | parser.add_option('--l2_reg', type='float', dest='l2_reg', default=1e-4)
46 | parser.add_option('--clip', type='float', dest='clip', default=0.25)
47 | # limit length options
48 | parser.add_option('--train_max_length', type='int', dest='train_max_length', default=20)
49 | parser.add_option('--min_freq', type='int', dest='min_freq', default=2)
50 | parser.add_option('--gpu_id', type='int', dest='gpu_id', default=-1)
51 |
52 | (options, args) = parser.parse_args()
53 | s = Session(options)
54 | s.train_lm()
--------------------------------------------------------------------------------
/vi/ud_scripts/ud_ft.sh:
--------------------------------------------------------------------------------
1 | CODE_ROOT="PATH_TO_THIS_CODE_REPO/"
2 | RESULT_DIR="PATH_TO_RESULT_DIR/"
3 | WORD_VEC_DIR="PATH_TO_WORD_VECTOR_DIR/"
4 | LANGUAGE="fr"
5 | ENC_CKPT="PATH_TO_ENCODER_CHECKPOINT"
6 | DEC_CKPT="PATH_TO_DECODER_CHECKPOINT"
7 | source /PATH_TO_PYTHON_ENV/activate &&
8 | OMP_NUM_THREADS=1 python ../nvil_ft_ud.py \
9 | --data_path $CODE_ROOT"data/ud" \
10 | --train_fname $LANGUAGE"_train_enhanced" \
11 | --valid_fname $LANGUAGE"_valid_enhanced" \
12 | --test_fname $LANGUAGE"_test_enhanced" \
13 | --cluster \
14 | --cluster_fname $CODE_ROOT"data/cluster/"$LANGUAGE"_cluster" \
15 | --word_vector_cache $WORD_VEC_DIR \
16 | --result_dir $RESULT_DIR \
17 | --log_name $LANGUAGE"_LOG_NAME.log" \
18 | --encoder_fname $RESULT_DIR$ENC_CKPT \
19 | --decoder_fname $RESULT_DIR$DEC_CKPT \
20 | --pr_fname $CODE_ROOT"data/pr_rules/ud_c/"$LANGUAGE"_10_gt.txt" \
21 | --word_dim 0 \
22 | --language $LANGUAGE
--------------------------------------------------------------------------------
/vi/ud_scripts/ud_pre.sh:
--------------------------------------------------------------------------------
1 | CODE_ROOT="PATH_TO_THIS_CODE_REPO/"
2 | RESULT_DIR="PATH_TO_RESULT_DIR/"
3 | WORD_VEC_DIR="PATH_TO_WORD_VECTOR_DIR/"
4 | LANGUAGE="fr"
5 | source /PATH_TO_PYTHON_ENV/activate &&
6 | python ../nvil_pre_ud.py \
7 | --data_path $CODE_ROOT"data/ud" \
8 | --train_fname $LANGUAGE"_train_enhanced" \
9 | --valid_fname $LANGUAGE"_valid_enhanced" \
10 | --test_fname $LANGUAGE"_test_enhanced" \
11 | --cluster \
12 | --cluster_fname $CODE_ROOT"data/cluster/"$LANGUAGE"_cluster" \
13 | --word_vector_cache $WORD_VEC_DIR \
14 | --result_dir $RESULT_DIR \
15 | --log_name $LANGUAGE"_LOG_NAME.log" \
16 | --pr_fname $CODE_ROOT"data/pr_rules/ud_c/"$LANGUAGE"_10_gt.txt" \
17 | --word_dim 0 \
18 | --language $LANGUAGE \
19 | --gpu_id -1 \
20 | --seed -1
--------------------------------------------------------------------------------
/vi/ud_scripts/ud_test.sh:
--------------------------------------------------------------------------------
1 | CODE_ROOT="PATH_TO_THIS_CODE_REPO/"
2 | RESULT_DIR="PATH_TO_RESULT_DIR/"
3 | WORD_VEC_DIR="PATH_TO_WORD_VECTOR_DIR/"
4 | LANGUAGE='fr'
5 | ENC_CKPT="PATH_TO_ENCODER_CHECKPOINT"
6 | source /PATH_TO_PYTHON_ENV/activate &&
7 | python ../test_ud.py \
8 | --data_path $CODE_ROOT"data/ud" \
9 | --train_fname $LANGUAGE"_train_enhanced" \
10 | --valid_fname $LANGUAGE"_valid_enhanced" \
11 | --test_fname $LANGUAGE"_test_enhanced" \
12 | --cluster \
13 | --cluster_fname $CODE_ROOT"data/cluster/"$LANGUAGE"_cluster" \
14 | --word_vector_cache $WORD_VEC_DIR \
15 | --result_dir $RESULT_DIR \
16 | --encoder_fname $RESULT_DIR$ENC_CKPT \
17 | --log_name $LANGUAGE"_LOG_NAME.log" \
18 | --word_dim 0 \
19 | --language $LANGUAGE \
20 | --gpu_id -1 \
21 | --seed -1
22 |
--------------------------------------------------------------------------------
/vi/ud_scripts/ud_train_decoder.sh:
--------------------------------------------------------------------------------
1 | CODE_ROOT="PATH_TO_THIS_CODE_REPO/"
2 | RESULT_DIR="PATH_TO_RESULT_DIR/"
3 | WORD_VEC_DIR="PATH_TO_WORD_VECTOR_DIR/"
4 | LANGUAGE="fr"
5 | source /PATH_TO_PYTHON_ENV/activate &&
6 | CUDA_VISIBLE_DEVICES=0 python ../train_decoder.py \
7 | --data_path $CODE_ROOT"data/ud" \
8 | --train_fname $LANGUAGE"_train_enhanced" \
9 | --valid_fname $LANGUAGE"_valid_enhanced" \
10 | --test_fname $LANGUAGE"_test_enhanced" \
11 | --cluster \
12 | --cluster_fname $CODE_ROOT"data/cluster/"$LANGUAGE"_cluster" \
13 | --word_vector_cache $WORD_VEC_DIR \
14 | --result_dir $RESULT_DIR \
15 | --log_name $LANGUAGE"_LOG_NAME.log" \
16 | --language $LANGUAGE \
17 | --seed -1 \
18 | --gpu_id 0
--------------------------------------------------------------------------------
/vi/ud_scripts/ud_train_encoder.sh:
--------------------------------------------------------------------------------
1 | CODE_ROOT="PATH_TO_THIS_CODE_REPO/"
2 | RESULT_DIR="PATH_TO_RESULT_DIR/"
3 | WORD_VEC_DIR="PATH_TO_WORD_VECTOR_DIR/"
4 | LANGUAGE="fr"
5 | source /PATH_TO_PYTHON_ENV/activate &&
6 | CUDA_VISIBLE_DEVICES=0 python ../train_encoder.py \
7 | --data_path $CODE_ROOT"data/ud" \
8 | --train_fname $LANGUAGE"_train_enhanced" \
9 | --valid_fname $LANGUAGE"_valid_enhanced" \
10 | --test_fname $LANGUAGE"_test_enhanced" \
11 | --cluster \
12 | --cluster_fname $CODE_ROOT"data/cluster/"$LANGUAGE"_cluster" \
13 | --word_vector_cache $WORD_VEC_DIR \
14 | --result_dir $RESULT_DIR \
15 | --log_name $LANGUAGE"_LOG_NAME.log" \
16 | --language $LANGUAGE \
17 | --seed -1 \
18 | --gpu_id 0
--------------------------------------------------------------------------------