├── README.md
├── data.py
├── data
    ├── data-goes-here.md
    ├── outputs
    │   └── results-go-here.md
    ├── sample
    │   ├── test.txt
    │   ├── train.txt
    │   └── valid.txt
    ├── test.txt
    ├── train.txt
    └── valid.txt
├── main.py
├── model.py
├── remote.sh
├── run.sh
├── run_sample.sh
├── show_result.py
└── util.py


/README.md:
--------------------------------------------------------------------------------
 1 | # nlp_multi_task_learning_pytorch
 2 | A basic multitask learning architecture for Natural Language Processing of Pytorch implementation.
 3 | 
 4 | The two tasks we use here is POS Tagging and Chunking. 
 5 | 
 6 | As below, we build the model, normally several layers of RNN above embedding. Later, we train specific task on different layer according to its complexity. For example, we think Chunking is a higher task than POS Tagging, inspired by *[Deep multi-task learning with low level tasks supervised at lower layers](http://anthology.aclweb.org/P16-2038)*.
 7 | 
 8 | ![img](https://ws3.sinaimg.cn/large/006tNbRwgy1fuchyzqmynj30ik0aogm6.jpg)
 9 | 
10 | ## Running Examples
11 | 
12 | You can check several running examples in `run.sh`. I will explain one here.
13 | 
14 | ```
15 | echo "Joint Training on the different level"
16 | python main.py --data './data' \  # The directory put the training data
17 |         --emsize 256 \	# embedding size
18 |         --npos_layers 1 \	# number of POS tagging training layer
19 |         --nchunk_layers 2 \	# number of chunking training layer
20 |         --nhid 128 \	# number of hidden states for RNN
21 |         --batch_size 128 \
22 |         --seq_len 10 \	# sequence length
23 |         --cuda \	# enbale GPU
24 |         --train_mode 'Joint' \
25 |         --epochs 300 \	
26 |         --log_interval 20 \
27 |         --save './result/joint_diff'
28 | ```
29 | 
30 | ## Current Done
31 | 
32 |   - A basic architecture for POS tagging and chunking
33 |   - Explorate the best hyperparameters of NN
34 | 
35 | ## Todo
36 |   - [ ] Chunking exploration
37 | 


--------------------------------------------------------------------------------
/data.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch 
 3 | 
 4 | 
 5 | class Dictionary(object):
 6 |     def __init__(self, name):
 7 |         self.name = name
 8 |         self.word2idx = {}
 9 |         self.idx2word = []
10 |         self.nwords = 0
11 | 
12 |     def add_word(self, word):
13 |         if word not in self.word2idx:
14 |             self.word2idx[word] = self.nwords
15 |             self.idx2word.append(word)
16 |             self.nwords += 1
17 | 
18 |     def __str__(self):
19 |         return "%s dictionary has %d kinds of tokens." \
20 |                 % (self.name, self.nwords)
21 | 
22 | 
23 | class Corpus(object):
24 |     def __init__(self, path):
25 |         self.word_dict = Dictionary('Word')
26 |         self.pos_dict = Dictionary('POS')
27 |         self.chunk_dict = Dictionary('Chunk')
28 | 
29 |         self.word_train, self.pos_train, self.chunk_train = self.tokenize(os.path.join(path, 'train.txt'))
30 |         self.word_valid, self.pos_valid, self.chunk_valid = self.tokenize(os.path.join(path, 'valid.txt'))
31 |         self.word_test, self.pos_test, self.chunk_test = self.tokenize(os.path.join(path, 'test.txt'))
32 | 
33 |     def tokenize(self, path):
34 |         "Tokenizes text data file"
35 |         assert os.path.exists(path)
36 |         # Build the dictionaries from corpus
37 |         with open(path, 'r') as f:
38 |             tokens = 0
39 |             for line in f:
40 |                 try: 
41 |                     word, pos, chunk = line.strip().split()
42 |                 except: 
43 |                     continue
44 |                 tokens += 1
45 |                 self.word_dict.add_word(word)
46 |                 self.pos_dict.add_word(pos)
47 |                 self.chunk_dict.add_word(chunk)
48 | 
49 |         with open(path, 'r') as f:
50 |             word_ids = torch.LongTensor(tokens)
51 |             pos_ids = torch.LongTensor(tokens)
52 |             chunk_ids = torch.LongTensor(tokens)
53 |             token = 0
54 |             for line in f:
55 |                 try: 
56 |                     word, pos, chunk = line.strip().split()
57 |                 except: 
58 |                     continue
59 |                 word_ids[token] = self.word_dict.word2idx[word]
60 |                 pos_ids[token] = self.pos_dict.word2idx[pos]
61 |                 chunk_ids[token] = self.chunk_dict.word2idx[chunk]
62 |                 token += 1
63 | 
64 |         return word_ids, pos_ids, chunk_ids
65 | 


--------------------------------------------------------------------------------
/data/data-goes-here.md:
--------------------------------------------------------------------------------
1 | # Your Data Goes In This Folder
2 | 


--------------------------------------------------------------------------------
/data/outputs/results-go-here.md:
--------------------------------------------------------------------------------
1 | # Results Go In This Folder (Automatically)
2 | 


--------------------------------------------------------------------------------
/data/sample/test.txt:
--------------------------------------------------------------------------------
 1 | Rockwell NNP B-NP
 2 | International NNP I-NP
 3 | Corp. NNP I-NP
 4 | 's POS B-NP
 5 | Tulsa NNP I-NP
 6 | unit NN I-NP
 7 | said VBD B-VP
 8 | it PRP B-NP
 9 | signed VBD B-VP
10 | a DT B-NP
11 | tentative JJ I-NP
12 | agreement NN I-NP
13 | extending VBG B-VP
14 | its PRP$ B-NP
15 | contract NN I-NP
16 | with IN B-PP
17 | Boeing NNP B-NP
18 | Co. NNP I-NP
19 | to TO B-VP
20 | provide VB I-VP
21 | 


--------------------------------------------------------------------------------
/data/sample/train.txt:
--------------------------------------------------------------------------------
  1 | Confidence NN B-NP
  2 | in IN B-PP
  3 | the DT B-NP
  4 | pound NN I-NP
  5 | is VBZ B-VP
  6 | widely RB I-VP
  7 | expected VBN I-VP
  8 | to TO I-VP
  9 | take VB I-VP
 10 | another DT B-NP
 11 | sharp JJ I-NP
 12 | Dive NN I-NP
 13 | if IN B-SBAR
 14 | trade NN B-NP
 15 | figures NNS I-NP
 16 | for IN B-PP
 17 | September NNP B-NP
 18 | , , O
 19 | due JJ B-ADJP
 20 | for IN B-PP
 21 | release NN B-NP
 22 | tomorrow NN B-NP
 23 | , , O
 24 | fail VB B-VP
 25 | to TO I-VP
 26 | show VB I-VP
 27 | a DT B-NP
 28 | substantial JJ I-NP
 29 | improvement NN I-NP
 30 | from IN B-PP
 31 | July NNP B-NP
 32 | and CC I-NP
 33 | August NNP I-NP
 34 | 's POS B-NP
 35 | near-record JJ I-NP
 36 | deficits NNS I-NP
 37 | . . O
 38 | Chancellor NNP O
 39 | of IN B-PP
 40 | the DT B-NP
 41 | Exchequer NNP I-NP
 42 | Nigel NNP B-NP
 43 | Lawson NNP I-NP
 44 | 's POS B-NP
 45 | restated VBN I-NP
 46 | commitment NN I-NP
 47 | to TO B-PP
 48 | a DT B-NP
 49 | firm NN I-NP
 50 | monetary JJ I-NP
 51 | policy NN I-NP
 52 | has VBZ B-VP
 53 | helped VBN I-VP
 54 | to TO I-VP
 55 | prevent VB I-VP
 56 | a DT B-NP
 57 | freefall NN I-NP
 58 | in IN B-PP
 59 | sterling NN B-NP
 60 | over IN B-PP
 61 | the DT B-NP
 62 | past JJ I-NP
 63 | week NN I-NP
 64 | . . O
 65 | But CC O
 66 | analysts NNS B-NP
 67 | reckon VBP B-VP
 68 | underlying VBG B-NP
 69 | support NN I-NP
 70 | for IN B-PP
 71 | sterling NN B-NP
 72 | has VBZ B-VP
 73 | been VBN I-VP
 74 | eroded VBN I-VP
 75 | by IN B-PP
 76 | the DT B-NP
 77 | chancellor NN I-NP
 78 | 's POS B-NP
 79 | failure NN I-NP
 80 | to TO B-VP
 81 | announce VB I-VP
 82 | any DT B-NP
 83 | new JJ I-NP
 84 | policy NN I-NP
 85 | measures NNS I-NP
 86 | in IN B-PP
 87 | his PRP$ B-NP
 88 | Mansion NNP I-NP
 89 | House NNP I-NP
 90 | speech NN I-NP
 91 | last JJ B-NP
 92 | Thursday NNP I-NP
 93 | . . O
 94 | This DT B-NP
 95 | has VBZ B-VP
 96 | increased VBN I-VP
 97 | the DT B-NP
 98 | risk NN I-NP
 99 | of IN B-PP
100 | the DT B-NP
101 | 


--------------------------------------------------------------------------------
/data/sample/valid.txt:
--------------------------------------------------------------------------------
 1 | a DT B-NP
 2 | trade NN I-NP
 3 | group NN I-NP
 4 | , , O
 5 | will MD B-VP
 6 | sell VB I-VP
 7 | access NN B-NP
 8 | to TO B-PP
 9 | the DT B-NP
10 | package NN I-NP
11 | to TO B-PP
12 | its PRP$ B-NP
13 | 180 CD I-NP
14 | airline NN I-NP
15 | members NNS I-NP
16 | world-wide JJ B-ADVP
17 | . . O
18 | Control NNP B-NP
19 | Data NNP I-NP
20 | will MD B-VP
21 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import time
  4 | import math
  5 | import torch
  6 | import torch.nn as nn
  7 | from torch.autograd import Variable
  8 | from torch import optim
  9 | 
 10 | from data import Corpus
 11 | from util import *
 12 | from model import *
 13 | 
 14 | ###############################################################################
 15 | # Set Parameters
 16 | ###############################################################################
 17 | parser = argparse.ArgumentParser(description='Pytorch NLP multi-task leraning for POS tagging and Chunking.')
 18 | parser.add_argument('--data', type=str, default='./data',
 19 |                     help='data file')
 20 | parser.add_argument('--emsize', type=int, default=200,
 21 |                     help='size of word embeddings') 
 22 | parser.add_argument('--npos_layers', type=int, default=1,
 23 |                     help='number of POS tagging layers')
 24 | parser.add_argument('--nchunk_layers', type=int, default=1,
 25 |                     help='number of chunking layers')
 26 | parser.add_argument('--nhid', type=int, default=200,
 27 |                     help='number of hidden units')
 28 | parser.add_argument('--lr', type=float, default=0.0001,
 29 |                     help='learning rate')
 30 | parser.add_argument('--clip', type=float, default=1,
 31 |                     help='gradient clip')
 32 | parser.add_argument('--epochs', type=int, default=40,
 33 |                     help='epoch number')
 34 | parser.add_argument('--batch_size', type=int, default=32, metavar='N',
 35 |                     help='batch size')
 36 | parser.add_argument('--seq_len', type=int, default=15,
 37 |                     help='sequence length')
 38 | parser.add_argument('--dropout', type=float, default=0.2,
 39 |                     help='dropout rate')
 40 | parser.add_argument('--rnn_type', type=str, default='LSTM',
 41 |                     help='RNN Cell types, among LSTM, GRU, and Elman')
 42 | parser.add_argument('--cuda', action='store_true',
 43 |                     help='use CUDA')
 44 | parser.add_argument('--bi', action='store_true',
 45 |                     help='use bidirection RNN')
 46 | parser.add_argument('--log_interval', type=int, default=200, metavar='N',
 47 |                     help='report interval')
 48 | parser.add_argument('--train_mode', type=str, default='Joint',
 49 |                     help='Training mode of model from POS, Chunk, to Joint.')
 50 | parser.add_argument('--test_times', type=int, default=1,
 51 |                     help='run several times to get trustable result.')
 52 | parser.add_argument('--save', type=str, default='model.pt',
 53 |                     help='path to save the final model')
 54 | args = parser.parse_args()
 55 | 
 56 | 
 57 | ###############################################################################
 58 | # Load Data
 59 | ###############################################################################
 60 | corpus_path = args.save.strip() + '_corpus.pt'
 61 | print('Loading corpus...')
 62 | if os.path.exists(corpus_path):
 63 |     corpus = torch.load(corpus_path)
 64 | else:
 65 |     corpus = Corpus(args.data)
 66 |     torch.save(corpus, corpus_path)
 67 | 
 68 | ###############################################################################
 69 | # Training Funcitons
 70 | ###############################################################################
 71 | def train(loss_log):
 72 |     
 73 |     if args.train_mode == 'Joint':
 74 |         target_data = (corpus.pos_train, corpus.chunk_train)
 75 |     elif args.train_mode == 'POS':
 76 |         target_data = (corpus.pos_train, )
 77 |     elif args.train_mode == 'Chunk':
 78 |         target_data = (corpus.chunk_train, )
 79 | 
 80 |     # Turn on training mode
 81 |     total_loss = 0
 82 |     start_time = time.time()
 83 |     n_iteration = corpus.word_train.size(0) // (args.batch_size*args.seq_len) 
 84 |     iteration = 0
 85 |     for X, ys in get_batch(corpus.word_train, *target_data, batch_size=args.batch_size,
 86 |                            seq_len=args.seq_len, cuda=args.cuda):
 87 |         iteration += 1
 88 |         model.zero_grad()
 89 |         if args.train_mode == 'Joint':
 90 |             if args.npos_layers == args.nchunk_layers:
 91 |                 hidden = model.rnn.init_hidden(args.batch_size)
 92 |                 outputs1, outputs2, hidden = model(X, hidden)
 93 |             else:
 94 |                 hidden1 = model.rnn1.init_hidden(args.batch_size)
 95 |                 hidden2 = model.init_rnn2_hidden(args.batch_size)
 96 |                 outputs1, outputs2, hidden1, hidden2 = model(X, hidden1, hidden2)    
 97 |             loss1 = criterion(outputs1.view(-1, npos_tags), ys[0].view(-1))
 98 |             loss2 = criterion(outputs2.view(-1, nchunk_tags), ys[1].view(-1))
 99 |             loss = loss1 + loss2
100 |         else:
101 |             hidden = model.rnn.init_hidden(args.batch_size)
102 |             outputs, hidden = model(X, hidden)
103 |             loss = criterion(outputs.view(-1, ntags), ys[0].view(-1))
104 | 
105 |         loss.backward() 
106 |         
107 |         # Prevent the exploding gradient
108 |         torch.nn.utils.clip_grad_norm(model.parameters(), args.clip)
109 |         optimizer.step()
110 |         total_loss += loss.data
111 |         
112 |         if iteration % args.log_interval == 0:
113 |             cur_loss = total_loss / args.log_interval
114 |             cur_loss = cur_loss.cpu().numpy()[0]
115 |             elapsed = time.time() - start_time
116 |             print('| epoch {:3d} | {:5d}/{:5d} iteration | {:5.2f} ms/batch | loss {:5.2f} |'.format(
117 |                 epoch, iteration, n_iteration,
118 |                 elapsed*1000/args.log_interval, 
119 |                 cur_loss))
120 |             loss_log.append(cur_loss)
121 |             total_loss = 0
122 |             start_time = time.time()
123 |     return loss_log
124 | 
125 | def evaluate(source, target):
126 |     model.eval()
127 |     n_iteration = source.size(0) // (args.batch_size*args.seq_len)
128 |     total_loss = 0
129 |     for X_val, y_vals in get_batch(source, *target, batch_size=args.batch_size,
130 |                            seq_len=args.seq_len, cuda=args.cuda, evalu=True):
131 |         if args.train_mode == 'Joint':
132 |             if args.npos_layers == args.nchunk_layers:
133 |                 hidden = model.rnn.init_hidden(args.batch_size)
134 |                 outputs1, outputs2, hidden = model(X_val, hidden)
135 |             else:
136 |                 hidden1 = model.rnn1.init_hidden(args.batch_size)
137 |                 hidden2 = model.init_rnn2_hidden(args.batch_size)
138 |                 outputs1, outputs2, hidden1, hidden2 = model(X_val, hidden1, hidden2)    
139 |             loss1 = criterion(outputs1.view(-1, npos_tags), y_vals[0].view(-1))
140 |             loss2 = criterion(outputs2.view(-1, nchunk_tags), y_vals[1].view(-1))
141 |             loss = loss1 + loss2
142 |             # Make predict and calculate accuracy
143 |             _, pred1 = outputs1.data.topk(1)
144 |             _, pred2 = outputs2.data.topk(1)
145 |             accuracy1 = torch.sum(pred1.squeeze(2) == y_vals[0].data) / (y_vals[0].size(0) * y_vals[0].size(1))
146 |             accuracy2 = torch.sum(pred2.squeeze(2) == y_vals[1].data) / (y_vals[1].size(0) * y_vals[1].size(1))
147 |             accuracy = (accuracy1, accuracy2)
148 |         else:
149 |             hidden = model.rnn.init_hidden(args.batch_size)
150 |             outputs, hidden = model(X_val, hidden)
151 |             loss = criterion(outputs.view(-1, ntags), y_vals[0].view(-1))
152 |             _, pred = outputs.data.topk(1)
153 |             accuracy = torch.sum(pred.squeeze(2) == y_vals[0].data) / (y_vals[0].size(0) * y_vals[0].size(1))
154 |         total_loss += loss
155 |          
156 |     return total_loss/n_iteration, accuracy
157 | 
158 | best_val_accuracies = []
159 | test_accuracies = []
160 | best_epoches = []
161 | patience = 25 #How many epoch if the accuracy have no change use early stopping
162 | for i in range(args.test_times):
163 | ###############################################################################
164 | # Build Model
165 | ###############################################################################
166 |     nwords = corpus.word_dict.nwords
167 |     npos_tags = corpus.pos_dict.nwords
168 |     nchunk_tags = corpus.chunk_dict.nwords
169 |     
170 |     if args.train_mode == 'Joint':
171 |         model = JointModel(nwords, args.emsize, args.nhid, npos_tags, args.npos_layers,
172 |                            nchunk_tags, args.nchunk_layers, args.dropout, bi=args.bi, 
173 |                            train_mode=args.train_mode)
174 |     else:
175 |         if args.train_mode == 'POS':
176 |             ntags = npos_tags
177 |             nlayers = args.npos_layers
178 |         elif args.train_mode == 'Chunk':
179 |             ntags = nchunk_tags
180 |             nlayers = args.nchunk_layers
181 |         model = JointModel(nwords, args.emsize, args.nhid, ntags, nlayers,
182 |                            args.dropout, bi=args.bi, train_mode=args.train_mode)
183 |     if args.cuda:
184 |         model = model.cuda()
185 | 
186 |     optimizer = optim.Adam(model.parameters(), lr=args.lr)
187 |     criterion = nn.CrossEntropyLoss()
188 | 
189 |     # Loop over epochs
190 |     best_val_loss = None
191 |     best_accuracy = None
192 |     best_epoch = 0
193 |     early_stop_count = 0
194 |     loss_log = []
195 |     # You can break training early by Ctr+C
196 |     try:
197 |         for epoch in range(1, args.epochs+1):
198 |             epoch_start_time = time.time()
199 |             print('Begin training...')
200 |             loss_log = train(loss_log)
201 |             # Evaluation
202 |             print('Evaluating on the valid data')
203 |             if args.train_mode == 'Joint':
204 |                 valid_target_data = (corpus.pos_valid, corpus.chunk_valid)
205 |             elif args.train_mode == 'POS':
206 |                 valid_target_data = (corpus.pos_valid, ) 
207 |             elif args.train_mode == 'Chunk':
208 |                 valid_target_data = (corpus.chunk_valid, ) 
209 |             
210 |             val_loss, accuracy = evaluate(corpus.word_valid, valid_target_data)
211 |             print('-'*50)
212 |             if args.train_mode == 'Joint':
213 |                 print('| end of epoch {:3d} | valid loss {:5.3f} | POS accuracy {:5.3f} | Chunk accuracy {:5.3}'.format(
214 |                     epoch, val_loss.data.cpu().numpy()[0], accuracy[0], accuracy[1]
215 |                 ))
216 |             else:
217 |                 print('| end of epoch {:3d} | valid loss {:5.3f} | accuracy {:5.3f} |'.format(
218 |                     epoch, val_loss.data.cpu().numpy()[0], accuracy
219 |                 ))
220 |             if not best_val_loss or (val_loss.data[0] < best_val_loss):
221 |                 with open(args.save.strip() + '.pt', 'wb') as f:
222 |                     torch.save(model, f)
223 |                 best_val_loss = val_loss.data[0]
224 |                 best_accuracy = accuracy
225 |                 best_epoch = epoch
226 |                 early_stop_count = 0
227 |             else:
228 |                 early_stop_count += 1
229 |             if early_stop_count >= patience:
230 |                 print('\nEarly Stopping! \nBecause %d epochs the accuracy have no improvement.'%(patience))
231 |                 break
232 |     except KeyboardInterrupt:
233 |         print('-'*50)
234 |         print('Exiting from training early.')
235 | 
236 | 
237 | ###############################################################################
238 | # Test Model
239 | ###############################################################################
240 |     #Load the best saved model
241 |     with open(args.save.strip() + '.pt', 'rb') as f:
242 |         model = torch.load(f)
243 | 
244 |     if args.train_mode == 'Joint':
245 |         test_target_data = (corpus.pos_test, corpus.chunk_test)
246 |     elif args.train_mode == 'POS':
247 |         test_target_data = (corpus.pos_test, ) 
248 |     elif args.train_mode == 'Chunk':
249 |         test_target_data = (corpus.chunk_test, ) 
250 |     test_loss, test_accuracy = evaluate(corpus.word_test, test_target_data)
251 |     print('='*50)
252 |     print("Evaluating on test data.")
253 |     if args.train_mode == 'Joint':
254 |         print('| end of epoch {:3d} | test loss {:5.3f} | POS test accuracy {:5.3f} | Chunk test accuracy {:5.3}'.format(
255 |             epoch, test_loss.data.cpu().numpy()[0], test_accuracy[0], test_accuracy[1]
256 |         ))
257 |     else:
258 |         print('| end of epoch {:3d} | test loss {:5.3f} | accuracy {:5.3f} |'.format(
259 |             epoch, test_loss.data.cpu().numpy()[0], test_accuracy
260 |         ))
261 |     
262 |     # Log Accuracy
263 |     best_val_accuracies.append(best_accuracy)
264 |     test_accuracies.append(test_accuracy)
265 |     best_epoches.append(best_epoch)
266 | 
267 | 
268 | # Save results
269 | results = {
270 |     'corpus': corpus,
271 |     'best_epoch': best_epoch,
272 |     'best_val_loss': best_val_loss,
273 |     'best_accuracy': best_accuracy,
274 |     'test_accuracy': test_accuracy,
275 |     'loss_log': loss_log,
276 | 
277 |     'best_val_accuracies': best_val_accuracies,
278 |     'test_accuracies': test_accuracies,
279 |     'best_epoches': best_epoches
280 | }
281 | torch.save(results, '%s_emsize%d_npos_layers%d_nchunk_layers%d_nhid%d_dropout%3.1f_seqlen%d_bi%d_%s_result.pt' \
282 |                     %(args.save.strip(), args.emsize, args.npos_layers, args.nchunk_layers,
283 |                       args.nhid, args.dropout, args.seq_len, args.bi, args.rnn_type))
284 | 
285 |             
286 | 


--------------------------------------------------------------------------------
/model.py:
--------------------------------------------------------------------------------
  1 | import torch.nn as nn
  2 | from torch.autograd import Variable
  3 | 
  4 | 
  5 | class EncoderModel(nn.Module):
  6 |     """Model include a transducer to predict at each time steps"""
  7 | 
  8 |     def __init__(self, ntoken, emsize, nhid,
  9 |                  nlayers=1, dropout=0.2, rnn_type='LSTM', bi=False):
 10 |         super().__init__()
 11 |         self.drop = nn.Dropout(dropout)
 12 |         self.embed = nn.Embedding(ntoken, emsize)
 13 |         self.rnn_type = rnn_type
 14 | 
 15 |         # Select RNN cell type from LSTM, GRU, and Elman
 16 |         if rnn_type == 'LSTM':
 17 |             self.rnn = nn.LSTM(emsize, nhid, nlayers, bidirectional=bi)
 18 |         elif rnn_type == 'GRU':
 19 |             self.rnn = nn.GRU(emsize, nhid, nlayers, bidirectional=bi)
 20 |         else:
 21 |             self.rnn = nn.RNN(emsize, nhid, nlayers, bidirectional=bi)
 22 |     
 23 |         self.init_weights()
 24 |         self.nhid = nhid
 25 |         self.nlayers = nlayers
 26 |         self.bi = bi
 27 |         
 28 |     def init_weights(self):
 29 |         init_range = 0.1
 30 |         self.embed.weight.data.uniform_(-init_range, init_range)
 31 | 
 32 |     def forward(self, input, hidden):
 33 |         embeded = self.drop(self.embed(input))
 34 |         self.rnn.flatten_parameters()
 35 |         output, hidden = self.rnn(embeded, hidden)
 36 |         output = self.drop(output)
 37 |         return output, hidden
 38 |     
 39 |     def init_hidden(self, batch_size):
 40 |         weight = next(self.parameters()).data
 41 |         return (Variable(weight.new(self.nlayers*(1+int(self.bi)), 
 42 |                                     batch_size, self.nhid).zero_()),
 43 |                 Variable(weight.new(self.nlayers*(1+int(self.bi)), 
 44 |                                     batch_size, self.nhid).zero_()))
 45 |                     
 46 | 
 47 | class LinearDecoder(nn.Module):
 48 |     """Linear decoder to decoder the outputs from the RNN Encoder.
 49 |         Then we can get the results of different tasks."""
 50 | 
 51 |     def __init__(self, nhid, ntags, bi=False):
 52 |         super().__init__()
 53 |         self.linear = nn.Linear(nhid*(1+int(bi)), ntags)
 54 |         self.init_weights()
 55 |         self.nin = nhid
 56 |         self.nout = ntags
 57 |         self.bi = bi
 58 |     
 59 |     def init_weights(self):
 60 |         init_range = 0.1
 61 |         self.linear.bias.data.fill_(0)
 62 |         self.linear.weight.data.uniform_(-init_range, init_range)
 63 |     
 64 |     def forward(self, input):
 65 |         logit = self.linear(input.view(input.size(0)*input.size(1), input.size(2)))
 66 |         return logit.view(input.size(0), input.size(1), logit.size(1))
 67 | 
 68 | 
 69 | class JointModel(nn.Module):
 70 |     """Joint Model to joint training two tasks.
 71 |        You can also only select one train mode to train one task.
 72 |        For args to specified the detail of training, include the task
 73 |        output and which layer we put it in. Number of tag first and 
 74 |        then number of layer."""
 75 |     def __init__(self, ntoken, emsize, nhid, *args,
 76 |                  dropout=0.2, rnn_type='LSTM', bi=False, train_mode='Joint'):
 77 |         super().__init__()
 78 |         self.ntoken = ntoken
 79 |         self.emsize = emsize
 80 |         self.nhid = nhid
 81 |         self.dropout = dropout
 82 |         self.rnn_type = rnn_type
 83 |         self.bi = bi
 84 |         self.train_mode = train_mode
 85 |         # According to train type, take arguments
 86 |         if train_mode == 'Joint':
 87 |             self.ntags1 = args[0]
 88 |             self.nlayers1 = args[1]
 89 |             self.ntags2 = args[2]
 90 |             self.nlayers2 = args[3]
 91 |             if self.nlayers1 == self.nlayers2:
 92 |                 self.rnn = EncoderModel(ntoken, emsize, nhid, self.nlayers1, 
 93 |                                         dropout, rnn_type, bi)
 94 |             else:
 95 |                 # Lower layer
 96 |                 self.rnn1 = EncoderModel(ntoken, emsize, nhid, self.nlayers1, 
 97 |                                          dropout, rnn_type, bi)
 98 |                 # Higher layer
 99 |                 if rnn_type == 'LSTM':
100 |                     self.rnn2 = nn.LSTM(nhid*(1+int(bi)), nhid, 
101 |                                         self.nlayers2 - self.nlayers1, 
102 |                                         bidirectional=bi)
103 |                 elif rnn_type == 'GRU':
104 |                     self.rnn2 = nn.GRU(nhid*(1+int(bi)), nhid, 
105 |                                        self.nlayers2 - self.nlayers1, 
106 |                                        bidirectional=bi)
107 |                 else:
108 |                     self.rnn2 = nn.RNN(nhid*(1+int(bi)), nhid, 
109 |                                        self.nlayers2 - self.nlayers1, 
110 |                                        bidirectional=bi)
111 | 
112 |             # Decoders for two tasks
113 |             self.linear1 = LinearDecoder(nhid, self.ntags1, bi)
114 |             self.linear2 = LinearDecoder(nhid, self.ntags2, bi)
115 |             
116 |         else:
117 |             self.ntags = args[0]
118 |             self.nlayers = args[1]
119 |             self.rnn = EncoderModel(ntoken, emsize, nhid, self.nlayers, 
120 |                                     dropout, rnn_type, bi)
121 |             self.linear = LinearDecoder(nhid, self.ntags, bi)
122 |         
123 |     def forward(self, input, *hidden):
124 |         if self.train_mode == 'Joint':
125 |             if self.nlayers1 == self.nlayers2:
126 |                 logits, hidden = self.rnn(input, hidden[0])
127 |                 outputs1 = self.linear1(logits)
128 |                 outputs2 = self.linear2(logits)
129 |                 return outputs1, outputs2, hidden
130 |             else:
131 |                 logits1, hidden1 = self.rnn1(input, hidden[0])
132 |                 self.rnn2.flatten_parameters()
133 |                 logits2, hidden2 = self.rnn2(logits1, hidden[1])
134 |                 outputs1 = self.linear1(logits1)
135 |                 outputs2 = self.linear2(logits2)
136 |                 return outputs1, outputs2, hidden1, hidden2
137 |         else:
138 |             logits, hidden = self.rnn(input, hidden[0])            
139 |             outputs = self.linear(logits)
140 |             return outputs, hidden
141 | 
142 |     def init_rnn2_hidden(self, batch_size):
143 |         weight = next(self.rnn2.parameters()).data
144 |         return (Variable(weight.new((self.nlayers2 - self.nlayers1)*(1+int(self.bi)), 
145 |                                     batch_size, self.nhid).zero_()),
146 |                 Variable(weight.new((self.nlayers2 - self.nlayers1)*(1+int(self.bi)), 
147 |                                     batch_size, self.nhid).zero_()))
148 |         


--------------------------------------------------------------------------------
/remote.sh:
--------------------------------------------------------------------------------
1 | rsync -zaP -e ssh . andy@150.65.242.87:~/MOOCs/proj/pytorch_mtl/
2 | 


--------------------------------------------------------------------------------
/run.sh:
--------------------------------------------------------------------------------
  1 | echo "Running Model"
  2 | 
  3 | echo "POS"
  4 | python main.py --data './data' \
  5 |         --emsize 256 \
  6 |         --npos_layers 2 \
  7 |         --nchunk_layers 0 \
  8 |         --nhid 128 \
  9 |         --batch_size 128 \
 10 |         --seq_len 10 \
 11 |         --cuda \
 12 |         --train_mode 'POS' \
 13 |         --epochs 300 \
 14 |         --log_interval 20 \
 15 |         --save './result/pos_model'
 16 | 
 17 | echo "Chunk"
 18 | python main.py --data './data' \
 19 |         --emsize 256 \
 20 |         --npos_layers 0 \
 21 |         --nchunk_layers 2 \
 22 |         --nhid 128 \
 23 |         --batch_size 128 \
 24 |         --seq_len 10 \
 25 |         --cuda \
 26 |         --train_mode 'Chunk' \
 27 |         --epochs 300 \
 28 |         --log_interval 20 \
 29 |         --save './result/chunk_model'
 30 | 
 31 | echo "Joint Training on the same level"
 32 | python main.py --data './data' \
 33 |         --emsize 256 \
 34 |         --npos_layers 2 \
 35 |         --nchunk_layers 2 \
 36 |         --nhid 128 \
 37 |         --batch_size 128 \
 38 |         --seq_len 10 \
 39 |         --cuda \
 40 |         --train_mode 'Joint' \
 41 |         --epochs 300 \
 42 |         --log_interval 20 \
 43 |         --save './result/joint_same'
 44 | 
 45 | echo "Joint Training on the different level"
 46 | python main.py --data './data' \
 47 |         --emsize 256 \
 48 |         --npos_layers 1 \
 49 |         --nchunk_layers 2 \
 50 |         --nhid 128 \
 51 |         --batch_size 128 \
 52 |         --seq_len 10 \
 53 |         --cuda \
 54 |         --train_mode 'Joint' \
 55 |         --epochs 300 \
 56 |         --log_interval 20 \
 57 |         --save './result/joint_diff'
 58 | 
 59 | 
 60 | 
 61 | # echo "Embedding size"
 62 | # for emsize in 128 256 512 
 63 | # do
 64 | #     echo "Embedding size $emsize"
 65 | #     python main.py --data './data' \
 66 | #             --emsize $emsize \
 67 | #             --nlayers 1 \
 68 | #             --nhid 128 \
 69 | #             --batch_size 128 \
 70 | #             --seq_len 15 \
 71 | #             --cuda \
 72 | #             --epochs 300 \
 73 | #             --log_interval 20 \
 74 | #             --save './result/pos_model'
 75 | # done
 76 | 
 77 | # echo "Number of Layers"
 78 | # for nlayers in 2 3 
 79 | # do
 80 | #     echo "NUmber of layers $nlayers"
 81 | #     python main.py --data './data' \
 82 | #                 --emsize 128 \
 83 | #                 --nlayers $nlayers \
 84 | #                 --nhid 128 \
 85 | #                 --batch_size 128 \
 86 | #                 --seq_len 15 \
 87 | #                 --cuda \
 88 | #                 --epochs 300 \
 89 | #                 --log_interval 20 \
 90 | #                 --save './result/pos_model'
 91 | # done
 92 | 
 93 | # echo "Number of hidden units"
 94 | # for nhid in 256 512
 95 | # do
 96 | #     echo "Number of hidden units $nhid"
 97 | #     python main.py --data './data' \
 98 | #                 --emsize 128 \
 99 | #                 --nlayers 1 \
100 | #                 --nhid $nhid \
101 | #                 --batch_size 128 \
102 | #                 --seq_len 15 \
103 | #                 --train_mode 'POS' \
104 | #                 --cuda \
105 | #                 --epochs 300 \
106 | #                 --log_interval 10 \
107 | #                 --save './result/pos_model'
108 | # done
109 | 
110 | # echo "Sequence Length"
111 | # for seq_len in 10 20
112 | # do
113 | #     python main.py --data './data' \
114 | #             --emsize 128 \
115 | #             --nlayers 1 \
116 | #             --nhid 128 \
117 | #             --batch_size 128 \
118 | #             --seq_len $seq_len \
119 | #             --train_mode 'POS' \
120 | #             --cuda \
121 | #             --epochs 300 \
122 | #             --log_interval 10 \
123 | #             --save './result/pos_model'
124 | # done
125 | 
126 | # for dropout in 0.4 0.6
127 | # do
128 | #     python main.py --data './data' \
129 | #             --emsize 128 \
130 | #             --nlayers 1 \
131 | #             --nhid 128 \
132 | #             --batch_size 128 \
133 | #             --seq_len 15 \
134 | #             --dropout $dropout \
135 | #             --train_mode 'POS' \
136 | #             --cuda \
137 | #             --epochs 300 \
138 | #             --log_interval 10 \
139 | #             --save './result/pos_model'
140 | # done
141 | 
142 | # for rnn_type in 'GRU' 'Elman'
143 | # do 
144 | #     python main.py --data './data' \
145 | #             --emsize 128 \
146 | #             --nlayers 1 \
147 | #             --nhid 128 \
148 | #             --batch_size 128 \
149 | #             --seq_len 15 \
150 | #             --rnn_type $rnn_type \
151 | #             --train_mode 'POS' \
152 | #             --cuda \
153 | #             --epochs 300 \
154 | #             --log_interval 10 \
155 | #             --save './result/pos_model'
156 | # done
157 | 
158 | 


--------------------------------------------------------------------------------
/run_sample.sh:
--------------------------------------------------------------------------------
 1 | echo "Running Model on samples"
 2 | 
 3 | python main.py --data './data/sample' \
 4 |             --emsize 20 \
 5 |             --npos_layers 1 \
 6 |             --nchunk_layers 2 \
 7 |             --train_mode 'Joint' \
 8 |             --nhid 20 \
 9 |             --batch_size 2 \
10 |             --epochs 10 \
11 |             --seq_len 10 \
12 |             --cuda \
13 |             --log_interval 2
14 | 


--------------------------------------------------------------------------------
/show_result.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch
 3 | 
 4 | f = []
 5 | path = './result'
 6 | for (dirpath, dirnames, filenames) in walk(path):
 7 |     f.extend(filenames)
 8 |     break
 9 | 
10 | results = []
11 | for file in f:
12 |     if 'result' in f:
13 |         name = f.split('/')[-1].split('.')[0]
14 |         best_accuracy = torch.load(f)
15 |         results.append([name, best_accuracy])
16 | 
17 | print(results)


--------------------------------------------------------------------------------
/util.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.autograd import Variable
 3 | 
 4 | 
 5 | def get_batch(source, *targets, batch_size, seq_len=10, cuda=False, evalu=False):
 6 |     """Generate batch from the raw data."""
 7 |     nbatch = source.size(0) // batch_size
 8 |     shuffle_mask = torch.randperm(batch_size)
 9 |     # Trim extra elements doesn't fit well
10 |     source = source.narrow(0, 0, nbatch*batch_size)
11 |     # Make batch shape
12 |     source = source.view(batch_size, -1).t().contiguous()
13 |     # Shuffle 
14 |     source = source[:, shuffle_mask]
15 |     if cuda:
16 |         source = source.cuda()
17 |     
18 |     targets = list(targets)
19 |     for i in range(len(targets)):
20 |         targets[i] = targets[i].narrow(0, 0, nbatch*batch_size)
21 |         targets[i] = targets[i].view(batch_size, -1).t().contiguous()
22 |         targets[i] = targets[i][:, shuffle_mask]
23 |         if cuda:
24 |             targets[i] = targets[i].cuda()
25 |     
26 |     for i in range(source.size(0) // seq_len):
27 |         ys = []
28 |         X = Variable(source[i*seq_len:(i+1)*seq_len], volatile=evalu)
29 |         for target in targets:
30 |             ys.append(Variable(target[i*seq_len:(i+1)*seq_len]))
31 |         yield X, ys
32 | 
33 | def repackage_hidden(h):
34 |     """Wrap hidden in the new Variable to detach it from old history."""
35 |     if type(h) == Variable:
36 |         return Variable(h.data)
37 |     else:
38 |         return tuple(repackage_hidden(v) for v in h)
39 | 


--------------------------------------------------------------------------------