├── README.md ├── data.py ├── data ├── data-goes-here.md ├── outputs │ └── results-go-here.md ├── sample │ ├── test.txt │ ├── train.txt │ └── valid.txt ├── test.txt ├── train.txt └── valid.txt ├── main.py ├── model.py ├── remote.sh ├── run.sh ├── run_sample.sh ├── show_result.py └── util.py /README.md: -------------------------------------------------------------------------------- 1 | # nlp_multi_task_learning_pytorch 2 | A basic multitask learning architecture for Natural Language Processing of Pytorch implementation. 3 | 4 | The two tasks we use here is POS Tagging and Chunking. 5 | 6 | As below, we build the model, normally several layers of RNN above embedding. Later, we train specific task on different layer according to its complexity. For example, we think Chunking is a higher task than POS Tagging, inspired by *[Deep multi-task learning with low level tasks supervised at lower layers](http://anthology.aclweb.org/P16-2038)*. 7 | 8 | ![img](https://ws3.sinaimg.cn/large/006tNbRwgy1fuchyzqmynj30ik0aogm6.jpg) 9 | 10 | ## Running Examples 11 | 12 | You can check several running examples in `run.sh`. I will explain one here. 13 | 14 | ``` 15 | echo "Joint Training on the different level" 16 | python main.py --data './data' \ # The directory put the training data 17 | --emsize 256 \ # embedding size 18 | --npos_layers 1 \ # number of POS tagging training layer 19 | --nchunk_layers 2 \ # number of chunking training layer 20 | --nhid 128 \ # number of hidden states for RNN 21 | --batch_size 128 \ 22 | --seq_len 10 \ # sequence length 23 | --cuda \ # enbale GPU 24 | --train_mode 'Joint' \ 25 | --epochs 300 \ 26 | --log_interval 20 \ 27 | --save './result/joint_diff' 28 | ``` 29 | 30 | ## Current Done 31 | 32 | - A basic architecture for POS tagging and chunking 33 | - Explorate the best hyperparameters of NN 34 | 35 | ## Todo 36 | - [ ] Chunking exploration 37 | -------------------------------------------------------------------------------- /data.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | 4 | 5 | class Dictionary(object): 6 | def __init__(self, name): 7 | self.name = name 8 | self.word2idx = {} 9 | self.idx2word = [] 10 | self.nwords = 0 11 | 12 | def add_word(self, word): 13 | if word not in self.word2idx: 14 | self.word2idx[word] = self.nwords 15 | self.idx2word.append(word) 16 | self.nwords += 1 17 | 18 | def __str__(self): 19 | return "%s dictionary has %d kinds of tokens." \ 20 | % (self.name, self.nwords) 21 | 22 | 23 | class Corpus(object): 24 | def __init__(self, path): 25 | self.word_dict = Dictionary('Word') 26 | self.pos_dict = Dictionary('POS') 27 | self.chunk_dict = Dictionary('Chunk') 28 | 29 | self.word_train, self.pos_train, self.chunk_train = self.tokenize(os.path.join(path, 'train.txt')) 30 | self.word_valid, self.pos_valid, self.chunk_valid = self.tokenize(os.path.join(path, 'valid.txt')) 31 | self.word_test, self.pos_test, self.chunk_test = self.tokenize(os.path.join(path, 'test.txt')) 32 | 33 | def tokenize(self, path): 34 | "Tokenizes text data file" 35 | assert os.path.exists(path) 36 | # Build the dictionaries from corpus 37 | with open(path, 'r') as f: 38 | tokens = 0 39 | for line in f: 40 | try: 41 | word, pos, chunk = line.strip().split() 42 | except: 43 | continue 44 | tokens += 1 45 | self.word_dict.add_word(word) 46 | self.pos_dict.add_word(pos) 47 | self.chunk_dict.add_word(chunk) 48 | 49 | with open(path, 'r') as f: 50 | word_ids = torch.LongTensor(tokens) 51 | pos_ids = torch.LongTensor(tokens) 52 | chunk_ids = torch.LongTensor(tokens) 53 | token = 0 54 | for line in f: 55 | try: 56 | word, pos, chunk = line.strip().split() 57 | except: 58 | continue 59 | word_ids[token] = self.word_dict.word2idx[word] 60 | pos_ids[token] = self.pos_dict.word2idx[pos] 61 | chunk_ids[token] = self.chunk_dict.word2idx[chunk] 62 | token += 1 63 | 64 | return word_ids, pos_ids, chunk_ids 65 | -------------------------------------------------------------------------------- /data/data-goes-here.md: -------------------------------------------------------------------------------- 1 | # Your Data Goes In This Folder 2 | -------------------------------------------------------------------------------- /data/outputs/results-go-here.md: -------------------------------------------------------------------------------- 1 | # Results Go In This Folder (Automatically) 2 | -------------------------------------------------------------------------------- /data/sample/test.txt: -------------------------------------------------------------------------------- 1 | Rockwell NNP B-NP 2 | International NNP I-NP 3 | Corp. NNP I-NP 4 | 's POS B-NP 5 | Tulsa NNP I-NP 6 | unit NN I-NP 7 | said VBD B-VP 8 | it PRP B-NP 9 | signed VBD B-VP 10 | a DT B-NP 11 | tentative JJ I-NP 12 | agreement NN I-NP 13 | extending VBG B-VP 14 | its PRP$ B-NP 15 | contract NN I-NP 16 | with IN B-PP 17 | Boeing NNP B-NP 18 | Co. NNP I-NP 19 | to TO B-VP 20 | provide VB I-VP 21 | -------------------------------------------------------------------------------- /data/sample/train.txt: -------------------------------------------------------------------------------- 1 | Confidence NN B-NP 2 | in IN B-PP 3 | the DT B-NP 4 | pound NN I-NP 5 | is VBZ B-VP 6 | widely RB I-VP 7 | expected VBN I-VP 8 | to TO I-VP 9 | take VB I-VP 10 | another DT B-NP 11 | sharp JJ I-NP 12 | Dive NN I-NP 13 | if IN B-SBAR 14 | trade NN B-NP 15 | figures NNS I-NP 16 | for IN B-PP 17 | September NNP B-NP 18 | , , O 19 | due JJ B-ADJP 20 | for IN B-PP 21 | release NN B-NP 22 | tomorrow NN B-NP 23 | , , O 24 | fail VB B-VP 25 | to TO I-VP 26 | show VB I-VP 27 | a DT B-NP 28 | substantial JJ I-NP 29 | improvement NN I-NP 30 | from IN B-PP 31 | July NNP B-NP 32 | and CC I-NP 33 | August NNP I-NP 34 | 's POS B-NP 35 | near-record JJ I-NP 36 | deficits NNS I-NP 37 | . . O 38 | Chancellor NNP O 39 | of IN B-PP 40 | the DT B-NP 41 | Exchequer NNP I-NP 42 | Nigel NNP B-NP 43 | Lawson NNP I-NP 44 | 's POS B-NP 45 | restated VBN I-NP 46 | commitment NN I-NP 47 | to TO B-PP 48 | a DT B-NP 49 | firm NN I-NP 50 | monetary JJ I-NP 51 | policy NN I-NP 52 | has VBZ B-VP 53 | helped VBN I-VP 54 | to TO I-VP 55 | prevent VB I-VP 56 | a DT B-NP 57 | freefall NN I-NP 58 | in IN B-PP 59 | sterling NN B-NP 60 | over IN B-PP 61 | the DT B-NP 62 | past JJ I-NP 63 | week NN I-NP 64 | . . O 65 | But CC O 66 | analysts NNS B-NP 67 | reckon VBP B-VP 68 | underlying VBG B-NP 69 | support NN I-NP 70 | for IN B-PP 71 | sterling NN B-NP 72 | has VBZ B-VP 73 | been VBN I-VP 74 | eroded VBN I-VP 75 | by IN B-PP 76 | the DT B-NP 77 | chancellor NN I-NP 78 | 's POS B-NP 79 | failure NN I-NP 80 | to TO B-VP 81 | announce VB I-VP 82 | any DT B-NP 83 | new JJ I-NP 84 | policy NN I-NP 85 | measures NNS I-NP 86 | in IN B-PP 87 | his PRP$ B-NP 88 | Mansion NNP I-NP 89 | House NNP I-NP 90 | speech NN I-NP 91 | last JJ B-NP 92 | Thursday NNP I-NP 93 | . . O 94 | This DT B-NP 95 | has VBZ B-VP 96 | increased VBN I-VP 97 | the DT B-NP 98 | risk NN I-NP 99 | of IN B-PP 100 | the DT B-NP 101 | -------------------------------------------------------------------------------- /data/sample/valid.txt: -------------------------------------------------------------------------------- 1 | a DT B-NP 2 | trade NN I-NP 3 | group NN I-NP 4 | , , O 5 | will MD B-VP 6 | sell VB I-VP 7 | access NN B-NP 8 | to TO B-PP 9 | the DT B-NP 10 | package NN I-NP 11 | to TO B-PP 12 | its PRP$ B-NP 13 | 180 CD I-NP 14 | airline NN I-NP 15 | members NNS I-NP 16 | world-wide JJ B-ADVP 17 | . . O 18 | Control NNP B-NP 19 | Data NNP I-NP 20 | will MD B-VP 21 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import time 4 | import math 5 | import torch 6 | import torch.nn as nn 7 | from torch.autograd import Variable 8 | from torch import optim 9 | 10 | from data import Corpus 11 | from util import * 12 | from model import * 13 | 14 | ############################################################################### 15 | # Set Parameters 16 | ############################################################################### 17 | parser = argparse.ArgumentParser(description='Pytorch NLP multi-task leraning for POS tagging and Chunking.') 18 | parser.add_argument('--data', type=str, default='./data', 19 | help='data file') 20 | parser.add_argument('--emsize', type=int, default=200, 21 | help='size of word embeddings') 22 | parser.add_argument('--npos_layers', type=int, default=1, 23 | help='number of POS tagging layers') 24 | parser.add_argument('--nchunk_layers', type=int, default=1, 25 | help='number of chunking layers') 26 | parser.add_argument('--nhid', type=int, default=200, 27 | help='number of hidden units') 28 | parser.add_argument('--lr', type=float, default=0.0001, 29 | help='learning rate') 30 | parser.add_argument('--clip', type=float, default=1, 31 | help='gradient clip') 32 | parser.add_argument('--epochs', type=int, default=40, 33 | help='epoch number') 34 | parser.add_argument('--batch_size', type=int, default=32, metavar='N', 35 | help='batch size') 36 | parser.add_argument('--seq_len', type=int, default=15, 37 | help='sequence length') 38 | parser.add_argument('--dropout', type=float, default=0.2, 39 | help='dropout rate') 40 | parser.add_argument('--rnn_type', type=str, default='LSTM', 41 | help='RNN Cell types, among LSTM, GRU, and Elman') 42 | parser.add_argument('--cuda', action='store_true', 43 | help='use CUDA') 44 | parser.add_argument('--bi', action='store_true', 45 | help='use bidirection RNN') 46 | parser.add_argument('--log_interval', type=int, default=200, metavar='N', 47 | help='report interval') 48 | parser.add_argument('--train_mode', type=str, default='Joint', 49 | help='Training mode of model from POS, Chunk, to Joint.') 50 | parser.add_argument('--test_times', type=int, default=1, 51 | help='run several times to get trustable result.') 52 | parser.add_argument('--save', type=str, default='model.pt', 53 | help='path to save the final model') 54 | args = parser.parse_args() 55 | 56 | 57 | ############################################################################### 58 | # Load Data 59 | ############################################################################### 60 | corpus_path = args.save.strip() + '_corpus.pt' 61 | print('Loading corpus...') 62 | if os.path.exists(corpus_path): 63 | corpus = torch.load(corpus_path) 64 | else: 65 | corpus = Corpus(args.data) 66 | torch.save(corpus, corpus_path) 67 | 68 | ############################################################################### 69 | # Training Funcitons 70 | ############################################################################### 71 | def train(loss_log): 72 | 73 | if args.train_mode == 'Joint': 74 | target_data = (corpus.pos_train, corpus.chunk_train) 75 | elif args.train_mode == 'POS': 76 | target_data = (corpus.pos_train, ) 77 | elif args.train_mode == 'Chunk': 78 | target_data = (corpus.chunk_train, ) 79 | 80 | # Turn on training mode 81 | total_loss = 0 82 | start_time = time.time() 83 | n_iteration = corpus.word_train.size(0) // (args.batch_size*args.seq_len) 84 | iteration = 0 85 | for X, ys in get_batch(corpus.word_train, *target_data, batch_size=args.batch_size, 86 | seq_len=args.seq_len, cuda=args.cuda): 87 | iteration += 1 88 | model.zero_grad() 89 | if args.train_mode == 'Joint': 90 | if args.npos_layers == args.nchunk_layers: 91 | hidden = model.rnn.init_hidden(args.batch_size) 92 | outputs1, outputs2, hidden = model(X, hidden) 93 | else: 94 | hidden1 = model.rnn1.init_hidden(args.batch_size) 95 | hidden2 = model.init_rnn2_hidden(args.batch_size) 96 | outputs1, outputs2, hidden1, hidden2 = model(X, hidden1, hidden2) 97 | loss1 = criterion(outputs1.view(-1, npos_tags), ys[0].view(-1)) 98 | loss2 = criterion(outputs2.view(-1, nchunk_tags), ys[1].view(-1)) 99 | loss = loss1 + loss2 100 | else: 101 | hidden = model.rnn.init_hidden(args.batch_size) 102 | outputs, hidden = model(X, hidden) 103 | loss = criterion(outputs.view(-1, ntags), ys[0].view(-1)) 104 | 105 | loss.backward() 106 | 107 | # Prevent the exploding gradient 108 | torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) 109 | optimizer.step() 110 | total_loss += loss.data 111 | 112 | if iteration % args.log_interval == 0: 113 | cur_loss = total_loss / args.log_interval 114 | cur_loss = cur_loss.cpu().numpy()[0] 115 | elapsed = time.time() - start_time 116 | print('| epoch {:3d} | {:5d}/{:5d} iteration | {:5.2f} ms/batch | loss {:5.2f} |'.format( 117 | epoch, iteration, n_iteration, 118 | elapsed*1000/args.log_interval, 119 | cur_loss)) 120 | loss_log.append(cur_loss) 121 | total_loss = 0 122 | start_time = time.time() 123 | return loss_log 124 | 125 | def evaluate(source, target): 126 | model.eval() 127 | n_iteration = source.size(0) // (args.batch_size*args.seq_len) 128 | total_loss = 0 129 | for X_val, y_vals in get_batch(source, *target, batch_size=args.batch_size, 130 | seq_len=args.seq_len, cuda=args.cuda, evalu=True): 131 | if args.train_mode == 'Joint': 132 | if args.npos_layers == args.nchunk_layers: 133 | hidden = model.rnn.init_hidden(args.batch_size) 134 | outputs1, outputs2, hidden = model(X_val, hidden) 135 | else: 136 | hidden1 = model.rnn1.init_hidden(args.batch_size) 137 | hidden2 = model.init_rnn2_hidden(args.batch_size) 138 | outputs1, outputs2, hidden1, hidden2 = model(X_val, hidden1, hidden2) 139 | loss1 = criterion(outputs1.view(-1, npos_tags), y_vals[0].view(-1)) 140 | loss2 = criterion(outputs2.view(-1, nchunk_tags), y_vals[1].view(-1)) 141 | loss = loss1 + loss2 142 | # Make predict and calculate accuracy 143 | _, pred1 = outputs1.data.topk(1) 144 | _, pred2 = outputs2.data.topk(1) 145 | accuracy1 = torch.sum(pred1.squeeze(2) == y_vals[0].data) / (y_vals[0].size(0) * y_vals[0].size(1)) 146 | accuracy2 = torch.sum(pred2.squeeze(2) == y_vals[1].data) / (y_vals[1].size(0) * y_vals[1].size(1)) 147 | accuracy = (accuracy1, accuracy2) 148 | else: 149 | hidden = model.rnn.init_hidden(args.batch_size) 150 | outputs, hidden = model(X_val, hidden) 151 | loss = criterion(outputs.view(-1, ntags), y_vals[0].view(-1)) 152 | _, pred = outputs.data.topk(1) 153 | accuracy = torch.sum(pred.squeeze(2) == y_vals[0].data) / (y_vals[0].size(0) * y_vals[0].size(1)) 154 | total_loss += loss 155 | 156 | return total_loss/n_iteration, accuracy 157 | 158 | best_val_accuracies = [] 159 | test_accuracies = [] 160 | best_epoches = [] 161 | patience = 25 #How many epoch if the accuracy have no change use early stopping 162 | for i in range(args.test_times): 163 | ############################################################################### 164 | # Build Model 165 | ############################################################################### 166 | nwords = corpus.word_dict.nwords 167 | npos_tags = corpus.pos_dict.nwords 168 | nchunk_tags = corpus.chunk_dict.nwords 169 | 170 | if args.train_mode == 'Joint': 171 | model = JointModel(nwords, args.emsize, args.nhid, npos_tags, args.npos_layers, 172 | nchunk_tags, args.nchunk_layers, args.dropout, bi=args.bi, 173 | train_mode=args.train_mode) 174 | else: 175 | if args.train_mode == 'POS': 176 | ntags = npos_tags 177 | nlayers = args.npos_layers 178 | elif args.train_mode == 'Chunk': 179 | ntags = nchunk_tags 180 | nlayers = args.nchunk_layers 181 | model = JointModel(nwords, args.emsize, args.nhid, ntags, nlayers, 182 | args.dropout, bi=args.bi, train_mode=args.train_mode) 183 | if args.cuda: 184 | model = model.cuda() 185 | 186 | optimizer = optim.Adam(model.parameters(), lr=args.lr) 187 | criterion = nn.CrossEntropyLoss() 188 | 189 | # Loop over epochs 190 | best_val_loss = None 191 | best_accuracy = None 192 | best_epoch = 0 193 | early_stop_count = 0 194 | loss_log = [] 195 | # You can break training early by Ctr+C 196 | try: 197 | for epoch in range(1, args.epochs+1): 198 | epoch_start_time = time.time() 199 | print('Begin training...') 200 | loss_log = train(loss_log) 201 | # Evaluation 202 | print('Evaluating on the valid data') 203 | if args.train_mode == 'Joint': 204 | valid_target_data = (corpus.pos_valid, corpus.chunk_valid) 205 | elif args.train_mode == 'POS': 206 | valid_target_data = (corpus.pos_valid, ) 207 | elif args.train_mode == 'Chunk': 208 | valid_target_data = (corpus.chunk_valid, ) 209 | 210 | val_loss, accuracy = evaluate(corpus.word_valid, valid_target_data) 211 | print('-'*50) 212 | if args.train_mode == 'Joint': 213 | print('| end of epoch {:3d} | valid loss {:5.3f} | POS accuracy {:5.3f} | Chunk accuracy {:5.3}'.format( 214 | epoch, val_loss.data.cpu().numpy()[0], accuracy[0], accuracy[1] 215 | )) 216 | else: 217 | print('| end of epoch {:3d} | valid loss {:5.3f} | accuracy {:5.3f} |'.format( 218 | epoch, val_loss.data.cpu().numpy()[0], accuracy 219 | )) 220 | if not best_val_loss or (val_loss.data[0] < best_val_loss): 221 | with open(args.save.strip() + '.pt', 'wb') as f: 222 | torch.save(model, f) 223 | best_val_loss = val_loss.data[0] 224 | best_accuracy = accuracy 225 | best_epoch = epoch 226 | early_stop_count = 0 227 | else: 228 | early_stop_count += 1 229 | if early_stop_count >= patience: 230 | print('\nEarly Stopping! \nBecause %d epochs the accuracy have no improvement.'%(patience)) 231 | break 232 | except KeyboardInterrupt: 233 | print('-'*50) 234 | print('Exiting from training early.') 235 | 236 | 237 | ############################################################################### 238 | # Test Model 239 | ############################################################################### 240 | #Load the best saved model 241 | with open(args.save.strip() + '.pt', 'rb') as f: 242 | model = torch.load(f) 243 | 244 | if args.train_mode == 'Joint': 245 | test_target_data = (corpus.pos_test, corpus.chunk_test) 246 | elif args.train_mode == 'POS': 247 | test_target_data = (corpus.pos_test, ) 248 | elif args.train_mode == 'Chunk': 249 | test_target_data = (corpus.chunk_test, ) 250 | test_loss, test_accuracy = evaluate(corpus.word_test, test_target_data) 251 | print('='*50) 252 | print("Evaluating on test data.") 253 | if args.train_mode == 'Joint': 254 | print('| end of epoch {:3d} | test loss {:5.3f} | POS test accuracy {:5.3f} | Chunk test accuracy {:5.3}'.format( 255 | epoch, test_loss.data.cpu().numpy()[0], test_accuracy[0], test_accuracy[1] 256 | )) 257 | else: 258 | print('| end of epoch {:3d} | test loss {:5.3f} | accuracy {:5.3f} |'.format( 259 | epoch, test_loss.data.cpu().numpy()[0], test_accuracy 260 | )) 261 | 262 | # Log Accuracy 263 | best_val_accuracies.append(best_accuracy) 264 | test_accuracies.append(test_accuracy) 265 | best_epoches.append(best_epoch) 266 | 267 | 268 | # Save results 269 | results = { 270 | 'corpus': corpus, 271 | 'best_epoch': best_epoch, 272 | 'best_val_loss': best_val_loss, 273 | 'best_accuracy': best_accuracy, 274 | 'test_accuracy': test_accuracy, 275 | 'loss_log': loss_log, 276 | 277 | 'best_val_accuracies': best_val_accuracies, 278 | 'test_accuracies': test_accuracies, 279 | 'best_epoches': best_epoches 280 | } 281 | torch.save(results, '%s_emsize%d_npos_layers%d_nchunk_layers%d_nhid%d_dropout%3.1f_seqlen%d_bi%d_%s_result.pt' \ 282 | %(args.save.strip(), args.emsize, args.npos_layers, args.nchunk_layers, 283 | args.nhid, args.dropout, args.seq_len, args.bi, args.rnn_type)) 284 | 285 | 286 | -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | from torch.autograd import Variable 3 | 4 | 5 | class EncoderModel(nn.Module): 6 | """Model include a transducer to predict at each time steps""" 7 | 8 | def __init__(self, ntoken, emsize, nhid, 9 | nlayers=1, dropout=0.2, rnn_type='LSTM', bi=False): 10 | super().__init__() 11 | self.drop = nn.Dropout(dropout) 12 | self.embed = nn.Embedding(ntoken, emsize) 13 | self.rnn_type = rnn_type 14 | 15 | # Select RNN cell type from LSTM, GRU, and Elman 16 | if rnn_type == 'LSTM': 17 | self.rnn = nn.LSTM(emsize, nhid, nlayers, bidirectional=bi) 18 | elif rnn_type == 'GRU': 19 | self.rnn = nn.GRU(emsize, nhid, nlayers, bidirectional=bi) 20 | else: 21 | self.rnn = nn.RNN(emsize, nhid, nlayers, bidirectional=bi) 22 | 23 | self.init_weights() 24 | self.nhid = nhid 25 | self.nlayers = nlayers 26 | self.bi = bi 27 | 28 | def init_weights(self): 29 | init_range = 0.1 30 | self.embed.weight.data.uniform_(-init_range, init_range) 31 | 32 | def forward(self, input, hidden): 33 | embeded = self.drop(self.embed(input)) 34 | self.rnn.flatten_parameters() 35 | output, hidden = self.rnn(embeded, hidden) 36 | output = self.drop(output) 37 | return output, hidden 38 | 39 | def init_hidden(self, batch_size): 40 | weight = next(self.parameters()).data 41 | return (Variable(weight.new(self.nlayers*(1+int(self.bi)), 42 | batch_size, self.nhid).zero_()), 43 | Variable(weight.new(self.nlayers*(1+int(self.bi)), 44 | batch_size, self.nhid).zero_())) 45 | 46 | 47 | class LinearDecoder(nn.Module): 48 | """Linear decoder to decoder the outputs from the RNN Encoder. 49 | Then we can get the results of different tasks.""" 50 | 51 | def __init__(self, nhid, ntags, bi=False): 52 | super().__init__() 53 | self.linear = nn.Linear(nhid*(1+int(bi)), ntags) 54 | self.init_weights() 55 | self.nin = nhid 56 | self.nout = ntags 57 | self.bi = bi 58 | 59 | def init_weights(self): 60 | init_range = 0.1 61 | self.linear.bias.data.fill_(0) 62 | self.linear.weight.data.uniform_(-init_range, init_range) 63 | 64 | def forward(self, input): 65 | logit = self.linear(input.view(input.size(0)*input.size(1), input.size(2))) 66 | return logit.view(input.size(0), input.size(1), logit.size(1)) 67 | 68 | 69 | class JointModel(nn.Module): 70 | """Joint Model to joint training two tasks. 71 | You can also only select one train mode to train one task. 72 | For args to specified the detail of training, include the task 73 | output and which layer we put it in. Number of tag first and 74 | then number of layer.""" 75 | def __init__(self, ntoken, emsize, nhid, *args, 76 | dropout=0.2, rnn_type='LSTM', bi=False, train_mode='Joint'): 77 | super().__init__() 78 | self.ntoken = ntoken 79 | self.emsize = emsize 80 | self.nhid = nhid 81 | self.dropout = dropout 82 | self.rnn_type = rnn_type 83 | self.bi = bi 84 | self.train_mode = train_mode 85 | # According to train type, take arguments 86 | if train_mode == 'Joint': 87 | self.ntags1 = args[0] 88 | self.nlayers1 = args[1] 89 | self.ntags2 = args[2] 90 | self.nlayers2 = args[3] 91 | if self.nlayers1 == self.nlayers2: 92 | self.rnn = EncoderModel(ntoken, emsize, nhid, self.nlayers1, 93 | dropout, rnn_type, bi) 94 | else: 95 | # Lower layer 96 | self.rnn1 = EncoderModel(ntoken, emsize, nhid, self.nlayers1, 97 | dropout, rnn_type, bi) 98 | # Higher layer 99 | if rnn_type == 'LSTM': 100 | self.rnn2 = nn.LSTM(nhid*(1+int(bi)), nhid, 101 | self.nlayers2 - self.nlayers1, 102 | bidirectional=bi) 103 | elif rnn_type == 'GRU': 104 | self.rnn2 = nn.GRU(nhid*(1+int(bi)), nhid, 105 | self.nlayers2 - self.nlayers1, 106 | bidirectional=bi) 107 | else: 108 | self.rnn2 = nn.RNN(nhid*(1+int(bi)), nhid, 109 | self.nlayers2 - self.nlayers1, 110 | bidirectional=bi) 111 | 112 | # Decoders for two tasks 113 | self.linear1 = LinearDecoder(nhid, self.ntags1, bi) 114 | self.linear2 = LinearDecoder(nhid, self.ntags2, bi) 115 | 116 | else: 117 | self.ntags = args[0] 118 | self.nlayers = args[1] 119 | self.rnn = EncoderModel(ntoken, emsize, nhid, self.nlayers, 120 | dropout, rnn_type, bi) 121 | self.linear = LinearDecoder(nhid, self.ntags, bi) 122 | 123 | def forward(self, input, *hidden): 124 | if self.train_mode == 'Joint': 125 | if self.nlayers1 == self.nlayers2: 126 | logits, hidden = self.rnn(input, hidden[0]) 127 | outputs1 = self.linear1(logits) 128 | outputs2 = self.linear2(logits) 129 | return outputs1, outputs2, hidden 130 | else: 131 | logits1, hidden1 = self.rnn1(input, hidden[0]) 132 | self.rnn2.flatten_parameters() 133 | logits2, hidden2 = self.rnn2(logits1, hidden[1]) 134 | outputs1 = self.linear1(logits1) 135 | outputs2 = self.linear2(logits2) 136 | return outputs1, outputs2, hidden1, hidden2 137 | else: 138 | logits, hidden = self.rnn(input, hidden[0]) 139 | outputs = self.linear(logits) 140 | return outputs, hidden 141 | 142 | def init_rnn2_hidden(self, batch_size): 143 | weight = next(self.rnn2.parameters()).data 144 | return (Variable(weight.new((self.nlayers2 - self.nlayers1)*(1+int(self.bi)), 145 | batch_size, self.nhid).zero_()), 146 | Variable(weight.new((self.nlayers2 - self.nlayers1)*(1+int(self.bi)), 147 | batch_size, self.nhid).zero_())) 148 | -------------------------------------------------------------------------------- /remote.sh: -------------------------------------------------------------------------------- 1 | rsync -zaP -e ssh . andy@150.65.242.87:~/MOOCs/proj/pytorch_mtl/ 2 | -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | echo "Running Model" 2 | 3 | echo "POS" 4 | python main.py --data './data' \ 5 | --emsize 256 \ 6 | --npos_layers 2 \ 7 | --nchunk_layers 0 \ 8 | --nhid 128 \ 9 | --batch_size 128 \ 10 | --seq_len 10 \ 11 | --cuda \ 12 | --train_mode 'POS' \ 13 | --epochs 300 \ 14 | --log_interval 20 \ 15 | --save './result/pos_model' 16 | 17 | echo "Chunk" 18 | python main.py --data './data' \ 19 | --emsize 256 \ 20 | --npos_layers 0 \ 21 | --nchunk_layers 2 \ 22 | --nhid 128 \ 23 | --batch_size 128 \ 24 | --seq_len 10 \ 25 | --cuda \ 26 | --train_mode 'Chunk' \ 27 | --epochs 300 \ 28 | --log_interval 20 \ 29 | --save './result/chunk_model' 30 | 31 | echo "Joint Training on the same level" 32 | python main.py --data './data' \ 33 | --emsize 256 \ 34 | --npos_layers 2 \ 35 | --nchunk_layers 2 \ 36 | --nhid 128 \ 37 | --batch_size 128 \ 38 | --seq_len 10 \ 39 | --cuda \ 40 | --train_mode 'Joint' \ 41 | --epochs 300 \ 42 | --log_interval 20 \ 43 | --save './result/joint_same' 44 | 45 | echo "Joint Training on the different level" 46 | python main.py --data './data' \ 47 | --emsize 256 \ 48 | --npos_layers 1 \ 49 | --nchunk_layers 2 \ 50 | --nhid 128 \ 51 | --batch_size 128 \ 52 | --seq_len 10 \ 53 | --cuda \ 54 | --train_mode 'Joint' \ 55 | --epochs 300 \ 56 | --log_interval 20 \ 57 | --save './result/joint_diff' 58 | 59 | 60 | 61 | # echo "Embedding size" 62 | # for emsize in 128 256 512 63 | # do 64 | # echo "Embedding size $emsize" 65 | # python main.py --data './data' \ 66 | # --emsize $emsize \ 67 | # --nlayers 1 \ 68 | # --nhid 128 \ 69 | # --batch_size 128 \ 70 | # --seq_len 15 \ 71 | # --cuda \ 72 | # --epochs 300 \ 73 | # --log_interval 20 \ 74 | # --save './result/pos_model' 75 | # done 76 | 77 | # echo "Number of Layers" 78 | # for nlayers in 2 3 79 | # do 80 | # echo "NUmber of layers $nlayers" 81 | # python main.py --data './data' \ 82 | # --emsize 128 \ 83 | # --nlayers $nlayers \ 84 | # --nhid 128 \ 85 | # --batch_size 128 \ 86 | # --seq_len 15 \ 87 | # --cuda \ 88 | # --epochs 300 \ 89 | # --log_interval 20 \ 90 | # --save './result/pos_model' 91 | # done 92 | 93 | # echo "Number of hidden units" 94 | # for nhid in 256 512 95 | # do 96 | # echo "Number of hidden units $nhid" 97 | # python main.py --data './data' \ 98 | # --emsize 128 \ 99 | # --nlayers 1 \ 100 | # --nhid $nhid \ 101 | # --batch_size 128 \ 102 | # --seq_len 15 \ 103 | # --train_mode 'POS' \ 104 | # --cuda \ 105 | # --epochs 300 \ 106 | # --log_interval 10 \ 107 | # --save './result/pos_model' 108 | # done 109 | 110 | # echo "Sequence Length" 111 | # for seq_len in 10 20 112 | # do 113 | # python main.py --data './data' \ 114 | # --emsize 128 \ 115 | # --nlayers 1 \ 116 | # --nhid 128 \ 117 | # --batch_size 128 \ 118 | # --seq_len $seq_len \ 119 | # --train_mode 'POS' \ 120 | # --cuda \ 121 | # --epochs 300 \ 122 | # --log_interval 10 \ 123 | # --save './result/pos_model' 124 | # done 125 | 126 | # for dropout in 0.4 0.6 127 | # do 128 | # python main.py --data './data' \ 129 | # --emsize 128 \ 130 | # --nlayers 1 \ 131 | # --nhid 128 \ 132 | # --batch_size 128 \ 133 | # --seq_len 15 \ 134 | # --dropout $dropout \ 135 | # --train_mode 'POS' \ 136 | # --cuda \ 137 | # --epochs 300 \ 138 | # --log_interval 10 \ 139 | # --save './result/pos_model' 140 | # done 141 | 142 | # for rnn_type in 'GRU' 'Elman' 143 | # do 144 | # python main.py --data './data' \ 145 | # --emsize 128 \ 146 | # --nlayers 1 \ 147 | # --nhid 128 \ 148 | # --batch_size 128 \ 149 | # --seq_len 15 \ 150 | # --rnn_type $rnn_type \ 151 | # --train_mode 'POS' \ 152 | # --cuda \ 153 | # --epochs 300 \ 154 | # --log_interval 10 \ 155 | # --save './result/pos_model' 156 | # done 157 | 158 | -------------------------------------------------------------------------------- /run_sample.sh: -------------------------------------------------------------------------------- 1 | echo "Running Model on samples" 2 | 3 | python main.py --data './data/sample' \ 4 | --emsize 20 \ 5 | --npos_layers 1 \ 6 | --nchunk_layers 2 \ 7 | --train_mode 'Joint' \ 8 | --nhid 20 \ 9 | --batch_size 2 \ 10 | --epochs 10 \ 11 | --seq_len 10 \ 12 | --cuda \ 13 | --log_interval 2 14 | -------------------------------------------------------------------------------- /show_result.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | 4 | f = [] 5 | path = './result' 6 | for (dirpath, dirnames, filenames) in walk(path): 7 | f.extend(filenames) 8 | break 9 | 10 | results = [] 11 | for file in f: 12 | if 'result' in f: 13 | name = f.split('/')[-1].split('.')[0] 14 | best_accuracy = torch.load(f) 15 | results.append([name, best_accuracy]) 16 | 17 | print(results) -------------------------------------------------------------------------------- /util.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.autograd import Variable 3 | 4 | 5 | def get_batch(source, *targets, batch_size, seq_len=10, cuda=False, evalu=False): 6 | """Generate batch from the raw data.""" 7 | nbatch = source.size(0) // batch_size 8 | shuffle_mask = torch.randperm(batch_size) 9 | # Trim extra elements doesn't fit well 10 | source = source.narrow(0, 0, nbatch*batch_size) 11 | # Make batch shape 12 | source = source.view(batch_size, -1).t().contiguous() 13 | # Shuffle 14 | source = source[:, shuffle_mask] 15 | if cuda: 16 | source = source.cuda() 17 | 18 | targets = list(targets) 19 | for i in range(len(targets)): 20 | targets[i] = targets[i].narrow(0, 0, nbatch*batch_size) 21 | targets[i] = targets[i].view(batch_size, -1).t().contiguous() 22 | targets[i] = targets[i][:, shuffle_mask] 23 | if cuda: 24 | targets[i] = targets[i].cuda() 25 | 26 | for i in range(source.size(0) // seq_len): 27 | ys = [] 28 | X = Variable(source[i*seq_len:(i+1)*seq_len], volatile=evalu) 29 | for target in targets: 30 | ys.append(Variable(target[i*seq_len:(i+1)*seq_len])) 31 | yield X, ys 32 | 33 | def repackage_hidden(h): 34 | """Wrap hidden in the new Variable to detach it from old history.""" 35 | if type(h) == Variable: 36 | return Variable(h.data) 37 | else: 38 | return tuple(repackage_hidden(v) for v in h) 39 | --------------------------------------------------------------------------------