├── .gitignore ├── README.md ├── model ├── __init__.py ├── batch_stack_lstm.py ├── evaluate.py ├── stack_lstm.py └── utils.py ├── predict.py └── train.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | .DS_Store 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # IDEA 101 | .idea/ 102 | 103 | # 104 | data 105 | embedding -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # stack-lstm-ner 2 | PyTorch implementation of Transition-based NER system [1]. 3 | 4 | ### Requirements 5 | * Python 3.x 6 | * PyTorch 0.3.0 7 | 8 | 9 | ### Task 10 | 11 | Given a sentence, give a tag to each word. A classical application is Named Entity Recognition (NER). Here is an example 12 | 13 | ``` 14 | John lives in New York 15 | B-PER O O B-LOC I-LOC 16 | ``` 17 | Corresponding sequence of actions 18 | 19 | ``` 20 | SHIFT 21 | REDUCE(PER) 22 | OUT 23 | OUT 24 | SHIFT 25 | SHIFT 26 | REDUCE(LOC) 27 | ``` 28 | 40 | 41 | ### Data format 42 | 43 | 44 | The training data must be in the following format (identical to the CoNLL2003 dataset). 45 | 46 | A default test file is provided to help you getting started. 47 | 48 | 49 | ``` 50 | John B-PER 51 | lives O 52 | in O 53 | New B-LOC 54 | York I-LOC 55 | . O 56 | ``` 57 | 58 | ### Training 59 | 60 | To train the model, run ```train.py``` with the following parameters: 61 | ``` 62 | --rand_embedding # use this if you want to randomly initialize the embeddings 63 | --emb_file # file dir for word embedding 64 | --char_structure # choose 'lstm' or 'cnn' 65 | --train_file # path to training file 66 | --dev_file # path to development file 67 | --test_file # path to test file 68 | --gpu # gpu id, set to -1 if use cpu mode 69 | --update # choose from 'sgd' or adam 70 | --batch_size # batch size, default=100 71 | --singleton_rate # the rate for changing the words with low frequency to '' 72 | --checkpoint # path to checkpoint and saved model 73 | ``` 74 | ### Decoding 75 | 76 | To tag a raw file, simpliy run ```predict.py``` with the following parameters: 77 | ``` 78 | --load_arg # path to saved json file with all args 79 | --load_check_point # path to saved model 80 | --test_file # path to test file 81 | --test_file_out # path to test file output 82 | --batch_size # batch size 83 | --gpu # gpu id, set to -1 if use cpu mode 84 | ``` 85 | Please be aware that when using the model in ```stack_lstm.py```, ```--batch_size``` must be 1. 86 | 87 | ### Result 88 | 89 | When models are only trained on the CoNLL 2003 English NER dataset, the results are summarized as below. 90 | 91 | |Model | Variant| F1 | Time(h) | 92 | | ------------- |-------------| -----| -----| 93 | | [Lample et al. 2016](https://github.com/clab/stack-lstm-ner) | pretrain | 86.67 | 94 | | | pretrain + dropout | 87.96 | 95 | | | pretrain + dropout + char | 90.33 | 96 | | Our Implementation | pretrain + dropout | | | 97 | | | pretrain + dropout + char (BiLSTM) | | | 98 | | | pretrain + dropout + char (CNN) | | | 99 | 100 | ### Author 101 | Huimeng Zhang: zhang_huimeng@foxmail.com 102 | 103 | ## References 104 | 105 | [1] [ Lample et al., Neural Architectures for Named Entity Recognition, 2016](http://www.aclweb.org/anthology/N16-1030.pdf) 106 | -------------------------------------------------------------------------------- /model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tianlinyang/stack-lstm-ner/09bb77ad4407433d583e55aa9badb5032c2eaadd/model/__init__.py -------------------------------------------------------------------------------- /model/batch_stack_lstm.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import torch 3 | import torch.autograd as autograd 4 | import torch.nn as nn 5 | import numpy as np 6 | 7 | import model.utils as utils 8 | 9 | 10 | class TransitionNER(nn.Module): 11 | def __init__(self, mode, action2idx, word2idx, label2idx, char2idx, ner_map, vocab_size, action_size, embedding_dim, 12 | action_embedding_dim, char_embedding_dim, 13 | hidden_dim, char_hidden_dim, rnn_layers, dropout_ratio, use_spelling, char_structure, is_cuda): 14 | super(TransitionNER, self).__init__() 15 | self.embedding_dim = embedding_dim 16 | self.mode = mode 17 | self.hidden_dim = hidden_dim 18 | self.vocab_size = vocab_size 19 | self.action2idx = action2idx 20 | self.label2idx = label2idx 21 | self.char2idx = char2idx 22 | self.use_spelling = use_spelling 23 | self.char_structure = char_structure 24 | if is_cuda >= 0: 25 | self.gpu_triger = True 26 | else: 27 | self.gpu_triger = False 28 | self.idx2label = {v: k for k, v in label2idx.items()} 29 | self.idx2action = {v: k for k, v in action2idx.items()} 30 | self.idx2word = {v: k for k, v in word2idx.items()} 31 | self.idx2char = {v: k for k, v in char2idx.items()} 32 | self.ner_map = ner_map 33 | 34 | self.word_embeds = nn.Embedding(vocab_size, embedding_dim) 35 | self.action_embeds = nn.Embedding(action_size, action_embedding_dim) 36 | self.relation_embeds = nn.Embedding(action_size, action_embedding_dim) 37 | 38 | if self.use_spelling: 39 | self.char_embeds = nn.Embedding(len(self.char2idx), char_embedding_dim) 40 | if self.char_structure == 'lstm': 41 | self.tok_embedding_dim = self.embedding_dim + char_hidden_dim * 2 42 | self.unk_char_embeds = nn.Parameter(torch.randn(1, char_hidden_dim * 2), requires_grad=True) 43 | self.pad_char_embeds = nn.Parameter(torch.zeros(1, char_hidden_dim * 2)) 44 | self.char_bi_lstm = nn.LSTM(char_embedding_dim, char_hidden_dim, num_layers=rnn_layers, 45 | bidirectional=True, dropout=dropout_ratio) 46 | elif self.char_structure == 'cnn': 47 | self.tok_embedding_dim = self.embedding_dim + char_hidden_dim 48 | self.pad_char_embeds = nn.Parameter(torch.zeros(1, char_hidden_dim)) 49 | self.unk_char_embeds = nn.Parameter(torch.randn(1, char_hidden_dim), requires_grad=True) 50 | self.conv1d = nn.Conv1d(char_embedding_dim, char_hidden_dim, 3, padding=2) 51 | else: 52 | self.tok_embedding_dim = self.embedding_dim 53 | 54 | self.buffer_lstm = nn.LSTMCell(self.tok_embedding_dim, hidden_dim) 55 | self.stack_lstm = nn.LSTMCell(self.tok_embedding_dim, hidden_dim) 56 | self.action_lstm = nn.LSTMCell(action_embedding_dim, hidden_dim) 57 | self.output_lstm = nn.LSTMCell(self.tok_embedding_dim, hidden_dim) 58 | self.entity_forward_lstm = nn.LSTMCell(self.tok_embedding_dim, hidden_dim) 59 | self.entity_backward_lstm = nn.LSTMCell(self.tok_embedding_dim, hidden_dim) 60 | 61 | self.ac_lstm = nn.LSTM(action_embedding_dim, hidden_dim, num_layers=rnn_layers, bidirectional=False, 62 | dropout=dropout_ratio) 63 | self.lstm = nn.LSTM(self.tok_embedding_dim, hidden_dim, num_layers=rnn_layers, bidirectional=False, 64 | dropout=dropout_ratio) 65 | self.rnn_layers = rnn_layers 66 | 67 | self.dropout_e = nn.Dropout(p=dropout_ratio) 68 | self.dropout = nn.Dropout(p=dropout_ratio) 69 | 70 | self.init_buffer = utils.xavier_init(self.gpu_triger, 1, hidden_dim) 71 | self.empty_emb = nn.Parameter(torch.randn(1, hidden_dim)) 72 | self.lstm_padding = nn.Parameter(torch.randn(1, self.tok_embedding_dim)) 73 | self.lstms_output_2_softmax = nn.Linear(hidden_dim * 4, hidden_dim) 74 | self.output_2_act = nn.Linear(hidden_dim, len(ner_map) + 2) 75 | self.entity_2_output = nn.Linear(hidden_dim * 2 + action_embedding_dim, self.tok_embedding_dim) 76 | 77 | self.batch_size = 1 78 | self.seq_length = 1 79 | 80 | def _rnn_get_output(self, state): 81 | return state[0] 82 | 83 | def get_possible_actions(self, stack, buffer): 84 | valid_actions = [] 85 | if len(buffer) > 0: 86 | valid_actions.append(self.action2idx["SHIFT"]) 87 | if len(stack) > 0: 88 | valid_actions += [self.action2idx[ner_action] for ner_action in self.ner_map.keys()] 89 | else: 90 | valid_actions.append(self.action2idx["OUT"]) 91 | return valid_actions 92 | 93 | def get_possible_actions_batch(self, stacks, buffer_lens, have_action_batch): 94 | assert len(stacks) == len(buffer_lens) 95 | valid_actions = [[] for i in range(len(buffer_lens))] 96 | for i in have_action_batch: 97 | if buffer_lens[i] > 0: 98 | valid_actions[i].append(self.action2idx["SHIFT"]) 99 | if stacks[i][1] != '': 100 | valid_actions[i] += [self.action2idx[ner_action] for ner_action in self.ner_map.keys()] 101 | else: 102 | valid_actions[i].append(self.action2idx["OUT"]) 103 | 104 | return valid_actions 105 | 106 | def getloss_batch(self, have_action_batch, batch_buffer, batch_stack, batch_action, batch_output, 107 | batch_valid_actions, batch_real_actions=None): 108 | predict_actions = [] 109 | losses = [] 110 | if self.mode == 'train': 111 | lstms_output = [torch.cat( 112 | [batch_buffer[batch_idx][0], batch_stack[batch_idx][0][0], batch_output[batch_idx][0][0], 113 | batch_action[batch_idx]], 1) 114 | for batch_idx in have_action_batch] 115 | elif self.mode == 'predict': 116 | lstms_output = [torch.cat( 117 | [batch_buffer[batch_idx][0], batch_stack[batch_idx][0][0], batch_output[batch_idx][0][0], 118 | batch_action[batch_idx][0][0]], 1) 119 | for batch_idx in have_action_batch] 120 | lstms_output = torch.cat([i for i in lstms_output], 0) 121 | hidden_output = torch.tanh(self.lstms_output_2_softmax(self.dropout(lstms_output))) 122 | logits = self.output_2_act(hidden_output) 123 | for idx in range(len(have_action_batch)): 124 | logit = logits[idx][ 125 | utils.variable(torch.LongTensor(batch_valid_actions[have_action_batch[idx]]), self.gpu_triger)] 126 | valid_action_tbl = {a: i for i, a in enumerate(batch_valid_actions[have_action_batch[idx]])} 127 | log_probs = torch.nn.functional.log_softmax(logit) 128 | action_idx = torch.max(log_probs.cpu(), 0)[1][0].data.numpy()[0] 129 | action_predict = batch_valid_actions[have_action_batch[idx]][action_idx] 130 | predict_actions.append(action_predict) 131 | if self.mode == 'train': 132 | if log_probs is not None: 133 | losses.append(log_probs[valid_action_tbl[batch_real_actions[have_action_batch[idx]]]]) 134 | 135 | if self.mode == 'predict': 136 | losses = None 137 | 138 | return predict_actions, losses 139 | 140 | def rand_init_hidden(self): 141 | 142 | if self.gpu_triger is True: 143 | return autograd.Variable( 144 | torch.randn(2 * self.rnn_layers, self.batch_size, self.hidden_dim // 2)).cuda(), autograd.Variable( 145 | torch.randn(2 * self.rnn_layers, self.batch_size, self.hidden_dim // 2)).cuda() 146 | else: 147 | return autograd.Variable( 148 | torch.randn(2 * self.rnn_layers, self.batch_size, self.hidden_dim // 2)), autograd.Variable( 149 | torch.randn(2 * self.rnn_layers, self.batch_size, self.hidden_dim // 2)) 150 | 151 | def set_seq_size(self, sentence): 152 | 153 | tmp = sentence.size() 154 | self.seq_length = tmp[0] 155 | self.batch_size = 1 156 | 157 | def set_batch_seq_size(self, sentence): 158 | 159 | tmp = sentence.size() 160 | self.seq_length = tmp[1] 161 | self.batch_size = tmp[0] 162 | 163 | def load_pretrained_embedding(self, pre_embeddings): 164 | 165 | assert (pre_embeddings.size()[1] == self.embedding_dim) 166 | self.word_embeds.weight = nn.Parameter(pre_embeddings) 167 | 168 | def rand_init(self, init_word_embedding=False, init_action_embedding=True, init_relation_embedding=True): 169 | 170 | if init_word_embedding: 171 | utils.init_embedding(self.word_embeds.weight) 172 | if init_action_embedding: 173 | utils.init_embedding(self.action_embeds.weight) 174 | if init_relation_embedding: 175 | utils.init_embedding(self.relation_embeds.weight) 176 | 177 | if self.use_spelling: 178 | utils.init_embedding(self.char_embeds.weight) 179 | if self.use_spelling and self.char_structure == 'lstm': 180 | utils.init_lstm(self.char_bi_lstm) 181 | 182 | utils.init_linear(self.lstms_output_2_softmax) 183 | utils.init_linear(self.output_2_act) 184 | utils.init_linear(self.entity_2_output) 185 | 186 | utils.init_lstm(self.lstm) 187 | utils.init_lstm_cell(self.buffer_lstm) 188 | utils.init_lstm_cell(self.action_lstm) 189 | utils.init_lstm_cell(self.stack_lstm) 190 | utils.init_lstm_cell(self.output_lstm) 191 | utils.init_lstm_cell(self.entity_forward_lstm) 192 | utils.init_lstm_cell(self.entity_backward_lstm) 193 | 194 | def batch_shift_out(self, Action, buffer, stack, batch_shift_idx): 195 | from_buffer_2_stack = [buffer[i].pop() for i in batch_shift_idx] 196 | lstm_in = torch.cat([i[1] for i in from_buffer_2_stack], 0) 197 | lstm_h = torch.cat([stack[i][-1][0][0] for i in batch_shift_idx], 0) 198 | lstm_c = torch.cat([stack[i][-1][0][1] for i in batch_shift_idx], 0) 199 | 200 | if Action == 'S': 201 | h, c = self.stack_lstm(lstm_in, (lstm_h, lstm_c)) 202 | elif Action == 'O': 203 | h, c = self.output_lstm(lstm_in, (lstm_h, lstm_c)) 204 | 205 | i = 0 206 | for id in batch_shift_idx: 207 | stack[id].append( 208 | [(h[i].unsqueeze(0), c[i].unsqueeze(0)), from_buffer_2_stack[i][2], from_buffer_2_stack[i][1]]) 209 | i += 1 210 | 211 | return buffer, stack 212 | 213 | def batch_reduce(self, stack, output, batch_relation, batch_reduce_idx): 214 | output_input = [] 215 | for idx in batch_reduce_idx: 216 | entity = [] 217 | ent = '' 218 | (ent_f_h, ent_f_c) = (utils.xavier_init(self.gpu_triger, 1, self.hidden_dim), 219 | utils.xavier_init(self.gpu_triger, 1, self.hidden_dim)) 220 | (ent_b_h, ent_b_c) = (utils.xavier_init(self.gpu_triger, 1, self.hidden_dim), 221 | utils.xavier_init(self.gpu_triger, 1, self.hidden_dim)) 222 | 223 | while stack[idx][-1][1] != '': 224 | _, word, tok_emb = stack[idx].pop() 225 | entity.append([tok_emb, word]) 226 | for ent_idx in range(len(entity)): 227 | ent = ent + ' ' + word 228 | ent_f_h, ent_f_c = self.entity_forward_lstm(entity[ent_idx][0], (ent_f_h, ent_f_c)) 229 | ent_b_h, ent_b_c = self.entity_backward_lstm(entity[len(entity) - ent_idx - 1][0], (ent_b_h, ent_b_c)) 230 | entity_input = self.dropout(torch.cat([ent_b_h, ent_f_h], 1)) 231 | output_input.append([self.entity_2_output(torch.cat([entity_input, batch_relation[idx]], 1)), ent]) 232 | 233 | lstm_in = torch.cat([ent_emb[0] for ent_emb in output_input]) 234 | lstm_h = torch.cat([output[i][-1][0][0] for i in batch_reduce_idx], 0) 235 | lstm_c = torch.cat([output[i][-1][0][1] for i in batch_reduce_idx], 0) 236 | h, c = self.output_lstm(lstm_in, (lstm_h, lstm_c)) 237 | h = self.dropout(h) 238 | i = 0 239 | for id in batch_reduce_idx: 240 | output[id].append([(h[i].unsqueeze(0), c[i].unsqueeze(0)), output_input[i][1], output_input[i][0]]) 241 | i += 1 242 | 243 | return stack, output 244 | 245 | def forward(self, sentences, actions=None, hidden=None): 246 | 247 | if actions is not None: 248 | self.mode = "train" 249 | else: 250 | self.mode = "predict" 251 | 252 | self.set_batch_seq_size(sentences) # sentences [batch_size, max_len] 253 | word_embeds = self.dropout_e(self.word_embeds(sentences)) # [batch_size, max_len, embeddind_size] 254 | if self.mode == 'train': 255 | action_embeds = self.dropout_e(self.action_embeds(actions)) 256 | relation_embeds = self.dropout_e(self.relation_embeds(actions)) 257 | action_output, _ = self.ac_lstm(action_embeds.transpose(0, 1)) 258 | action_output = action_output.transpose(0, 1) 259 | 260 | lstm_initial = ( 261 | utils.xavier_init(self.gpu_triger, 1, self.hidden_dim), utils.xavier_init(self.gpu_triger, 1, self.hidden_dim)) 262 | 263 | sentence_array = sentences.data.cpu().numpy() 264 | sents_len = [] 265 | token_embedds = None 266 | for sent_idx in range(len(sentence_array)): 267 | count_words = 0 268 | token_embedding = None 269 | for word_idx in reversed(range(len(sentence_array[sent_idx]))): 270 | if self.use_spelling: 271 | if sentence_array[sent_idx][word_idx] == 1: 272 | tok_rep = torch.cat([word_embeds[sent_idx][word_idx].unsqueeze(0), self.pad_char_embeds], 1) 273 | elif sentence_array[sent_idx][word_idx] == 0: 274 | count_words += 1 275 | tok_rep = torch.cat([word_embeds[sent_idx][word_idx].unsqueeze(0), self.unk_char_embeds], 1) 276 | else: 277 | count_words += 1 278 | word = sentence_array[sent_idx][word_idx] 279 | chars_in_word = [self.char2idx[char] for char in self.idx2word[word]] 280 | chars_Tensor = utils.variable(torch.from_numpy(np.array(chars_in_word)), self.gpu_triger) 281 | chars_embeds = self.dropout_e(self.char_embeds(chars_Tensor)) 282 | if self.char_structure == 'lstm': 283 | char_o, hidden = self.char_bi_lstm(chars_embeds.unsqueeze(1), hidden) 284 | char_out = torch.chunk(hidden[0].squeeze(1), 2, 0) 285 | tok_rep = torch.cat( 286 | [word_embeds[sent_idx][word_idx].unsqueeze(0), char_out[0], char_out[1]], 1) 287 | elif self.char_structure == 'cnn': 288 | char, _ = self.conv1d(chars_embeds.unsqueeze(0).transpose(1, 2)).max( 289 | dim=2) # [batch_size, Embedding_sie, sentence_len] --> [batch_size, output_dim, sentence_len+padding_num*2 - kernel_num + 1] 290 | char = torch.tanh(char) 291 | tok_rep = torch.cat([word_embeds[sent_idx][word_idx].unsqueeze(0), char], 1) 292 | else: 293 | if sentence_array[sent_idx][word_idx] != 1: 294 | count_words += 1 295 | tok_rep = word_embeds[sent_idx][word_idx].unsqueeze(0) 296 | if token_embedding is None: 297 | token_embedding = tok_rep 298 | else: 299 | token_embedding = torch.cat([token_embedding, tok_rep], 0) 300 | 301 | sents_len.append(count_words) 302 | if token_embedds is None: 303 | token_embedds = token_embedding.unsqueeze(0) 304 | else: 305 | token_embedds = torch.cat([token_embedds, token_embedding.unsqueeze(0)], 0) 306 | 307 | tokens = token_embedds.transpose(0, 1) 308 | tok_output, hidden = self.lstm(tokens) # [max_len, batch_size, hidden_dim] 309 | tok_output = tok_output.transpose(0, 1) 310 | 311 | buffer = [[] for i in range(self.batch_size)] 312 | losses = [[] for i in range(self.batch_size)] 313 | right = [0 for i in range(self.batch_size)] 314 | predict_actions = [[] for i in range(self.batch_size)] 315 | output = [[[lstm_initial, ""]] for i in range(self.batch_size)] 316 | if self.mode == 'predict': 317 | action = [[[lstm_initial, ""]] for i in range(self.batch_size)] 318 | 319 | for idx in range(tok_output.size(0)): 320 | for word_idx in range(tok_output.size(1)): 321 | buffer[idx].append([tok_output[idx][word_idx].unsqueeze(0), token_embedds[idx][word_idx].unsqueeze(0), 322 | self.idx2word[sentence_array[idx][tok_output.size(1) - 1 - word_idx]]]) 323 | 324 | stack = [[[lstm_initial, ""]] for i in range(self.batch_size)] 325 | for act_idx in range(self.seq_length): 326 | batch_buffer = [b[-1] for b in buffer] 327 | if self.mode == 'train': 328 | if act_idx == 0: 329 | batch_action = [lstm_initial[0] for a in range(self.batch_size)] 330 | else: 331 | batch_action = [a[act_idx - 1].unsqueeze(0) for a in action_output] 332 | batch_relation = [r[act_idx].unsqueeze(0) for r in relation_embeds] 333 | elif self.mode == 'predict': 334 | batch_action = [a[-1] for a in action] 335 | batch_output = [o[-1] for o in output] 336 | batch_stack = [s[-1] for s in stack] 337 | 338 | have_action_batch_1 = [i for i in range(len(sents_len)) if sents_len[i] > 0] 339 | have_action_batch_2 = [i for i in range(len(batch_stack)) if batch_stack[i][1] != ''] 340 | have_action_batch = list(set(have_action_batch_1).union(set(have_action_batch_2))) 341 | 342 | if len(have_action_batch) > 0: 343 | batch_valid_actions = self.get_possible_actions_batch(batch_stack, sents_len, have_action_batch) 344 | if self.mode == 'train': 345 | batch_real_action = [ac[act_idx] for ac in actions.data] 346 | batch_pred, batch_loss = self.getloss_batch(have_action_batch, batch_buffer, batch_stack, 347 | batch_action, batch_output, batch_valid_actions, 348 | batch_real_action) 349 | batch_real_action = [self.idx2action[ac] for ac in batch_real_action] 350 | elif self.mode == 'predict': 351 | batch_pred, batch_loss = self.getloss_batch(have_action_batch, batch_buffer, batch_stack, 352 | batch_action, batch_output, batch_valid_actions) 353 | pred_action_tensor = utils.variable(torch.from_numpy(np.array(batch_pred)), self.gpu_triger) 354 | predict_actions_embed = self.dropout_e(self.action_embeds(pred_action_tensor)) 355 | ac_lstm_h, ac_lstm_c = self.action_lstm(predict_actions_embed, (torch.cat( 356 | [action[ac_idx][-1][0][0] for ac_idx in range(len(action)) if ac_idx in have_action_batch]), 357 | torch.cat( 358 | [action[ac_idx][-1][0][1] for 359 | ac_idx in range(len(action)) if 360 | ac_idx in have_action_batch]))) 361 | 362 | i = 0 363 | for batch_idx in range(self.batch_size): 364 | if batch_idx in have_action_batch: 365 | predict_actions[batch_idx].append(batch_pred[i]) 366 | if self.mode == 'train': 367 | losses[batch_idx].append(batch_loss[i]) 368 | elif self.mode == 'predict': 369 | action[batch_idx].append([(ac_lstm_h[i].unsqueeze(0), ac_lstm_c[i].unsqueeze(0)), 370 | self.idx2action[batch_pred[i]]]) 371 | i += 1 372 | else: 373 | if self.mode == 'predict': 374 | action[batch_idx].append([lstm_initial, ""]) 375 | 376 | if self.mode == 'predict': 377 | batch_real_action = [ac[-1][1] for ac in action] 378 | relation_embeds = self.dropout_e(self.relation_embeds( 379 | utils.variable(torch.from_numpy(np.array([self.action2idx[a] for a in batch_real_action])), 380 | self.gpu_triger))) 381 | batch_relation = [relation_embed.unsqueeze(0) for relation_embed in relation_embeds] 382 | 383 | batch_shift_idx = [idx for idx in range(len(batch_real_action)) if 384 | batch_real_action[idx].startswith('S')] 385 | batch_out_idx = [idx for idx in range(len(batch_real_action)) if batch_real_action[idx].startswith('O')] 386 | batch_reduce_idx = [idx for idx in range(len(batch_real_action)) if 387 | batch_real_action[idx].startswith('R')] 388 | 389 | # batch_relation = [batch_relation[i] for i in batch_reduce_idx] 390 | if len(batch_shift_idx) > 0: 391 | buffer, stack = self.batch_shift_out('S', buffer, stack, batch_shift_idx) 392 | for i in range(len(sents_len)): 393 | if i in batch_shift_idx: 394 | sents_len[i] -= 1 395 | if len(batch_out_idx) > 0: 396 | buffer, output = self.batch_shift_out('O', buffer, output, batch_out_idx) 397 | for i in range(len(sents_len)): 398 | if i in batch_out_idx: 399 | sents_len[i] -= 1 400 | if len(batch_reduce_idx) > 0: 401 | stack, output = self.batch_reduce(stack, output, batch_relation, batch_reduce_idx) 402 | loss = 0 403 | if self.mode == 'train': 404 | for idx in range(self.batch_size): 405 | loss += -torch.sum(torch.cat(losses[idx])) 406 | 407 | return loss, predict_actions 408 | -------------------------------------------------------------------------------- /model/evaluate.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import itertools 4 | 5 | import model.utils as utils 6 | 7 | 8 | def calc_score(ner_model, dataset_loader, if_cuda): 9 | 10 | ner_model.eval() 11 | correct = 0 12 | total_act = 0 13 | for feature, label, action in itertools.chain.from_iterable(dataset_loader): # feature : torch.Size([4, 17]) 14 | fea_v, tg_v, ac_v = utils.repack_vb(if_cuda, feature, label, action) 15 | loss, pre_action = ner_model.forward(fea_v, ac_v) # loss torch.Size([1, seq_len, action_size+1, action_size+1]) 16 | for idx in range(len(pre_action)): 17 | if pre_action[idx] == ac_v.squeeze(0).data[idx]: 18 | correct += 1 19 | total_act += len(pre_action) 20 | 21 | acc = correct / float(total_act) 22 | 23 | return acc 24 | 25 | def calc_f1_score(ner_model, dataset_loader, action2idx, if_cuda): 26 | 27 | idx2action = {v: k for k, v in action2idx.items()} 28 | ner_model.eval() 29 | correct = 0 30 | total_correct_entity = 0 31 | total_act = 0 32 | 33 | total_entity_in_gold = 0 34 | total_entity_in_pre = 0 35 | for feature, label, action in itertools.chain.from_iterable(dataset_loader): # feature : torch.Size([4, 17]) 36 | fea_v, tg_v, ac_v = utils.repack_vb(if_cuda, feature, label, action) 37 | # loss, pre_action, right_num = ner_model.forward(fea_v, ac_v) # loss torch.Size([1, seq_len, action_size+1, action_size+1]) 38 | _, pre_actions = ner_model.forward(fea_v) # loss torch.Size([1, seq_len, action_size+1, action_size+1]) 39 | for ac_golden, ac_pre in zip(ac_v.data.tolist(), pre_actions): 40 | num_entity_in_real, num_entity_in_pre, correct_entity = to_entity(ac_golden, ac_pre, idx2action) 41 | total_correct_entity += correct_entity 42 | total_entity_in_gold += num_entity_in_real 43 | total_entity_in_pre += num_entity_in_pre 44 | # for idx in range(len(ac_pre)): 45 | # if ac_pre[idx] == ac_golden[idx]: 46 | # correct += 1 47 | total_act += len(ac_pre) 48 | 49 | # acc = correct / float(total_act) 50 | if total_entity_in_pre > 0 : 51 | pre = total_correct_entity / float(total_entity_in_pre) 52 | else: 53 | pre = 0 54 | if total_entity_in_gold > 0 : 55 | rec = total_correct_entity / float(total_entity_in_gold) 56 | else: 57 | rec = 0 58 | if (pre + rec) > 0: 59 | f1 = 2 * pre * rec / float(pre + rec) 60 | else: 61 | f1 = 0 62 | return f1, pre, rec 63 | 64 | def to_entity(real_action, predict_action, idx2action): 65 | flags = [False, False] 66 | entitys = [[],[]] 67 | actions = [real_action, predict_action] 68 | for idx in range(len(actions)): 69 | ner_start_pos = -1 70 | for ac_idx in range(len(actions[idx])): 71 | if idx2action[actions[idx][ac_idx]].startswith('S') and ner_start_pos < 0: 72 | ner_start_pos = ac_idx 73 | elif idx2action[actions[idx][ac_idx]].startswith('O') and ner_start_pos >= 0: 74 | ner_start_pos = -1 75 | elif idx2action[actions[idx][ac_idx]].startswith('R') and ner_start_pos >= 0: 76 | entitys[idx].append(str(ner_start_pos)+'-'+str(ac_idx-1)+idx2action[actions[idx][ac_idx]]) 77 | ner_start_pos = -1 78 | correct_entity = set(entitys[0]) & set(entitys[1]) 79 | return len(entitys[0]), len(entitys[1]), len(correct_entity) 80 | 81 | def generate_ner(ner_model, fileout, dataset_loader, action2idx, word2idx, if_cuda): 82 | 83 | idx2action = {v: k for k, v in action2idx.items()} 84 | idx2word = {v: k for k, v in word2idx.items()} 85 | ner_model.eval() 86 | 87 | for feature in itertools.chain.from_iterable(dataset_loader): # feature : torch.Size([4, 17]) 88 | fe_v = utils.variable(feature, if_cuda) 89 | _, pre_action = ner_model.forward(fe_v) 90 | feature_seq = [] 91 | for sent in fe_v.squeeze(0).data.tolist(): 92 | feature_seq.append([idx2word[w_idx] for w_idx in sent]) 93 | 94 | for sent_idx in range(len(pre_action)): 95 | entitys = [] 96 | ner_start_pos = -1 97 | word_start = -1 98 | word_idx = 0 99 | for ac_idx in range(len(pre_action[sent_idx])): 100 | if idx2action[pre_action[sent_idx][ac_idx]].startswith('S') and ner_start_pos < 0: 101 | ner_start_pos = ac_idx 102 | word_start = word_idx 103 | word_idx += 1 104 | elif idx2action[pre_action[sent_idx][ac_idx]].startswith('O') and ner_start_pos >= 0: 105 | ner_start_pos = -1 106 | word_idx += 1 107 | elif idx2action[pre_action[sent_idx][ac_idx]].startswith('R') and ner_start_pos >= 0: 108 | ent = [] 109 | ent.append(" ".join(feature_seq[sent_idx][word_start:word_idx])) 110 | ent.append([ner_start_pos, ac_idx-1]) 111 | ent.append(idx2action[pre_action[sent_idx][ac_idx]].split('-')[1]) 112 | entitys.append(ent) 113 | ner_start_pos = -1 114 | else: 115 | word_idx += 1 116 | 117 | fileout.write("%s\nEntities: " % (" ".join(feature_seq[sent_idx]))) 118 | for i in range(len(entitys)): 119 | fileout.write("%s-%s " %(entitys[i][0], entitys[i][2])) 120 | fileout.write("\n\n") 121 | 122 | 123 | -------------------------------------------------------------------------------- /model/stack_lstm.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.autograd as autograd 3 | import torch.nn as nn 4 | import numpy as np 5 | 6 | import model.utils as utils 7 | 8 | 9 | class StackRNN(object): 10 | def __init__(self, cell, initial_state, dropout, get_output, p_empty_embedding=None): 11 | self.cell = cell 12 | self.dropout = dropout 13 | self.s = [(initial_state, None)] 14 | self.empty = None 15 | self.get_output = get_output 16 | if p_empty_embedding is not None: 17 | self.empty = p_empty_embedding 18 | 19 | def push(self, expr, extra=None): 20 | self.dropout(self.s[-1][0][0]) 21 | self.s.append((self.cell(expr, self.s[-1][0]), extra)) 22 | 23 | def pop(self): 24 | return self.s.pop()[1] 25 | 26 | def embedding(self): 27 | return self.get_output(self.s[-1][0]) if len(self.s) > 1 else self.empty 28 | 29 | def back_to_init(self): 30 | while self.__len__() > 0: 31 | self.pop() 32 | 33 | def clear(self): 34 | self.s.reverse() 35 | self.back_to_init() 36 | 37 | def __len__(self): 38 | return len(self.s) - 1 39 | 40 | 41 | class TransitionNER(nn.Module): 42 | 43 | def __init__(self, mode, action2idx, word2idx, label2idx, char2idx, ner_map, vocab_size, action_size, embedding_dim, action_embedding_dim, char_embedding_dim, 44 | hidden_dim, char_hidden_dim, rnn_layers, dropout_ratio, use_spelling, char_structure, is_cuda): 45 | super(TransitionNER, self).__init__() 46 | self.embedding_dim = embedding_dim 47 | self.mode = mode 48 | self.hidden_dim = hidden_dim 49 | self.vocab_size = vocab_size 50 | self.action2idx = action2idx 51 | self.label2idx = label2idx 52 | self.char2idx = char2idx 53 | self.use_spelling = use_spelling 54 | self.char_structure = char_structure 55 | if is_cuda >=0: 56 | self.gpu_triger = True 57 | else: 58 | self.gpu_triger = False 59 | self.idx2label = {v: k for k, v in label2idx.items()} 60 | self.idx2action = {v: k for k, v in action2idx.items()} 61 | self.idx2word = {v: k for k, v in word2idx.items()} 62 | self.idx2char = {v: k for k, v in char2idx.items()} 63 | self.ner_map = ner_map 64 | 65 | self.word_embeds = nn.Embedding(vocab_size, embedding_dim) 66 | self.action_embeds = nn.Embedding(action_size, action_embedding_dim) 67 | self.relation_embeds = nn.Embedding(action_size, action_embedding_dim) 68 | 69 | if self.use_spelling: 70 | self.char_embeds = nn.Embedding(len(self.char2idx), char_embedding_dim) 71 | if self.char_structure == 'lstm': 72 | self.tok_embedding_dim = self.embedding_dim + char_hidden_dim*2 73 | self.unk_char_embeds = nn.Parameter(torch.randn(1, char_hidden_dim * 2), requires_grad=True) 74 | self.pad_char_embeds = nn.Parameter(torch.zeros(1, char_hidden_dim * 2)) 75 | self.char_bi_lstm = nn.LSTM(char_embedding_dim, char_hidden_dim, num_layers=rnn_layers, bidirectional=True, dropout=dropout_ratio) 76 | elif self.char_structure == 'cnn': 77 | self.tok_embedding_dim = self.embedding_dim + char_hidden_dim 78 | self.pad_char_embeds = nn.Parameter(torch.zeros(1, char_hidden_dim )) 79 | self.unk_char_embeds = nn.Parameter(torch.randn(1, char_hidden_dim), requires_grad=True) 80 | self.conv1d = nn.Conv1d(char_embedding_dim, char_hidden_dim, 3, padding=2) 81 | else: 82 | self.tok_embedding_dim = self.embedding_dim 83 | 84 | self.buffer_lstm = nn.LSTMCell(self.tok_embedding_dim, hidden_dim) 85 | self.stack_lstm = nn.LSTMCell(self.tok_embedding_dim, hidden_dim) 86 | self.action_lstm = nn.LSTMCell(action_embedding_dim, hidden_dim) 87 | self.output_lstm = nn.LSTMCell(self.tok_embedding_dim, hidden_dim) 88 | self.entity_forward_lstm = nn.LSTMCell(self.tok_embedding_dim, hidden_dim) 89 | self.entity_backward_lstm = nn.LSTMCell(self.tok_embedding_dim, hidden_dim) 90 | 91 | self.rnn_layers = rnn_layers 92 | 93 | self.dropout_e = nn.Dropout(p=dropout_ratio) 94 | self.dropout = nn.Dropout(p=dropout_ratio) 95 | 96 | self.init_buffer = utils.xavier_init(self.gpu_triger,1,hidden_dim) 97 | self.empty_emb = nn.Parameter(torch.randn(1, hidden_dim)) 98 | self.lstms_output_2_softmax = nn.Linear(hidden_dim * 4, hidden_dim) 99 | self.output_2_act = nn.Linear(hidden_dim, len(ner_map)+2) 100 | self.entity_2_output = nn.Linear(hidden_dim*2 + action_embedding_dim, self.tok_embedding_dim) 101 | 102 | self.lstm_initial = ( 103 | utils.xavier_init(self.gpu_triger, 1, self.hidden_dim), utils.xavier_init(self.gpu_triger, 1, self.hidden_dim)) 104 | 105 | self.batch_size = 1 106 | self.seq_length = 1 107 | 108 | def _rnn_get_output(self, state): 109 | return state[0] 110 | 111 | def get_possible_actions(self, stack, buffer): 112 | valid_actions = [] 113 | if len(buffer) > 0: 114 | valid_actions.append(self.action2idx["SHIFT"]) 115 | if len(stack) > 0: 116 | valid_actions += [self.action2idx[ner_action] for ner_action in self.ner_map.keys()] 117 | else: 118 | valid_actions.append(self.action2idx["OUT"]) 119 | return valid_actions 120 | 121 | def rand_init_hidden(self): 122 | 123 | if self.gpu_triger is True: 124 | return autograd.Variable( 125 | torch.randn(2 * self.rnn_layers, self.batch_size, self.hidden_dim // 2)).cuda(), autograd.Variable( 126 | torch.randn(2 * self.rnn_layers, self.batch_size, self.hidden_dim // 2)).cuda() 127 | else: 128 | return autograd.Variable( 129 | torch.randn(2 * self.rnn_layers, self.batch_size, self.hidden_dim // 2)), autograd.Variable( 130 | torch.randn(2 * self.rnn_layers, self.batch_size, self.hidden_dim // 2)) 131 | 132 | def set_seq_size(self, sentence): 133 | 134 | tmp = sentence.size() 135 | self.seq_length = tmp[0] 136 | self.batch_size = 1 137 | 138 | def set_batch_seq_size(self, sentence): 139 | 140 | tmp = sentence.size() 141 | self.seq_length = tmp[1] 142 | self.batch_size = tmp[0] 143 | 144 | def load_pretrained_embedding(self, pre_embeddings): 145 | 146 | assert (pre_embeddings.size()[1] == self.embedding_dim) 147 | self.word_embeds.weight = nn.Parameter(pre_embeddings) 148 | 149 | 150 | def rand_init(self, init_word_embedding=False, init_action_embedding=True, init_relation_embedding=True): 151 | 152 | if init_word_embedding: 153 | utils.init_embedding(self.word_embeds.weight) 154 | if init_action_embedding: 155 | utils.init_embedding(self.action_embeds.weight) 156 | if init_relation_embedding: 157 | utils.init_embedding(self.relation_embeds.weight) 158 | 159 | if self.use_spelling: 160 | utils.init_embedding(self.char_embeds.weight) 161 | if self.use_spelling and self.char_structure == 'lstm': 162 | utils.init_lstm(self.char_bi_lstm) 163 | 164 | utils.init_linear(self.lstms_output_2_softmax) 165 | utils.init_linear(self.output_2_act) 166 | utils.init_linear(self.entity_2_output) 167 | 168 | utils.init_lstm_cell(self.buffer_lstm) 169 | utils.init_lstm_cell(self.action_lstm) 170 | utils.init_lstm_cell(self.stack_lstm) 171 | utils.init_lstm_cell(self.output_lstm) 172 | utils.init_lstm_cell(self.entity_forward_lstm) 173 | utils.init_lstm_cell(self.entity_backward_lstm) 174 | 175 | def forward(self, sentence, actions=None, hidden=None): 176 | 177 | # sentence = sentence.squeeze(0) 178 | if actions is None: 179 | mode = 'predict' 180 | else: 181 | mode = 'train' 182 | 183 | self.set_seq_size(sentence) 184 | word_embeds = self.dropout_e(self.word_embeds(sentence)) 185 | word_embeds = word_embeds.squeeze(0) 186 | if mode == 'train': 187 | # actions = actions.squeeze(0) 188 | action_embeds = self.dropout_e(self.action_embeds(actions)) 189 | action_embeds = action_embeds.squeeze(0) 190 | relation_embeds = self.dropout_e(self.relation_embeds(actions)) 191 | relation_embeds = relation_embeds.squeeze(0) 192 | actions = actions.squeeze(0) 193 | 194 | sentence = sentence.squeeze(0) 195 | action_count = 0 196 | 197 | buffer = StackRNN(self.buffer_lstm, self.lstm_initial, self.dropout, self._rnn_get_output, self.empty_emb) 198 | stack = StackRNN(self.stack_lstm, self.lstm_initial, self.dropout, self._rnn_get_output, self.empty_emb) 199 | action = StackRNN(self.action_lstm, self.lstm_initial, self.dropout, self._rnn_get_output, self.empty_emb) 200 | output = StackRNN(self.output_lstm, self.lstm_initial, self.dropout, self._rnn_get_output, self.empty_emb) 201 | ent_f = StackRNN(self.entity_forward_lstm, self.lstm_initial, self.dropout, self._rnn_get_output, self.empty_emb) 202 | ent_b = StackRNN(self.entity_backward_lstm, self.lstm_initial, self.dropout, self._rnn_get_output, self.empty_emb) 203 | 204 | predict_actions = [] 205 | pre_actions = [] 206 | losses = [] 207 | 208 | sentence_array = sentence.data.tolist() 209 | token_embedding = list() 210 | 211 | for word_idx in range(len(sentence_array)): 212 | if self.use_spelling: 213 | if sentence_array[word_idx] == 0: 214 | tok_rep = torch.cat([word_embeds[word_idx].unsqueeze(0), self.unk_char_embeds], 1) 215 | elif sentence_array[word_idx] != 1: 216 | word = sentence_array[word_idx] 217 | chars_in_word = [self.char2idx[char] for char in self.idx2word[word]] 218 | chars_Tensor = utils.variable(torch.from_numpy(np.array(chars_in_word)), self.gpu_triger) 219 | chars_embeds = self.dropout_e(self.char_embeds(chars_Tensor.unsqueeze(0))) 220 | if self.char_structure == 'lstm': 221 | char_o, hidden = self.char_bi_lstm(chars_embeds.transpose(0, 1), hidden) 222 | char_out = torch.chunk(hidden[0].squeeze(1), 2, 0) 223 | tok_rep = torch.cat([word_embeds[word_idx].unsqueeze(0), char_out[0], char_out[1]], 1) 224 | elif self.char_structure == 'cnn': 225 | char = chars_embeds.unsqueeze(0) 226 | char = char.transpose(1, 2) 227 | char, _ = self.conv1d(char).max(dim=2) 228 | char = torch.tanh(char) 229 | tok_rep = torch.cat([word_embeds[word_idx].unsqueeze(0), char], 1) 230 | else: 231 | tok_rep = word_embeds[word_idx].unsqueeze(0) 232 | if word_idx == 0: 233 | token_embedding = tok_rep 234 | elif sentence_array[word_idx] != 1: 235 | token_embedding = torch.cat([token_embedding, tok_rep], 0) 236 | 237 | for i in range(token_embedding.size()[0]): 238 | tok_embed = token_embedding[token_embedding.size()[0]-1-i].unsqueeze(0) 239 | tok = sentence.data[token_embedding.size()[0]-1-i] 240 | buffer.push(tok_embed, (tok_embed, self.idx2word[tok])) 241 | 242 | while len(buffer) > 0 or len(stack) > 0: 243 | valid_actions = self.get_possible_actions(stack, buffer) 244 | log_probs = None 245 | if len(valid_actions)>1: 246 | 247 | lstms_output = torch.cat([buffer.embedding(), stack.embedding(), output.embedding(), action.embedding()], 1) 248 | hidden_output = torch.tanh(self.lstms_output_2_softmax(self.dropout(lstms_output))) 249 | if self.gpu_triger is True: 250 | logits = self.output_2_act(hidden_output)[0][torch.autograd.Variable(torch.LongTensor(valid_actions)).cuda()] 251 | else: 252 | logits = self.output_2_act(hidden_output)[0][torch.autograd.Variable(torch.LongTensor(valid_actions))] 253 | valid_action_tbl = {a: i for i, a in enumerate(valid_actions)} 254 | log_probs = torch.nn.functional.log_softmax(logits, dim=0) 255 | action_idx = torch.max(log_probs.cpu(), 0)[1][0].data.numpy()[0] 256 | action_predict = valid_actions[action_idx] 257 | pre_actions.append(action_predict) 258 | if mode == 'train': 259 | if log_probs is not None: 260 | losses.append(log_probs[valid_action_tbl[actions.data[action_count]]]) 261 | 262 | if mode == 'train': 263 | real_action = self.idx2action[actions.data[action_count]] 264 | act_embedding = action_embeds[action_count].unsqueeze(0) 265 | rel_embedding = relation_embeds[action_count].unsqueeze(0) 266 | elif mode == 'predict': 267 | real_action = self.idx2action[action_predict] 268 | action_predict_tensor = utils.variable(torch.from_numpy(np.array([action_predict])), self.gpu_triger) 269 | action_embeds = self.dropout_e(self.action_embeds(action_predict_tensor)) 270 | relation_embeds = self.dropout_e(self.relation_embeds(action_predict_tensor)) 271 | act_embedding = action_embeds[0].unsqueeze(0) 272 | rel_embedding = relation_embeds[0].unsqueeze(0) 273 | 274 | action.push(act_embedding,(act_embedding, real_action)) 275 | if real_action.startswith('S'): 276 | assert len(buffer) > 0 277 | tok_buffer_embedding, buffer_token = buffer.pop() 278 | stack.push(tok_buffer_embedding, (tok_buffer_embedding, buffer_token)) 279 | elif real_action.startswith('O'): 280 | assert len(buffer) > 0 281 | tok_buffer_embedding, buffer_token = buffer.pop() 282 | output.push(tok_buffer_embedding, (tok_buffer_embedding, buffer_token)) 283 | elif real_action.startswith('R'): 284 | ent ='' 285 | entity = [] 286 | assert len(stack) > 0 287 | while len(stack) > 0: 288 | tok_stack_embedding, stack_token = stack.pop() 289 | entity.append([tok_stack_embedding, stack_token]) 290 | if len(entity) > 1: 291 | 292 | for i in range(len(entity)): 293 | ent_f.push(entity[i][0], (entity[i][0],entity[i][1])) 294 | ent_b.push(entity[len(entity)-i-1][0], (entity[len(entity)-i-1][0], entity[len(entity)-i-1][1])) 295 | ent += entity[i][1] 296 | ent += ' ' 297 | entity_input = self.dropout(torch.cat([ent_f.embedding(), ent_b.embedding()], 1)) 298 | else: 299 | ent_f.push(entity[0][0], (entity[0][0], entity[0][1])) 300 | ent_b.push(entity[0][0], (entity[0][0], entity[0][1])) 301 | ent = entity[0][1] 302 | entity_input = self.dropout(torch.cat([ent_f.embedding(), ent_b.embedding()], 1)) 303 | ent_f.clear() 304 | ent_b.clear() 305 | output_input = self.entity_2_output(torch.cat([entity_input, rel_embedding], 1)) 306 | output.push(output_input, (entity_input, ent)) 307 | action_count += 1 308 | 309 | if len(losses) > 0: 310 | loss = -torch.sum(torch.cat(losses)) 311 | else: 312 | loss = -1 313 | predict_actions.append(pre_actions) 314 | 315 | return loss, predict_actions 316 | -------------------------------------------------------------------------------- /model/utils.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import json 3 | 4 | import numpy as np 5 | import torch.nn as nn 6 | import torch.nn.init 7 | from torch.utils.data import Dataset 8 | 9 | class TransitionDataset_P(Dataset): 10 | 11 | def __init__(self, data_tensor): 12 | self.data_tensor = data_tensor 13 | 14 | def __getitem__(self, index): 15 | return self.data_tensor[index] 16 | 17 | def __len__(self): 18 | return len(self.data_tensor) 19 | 20 | class TransitionDataset(Dataset): 21 | 22 | def __init__(self, data_tensor, label_tensor, action_tensor): 23 | 24 | # print(data_tensor.size, label_tensor.size, action_tensor.size) 25 | # assert data_tensor.size(0) == label_tensor.size(0) 26 | self.data_tensor = data_tensor 27 | self.label_tensor = label_tensor 28 | self.action_tensor = action_tensor 29 | 30 | def __getitem__(self, index): 31 | return self.data_tensor[index], self.label_tensor[index], self.action_tensor[index] 32 | 33 | def __len__(self): 34 | return len(self.data_tensor) 35 | 36 | 37 | zip = getattr(itertools, 'izip', zip) 38 | 39 | def variable(tensor, gpu): 40 | if gpu: 41 | return torch.autograd.Variable(tensor).cuda() 42 | else: 43 | return torch.autograd.Variable(tensor) 44 | 45 | 46 | def xavier_init(gpu, *size): 47 | return nn.init.xavier_normal(variable(torch.FloatTensor(*size), gpu)) 48 | 49 | 50 | def init_varaible_zero(gpu, *size): 51 | return variable(torch.zeros(*size),gpu) 52 | 53 | def to_scalar(var): 54 | 55 | return var.view(-1).data.tolist()[0] 56 | 57 | 58 | def argmax(vec): 59 | 60 | _, idx = torch.max(vec, 1) 61 | return to_scalar(idx) 62 | 63 | 64 | def log_sum_exp(vec, m_size): 65 | 66 | _, idx = torch.max(vec, 1) # B * 1 * M 67 | max_score = torch.gather(vec, 1, idx.view(-1, 1, m_size)).view(-1, 1, m_size) # B * M 68 | 69 | return max_score.view(-1, m_size) + torch.log(torch.sum(torch.exp(vec - max_score.expand_as(vec)), 1)).view(-1, m_size) # B * M 70 | 71 | 72 | def encode2char_safe(input_lines, char_dict): 73 | 74 | unk = char_dict[''] 75 | forw_lines = [list(map(lambda m: list(map(lambda t: char_dict.get(t, unk), m)), line)) for line in input_lines] 76 | return forw_lines 77 | 78 | 79 | def encode_safe(input_lines, word_dict, unk, singleton, singleton_rate): 80 | if singleton_rate > 0: 81 | lines = list() 82 | for sentence in input_lines: 83 | line = list() 84 | for word in sentence: 85 | if word in singleton and torch.rand(1).numpy()[0] < singleton_rate: 86 | line.append(unk) 87 | elif word in word_dict: 88 | line.append(word_dict[word]) 89 | else: 90 | line.append(unk) 91 | lines.append(line) 92 | else: 93 | lines = list(map(lambda t: list(map(lambda m: word_dict.get(m, unk), t)), input_lines)) 94 | return lines 95 | 96 | def encode_safe_predict(input_lines, word_dict, unk): 97 | lines = list(map(lambda t: list(map(lambda m: word_dict.get(m, unk), t)), input_lines)) 98 | return lines 99 | 100 | def encode(input_lines, word_dict): 101 | 102 | lines = list(map(lambda t: list(map(lambda m: word_dict[m], t)), input_lines)) 103 | return lines 104 | 105 | def shrink_features(feature_map, features, thresholds): 106 | 107 | feature_count = {k: 0 for (k, v) in iter(feature_map.items())} 108 | for feature_list in features: 109 | for feature in feature_list: 110 | feature_count[feature] += 1 111 | shrinked_feature_count = [k for (k, v) in iter(feature_count.items()) if v >= thresholds] 112 | feature_map = {shrinked_feature_count[ind]: (ind + 1) for ind in range(0, len(shrinked_feature_count))} 113 | 114 | #inserting unk to be 0 encoded 115 | feature_map[''] = 0 116 | #inserting eof 117 | feature_map[''] = len(feature_map) 118 | return feature_map 119 | 120 | def generate_corpus(lines: object, word_count, use_spelling, if_shrink_feature: object = False, thresholds: object = 1) -> object: 121 | 122 | feature_map = dict() 123 | if use_spelling: 124 | char_map = {"": 0, "": 1, "": 2, "": 3} 125 | else: 126 | char_map = None 127 | label_map = dict() 128 | action_map = {"OUT": 0, "SHIFT": 1} 129 | ner_map =dict() 130 | 131 | features = list() 132 | actions = list() 133 | labels = list() 134 | 135 | tmp_fl = list() 136 | tmp_ll = list() 137 | tmp_al = list() 138 | count_ner = 0 139 | ner_label = "" 140 | for line in lines: 141 | if not (line.isspace() or (len(line) > 10 and line[0:10] == '-DOCSTART-')): 142 | line = line.rstrip('\n').split() 143 | tmp_fl.append(line[0]) 144 | if line[0] in word_count: 145 | word_count[line[0]] += 1 146 | else: 147 | word_count[line[0]] = 1 148 | if use_spelling: 149 | for char_idx in range(len(line[0])): 150 | if line[0][char_idx] not in char_map: 151 | char_map[line[0][char_idx]] = len(char_map) 152 | tmp_ll.append(line[-1]) 153 | 154 | if line[0] not in feature_map: 155 | feature_map[line[0]] = len(feature_map) + 1 #0 is for unk 156 | if line[-1] not in label_map: 157 | label_map[line[-1]] = len(label_map) 158 | 159 | if len(line[-1].split('-')) > 1: 160 | if line[-1].split('-')[0] == "B" and not ner_label == "": 161 | tmp_al.append(ner_label) 162 | count_ner += 1 163 | ner_label = "REDUCE-"+line[-1].split('-')[1] 164 | if ner_label not in action_map: 165 | ner_map[ner_label] = len(ner_map) 166 | action_map[ner_label] = len(action_map) 167 | tmp_al.append("SHIFT") 168 | else: 169 | if not ner_label == "": 170 | tmp_al.append(ner_label) 171 | count_ner += 1 172 | ner_label = "" 173 | tmp_al.append("OUT") 174 | 175 | elif len(tmp_fl) > 0: 176 | if not ner_label =="": 177 | tmp_al.append(ner_label) 178 | count_ner += 1 179 | ner_label = "" 180 | assert len(tmp_ll) == len(tmp_fl) 181 | assert len(tmp_al) == len(tmp_fl)+count_ner 182 | features.append(tmp_fl) 183 | labels.append(tmp_ll) 184 | actions.append(tmp_al) 185 | count_ner = 0 186 | tmp_al = list() 187 | tmp_fl = list() 188 | tmp_ll = list() 189 | if len(tmp_fl) > 0: 190 | assert len(tmp_ll) == len(tmp_fl) 191 | assert len(tmp_al) == len(tmp_fl)+count_ner 192 | features.append(tmp_fl) 193 | labels.append(tmp_ll) 194 | actions.append(tmp_al) 195 | 196 | if if_shrink_feature: 197 | feature_map = shrink_features(feature_map, features, thresholds) 198 | else: 199 | #inserting unk to be 0 encoded 200 | feature_map[''] = 0 201 | #inserting eof 202 | feature_map[''] = len(feature_map) 203 | action_map[''] = len(action_map) 204 | label_map[''] = len(label_map) 205 | 206 | singleton = list() 207 | 208 | for k, v in word_count.items(): 209 | if v == 1: 210 | singleton.append(k) 211 | 212 | return features, labels, actions, feature_map, label_map, action_map, ner_map, singleton, char_map 213 | 214 | 215 | def read_corpus_ner(lines, word_count): 216 | 217 | features = list() 218 | actions = list() 219 | labels = list() 220 | tmp_fl = list() 221 | tmp_ll = list() 222 | tmp_al = list() 223 | count_ner = 0 224 | ner_label = "" 225 | for line in lines: 226 | if not (line.isspace() or (len(line) > 10 and line[0:10] == '-DOCSTART-')): 227 | line = line.rstrip('\n').split() 228 | tmp_fl.append(line[0]) 229 | if line[0] in word_count: 230 | word_count[line[0]] += 1 231 | else: 232 | word_count[line[0]] = 1 233 | tmp_ll.append(line[-1]) 234 | if len(line[-1].split('-')) > 1: 235 | if line[-1].split('-')[0] == "B" and not ner_label == "": 236 | tmp_al.append(ner_label) 237 | count_ner += 1 238 | ner_label = "REDUCE-"+line[-1].split('-')[1] 239 | tmp_al.append("SHIFT") 240 | else: 241 | if not ner_label == "": 242 | tmp_al.append(ner_label) 243 | count_ner += 1 244 | ner_label = "" 245 | tmp_al.append("OUT") 246 | 247 | elif len(tmp_fl) > 0: 248 | if not ner_label =="": 249 | tmp_al.append(ner_label) 250 | count_ner += 1 251 | ner_label = "" 252 | assert len(tmp_ll) == len(tmp_fl) 253 | assert len(tmp_al) == len(tmp_fl)+count_ner 254 | features.append(tmp_fl) 255 | labels.append(tmp_ll) 256 | actions.append(tmp_al) 257 | count_ner = 0 258 | tmp_al = list() 259 | tmp_fl = list() 260 | tmp_ll = list() 261 | if len(tmp_fl) > 0: 262 | assert len(tmp_ll) == len(tmp_fl) 263 | assert len(tmp_al) == len(tmp_fl)+count_ner 264 | features.append(tmp_fl) 265 | labels.append(tmp_ll) 266 | actions.append(tmp_al) 267 | 268 | return features, labels, actions, word_count 269 | 270 | def read_corpus_predict(lines): 271 | features = list() 272 | for line in lines: 273 | line = line.rstrip('\n').split() 274 | features.append(line) 275 | 276 | return features 277 | 278 | 279 | 280 | def shrink_embedding(feature_map, word_dict, word_embedding, caseless): 281 | 282 | if caseless: 283 | feature_map = set([k.lower() for k in feature_map.keys()]) 284 | new_word_list = [k for k in word_dict.keys() if (k in feature_map)] 285 | new_word_dict = {k:v for (v, k) in enumerate(new_word_list)} 286 | new_word_list_ind = torch.LongTensor([word_dict[k] for k in new_word_list]) 287 | new_embedding = word_embedding[new_word_list_ind] 288 | return new_word_dict, new_embedding 289 | 290 | def load_embedding_wlm(emb_file, delimiter, feature_map, full_feature_set, caseless, unk, emb_len, shrink_to_train=False, shrink_to_corpus=False): 291 | 292 | if caseless: 293 | feature_set = set([key.lower() for key in feature_map]) 294 | full_feature_set = set([key.lower() for key in full_feature_set]) 295 | else: 296 | feature_set = set([key for key in feature_map]) 297 | full_feature_set = set([key for key in full_feature_set]) 298 | 299 | # ensure is 0 300 | word_dict = {v: (k + 1) for (k, v) in enumerate(feature_set - set(['']))} 301 | word_dict[''] = 0 302 | 303 | in_doc_freq_num = len(word_dict) 304 | rand_embedding_tensor = torch.FloatTensor(in_doc_freq_num, emb_len) 305 | init_embedding(rand_embedding_tensor) 306 | 307 | indoc_embedding_array = list() 308 | indoc_word_array = list() 309 | outdoc_embedding_array = list() 310 | outdoc_word_array = list() 311 | 312 | for line in open(emb_file, 'r'): 313 | line = line.split(delimiter) 314 | if len(line) > 2: 315 | vector = list(map(lambda t: float(t), filter(lambda n: n and not n.isspace(), line[1:]))) 316 | 317 | if shrink_to_train and line[0] not in feature_set: 318 | continue 319 | 320 | if line[0] == unk: 321 | rand_embedding_tensor[0] = torch.FloatTensor(vector) # unk is 0 322 | elif line[0] in word_dict: 323 | rand_embedding_tensor[word_dict[line[0]]] = torch.FloatTensor(vector) 324 | elif line[0] in full_feature_set: 325 | indoc_embedding_array.append(vector) 326 | indoc_word_array.append(line[0]) 327 | elif not shrink_to_corpus: 328 | outdoc_word_array.append(line[0]) 329 | outdoc_embedding_array.append(vector) 330 | 331 | embedding_tensor_0 = torch.FloatTensor(np.asarray(indoc_embedding_array)) 332 | 333 | if not shrink_to_corpus: 334 | embedding_tensor_1 = torch.FloatTensor(np.asarray(outdoc_embedding_array)) 335 | word_emb_len = embedding_tensor_0.size(1) 336 | assert (word_emb_len == emb_len) 337 | 338 | if shrink_to_corpus: 339 | embedding_tensor = torch.cat([rand_embedding_tensor, embedding_tensor_0], 0) 340 | else: 341 | embedding_tensor = torch.cat([rand_embedding_tensor, embedding_tensor_0, embedding_tensor_1], 0) 342 | 343 | for word in indoc_word_array: 344 | word_dict[word] = len(word_dict) 345 | in_doc_num = len(word_dict) 346 | if not shrink_to_corpus: 347 | for word in outdoc_word_array: 348 | word_dict[word] = len(word_dict) 349 | 350 | return word_dict, embedding_tensor 351 | 352 | def calc_threshold_mean(features): 353 | 354 | lines_len = list(map(lambda t: len(t) + 1, features)) 355 | average = int(sum(lines_len) / len(lines_len)) 356 | lower_line = list(filter(lambda t: t < average, lines_len)) 357 | upper_line = list(filter(lambda t: t >= average, lines_len)) 358 | lower_average = int(sum(lower_line) / len(lower_line)) 359 | upper_average = int(sum(upper_line) / len(upper_line)) 360 | max_len = max(lines_len) 361 | return [lower_average, average, upper_average, max_len] 362 | 363 | 364 | def construct_dataset(input_features, input_label, input_action, word_dict, label_dict, action_dict, singleton, singleton_rate, caseless): 365 | 366 | if caseless: 367 | input_features = list(map(lambda t: list(map(lambda x: x, t)), input_features)) 368 | features = encode_safe(input_features, word_dict, word_dict[''], singleton, singleton_rate) 369 | labels = encode(input_label, label_dict) 370 | actions = encode(input_action, action_dict) 371 | thresholds = calc_threshold_mean(actions) 372 | 373 | buckets = [[[], [], []] for _ in range(len(thresholds))] 374 | for feature, label, action in zip(features, labels, actions): 375 | cur_len = len(action) 376 | cur_sent_len = len(feature) 377 | idx = 0 378 | cur_len_1 = cur_len + 1 379 | while thresholds[idx] < cur_len_1: 380 | idx += 1 381 | buckets[idx][0].append(feature + [word_dict['']] * (thresholds[idx] - cur_sent_len)) 382 | buckets[idx][1].append(label + [label_dict['']] * (thresholds[idx] - cur_sent_len)) 383 | buckets[idx][2].append(action + [action_dict['']] * (thresholds[idx] - cur_len)) 384 | 385 | dataset = [TransitionDataset(torch.LongTensor(bucket[0]), torch.LongTensor(bucket[1]), torch.LongTensor(bucket[2])) for bucket in buckets] 386 | 387 | return dataset 388 | 389 | def construct_dataset_predict(input_features, word_dict, caseless): 390 | if caseless: 391 | input_features = list(map(lambda t: list(map(lambda x: x, t)), input_features)) 392 | features = encode_safe_predict(input_features, word_dict, word_dict['']) 393 | feature_tensor = [] 394 | for feature in features: 395 | feature_tensor.append(torch.LongTensor(feature)) 396 | dataset = TransitionDataset_P(feature_tensor) 397 | 398 | return dataset 399 | 400 | 401 | def save_checkpoint(state, track_list, filename): 402 | 403 | with open(filename+'.json', 'w') as f: 404 | json.dump(track_list, f) 405 | torch.save(state, filename+'.model') 406 | 407 | def adjust_learning_rate(optimizer, lr): 408 | 409 | for param_group in optimizer.param_groups: 410 | param_group['lr'] = lr 411 | 412 | def init_embedding(input_embedding): 413 | 414 | bias = np.sqrt(3.0 / input_embedding.size(1)) 415 | nn.init.uniform(input_embedding, -bias, bias) 416 | 417 | def init_linear(input_linear): 418 | 419 | nn.init.orthogonal(input_linear.weight) 420 | if input_linear.bias is not None: 421 | input_linear.bias.data.zero_() 422 | 423 | def init_lstm(input_lstm): 424 | 425 | for ind in range(0, input_lstm.num_layers): 426 | weight = eval('input_lstm.weight_ih_l'+str(ind)) 427 | nn.init.orthogonal(weight) 428 | weight = eval('input_lstm.weight_hh_l'+str(ind)) 429 | nn.init.orthogonal(weight) 430 | 431 | if input_lstm.bias: 432 | for ind in range(0, input_lstm.num_layers): 433 | weight = eval('input_lstm.bias_ih_l'+str(ind)) 434 | weight.data.zero_() 435 | weight.data[input_lstm.hidden_size: 2 * input_lstm.hidden_size] = 1 436 | weight = eval('input_lstm.bias_hh_l'+str(ind)) 437 | weight.data.zero_() 438 | weight.data[input_lstm.hidden_size: 2 * input_lstm.hidden_size] = 1 439 | 440 | 441 | def init_lstm_cell(input_lstm): 442 | 443 | weight = eval('input_lstm.weight_ih') 444 | bias = np.sqrt(6.0 / (weight.size(0) / 4 + weight.size(1))) 445 | nn.init.uniform(weight, -bias, bias) 446 | weight = eval('input_lstm.weight_hh') 447 | bias = np.sqrt(6.0 / (weight.size(0) / 4 + weight.size(1))) 448 | nn.init.uniform(weight, -bias, bias) 449 | 450 | if input_lstm.bias: 451 | weight = eval('input_lstm.bias_ih' ) 452 | weight.data.zero_() 453 | weight.data[input_lstm.hidden_size: 2 * input_lstm.hidden_size] = 1 454 | weight = eval('input_lstm.bias_hh') 455 | weight.data.zero_() 456 | weight.data[input_lstm.hidden_size: 2 * input_lstm.hidden_size] = 1 457 | 458 | def repack_vb(if_cuda, feature, label, action): 459 | 460 | if if_cuda: 461 | fea_v = torch.autograd.Variable(feature).cuda() # feature: torch.Size([4, 17]) fea_v: torch.Size([17, 4]) 462 | label_v = torch.autograd.Variable(label).cuda() # torch.Size([17, 4, 1]) 463 | action_v = torch.autograd.Variable(action).cuda() # torch.Size([17, 4]) 464 | else: 465 | fea_v = torch.autograd.Variable(feature) 466 | label_v = torch.autograd.Variable(label).contiguous() 467 | action_v = torch.autograd.Variable(action).contiguous() 468 | return fea_v, label_v, action_v 469 | 470 | def generate_char(char2idx, train_features, dev_features, test_features): 471 | 472 | dev_char = [[[char2idx[c] for c in word] for word in sent] for sent in dev_features] 473 | test_char = [[[char2idx[c] for c in word] for word in sent] for sent in test_features] 474 | train_char = [[[char2idx[c] for c in word] for word in sent] for sent in train_features] 475 | 476 | return train_char, dev_char, test_char 477 | -------------------------------------------------------------------------------- /predict.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import codecs 3 | # from model.stack_lstm import * 4 | from model.batch_stack_lstm import * 5 | import model.utils as utils 6 | import model.evaluate as evaluate 7 | 8 | import argparse 9 | import json 10 | 11 | 12 | if __name__ == "__main__": 13 | parser = argparse.ArgumentParser(description='Evaluating Stack-LSTM') 14 | parser.add_argument('--load_arg', default='./checkpoint/ner_2018-06-22/dev=68.33.json', 15 | help='arg json file path') 16 | parser.add_argument('--spelling', default=True, help='use spelling or not') 17 | parser.add_argument('--load_check_point', default='./checkpoint/ner_2018-06-22/dev=68.33.model', 18 | help='checkpoint path') 19 | parser.add_argument('--gpu', type=int, default=0, help='gpu id') 20 | parser.add_argument('--batch_size', type=int, default=10, help='batch size') 21 | parser.add_argument('--mode', choices=['train', 'predict'], default='predict', help='mode selection') 22 | parser.add_argument('--test_file', default='test.txt', 23 | help='path to test file, if set to none, would use test_file path in the checkpoint file') 24 | parser.add_argument('--test_file_out', default='test_out.txt', 25 | help='path to test file output, if set to none, would use test_file path in the checkpoint file') 26 | args = parser.parse_args() 27 | 28 | with open(args.load_arg, 'r') as f: 29 | jd = json.load(f) 30 | jd = jd['args'] 31 | 32 | checkpoint_file = torch.load(args.load_check_point, map_location=lambda storage, loc: storage) 33 | f_map = checkpoint_file['f_map'] 34 | l_map = checkpoint_file['l_map'] 35 | a_map = checkpoint_file['a_map'] 36 | char_map = checkpoint_file['char_map'] 37 | singleton = checkpoint_file['singleton'] 38 | ner_map = checkpoint_file['ner_map'] 39 | if args.gpu >= 0: 40 | torch.cuda.set_device(args.gpu) 41 | 42 | # load corpus 43 | with codecs.open(args.test_file, 'r', 'utf-8') as f: 44 | test_lines = f.readlines() 45 | 46 | 47 | # converting format 48 | test_features = utils.read_corpus_predict(test_lines) 49 | 50 | # construct dataset 51 | test_dataset = utils.construct_dataset_predict(test_features, f_map, jd['caseless']) 52 | 53 | test_dataset_loader = [torch.utils.data.DataLoader(test_dataset, args.batch_size, shuffle=False, drop_last=False)] 54 | 55 | # build model 56 | ner_model = TransitionNER(args.mode, a_map, f_map, l_map, char_map, ner_map, len(f_map), len(a_map), jd['embedding_dim'], jd['action_embedding_dim'], jd['char_embedding_dim'], jd['hidden'], jd['char_hidden'], 57 | jd['layers'], jd['drop_out'], args.spelling, jd['char_structure'], is_cuda=args.gpu) 58 | print("loading model") 59 | ner_model.load_state_dict(checkpoint_file['state_dict']) 60 | 61 | if args.gpu >= 0: 62 | if_cuda = True 63 | torch.cuda.set_device(args.gpu) 64 | ner_model.cuda() 65 | else: 66 | if_cuda = False 67 | file_out = codecs.open(args.test_file_out, "w+", encoding="utf-8") 68 | print("decoding") 69 | evaluate.generate_ner(ner_model, file_out, test_dataset_loader, a_map, f_map, if_cuda) 70 | 71 | 72 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import time 3 | import torch.optim as optim 4 | import codecs 5 | # from model.stack_lstm import * 6 | from model.batch_stack_lstm import * 7 | import model.utils as utils 8 | import model.evaluate as evaluate 9 | 10 | import argparse 11 | import os 12 | import sys 13 | from tqdm import tqdm 14 | import itertools 15 | import functools 16 | 17 | 18 | if __name__ == "__main__": 19 | parser = argparse.ArgumentParser(description='Training transition-based NER system') 20 | parser.add_argument('--batch', action='store_true') 21 | parser.add_argument('--rand_embedding', action='store_true', help='random initialize word embedding') 22 | parser.add_argument('--emb_file', default='embedding/sskip.100.vectors', 23 | help='path to pre-trained embedding') 24 | parser.add_argument('--train_file', default='data/conll2003/train.txt', help='path to training file') 25 | parser.add_argument('--dev_file', default='data/conll2003/dev.txt', help='path to development file') 26 | parser.add_argument('--test_file', default='data/conll2003/test.txt', help='path to test file') 27 | parser.add_argument('--batch_size', type=int, default=100, help='batch size (10)') 28 | parser.add_argument('--gpu', type=int, default=0, help='gpu id, set to -1 if use cpu mode') 29 | parser.add_argument('--unk', default='unk', help='unknow-token in pre-trained embedding') 30 | parser.add_argument('--checkpoint', default='./checkpoint/ner_', help='path to checkpoint prefix') 31 | parser.add_argument('--hidden', type=int, default=100, help='hidden dimension') 32 | parser.add_argument('--char_hidden', type=int, default=50, help='hidden dimension for character') 33 | parser.add_argument('--char_structure', choices=['lstm', 'cnn'], default='lstm', help='') 34 | parser.add_argument('--drop_out', type=float, default=0.5, help='dropout ratio') 35 | parser.add_argument('--epoch', type=int, default=50, help='maximum epoch number') 36 | parser.add_argument('--start_epoch', type=int, default=0, help='start epoch idx') 37 | parser.add_argument('--caseless', default=True, help='caseless or not') 38 | parser.add_argument('--spelling', default=True, help='use spelling or not') 39 | parser.add_argument('--embedding_dim', type=int, default=100, help='dimension for word embedding') 40 | parser.add_argument('--char_embedding_dim', type=int, default=50, help='dimension for char embedding') 41 | parser.add_argument('--action_embedding_dim', type=int, default=20, help='dimension for action embedding') 42 | parser.add_argument('--layers', type=int, default=1, help='number of lstm layers') 43 | parser.add_argument('--lr', type=float, default=0.001, help='initial learning rate') 44 | parser.add_argument('--singleton_rate', type=float, default=0.2, help='initial singleton rate') 45 | parser.add_argument('--lr_decay', type=float, default=0.75, help='decay ratio of learning rate') 46 | parser.add_argument('--load_check_point', default='', help='path of checkpoint') 47 | parser.add_argument('--load_opt', action='store_true', help='load optimizer from ') 48 | parser.add_argument('--update', choices=['sgd', 'adam'], default='adam', help='optimizer method') 49 | parser.add_argument('--mode', choices=['train', 'predict'], default='train', help='mode selection') 50 | parser.add_argument('--momentum', type=float, default=0.9, help='momentum for sgd') 51 | parser.add_argument('--clip_grad', type=float, default=5.0, help='grad clip at') 52 | parser.add_argument('--mini_count', type=float, default=1, help='thresholds to replace rare words with ') 53 | parser.add_argument('--eva_matrix', choices=['a', 'fa'], default='fa', help='use f1 and accuracy or accuracy alone') 54 | parser.add_argument('--patience', type=int, default=15, help='patience for early stop') 55 | parser.add_argument('--least_iters', type=int, default=50, help='at least train how many epochs before stop') 56 | parser.add_argument('--shrink_embedding', action='store_true', 57 | help='shrink the embedding dictionary to corpus (open this if pre-trained embedding dictionary is too large, but disable this may yield better results on external corpus)') 58 | args = parser.parse_args() 59 | 60 | print('setting:') 61 | print(args) 62 | 63 | date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 64 | args.checkpoint = args.checkpoint + date.split(' ')[0] 65 | if not os.path.exists(args.checkpoint): 66 | os.makedirs(args.checkpoint) 67 | 68 | if_cuda = True if args.gpu >= 0 else False 69 | 70 | # load corpus 71 | print('loading corpus') 72 | with codecs.open(args.train_file, 'r', 'utf-8') as f: 73 | lines = f.readlines() 74 | with codecs.open(args.dev_file, 'r', 'utf-8') as f: 75 | dev_lines = f.readlines() 76 | with codecs.open(args.test_file, 'r', 'utf-8') as f: 77 | test_lines = f.readlines() 78 | 79 | # converting format 80 | word_count = dict() 81 | dev_features, dev_labels, dev_actions, word_count = utils.read_corpus_ner(dev_lines, word_count) 82 | test_features, test_labels, test_actions, word_count = utils.read_corpus_ner(test_lines, word_count) 83 | 84 | 85 | if args.load_check_point: 86 | if os.path.isfile(args.load_check_point): 87 | print("loading checkpoint: '{}'".format(args.load_check_point)) 88 | checkpoint_file = torch.load(args.load_check_point) 89 | args.start_epoch = checkpoint_file['epoch'] 90 | f_map = checkpoint_file['f_map'] 91 | l_map = checkpoint_file['l_map'] 92 | a_map = checkpoint_file['a_map'] 93 | ner_map = checkpoint_file['ner_map'] 94 | char_map = checkpoint_file['char_map'] 95 | singleton = checkpoint_file['singleton'] 96 | train_features, train_labels, train_actions, word_count = utils.read_corpus_ner(lines, word_count) 97 | else: 98 | print("no checkpoint found at: '{}'".format(args.load_check_point)) 99 | else: 100 | print('constructing coding table') 101 | 102 | train_features, train_labels, train_actions, f_map, l_map, a_map, ner_map, singleton, char_map = utils.generate_corpus(lines, word_count, args.spelling, 103 | if_shrink_feature=True, 104 | thresholds=0) 105 | f_set = {v for v in f_map} 106 | 107 | dt_f_set = functools.reduce(lambda x, y: x | y, map(lambda t: set(t), dev_features), 108 | f_set) # Add word in dev and in test into feature_map 109 | dt_f_set = functools.reduce(lambda x, y: x | y, map(lambda t: set(t), test_features), dt_f_set) 110 | dt_f_set = functools.reduce(lambda x, y: x | y, map(lambda t: set(t), train_features), dt_f_set) 111 | 112 | if not args.rand_embedding: 113 | print("feature size: '{}'".format(len(f_map))) 114 | print('loading embedding') 115 | f_map = {'': 0} 116 | f_map, embedding_tensor= utils.load_embedding_wlm(args.emb_file, ' ', f_map, dt_f_set, 117 | args.caseless, args.unk, 118 | args.embedding_dim, 119 | shrink_to_corpus=args.shrink_embedding) 120 | print("embedding size: '{}'".format(len(f_map))) 121 | 122 | l_set = functools.reduce(lambda x, y: x | y, map(lambda t: set(t), dev_labels)) 123 | l_set = functools.reduce(lambda x, y: x | y, map(lambda t: set(t), test_labels), l_set) 124 | for label in l_set: 125 | if label not in l_map: 126 | l_map[label] = len(l_map) 127 | 128 | print("%d train sentences" % len(train_features)) 129 | print("%d dev sentences" % len(dev_features)) 130 | print("%d test sentences" % len(test_features)) 131 | 132 | # construct dataset 133 | singleton = list(functools.reduce(lambda x, y: x & y, map(lambda t: set(t), [singleton, f_map]))) 134 | dataset = utils.construct_dataset(train_features, train_labels, train_actions, f_map, l_map, a_map, singleton, args.singleton_rate, args.caseless) 135 | dev_dataset = utils.construct_dataset(dev_features, dev_labels, dev_actions, f_map, l_map, a_map, singleton, args.singleton_rate, args.caseless) 136 | test_dataset = utils.construct_dataset(test_features, test_labels, test_actions, f_map, l_map, a_map, singleton, args.singleton_rate, args.caseless) 137 | 138 | dataset_loader = [torch.utils.data.DataLoader(tup, args.batch_size, shuffle=True, drop_last=False) for tup in dataset] 139 | dev_dataset_loader = [torch.utils.data.DataLoader(tup, args.batch_size, shuffle=False, drop_last=False) for tup in dev_dataset] 140 | test_dataset_loader = [torch.utils.data.DataLoader(tup, args.batch_size, shuffle=False, drop_last=False) for tup in test_dataset] 141 | 142 | # build model 143 | print('building model') 144 | ner_model = TransitionNER(args.mode, a_map, f_map, l_map, char_map, ner_map, len(f_map), len(a_map), args.embedding_dim, args.action_embedding_dim, args.char_embedding_dim, args.hidden, args.char_hidden, args.layers, args.drop_out, 145 | args.spelling, args.char_structure, is_cuda=args.gpu) 146 | 147 | if args.load_check_point: 148 | ner_model.load_state_dict(checkpoint_file['state_dict']) 149 | else: 150 | if not args.rand_embedding: 151 | ner_model.load_pretrained_embedding(embedding_tensor) 152 | print('random initialization') 153 | ner_model.rand_init(init_word_embedding=args.rand_embedding) 154 | 155 | if args.update == 'sgd': 156 | optimizer = optim.SGD(ner_model.parameters(), lr=args.lr, momentum=args.momentum, nesterov=True) 157 | elif args.update == 'adam': 158 | optimizer = optim.Adam(ner_model.parameters(), lr=args.lr, betas=(0.9, 0.9)) 159 | 160 | if args.load_check_point and args.load_opt: 161 | optimizer.load_state_dict(checkpoint_file['optimizer']) 162 | 163 | scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=args.lr_decay, patience=0, 164 | verbose=True) 165 | 166 | if if_cuda: 167 | print('device: ' + str(args.gpu)) 168 | torch.cuda.set_device(args.gpu) 169 | ner_model.cuda(device=args.gpu) 170 | else: 171 | if_cuda = False 172 | 173 | tot_length = sum(map(lambda t: len(t), dataset_loader)) 174 | best_f1 = float('-inf') 175 | best_acc = float('-inf') 176 | track_list = list() 177 | start_time = time.time() 178 | epoch_list = range(args.start_epoch, args.start_epoch + args.epoch) 179 | patience_count = 0 180 | 181 | for epoch_idx, args.start_epoch in enumerate(epoch_list): 182 | 183 | epoch_loss = 0 184 | ner_model.train() 185 | 186 | for feature, label, action in tqdm( 187 | itertools.chain.from_iterable(dataset_loader), mininterval=2, 188 | desc=' - Tot it %d (epoch %d)' % (tot_length, args.start_epoch), leave=False, file=sys.stdout): 189 | 190 | fea_v, la_v, ac_v = utils.repack_vb(if_cuda, feature, label, action) 191 | ner_model.zero_grad() # zeroes the gradient of all parameters 192 | # loss, _, _ = ner_model.forward(fea_v, ac_v) 193 | loss, _ = ner_model.forward(fea_v, ac_v) 194 | loss.backward() 195 | nn.utils.clip_grad_norm(ner_model.parameters(), args.clip_grad) 196 | optimizer.step() 197 | epoch_loss += utils.to_scalar(loss) 198 | 199 | # update lr 200 | scheduler.step(epoch_loss) 201 | dev_f1, dev_pre, dev_rec = evaluate.calc_f1_score(ner_model, dev_dataset_loader, a_map, if_cuda) 202 | 203 | if dev_f1 > best_f1: 204 | patience_count = 0 205 | if epoch_idx > 0: 206 | try: 207 | os.remove(args.checkpoint + '/dev=' + str(best_f1) + '.json') 208 | os.remove(args.checkpoint + '/dev=' + str(best_f1) + '.model') 209 | except Exception as inst: 210 | print(inst) 211 | 212 | best_f1 = dev_f1 213 | test_f1, test_pre, test_rec = evaluate.calc_f1_score(ner_model, test_dataset_loader, a_map, if_cuda) 214 | 215 | track_list.append( 216 | {'loss': epoch_loss, 'dev_f1': dev_f1, 'test_f1': test_f1}) 217 | 218 | print( 219 | '(loss: %.4f, epoch: %d, dev F1 = %.4f, dev pre = %.4f, dev rec = %.4f, F1 on test = %.4f, pre on test = %.4f, rec on test = %.4f), saving...' % 220 | (epoch_loss, 221 | args.start_epoch, 222 | dev_f1, 223 | dev_pre, 224 | dev_rec, 225 | test_f1, 226 | test_pre, 227 | test_rec)) 228 | 229 | try: 230 | utils.save_checkpoint({ 231 | 'epoch': args.start_epoch, 232 | 'state_dict': ner_model.state_dict(), 233 | 'optimizer': optimizer.state_dict(), 234 | 'f_map': f_map, 235 | 'l_map': l_map, 236 | 'a_map': a_map, 237 | 'ner_map': ner_map, 238 | 'char_map': char_map, 239 | 'singleton': singleton 240 | }, {'track_list': track_list, 241 | 'args': vars(args) 242 | }, args.checkpoint + '/dev=' + str(round(best_f1*100,2))) 243 | except Exception as inst: 244 | print(inst) 245 | 246 | else: 247 | patience_count += 1 248 | print('(loss: %.4f, epoch: %d, dev F1 = %.4f)' % 249 | (epoch_loss, 250 | args.start_epoch, 251 | dev_f1)) 252 | track_list.append({'loss': epoch_loss, 'dev_f1': dev_f1}) 253 | 254 | print('epoch: ' + str(args.start_epoch) + '\t in ' + str(args.epoch) + ' take: ' + str( 255 | time.time() - start_time) + ' s') 256 | 257 | if patience_count >= args.patience and args.start_epoch >= args.least_iters: 258 | break 259 | 260 | # print best 261 | print( 262 | args.checkpoint + ' dev_f1: %.4f dev_rec: %.4f dev_pre: %.4f test_f1: %.4f test_rec: %.4f test_pre: %.4f\n' % ( 263 | dev_f1, dev_rec, dev_pre, test_f1, test_rec, test_pre)) 264 | 265 | # printing summary 266 | print('setting:') 267 | print(args) 268 | 269 | # log_file.close() 270 | --------------------------------------------------------------------------------