├── ckpts └── 1 ├── Pretraining_Bert ├── README.md ├── lm_bert │ ├── bert_config.json │ ├── pretraining_args.py │ ├── utils.py │ └── run_pretraining.py └── temp.py ├── model ├── __init__.py ├── bert_lstm_crf.py └── crf.py ├── data └── tag.txt ├── README.md ├── bert_config.json ├── pretraining_args.py ├── config.py ├── submit.py ├── main.py └── utils.py /ckpts/1: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /Pretraining_Bert/README.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /model/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from model.crf import CRF 3 | from model.bert_lstm_crf import BERT_LSTM_CRF 4 | -------------------------------------------------------------------------------- /data/tag.txt: -------------------------------------------------------------------------------- 1 | 2 | o 3 | a-B 4 | a-E 5 | a-S 6 | a-M 7 | b-B 8 | b-E 9 | b-S 10 | b-M 11 | c-B 12 | c-E 13 | c-S 14 | c-M 15 | 16 | 17 | -------------------------------------------------------------------------------- /Pretraining_Bert/lm_bert/bert_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "attention_probs_dropout_prob": 0.1, 3 | "directionality": "bidi", 4 | "hidden_act": "gelu", 5 | "hidden_dropout_prob": 0.1, 6 | "hidden_size": 384, 7 | "initializer_range": 0.02, 8 | "intermediate_size": 3072, 9 | "max_position_embeddings": 384, 10 | "num_attention_heads": 8, 11 | "num_hidden_layers": 12, 12 | "vocab_size": 21154 13 | } 14 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # daguan_bert_ner 2 | bert的模型传到:https://pan.baidu.com/s/1anf4K_nv-LhTeMnV69oBCw 3 | 下载之后放到'./ckpts'目录下即可进行训练 4 | 5 | 6 | 语言模型的代码在Pretraining_Bert文件中, 7 | 预训练模型的数据我也上传到:https://pan.baidu.com/s/19mnQ9lDvQhBniBU5SBVN9Q ,下载之后将其放到lm_bert目录下即可 8 | temp.py代码提供了两个辅助脚本: 9 | (1)create_bert_datas()方法为创建不同的训练数据, 10 | (2)model_transfer()方法为删除语言模型训练的部分权重而只保留命名实体识别所需的部分模型权重 11 | 将转换之后的模型放到'./ckpts'目录下即可按照自己训练的权重进行实体识别训练 12 | -------------------------------------------------------------------------------- /bert_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "attention_probs_dropout_prob": 0.1, 3 | "directionality": "bidi", 4 | "hidden_act": "gelu", 5 | "hidden_dropout_prob": 0.1, 6 | "hidden_size": 384, 7 | "initializer_range": 0.02, 8 | "intermediate_size": 3072, 9 | "max_position_embeddings": 384, 10 | "num_attention_heads": 8, 11 | "num_hidden_layers": 12, 12 | "pooler_fc_size": 384, 13 | "pooler_num_attention_heads": 12, 14 | "pooler_num_fc_layers": 3, 15 | "pooler_size_per_head": 128, 16 | "pooler_type": "first_token_transform", 17 | "type_vocab_size": 2, 18 | "vocab_size": 21154 19 | } 20 | -------------------------------------------------------------------------------- /pretraining_args.py: -------------------------------------------------------------------------------- 1 | # -----------ARGS--------------------- 2 | pretrain_train_path = "./data/train.txt" 3 | pretrain_dev_path = "./data/dev.txt" 4 | pretrain_test_path = "./data/test.txt" 5 | submit_test_path = "./data/normal_daguan_test.txt" 6 | 7 | 8 | max_seq_length = 200 9 | do_train = True 10 | do_lower_case = True 11 | train_batch_size = 16 12 | eval_batch_size = 16 13 | learning_rate = 2e-4 14 | num_train_epochs = 100 15 | warmup_proportion = 0.1 16 | use_cuda = True 17 | local_rank = -1 18 | seed = 42 19 | gradient_accumulation_steps = 1 20 | bert_config_json = "./bert_config.json" 21 | vocab_file = "./bert_vocab.txt" 22 | output_dir = "outputs" 23 | masked_lm_prob = 0.15 24 | max_predictions_per_seq = 20 25 | weight='ckpts/bert_weight.bin' -------------------------------------------------------------------------------- /Pretraining_Bert/lm_bert/pretraining_args.py: -------------------------------------------------------------------------------- 1 | # -----------ARGS--------------------- 2 | pretrain_train_path = "/home/hing/bert/Pretraining_Bert_From_Scratch/lm_smallBert/data/pretrain_train.txt" 3 | pretrain_dev_path = "/home/hing/bert/Pretraining_Bert_From_Scratch/lm_smallBert/data/pretrain_dev.txt" 4 | 5 | max_seq_length = 200 6 | do_train = True 7 | do_lower_case = True 8 | train_batch_size = 20 9 | eval_batch_size = 16 10 | learning_rate = 2e-5 11 | num_train_epochs = 100 12 | warmup_proportion = 0.1 13 | no_cuda = False 14 | local_rank = -1 15 | seed = 42 16 | gradient_accumulation_steps = 1 17 | fp16 = False 18 | loss_scale = 0. 19 | bert_config_json = "/home/hing/bert/Pretraining_Bert_From_Scratch/lm_smallBert/bert_config.json" 20 | vocab_file = "/home/hing/bert/Pretraining_Bert_From_Scratch/lm_smallBert/bert_vocab.txt" 21 | output_dir = "outputs" 22 | masked_lm_prob = 0.15 23 | max_predictions_per_seq = 20 -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | 4 | class Config(object): 5 | def __init__(self): 6 | self.label_file = './data/tag.txt' 7 | self.vocab = 'bert_vocab.txt' 8 | self.use_cuda = True 9 | self.gpu = 0 10 | self.rnn_hidden = 128 11 | self.bert_embedding = 384 12 | self.dropout1 = 0.1 13 | self.dropout_ratio = 0.1 14 | self.rnn_layer = 1 15 | self.lr = 2e-5 16 | self.checkpoint = 'result/' 17 | self.optim = 'Adam' 18 | self.load_model = False 19 | self.load_path = 'result' 20 | self.base_epoch = 100 21 | self.flag='submit' 22 | def update(self, **kwargs): 23 | for k, v in kwargs.items(): 24 | setattr(self, k, v) 25 | 26 | def __str__(self): 27 | 28 | return '\n'.join(['%s:%s' % item for item in self.__dict__.items()]) 29 | 30 | 31 | if __name__ == '__main__': 32 | 33 | con = Config() 34 | con.update(gpu=1) 35 | # print(con.gpu) 36 | # print(con) 37 | -------------------------------------------------------------------------------- /submit.py: -------------------------------------------------------------------------------- 1 | def submitFormat(): 2 | f=open('predict.txt') 3 | labels=f.readlines() 4 | f.close() 5 | f=open('./data/daguan_test.txt') 6 | texts=f.readlines() 7 | f.close() 8 | f=open('submit.txt','w',encoding='utf8') 9 | for i,label in enumerate(labels): 10 | 11 | label=label.strip().split(' ') 12 | text=texts[i].strip().split('_') 13 | if len(label)!=len(text): 14 | print(i,len(label),len(text)) 15 | length=len(label) 16 | start=0 17 | end=0 18 | result=[] 19 | print(i,label) 20 | while length>0: 21 | if 'M' in label[end]: 22 | label[end]=label[end][:2]+'B' 23 | continue 24 | if label[end] == 'o': 25 | while label[end] == 'o': 26 | end+=1 27 | length-=1 28 | if length==0: 29 | break 30 | result.append('_'.join(text[start:end])+'/o') 31 | start=end 32 | if start==len(label): 33 | break 34 | if 'S' in label[end]: 35 | # print(text[start:end] +'/'+label[end][0]) 36 | result.append('_'.join(text[start:end+1]) +'/'+label[end][0]) 37 | end+=1 38 | length-=1 39 | start=end 40 | if length == 0: 41 | break 42 | if 'B' in label[end]: 43 | while 'E' not in label[end]: 44 | end+=1 45 | length-=1 46 | if length==0: 47 | break 48 | end += 1 49 | length -= 1 50 | result.append('_'.join(text[start:end])+'/'+label[start][0]) 51 | start=end 52 | if start==len(label): 53 | break 54 | # if '' in label[end]: 55 | f.write(' '.join(result)+'\n') 56 | f.close() 57 | submitFormat() 58 | # import torch 59 | # model=torch.load('./ckpts/bert_weight.bin') 60 | # for k,v in model.items(): 61 | # print(k,v) -------------------------------------------------------------------------------- /model/bert_lstm_crf.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # coding=utf-8 3 | import torch.nn as nn 4 | from pytorch_pretrained_bert import BertModel,BertConfig 5 | from model import CRF 6 | from torch.autograd import Variable 7 | import torch 8 | 9 | class BERT_LSTM_CRF(nn.Module): 10 | """ 11 | bert_lstm_crf model 12 | bert_model=BertModel(config=BertConfig.from_json_file(args.bert_config_json)) 13 | """ 14 | def __init__(self, args, tagset_size, embedding_dim, hidden_dim, rnn_layers, dropout_ratio, dropout1, use_cuda=False): 15 | super(BERT_LSTM_CRF, self).__init__() 16 | self.embedding_dim = embedding_dim 17 | self.hidden_dim = hidden_dim 18 | self.word_embeds = BertModel(config=BertConfig.from_json_file(args.bert_config_json)) 19 | # print(self.word_embeds) 20 | self.word_embeds.load_state_dict(torch.load('./ckpts/bert_weight.bin')) 21 | self.lstm = nn.LSTM(embedding_dim, hidden_dim, 22 | num_layers=rnn_layers, bidirectional=True, dropout=dropout_ratio, batch_first=True) 23 | self.rnn_layers = rnn_layers 24 | self.dropout1 = nn.Dropout(p=dropout1) 25 | self.crf = CRF(target_size=tagset_size, average_batch=True, use_cuda=use_cuda) 26 | self.liner = nn.Linear(hidden_dim*2, tagset_size+2) 27 | self.tagset_size = tagset_size 28 | 29 | def rand_init_hidden(self, batch_size): 30 | """ 31 | random initialize hidden variable 32 | """ 33 | return Variable( 34 | torch.randn(2 * self.rnn_layers, batch_size, self.hidden_dim)), Variable( 35 | torch.randn(2 * self.rnn_layers, batch_size, self.hidden_dim)) 36 | 37 | def forward(self, sentence, attention_mask=None): 38 | ''' 39 | args: 40 | sentence (word_seq_len, batch_size) : word-level representation of sentence 41 | hidden: initial hidden state 42 | 43 | return: 44 | crf output (word_seq_len, batch_size, tag_size, tag_size), hidden 45 | ''' 46 | batch_size = sentence.size(0) 47 | seq_length = sentence.size(1) 48 | embeds, _ = self.word_embeds(sentence, attention_mask=attention_mask, output_all_encoded_layers=False) 49 | # print(embeds,_) 50 | hidden = self.rand_init_hidden(batch_size) 51 | # if embeds.is_cuda: 52 | # hidden = (i.cuda() for i in hidden) 53 | # embeds=(embeds,dim=0,keepdim=True) 54 | lstm_out, hidden = self.lstm(embeds) 55 | lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim*2) 56 | d_lstm_out = self.dropout1(lstm_out) 57 | l_out = self.liner(d_lstm_out) 58 | lstm_feats = l_out.contiguous().view(batch_size, seq_length, -1) 59 | return lstm_feats 60 | 61 | def loss(self, feats, mask, tags): 62 | """ 63 | feats: size=(batch_size, seq_len, tag_size) 64 | mask: size=(batch_size, seq_len) 65 | tags: size=(batch_size, seq_len) 66 | :return: 67 | """ 68 | loss_value = self.crf.neg_log_likelihood_loss(feats, mask, tags) 69 | batch_size = feats.size(0) 70 | loss_value /= float(batch_size) 71 | return loss_value 72 | 73 | 74 | 75 | -------------------------------------------------------------------------------- /Pretraining_Bert/temp.py: -------------------------------------------------------------------------------- 1 | def create_bert_datas(): 2 | f=open('/home/hing/Desktop/named_entity_recognition/language_model/corpus_no_space.txt','r') 3 | datas=f.readlines() 4 | f.close() 5 | length=[] 6 | f = open('pretrain_train.txt', 'w') 7 | for data in datas[:-100000]: 8 | length.append(len(data.strip().split())) 9 | f.write(data) 10 | f.close() 11 | f.close() 12 | length=sorted(length) 13 | count=0 14 | for len_ in length: 15 | if len_<=156: 16 | count+=1 17 | print(count) 18 | print(len(length)) 19 | print(count/len(length)) 20 | # f=open('vocab.txt','w') 21 | # vocabs=set() 22 | # for data in datas: 23 | # words=data.strip().split() 24 | # for word in words: 25 | # vocabs.add(word) 26 | # for vocab in vocabs: 27 | # f.write(vocab+'\n') 28 | # f.close() 29 | # # f = open('/home/hing/Desktop/named_entity_recognition/normal_daguan_test.txt', 'r') 30 | # # datas = f.readlines() 31 | # # f.close() 32 | f = open('pretrain_dev.txt', 'w') 33 | for data in datas[-100000:]: 34 | f.write(data) 35 | f.close() 36 | # create_bert_datas() 37 | # f = open('/home/hing/bert/Pretraining_Bert_From_Scratch/lm_smallBert/data/pretrain_dev.txt',) 38 | # datas=f.readlines() 39 | # f.close() 40 | # print(len(datas)) 41 | # f = open('/home/hing/bert/Pretraining_Bert_From_Scratch/lm_smallBert/data/pretrain_train.txt',) 42 | # datas=f.readlines() 43 | # f.close() 44 | # print(len(datas)) 45 | from pytorch_pretrained_bert.modeling import BertForMaskedLM, BertConfig 46 | import torch 47 | import lm_smallBert.pretraining_args as args 48 | from pytorch_pretrained_bert.modeling import BertModel 49 | def model_transfer(): 50 | model = BertForMaskedLM(config=BertConfig.from_json_file(args.bert_config_json)) 51 | # print('language_model',model.state_dict()['bert.embeddings.word_embeddings.weight']) 52 | # print('language_model',model.state_dict()['bert.embeddings.LayerNorm.weight']) 53 | # print('language_model',model.state_dict()['bert.encoder.layer.0.attention.self.key.weight']) 54 | model = model.bert 55 | # print('bert_model',model.state_dict()['embeddings.word_embeddings.weight']) 56 | # print('bert_model',model.state_dict()['embeddings.LayerNorm.weight']) 57 | # print('bert_model',model.state_dict()['encoder.layer.0.attention.self.key.weight']) 58 | model_dict = model.state_dict() 59 | lm_dict = torch.load('./lm_smallBert/outputs/1.41_150000_step') 60 | for k,v in lm_dict.items(): 61 | print(k,v) 62 | # print('lm_dict',lm_dict['bert.embeddings.word_embeddings.weight']) 63 | # print('lm_dict',lm_dict['bert.embeddings.LayerNorm.weight']) 64 | # print('lm_dict',lm_dict['bert.encoder.layer.0.attention.self.key.weight']) 65 | pretrained_dict = {k[5:]: v for k, v in lm_dict.items() if k[5:] in model_dict.keys()} 66 | # print('pretrained_dict',pretrained_dict) 67 | model.load_state_dict(pretrained_dict) 68 | torch.save(model.state_dict(),'1.41_bert_weight.bin') 69 | model_transfer() 70 | # bert_model=BertModel(config=BertConfig.from_json_file(args.bert_config_json)) 71 | # # bert_model_weight=torch.load('bert_weight.bin') 72 | # # print(bert_model) 73 | # # bert_model.load_state_dict(bert_model_weight) 74 | # for k,v in bert_model.named_parameters(): 75 | # print(k,v) 76 | # lm_dict = torch.load('./lm_smallBert/outputs/60000_pytorch_model.bin') 77 | # for k, v in lm_dict.items(): 78 | # print(k, v) 79 | -------------------------------------------------------------------------------- /Pretraining_Bert/lm_bert/utils.py: -------------------------------------------------------------------------------- 1 | 2 | def create_masked_lm_predictions(tokens, masked_lm_prob, max_predictions_per_seq, vocab_list): 3 | """Creates the predictions for the masked LM objective. This is mostly copied from the Google BERT repo, but 4 | with several refactors to clean it up and remove a lot of unnecessary variables.""" 5 | cand_indices = [] 6 | for (i, token) in enumerate(tokens): 7 | if token == "[CLS]" or token == "[SEP]": 8 | continue 9 | cand_indices.append(i) 10 | 11 | num_to_mask = min(max_predictions_per_seq, 12 | max(1, int(round(len(tokens) * masked_lm_prob)))) 13 | # print(num_to_mask) 14 | # print("tokens", len(tokens)) 15 | # print("cand", len(cand_indices)) 16 | shuffle(cand_indices) 17 | mask_indices = sorted(sample(cand_indices, num_to_mask)) 18 | masked_token_labels = [] 19 | for index in mask_indices: 20 | # 80% of the time, replace with [MASK] 21 | if random() < 0.8: 22 | masked_token = "[MASK]" 23 | else: 24 | # 10% of the time, keep original 25 | if random() < 0.5: 26 | masked_token = tokens[index] 27 | # 10% of the time, replace with random word 28 | else: 29 | masked_token = choice(vocab_list) 30 | masked_token_labels.append(tokens[index]) 31 | # Once we've saved the true label for that token, we can overwrite it with the masked version 32 | tokens[index] = masked_token 33 | 34 | return tokens, mask_indices, masked_token_labels 35 | 36 | class InputFeatures(object): 37 | """A single set of features of data.""" 38 | def __init__(self, input_ids, input_mask, segment_ids, label_id): 39 | self.input_ids = input_ids 40 | self.input_mask = input_mask 41 | self.segment_ids = segment_ids 42 | self.label_id = label_id 43 | def create_examples(data_path, max_seq_length, masked_lm_prob, max_predictions_per_seq, vocab_list): 44 | """Creates examples for the training and dev sets.""" 45 | examples = [] 46 | max_num_tokens = max_seq_length - 2 47 | fr = open(data_path, "r") 48 | for (i, line) in tqdm(enumerate(fr), desc="Creating Example"): 49 | tokens_a = line.strip("\n").split()[:max_num_tokens] 50 | tokens = ["[CLS]"] + tokens_a + ["[SEP]"] 51 | segment_ids = [0 for _ in range(len(tokens_a) + 2)] 52 | # remove too short sample 53 | if len(tokens_a) < 5: 54 | continue 55 | tokens, masked_lm_positions, masked_lm_labels = create_masked_lm_predictions( 56 | tokens, masked_lm_prob, max_predictions_per_seq, vocab_list) 57 | example = { 58 | "tokens": tokens, 59 | "segment_ids": segment_ids, 60 | "masked_lm_positions": masked_lm_positions, 61 | "masked_lm_labels": masked_lm_labels} 62 | examples.append(example) 63 | fr.close() 64 | return examples 65 | 66 | 67 | def convert_examples_to_features(examples, max_seq_length, tokenizer): 68 | features = [] 69 | for i, example in tqdm(enumerate(examples), desc="Converting Feature"): 70 | tokens = example["tokens"] 71 | segment_ids = example["segment_ids"] 72 | masked_lm_positions = example["masked_lm_positions"] 73 | masked_lm_labels = example["masked_lm_labels"] 74 | assert len(tokens) == len(segment_ids) <= max_seq_length # The preprocessed data should be already truncated 75 | input_ids = tokenizer.convert_tokens_to_ids(tokens) 76 | masked_label_ids = tokenizer.convert_tokens_to_ids(masked_lm_labels) 77 | 78 | input_array = np.zeros(max_seq_length, dtype=np.int) 79 | input_array[:len(input_ids)] = input_ids 80 | 81 | mask_array = np.zeros(max_seq_length, dtype=np.bool) 82 | mask_array[:len(input_ids)] = 1 83 | 84 | segment_array = np.zeros(max_seq_length, dtype=np.bool) 85 | segment_array[:len(segment_ids)] = segment_ids 86 | 87 | lm_label_array = np.full(max_seq_length, dtype=np.int, fill_value=-1) 88 | lm_label_array[masked_lm_positions] = masked_label_ids 89 | 90 | feature = InputFeatures(input_ids=input_array, 91 | input_mask=mask_array, 92 | segment_ids=segment_array, 93 | label_id=lm_label_array) 94 | features.append(feature) 95 | # if i < 10: 96 | # logger.info("input_ids: %s\ninput_mask:%s\nsegment_ids:%s\nlabel_id:%s" %(input_array, mask_array, segment_array, lm_label_array)) 97 | return features 98 | # from __future__ import absolute_import, division, print_function 99 | 100 | import pretraining_args as args 101 | import csv 102 | import logging 103 | import os 104 | import random 105 | random.seed(args.seed) 106 | import sys 107 | from glob import glob 108 | import numpy as np 109 | import torch 110 | 111 | from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, 112 | TensorDataset) 113 | from torch.utils.data.distributed import DistributedSampler 114 | from tqdm import tqdm, trange 115 | from random import random, randrange, randint, shuffle, choice, sample 116 | from torch.nn import CrossEntropyLoss, MSELoss 117 | from scipy.stats import pearsonr, spearmanr 118 | from sklearn.metrics import matthews_corrcoef, f1_score 119 | 120 | from pytorch_pretrained_bert.file_utils import WEIGHTS_NAME, CONFIG_NAME 121 | from pytorch_pretrained_bert.modeling import BertForMaskedLM, BertConfig 122 | from pytorch_pretrained_bert.tokenization import BertTokenizer 123 | from pytorch_pretrained_bert.optimization import BertAdam 124 | print(args.bert_config_json) 125 | vocab_list = [] 126 | with open(args.vocab_file, 'r') as fr: 127 | for line in fr: 128 | vocab_list.append(line.strip("\n")) 129 | tokenizer = BertTokenizer(vocab_file=args.vocab_file) 130 | 131 | model = BertForMaskedLM(config=BertConfig.from_json_file(args.bert_config_json)) 132 | model.load_state_dict(torch.load('/home/hing/bert/Pretraining_Bert_From_Scratch/lm_smallBert/outputs/60000_pytorch_model.bin')) 133 | for k,v in model.named_parameters(): 134 | print(k,v) 135 | pretrain_=BertForMaskedLM(args.bert_config_json) 136 | eval_examples = create_examples(data_path=args.pretrain_dev_path, 137 | max_seq_length=args.max_seq_length, 138 | masked_lm_prob=args.masked_lm_prob, 139 | max_predictions_per_seq=args.max_predictions_per_seq, 140 | vocab_list=vocab_list) 141 | eval_features = convert_examples_to_features( 142 | eval_examples, args.max_seq_length, tokenizer) 143 | all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) 144 | all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) 145 | all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) 146 | all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) 147 | eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) 148 | # Run prediction for full data 149 | eval_sampler = SequentialSampler(eval_data) 150 | eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) 151 | model.eval() 152 | eval_loss = 0 153 | nb_eval_steps = 0 154 | # device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") 155 | device = torch.device("cpu") 156 | 157 | model.to(device) 158 | for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"): 159 | input_ids = input_ids.to(device) 160 | input_mask = input_mask.to(device) 161 | segment_ids = segment_ids.to(device) 162 | label_ids = label_ids.to(device) 163 | 164 | with torch.no_grad(): 165 | loss = model(input_ids, segment_ids, input_mask, label_ids) 166 | print(loss) -------------------------------------------------------------------------------- /model/crf.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import torch 3 | from torch.autograd import Variable 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | 7 | def log_sum_exp(vec, m_size): 8 | """ 9 | Args: 10 | vec: size=(batch_size, vanishing_dim, hidden_dim) 11 | m_size: hidden_dim 12 | 13 | Returns: 14 | size=(batch_size, hidden_dim) 15 | """ 16 | _, idx = torch.max(vec, 1) # B * 1 * M 17 | max_score = torch.gather(vec, 1, idx.view(-1, 1, m_size)).view(-1, 1, m_size) # B * M 18 | return max_score.view(-1, m_size) + torch.log(torch.sum( 19 | torch.exp(vec - max_score.expand_as(vec)), 1)).view(-1, m_size) 20 | 21 | 22 | class CRF(nn.Module): 23 | 24 | def __init__(self, **kwargs): 25 | """ 26 | Args: 27 | target_size: int, target size 28 | use_cuda: bool, 是否使用gpu, default is True 29 | average_batch: bool, loss是否作平均, default is True 30 | """ 31 | super(CRF, self).__init__() 32 | for k in kwargs: 33 | self.__setattr__(k, kwargs[k]) 34 | self.START_TAG_IDX, self.END_TAG_IDX = -2, -1 35 | init_transitions = torch.zeros(self.target_size+2, self.target_size+2) 36 | init_transitions[:, self.START_TAG_IDX] = -1000. 37 | init_transitions[self.END_TAG_IDX, :] = -1000. 38 | if self.use_cuda: 39 | init_transitions = init_transitions.cuda() 40 | self.transitions = nn.Parameter(init_transitions) 41 | 42 | def _forward_alg(self, feats, mask=None): 43 | """ 44 | Do the forward algorithm to compute the partition function (batched). 45 | 46 | Args: 47 | feats: size=(batch_size, seq_len, self.target_size+2) 48 | mask: size=(batch_size, seq_len) 49 | 50 | Returns: 51 | xxx 52 | """ 53 | batch_size = feats.size(0) 54 | seq_len = feats.size(1) 55 | tag_size = feats.size(-1) 56 | 57 | mask = mask.transpose(1, 0).contiguous() 58 | ins_num = batch_size * seq_len 59 | feats = feats.transpose(1, 0).contiguous().view( 60 | ins_num, 1, tag_size).expand(ins_num, tag_size, tag_size) 61 | 62 | scores = feats + self.transitions.view( 63 | 1, tag_size, tag_size).expand(ins_num, tag_size, tag_size) 64 | scores = scores.view(seq_len, batch_size, tag_size, tag_size) 65 | seq_iter = enumerate(scores) 66 | try: 67 | _, inivalues = seq_iter.__next__() 68 | except: 69 | _, inivalues = seq_iter.next() 70 | 71 | partition = inivalues[:, self.START_TAG_IDX, :].clone().view(batch_size, tag_size, 1) 72 | for idx, cur_values in seq_iter: 73 | cur_values = cur_values + partition.contiguous().view( 74 | batch_size, tag_size, 1).expand(batch_size, tag_size, tag_size) 75 | cur_partition = log_sum_exp(cur_values, tag_size) 76 | mask_idx = mask[idx, :].view(batch_size, 1).expand(batch_size, tag_size) 77 | masked_cur_partition = cur_partition.masked_select(mask_idx.byte()) 78 | if masked_cur_partition.dim() != 0: 79 | mask_idx = mask_idx.contiguous().view(batch_size, tag_size, 1) 80 | partition.masked_scatter_(mask_idx.byte(), masked_cur_partition) 81 | cur_values = self.transitions.view(1, tag_size, tag_size).expand( 82 | batch_size, tag_size, tag_size) + partition.contiguous().view( 83 | batch_size, tag_size, 1).expand(batch_size, tag_size, tag_size) 84 | cur_partition = log_sum_exp(cur_values, tag_size) 85 | final_partition = cur_partition[:, self.END_TAG_IDX] 86 | return final_partition.sum(), scores 87 | 88 | def _viterbi_decode(self, feats, mask=None): 89 | """ 90 | Args: 91 | feats: size=(batch_size, seq_len, self.target_size+2) 92 | mask: size=(batch_size, seq_len) 93 | 94 | Returns: 95 | decode_idx: (batch_size, seq_len), viterbi decode结果 96 | path_score: size=(batch_size, 1), 每个句子的得分 97 | """ 98 | batch_size = feats.size(0) 99 | seq_len = feats.size(1) 100 | tag_size = feats.size(-1) 101 | 102 | length_mask = torch.sum(mask, dim=1).view(batch_size, 1).long() 103 | mask = mask.transpose(1, 0).contiguous() 104 | ins_num = seq_len * batch_size 105 | feats = feats.transpose(1, 0).contiguous().view( 106 | ins_num, 1, tag_size).expand(ins_num, tag_size, tag_size) 107 | 108 | scores = feats + self.transitions.view( 109 | 1, tag_size, tag_size).expand(ins_num, tag_size, tag_size) 110 | scores = scores.view(seq_len, batch_size, tag_size, tag_size) 111 | 112 | seq_iter = enumerate(scores) 113 | # record the position of the best score 114 | back_points = list() 115 | partition_history = list() 116 | mask = (1 - mask.long()).byte() 117 | try: 118 | _, inivalues = seq_iter.__next__() 119 | except: 120 | _, inivalues = seq_iter.next() 121 | partition = inivalues[:, self.START_TAG_IDX, :].clone().view(batch_size, tag_size, 1) 122 | partition_history.append(partition) 123 | 124 | for idx, cur_values in seq_iter: 125 | cur_values = cur_values + partition.contiguous().view( 126 | batch_size, tag_size, 1).expand(batch_size, tag_size, tag_size) 127 | partition, cur_bp = torch.max(cur_values, 1) 128 | partition_history.append(partition.unsqueeze(-1)) 129 | 130 | cur_bp.masked_fill_(mask[idx].view(batch_size, 1).expand(batch_size, tag_size), 0) 131 | back_points.append(cur_bp) 132 | 133 | partition_history = torch.cat(partition_history).view( 134 | seq_len, batch_size, -1).transpose(1, 0).contiguous() 135 | 136 | last_position = length_mask.view(batch_size, 1, 1).expand(batch_size, 1, tag_size) - 1 137 | last_partition = torch.gather( 138 | partition_history, 1, last_position).view(batch_size, tag_size, 1) 139 | 140 | last_values = last_partition.expand(batch_size, tag_size, tag_size) + \ 141 | self.transitions.view(1, tag_size, tag_size).expand(batch_size, tag_size, tag_size) 142 | _, last_bp = torch.max(last_values, 1) 143 | pad_zero = Variable(torch.zeros(batch_size, tag_size)).long() 144 | if self.use_cuda: 145 | pad_zero = pad_zero.cuda() 146 | back_points.append(pad_zero) 147 | back_points = torch.cat(back_points).view(seq_len, batch_size, tag_size) 148 | 149 | pointer = last_bp[:, self.END_TAG_IDX] 150 | insert_last = pointer.contiguous().view(batch_size, 1, 1).expand(batch_size, 1, tag_size) 151 | back_points = back_points.transpose(1, 0).contiguous() 152 | 153 | back_points.scatter_(1, last_position, insert_last) 154 | 155 | back_points = back_points.transpose(1, 0).contiguous() 156 | 157 | decode_idx = Variable(torch.LongTensor(seq_len, batch_size)) 158 | if self.use_cuda: 159 | decode_idx = decode_idx.cuda() 160 | decode_idx[-1] = pointer.data 161 | for idx in range(len(back_points)-2, -1, -1): 162 | pointer = torch.gather(back_points[idx], 1, pointer.contiguous().view(batch_size, 1)) 163 | decode_idx[idx] = pointer.view(-1).data 164 | path_score = None 165 | decode_idx = decode_idx.transpose(1, 0) 166 | return path_score, decode_idx 167 | 168 | def forward(self, feats, mask=None): 169 | path_score, best_path = self._viterbi_decode(feats, mask) 170 | return path_score, best_path 171 | 172 | def _score_sentence(self, scores, mask, tags): 173 | """ 174 | Args: 175 | scores: size=(seq_len, batch_size, tag_size, tag_size) 176 | mask: size=(batch_size, seq_len) 177 | tags: size=(batch_size, seq_len) 178 | 179 | Returns: 180 | score: 181 | """ 182 | batch_size = scores.size(1) 183 | seq_len = scores.size(0) 184 | tag_size = scores.size(-1) 185 | 186 | new_tags = Variable(torch.LongTensor(batch_size, seq_len)) 187 | if self.use_cuda: 188 | new_tags = new_tags.cuda() 189 | for idx in range(seq_len): 190 | if idx == 0: 191 | new_tags[:, 0] = (tag_size - 2) * tag_size + tags[:, 0] 192 | else: 193 | new_tags[:, idx] = tags[:, idx-1] * tag_size + tags[:, idx] 194 | 195 | end_transition = self.transitions[:, self.END_TAG_IDX].contiguous().view( 196 | 1, tag_size).expand(batch_size, tag_size) 197 | length_mask = torch.sum(mask, dim=1).view(batch_size, 1).long() 198 | end_ids = torch.gather(tags, 1, length_mask-1) 199 | 200 | end_energy = torch.gather(end_transition, 1, end_ids) 201 | 202 | new_tags = new_tags.transpose(1, 0).contiguous().view(seq_len, batch_size, 1) 203 | tg_energy = torch.gather(scores.view(seq_len, batch_size, -1), 2, new_tags).view( 204 | seq_len, batch_size) 205 | tg_energy = tg_energy.masked_select(mask.transpose(1, 0)) 206 | 207 | gold_score = tg_energy.sum() + end_energy.sum() 208 | 209 | return gold_score 210 | 211 | def neg_log_likelihood_loss(self, feats, mask, tags): 212 | """ 213 | Args: 214 | feats: size=(batch_size, seq_len, tag_size) 215 | mask: size=(batch_size, seq_len) 216 | tags: size=(batch_size, seq_len) 217 | """ 218 | batch_size = feats.size(0) 219 | mask = mask.byte() 220 | forward_score, scores = self._forward_alg(feats, mask) 221 | gold_score = self._score_sentence(scores, mask, tags) 222 | if self.average_batch: 223 | return (forward_score - gold_score) / batch_size 224 | return forward_score - gold_score 225 | 226 | 227 | 228 | 229 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import torch 3 | import torch.nn as nn 4 | from torch.autograd import Variable 5 | from config import Config 6 | from model import BERT_LSTM_CRF 7 | import torch.optim as optim 8 | from utils import load_vocab, read_corpus, load_model, save_model,get_ner_fmeasure 9 | from torch.utils.data import TensorDataset 10 | from torch.utils.data import DataLoader 11 | import pretraining_args as args 12 | from pytorch_pretrained_bert.optimization import BertAdam 13 | def train(**kwargs): 14 | config = Config() 15 | config.update(**kwargs) 16 | print('当前设置为:\n', config) 17 | if args.use_cuda: 18 | torch.cuda.set_device(config.gpu) 19 | print('loading corpus') 20 | vocab = load_vocab(args.vocab_file) 21 | label_dic = load_vocab(config.label_file) 22 | index2label={v:k for k,v in label_dic.items()} 23 | tagset_size = len(label_dic) 24 | train_data,_ = read_corpus(args.pretrain_train_path, max_length=args.max_seq_length, label_dic=label_dic, vocab=vocab) 25 | dev_data,dev_len = read_corpus(args.pretrain_dev_path, max_length=args.max_seq_length, label_dic=label_dic, vocab=vocab) 26 | num_train_optimization_steps = int( 27 | len(train_data) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs 28 | if args.local_rank != -1: 29 | num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size() 30 | 31 | train_ids = torch.LongTensor([temp.input_id for temp in train_data]) 32 | train_masks = torch.LongTensor([temp.input_mask for temp in train_data]) 33 | train_tags = torch.LongTensor([temp.label_id for temp in train_data]) 34 | 35 | train_dataset = TensorDataset(train_ids, train_masks, train_tags) 36 | train_loader = DataLoader(train_dataset, shuffle=True, batch_size=args.train_batch_size) 37 | 38 | dev_ids = torch.LongTensor([temp.input_id for temp in dev_data]) 39 | dev_masks = torch.LongTensor([temp.input_mask for temp in dev_data]) 40 | dev_tags = torch.LongTensor([temp.label_id for temp in dev_data]) 41 | dev_dataset = TensorDataset(dev_ids, dev_masks, dev_tags) 42 | dev_loader = DataLoader(dev_dataset, shuffle=False, batch_size=args.eval_batch_size) 43 | model = BERT_LSTM_CRF(args, tagset_size, config.bert_embedding, config.rnn_hidden, config.rnn_layer, dropout_ratio=config.dropout_ratio, dropout1=config.dropout1, use_cuda=config.use_cuda) 44 | 45 | if config.use_cuda: 46 | model=model.cuda() 47 | if config.load_model: 48 | if config.flag=='submit': 49 | assert config.load_path is not None 50 | test_data, test_len = read_corpus(args.submit_test_path, max_length=args.max_seq_length, label_dic=label_dic, 51 | vocab=vocab,flag='submit') 52 | test_ids = torch.LongTensor([temp.input_id for temp in test_data]) 53 | test_masks = torch.LongTensor([temp.input_mask for temp in test_data]) 54 | test_dataset = TensorDataset(test_ids, test_masks) 55 | test_loader = DataLoader(test_dataset, shuffle=False, batch_size=args.eval_batch_size) 56 | model = load_model(model, name=None) 57 | test(model, test_loader, config, index2label, test_len) 58 | # dev(model, test_loader, None, config) 59 | if config.flag=='test': 60 | assert config.load_path is not None 61 | test_data, test_len = read_corpus(args.pretrain_test_path, max_length=args.max_seq_length, label_dic=label_dic, 62 | vocab=vocab) 63 | test_ids = torch.LongTensor([temp.input_id for temp in test_data]) 64 | test_masks = torch.LongTensor([temp.input_mask for temp in test_data]) 65 | test_tags = torch.LongTensor([temp.label_id for temp in test_data]) 66 | test_dataset = TensorDataset(test_ids, test_masks, test_tags) 67 | test_loader = DataLoader(test_dataset, shuffle=False, batch_size=args.eval_batch_size) 68 | model = load_model(model, name=None) 69 | # test(model, test_loader, config, index2label, test_len) 70 | dev(model, test_loader, 0, config, index2label, dev_len) 71 | 72 | else: 73 | # print(model) 74 | model.train() 75 | bert_param_optimizer = list(model.word_embeds.named_parameters()) 76 | lstm_param_optimizer = list(model.lstm.named_parameters()) 77 | liner_param_optimizer = list(model.liner.named_parameters()) 78 | crf_param_optimizer = list(model.crf.named_parameters()) 79 | no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] 80 | optimizer_grouped_parameters = [ 81 | {'params': [p for n, p in bert_param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01,'lr':config.lr}, 82 | {'params': [p for n, p in bert_param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0,'lr':config.lr}, 83 | {'params': [p for n, p in lstm_param_optimizer if not any(nd in n for nd in no_decay)],'weight_decay': 0.001, 'lr': config.lr*5}, 84 | {'params': [p for n, p in lstm_param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0,'lr': config.lr*5}, 85 | {'params': [p for n, p in liner_param_optimizer if not any(nd in n for nd in no_decay)],'weight_decay': 0.001, 'lr': config.lr * 2}, 86 | {'params': [p for n, p in liner_param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0, 'lr':config.lr * 2}, 87 | {'params': [p for n, p in crf_param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001, 'lr': config.lr * 3}, 88 | {'params': [p for n, p in crf_param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0,'lr': config.lr * 3}, 89 | 90 | ] 91 | # print(optimizer_grouped_parameters) 92 | optimizer = BertAdam(optimizer_grouped_parameters, 93 | # lr=config.lr, 94 | warmup=args.warmup_proportion, 95 | t_total=num_train_optimization_steps) 96 | # optimizer = optimizer(model.parameters(), lr=config.lr, weight_decay=config.weight_decay) 97 | eval_f1 = 0.0 98 | for epoch in range(config.base_epoch): 99 | print(optimizer.get_lr()) 100 | step = 0 101 | for i, batch in enumerate(train_loader): 102 | step += 1 103 | model.zero_grad() 104 | inputs, masks, tags = batch 105 | inputs, masks, tags = Variable(inputs), Variable(masks), Variable(tags) 106 | if config.use_cuda: 107 | inputs, masks, tags = inputs.cuda(), masks.cuda(), tags.cuda() 108 | 109 | feats = model(inputs, masks) 110 | loss = model.loss(feats, masks,tags) 111 | loss.backward() 112 | optimizer.step() 113 | if step % 50 == 0: 114 | print('step: {} | epoch: {}| loss: {}'.format(step, epoch, loss.item())) 115 | f_measure = dev(model, dev_loader, epoch, config,index2label,dev_len) 116 | if eval_f1 < f_measure: 117 | eval_f1=f_measure 118 | save_model(model,epoch,f_measure) 119 | 120 | 121 | def dev(model, dev_loader, epoch, config,index2label,dev_lens): 122 | model.eval() 123 | eval_loss = 0 124 | trues = [] 125 | preds = [] 126 | length = 0 127 | for i, batch in enumerate(dev_loader): 128 | inputs, masks, tags = batch 129 | length += inputs.size(0) 130 | inputs, masks, tags = Variable(inputs), Variable(masks), Variable(tags) 131 | if config.use_cuda: 132 | inputs, masks, tags = inputs.cuda(), masks.cuda(), tags.cuda() 133 | feats = model(inputs, masks) 134 | path_score, best_path = model.crf(feats, masks.byte()) 135 | loss = model.loss(feats, masks, tags) 136 | eval_loss += loss.item() 137 | nums=len(best_path) 138 | for i in range(nums): 139 | pred_result=[] 140 | true_result=[] 141 | for index in list(best_path[i].cpu().numpy()): 142 | pred_result.append(index2label[index]) 143 | for index in list(tags[i].cpu().numpy()): 144 | true_result.append(index2label[index]) 145 | preds.append(pred_result) 146 | trues.append(true_result) 147 | # print(len(dev_lens)) 148 | pred_tag_lists = [preds[i][1:dev_lens[i]+1] for i in range(len(dev_lens))] 149 | tag_lists = [trues[i][1:dev_lens[i]+1] for i in range(len(dev_lens))] 150 | accuracy, precision, recall, f_measure=get_ner_fmeasure(tag_lists,pred_tag_lists) 151 | def calculate_category_f1(): 152 | print(pred_tag_lists[:25]) 153 | print(tag_lists[:25]) 154 | labels=[v for k,v in index2label.items()] 155 | truth_label_count={} 156 | predict_label_count = {} 157 | label_count={} 158 | count=0 159 | for pred,true in zip(preds,trues): 160 | for i,t in enumerate(true): 161 | if t=='' and pred[i]=='': 162 | count=count+1 163 | break 164 | else: 165 | if t not in ['', 'o', '']: 166 | if t==pred[i]: 167 | if t not in label_count: 168 | label_count[t]=1 169 | else: 170 | label_count[t] +=1 171 | if t not in truth_label_count: 172 | truth_label_count[t]=1 173 | else: 174 | truth_label_count[t]+=1 175 | if pred[i] not in predict_label_count: 176 | predict_label_count[pred[i]]=1 177 | else: 178 | predict_label_count[pred[i]] += 1 179 | precision={} 180 | recall={} 181 | f1={} 182 | # print(label_count.keys()) 183 | # print(predict_label_count.keys()) 184 | # print(truth_label_count.keys()) 185 | for label in labels: 186 | if label in label_count: 187 | precision[label]=label_count[label]/predict_label_count[label] 188 | recall[label]=label_count[label]/truth_label_count[label] 189 | f1[label]=2*precision[label]*recall[label]/(precision[label]+recall[label]) 190 | 191 | # print(sum(precision.values())/len(truth_label_count)) 192 | # print(sum(recall.values())/len(truth_label_count)) 193 | 194 | print(precision) 195 | print(recall) 196 | print(f1) 197 | # print(truth_label_count) 198 | print('eval epoch: {}| loss: {}'.format(epoch, eval_loss/length)) 199 | model.train() 200 | return f_measure 201 | def test(model, test_loader, config,index2label,dev_lens): 202 | model.eval() 203 | preds = [] 204 | length = 0 205 | for i, batch in enumerate(test_loader): 206 | inputs, masks = batch 207 | length += inputs.size(0) 208 | inputs, masks = Variable(inputs), Variable(masks) 209 | if config.use_cuda: 210 | inputs, masks = inputs.cuda(), masks.cuda() 211 | feats = model(inputs, masks) 212 | path_score, best_path = model.crf(feats, masks.byte()) 213 | nums=len(best_path) 214 | for i in range(nums): 215 | pred_result=[] 216 | for index in list(best_path[i].cpu().numpy()): 217 | pred_result.append(index2label[index]) 218 | preds.append(pred_result) 219 | # print(len(dev_lens)) 220 | pred_tag_lists = [preds[i][1:dev_lens[i]+1] for i in range(len(dev_lens))] 221 | f=open('predict.txt','w') 222 | for pred_tag_list in pred_tag_lists: 223 | f.write(' '.join(pred_tag_list) + '\n') 224 | f.close() 225 | 226 | if __name__ == '__main__': 227 | train() 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import torch 3 | import os 4 | import datetime 5 | import unicodedata 6 | 7 | 8 | class InputFeatures(object): 9 | def __init__(self, input_id, label_id, input_mask): 10 | self.input_id = input_id 11 | self.label_id = label_id 12 | self.input_mask = input_mask 13 | 14 | 15 | def load_vocab(vocab_file): 16 | """Loads a vocabulary file into a dictionary.""" 17 | vocab = {} 18 | index = 0 19 | with open(vocab_file, "r", encoding="utf-8") as reader: 20 | while True: 21 | token = reader.readline() 22 | if not token: 23 | break 24 | token = token.strip() 25 | vocab[token] = index 26 | index += 1 27 | return vocab 28 | 29 | 30 | def read_corpus(path, max_length, label_dic, vocab,flag=None): 31 | """ 32 | :param path:数据文件路径 33 | :param max_length: 最大长度 34 | :param label_dic: 标签字典 35 | :return: 36 | """ 37 | file = open(path, encoding='utf-8') 38 | content = file.readlines() 39 | file.close() 40 | result = [] 41 | lengths = [] 42 | if flag=='submit': 43 | for line in content: 44 | tokens = line.strip().split(' ') 45 | length=len(tokens) 46 | if len(tokens) > max_length-2: 47 | length=max_length-2 48 | tokens = tokens[0:(max_length-2)] 49 | lengths.append(length) 50 | tokens_f =['[CLS]'] + tokens + ['[SEP]'] 51 | input_ids = [int(vocab[i]) if i in vocab else int(vocab['[UNK]']) for i in tokens_f] 52 | input_mask = [1] * len(input_ids) 53 | while len(input_ids) < max_length: 54 | input_ids.append(0) 55 | input_mask.append(0) 56 | assert len(input_ids) == max_length 57 | assert len(input_mask) == max_length 58 | feature = InputFeatures(input_id=input_ids, input_mask=input_mask, label_id=None) 59 | result.append(feature) 60 | return result,lengths 61 | else: 62 | for line in content: 63 | text, label = line.strip().split('|||') 64 | tokens = text.split() 65 | label = label.split() 66 | length=len(tokens) 67 | if len(tokens) > max_length-2: 68 | length=max_length-2 69 | tokens = tokens[0:(max_length-2)] 70 | label = label[0:(max_length-2)] 71 | lengths.append(length) 72 | tokens_f =['[CLS]'] + tokens + ['[SEP]'] 73 | label_f = [""] + label + [''] 74 | input_ids = [int(vocab[i]) if i in vocab else int(vocab['[UNK]']) for i in tokens_f] 75 | label_ids = [label_dic[i] for i in label_f] 76 | input_mask = [1] * len(input_ids) 77 | while len(input_ids) < max_length: 78 | input_ids.append(0) 79 | input_mask.append(0) 80 | label_ids.append(label_dic['']) 81 | assert len(input_ids) == max_length 82 | assert len(input_mask) == max_length 83 | assert len(label_ids) == max_length 84 | feature = InputFeatures(input_id=input_ids, input_mask=input_mask, label_id=label_ids) 85 | result.append(feature) 86 | return result,lengths 87 | 88 | 89 | def save_model(model, epoch,eval_f1, path='result', **kwargs): 90 | """ 91 | 默认保留所有模型 92 | :param model: 模型 93 | :param path: 保存路径 94 | :param loss: 校验损失 95 | :param last_loss: 最佳epoch损失 96 | :param kwargs: every_epoch or best_epoch 97 | :return: 98 | """ 99 | if not os.path.exists(path): 100 | os.mkdir(path) 101 | if kwargs.get('name', None) is None: 102 | cur_time = datetime.datetime.now().strftime('%Y-%m-%d#%H:%M:%S') 103 | name ='epoch_{}'.format(epoch)+'_'+str(eval_f1)[2:6] 104 | full_name = os.path.join(path, name) 105 | torch.save(model.state_dict(), full_name) 106 | print('Saved model at epoch {} successfully'.format(epoch)) 107 | with open('{}/checkpoint'.format(path), 'w') as file: 108 | file.write(name) 109 | print('Write to checkpoint') 110 | 111 | 112 | def load_model(model, path='result/8918', **kwargs): 113 | if kwargs.get('name', None) is None: 114 | with open('{}/checkpoint'.format(path)) as file: 115 | content = file.read().strip() 116 | name = os.path.join(path, content) 117 | else: 118 | name=kwargs['name'] 119 | name = os.path.join(path,name) 120 | model.load_state_dict(torch.load(name, map_location=lambda storage, loc: storage)) 121 | print('load model {} successfully'.format(name)) 122 | return model 123 | 124 | def data_split(): 125 | def train_count(): 126 | #16941 17000 0.9965294117647059 127 | f=open('./data/normal_daguan_train.txt') 128 | datas=f.readlines() 129 | f.close() 130 | count=0 131 | for data in datas: 132 | words=data.split('|||')[0].split(' ') 133 | if len(words)<=256: 134 | count+=1 135 | print(count,len(datas),count/len(datas)) 136 | def test_count(): 137 | #2989 3000 0.9963333333333333 138 | f=open('./data/normal_daguan_test.txt') 139 | datas=f.readlines() 140 | f.close() 141 | count=0 142 | for data in datas: 143 | words=data.split(' ') 144 | if len(words)<=256: 145 | count+=1 146 | print(count,len(datas),count/len(datas)) 147 | 148 | def split(): 149 | # 16941 17000 0.9965294117647059 150 | f = open('./data/normal_daguan_train.txt') 151 | datas = f.readlines() 152 | f.close() 153 | f = open('./data/train.txt','w') 154 | for data in datas[:14000]: 155 | f.write(data) 156 | f.close() 157 | f = open('./data/dev.txt', 'w') 158 | for data in datas[14000:15500]: 159 | f.write(data) 160 | f.close() 161 | f = open('./data/test.txt', 'w') 162 | for data in datas[15500:]: 163 | f.write(data) 164 | f.close() 165 | train_count() 166 | test_count() 167 | # split() 168 | data_split() 169 | def get_ner_fmeasure(golden_lists, predict_lists, label_type="BMES"): 170 | sent_num = len(golden_lists) 171 | golden_full = [] 172 | predict_full = [] 173 | right_full = [] 174 | right_tag = 0 175 | all_tag = 0 176 | for idx in range(0,sent_num): 177 | # word_list = sentence_lists[idx] 178 | golden_list = golden_lists[idx] 179 | predict_list = predict_lists[idx] 180 | for idy in range(len(golden_list)): 181 | if golden_list[idy] == predict_list[idy]: 182 | right_tag += 1 183 | all_tag += len(golden_list) 184 | if label_type == "BMES": 185 | gold_matrix = get_ner_BMES(golden_list) 186 | pred_matrix = get_ner_BMES(predict_list) 187 | else: 188 | gold_matrix = get_ner_BIO(golden_list) 189 | pred_matrix = get_ner_BIO(predict_list) 190 | # print "gold", gold_matrix 191 | # print "pred", pred_matrix 192 | right_ner = list(set(gold_matrix).intersection(set(pred_matrix))) 193 | golden_full += gold_matrix 194 | predict_full += pred_matrix 195 | right_full += right_ner 196 | right_num = len(right_full) 197 | golden_num = len(golden_full) 198 | predict_num = len(predict_full) 199 | if predict_num == 0: 200 | precision = -1 201 | else: 202 | precision = (right_num+0.0)/predict_num 203 | if golden_num == 0: 204 | recall = -1 205 | else: 206 | recall = (right_num+0.0)/golden_num 207 | if (precision == -1) or (recall == -1) or (precision+recall) <= 0.: 208 | f_measure = -1 209 | else: 210 | f_measure = 2*precision*recall/(precision+recall) 211 | accuracy = (right_tag+0.0)/all_tag 212 | # print "Accuracy: ", right_tag,"/",all_tag,"=",accuracy 213 | print ("gold_num = ", golden_num, " pred_num = ", predict_num, " right_num = ", right_num) 214 | # print('accuracy=',accuracy,' precision=',precision,' recall=',recall,' f_measure=',f_measure) 215 | print('acc=',round(accuracy,4),' p=',round(precision,4),' r=',round(recall,4),' f1=',round(f_measure,4)) 216 | return accuracy, precision, recall, f_measure 217 | 218 | 219 | def get_ner_BMES(label_list): 220 | # list_len = len(word_list) 221 | # assert(list_len == len(label_list)), "word list size unmatch with label list" 222 | list_len = len(label_list) 223 | begin_label = 'B-' 224 | end_label = 'E-' 225 | single_label = 'S-' 226 | whole_tag = '' 227 | index_tag = '' 228 | tag_list = [] 229 | stand_matrix = [] 230 | for i in range(0, list_len): 231 | # wordlabel = word_list[i] 232 | current_label = label_list[i].upper() 233 | if begin_label in current_label: 234 | if index_tag != '': 235 | tag_list.append(whole_tag + ',' + str(i - 1)) 236 | whole_tag = current_label.replace(begin_label, "", 1) + '[' + str(i) 237 | index_tag = current_label.replace(begin_label, "", 1) 238 | 239 | elif single_label in current_label: 240 | if index_tag != '': 241 | tag_list.append(whole_tag + ',' + str(i - 1)) 242 | whole_tag = current_label.replace(single_label, "", 1) + '[' + str(i) 243 | tag_list.append(whole_tag) 244 | whole_tag = "" 245 | index_tag = "" 246 | elif end_label in current_label: 247 | if index_tag != '': 248 | tag_list.append(whole_tag + ',' + str(i)) 249 | whole_tag = '' 250 | index_tag = '' 251 | else: 252 | continue 253 | if (whole_tag != '') & (index_tag != ''): 254 | tag_list.append(whole_tag) 255 | tag_list_len = len(tag_list) 256 | 257 | for i in range(0, tag_list_len): 258 | if len(tag_list[i]) > 0: 259 | tag_list[i] = tag_list[i] + ']' 260 | insert_list = reverse_style(tag_list[i]) 261 | stand_matrix.append(insert_list) 262 | # print stand_matrix 263 | return stand_matrix 264 | def get_ner_BIO(label_list): 265 | # list_len = len(word_list) 266 | # assert(list_len == len(label_list)), "word list size unmatch with label list" 267 | list_len = len(label_list) 268 | begin_label = 'B-' 269 | inside_label = 'I-' 270 | whole_tag = '' 271 | index_tag = '' 272 | tag_list = [] 273 | stand_matrix = [] 274 | for i in range(0, list_len): 275 | # wordlabel = word_list[i] 276 | current_label = label_list[i].upper() 277 | if begin_label in current_label: 278 | if index_tag == '': 279 | whole_tag = current_label.replace(begin_label,"",1) +'[' +str(i) 280 | index_tag = current_label.replace(begin_label,"",1) 281 | else: 282 | tag_list.append(whole_tag + ',' + str(i-1)) 283 | whole_tag = current_label.replace(begin_label,"",1) + '[' + str(i) 284 | index_tag = current_label.replace(begin_label,"",1) 285 | 286 | elif inside_label in current_label: 287 | if current_label.replace(inside_label,"",1) == index_tag: 288 | whole_tag = whole_tag 289 | else: 290 | if (whole_tag != '')&(index_tag != ''): 291 | tag_list.append(whole_tag +',' + str(i-1)) 292 | whole_tag = '' 293 | index_tag = '' 294 | else: 295 | if (whole_tag != '')&(index_tag != ''): 296 | tag_list.append(whole_tag +',' + str(i-1)) 297 | whole_tag = '' 298 | index_tag = '' 299 | 300 | if (whole_tag != '')&(index_tag != ''): 301 | tag_list.append(whole_tag) 302 | tag_list_len = len(tag_list) 303 | 304 | for i in range(0, tag_list_len): 305 | if len(tag_list[i]) > 0: 306 | tag_list[i] = tag_list[i]+ ']' 307 | insert_list = reverse_style(tag_list[i]) 308 | stand_matrix.append(insert_list) 309 | return stand_matrix 310 | def reverse_style(input_string): 311 | target_position = input_string.index('[') 312 | input_len = len(input_string) 313 | output_string = input_string[target_position:input_len] + input_string[0:target_position] 314 | return output_string -------------------------------------------------------------------------------- /Pretraining_Bert/lm_bert/run_pretraining.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function 2 | 3 | import pretraining_args as args 4 | import csv 5 | import logging 6 | import os 7 | import random 8 | random.seed(args.seed) 9 | import sys 10 | from glob import glob 11 | import numpy as np 12 | import torch 13 | 14 | from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, 15 | TensorDataset) 16 | from torch.utils.data.distributed import DistributedSampler 17 | from tqdm import tqdm, trange 18 | from random import random, randrange, randint, shuffle, choice, sample 19 | from torch.nn import CrossEntropyLoss, MSELoss 20 | from scipy.stats import pearsonr, spearmanr 21 | from sklearn.metrics import matthews_corrcoef, f1_score 22 | 23 | from pytorch_pretrained_bert.file_utils import WEIGHTS_NAME, CONFIG_NAME 24 | from pytorch_pretrained_bert.modeling import BertForMaskedLM, BertConfig 25 | from pytorch_pretrained_bert.tokenization import BertTokenizer 26 | from pytorch_pretrained_bert.optimization import BertAdam 27 | 28 | logger = logging.getLogger(__name__) 29 | 30 | 31 | def warmup_linear(x, warmup=0.002): 32 | if x < warmup: 33 | return x/warmup 34 | return 1.0 - x 35 | 36 | 37 | class InputFeatures(object): 38 | """A single set of features of data.""" 39 | def __init__(self, input_ids, input_mask, segment_ids, label_id): 40 | self.input_ids = input_ids 41 | self.input_mask = input_mask 42 | self.segment_ids = segment_ids 43 | self.label_id = label_id 44 | 45 | 46 | def create_masked_lm_predictions(tokens, masked_lm_prob, max_predictions_per_seq, vocab_list): 47 | """Creates the predictions for the masked LM objective. This is mostly copied from the Google BERT repo, but 48 | with several refactors to clean it up and remove a lot of unnecessary variables.""" 49 | cand_indices = [] 50 | for (i, token) in enumerate(tokens): 51 | if token == "[CLS]" or token == "[SEP]": 52 | continue 53 | cand_indices.append(i) 54 | 55 | num_to_mask = min(max_predictions_per_seq, 56 | max(1, int(round(len(tokens) * masked_lm_prob)))) 57 | # print(num_to_mask) 58 | # print("tokens", len(tokens)) 59 | # print("cand", len(cand_indices)) 60 | shuffle(cand_indices) 61 | mask_indices = sorted(sample(cand_indices, num_to_mask)) 62 | masked_token_labels = [] 63 | for index in mask_indices: 64 | # 80% of the time, replace with [MASK] 65 | if random() < 0.8: 66 | masked_token = "[MASK]" 67 | else: 68 | # 10% of the time, keep original 69 | if random() < 0.5: 70 | masked_token = tokens[index] 71 | # 10% of the time, replace with random word 72 | else: 73 | masked_token = choice(vocab_list) 74 | masked_token_labels.append(tokens[index]) 75 | # Once we've saved the true label for that token, we can overwrite it with the masked version 76 | tokens[index] = masked_token 77 | 78 | return tokens, mask_indices, masked_token_labels 79 | 80 | 81 | def create_examples(data_path, max_seq_length, masked_lm_prob, max_predictions_per_seq, vocab_list): 82 | """Creates examples for the training and dev sets.""" 83 | examples = [] 84 | max_num_tokens = max_seq_length - 2 85 | fr = open(data_path, "r") 86 | for (i, line) in tqdm(enumerate(fr), desc="Creating Example"): 87 | # if i>=1000: 88 | # break 89 | tokens_a = line.strip("\n").split()[:max_num_tokens] 90 | tokens = ["[CLS]"] + tokens_a + ["[SEP]"] 91 | segment_ids = [0 for _ in range(len(tokens_a) + 2)] 92 | # remove too short sample 93 | if len(tokens_a) < 5: 94 | continue 95 | tokens, masked_lm_positions, masked_lm_labels = create_masked_lm_predictions( 96 | tokens, masked_lm_prob, max_predictions_per_seq, vocab_list) 97 | example = { 98 | "tokens": tokens, 99 | "segment_ids": segment_ids, 100 | "masked_lm_positions": masked_lm_positions, 101 | "masked_lm_labels": masked_lm_labels} 102 | examples.append(example) 103 | fr.close() 104 | return examples 105 | 106 | 107 | def convert_examples_to_features(examples, max_seq_length, tokenizer): 108 | features = [] 109 | for i, example in tqdm(enumerate(examples), desc="Converting Feature"): 110 | tokens = example["tokens"] 111 | segment_ids = example["segment_ids"] 112 | masked_lm_positions = example["masked_lm_positions"] 113 | masked_lm_labels = example["masked_lm_labels"] 114 | assert len(tokens) == len(segment_ids) <= max_seq_length # The preprocessed data should be already truncated 115 | input_ids = tokenizer.convert_tokens_to_ids(tokens) 116 | masked_label_ids = tokenizer.convert_tokens_to_ids(masked_lm_labels) 117 | 118 | input_array = np.zeros(max_seq_length, dtype=np.int) 119 | input_array[:len(input_ids)] = input_ids 120 | 121 | mask_array = np.zeros(max_seq_length, dtype=np.bool) 122 | mask_array[:len(input_ids)] = 1 123 | 124 | segment_array = np.zeros(max_seq_length, dtype=np.bool) 125 | segment_array[:len(segment_ids)] = segment_ids 126 | 127 | lm_label_array = np.full(max_seq_length, dtype=np.int, fill_value=-1) 128 | lm_label_array[masked_lm_positions] = masked_label_ids 129 | 130 | feature = InputFeatures(input_ids=input_array, 131 | input_mask=mask_array, 132 | segment_ids=segment_array, 133 | label_id=lm_label_array) 134 | features.append(feature) 135 | # if i < 10: 136 | # logger.info("input_ids: %s\ninput_mask:%s\nsegment_ids:%s\nlabel_id:%s" %(input_array, mask_array, segment_array, lm_label_array)) 137 | return features 138 | 139 | 140 | 141 | def main(): 142 | if args.local_rank == -1 or args.no_cuda: 143 | device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") 144 | n_gpu = torch.cuda.device_count() 145 | else: 146 | torch.cuda.set_device(args.local_rank) 147 | device = torch.device("cuda", args.local_rank) 148 | n_gpu = 1 149 | # Initializes the distributed backend which will take care of sychronizing nodes/GPUs 150 | torch.distributed.init_process_group(backend='nccl') 151 | 152 | logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', 153 | datefmt='%m/%d/%Y %H:%M:%S', 154 | level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) 155 | 156 | logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( 157 | device, n_gpu, bool(args.local_rank != -1), args.fp16)) 158 | 159 | if args.gradient_accumulation_steps < 1: 160 | raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( 161 | args.gradient_accumulation_steps)) 162 | 163 | args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps 164 | 165 | 166 | np.random.seed(args.seed) 167 | torch.manual_seed(args.seed) 168 | if n_gpu > 0: 169 | torch.cuda.manual_seed_all(args.seed) 170 | 171 | # if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train: 172 | # raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) 173 | if not os.path.exists(args.output_dir): 174 | os.makedirs(args.output_dir) 175 | 176 | tokenizer = BertTokenizer(vocab_file=args.vocab_file) 177 | 178 | train_examples = None 179 | num_train_optimization_steps = None 180 | vocab_list = [] 181 | with open(args.vocab_file, 'r') as fr: 182 | for line in fr: 183 | vocab_list.append(line.strip("\n")) 184 | 185 | if args.do_train: 186 | train_examples = create_examples(data_path=args.pretrain_train_path, 187 | max_seq_length=args.max_seq_length, 188 | masked_lm_prob=args.masked_lm_prob, 189 | max_predictions_per_seq=args.max_predictions_per_seq, 190 | vocab_list=vocab_list) 191 | num_train_optimization_steps = int( 192 | len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs 193 | if args.local_rank != -1: 194 | num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size() 195 | 196 | model = BertForMaskedLM(config=BertConfig.from_json_file(args.bert_config_json)) 197 | # model.load_state_dict(torch.load('./outputs/60000_pytorch_model.bin')) 198 | if args.fp16: 199 | model.half() 200 | model.to(device) 201 | if args.local_rank != -1: 202 | try: 203 | from apex.parallel import DistributedDataParallel as DDP 204 | except ImportError: 205 | raise ImportError( 206 | "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") 207 | 208 | model = DDP(model) 209 | elif n_gpu > 1: 210 | model = torch.nn.DataParallel(model) 211 | 212 | # Prepare optimizer 213 | param_optimizer = list(model.named_parameters()) 214 | no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] 215 | optimizer_grouped_parameters = [ 216 | {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, 217 | {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} 218 | ] 219 | if args.fp16: 220 | try: 221 | from apex.optimizers import FP16_Optimizer 222 | from apex.optimizers import FusedAdam 223 | except ImportError: 224 | raise ImportError( 225 | "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") 226 | 227 | optimizer = FusedAdam(optimizer_grouped_parameters, 228 | lr=args.learning_rate, 229 | bias_correction=False, 230 | max_grad_norm=1.0) 231 | if args.loss_scale == 0: 232 | optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) 233 | else: 234 | optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) 235 | 236 | else: 237 | optimizer = BertAdam(optimizer_grouped_parameters, 238 | lr=args.learning_rate, 239 | warmup=args.warmup_proportion, 240 | t_total=num_train_optimization_steps) 241 | 242 | iter_nums = 0 243 | best_loss = 100000 244 | 245 | if args.do_train: 246 | train_features = convert_examples_to_features( 247 | train_examples, args.max_seq_length, tokenizer) 248 | all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) 249 | all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) 250 | all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) 251 | all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) 252 | 253 | train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) 254 | if args.local_rank == -1: 255 | train_sampler = RandomSampler(train_data) 256 | else: 257 | train_sampler = DistributedSampler(train_data) 258 | train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) 259 | 260 | model.train() 261 | global_step=0 262 | for e in trange(int(args.num_train_epochs), desc="Epoch"): 263 | tr_loss = 0 264 | nb_tr_examples, nb_tr_steps = 0, 0 265 | for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): 266 | global_step+=1 267 | iter_nums += 1 268 | batch = tuple(t.to(device) for t in batch) 269 | input_ids, input_mask, segment_ids, label_ids = batch 270 | # masked_lm_loss 271 | loss = model(input_ids, segment_ids, input_mask, label_ids) 272 | 273 | if n_gpu > 1: 274 | loss = loss.mean() # mean() to average on multi-gpu. 275 | if args.gradient_accumulation_steps > 1: 276 | loss = loss / args.gradient_accumulation_steps 277 | 278 | if args.fp16: 279 | optimizer.backward(loss) 280 | else: 281 | loss.backward() 282 | 283 | tr_loss += loss.item() 284 | nb_tr_examples += input_ids.size(0) 285 | nb_tr_steps += 1 286 | if (step + 1) % args.gradient_accumulation_steps == 0: 287 | if args.fp16: 288 | # modify learning rate with special warm up BERT uses 289 | # if args.fp16 is False, BertAdam is used that handles this automatically 290 | lr_this_step = args.learning_rate * warmup_linear(global_step / num_train_optimization_steps, 291 | args.warmup_proportion) 292 | for param_group in optimizer.param_groups: 293 | param_group['lr'] = lr_this_step 294 | optimizer.step() 295 | optimizer.zero_grad() 296 | global_step += 1 297 | if nb_tr_steps > 0 and nb_tr_steps % 10000 == 0: 298 | logger.info("===================== -epoch %d -train_step %d -iter_nums %d -train_loss %.4f\n" % (e, nb_tr_steps, iter_nums,tr_loss / nb_tr_steps)) 299 | 300 | if nb_tr_steps > 0 and iter_nums % 50000 == 0: 301 | eval_examples = create_examples(data_path=args.pretrain_dev_path, 302 | max_seq_length=args.max_seq_length, 303 | masked_lm_prob=args.masked_lm_prob, 304 | max_predictions_per_seq=args.max_predictions_per_seq, 305 | vocab_list=vocab_list) 306 | eval_features = convert_examples_to_features( 307 | eval_examples, args.max_seq_length, tokenizer) 308 | all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) 309 | all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) 310 | all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) 311 | all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) 312 | eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) 313 | # Run prediction for full data 314 | eval_sampler = SequentialSampler(eval_data) 315 | eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) 316 | 317 | model.eval() 318 | eval_loss = 0 319 | nb_eval_steps = 0 320 | for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"): 321 | input_ids = input_ids.to(device) 322 | input_mask = input_mask.to(device) 323 | segment_ids = segment_ids.to(device) 324 | label_ids = label_ids.to(device) 325 | 326 | with torch.no_grad(): 327 | loss = model(input_ids, segment_ids, input_mask, label_ids) 328 | 329 | eval_loss += loss.item() 330 | nb_eval_steps += 1 331 | 332 | eval_loss = eval_loss / nb_eval_steps 333 | # if eval_loss < best_loss: 334 | model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self 335 | 336 | # If we save using the predefined names, we can load using `from_pretrained` 337 | output_model_file = os.path.join(args.output_dir, str(eval_loss)[:4] + '_' + str(iter_nums) +'_iter_nums') 338 | torch.save(model_to_save.state_dict(), output_model_file) 339 | best_loss = eval_loss 340 | logger.info("============================ -iter_nums %d -train_loss %.4f -eval_loss %.4f\n"% (iter_nums, tr_loss / nb_tr_steps, eval_loss)) 341 | else: 342 | model = BertForMaskedLM(config=BertConfig.from_json_file(args.bert_config_json)) 343 | model.load_state_dict(torch.load('./outputs/60000_pytorch_model.bin')) 344 | 345 | if args.fp16: 346 | model.half() 347 | model.to(device) 348 | if args.local_rank != -1: 349 | try: 350 | from apex.parallel import DistributedDataParallel as DDP 351 | except ImportError: 352 | raise ImportError( 353 | "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") 354 | model = DDP(model) 355 | elif n_gpu > 1: 356 | model = torch.nn.DataParallel(model) 357 | 358 | eval_examples = create_examples(data_path=args.pretrain_dev_path, 359 | max_seq_length=args.max_seq_length, 360 | masked_lm_prob=args.masked_lm_prob, 361 | max_predictions_per_seq=args.max_predictions_per_seq, 362 | vocab_list=vocab_list) 363 | eval_features = convert_examples_to_features( 364 | eval_examples, args.max_seq_length, tokenizer) 365 | all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) 366 | all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) 367 | all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) 368 | all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) 369 | eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) 370 | # Run prediction for full data 371 | eval_sampler = SequentialSampler(eval_data) 372 | eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) 373 | 374 | model.eval() 375 | eval_loss = 0 376 | nb_eval_steps = 0 377 | for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"): 378 | input_ids = input_ids.to(device) 379 | input_mask = input_mask.to(device) 380 | segment_ids = segment_ids.to(device) 381 | label_ids = label_ids.to(device) 382 | 383 | with torch.no_grad(): 384 | loss = model(input_ids, segment_ids, input_mask, label_ids) 385 | 386 | eval_loss += loss.item() 387 | nb_eval_steps += 1 388 | 389 | eval_loss = eval_loss / nb_eval_steps 390 | 391 | logger.info("============================ -eval_loss %.4f\n" % ( 392 | eval_loss)) 393 | 394 | 395 | if __name__ == "__main__": 396 | main() 397 | 398 | 399 | --------------------------------------------------------------------------------