├── .DS_Store ├── 07-named-entity-recognition ├── .DS_Store ├── __init__.py ├── __pycache__ │ └── data_preprocess.cpython-36.pyc ├── cnn-bilstm-crf.py ├── data_preprocess.py ├── evaluating.py └── predict.py └── README.md /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZephyrChenzf/NER-Sequence-labeling--Textcnn-bilstm-crf-pytorch/6c8a1d1c145be5f2b37a8a603bdc06f4fbc8b3c1/.DS_Store -------------------------------------------------------------------------------- /07-named-entity-recognition/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZephyrChenzf/NER-Sequence-labeling--Textcnn-bilstm-crf-pytorch/6c8a1d1c145be5f2b37a8a603bdc06f4fbc8b3c1/07-named-entity-recognition/.DS_Store -------------------------------------------------------------------------------- /07-named-entity-recognition/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZephyrChenzf/NER-Sequence-labeling--Textcnn-bilstm-crf-pytorch/6c8a1d1c145be5f2b37a8a603bdc06f4fbc8b3c1/07-named-entity-recognition/__init__.py -------------------------------------------------------------------------------- /07-named-entity-recognition/__pycache__/data_preprocess.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ZephyrChenzf/NER-Sequence-labeling--Textcnn-bilstm-crf-pytorch/6c8a1d1c145be5f2b37a8a603bdc06f4fbc8b3c1/07-named-entity-recognition/__pycache__/data_preprocess.cpython-36.pyc -------------------------------------------------------------------------------- /07-named-entity-recognition/cnn-bilstm-crf.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from torch.autograd import Variable 4 | from torch.utils.data import DataLoader 5 | from torch import optim,nn 6 | import data_preprocess 7 | import os 8 | torch.manual_seed(1) 9 | 10 | os.environ['CUDA_VISIBLE_DEVICES']='0' 11 | use_cuda=torch.cuda.is_available() 12 | 13 | word2index,index2word,tag2index,index2tag=data_preprocess.get_dic() 14 | train_x,val_x,train_y,val_y,train_mask,val_mask=data_preprocess.get_data() 15 | trainDataSet=data_preprocess.TextDataSet(train_x,train_y,train_mask) 16 | valDataSet=data_preprocess.TextDataSet(val_x,val_y,val_mask) 17 | trainDataLoader=DataLoader(trainDataSet,batch_size=16,shuffle=True) 18 | valDataLoader=DataLoader(valDataSet,batch_size=16,shuffle=False) 19 | 20 | MAXLEN=100 21 | vcab_size=len(word2index) 22 | emb_dim=128 23 | hidden_dim=256 24 | num_epoches=20 25 | batch_size=16 26 | 27 | 28 | class BILSTM_CRF(nn.Module): 29 | def __init__(self,vcab_size,tag2index,emb_dim,hidden_dim,batch_size): 30 | super(BILSTM_CRF,self).__init__() 31 | self.vcab_size=vcab_size 32 | self.tag2index=tag2index 33 | self.num_tags=len(tag2index) 34 | self.emb_dim=emb_dim 35 | self.hidden_dim=hidden_dim 36 | self.batch_size=batch_size 37 | self.use_cuda=torch.cuda.is_available() 38 | self.embed=nn.Embedding(num_embeddings=vcab_size,embedding_dim=emb_dim)#b,100,128 39 | #->100,b,128 40 | self.bilstm=nn.LSTM(input_size=emb_dim,hidden_size=hidden_dim,num_layers=1,bidirectional=True,dropout=0.1)#100,b,256*2 41 | self.conv1 = nn.Sequential( 42 | #b,1,100,128 43 | nn.Conv2d(1, 128, (1,emb_dim),padding=0), # b,128,100,1 44 | nn.BatchNorm2d(128), 45 | nn.ReLU(True), 46 | ) 47 | self.conv2 = nn.Sequential( 48 | nn.Conv2d(1, 128, (3,emb_dim+2), padding=1), # b,128,100,1 49 | nn.BatchNorm2d(128), 50 | nn.ReLU(True), 51 | ) 52 | self.conv3 = nn.Sequential( 53 | nn.Conv2d(1, 128, (5,emb_dim+4), padding=2), # b,128,100,1 54 | nn.BatchNorm2d(128), 55 | nn.ReLU(True), 56 | ) 57 | #b,128*3,100,1->100,b,128*3 58 | self.linear1 = nn.Linear(hidden_dim * 2+128*3,hidden_dim) 59 | self.drop=nn.Dropout(0.2) 60 | self.classfy=nn.Linear(hidden_dim,self.num_tags)#100*b,10 61 | #->100,b,10 62 | # init transitions 63 | self.start_transitions = nn.Parameter(torch.Tensor(self.num_tags))# 64 | self.end_transitions = nn.Parameter(torch.Tensor(self.num_tags))# 65 | self.transitions = nn.Parameter(torch.Tensor(self.num_tags, self.num_tags))# 66 | nn.init.uniform(self.start_transitions, -0.1, 0.1) 67 | nn.init.uniform(self.end_transitions, -0.1, 0.1) 68 | nn.init.uniform(self.transitions, -0.1, 0.1) 69 | 70 | def init_hidden(self,batch_size):# 71 | h_h=Variable(torch.randn(2,batch_size,self.hidden_dim)) 72 | h_c=Variable(torch.randn(2,batch_size,self.hidden_dim)) 73 | if use_cuda: 74 | h_h=h_h.cuda() 75 | h_c=h_c.cuda() 76 | return (h_h,h_c) 77 | 78 | def get_bilstm_out(self,x):# 79 | batch_size = x.size(0) 80 | emb=self.embed(x) 81 | 82 | #cnn输出 83 | emb_cnn=emb.unsqueeze(1) 84 | cnn1=self.conv1(emb_cnn) 85 | cnn2=self.conv2(emb_cnn) 86 | cnn3=self.conv3(emb_cnn) 87 | cnn_cat=torch.cat((cnn1,cnn2,cnn3),1) 88 | cnn_out=cnn_cat.squeeze().permute(2,0,1)#100,b,128*3 89 | 90 | emb_rnn=emb.permute(1,0,2) 91 | init_hidden=self.init_hidden(batch_size) 92 | lstm_out,hidden=self.bilstm(emb_rnn,init_hidden) 93 | 94 | cat_out=torch.cat((cnn_out,lstm_out),2)#100,b,128*3+256*2 95 | s,b,h=cat_out.size() 96 | cat_out=cat_out.view(s*b,h) 97 | cat_out=self.linear1(cat_out) 98 | cat_out=self.drop(cat_out) 99 | cat_out=self.classfy(cat_out) 100 | cat_out=cat_out.view(s,b,-1) 101 | # out=out.permute(1,0,2) 102 | return cat_out 103 | 104 | def _log_sum_exp(self,tensor,dim): 105 | # Find the max value along `dim` 106 | offset, _ = tensor.max(dim)#b,m 107 | # Make offset broadcastable 108 | broadcast_offset = offset.unsqueeze(dim)#b,1,m 109 | # Perform log-sum-exp safely 110 | safe_log_sum_exp = torch.log(torch.sum(torch.exp(tensor - broadcast_offset), dim))#b,m 111 | # Add offset back 112 | return offset + safe_log_sum_exp 113 | 114 | def get_all_score(self,emissions,mask):# 115 | # emissions: (seq_length, batch_size, num_tags) 116 | # mask: (batch_size,seq_length) 117 | seq_length = emissions.size(0) 118 | mask = mask.permute(1,0).contiguous().float() 119 | 120 | log_prob = self.start_transitions.view(1, -1) + emissions[0] # b,m, 121 | 122 | for i in range(1, seq_length): 123 | broadcast_log_prob = log_prob.unsqueeze(2) # b,m,1 124 | broadcast_transitions = self.transitions.unsqueeze(0) #1,m,m 125 | broadcast_emissions = emissions[i].unsqueeze(1) # b,1,m 126 | 127 | score = broadcast_log_prob + broadcast_transitions \ 128 | + broadcast_emissions # b,m,m 129 | 130 | score = self._log_sum_exp(score, 1) # 131 | 132 | log_prob = score * mask[i].unsqueeze(1) + log_prob * (1. - mask[i]).unsqueeze( 133 | 1) 134 | 135 | # End transition score 136 | log_prob += self.end_transitions.view(1, -1) 137 | # Sum (log-sum-exp) over all possible tags 138 | return self._log_sum_exp(log_prob, 1) # (batch_size,) 139 | 140 | def get_real_score(self,emissions,mask,tags):# 141 | # emissions: (seq_length, batch_size, num_tags) 142 | # tags: (batch_size,seq_length) 143 | # mask: (batch_size,seq_length) 144 | seq_length = emissions.size(0)#s 145 | mask = mask.permute(1,0).contiguous().float() 146 | tags=tags.permute(1,0).contiguous() 147 | 148 | # Start transition score 149 | llh = self.start_transitions[tags[0]] # (batch_size,),T(start->firstTag) 150 | 151 | for i in range(seq_length - 1): 152 | cur_tag, next_tag = tags[i], tags[i+1] 153 | # Emission score for current tag 154 | llh += emissions[i].gather(1, cur_tag.view(-1, 1)).squeeze(1) * mask[i]#(b,1)->b->b*mask 155 | # Transition score to next tag 156 | transition_score = self.transitions[cur_tag.data, next_tag.data]# 157 | # Only add transition score if the next tag is not masked (mask == 1) 158 | llh += transition_score * mask[i+1]# 159 | 160 | # Find last tag index 161 | last_tag_indices = mask.long().sum(0) - 1 # (batch_size,) 162 | last_tags = tags.gather(0, last_tag_indices.view(1, -1)).squeeze(0)#b, 163 | 164 | # End transition score 165 | llh += self.end_transitions[last_tags]# 166 | # Emission score for the last tag, if mask is valid (mask == 1) 167 | llh += emissions[-1].gather(1, last_tags.view(-1, 1)).squeeze(1) * mask[-1]# 168 | 169 | return llh#b 170 | 171 | def neg_log_likelihood(self,feats,tags,mask): 172 | #feats: bilstm#100,b,10 173 | batch_size=feats.size(1) 174 | all_score=self.get_all_score(feats,mask)# 175 | real_score=self.get_real_score(feats,mask,tags)#b 176 | loss=(all_score.view(batch_size,1)-real_score.view(batch_size,1)).sum()/batch_size 177 | return loss # 178 | 179 | def viterbi_decode(self, emissions,mask): 180 | # emissions: (seq_length, batch_size, num_tags) 181 | # mask: (batch_size,seq_length) 182 | seq_length=emissions.size(0) 183 | batch_size=emissions.size(1) 184 | num_tags=emissions.size(2) 185 | length_mask = torch.sum(mask, dim=1).view(batch_size, 1).long() # b,1 186 | mask=mask.permute(1,0).contiguous().float()#s,b 187 | 188 | viterbi_history=[] 189 | viterbi_score = self.start_transitions.view(1, -1) + emissions[0] # b,m, 190 | 191 | for i in range(1, seq_length): 192 | broadcast_viterbi_score = viterbi_score.unsqueeze(2) # b,m,1 193 | broadcast_transitions = self.transitions.unsqueeze(0) #1,m,m 194 | broadcast_emissions = emissions[i].unsqueeze(1) # b,1,m 195 | 196 | score = broadcast_viterbi_score + broadcast_transitions \ 197 | + broadcast_emissions # b,m,m 198 | 199 | best_score,best_path = torch.max(score, 1) # b,m 200 | viterbi_history.append(best_path*mask[i].long().unsqueeze(1))# 201 | viterbi_score = best_score * mask[i].unsqueeze(1) + viterbi_score * (1. - mask[i]).unsqueeze( 202 | 1) # 203 | viterbi_score+=self.end_transitions.view(1,-1)#b,m 204 | best_score,last_path=torch.max(viterbi_score,1)#b 205 | last_path=last_path.view(-1,1)#b,1 206 | last_position = (length_mask.contiguous().view(batch_size, 1, 1).expand(batch_size, 1, num_tags) - 1).contiguous() # b,1->b,1,m 207 | pad_zero = Variable(torch.zeros(batch_size, num_tags)).long() 208 | if use_cuda: 209 | pad_zero = pad_zero.cuda() 210 | viterbi_history.append(pad_zero)#(s-1,b,m)->(s,b,m) 211 | viterbi_history = torch.cat(viterbi_history).view(-1, batch_size, num_tags) # s,b,m 212 | insert_last = last_path.view(batch_size, 1, 1).expand(batch_size, 1, num_tags) # 213 | viterbi_history = viterbi_history.transpose(1, 0).contiguous() # b,s,m 214 | viterbi_history.scatter_(1, last_position, insert_last) # 215 | viterbi_history = viterbi_history.transpose(1, 0).contiguous() # s,b,m 216 | decode_idx = Variable(torch.LongTensor(seq_length, batch_size))#s,b 217 | if use_cuda: 218 | decode_idx = decode_idx.cuda() 219 | # decode_idx[-1] = 0 220 | for idx in range(len(viterbi_history)-2,-1,-1): 221 | last_path=torch.gather(viterbi_history[idx],1,last_path) 222 | decode_idx[idx]=last_path.data 223 | decode_idx=decode_idx.transpose(1,0)#b,s 224 | return decode_idx 225 | 226 | def forward(self, feats,mask): 227 | #feats #bilstm#100.b.10 228 | best_path=self.viterbi_decode(feats,mask)#b,s 229 | return best_path 230 | 231 | 232 | if use_cuda: 233 | model=BILSTM_CRF(vcab_size,tag2index,emb_dim,hidden_dim,batch_size).cuda() 234 | else: 235 | model=BILSTM_CRF(vcab_size,tag2index,emb_dim,hidden_dim,batch_size) 236 | 237 | optimzier=optim.Adam(model.parameters(),lr=1e-3) 238 | 239 | best_acc=0 240 | best_model=None 241 | for epoch in range(num_epoches): 242 | train_loss=0 243 | train_acc=0 244 | batch_len_all=0 245 | # model.train() 246 | for i,data in enumerate(trainDataLoader): 247 | x,y,mask=data 248 | batch_len = len(x) 249 | batch_len_all += batch_len 250 | if use_cuda: 251 | x=Variable(x).cuda() 252 | y=Variable(y).cuda() 253 | mask=Variable(mask).cuda() 254 | else: 255 | x=Variable(x) 256 | y=Variable(y) 257 | mask=Variable(mask) 258 | feats=model.get_bilstm_out(x) 259 | loss=model.neg_log_likelihood(feats,y,mask) 260 | train_loss+=loss.data[0] 261 | prepath=model(feats,mask)#b,s 262 | pre_y=prepath.masked_select(mask) 263 | true_y=y.masked_select(mask) 264 | acc_num=(pre_y==true_y).data.sum() 265 | # acc_num=(pre_y==true_y).sum() 266 | acc_pro=float(acc_num)/len(pre_y) 267 | train_acc+=acc_pro 268 | #backward 269 | optimzier.zero_grad() 270 | loss.backward() 271 | optimzier.step() 272 | if (i + 1) % 100 == 0: 273 | print('[{}/{}],train loss is:{:.6f},train acc is:{:.6f}'.format(i+1, len(trainDataLoader), 274 | train_loss / (i+1), 275 | train_acc / (i+1))) 276 | print( 277 | 'epoch:[{}],train loss is:{:.6f},train acc is:{:.6f}'.format(epoch, 278 | train_loss / (len(trainDataLoader)), 279 | train_acc / (len(trainDataLoader)))) 280 | # model.eval() 281 | eval_loss = 0 282 | eval_acc = 0 283 | batch_len_all = 0 284 | for i, data in enumerate(valDataLoader): 285 | x, y,mask = data 286 | batch_len = len(x) 287 | batch_len_all += batch_len 288 | if use_cuda: 289 | x = Variable(x, volatile=True).cuda() 290 | y = Variable(y, volatile=True).cuda() 291 | mask=Variable(mask,volatile=True).cuda() 292 | else: 293 | x = Variable(x, volatile=True) 294 | y = Variable(y, volatile=True) 295 | mask = Variable(mask, volatile=True) 296 | feats=model.get_bilstm_out(x) 297 | loss=model.neg_log_likelihood(feats,y,mask) 298 | eval_loss += loss.data[0] 299 | prepath = model(feats, mask) # b,s 300 | pre_y = prepath.masked_select(mask) 301 | true_y = y.masked_select(mask) 302 | acc_num = (pre_y == true_y).data.sum() 303 | acc_pro = float(acc_num) / len(pre_y) 304 | eval_acc += acc_pro 305 | print('val loss is:{:.6f},val acc is:{:.6f}'.format( 306 | eval_loss / (len(valDataLoader) ), 307 | eval_acc / (len(valDataLoader)))) 308 | if best_acc < (eval_acc / (len(valDataLoader))): 309 | best_acc = eval_acc / (len(valDataLoader)) 310 | best_model = model.state_dict() 311 | print('best acc is {:.6f},best model is changed'.format(best_acc)) 312 | 313 | torch.save(best_model,'./model/best_model.pth') 314 | torch.save(model.state_dict(),'./model/last_model.pth') 315 | -------------------------------------------------------------------------------- /07-named-entity-recognition/data_preprocess.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from torch.utils.data import Dataset 4 | 5 | 6 | word2index={'pad':0,'unknow':1} 7 | index2word={0:'pad',1:'unknow'} 8 | tag2index={'pad':0} 9 | index2tag={0:'pad'} 10 | MAX_LEN=100 11 | 12 | data=open('./data/train_data','r') 13 | train_x=[]#总x训练集 14 | train_y=[]#总y训练集 15 | sen_x=[]#每次存一句话的id组 16 | sen_y=[]#每次存一句话的标签id组 17 | 18 | #将数据按每句话分出来 19 | for line in data: 20 | line=line.strip() 21 | if(line=="" or line=="\n" or line=="\r\n"):#一句话结束了 22 | train_x.append(sen_x) 23 | sen_x=[] 24 | train_y.append(sen_y) 25 | sen_y=[] 26 | continue 27 | line=line.split(' ') 28 | if(len(line)<2): 29 | continue 30 | if line[0] in word2index:#如果在词典中有该词,将id给sen_x 31 | sen_x.append(word2index[line[0]]) 32 | else:#如果没有则加入字典,并将id给sen_x 33 | word2index[line[0]]=len(word2index) 34 | index2word[len(index2word)]=line[0] 35 | sen_x.append(word2index[line[0]]) 36 | if line[1] in tag2index:#同理,注意不同标签对应的id与初始碰到的标签有关 37 | sen_y.append((tag2index[line[1]])) 38 | else: 39 | tag2index[line[1]]=len(tag2index) 40 | index2tag[len(index2tag)]=line[1] 41 | sen_y.append(tag2index[line[1]]) 42 | 43 | #开始对每句话进行裁剪,主要是最大长度的限制 44 | train_x_cut=[] 45 | train_y_cut=[] 46 | train_mask=[] 47 | for i in range(len(train_x)): 48 | if len(train_x[i])<=MAX_LEN:#如果句子长度小于max_sen_len 49 | train_x_cut.append(train_x[i]) 50 | train_y_cut.append(train_y[i]) 51 | train_mask.append([1]*len(train_x[i])) 52 | continue 53 | while len(train_x[i])>MAX_LEN:#超过100,使用标点符号拆分句子,将前面部分加入训练集,若后面部分仍超过100,继续拆分 54 | flag=False 55 | for j in reversed(range(MAX_LEN)):#反向访问,99、98、97... 56 | if train_x[i][j]==word2index[','] or train_x[i][j]==word2index['、']: 57 | train_x_cut.append(train_x[i][:j+1]) 58 | train_y_cut.append(train_y[i][:j+1]) 59 | train_mask.append([1]*(j+1)) 60 | train_x[i]=train_x[i][j+1:] 61 | train_y[i]=train_y[i][j+1:] 62 | break 63 | if j==0: 64 | flag=True 65 | if flag: 66 | train_x_cut.append(train_x[i][:MAX_LEN]) 67 | train_y_cut.append(train_y[i][:MAX_LEN]) 68 | train_mask.append([1]*MAX_LEN) 69 | train_x[i]=train_x[i][MAX_LEN:] 70 | train_y[i]=train_y[i][MAX_LEN:] 71 | if len(train_x[i])<=MAX_LEN:#如果句子长度小于max_sen_len,最后没有超过100的直接加入 72 | train_x_cut.append(train_x[i]) 73 | train_y_cut.append(train_y[i]) 74 | train_mask.append([1]*len(train_x[i])) 75 | 76 | #给每段分割填充0 77 | for i in range(len(train_x_cut)): 78 | if len(train_x_cut[i]) MAX_LEN: # 超过100,使用标点符号拆分句子,将前面部分加入训练集,若后面部分仍超过100,继续拆分 163 | flag = False 164 | for j in reversed(range(MAX_LEN)): # 反向访问,99、98、97... 165 | if test_x[i][j] == word2index[','] or test_x[i][j] == word2index['、']: 166 | test_x_cut.append(test_x[i][:j + 1]) 167 | test_mask.append([1]*(j+1)) 168 | test_x_len.append(j+1) 169 | test_x_cut_word.append(test_word[i][:j+1]) 170 | test_x[i] = test_x[i][j + 1:] 171 | test_x_cut_word[i]=test_word[i][j+1:] 172 | test_x_fenge.append(count) 173 | count+=1 174 | break 175 | if j == 0: 176 | flag = True 177 | if flag: 178 | test_x_cut.append(test_x[i][:MAX_LEN]) 179 | test_mask.append([1]*MAX_LEN) 180 | test_x_len.append(MAX_LEN) 181 | test_x_cut_word.append(test_word[i][:MAX_LEN]) 182 | test_x[i] = test_x[i][MAX_LEN:] 183 | test_x_cut_word[i]=test_word[i][MAX_LEN:] 184 | test_x_fenge.append(count) 185 | count+=1 186 | if len(test_x[i]) <= MAX_LEN: # 如果句子长度小于max_sen_len,最后没有超过100的直接加入 187 | test_x_cut.append(test_x[i]) 188 | test_mask.append([1]*len(test_x[i])) 189 | test_x_len.append(len(test_x[i])) 190 | test_x_cut_word.append(test_word[i]) 191 | count += 1 192 | 193 | # 给每段分割填充0 194 | for i in range(len(test_x_cut)): 195 | if len(test_x_cut[i]) < MAX_LEN: 196 | tlen = len(test_x_cut[i]) 197 | for j in range(MAX_LEN - tlen): 198 | test_x_cut[i].append(0) 199 | for i in range(len(test_mask)): 200 | if len(test_mask[i]) < MAX_LEN: 201 | tlen = len(test_mask[i]) 202 | for j in range(MAX_LEN - tlen): 203 | test_mask[i].append(0) 204 | #转化LongTensor 205 | test_x_cut=torch.LongTensor(test_x_cut) 206 | test_mask=torch.ByteTensor(test_mask) 207 | return test_x_cut,test_mask,test_x_len,test_x_cut_word,test_x_fenge 208 | 209 | #对测试集处理的函数 210 | def getTest_xy(filepath): 211 | data=open(filepath,'r') 212 | test_x = [] # 总x测试集 213 | test_y=[]#总y测试集 214 | test_word=[] #所有句话的词 215 | sen_x = [] # 每次存一句话的id组 216 | sen_y=[] #每次存一句话的标签id组 217 | sen_word=[]# 一句话的词 218 | 219 | # 将数据按每句话分出来 220 | for line in data: 221 | line = line.strip() 222 | if (line == "" or line == "\n" or line == "\r\n"): # 一句话结束了 223 | test_x.append(sen_x) 224 | sen_x = [] 225 | test_y.append(sen_y) 226 | sen_y=[] 227 | test_word.append(sen_word) 228 | sen_word=[] 229 | continue 230 | line = line.split(' ') 231 | sen_word.append(line[0]) 232 | if line[0] in word2index: # 如果在词典中有该词,将id给sen_x 233 | sen_x.append(word2index[line[0]]) 234 | sen_y.append(tag2index[line[1]]) 235 | else: # 如果没有则设为未识别 236 | sen_x.append(1) 237 | sen_y.append(tag2index[line[1]]) 238 | 239 | # 开始对每句话进行裁剪,主要是最大长度的限制 240 | test_x_cut = []#每个分割的词id 241 | test_y_cut=[]#每个分割的标签id 242 | test_mask=[] 243 | test_x_len=[]#每句话本身的长度(不填充的长度) 244 | test_x_cut_word=[]#所有分割出的词 245 | count=0#用于样本计数 246 | test_x_fenge=[]#用于记分割了的样本序号 247 | for i in range(len(test_x)): 248 | if len(test_x[i]) <= MAX_LEN: # 如果句子长度小于max_sen_len 249 | test_x_cut.append(test_x[i]) 250 | test_y_cut.append(test_y[i]) 251 | test_mask.append([1]*len(test_x[i])) 252 | test_x_len.append(len(test_x[i])) 253 | test_x_cut_word.append(test_word[i]) 254 | count+=1 255 | continue 256 | while len(test_x[i]) > MAX_LEN: # 超过100,使用标点符号拆分句子,将前面部分加入训练集,若后面部分仍超过100,继续拆分 257 | flag = False 258 | for j in reversed(range(MAX_LEN)): # 反向访问,99、98、97... 259 | if test_x[i][j] == word2index[','] or test_x[i][j] == word2index['、']: 260 | test_x_cut.append(test_x[i][:j + 1]) 261 | test_y_cut.append(test_y[i][:j+1]) 262 | test_mask.append([1]*(j+1)) 263 | test_x_len.append(j+1) 264 | test_x_cut_word.append(test_word[i][:j+1]) 265 | test_x[i] = test_x[i][j + 1:] 266 | test_y[i]=test_y[i][j+1:] 267 | test_x_cut_word[i]=test_word[i][j+1:] 268 | test_x_fenge.append(count) 269 | count+=1 270 | break 271 | if j == 0: 272 | flag = True 273 | if flag: 274 | test_x_cut.append(test_x[i][:MAX_LEN]) 275 | test_y_cut.append(test_y[i][:MAX_LEN]) 276 | test_mask.append([1]*MAX_LEN) 277 | test_x_len.append(MAX_LEN) 278 | test_x_cut_word.append(test_word[i][:MAX_LEN]) 279 | test_x[i] = test_x[i][MAX_LEN:] 280 | test_y[i]=test_y[i][MAX_LEN:] 281 | test_x_cut_word[i]=test_word[i][MAX_LEN:] 282 | test_x_fenge.append(count) 283 | count+=1 284 | if len(test_x[i]) <= MAX_LEN: # 如果句子长度小于max_sen_len,最后没有超过100的直接加入 285 | test_x_cut.append(test_x[i]) 286 | test_y_cut.append(test_y[i]) 287 | test_mask.append([1]*len(test_x[i])) 288 | test_x_len.append(len(test_x[i])) 289 | test_x_cut_word.append(test_word[i]) 290 | count += 1 291 | 292 | # 给每段分割填充0 293 | # 给每段分割填充0 294 | for i in range(len(test_x_cut)): 295 | if len(test_x_cut[i]) < MAX_LEN: 296 | tlen = len(test_x_cut[i]) 297 | for j in range(MAX_LEN - tlen): 298 | test_x_cut[i].append(0) 299 | 300 | for i in range(len(test_y_cut)): 301 | if len(test_y_cut[i]) < MAX_LEN: 302 | tlen = len(test_y_cut[i]) 303 | for j in range(MAX_LEN - tlen): 304 | test_y_cut[i].append(0) 305 | 306 | for i in range(len(test_mask)): 307 | if len(test_mask[i]) < MAX_LEN: 308 | tlen = len(test_mask[i]) 309 | for j in range(MAX_LEN - tlen): 310 | test_mask[i].append(0) 311 | #转化LongTensor 312 | test_x_cut=torch.LongTensor(test_x_cut) 313 | test_y_cut=torch.LongTensor(test_y_cut) 314 | test_mask=torch.ByteTensor(test_mask) 315 | return test_x_cut,test_y_cut,test_mask,test_x_len,test_x_cut_word,test_x_fenge 316 | 317 | def write_result_to_file(filepath,y_pred,test_x_len,test_x_cut_word,test_x_fenge): 318 | f=open(filepath,'w') 319 | for i1 in range(y_pred.shape[0]):#样本数 320 | for i2 in range(test_x_len[i1]):#每个样本的真实长度 321 | tag_id=y_pred[i1][i2] 322 | word=test_x_cut_word[i1][i2] 323 | if tag_id in index2tag: 324 | tag=index2tag[tag_id] 325 | else: 326 | tag='o' 327 | f.write(word+' '+tag+'\n') 328 | if i1 not in test_x_fenge: 329 | f.write('\n') 330 | f.close() -------------------------------------------------------------------------------- /07-named-entity-recognition/evaluating.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import data_preprocess 3 | 4 | 5 | word2index,index2word,tag2index,index2tag=data_preprocess.get_dic() 6 | 7 | def evaluate(sourcePath,resultPath): 8 | f_s=open(sourcePath,'r') 9 | f_r=open(resultPath,'r') 10 | source_data=[] 11 | result_data=[] 12 | table_eval=np.zeros((len(tag2index),len(tag2index))) #横轴表示真实值,纵轴表示预测值 13 | for line in f_s: 14 | source_data.append(line) 15 | for line in f_r: 16 | result_data.append(line) 17 | length=len(source_data) 18 | for i in range(length): 19 | if source_data[i]=='\n': 20 | continue 21 | tag_t=source_data[i].split()[1] 22 | tag_p=result_data[i].split()[1] 23 | tag_t_inx=tag2index[tag_t] 24 | tag_p_inx=tag2index[tag_p] 25 | table_eval[tag_p_inx][tag_t_inx]+=1 26 | #print(table_eval) 27 | # 评测 28 | all_p_numerator=0 29 | all_p_denominator=0 30 | all_r_denominator = 0 31 | #具体评测内容自行添加 32 | # for i in range(2,len(tag2index)-1,2): 33 | # print('############'+index2tag[i]+'##############') 34 | # precision=(table_eval[i,i]+table_eval[i+1,i+1])/(table_eval[i,:].sum()+table_eval[i+1,:].sum())#precision 35 | # recall=(table_eval[i,i]+table_eval[i+1,i+1])/(table_eval[:,i].sum()+table_eval[:,i+1].sum())#recall 36 | # f1=2*precision*recall/(precision+recall)#f1 37 | # print("num: "+str(table_eval[i,i])+' '+str(table_eval[i+1,i+1])) 38 | # print(precision) 39 | # print(recall) 40 | # print(f1) 41 | # print("#########################") 42 | # all_p_numerator+=table_eval[i,i]+table_eval[i+1,i+1] 43 | # all_p_denominator+=table_eval[i,:].sum()+table_eval[i+1,:].sum() 44 | # all_r_denominator+=table_eval[:,i].sum()+table_eval[:,i+1].sum() 45 | # print("##########all################") 46 | all_p=all_p_numerator/all_p_denominator 47 | all_r=all_p_numerator/all_r_denominator 48 | all_f1=2*all_p*all_r/(all_p+all_r) 49 | print(all_p) 50 | print(all_r) 51 | print(all_f1) 52 | 53 | evaluate('./data/test_data','./data/result_data') 54 | print(tag2index) 55 | -------------------------------------------------------------------------------- /07-named-entity-recognition/predict.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from torch.autograd import Variable 4 | from torch.utils.data import DataLoader 5 | from torch import optim, nn 6 | import data_preprocess 7 | import os 8 | 9 | torch.manual_seed(1) 10 | 11 | os.environ['CUDA_VISIBLE_DEVICES'] = '0' 12 | use_cuda = torch.cuda.is_available() 13 | 14 | word2index, index2word, tag2index, index2tag = data_preprocess.get_dic() 15 | test_x_cut, test_y_cut, test_mask, test_x_len, test_x_cut_word, test_x_fenge = data_preprocess.getTest_xy( 16 | './data/test_data') 17 | testDataSet = data_preprocess.TextDataSet(test_x_cut, test_y_cut, test_mask) 18 | 19 | testDataLoader = DataLoader(testDataSet, batch_size=16, shuffle=False) 20 | 21 | MAXLEN = 100 22 | vcab_size = len(word2index) 23 | emb_dim = 128 24 | hidden_dim = 256 25 | num_epoches = 20 26 | batch_size = 16 27 | 28 | 29 | class BILSTM_CRF(nn.Module): 30 | def __init__(self,vcab_size,tag2index,emb_dim,hidden_dim,batch_size): 31 | super(BILSTM_CRF,self).__init__() 32 | self.vcab_size=vcab_size 33 | self.tag2index=tag2index 34 | self.num_tags=len(tag2index) 35 | self.emb_dim=emb_dim 36 | self.hidden_dim=hidden_dim 37 | self.batch_size=batch_size 38 | self.use_cuda=torch.cuda.is_available() 39 | self.embed=nn.Embedding(num_embeddings=vcab_size,embedding_dim=emb_dim)#b,100,128 40 | #->100,b,128 41 | self.bilstm=nn.LSTM(input_size=emb_dim,hidden_size=hidden_dim,num_layers=1,bidirectional=True,dropout=0.1)#100,b,256*2 42 | self.conv1 = nn.Sequential( 43 | #b,1,100,128 44 | nn.Conv2d(1, 128, (1,emb_dim),padding=0), # b,128,100,1 45 | nn.BatchNorm2d(128), 46 | nn.ReLU(True), 47 | ) 48 | self.conv2 = nn.Sequential( 49 | nn.Conv2d(1, 128, (3,emb_dim+2), padding=1), # b,128,100,1 50 | nn.BatchNorm2d(128), 51 | nn.ReLU(True), 52 | ) 53 | self.conv3 = nn.Sequential( 54 | nn.Conv2d(1, 128, (5,emb_dim+4), padding=2), # b,128,100,1 55 | nn.BatchNorm2d(128), 56 | nn.ReLU(True), 57 | ) 58 | #b,128*3,100,1->100,b,128*3 59 | self.linear1 = nn.Linear(hidden_dim * 2+128*3,hidden_dim) 60 | self.drop=nn.Dropout(0.2) 61 | self.classfy=nn.Linear(hidden_dim,self.num_tags)#100*b,10 62 | #->100,b,10 63 | # init transitions 64 | self.start_transitions = nn.Parameter(torch.Tensor(self.num_tags))# 65 | self.end_transitions = nn.Parameter(torch.Tensor(self.num_tags))# 66 | self.transitions = nn.Parameter(torch.Tensor(self.num_tags, self.num_tags))# 67 | nn.init.uniform(self.start_transitions, -0.1, 0.1) 68 | nn.init.uniform(self.end_transitions, -0.1, 0.1) 69 | nn.init.uniform(self.transitions, -0.1, 0.1) 70 | 71 | def init_hidden(self,batch_size):# 72 | h_h=Variable(torch.randn(2,batch_size,self.hidden_dim)) 73 | h_c=Variable(torch.randn(2,batch_size,self.hidden_dim)) 74 | if use_cuda: 75 | h_h=h_h.cuda() 76 | h_c=h_c.cuda() 77 | return (h_h,h_c) 78 | 79 | def get_bilstm_out(self,x):# 80 | batch_size = x.size(0) 81 | emb=self.embed(x) 82 | 83 | #cnn输出 84 | emb_cnn=emb.unsqueeze(1) 85 | cnn1=self.conv1(emb_cnn) 86 | cnn2=self.conv2(emb_cnn) 87 | cnn3=self.conv3(emb_cnn) 88 | cnn_cat=torch.cat((cnn1,cnn2,cnn3),1) 89 | cnn_out=cnn_cat.squeeze().permute(2,0,1)#100,b,128*3 90 | 91 | emb_rnn=emb.permute(1,0,2) 92 | init_hidden=self.init_hidden(batch_size) 93 | lstm_out,hidden=self.bilstm(emb_rnn,init_hidden) 94 | 95 | cat_out=torch.cat((cnn_out,lstm_out),2)#100,b,128*3+256*2 96 | s,b,h=cat_out.size() 97 | cat_out=cat_out.view(s*b,h) 98 | cat_out=self.linear1(cat_out) 99 | cat_out=self.drop(cat_out) 100 | cat_out=self.classfy(cat_out) 101 | cat_out=cat_out.view(s,b,-1) 102 | # out=out.permute(1,0,2) 103 | return cat_out 104 | 105 | def _log_sum_exp(self,tensor,dim): 106 | # Find the max value along `dim` 107 | offset, _ = tensor.max(dim)#b,m 108 | # Make offset broadcastable 109 | broadcast_offset = offset.unsqueeze(dim)#b,1,m 110 | # Perform log-sum-exp safely 111 | safe_log_sum_exp = torch.log(torch.sum(torch.exp(tensor - broadcast_offset), dim))#b,m 112 | # Add offset back 113 | return offset + safe_log_sum_exp 114 | 115 | def get_all_score(self,emissions,mask):# 116 | # emissions: (seq_length, batch_size, num_tags) 117 | # mask: (batch_size,seq_length) 118 | seq_length = emissions.size(0) 119 | mask = mask.permute(1,0).contiguous().float() 120 | 121 | log_prob = self.start_transitions.view(1, -1) + emissions[0] # b,m, 122 | 123 | for i in range(1, seq_length): 124 | broadcast_log_prob = log_prob.unsqueeze(2) # b,m,1 125 | broadcast_transitions = self.transitions.unsqueeze(0) #1,m,m 126 | broadcast_emissions = emissions[i].unsqueeze(1) # b,1,m 127 | 128 | score = broadcast_log_prob + broadcast_transitions \ 129 | + broadcast_emissions # b,m,m 130 | 131 | score = self._log_sum_exp(score, 1) # 132 | 133 | log_prob = score * mask[i].unsqueeze(1) + log_prob * (1. - mask[i]).unsqueeze( 134 | 1) # 135 | 136 | # End transition score 137 | log_prob += self.end_transitions.view(1, -1) 138 | # Sum (log-sum-exp) over all possible tags 139 | return self._log_sum_exp(log_prob, 1) # (batch_size,) 140 | 141 | def get_real_score(self,emissions,mask,tags):# 142 | # emissions: (seq_length, batch_size, num_tags) 143 | # tags: (batch_size,seq_length) 144 | # mask: (batch_size,seq_length) 145 | seq_length = emissions.size(0)#s 146 | mask = mask.permute(1,0).contiguous().float() 147 | tags=tags.permute(1,0).contiguous() 148 | 149 | # Start transition score 150 | llh = self.start_transitions[tags[0]] # (batch_size,),T(start->firstTag) 151 | 152 | for i in range(seq_length - 1): 153 | cur_tag, next_tag = tags[i], tags[i+1] 154 | # Emission score for current tag 155 | llh += emissions[i].gather(1, cur_tag.view(-1, 1)).squeeze(1) * mask[i]#(b,1)->b->b*mask, 156 | # Transition score to next tag 157 | transition_score = self.transitions[cur_tag.data, next_tag.data]# 158 | # Only add transition score if the next tag is not masked (mask == 1) 159 | llh += transition_score * mask[i+1]# 160 | 161 | # Find last tag index 162 | last_tag_indices = mask.long().sum(0) - 1 # (batch_size,) 163 | last_tags = tags.gather(0, last_tag_indices.view(1, -1)).squeeze(0)#b 164 | 165 | # End transition score 166 | llh += self.end_transitions[last_tags]# 167 | # Emission score for the last tag, if mask is valid (mask == 1) 168 | llh += emissions[-1].gather(1, last_tags.view(-1, 1)).squeeze(1) * mask[-1]# 169 | 170 | return llh#b 171 | 172 | def neg_log_likelihood(self,feats,tags,mask): 173 | #feats: 174 | batch_size=feats.size(1) 175 | all_score=self.get_all_score(feats,mask)# 176 | real_score=self.get_real_score(feats,mask,tags)# 177 | loss=(all_score.view(batch_size,1)-real_score.view(batch_size,1)).sum()/batch_size 178 | return loss # 179 | 180 | def viterbi_decode(self, emissions,mask): 181 | # emissions: (seq_length, batch_size, num_tags) 182 | # mask: (batch_size,seq_length) 183 | seq_length=emissions.size(0) 184 | batch_size=emissions.size(1) 185 | num_tags=emissions.size(2) 186 | length_mask = torch.sum(mask, dim=1).view(batch_size, 1).long() # 187 | mask=mask.permute(1,0).contiguous().float()#s,b 188 | 189 | viterbi_history=[] 190 | viterbi_score = self.start_transitions.view(1, -1) + emissions[0] # 191 | 192 | for i in range(1, seq_length): 193 | broadcast_viterbi_score = viterbi_score.unsqueeze(2) # b,m,1 194 | broadcast_transitions = self.transitions.unsqueeze(0) #1,m,m 195 | broadcast_emissions = emissions[i].unsqueeze(1) # b,1,m 196 | 197 | score = broadcast_viterbi_score + broadcast_transitions \ 198 | + broadcast_emissions # b,m,m 199 | 200 | best_score,best_path = torch.max(score, 1) # 201 | viterbi_history.append(best_path*mask[i].long().unsqueeze(1))# 202 | viterbi_score = best_score * mask[i].unsqueeze(1) + viterbi_score * (1. - mask[i]).unsqueeze( 203 | 1) # 204 | viterbi_score+=self.end_transitions.view(1,-1)#b,m 205 | best_score,last_path=torch.max(viterbi_score,1)#b 206 | last_path=last_path.view(-1,1)#b,1 207 | last_position = (length_mask.contiguous().view(batch_size, 1, 1).expand(batch_size, 1, num_tags) - 1).contiguous() # 208 | pad_zero = Variable(torch.zeros(batch_size, num_tags)).long() 209 | if use_cuda: 210 | pad_zero = pad_zero.cuda() 211 | viterbi_history.append(pad_zero)#(s-1,b,m)->(s,b,m) 212 | viterbi_history = torch.cat(viterbi_history).view(-1, batch_size, num_tags) # s,b,m 213 | insert_last = last_path.view(batch_size, 1, 1).expand(batch_size, 1, num_tags) # 214 | viterbi_history = viterbi_history.transpose(1, 0).contiguous() # b,s,m 215 | viterbi_history.scatter_(1, last_position, insert_last) # 216 | viterbi_history = viterbi_history.transpose(1, 0).contiguous() # s,b,m 217 | decode_idx = Variable(torch.LongTensor(seq_length, batch_size))# 218 | if use_cuda: 219 | decode_idx = decode_idx.cuda() 220 | # decode_idx[-1] = 0 221 | for idx in range(len(viterbi_history)-2,-1,-1): 222 | last_path=torch.gather(viterbi_history[idx],1,last_path) 223 | decode_idx[idx]=last_path.data 224 | decode_idx=decode_idx.transpose(1,0)#b,s 225 | return decode_idx 226 | 227 | def forward(self, feats,mask): 228 | #feats #bilstm的输出#100.b.10 229 | best_path=self.viterbi_decode(feats,mask)#最佳路径b,s 230 | return best_path 231 | 232 | if use_cuda: 233 | model = BILSTM_CRF(vcab_size, tag2index, emb_dim, hidden_dim, batch_size).cuda() 234 | else: 235 | model = BILSTM_CRF(vcab_size, tag2index, emb_dim, hidden_dim, batch_size) 236 | 237 | model.load_state_dict(torch.load('./model/best_model.pth')) 238 | 239 | # model.eval() 240 | test_loss = 0 241 | test_acc = 0 242 | batch_len_all = 0 243 | prepath_all=[]# 244 | for i, data in enumerate(testDataLoader): 245 | x, y, mask = data 246 | batch_len = len(x) 247 | batch_len_all += batch_len 248 | if use_cuda: 249 | x = Variable(x, volatile=True).cuda() 250 | y = Variable(y, volatile=True).cuda() 251 | mask = Variable(mask, volatile=True).cuda() 252 | else: 253 | x = Variable(x, volatile=True) 254 | y = Variable(y, volatile=True) 255 | mask = Variable(mask, volatile=True) 256 | feats = model.get_bilstm_out(x) 257 | loss = model.neg_log_likelihood(feats, y, mask) 258 | test_loss += loss.data[0] 259 | prepath = model(feats, mask) # b,s 260 | prepath_all.append(prepath) 261 | pre_y = prepath.masked_select(mask) 262 | true_y = y.masked_select(mask) 263 | acc_num = (pre_y == true_y).data.sum() 264 | acc_pro = float(acc_num) / len(pre_y) 265 | test_acc += acc_pro 266 | print('test loss is:{:.6f},test acc is:{:.6f}'.format(test_loss / (len(testDataLoader)),test_acc / (len(testDataLoader)))) 267 | 268 | #写入结果文件 269 | prepath_all=torch.cat(prepath_all).data 270 | data_preprocess.write_result_to_file('./data/result_data',prepath_all,test_x_len,test_x_cut_word,test_x_fenge) 271 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | NER-Sequence-labeling--Textcnn-bilstm-crf-pytorch 2 | ====== 3 | pytorch用Textcnn-bilstm-crf模型实现命名实体识别
4 | --------- 5 | 数据处理 6 | ------ 7 | 数据处理文件是'data_preprocess.py' 8 | 9 | 模型和训练过程 10 | -------- 11 | 模型和训练过程都在同一个文件中‘cnn-bilistm-crf.py’ 12 | 13 | 预测 14 | -------- 15 | 预测文件为‘predict.py’ 16 | 17 | ------------ 18 |   19 | 数据 20 | -------- 21 | 数据存在data文件夹中 22 | --------------------------------------------------------------------------------