├── .DS_Store
├── 07-named-entity-recognition
    ├── .DS_Store
    ├── __init__.py
    ├── __pycache__
    │   └── data_preprocess.cpython-36.pyc
    ├── cnn-bilstm-crf.py
    ├── data_preprocess.py
    ├── evaluating.py
    └── predict.py
└── README.md


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZephyrChenzf/NER-Sequence-labeling--Textcnn-bilstm-crf-pytorch/6c8a1d1c145be5f2b37a8a603bdc06f4fbc8b3c1/.DS_Store


--------------------------------------------------------------------------------
/07-named-entity-recognition/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZephyrChenzf/NER-Sequence-labeling--Textcnn-bilstm-crf-pytorch/6c8a1d1c145be5f2b37a8a603bdc06f4fbc8b3c1/07-named-entity-recognition/.DS_Store


--------------------------------------------------------------------------------
/07-named-entity-recognition/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZephyrChenzf/NER-Sequence-labeling--Textcnn-bilstm-crf-pytorch/6c8a1d1c145be5f2b37a8a603bdc06f4fbc8b3c1/07-named-entity-recognition/__init__.py


--------------------------------------------------------------------------------
/07-named-entity-recognition/__pycache__/data_preprocess.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZephyrChenzf/NER-Sequence-labeling--Textcnn-bilstm-crf-pytorch/6c8a1d1c145be5f2b37a8a603bdc06f4fbc8b3c1/07-named-entity-recognition/__pycache__/data_preprocess.cpython-36.pyc


--------------------------------------------------------------------------------
/07-named-entity-recognition/cnn-bilstm-crf.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | from torch.autograd import Variable
  4 | from torch.utils.data import DataLoader
  5 | from torch import optim,nn
  6 | import data_preprocess
  7 | import os
  8 | torch.manual_seed(1)
  9 | 
 10 | os.environ['CUDA_VISIBLE_DEVICES']='0'
 11 | use_cuda=torch.cuda.is_available()
 12 | 
 13 | word2index,index2word,tag2index,index2tag=data_preprocess.get_dic()
 14 | train_x,val_x,train_y,val_y,train_mask,val_mask=data_preprocess.get_data()
 15 | trainDataSet=data_preprocess.TextDataSet(train_x,train_y,train_mask)
 16 | valDataSet=data_preprocess.TextDataSet(val_x,val_y,val_mask)
 17 | trainDataLoader=DataLoader(trainDataSet,batch_size=16,shuffle=True)
 18 | valDataLoader=DataLoader(valDataSet,batch_size=16,shuffle=False)
 19 | 
 20 | MAXLEN=100
 21 | vcab_size=len(word2index)
 22 | emb_dim=128
 23 | hidden_dim=256
 24 | num_epoches=20
 25 | batch_size=16
 26 | 
 27 | 
 28 | class BILSTM_CRF(nn.Module):
 29 |     def __init__(self,vcab_size,tag2index,emb_dim,hidden_dim,batch_size):
 30 |         super(BILSTM_CRF,self).__init__()
 31 |         self.vcab_size=vcab_size
 32 |         self.tag2index=tag2index
 33 |         self.num_tags=len(tag2index)
 34 |         self.emb_dim=emb_dim
 35 |         self.hidden_dim=hidden_dim
 36 |         self.batch_size=batch_size
 37 |         self.use_cuda=torch.cuda.is_available()
 38 |         self.embed=nn.Embedding(num_embeddings=vcab_size,embedding_dim=emb_dim)#b,100,128
 39 |         #->100,b,128
 40 |         self.bilstm=nn.LSTM(input_size=emb_dim,hidden_size=hidden_dim,num_layers=1,bidirectional=True,dropout=0.1)#100,b,256*2
 41 |         self.conv1 = nn.Sequential(
 42 |             #b,1,100,128
 43 |             nn.Conv2d(1, 128, (1,emb_dim),padding=0),  # b,128,100,1
 44 |             nn.BatchNorm2d(128),
 45 |             nn.ReLU(True),
 46 |         )
 47 |         self.conv2 = nn.Sequential(
 48 |             nn.Conv2d(1, 128, (3,emb_dim+2), padding=1),  # b,128,100,1
 49 |             nn.BatchNorm2d(128),
 50 |             nn.ReLU(True),
 51 |         )
 52 |         self.conv3 = nn.Sequential(
 53 |             nn.Conv2d(1, 128, (5,emb_dim+4), padding=2),  # b,128,100,1
 54 |             nn.BatchNorm2d(128),
 55 |             nn.ReLU(True),
 56 |         )
 57 |         #b,128*3,100,1->100,b,128*3
 58 |         self.linear1 = nn.Linear(hidden_dim * 2+128*3,hidden_dim)
 59 |         self.drop=nn.Dropout(0.2)
 60 |         self.classfy=nn.Linear(hidden_dim,self.num_tags)#100*b,10
 61 |         #->100,b,10
 62 |         # init transitions
 63 |         self.start_transitions = nn.Parameter(torch.Tensor(self.num_tags))#
 64 |         self.end_transitions = nn.Parameter(torch.Tensor(self.num_tags))#
 65 |         self.transitions = nn.Parameter(torch.Tensor(self.num_tags, self.num_tags))#
 66 |         nn.init.uniform(self.start_transitions, -0.1, 0.1)
 67 |         nn.init.uniform(self.end_transitions, -0.1, 0.1)
 68 |         nn.init.uniform(self.transitions, -0.1, 0.1)
 69 | 
 70 |     def init_hidden(self,batch_size):#
 71 |         h_h=Variable(torch.randn(2,batch_size,self.hidden_dim))
 72 |         h_c=Variable(torch.randn(2,batch_size,self.hidden_dim))
 73 |         if use_cuda:
 74 |             h_h=h_h.cuda()
 75 |             h_c=h_c.cuda()
 76 |         return (h_h,h_c)
 77 | 
 78 |     def get_bilstm_out(self,x):#
 79 |         batch_size = x.size(0)
 80 |         emb=self.embed(x)
 81 | 
 82 |         #cnn输出
 83 |         emb_cnn=emb.unsqueeze(1)
 84 |         cnn1=self.conv1(emb_cnn)
 85 |         cnn2=self.conv2(emb_cnn)
 86 |         cnn3=self.conv3(emb_cnn)
 87 |         cnn_cat=torch.cat((cnn1,cnn2,cnn3),1)
 88 |         cnn_out=cnn_cat.squeeze().permute(2,0,1)#100,b,128*3
 89 | 
 90 |         emb_rnn=emb.permute(1,0,2)
 91 |         init_hidden=self.init_hidden(batch_size)
 92 |         lstm_out,hidden=self.bilstm(emb_rnn,init_hidden)
 93 | 
 94 |         cat_out=torch.cat((cnn_out,lstm_out),2)#100,b,128*3+256*2
 95 |         s,b,h=cat_out.size()
 96 |         cat_out=cat_out.view(s*b,h)
 97 |         cat_out=self.linear1(cat_out)
 98 |         cat_out=self.drop(cat_out)
 99 |         cat_out=self.classfy(cat_out)
100 |         cat_out=cat_out.view(s,b,-1)
101 |         # out=out.permute(1,0,2)
102 |         return cat_out
103 | 
104 |     def _log_sum_exp(self,tensor,dim):
105 |         # Find the max value along `dim`
106 |         offset, _ = tensor.max(dim)#b,m
107 |         # Make offset broadcastable
108 |         broadcast_offset = offset.unsqueeze(dim)#b,1,m
109 |         # Perform log-sum-exp safely
110 |         safe_log_sum_exp = torch.log(torch.sum(torch.exp(tensor - broadcast_offset), dim))#b,m
111 |         # Add offset back
112 |         return offset + safe_log_sum_exp
113 | 
114 |     def get_all_score(self,emissions,mask):#
115 |         # emissions: (seq_length, batch_size, num_tags)
116 |         # mask: (batch_size,seq_length)
117 |         seq_length = emissions.size(0)
118 |         mask = mask.permute(1,0).contiguous().float()
119 | 
120 |         log_prob = self.start_transitions.view(1, -1) + emissions[0]  # b,m,
121 | 
122 |         for i in range(1, seq_length):
123 |             broadcast_log_prob = log_prob.unsqueeze(2)  # b,m,1
124 |             broadcast_transitions = self.transitions.unsqueeze(0)  #1,m,m
125 |             broadcast_emissions = emissions[i].unsqueeze(1)  # b,1,m
126 | 
127 |             score = broadcast_log_prob + broadcast_transitions \
128 |                     + broadcast_emissions  # b,m,m
129 | 
130 |             score = self._log_sum_exp(score, 1)  #
131 | 
132 |             log_prob = score * mask[i].unsqueeze(1) + log_prob * (1. - mask[i]).unsqueeze(
133 |                 1)  
134 | 
135 |         # End transition score
136 |         log_prob += self.end_transitions.view(1, -1)
137 |         # Sum (log-sum-exp) over all possible tags
138 |         return self._log_sum_exp(log_prob, 1)  # (batch_size,)
139 | 
140 |     def get_real_score(self,emissions,mask,tags):#
141 |         # emissions: (seq_length, batch_size, num_tags)
142 |         # tags: (batch_size,seq_length)
143 |         # mask: (batch_size,seq_length)
144 |         seq_length = emissions.size(0)#s
145 |         mask = mask.permute(1,0).contiguous().float()
146 |         tags=tags.permute(1,0).contiguous()
147 | 
148 |         # Start transition score
149 |         llh = self.start_transitions[tags[0]]  # (batch_size,),T(start->firstTag)
150 | 
151 |         for i in range(seq_length - 1):
152 |             cur_tag, next_tag = tags[i], tags[i+1]
153 |             # Emission score for current tag
154 |             llh += emissions[i].gather(1, cur_tag.view(-1, 1)).squeeze(1) * mask[i]#(b,1)->b->b*mask
155 |             # Transition score to next tag
156 |             transition_score = self.transitions[cur_tag.data, next_tag.data]#
157 |             # Only add transition score if the next tag is not masked (mask == 1)
158 |             llh += transition_score * mask[i+1]#
159 | 
160 |         # Find last tag index
161 |         last_tag_indices = mask.long().sum(0) - 1  # (batch_size,)
162 |         last_tags = tags.gather(0, last_tag_indices.view(1, -1)).squeeze(0)#b,
163 | 
164 |         # End transition score
165 |         llh += self.end_transitions[last_tags]#
166 |         # Emission score for the last tag, if mask is valid (mask == 1)
167 |         llh += emissions[-1].gather(1, last_tags.view(-1, 1)).squeeze(1) * mask[-1]#
168 | 
169 |         return llh#b
170 | 
171 |     def neg_log_likelihood(self,feats,tags,mask):
172 |         #feats:  bilstm#100,b,10
173 |         batch_size=feats.size(1)
174 |         all_score=self.get_all_score(feats,mask)#
175 |         real_score=self.get_real_score(feats,mask,tags)#b
176 |         loss=(all_score.view(batch_size,1)-real_score.view(batch_size,1)).sum()/batch_size
177 |         return loss #
178 | 
179 |     def viterbi_decode(self, emissions,mask):
180 |         # emissions: (seq_length, batch_size, num_tags)
181 |         # mask: (batch_size,seq_length)
182 |         seq_length=emissions.size(0)
183 |         batch_size=emissions.size(1)
184 |         num_tags=emissions.size(2)
185 |         length_mask = torch.sum(mask, dim=1).view(batch_size, 1).long()  # b,1
186 |         mask=mask.permute(1,0).contiguous().float()#s,b
187 | 
188 |         viterbi_history=[]
189 |         viterbi_score = self.start_transitions.view(1, -1) + emissions[0]  # b,m,
190 | 
191 |         for i in range(1, seq_length):
192 |             broadcast_viterbi_score = viterbi_score.unsqueeze(2)  # b,m,1
193 |             broadcast_transitions = self.transitions.unsqueeze(0)  #1,m,m
194 |             broadcast_emissions = emissions[i].unsqueeze(1)  # b,1,m
195 | 
196 |             score = broadcast_viterbi_score + broadcast_transitions \
197 |                     + broadcast_emissions  # b,m,m
198 | 
199 |             best_score,best_path = torch.max(score, 1)  # b,m
200 |             viterbi_history.append(best_path*mask[i].long().unsqueeze(1))#
201 |             viterbi_score = best_score * mask[i].unsqueeze(1) + viterbi_score * (1. - mask[i]).unsqueeze(
202 |                 1)  # 
203 |         viterbi_score+=self.end_transitions.view(1,-1)#b,m
204 |         best_score,last_path=torch.max(viterbi_score,1)#b
205 |         last_path=last_path.view(-1,1)#b,1
206 |         last_position = (length_mask.contiguous().view(batch_size, 1, 1).expand(batch_size, 1, num_tags) - 1).contiguous()  # b,1->b,1,m
207 |         pad_zero = Variable(torch.zeros(batch_size, num_tags)).long()
208 |         if use_cuda:
209 |             pad_zero = pad_zero.cuda()
210 |         viterbi_history.append(pad_zero)#(s-1,b,m)->(s,b,m)
211 |         viterbi_history = torch.cat(viterbi_history).view(-1, batch_size, num_tags)  # s,b,m
212 |         insert_last = last_path.view(batch_size, 1, 1).expand(batch_size, 1, num_tags) #
213 |         viterbi_history = viterbi_history.transpose(1, 0).contiguous()  # b,s,m
214 |         viterbi_history.scatter_(1, last_position, insert_last)  # 
215 |         viterbi_history = viterbi_history.transpose(1, 0).contiguous()  # s,b,m
216 |         decode_idx = Variable(torch.LongTensor(seq_length, batch_size))#s,b
217 |         if use_cuda:
218 |             decode_idx = decode_idx.cuda()
219 |         # decode_idx[-1] = 0
220 |         for idx in range(len(viterbi_history)-2,-1,-1):
221 |             last_path=torch.gather(viterbi_history[idx],1,last_path)
222 |             decode_idx[idx]=last_path.data
223 |         decode_idx=decode_idx.transpose(1,0)#b,s
224 |         return decode_idx
225 | 
226 |     def forward(self, feats,mask):
227 |         #feats    #bilstm#100.b.10
228 |         best_path=self.viterbi_decode(feats,mask)#b,s
229 |         return best_path
230 | 
231 | 
232 | if use_cuda:
233 |     model=BILSTM_CRF(vcab_size,tag2index,emb_dim,hidden_dim,batch_size).cuda()
234 | else:
235 |     model=BILSTM_CRF(vcab_size,tag2index,emb_dim,hidden_dim,batch_size)
236 | 
237 | optimzier=optim.Adam(model.parameters(),lr=1e-3)
238 | 
239 | best_acc=0
240 | best_model=None
241 | for epoch in range(num_epoches):
242 |     train_loss=0
243 |     train_acc=0
244 |     batch_len_all=0
245 |    # model.train()
246 |     for i,data in enumerate(trainDataLoader):
247 |         x,y,mask=data
248 |         batch_len = len(x)
249 |         batch_len_all += batch_len
250 |         if use_cuda:
251 |             x=Variable(x).cuda()
252 |             y=Variable(y).cuda()
253 |             mask=Variable(mask).cuda()
254 |         else:
255 |             x=Variable(x)
256 |             y=Variable(y)
257 |             mask=Variable(mask)
258 |         feats=model.get_bilstm_out(x)
259 |         loss=model.neg_log_likelihood(feats,y,mask)
260 |         train_loss+=loss.data[0]
261 |         prepath=model(feats,mask)#b,s
262 |         pre_y=prepath.masked_select(mask)
263 |         true_y=y.masked_select(mask)
264 |         acc_num=(pre_y==true_y).data.sum()
265 |         # acc_num=(pre_y==true_y).sum()
266 |         acc_pro=float(acc_num)/len(pre_y)
267 |         train_acc+=acc_pro
268 |         #backward
269 |         optimzier.zero_grad()
270 |         loss.backward()
271 |         optimzier.step()
272 |         if (i + 1) % 100 == 0:
273 |             print('[{}/{}],train loss is:{:.6f},train acc is:{:.6f}'.format(i+1, len(trainDataLoader),
274 |                                                                             train_loss / (i+1),
275 |                                                                             train_acc / (i+1)))
276 |     print(
277 |         'epoch:[{}],train loss is:{:.6f},train acc is:{:.6f}'.format(epoch,
278 |                                                                      train_loss / (len(trainDataLoader)),
279 |                                                                      train_acc / (len(trainDataLoader))))
280 |     # model.eval()
281 |     eval_loss = 0
282 |     eval_acc = 0
283 |     batch_len_all = 0
284 |     for i, data in enumerate(valDataLoader):
285 |         x, y,mask = data
286 |         batch_len = len(x)
287 |         batch_len_all += batch_len
288 |         if use_cuda:
289 |             x = Variable(x, volatile=True).cuda()
290 |             y = Variable(y, volatile=True).cuda()
291 |             mask=Variable(mask,volatile=True).cuda()
292 |         else:
293 |             x = Variable(x, volatile=True)
294 |             y = Variable(y, volatile=True)
295 |             mask = Variable(mask, volatile=True)
296 |         feats=model.get_bilstm_out(x)
297 |         loss=model.neg_log_likelihood(feats,y,mask)
298 |         eval_loss += loss.data[0]
299 |         prepath = model(feats, mask)  # b,s
300 |         pre_y = prepath.masked_select(mask)
301 |         true_y = y.masked_select(mask)
302 |         acc_num = (pre_y == true_y).data.sum()
303 |         acc_pro = float(acc_num) / len(pre_y)
304 |         eval_acc += acc_pro
305 |     print('val loss is:{:.6f},val acc is:{:.6f}'.format(
306 |         eval_loss / (len(valDataLoader) ),
307 |         eval_acc / (len(valDataLoader))))
308 |     if best_acc < (eval_acc / (len(valDataLoader))):
309 |         best_acc = eval_acc / (len(valDataLoader))
310 |         best_model = model.state_dict()
311 |         print('best acc is {:.6f},best model is changed'.format(best_acc))
312 | 
313 | torch.save(best_model,'./model/best_model.pth')
314 | torch.save(model.state_dict(),'./model/last_model.pth')
315 | 


--------------------------------------------------------------------------------
/07-named-entity-recognition/data_preprocess.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | from torch.utils.data import Dataset
  4 | 
  5 | 
  6 | word2index={'pad':0,'unknow':1}
  7 | index2word={0:'pad',1:'unknow'}
  8 | tag2index={'pad':0}
  9 | index2tag={0:'pad'}
 10 | MAX_LEN=100
 11 | 
 12 | data=open('./data/train_data','r')
 13 | train_x=[]#总x训练集
 14 | train_y=[]#总y训练集
 15 | sen_x=[]#每次存一句话的id组
 16 | sen_y=[]#每次存一句话的标签id组
 17 | 
 18 | #将数据按每句话分出来
 19 | for line in data:
 20 |     line=line.strip()
 21 |     if(line=="" or line=="\n" or line=="\r\n"):#一句话结束了
 22 |         train_x.append(sen_x)
 23 |         sen_x=[]
 24 |         train_y.append(sen_y)
 25 |         sen_y=[]
 26 |         continue
 27 |     line=line.split(' ')
 28 |     if(len(line)<2):
 29 |         continue
 30 |     if line[0] in word2index:#如果在词典中有该词，将id给sen_x
 31 |         sen_x.append(word2index[line[0]])
 32 |     else:#如果没有则加入字典，并将id给sen_x
 33 |         word2index[line[0]]=len(word2index)
 34 |         index2word[len(index2word)]=line[0]
 35 |         sen_x.append(word2index[line[0]])
 36 |     if line[1] in tag2index:#同理，注意不同标签对应的id与初始碰到的标签有关
 37 |         sen_y.append((tag2index[line[1]]))
 38 |     else:
 39 |         tag2index[line[1]]=len(tag2index)
 40 |         index2tag[len(index2tag)]=line[1]
 41 |         sen_y.append(tag2index[line[1]])
 42 | 
 43 | #开始对每句话进行裁剪，主要是最大长度的限制
 44 | train_x_cut=[]
 45 | train_y_cut=[]
 46 | train_mask=[]
 47 | for i in range(len(train_x)):
 48 |     if len(train_x[i])<=MAX_LEN:#如果句子长度小于max_sen_len
 49 |         train_x_cut.append(train_x[i])
 50 |         train_y_cut.append(train_y[i])
 51 |         train_mask.append([1]*len(train_x[i]))
 52 |         continue
 53 |     while len(train_x[i])>MAX_LEN:#超过100，使用标点符号拆分句子，将前面部分加入训练集，若后面部分仍超过100，继续拆分
 54 |         flag=False
 55 |         for j in reversed(range(MAX_LEN)):#反向访问，99、98、97...
 56 |             if train_x[i][j]==word2index['，'] or train_x[i][j]==word2index['、']:
 57 |                 train_x_cut.append(train_x[i][:j+1])
 58 |                 train_y_cut.append(train_y[i][:j+1])
 59 |                 train_mask.append([1]*(j+1))
 60 |                 train_x[i]=train_x[i][j+1:]
 61 |                 train_y[i]=train_y[i][j+1:]
 62 |                 break
 63 |             if j==0:
 64 |                 flag=True
 65 |         if flag:
 66 |             train_x_cut.append(train_x[i][:MAX_LEN])
 67 |             train_y_cut.append(train_y[i][:MAX_LEN])
 68 |             train_mask.append([1]*MAX_LEN)
 69 |             train_x[i]=train_x[i][MAX_LEN:]
 70 |             train_y[i]=train_y[i][MAX_LEN:]
 71 |     if len(train_x[i])<=MAX_LEN:#如果句子长度小于max_sen_len，最后没有超过100的直接加入
 72 |         train_x_cut.append(train_x[i])
 73 |         train_y_cut.append(train_y[i])
 74 |         train_mask.append([1]*len(train_x[i]))
 75 | 
 76 | #给每段分割填充0
 77 | for i in range(len(train_x_cut)):
 78 |     if len(train_x_cut[i])<MAX_LEN:
 79 |         tlen=len(train_x_cut[i])
 80 |         for j in range(MAX_LEN-tlen):
 81 |             train_x_cut[i].append(0)
 82 | 
 83 | for i in range(len(train_y_cut)):
 84 |     if len(train_y_cut[i])<MAX_LEN:
 85 |         tlen=len(train_y_cut[i])
 86 |         for j in range(MAX_LEN-tlen):
 87 |             train_y_cut[i].append(0)
 88 | 
 89 | for i in range(len(train_mask)):
 90 |     if len(train_mask[i])<MAX_LEN:
 91 |         tlen=len(train_mask[i])
 92 |         for j in range(MAX_LEN-tlen):
 93 |             train_mask[i].append(0)
 94 | 
 95 | #将以上数据划分为训练集和验证集
 96 | from sklearn.model_selection import train_test_split
 97 | train_x,val_x,train_y,val_y,train_mask,val_mask=train_test_split(train_x_cut,train_y_cut,train_mask,test_size=0.2,random_state=20180306)
 98 | 
 99 | #将数据转化为LongTensor
100 | train_x=torch.LongTensor(train_x)
101 | val_x=torch.LongTensor(val_x)
102 | train_y=torch.LongTensor(train_y)
103 | val_y=torch.LongTensor(val_y)
104 | train_mask=torch.ByteTensor(train_mask)
105 | val_mask=torch.ByteTensor(val_mask)
106 | #返回字典
107 | def get_dic():
108 |     return word2index,index2word,tag2index,index2tag
109 | 
110 | #返回训练集和验证集
111 | def get_data():
112 |     return train_x,val_x,train_y,val_y,train_mask,val_mask
113 | 
114 | #定义TextDataSet类
115 | class TextDataSet(Dataset):
116 |     def __init__(self,inputs,outputs,masks):
117 |         self.inputs,self.outputs,self.masks=inputs,outputs,masks
118 |     def __getitem__(self, item):
119 |         return self.inputs[item],self.outputs[item],self.masks[item]
120 |     def __len__(self):
121 |         return len(self.inputs)
122 | 
123 | #对测试集处理的函数
124 | def getTest_x(filepath):
125 |     data=open(filepath,'r')
126 |     test_x = []  # 总x测试集
127 |     test_word=[] #所有句话的词
128 |     sen_x = []  # 每次存一句话的id组
129 |     sen_word=[]# 一句话的词
130 | 
131 |     # 将数据按每句话分出来
132 |     for line in data:
133 |         line = line.strip()
134 |         if (line == "" or line == "\n" or line == "\r\n"):  # 一句话结束了
135 |             test_x.append(sen_x)
136 |             sen_x = []
137 |             test_word.append(sen_word)
138 |             sen_word=[]
139 |             continue
140 |         line = line.split(' ')
141 |         sen_word.append(line[0])
142 |         if line[0] in word2index:  # 如果在词典中有该词，将id给sen_x
143 |             sen_x.append(word2index[line[0]])
144 |         else:  # 如果没有则设为未识别
145 |             sen_x.append(1)
146 | 
147 |     # 开始对每句话进行裁剪，主要是最大长度的限制
148 |     test_x_cut = []#每个分割的词id
149 |     test_mask=[]
150 |     test_x_len=[]#每句话本身的长度（不填充的长度）
151 |     test_x_cut_word=[]#所有分割出的词
152 |     count=0#用于样本计数
153 |     test_x_fenge=[]#用于记分割了的样本序号
154 |     for i in range(len(test_x)):
155 |         if len(test_x[i]) <= MAX_LEN:  # 如果句子长度小于max_sen_len
156 |             test_x_cut.append(test_x[i])
157 |             test_mask.append([1]*len(test_x[i]))
158 |             test_x_len.append(len(test_x[i]))
159 |             test_x_cut_word.append(test_word[i])
160 |             count+=1
161 |             continue
162 |         while len(test_x[i]) > MAX_LEN:  # 超过100，使用标点符号拆分句子，将前面部分加入训练集，若后面部分仍超过100，继续拆分
163 |             flag = False
164 |             for j in reversed(range(MAX_LEN)):  # 反向访问，99、98、97...
165 |                 if test_x[i][j] == word2index['，'] or test_x[i][j] == word2index['、']:
166 |                     test_x_cut.append(test_x[i][:j + 1])
167 |                     test_mask.append([1]*(j+1))
168 |                     test_x_len.append(j+1)
169 |                     test_x_cut_word.append(test_word[i][:j+1])
170 |                     test_x[i] = test_x[i][j + 1:]
171 |                     test_x_cut_word[i]=test_word[i][j+1:]
172 |                     test_x_fenge.append(count)
173 |                     count+=1
174 |                     break
175 |                 if j == 0:
176 |                     flag = True
177 |             if flag:
178 |                 test_x_cut.append(test_x[i][:MAX_LEN])
179 |                 test_mask.append([1]*MAX_LEN)
180 |                 test_x_len.append(MAX_LEN)
181 |                 test_x_cut_word.append(test_word[i][:MAX_LEN])
182 |                 test_x[i] = test_x[i][MAX_LEN:]
183 |                 test_x_cut_word[i]=test_word[i][MAX_LEN:]
184 |                 test_x_fenge.append(count)
185 |                 count+=1
186 |         if len(test_x[i]) <= MAX_LEN:  # 如果句子长度小于max_sen_len，最后没有超过100的直接加入
187 |             test_x_cut.append(test_x[i])
188 |             test_mask.append([1]*len(test_x[i]))
189 |             test_x_len.append(len(test_x[i]))
190 |             test_x_cut_word.append(test_word[i])
191 |             count += 1
192 | 
193 |     # 给每段分割填充0
194 |     for i in range(len(test_x_cut)):
195 |         if len(test_x_cut[i]) < MAX_LEN:
196 |             tlen = len(test_x_cut[i])
197 |             for j in range(MAX_LEN - tlen):
198 |                 test_x_cut[i].append(0)
199 |     for i in range(len(test_mask)):
200 |         if len(test_mask[i]) < MAX_LEN:
201 |             tlen = len(test_mask[i])
202 |             for j in range(MAX_LEN - tlen):
203 |                 test_mask[i].append(0)
204 |     #转化LongTensor
205 |     test_x_cut=torch.LongTensor(test_x_cut)
206 |     test_mask=torch.ByteTensor(test_mask)
207 |     return test_x_cut,test_mask,test_x_len,test_x_cut_word,test_x_fenge
208 | 
209 | #对测试集处理的函数
210 | def getTest_xy(filepath):
211 |     data=open(filepath,'r')
212 |     test_x = []  # 总x测试集
213 |     test_y=[]#总y测试集
214 |     test_word=[] #所有句话的词
215 |     sen_x = []  # 每次存一句话的id组
216 |     sen_y=[]    #每次存一句话的标签id组
217 |     sen_word=[]# 一句话的词
218 | 
219 |     # 将数据按每句话分出来
220 |     for line in data:
221 |         line = line.strip()
222 |         if (line == "" or line == "\n" or line == "\r\n"):  # 一句话结束了
223 |             test_x.append(sen_x)
224 |             sen_x = []
225 |             test_y.append(sen_y)
226 |             sen_y=[]
227 |             test_word.append(sen_word)
228 |             sen_word=[]
229 |             continue
230 |         line = line.split(' ')
231 |         sen_word.append(line[0])
232 |         if line[0] in word2index:  # 如果在词典中有该词，将id给sen_x
233 |             sen_x.append(word2index[line[0]])
234 |             sen_y.append(tag2index[line[1]])
235 |         else:  # 如果没有则设为未识别
236 |             sen_x.append(1)
237 |             sen_y.append(tag2index[line[1]])
238 | 
239 |     # 开始对每句话进行裁剪，主要是最大长度的限制
240 |     test_x_cut = []#每个分割的词id
241 |     test_y_cut=[]#每个分割的标签id
242 |     test_mask=[]
243 |     test_x_len=[]#每句话本身的长度（不填充的长度）
244 |     test_x_cut_word=[]#所有分割出的词
245 |     count=0#用于样本计数
246 |     test_x_fenge=[]#用于记分割了的样本序号
247 |     for i in range(len(test_x)):
248 |         if len(test_x[i]) <= MAX_LEN:  # 如果句子长度小于max_sen_len
249 |             test_x_cut.append(test_x[i])
250 |             test_y_cut.append(test_y[i])
251 |             test_mask.append([1]*len(test_x[i]))
252 |             test_x_len.append(len(test_x[i]))
253 |             test_x_cut_word.append(test_word[i])
254 |             count+=1
255 |             continue
256 |         while len(test_x[i]) > MAX_LEN:  # 超过100，使用标点符号拆分句子，将前面部分加入训练集，若后面部分仍超过100，继续拆分
257 |             flag = False
258 |             for j in reversed(range(MAX_LEN)):  # 反向访问，99、98、97...
259 |                 if test_x[i][j] == word2index['，'] or test_x[i][j] == word2index['、']:
260 |                     test_x_cut.append(test_x[i][:j + 1])
261 |                     test_y_cut.append(test_y[i][:j+1])
262 |                     test_mask.append([1]*(j+1))
263 |                     test_x_len.append(j+1)
264 |                     test_x_cut_word.append(test_word[i][:j+1])
265 |                     test_x[i] = test_x[i][j + 1:]
266 |                     test_y[i]=test_y[i][j+1:]
267 |                     test_x_cut_word[i]=test_word[i][j+1:]
268 |                     test_x_fenge.append(count)
269 |                     count+=1
270 |                     break
271 |                 if j == 0:
272 |                     flag = True
273 |             if flag:
274 |                 test_x_cut.append(test_x[i][:MAX_LEN])
275 |                 test_y_cut.append(test_y[i][:MAX_LEN])
276 |                 test_mask.append([1]*MAX_LEN)
277 |                 test_x_len.append(MAX_LEN)
278 |                 test_x_cut_word.append(test_word[i][:MAX_LEN])
279 |                 test_x[i] = test_x[i][MAX_LEN:]
280 |                 test_y[i]=test_y[i][MAX_LEN:]
281 |                 test_x_cut_word[i]=test_word[i][MAX_LEN:]
282 |                 test_x_fenge.append(count)
283 |                 count+=1
284 |         if len(test_x[i]) <= MAX_LEN:  # 如果句子长度小于max_sen_len，最后没有超过100的直接加入
285 |             test_x_cut.append(test_x[i])
286 |             test_y_cut.append(test_y[i])
287 |             test_mask.append([1]*len(test_x[i]))
288 |             test_x_len.append(len(test_x[i]))
289 |             test_x_cut_word.append(test_word[i])
290 |             count += 1
291 | 
292 |     # 给每段分割填充0
293 |     # 给每段分割填充0
294 |     for i in range(len(test_x_cut)):
295 |         if len(test_x_cut[i]) < MAX_LEN:
296 |             tlen = len(test_x_cut[i])
297 |             for j in range(MAX_LEN - tlen):
298 |                 test_x_cut[i].append(0)
299 | 
300 |     for i in range(len(test_y_cut)):
301 |         if len(test_y_cut[i]) < MAX_LEN:
302 |             tlen = len(test_y_cut[i])
303 |             for j in range(MAX_LEN - tlen):
304 |                 test_y_cut[i].append(0)
305 | 
306 |     for i in range(len(test_mask)):
307 |         if len(test_mask[i]) < MAX_LEN:
308 |             tlen = len(test_mask[i])
309 |             for j in range(MAX_LEN - tlen):
310 |                 test_mask[i].append(0)
311 |     #转化LongTensor
312 |     test_x_cut=torch.LongTensor(test_x_cut)
313 |     test_y_cut=torch.LongTensor(test_y_cut)
314 |     test_mask=torch.ByteTensor(test_mask)
315 |     return test_x_cut,test_y_cut,test_mask,test_x_len,test_x_cut_word,test_x_fenge
316 | 
317 | def write_result_to_file(filepath,y_pred,test_x_len,test_x_cut_word,test_x_fenge):
318 |     f=open(filepath,'w')
319 |     for i1 in range(y_pred.shape[0]):#样本数
320 |         for i2 in range(test_x_len[i1]):#每个样本的真实长度
321 |             tag_id=y_pred[i1][i2]
322 |             word=test_x_cut_word[i1][i2]
323 |             if tag_id in index2tag:
324 |                 tag=index2tag[tag_id]
325 |             else:
326 |                 tag='o'
327 |             f.write(word+' '+tag+'\n')
328 |         if i1 not in test_x_fenge:
329 |             f.write('\n')
330 |     f.close()


--------------------------------------------------------------------------------
/07-named-entity-recognition/evaluating.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import data_preprocess
 3 | 
 4 | 
 5 | word2index,index2word,tag2index,index2tag=data_preprocess.get_dic()
 6 | 
 7 | def evaluate(sourcePath,resultPath):
 8 |     f_s=open(sourcePath,'r')
 9 |     f_r=open(resultPath,'r')
10 |     source_data=[]
11 |     result_data=[]
12 |     table_eval=np.zeros((len(tag2index),len(tag2index)))  #横轴表示真实值，纵轴表示预测值
13 |     for line in f_s:
14 |         source_data.append(line)
15 |     for line in f_r:
16 |         result_data.append(line)
17 |     length=len(source_data)
18 |     for i in range(length):
19 |         if source_data[i]=='\n':
20 |             continue
21 |         tag_t=source_data[i].split()[1]
22 |         tag_p=result_data[i].split()[1]
23 |         tag_t_inx=tag2index[tag_t]
24 |         tag_p_inx=tag2index[tag_p]
25 |         table_eval[tag_p_inx][tag_t_inx]+=1
26 |     #print(table_eval)
27 |     # 评测
28 |     all_p_numerator=0
29 |     all_p_denominator=0
30 |     all_r_denominator = 0
31 |     #具体评测内容自行添加
32 |     # for i in range(2,len(tag2index)-1,2):
33 |     #     print('############'+index2tag[i]+'##############')
34 |     #     precision=(table_eval[i,i]+table_eval[i+1,i+1])/(table_eval[i,:].sum()+table_eval[i+1,:].sum())#precision
35 |     #     recall=(table_eval[i,i]+table_eval[i+1,i+1])/(table_eval[:,i].sum()+table_eval[:,i+1].sum())#recall
36 |     #     f1=2*precision*recall/(precision+recall)#f1
37 |     #     print("num: "+str(table_eval[i,i])+' '+str(table_eval[i+1,i+1]))
38 |     #     print(precision)
39 |     #     print(recall)
40 |     #     print(f1)
41 |     #     print("#########################")
42 |     #     all_p_numerator+=table_eval[i,i]+table_eval[i+1,i+1]
43 |     #     all_p_denominator+=table_eval[i,:].sum()+table_eval[i+1,:].sum()
44 |     #     all_r_denominator+=table_eval[:,i].sum()+table_eval[:,i+1].sum()
45 |     # print("##########all################")
46 |     all_p=all_p_numerator/all_p_denominator
47 |     all_r=all_p_numerator/all_r_denominator
48 |     all_f1=2*all_p*all_r/(all_p+all_r)
49 |     print(all_p)
50 |     print(all_r)
51 |     print(all_f1)
52 | 
53 | evaluate('./data/test_data','./data/result_data')
54 | print(tag2index)
55 | 


--------------------------------------------------------------------------------
/07-named-entity-recognition/predict.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | from torch.autograd import Variable
  4 | from torch.utils.data import DataLoader
  5 | from torch import optim, nn
  6 | import data_preprocess
  7 | import os
  8 | 
  9 | torch.manual_seed(1)
 10 | 
 11 | os.environ['CUDA_VISIBLE_DEVICES'] = '0'
 12 | use_cuda = torch.cuda.is_available()
 13 | 
 14 | word2index, index2word, tag2index, index2tag = data_preprocess.get_dic()
 15 | test_x_cut, test_y_cut, test_mask, test_x_len, test_x_cut_word, test_x_fenge = data_preprocess.getTest_xy(
 16 |     './data/test_data')
 17 | testDataSet = data_preprocess.TextDataSet(test_x_cut, test_y_cut, test_mask)
 18 | 
 19 | testDataLoader = DataLoader(testDataSet, batch_size=16, shuffle=False)
 20 | 
 21 | MAXLEN = 100
 22 | vcab_size = len(word2index)
 23 | emb_dim = 128
 24 | hidden_dim = 256
 25 | num_epoches = 20
 26 | batch_size = 16
 27 | 
 28 | 
 29 | class BILSTM_CRF(nn.Module):
 30 |     def __init__(self,vcab_size,tag2index,emb_dim,hidden_dim,batch_size):
 31 |         super(BILSTM_CRF,self).__init__()
 32 |         self.vcab_size=vcab_size
 33 |         self.tag2index=tag2index
 34 |         self.num_tags=len(tag2index)
 35 |         self.emb_dim=emb_dim
 36 |         self.hidden_dim=hidden_dim
 37 |         self.batch_size=batch_size
 38 |         self.use_cuda=torch.cuda.is_available()
 39 |         self.embed=nn.Embedding(num_embeddings=vcab_size,embedding_dim=emb_dim)#b,100,128
 40 |         #->100,b,128
 41 |         self.bilstm=nn.LSTM(input_size=emb_dim,hidden_size=hidden_dim,num_layers=1,bidirectional=True,dropout=0.1)#100,b,256*2
 42 |         self.conv1 = nn.Sequential(
 43 |             #b,1,100,128
 44 |             nn.Conv2d(1, 128, (1,emb_dim),padding=0),  # b,128,100,1
 45 |             nn.BatchNorm2d(128),
 46 |             nn.ReLU(True),
 47 |         )
 48 |         self.conv2 = nn.Sequential(
 49 |             nn.Conv2d(1, 128, (3,emb_dim+2), padding=1),  # b,128,100,1
 50 |             nn.BatchNorm2d(128),
 51 |             nn.ReLU(True),
 52 |         )
 53 |         self.conv3 = nn.Sequential(
 54 |             nn.Conv2d(1, 128, (5,emb_dim+4), padding=2),  # b,128,100,1
 55 |             nn.BatchNorm2d(128),
 56 |             nn.ReLU(True),
 57 |         )
 58 |         #b,128*3,100,1->100,b,128*3
 59 |         self.linear1 = nn.Linear(hidden_dim * 2+128*3,hidden_dim)
 60 |         self.drop=nn.Dropout(0.2)
 61 |         self.classfy=nn.Linear(hidden_dim,self.num_tags)#100*b,10
 62 |         #->100,b,10
 63 |         # init transitions
 64 |         self.start_transitions = nn.Parameter(torch.Tensor(self.num_tags))#
 65 |         self.end_transitions = nn.Parameter(torch.Tensor(self.num_tags))#
 66 |         self.transitions = nn.Parameter(torch.Tensor(self.num_tags, self.num_tags))#
 67 |         nn.init.uniform(self.start_transitions, -0.1, 0.1)
 68 |         nn.init.uniform(self.end_transitions, -0.1, 0.1)
 69 |         nn.init.uniform(self.transitions, -0.1, 0.1)
 70 | 
 71 |     def init_hidden(self,batch_size):#
 72 |         h_h=Variable(torch.randn(2,batch_size,self.hidden_dim))
 73 |         h_c=Variable(torch.randn(2,batch_size,self.hidden_dim))
 74 |         if use_cuda:
 75 |             h_h=h_h.cuda()
 76 |             h_c=h_c.cuda()
 77 |         return (h_h,h_c)
 78 | 
 79 |     def get_bilstm_out(self,x):#
 80 |         batch_size = x.size(0)
 81 |         emb=self.embed(x)
 82 | 
 83 |         #cnn输出
 84 |         emb_cnn=emb.unsqueeze(1)
 85 |         cnn1=self.conv1(emb_cnn)
 86 |         cnn2=self.conv2(emb_cnn)
 87 |         cnn3=self.conv3(emb_cnn)
 88 |         cnn_cat=torch.cat((cnn1,cnn2,cnn3),1)
 89 |         cnn_out=cnn_cat.squeeze().permute(2,0,1)#100,b,128*3
 90 | 
 91 |         emb_rnn=emb.permute(1,0,2)
 92 |         init_hidden=self.init_hidden(batch_size)
 93 |         lstm_out,hidden=self.bilstm(emb_rnn,init_hidden)
 94 | 
 95 |         cat_out=torch.cat((cnn_out,lstm_out),2)#100,b,128*3+256*2
 96 |         s,b,h=cat_out.size()
 97 |         cat_out=cat_out.view(s*b,h)
 98 |         cat_out=self.linear1(cat_out)
 99 |         cat_out=self.drop(cat_out)
100 |         cat_out=self.classfy(cat_out)
101 |         cat_out=cat_out.view(s,b,-1)
102 |         # out=out.permute(1,0,2)
103 |         return cat_out
104 | 
105 |     def _log_sum_exp(self,tensor,dim):
106 |         # Find the max value along `dim`
107 |         offset, _ = tensor.max(dim)#b,m
108 |         # Make offset broadcastable
109 |         broadcast_offset = offset.unsqueeze(dim)#b,1,m
110 |         # Perform log-sum-exp safely
111 |         safe_log_sum_exp = torch.log(torch.sum(torch.exp(tensor - broadcast_offset), dim))#b,m
112 |         # Add offset back
113 |         return offset + safe_log_sum_exp
114 | 
115 |     def get_all_score(self,emissions,mask):#
116 |         # emissions: (seq_length, batch_size, num_tags)
117 |         # mask: (batch_size,seq_length)
118 |         seq_length = emissions.size(0)
119 |         mask = mask.permute(1,0).contiguous().float()
120 | 
121 |         log_prob = self.start_transitions.view(1, -1) + emissions[0]  # b,m,
122 | 
123 |         for i in range(1, seq_length):
124 |             broadcast_log_prob = log_prob.unsqueeze(2)  # b,m,1
125 |             broadcast_transitions = self.transitions.unsqueeze(0)  #1,m,m
126 |             broadcast_emissions = emissions[i].unsqueeze(1)  # b,1,m
127 | 
128 |             score = broadcast_log_prob + broadcast_transitions \
129 |                     + broadcast_emissions  # b,m,m
130 | 
131 |             score = self._log_sum_exp(score, 1)  # 
132 | 
133 |             log_prob = score * mask[i].unsqueeze(1) + log_prob * (1. - mask[i]).unsqueeze(
134 |                 1)  # 
135 | 
136 |         # End transition score
137 |         log_prob += self.end_transitions.view(1, -1)
138 |         # Sum (log-sum-exp) over all possible tags
139 |         return self._log_sum_exp(log_prob, 1)  # (batch_size,)
140 | 
141 |     def get_real_score(self,emissions,mask,tags):#
142 |         # emissions: (seq_length, batch_size, num_tags)
143 |         # tags: (batch_size,seq_length)
144 |         # mask: (batch_size,seq_length)
145 |         seq_length = emissions.size(0)#s
146 |         mask = mask.permute(1,0).contiguous().float()
147 |         tags=tags.permute(1,0).contiguous()
148 | 
149 |         # Start transition score
150 |         llh = self.start_transitions[tags[0]]  # (batch_size,),T(start->firstTag)
151 | 
152 |         for i in range(seq_length - 1):
153 |             cur_tag, next_tag = tags[i], tags[i+1]
154 |             # Emission score for current tag
155 |             llh += emissions[i].gather(1, cur_tag.view(-1, 1)).squeeze(1) * mask[i]#(b,1)->b->b*mask，
156 |             # Transition score to next tag
157 |             transition_score = self.transitions[cur_tag.data, next_tag.data]#
158 |             # Only add transition score if the next tag is not masked (mask == 1)
159 |             llh += transition_score * mask[i+1]#
160 | 
161 |         # Find last tag index
162 |         last_tag_indices = mask.long().sum(0) - 1  # (batch_size,)
163 |         last_tags = tags.gather(0, last_tag_indices.view(1, -1)).squeeze(0)#b
164 | 
165 |         # End transition score
166 |         llh += self.end_transitions[last_tags]#
167 |         # Emission score for the last tag, if mask is valid (mask == 1)
168 |         llh += emissions[-1].gather(1, last_tags.view(-1, 1)).squeeze(1) * mask[-1]#
169 | 
170 |         return llh#b
171 | 
172 |     def neg_log_likelihood(self,feats,tags,mask):
173 |         #feats:  
174 |         batch_size=feats.size(1)
175 |         all_score=self.get_all_score(feats,mask)#
176 |         real_score=self.get_real_score(feats,mask,tags)#
177 |         loss=(all_score.view(batch_size,1)-real_score.view(batch_size,1)).sum()/batch_size
178 |         return loss #
179 | 
180 |     def viterbi_decode(self, emissions,mask):
181 |         # emissions: (seq_length, batch_size, num_tags)
182 |         # mask: (batch_size,seq_length)
183 |         seq_length=emissions.size(0)
184 |         batch_size=emissions.size(1)
185 |         num_tags=emissions.size(2)
186 |         length_mask = torch.sum(mask, dim=1).view(batch_size, 1).long()  #
187 |         mask=mask.permute(1,0).contiguous().float()#s,b
188 | 
189 |         viterbi_history=[]
190 |         viterbi_score = self.start_transitions.view(1, -1) + emissions[0]  # 
191 | 
192 |         for i in range(1, seq_length):
193 |             broadcast_viterbi_score = viterbi_score.unsqueeze(2)  # b,m,1
194 |             broadcast_transitions = self.transitions.unsqueeze(0)  #1,m,m
195 |             broadcast_emissions = emissions[i].unsqueeze(1)  # b,1,m
196 | 
197 |             score = broadcast_viterbi_score + broadcast_transitions \
198 |                     + broadcast_emissions  # b,m,m
199 | 
200 |             best_score,best_path = torch.max(score, 1)  # 
201 |             viterbi_history.append(best_path*mask[i].long().unsqueeze(1))#
202 |             viterbi_score = best_score * mask[i].unsqueeze(1) + viterbi_score * (1. - mask[i]).unsqueeze(
203 |                 1)  # 
204 |         viterbi_score+=self.end_transitions.view(1,-1)#b,m
205 |         best_score,last_path=torch.max(viterbi_score,1)#b
206 |         last_path=last_path.view(-1,1)#b,1
207 |         last_position = (length_mask.contiguous().view(batch_size, 1, 1).expand(batch_size, 1, num_tags) - 1).contiguous()  #
208 |         pad_zero = Variable(torch.zeros(batch_size, num_tags)).long()
209 |         if use_cuda:
210 |             pad_zero = pad_zero.cuda()
211 |         viterbi_history.append(pad_zero)#(s-1,b,m)->(s,b,m)
212 |         viterbi_history = torch.cat(viterbi_history).view(-1, batch_size, num_tags)  # s,b,m
213 |         insert_last = last_path.view(batch_size, 1, 1).expand(batch_size, 1, num_tags) #
214 |         viterbi_history = viterbi_history.transpose(1, 0).contiguous()  # b,s,m
215 |         viterbi_history.scatter_(1, last_position, insert_last)  # 
216 |         viterbi_history = viterbi_history.transpose(1, 0).contiguous()  # s,b,m
217 |         decode_idx = Variable(torch.LongTensor(seq_length, batch_size))#
218 |         if use_cuda:
219 |             decode_idx = decode_idx.cuda()
220 |         # decode_idx[-1] = 0
221 |         for idx in range(len(viterbi_history)-2,-1,-1):
222 |             last_path=torch.gather(viterbi_history[idx],1,last_path)
223 |             decode_idx[idx]=last_path.data
224 |         decode_idx=decode_idx.transpose(1,0)#b,s
225 |         return decode_idx
226 | 
227 |     def forward(self, feats,mask):
228 |         #feats    #bilstm的输出#100.b.10
229 |         best_path=self.viterbi_decode(feats,mask)#最佳路径b,s
230 |         return best_path
231 | 
232 | if use_cuda:
233 |     model = BILSTM_CRF(vcab_size, tag2index, emb_dim, hidden_dim, batch_size).cuda()
234 | else:
235 |     model = BILSTM_CRF(vcab_size, tag2index, emb_dim, hidden_dim, batch_size)
236 | 
237 | model.load_state_dict(torch.load('./model/best_model.pth'))
238 | 
239 | # model.eval()
240 | test_loss = 0
241 | test_acc = 0
242 | batch_len_all = 0
243 | prepath_all=[]#
244 | for i, data in enumerate(testDataLoader):
245 |     x, y, mask = data
246 |     batch_len = len(x)
247 |     batch_len_all += batch_len
248 |     if use_cuda:
249 |         x = Variable(x, volatile=True).cuda()
250 |         y = Variable(y, volatile=True).cuda()
251 |         mask = Variable(mask, volatile=True).cuda()
252 |     else:
253 |         x = Variable(x, volatile=True)
254 |         y = Variable(y, volatile=True)
255 |         mask = Variable(mask, volatile=True)
256 |     feats = model.get_bilstm_out(x)
257 |     loss = model.neg_log_likelihood(feats, y, mask)
258 |     test_loss += loss.data[0]
259 |     prepath = model(feats, mask)  # b,s
260 |     prepath_all.append(prepath)
261 |     pre_y = prepath.masked_select(mask)
262 |     true_y = y.masked_select(mask)
263 |     acc_num = (pre_y == true_y).data.sum()
264 |     acc_pro = float(acc_num) / len(pre_y)
265 |     test_acc += acc_pro
266 | print('test loss is:{:.6f},test acc is:{:.6f}'.format(test_loss / (len(testDataLoader)),test_acc / (len(testDataLoader))))
267 | 
268 | #写入结果文件
269 | prepath_all=torch.cat(prepath_all).data
270 | data_preprocess.write_result_to_file('./data/result_data',prepath_all,test_x_len,test_x_cut_word,test_x_fenge)
271 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | NER-Sequence-labeling--Textcnn-bilstm-crf-pytorch
 2 | ======
 3 | pytorch用Textcnn-bilstm-crf模型实现命名实体识别<br>
 4 | ---------
 5 | 数据处理
 6 | ------
 7 | 数据处理文件是'data_preprocess.py'
 8 |     
 9 | 模型和训练过程
10 | --------
11 | 模型和训练过程都在同一个文件中‘cnn-bilistm-crf.py’
12 |     
13 | 预测
14 | --------
15 | 预测文件为‘predict.py’
16 | 
17 | ------------
18 |   
19 | 数据
20 | --------
21 | 数据存在data文件夹中
22 | 


--------------------------------------------------------------------------------