├── 人体阴阳与电能.txt ├── README.md ├── code_10_CrossEntropy.py ├── code_13_pooling.py ├── code_09_BNdetail.py ├── code_20_GPT2Test.py ├── code_19_BERTTest.py ├── code_02_moons.py ├── code_18_pipline.py ├── code_05_L2.py ├── code_06_Dropout.py ├── code_08_BN.py ├── code_25_BERT_NoPUNC.py ├── code_03_moons_fun.py ├── code_12_CONV.py ├── code_24_BERT_PROPN.py ├── code_07_Multi-sampleDropout.py ├── code_27_spellgcn.py ├── code_04_use_module.py ├── code_22_TextCNNInterpret.py ├── code_01_subtraction.py ├── code_29_serving.py ├── code_15_rnnwordtest.py ├── code_21_BERT_CH.py ├── code_28_CDial.py ├── code_11_skip-gram.py ├── code_14_TextCNN.py ├── code_16_AttLSTMModel.py ├── code_23_GNN_BERT.py ├── code_26_RGCNDGL.py └── code_17_Transformer.py /人体阴阳与电能.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/darvincy/Bert_based_book_code/HEAD/人体阴阳与电能.txt -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 《基于BERT模型的自然语言处理实战》随书代码 2 | 3 | 随书数据资源可在官网下载:https://www.aianaconda.com/index/bert 4 | -------------------------------------------------------------------------------- /code_10_CrossEntropy.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @author: 代码医生工作室 4 | @公众号:xiangyuejiqiren (内有更多优秀文章及学习资料) 5 | @来源: 配套代码 6 | @配套代码技术支持:bbs.aianaconda.com 7 | Created on Mon Apr 8 22:19:48 2019 8 | """ 9 | 10 | import torch 11 | logits = torch.autograd.Variable(torch.tensor([[2, 0.5,6], [0.1,0, 3]])) 12 | labels = torch.autograd.Variable(torch.LongTensor([2,1])) 13 | print(logits) 14 | print(labels) 15 | print('Softmax:',torch.nn.Softmax(dim=1)(logits)) 16 | logsoftmax = torch.nn.LogSoftmax(dim=1)(logits) 17 | print('logsoftmax:',logsoftmax) 18 | output = torch.nn.NLLLoss()(logsoftmax, labels) 19 | print('NLLLoss:',output) 20 | print ( 'CrossEntropyLoss:', torch.nn.CrossEntropyLoss()(logits, labels) ) -------------------------------------------------------------------------------- /code_13_pooling.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @author: 代码医生工作室 4 | @公众号:xiangyuejiqiren (内有更多优秀文章及学习资料) 5 | @来源: 配套代码 6 | @配套代码技术支持:bbs.aianaconda.com 7 | Created on Sat Apr 27 07:04:02 2019 8 | """ 9 | 10 | import torch 11 | 12 | img=torch.tensor([ [ [0.,0.,0.,0.],[1.,1.,1.,1.],[2.,2.,2.,2.],[3.,3.,3.,3.] ], 13 | [ [4.,4.,4.,4.],[5.,5.,5.,5.],[6.,6.,6.,6.],[7.,7.,7.,7.] ] 14 | ]).reshape([1,2,4,4]) 15 | print(img) 16 | print(img[0][0]) 17 | print(img[0][1]) 18 | 19 | #torch.nn.functional.avg_pool2d(input, kernel_size, stride=None, padding=0, ceil_mode=False, count_include_pad=True) 20 | pooling=torch.nn.functional.max_pool2d(img,kernel_size =2) 21 | print("pooling:\n",pooling) 22 | pooling1=torch.nn.functional.max_pool2d(img,kernel_size =2,stride=1) 23 | print("pooling1:\n",pooling1) 24 | pooling2=torch.nn.functional.avg_pool2d(img,kernel_size =4,stride=1,padding=1) 25 | print("pooling2:\n",pooling2) 26 | pooling3=torch.nn.functional.avg_pool2d(img,kernel_size =4) 27 | print("pooling3:\n",pooling3) 28 | 29 | m1 = img.mean(3) 30 | print("第1次平均值结果:\n",m1) 31 | print("第2次平均值结果:\n",m1.mean(2)) 32 | -------------------------------------------------------------------------------- /code_09_BNdetail.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @author: 代码医生工作室 4 | @公众号:xiangyuejiqiren (内有更多优秀文章及学习资料) 5 | @来源: 配套代码 6 | @配套代码技术支持:bbs.aianaconda.com 7 | Created on Tue Jan 21 19:34:35 2020 8 | """ 9 | 10 | 11 | import torch 12 | import torch.nn as nn 13 | data=torch.randn(2,2,2,1) 14 | print(data) 15 | obn=nn.BatchNorm2d(2,affine=True) #实例化自适应BN对象 16 | output=obn(data) 17 | 18 | 19 | print(obn.weight) 20 | print(obn.bias) 21 | print(obn.eps) 22 | print(output,output.size()) 23 | 24 | 25 | print("第1通道的数据:",data[:,0]) 26 | 27 | #计算第1通道数据的均值和方差 28 | Mean=torch.Tensor.mean(data[:,0]) 29 | Var=torch.Tensor.var(data[:,0],False) #false表示贝塞尔校正不会被使用 30 | print(Mean) 31 | print(Var) 32 | 33 | #计算第1通道中第一个数据的BN 34 | batchnorm=((data[0][0][0][0]-Mean)/(torch.pow(Var,0.5)+obn.eps))\ 35 | *obn.weight[0]+obn.bias[0] 36 | print(batchnorm) 37 | 38 | 39 | 40 | 41 | 42 | import torch 43 | data=torch.randn(1,1,1)#tensor([[[1.3868]]]) 44 | data.expand(1, 1, 2)#tensor([[[1.3868, 1.3868]]]) 45 | data.repeat(1,1,2) 46 | import torch 47 | data=torch.rand(2,4)#tensor([[0.2316, 0.3987, 0.6225, 0.5304], 48 | # [0.7686, 0.3504, 0.8837, 0.7697]]) 49 | torch.multinomial(data, 1)#tensor([[1], [2]]) 50 | torch.multinomial(data, 1)#tensor([[1], [0]]) -------------------------------------------------------------------------------- /code_20_GPT2Test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Mar 20 11:10:34 2020 4 | 5 | @author: ljh 6 | """ 7 | 8 | import torch 9 | from transformers import GPT2Tokenizer, GPT2LMHeadModel 10 | 11 | # 加载预训练模型(权重) 12 | tokenizer = GPT2Tokenizer.from_pretrained('gpt2') 13 | 14 | 15 | #编码输入 16 | indexed_tokens = tokenizer.encode("Who is Li Jinhong ? Li Jinhong is a") 17 | 18 | print( tokenizer.decode(indexed_tokens)) 19 | 20 | tokens_tensor = torch.tensor([indexed_tokens])#转换为张量 21 | 22 | # 加载预训练模型(权重) 23 | model = GPT2LMHeadModel.from_pretrained('gpt2') 24 | 25 | #将模型设置为评估模式 26 | model.eval() 27 | 28 | tokens_tensor = tokens_tensor.to('cuda') 29 | model.to('cuda') 30 | 31 | # 预测所有标记 32 | with torch.no_grad(): 33 | outputs = model(tokens_tensor) 34 | predictions = outputs[0] 35 | 36 | # 得到预测的下一词 37 | predicted_index = torch.argmax(predictions[0, -1, :]).item() 38 | predicted_text = tokenizer.decode(indexed_tokens + [predicted_index]) 39 | print(predicted_text) 40 | 41 | 42 | #生成一段完整的话 43 | stopids = tokenizer.convert_tokens_to_ids(["."])[0] 44 | past = None 45 | for i in range(100): 46 | with torch.no_grad(): 47 | output, past = model(tokens_tensor, past=past) 48 | token = torch.argmax(output[..., -1, :]) 49 | 50 | indexed_tokens += [token.tolist()] 51 | 52 | if stopids== token.tolist(): 53 | break 54 | tokens_tensor = token.unsqueeze(0) 55 | 56 | sequence = tokenizer.decode(indexed_tokens) 57 | 58 | print(sequence) 59 | 60 | 61 | -------------------------------------------------------------------------------- /code_19_BERTTest.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Mar 18 10:43:59 2020 4 | 5 | @author: ljh 6 | """ 7 | 8 | 9 | import torch 10 | from transformers import BertTokenizer, BertForMaskedLM 11 | 12 | #加载预训练模型 tokenizer (vocabulary) 13 | tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') 14 | 15 | #输入文本 16 | text = "[CLS] Who is Li Jinhong ? [SEP] Li Jinhong is a programmer [SEP]" 17 | tokenized_text = tokenizer.tokenize(text) 18 | print(tokenized_text) 19 | 20 | masked_index = 8 #掩码一个标记,用' BertForMaskedLM '预测回来 21 | tokenized_text[masked_index] = '[MASK]' 22 | print(tokenized_text) 23 | 24 | # 将标记转换为词汇表索引 25 | indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) 26 | # 将输入转换为PyTorch张量 27 | tokens_tensor = torch.tensor([indexed_tokens]) 28 | 29 | 30 | #指定设备 31 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 32 | print(device) 33 | 34 | # 加载预训练模型 (weights) 35 | model = BertForMaskedLM.from_pretrained('bert-base-uncased') 36 | model.eval() 37 | model.to(device) 38 | 39 | 40 | segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1] 41 | segments_tensors = torch.tensor([segments_ids]).to(device) 42 | 43 | tokens_tensor = tokens_tensor.to(device) 44 | # 预测所有的tokens 45 | with torch.no_grad(): 46 | outputs = model(tokens_tensor, token_type_ids=segments_tensors) 47 | 48 | predictions = outputs[0] #[1, 15, 30522] 49 | 50 | predicted_index = torch.argmax(predictions[0, masked_index]).item() 51 | predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0] #转成单词 52 | print('Predicted token is:',predicted_token) 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | -------------------------------------------------------------------------------- /code_02_moons.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @author: 代码医生工作室 4 | @公众号:xiangyuejiqiren (内有更多优秀文章及学习资料) 5 | @来源: 配套代码 6 | @配套代码技术支持:bbs.aianaconda.com 7 | Created on Fri Feb 1 00:07:25 2019 8 | """ 9 | 10 | import sklearn.datasets #引入数据集 11 | import torch 12 | import numpy as np 13 | import matplotlib.pyplot as plt 14 | from code_03_moons_fun import LogicNet,plot_losses,predict,plot_decision_boundary 15 | 16 | 17 | 18 | torch.manual_seed(0) 19 | torch.cuda.manual_seed_all(0) 20 | 21 | torch.backends.cudnn.deterministic = True 22 | torch.backends.cudnn.benchmark = False 23 | 24 | np.random.seed(0) #设置随机数种子 25 | X, Y = sklearn.datasets.make_moons(200,noise=0.2) #生成2组半圆形数据 26 | 27 | arg = np.squeeze(np.argwhere(Y==0),axis = 1) #获取第1组数据索引 28 | arg2 = np.squeeze(np.argwhere(Y==1),axis = 1)#获取第2组数据索引 29 | 30 | plt.title("moons data") 31 | plt.scatter(X[arg,0], X[arg,1], s=100,c='b',marker='+',label='data1') 32 | plt.scatter(X[arg2,0], X[arg2,1],s=40, c='r',marker='o',label='data2') 33 | plt.legend() 34 | plt.show() 35 | 36 | 37 | 38 | model = LogicNet(inputdim=2,hiddendim=3,outputdim=2)#初始化模型 39 | optimizer = torch.optim.Adam(model.parameters(), lr=0.01)#定义优化器 40 | 41 | 42 | xt = torch.from_numpy(X).type(torch.FloatTensor)#将Numpy数据转化为张量 43 | yt = torch.from_numpy(Y).type(torch.LongTensor) 44 | epochs = 1000#定义迭代次数 45 | losses = []#定义列表,用于接收每一步的损失值 46 | for i in range(epochs): 47 | loss = model.getloss(xt,yt) 48 | losses.append(loss.item()) 49 | optimizer.zero_grad()#清空之前的梯度 50 | loss.backward()#反向传播损失值 51 | optimizer.step()#更新参数 52 | 53 | 54 | 55 | plot_losses(losses) 56 | 57 | 58 | from sklearn.metrics import accuracy_score 59 | print(accuracy_score(model.predict(xt),yt)) 60 | 61 | 62 | plot_decision_boundary(lambda x : predict(model,x) ,xt.numpy(), yt.numpy()) 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | -------------------------------------------------------------------------------- /code_18_pipline.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Mar 30 10:22:55 2020 4 | 5 | @author: ljh 6 | """ 7 | 8 | from transformers import * 9 | 10 | nlp = pipeline("sentiment-analysis") 11 | print(nlp("I like this book!")) 12 | 13 | ##########################################feature-extraction 14 | import numpy as np 15 | nlp_features = pipeline('feature-extraction') 16 | output = nlp_features('Code Doctor Studio is a Chinese company based in BeiJing.') 17 | print(np.array(output).shape) # (Samples, Tokens, Vector Size)(1, 16, 768) 18 | 19 | 20 | ############################掩码语言建模 21 | nlp_fill = pipeline("fill-mask") 22 | print(nlp_fill.tokenizer.mask_token) 23 | print(nlp_fill(f"Li Jinhong wrote many {nlp_fill.tokenizer.mask_token} about artificial intelligence technology and helped many people.")) 24 | 25 | 26 | 27 | 28 | ############################抽取式问答 29 | 30 | 31 | nlp_qa = pipeline("question-answering") 32 | print(nlp_qa(context='Code Doctor Studio is a Chinese company based in BeiJing.', 33 | question='Where is Code Doctor Studio?') ) 34 | 35 | 36 | 37 | 38 | ###################################摘要 39 | 40 | TEXT_TO_SUMMARIZE = ''' 41 | In this notebook we will be using the transformer model, first introduced in this paper. Specifically, we will be using the BERT (Bidirectional Encoder Representations from Transformers) model from this paper. 42 | Transformer models are considerably larger than anything else covered in these tutorials. As such we are going to use the transformers library to get pre-trained transformers and use them as our embedding layers. We will freeze (not train) the transformer and only train the remainder of the model which learns from the representations produced by the transformer. In this case we will be using a multi-layer bi-directional GRU, however any model can learn from these representations. 43 | ''' 44 | summarizer = pipeline('summarization') 45 | print(summarizer(TEXT_TO_SUMMARIZE)) 46 | 47 | 48 | # #################命名实体识别 49 | 50 | nlp_token_class = pipeline("ner") 51 | print(nlp_token_class( 52 | 'Code Doctor Studio is a Chinese company based in BeiJing.')) 53 | 54 | -------------------------------------------------------------------------------- /code_05_L2.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @author: 代码医生工作室 4 | @公众号:xiangyuejiqiren (内有更多优秀文章及学习资料) 5 | @来源: 配套代码 6 | @配套代码技术支持:bbs.aianaconda.com 7 | Created on Tue Apr 30 08:15:15 2019 8 | """ 9 | 10 | import sklearn.datasets #引入数据集 11 | import torch 12 | import numpy as np 13 | import matplotlib.pyplot as plt 14 | from code_03_moons_fun import LogicNet,moving_average,predict,plot_decision_boundary 15 | 16 | np.random.seed(0) #设置随机数种子 17 | X, Y = sklearn.datasets.make_moons(40,noise=0.2) #生成2组半圆形数据 18 | 19 | arg = np.squeeze(np.argwhere(Y==0),axis = 1) #获取第1组数据索引 20 | arg2 = np.squeeze(np.argwhere(Y==1),axis = 1)#获取第2组数据索引 21 | 22 | plt.title("train moons data") 23 | plt.scatter(X[arg,0], X[arg,1], s=100,c='b',marker='+',label='data1') 24 | plt.scatter(X[arg2,0], X[arg2,1],s=40, c='r',marker='o',label='data2') 25 | plt.legend() 26 | plt.show() 27 | 28 | 29 | model = LogicNet(inputdim=2,hiddendim=500,outputdim=2)#初始化模型 30 | #添加正则化处理 31 | weight_p, bias_p = [],[] 32 | for name, p in model.named_parameters(): 33 | if 'bias' in name: 34 | bias_p += [p] 35 | else: 36 | weight_p += [p] 37 | optimizer = torch.optim.Adam([{'params': weight_p, 'weight_decay':0.001}, 38 | {'params': bias_p, 'weight_decay':0}], 39 | lr=0.01) 40 | 41 | 42 | 43 | xt = torch.from_numpy(X).type(torch.FloatTensor)#将Numpy数据转化为张量 44 | yt = torch.from_numpy(Y).type(torch.LongTensor) 45 | epochs = 1000#定义迭代次数 46 | losses = []#定义列表,用于接收每一步的损失值 47 | for i in range(epochs): 48 | loss = model.getloss(xt,yt) 49 | losses.append(loss.item()) 50 | optimizer.zero_grad()#清空之前的梯度 51 | loss.backward()#反向传播损失值 52 | optimizer.step()#更新参数 53 | 54 | 55 | avgloss= moving_average(losses) #获得损失值的移动平均值 56 | plt.figure(1) 57 | plt.subplot(211) 58 | plt.plot(range(len(avgloss)), avgloss, 'b--') 59 | plt.xlabel('step number') 60 | plt.ylabel('Training loss') 61 | plt.title('step number vs. Training loss') 62 | plt.show() 63 | 64 | 65 | plot_decision_boundary(lambda x : predict(model,x) ,X, Y) 66 | from sklearn.metrics import accuracy_score 67 | print("训练时的准确率:",accuracy_score(model.predict(xt),yt)) 68 | 69 | Xtest, Ytest = sklearn.datasets.make_moons(80,noise=0.2) #生成2组半圆形数据 70 | plot_decision_boundary(lambda x : predict(model,x) ,Xtest, Ytest) 71 | Xtest_t = torch.from_numpy(Xtest).type(torch.FloatTensor)#将Numpy数据转化为张量 72 | Ytest_t = torch.from_numpy(Ytest).type(torch.LongTensor) 73 | print("测试时的准确率:",accuracy_score(model.predict(Xtest_t),Ytest_t)) 74 | -------------------------------------------------------------------------------- /code_06_Dropout.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @author: 代码医生工作室 4 | @公众号:xiangyuejiqiren (内有更多优秀文章及学习资料) 5 | @来源: 配套代码 6 | @配套代码技术支持:bbs.aianaconda.com 7 | Created on Fri Feb 1 00:07:25 2019 8 | """ 9 | 10 | import sklearn.datasets #引入数据集 11 | import torch 12 | import numpy as np 13 | import matplotlib.pyplot as plt 14 | from code_03_moons_fun import LogicNet,moving_average,predict,plot_decision_boundary 15 | import torch.nn as nn 16 | 17 | 18 | 19 | 20 | np.random.seed(0) #设置随机数种子 21 | X, Y = sklearn.datasets.make_moons(40,noise=0.2) #生成2组半圆形数据 22 | 23 | arg = np.squeeze(np.argwhere(Y==0),axis = 1) #获取第1组数据索引 24 | arg2 = np.squeeze(np.argwhere(Y==1),axis = 1)#获取第2组数据索引 25 | 26 | plt.title("train moons data") 27 | plt.scatter(X[arg,0], X[arg,1], s=100,c='b',marker='+',label='data1') 28 | plt.scatter(X[arg2,0], X[arg2,1],s=40, c='r',marker='o',label='data2') 29 | plt.legend() 30 | plt.show() 31 | 32 | #继承LogicNet类,构建网络模型 33 | class Logic_Dropout_Net(LogicNet): 34 | def __init__(self,inputdim,hiddendim,outputdim):#初始化网络结构 35 | super(Logic_Dropout_Net,self).__init__(inputdim,hiddendim,outputdim) 36 | 37 | def forward(self,x): #搭建用两层全连接组成的网络模型 38 | x = self.Linear1(x)#将输入数据传入第1层 39 | x = torch.tanh(x)#对第一层的结果进行非线性变换 40 | x = nn.functional.dropout(x, p=0.07, training=self.training) 41 | x = self.Linear2(x)#再将数据传入第2层 42 | return x 43 | 44 | 45 | 46 | 47 | model = Logic_Dropout_Net(inputdim=2,hiddendim=500,outputdim=2)#初始化模型 48 | optimizer = torch.optim.Adam(model.parameters(), lr=0.01)#定义优化器 49 | 50 | 51 | xt = torch.from_numpy(X).type(torch.FloatTensor)#将Numpy数据转化为张量 52 | yt = torch.from_numpy(Y).type(torch.LongTensor) 53 | epochs = 1000#定义迭代次数 54 | losses = []#定义列表,用于接收每一步的损失值 55 | for i in range(epochs): 56 | loss = model.getloss(xt,yt) 57 | losses.append(loss.item()) 58 | optimizer.zero_grad()#清空之前的梯度 59 | loss.backward()#反向传播损失值 60 | optimizer.step()#更新参数 61 | 62 | 63 | avgloss= moving_average(losses) #获得损失值的移动平均值 64 | plt.figure(1) 65 | plt.subplot(211) 66 | plt.plot(range(len(avgloss)), avgloss, 'b--') 67 | plt.xlabel('step number') 68 | plt.ylabel('Training loss') 69 | plt.title('step number vs. Training loss') 70 | plt.show() 71 | 72 | 73 | plot_decision_boundary(lambda x : predict(model,x) ,X, Y) 74 | from sklearn.metrics import accuracy_score 75 | print("训练时的准确率:",accuracy_score(model.predict(xt),yt)) 76 | 77 | Xtest, Ytest = sklearn.datasets.make_moons(80,noise=0.2) #生成2组半圆形数据 78 | plot_decision_boundary(lambda x : predict(model,x) ,Xtest, Ytest) 79 | Xtest_t = torch.from_numpy(Xtest).type(torch.FloatTensor)#将Numpy数据转化为张量 80 | Ytest_t = torch.from_numpy(Ytest).type(torch.LongTensor) 81 | print("测试时的准确率:",accuracy_score(model.predict(Xtest_t),Ytest_t)) 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | -------------------------------------------------------------------------------- /code_08_BN.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @author: 代码医生工作室 4 | @公众号:xiangyuejiqiren (内有更多优秀文章及学习资料) 5 | @来源: 配套代码 6 | @配套代码技术支持:bbs.aianaconda.com 7 | Created on Fri Feb 1 00:07:25 2019 8 | """ 9 | 10 | import sklearn.datasets #引入数据集 11 | import torch 12 | import numpy as np 13 | import matplotlib.pyplot as plt 14 | from code_02_moons_fun import LogicNet,moving_average,predict,plot_decision_boundary 15 | import torch.nn as nn 16 | 17 | 18 | 19 | 20 | np.random.seed(0) #设置随机数种子 21 | X, Y = sklearn.datasets.make_moons(40,noise=0.2) #生成2组半圆形数据 22 | 23 | arg = np.squeeze(np.argwhere(Y==0),axis = 1) #获取第1组数据索引 24 | arg2 = np.squeeze(np.argwhere(Y==1),axis = 1)#获取第2组数据索引 25 | 26 | plt.title("train moons data") 27 | plt.scatter(X[arg,0], X[arg,1], s=100,c='b',marker='+',label='data1') 28 | plt.scatter(X[arg2,0], X[arg2,1],s=40, c='r',marker='o',label='data2') 29 | plt.legend() 30 | plt.show() 31 | 32 | #继承LogicNet类,构建网络模型 33 | class Logic_BN_Net(LogicNet): 34 | def __init__(self,inputdim,hiddendim,outputdim):#初始化网络结构 35 | super(Logic_BN_Net,self).__init__(inputdim,hiddendim,outputdim) 36 | self.BN = nn.BatchNorm1d(hiddendim) #定义BN层 37 | def forward(self,x): #搭建用两层全连接组成的网络模型 38 | x = self.Linear1(x)#将输入数据传入第1层 39 | x = torch.tanh(x)#对第一层的结果进行非线性变换 40 | x = self.BN(x)#将第一层的数据做BN处理 41 | x = self.Linear2(x)#再将数据传入第2层 42 | return x 43 | 44 | 45 | 46 | 47 | model = Logic_BN_Net(inputdim=2,hiddendim=500,outputdim=2)#初始化模型 48 | optimizer = torch.optim.Adam(model.parameters(), lr=0.01)#定义优化器 49 | 50 | 51 | xt = torch.from_numpy(X).type(torch.FloatTensor)#将Numpy数据转化为张量 52 | yt = torch.from_numpy(Y).type(torch.LongTensor) 53 | epochs = 200#定义迭代次数 54 | losses = []#定义列表,用于接收每一步的损失值 55 | for i in range(epochs): 56 | loss = model.getloss(xt,yt) 57 | losses.append(loss.item()) 58 | optimizer.zero_grad()#清空之前的梯度 59 | loss.backward()#反向传播损失值 60 | optimizer.step()#更新参数 61 | 62 | 63 | avgloss= moving_average(losses) #获得损失值的移动平均值 64 | plt.figure(1) 65 | plt.subplot(211) 66 | plt.plot(range(len(avgloss)), avgloss, 'b--') 67 | plt.xlabel('step number') 68 | plt.ylabel('Training loss') 69 | plt.title('step number vs. Training loss') 70 | plt.show() 71 | 72 | 73 | plot_decision_boundary(lambda x : predict(model,x) ,X, Y) 74 | from sklearn.metrics import accuracy_score 75 | print("训练时的准确率:",accuracy_score(model.predict(xt),yt)) 76 | 77 | Xtest, Ytest = sklearn.datasets.make_moons(80,noise=0.2) #生成2组半圆形数据 78 | plot_decision_boundary(lambda x : predict(model,x) ,Xtest, Ytest) 79 | Xtest_t = torch.from_numpy(Xtest).type(torch.FloatTensor)#将Numpy数据转化为张量 80 | Ytest_t = torch.from_numpy(Ytest).type(torch.LongTensor) 81 | print("测试时的准确率:",accuracy_score(model.predict(Xtest_t),Ytest_t)) 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | -------------------------------------------------------------------------------- /code_25_BERT_NoPUNC.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Apr 10 07:10:37 2020 4 | 5 | @author: ljh 6 | """ 7 | 8 | #没标点 9 | 10 | import re 11 | import pickle 12 | import torch 13 | from tqdm import tqdm 14 | 15 | from code_12_BERT_PROPN import (device,df_test,df_train_val, 16 | getmodel,insert_tag,tokenize) 17 | 18 | def clean_and_replace_target_name(row): #去掉标点符号 19 | text = row['TextClean'] 20 | text = re.sub("[^a-zA-Z]"," ",text) #只保留英文字符,去掉标点及数字 21 | A = re.sub("[^a-zA-Z]"," ",row['A']) #只保留英文字符 22 | B = re.sub("[^a-zA-Z]"," ",row['B']) #只保留英文字符 23 | 24 | # replace names # 先分词,再取第一个,Dehner--》 ['de', '##hner']--》de 确保不被分成2个词 25 | text = re.sub(str(A), tokenizer.tokenize(A)[0], text) #将名称之换做一个词Bob Suter--》bob 26 | text = re.sub(str(B), tokenizer.tokenize(B)[0], text) 27 | 28 | text = re.sub(r"THISISA", r"[THISISA]", text) 29 | text = re.sub(r"THISISB", r"[THISISB]", text) 30 | text = re.sub(r"THISISP", r"[THISISP]", text) 31 | 32 | text = re.sub(' +', ' ', text) #去掉多个空格 33 | return text 34 | 35 | 36 | def savepkl(df,prename=''): 37 | offsets_lst = [] 38 | tokens_lst = [] 39 | max_len=269 #设置处理文本的最大长度 40 | bert_prediction = [] 41 | for _, row in tqdm(df.iterrows(),total=len(df)): 42 | 43 | row.loc['TextClean'] = insert_tag(row,hasbrack= False)#插入标签,防止去标点时,一起被去掉 44 | text = clean_and_replace_target_name(row)#去除标点、空格,并压缩被指带的名词 45 | 46 | encode_rel= tokenizer.encode_plus(text,max_length=max_len,pad_to_max_length=True)#向量化 len=90 47 | 48 | tokens, offsets ,masks= tokenize(encode_rel['input_ids'] , 49 | tokenizer,encode_rel['attention_mask'])#获取标签偏移 50 | offsets_lst.append(offsets) 51 | tokens_lst.append(tokens) 52 | #验证代词位置 53 | # print( tokenizer.decode(tokens),len(tokens)) 54 | # print( tokenizer.decode(np.asarray(tokens)[list(offsets)])) 55 | token_tensor = torch.LongTensor([tokens]).to(device) 56 | masks_tensor = torch.LongTensor([masks]).to(device) 57 | #输入BERT模型 58 | bert_outputs,bert_last_outputs= model(token_tensor,attention_mask =masks_tensor) #[1, 107, 768] , [1, 768] 59 | bert_prediction.append(bert_outputs.cpu().numpy())#([1, 266, 768]) 60 | 61 | pickle.dump(offsets_lst, open(prename+'offsets_NoPUNC.pkl', "wb")) 62 | pickle.dump(tokens_lst, open(prename+'tokens_NoPUNC_padding.pkl', "wb")) 63 | pickle.dump(bert_prediction, open(prename+'bert_outputs_forNoPUNC.pkl', "wb")) 64 | 65 | if __name__ == '__main__': 66 | 67 | tokenizer,model = getmodel() 68 | model.to(device) 69 | torch.set_grad_enabled(False) 70 | 71 | savepkl(df_test, 'test_') 72 | savepkl(df_train_val, ) -------------------------------------------------------------------------------- /code_03_moons_fun.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @author: 代码医生工作室 4 | @公众号:xiangyuejiqiren (内有更多优秀文章及学习资料) 5 | @来源: 配套代码 6 | @配套代码技术支持:bbs.aianaconda.com 7 | Created on Fri Feb 1 00:07:25 2019 8 | """ 9 | 10 | 11 | import torch 12 | import numpy as np 13 | import matplotlib.pyplot as plt 14 | import torch.nn as nn 15 | 16 | #继承nn.Module类,构建网络模型 17 | class LogicNet(nn.Module): 18 | def __init__(self,inputdim,hiddendim,outputdim):#初始化网络结构 19 | super(LogicNet,self).__init__() 20 | self.Linear1 = nn.Linear(inputdim,hiddendim) #定义全连接层 21 | self.Linear2 = nn.Linear(hiddendim,outputdim)#定义全连接层 22 | self.criterion = nn.CrossEntropyLoss() #定义交叉熵函数 23 | 24 | def forward(self,x): #搭建用两层全连接组成的网络模型 25 | x = self.Linear1(x)#将输入数据传入第1层 26 | x = torch.tanh(x)#对第一层的结果进行非线性变换 27 | x = self.Linear2(x)#再将数据传入第2层 28 | # print("LogicNet") 29 | return x 30 | 31 | def predict(self,x):#实现LogicNet类的预测接口 32 | #调用自身网络模型,并对结果进行softmax处理,分别得出预测数据属于每一类的概率 33 | pred = torch.softmax(self.forward(x),dim=1) 34 | return torch.argmax(pred,dim=1) #返回每组预测概率中最大的索引 35 | 36 | def getloss(self,x,y): #实现LogicNet类的损失值计算接口 37 | y_pred = self.forward(x) 38 | loss = self.criterion(y_pred,y)#计算损失值得交叉熵 39 | return loss 40 | 41 | 42 | 43 | 44 | def moving_average(a, w=10):#定义函数计算移动平均损失值 45 | if len(a) < w: 46 | return a[:] 47 | return [val if idx < w else sum(a[(idx-w):idx])/w for idx, val in enumerate(a)] 48 | 49 | def plot_losses(losses): 50 | avgloss= moving_average(losses) #获得损失值的移动平均值 51 | plt.figure(1) 52 | plt.subplot(211) 53 | plt.plot(range(len(avgloss)), avgloss, 'b--') 54 | plt.xlabel('step number') 55 | plt.ylabel('Training loss') 56 | plt.title('step number vs. Training loss') 57 | plt.show() 58 | 59 | def predict(model,x): #封装支持Numpy的预测接口 60 | x = torch.from_numpy(x).type(torch.FloatTensor) 61 | ans = model.predict(x) 62 | return ans.numpy() 63 | 64 | def plot_decision_boundary(pred_func,X,Y):#在直角坐标系中可视化模型能力 65 | #计算取值范围 66 | x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 67 | y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 68 | h = 0.01 69 | #在坐标系中采用数据,生成网格矩阵,用于输入模型 70 | xx,yy=np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) 71 | #将数据输入并进行预测 72 | Z = pred_func(np.c_[xx.ravel(), yy.ravel()]) 73 | Z = Z.reshape(xx.shape) 74 | #将预测的结果可视化 75 | plt.contourf(xx, yy, Z, cmap=plt.cm.Spectral) 76 | plt.title("Linear predict") 77 | arg = np.squeeze(np.argwhere(Y==0),axis = 1) 78 | arg2 = np.squeeze(np.argwhere(Y==1),axis = 1) 79 | plt.scatter(X[arg,0], X[arg,1], s=100,c='b',marker='+') 80 | plt.scatter(X[arg2,0], X[arg2,1],s=40, c='r',marker='o') 81 | plt.show() 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | -------------------------------------------------------------------------------- /code_12_CONV.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @author: 代码医生工作室 4 | @公众号:xiangyuejiqiren (内有更多优秀文章及学习资料) 5 | @来源: 配套代码 6 | @配套代码技术支持:bbs.aianaconda.com 7 | Created on Thu Apr 25 15:18:57 2019 8 | """ 9 | 10 | import torch 11 | # [batch, in_channels, in_height, in_width] [训练时一个batch的图片数量, 图像通道数, 图片高度, 图片宽度] 12 | input1 = torch.ones([1, 1, 5, 5]) 13 | input2 = torch.ones([1, 2, 5, 5]) 14 | input3 = torch.ones([1, 1, 4, 4]) 15 | # [ out_channels, in_channels,filter_height, filter_width] [卷积核个数,图像通道数,卷积核的高度,卷积核的宽度] 16 | filter1 = torch.tensor([-1.0,0,0,-1]).reshape([2, 2, 1, 1]) 17 | filter2 = torch.tensor([-1.0,0,0,-1,-1.0,0,0,-1]).reshape([2,1,2, 2]) 18 | filter3 = torch.tensor([-1.0,0,0,-1,-1.0,0,0,-1,-1.0,0,0,-1]).reshape([3,1,2, 2]) 19 | filter4 = torch.tensor([-1.0,0,0,-1,-1.0,0,0,-1, 20 | -1.0,0,0,-1, 21 | -1.0,0,0,-1]).reshape([2, 2, 2, 2]) 22 | filter5 = torch.tensor([-1.0,0,0,-1,-1.0,0,0,-1]).reshape([1,2, 2, 2]) 23 | 24 | #class torch.nn.Conv2d(in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True) 25 | #condv = torch.nn.Conv2d(1,1,kernel_size=1,padding=1, bias=False) 26 | #condv.weight = torch.nn.Parameter(torch.ones([1,1,1,1])) 27 | #padding1 = condv(input1) 28 | #print(padding1) 29 | 30 | #验证padding补0的规则 ——上下左右都补0 31 | padding1 = torch.nn.functional.conv2d(input1, torch.ones([1,1,1,1]), stride=1, padding=1) 32 | print(padding1) 33 | 34 | 35 | padding2 = torch.nn.functional.conv2d(input1, torch.ones([1,1,1,1]), stride=1, padding=(1,2)) 36 | print(padding2) 37 | 38 | ##1个通道输入,生成1个feature map 39 | #filter1 = torch.tensor([-1.0,0,0,-1]).reshape([1, 1, 2, 2]) 40 | #op1 = torch.nn.functional.conv2d(input1, filter1, stride=2, padding=1) 41 | #print('\n') 42 | #print(padding1) 43 | #print(filter1) 44 | #print(op1) 45 | 46 | #torch.nn.Conv2d(in_channels=1, out_channels=6, kernel_size=5) 47 | #torch.nn.functional.conv1d(input, weight, bias=None, stride=1, padding=0, dilation=1, groups=1) 48 | #torch.nn.functional.conv2d(input, weight, bias=None, stride=1, padding=0, dilation=1, groups=1) 49 | 50 | 51 | 52 | 53 | 54 | op1 = torch.nn.functional.conv2d(input1, filter1, stride=2, padding=1) #1个通道输入,生成1个feature map 55 | op2 = torch.nn.functional.conv2d(input1, filter2, stride=2, padding=1) #1个通道输入,生成2个feature map 56 | op3 = torch.nn.functional.conv2d(input1, filter3, stride=2, padding=1) #1个通道输入,生成3个feature map 57 | 58 | op4 = torch.nn.functional.conv2d(input2, filter4, stride=2, padding=1) # 2个通道输入,生成2个feature 59 | op5 = torch.nn.functional.conv2d(input2, filter5, stride=2, padding=1) # 2个通道输入,生成一个feature map 60 | 61 | op6 = torch.nn.functional.conv2d(input1, filter1, stride=2, padding=0) # 5*5 对于pading不同而不同 62 | 63 | 64 | print("op1:\n",op1,filter1)#1-1 后面补0 65 | print("------------------") 66 | 67 | print("op2:\n",op2,filter2) #1-2多卷积核 按列取 68 | print("op3:\n",op3,filter3) #1-3 69 | print("------------------") 70 | 71 | print("op4:\n",op4,filter4)#2-2 通道叠加 72 | print("op5:\n",op5,filter5)#2-1 73 | print("------------------") 74 | 75 | print("op1:\n",op1,filter1)#1-1 76 | print("op6:\n",op6,filter1) 77 | 78 | -------------------------------------------------------------------------------- /code_24_BERT_PROPN.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Apr 10 07:10:37 2020 4 | 5 | @author: ljh 6 | """ 7 | 8 | #提取代词特征 9 | 10 | import pandas as pd 11 | import pickle 12 | import torch 13 | from tqdm import tqdm 14 | from transformers import BertTokenizer,BertModel,BertConfig 15 | 16 | #指定设备 17 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 18 | print(device) 19 | 20 | #读取数据 21 | df_test = pd.read_csv("gap-development.tsv", delimiter="\t") 22 | df_train_val = pd.concat([ 23 | pd.read_csv("gap-test.tsv", delimiter="\t"), 24 | pd.read_csv("gap-validation.tsv", delimiter="\t") 25 | ], axis=0) 26 | 27 | 28 | def getmodel(): 29 | #加载词表文件tokenizer 30 | tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') 31 | 32 | #添加特殊词 33 | special_tokens_dict = {'additional_special_tokens': ["[THISISA]","[THISISB]","[THISISP]"]} 34 | tokenizer.add_special_tokens(special_tokens_dict) #添加特殊词 35 | print(tokenizer.additional_special_tokens,tokenizer.additional_special_tokens_ids) 36 | 37 | 38 | model = BertModel.from_pretrained('bert-base-uncased')#加载模型 39 | return tokenizer,model 40 | 41 | 42 | 43 | 44 | ############################ 45 | 46 | 47 | def insert_tag(row,hasbrack=True):#按照插入的位置,从大到小排序[(383, ' THISISP '), (366, ' THISISB '), (352, ' THISISA ')] 48 | orgtag=[" [THISISA] "," [THISISB] "," [THISISP] "] 49 | if hasbrack==False: 50 | orgtag=[" THISISA "," THISISB "," THISISP "] 51 | 52 | to_be_inserted = sorted([ 53 | (row["A-offset"], orgtag[0]), 54 | (row["B-offset"], orgtag[1]), 55 | (row["Pronoun-offset"], orgtag[2])], key=lambda x: x[0], reverse=True) 56 | 57 | text = row["Text"]#len 443 58 | for offset, tag in to_be_inserted:#先插最后的,不会影响前面 59 | text = text[:offset] + tag + text[offset:]#(插到每个代词的前面) 60 | return text#len 470 (443+3*9) 61 | 62 | 63 | 64 | def tokenize(sequence_ind, tokenizer,sequence_mask= None):#将标签分离,并返回标签偏移位置 65 | entries = {} 66 | final_tokens=[] 67 | final_mask=[] 68 | 69 | for i,one in enumerate(sequence_ind): 70 | if one in tokenizer.additional_special_tokens_ids: 71 | tokenstr = tokenizer.convert_ids_to_tokens(one) 72 | entries[tokenstr] = len(final_tokens) 73 | continue 74 | final_tokens.append(one) 75 | if sequence_mask is not None: 76 | final_mask.append(sequence_mask[i]) 77 | return final_tokens, (entries["[THISISA]"], entries["[THISISB]"], entries["[THISISP]"]) ,final_mask 78 | 79 | 80 | 81 | def savepkl(df,name): 82 | bert_prediction = [] 83 | for _, row in tqdm(df.iterrows(),total=len(df)): 84 | #循环内部 85 | text = insert_tag(row)#插入标签 86 | sequence_ind = tokenizer.encode(text)#向量化 87 | tokens, offsets,_ = tokenize(sequence_ind, tokenizer)#获取标签偏移 88 | token_tensor = torch.LongTensor([tokens]).to(device) 89 | bert_outputs,bert_last_outputs= model(token_tensor) #[1, 107, 768] , [1, 768] 90 | extracted_outputs = bert_outputs[:,offsets,:]#根据偏移位置抽取特征向量 91 | bert_prediction.append(extracted_outputs.cpu().numpy()) 92 | pickle.dump(bert_prediction, open(name, "wb")) 93 | 94 | 95 | if __name__ == '__main__': 96 | 97 | tokenizer,model = getmodel() 98 | model.to(device) 99 | torch.set_grad_enabled(False) 100 | 101 | savepkl(df_test, 'test_bert_outputs_forPROPN.pkl') 102 | savepkl(df_train_val, 'bert_outputs_forPROPN.pkl') 103 | 104 | -------------------------------------------------------------------------------- /code_07_Multi-sampleDropout.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Nov 5 12:11:47 2020 4 | 5 | @author: ljh 6 | """ 7 | 8 | 9 | import sklearn.datasets #引入数据集 10 | import torch 11 | import numpy as np 12 | import matplotlib.pyplot as plt 13 | from code_03_moons_fun import LogicNet,moving_average,predict,plot_decision_boundary 14 | import torch.nn as nn 15 | 16 | 17 | 18 | 19 | np.random.seed(0) #设置随机数种子 20 | X, Y = sklearn.datasets.make_moons(40,noise=0.2) #生成2组半圆形数据 21 | 22 | arg = np.squeeze(np.argwhere(Y==0),axis = 1) #获取第1组数据索引 23 | arg2 = np.squeeze(np.argwhere(Y==1),axis = 1)#获取第2组数据索引 24 | 25 | plt.title("train moons data") 26 | plt.scatter(X[arg,0], X[arg,1], s=100,c='b',marker='+',label='data1') 27 | plt.scatter(X[arg2,0], X[arg2,1],s=40, c='r',marker='o',label='data2') 28 | plt.legend() 29 | plt.show() 30 | 31 | #继承LogicNet类,构建网络模型 32 | class Logic_Dropout_Net(LogicNet): 33 | def __init__(self,inputdim,hiddendim,outputdim):#初始化网络结构 34 | super(Logic_Dropout_Net,self).__init__(inputdim,hiddendim,outputdim) 35 | 36 | self.drop = nn.Dropout(0.07, inplace=False) 37 | 38 | def forward(self,x): #搭建用两层全连接组成的网络模型 39 | x = self.Linear1(x)#将输入数据传入第1层 40 | x = torch.tanh(x)#对第一层的结果进行非线性变换 41 | # x = nn.functional.dropout(x, p=0.07, training=self.training) 42 | x = self.drop(x) 43 | x = self.Linear2(x)#再将数据传入第2层 44 | return x 45 | 46 | class Logic_TDropout_Net(LogicNet): 47 | def __init__(self,inputdim,hiddendim,outputdim, dropout_num=8,dropout_p=0.5):#初始化网络结构 48 | super(Logic_TDropout_Net,self).__init__(inputdim,hiddendim,outputdim) 49 | 50 | self.dropouts = nn.ModuleList([nn.Dropout(dropout_p, inplace=False) for _ in range(dropout_num)]) 51 | 52 | 53 | def forward(self,x): #搭建用两层全连接组成的网络模型 54 | x = self.Linear1(x)#将输入数据传入第1层 55 | x = torch.tanh(x)#对第一层的结果进行非线性变换 56 | 57 | if len(self.dropouts) == 0: 58 | return self.Linear2(x)#再将数据传入第2层 59 | else: 60 | for i,dropout in enumerate(self.dropouts): 61 | if i== 0: 62 | out = dropout(x) 63 | out = self.Linear2(out) 64 | else: 65 | temp_out = dropout(x) 66 | out =out+ self.Linear2(temp_out)#再将数据传入第2层 67 | return out 68 | 69 | #model = Logic_Dropout_Net(inputdim=2,hiddendim=500,outputdim=2)#初始化模型 70 | model = Logic_TDropout_Net(inputdim=2,hiddendim=500,outputdim=2,dropout_num=8,dropout_p=0.1)#初始化模型 71 | optimizer = torch.optim.Adam(model.parameters(), lr=0.01)#定义优化器 72 | 73 | 74 | xt = torch.from_numpy(X).type(torch.FloatTensor)#将Numpy数据转化为张量 75 | yt = torch.from_numpy(Y).type(torch.LongTensor) 76 | epochs = 300#定义迭代次数 77 | losses = []#定义列表,用于接收每一步的损失值 78 | for i in range(epochs): 79 | loss = model.getloss(xt,yt) 80 | losses.append(loss.item()) 81 | optimizer.zero_grad()#清空之前的梯度 82 | loss.backward()#反向传播损失值 83 | optimizer.step()#更新参数 84 | 85 | 86 | avgloss= moving_average(losses) #获得损失值的移动平均值 87 | plt.figure(1) 88 | plt.subplot(211) 89 | plt.plot(range(len(avgloss)), avgloss, 'b--') 90 | plt.xlabel('step number') 91 | plt.ylabel('Training loss') 92 | plt.title('step number vs. Training loss') 93 | plt.show() 94 | 95 | 96 | plot_decision_boundary(lambda x : predict(model,x) ,X, Y) 97 | from sklearn.metrics import accuracy_score 98 | print("训练时的准确率:",accuracy_score(model.predict(xt),yt)) 99 | 100 | Xtest, Ytest = sklearn.datasets.make_moons(80,noise=0.2) #生成2组半圆形数据 101 | plot_decision_boundary(lambda x : predict(model,x) ,Xtest, Ytest) 102 | Xtest_t = torch.from_numpy(Xtest).type(torch.FloatTensor)#将Numpy数据转化为张量 103 | Ytest_t = torch.from_numpy(Ytest).type(torch.LongTensor) 104 | print("测试时的准确率:",accuracy_score(model.predict(Xtest_t),Ytest_t)) 105 | 106 | 107 | 108 | model.eval() -------------------------------------------------------------------------------- /code_27_spellgcn.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Dec 1 20:29:59 2020 4 | 5 | @author: ljh 6 | """ 7 | from dgl.nn import GraphConv 8 | from transformers import BertTokenizer, BertModel, BertConfig,BertLMHeadModel 9 | import dgl 10 | import torch 11 | import torch.nn as nn 12 | import torch.nn.functional as F 13 | 14 | def load_graph(graph_dir): 15 | nodes_vocab = {} 16 | with open("%s/nodes_vocab.txt"%(graph_dir),encoding="UTF-8") as f: 17 | for i, line in enumerate(f): 18 | nodes_vocab.setdefault(line.strip(), i) 19 | 20 | node1s,node2s = [],[] 21 | with open("%s/spellGraphs.txt"%(graph_dir),encoding="UTF-8") as f: 22 | for i, line in enumerate(f): 23 | e1,e2, rel = line.strip().split("|") 24 | node1s.append(nodes_vocab[e1]) 25 | node2s.append(nodes_vocab[e2]) 26 | 27 | g1 = dgl.graph((node1s, node2s),num_nodes=len(nodes_vocab)) 28 | 29 | w2n = [] 30 | vocab = {} 31 | with open("%s/vocab.txt"%(graph_dir),encoding="UTF-8") as f: 32 | for i, line in enumerate(f): 33 | word = line.strip() 34 | vocab.setdefault(word, i) 35 | if word in nodes_vocab: 36 | w2n.append(nodes_vocab[word]) 37 | else: 38 | w2n.append(0) 39 | n2w = [] 40 | with open("%s/nodes_vocab.txt"%(graph_dir),encoding="UTF-8") as f: 41 | for i, line in enumerate(f): 42 | word = line.strip() 43 | if word in vocab: 44 | n2w.append(vocab[word]) 45 | else: 46 | n2w.append(0) 47 | return g1,w2n,n2w 48 | 49 | graph_dir = r'./gcn_graph' 50 | 51 | config = BertConfig.from_pretrained(r'./bert-base-chinese') 52 | config.is_decoder = True 53 | 54 | g1,w2n,n2w = load_graph(graph_dir) 55 | w2n=torch.tensor(w2n) 56 | n2w =torch.tensor(n2w) 57 | g = dgl.add_self_loop(g1) 58 | 59 | mask_nodes_ids = torch.where( w2n !=0)[0] #找到不为0的id 60 | maskbase = torch.zeros( (config.vocab_size,config.hidden_size) ) 61 | maskbase[mask_nodes_ids] =1. 62 | 63 | class MGCNNet(nn.Module): 64 | def __init__(self): 65 | super(MGCNNet, self).__init__() 66 | self.gcn1 = GraphConv(config.hidden_size, config.hidden_size) 67 | self.dropout = nn.Dropout(0.1) 68 | self.gcn2 = GraphConv(config.hidden_size, config.hidden_size) 69 | 70 | def forward(self, g, features): 71 | gcn1out = self.gcn1(g, features) 72 | x = self.dropout(gcn1out) 73 | gcn2out = self.gcn2(g, x) 74 | return features+gcn1out+gcn2out 75 | 76 | class spellgcnBert(nn.Module): 77 | def __init__(self, MLbert): 78 | super(spellgcnBert, self).__init__() 79 | self.MLbert = MLbert 80 | self.gnnmodel = MGCNNet() 81 | 82 | 83 | def getgnnemb(self): 84 | feat = self.MLbert.bert.embeddings.word_embeddings( n2w )#( input_ids=torch.tensor([n2w]).to(device) ) 85 | node_embedding = self.gnnmodel(g, feat) #[4755, 768] 86 | expanded_node_embedding = node_embedding[w2n]#21128, 768 87 | rest_embedding = self.MLbert.bert.get_input_embeddings().weight#21128, 768] 88 | gcn_embedding = maskbase * expanded_node_embedding + (1 - maskbase) * rest_embedding 89 | return gcn_embedding 90 | 91 | 92 | def forward(self, input_ids, input_mask, segment_ids): 93 | gcn_embedding = self.getgnnemb() 94 | outputs = self.MLbert.bert(input_ids, input_mask, segment_ids) #prob [batch_size, seq_len, 1] 95 | sequence_output = outputs[0] 96 | hidden_states = self.MLbert.cls.predictions.transform(sequence_output) 97 | prediction_scores =F.linear(hidden_states, gcn_embedding, MLbert.cls.predictions.bias) 98 | return prediction_scores 99 | 100 | tokenizer = BertTokenizer.from_pretrained(r'./bert-base-chinese') 101 | MLbert = BertLMHeadModel.from_pretrained(r'./bert-base-chinese', config=config) 102 | spellgcnBertmodel = spellgcnBert(MLbert) 103 | -------------------------------------------------------------------------------- /code_04_use_module.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @author: 代码医生工作室 4 | @公众号:xiangyuejiqiren (内有更多优秀文章及学习资料) 5 | @来源: 配套代码 6 | @配套代码技术支持:bbs.aianaconda.com 7 | Created on Wed Apr 3 06:12:15 2019 8 | """ 9 | 10 | 11 | import sklearn.datasets #引入数据集 12 | import torch 13 | import numpy as np 14 | import matplotlib.pyplot as plt 15 | 16 | np.random.seed(0) #设置随机数种子 17 | X, Y = sklearn.datasets.make_moons(200,noise=0.2) #生成2组半圆形数据数据 18 | 19 | arg = np.squeeze(np.argwhere(Y==0),axis = 1) #获取第1组数据索引 20 | arg2 = np.squeeze(np.argwhere(Y==1),axis = 1)#获取第2组数据索引 21 | 22 | plt.title("moons data") 23 | plt.scatter(X[arg,0], X[arg,1], s=100,c='b',marker='+',label='data1') 24 | plt.scatter(X[arg2,0], X[arg2,1],s=40, c='r',marker='o',label='data2') 25 | plt.legend() 26 | plt.show() 27 | 28 | 29 | 30 | import torch.nn as nn 31 | 32 | #继承nn.Module类,构建网络模型 33 | class LogicNet(nn.Module): 34 | def __init__(self,inputdim,hiddendim,outputdim):#初始化网络结构 35 | super(LogicNet,self).__init__() 36 | # self.Linear1 = nn.Linear(inputdim,hiddendim) #定义全连接层 37 | # self.Linear2 = nn.Linear(hiddendim,outputdim)#定义全连接层 38 | self.add_module("Linear1", nn.Linear(inputdim,hiddendim))#定义全连接层 39 | self.add_module("Linear2", nn.Linear(hiddendim,outputdim))#定义全连接层 40 | self.criterion = nn.CrossEntropyLoss() #定义交叉熵函数 41 | 42 | def forward(self,x): #搭建用两层全连接组成的网络模型 43 | x = self.Linear1(x)#将输入数据传入第1层 44 | x = torch.tanh(x)#对第一层的结果进行非线性变换 45 | x = self.Linear2(x)#再将数据传入第2层 46 | return x 47 | 48 | def predict(self,x):#实现LogicNet类的预测接口 49 | #调用自身网络模型,并对结果进行softmax处理,分别得出预测数据属于每一类的概率 50 | pred = torch.softmax(self.forward(x),dim=1) 51 | return torch.argmax(pred,dim=1) #返回每组预测概率中最大的索引 52 | 53 | def getloss(self,x,y): #实现LogicNet类的损失值计算接口 54 | y_pred = self.forward(x) 55 | loss = self.criterion(y_pred,y)#计算损失值得交叉熵 56 | return loss 57 | 58 | 59 | 60 | 61 | model = LogicNet(inputdim=2,hiddendim=3,outputdim=2)#初始化模型 62 | optimizer = torch.optim.Adam(model.parameters(), lr=0.01)#定义优化器 63 | #model = model.cuda() 64 | for sub_module in model.children(): 65 | print(sub_module) 66 | 67 | 68 | for name, module in model.named_children(): 69 | print(name,"is:",module) 70 | 71 | 72 | for module in model.modules(): 73 | print(module) 74 | 75 | for param in model.parameters(): 76 | print(type(param.data), param.size()) 77 | 78 | for name,param in model.named_parameters(): 79 | print(type(param.data), param.size(),name) 80 | 81 | 82 | 83 | xt = torch.from_numpy(X).type(torch.FloatTensor)#将Numpy数据转化为张量 84 | yt = torch.from_numpy(Y).type(torch.LongTensor) 85 | epochs = 1000#定义迭代次数 86 | losses = []#定义列表,用于接收每一步的损失值 87 | for i in range(epochs): 88 | loss = model.getloss(xt,yt) 89 | losses.append(loss.item()) 90 | optimizer.zero_grad()#清空之前的梯度 91 | loss.backward()#反向传播损失值 92 | optimizer.step()#更新参数 93 | 94 | def moving_average(a, w=10):#定义函数计算移动平均损失值 95 | if len(a) < w: 96 | return a[:] 97 | return [val if idx < w else sum(a[(idx-w):idx])/w for idx, val in enumerate(a)] 98 | 99 | avgloss= moving_average(losses) #获得损失值的移动平均值 100 | plt.figure(1) 101 | plt.subplot(211) 102 | plt.plot(range(len(avgloss)), avgloss, 'b--') 103 | plt.xlabel('step number') 104 | plt.ylabel('Training loss') 105 | plt.title('step number vs. Training loss') 106 | plt.show() 107 | 108 | 109 | from sklearn.metrics import accuracy_score 110 | print(accuracy_score(model.predict(xt),yt)) 111 | 112 | 113 | def predict(x): #封装支持Numpy的预测接口 114 | x = torch.from_numpy(x).type(torch.FloatTensor) 115 | ans = model.predict(x) 116 | return ans.numpy() 117 | 118 | def plot_decision_boundary(pred_func,X,Y):#在直角坐标系中可视化模型能力 119 | #计算取值范围 120 | x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 121 | y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 122 | h = 0.01 123 | #在坐标系中采用数据,生成网格矩阵,用于输入模型 124 | xx,yy=np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) 125 | #将数据输入并进行预测 126 | Z = pred_func(np.c_[xx.ravel(), yy.ravel()]) 127 | Z = Z.reshape(xx.shape) 128 | #将预测的结果可视化 129 | plt.contourf(xx, yy, Z, cmap=plt.cm.Spectral) 130 | plt.title("Linear predict") 131 | arg = np.squeeze(np.argwhere(Y==0),axis = 1) 132 | arg2 = np.squeeze(np.argwhere(Y==1),axis = 1) 133 | plt.scatter(X[arg,0], X[arg,1], s=100,c='b',marker='+') 134 | plt.scatter(X[arg2,0], X[arg2,1],s=40, c='r',marker='o') 135 | 136 | 137 | plot_decision_boundary(lambda x : predict(x) ,xt.numpy(), yt.numpy()) -------------------------------------------------------------------------------- /code_22_TextCNNInterpret.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sat Nov 9 10:13:29 2019 4 | 5 | @author: ljh 6 | """ 7 | 8 | 9 | 10 | import spacy #引入分词库 11 | import torch#引入PyTorch库 12 | import torch.nn.functional as F 13 | #引入解释库 14 | from captum.attr import (IntegratedGradients,TokenReferenceBase,visualization, 15 | configure_interpretable_embedding_layer, remove_interpretable_embedding_layer) 16 | 17 | #引入本地代码库 18 | from code_14_TextCNN import TextCNN, TEXT,LABEL 19 | 20 | class TextCNNInterpret(TextCNN):#定义TextCNN的子类 21 | def __init__(self, *args,**kwargs):#透传参数 22 | super().__init__(*args,**kwargs) 23 | def forward(self, text): #重载模型处理方法 24 | embedded = self.embedding(text)#从词嵌入开始处理 25 | #后面的代码与TextCNN一样 26 | embedded = embedded.unsqueeze(1) 27 | conved = [self.mish(conv(embedded)).squeeze(3) for conv in self.convs] 28 | pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved] 29 | cat = self.dropout(torch.cat(pooled, dim = 1)) 30 | return self.fc(cat) 31 | 32 | ########################## 33 | #定义模型参数 34 | INPUT_DIM = len(TEXT.vocab)#25002 35 | EMBEDDING_DIM = TEXT.vocab.vectors.size()[1] #100 36 | N_FILTERS = 100 37 | FILTER_SIZES = [3,4,5] 38 | OUTPUT_DIM = 1 39 | DROPOUT = 0.5 40 | PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token] 41 | #实例化模型 42 | model = TextCNNInterpret(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX) 43 | 44 | #加载模型权重 45 | model.load_state_dict(torch.load('textcnn-model.pt') ) 46 | print('Vocabulary Size: ', len(TEXT.vocab)) 47 | #对嵌入层进行封装并提取 48 | interpretable_embedding = configure_interpretable_embedding_layer (model, 'embedding') 49 | 50 | ########################## 51 | 52 | 53 | 54 | ig = IntegratedGradients(model)#创建梯度积分算法对象 55 | 56 | #定义列表,存放可视化记录 57 | vis_data_records_ig = [] 58 | 59 | nlp = spacy.load('en') #为分词库加载英文语言包 60 | 61 | 62 | #定义函数对句子进行可解释性分析 63 | def interpret_sentence(model, sentence, min_len = 7, label = 0): 64 | 65 | sentence=sentence.lower() #将句子转为小写 66 | 67 | model.eval() 68 | #分词处理 69 | text = [tok.text for tok in nlp.tokenizer(sentence)] 70 | if len(text) < min_len: #对小于指定长度的句子进行 填充 71 | text += [TEXT.pad_token] * (min_len - len(text)) 72 | #将句子中的单词转为索引 73 | indexed = [TEXT.vocab.stoi[t] for t in text] 74 | 75 | model.zero_grad() #将模型中的梯度清0 76 | 77 | input_indices = torch.LongTensor(indexed) #转为张量 78 | input_indices = input_indices.unsqueeze(0) #增加维度 79 | 80 | #转为词嵌入 81 | input_embedding = interpretable_embedding.indices_to_embeddings(input_indices) 82 | 83 | #将词嵌入输入模型,进行预测 84 | pred = torch.sigmoid(model(input_embedding)).item() 85 | pred_ind = round(pred) #计算输出结果 86 | 87 | #创建梯度积分的初始输入值 88 | PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token] #获得填充字符的索引 89 | token_reference = TokenReferenceBase(reference_token_idx=PAD_IDX) 90 | #制作初始输入索引:复制指定长度个token_reference,并扩展维度 91 | reference_indices = token_reference.generate_reference(len(indexed), device='cpu').unsqueeze(0) 92 | print("reference_indices",reference_indices) 93 | #将制作好的输入索引转成词嵌入 94 | reference_embedding = interpretable_embedding.indices_to_embeddings(reference_indices) 95 | 96 | 97 | #用梯度积分的方法计算可解释性 98 | attributions_ig, delta = ig.attribute(input_embedding, reference_embedding, n_steps=500, return_convergence_delta=True) 99 | #输出可解释性结果 100 | print('attributions_ig, delta',attributions_ig.size(), delta.size()) 101 | print('pred: ', LABEL.vocab.itos[pred_ind], '(', '%.2f'%pred, ')', ', delta: ', abs(delta)) 102 | #加入可视化记录中 103 | add_attributions_to_visualizer(attributions_ig, text, pred, pred_ind, label, delta, vis_data_records_ig) 104 | 105 | #定义函数,将解释性结果放入可视化记录中 106 | def add_attributions_to_visualizer(attributions, text, pred, pred_ind, label, delta, vis_data_records): 107 | attributions = attributions.sum(dim=2).squeeze(0) 108 | attributions = attributions / torch.norm(attributions) 109 | attributions = attributions.detach().numpy() 110 | 111 | # storing couple samples in an array for visualization purposes 112 | vis_data_records.append(visualization.VisualizationDataRecord( 113 | attributions, 114 | pred, 115 | LABEL.vocab.itos[pred_ind], 116 | LABEL.vocab.itos[label], 117 | LABEL.vocab.itos[1], 118 | attributions.sum(), 119 | text[:len(attributions)], 120 | delta)) 121 | 122 | interpret_sentence(model, 'It was a fantastic performance !', label=1) 123 | 124 | interpret_sentence(model, 'The film is very good!', label=1) 125 | 126 | interpret_sentence(model, 'I think this film is not very bad!', label=1) 127 | 128 | 129 | #根据可视化记录生成网页 130 | visualization.visualize_text(vis_data_records_ig) 131 | 132 | #还原模型的词嵌入层 133 | remove_interpretable_embedding_layer(model, interpretable_embedding) 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | -------------------------------------------------------------------------------- /code_01_subtraction.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @author: 代码医生工作室 4 | @公众号:xiangyuejiqiren (内有更多优秀文章及学习资料) 5 | @来源: 配套代码 6 | @配套代码技术支持:bbs.aianaconda.com 7 | Created on Thu Mar 30 09:43:58 2017 8 | """ 9 | 10 | import copy, numpy as np 11 | np.random.seed(0) #随机数生成器的种子,可以每次得到一样的值 12 | # compute sigmoid nonlinearity 13 | def sigmoid(x): #激活函数 14 | output = 1/(1+np.exp(-x)) 15 | return output 16 | # convert output of sigmoid function to its derivative 17 | def sigmoid_output_to_derivative(output):#激活函数的导数 18 | return output*(1-output) 19 | 20 | 21 | int2binary = {} #整数到其二进制表示的映射 22 | binary_dim = 8 #暂时制作256以内的减法 23 | ## 计算0-256的二进制表示 24 | largest_number = pow(2,binary_dim) 25 | binary = np.unpackbits( 26 | np.array([range(largest_number)],dtype=np.uint8).T,axis=1) 27 | for i in range(largest_number): 28 | int2binary[i] = binary[i] 29 | 30 | # input variables 31 | alpha = 0.9 #学习速率 32 | input_dim = 2 #输入的维度是2 33 | hidden_dim = 16 34 | output_dim = 1 #输出维度为1 35 | 36 | # initialize neural network weights 37 | synapse_0 = (2*np.random.random((input_dim,hidden_dim)) - 1)*0.05 #维度为2*16, 2是输入维度,16是隐藏层维度 38 | synapse_1 = (2*np.random.random((hidden_dim,output_dim)) - 1)*0.05 39 | synapse_h = (2*np.random.random((hidden_dim,hidden_dim)) - 1)*0.05 40 | # => [-0.05, 0.05), 41 | 42 | # 用于存放反向传播的权重更新值 43 | synapse_0_update = np.zeros_like(synapse_0) 44 | synapse_1_update = np.zeros_like(synapse_1) 45 | synapse_h_update = np.zeros_like(synapse_h) 46 | 47 | # training 48 | for j in range(10000): 49 | 50 | #生成一个数字a 51 | a_int = np.random.randint(largest_number) 52 | #生成一个数字b,b的最大值取的是largest_number/2,作为被减数,让它小一点。 53 | b_int = np.random.randint(largest_number/2) 54 | #如果生成的b大了,那么交换一下 55 | if a_int