├── data └── Twitter2013 │ ├── twitter-2013dev-A.tsv │ ├── twitter-2013test-A.tsv │ └── twitter-2013train-A.tsv ├── dataset └── dataloader.py ├── models └── bertmask.py ├── prompt_class.py ├── readme.md └── tb_log └── readme.md /dataset/dataloader.py: -------------------------------------------------------------------------------- 1 | #构建数据集 2 | import torch.utils.data as Data 3 | import pandas as pd 4 | import torch 5 | 6 | PREFIX = 'It was [mask]. ' 7 | MASK_POS=3 # "it was [mask]" 中 [mask] 位置 8 | 9 | 10 | 11 | 12 | 13 | class MyDataSet(Data.Dataset): 14 | def __init__(self, sen , mask , typ ,label ): 15 | super(MyDataSet, self).__init__() 16 | self.sen = torch.tensor(sen,dtype=torch.long) 17 | self.mask = torch.tensor(mask,dtype=torch.long) 18 | self.typ =torch.tensor( typ,dtype=torch.long) 19 | self.label = torch.tensor(label,dtype=torch.long) 20 | 21 | def __len__(self): 22 | return self.sen.shape[0] 23 | 24 | def __getitem__(self, idx): 25 | return self.sen[idx], self.mask[idx],self.typ[idx],self.label[idx] 26 | #load data 27 | 28 | def load_data(tsvpath): 29 | data=pd.read_csv(tsvpath,sep="\t",header=None,names=["sn","polarity","text"]) 30 | data=data[data["polarity"] != "neutral"] 31 | yy=data["polarity"].replace({"negative":0,"positive":1,"neutral":2}) 32 | # print(data.loc[0:5,[0,1]]) # 33 | #print(data.iloc[0:5,[1,1]]) # 34 | #print(data.iloc[:,1:2]) # 35 | #print(data.iloc[:,2:3]) # 36 | return data.values[:,2:3].tolist(),yy.tolist() #data.values[:,1:2].tolist() 37 | 38 | 39 | def ProcessData(filepath,tokenizer): 40 | pos_id=tokenizer.convert_tokens_to_ids("good") #9005 41 | neg_id=tokenizer.convert_tokens_to_ids("bad") #12139 42 | x_train,y_train=load_data(filepath) 43 | #x_train,x_test,y_train,y_test=train_test_split(StrongData,StrongLabel,test_size=0.3, random_state=42) 44 | 45 | Inputid=[] 46 | Labelid=[] 47 | typeid=[] 48 | attenmask=[] 49 | 50 | for i in range(len(x_train)): 51 | 52 | text_ = PREFIX+x_train[i][0] 53 | 54 | encode_dict = tokenizer.encode_plus(text_,max_length=60,padding="max_length",truncation=True) 55 | input_ids=encode_dict["input_ids"] 56 | type_ids=encode_dict["token_type_ids"] 57 | atten_mask=encode_dict["attention_mask"] 58 | labelid,inputid= input_ids[:],input_ids[:] 59 | if y_train[i] == 0: 60 | labelid[MASK_POS] = neg_id 61 | labelid[:MASK_POS] = [-1]* len(labelid[:MASK_POS]) 62 | labelid[MASK_POS+1:] = [-1] * len(labelid[MASK_POS+1:]) 63 | inputid[MASK_POS] = tokenizer.mask_token_id 64 | else: 65 | labelid[MASK_POS] = pos_id 66 | labelid[:MASK_POS] = [-1]* len(labelid[:MASK_POS]) 67 | labelid[MASK_POS+1:] = [-1] * len(labelid[MASK_POS+1:]) 68 | inputid[MASK_POS] = tokenizer.mask_token_id 69 | 70 | Labelid.append(labelid) 71 | Inputid.append(inputid) 72 | typeid.append(type_ids) 73 | attenmask.append(atten_mask) 74 | 75 | return Inputid,Labelid,typeid,attenmask 76 | -------------------------------------------------------------------------------- /models/bertmask.py: -------------------------------------------------------------------------------- 1 | 2 | import torch.nn as nn 3 | from transformers import BertModel,BertForMaskedLM 4 | class Bert_Model(nn.Module): 5 | def __init__(self, bert_path ,config_file ): 6 | super(Bert_Model, self).__init__() 7 | self.bert = BertForMaskedLM.from_pretrained(bert_path,config=config_file) # 加载预训练模型权重 8 | 9 | 10 | def forward(self, input_ids, attention_mask, token_type_ids): 11 | outputs = self.bert(input_ids, attention_mask, token_type_ids) #masked LM 输出的是 mask的值 对应的ids的概率 ,输出 会是词表大小,里面是概率 12 | logit = outputs[0] # 池化后的输出 [bs, config.hidden_size] 13 | 14 | 15 | return logit 16 | -------------------------------------------------------------------------------- /prompt_class.py: -------------------------------------------------------------------------------- 1 | ''' 2 | 3 | https://blog.csdn.net/wf19971210/article/details/120543015 4 | 5 | 链接:https://pan.baidu.com/s/1Nx7htUBWKBZfo3QPPty3mw 6 | 提取码:1234 7 | ''' 8 | import warnings 9 | from datetime import datetime 10 | import time 11 | import torch 12 | import os 13 | from transformers import BertModel,BertConfig,BertModel,BertTokenizerFast,get_cosine_schedule_with_warmup,BertForMaskedLM 14 | import pandas as pd 15 | import torch 16 | import torch.nn as nn 17 | import torch.optim as optim 18 | import torch.utils.data as Data 19 | from torch.utils.tensorboard import SummaryWriter 20 | 21 | from models.bertmask import Bert_Model 22 | 23 | from dataset.dataloader import load_data,MyDataSet,ProcessData,MASK_POS 24 | # hyperparameters 25 | EPOCH=200 26 | RANDOM_SEED=2022 27 | TRAIN_BATCH_SIZE=32 #小批训练, 批大小增大时需要提升学习率 https://zhuanlan.zhihu.com/p/413656738 28 | TEST_BATCH_SIZE=96 #大批测试 29 | EVAL_PERIOD=20 30 | MODEL_NAME="bert-large-uncased" # bert-base-chinese 31 | DATA_PATH="data/Twitter2013" 32 | NUM_WORKERS=10 33 | 34 | train_file="twitter-2013train-A.tsv" 35 | dev_file="twitter-2013dev-A.tsv" 36 | test_file="twitter-2013test-A.tsv" 37 | 38 | 39 | # env variables 40 | 41 | os.environ['TOKENIZERS_PARALLELISM']="false" 42 | 43 | device=torch.device("cuda" if torch.cuda.is_available() else "cpu") 44 | 45 | writer = SummaryWriter('./tb_log') 46 | ''' 47 | 48 | ''' 49 | 50 | pd.options.display.max_columns = None 51 | pd.options.display.max_rows = None 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | tokenizer=BertTokenizerFast.from_pretrained(MODEL_NAME) 61 | 62 | config=BertConfig.from_pretrained(MODEL_NAME) 63 | 64 | model=Bert_Model(bert_path=MODEL_NAME,config_file=config).to(device) 65 | 66 | 67 | 68 | # get the data and label 69 | 70 | # DATA_PATH+os.sep+filepath 71 | 72 | 73 | Inputid_train,Labelid_train,typeids_train,inputnmask_train=ProcessData(DATA_PATH+os.sep+train_file,tokenizer) 74 | Inputid_dev,Labelid_dev,typeids_dev,inputnmask_dev=ProcessData(DATA_PATH+os.sep+dev_file,tokenizer) 75 | Inputid_test,Labelid_test,typeids_test,inputnmask_test=ProcessData(DATA_PATH+os.sep+test_file,tokenizer) 76 | 77 | 78 | train_dataset = Data.DataLoader(MyDataSet(Inputid_train, inputnmask_train , typeids_train , Labelid_train), TRAIN_BATCH_SIZE, shuffle=True,num_workers=NUM_WORKERS) 79 | valid_dataset = Data.DataLoader(MyDataSet(Inputid_dev, inputnmask_dev , typeids_dev , Labelid_dev), TRAIN_BATCH_SIZE, shuffle=True,num_workers=NUM_WORKERS) 80 | test_dataset = Data.DataLoader(MyDataSet(Inputid_test, inputnmask_test , typeids_test , Labelid_test), TEST_BATCH_SIZE, shuffle=True,num_workers=NUM_WORKERS) 81 | 82 | train_data_num=len(Inputid_train) 83 | test_data_num=len(Inputid_test) 84 | #print("hello!") 85 | 86 | 87 | 88 | optimizer = optim.AdamW(model.parameters(),lr=2e-5,weight_decay=1e-4) #使用Adam优化器 89 | loss_func = nn.CrossEntropyLoss(ignore_index=-1) 90 | EPOCH = 200 91 | schedule = get_cosine_schedule_with_warmup(optimizer,num_warmup_steps=len(train_dataset),num_training_steps=EPOCH*len(train_dataset)) 92 | print("正在训练中。。。") 93 | totaltime=0 94 | for epoch in range(EPOCH): 95 | 96 | starttime_train=datetime.now() 97 | 98 | start =time.time() 99 | correct=0 100 | train_loss_sum=0 101 | model.train() 102 | 103 | for idx,(ids,att_mask,type,y) in enumerate(train_dataset): 104 | ids,att_mask,type,y = ids.to(device),att_mask.to(device),type.to(device),y.to(device) 105 | out_train = model(ids,att_mask,type) 106 | #print(out_train.view(-1, tokenizer.vocab_size).shape, y.view(-1).shape) 107 | loss = loss_func(out_train.view(-1, tokenizer.vocab_size),y.view(-1)) 108 | optimizer.zero_grad() 109 | loss.backward() 110 | optimizer.step() 111 | schedule.step() 112 | train_loss_sum += loss.item() 113 | 114 | if( idx+1)% EVAL_PERIOD == 0: 115 | print("Epoch {:04d} | Step {:06d}/{:06d} | Loss {:.4f} | Time {:.0f}".format( 116 | epoch + 1, idx + 1, len(train_dataset), train_loss_sum / (idx + 1), time.time() - start)) 117 | writer.add_scalar('loss/train_loss', train_loss_sum / (idx + 1), epoch) 118 | 119 | truelabel=y[:,MASK_POS] 120 | out_train_mask=out_train[:,MASK_POS,:] 121 | predicted=torch.max(out_train_mask,1)[1] 122 | correct += (predicted == truelabel).sum() 123 | correct = float(correct) 124 | 125 | acc =float(correct /train_data_num) 126 | 127 | eval_loss_sum=0.0 128 | model.eval() 129 | correct_test=0 130 | with torch.no_grad(): 131 | for ids, att, tpe, y in test_dataset: 132 | ids, att, tpe, y = ids.to(device), att.to(device), tpe.to(device), y.to(device) 133 | out_test = model(ids , att , tpe) 134 | loss_eval = loss_func(out_test.view(-1, tokenizer.vocab_size), y.view(-1)) 135 | eval_loss_sum += loss_eval.item() 136 | ttruelabel = y[:, MASK_POS] 137 | tout_train_mask = out_test[:, MASK_POS, :] 138 | predicted_test = torch.max(tout_train_mask.data, 1)[1] 139 | correct_test += (predicted_test == ttruelabel).sum() 140 | correct_test = float(correct_test) 141 | acc_test = float(correct_test / test_data_num) 142 | 143 | if epoch % 1 == 0: 144 | out = ("epoch {}, train_loss {}, train_acc {} , eval_loss {} ,acc_test {}" 145 | .format(epoch + 1, train_loss_sum / (len(train_dataset)), acc, eval_loss_sum / (len(test_dataset)), 146 | acc_test)) 147 | writer.add_scalar('loss/test_loss', train_loss_sum / (idx + 1), epoch) 148 | print(out) 149 | end=time.time() 150 | 151 | print("epoch {} duration:".format(epoch+1),end-start) 152 | totaltime+=end-start 153 | 154 | print("total training time: ",totaltime) -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | Sentiment classification based on Prompt learning 2 | 3 | 4 | blog: https://blog.csdn.net/znsoft/article/details/123960223 5 | 6 | acc: above 92% 7 | 8 | 9 | 10 | ## prompt templates 11 | 12 | the only template is : It was [mask]. 13 | -------------------------------------------------------------------------------- /tb_log/readme.md: -------------------------------------------------------------------------------- 1 | place holder 2 | --------------------------------------------------------------------------------