├── data
    └── Twitter2013
    │   ├── twitter-2013dev-A.tsv
    │   ├── twitter-2013test-A.tsv
    │   └── twitter-2013train-A.tsv
├── dataset
    └── dataloader.py
├── models
    └── bertmask.py
├── prompt_class.py
├── readme.md
└── tb_log
    └── readme.md


/dataset/dataloader.py:
--------------------------------------------------------------------------------
 1 | #构建数据集
 2 | import torch.utils.data as Data
 3 | import pandas as pd
 4 | import torch 
 5 | 
 6 | PREFIX = 'It was [mask]. '
 7 | MASK_POS=3  # "it was [mask]" 中 [mask] 位置
 8 | 
 9 | 
10 | 
11 | 
12 | 
13 | class MyDataSet(Data.Dataset):
14 |     def __init__(self, sen , mask , typ ,label ):
15 |         super(MyDataSet, self).__init__()
16 |         self.sen = torch.tensor(sen,dtype=torch.long)
17 |         self.mask = torch.tensor(mask,dtype=torch.long)
18 |         self.typ =torch.tensor( typ,dtype=torch.long)
19 |         self.label = torch.tensor(label,dtype=torch.long)
20 | 
21 |     def __len__(self):
22 |         return self.sen.shape[0]
23 | 
24 |     def __getitem__(self, idx):
25 |         return self.sen[idx], self.mask[idx],self.typ[idx],self.label[idx]
26 | #load  data
27 |    
28 | def load_data(tsvpath):
29 |     data=pd.read_csv(tsvpath,sep="\t",header=None,names=["sn","polarity","text"])
30 |     data=data[data["polarity"] != "neutral"]
31 |     yy=data["polarity"].replace({"negative":0,"positive":1,"neutral":2})
32 |     # print(data.loc[0:5,[0,1]])  # 
33 |     #print(data.iloc[0:5,[1,1]])  # 
34 |     #print(data.iloc[:,1:2])  # 
35 |     #print(data.iloc[:,2:3])  # 
36 |     return data.values[:,2:3].tolist(),yy.tolist() #data.values[:,1:2].tolist()
37 | 
38 | 
39 | def ProcessData(filepath,tokenizer):
40 |     pos_id=tokenizer.convert_tokens_to_ids("good") #9005
41 |     neg_id=tokenizer.convert_tokens_to_ids("bad")  #12139
42 |     x_train,y_train=load_data(filepath)
43 |     #x_train,x_test,y_train,y_test=train_test_split(StrongData,StrongLabel,test_size=0.3, random_state=42)
44 | 
45 |     Inputid=[]
46 |     Labelid=[]
47 |     typeid=[]
48 |     attenmask=[]
49 | 
50 |     for i in range(len(x_train)):
51 | 
52 |         text_ = PREFIX+x_train[i][0]
53 | 
54 |         encode_dict = tokenizer.encode_plus(text_,max_length=60,padding="max_length",truncation=True)
55 |         input_ids=encode_dict["input_ids"]
56 |         type_ids=encode_dict["token_type_ids"]
57 |         atten_mask=encode_dict["attention_mask"]
58 |         labelid,inputid= input_ids[:],input_ids[:]
59 |         if y_train[i] == 0:
60 |             labelid[MASK_POS] = neg_id
61 |             labelid[:MASK_POS] = [-1]* len(labelid[:MASK_POS]) 
62 |             labelid[MASK_POS+1:] = [-1] * len(labelid[MASK_POS+1:])
63 |             inputid[MASK_POS] = tokenizer.mask_token_id
64 |         else:
65 |             labelid[MASK_POS] = pos_id
66 |             labelid[:MASK_POS] = [-1]* len(labelid[:MASK_POS]) 
67 |             labelid[MASK_POS+1:] = [-1] * len(labelid[MASK_POS+1:])
68 |             inputid[MASK_POS] = tokenizer.mask_token_id
69 | 
70 |         Labelid.append(labelid)
71 |         Inputid.append(inputid)
72 |         typeid.append(type_ids)
73 |         attenmask.append(atten_mask)
74 | 
75 |     return Inputid,Labelid,typeid,attenmask
76 | 


--------------------------------------------------------------------------------
/models/bertmask.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import torch.nn as nn 
 3 | from transformers import BertModel,BertForMaskedLM
 4 | class Bert_Model(nn.Module):
 5 |     def __init__(self,  bert_path ,config_file ):
 6 |         super(Bert_Model, self).__init__()
 7 |         self.bert = BertForMaskedLM.from_pretrained(bert_path,config=config_file)  # 加载预训练模型权重
 8 | 
 9 | 
10 |     def forward(self, input_ids, attention_mask, token_type_ids):
11 |         outputs = self.bert(input_ids, attention_mask, token_type_ids) #masked LM 输出的是 mask的值 对应的ids的概率 ，输出 会是词表大小，里面是概率 
12 |         logit = outputs[0]  # 池化后的输出 [bs, config.hidden_size]
13 | 
14 | 
15 |         return logit 
16 | 


--------------------------------------------------------------------------------
/prompt_class.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | 
  3 | https://blog.csdn.net/wf19971210/article/details/120543015
  4 | 
  5 | 链接：https://pan.baidu.com/s/1Nx7htUBWKBZfo3QPPty3mw
  6 | 提取码：1234
  7 | '''
  8 | import warnings
  9 | from datetime import datetime
 10 | import time 
 11 | import torch
 12 | import os
 13 | from transformers import BertModel,BertConfig,BertModel,BertTokenizerFast,get_cosine_schedule_with_warmup,BertForMaskedLM
 14 | import pandas  as pd
 15 | import torch 
 16 | import torch.nn as nn 
 17 | import torch.optim as optim 
 18 | import torch.utils.data as Data
 19 | from torch.utils.tensorboard import SummaryWriter 
 20 | 
 21 | from models.bertmask import Bert_Model
 22 | 
 23 | from dataset.dataloader import load_data,MyDataSet,ProcessData,MASK_POS
 24 | # hyperparameters 
 25 | EPOCH=200
 26 | RANDOM_SEED=2022 
 27 | TRAIN_BATCH_SIZE=32  #小批训练， 批大小增大时需要提升学习率  https://zhuanlan.zhihu.com/p/413656738
 28 | TEST_BATCH_SIZE=96   #大批测试
 29 | EVAL_PERIOD=20
 30 | MODEL_NAME="bert-large-uncased"  # bert-base-chinese
 31 | DATA_PATH="data/Twitter2013"
 32 | NUM_WORKERS=10
 33 | 
 34 | train_file="twitter-2013train-A.tsv"
 35 | dev_file="twitter-2013dev-A.tsv"
 36 | test_file="twitter-2013test-A.tsv"
 37 | 
 38 | 
 39 | # env variables
 40 | 
 41 | os.environ['TOKENIZERS_PARALLELISM']="false" 
 42 | 
 43 | device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
 44 | 
 45 | writer = SummaryWriter('./tb_log')
 46 | '''
 47 | 
 48 | '''
 49 | 
 50 | pd.options.display.max_columns = None
 51 | pd.options.display.max_rows = None
 52 | 
 53 | 
 54 | 
 55 | 
 56 | 
 57 | 
 58 | 
 59 | 
 60 | tokenizer=BertTokenizerFast.from_pretrained(MODEL_NAME)
 61 | 
 62 | config=BertConfig.from_pretrained(MODEL_NAME)
 63 | 
 64 | model=Bert_Model(bert_path=MODEL_NAME,config_file=config).to(device)
 65 | 
 66 | 
 67 | 
 68 | # get the data and label
 69 | 
 70 | # DATA_PATH+os.sep+filepath
 71 | 
 72 | 
 73 | Inputid_train,Labelid_train,typeids_train,inputnmask_train=ProcessData(DATA_PATH+os.sep+train_file,tokenizer)
 74 | Inputid_dev,Labelid_dev,typeids_dev,inputnmask_dev=ProcessData(DATA_PATH+os.sep+dev_file,tokenizer)
 75 | Inputid_test,Labelid_test,typeids_test,inputnmask_test=ProcessData(DATA_PATH+os.sep+test_file,tokenizer)
 76 | 
 77 | 
 78 | train_dataset = Data.DataLoader(MyDataSet(Inputid_train,  inputnmask_train , typeids_train , Labelid_train), TRAIN_BATCH_SIZE, shuffle=True,num_workers=NUM_WORKERS)
 79 | valid_dataset = Data.DataLoader(MyDataSet(Inputid_dev,  inputnmask_dev , typeids_dev , Labelid_dev), TRAIN_BATCH_SIZE,  shuffle=True,num_workers=NUM_WORKERS)
 80 | test_dataset = Data.DataLoader(MyDataSet(Inputid_test,  inputnmask_test , typeids_test , Labelid_test), TEST_BATCH_SIZE,  shuffle=True,num_workers=NUM_WORKERS)
 81 | 
 82 | train_data_num=len(Inputid_train)
 83 | test_data_num=len(Inputid_test)
 84 | #print("hello!")
 85 | 
 86 | 
 87 | 
 88 | optimizer = optim.AdamW(model.parameters(),lr=2e-5,weight_decay=1e-4)  #使用Adam优化器
 89 | loss_func = nn.CrossEntropyLoss(ignore_index=-1)
 90 | EPOCH = 200
 91 | schedule = get_cosine_schedule_with_warmup(optimizer,num_warmup_steps=len(train_dataset),num_training_steps=EPOCH*len(train_dataset))
 92 | print("正在训练中。。。")
 93 | totaltime=0
 94 | for epoch in range(EPOCH):
 95 | 
 96 |     starttime_train=datetime.now()
 97 | 
 98 |     start =time.time()
 99 |     correct=0
100 |     train_loss_sum=0
101 |     model.train()
102 | 
103 |     for idx,(ids,att_mask,type,y) in enumerate(train_dataset):
104 |         ids,att_mask,type,y = ids.to(device),att_mask.to(device),type.to(device),y.to(device)
105 |         out_train = model(ids,att_mask,type)
106 |        #print(out_train.view(-1, tokenizer.vocab_size).shape, y.view(-1).shape)
107 |         loss = loss_func(out_train.view(-1, tokenizer.vocab_size),y.view(-1))
108 |         optimizer.zero_grad()
109 |         loss.backward()
110 |         optimizer.step()
111 |         schedule.step()
112 |         train_loss_sum += loss.item()
113 |        
114 |         if( idx+1)% EVAL_PERIOD == 0:
115 |             print("Epoch {:04d} | Step {:06d}/{:06d} | Loss {:.4f} | Time {:.0f}".format(
116 |                 epoch + 1, idx + 1, len(train_dataset), train_loss_sum / (idx + 1), time.time() - start))
117 |             writer.add_scalar('loss/train_loss', train_loss_sum / (idx + 1), epoch)
118 | 
119 |         truelabel=y[:,MASK_POS]
120 |         out_train_mask=out_train[:,MASK_POS,:]
121 |         predicted=torch.max(out_train_mask,1)[1]
122 |         correct += (predicted == truelabel).sum()
123 |         correct = float(correct)
124 |     
125 |     acc =float(correct /train_data_num)
126 | 
127 |     eval_loss_sum=0.0
128 |     model.eval()
129 |     correct_test=0
130 |     with torch.no_grad():
131 |         for ids, att, tpe, y in test_dataset:
132 |             ids, att, tpe, y = ids.to(device), att.to(device), tpe.to(device), y.to(device)
133 |             out_test = model(ids , att , tpe)
134 |             loss_eval = loss_func(out_test.view(-1, tokenizer.vocab_size), y.view(-1))
135 |             eval_loss_sum += loss_eval.item()
136 |             ttruelabel = y[:, MASK_POS]
137 |             tout_train_mask = out_test[:, MASK_POS, :]
138 |             predicted_test = torch.max(tout_train_mask.data, 1)[1]
139 |             correct_test += (predicted_test == ttruelabel).sum()
140 |             correct_test = float(correct_test)
141 |     acc_test = float(correct_test / test_data_num)
142 | 
143 |     if epoch % 1 == 0:
144 |         out = ("epoch {}, train_loss {},  train_acc {} , eval_loss {} ,acc_test {}"
145 |                .format(epoch + 1, train_loss_sum / (len(train_dataset)), acc, eval_loss_sum / (len(test_dataset)),
146 |                 acc_test))
147 |         writer.add_scalar('loss/test_loss', train_loss_sum / (idx + 1), epoch)
148 |         print(out)
149 |     end=time.time()
150 | 
151 |     print("epoch {} duration:".format(epoch+1),end-start)
152 |     totaltime+=end-start
153 | 
154 | print("total training time: ",totaltime)


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
 1 | Sentiment classification based on Prompt learning
 2 | 
 3 | 
 4 | blog:   https://blog.csdn.net/znsoft/article/details/123960223
 5 | 
 6 | acc:  above 92% 
 7 | 
 8 | 
 9 | 
10 | ##  prompt templates
11 | 
12 | the only template is :  It was [mask].
13 | 


--------------------------------------------------------------------------------
/tb_log/readme.md:
--------------------------------------------------------------------------------
1 | place holder
2 | 


--------------------------------------------------------------------------------