├── README.md ├── bert.py ├── cross_validation.py ├── data_processor.py ├── predict.py ├── train_ernie.py ├── train_roberta.py ├── train_valid.py └── utils.py /README.md: -------------------------------------------------------------------------------- 1 | # 2021-sohu-textmatch 2 | 2021搜狐校园文本匹配算法大赛 3 | 4 | # 建模思路 5 | 加载 bert 预训练模型,取最后一层序列输出的平均值作为句子对的embedding;任务类型embedding化与句子对的embedding相加,作为input输入到全连接层分类器。这样就可以达到用一个模型同时预测taskA和taskB的目标。 6 | 7 | # 模型效果 8 | | 预训练模型 | 线下 | 线下taskA | 线下taskB | 线上 | 线上taskA | 线上taskB | 9 | |:-----------------------:|--------|-----------|-----------|--------------|--------------|--------------| 10 | | ernie | 0.7815 | 0.847 | 0.7161 | 0.7822752226 | 0.8473297289 | 0.7172207162 | 11 | | chinese-roberta-wwm-ext | 0.7815 | 0.8468 | 0.7162 | 0.7824321441 | 0.8499669016 | 0.7148973867 | 12 | | ernie+roberta等权重融合 | | | | 0.7841945647 | 0.8509693997 | 0.7174197298 | 13 | -------------------------------------------------------------------------------- /bert.py: -------------------------------------------------------------------------------- 1 | import copy 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | from transformers import AutoModel 7 | 8 | 9 | class FocalLoss(nn.Module): 10 | def __init__(self, alpha=0.25, gamma=2, size_average=True): 11 | super(FocalLoss, self).__init__() 12 | self.alpha = torch.tensor(alpha).cuda() 13 | self.gamma = gamma 14 | self.size_average = size_average 15 | 16 | def forward(self, pred, target): 17 | # 如果模型最后没有 nn.Sigmoid(),那么这里就需要对预测结果计算一次 Sigmoid 操作 18 | pred = nn.Sigmoid()(pred) 19 | 20 | # 展开 pred 和 target,此时 pred.size = target.size = (BatchSize,1) 21 | pred = pred.view(-1, 1) 22 | target = target.view(-1, 1) 23 | 24 | # 此处将预测样本为正负的概率都计算出来,此时 pred.size = (BatchSize,2) 25 | pred = torch.cat((1 - pred, pred), dim=1) 26 | 27 | # 根据 target 生成 mask,即根据 ground truth 选择所需概率 28 | # 用大白话讲就是: 29 | # 当标签为 1 时,我们就将模型预测该样本为正类的概率代入公式中进行计算 30 | # 当标签为 0 时,我们就将模型预测该样本为负类的概率代入公式中进行计算 31 | class_mask = torch.zeros(pred.shape[0], pred.shape[1]).cuda() 32 | # 这里的 scatter_ 操作不常用,其函数原型为: 33 | # scatter_(dim,index,src)->Tensor 34 | # Writes all values from the tensor src into self at the indices specified in the index tensor. 35 | # For each value in src, its output index is specified by its index in src for dimension != dim and by the corresponding value in index for dimension = dim. 36 | class_mask.scatter_(1, target.view(-1, 1).long(), 1.) 37 | 38 | # 利用 mask 将所需概率值挑选出来 39 | probs = (pred * class_mask).sum(dim=1).view(-1, 1) 40 | probs = probs.clamp(min=0.0001, max=1.0) 41 | 42 | # 计算概率的 log 值 43 | log_p = probs.log() 44 | 45 | # 根据论文中所述,对 alpha 进行设置(该参数用于调整正负样本数量不均衡带来的问题) 46 | alpha = torch.ones(pred.shape[0], pred.shape[1]).cuda() 47 | alpha[:, 0] = alpha[:, 0] * (1 - self.alpha) 48 | alpha[:, 1] = alpha[:, 1] * self.alpha 49 | alpha = (alpha * class_mask).sum(dim=1).view(-1, 1) 50 | 51 | # 根据 Focal Loss 的公式计算 Loss 52 | batch_loss = -alpha * (torch.pow((1 - probs), self.gamma)) * log_p 53 | 54 | # Loss Function的常规操作,mean 与 sum 的区别不大,相当于学习率设置不一样而已 55 | if self.size_average: 56 | loss = batch_loss.mean() 57 | else: 58 | loss = batch_loss.sum() 59 | 60 | return loss 61 | 62 | 63 | def compute_loss(outputs, labels, loss_method='binary'): 64 | loss = 0. 65 | if loss_method == 'binary': 66 | labels = labels.unsqueeze(1) 67 | loss = F.binary_cross_entropy(torch.sigmoid(outputs), labels) 68 | elif loss_method == 'cross_entropy': 69 | loss = F.cross_entropy(outputs, labels) 70 | elif loss_method == 'focal_loss': 71 | focal_loss = FocalLoss() 72 | loss = focal_loss(outputs, labels) 73 | else: 74 | raise Exception('loss_method {binary or cross_entropy} error. ') 75 | return loss 76 | 77 | 78 | class Bert(nn.Module): 79 | def __init__(self, config): 80 | super(Bert, self).__init__() 81 | 82 | # 计算loss的方法 83 | self.loss_method = config.loss_method 84 | self.pool_method = config.pool_method 85 | 86 | self.bert = AutoModel.from_pretrained(config.pretrain_path) 87 | 88 | if config.requires_grad: 89 | for param in self.bert.parameters(): 90 | param.requires_grad = True 91 | self.dropout = nn.Dropout(config.hidden_dropout_prob) 92 | 93 | last_layer_dim = self.bert.config.hidden_size 94 | 95 | self.task_type_embedding = nn.Embedding(2, last_layer_dim) 96 | self.w2v_linear = nn.Linear(200, last_layer_dim) 97 | 98 | self.ln = torch.nn.LayerNorm(last_layer_dim) 99 | 100 | hidden_size = [last_layer_dim] + copy.deepcopy(config.hidden_size) 101 | 102 | self.classifier = nn.Sequential() 103 | 104 | for i in range(len(hidden_size) - 1): 105 | self.classifier.add_module( 106 | 'classifier_{}'.format(i), 107 | nn.Linear(hidden_size[i], hidden_size[i + 1])) 108 | 109 | if self.loss_method in ['binary', 'focal_loss', 'ghmc']: 110 | self.classifier.add_module( 111 | 'classifier_output', 112 | nn.Linear(hidden_size[len(hidden_size) - 1], 1)) 113 | else: 114 | self.classifier.add_module( 115 | 'classifier_output', 116 | nn.Linear(hidden_size[len(hidden_size) - 1], 2)) 117 | 118 | def forward(self, 119 | task_type=None, 120 | input_ids=None, 121 | attention_mask=None, 122 | token_type_ids=None, 123 | labels=None): 124 | outputs = self.bert( 125 | input_ids, 126 | attention_mask=attention_mask, 127 | token_type_ids=token_type_ids, 128 | ) 129 | if self.pool_method == 'first': 130 | pooled_output = outputs[0][:, 0] 131 | else: 132 | pooled_output = torch.mean(outputs[0], dim=1) 133 | 134 | # bert 输出和 task_type_embedding 相加 135 | task_type_embedding = self.task_type_embedding(task_type % 2) 136 | 137 | # sentence_w2v_embedding0 = self.w2v_linear(sentence_w2v_embedding0) 138 | # sentence_w2v_embedding1 = self.w2v_linear(sentence_w2v_embedding1) 139 | 140 | pooled_output = self.ln(pooled_output + task_type_embedding) 141 | pooled_output = self.dropout(pooled_output) 142 | out = self.classifier(pooled_output) 143 | 144 | # pooled_output = self.dropout(pooled_output) 145 | # out_list = [] 146 | # for i in range(2): 147 | # out = self.classifier_list[i](pooled_output) 148 | # out_list.append(out) 149 | # out = torch.cat(out_list, 1) 150 | # output_weight = (task_type % 2).view(-1, 1) 151 | # out = out.gather(1, output_weight) 152 | 153 | loss = 0 154 | if labels is not None: 155 | loss = compute_loss(out, labels, loss_method=self.loss_method) 156 | 157 | if self.loss_method in ['binary', 'focal_loss', 'ghmc']: 158 | out = torch.sigmoid(out).flatten() 159 | 160 | return out, loss 161 | -------------------------------------------------------------------------------- /cross_validation.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | 4 | import pandas as pd 5 | from torch.utils.data import DataLoader 6 | from transformers import AutoTokenizer, BertTokenizer 7 | 8 | from data_processor import BuildDataSet, convert2features 9 | from train_valid import model_evaluate, model_save, model_train 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | 14 | def hold_out(config, model, train_data, valid_data, train_enhancement=None): 15 | logger.debug('训练集维度:{},验证集维度:{}'.format(len(train_data), len(valid_data))) 16 | # 数据增强 17 | if train_enhancement: 18 | logger.debug('通过数据增强后,新增数据: %d', len(train_enhancement)) 19 | train_data.extend(train_enhancement) 20 | 21 | # 读取 Tokenizer 22 | if config.pretrain_model_name == 'albert_chinese_large': 23 | tokenizer = BertTokenizer.from_pretrained( 24 | config.pretrain_path, do_lower_case=config.do_lower_case) 25 | else: 26 | tokenizer = AutoTokenizer.from_pretrained( 27 | config.pretrain_path, do_lower_case=config.do_lower_case) 28 | 29 | # 训练集数据加载 30 | train_features = convert2features(examples=train_data, 31 | tokenizer=tokenizer, 32 | max_length=config.pad_size) 33 | train_dataset = BuildDataSet(train_features) 34 | train_loader = DataLoader(train_dataset, 35 | batch_size=config.batch_size, 36 | shuffle=True) 37 | 38 | # 验证集数据加载 39 | valid_features = convert2features(examples=valid_data, 40 | tokenizer=tokenizer, 41 | max_length=config.pad_size) 42 | valid_dataset = BuildDataSet(valid_features) 43 | valid_loader = DataLoader(valid_dataset, 44 | batch_size=config.batch_size, 45 | shuffle=True) 46 | 47 | # 模型训练保存 48 | model = model.to(config.device) 49 | best_model = model_train(config, model, train_loader, valid_loader) 50 | 51 | metrics, valid_loss, total_inputs_error = model_evaluate( 52 | config, best_model, valid_loader) 53 | 54 | valid_f1 = metrics['f1'] 55 | valid_f1_a = metrics['f1_a'] 56 | valid_f1_b = metrics['f1_b'] 57 | 58 | logger.info( 59 | 'evaluate: f1: {0:>6.2%}, f1_a: {1:>6.2%}, f1_b: {2:>6.2%}, loss: {3:>.6f}' 60 | .format(valid_f1, valid_f1_a, valid_f1_b, valid_loss)) 61 | model_save(config, best_model) 62 | 63 | # 保存 bad_case 64 | for example in total_inputs_error: 65 | tokens = tokenizer.convert_ids_to_tokens(example['sentence_ids']) 66 | if config.pretrain_model_name == 'chinese-xlnet-base': 67 | tokens = ''.join(x for x in tokens if x not in ['', '[pad]']) 68 | source, target, _ = tokens.split('') 69 | else: 70 | tokens = ''.join(x for x in tokens if x not in ['[CLS]', '[PAD]']) 71 | source, target, _ = tokens.split('[SEP]') 72 | example['source'] = source 73 | example['target'] = target 74 | 75 | bad_case = pd.DataFrame(total_inputs_error) 76 | 77 | os.makedirs('user_data/bad_case', exist_ok=True) 78 | bad_case.to_csv('user_data/bad_case/{}.csv'.format(config.model_name), 79 | index=False) 80 | -------------------------------------------------------------------------------- /data_processor.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import gc 3 | import json 4 | import logging 5 | import os 6 | import random 7 | 8 | import numpy as np 9 | import torch.utils.data as Data 10 | from tqdm import tqdm 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | class InputFeatures(object): 16 | def __init__(self, 17 | task_type, 18 | input_ids, 19 | attention_mask=None, 20 | token_type_ids=None, 21 | label=None, 22 | sentence_w2v_embedding0=None, 23 | sentence_w2v_embedding1=None): 24 | self.task_type = task_type 25 | self.input_ids = input_ids 26 | self.attention_mask = attention_mask 27 | self.token_type_ids = token_type_ids 28 | self.label = label 29 | 30 | def __repr__(self): 31 | return str(self.to_json_string()) 32 | 33 | def to_dict(self): 34 | """Serializes this instance to a Python dictionary.""" 35 | output = copy.deepcopy(self.__dict__) 36 | return output 37 | 38 | def to_json_string(self): 39 | """Serializes this instance to a JSON string.""" 40 | return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n" 41 | 42 | 43 | def convert2features(examples, 44 | tokenizer, 45 | max_length=512, 46 | pad_token=0, 47 | pad_token_segment_id=0): 48 | features = [] 49 | for example in tqdm(examples): 50 | inputs = tokenizer.encode_plus(example[1], 51 | example[2], 52 | add_special_tokens=True, 53 | max_length=max_length, 54 | truncation=True) 55 | input_ids, token_type_ids = inputs['input_ids'], inputs[ 56 | 'token_type_ids'] 57 | attention_mask = [1] * len(input_ids) 58 | 59 | # Zero-pad up to the sequence length. 60 | padding_length = max_length - len(input_ids) 61 | 62 | input_ids = input_ids + ([pad_token] * padding_length) 63 | attention_mask = attention_mask + ([0] * padding_length) 64 | token_type_ids = token_type_ids + ([pad_token_segment_id] * 65 | padding_length) 66 | 67 | if example[3] is not None: 68 | label = example[3] 69 | else: 70 | label = 0 71 | 72 | features.append( 73 | InputFeatures(example[0], input_ids, attention_mask, 74 | token_type_ids, label)) 75 | 76 | return features 77 | 78 | 79 | class BuildDataSet(Data.Dataset): 80 | def __init__(self, features): 81 | self.features = features 82 | 83 | def __getitem__(self, index): 84 | feature = self.features[index] 85 | task_type = np.array(feature.task_type) 86 | input_ids = np.array(feature.input_ids) 87 | attention_mask = np.array(feature.attention_mask) 88 | token_type_ids = np.array(feature.token_type_ids) 89 | label = np.array(feature.label) 90 | 91 | return task_type, input_ids, attention_mask, token_type_ids, label 92 | 93 | def __len__(self): 94 | return len(self.features) 95 | 96 | 97 | class DataProcessor: 98 | def __init__(self, config): 99 | self.data_dir = config.data_dir 100 | self.seed = config.seed 101 | self.variants = config.variants 102 | self.stop_word_list = None 103 | 104 | def get_train_examples(self): 105 | train_examples = [] 106 | 107 | for i, var in enumerate(self.variants): 108 | key = 'labelA' if 'A' in var else 'labelB' 109 | fs = [ 110 | os.path.join(self.data_dir, 'sohu2021_open_data', var, 111 | 'train.txt'), 112 | os.path.join(self.data_dir, 'round2', f'{var}.txt'), 113 | os.path.join(self.data_dir, 'divided_20210419', var, 'train.txt'), 114 | ] 115 | 116 | for f in fs: 117 | with open(f) as f: 118 | # for line in list(f)[:10]: 119 | for line in f: 120 | line = json.loads(line) 121 | train_examples.append((i, line['source'], 122 | line['target'], int(line[key]))) 123 | 124 | return train_examples 125 | 126 | def get_valid_examples(self): 127 | valid_examples = [] 128 | 129 | for i, var in enumerate(self.variants): 130 | key = 'labelA' if 'A' in var else 'labelB' 131 | f = os.path.join(self.data_dir, 'sohu2021_open_data', var, 132 | 'valid.txt') 133 | 134 | with open(f) as f: 135 | for line in f: 136 | line = json.loads(line) 137 | valid_examples.append( 138 | (i, line['source'], line['target'], int(line[key]))) 139 | # break 140 | 141 | return valid_examples 142 | 143 | def get_test_examples(self): 144 | test_ids = [] 145 | test_examples = [] 146 | 147 | for i, var in enumerate(self.variants): 148 | f = os.path.join(self.data_dir, 'sohu2021_open_data', var, 149 | 'test_with_id.txt') 150 | 151 | with open(f) as f: 152 | for line in f: 153 | line = json.loads(line) 154 | test_examples.append( 155 | (i, line['source'], line['target'], -1)) 156 | test_ids.append(line['id']) 157 | 158 | return test_ids, test_examples 159 | 160 | def read_data_augment(self, augment_list): 161 | data_augment = [] 162 | 163 | for augment in augment_list: 164 | for type in self.data_files: 165 | examples = self._read_data( 166 | os.path.join('data/augment_data', type, 167 | '{}.pkl'.format(augment))) 168 | data_augment += examples 169 | 170 | random.seed(self.seed) 171 | random.shuffle(data_augment) 172 | return data_augment 173 | -------------------------------------------------------------------------------- /predict.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import time 4 | 5 | import numpy as np 6 | import pandas as pd 7 | import torch 8 | from torch.utils.data import DataLoader 9 | from transformers import AutoTokenizer 10 | 11 | from bert import Bert 12 | from data_processor import BuildDataSet, DataProcessor, convert2features 13 | from train_valid import model_evaluate, model_load 14 | from utils import combined_result, random_seed 15 | 16 | 17 | class ModelConfig(): 18 | def __init__(self): 19 | pass 20 | 21 | 22 | class TestConfig: 23 | def __init__(self): 24 | self.device = torch.device( 25 | 'cuda' if torch.cuda.is_available() else 'cpu') 26 | 27 | self.data_dir = 'raw_data'.replace('/', os.path.sep) 28 | 29 | self.model_path = 'user_data/model'.replace('/', os.path.sep) 30 | self.model_names = ['erniev8'] 31 | self.variants = [ 32 | '短短匹配A类', 33 | '短短匹配B类', 34 | '短长匹配A类', 35 | '短长匹配B类', 36 | '长长匹配A类', 37 | '长长匹配B类', 38 | ] 39 | 40 | os.makedirs('result', exist_ok=True) 41 | self.output_path = os.path.join( 42 | 'result', 43 | time.strftime('%Y-%m-%d_%H-%M-%S') + '.csv') 44 | 45 | self.prob_threshold = 0.5 46 | self.seed = 2021 47 | 48 | 49 | def model_predict(model, config, examples): 50 | # 读取 Tokenizer 51 | tokenizer = AutoTokenizer.from_pretrained( 52 | config.pretrain_path, do_lower_case=config.do_lower_case) 53 | 54 | test_features = convert2features(examples=examples, 55 | tokenizer=tokenizer, 56 | max_length=config.pad_size) 57 | test_dataset = BuildDataSet(test_features) 58 | test_loader = DataLoader(test_dataset, 59 | batch_size=config.batch_size, 60 | shuffle=False) 61 | test_prob, _ = model_evaluate(config, model, test_loader, test=True) 62 | return test_prob 63 | 64 | 65 | def predict_task(config): 66 | processor = DataProcessor(config) 67 | 68 | ids, examples = processor.get_test_examples() 69 | all_predict = [] 70 | 71 | for model_name in config.model_names: 72 | with open(os.path.join(config.model_path, model_name, 'config.json'), 73 | 'r') as f: 74 | model_conf_json = json.load(f) 75 | 76 | # 模型定义 77 | model_conf = ModelConfig() 78 | model_conf.__dict__.update(model_conf_json) 79 | model = Bert(model_conf) 80 | 81 | # 加载模型 82 | model_path = os.path.join(config.model_path, model_name) 83 | print(f'load model from {model_path}') 84 | model = model_load(model_path, model, device='cpu') 85 | model.to(config.device) 86 | 87 | # 模型预测 88 | predict_prob = model_predict(model, model_conf, examples) 89 | all_predict.append(predict_prob) 90 | 91 | final_predict = combined_result(all_predict, pattern='average') 92 | final_predict_label = np.asarray(final_predict >= config.prob_threshold, 93 | dtype=np.int) 94 | 95 | submit = pd.DataFrame() 96 | submit['id'] = ids 97 | submit['label'] = final_predict_label 98 | return submit 99 | 100 | 101 | def predict(config): 102 | start_time = time.time() 103 | 104 | submit = predict_task(config) 105 | 106 | end_time = time.time() 107 | time_dif = end_time - start_time 108 | print(time_dif * 1000 // (14913 + 14909)) 109 | 110 | submit.to_csv(config.output_path, index=False) 111 | 112 | 113 | if __name__ == '__main__': 114 | config = TestConfig() 115 | random_seed(config.seed) 116 | predict(config) 117 | -------------------------------------------------------------------------------- /train_ernie.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import time 4 | import warnings 5 | 6 | import torch 7 | 8 | from bert import Bert 9 | from cross_validation import hold_out 10 | from data_processor import DataProcessor 11 | from utils import config_to_json_string, random_seed 12 | 13 | warnings.filterwarnings('ignore') 14 | 15 | 16 | class TrainConfig: 17 | def __init__(self): 18 | # 预训练模型相关 19 | self.pretrain_model_name = 'ernie' 20 | self.pretrain_path = 'data/pretrain_models/{}'.format( 21 | self.pretrain_model_name).replace('/', os.path.sep) 22 | self.device = torch.device( 23 | 'cuda' if torch.cuda.is_available() else 'cpu') 24 | self.do_lower_case = True 25 | self.requires_grad = True 26 | 27 | # 模型相关 28 | self.pad_size = 512 # 每句话处理成的长度 29 | self.batch_size = 36 30 | self.learning_rate = 2e-5 # 学习率 31 | self.head_learning_rate = 1e-4 # 后面的分类层的学习率 32 | self.weight_decay = 0.01 # 权重衰减因子 33 | self.warmup_proportion = 0.1 # Proportion of training to perform linear learning rate warmup for. 34 | self.num_train_epochs = 3 # epoch数 35 | self.prob_threshold = 0.5 36 | self.loss_method = 'binary' # [ binary, cross_entropy] 37 | self.hidden_dropout_prob = 0.1 38 | self.hidden_size = [] 39 | self.diff_learning_rate = False 40 | self.early_stop = True 41 | self.require_improvement = 3000 42 | self.FGM = True 43 | self.pool_method = 'mean' 44 | self.multi_drop = 1 45 | 46 | self.variants = [ 47 | '短短匹配A类', 48 | '短短匹配B类', 49 | '短长匹配A类', 50 | '短长匹配B类', 51 | '长长匹配A类', 52 | '长长匹配B类', 53 | ] 54 | 55 | # 数据路径 56 | is_abstract = True 57 | if is_abstract: 58 | self.data_dir = 'abstract_data'.replace('/', os.path.sep) 59 | else: 60 | self.data_dir = 'raw_data'.replace('/', os.path.sep) 61 | self.model_name = f'{self.pretrain_model_name}v8' 62 | self.model_path = 'user_data/model/{}'.format(self.model_name).replace( 63 | '/', os.path.sep) 64 | os.makedirs(self.model_path, exist_ok=True) 65 | 66 | # logging 67 | self.logging_dir = 'user_data/logging/{}'.format( 68 | self.model_name).replace('/', os.path.sep) 69 | os.makedirs(self.logging_dir, exist_ok=True) 70 | self.seed = 2021 71 | 72 | # 数据增强 73 | self.data_augment = None 74 | 75 | 76 | def train_model(config): 77 | logging.debug('config {}'.format(config_to_json_string(config))) 78 | 79 | # 读取数据 80 | processor = DataProcessor(config) 81 | train_examples = processor.get_train_examples() 82 | valid_examples = processor.get_valid_examples() 83 | 84 | if config.data_augment: 85 | augment_examples = processor.read_data_augment(config.data_augment) 86 | else: 87 | augment_examples = None 88 | 89 | logging.info(train_examples[:1]) 90 | logging.info(valid_examples[:1]) 91 | 92 | model = Bert(config) 93 | hold_out(config=config, 94 | model=model, 95 | train_data=train_examples, 96 | valid_data=valid_examples, 97 | train_enhancement=augment_examples) 98 | 99 | 100 | if __name__ == '__main__': 101 | config = TrainConfig() 102 | 103 | random_seed(config.seed) 104 | 105 | # 定义日志 106 | file = time.strftime('%Y-%m-%d_%H-%M-%S') + '.log' 107 | logging_filename = os.path.join(config.logging_dir, file) 108 | logging.basicConfig(filename=logging_filename, 109 | format='%(levelname)s: %(message)s', 110 | level=logging.DEBUG) 111 | 112 | # 运行模型 113 | train_model(config) 114 | -------------------------------------------------------------------------------- /train_roberta.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import time 4 | import warnings 5 | 6 | import torch 7 | 8 | from bert import Bert 9 | from cross_validation import hold_out 10 | from data_processor import DataProcessor 11 | from utils import config_to_json_string, random_seed 12 | 13 | warnings.filterwarnings('ignore') 14 | 15 | 16 | class TrainConfig: 17 | def __init__(self): 18 | # 预训练模型相关 19 | self.pretrain_model_name = 'chinese-roberta-wwm-ext' 20 | self.pretrain_path = 'data/pretrain_models/{}'.format( 21 | self.pretrain_model_name).replace('/', os.path.sep) 22 | self.device = torch.device( 23 | 'cuda' if torch.cuda.is_available() else 'cpu') 24 | self.do_lower_case = True 25 | self.requires_grad = True 26 | 27 | # 模型相关 28 | self.pad_size = 512 # 每句话处理成的长度 29 | self.batch_size = 32 30 | self.learning_rate = 2e-5 # 学习率 31 | self.head_learning_rate = 1e-4 # 后面的分类层的学习率 32 | self.weight_decay = 0.01 # 权重衰减因子 33 | self.warmup_proportion = 0.1 # Proportion of training to perform linear learning rate warmup for. 34 | self.num_train_epochs = 3 # epoch数 35 | self.prob_threshold = 0.5 36 | self.loss_method = 'binary' # [ binary, cross_entropy] 37 | self.hidden_dropout_prob = 0.1 38 | self.hidden_size = [] 39 | self.diff_learning_rate = False 40 | self.early_stop = True 41 | self.require_improvement = 3000 42 | self.FGM = True 43 | self.pool_method = 'mean' 44 | self.multi_drop = 1 45 | 46 | self.variants = [ 47 | '短短匹配A类', 48 | '短短匹配B类', 49 | '短长匹配A类', 50 | '短长匹配B类', 51 | '长长匹配A类', 52 | '长长匹配B类', 53 | ] 54 | 55 | # 数据路径 56 | self.data_dir = 'raw_data'.replace('/', os.path.sep) 57 | self.model_name = f'{self.pretrain_model_name}v2' 58 | self.model_path = 'user_data/model/{}'.format(self.model_name).replace( 59 | '/', os.path.sep) 60 | os.makedirs(self.model_path, exist_ok=True) 61 | 62 | # logging 63 | self.logging_dir = 'user_data/logging/{}'.format( 64 | self.model_name).replace('/', os.path.sep) 65 | os.makedirs(self.logging_dir, exist_ok=True) 66 | self.seed = 2021 67 | 68 | # 数据增强 69 | self.data_augment = None 70 | 71 | 72 | def train_model(config): 73 | logging.debug('config {}'.format(config_to_json_string(config))) 74 | 75 | # 读取数据 76 | processor = DataProcessor(config) 77 | train_examples = processor.get_train_examples() 78 | valid_examples = processor.get_valid_examples() 79 | 80 | if config.data_augment: 81 | augment_examples = processor.read_data_augment(config.data_augment) 82 | else: 83 | augment_examples = None 84 | 85 | logging.info(train_examples[:1]) 86 | logging.info(valid_examples[:1]) 87 | 88 | model = Bert(config) 89 | hold_out(config=config, 90 | model=model, 91 | train_data=train_examples, 92 | valid_data=valid_examples, 93 | train_enhancement=augment_examples) 94 | 95 | 96 | if __name__ == '__main__': 97 | config = TrainConfig() 98 | 99 | random_seed(config.seed) 100 | 101 | # 定义日志 102 | file = time.strftime('%Y-%m-%d_%H-%M-%S') + '.log' 103 | logging_filename = os.path.join(config.logging_dir, file) 104 | logging.basicConfig(filename=logging_filename, 105 | format='%(levelname)s: %(message)s', 106 | level=logging.DEBUG) 107 | 108 | # 运行模型 109 | train_model(config) 110 | -------------------------------------------------------------------------------- /train_valid.py: -------------------------------------------------------------------------------- 1 | # coding: UTF-8 2 | import copy 3 | import logging 4 | import os 5 | import time 6 | 7 | import numpy as np 8 | import torch 9 | from sklearn.metrics import f1_score 10 | from transformers import AdamW, get_linear_schedule_with_warmup 11 | 12 | from utils import config_to_json_string 13 | 14 | logger = logging.getLogger(__name__) 15 | 16 | 17 | class FGM(): 18 | ''' 19 | Example 20 | # 初始化 21 | fgm = FGM(model,epsilon=1, emb_name='word_embeddings.') 22 | for batch_input, batch_label in data: 23 | # 正常训练 24 | loss = model(batch_input, batch_label) 25 | loss.backward() # 反向传播,得到正常的grad 26 | # 对抗训练 27 | fgm.attack() # 在embedding上添加对抗扰动 28 | loss_adv = model(batch_input, batch_label) 29 | loss_adv.backward() # 反向传播,并在正常的grad基础上,累加对抗训练的梯度 30 | fgm.restore() # 恢复embedding参数 31 | # 梯度下降,更新参数 32 | optimizer.step() 33 | model.zero_grad() 34 | ''' 35 | def __init__(self, model, emb_name, epsilon=1.0): 36 | # emb_name这个参数要换成你模型中embedding的参数名 37 | self.model = model 38 | self.epsilon = epsilon 39 | self.emb_name = emb_name 40 | self.backup = {} 41 | 42 | def attack(self): 43 | for name, param in self.model.named_parameters(): 44 | if param.requires_grad and self.emb_name in name: 45 | self.backup[name] = param.data.clone() 46 | norm = torch.norm(param.grad) 47 | if norm != 0 and not torch.isnan(norm): 48 | r_at = self.epsilon * param.grad / norm 49 | param.data.add_(r_at) 50 | 51 | def restore(self): 52 | for name, param in self.model.named_parameters(): 53 | if param.requires_grad and self.emb_name in name: 54 | assert name in self.backup 55 | param.data = self.backup[name] 56 | self.backup = {} 57 | 58 | 59 | def model_train(config, model, train_iter, valid_iter): 60 | start_time = time.time() 61 | 62 | # Prepare optimizer and schedule (linear warmup and decay) 63 | no_decay = ["bias", "LayerNorm.weight"] 64 | diff_part = ["bert.embeddings", "bert.encoder"] 65 | if not config.diff_learning_rate: 66 | optimizer_grouped_parameters = [ 67 | { 68 | "params": [ 69 | p for n, p in model.named_parameters() 70 | if not any(nd in n for nd in no_decay) 71 | ], 72 | "weight_decay": 73 | config.weight_decay, 74 | }, 75 | { 76 | "params": [ 77 | p for n, p in model.named_parameters() 78 | if any(nd in n for nd in no_decay) 79 | ], 80 | "weight_decay": 81 | 0.0 82 | }, 83 | ] 84 | optimizer = AdamW(optimizer_grouped_parameters, 85 | lr=config.learning_rate) 86 | else: 87 | logger.info("use the diff learning rate") 88 | # the formal is basic_bert part, not include the pooler 89 | optimizer_grouped_parameters = [ 90 | { 91 | # weight 衰减 92 | "params": [ 93 | p for n, p in model.named_parameters() 94 | if not any(nd in n 95 | for nd in no_decay) and any(nd in n 96 | for nd in diff_part) 97 | ], 98 | "weight_decay": 99 | config.weight_decay, 100 | "lr": 101 | config.learning_rate 102 | }, 103 | { 104 | # weight 不衰减 105 | "params": [ 106 | p for n, p in model.named_parameters() 107 | if any(nd in n 108 | for nd in no_decay) and any(nd in n 109 | for nd in diff_part) 110 | ], 111 | "weight_decay": 112 | 0.0, 113 | "lr": 114 | config.learning_rate 115 | }, 116 | { 117 | "params": [ 118 | p for n, p in model.named_parameters() 119 | if not any(nd in n for nd in no_decay) and not any( 120 | nd in n for nd in diff_part) 121 | ], 122 | "weight_decay": 123 | config.weight_decay, 124 | "lr": 125 | config.head_learning_rate 126 | }, 127 | { 128 | "params": [ 129 | p for n, p in model.named_parameters() 130 | if any(nd in n 131 | for nd in no_decay) and not any(nd in n 132 | for nd in diff_part) 133 | ], 134 | "weight_decay": 135 | 0.0, 136 | "lr": 137 | config.head_learning_rate 138 | }, 139 | ] 140 | optimizer = AdamW(optimizer_grouped_parameters) 141 | 142 | t_total = len(train_iter) * config.num_train_epochs 143 | scheduler = get_linear_schedule_with_warmup(optimizer, 144 | num_warmup_steps=t_total * 145 | config.warmup_proportion, 146 | num_training_steps=t_total) 147 | 148 | # Train! 149 | logger.info("***** Running training *****") 150 | logger.info(" Num Epochs = %d", config.num_train_epochs) 151 | logger.info(" Instantaneous batch size GPU/CPU = %d", config.batch_size) 152 | logger.info(" Total optimization steps = %d", t_total) 153 | logger.info(" Train device:%s", config.device) 154 | 155 | global_batch = 0 # 记录进行到多少batch 156 | valid_best_f1 = 0 157 | last_improve = 0 # 记录上次验证集loss下降的batch数 158 | flag = False # 记录是否很久没有效果提升 159 | 160 | predict_all = [] 161 | labels_all = [] 162 | best_model = copy.deepcopy(model) 163 | 164 | if config.FGM: 165 | fgm = FGM(model, epsilon=1, emb_name='word_embeddings.') 166 | 167 | for epoch in range(config.num_train_epochs): 168 | logger.info('Epoch [{}/{}]'.format(epoch + 1, config.num_train_epochs)) 169 | for _, (task_type, input_ids, attention_mask, token_type_ids, labels) in enumerate(train_iter): 170 | global_batch += 1 171 | model.train() 172 | 173 | task_type = torch.tensor(task_type).type(torch.LongTensor).to( 174 | config.device) 175 | input_ids = torch.tensor(input_ids).type(torch.LongTensor).to( 176 | config.device) 177 | attention_mask = torch.tensor(attention_mask).type( 178 | torch.LongTensor).to(config.device) 179 | token_type_ids = torch.tensor(token_type_ids).type( 180 | torch.LongTensor).to(config.device) 181 | 182 | if config.loss_method in ['binary']: 183 | labels_tensor = torch.tensor(labels).type( 184 | torch.FloatTensor).to(config.device) 185 | else: 186 | labels_tensor = torch.tensor(labels).type(torch.LongTensor).to( 187 | config.device) 188 | 189 | outputs, loss = model(task_type, input_ids, attention_mask, 190 | token_type_ids, labels_tensor) 191 | 192 | loss.backward() 193 | 194 | # 对抗训练 195 | if config.FGM: 196 | fgm.attack() # 在embedding上添加对抗扰动 197 | _, loss_adv = model(task_type, input_ids, attention_mask, token_type_ids, 198 | labels_tensor) 199 | loss_adv.backward() # 反向传播,并在正常的grad基础上,累加对抗训练的梯度 200 | fgm.restore() # 恢复embedding参数 201 | 202 | optimizer.step() 203 | scheduler.step() # Update learning rate schedule 204 | model.zero_grad() 205 | 206 | outputs = outputs.cpu().detach().numpy() 207 | predic = list( 208 | np.array(outputs >= config.prob_threshold, dtype='int')) 209 | labels_all.extend(labels) 210 | predict_all.extend(predic) 211 | 212 | if global_batch % 100 == 0: 213 | train_f1 = f1_score(labels_all, predict_all) 214 | predict_all = [] 215 | labels_all = [] 216 | 217 | metrics, valid_loss, _ = model_evaluate( 218 | config, model, valid_iter) 219 | valid_f1 = metrics['f1'] 220 | valid_f1_a = metrics['f1_a'] 221 | valid_f1_b = metrics['f1_b'] 222 | 223 | if valid_f1 > valid_best_f1: 224 | valid_best_f1 = valid_f1 225 | improve = '*' 226 | last_improve = global_batch 227 | best_model = copy.deepcopy(model) 228 | else: 229 | improve = '' 230 | 231 | time_dif = time.time() - start_time 232 | msg = 'Iter: {0:>6}, Train Loss: {1:>5.6f}, Train F1: {2:>6.2%}, Val Loss: {3:>5.6f}, Val F1: {4:>6.2%}, Val F1_a: {5:>6.2%}, Val F1_b: {6:>6.2%}, Time: {7} {8}' 233 | logger.info( 234 | msg.format(global_batch, 235 | loss.cpu().data.item(), train_f1, 236 | valid_loss.cpu().data.item(), valid_f1, 237 | valid_f1_a, valid_f1_b, time_dif, improve)) 238 | 239 | if config.early_stop and global_batch - last_improve > config.require_improvement: 240 | logger.info( 241 | "No optimization for a long time, auto-stopping...") 242 | flag = True 243 | break 244 | if flag: 245 | break 246 | 247 | if config.early_stop: 248 | return best_model 249 | else: 250 | return model 251 | 252 | def model_evaluate(config, model, data_iter, test=False): 253 | model.eval() 254 | 255 | # loss 总和 256 | loss_total = 0 257 | # 预测的全部 label 258 | predict_label_all = [] 259 | predict_label_taskA = [] 260 | predict_label_taskB = [] 261 | 262 | # 预测的全部概率 263 | predict_prob_all = [] 264 | 265 | # 真实的全部 label 266 | true_label_all = [] 267 | true_label_taskA = [] 268 | true_label_taskB = [] 269 | 270 | # 全部的task_type 271 | task_type_all = [] 272 | 273 | total_inputs_error = [] 274 | with torch.no_grad(): 275 | for i, (task_type, input_ids, attention_mask, token_type_ids, labels) in enumerate(data_iter): 276 | 277 | task_type = torch.tensor(task_type).type(torch.LongTensor).to( 278 | config.device) 279 | input_ids = torch.tensor(input_ids).type(torch.LongTensor).to( 280 | config.device) 281 | attention_mask = torch.tensor(attention_mask).type( 282 | torch.LongTensor).to(config.device) 283 | token_type_ids = torch.tensor(token_type_ids).type( 284 | torch.LongTensor).to(config.device) 285 | 286 | if config.loss_method in ['binary']: 287 | labels = torch.tensor(labels).type(torch.FloatTensor).to( 288 | config.device) if not test else None 289 | else: 290 | labels = torch.tensor(labels).type(torch.LongTensor).to( 291 | config.device) if not test else None 292 | 293 | predict_prob, loss = model(task_type, input_ids, attention_mask, 294 | token_type_ids, labels) 295 | 296 | predict_prob = predict_prob.cpu().detach().numpy() 297 | predict_label = list( 298 | np.array(predict_prob >= config.prob_threshold, dtype='int')) 299 | 300 | predict_prob_all.extend(list(predict_prob)) 301 | predict_label_all.extend(predict_label) 302 | task_type_all.extend(list(task_type.cpu().detach().numpy())) 303 | 304 | if not test: 305 | labels = labels.data.cpu().numpy() 306 | true_label_all.extend(list(labels)) 307 | loss_total += loss 308 | 309 | input_ids = input_ids.data.cpu().detach().numpy() 310 | classify_error = get_classify_error(input_ids, predict_label, 311 | labels, predict_prob) 312 | total_inputs_error.extend(classify_error) 313 | 314 | if test: 315 | return predict_prob_all, predict_label_all 316 | 317 | for task_type, predict_label, true_label in zip(task_type_all, 318 | predict_label_all, 319 | true_label_all): 320 | if task_type % 2 == 0: 321 | predict_label_taskA.append(predict_label) 322 | true_label_taskA.append(true_label) 323 | else: 324 | predict_label_taskB.append(predict_label) 325 | true_label_taskB.append(true_label) 326 | 327 | f1_a = f1_score(true_label_taskA, predict_label_taskA) 328 | f1_b = f1_score(true_label_taskB, predict_label_taskB) 329 | f1 = (f1_a + f1_b) / 2 330 | 331 | return { 332 | 'f1': f1, 333 | 'f1_a': f1_a, 334 | 'f1_b': f1_b 335 | }, loss_total / len(data_iter), total_inputs_error 336 | 337 | 338 | def get_classify_error(input_ids, predict, labels, proba, input_ids_pair=None): 339 | error_list = [] 340 | error_idx = predict != labels 341 | error_sentences = input_ids[error_idx] 342 | total_sentences = [] 343 | if input_ids_pair is not None: 344 | error_sentences_pair = input_ids_pair[error_idx] 345 | for sentence1, sentence2 in zip(error_sentences, error_sentences_pair): 346 | total_sentences.append( 347 | np.array(sentence1.tolist() + [117] + sentence2.tolist(), 348 | dtype=int)) 349 | else: 350 | total_sentences = error_sentences 351 | 352 | true_label = labels[error_idx] 353 | pred_proba = proba[error_idx] 354 | for sentences, label, prob in zip(total_sentences, true_label, pred_proba): 355 | error_dict = {} 356 | error_dict['sentence_ids'] = sentences 357 | error_dict['true_label'] = label 358 | error_dict['proba'] = prob 359 | error_list.append(error_dict) 360 | 361 | return error_list 362 | 363 | 364 | def model_save(config, model, num=-1): 365 | if num == -1: 366 | file_name = os.path.join(config.model_path, 'model.pkl') 367 | 368 | with open(os.path.join(config.model_path, 'config.json'), 'w') as f: 369 | f.write(config_to_json_string(config)) 370 | 371 | torch.save(model.state_dict(), file_name) 372 | logger.info('model saved, path: %s', file_name) 373 | 374 | 375 | def model_load(model_path, model, device='cpu'): 376 | file_name = os.path.join(model_path, 'model.pkl') 377 | model.load_state_dict( 378 | torch.load(file_name, 379 | map_location=device if device == 'cpu' else "{}:{}".format( 380 | device, 0))) 381 | return model 382 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import json 3 | 4 | import numpy as np 5 | import torch 6 | 7 | 8 | def random_seed(seed): 9 | np.random.seed(seed) 10 | torch.manual_seed(seed) 11 | torch.cuda.manual_seed_all(seed) 12 | 13 | torch.backends.cudnn.deterministic = True 14 | torch.backends.cudnn.benchmark = False 15 | 16 | 17 | def config_to_dict(config): 18 | output = copy.deepcopy(config.__dict__) 19 | output['device'] = config.device.type 20 | return output 21 | 22 | 23 | def config_to_json_string(config): 24 | return json.dumps(config_to_dict(config), indent=2, sort_keys=True) 25 | 26 | 27 | def combined_result(all_result, weight=None, pattern='average'): 28 | def average_result(all_result): # shape:[num_model, axis] 29 | all_result = np.asarray(all_result, dtype=np.float) 30 | return np.mean(all_result, axis=0) 31 | 32 | def weighted_result(all_result, weight): 33 | all_result = np.asarray(all_result, dtype=np.float) 34 | return np.average(all_result, axis=0, weights=weight) 35 | 36 | if pattern == 'weighted': 37 | return weighted_result(all_result, weight) 38 | elif pattern == 'average': 39 | return average_result(all_result) 40 | else: 41 | raise ValueError("the combined type is incorrect") 42 | 43 | def sentence_reverse(test_examples): 44 | """ 45 | 将测试数据翻转 46 | :param test_examples: 47 | :return: 48 | """ 49 | reverse_test_examples = [] 50 | for example in test_examples: 51 | try_example = [example[1], example[0], example[2]] 52 | reverse_test_examples.append(try_example) 53 | return reverse_test_examples 54 | --------------------------------------------------------------------------------