├── README.md
├── bert.py
├── cross_validation.py
├── data_processor.py
├── predict.py
├── train_ernie.py
├── train_roberta.py
├── train_valid.py
└── utils.py


/README.md:
--------------------------------------------------------------------------------
 1 | # 2021-sohu-textmatch
 2 | 2021搜狐校园文本匹配算法大赛
 3 | 
 4 | # 建模思路
 5 | 加载 bert 预训练模型，取最后一层序列输出的平均值作为句子对的embedding；任务类型embedding化与句子对的embedding相加，作为input输入到全连接层分类器。这样就可以达到用一个模型同时预测taskA和taskB的目标。
 6 | 
 7 | # 模型效果
 8 | |        预训练模型       | 线下   | 线下taskA | 线下taskB | 线上         | 线上taskA    | 线上taskB    |
 9 | |:-----------------------:|--------|-----------|-----------|--------------|--------------|--------------|
10 | |          ernie          | 0.7815 | 0.847     | 0.7161    | 0.7822752226 | 0.8473297289 | 0.7172207162 |
11 | | chinese-roberta-wwm-ext | 0.7815 | 0.8468    | 0.7162    | 0.7824321441 | 0.8499669016 | 0.7148973867 |
12 | | ernie+roberta等权重融合 |        |           |           | 0.7841945647 | 0.8509693997 | 0.7174197298 |
13 | 


--------------------------------------------------------------------------------
/bert.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | import torch.nn.functional as F
  6 | from transformers import AutoModel
  7 | 
  8 | 
  9 | class FocalLoss(nn.Module):
 10 |     def __init__(self, alpha=0.25, gamma=2, size_average=True):
 11 |         super(FocalLoss, self).__init__()
 12 |         self.alpha = torch.tensor(alpha).cuda()
 13 |         self.gamma = gamma
 14 |         self.size_average = size_average
 15 | 
 16 |     def forward(self, pred, target):
 17 |         # 如果模型最后没有 nn.Sigmoid()，那么这里就需要对预测结果计算一次 Sigmoid 操作
 18 |         pred = nn.Sigmoid()(pred)
 19 | 
 20 |         # 展开 pred 和 target,此时 pred.size = target.size = (BatchSize,1)
 21 |         pred = pred.view(-1, 1)
 22 |         target = target.view(-1, 1)
 23 | 
 24 |         # 此处将预测样本为正负的概率都计算出来，此时 pred.size = (BatchSize,2)
 25 |         pred = torch.cat((1 - pred, pred), dim=1)
 26 | 
 27 |         # 根据 target 生成 mask，即根据 ground truth 选择所需概率
 28 |         # 用大白话讲就是：
 29 |         # 当标签为 1 时，我们就将模型预测该样本为正类的概率代入公式中进行计算
 30 |         # 当标签为 0 时，我们就将模型预测该样本为负类的概率代入公式中进行计算
 31 |         class_mask = torch.zeros(pred.shape[0], pred.shape[1]).cuda()
 32 |         # 这里的 scatter_ 操作不常用，其函数原型为:
 33 |         # scatter_(dim,index,src)->Tensor
 34 |         # Writes all values from the tensor src into self at the indices specified in the index tensor.
 35 |         # For each value in src, its output index is specified by its index in src for dimension != dim and by the corresponding value in index for dimension = dim.
 36 |         class_mask.scatter_(1, target.view(-1, 1).long(), 1.)
 37 | 
 38 |         # 利用 mask 将所需概率值挑选出来
 39 |         probs = (pred * class_mask).sum(dim=1).view(-1, 1)
 40 |         probs = probs.clamp(min=0.0001, max=1.0)
 41 | 
 42 |         # 计算概率的 log 值
 43 |         log_p = probs.log()
 44 | 
 45 |         # 根据论文中所述，对 alpha　进行设置（该参数用于调整正负样本数量不均衡带来的问题）
 46 |         alpha = torch.ones(pred.shape[0], pred.shape[1]).cuda()
 47 |         alpha[:, 0] = alpha[:, 0] * (1 - self.alpha)
 48 |         alpha[:, 1] = alpha[:, 1] * self.alpha
 49 |         alpha = (alpha * class_mask).sum(dim=1).view(-1, 1)
 50 | 
 51 |         # 根据 Focal Loss 的公式计算 Loss
 52 |         batch_loss = -alpha * (torch.pow((1 - probs), self.gamma)) * log_p
 53 | 
 54 |         # Loss Function的常规操作，mean 与 sum 的区别不大，相当于学习率设置不一样而已
 55 |         if self.size_average:
 56 |             loss = batch_loss.mean()
 57 |         else:
 58 |             loss = batch_loss.sum()
 59 | 
 60 |         return loss
 61 | 
 62 | 
 63 | def compute_loss(outputs, labels, loss_method='binary'):
 64 |     loss = 0.
 65 |     if loss_method == 'binary':
 66 |         labels = labels.unsqueeze(1)
 67 |         loss = F.binary_cross_entropy(torch.sigmoid(outputs), labels)
 68 |     elif loss_method == 'cross_entropy':
 69 |         loss = F.cross_entropy(outputs, labels)
 70 |     elif loss_method == 'focal_loss':
 71 |         focal_loss = FocalLoss()
 72 |         loss = focal_loss(outputs, labels)
 73 |     else:
 74 |         raise Exception('loss_method {binary or cross_entropy} error. ')
 75 |     return loss
 76 | 
 77 | 
 78 | class Bert(nn.Module):
 79 |     def __init__(self, config):
 80 |         super(Bert, self).__init__()
 81 | 
 82 |         # 计算loss的方法
 83 |         self.loss_method = config.loss_method
 84 |         self.pool_method = config.pool_method
 85 | 
 86 |         self.bert = AutoModel.from_pretrained(config.pretrain_path)
 87 | 
 88 |         if config.requires_grad:
 89 |             for param in self.bert.parameters():
 90 |                 param.requires_grad = True
 91 |         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 92 | 
 93 |         last_layer_dim = self.bert.config.hidden_size
 94 | 
 95 |         self.task_type_embedding = nn.Embedding(2, last_layer_dim)
 96 |         self.w2v_linear = nn.Linear(200, last_layer_dim)
 97 | 
 98 |         self.ln = torch.nn.LayerNorm(last_layer_dim)
 99 | 
100 |         hidden_size = [last_layer_dim] + copy.deepcopy(config.hidden_size)
101 | 
102 |         self.classifier = nn.Sequential()
103 | 
104 |         for i in range(len(hidden_size) - 1):
105 |             self.classifier.add_module(
106 |                 'classifier_{}'.format(i),
107 |                 nn.Linear(hidden_size[i], hidden_size[i + 1]))
108 | 
109 |         if self.loss_method in ['binary', 'focal_loss', 'ghmc']:
110 |             self.classifier.add_module(
111 |                 'classifier_output',
112 |                 nn.Linear(hidden_size[len(hidden_size) - 1], 1))
113 |         else:
114 |             self.classifier.add_module(
115 |                 'classifier_output',
116 |                 nn.Linear(hidden_size[len(hidden_size) - 1], 2))
117 | 
118 |     def forward(self,
119 |                 task_type=None,
120 |                 input_ids=None,
121 |                 attention_mask=None,
122 |                 token_type_ids=None,
123 |                 labels=None):
124 |         outputs = self.bert(
125 |             input_ids,
126 |             attention_mask=attention_mask,
127 |             token_type_ids=token_type_ids,
128 |         )
129 |         if self.pool_method == 'first':
130 |             pooled_output = outputs[0][:, 0]
131 |         else:
132 |             pooled_output = torch.mean(outputs[0], dim=1)
133 | 
134 |         # bert 输出和 task_type_embedding 相加
135 |         task_type_embedding = self.task_type_embedding(task_type % 2)
136 | 
137 | #         sentence_w2v_embedding0 = self.w2v_linear(sentence_w2v_embedding0)
138 | #         sentence_w2v_embedding1 = self.w2v_linear(sentence_w2v_embedding1)
139 | 
140 |         pooled_output = self.ln(pooled_output + task_type_embedding)
141 |         pooled_output = self.dropout(pooled_output)
142 |         out = self.classifier(pooled_output)
143 | 
144 |         # pooled_output = self.dropout(pooled_output)
145 |         # out_list = []
146 |         # for i in range(2):
147 |         #     out = self.classifier_list[i](pooled_output)
148 |         #     out_list.append(out)
149 |         # out = torch.cat(out_list, 1)
150 |         # output_weight = (task_type % 2).view(-1, 1)
151 |         # out = out.gather(1, output_weight)
152 | 
153 |         loss = 0
154 |         if labels is not None:
155 |             loss = compute_loss(out, labels, loss_method=self.loss_method)
156 | 
157 |         if self.loss_method in ['binary', 'focal_loss', 'ghmc']:
158 |             out = torch.sigmoid(out).flatten()
159 | 
160 |         return out, loss
161 | 


--------------------------------------------------------------------------------
/cross_validation.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | 
 4 | import pandas as pd
 5 | from torch.utils.data import DataLoader
 6 | from transformers import AutoTokenizer, BertTokenizer
 7 | 
 8 | from data_processor import BuildDataSet, convert2features
 9 | from train_valid import model_evaluate, model_save, model_train
10 | 
11 | logger = logging.getLogger(__name__)
12 | 
13 | 
14 | def hold_out(config, model, train_data, valid_data, train_enhancement=None):
15 |     logger.debug('训练集维度：{}，验证集维度：{}'.format(len(train_data), len(valid_data)))
16 |     # 数据增强
17 |     if train_enhancement:
18 |         logger.debug('通过数据增强后，新增数据: %d', len(train_enhancement))
19 |         train_data.extend(train_enhancement)
20 | 
21 |     # 读取 Tokenizer
22 |     if config.pretrain_model_name == 'albert_chinese_large':
23 |         tokenizer = BertTokenizer.from_pretrained(
24 |             config.pretrain_path, do_lower_case=config.do_lower_case)
25 |     else:
26 |         tokenizer = AutoTokenizer.from_pretrained(
27 |             config.pretrain_path, do_lower_case=config.do_lower_case)
28 | 
29 |     # 训练集数据加载
30 |     train_features = convert2features(examples=train_data,
31 |                                       tokenizer=tokenizer,
32 |                                       max_length=config.pad_size)
33 |     train_dataset = BuildDataSet(train_features)
34 |     train_loader = DataLoader(train_dataset,
35 |                               batch_size=config.batch_size,
36 |                               shuffle=True)
37 | 
38 |     # 验证集数据加载
39 |     valid_features = convert2features(examples=valid_data,
40 |                                       tokenizer=tokenizer,
41 |                                       max_length=config.pad_size)
42 |     valid_dataset = BuildDataSet(valid_features)
43 |     valid_loader = DataLoader(valid_dataset,
44 |                               batch_size=config.batch_size,
45 |                               shuffle=True)
46 | 
47 |     # 模型训练保存
48 |     model = model.to(config.device)
49 |     best_model = model_train(config, model, train_loader, valid_loader)
50 | 
51 |     metrics, valid_loss, total_inputs_error = model_evaluate(
52 |         config, best_model, valid_loader)
53 | 
54 |     valid_f1 = metrics['f1']
55 |     valid_f1_a = metrics['f1_a']
56 |     valid_f1_b = metrics['f1_b']
57 | 
58 |     logger.info(
59 |         'evaluate: f1: {0:>6.2%}, f1_a: {1:>6.2%}, f1_b: {2:>6.2%}, loss: {3:>.6f}'
60 |         .format(valid_f1, valid_f1_a, valid_f1_b, valid_loss))
61 |     model_save(config, best_model)
62 | 
63 |     # 保存 bad_case
64 |     for example in total_inputs_error:
65 |         tokens = tokenizer.convert_ids_to_tokens(example['sentence_ids'])
66 |         if config.pretrain_model_name == 'chinese-xlnet-base':
67 |             tokens = ''.join(x for x in tokens if x not in ['<cls>', '[pad]'])
68 |             source, target, _ = tokens.split('<sep>')
69 |         else:
70 |             tokens = ''.join(x for x in tokens if x not in ['[CLS]', '[PAD]'])
71 |             source, target, _ = tokens.split('[SEP]')
72 |         example['source'] = source
73 |         example['target'] = target
74 | 
75 |     bad_case = pd.DataFrame(total_inputs_error)
76 | 
77 |     os.makedirs('user_data/bad_case', exist_ok=True)
78 |     bad_case.to_csv('user_data/bad_case/{}.csv'.format(config.model_name),
79 |                     index=False)
80 | 


--------------------------------------------------------------------------------
/data_processor.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import gc
  3 | import json
  4 | import logging
  5 | import os
  6 | import random
  7 | 
  8 | import numpy as np
  9 | import torch.utils.data as Data
 10 | from tqdm import tqdm
 11 | 
 12 | logger = logging.getLogger(__name__)
 13 | 
 14 | 
 15 | class InputFeatures(object):
 16 |     def __init__(self,
 17 |                  task_type,
 18 |                  input_ids,
 19 |                  attention_mask=None,
 20 |                  token_type_ids=None,
 21 |                  label=None,
 22 |                  sentence_w2v_embedding0=None,
 23 |                  sentence_w2v_embedding1=None):
 24 |         self.task_type = task_type
 25 |         self.input_ids = input_ids
 26 |         self.attention_mask = attention_mask
 27 |         self.token_type_ids = token_type_ids
 28 |         self.label = label
 29 | 
 30 |     def __repr__(self):
 31 |         return str(self.to_json_string())
 32 | 
 33 |     def to_dict(self):
 34 |         """Serializes this instance to a Python dictionary."""
 35 |         output = copy.deepcopy(self.__dict__)
 36 |         return output
 37 | 
 38 |     def to_json_string(self):
 39 |         """Serializes this instance to a JSON string."""
 40 |         return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
 41 | 
 42 |     
 43 | def convert2features(examples,
 44 |                      tokenizer,
 45 |                      max_length=512,
 46 |                      pad_token=0,
 47 |                      pad_token_segment_id=0):
 48 |     features = []
 49 |     for example in tqdm(examples):
 50 |         inputs = tokenizer.encode_plus(example[1],
 51 |                                        example[2],
 52 |                                        add_special_tokens=True,
 53 |                                        max_length=max_length,
 54 |                                        truncation=True)
 55 |         input_ids, token_type_ids = inputs['input_ids'], inputs[
 56 |             'token_type_ids']
 57 |         attention_mask = [1] * len(input_ids)
 58 | 
 59 |         # Zero-pad up to the sequence length.
 60 |         padding_length = max_length - len(input_ids)
 61 | 
 62 |         input_ids = input_ids + ([pad_token] * padding_length)
 63 |         attention_mask = attention_mask + ([0] * padding_length)
 64 |         token_type_ids = token_type_ids + ([pad_token_segment_id] *
 65 |                                            padding_length)
 66 | 
 67 |         if example[3] is not None:
 68 |             label = example[3]
 69 |         else:
 70 |             label = 0
 71 | 
 72 |         features.append(
 73 |             InputFeatures(example[0], input_ids, attention_mask,
 74 |                           token_type_ids, label))
 75 | 
 76 |     return features
 77 | 
 78 | 
 79 | class BuildDataSet(Data.Dataset):
 80 |     def __init__(self, features):
 81 |         self.features = features
 82 | 
 83 |     def __getitem__(self, index):
 84 |         feature = self.features[index]
 85 |         task_type = np.array(feature.task_type)
 86 |         input_ids = np.array(feature.input_ids)
 87 |         attention_mask = np.array(feature.attention_mask)
 88 |         token_type_ids = np.array(feature.token_type_ids)
 89 |         label = np.array(feature.label)
 90 | 
 91 |         return task_type, input_ids, attention_mask, token_type_ids, label
 92 | 
 93 |     def __len__(self):
 94 |         return len(self.features)
 95 | 
 96 | 
 97 | class DataProcessor:
 98 |     def __init__(self, config):
 99 |         self.data_dir = config.data_dir
100 |         self.seed = config.seed
101 |         self.variants = config.variants
102 |         self.stop_word_list = None
103 | 
104 |     def get_train_examples(self):
105 |         train_examples = []
106 | 
107 |         for i, var in enumerate(self.variants):
108 |             key = 'labelA' if 'A' in var else 'labelB'
109 |             fs = [
110 |                 os.path.join(self.data_dir, 'sohu2021_open_data', var,
111 |                              'train.txt'),
112 |                 os.path.join(self.data_dir, 'round2', f'{var}.txt'),
113 |                 os.path.join(self.data_dir, 'divided_20210419', var, 'train.txt'),
114 |             ]
115 | 
116 |             for f in fs:
117 |                 with open(f) as f:
118 | #                     for line in list(f)[:10]:
119 |                     for line in f:
120 |                         line = json.loads(line)
121 |                         train_examples.append((i, line['source'],
122 |                                                line['target'], int(line[key])))
123 | 
124 |         return train_examples
125 | 
126 |     def get_valid_examples(self):
127 |         valid_examples = []
128 | 
129 |         for i, var in enumerate(self.variants):
130 |             key = 'labelA' if 'A' in var else 'labelB'
131 |             f = os.path.join(self.data_dir, 'sohu2021_open_data', var,
132 |                              'valid.txt')
133 | 
134 |             with open(f) as f:
135 |                 for line in f:
136 |                     line = json.loads(line)
137 |                     valid_examples.append(
138 |                         (i, line['source'], line['target'], int(line[key])))
139 | #                     break
140 | 
141 |         return valid_examples
142 | 
143 |     def get_test_examples(self):
144 |         test_ids = []
145 |         test_examples = []
146 | 
147 |         for i, var in enumerate(self.variants):
148 |             f = os.path.join(self.data_dir, 'sohu2021_open_data', var,
149 |                              'test_with_id.txt')
150 | 
151 |             with open(f) as f:
152 |                 for line in f:
153 |                     line = json.loads(line)
154 |                     test_examples.append(
155 |                         (i, line['source'], line['target'], -1))
156 |                     test_ids.append(line['id'])
157 | 
158 |         return test_ids, test_examples
159 | 
160 |     def read_data_augment(self, augment_list):
161 |         data_augment = []
162 | 
163 |         for augment in augment_list:
164 |             for type in self.data_files:
165 |                 examples = self._read_data(
166 |                     os.path.join('data/augment_data', type,
167 |                                  '{}.pkl'.format(augment)))
168 |                 data_augment += examples
169 | 
170 |         random.seed(self.seed)
171 |         random.shuffle(data_augment)
172 |         return data_augment
173 | 


--------------------------------------------------------------------------------
/predict.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import time
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | import torch
  8 | from torch.utils.data import DataLoader
  9 | from transformers import AutoTokenizer
 10 | 
 11 | from bert import Bert
 12 | from data_processor import BuildDataSet, DataProcessor, convert2features
 13 | from train_valid import model_evaluate, model_load
 14 | from utils import combined_result, random_seed
 15 | 
 16 | 
 17 | class ModelConfig():
 18 |     def __init__(self):
 19 |         pass
 20 | 
 21 | 
 22 | class TestConfig:
 23 |     def __init__(self):
 24 |         self.device = torch.device(
 25 |             'cuda' if torch.cuda.is_available() else 'cpu')
 26 | 
 27 |         self.data_dir = 'raw_data'.replace('/', os.path.sep)
 28 | 
 29 |         self.model_path = 'user_data/model'.replace('/', os.path.sep)
 30 |         self.model_names = ['erniev8']
 31 |         self.variants = [
 32 |             '短短匹配A类',
 33 |             '短短匹配B类',
 34 |             '短长匹配A类',
 35 |             '短长匹配B类',
 36 |             '长长匹配A类',
 37 |             '长长匹配B类',
 38 |         ]
 39 | 
 40 |         os.makedirs('result', exist_ok=True)
 41 |         self.output_path = os.path.join(
 42 |             'result',
 43 |             time.strftime('%Y-%m-%d_%H-%M-%S') + '.csv')
 44 | 
 45 |         self.prob_threshold = 0.5
 46 |         self.seed = 2021
 47 | 
 48 | 
 49 | def model_predict(model, config, examples):
 50 |     # 读取 Tokenizer
 51 |     tokenizer = AutoTokenizer.from_pretrained(
 52 |         config.pretrain_path, do_lower_case=config.do_lower_case)
 53 | 
 54 |     test_features = convert2features(examples=examples,
 55 |                                      tokenizer=tokenizer,
 56 |                                      max_length=config.pad_size)
 57 |     test_dataset = BuildDataSet(test_features)
 58 |     test_loader = DataLoader(test_dataset,
 59 |                              batch_size=config.batch_size,
 60 |                              shuffle=False)
 61 |     test_prob, _ = model_evaluate(config, model, test_loader, test=True)
 62 |     return test_prob
 63 | 
 64 | 
 65 | def predict_task(config):
 66 |     processor = DataProcessor(config)
 67 | 
 68 |     ids, examples = processor.get_test_examples()
 69 |     all_predict = []
 70 | 
 71 |     for model_name in config.model_names:
 72 |         with open(os.path.join(config.model_path, model_name, 'config.json'),
 73 |                   'r') as f:
 74 |             model_conf_json = json.load(f)
 75 | 
 76 |         # 模型定义
 77 |         model_conf = ModelConfig()
 78 |         model_conf.__dict__.update(model_conf_json)
 79 |         model = Bert(model_conf)
 80 | 
 81 |         # 加载模型
 82 |         model_path = os.path.join(config.model_path, model_name)
 83 |         print(f'load model from {model_path}')
 84 |         model = model_load(model_path, model, device='cpu')
 85 |         model.to(config.device)
 86 | 
 87 |         # 模型预测
 88 |         predict_prob = model_predict(model, model_conf, examples)
 89 |         all_predict.append(predict_prob)
 90 | 
 91 |     final_predict = combined_result(all_predict, pattern='average')
 92 |     final_predict_label = np.asarray(final_predict >= config.prob_threshold,
 93 |                                      dtype=np.int)
 94 | 
 95 |     submit = pd.DataFrame()
 96 |     submit['id'] = ids
 97 |     submit['label'] = final_predict_label
 98 |     return submit
 99 | 
100 | 
101 | def predict(config):
102 |     start_time = time.time()
103 | 
104 |     submit = predict_task(config)
105 | 
106 |     end_time = time.time()
107 |     time_dif = end_time - start_time
108 |     print(time_dif * 1000 // (14913 + 14909))
109 | 
110 |     submit.to_csv(config.output_path, index=False)
111 | 
112 | 
113 | if __name__ == '__main__':
114 |     config = TestConfig()
115 |     random_seed(config.seed)
116 |     predict(config)
117 | 


--------------------------------------------------------------------------------
/train_ernie.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | import time
  4 | import warnings
  5 | 
  6 | import torch
  7 | 
  8 | from bert import Bert
  9 | from cross_validation import hold_out
 10 | from data_processor import DataProcessor
 11 | from utils import config_to_json_string, random_seed
 12 | 
 13 | warnings.filterwarnings('ignore')
 14 | 
 15 | 
 16 | class TrainConfig:
 17 |     def __init__(self):
 18 |         # 预训练模型相关
 19 |         self.pretrain_model_name = 'ernie'
 20 |         self.pretrain_path = 'data/pretrain_models/{}'.format(
 21 |             self.pretrain_model_name).replace('/', os.path.sep)
 22 |         self.device = torch.device(
 23 |             'cuda' if torch.cuda.is_available() else 'cpu')
 24 |         self.do_lower_case = True
 25 |         self.requires_grad = True
 26 | 
 27 |         # 模型相关
 28 |         self.pad_size = 512  # 每句话处理成的长度
 29 |         self.batch_size = 36
 30 |         self.learning_rate = 2e-5  # 学习率
 31 |         self.head_learning_rate = 1e-4  # 后面的分类层的学习率
 32 |         self.weight_decay = 0.01  # 权重衰减因子
 33 |         self.warmup_proportion = 0.1  # Proportion of training to perform linear learning rate warmup for.
 34 |         self.num_train_epochs = 3  # epoch数
 35 |         self.prob_threshold = 0.5
 36 |         self.loss_method = 'binary'  # [ binary, cross_entropy]
 37 |         self.hidden_dropout_prob = 0.1
 38 |         self.hidden_size = []
 39 |         self.diff_learning_rate = False
 40 |         self.early_stop = True
 41 |         self.require_improvement = 3000
 42 |         self.FGM = True
 43 |         self.pool_method = 'mean'
 44 |         self.multi_drop = 1
 45 | 
 46 |         self.variants = [
 47 |             '短短匹配A类',
 48 |             '短短匹配B类',
 49 |             '短长匹配A类',
 50 |             '短长匹配B类',
 51 |             '长长匹配A类',
 52 |             '长长匹配B类',
 53 |         ]
 54 | 
 55 |         # 数据路径
 56 |         is_abstract = True
 57 |         if is_abstract:
 58 |             self.data_dir = 'abstract_data'.replace('/', os.path.sep)
 59 |         else:
 60 |             self.data_dir = 'raw_data'.replace('/', os.path.sep)
 61 |         self.model_name = f'{self.pretrain_model_name}v8'
 62 |         self.model_path = 'user_data/model/{}'.format(self.model_name).replace(
 63 |             '/', os.path.sep)
 64 |         os.makedirs(self.model_path, exist_ok=True)
 65 | 
 66 |         # logging
 67 |         self.logging_dir = 'user_data/logging/{}'.format(
 68 |             self.model_name).replace('/', os.path.sep)
 69 |         os.makedirs(self.logging_dir, exist_ok=True)
 70 |         self.seed = 2021
 71 | 
 72 |         # 数据增强
 73 |         self.data_augment = None
 74 | 
 75 | 
 76 | def train_model(config):
 77 |     logging.debug('config {}'.format(config_to_json_string(config)))
 78 | 
 79 |     # 读取数据
 80 |     processor = DataProcessor(config)
 81 |     train_examples = processor.get_train_examples()
 82 |     valid_examples = processor.get_valid_examples()
 83 | 
 84 |     if config.data_augment:
 85 |         augment_examples = processor.read_data_augment(config.data_augment)
 86 |     else:
 87 |         augment_examples = None
 88 | 
 89 |     logging.info(train_examples[:1])
 90 |     logging.info(valid_examples[:1])
 91 | 
 92 |     model = Bert(config)
 93 |     hold_out(config=config,
 94 |              model=model,
 95 |              train_data=train_examples,
 96 |              valid_data=valid_examples,
 97 |              train_enhancement=augment_examples)
 98 | 
 99 | 
100 | if __name__ == '__main__':
101 |     config = TrainConfig()
102 | 
103 |     random_seed(config.seed)
104 | 
105 |     # 定义日志
106 |     file = time.strftime('%Y-%m-%d_%H-%M-%S') + '.log'
107 |     logging_filename = os.path.join(config.logging_dir, file)
108 |     logging.basicConfig(filename=logging_filename,
109 |                         format='%(levelname)s: %(message)s',
110 |                         level=logging.DEBUG)
111 | 
112 |     # 运行模型
113 |     train_model(config)
114 | 


--------------------------------------------------------------------------------
/train_roberta.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | import time
  4 | import warnings
  5 | 
  6 | import torch
  7 | 
  8 | from bert import Bert
  9 | from cross_validation import hold_out
 10 | from data_processor import DataProcessor
 11 | from utils import config_to_json_string, random_seed
 12 | 
 13 | warnings.filterwarnings('ignore')
 14 | 
 15 | 
 16 | class TrainConfig:
 17 |     def __init__(self):
 18 |         # 预训练模型相关
 19 |         self.pretrain_model_name = 'chinese-roberta-wwm-ext'
 20 |         self.pretrain_path = 'data/pretrain_models/{}'.format(
 21 |             self.pretrain_model_name).replace('/', os.path.sep)
 22 |         self.device = torch.device(
 23 |             'cuda' if torch.cuda.is_available() else 'cpu')
 24 |         self.do_lower_case = True
 25 |         self.requires_grad = True
 26 | 
 27 |         # 模型相关
 28 |         self.pad_size = 512  # 每句话处理成的长度
 29 |         self.batch_size = 32
 30 |         self.learning_rate = 2e-5  # 学习率
 31 |         self.head_learning_rate = 1e-4  # 后面的分类层的学习率
 32 |         self.weight_decay = 0.01  # 权重衰减因子
 33 |         self.warmup_proportion = 0.1  # Proportion of training to perform linear learning rate warmup for.
 34 |         self.num_train_epochs = 3  # epoch数
 35 |         self.prob_threshold = 0.5
 36 |         self.loss_method = 'binary'  # [ binary, cross_entropy]
 37 |         self.hidden_dropout_prob = 0.1
 38 |         self.hidden_size = []
 39 |         self.diff_learning_rate = False
 40 |         self.early_stop = True
 41 |         self.require_improvement = 3000
 42 |         self.FGM = True
 43 |         self.pool_method = 'mean'
 44 |         self.multi_drop = 1
 45 | 
 46 |         self.variants = [
 47 |             '短短匹配A类',
 48 |             '短短匹配B类',
 49 |             '短长匹配A类',
 50 |             '短长匹配B类',
 51 |             '长长匹配A类',
 52 |             '长长匹配B类',
 53 |         ]
 54 | 
 55 |         # 数据路径
 56 |         self.data_dir = 'raw_data'.replace('/', os.path.sep)
 57 |         self.model_name = f'{self.pretrain_model_name}v2'
 58 |         self.model_path = 'user_data/model/{}'.format(self.model_name).replace(
 59 |             '/', os.path.sep)
 60 |         os.makedirs(self.model_path, exist_ok=True)
 61 | 
 62 |         # logging
 63 |         self.logging_dir = 'user_data/logging/{}'.format(
 64 |             self.model_name).replace('/', os.path.sep)
 65 |         os.makedirs(self.logging_dir, exist_ok=True)
 66 |         self.seed = 2021
 67 | 
 68 |         # 数据增强
 69 |         self.data_augment = None
 70 | 
 71 | 
 72 | def train_model(config):
 73 |     logging.debug('config {}'.format(config_to_json_string(config)))
 74 | 
 75 |     # 读取数据
 76 |     processor = DataProcessor(config)
 77 |     train_examples = processor.get_train_examples()
 78 |     valid_examples = processor.get_valid_examples()
 79 | 
 80 |     if config.data_augment:
 81 |         augment_examples = processor.read_data_augment(config.data_augment)
 82 |     else:
 83 |         augment_examples = None
 84 | 
 85 |     logging.info(train_examples[:1])
 86 |     logging.info(valid_examples[:1])
 87 | 
 88 |     model = Bert(config)
 89 |     hold_out(config=config,
 90 |              model=model,
 91 |              train_data=train_examples,
 92 |              valid_data=valid_examples,
 93 |              train_enhancement=augment_examples)
 94 | 
 95 | 
 96 | if __name__ == '__main__':
 97 |     config = TrainConfig()
 98 | 
 99 |     random_seed(config.seed)
100 | 
101 |     # 定义日志
102 |     file = time.strftime('%Y-%m-%d_%H-%M-%S') + '.log'
103 |     logging_filename = os.path.join(config.logging_dir, file)
104 |     logging.basicConfig(filename=logging_filename,
105 |                         format='%(levelname)s: %(message)s',
106 |                         level=logging.DEBUG)
107 | 
108 |     # 运行模型
109 |     train_model(config)
110 | 


--------------------------------------------------------------------------------
/train_valid.py:
--------------------------------------------------------------------------------
  1 | # coding: UTF-8
  2 | import copy
  3 | import logging
  4 | import os
  5 | import time
  6 | 
  7 | import numpy as np
  8 | import torch
  9 | from sklearn.metrics import f1_score
 10 | from transformers import AdamW, get_linear_schedule_with_warmup
 11 | 
 12 | from utils import config_to_json_string
 13 | 
 14 | logger = logging.getLogger(__name__)
 15 | 
 16 | 
 17 | class FGM():
 18 |     '''
 19 |     Example
 20 |     # 初始化
 21 |     fgm = FGM(model,epsilon=1, emb_name='word_embeddings.')
 22 |     for batch_input, batch_label in data:
 23 |         # 正常训练
 24 |         loss = model(batch_input, batch_label)
 25 |         loss.backward() # 反向传播，得到正常的grad
 26 |         # 对抗训练
 27 |         fgm.attack() # 在embedding上添加对抗扰动
 28 |         loss_adv = model(batch_input, batch_label)
 29 |         loss_adv.backward() # 反向传播，并在正常的grad基础上，累加对抗训练的梯度
 30 |         fgm.restore() # 恢复embedding参数
 31 |         # 梯度下降，更新参数
 32 |         optimizer.step()
 33 |         model.zero_grad()
 34 |     '''
 35 |     def __init__(self, model, emb_name, epsilon=1.0):
 36 |         # emb_name这个参数要换成你模型中embedding的参数名
 37 |         self.model = model
 38 |         self.epsilon = epsilon
 39 |         self.emb_name = emb_name
 40 |         self.backup = {}
 41 | 
 42 |     def attack(self):
 43 |         for name, param in self.model.named_parameters():
 44 |             if param.requires_grad and self.emb_name in name:
 45 |                 self.backup[name] = param.data.clone()
 46 |                 norm = torch.norm(param.grad)
 47 |                 if norm != 0 and not torch.isnan(norm):
 48 |                     r_at = self.epsilon * param.grad / norm
 49 |                     param.data.add_(r_at)
 50 | 
 51 |     def restore(self):
 52 |         for name, param in self.model.named_parameters():
 53 |             if param.requires_grad and self.emb_name in name:
 54 |                 assert name in self.backup
 55 |                 param.data = self.backup[name]
 56 |         self.backup = {}
 57 | 
 58 | 
 59 | def model_train(config, model, train_iter, valid_iter):
 60 |     start_time = time.time()
 61 | 
 62 |     # Prepare optimizer and schedule (linear warmup and decay)
 63 |     no_decay = ["bias", "LayerNorm.weight"]
 64 |     diff_part = ["bert.embeddings", "bert.encoder"]
 65 |     if not config.diff_learning_rate:
 66 |         optimizer_grouped_parameters = [
 67 |             {
 68 |                 "params": [
 69 |                     p for n, p in model.named_parameters()
 70 |                     if not any(nd in n for nd in no_decay)
 71 |                 ],
 72 |                 "weight_decay":
 73 |                 config.weight_decay,
 74 |             },
 75 |             {
 76 |                 "params": [
 77 |                     p for n, p in model.named_parameters()
 78 |                     if any(nd in n for nd in no_decay)
 79 |                 ],
 80 |                 "weight_decay":
 81 |                 0.0
 82 |             },
 83 |         ]
 84 |         optimizer = AdamW(optimizer_grouped_parameters,
 85 |                           lr=config.learning_rate)
 86 |     else:
 87 |         logger.info("use the diff learning rate")
 88 |         # the formal is basic_bert part, not include the pooler
 89 |         optimizer_grouped_parameters = [
 90 |             {
 91 |                 # weight 衰减
 92 |                 "params": [
 93 |                     p for n, p in model.named_parameters()
 94 |                     if not any(nd in n
 95 |                                for nd in no_decay) and any(nd in n
 96 |                                                            for nd in diff_part)
 97 |                 ],
 98 |                 "weight_decay":
 99 |                 config.weight_decay,
100 |                 "lr":
101 |                 config.learning_rate
102 |             },
103 |             {
104 |                 # weight 不衰减
105 |                 "params": [
106 |                     p for n, p in model.named_parameters()
107 |                     if any(nd in n
108 |                            for nd in no_decay) and any(nd in n
109 |                                                        for nd in diff_part)
110 |                 ],
111 |                 "weight_decay":
112 |                 0.0,
113 |                 "lr":
114 |                 config.learning_rate
115 |             },
116 |             {
117 |                 "params": [
118 |                     p for n, p in model.named_parameters()
119 |                     if not any(nd in n for nd in no_decay) and not any(
120 |                         nd in n for nd in diff_part)
121 |                 ],
122 |                 "weight_decay":
123 |                 config.weight_decay,
124 |                 "lr":
125 |                 config.head_learning_rate
126 |             },
127 |             {
128 |                 "params": [
129 |                     p for n, p in model.named_parameters()
130 |                     if any(nd in n
131 |                            for nd in no_decay) and not any(nd in n
132 |                                                            for nd in diff_part)
133 |                 ],
134 |                 "weight_decay":
135 |                 0.0,
136 |                 "lr":
137 |                 config.head_learning_rate
138 |             },
139 |         ]
140 |         optimizer = AdamW(optimizer_grouped_parameters)
141 | 
142 |     t_total = len(train_iter) * config.num_train_epochs
143 |     scheduler = get_linear_schedule_with_warmup(optimizer,
144 |                                                 num_warmup_steps=t_total *
145 |                                                 config.warmup_proportion,
146 |                                                 num_training_steps=t_total)
147 | 
148 |     # Train!
149 |     logger.info("***** Running training *****")
150 |     logger.info("  Num Epochs = %d", config.num_train_epochs)
151 |     logger.info("  Instantaneous batch size GPU/CPU = %d", config.batch_size)
152 |     logger.info("  Total optimization steps = %d", t_total)
153 |     logger.info("  Train device:%s", config.device)
154 | 
155 |     global_batch = 0  # 记录进行到多少batch
156 |     valid_best_f1 = 0
157 |     last_improve = 0  # 记录上次验证集loss下降的batch数
158 |     flag = False  # 记录是否很久没有效果提升
159 | 
160 |     predict_all = []
161 |     labels_all = []
162 |     best_model = copy.deepcopy(model)
163 | 
164 |     if config.FGM:
165 |         fgm = FGM(model, epsilon=1, emb_name='word_embeddings.')
166 | 
167 |     for epoch in range(config.num_train_epochs):
168 |         logger.info('Epoch [{}/{}]'.format(epoch + 1, config.num_train_epochs))
169 |         for _, (task_type, input_ids, attention_mask, token_type_ids, labels) in enumerate(train_iter):
170 |             global_batch += 1
171 |             model.train()
172 | 
173 |             task_type = torch.tensor(task_type).type(torch.LongTensor).to(
174 |                 config.device)
175 |             input_ids = torch.tensor(input_ids).type(torch.LongTensor).to(
176 |                 config.device)
177 |             attention_mask = torch.tensor(attention_mask).type(
178 |                 torch.LongTensor).to(config.device)
179 |             token_type_ids = torch.tensor(token_type_ids).type(
180 |                 torch.LongTensor).to(config.device)
181 | 
182 |             if config.loss_method in ['binary']:
183 |                 labels_tensor = torch.tensor(labels).type(
184 |                     torch.FloatTensor).to(config.device)
185 |             else:
186 |                 labels_tensor = torch.tensor(labels).type(torch.LongTensor).to(
187 |                     config.device)
188 | 
189 |             outputs, loss = model(task_type, input_ids, attention_mask,
190 |                                   token_type_ids, labels_tensor)
191 | 
192 |             loss.backward()
193 | 
194 |             # 对抗训练
195 |             if config.FGM:
196 |                 fgm.attack()  # 在embedding上添加对抗扰动
197 |                 _, loss_adv = model(task_type, input_ids, attention_mask, token_type_ids,
198 |                                     labels_tensor)
199 |                 loss_adv.backward()  # 反向传播，并在正常的grad基础上，累加对抗训练的梯度
200 |                 fgm.restore()  # 恢复embedding参数
201 | 
202 |             optimizer.step()
203 |             scheduler.step()  # Update learning rate schedule
204 |             model.zero_grad()
205 | 
206 |             outputs = outputs.cpu().detach().numpy()
207 |             predic = list(
208 |                 np.array(outputs >= config.prob_threshold, dtype='int'))
209 |             labels_all.extend(labels)
210 |             predict_all.extend(predic)
211 | 
212 |             if global_batch % 100 == 0:
213 |                 train_f1 = f1_score(labels_all, predict_all)
214 |                 predict_all = []
215 |                 labels_all = []
216 | 
217 |                 metrics, valid_loss, _ = model_evaluate(
218 |                     config, model, valid_iter)
219 |                 valid_f1 = metrics['f1']
220 |                 valid_f1_a = metrics['f1_a']
221 |                 valid_f1_b = metrics['f1_b']
222 | 
223 |                 if valid_f1 > valid_best_f1:
224 |                     valid_best_f1 = valid_f1
225 |                     improve = '*'
226 |                     last_improve = global_batch
227 |                     best_model = copy.deepcopy(model)
228 |                 else:
229 |                     improve = ''
230 | 
231 |                 time_dif = time.time() - start_time
232 |                 msg = 'Iter: {0:>6},  Train Loss: {1:>5.6f},  Train F1: {2:>6.2%},  Val Loss: {3:>5.6f},  Val F1: {4:>6.2%}, Val F1_a: {5:>6.2%}, Val F1_b: {6:>6.2%}, Time: {7} {8}'
233 |                 logger.info(
234 |                     msg.format(global_batch,
235 |                                loss.cpu().data.item(), train_f1,
236 |                                valid_loss.cpu().data.item(), valid_f1,
237 |                                valid_f1_a, valid_f1_b, time_dif, improve))
238 | 
239 |             if config.early_stop and global_batch - last_improve > config.require_improvement:
240 |                 logger.info(
241 |                     "No optimization for a long time, auto-stopping...")
242 |                 flag = True
243 |                 break
244 |         if flag:
245 |             break
246 |     
247 |     if config.early_stop:
248 |         return best_model
249 |     else:
250 |         return model
251 | 
252 | def model_evaluate(config, model, data_iter, test=False):
253 |     model.eval()
254 | 
255 |     # loss 总和
256 |     loss_total = 0
257 |     # 预测的全部 label
258 |     predict_label_all = []
259 |     predict_label_taskA = []
260 |     predict_label_taskB = []
261 | 
262 |     # 预测的全部概率
263 |     predict_prob_all = []
264 | 
265 |     # 真实的全部 label
266 |     true_label_all = []
267 |     true_label_taskA = []
268 |     true_label_taskB = []
269 | 
270 |     # 全部的task_type
271 |     task_type_all = []
272 | 
273 |     total_inputs_error = []
274 |     with torch.no_grad():
275 |         for i, (task_type, input_ids, attention_mask, token_type_ids, labels) in enumerate(data_iter):
276 | 
277 |             task_type = torch.tensor(task_type).type(torch.LongTensor).to(
278 |                 config.device)
279 |             input_ids = torch.tensor(input_ids).type(torch.LongTensor).to(
280 |                 config.device)
281 |             attention_mask = torch.tensor(attention_mask).type(
282 |                 torch.LongTensor).to(config.device)
283 |             token_type_ids = torch.tensor(token_type_ids).type(
284 |                 torch.LongTensor).to(config.device)
285 | 
286 |             if config.loss_method in ['binary']:
287 |                 labels = torch.tensor(labels).type(torch.FloatTensor).to(
288 |                     config.device) if not test else None
289 |             else:
290 |                 labels = torch.tensor(labels).type(torch.LongTensor).to(
291 |                     config.device) if not test else None
292 | 
293 |             predict_prob, loss = model(task_type, input_ids, attention_mask,
294 |                                        token_type_ids, labels)
295 | 
296 |             predict_prob = predict_prob.cpu().detach().numpy()
297 |             predict_label = list(
298 |                 np.array(predict_prob >= config.prob_threshold, dtype='int'))
299 | 
300 |             predict_prob_all.extend(list(predict_prob))
301 |             predict_label_all.extend(predict_label)
302 |             task_type_all.extend(list(task_type.cpu().detach().numpy()))
303 | 
304 |             if not test:
305 |                 labels = labels.data.cpu().numpy()
306 |                 true_label_all.extend(list(labels))
307 |                 loss_total += loss
308 | 
309 |                 input_ids = input_ids.data.cpu().detach().numpy()
310 |                 classify_error = get_classify_error(input_ids, predict_label,
311 |                                                     labels, predict_prob)
312 |                 total_inputs_error.extend(classify_error)
313 | 
314 |     if test:
315 |         return predict_prob_all, predict_label_all
316 | 
317 |     for task_type, predict_label, true_label in zip(task_type_all,
318 |                                                     predict_label_all,
319 |                                                     true_label_all):
320 |         if task_type % 2 == 0:
321 |             predict_label_taskA.append(predict_label)
322 |             true_label_taskA.append(true_label)
323 |         else:
324 |             predict_label_taskB.append(predict_label)
325 |             true_label_taskB.append(true_label)
326 | 
327 |     f1_a = f1_score(true_label_taskA, predict_label_taskA)
328 |     f1_b = f1_score(true_label_taskB, predict_label_taskB)
329 |     f1 = (f1_a + f1_b) / 2
330 | 
331 |     return {
332 |         'f1': f1,
333 |         'f1_a': f1_a,
334 |         'f1_b': f1_b
335 |     }, loss_total / len(data_iter), total_inputs_error
336 | 
337 | 
338 | def get_classify_error(input_ids, predict, labels, proba, input_ids_pair=None):
339 |     error_list = []
340 |     error_idx = predict != labels
341 |     error_sentences = input_ids[error_idx]
342 |     total_sentences = []
343 |     if input_ids_pair is not None:
344 |         error_sentences_pair = input_ids_pair[error_idx]
345 |         for sentence1, sentence2 in zip(error_sentences, error_sentences_pair):
346 |             total_sentences.append(
347 |                 np.array(sentence1.tolist() + [117] + sentence2.tolist(),
348 |                          dtype=int))
349 |     else:
350 |         total_sentences = error_sentences
351 | 
352 |     true_label = labels[error_idx]
353 |     pred_proba = proba[error_idx]
354 |     for sentences, label, prob in zip(total_sentences, true_label, pred_proba):
355 |         error_dict = {}
356 |         error_dict['sentence_ids'] = sentences
357 |         error_dict['true_label'] = label
358 |         error_dict['proba'] = prob
359 |         error_list.append(error_dict)
360 | 
361 |     return error_list
362 | 
363 | 
364 | def model_save(config, model, num=-1):
365 |     if num == -1:
366 |         file_name = os.path.join(config.model_path, 'model.pkl')
367 | 
368 |     with open(os.path.join(config.model_path, 'config.json'), 'w') as f:
369 |         f.write(config_to_json_string(config))
370 | 
371 |     torch.save(model.state_dict(), file_name)
372 |     logger.info('model saved, path: %s', file_name)
373 | 
374 | 
375 | def model_load(model_path, model, device='cpu'):
376 |     file_name = os.path.join(model_path, 'model.pkl')
377 |     model.load_state_dict(
378 |         torch.load(file_name,
379 |                    map_location=device if device == 'cpu' else "{}:{}".format(
380 |                        device, 0)))
381 |     return model
382 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | import json
 3 | 
 4 | import numpy as np
 5 | import torch
 6 | 
 7 | 
 8 | def random_seed(seed):
 9 |     np.random.seed(seed)
10 |     torch.manual_seed(seed)
11 |     torch.cuda.manual_seed_all(seed)
12 | 
13 |     torch.backends.cudnn.deterministic = True
14 |     torch.backends.cudnn.benchmark = False
15 | 
16 | 
17 | def config_to_dict(config):
18 |     output = copy.deepcopy(config.__dict__)
19 |     output['device'] = config.device.type
20 |     return output
21 | 
22 | 
23 | def config_to_json_string(config):
24 |     return json.dumps(config_to_dict(config), indent=2, sort_keys=True)
25 | 
26 | 
27 | def combined_result(all_result, weight=None, pattern='average'):
28 |     def average_result(all_result):  # shape:[num_model, axis]
29 |         all_result = np.asarray(all_result, dtype=np.float)
30 |         return np.mean(all_result, axis=0)
31 | 
32 |     def weighted_result(all_result, weight):
33 |         all_result = np.asarray(all_result, dtype=np.float)
34 |         return np.average(all_result, axis=0, weights=weight)
35 | 
36 |     if pattern == 'weighted':
37 |         return weighted_result(all_result, weight)
38 |     elif pattern == 'average':
39 |         return average_result(all_result)
40 |     else:
41 |         raise ValueError("the combined type is incorrect")
42 | 
43 | def sentence_reverse(test_examples):
44 |     """
45 |     将测试数据翻转
46 |     :param test_examples:
47 |     :return:
48 |     """
49 |     reverse_test_examples = []
50 |     for example in test_examples:
51 |         try_example = [example[1], example[0], example[2]]
52 |         reverse_test_examples.append(try_example)
53 |     return reverse_test_examples
54 | 


--------------------------------------------------------------------------------