├── log └── log.txt ├── input └── input.txt ├── submit └── submit.txt ├── model_save └── model_save.txt ├── preprocess ├── input │ └── input.txt └── preprocess.py ├── file ├── 1.png ├── 2.png ├── 3.png ├── 4.png ├── 5.png ├── 6.png ├── 7.png └── 【2019 CCF BDCI】-关联模型-莽就完事了-说明论文.pdf ├── requirements.txt ├── ckpt ├── 如果百度云下载过慢,该文件夹内容下载地址.txt ├── chinese_L-12_H-768_A-12 │ └── bert_config.json └── chinese_wwm_ext_L-12_H-768_A-12 │ └── bert_config.json ├── train.sh ├── combine.py ├── README.md └── train.py /log/log.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /input/input.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /submit/submit.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /model_save/model_save.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /preprocess/input/input.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /file/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Makaixin/Correlation-between-requirements-and-achievements/HEAD/file/1.png -------------------------------------------------------------------------------- /file/2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Makaixin/Correlation-between-requirements-and-achievements/HEAD/file/2.png -------------------------------------------------------------------------------- /file/3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Makaixin/Correlation-between-requirements-and-achievements/HEAD/file/3.png -------------------------------------------------------------------------------- /file/4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Makaixin/Correlation-between-requirements-and-achievements/HEAD/file/4.png -------------------------------------------------------------------------------- /file/5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Makaixin/Correlation-between-requirements-and-achievements/HEAD/file/5.png -------------------------------------------------------------------------------- /file/6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Makaixin/Correlation-between-requirements-and-achievements/HEAD/file/6.png -------------------------------------------------------------------------------- /file/7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Makaixin/Correlation-between-requirements-and-achievements/HEAD/file/7.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==1.16.3 2 | pandas==0.24.2 3 | tensorflow-gpu==1.14.0 4 | keras==2.3.1 5 | keras-bert==0.80.0 6 | tqdm==4.36.1 -------------------------------------------------------------------------------- /file/【2019 CCF BDCI】-关联模型-莽就完事了-说明论文.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Makaixin/Correlation-between-requirements-and-achievements/HEAD/file/【2019 CCF BDCI】-关联模型-莽就完事了-说明论文.pdf -------------------------------------------------------------------------------- /ckpt/如果百度云下载过慢,该文件夹内容下载地址.txt: -------------------------------------------------------------------------------- 1 | 地址:https://pan.iflytek.com/#/link/8AA4B23D9BCBCBA0187EE58234332B46 密码:thGd 2 | 3 | https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip 4 | -------------------------------------------------------------------------------- /train.sh: -------------------------------------------------------------------------------- 1 | export CUDA_VISIBLE_DEVICES=0 2 | for((i=0;i<2;i++)); 3 | do 4 | 5 | python train.py \ 6 | --counter $i \ 7 | --name bertt128_bertwwm512 \ 8 | --model 0 \ 9 | --model1 1 \ 10 | --title_len 128 \ 11 | --content_len 512 \ 12 | --learning_rate 5e-5 \ 13 | --min_learning_rate 1e-5 \ 14 | --random_seed 123 \ 15 | --batch_size 16 \ 16 | --epoch 8 \ 17 | --fold 7 18 | 19 | done 20 | 21 | python combine.py --k 2 22 | 23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /ckpt/chinese_L-12_H-768_A-12/bert_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "attention_probs_dropout_prob": 0.1, 3 | "directionality": "bidi", 4 | "hidden_act": "gelu", 5 | "hidden_dropout_prob": 0.1, 6 | "hidden_size": 768, 7 | "initializer_range": 0.02, 8 | "intermediate_size": 3072, 9 | "max_position_embeddings": 512, 10 | "num_attention_heads": 12, 11 | "num_hidden_layers": 12, 12 | "pooler_fc_size": 768, 13 | "pooler_num_attention_heads": 12, 14 | "pooler_num_fc_layers": 3, 15 | "pooler_size_per_head": 128, 16 | "pooler_type": "first_token_transform", 17 | "type_vocab_size": 2, 18 | "vocab_size": 21128 19 | } 20 | -------------------------------------------------------------------------------- /ckpt/chinese_wwm_ext_L-12_H-768_A-12/bert_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "attention_probs_dropout_prob": 0.1, 3 | "directionality": "bidi", 4 | "hidden_act": "gelu", 5 | "hidden_dropout_prob": 0.1, 6 | "hidden_size": 768, 7 | "initializer_range": 0.02, 8 | "intermediate_size": 3072, 9 | "max_position_embeddings": 512, 10 | "num_attention_heads": 12, 11 | "num_hidden_layers": 12, 12 | "pooler_fc_size": 768, 13 | "pooler_num_attention_heads": 12, 14 | "pooler_num_fc_layers": 3, 15 | "pooler_size_per_head": 128, 16 | "pooler_type": "first_token_transform", 17 | "type_vocab_size": 2, 18 | "vocab_size": 21128 19 | } 20 | -------------------------------------------------------------------------------- /combine.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import argparse 4 | 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument("--k", default=6, type=int, required=False) 7 | parser.add_argument("--a", default=0.52, type=float, required=False) 8 | parser.add_argument("--op", default=0, type=int, required=False) 9 | parser.add_argument("--n", default=0, type=int, required=False) 10 | args = parser.parse_args() 11 | k = args.k 12 | a = args.a 13 | op = args.op 14 | n = args.n 15 | 16 | test = pd.read_csv('./input/TestPrediction.csv') 17 | oof_test = np.loadtxt('./submit/w_0.txt') 18 | print(oof_test) 19 | print('-----------------------') 20 | oof_test1 = np.loadtxt('./submit/w_{}.txt'.format(op)) 21 | oof_test += oof_test1 * n 22 | print(oof_test) 23 | print('-----------------------') 24 | 25 | for i in range(1, k): 26 | oof_test1 = np.loadtxt('./submit/w_{}.txt'.format(i)) 27 | oof_test += oof_test1 28 | 29 | print(oof_test) 30 | print('-----------------------') 31 | for i in range(len(oof_test)): 32 | oof_test[i][0] = oof_test[i][0] * (1 - a) 33 | oof_test[i][1] = oof_test[i][1] * a 34 | oof_test[i][2] = oof_test[i][2] * a 35 | oof_test[i][3] = oof_test[i][3] * (1 - a) 36 | print(oof_test) 37 | test['Level'] = np.argmax(oof_test, axis=1) + 1 38 | test[['Guid', 'Level']].to_csv('./submit/submit.csv', index=False) 39 | -------------------------------------------------------------------------------- /preprocess/preprocess.py: -------------------------------------------------------------------------------- 1 | #! -*- coding:utf-8 -*- 2 | import json 3 | import numpy as np 4 | import pandas as pd # ! -*- coding:utf-8 -*- 5 | import json 6 | import numpy as np 7 | import pandas as pd 8 | import time 9 | import random 10 | 11 | l = np.array([['1', '1', '1', '1'], 12 | ['1', '0', '2', '2'], 13 | ['1', '2', '3', '3'], 14 | ['1', '2', '3', '4']]) 15 | k = np.array([[55, 55, 55, 55], 16 | [55, 0, 6, 6], 17 | [55, 6, 8, 9], 18 | [55, 6, 9, 2]]) 19 | 20 | random.seed(123) 21 | 22 | train_interrelation = pd.read_csv('./input/Train_Interrelation.csv', dtype=str) 23 | print("Train_Interrelation", len(train_interrelation)) 24 | 25 | sum = [0 for i in range(5)] 26 | for i in train_interrelation['Level']: 27 | sum[int(i)] += 1 28 | for i in range(1, 5): 29 | print('等级', i, sum[i]) 30 | 31 | tur = set() 32 | for i in train_interrelation['Rid']: 33 | tur.add(i) 34 | print(len(tur)) 35 | train_re = train_interrelation.sort_values(['Rid']) 36 | sum = 0 37 | for i in range(len(train_re)): 38 | j = 1 39 | flag = 0 40 | while i + j < len(train_re) and train_re['Rid'].values[i] == train_re['Rid'].values[i + j]: 41 | flag = 1 42 | if flag == 1 and train_re['Rid'].values[i] != train_re['Rid'].values[i + j]: 43 | break 44 | c = int(train_re['Level'].values[i]) - 1 45 | r = int(train_re['Level'].values[i + j]) - 1 46 | if l[c][r] != '0' and random.randint(0, k[c][r]) == 0: 47 | sum += 1 48 | # print(train_re['Rid'].values[i], train_re['Aid'].values[i], train_re['Aid'].values[i + j]) 49 | train_interrelation.loc[i * 1000000 + j] = \ 50 | [train_re['Rid'].values[i], 51 | train_re['Aid'].values[i], 52 | train_re['Aid'].values[i + j], 53 | l[int(train_re['Level'].values[i]) - 1][int(train_re['Level'].values[i + j]) - 1]] 54 | j += 1 55 | 56 | train_interrelation.to_csv('./Train_Interrelation.csv', index=False) 57 | 58 | print("Train_Interrelation", len(train_interrelation)) 59 | print('sum', sum) 60 | 61 | sum = [0 for i in range(5)] 62 | for i in train_interrelation['Level']: 63 | sum[int(i)] += 1 64 | for i in range(1, 5): 65 | print('等级', i, sum[i]) 66 | 67 | 68 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 2019 CCF BDCI “技术需求”与“技术成果”项目之间关联度计算模型第一名解决方案 2 | 3 | ## 赛题介绍 4 | 5 | 赛题介绍见[官方网站](https://www.datafountain.cn/competitions/359) 6 | 7 | ## 数据预处理 8 | 9 | ### 数据清洗 10 | 11 | 数据清洗 12 | 13 | 对技术成果和技术需求的较短内容进行筛选查看,发现其中存在一定量的空白、“\n”、“未提供。”等无用信息。使用对应标题对无用信息进行替换。 14 | 15 | ### 数据增广 16 | 17 | 对问题进一步化简,可以简化成两个文本之间的关联度计算。 18 | 19 | 1.那么A文本与B文本之间关联度,同样也是B文本与A文本之间关联度。该方法在仅取标题时可以提升成绩。当加入内容时会造成过拟合,最终未采用该方法。 20 | 21 | 2.那么假设A文本与B文本之间关联度为4,A文本与C文本之间关联度为3,那么可以假定B文本与C文本之间关联度为3,按照这个思路可以假设关联矩阵 22 | 23 | 24 | 25 | 其中A文本与B文本之间关联度为i,A文本与C文本之间关联度为j,那么B文本与C文本之间关联度为R_(i,j)。此方法可增加数据295994条,从中按照原数据集各个关联度等级的比例从中随机取出10000条。该方法我认为具有一定的可能性,但由于训练时间过长、提交次数有限,尝试过的参数均会造成过拟合现象。最终模型中未对数据进行数据增广。 26 | 27 | ### 模型 28 | 29 | 1.标题与内容拼接的孪生BERT模型 30 | 31 | 32 | 33 | 2.分别进行标题与内容关联度判别的孪生BERT模型 34 | 35 | 36 | 37 | 3.分别进行标题与内容关联度判别的伪孪生BERT模型 38 | 39 | 40 | 41 | 最终只采用这个模型,也没有进行融合。当然可以很简单的认为它就是分别使用两个BERT进行相似度判别然后进行拼接。 42 | 其中进行技术成果标题与技术需求标题关联度计算的BERT采用谷歌开源的BERT-base;进行技术成果内容与技术需求内容关联度计算的BERT采用哈工大讯飞联合实验室发布基于全词覆盖的BERT-WWM。该预训练由于采用了全词覆盖,在多数情况下可以取得更好的效果。在第一个进行技术成果标题与技术需求标题关联度计算的BERT中输入最大长度MaxLenT设置为128,两个标题拼接最大长度也没有超过128个字,同时这样可以减少训练时间和显存需求;在第二个进行技术成果内容与技术需求内容关联度计算的BERT-WWM中输入最大长度MaxLenC设置为512,尽可能多的读取数据内容。两个BERT都采用12layers, 768hidden states, 12heads版本,该模型采用7折交叉验证,其中batch size取16,epoch取8,并在训练时保存较好的模型权值,初始学习率设置成5e-5,后续学习率设置成1e-5。 43 | 44 | ## 预测偏好处理 45 | 46 | 47 | 48 | 49 | ## 模型对比 50 | 51 | |序号 | 模型 | 初赛成绩 | 复赛成绩 | 52 | |---|-----------------|-----------|-----------| 53 | | 1 | BERT-base | 0.78585178 |0.79595751| 54 | | 2 | RoBERTa-base | 0.78077936 |/ | 55 | | 3 | 孪生BERT-1 | 0.78604090 |0.79607499| 56 | | 4 | 孪生BERT-2 | 0.78509617|0.79843128| 57 | | 5 | BERT+数据增广-1 | / |0.80163449| 58 | | 6 | BERT+数据增广-2 | / |0.77996242| 59 | | 7 | BERT+数据增广-3 | / |0.79548806| 60 | | 8 | BERT-T128C512 | 0.79079902 |0.79866767| 61 | | 9 | BERT-WWM-T128C512 | 0.79099053 |0.80008900| 62 | | 10 | 最终模型 | 0.79175758 |0.80642748| 63 | 64 | 1.其中BERT-base、RoBERTa-base、BERT+数据增广-1、BERT+数据增广-2、BERT+数据增广-3模型中输入均只有技术成果标题与技术需求标题,MaxLenT为128,其余超参数与最终模型中基本相同。 65 | 66 | 2.孪生BERT-1模型为标题与内容拼接的孪生BERT模型,MaxLen为512,其余超参数与最终模型中基本相同。 67 | 68 | 3.孪生BERT-2模型为分别进行标题与内容关联度判别的孪生BERT模型,MaxLen为512,其余超参数与最终模型中基本相同。 69 | 70 | 4.BERT+数据增广-1模型中,数据增广采用第一种方式。 71 | 72 | 5.BERT+数据增广-2模型中,数据增广采用第二种方式,且取全部增广数据。 73 | 74 | 6.BERT+数据增广-3模型中,数据增广采用第二种方式,但按照原数据集各个关联度等级的比例从中随机取出。 75 | 76 | 7.BERT-T128C512模型中BERT均采用谷歌发布的BERT-base,其余超参数与最终模型中相同。 77 | 78 | 8.BERT-WWM-T128C512模型中BERT均采用采用哈工大讯飞联合实验室发布的BERT-WWM,其余超参数与最终模型中相同。 79 | 80 | 9.最终模型中标题采用谷歌发布的BERT-base,内容采用哈工大讯飞联合实验室发布的BERT-WWM。 81 | 82 | ## 总结 83 | 我个人认为BERT-WWM预训练相比于BERT预训练对中文效果应该更好,而得到这样的结果,可能的原因是两个预训练在训练时使用的语料库不同,标题部分中专业名词比重较大且短小,BERT对此比较敏感,而BERT-WWM对常规文本比较敏感。当然这个成绩中也有预测偏好处理的功劳。 84 | 85 | ## 复现 86 | 由于版权问题,本项目不提供数据集,可以去DF平台[下载](https://www.datafountain.cn/competitions/359/datasets)。 87 | 下载好数据集后可直接运行bash train.sh 88 | 89 | ## 写在后面的话 90 | 91 | ### 感谢开源 92 | 93 | 本次比赛十分感谢华南理工大学Chevalier同学在知乎上分享的[BaseLine](https://zhuanlan.zhihu.com/p/82737301)。本代码修改于该代码。 94 | 95 | 由于刚开始接触深度学习,也是头一次参加比赛,本人水平有限欢迎批评指正。 96 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | #! -*- coding:utf-8 -*- 2 | import json 3 | import numpy as np 4 | from tqdm import tqdm 5 | import time 6 | import logging 7 | from sklearn.model_selection import StratifiedKFold 8 | from keras_bert import load_trained_model_from_checkpoint, Tokenizer 9 | from keras.optimizers import Adam 10 | import keras.backend.tensorflow_backend as KTF 11 | from keras.layers import * 12 | from keras.models import Model 13 | import keras.backend as K 14 | from keras.callbacks import Callback 15 | import tensorflow as tf 16 | import os 17 | import pandas as pd 18 | from keras.utils.np_utils import to_categorical 19 | from sklearn.metrics import mean_absolute_error, accuracy_score, f1_score 20 | import random 21 | import argparse 22 | 23 | # 超参数 24 | parser = argparse.ArgumentParser() 25 | parser.add_argument("--counter", default=0, type=int, required=False) 26 | parser.add_argument("--name", default='', type=str, required=False) 27 | parser.add_argument("--model", default=0, type=int, required=False) 28 | parser.add_argument("--model1", default=1, type=int, required=False) 29 | parser.add_argument("--title_len", default=128, type=int, required=False) 30 | parser.add_argument("--content_len", default=512, type=int, required=False) 31 | parser.add_argument("--learning_rate", default=5e-5, type=float, required=False) 32 | parser.add_argument("--min_learning_rate", default=1e-5, type=float, required=False) 33 | parser.add_argument("--random_seed", default=123, type=int, required=False) 34 | parser.add_argument("--batch_size", default=16, type=int, required=False) 35 | parser.add_argument("--epoch", default=8, type=int, required=False) 36 | parser.add_argument("--fold", default=7, type=int, required=False) 37 | 38 | args = parser.parse_args() 39 | counter = args.counter 40 | name = args.name 41 | model = args.model 42 | model1 = args.model1 43 | MAX_LENT = args.title_len 44 | MAX_LENC = args.content_len 45 | learning_rate = args.learning_rate 46 | min_learning_rate = args.min_learning_rate 47 | random_seed = args.random_seed 48 | bs = args.batch_size 49 | epoch = args.epoch 50 | fold = args.fold 51 | 52 | # cpu运行 53 | # os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" 54 | # os.environ["CUDA_VISIBLE_DEVICES"] = "-1" 55 | 56 | # 预训练所在文件夹 57 | bert_path = ['chinese_L-12_H-768_A-12', 58 | 'chinese_wwm_ext_L-12_H-768_A-12', 59 | 'chinese_roberta_wwm_ext_L-12_H-768_A-12', 60 | 'roeberta_zh_L-24_H-1024_A-16'] 61 | 62 | # 不全部占满显存, 按需分配 63 | config = tf.ConfigProto() 64 | config.gpu_options.allow_growth = True 65 | sess = tf.Session(config=config) 66 | KTF.set_session(sess) 67 | 68 | # 加载对应预训练 69 | config_path = './ckpt/' + bert_path[model] + '/bert_config.json' 70 | checkpoint_path = './ckpt/' + bert_path[model] + '/bert_model.ckpt' 71 | dict_path = './ckpt/' + bert_path[model] + '/vocab.txt' 72 | config_path1 = './ckpt/' + bert_path[model1] + '/bert_config.json' 73 | checkpoint_path1 = './ckpt/' + bert_path[model1] + '/bert_model.ckpt' 74 | dict_path1 = './ckpt/' + bert_path[model1] + '/vocab.txt' 75 | 76 | # 加载词汇表 77 | token_dict = {} 78 | with open(dict_path, 'r', encoding='utf-8') as reader: 79 | for line in reader: 80 | token = line.strip() 81 | token_dict[token] = len(token_dict) 82 | tokenizer = Tokenizer(token_dict) 83 | token_dict1 = {} 84 | with open(dict_path1, 'r', encoding='utf-8') as reader: 85 | for line in reader: 86 | token = line.strip() 87 | token_dict1[token] = len(token_dict1) 88 | tokenizer1 = Tokenizer(token_dict1) 89 | 90 | file_path = './log/' 91 | # 创建一个logger 92 | logger = logging.getLogger('mylogger') 93 | logger.setLevel(logging.DEBUG) 94 | 95 | # 创建一个handler, 96 | timestamp = time.strftime("%Y.%m.%d_%H.%M.%S", time.localtime()) 97 | fh = logging.FileHandler(file_path + 'log_' + timestamp + '.txt') 98 | fh.setLevel(logging.DEBUG) 99 | 100 | # 再创建一个handler,用于输出到控制台 101 | ch = logging.StreamHandler() 102 | ch.setLevel(logging.DEBUG) 103 | 104 | # 定义handler的输出格式 105 | formatter = logging.Formatter('[%(asctime)s][%(levelname)s] ## %(message)s') 106 | fh.setFormatter(formatter) 107 | ch.setFormatter(formatter) 108 | # 给logger添加handler 109 | logger.addHandler(fh) 110 | logger.addHandler(ch) 111 | 112 | 113 | # 数据读入与预处理 114 | def read_data(file_path, id, name): 115 | train_id = [] 116 | train_title = [] 117 | train_text = [] 118 | with open(file_path, 'r', encoding='utf-8-sig') as f: 119 | for idx, line in enumerate(f): 120 | line = line.strip().split(',') 121 | train_id.append(line[0].replace('\'', '').replace(' ', '')) 122 | train_title.append(line[1]) 123 | train_text.append(','.join(line[2:])) 124 | output = pd.DataFrame(dtype=str) 125 | output[id] = train_id 126 | output[name + '_title'] = train_title 127 | output[name + '_content'] = train_text 128 | return output 129 | 130 | 131 | # 读取数据 132 | train_interrelation = pd.read_csv('./input/Train_Interrelation.csv', dtype=str) 133 | Train_Achievements = read_data('./input/Train_Achievements.csv', 'Aid', 'Achievements') 134 | Requirements = read_data('./input/Requirements.csv', 'Rid', 'Requirements') 135 | TestPrediction = pd.read_csv('./input/TestPrediction.csv', dtype=str) 136 | Test_Achievements = read_data('./input/Test_Achievements.csv', 'Aid', 'Achievements') 137 | 138 | # 将train和test数据表连接成大表并选出有用信息 139 | train = pd.merge(train_interrelation, Train_Achievements, on='Aid', how='left') 140 | train = pd.merge(train, Requirements, on='Rid', how='left') 141 | test = pd.merge(TestPrediction, Test_Achievements, on='Aid', how='left') 142 | test = pd.merge(test, Requirements, on='Rid', how='left') 143 | 144 | 145 | # 对数据进行预处理。 146 | # 将内容为空白或如“图片”之类的无用信息替换为对应标题 147 | for i in range(len(train)): 148 | if len(train['Achievements_content'][i]) < 14: 149 | train['Achievements_content'][i] = train['Achievements_title'][i] 150 | if len(train['Requirements_content'][i]) < 10: 151 | train['Requirements_content'][i] = train['Requirements_title'][i] 152 | print("train预处理完毕") 153 | 154 | for i in range(len(test)): 155 | if len(test['Achievements_content'][i]) < 14: 156 | test['Achievements_content'][i] = test['Achievements_title'][i] 157 | if len(test['Requirements_content'][i]) < 10: 158 | test['Requirements_content'][i] = test['Requirements_title'][i] 159 | print("test预处理完毕") 160 | 161 | train_achievements = train['Achievements_title'].values 162 | train_requirements = train['Requirements_title'].values 163 | train_achievementsc = train['Achievements_content'].values 164 | train_requirementsc = train['Requirements_content'].values 165 | 166 | test_achievements = test['Achievements_title'].values 167 | test_requirements = test['Requirements_title'].values 168 | test_achievementsc = test['Achievements_content'].values 169 | test_requirementsc = test['Requirements_content'].values 170 | 171 | labels = train['Level'].astype(int).values - 1 172 | labels_cat = to_categorical(labels) 173 | labels_cat = labels_cat.astype(np.int32) 174 | 175 | 176 | # 数据生成 177 | class data_generator: 178 | def __init__(self, data, batch_size=bs): 179 | self.data = data 180 | self.batch_size = batch_size 181 | self.steps = len(self.data[0]) // self.batch_size 182 | if len(self.data[0]) % self.batch_size != 0: 183 | self.steps += 1 184 | 185 | def __len__(self): 186 | return self.steps 187 | 188 | def __iter__(self): 189 | while True: 190 | X1, X2, X3, X4, y = self.data 191 | idxs = list(range(len(self.data[0]))) 192 | np.random.shuffle(idxs) 193 | T, T_, Y = [], [], [] 194 | X, X_, Z = [], [], [] 195 | for c, i in enumerate(idxs): 196 | achievements = X1[i] 197 | requirements = X2[i] 198 | achievementsc = X3[i] 199 | requirementsc = X4[i] 200 | t, t_ = tokenizer.encode(first=achievements, second=requirements, max_len=MAX_LENT) 201 | x, x_ = tokenizer1.encode(first=achievementsc, second=requirementsc, max_len=MAX_LENC) 202 | T.append(t) 203 | T_.append(t_) 204 | X.append(x) 205 | X_.append(x_) 206 | Y.append(y[i]) 207 | if len(T) == self.batch_size or i == idxs[-1]: 208 | T = np.array(T) 209 | T_ = np.array(T_) 210 | X = np.array(X) 211 | X_ = np.array(X_) 212 | Y = np.array(Y) 213 | yield [T, T_, X, X_], Y 214 | T, T_, Y = [], [], [] 215 | X, X_, Z = [], [], [] 216 | 217 | 218 | # 模型构建 219 | # 在第一个bert中对标题进行相似度判别 220 | # 在第二个bert中对内容进行相似度判别 221 | # 分别取出[CLS]进行拼接后进行分类 222 | def get_model(): 223 | bert_model = load_trained_model_from_checkpoint(config_path, checkpoint_path) 224 | bert_model1 = load_trained_model_from_checkpoint(config_path1, checkpoint_path1) 225 | for l in bert_model.layers: 226 | l.trainable = True 227 | 228 | T1 = Input(shape=(None,)) 229 | T2 = Input(shape=(None,)) 230 | X1 = Input(shape=(None,)) 231 | X2 = Input(shape=(None,)) 232 | 233 | T = bert_model([T1, T2]) 234 | X = bert_model1([X1, X2]) 235 | 236 | T = Lambda(lambda x: x[:, 0])(T) 237 | X = Lambda(lambda x: x[:, 0])(X) 238 | 239 | T = Concatenate(axis=-1)([T, X]) 240 | T = Dense(384)(T) 241 | # T = Dropout(0.1)(T) 242 | output = Dense(4, activation='softmax')(T) 243 | 244 | model = Model([T1, T2, X1, X2], output) 245 | model.compile( 246 | loss='categorical_crossentropy', 247 | optimizer=Adam(1e-5), # 用足够小的学习率 248 | metrics=['MAE'] 249 | ) 250 | model.summary() 251 | return model 252 | 253 | 254 | class Evaluate(Callback): 255 | def __init__(self, val_data, val_index): 256 | self.score = [] 257 | self.best = 0. 258 | self.early_stopping = 0 259 | self.val_data = val_data 260 | self.val_index = val_index 261 | self.predict = [] 262 | self.lr = 0 263 | self.passed = 0 264 | 265 | # 第一个epoch用来warmup,第二个epoch把学习率降到最低 266 | def on_batch_begin(self, batch, logs=None): 267 | if self.passed < self.params['steps']: 268 | self.lr = (self.passed + 1.) / self.params['steps'] * learning_rate 269 | K.set_value(self.model.optimizer.lr, self.lr) 270 | self.passed += 1 271 | elif self.params['steps'] <= self.passed < self.params['steps'] * 2: 272 | self.lr = (2 - (self.passed + 1.) / self.params['steps']) * (learning_rate - min_learning_rate) 273 | self.lr += min_learning_rate 274 | K.set_value(self.model.optimizer.lr, self.lr) 275 | self.passed += 1 276 | 277 | def on_epoch_end(self, epoch, logs=None): 278 | score, acc, f1 = self.evaluate() 279 | if score > self.best: 280 | self.best = score 281 | self.early_stopping = 0 282 | model.save_weights('./model_save/bert{}.w'.format(fold)) 283 | else: 284 | self.early_stopping += 1 285 | logger.info('fold: %d, lr: %.6f, score: %.4f, acc: %.4f, f1: %.4f,best: %.4f\n' % ( 286 | fold, self.lr, score, acc, f1, self.best)) 287 | 288 | def evaluate(self): 289 | self.predict = [] 290 | prob = [] 291 | val_x1, val_x2, val_x3, val_x4, val_y, val_cat = self.val_data 292 | 293 | for i in tqdm(range(len(val_x1))): 294 | achievements = val_x1[i] 295 | requirements = val_x2[i] 296 | achievementsc = val_x3[i] 297 | requirementsc = val_x4[i] 298 | 299 | t1, t1_ = tokenizer.encode(first=achievements, second=requirements, max_len=MAX_LENT) 300 | x1, x1_ = tokenizer1.encode(first=achievementsc, second=requirementsc, max_len=MAX_LENC) 301 | 302 | T1, T1_ = np.array([t1]), np.array([t1_]) 303 | X1, X1_ = np.array([x1]), np.array([x1_]) 304 | 305 | _prob = model.predict([T1, T1_, X1, X1_]) 306 | 307 | oof_train[self.val_index[i]] = _prob[0] 308 | self.predict.append(np.argmax(_prob, axis=1)[0] + 1) 309 | prob.append(_prob[0]) 310 | 311 | score = 1.0 / (1 + mean_absolute_error(val_y + 1, self.predict)) 312 | acc = accuracy_score(val_y + 1, self.predict) 313 | f1 = f1_score(val_y + 1, self.predict, average='macro') 314 | return score, acc, f1 315 | 316 | 317 | skf = StratifiedKFold(n_splits=fold, shuffle=True, random_state=random_seed) 318 | 319 | 320 | def predict(data): 321 | prob = [] 322 | val_x1, val_x2, val_x3, val_x4 = data 323 | for i in tqdm(range(len(val_x1))): 324 | achievements = val_x1[i] 325 | requirements = val_x2[i] 326 | achievementsc = val_x3[i] 327 | requirementsc = val_x4[i] 328 | 329 | t1, t1_ = tokenizer.encode(first=achievements, second=requirements, max_len=MAX_LENT) 330 | x1, x1_ = tokenizer1.encode(first=achievementsc, second=requirementsc, max_len=MAX_LENC) 331 | 332 | T1, T1_ = np.array([t1]), np.array([t1_]) 333 | X1, X1_ = np.array([x1]), np.array([x1_]) 334 | 335 | _prob = model.predict([T1, T1_, X1, X1_]) 336 | prob.append(_prob[0]) 337 | return prob 338 | 339 | 340 | oof_train = np.zeros((len(train), 4), dtype=np.float32) 341 | oof_test = np.zeros((len(test), 4), dtype=np.float32) 342 | logger.info("加载{}和{}".format(bert_path[model], bert_path[model1])) 343 | 344 | timestamp = time.time() 345 | 346 | for fold, (train_index, valid_index) in enumerate(skf.split(train_achievements, labels)): 347 | logger.info('------------ %d fold take: %.1f minute ------------' % (fold, (time.time() - timestamp) / 60)) 348 | timestamp = time.time() 349 | x1 = train_achievements[train_index] 350 | x2 = train_requirements[train_index] 351 | x3 = train_achievementsc[train_index] 352 | x4 = train_requirementsc[train_index] 353 | y = labels_cat[train_index] 354 | 355 | val_x1 = train_achievements[valid_index] 356 | val_x2 = train_requirements[valid_index] 357 | val_x3 = train_achievementsc[valid_index] 358 | val_x4 = train_requirementsc[valid_index] 359 | val_y = labels[valid_index] 360 | val_cat = labels_cat[valid_index] 361 | 362 | train_D = data_generator([x1, x2, x3, x4, y]) 363 | evaluator = Evaluate([val_x1, val_x2, val_x3, val_x4, val_y, val_cat], valid_index) 364 | 365 | model = get_model() 366 | model.fit_generator(train_D.__iter__(), 367 | steps_per_epoch=len(train_D), 368 | epochs=epoch, 369 | callbacks=[evaluator] 370 | ) 371 | model.load_weights('./model_save/bert{}.w'.format(fold)) 372 | oof_test += predict([test_achievements, test_requirements, test_achievementsc, test_requirementsc]) 373 | K.clear_session() 374 | 375 | oof_test /= epoch 376 | 377 | cv_score = 1.0 / (1 + mean_absolute_error(labels + 1, np.argmax(oof_train, axis=1) + 1)) 378 | logger.info(cv_score) 379 | 380 | np.savetxt('./submit/w_{}.txt'.format(counter), oof_test) 381 | test['Level'] = np.argmax(oof_test, axis=1) + 1 382 | test[['Guid', 'Level']].to_csv('./submit/{}.csv'.format(counter), index=False) 383 | --------------------------------------------------------------------------------