├── log
└── log.txt
├── input
└── input.txt
├── submit
└── submit.txt
├── model_save
└── model_save.txt
├── preprocess
├── input
│ └── input.txt
└── preprocess.py
├── file
├── 1.png
├── 2.png
├── 3.png
├── 4.png
├── 5.png
├── 6.png
├── 7.png
└── 【2019 CCF BDCI】-关联模型-莽就完事了-说明论文.pdf
├── requirements.txt
├── ckpt
├── 如果百度云下载过慢,该文件夹内容下载地址.txt
├── chinese_L-12_H-768_A-12
│ └── bert_config.json
└── chinese_wwm_ext_L-12_H-768_A-12
│ └── bert_config.json
├── train.sh
├── combine.py
├── README.md
└── train.py
/log/log.txt:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/input/input.txt:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/submit/submit.txt:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/model_save/model_save.txt:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/preprocess/input/input.txt:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/file/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Makaixin/Correlation-between-requirements-and-achievements/HEAD/file/1.png
--------------------------------------------------------------------------------
/file/2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Makaixin/Correlation-between-requirements-and-achievements/HEAD/file/2.png
--------------------------------------------------------------------------------
/file/3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Makaixin/Correlation-between-requirements-and-achievements/HEAD/file/3.png
--------------------------------------------------------------------------------
/file/4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Makaixin/Correlation-between-requirements-and-achievements/HEAD/file/4.png
--------------------------------------------------------------------------------
/file/5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Makaixin/Correlation-between-requirements-and-achievements/HEAD/file/5.png
--------------------------------------------------------------------------------
/file/6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Makaixin/Correlation-between-requirements-and-achievements/HEAD/file/6.png
--------------------------------------------------------------------------------
/file/7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Makaixin/Correlation-between-requirements-and-achievements/HEAD/file/7.png
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy==1.16.3
2 | pandas==0.24.2
3 | tensorflow-gpu==1.14.0
4 | keras==2.3.1
5 | keras-bert==0.80.0
6 | tqdm==4.36.1
--------------------------------------------------------------------------------
/file/【2019 CCF BDCI】-关联模型-莽就完事了-说明论文.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Makaixin/Correlation-between-requirements-and-achievements/HEAD/file/【2019 CCF BDCI】-关联模型-莽就完事了-说明论文.pdf
--------------------------------------------------------------------------------
/ckpt/如果百度云下载过慢,该文件夹内容下载地址.txt:
--------------------------------------------------------------------------------
1 | 地址:https://pan.iflytek.com/#/link/8AA4B23D9BCBCBA0187EE58234332B46 密码:thGd
2 |
3 | https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip
4 |
--------------------------------------------------------------------------------
/train.sh:
--------------------------------------------------------------------------------
1 | export CUDA_VISIBLE_DEVICES=0
2 | for((i=0;i<2;i++));
3 | do
4 |
5 | python train.py \
6 | --counter $i \
7 | --name bertt128_bertwwm512 \
8 | --model 0 \
9 | --model1 1 \
10 | --title_len 128 \
11 | --content_len 512 \
12 | --learning_rate 5e-5 \
13 | --min_learning_rate 1e-5 \
14 | --random_seed 123 \
15 | --batch_size 16 \
16 | --epoch 8 \
17 | --fold 7
18 |
19 | done
20 |
21 | python combine.py --k 2
22 |
23 |
24 |
25 |
26 |
--------------------------------------------------------------------------------
/ckpt/chinese_L-12_H-768_A-12/bert_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "attention_probs_dropout_prob": 0.1,
3 | "directionality": "bidi",
4 | "hidden_act": "gelu",
5 | "hidden_dropout_prob": 0.1,
6 | "hidden_size": 768,
7 | "initializer_range": 0.02,
8 | "intermediate_size": 3072,
9 | "max_position_embeddings": 512,
10 | "num_attention_heads": 12,
11 | "num_hidden_layers": 12,
12 | "pooler_fc_size": 768,
13 | "pooler_num_attention_heads": 12,
14 | "pooler_num_fc_layers": 3,
15 | "pooler_size_per_head": 128,
16 | "pooler_type": "first_token_transform",
17 | "type_vocab_size": 2,
18 | "vocab_size": 21128
19 | }
20 |
--------------------------------------------------------------------------------
/ckpt/chinese_wwm_ext_L-12_H-768_A-12/bert_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "attention_probs_dropout_prob": 0.1,
3 | "directionality": "bidi",
4 | "hidden_act": "gelu",
5 | "hidden_dropout_prob": 0.1,
6 | "hidden_size": 768,
7 | "initializer_range": 0.02,
8 | "intermediate_size": 3072,
9 | "max_position_embeddings": 512,
10 | "num_attention_heads": 12,
11 | "num_hidden_layers": 12,
12 | "pooler_fc_size": 768,
13 | "pooler_num_attention_heads": 12,
14 | "pooler_num_fc_layers": 3,
15 | "pooler_size_per_head": 128,
16 | "pooler_type": "first_token_transform",
17 | "type_vocab_size": 2,
18 | "vocab_size": 21128
19 | }
20 |
--------------------------------------------------------------------------------
/combine.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 | import argparse
4 |
5 | parser = argparse.ArgumentParser()
6 | parser.add_argument("--k", default=6, type=int, required=False)
7 | parser.add_argument("--a", default=0.52, type=float, required=False)
8 | parser.add_argument("--op", default=0, type=int, required=False)
9 | parser.add_argument("--n", default=0, type=int, required=False)
10 | args = parser.parse_args()
11 | k = args.k
12 | a = args.a
13 | op = args.op
14 | n = args.n
15 |
16 | test = pd.read_csv('./input/TestPrediction.csv')
17 | oof_test = np.loadtxt('./submit/w_0.txt')
18 | print(oof_test)
19 | print('-----------------------')
20 | oof_test1 = np.loadtxt('./submit/w_{}.txt'.format(op))
21 | oof_test += oof_test1 * n
22 | print(oof_test)
23 | print('-----------------------')
24 |
25 | for i in range(1, k):
26 | oof_test1 = np.loadtxt('./submit/w_{}.txt'.format(i))
27 | oof_test += oof_test1
28 |
29 | print(oof_test)
30 | print('-----------------------')
31 | for i in range(len(oof_test)):
32 | oof_test[i][0] = oof_test[i][0] * (1 - a)
33 | oof_test[i][1] = oof_test[i][1] * a
34 | oof_test[i][2] = oof_test[i][2] * a
35 | oof_test[i][3] = oof_test[i][3] * (1 - a)
36 | print(oof_test)
37 | test['Level'] = np.argmax(oof_test, axis=1) + 1
38 | test[['Guid', 'Level']].to_csv('./submit/submit.csv', index=False)
39 |
--------------------------------------------------------------------------------
/preprocess/preprocess.py:
--------------------------------------------------------------------------------
1 | #! -*- coding:utf-8 -*-
2 | import json
3 | import numpy as np
4 | import pandas as pd # ! -*- coding:utf-8 -*-
5 | import json
6 | import numpy as np
7 | import pandas as pd
8 | import time
9 | import random
10 |
11 | l = np.array([['1', '1', '1', '1'],
12 | ['1', '0', '2', '2'],
13 | ['1', '2', '3', '3'],
14 | ['1', '2', '3', '4']])
15 | k = np.array([[55, 55, 55, 55],
16 | [55, 0, 6, 6],
17 | [55, 6, 8, 9],
18 | [55, 6, 9, 2]])
19 |
20 | random.seed(123)
21 |
22 | train_interrelation = pd.read_csv('./input/Train_Interrelation.csv', dtype=str)
23 | print("Train_Interrelation", len(train_interrelation))
24 |
25 | sum = [0 for i in range(5)]
26 | for i in train_interrelation['Level']:
27 | sum[int(i)] += 1
28 | for i in range(1, 5):
29 | print('等级', i, sum[i])
30 |
31 | tur = set()
32 | for i in train_interrelation['Rid']:
33 | tur.add(i)
34 | print(len(tur))
35 | train_re = train_interrelation.sort_values(['Rid'])
36 | sum = 0
37 | for i in range(len(train_re)):
38 | j = 1
39 | flag = 0
40 | while i + j < len(train_re) and train_re['Rid'].values[i] == train_re['Rid'].values[i + j]:
41 | flag = 1
42 | if flag == 1 and train_re['Rid'].values[i] != train_re['Rid'].values[i + j]:
43 | break
44 | c = int(train_re['Level'].values[i]) - 1
45 | r = int(train_re['Level'].values[i + j]) - 1
46 | if l[c][r] != '0' and random.randint(0, k[c][r]) == 0:
47 | sum += 1
48 | # print(train_re['Rid'].values[i], train_re['Aid'].values[i], train_re['Aid'].values[i + j])
49 | train_interrelation.loc[i * 1000000 + j] = \
50 | [train_re['Rid'].values[i],
51 | train_re['Aid'].values[i],
52 | train_re['Aid'].values[i + j],
53 | l[int(train_re['Level'].values[i]) - 1][int(train_re['Level'].values[i + j]) - 1]]
54 | j += 1
55 |
56 | train_interrelation.to_csv('./Train_Interrelation.csv', index=False)
57 |
58 | print("Train_Interrelation", len(train_interrelation))
59 | print('sum', sum)
60 |
61 | sum = [0 for i in range(5)]
62 | for i in train_interrelation['Level']:
63 | sum[int(i)] += 1
64 | for i in range(1, 5):
65 | print('等级', i, sum[i])
66 |
67 |
68 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # 2019 CCF BDCI “技术需求”与“技术成果”项目之间关联度计算模型第一名解决方案
2 |
3 | ## 赛题介绍
4 |
5 | 赛题介绍见[官方网站](https://www.datafountain.cn/competitions/359)
6 |
7 | ## 数据预处理
8 |
9 | ### 数据清洗
10 |
11 |
12 |
13 | 对技术成果和技术需求的较短内容进行筛选查看,发现其中存在一定量的空白、“\n”、“未提供。”等无用信息。使用对应标题对无用信息进行替换。
14 |
15 | ### 数据增广
16 |
17 | 对问题进一步化简,可以简化成两个文本之间的关联度计算。
18 |
19 | 1.那么A文本与B文本之间关联度,同样也是B文本与A文本之间关联度。该方法在仅取标题时可以提升成绩。当加入内容时会造成过拟合,最终未采用该方法。
20 |
21 | 2.那么假设A文本与B文本之间关联度为4,A文本与C文本之间关联度为3,那么可以假定B文本与C文本之间关联度为3,按照这个思路可以假设关联矩阵
22 |
23 |
24 |
25 | 其中A文本与B文本之间关联度为i,A文本与C文本之间关联度为j,那么B文本与C文本之间关联度为R_(i,j)。此方法可增加数据295994条,从中按照原数据集各个关联度等级的比例从中随机取出10000条。该方法我认为具有一定的可能性,但由于训练时间过长、提交次数有限,尝试过的参数均会造成过拟合现象。最终模型中未对数据进行数据增广。
26 |
27 | ### 模型
28 |
29 | 1.标题与内容拼接的孪生BERT模型
30 |
31 |
32 |
33 | 2.分别进行标题与内容关联度判别的孪生BERT模型
34 |
35 |
36 |
37 | 3.分别进行标题与内容关联度判别的伪孪生BERT模型
38 |
39 |
40 |
41 | 最终只采用这个模型,也没有进行融合。当然可以很简单的认为它就是分别使用两个BERT进行相似度判别然后进行拼接。
42 | 其中进行技术成果标题与技术需求标题关联度计算的BERT采用谷歌开源的BERT-base;进行技术成果内容与技术需求内容关联度计算的BERT采用哈工大讯飞联合实验室发布基于全词覆盖的BERT-WWM。该预训练由于采用了全词覆盖,在多数情况下可以取得更好的效果。在第一个进行技术成果标题与技术需求标题关联度计算的BERT中输入最大长度MaxLenT设置为128,两个标题拼接最大长度也没有超过128个字,同时这样可以减少训练时间和显存需求;在第二个进行技术成果内容与技术需求内容关联度计算的BERT-WWM中输入最大长度MaxLenC设置为512,尽可能多的读取数据内容。两个BERT都采用12layers, 768hidden states, 12heads版本,该模型采用7折交叉验证,其中batch size取16,epoch取8,并在训练时保存较好的模型权值,初始学习率设置成5e-5,后续学习率设置成1e-5。
43 |
44 | ## 预测偏好处理
45 |
46 |
47 |
48 |
49 | ## 模型对比
50 |
51 | |序号 | 模型 | 初赛成绩 | 复赛成绩 |
52 | |---|-----------------|-----------|-----------|
53 | | 1 | BERT-base | 0.78585178 |0.79595751|
54 | | 2 | RoBERTa-base | 0.78077936 |/ |
55 | | 3 | 孪生BERT-1 | 0.78604090 |0.79607499|
56 | | 4 | 孪生BERT-2 | 0.78509617|0.79843128|
57 | | 5 | BERT+数据增广-1 | / |0.80163449|
58 | | 6 | BERT+数据增广-2 | / |0.77996242|
59 | | 7 | BERT+数据增广-3 | / |0.79548806|
60 | | 8 | BERT-T128C512 | 0.79079902 |0.79866767|
61 | | 9 | BERT-WWM-T128C512 | 0.79099053 |0.80008900|
62 | | 10 | 最终模型 | 0.79175758 |0.80642748|
63 |
64 | 1.其中BERT-base、RoBERTa-base、BERT+数据增广-1、BERT+数据增广-2、BERT+数据增广-3模型中输入均只有技术成果标题与技术需求标题,MaxLenT为128,其余超参数与最终模型中基本相同。
65 |
66 | 2.孪生BERT-1模型为标题与内容拼接的孪生BERT模型,MaxLen为512,其余超参数与最终模型中基本相同。
67 |
68 | 3.孪生BERT-2模型为分别进行标题与内容关联度判别的孪生BERT模型,MaxLen为512,其余超参数与最终模型中基本相同。
69 |
70 | 4.BERT+数据增广-1模型中,数据增广采用第一种方式。
71 |
72 | 5.BERT+数据增广-2模型中,数据增广采用第二种方式,且取全部增广数据。
73 |
74 | 6.BERT+数据增广-3模型中,数据增广采用第二种方式,但按照原数据集各个关联度等级的比例从中随机取出。
75 |
76 | 7.BERT-T128C512模型中BERT均采用谷歌发布的BERT-base,其余超参数与最终模型中相同。
77 |
78 | 8.BERT-WWM-T128C512模型中BERT均采用采用哈工大讯飞联合实验室发布的BERT-WWM,其余超参数与最终模型中相同。
79 |
80 | 9.最终模型中标题采用谷歌发布的BERT-base,内容采用哈工大讯飞联合实验室发布的BERT-WWM。
81 |
82 | ## 总结
83 | 我个人认为BERT-WWM预训练相比于BERT预训练对中文效果应该更好,而得到这样的结果,可能的原因是两个预训练在训练时使用的语料库不同,标题部分中专业名词比重较大且短小,BERT对此比较敏感,而BERT-WWM对常规文本比较敏感。当然这个成绩中也有预测偏好处理的功劳。
84 |
85 | ## 复现
86 | 由于版权问题,本项目不提供数据集,可以去DF平台[下载](https://www.datafountain.cn/competitions/359/datasets)。
87 | 下载好数据集后可直接运行bash train.sh
88 |
89 | ## 写在后面的话
90 |
91 | ### 感谢开源
92 |
93 | 本次比赛十分感谢华南理工大学Chevalier同学在知乎上分享的[BaseLine](https://zhuanlan.zhihu.com/p/82737301)。本代码修改于该代码。
94 |
95 | 由于刚开始接触深度学习,也是头一次参加比赛,本人水平有限欢迎批评指正。
96 |
--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
1 | #! -*- coding:utf-8 -*-
2 | import json
3 | import numpy as np
4 | from tqdm import tqdm
5 | import time
6 | import logging
7 | from sklearn.model_selection import StratifiedKFold
8 | from keras_bert import load_trained_model_from_checkpoint, Tokenizer
9 | from keras.optimizers import Adam
10 | import keras.backend.tensorflow_backend as KTF
11 | from keras.layers import *
12 | from keras.models import Model
13 | import keras.backend as K
14 | from keras.callbacks import Callback
15 | import tensorflow as tf
16 | import os
17 | import pandas as pd
18 | from keras.utils.np_utils import to_categorical
19 | from sklearn.metrics import mean_absolute_error, accuracy_score, f1_score
20 | import random
21 | import argparse
22 |
23 | # 超参数
24 | parser = argparse.ArgumentParser()
25 | parser.add_argument("--counter", default=0, type=int, required=False)
26 | parser.add_argument("--name", default='', type=str, required=False)
27 | parser.add_argument("--model", default=0, type=int, required=False)
28 | parser.add_argument("--model1", default=1, type=int, required=False)
29 | parser.add_argument("--title_len", default=128, type=int, required=False)
30 | parser.add_argument("--content_len", default=512, type=int, required=False)
31 | parser.add_argument("--learning_rate", default=5e-5, type=float, required=False)
32 | parser.add_argument("--min_learning_rate", default=1e-5, type=float, required=False)
33 | parser.add_argument("--random_seed", default=123, type=int, required=False)
34 | parser.add_argument("--batch_size", default=16, type=int, required=False)
35 | parser.add_argument("--epoch", default=8, type=int, required=False)
36 | parser.add_argument("--fold", default=7, type=int, required=False)
37 |
38 | args = parser.parse_args()
39 | counter = args.counter
40 | name = args.name
41 | model = args.model
42 | model1 = args.model1
43 | MAX_LENT = args.title_len
44 | MAX_LENC = args.content_len
45 | learning_rate = args.learning_rate
46 | min_learning_rate = args.min_learning_rate
47 | random_seed = args.random_seed
48 | bs = args.batch_size
49 | epoch = args.epoch
50 | fold = args.fold
51 |
52 | # cpu运行
53 | # os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
54 | # os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
55 |
56 | # 预训练所在文件夹
57 | bert_path = ['chinese_L-12_H-768_A-12',
58 | 'chinese_wwm_ext_L-12_H-768_A-12',
59 | 'chinese_roberta_wwm_ext_L-12_H-768_A-12',
60 | 'roeberta_zh_L-24_H-1024_A-16']
61 |
62 | # 不全部占满显存, 按需分配
63 | config = tf.ConfigProto()
64 | config.gpu_options.allow_growth = True
65 | sess = tf.Session(config=config)
66 | KTF.set_session(sess)
67 |
68 | # 加载对应预训练
69 | config_path = './ckpt/' + bert_path[model] + '/bert_config.json'
70 | checkpoint_path = './ckpt/' + bert_path[model] + '/bert_model.ckpt'
71 | dict_path = './ckpt/' + bert_path[model] + '/vocab.txt'
72 | config_path1 = './ckpt/' + bert_path[model1] + '/bert_config.json'
73 | checkpoint_path1 = './ckpt/' + bert_path[model1] + '/bert_model.ckpt'
74 | dict_path1 = './ckpt/' + bert_path[model1] + '/vocab.txt'
75 |
76 | # 加载词汇表
77 | token_dict = {}
78 | with open(dict_path, 'r', encoding='utf-8') as reader:
79 | for line in reader:
80 | token = line.strip()
81 | token_dict[token] = len(token_dict)
82 | tokenizer = Tokenizer(token_dict)
83 | token_dict1 = {}
84 | with open(dict_path1, 'r', encoding='utf-8') as reader:
85 | for line in reader:
86 | token = line.strip()
87 | token_dict1[token] = len(token_dict1)
88 | tokenizer1 = Tokenizer(token_dict1)
89 |
90 | file_path = './log/'
91 | # 创建一个logger
92 | logger = logging.getLogger('mylogger')
93 | logger.setLevel(logging.DEBUG)
94 |
95 | # 创建一个handler,
96 | timestamp = time.strftime("%Y.%m.%d_%H.%M.%S", time.localtime())
97 | fh = logging.FileHandler(file_path + 'log_' + timestamp + '.txt')
98 | fh.setLevel(logging.DEBUG)
99 |
100 | # 再创建一个handler,用于输出到控制台
101 | ch = logging.StreamHandler()
102 | ch.setLevel(logging.DEBUG)
103 |
104 | # 定义handler的输出格式
105 | formatter = logging.Formatter('[%(asctime)s][%(levelname)s] ## %(message)s')
106 | fh.setFormatter(formatter)
107 | ch.setFormatter(formatter)
108 | # 给logger添加handler
109 | logger.addHandler(fh)
110 | logger.addHandler(ch)
111 |
112 |
113 | # 数据读入与预处理
114 | def read_data(file_path, id, name):
115 | train_id = []
116 | train_title = []
117 | train_text = []
118 | with open(file_path, 'r', encoding='utf-8-sig') as f:
119 | for idx, line in enumerate(f):
120 | line = line.strip().split(',')
121 | train_id.append(line[0].replace('\'', '').replace(' ', ''))
122 | train_title.append(line[1])
123 | train_text.append(','.join(line[2:]))
124 | output = pd.DataFrame(dtype=str)
125 | output[id] = train_id
126 | output[name + '_title'] = train_title
127 | output[name + '_content'] = train_text
128 | return output
129 |
130 |
131 | # 读取数据
132 | train_interrelation = pd.read_csv('./input/Train_Interrelation.csv', dtype=str)
133 | Train_Achievements = read_data('./input/Train_Achievements.csv', 'Aid', 'Achievements')
134 | Requirements = read_data('./input/Requirements.csv', 'Rid', 'Requirements')
135 | TestPrediction = pd.read_csv('./input/TestPrediction.csv', dtype=str)
136 | Test_Achievements = read_data('./input/Test_Achievements.csv', 'Aid', 'Achievements')
137 |
138 | # 将train和test数据表连接成大表并选出有用信息
139 | train = pd.merge(train_interrelation, Train_Achievements, on='Aid', how='left')
140 | train = pd.merge(train, Requirements, on='Rid', how='left')
141 | test = pd.merge(TestPrediction, Test_Achievements, on='Aid', how='left')
142 | test = pd.merge(test, Requirements, on='Rid', how='left')
143 |
144 |
145 | # 对数据进行预处理。
146 | # 将内容为空白或如“图片”之类的无用信息替换为对应标题
147 | for i in range(len(train)):
148 | if len(train['Achievements_content'][i]) < 14:
149 | train['Achievements_content'][i] = train['Achievements_title'][i]
150 | if len(train['Requirements_content'][i]) < 10:
151 | train['Requirements_content'][i] = train['Requirements_title'][i]
152 | print("train预处理完毕")
153 |
154 | for i in range(len(test)):
155 | if len(test['Achievements_content'][i]) < 14:
156 | test['Achievements_content'][i] = test['Achievements_title'][i]
157 | if len(test['Requirements_content'][i]) < 10:
158 | test['Requirements_content'][i] = test['Requirements_title'][i]
159 | print("test预处理完毕")
160 |
161 | train_achievements = train['Achievements_title'].values
162 | train_requirements = train['Requirements_title'].values
163 | train_achievementsc = train['Achievements_content'].values
164 | train_requirementsc = train['Requirements_content'].values
165 |
166 | test_achievements = test['Achievements_title'].values
167 | test_requirements = test['Requirements_title'].values
168 | test_achievementsc = test['Achievements_content'].values
169 | test_requirementsc = test['Requirements_content'].values
170 |
171 | labels = train['Level'].astype(int).values - 1
172 | labels_cat = to_categorical(labels)
173 | labels_cat = labels_cat.astype(np.int32)
174 |
175 |
176 | # 数据生成
177 | class data_generator:
178 | def __init__(self, data, batch_size=bs):
179 | self.data = data
180 | self.batch_size = batch_size
181 | self.steps = len(self.data[0]) // self.batch_size
182 | if len(self.data[0]) % self.batch_size != 0:
183 | self.steps += 1
184 |
185 | def __len__(self):
186 | return self.steps
187 |
188 | def __iter__(self):
189 | while True:
190 | X1, X2, X3, X4, y = self.data
191 | idxs = list(range(len(self.data[0])))
192 | np.random.shuffle(idxs)
193 | T, T_, Y = [], [], []
194 | X, X_, Z = [], [], []
195 | for c, i in enumerate(idxs):
196 | achievements = X1[i]
197 | requirements = X2[i]
198 | achievementsc = X3[i]
199 | requirementsc = X4[i]
200 | t, t_ = tokenizer.encode(first=achievements, second=requirements, max_len=MAX_LENT)
201 | x, x_ = tokenizer1.encode(first=achievementsc, second=requirementsc, max_len=MAX_LENC)
202 | T.append(t)
203 | T_.append(t_)
204 | X.append(x)
205 | X_.append(x_)
206 | Y.append(y[i])
207 | if len(T) == self.batch_size or i == idxs[-1]:
208 | T = np.array(T)
209 | T_ = np.array(T_)
210 | X = np.array(X)
211 | X_ = np.array(X_)
212 | Y = np.array(Y)
213 | yield [T, T_, X, X_], Y
214 | T, T_, Y = [], [], []
215 | X, X_, Z = [], [], []
216 |
217 |
218 | # 模型构建
219 | # 在第一个bert中对标题进行相似度判别
220 | # 在第二个bert中对内容进行相似度判别
221 | # 分别取出[CLS]进行拼接后进行分类
222 | def get_model():
223 | bert_model = load_trained_model_from_checkpoint(config_path, checkpoint_path)
224 | bert_model1 = load_trained_model_from_checkpoint(config_path1, checkpoint_path1)
225 | for l in bert_model.layers:
226 | l.trainable = True
227 |
228 | T1 = Input(shape=(None,))
229 | T2 = Input(shape=(None,))
230 | X1 = Input(shape=(None,))
231 | X2 = Input(shape=(None,))
232 |
233 | T = bert_model([T1, T2])
234 | X = bert_model1([X1, X2])
235 |
236 | T = Lambda(lambda x: x[:, 0])(T)
237 | X = Lambda(lambda x: x[:, 0])(X)
238 |
239 | T = Concatenate(axis=-1)([T, X])
240 | T = Dense(384)(T)
241 | # T = Dropout(0.1)(T)
242 | output = Dense(4, activation='softmax')(T)
243 |
244 | model = Model([T1, T2, X1, X2], output)
245 | model.compile(
246 | loss='categorical_crossentropy',
247 | optimizer=Adam(1e-5), # 用足够小的学习率
248 | metrics=['MAE']
249 | )
250 | model.summary()
251 | return model
252 |
253 |
254 | class Evaluate(Callback):
255 | def __init__(self, val_data, val_index):
256 | self.score = []
257 | self.best = 0.
258 | self.early_stopping = 0
259 | self.val_data = val_data
260 | self.val_index = val_index
261 | self.predict = []
262 | self.lr = 0
263 | self.passed = 0
264 |
265 | # 第一个epoch用来warmup,第二个epoch把学习率降到最低
266 | def on_batch_begin(self, batch, logs=None):
267 | if self.passed < self.params['steps']:
268 | self.lr = (self.passed + 1.) / self.params['steps'] * learning_rate
269 | K.set_value(self.model.optimizer.lr, self.lr)
270 | self.passed += 1
271 | elif self.params['steps'] <= self.passed < self.params['steps'] * 2:
272 | self.lr = (2 - (self.passed + 1.) / self.params['steps']) * (learning_rate - min_learning_rate)
273 | self.lr += min_learning_rate
274 | K.set_value(self.model.optimizer.lr, self.lr)
275 | self.passed += 1
276 |
277 | def on_epoch_end(self, epoch, logs=None):
278 | score, acc, f1 = self.evaluate()
279 | if score > self.best:
280 | self.best = score
281 | self.early_stopping = 0
282 | model.save_weights('./model_save/bert{}.w'.format(fold))
283 | else:
284 | self.early_stopping += 1
285 | logger.info('fold: %d, lr: %.6f, score: %.4f, acc: %.4f, f1: %.4f,best: %.4f\n' % (
286 | fold, self.lr, score, acc, f1, self.best))
287 |
288 | def evaluate(self):
289 | self.predict = []
290 | prob = []
291 | val_x1, val_x2, val_x3, val_x4, val_y, val_cat = self.val_data
292 |
293 | for i in tqdm(range(len(val_x1))):
294 | achievements = val_x1[i]
295 | requirements = val_x2[i]
296 | achievementsc = val_x3[i]
297 | requirementsc = val_x4[i]
298 |
299 | t1, t1_ = tokenizer.encode(first=achievements, second=requirements, max_len=MAX_LENT)
300 | x1, x1_ = tokenizer1.encode(first=achievementsc, second=requirementsc, max_len=MAX_LENC)
301 |
302 | T1, T1_ = np.array([t1]), np.array([t1_])
303 | X1, X1_ = np.array([x1]), np.array([x1_])
304 |
305 | _prob = model.predict([T1, T1_, X1, X1_])
306 |
307 | oof_train[self.val_index[i]] = _prob[0]
308 | self.predict.append(np.argmax(_prob, axis=1)[0] + 1)
309 | prob.append(_prob[0])
310 |
311 | score = 1.0 / (1 + mean_absolute_error(val_y + 1, self.predict))
312 | acc = accuracy_score(val_y + 1, self.predict)
313 | f1 = f1_score(val_y + 1, self.predict, average='macro')
314 | return score, acc, f1
315 |
316 |
317 | skf = StratifiedKFold(n_splits=fold, shuffle=True, random_state=random_seed)
318 |
319 |
320 | def predict(data):
321 | prob = []
322 | val_x1, val_x2, val_x3, val_x4 = data
323 | for i in tqdm(range(len(val_x1))):
324 | achievements = val_x1[i]
325 | requirements = val_x2[i]
326 | achievementsc = val_x3[i]
327 | requirementsc = val_x4[i]
328 |
329 | t1, t1_ = tokenizer.encode(first=achievements, second=requirements, max_len=MAX_LENT)
330 | x1, x1_ = tokenizer1.encode(first=achievementsc, second=requirementsc, max_len=MAX_LENC)
331 |
332 | T1, T1_ = np.array([t1]), np.array([t1_])
333 | X1, X1_ = np.array([x1]), np.array([x1_])
334 |
335 | _prob = model.predict([T1, T1_, X1, X1_])
336 | prob.append(_prob[0])
337 | return prob
338 |
339 |
340 | oof_train = np.zeros((len(train), 4), dtype=np.float32)
341 | oof_test = np.zeros((len(test), 4), dtype=np.float32)
342 | logger.info("加载{}和{}".format(bert_path[model], bert_path[model1]))
343 |
344 | timestamp = time.time()
345 |
346 | for fold, (train_index, valid_index) in enumerate(skf.split(train_achievements, labels)):
347 | logger.info('------------ %d fold take: %.1f minute ------------' % (fold, (time.time() - timestamp) / 60))
348 | timestamp = time.time()
349 | x1 = train_achievements[train_index]
350 | x2 = train_requirements[train_index]
351 | x3 = train_achievementsc[train_index]
352 | x4 = train_requirementsc[train_index]
353 | y = labels_cat[train_index]
354 |
355 | val_x1 = train_achievements[valid_index]
356 | val_x2 = train_requirements[valid_index]
357 | val_x3 = train_achievementsc[valid_index]
358 | val_x4 = train_requirementsc[valid_index]
359 | val_y = labels[valid_index]
360 | val_cat = labels_cat[valid_index]
361 |
362 | train_D = data_generator([x1, x2, x3, x4, y])
363 | evaluator = Evaluate([val_x1, val_x2, val_x3, val_x4, val_y, val_cat], valid_index)
364 |
365 | model = get_model()
366 | model.fit_generator(train_D.__iter__(),
367 | steps_per_epoch=len(train_D),
368 | epochs=epoch,
369 | callbacks=[evaluator]
370 | )
371 | model.load_weights('./model_save/bert{}.w'.format(fold))
372 | oof_test += predict([test_achievements, test_requirements, test_achievementsc, test_requirementsc])
373 | K.clear_session()
374 |
375 | oof_test /= epoch
376 |
377 | cv_score = 1.0 / (1 + mean_absolute_error(labels + 1, np.argmax(oof_train, axis=1) + 1))
378 | logger.info(cv_score)
379 |
380 | np.savetxt('./submit/w_{}.txt'.format(counter), oof_test)
381 | test['Level'] = np.argmax(oof_test, axis=1) + 1
382 | test[['Guid', 'Level']].to_csv('./submit/{}.csv'.format(counter), index=False)
383 |
--------------------------------------------------------------------------------