├── .gitignore ├── README.md ├── pre_classify.py ├── xf_bert_v2.py └── xunfei_classify.py /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | __pycache__ 3 | archive 4 | .pyc 5 | .idea 6 | __pyache__/ 7 | dist/ 8 | ~$* 9 | /data 10 | resources/ 11 | .ipynb_checkpoints -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## 大数据应用分类标注挑战赛深度学习模型Baseline 2 | > 两个模型随便使用。 3 | 4 | **比赛链接:** 5 | - http://challenge.xfyun.cn/2019/gamedetail?type=detail/classifyApp 6 | 7 | 8 | ## keras模型训练方式 9 | > 使用bert提取特征,Bidirectional 训练模型,可以达到 74.39565。需要一定的内存。 10 | ``` 11 | pip install keras 12 | pip install keras_bert 13 | 14 | # 指定数据目录, 15 | data_dir = './data/' 16 | # 将bert预训练数据放到改目录 17 | # https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip 18 | BERT_PRETRAINED_DIR = './chinese_L-12_H-768_A-12/' 19 | # 运行 20 | python3 xf_bert_v2.py 21 | ``` 22 | 23 | ## kears TextCNN 模型使用方式 24 | - 下载好数据存放`/data`文件夹下,创建`data/xfyun`文件夹 25 | - 数据预处理生成数据集`python xunfei/pre_classify.py` 26 | - 训练模型`python xunfei_classify.py -d ./data/xfyun -n xunfei -m train` 27 | - 预测模型`python xunfei_classify.py -d ./data/xfyun -n xunfei -m predict` 28 | 29 | 30 | ## 参考资料 31 | - [tokenizer](https://www.cnblogs.com/bymo/p/9675654.html) 32 | - [模型结构](https://blog.csdn.net/asialee_bird/article/details/88813385) 33 | 34 | @**Galen**_20190717_ -------------------------------------------------------------------------------- /pre_classify.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*-# 2 | """ 3 | @author:Galen 4 | @file: pre_classify.py 5 | @time: 2019/06/27 6 | @description: 7 | """ 8 | import random 9 | import os 10 | 11 | from collections import defaultdict 12 | 13 | 14 | def read_data(path): 15 | with open(path, "r") as f: 16 | while True: 17 | line = f.readline().strip() 18 | if not line: 19 | break 20 | yield line 21 | 22 | 23 | def save_txt(filename, data_list, model="a"): 24 | with open(filename, model, encoding='UTF-8', errors='ignore') as f: 25 | for data in data_list: 26 | f.write(data + '\n') 27 | 28 | 29 | def sort_dict(dict_words, reverse=True, site=1): 30 | """ 31 | 字典排序 32 | reverse: False 升序,True降序 33 | site: 0 第一个元素,1是第二个元素 34 | :param dict_words: 35 | :return: 36 | """ 37 | keys = dict_words.keys() 38 | values = dict_words.values() 39 | list_one = [(key, val) for key, val in zip(keys, values)] 40 | list_sort = sorted(list_one, key=lambda x: x[site], reverse=reverse) 41 | return list_sort 42 | 43 | 44 | def clean_data(): 45 | """ 46 | 47 | :return: 48 | """ 49 | data_path = 'data/xfyun/apptype_train.dat' 50 | save_path = 'data/xfyun/apptype_train.txt' 51 | cate_path = 'data/xfyun/apptype_count.txt' 52 | cate_count = defaultdict(int) 53 | data_set = [] 54 | for line in read_data(data_path): 55 | line_list = line.strip().split("\t") 56 | line_list = [x.replace("\t", "").replace("\n", "").strip() for x in line_list] 57 | if len(line_list) == 3: 58 | if len(line_list[0]) == 32: 59 | data_set.append(line) 60 | else: 61 | print('line_list = 3', len(line_list), line_list) 62 | data_set[-1] = data_set[-1] + line.replace("\t", "").replace("\n", "").strip() 63 | else: 64 | print('line_list != 3', len(line_list), line_list) 65 | data_set[-1] = data_set[-1] + line.replace("\t", "").replace("\n", "").strip() 66 | with open(save_path, 'w')as f: 67 | for line in data_set: 68 | # 001357CD179A515D6C0B91C7462D6C32\t<种类>|<种类>\t内容 69 | line_list = line.split("\t") 70 | cate_list = line_list[1].split("|") 71 | if len(line_list[2]) <= 0: 72 | print(line_list) 73 | for c in cate_list: 74 | cate_count[c] += 1 75 | f.write("{0}\t{1}\n".format(c, line_list[2])) 76 | cate_count_list = sort_dict(cate_count) 77 | with open(cate_path, 'w')as f: 78 | for k, v in cate_count_list: 79 | # print(k, v) 80 | f.write("{0}\t{1}\n".format(k, v)) 81 | 82 | 83 | def cate_save(context_list, data_type): 84 | save_dir = "./data/xfyun" 85 | data_save = os.path.join(save_dir, "{0}.{1}.txt".format("xunfei", data_type)) 86 | print("save path:", data_save) 87 | save_txt(data_save, context_list, model='w') 88 | 89 | 90 | def generate_data(): 91 | data_path = 'data/xfyun/apptype_train.txt' 92 | with open(data_path, 'r', encoding='utf-8') as f: 93 | data_list = f.read().splitlines() 94 | print(len(data_list)) 95 | random.shuffle(data_list) 96 | val_list = data_list[0:1000] 97 | cate_save(val_list, "val") 98 | train_list = data_list[1000:] 99 | cate_save(train_list, "train") 100 | 101 | 102 | if __name__ == "__main__": 103 | # python xunfei/pre_classify.py 104 | clean_data() 105 | generate_data() 106 | -------------------------------------------------------------------------------- /xf_bert_v2.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*-# 2 | """ 3 | @author:Galen 4 | @file: xf_bert_v2.py 5 | @time: 2019-07-31 6 | @description:使用bert提取特征,Bidirectional 训练模型 7 | 8 | 9 | python3 xf_bert_v2.py 10 | 11 | pip install keras 12 | pip install keras_bert 13 | 14 | """ 15 | import codecs 16 | import gc 17 | import math 18 | import os 19 | import pickle 20 | from datetime import datetime 21 | 22 | import numpy as np 23 | import pandas as pd 24 | import tensorflow as tf 25 | import tqdm 26 | from keras.callbacks import EarlyStopping, ReduceLROnPlateau 27 | from keras.layers import CuDNNLSTM, Bidirectional 28 | from keras.layers import Dense, Flatten, SpatialDropout1D, Dropout 29 | from keras.models import Sequential 30 | from keras.optimizers import Adam 31 | from keras.utils import Sequence 32 | from keras_bert import Tokenizer, load_trained_model_from_checkpoint 33 | from sklearn.preprocessing import LabelBinarizer 34 | 35 | 36 | 37 | data_dir = './data/' 38 | BERT_PRETRAINED_DIR = './chinese_L-12_H-768_A-12/' 39 | 40 | init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) 41 | with tf.Session() as sess: 42 | sess.run(init_op) 43 | 44 | 45 | def pickle_save(path, data): 46 | with open(path, 'wb')as f: 47 | # dumps序列化源数据后写入文件 48 | f.write(pickle.dumps(data, protocol=4)) 49 | 50 | 51 | def pickle_load(path): 52 | with open(path, 'rb')as f: 53 | return pickle.loads(f.read()) 54 | 55 | 56 | def load_data(): 57 | # ============================读入训练集:======================================= 58 | train = pd.read_csv(data_dir + "apptype_train.dat", header=None, encoding='utf8', delimiter=' ') 59 | # 以tab键分割,不知道为啥delimiter='\t'会报错,所以先读入再分割。 60 | train = pd.DataFrame(train[0].apply(lambda x: x.split('\t')).tolist(), columns=['id', 'label', 'comment']) 61 | print('train', train.shape) 62 | # =============================读入测试集:====================================== 63 | test = pd.read_csv(data_dir + "app_desc.dat", header=None, encoding='utf8', delimiter=' ') 64 | test = pd.DataFrame(test[0].apply(lambda x: x.split('\t')).tolist(), columns=['id', 'comment']) 65 | print('test', test.shape) 66 | print('load success ') 67 | # ========================以|为分隔符,把标签分割:=============================== 68 | train['label1'] = train['label'].apply(lambda x: x.split('|')[0]) 69 | train['label2'] = train['label'].apply(lambda x: x.split('|')[1] if '|' in x else 0) ##第二个标签有些没有,此处补0 70 | # 去掉样本少于5个的类别 71 | train = train[~train.label1.isin(['140110', '140805', '140105'])].reset_index(drop=True) 72 | 73 | return train, test 74 | 75 | 76 | def get_binary_label(data_list): 77 | lb = LabelBinarizer() 78 | binary_label = lb.fit_transform(data_list) # transfer label to binary value 79 | cate_len = len(lb.classes_) 80 | print(binary_label.shape) 81 | # 逆过程 82 | # yesORno=lb.inverse_transform(p) 83 | return binary_label, cate_len, lb 84 | 85 | 86 | def load_bert_model(): 87 | print('load_bert_model') 88 | # TensorFlow 模型文件,包含预训练模型的权重 89 | checkpoint_file = os.path.join(BERT_PRETRAINED_DIR, 'bert_model.ckpt') 90 | # 配置文件,记录模型的超参数 91 | config_file = os.path.join(BERT_PRETRAINED_DIR, 'bert_config.json') 92 | # 字典文件,记录词条与 id 的映射关系 93 | dict_file = os.path.join(BERT_PRETRAINED_DIR, 'vocab.txt') 94 | # 将 BERT 模型载入到 keras 95 | bert_model = load_trained_model_from_checkpoint(config_file, checkpoint_file, seq_len=64) 96 | 97 | token_dict = {} 98 | with codecs.open(dict_file, 'r', 'utf-8') as reader: 99 | for line in reader: 100 | token = line.strip() 101 | token_dict[token] = len(token_dict) 102 | tokenizer = Tokenizer(token_dict) 103 | return tokenizer, bert_model 104 | 105 | 106 | class DataGenerator(Sequence): 107 | 108 | def __init__(self, dataX, dataY, batch_size=1, shuffle=True, bert_model=None, tokenizer=None, max_sentence=64): 109 | self.batch_size = batch_size 110 | self.dataX = dataX 111 | self.dataY = dataY 112 | self.bert_model = bert_model 113 | self.tokenizer = tokenizer 114 | self.max_sentence = max_sentence 115 | # 验证dataX训练数据和标签是否数量一致 116 | assert (len(self.dataX) == len(self.dataY)) 117 | self.indexes = np.arange(len(self.dataX)) 118 | self.shuffle = shuffle 119 | # 打乱数据 120 | if self.shuffle: 121 | np.random.shuffle(self.indexes) 122 | 123 | def __len__(self): 124 | # 计算每一个epoch的迭代次数 125 | return math.ceil(len(self.dataX) / float(self.batch_size)) 126 | 127 | def __getitem__(self, index): 128 | # 生成每个batch数据 129 | # 生成batch_size个索引 130 | batch_indexs = self.indexes[index * self.batch_size:(index + 1) * self.batch_size] 131 | # 根据索引获取集合中的数据 132 | batch_X = [self.dataX[k] for k in batch_indexs] 133 | batch_Y = [self.dataY[k] for k in batch_indexs] 134 | 135 | # 生成数据 136 | X, y = self.data_generation(batch_X, batch_Y) 137 | return X, y 138 | 139 | def on_epoch_end(self): 140 | # 在每一次epoch结束是否需要进行一次随机,重新随机一下index 141 | if self.shuffle: 142 | np.random.shuffle(self.indexes) 143 | 144 | def data_generation(self, batch_X, batch_Y): 145 | indices = [] 146 | segments = [] 147 | for i, text in enumerate(batch_X): 148 | index, segment = self.tokenizer.encode(first=text, max_len=self.max_sentence) 149 | indices.append(index) 150 | segments.append(segment) 151 | word_vec = self.bert_model.predict([np.array(indices), np.array(segments)]) 152 | return word_vec, np.array(batch_Y) 153 | 154 | 155 | def dl_model(max_sentence, cate_num): 156 | model = Sequential() 157 | model.add(Bidirectional(CuDNNLSTM(128, return_sequences=True), input_shape=(max_sentence, 768))) 158 | model.add(SpatialDropout1D(0.5)) 159 | model.add(Bidirectional(CuDNNLSTM(128, return_sequences=True))) 160 | model.add(Flatten()) 161 | model.add(Dense(256, activation='relu')) 162 | # model.add(Dropout(0.3)) 163 | # model.add(Dense(64, activation='relu')) 164 | model.add(Dropout(0.1)) 165 | model.add(Dense(cate_num, activation='softmax')) 166 | model.summary() 167 | return model 168 | 169 | 170 | def make_callbacks(): 171 | # checkpoint = ModelCheckpoint(weights_file, monitor='val_loss', verbose=1, save_best_only=False, mode='auto') 172 | 173 | # 不再优化时候,调整学习率 174 | reduce_lr = ReduceLROnPlateau(monitor='val_loss', patience=5, mode='auto', factor=0.8) 175 | 176 | # 当验证集的loss不再下降时,中断训练 patience=2 两个 Epoch 停止 177 | early_stopping = EarlyStopping(monitor='val_loss', patience=5) 178 | 179 | # all the goodies 180 | return [reduce_lr, early_stopping] 181 | 182 | 183 | def bert_data(data_list, bert_model, tokenizer, bert_pkl_path, max_sentence): 184 | print('bert_data ...') 185 | if os.path.exists(bert_pkl_path): 186 | print("loading ", bert_pkl_path) 187 | return pickle_load(bert_pkl_path) 188 | indices = [] 189 | segments = [] 190 | for text in tqdm.tqdm(data_list): 191 | index, segment = tokenizer.encode(first=text, max_len=max_sentence) 192 | indices.append(index) 193 | segments.append(segment) 194 | text_data = [np.array(indices), np.array(segments)] 195 | del indices 196 | del segments 197 | del tokenizer 198 | gc.collect() 199 | gc.collect() 200 | data_vec = bert_model.predict(text_data) 201 | del bert_model 202 | del text_data 203 | gc.collect() 204 | print("saving ", bert_pkl_path) 205 | pickle_save(bert_pkl_path, data_vec) 206 | return data_vec 207 | 208 | 209 | def train_fit_generator(x_train, y_train, total_epochs, batch_size, steps, max_sentence, cate_len, bert_model, 210 | tokenizer): 211 | data_gen = DataGenerator(x_train, y_train, batch_size, True, bert_model, tokenizer) 212 | callbacks_list = make_callbacks() 213 | model = dl_model(max_sentence, cate_len) 214 | exp_decay = lambda init, fin, steps: (init / fin) ** (1 / (steps - 1)) - 1 215 | lr_init, lr_fin = 0.001, 0.0001 216 | lr_decay = exp_decay(lr_init, lr_fin, steps) 217 | optimizer_adam = Adam(lr=0.001, decay=lr_decay) 218 | model.compile(loss='categorical_crossentropy', optimizer=optimizer_adam, metrics=['accuracy']) 219 | model.fit_generator(data_gen, 220 | epochs=total_epochs, 221 | callbacks=callbacks_list, 222 | # steps_per_epoch=STEPS_PER_EPOCH, 223 | # validation_data=val_batch, 224 | # validation_steps=VALIDATION_STEPS, 225 | verbose=1, # verbose:日志显示,0为不在标准输出流输出日志信息,1为输出进度条记录,2为每个epoch输出一行记录 226 | ) 227 | return model 228 | 229 | 230 | def train_fit(x_train, y_train, total_epochs, batch_size, steps, max_sentence, cate_len, bert_model, tokenizer): 231 | data_vec = bert_data(x_train, bert_model, tokenizer, 232 | data_dir + 'train_bert_pickle_{0}.plk'.format(str(max_sentence)), max_sentence) 233 | callbacks_list = make_callbacks() 234 | model = dl_model(max_sentence, cate_len) 235 | exp_decay = lambda init, fin, steps: (init / fin) ** (1 / (steps - 1)) - 1 236 | lr_init, lr_fin = 0.001, 0.0001 237 | lr_decay = exp_decay(lr_init, lr_fin, steps) 238 | optimizer_adam = Adam(lr=0.001, decay=lr_decay) 239 | model.compile(loss='categorical_crossentropy', optimizer=optimizer_adam, metrics=['accuracy']) 240 | model.fit(data_vec, y_train, 241 | epochs=total_epochs, 242 | batch_size=batch_size, 243 | callbacks=callbacks_list, 244 | validation_split=0.1, 245 | verbose=1, # verbose:日志显示,0为不在标准输出流输出日志信息,1为输出进度条记录,2为每个epoch输出一行记录 246 | ) 247 | return model 248 | 249 | 250 | def get_top2(data_np): 251 | results = pd.DataFrame(data_np) 252 | first_np = np.zeros(data_np.shape, dtype=int) 253 | second_np = np.zeros(data_np.shape, dtype=int) 254 | for j, row in results.iterrows(): 255 | zz = list(np.argsort(row)) 256 | # 第一个标签 257 | first_np[j, row.index[zz[-1]]] = 1 258 | # 第二个标签 259 | second_np[j, row.index[zz[-2]]] = 1 260 | return first_np, second_np 261 | 262 | 263 | def predict(model, test_df, batch_size, bert_model, tokenizer, lb, max_sentence): 264 | # 预测数据 265 | test_vec = bert_data(test_df['comment'], bert_model, tokenizer, 266 | data_dir + 'test_bert_pickle_{0}.plk'.format(str(max_sentence)), max_sentence) 267 | print('predictions ...') 268 | predictions = model.predict_proba(test_vec, batch_size=batch_size) 269 | del test_vec 270 | gc.collect() 271 | print('inverse_transform cate ...') 272 | predictions_1, predictions_2 = get_top2(predictions) 273 | label1 = lb.inverse_transform(predictions_1) 274 | label2 = lb.inverse_transform(predictions_2) 275 | data_save = 'pre_test_{}.csv'.format(datetime.now().strftime('%m%d_%H%M%S')) 276 | print('saving ...', data_save) 277 | submission = pd.DataFrame.from_dict({ 278 | 'id': test_df['id'], 279 | 'label1': label1, 280 | 'label2': label2, 281 | }) 282 | submission.to_csv(data_save, sep=',', index=False) 283 | 284 | 285 | def main(): 286 | epochs = 15 287 | batch_size = 32 288 | max_sentence = 64 289 | train, test = load_data() 290 | x_train = train['comment'] 291 | y_train, cate_len, lb = get_binary_label(train['label1']) 292 | tokenizer, bert_model = load_bert_model() 293 | steps = int(len(x_train) / batch_size) * epochs 294 | # model = train_fit_generator(x_train, y_train, total_epochs, batch_size,max_sentence, cate_len, bert_model, tokenizer) 295 | model = train_fit(x_train, y_train, epochs, batch_size, steps, max_sentence, cate_len, bert_model, tokenizer) 296 | del x_train 297 | gc.collect() 298 | predict(model, test, batch_size, bert_model, tokenizer, lb, max_sentence) 299 | 300 | 301 | main() 302 | -------------------------------------------------------------------------------- /xunfei_classify.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*-# 2 | """ 3 | @author:Galen 4 | @file: xunfei_classify.py 5 | @time: 2019/06/18 6 | @description: 7 | 中文文本分类 xunfei_classify 最优 8 | 9 | 10 | 步骤: 11 | 1. 生成 Tokenizer 12 | 13 | 参考资料 14 | https://blog.csdn.net/asialee_bird/article/details/88813385 15 | tokenizer: 16 | https://www.cnblogs.com/bymo/p/9675654.html 17 | 模型结构 18 | https://blog.csdn.net/asialee_bird/article/details/88813385 19 | https://www.cnblogs.com/bymo/p/9675654.html 20 | """ 21 | import argparse 22 | import os 23 | import pickle 24 | import re 25 | from datetime import datetime 26 | 27 | import numpy as np 28 | import pandas as pd 29 | import tqdm 30 | from keras import Input 31 | from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau 32 | from keras.layers import Conv1D, MaxPool1D, Dense, Flatten, concatenate, Embedding, Dropout 33 | from keras.models import Model 34 | from keras.models import load_model 35 | from keras.preprocessing.sequence import pad_sequences 36 | from keras.preprocessing.text import Tokenizer 37 | from keras.utils import plot_model, to_categorical 38 | 39 | parse = argparse.ArgumentParser() 40 | parse.add_argument('--data', '-d', type=str, default='data/xfyun') 41 | parse.add_argument('--name', '-n', type=str, default='xunfei') # 42 | parse.add_argument('--model', '-m', type=str, default='train') # train', 'test' 43 | parse.add_argument('--sentence', '-s', type=int, default=1000) # 句子长度. 1000 > 1500 44 | parse.add_argument('--vocab', '-v', type=int, default=6000) # 词长度。 45 | parse.add_argument('--embedding', '-e', type=int, default=64) # 词向量维度。 64 > 128 46 | args = parse.parse_args() 47 | 48 | data_path = args.data 49 | WEIGHTS_IDR = './weight/xunfei_classify' 50 | 51 | # 创建权重文件夹 52 | if not os.path.exists(WEIGHTS_IDR): 53 | os.makedirs(WEIGHTS_IDR) 54 | print('[Info] 文件夹 "%s" 不存在, 创建文件夹.' % WEIGHTS_IDR) 55 | 56 | model_name = "{0}.xunfei_{1}_{2}_{3}".format(str(args.name), str(args.sentence), str(args.vocab), str(args.embedding)) 57 | print(model_name) 58 | WEIGHTS_FILE = os.path.join(WEIGHTS_IDR, "{0}.hdf5".format(model_name)) 59 | MODEL_FILE = os.path.join(WEIGHTS_IDR, "{0}.h5".format(model_name)) 60 | 61 | SENTENCE_LEN = args.sentence # 句子长度 6000 1500 62 | VOCAB_LEN = args.vocab # 词长度 63 | EMBEDDING_DIM = args.embedding # 词向量维度 64 | 65 | 66 | def load_cate(): 67 | cate_path = "data/xfyun/apptype_count.txt" 68 | with open(cate_path, 'r')as f: 69 | data_list = f.readlines() 70 | return [x.split("\t")[0] for x in data_list] 71 | 72 | 73 | BATCH_SIZE = 64 74 | CATEGORIES = load_cate() 75 | NUM_CATEGORY = len(CATEGORIES) 76 | tokenizer_path = "tokenizer_{0}_{1}.pickle".format(args.name, args.vocab) 77 | train_file = os.path.join(data_path, '{0}.train.txt'.format(args.name)) 78 | val_file = os.path.join(data_path, '{0}.val.txt'.format(args.name)) # 3000 79 | test_file = os.path.join(data_path, 'app_desc.dat') 80 | 81 | TOTAL_EPOCHS = 20 82 | STEPS_PER_EPOCH = int(28134 // BATCH_SIZE) 83 | VALIDATION_STEPS = int(300 // BATCH_SIZE) 84 | 85 | 86 | def clean_multiple_spaces(content): 87 | """ 88 | 清理非多个空格 89 | :param content: 90 | :return: 91 | """ 92 | 93 | # content = re.sub(r'[^\u4e00-\u9fa5]+', ' ', content) # Eliminate Chinese characters 94 | return re.sub('[\r\n\t ]+', ' ', content).replace('\xa0', '').strip() 95 | 96 | 97 | def read_data_file(path): 98 | lines = open(path, 'r', encoding='utf-8').readlines() 99 | x_list = [] 100 | y_list = [] 101 | for line in tqdm.tqdm(lines): 102 | rows = line.split('\t') 103 | if len(rows) >= 2: 104 | y_list.append(rows[0].strip()) 105 | x_list.append(list(clean_multiple_spaces(' '.join(rows[1:])))) 106 | else: 107 | pass 108 | # print(rows) 109 | return x_list, y_list 110 | 111 | 112 | def tokenizer_data(): 113 | """ 114 | 将语料进行tokenrizer 115 | 取语料的 90 % 长度作为最大长度值。 116 | :return: 117 | """ 118 | if os.path.exists(tokenizer_path): 119 | # loading 120 | print("loading tokenizer data...") 121 | with open(tokenizer_path, 'rb') as handle: 122 | tokenizer = pickle.load(handle) 123 | else: 124 | print("calculate tokenizer data...") 125 | test_x, test_y = read_data_file(test_file) 126 | train_x, train_y = read_data_file(train_file) 127 | val_x, val_y = read_data_file(val_file) 128 | data_all = test_x + train_x + val_x 129 | # 或的0.90 左右文本长度 130 | sequence_length = sorted([len(x) for x in data_all])[int(0.95 * len(data_all))] 131 | print("sequence_length 90 is : {0}".format(str(sequence_length))) 132 | tokenizer = Tokenizer(VOCAB_LEN) 133 | tokenizer.fit_on_texts(data_all) 134 | # loading 135 | with open(tokenizer_path, 'wb') as handle: 136 | pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL) 137 | return tokenizer 138 | 139 | 140 | def make_callbacks(weights_file): 141 | checkpoint = ModelCheckpoint(weights_file, monitor='val_loss', verbose=1, save_best_only=False, mode='auto') 142 | 143 | # 不再优化时候,调整学习率 144 | reduce_lr = ReduceLROnPlateau(monitor='val_loss', patience=1, mode='auto', factor=0.8) 145 | 146 | # 当验证集的loss不再下降时,中断训练 patience=2 两个 Epoch 停止 147 | early_stopping = EarlyStopping(monitor='val_loss', patience=10) 148 | 149 | # all the goodies 150 | return [reduce_lr, checkpoint, early_stopping] 151 | 152 | 153 | def xunfei_classify_model(sentence_length=SENTENCE_LEN, vocab_len=VOCAB_LEN, embedding_dim=EMBEDDING_DIM, 154 | model_img_path=None, 155 | embedding_matrix=None): 156 | """ 157 | TextCNN: 158 | 1. embedding layers, 159 | 2.convolution layer, 160 | 3.max-pooling, 161 | 4.softmax layer. 162 | :param sentence_length:句子大小 163 | :param vocab_len: 文本中词汇表大小 164 | :param embedding_dim: 词向量空间大小 165 | :param model_img_path: 166 | :param embedding_matrix: 167 | :return: 168 | """ 169 | x_input = Input(shape=(sentence_length,)) 170 | 171 | if embedding_matrix is None: 172 | x_emb = Embedding(input_dim=vocab_len + 1, output_dim=embedding_dim, input_length=sentence_length)(x_input) 173 | else: 174 | x_emb = Embedding(input_dim=vocab_len + 1, output_dim=embedding_dim, input_length=sentence_length, 175 | weights=[embedding_matrix], trainable=True)(x_input) 176 | 177 | # 多层卷积核 178 | pool_output = [] 179 | kernel_sizes = [3, 4, 5] 180 | for kernel_size in kernel_sizes: 181 | c = Conv1D(filters=256, kernel_size=kernel_size, strides=1)(x_emb) 182 | p = MaxPool1D(pool_size=int(c.shape[1]))(c) 183 | pool_output.append(p) 184 | # 合并三个模型的输出向量 185 | pool_output = concatenate([p for p in pool_output]) 186 | 187 | x_flatten = Flatten()(pool_output) 188 | drop = Dropout(0.5)(x_flatten) 189 | y = Dense(NUM_CATEGORY, activation='softmax')(drop) 190 | model = Model([x_input], outputs=[y]) 191 | if model_img_path: 192 | plot_model(model, to_file=model_img_path, show_shapes=True, show_layer_names=False) 193 | # model.summary() 194 | return model 195 | 196 | 197 | def process_file(filename, tokenizer, cate_id, max_length=SENTENCE_LEN): 198 | """将文件转换为id表示""" 199 | contents, labels = read_data_file(filename) 200 | label_id = [cate_id[x] for x in labels] 201 | # 将每个样本中的每个词转换为数字列表,使用每个词的编号进行编号 202 | data_id = tokenizer.texts_to_sequences(contents) 203 | # 使用keras提供的pad_sequences来将文本pad为固定长度 句子不够长则补0 204 | x_pad = pad_sequences(data_id, max_length) 205 | # label进行one-hot处理 label进行labelEncoder之后变为0-9个数才能进行one-hot,且由于10个类别,则每个label的维度大小为10。 206 | y_pad = to_categorical(label_id, num_classes=len(cate_id)) # 将标签转换为one-hot表示 207 | return x_pad, y_pad 208 | 209 | 210 | def predict_file(filename, tokenizer, max_length=SENTENCE_LEN): 211 | """将文件转换为id表示""" 212 | contents, labels = read_data_file(filename) 213 | # 将每个样本中的每个词转换为数字列表,使用每个词的编号进行编号 214 | data_id = tokenizer.texts_to_sequences(contents) 215 | # 使用keras提供的pad_sequences来将文本pad为固定长度 句子不够长则补0 216 | x_pad = pad_sequences(data_id, max_length) 217 | # label进行one-hot处理 label进行labelEncoder之后变为0-9个数才能进行one-hot,且由于10个类别,则每个label的维度大小为10。 218 | return x_pad, labels 219 | 220 | 221 | def batch_iter(x, y, batch_size=BATCH_SIZE): 222 | """ 223 | 生成批次数据 224 | :param x: 225 | :param y: 226 | :param batch_size: 227 | :return: 228 | """ 229 | while 1: 230 | data_len = len(x) 231 | num_batch = int((data_len - 1) / batch_size) + 1 232 | # shuffle与permutation都是对原来的数组进行重新洗牌 permutation不直接在原来的数组上进行操作,而是返回一个新的打乱顺序的数组,并不改变原来的数组。 233 | indices = np.random.permutation(np.arange(data_len)) 234 | x_shuffle = x[indices] 235 | y_shuffle = y[indices] 236 | for i in range(num_batch): 237 | start_id = i * batch_size 238 | end_id = min((i + 1) * batch_size, data_len) 239 | yield x_shuffle[start_id:end_id], y_shuffle[start_id:end_id] 240 | 241 | 242 | def train(): 243 | # 加载数据 244 | print("Loading training and validation data...") 245 | x_train, y_train = process_file(train_file, tokenize, cate_to_id, SENTENCE_LEN) 246 | x_val, y_val = process_file(val_file, tokenize, cate_to_id, SENTENCE_LEN) 247 | train_batch = batch_iter(x_train, y_train) 248 | val_batch = batch_iter(x_val, y_val) 249 | model = xunfei_classify_model() 250 | if os.path.exists(WEIGHTS_FILE): 251 | print("loading ", WEIGHTS_FILE) 252 | model.load_weights(WEIGHTS_FILE, by_name=True) 253 | callbacks_list = make_callbacks(WEIGHTS_FILE) 254 | model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) 255 | model.fit_generator(train_batch, 256 | epochs=TOTAL_EPOCHS, 257 | callbacks=callbacks_list, 258 | steps_per_epoch=STEPS_PER_EPOCH, 259 | validation_data=val_batch, 260 | validation_steps=VALIDATION_STEPS, 261 | verbose=1, # verbose:日志显示,0为不在标准输出流输出日志信息,1为输出进度条记录,2为每个epoch输出一行记录 262 | ) 263 | # Save it for later 264 | print('Saving Model') 265 | # Keras模型和权重保存在一个HDF5文件中 266 | model.save(MODEL_FILE, include_optimizer=True) 267 | 268 | 269 | def sort_dict(dict_words, reverse=True, site=1): 270 | """ 271 | 字典排序 272 | reverse: False 升序,True降序 273 | site: 0 第一个元素,1是第二个元素 274 | :param dict_words: 275 | :return: 276 | """ 277 | keys = dict_words.keys() 278 | values = dict_words.values() 279 | list_one = [(key, val) for key, val in zip(keys, values)] 280 | list_sort = sorted(list_one, key=lambda x: x[site], reverse=reverse) 281 | return list_sort 282 | 283 | 284 | def get_top2(data_list): 285 | result_label_1 = [] 286 | result_label_2 = [] 287 | for data in data_list.tolist(): 288 | cate_score_list = zip(range(len(CATEGORIES)), data) 289 | list_sort = sorted(cate_score_list, key=lambda x: x[1], reverse=True) 290 | # print('list_sort', list_sort[0:10]) 291 | result_label_1.append(list_sort[0][0]) 292 | result_label_2.append(list_sort[0][1]) 293 | return result_label_1, result_label_2 294 | 295 | 296 | def predict(): 297 | print("Loading desc data...") 298 | x_test, y_name = predict_file(test_file, tokenize, SENTENCE_LEN) 299 | model = load_model(MODEL_FILE) 300 | result = model.predict(x_test) # 预测样本属于每个类别的概率 301 | # print("result", result) 302 | label_1, label_2 = get_top2(result) 303 | label_1_predict_name = [id_to_cate.get(x, "140901") for x in label_1] 304 | label_2_predict_name = [id_to_cate.get(x, "140206") for x in label_2] 305 | df_sub = pd.concat([pd.Series(y_name), pd.Series(label_1_predict_name), pd.Series(label_2_predict_name)], axis=1) 306 | df_sub.columns = ['id', 'label1', 'label2'] 307 | df_sub.to_csv('pre_test_{}.csv'.format(datetime.now().strftime('%m%d_%H%M%S')), sep=',', index=False) 308 | 309 | 310 | if __name__ == '__main__': 311 | if args.model not in ['train', 'predict']: 312 | raise ValueError("""usage: python run_cnn.py [train / predict]""") 313 | cate_to_id = dict(zip(CATEGORIES, range(len(CATEGORIES)))) 314 | id_to_cate = dict(zip(range(len(CATEGORIES)), CATEGORIES)) 315 | # 加载数据 316 | 317 | tokenize = tokenizer_data() 318 | if args.model == 'train': 319 | train() 320 | else: 321 | predict() 322 | --------------------------------------------------------------------------------