├── zhihu-text-classification-master ├── data_process │ ├── .idea │ │ ├── .name │ │ ├── encodings.xml │ │ ├── modules.xml │ │ ├── deployment.xml │ │ ├── data_process.iml │ │ └── misc.xml │ ├── test.py │ ├── run_all_data_process.sh │ ├── question_and_topic_2id.py │ ├── README.md │ ├── embed2ndarray.py │ ├── word2id.py │ ├── char2id.py │ ├── creat_batch_seg.py │ └── creat_batch_data.py └── models │ ├── wd_4_han │ ├── __init__.py │ ├── predict.py │ ├── train.py │ └── network.py │ ├── wd_2_hcnn │ ├── __init__.py │ ├── predict.py │ ├── train.py │ └── network.py │ ├── wd_3_bigru │ ├── __init__.py │ ├── predict.py │ ├── train.py │ └── network.py │ ├── wd_6_rcnn │ ├── __init__.py │ ├── predict.py │ ├── train.py │ └── network.py │ ├── wd_1_1_cnn_concat │ ├── __init__.py │ ├── predict.py │ ├── train.py │ └── network.py │ ├── wd_1_2_cnn_max │ ├── __init__.py │ ├── predict.py │ ├── network.py │ └── train.py │ └── wd_5_bigru_cnn │ ├── __init__.py │ ├── predict.py │ ├── train.py │ └── network.py └── ReadMe.md /zhihu-text-classification-master/data_process/.idea/.name: -------------------------------------------------------------------------------- 1 | data_process -------------------------------------------------------------------------------- /ReadMe.md: -------------------------------------------------------------------------------- 1 | # 竞赛列表 2 | + [2017 知乎看山杯机器学习挑战赛](https://www.biendata.com/competition/zhihu/) 3 | -------------------------------------------------------------------------------- /zhihu-text-classification-master/models/wd_4_han/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | -------------------------------------------------------------------------------- /zhihu-text-classification-master/models/wd_2_hcnn/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | -------------------------------------------------------------------------------- /zhihu-text-classification-master/models/wd_3_bigru/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | -------------------------------------------------------------------------------- /zhihu-text-classification-master/models/wd_6_rcnn/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | -------------------------------------------------------------------------------- /zhihu-text-classification-master/models/wd_1_1_cnn_concat/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | -------------------------------------------------------------------------------- /zhihu-text-classification-master/models/wd_1_2_cnn_max/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | -------------------------------------------------------------------------------- /zhihu-text-classification-master/models/wd_5_bigru_cnn/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | -------------------------------------------------------------------------------- /zhihu-text-classification-master/data_process/.idea/encodings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /zhihu-text-classification-master/data_process/test.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | 4 | from multiprocessing import Pool 5 | import numpy as np 6 | 7 | def func(a, b): 8 | return a+b 9 | 10 | p = Pool() 11 | a = [1,2,3] 12 | b = [4,5,6] 13 | para = zip(a,b) 14 | result = p.map(func, para) 15 | p.close() 16 | p.join() 17 | print result -------------------------------------------------------------------------------- /zhihu-text-classification-master/data_process/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /zhihu-text-classification-master/data_process/.idea/deployment.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /zhihu-text-classification-master/data_process/.idea/data_process.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 12 | -------------------------------------------------------------------------------- /zhihu-text-classification-master/data_process/run_all_data_process.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | echo -e "\033[44;37;5m RUNNING embed2ndarray.py\033[0m "; 3 | python embed2ndarray.py; 4 | echo -e "\033[44;37;5m RUNNING question_and_topic_2id.py\033[0m "; 5 | python question_and_topic_2id.py; 6 | echo -e "\033[44;37;5m RUNNING char2id.py\033[0m "; 7 | python char2id.py; 8 | echo -e "\033[44;37;5m RUNNING word2id.py\033[0m "; 9 | python word2id.py; 10 | echo -e "\033[44;37;5m RUNNING creat_batch_data.py\033[0m "; 11 | python creat_batch_data.py; 12 | echo -e "\033[44;37;5m RUNNING creat_batch_seg.py\033[0m "; 13 | python creat_batch_seg.py; -------------------------------------------------------------------------------- /zhihu-text-classification-master/data_process/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /zhihu-text-classification-master/data_process/question_and_topic_2id.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | import pandas as pd 4 | import pickle 5 | from itertools import chain 6 | 7 | 8 | def question_and_topic_2id(): 9 | """把question和topic转成id形式并保存至 ../data/目录下。""" 10 | print('Changing the quetion and topic to id and save in sr_question2.pkl and sr_topic2id.pkl in ../data/') 11 | df_question_topic = pd.read_csv('../raw_data/question_topic_train_set.txt', sep='\t', names=['question', 'topics'], 12 | dtype={'question': object, 'topics': object}) 13 | df_question_topic.topics = df_question_topic.topics.apply(lambda tps: tps.split(',')) 14 | save_path = '../data/' 15 | print('questino number = %d ' % len(df_question_topic)) 16 | # 问题 id 按照给出的问题顺序编号 17 | questions = df_question_topic.question.values 18 | sr_question2id = pd.Series(range(len(questions)), index=questions) 19 | sr_id2question = pd.Series(questions, index=range(len(questions))) 20 | # topic 按照数量从大到小进行编号 21 | topics = df_question_topic.topics.values 22 | topics = list(chain(*topics)) 23 | sr_topics = pd.Series(topics) 24 | topics_count = sr_topics.value_counts() 25 | topics = topics_count.index 26 | sr_topic2id = pd.Series(range(len(topics)),index=topics) 27 | sr_id2topic = pd.Series(topics, index=range(len(topics))) 28 | 29 | with open(save_path + 'sr_question2id.pkl', 'wb') as outp: 30 | pickle.dump(sr_question2id, outp) 31 | pickle.dump(sr_id2question, outp) 32 | with open(save_path + 'sr_topic2id.pkl', 'wb') as outp: 33 | pickle.dump(sr_topic2id, outp) 34 | pickle.dump(sr_id2topic, outp) 35 | print('Finished changing.') 36 | 37 | 38 | if __name__ == '__main__': 39 | question_and_topic_2id() 40 | -------------------------------------------------------------------------------- /zhihu-text-classification-master/data_process/README.md: -------------------------------------------------------------------------------- 1 | ## 数据处理 2 | 3 | 1.把比赛提供的所有数据解压到 raw_data/ 目录下。
4 | 2.按照顺序依次执行各个 .py,不带任何参数。
5 | 或者在当前目录下输入下面命令运行所有文件:
6 | dos2unix run_all_data_process.sh # 使用cygwin工具dos2unix将script改为unix格式
7 | sh run_all_data_process.sh
8 | 3.环境依赖(下面是我使用的版本)
9 | - numpy 1.12.1 10 | - pandas 0.19.2 11 | - word2vec 0.9.1 12 | - tqdm 4.11.2 13 | 14 | 15 | ### embed2ndarray.py 16 | 赛方提供了txt格式的词向量和字向量,这里把embedding矩阵转成 np.ndarray 形式,分别保存为 data/word_embedding.npy 和 data/char_embedding.npy。在赛方提供的词向量基础上,添加 '\' 和 '\' 两个特殊符号。其中 '\' 用于将序列补全到固定长度, '\' 用于替换低频词(字)。 17 | 用 pd.Series 保存词(字)对应 embedding 中的行号(id),存储在 data/sr_word2id.pkl 和 data/sr_char2id.pkl 中。 18 | 19 | ### question_and_topic_2id.py 20 | 把问题和话题转为id形式,保存在 data/sr_question2id.pkl 和 data/sr_id2question.pkl 中。 21 | 22 | ### char2id.py 23 | 利用上面得到的 sr_char2id,把所有问题的字转为对应的id, 存储为 24 | data/ch_train_title.npy 25 | data/ch_train_content.npy 26 | data/ch_eval_title.npy 27 | data/ch_eval_content.npy 28 | 29 | ### word2id.py 30 | 同 char2id.py 31 | 32 | ### creat_batch_data.py 33 | 把所有的数据按照 batch_size(128) 进行打包,固定seed,随机取 10 万样本作为验证集。每个batch存储为一个 npz 文件,包括 X, y 两部分。 34 | 这里所有的序列都进行了截断,长度不足的用0进行padding到固定长度。 35 | 保存位置: 36 | wd_train_path = '../data/wd-data/data_train/' 37 | wd_valid_path = '../data/wd-data/data_valid/' 38 | wd_test_path = '../data/wd-data/data_test/' 39 | ch_train_path = '../data/ch-data/data_train/' 40 | ch_valid_path = '../data/ch-data/data_valid/' 41 | ch_test_path = '../data/ch-data/data_test/' 42 | 43 | 44 | ### creat_batch_seg.py 45 | 和 creat_batch_data.py 相同,只是对 content 部分进行句子划分。用于分层模型。 46 | 划分句子长度: 47 | wd_title_len = 30, wd_sent_len = 30, wd_doc_len = 10.(即content划分为10个句子,每个句子长度为30个词) 48 | ch_title_len = 52, ch_sent_len = 52, ch_doc_len = 10. 49 | 不划分句子: 50 | wd_title_len = 30, wd_content_len = 150. 51 | ch_title_len = 52, ch_content_len = 300. 52 | 53 | 54 | ### To do 55 | - 在数据读取中使用 tfrecord 文件进行数据读取。这样能够随时改变 batch_size, 而且 shuffle 会比使用 numpy 更加均匀。 56 | - 添加序列长度信息。在这里所有的序列都截断或者padding为固定长度,在误差计算中没有处理padding部分,可能会使准确率下降。在使用 dynamic_rnn 的时候加上 sequence_length 信息,在计算的时候忽略 padding 部分。同时结合 tf.train.SequenceExample() 和 tf.train.batch() 自动 padding,也可以减少数据量。 -------------------------------------------------------------------------------- /zhihu-text-classification-master/data_process/embed2ndarray.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | from __future__ import division 4 | from __future__ import print_function 5 | 6 | import numpy as np 7 | import pandas as pd 8 | import word2vec 9 | import pickle 10 | import os 11 | 12 | SPECIAL_SYMBOL = ['', ''] # add these special symbols to word(char) embeddings. 13 | 14 | 15 | def get_word_embedding(): 16 | """提取词向量,并保存至 ../data/word_embedding.npy""" 17 | print('getting the word_embedding.npy') 18 | wv = word2vec.load('../raw_data/word_embedding.txt') 19 | word_embedding = wv.vectors 20 | words = wv.vocab 21 | n_special_sym = len(SPECIAL_SYMBOL) 22 | sr_id2word = pd.Series(words, index=range(n_special_sym, n_special_sym + len(words))) 23 | sr_word2id = pd.Series(range(n_special_sym, n_special_sym + len(words)), index=words) 24 | # 添加特殊符号::0, :1 25 | embedding_size = 256 26 | vec_special_sym = np.random.randn(n_special_sym, embedding_size) 27 | for i in range(n_special_sym): 28 | sr_id2word[i] = SPECIAL_SYMBOL[i] 29 | sr_word2id[SPECIAL_SYMBOL[i]] = i 30 | word_embedding = np.vstack([vec_special_sym, word_embedding]) 31 | # 保存词向量 32 | save_path = '../data/' 33 | if not os.path.exists(save_path): 34 | os.makedirs(save_path) 35 | np.save(save_path + 'word_embedding.npy', word_embedding) 36 | # 保存词与id的对应关系 37 | with open(save_path + 'sr_word2id.pkl', 'wb') as outp: 38 | pickle.dump(sr_id2word, outp) 39 | pickle.dump(sr_word2id, outp) 40 | print('Saving the word_embedding.npy to ../data/word_embedding.npy') 41 | 42 | 43 | def get_char_embedding(): 44 | """提取字向量,并保存至 ../data/char_embedding.npy""" 45 | print('getting the char_embedding.npy') 46 | wv = word2vec.load('../raw_data/char_embedding.txt') 47 | char_embedding = wv.vectors 48 | chars = wv.vocab 49 | n_special_sym = len(SPECIAL_SYMBOL) 50 | sr_id2char = pd.Series(chars, index=range(n_special_sym, n_special_sym + len(chars))) 51 | sr_char2id = pd.Series(range(n_special_sym, n_special_sym + len(chars)), index=chars) 52 | 53 | # 添加特殊符号::0, :1 54 | embedding_size = 256 55 | 56 | vec_special_sym = np.random.randn(n_special_sym, embedding_size) 57 | for i in range(n_special_sym): 58 | sr_id2char[i] = SPECIAL_SYMBOL[i] 59 | sr_char2id[SPECIAL_SYMBOL[i]] = i 60 | char_embedding = np.vstack([vec_special_sym, char_embedding]) 61 | # 保存字向量 62 | save_path = '../data/' 63 | if not os.path.exists(save_path): 64 | os.makedirs(save_path) 65 | np.save(save_path + 'char_embedding.npy', char_embedding) 66 | # 保存字与id的对应关系 67 | with open(save_path + 'sr_char2id.pkl', 'wb') as outp: 68 | pickle.dump(sr_id2char, outp) 69 | pickle.dump(sr_char2id, outp) 70 | print('Saving the char_embedding.npy to ../data/char_embedding.npy') 71 | 72 | 73 | if __name__ == '__main__': 74 | get_word_embedding() 75 | get_char_embedding() 76 | -------------------------------------------------------------------------------- /zhihu-text-classification-master/data_process/word2id.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | from __future__ import division 4 | from __future__ import print_function 5 | 6 | import numpy as np 7 | import pandas as pd 8 | import pickle 9 | from multiprocessing import Pool 10 | from tqdm import tqdm 11 | import time 12 | 13 | save_path = '../data/' 14 | with open(save_path + 'sr_word2id.pkl', 'rb') as inp: 15 | sr_id2word = pickle.load(inp) 16 | sr_word2id = pickle.load(inp) 17 | dict_word2id = dict() 18 | for i in range(len(sr_word2id)): 19 | dict_word2id[sr_word2id.index[i]] = sr_word2id.values[i] 20 | 21 | 22 | def get_id(word): 23 | """获取 word 所对应的 id. 24 | 如果该词不在词典中,用 (对应的 ID 为 1 )进行替换。 25 | """ 26 | if word not in dict_word2id: 27 | return 1 28 | else: 29 | return dict_word2id[word] 30 | 31 | 32 | def get_id4words(words): 33 | """把 words 转为 对应的 id""" 34 | words = words.strip().split(',') # 先分开词 35 | ids = list(map(get_id, words)) # 获取id 36 | return ids 37 | 38 | 39 | def test_word2id(): 40 | """把测试集的所有词转成对应的id。""" 41 | time0 = time.time() 42 | print('Processing eval data.') 43 | df_eval = pd.read_csv('../raw_data/question_eval_set.txt', sep='\t', usecols=[0, 2, 4], 44 | names=['question_id', 'word_title', 'word_content'], dtype={'question_id': object}) 45 | print('test question number %d' % len(df_eval)) 46 | # 没有 title 的问题用 content 来替换 47 | na_title_indexs = list() 48 | for i in range(len(df_eval)): 49 | word_title = df_eval.word_title.values[i] 50 | if type(word_title) is float: 51 | na_title_indexs.append(i) 52 | print('There are %d test questions without title.' % len(na_title_indexs)) 53 | for na_index in na_title_indexs: 54 | df_eval.at[na_index, 'word_title'] = df_eval.at[na_index, 'word_content'] 55 | # 没有 content 的问题用 title 来替换 56 | na_content_indexs = list() 57 | for i in tqdm(range(len(df_eval))): 58 | word_content = df_eval.word_content.values[i] 59 | if type(word_content) is float: 60 | na_content_indexs.append(i) 61 | print('There are %d test questions without content.' % len(na_content_indexs)) 62 | for na_index in tqdm(na_content_indexs): 63 | df_eval.at[na_index, 'word_content'] = df_eval.at[na_index, 'word_title'] 64 | # 转为 id 形式 65 | p = Pool() 66 | eval_title = np.asarray(p.map(get_id4words, df_eval.word_title.values)) 67 | np.save('../data/wd_eval_title.npy', eval_title) 68 | eval_content = np.asarray(p.map(get_id4words, df_eval.word_content.values)) 69 | np.save('../data/wd_eval_content.npy', eval_content) 70 | p.close() 71 | p.join() 72 | print('Finished changing the eval words to ids. Costed time %g s' % (time.time() - time0)) 73 | 74 | 75 | def train_word2id(): 76 | """把训练集的所有词转成对应的id。""" 77 | time0 = time.time() 78 | print('Processing train data.') 79 | df_train = pd.read_csv('../raw_data/question_train_set.txt', sep='\t', usecols=[0, 2, 4], 80 | names=['question_id', 'word_title', 'word_content'], dtype={'question_id': object}) 81 | print('training question number %d ' % len(df_train)) 82 | # 没有 content 的问题用 title 来替换 83 | na_content_indexs = list() 84 | for i in tqdm(range(len(df_train))): 85 | word_content = df_train.word_content.values[i] 86 | if type(word_content) is float: 87 | na_content_indexs.append(i) 88 | print('There are %d train questions without content.' % len(na_content_indexs)) 89 | for na_index in tqdm(na_content_indexs): 90 | df_train.at[na_index, 'word_content'] = df_train.at[na_index, 'word_title'] 91 | # 没有 title 的问题, 丢弃 92 | na_title_indexs = list() 93 | for i in range(len(df_train)): 94 | word_title = df_train.word_title.values[i] 95 | if type(word_title) is float: 96 | na_title_indexs.append(i) 97 | print('There are %d train questions without title.' % len(na_title_indexs)) 98 | df_train = df_train.drop(na_title_indexs) 99 | print('After dropping, training question number(should be 2999952) = %d' % len(df_train)) 100 | # 转为 id 形式 101 | p = Pool() 102 | train_title = np.asarray(p.map(get_id4words, df_train.word_title.values)) 103 | np.save('../data/wd_train_title.npy', train_title) 104 | train_content = np.asarray(p.map(get_id4words, df_train.word_content.values)) 105 | np.save('../data/wd_train_content.npy', train_content) 106 | p.close() 107 | p.join() 108 | print('Finished changing the training words to ids. Costed time %g s' % (time.time() - time0)) 109 | 110 | 111 | if __name__ == '__main__': 112 | test_word2id() 113 | train_word2id() 114 | -------------------------------------------------------------------------------- /zhihu-text-classification-master/models/wd_4_han/predict.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | from __future__ import print_function 4 | from __future__ import division 5 | import tensorflow as tf 6 | import numpy as np 7 | from tqdm import tqdm 8 | import os 9 | import sys 10 | import time 11 | import network 12 | 13 | sys.path.append('../..') 14 | from evaluator import score_eval 15 | 16 | settings = network.Settings() 17 | title_len = settings.title_len 18 | model_name = settings.model_name 19 | ckpt_path = settings.ckpt_path 20 | 21 | local_scores_path = '../../local_scores/' 22 | scores_path = '../../scores/' 23 | if not os.path.exists(local_scores_path): 24 | os.makedirs(local_scores_path) 25 | if not os.path.exists(scores_path): 26 | os.makedirs(scores_path) 27 | 28 | embedding_path = '../../data/word_embedding.npy' 29 | data_valid_path = '../../data/wd-data/seg_valid/' 30 | data_test_path = '../../data/wd-data/seg_test/' 31 | va_batches = os.listdir(data_valid_path) 32 | te_batches = os.listdir(data_test_path) # batch 文件名列表 33 | n_va_batches = len(va_batches) 34 | n_te_batches = len(te_batches) 35 | 36 | 37 | def get_batch(batch_id): 38 | """get a batch from valid data""" 39 | new_batch = np.load(data_valid_path + str(batch_id) + '.npz') 40 | X_batch = new_batch['X'] 41 | y_batch = new_batch['y'] 42 | X1_batch = X_batch[:, :title_len] 43 | X2_batch = X_batch[:, title_len:] 44 | return [X1_batch, X2_batch, y_batch] 45 | 46 | 47 | def get_test_batch(batch_id): 48 | """get a batch from test data""" 49 | X_batch = np.load(data_test_path + str(batch_id) + '.npy') 50 | X1_batch = X_batch[:, :title_len] 51 | X2_batch = X_batch[:, title_len:] 52 | return [X1_batch, X2_batch] 53 | 54 | 55 | def local_predict(sess, model): 56 | """Test on the valid data.""" 57 | time0 = time.time() 58 | predict_labels_list = list() # 所有的预测结果 59 | marked_labels_list = list() 60 | predict_scores = list() 61 | for i in tqdm(xrange(n_va_batches)): 62 | [X1_batch, X2_batch, y_batch] = get_batch(i) 63 | marked_labels_list.extend(y_batch) 64 | _batch_size = len(X1_batch) 65 | fetches = [model.y_pred] 66 | feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, 67 | model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0} 68 | predict_labels = sess.run(fetches, feed_dict)[0] 69 | predict_scores.append(predict_labels) 70 | predict_labels = list(map(lambda label: label.argsort()[-1:-6:-1], predict_labels)) # 取最大的5个下标 71 | predict_labels_list.extend(predict_labels) 72 | predict_label_and_marked_label_list = zip(predict_labels_list, marked_labels_list) 73 | precision, recall, f1 = score_eval(predict_label_and_marked_label_list) 74 | print('Local valid p=%g, r=%g, f1=%g' % (precision, recall, f1)) 75 | predict_scores = np.vstack(np.asarray(predict_scores)) 76 | local_scores_name = local_scores_path + model_name + '.npy' 77 | np.save(local_scores_name, predict_scores) 78 | print('local_scores.shape=', predict_scores.shape) 79 | print('Writed the scores into %s, time %g s' % (local_scores_name, time.time() - time0)) 80 | 81 | 82 | def predict(sess, model): 83 | """Test on the test data.""" 84 | time0 = time.time() 85 | predict_scores = list() 86 | for i in tqdm(xrange(n_te_batches)): 87 | [X1_batch, X2_batch] = get_test_batch(i) 88 | _batch_size = len(X1_batch) 89 | fetches = [model.y_pred] 90 | feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, 91 | model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0} 92 | predict_labels = sess.run(fetches, feed_dict)[0] 93 | predict_scores.append(predict_labels) 94 | predict_scores = np.vstack(np.asarray(predict_scores)) 95 | scores_name = scores_path + model_name + '.npy' 96 | np.save(scores_name, predict_scores) 97 | print('scores.shape=', predict_scores.shape) 98 | print('Writed the scores into %s, time %g s' % (scores_name, time.time() - time0)) 99 | 100 | 101 | def main(_): 102 | if not os.path.exists(ckpt_path + 'checkpoint'): 103 | print('there is not saved model, please check the ckpt path') 104 | exit() 105 | print('Loading model...') 106 | W_embedding = np.load(embedding_path) 107 | config = tf.ConfigProto() 108 | config.gpu_options.allow_growth = True 109 | with tf.Session(config=config) as sess: 110 | model = network.HAN(W_embedding, settings) 111 | model.saver.restore(sess, tf.train.latest_checkpoint(ckpt_path)) 112 | print('Local predicting...') 113 | local_predict(sess, model) 114 | print('Test predicting...') 115 | predict(sess, model) 116 | 117 | 118 | if __name__ == '__main__': 119 | tf.app.run() 120 | -------------------------------------------------------------------------------- /zhihu-text-classification-master/models/wd_2_hcnn/predict.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | from __future__ import print_function 4 | from __future__ import division 5 | import tensorflow as tf 6 | import numpy as np 7 | from tqdm import tqdm 8 | import os 9 | import sys 10 | import time 11 | import network 12 | 13 | sys.path.append('../..') 14 | from evaluator import score_eval 15 | 16 | settings = network.Settings() 17 | title_len = settings.title_len 18 | model_name = settings.model_name 19 | ckpt_path = settings.ckpt_path 20 | 21 | local_scores_path = '../../local_scores/' 22 | scores_path = '../../scores/' 23 | if not os.path.exists(local_scores_path): 24 | os.makedirs(local_scores_path) 25 | if not os.path.exists(scores_path): 26 | os.makedirs(scores_path) 27 | 28 | embedding_path = '../../data/word_embedding.npy' 29 | data_valid_path = '../../data/wd-data/seg_valid/' 30 | data_test_path = '../../data/wd-data/seg_test/' 31 | va_batches = os.listdir(data_valid_path) 32 | te_batches = os.listdir(data_test_path) # batch 文件名列表 33 | n_va_batches = len(va_batches) 34 | n_te_batches = len(te_batches) 35 | 36 | 37 | def get_batch(batch_id): 38 | """get a batch from valid data""" 39 | new_batch = np.load(data_valid_path + str(batch_id) + '.npz') 40 | X_batch = new_batch['X'] 41 | y_batch = new_batch['y'] 42 | X1_batch = X_batch[:, :title_len] 43 | X2_batch = X_batch[:, title_len:] 44 | return [X1_batch, X2_batch, y_batch] 45 | 46 | 47 | def get_test_batch(batch_id): 48 | """get a batch from test data""" 49 | X_batch = np.load(data_test_path + str(batch_id) + '.npy') 50 | X1_batch = X_batch[:, :title_len] 51 | X2_batch = X_batch[:, title_len:] 52 | return [X1_batch, X2_batch] 53 | 54 | 55 | def local_predict(sess, model): 56 | """Test on the valid data.""" 57 | time0 = time.time() 58 | predict_labels_list = list() # 所有的预测结果 59 | marked_labels_list = list() 60 | predict_scores = list() 61 | for i in tqdm(xrange(n_va_batches)): 62 | [X1_batch, X2_batch, y_batch] = get_batch(i) 63 | marked_labels_list.extend(y_batch) 64 | _batch_size = len(X1_batch) 65 | fetches = [model.y_pred] 66 | feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, 67 | model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0} 68 | predict_labels = sess.run(fetches, feed_dict)[0] 69 | predict_scores.append(predict_labels) 70 | predict_labels = list(map(lambda label: label.argsort()[-1:-6:-1], predict_labels)) # 取最大的5个下标 71 | predict_labels_list.extend(predict_labels) 72 | predict_label_and_marked_label_list = zip(predict_labels_list, marked_labels_list) 73 | precision, recall, f1 = score_eval(predict_label_and_marked_label_list) 74 | print('Local valid p=%g, r=%g, f1=%g' % (precision, recall, f1)) 75 | predict_scores = np.vstack(np.asarray(predict_scores)) 76 | local_scores_name = local_scores_path + model_name + '.npy' 77 | np.save(local_scores_name, predict_scores) 78 | print('local_scores.shape=', predict_scores.shape) 79 | print('Writed the scores into %s, time %g s' % (local_scores_name, time.time() - time0)) 80 | 81 | 82 | def predict(sess, model): 83 | """Test on the test data.""" 84 | time0 = time.time() 85 | predict_scores = list() 86 | for i in tqdm(xrange(n_te_batches)): 87 | [X1_batch, X2_batch] = get_test_batch(i) 88 | _batch_size = len(X1_batch) 89 | fetches = [model.y_pred] 90 | feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, 91 | model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0} 92 | predict_labels = sess.run(fetches, feed_dict)[0] 93 | predict_scores.append(predict_labels) 94 | predict_scores = np.vstack(np.asarray(predict_scores)) 95 | scores_name = scores_path + model_name + '.npy' 96 | np.save(scores_name, predict_scores) 97 | print('scores.shape=', predict_scores.shape) 98 | print('Writed the scores into %s, time %g s' % (scores_name, time.time() - time0)) 99 | 100 | 101 | def main(_): 102 | if not os.path.exists(ckpt_path + 'checkpoint'): 103 | print('there is not saved model, please check the ckpt path') 104 | exit() 105 | print('Loading model...') 106 | W_embedding = np.load(embedding_path) 107 | config = tf.ConfigProto() 108 | config.gpu_options.allow_growth = True 109 | with tf.Session(config=config) as sess: 110 | model = network.HCNN(W_embedding, settings) 111 | model.saver.restore(sess, tf.train.latest_checkpoint(ckpt_path)) 112 | print('Local predicting...') 113 | local_predict(sess, model) 114 | print('Test predicting...') 115 | predict(sess, model) 116 | 117 | 118 | if __name__ == '__main__': 119 | tf.app.run() 120 | -------------------------------------------------------------------------------- /zhihu-text-classification-master/models/wd_3_bigru/predict.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | from __future__ import print_function 4 | from __future__ import division 5 | import tensorflow as tf 6 | import numpy as np 7 | from tqdm import tqdm 8 | import os 9 | import sys 10 | import time 11 | import network 12 | 13 | sys.path.append('../..') 14 | from evaluator import score_eval 15 | 16 | settings = network.Settings() 17 | title_len = settings.title_len 18 | model_name = settings.model_name 19 | ckpt_path = settings.ckpt_path 20 | 21 | local_scores_path = '../../local_scores/' 22 | scores_path = '../../scores/' 23 | if not os.path.exists(local_scores_path): 24 | os.makedirs(local_scores_path) 25 | if not os.path.exists(scores_path): 26 | os.makedirs(scores_path) 27 | 28 | embedding_path = '../../data/word_embedding.npy' 29 | data_valid_path = '../../data/wd-data/data_valid/' 30 | data_test_path = '../../data/wd-data/data_test/' 31 | va_batches = os.listdir(data_valid_path) 32 | te_batches = os.listdir(data_test_path) # batch 文件名列表 33 | n_va_batches = len(va_batches) 34 | n_te_batches = len(te_batches) 35 | 36 | 37 | def get_batch(batch_id): 38 | """get a batch from valid data""" 39 | new_batch = np.load(data_valid_path + str(batch_id) + '.npz') 40 | X_batch = new_batch['X'] 41 | y_batch = new_batch['y'] 42 | X1_batch = X_batch[:, :title_len] 43 | X2_batch = X_batch[:, title_len:] 44 | return [X1_batch, X2_batch, y_batch] 45 | 46 | 47 | def get_test_batch(batch_id): 48 | """get a batch from test data""" 49 | X_batch = np.load(data_test_path + str(batch_id) + '.npy') 50 | X1_batch = X_batch[:, :title_len] 51 | X2_batch = X_batch[:, title_len:] 52 | return [X1_batch, X2_batch] 53 | 54 | 55 | def local_predict(sess, model): 56 | """Test on the valid data.""" 57 | time0 = time.time() 58 | predict_labels_list = list() # 所有的预测结果 59 | marked_labels_list = list() 60 | predict_scores = list() 61 | for i in tqdm(range(n_va_batches)): 62 | [X1_batch, X2_batch, y_batch] = get_batch(i) 63 | marked_labels_list.extend(y_batch) 64 | _batch_size = len(X1_batch) 65 | fetches = [model.y_pred] 66 | feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, 67 | model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0} 68 | predict_labels = sess.run(fetches, feed_dict)[0] 69 | predict_scores.append(predict_labels) 70 | predict_labels = list(map(lambda label: label.argsort()[-1:-6:-1], predict_labels)) # 取最大的5个下标 71 | predict_labels_list.extend(predict_labels) 72 | predict_label_and_marked_label_list = zip(predict_labels_list, marked_labels_list) 73 | precision, recall, f1 = score_eval(predict_label_and_marked_label_list) 74 | print('Local valid p=%g, r=%g, f1=%g' % (precision, recall, f1)) 75 | predict_scores = np.vstack(np.asarray(predict_scores)) 76 | local_scores_name = local_scores_path + model_name + '.npy' 77 | np.save(local_scores_name, predict_scores) 78 | print('local_scores.shape=', predict_scores.shape) 79 | print('Writed the scores into %s, time %g s' % (local_scores_name, time.time() - time0)) 80 | 81 | 82 | def predict(sess, model): 83 | """Test on the test data.""" 84 | time0 = time.time() 85 | predict_scores = list() 86 | for i in tqdm(range(n_te_batches)): 87 | [X1_batch, X2_batch] = get_test_batch(i) 88 | _batch_size = len(X1_batch) 89 | fetches = [model.y_pred] 90 | feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, 91 | model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0} 92 | predict_labels = sess.run(fetches, feed_dict)[0] 93 | predict_scores.append(predict_labels) 94 | predict_scores = np.vstack(np.asarray(predict_scores)) 95 | scores_name = scores_path + model_name + '.npy' 96 | np.save(scores_name, predict_scores) 97 | print('scores.shape=', predict_scores.shape) 98 | print('Writed the scores into %s, time %g s' % (scores_name, time.time() - time0)) 99 | 100 | 101 | def main(_): 102 | if not os.path.exists(ckpt_path + 'checkpoint'): 103 | print('there is not saved model, please check the ckpt path') 104 | exit() 105 | print('Loading model...') 106 | W_embedding = np.load(embedding_path) 107 | config = tf.ConfigProto() 108 | config.gpu_options.allow_growth = True 109 | with tf.Session(config=config) as sess: 110 | model = network.BiGRU(W_embedding, settings) 111 | model.saver.restore(sess, tf.train.latest_checkpoint(ckpt_path)) 112 | print('Local predicting...') 113 | local_predict(sess, model) 114 | print('Test predicting...') 115 | predict(sess, model) 116 | 117 | 118 | if __name__ == '__main__': 119 | tf.app.run() 120 | -------------------------------------------------------------------------------- /zhihu-text-classification-master/models/wd_6_rcnn/predict.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | from __future__ import print_function 4 | from __future__ import division 5 | import tensorflow as tf 6 | import numpy as np 7 | from tqdm import tqdm 8 | import os 9 | import sys 10 | import time 11 | import network 12 | 13 | sys.path.append('../..') 14 | from evaluator import score_eval 15 | 16 | settings = network.Settings() 17 | title_len = settings.title_len 18 | model_name = settings.model_name 19 | ckpt_path = settings.ckpt_path 20 | 21 | local_scores_path = '../../local_scores/' 22 | scores_path = '../../scores/' 23 | if not os.path.exists(local_scores_path): 24 | os.makedirs(local_scores_path) 25 | if not os.path.exists(scores_path): 26 | os.makedirs(scores_path) 27 | 28 | embedding_path = '../../data/word_embedding.npy' 29 | data_valid_path = '../../data/wd-data/data_valid/' 30 | data_test_path = '../../data/wd-data/data_test/' 31 | va_batches = os.listdir(data_valid_path) 32 | te_batches = os.listdir(data_test_path) # batch 文件名列表 33 | n_va_batches = len(va_batches) 34 | n_te_batches = len(te_batches) 35 | 36 | 37 | def get_batch(batch_id): 38 | """get a batch from valid data""" 39 | new_batch = np.load(data_valid_path + str(batch_id) + '.npz') 40 | X_batch = new_batch['X'] 41 | y_batch = new_batch['y'] 42 | X1_batch = X_batch[:, :title_len] 43 | X2_batch = X_batch[:, title_len:] 44 | return [X1_batch, X2_batch, y_batch] 45 | 46 | 47 | def get_test_batch(batch_id): 48 | """get a batch from test data""" 49 | X_batch = np.load(data_test_path + str(batch_id) + '.npy') 50 | X1_batch = X_batch[:, :title_len] 51 | X2_batch = X_batch[:, title_len:] 52 | return [X1_batch, X2_batch] 53 | 54 | 55 | def local_predict(sess, model): 56 | """Test on the valid data.""" 57 | time0 = time.time() 58 | predict_labels_list = list() # 所有的预测结果 59 | marked_labels_list = list() 60 | predict_scores = list() 61 | for i in tqdm(xrange(n_va_batches)): 62 | [X1_batch, X2_batch, y_batch] = get_batch(i) 63 | marked_labels_list.extend(y_batch) 64 | _batch_size = len(X1_batch) 65 | fetches = [model.y_pred] 66 | feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, 67 | model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0} 68 | predict_labels = sess.run(fetches, feed_dict)[0] 69 | predict_scores.append(predict_labels) 70 | predict_labels = list(map(lambda label: label.argsort()[-1:-6:-1], predict_labels)) # 取最大的5个下标 71 | predict_labels_list.extend(predict_labels) 72 | predict_label_and_marked_label_list = zip(predict_labels_list, marked_labels_list) 73 | precision, recall, f1 = score_eval(predict_label_and_marked_label_list) 74 | print('Local valid p=%g, r=%g, f1=%g' % (precision, recall, f1)) 75 | predict_scores = np.vstack(np.asarray(predict_scores)) 76 | local_scores_name = local_scores_path + model_name + '.npy' 77 | np.save(local_scores_name, predict_scores) 78 | print('local_scores.shape=', predict_scores.shape) 79 | print('Writed the scores into %s, time %g s' % (local_scores_name, time.time() - time0)) 80 | 81 | 82 | def predict(sess, model): 83 | """Test on the test data.""" 84 | time0 = time.time() 85 | predict_scores = list() 86 | for i in tqdm(xrange(n_te_batches)): 87 | [X1_batch, X2_batch] = get_test_batch(i) 88 | _batch_size = len(X1_batch) 89 | fetches = [model.y_pred] 90 | feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, 91 | model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0} 92 | predict_labels = sess.run(fetches, feed_dict)[0] 93 | predict_scores.append(predict_labels) 94 | predict_scores = np.vstack(np.asarray(predict_scores)) 95 | scores_name = scores_path + model_name + '.npy' 96 | np.save(scores_name, predict_scores) 97 | print('scores.shape=', predict_scores.shape) 98 | print('Writed the scores into %s, time %g s' % (scores_name, time.time() - time0)) 99 | 100 | 101 | def main(_): 102 | if not os.path.exists(ckpt_path + 'checkpoint'): 103 | print('there is not saved model, please check the ckpt path') 104 | exit() 105 | print('Loading model...') 106 | W_embedding = np.load(embedding_path) 107 | config = tf.ConfigProto() 108 | config.gpu_options.allow_growth = True 109 | with tf.Session(config=config) as sess: 110 | model = network.RCNN(W_embedding, settings) 111 | model.saver.restore(sess, tf.train.latest_checkpoint(ckpt_path)) 112 | print('Local predicting...') 113 | local_predict(sess, model) 114 | print('Test predicting...') 115 | predict(sess, model) 116 | 117 | 118 | if __name__ == '__main__': 119 | tf.app.run() 120 | -------------------------------------------------------------------------------- /zhihu-text-classification-master/models/wd_1_1_cnn_concat/predict.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | from __future__ import print_function 4 | from __future__ import division 5 | import tensorflow as tf 6 | import numpy as np 7 | from tqdm import tqdm 8 | import os 9 | import sys 10 | import time 11 | import network 12 | 13 | sys.path.append('../..') 14 | from evaluator import score_eval 15 | 16 | settings = network.Settings() 17 | title_len = settings.title_len 18 | model_name = settings.model_name 19 | ckpt_path = settings.ckpt_path 20 | 21 | local_scores_path = '../../local_scores/' 22 | scores_path = '../../scores/' 23 | if not os.path.exists(local_scores_path): 24 | os.makedirs(local_scores_path) 25 | if not os.path.exists(scores_path): 26 | os.makedirs(scores_path) 27 | 28 | embedding_path = '../../data/word_embedding.npy' 29 | data_valid_path = '../../data/wd-data/data_valid/' 30 | data_test_path = '../../data/wd-data/data_test/' 31 | va_batches = os.listdir(data_valid_path) 32 | te_batches = os.listdir(data_test_path) # batch 文件名列表 33 | n_va_batches = len(va_batches) 34 | n_te_batches = len(te_batches) 35 | 36 | 37 | def get_batch(batch_id): 38 | """get a batch from valid data""" 39 | new_batch = np.load(data_valid_path + str(batch_id) + '.npz') 40 | X_batch = new_batch['X'] 41 | y_batch = new_batch['y'] 42 | X1_batch = X_batch[:, :title_len] 43 | X2_batch = X_batch[:, title_len:] 44 | return [X1_batch, X2_batch, y_batch] 45 | 46 | 47 | def get_test_batch(batch_id): 48 | """get a batch from test data""" 49 | X_batch = np.load(data_test_path + str(batch_id) + '.npy') 50 | X1_batch = X_batch[:, :title_len] 51 | X2_batch = X_batch[:, title_len:] 52 | return [X1_batch, X2_batch] 53 | 54 | 55 | def local_predict(sess, model): 56 | """Test on the valid data.""" 57 | time0 = time.time() 58 | predict_labels_list = list() # 所有的预测结果 59 | marked_labels_list = list() 60 | predict_scores = list() 61 | for i in tqdm(xrange(n_va_batches)): 62 | [X1_batch, X2_batch, y_batch] = get_batch(i) 63 | marked_labels_list.extend(y_batch) 64 | _batch_size = len(X1_batch) 65 | fetches = [model.y_pred] 66 | feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, 67 | model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0} 68 | predict_labels = sess.run(fetches, feed_dict)[0] 69 | predict_scores.append(predict_labels) 70 | predict_labels = map(lambda label: label.argsort()[-1:-6:-1], predict_labels) # 取最大的5个下标 71 | predict_labels_list.extend(predict_labels) 72 | predict_label_and_marked_label_list = zip(predict_labels_list, marked_labels_list) 73 | precision, recall, f1 = score_eval(predict_label_and_marked_label_list) 74 | print('Local valid p=%g, r=%g, f1=%g' % (precision, recall, f1)) 75 | predict_scores = np.vstack(np.asarray(predict_scores)) 76 | local_scores_name = local_scores_path + model_name + '.npy' 77 | np.save(local_scores_name, predict_scores) 78 | print('local_scores.shape=', predict_scores.shape) 79 | print('Writed the scores into %s, time %g s' % (local_scores_name, time.time() - time0)) 80 | 81 | 82 | def predict(sess, model): 83 | """Test on the test data.""" 84 | time0 = time.time() 85 | predict_scores = list() 86 | for i in tqdm(xrange(n_te_batches)): 87 | [X1_batch, X2_batch] = get_test_batch(i) 88 | _batch_size = len(X1_batch) 89 | fetches = [model.y_pred] 90 | feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, 91 | model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0} 92 | predict_labels = sess.run(fetches, feed_dict)[0] 93 | predict_scores.append(predict_labels) 94 | predict_scores = np.vstack(np.asarray(predict_scores)) 95 | scores_name = scores_path + model_name + '.npy' 96 | np.save(scores_name, predict_scores) 97 | print('scores.shape=', predict_scores.shape) 98 | print('Writed the scores into %s, time %g s' % (scores_name, time.time() - time0)) 99 | 100 | 101 | def main(_): 102 | if not os.path.exists(ckpt_path + 'checkpoint'): 103 | print('there is not saved model, please check the ckpt path') 104 | exit() 105 | print('Loading model...') 106 | W_embedding = np.load(embedding_path) 107 | config = tf.ConfigProto() 108 | config.gpu_options.allow_growth = True 109 | with tf.Session(config=config) as sess: 110 | model = network.TextCNN(W_embedding, settings) 111 | model.saver.restore(sess, tf.train.latest_checkpoint(ckpt_path)) 112 | print('Local predicting...') 113 | local_predict(sess, model) 114 | print('Test predicting...') 115 | predict(sess, model) 116 | 117 | 118 | if __name__ == '__main__': 119 | tf.app.run() 120 | -------------------------------------------------------------------------------- /zhihu-text-classification-master/models/wd_1_2_cnn_max/predict.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | from __future__ import print_function 4 | from __future__ import division 5 | import tensorflow as tf 6 | import numpy as np 7 | from tqdm import tqdm 8 | import os 9 | import sys 10 | import time 11 | import network 12 | 13 | sys.path.append('../..') 14 | from evaluator import score_eval 15 | 16 | settings = network.Settings() 17 | title_len = settings.title_len 18 | model_name = settings.model_name 19 | ckpt_path = settings.ckpt_path 20 | 21 | local_scores_path = '../../local_scores/' 22 | scores_path = '../../scores/' 23 | if not os.path.exists(local_scores_path): 24 | os.makedirs(local_scores_path) 25 | if not os.path.exists(scores_path): 26 | os.makedirs(scores_path) 27 | 28 | embedding_path = '../../data/word_embedding.npy' 29 | data_valid_path = '../../data/wd-data/data_valid/' 30 | data_test_path = '../../data/wd-data/data_test/' 31 | va_batches = os.listdir(data_valid_path) 32 | te_batches = os.listdir(data_test_path) # batch 文件名列表 33 | n_va_batches = len(va_batches) 34 | n_te_batches = len(te_batches) 35 | 36 | 37 | def get_batch(batch_id): 38 | """get a batch from valid data""" 39 | new_batch = np.load(data_valid_path + str(batch_id) + '.npz') 40 | X_batch = new_batch['X'] 41 | y_batch = new_batch['y'] 42 | X1_batch = X_batch[:, :title_len] 43 | X2_batch = X_batch[:, title_len:] 44 | return [X1_batch, X2_batch, y_batch] 45 | 46 | 47 | def get_test_batch(batch_id): 48 | """get a batch from test data""" 49 | X_batch = np.load(data_test_path + str(batch_id) + '.npy') 50 | X1_batch = X_batch[:, :title_len] 51 | X2_batch = X_batch[:, title_len:] 52 | return [X1_batch, X2_batch] 53 | 54 | 55 | def local_predict(sess, model): 56 | """Test on the valid data.""" 57 | time0 = time.time() 58 | predict_labels_list = list() # 所有的预测结果 59 | marked_labels_list = list() 60 | predict_scores = list() 61 | for i in tqdm(xrange(n_va_batches)): 62 | [X1_batch, X2_batch, y_batch] = get_batch(i) 63 | marked_labels_list.extend(y_batch) 64 | _batch_size = len(X1_batch) 65 | fetches = [model.y_pred] 66 | feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, 67 | model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0} 68 | predict_labels = sess.run(fetches, feed_dict)[0] 69 | predict_scores.append(predict_labels) 70 | predict_labels = list(map(lambda label: label.argsort()[-1:-6:-1], predict_labels)) # 取最大的5个下标 71 | predict_labels_list.extend(predict_labels) 72 | predict_label_and_marked_label_list = zip(predict_labels_list, marked_labels_list) 73 | precision, recall, f1 = score_eval(predict_label_and_marked_label_list) 74 | print('Local valid p=%g, r=%g, f1=%g' % (precision, recall, f1)) 75 | predict_scores = np.vstack(np.asarray(predict_scores)) 76 | local_scores_name = local_scores_path + model_name + '.npy' 77 | np.save(local_scores_name, predict_scores) 78 | print('local_scores.shape=', predict_scores.shape) 79 | print('Writed the scores into %s, time %g s' % (local_scores_name, time.time() - time0)) 80 | 81 | 82 | def predict(sess, model): 83 | """Test on the test data.""" 84 | time0 = time.time() 85 | predict_scores = list() 86 | for i in tqdm(xrange(n_te_batches)): 87 | [X1_batch, X2_batch] = get_test_batch(i) 88 | _batch_size = len(X1_batch) 89 | fetches = [model.y_pred] 90 | feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, 91 | model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0} 92 | predict_labels = sess.run(fetches, feed_dict)[0] 93 | predict_scores.append(predict_labels) 94 | predict_scores = np.vstack(np.asarray(predict_scores)) 95 | scores_name = scores_path + model_name + '.npy' 96 | np.save(scores_name, predict_scores) 97 | print('scores.shape=', predict_scores.shape) 98 | print('Writed the scores into %s, time %g s' % (scores_name, time.time() - time0)) 99 | 100 | 101 | def main(_): 102 | if not os.path.exists(ckpt_path + 'checkpoint'): 103 | print('there is not saved model, please check the ckpt path') 104 | exit() 105 | print('Loading model...') 106 | W_embedding = np.load(embedding_path) 107 | config = tf.ConfigProto() 108 | config.gpu_options.allow_growth = True 109 | with tf.Session(config=config) as sess: 110 | model = network.TextCNN(W_embedding, settings) 111 | model.saver.restore(sess, tf.train.latest_checkpoint(ckpt_path)) 112 | print('Local predicting...') 113 | local_predict(sess, model) 114 | print('Test predicting...') 115 | predict(sess, model) 116 | 117 | 118 | if __name__ == '__main__': 119 | tf.app.run() 120 | -------------------------------------------------------------------------------- /zhihu-text-classification-master/models/wd_5_bigru_cnn/predict.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | from __future__ import print_function 4 | from __future__ import division 5 | import tensorflow as tf 6 | import numpy as np 7 | from tqdm import tqdm 8 | import os 9 | import sys 10 | import time 11 | import network 12 | 13 | sys.path.append('../..') 14 | from evaluator import score_eval 15 | 16 | settings = network.Settings() 17 | title_len = settings.title_len 18 | model_name = settings.model_name 19 | ckpt_path = settings.ckpt_path 20 | 21 | local_scores_path = '../../local_scores/' 22 | scores_path = '../../scores/' 23 | if not os.path.exists(local_scores_path): 24 | os.makedirs(local_scores_path) 25 | if not os.path.exists(scores_path): 26 | os.makedirs(scores_path) 27 | 28 | embedding_path = '../../data/word_embedding.npy' 29 | data_valid_path = '../../data/wd-data/data_valid/' 30 | data_test_path = '../../data/wd-data/data_test/' 31 | va_batches = os.listdir(data_valid_path) 32 | te_batches = os.listdir(data_test_path) # batch 文件名列表 33 | n_va_batches = len(va_batches) 34 | n_te_batches = len(te_batches) 35 | 36 | 37 | def get_batch(batch_id): 38 | """get a batch from valid data""" 39 | new_batch = np.load(data_valid_path + str(batch_id) + '.npz') 40 | X_batch = new_batch['X'] 41 | y_batch = new_batch['y'] 42 | X1_batch = X_batch[:, :title_len] 43 | X2_batch = X_batch[:, title_len:] 44 | return [X1_batch, X2_batch, y_batch] 45 | 46 | 47 | def get_test_batch(batch_id): 48 | """get a batch from test data""" 49 | X_batch = np.load(data_test_path + str(batch_id) + '.npy') 50 | X1_batch = X_batch[:, :title_len] 51 | X2_batch = X_batch[:, title_len:] 52 | return [X1_batch, X2_batch] 53 | 54 | 55 | def local_predict(sess, model): 56 | """Test on the valid data.""" 57 | time0 = time.time() 58 | predict_labels_list = list() # 所有的预测结果 59 | marked_labels_list = list() 60 | predict_scores = list() 61 | for i in tqdm(xrange(n_va_batches)): 62 | [X1_batch, X2_batch, y_batch] = get_batch(i) 63 | marked_labels_list.extend(y_batch) 64 | _batch_size = len(X1_batch) 65 | fetches = [model.y_pred] 66 | feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, 67 | model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0} 68 | predict_labels = sess.run(fetches, feed_dict)[0] 69 | predict_scores.append(predict_labels) 70 | predict_labels = list(map(lambda label: label.argsort()[-1:-6:-1], predict_labels)) # 取最大的5个下标 71 | predict_labels_list.extend(predict_labels) 72 | predict_label_and_marked_label_list = zip(predict_labels_list, marked_labels_list) 73 | precision, recall, f1 = score_eval(predict_label_and_marked_label_list) 74 | print('Local valid p=%g, r=%g, f1=%g' % (precision, recall, f1)) 75 | predict_scores = np.vstack(np.asarray(predict_scores)) 76 | local_scores_name = local_scores_path + model_name + '.npy' 77 | np.save(local_scores_name, predict_scores) 78 | print('local_scores.shape=', predict_scores.shape) 79 | print('Writed the scores into %s, time %g s' % (local_scores_name, time.time() - time0)) 80 | 81 | 82 | def predict(sess, model): 83 | """Test on the test data.""" 84 | time0 = time.time() 85 | predict_scores = list() 86 | for i in tqdm(xrange(n_te_batches)): 87 | [X1_batch, X2_batch] = get_test_batch(i) 88 | _batch_size = len(X1_batch) 89 | fetches = [model.y_pred] 90 | feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, 91 | model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0} 92 | predict_labels = sess.run(fetches, feed_dict)[0] 93 | predict_scores.append(predict_labels) 94 | predict_scores = np.vstack(np.asarray(predict_scores)) 95 | scores_name = scores_path + model_name + '.npy' 96 | np.save(scores_name, predict_scores) 97 | print('scores.shape=', predict_scores.shape) 98 | print('Writed the scores into %s, time %g s' % (scores_name, time.time() - time0)) 99 | 100 | 101 | def main(_): 102 | if not os.path.exists(ckpt_path + 'checkpoint'): 103 | print('there is not saved model, please check the ckpt path') 104 | exit() 105 | print('Loading model...') 106 | W_embedding = np.load(embedding_path) 107 | config = tf.ConfigProto() 108 | config.gpu_options.allow_growth = True 109 | with tf.Session(config=config) as sess: 110 | model = network.BiGRU_CNN(W_embedding, settings) 111 | model.saver.restore(sess, tf.train.latest_checkpoint(ckpt_path)) 112 | print('Local predicting...') 113 | local_predict(sess, model) 114 | print('Test predicting...') 115 | predict(sess, model) 116 | 117 | 118 | if __name__ == '__main__': 119 | tf.app.run() 120 | -------------------------------------------------------------------------------- /zhihu-text-classification-master/data_process/char2id.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | from __future__ import division 4 | from __future__ import print_function 5 | 6 | import numpy as np 7 | import pandas as pd 8 | import pickle 9 | from multiprocessing import Pool 10 | from tqdm import tqdm 11 | import time 12 | 13 | 14 | save_path = '../data/' 15 | with open(save_path + 'sr_char2id.pkl', 'rb') as inp: 16 | sr_id2char = pickle.load(inp) 17 | sr_char2id = pickle.load(inp) 18 | dict_char2id = dict() 19 | for i in range(len(sr_char2id)): 20 | dict_char2id[sr_char2id.index[i]] = sr_char2id.values[i] 21 | 22 | 23 | def get_id(char): 24 | """获取 char 所对应的 id. 25 | 如果该字不在字典中,用1进行替换。 26 | """ 27 | if char not in dict_char2id: 28 | return 1 29 | else: 30 | return dict_char2id[char] 31 | 32 | 33 | def get_id4chars(chars): 34 | """把 chars 转为 对应的 id""" 35 | chars = chars.strip().split(',') # 先分开字 36 | ids = list(map(get_id, chars)) # 获取id 37 | return ids 38 | 39 | 40 | def test_char2id(): 41 | """把测试集的所有字转成对应的id。""" 42 | time0 = time.time() 43 | print('Processing eval data.') 44 | df_eval = pd.read_csv('../raw_data/question_eval_set.txt', sep='\t', usecols=[0, 1, 3], 45 | names=['question_id', 'char_title', 'char_content'], dtype={'question_id': object}) 46 | print('test question number %d' % len(df_eval)) 47 | # 没有 title 的问题用 content 来替换 48 | na_title_indexs = list() 49 | for i in range(len(df_eval)): 50 | char_title = df_eval.char_title.values[i] 51 | if type(char_title) is float: 52 | na_title_indexs.append(i) 53 | print('There are %d test questions without title.' % len(na_title_indexs)) 54 | for na_index in na_title_indexs: 55 | df_eval.at[na_index, 'char_title'] = df_eval.at[na_index, 'char_content'] 56 | # 没有 content 的问题用 title 来替换 57 | na_content_indexs = list() 58 | for i in tqdm(range(len(df_eval))): 59 | char_content = df_eval.char_content.values[i] 60 | if type(char_content) is float: 61 | na_content_indexs.append(i) 62 | print('There are %d test questions without content.' % len(na_content_indexs)) 63 | for na_index in tqdm(na_content_indexs): 64 | df_eval.at[na_index, 'char_content'] = df_eval.at[na_index, 'char_title'] 65 | # 转为 id 形式 66 | p = Pool() 67 | eval_title = np.asarray(p.map(get_id4chars, df_eval.char_title.values)) 68 | np.save('../data/ch_eval_title.npy', eval_title) 69 | eval_content = np.asarray(p.map(get_id4chars, df_eval.char_content.values)) 70 | np.save('../data/ch_eval_content.npy', eval_content) 71 | p.close() 72 | p.join() 73 | print('Finished changing the eval chars to ids. Costed time %g s' % (time.time()-time0)) 74 | 75 | 76 | def train_char2id(): 77 | """把训练集的所有字转成对应的id。""" 78 | time0 = time.time() 79 | print('Processing train data.') 80 | df_train = pd.read_csv('../raw_data/question_train_set.txt', sep='\t', usecols=[0, 1, 3], 81 | names=['question_id', 'char_title', 'char_content'], dtype={'question_id': object}) 82 | print('training question number %d ' % len(df_train)) 83 | # 没有 content 的问题用 title 来替换 84 | na_content_indexs = list() 85 | for i in tqdm(range(len(df_train))): 86 | char_content = df_train.char_content.values[i] 87 | if type(char_content) is float: 88 | na_content_indexs.append(i) 89 | print('There are %d train questions without content.' % len(na_content_indexs)) 90 | for na_index in tqdm(na_content_indexs): 91 | df_train.at[na_index, 'char_content'] = df_train.at[na_index, 'char_title'] 92 | # 没有 title 的问题, 与词一样丢弃下面样本 93 | na_title_indexs = [328877, 422123, 633584, 768738, 818616, 876828, 1273673, 1527297, 94 | 1636237, 1682969, 2052477, 2628516, 2657464, 2904162, 2993517] 95 | for i in range(len(df_train)): 96 | char_title = df_train.char_title.values[i] 97 | if type(char_title) is float: 98 | na_title_indexs.append(i) 99 | print('There are %d train questions without title.' % len(na_title_indexs)) 100 | df_train = df_train.drop(na_title_indexs) 101 | print('After dropping, training question number(should be 2999952) = %d' % len(df_train)) 102 | # 转为 id 形式 103 | p = Pool() 104 | train_title = np.asarray(list(p.map(get_id4chars, df_train.char_title.values))) 105 | np.save('../data/ch_train_title.npy', train_title) 106 | train_content = np.asarray(p.map(get_id4chars, df_train.char_content.values)) 107 | np.save('../data/ch_train_content.npy', train_content) 108 | p.close() 109 | p.join() 110 | print('Finished changing the training chars to ids. Costed time %g s' % (time.time() - time0)) 111 | 112 | 113 | if __name__ == '__main__': 114 | test_char2id() 115 | train_char2id() 116 | 117 | 118 | 119 | 120 | 121 | 122 | -------------------------------------------------------------------------------- /zhihu-text-classification-master/data_process/creat_batch_seg.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | from __future__ import division 4 | from __future__ import print_function 5 | 6 | import numpy as np 7 | from multiprocessing import Pool 8 | import sys 9 | import os 10 | 11 | sys.path.append('../') 12 | from data_helpers import pad_X30 13 | from data_helpers import pad_X52 14 | from data_helpers import wd_pad_cut_docs 15 | from data_helpers import ch_pad_cut_docs 16 | from data_helpers import train_batch 17 | from data_helpers import eval_batch 18 | 19 | 20 | wd_train_path = '../data/wd-data/seg_train/' 21 | wd_valid_path = '../data/wd-data/seg_valid/' 22 | wd_test_path = '../data/wd-data/seg_test/' 23 | ch_train_path = '../data/ch-data/seg_train/' 24 | ch_valid_path = '../data/ch-data/seg_valid/' 25 | ch_test_path = '../data/ch-data/seg_test/' 26 | paths = [wd_train_path, wd_valid_path, wd_test_path, 27 | ch_train_path, ch_valid_path, ch_test_path] 28 | for each in paths: 29 | if not os.path.exists(each): 30 | os.makedirs(each) 31 | 32 | 33 | # word 数据打包 34 | def wd_train_get_batch(title_len=30, batch_size=128): 35 | print('loading word train_title and train_content, this should cost minutes, please wait.') 36 | train_title = np.load('../data/wd_train_title.npy') 37 | train_content = np.load('../data/wd_train_content.npy') 38 | p = Pool(6) 39 | X_title = np.asarray(p.map(pad_X30, train_title)) 40 | X_content = np.asarray(p.map(wd_pad_cut_docs, train_content)) 41 | p.close() 42 | p.join() 43 | X_content.shape = [-1, 30*10] 44 | X = np.hstack([X_title, X_content]) 45 | y = np.load('../data/y_tr.npy') 46 | # 划分验证集 47 | sample_num = X.shape[0] 48 | np.random.seed(13) 49 | valid_num = 100000 50 | new_index = np.random.permutation(sample_num) 51 | X = X[new_index] 52 | y = y[new_index] 53 | X_valid = X[:valid_num] 54 | y_valid = y[:valid_num] 55 | X_train = X[valid_num:] 56 | y_train = y[valid_num:] 57 | print('X_train.shape=', X_train.shape, 'y_train.shape=', y_train.shape) 58 | print('X_valid.shape=', X_valid.shape, 'y_valid.shape=', y_valid.shape) 59 | # 验证集打 batch 60 | print('creating batch data.') 61 | sample_num = len(X_valid) 62 | print('valid_sample_num=%d' % sample_num) 63 | train_batch(X_valid, y_valid, wd_valid_path, batch_size) 64 | # 训练集打 batch 65 | sample_num = len(X_train) 66 | print('train_sample_num=%d' % sample_num) 67 | train_batch(X_train, y_train, wd_train_path, batch_size) 68 | 69 | 70 | def wd_test_get_batch(title_len=30, batch_size=128): 71 | print('loading word eval_title and eval_content.') 72 | eval_title = np.load('../data/wd_eval_title.npy') 73 | eval_content = np.load('../data/wd_eval_content.npy') 74 | p = Pool(6) 75 | X_title = np.asarray(p.map(pad_X30, eval_title)) 76 | X_content = np.asarray(p.map(wd_pad_cut_docs, eval_content)) 77 | p.close() 78 | p.join() 79 | X_content.shape = [-1, 30*10] 80 | X = np.hstack([X_title, X_content]) 81 | sample_num = len(X) 82 | print('eval_sample_num=%d' % sample_num) 83 | eval_batch(X, wd_test_path, batch_size) 84 | 85 | 86 | # char 数据打包 87 | def ch_train_get_batch(title_len=52, batch_size=128): 88 | print('loading char train_title and train_content, this should cost minutes, please wait.') 89 | train_title = np.load('../data/ch_train_title.npy') 90 | train_content = np.load('../data/ch_train_content.npy') 91 | p = Pool(8) 92 | X_title = np.asarray(p.map(pad_X52, train_title)) 93 | X_content = np.asarray(p.map(ch_pad_cut_docs, train_content)) 94 | p.close() 95 | p.join() 96 | X_content.shape = [-1, 52*10] 97 | X = np.hstack([X_title, X_content]) 98 | y = np.load('../data/y_tr.npy') 99 | # 划分验证集 100 | sample_num = X.shape[0] 101 | np.random.seed(13) 102 | valid_num = 100000 103 | new_index = np.random.permutation(sample_num) 104 | X = X[new_index] 105 | y = y[new_index] 106 | X_valid = X[:valid_num] 107 | y_valid = y[:valid_num] 108 | X_train = X[valid_num:] 109 | y_train = y[valid_num:] 110 | print('X_train.shape=', X_train.shape, 'y_train.shape=', y_train.shape) 111 | print('X_valid.shape=', X_valid.shape, 'y_valid.shape=', y_valid.shape) 112 | # 验证集打batch 113 | print('creating batch data.') 114 | sample_num = len(X_valid) 115 | print('valid_sample_num=%d' % sample_num) 116 | train_batch(X_valid, y_valid, ch_valid_path, batch_size) 117 | # 训练集打batch 118 | sample_num = len(X_train) 119 | print('train_sample_num=%d' % sample_num) 120 | train_batch(X_train, y_train, ch_train_path, batch_size) 121 | 122 | 123 | def ch_test_get_batch(title_len=52, batch_size=128): 124 | print('loading char eval_title and eval_content.') 125 | eval_title = np.load('../data/ch_eval_title.npy') 126 | eval_content = np.load('../data/ch_eval_content.npy') 127 | p = Pool() 128 | X_title = np.asarray(p.map(pad_X52, eval_title)) 129 | X_content = np.asarray(p.map(ch_pad_cut_docs, eval_content)) 130 | p.close() 131 | p.join() 132 | X_content.shape = [-1, 52*10] 133 | X = np.hstack([X_title, X_content]) 134 | sample_num = len(X) 135 | print('eval_sample_num=%d' % sample_num) 136 | eval_batch(X, ch_test_path, batch_size) 137 | 138 | 139 | if __name__ == '__main__': 140 | wd_train_get_batch() 141 | wd_test_get_batch() 142 | ch_train_get_batch() 143 | ch_test_get_batch() 144 | -------------------------------------------------------------------------------- /zhihu-text-classification-master/data_process/creat_batch_data.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | from __future__ import division 4 | from __future__ import print_function 5 | 6 | import numpy as np 7 | import pandas as pd 8 | import pickle 9 | from multiprocessing import Pool 10 | import sys 11 | import os 12 | 13 | sys.path.append('../') 14 | from data_helpers import pad_X30 15 | from data_helpers import pad_X150 16 | from data_helpers import pad_X52 17 | from data_helpers import pad_X300 18 | from data_helpers import train_batch 19 | from data_helpers import eval_batch 20 | 21 | """ 把所有的数据按照 batch_size(128) 进行打包。取 10万 样本作为验证集。 22 | word_title_len = 30. 23 | word_content_len = 150. 24 | char_title_len = 52. 25 | char_content_len = 300. 26 | """ 27 | 28 | 29 | wd_train_path = '../data/wd-data/data_train/' 30 | wd_valid_path = '../data/wd-data/data_valid/' 31 | wd_test_path = '../data/wd-data/data_test/' 32 | ch_train_path = '../data/ch-data/data_train/' 33 | ch_valid_path = '../data/ch-data/data_valid/' 34 | ch_test_path = '../data/ch-data/data_test/' 35 | paths = [wd_train_path, wd_valid_path, wd_test_path, 36 | ch_train_path, ch_valid_path, ch_test_path] 37 | for each in paths: 38 | if not os.path.exists(each): 39 | os.makedirs(each) 40 | 41 | with open('../data/sr_topic2id.pkl', 'rb') as inp: 42 | sr_topic2id = pickle.load(inp) 43 | 44 | dict_topic2id = dict() 45 | for i in range(len(sr_topic2id)): 46 | dict_topic2id[sr_topic2id.index[i]] = sr_topic2id.values[i] 47 | 48 | 49 | def topics2ids(topics): 50 | """把 chars 转为 对应的 id""" 51 | topics = topics.split(',') 52 | ids = list(map(lambda topic: dict_topic2id[topic], topics)) # 获取id 53 | return ids 54 | 55 | 56 | def get_lables(): 57 | """获取训练集所有样本的标签。注意之前在处理数据时丢弃了部分没有 title 的样本。""" 58 | df_question_topic = pd.read_csv('../raw_data/question_topic_train_set.txt', sep='\t', 59 | names=['questions', 'topics'], dtype={'questions': object, 'topics': object}) 60 | na_title_indexs = [328877, 422123, 633584, 768738, 818616, 876828, 1273673, 1527297, 61 | 1636237, 1682969, 2052477, 2628516, 2657464, 2904162, 2993517] 62 | df_question_topic = df_question_topic.drop(na_title_indexs) 63 | p = Pool() 64 | y = p.map(topics2ids, df_question_topic.topics.values) 65 | p.close() 66 | p.join() 67 | return np.asarray(y) 68 | 69 | 70 | # word 数据打包 71 | def wd_train_get_batch(title_len=30, content_len=150, batch_size=128): 72 | print('loading word train_title and train_content.') 73 | train_title = np.load('../data/wd_train_title.npy') 74 | train_content = np.load('../data/wd_train_content.npy') 75 | p = Pool() 76 | X_title = np.asarray(p.map(pad_X30, train_title)) 77 | X_content = np.asarray(p.map(pad_X150, train_content)) 78 | p.close() 79 | p.join() 80 | X = np.hstack([X_title, X_content]) 81 | print('getting labels, this should cost minutes, please wait.') 82 | y = get_lables() 83 | print('y.shape=', y.shape) 84 | np.save('../data/y_tr.npy', y) 85 | # 划分验证集 86 | sample_num = X.shape[0] 87 | np.random.seed(13) 88 | valid_num = 100000 89 | new_index = np.random.permutation(sample_num) 90 | X = X[new_index] 91 | y = y[new_index] 92 | X_valid = X[:valid_num] 93 | y_valid = y[:valid_num] 94 | X_train = X[valid_num:] 95 | y_train = y[valid_num:] 96 | print('X_train.shape=', X_train.shape, 'y_train.shape=', y_train.shape) 97 | print('X_valid.shape=', X_valid.shape, 'y_valid.shape=', y_valid.shape) 98 | print('creating batch data.') 99 | # 验证集打batch 100 | sample_num = len(X_valid) 101 | print('valid_sample_num=%d' % sample_num) 102 | train_batch(X_valid, y_valid, wd_valid_path, batch_size) 103 | # 训练集打batch 104 | sample_num = len(X_train) 105 | print('train_sample_num=%d' % sample_num) 106 | train_batch(X_train, y_train, wd_train_path, batch_size) 107 | 108 | 109 | def wd_test_get_batch(title_len=30, content_len=150, batch_size=128): 110 | eval_title = np.load('../data/wd_eval_title.npy') 111 | eval_content = np.load('../data/wd_eval_content.npy') 112 | p = Pool() 113 | X_title = np.asarray(p.map(pad_X30, eval_title)) 114 | X_content = np.asarray(p.map(pad_X150, eval_content)) 115 | p.close() 116 | p.join() 117 | X = np.hstack([X_title, X_content]) 118 | sample_num = len(X) 119 | print('eval_sample_num=%d' % sample_num) 120 | eval_batch(X, wd_test_path, batch_size) 121 | 122 | 123 | # char 数据打包 124 | def ch_train_get_batch(title_len=52, content_len=300, batch_size=128): 125 | print('loading char train_title and train_content.') 126 | train_title = np.load('../data/ch_train_title.npy') 127 | train_content = np.load('../data/ch_train_content.npy') 128 | p = Pool() 129 | X_title = np.asarray(p.map(pad_X52, train_title)) 130 | X_content = np.asarray(p.map(pad_X300, train_content)) 131 | p.close() 132 | p.join() 133 | X = np.hstack([X_title, X_content]) 134 | y = np.load('../data/y_tr.npy') 135 | # 划分验证集 136 | sample_num = X.shape[0] 137 | np.random.seed(13) 138 | valid_num = 100000 139 | new_index = np.random.permutation(sample_num) 140 | X = X[new_index] 141 | y = y[new_index] 142 | X_valid = X[:valid_num] 143 | y_valid = y[:valid_num] 144 | X_train = X[valid_num:] 145 | y_train = y[valid_num:] 146 | print('X_train.shape=', X_train.shape, 'y_train.shape=', y_train.shape) 147 | print('X_valid.shape=', X_valid.shape, 'y_valid.shape=', y_valid.shape) 148 | # 验证集打batch 149 | print('creating batch data.') 150 | sample_num = len(X_valid) 151 | print('valid_sample_num=%d' % sample_num) 152 | train_batch(X_valid, y_valid, ch_valid_path, batch_size) 153 | # 训练集打batch 154 | sample_num = len(X_train) 155 | print('train_sample_num=%d' % sample_num) 156 | train_batch(X_train, y_train, ch_train_path, batch_size) 157 | 158 | 159 | def ch_test_get_batch(title_len=52, content_len=300, batch_size=128): 160 | eval_title = np.load('../data/ch_eval_title.npy') 161 | eval_content = np.load('../data/ch_eval_content.npy') 162 | p = Pool() 163 | X_title = np.asarray(p.map(pad_X52, eval_title)) 164 | X_content = np.asarray(p.map(pad_X300, eval_content)) 165 | p.close() 166 | p.join() 167 | X = np.hstack([X_title, X_content]) 168 | sample_num = len(X) 169 | print('eval_sample_num=%d' % sample_num) 170 | eval_batch(X, ch_test_path, batch_size) 171 | 172 | 173 | if __name__ == '__main__': 174 | wd_train_get_batch() 175 | wd_test_get_batch() 176 | ch_train_get_batch() 177 | ch_test_get_batch() 178 | -------------------------------------------------------------------------------- /zhihu-text-classification-master/models/wd_1_1_cnn_concat/train.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | from __future__ import print_function 4 | from __future__ import division 5 | import tensorflow as tf 6 | import numpy as np 7 | from tqdm import tqdm 8 | import os 9 | import sys 10 | import shutil 11 | import time 12 | import network 13 | 14 | sys.path.append('../..') 15 | from data_helpers import to_categorical 16 | from evaluator import score_eval 17 | 18 | flags = tf.flags 19 | flags.DEFINE_bool('is_retrain', False, 'if is_retrain is true, not rebuild the summary') 20 | flags.DEFINE_integer('max_epoch', 1, 'update the embedding after max_epoch, default: 1') 21 | flags.DEFINE_integer('max_max_epoch', 6, 'all training epoches, default: 6') 22 | flags.DEFINE_float('lr', 1e-3, 'initial learning rate, default: 1e-3') 23 | flags.DEFINE_float('decay_rate', 0.65, 'decay rate, default: 0.65') 24 | flags.DEFINE_float('keep_prob', 0.5, 'keep_prob for training, default: 0.5') 25 | # 正式 26 | flags.DEFINE_integer('decay_step', 15000, 'decay_step, default: 15000') 27 | flags.DEFINE_integer('valid_step', 10000, 'valid_step, default: 10000') 28 | flags.DEFINE_float('last_f1', 0.40, 'if valid_f1 > last_f1, save new model. default: 0.40') 29 | 30 | # 测试 31 | # flags.DEFINE_integer('decay_step', 1000, 'decay_step, default: 1000') 32 | # flags.DEFINE_integer('valid_step', 500, 'valid_step, default: 500') 33 | # flags.DEFINE_float('last_f1', 0.10, 'if valid_f1 > last_f1, save new model. default: 0.10') 34 | FLAGS = flags.FLAGS 35 | 36 | lr = FLAGS.lr 37 | last_f1 = FLAGS.last_f1 38 | settings = network.Settings() 39 | title_len = settings.title_len 40 | summary_path = settings.summary_path 41 | ckpt_path = settings.ckpt_path 42 | model_path = ckpt_path + 'model.ckpt' 43 | 44 | embedding_path = '../../data/word_embedding.npy' 45 | data_train_path = '../../data/wd-data/data_train/' 46 | data_valid_path = '../../data/wd-data/data_valid/' 47 | tr_batches = os.listdir(data_train_path) # batch 文件名列表 48 | va_batches = os.listdir(data_valid_path) 49 | n_tr_batches = len(tr_batches) 50 | n_va_batches = len(va_batches) 51 | 52 | # 测试 53 | # n_tr_batches = 1000 54 | # n_va_batches = 50 55 | 56 | 57 | def get_batch(data_path, batch_id): 58 | """get a batch from data_path""" 59 | new_batch = np.load(data_path + str(batch_id) + '.npz') 60 | X_batch = new_batch['X'] 61 | y_batch = new_batch['y'] 62 | X1_batch = X_batch[:, :title_len] 63 | X2_batch = X_batch[:, title_len:] 64 | return [X1_batch, X2_batch, y_batch] 65 | 66 | 67 | def valid_epoch(data_path, sess, model): 68 | """Test on the valid data.""" 69 | va_batches = os.listdir(data_path) 70 | n_va_batches = len(va_batches) 71 | _costs = 0.0 72 | predict_labels_list = list() # 所有的预测结果 73 | marked_labels_list = list() 74 | for i in range(n_va_batches): 75 | [X1_batch, X2_batch, y_batch] = get_batch(data_path, i) 76 | marked_labels_list.extend(y_batch) 77 | y_batch = to_categorical(y_batch) 78 | _batch_size = len(y_batch) 79 | fetches = [model.loss, model.y_pred] 80 | feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch, 81 | model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0} 82 | _cost, predict_labels = sess.run(fetches, feed_dict) 83 | _costs += _cost 84 | predict_labels = list(map(lambda label: label.argsort()[-1:-6:-1], predict_labels)) # 取最大的5个下标 85 | predict_labels_list.extend(predict_labels) 86 | predict_label_and_marked_label_list = zip(predict_labels_list, marked_labels_list) 87 | precision, recall, f1 = score_eval(predict_label_and_marked_label_list) 88 | mean_cost = _costs / n_va_batches 89 | return mean_cost, precision, recall, f1 90 | 91 | 92 | def train_epoch(data_train_path, sess, model, train_fetches, valid_fetches, train_writer, test_writer): 93 | global last_f1 94 | global lr 95 | time0 = time.time() 96 | batch_indexs = np.random.permutation(n_tr_batches) # shuffle the training data 97 | for batch in tqdm(range(n_tr_batches)): 98 | global_step = sess.run(model.global_step) 99 | if 0 == (global_step + 1) % FLAGS.valid_step: 100 | valid_cost, precision, recall, f1 = valid_epoch(data_valid_path, sess, model) 101 | print('Global_step=%d: valid cost=%g; p=%g, r=%g, f1=%g, time=%g s' % ( 102 | global_step, valid_cost, precision, recall, f1, time.time() - time0)) 103 | time0 = time.time() 104 | if f1 > last_f1: 105 | last_f1 = f1 106 | saving_path = model.saver.save(sess, model_path, global_step+1) 107 | print('saved new model to %s ' % saving_path) 108 | # training 109 | batch_id = batch_indexs[batch] 110 | [X1_batch, X2_batch, y_batch] = get_batch(data_train_path, batch_id) 111 | y_batch = to_categorical(y_batch) 112 | _batch_size = len(y_batch) 113 | feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch, 114 | model.batch_size: _batch_size, model.tst: False, model.keep_prob: FLAGS.keep_prob} 115 | summary, _cost, _, _ = sess.run(train_fetches, feed_dict) # the cost is the mean cost of one batch 116 | # valid per 500 steps 117 | if 0 == (global_step + 1) % 500: 118 | train_writer.add_summary(summary, global_step) 119 | batch_id = np.random.randint(0, n_va_batches) # 随机选一个验证batch 120 | [X1_batch, X2_batch, y_batch] = get_batch(data_valid_path, batch_id) 121 | y_batch = to_categorical(y_batch) 122 | _batch_size = len(y_batch) 123 | feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch, 124 | model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0} 125 | summary, _cost = sess.run(valid_fetches, feed_dict) 126 | test_writer.add_summary(summary, global_step) 127 | 128 | 129 | def main(_): 130 | global ckpt_path 131 | global last_f1 132 | if not os.path.exists(ckpt_path): 133 | os.makedirs(ckpt_path) 134 | if not os.path.exists(summary_path): 135 | os.makedirs(summary_path) 136 | elif not FLAGS.is_retrain: # 重新训练本模型,删除以前的 summary 137 | shutil.rmtree(summary_path) 138 | os.makedirs(summary_path) 139 | if not os.path.exists(summary_path): 140 | os.makedirs(summary_path) 141 | 142 | print('1.Loading data...') 143 | W_embedding = np.load(embedding_path) 144 | print('training sample_num = %d' % n_tr_batches) 145 | print('valid sample_num = %d' % n_va_batches) 146 | 147 | # Initial or restore the model 148 | print('2.Building model...') 149 | config = tf.ConfigProto() 150 | config.gpu_options.allow_growth = True 151 | with tf.Session(config=config) as sess: 152 | model = network.TextCNN(W_embedding, settings) 153 | with tf.variable_scope('training_ops') as vs: 154 | learning_rate = tf.train.exponential_decay(FLAGS.lr, model.global_step, FLAGS.decay_step, 155 | FLAGS.decay_rate, staircase=True) 156 | # two optimizer: op1, update embedding; op2, do not update embedding. 157 | with tf.variable_scope('Optimizer1'): 158 | tvars1 = tf.trainable_variables() 159 | train_op1 = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(model.loss, global_step=model.global_step, var_list=tvars1) 160 | 161 | with tf.variable_scope('Optimizer2'): 162 | tvars2 = [tvar for tvar in tvars1 if 'embedding' not in tvar.name] 163 | train_op2 = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(model.loss, global_step=model.global_step, var_list=tvars2) 164 | 165 | update_op = tf.group(*model.update_emas) 166 | merged = tf.summary.merge_all() # summary 167 | train_writer = tf.summary.FileWriter(summary_path + 'train', sess.graph) 168 | test_writer = tf.summary.FileWriter(summary_path + 'test') 169 | training_ops = [v for v in tf.global_variables() if v.name.startswith(vs.name+'/')] 170 | 171 | # 如果已经保存过模型,导入上次的模型 172 | if os.path.exists(ckpt_path + "checkpoint"): 173 | print("Restoring Variables from Checkpoint...") 174 | model.saver.restore(sess, tf.train.latest_checkpoint(ckpt_path)) 175 | last_valid_cost, precision, recall, last_f1 = valid_epoch(data_valid_path, sess, model) 176 | print(' valid cost=%g; p=%g, r=%g, f1=%g' % (last_valid_cost, precision, recall, last_f1)) 177 | sess.run(tf.variables_initializer(training_ops)) 178 | else: 179 | print('Initializing Variables...') 180 | sess.run(tf.global_variables_initializer()) 181 | 182 | print('3.Begin training...') 183 | print('max_epoch=%d, max_max_epoch=%d' % (FLAGS.max_epoch, FLAGS.max_max_epoch)) 184 | for epoch in range(FLAGS.max_max_epoch): 185 | global_step = sess.run(model.global_step) 186 | print('Global step %d, lr=%g' % (global_step, sess.run(learning_rate))) 187 | if epoch == FLAGS.max_epoch: # update the embedding 188 | train_op = train_op1 189 | else: 190 | train_op = train_op2 191 | 192 | train_fetches = [merged, model.loss, train_op, update_op] 193 | valid_fetches = [merged, model.loss] 194 | train_epoch(data_train_path, sess, model, train_fetches, valid_fetches, train_writer, test_writer) 195 | # 最后再做一次验证 196 | valid_cost, precision, recall, f1 = valid_epoch(data_valid_path, sess, model) 197 | print('END.Global_step=%d: valid cost=%g; p=%g, r=%g, f1=%g' % ( 198 | sess.run(model.global_step), valid_cost, precision, recall, f1)) 199 | if f1 > last_f1: # save the better model 200 | saving_path = model.saver.save(sess, model_path, sess.run(model.global_step)+1) 201 | print('saved new model to %s ' % saving_path) 202 | 203 | 204 | if __name__ == '__main__': 205 | tf.app.run() 206 | -------------------------------------------------------------------------------- /zhihu-text-classification-master/models/wd_1_1_cnn_concat/network.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | import tensorflow as tf 4 | 5 | """wd_1_1_cnn_concat 6 | title 部分使用 TextCNN;content 部分使用 TextCNN; 两部分输出直接 concat。 7 | """ 8 | 9 | 10 | class Settings(object): 11 | def __init__(self): 12 | self.model_name = 'wd_1_1_cnn_concat' 13 | self.title_len = 30 14 | self.content_len = 150 15 | self.filter_sizes = [2, 3, 4, 5, 7] 16 | self.n_filter = 256 17 | self.fc_hidden_size = 1024 18 | self.n_class = 1999 19 | self.summary_path = '../../summary/' + self.model_name + '/' 20 | self.ckpt_path = '../../ckpt/' + self.model_name + '/' 21 | 22 | 23 | class TextCNN(object): 24 | """ 25 | title: inputs->textcnn->output_title 26 | content: inputs->textcnn->output_content 27 | concat[output_title, output_content] -> fc+bn+relu -> sigmoid_entropy. 28 | """ 29 | 30 | def __init__(self, W_embedding, settings): 31 | self.model_name = settings.model_name 32 | self.title_len = settings.title_len 33 | self.content_len = settings.content_len 34 | self.filter_sizes = settings.filter_sizes 35 | self.n_filter = settings.n_filter 36 | self.n_filter_total = self.n_filter * len(self.filter_sizes) 37 | self.n_class = settings.n_class 38 | self.fc_hidden_size = settings.fc_hidden_size 39 | self._global_step = tf.Variable(0, trainable=False, name='Global_Step') 40 | self.update_emas = list() 41 | # placeholders 42 | self._tst = tf.placeholder(tf.bool) 43 | self._keep_prob = tf.placeholder(tf.float32, []) 44 | self._batch_size = tf.placeholder(tf.int32, []) 45 | 46 | with tf.name_scope('Inputs'): 47 | self._X1_inputs = tf.placeholder(tf.int64, [None, self.title_len], name='X1_inputs') 48 | self._X2_inputs = tf.placeholder(tf.int64, [None, self.content_len], name='X2_inputs') 49 | self._y_inputs = tf.placeholder(tf.float32, [None, self.n_class], name='y_input') 50 | 51 | with tf.variable_scope('embedding'): 52 | self.embedding = tf.get_variable(name='embedding', shape=W_embedding.shape, 53 | initializer=tf.constant_initializer(W_embedding), trainable=True) 54 | self.embedding_size = W_embedding.shape[1] 55 | 56 | with tf.variable_scope('cnn_text'): 57 | output_title = self.cnn_inference(self._X1_inputs, self.title_len) 58 | 59 | with tf.variable_scope('hcnn_content'): 60 | output_content = self.cnn_inference(self._X2_inputs, self.content_len) 61 | 62 | with tf.variable_scope('fc-bn-layer'): 63 | output = tf.concat([output_title, output_content], axis=1) 64 | W_fc = self.weight_variable([self.n_filter_total * 2, self.fc_hidden_size], name='Weight_fc') 65 | tf.summary.histogram('W_fc', W_fc) 66 | h_fc = tf.matmul(output, W_fc, name='h_fc') 67 | beta_fc = tf.Variable(tf.constant(0.1, tf.float32, shape=[self.fc_hidden_size], name="beta_fc")) 68 | tf.summary.histogram('beta_fc', beta_fc) 69 | fc_bn, update_ema_fc = self.batchnorm(h_fc, beta_fc, convolutional=False) 70 | self.update_emas.append(update_ema_fc) 71 | self.fc_bn_relu = tf.nn.relu(fc_bn, name="relu") 72 | fc_bn_drop = tf.nn.dropout(self.fc_bn_relu, self.keep_prob) 73 | 74 | with tf.variable_scope('out_layer'): 75 | W_out = self.weight_variable([self.fc_hidden_size, self.n_class], name='Weight_out') 76 | tf.summary.histogram('Weight_out', W_out) 77 | b_out = self.bias_variable([self.n_class], name='bias_out') 78 | tf.summary.histogram('bias_out', b_out) 79 | self._y_pred = tf.nn.xw_plus_b(fc_bn_drop, W_out, b_out, name='y_pred') # 每个类别的分数 scores 80 | 81 | with tf.name_scope('loss'): 82 | self._loss = tf.reduce_mean( 83 | tf.nn.sigmoid_cross_entropy_with_logits(logits=self._y_pred, labels=self._y_inputs)) 84 | tf.summary.scalar('loss', self._loss) 85 | 86 | self.saver = tf.train.Saver(max_to_keep=2) 87 | 88 | @property 89 | def tst(self): 90 | return self._tst 91 | 92 | @property 93 | def keep_prob(self): 94 | return self._keep_prob 95 | 96 | @property 97 | def batch_size(self): 98 | return self._batch_size 99 | 100 | @property 101 | def global_step(self): 102 | return self._global_step 103 | 104 | @property 105 | def X1_inputs(self): 106 | return self._X1_inputs 107 | 108 | @property 109 | def X2_inputs(self): 110 | return self._X2_inputs 111 | 112 | @property 113 | def y_inputs(self): 114 | return self._y_inputs 115 | 116 | @property 117 | def y_pred(self): 118 | return self._y_pred 119 | 120 | @property 121 | def loss(self): 122 | return self._loss 123 | 124 | def weight_variable(self, shape, name): 125 | """Create a weight variable with appropriate initialization.""" 126 | initial = tf.truncated_normal(shape, stddev=0.1) 127 | return tf.Variable(initial, name=name) 128 | 129 | def bias_variable(self, shape, name): 130 | """Create a bias variable with appropriate initialization.""" 131 | initial = tf.constant(0.1, shape=shape) 132 | return tf.Variable(initial, name=name) 133 | 134 | def batchnorm(self, Ylogits, offset, convolutional=False): 135 | """batchnormalization. 136 | Args: 137 | Ylogits: 1D向量或者是3D的卷积结果。 138 | num_updates: 迭代的global_step 139 | offset:表示beta,全局均值;在 RELU 激活中一般初始化为 0.1。 140 | scale:表示lambda,全局方差;在 sigmoid 激活中需要,这 RELU 激活中作用不大。 141 | m: 表示batch均值;v:表示batch方差。 142 | bnepsilon:一个很小的浮点数,防止除以 0. 143 | Returns: 144 | Ybn: 和 Ylogits 的维度一样,就是经过 Batch Normalization 处理的结果。 145 | update_moving_everages:更新mean和variance,主要是给最后的 test 使用。 146 | """ 147 | exp_moving_avg = tf.train.ExponentialMovingAverage(0.999, 148 | self._global_step) # adding the iteration prevents from averaging across non-existing iterations 149 | bnepsilon = 1e-5 150 | if convolutional: 151 | mean, variance = tf.nn.moments(Ylogits, [0, 1, 2]) 152 | else: 153 | mean, variance = tf.nn.moments(Ylogits, [0]) 154 | update_moving_everages = exp_moving_avg.apply([mean, variance]) 155 | m = tf.cond(self.tst, lambda: exp_moving_avg.average(mean), lambda: mean) 156 | v = tf.cond(self.tst, lambda: exp_moving_avg.average(variance), lambda: variance) 157 | Ybn = tf.nn.batch_normalization(Ylogits, m, v, offset, None, bnepsilon) 158 | return Ybn, update_moving_everages 159 | 160 | def cnn_inference(self, X_inputs, n_step): 161 | """TextCNN 模型。 162 | Args: 163 | X_inputs: tensor.shape=(batch_size, n_step) 164 | Returns: 165 | title_outputs: tensor.shape=(batch_size, self.n_filter_total) 166 | """ 167 | inputs = tf.nn.embedding_lookup(self.embedding, X_inputs) 168 | inputs = tf.expand_dims(inputs, -1) 169 | pooled_outputs = list() 170 | for i, filter_size in enumerate(self.filter_sizes): 171 | with tf.variable_scope("conv-maxpool-%s" % filter_size): 172 | # Convolution Layer 173 | filter_shape = [filter_size, self.embedding_size, 1, self.n_filter] 174 | W_filter = self.weight_variable(shape=filter_shape, name='W_filter') 175 | beta = self.bias_variable(shape=[self.n_filter], name='beta_filter') 176 | tf.summary.histogram('beta', beta) 177 | conv = tf.nn.conv2d(inputs, W_filter, strides=[1, 1, 1, 1], padding="VALID", name="conv") 178 | conv_bn, update_ema = self.batchnorm(conv, beta, convolutional=True) # 在激活层前面加 BN 179 | # Apply nonlinearity, batch norm scaling is not useful with relus 180 | # batch norm offsets are used instead of biases,使用 BN 层的 offset,不要 biases 181 | h = tf.nn.relu(conv_bn, name="relu") 182 | # Maxpooling over the outputs 183 | pooled = tf.nn.max_pool(h, ksize=[1, n_step - filter_size + 1, 1, 1], 184 | strides=[1, 1, 1, 1], padding='VALID', name="pool") 185 | pooled_outputs.append(pooled) 186 | self.update_emas.append(update_ema) 187 | h_pool = tf.concat(pooled_outputs, 3) 188 | h_pool_flat = tf.reshape(h_pool, [-1, self.n_filter_total]) 189 | return h_pool_flat # shape = [batch_size, self.n_filter_total] 190 | 191 | 192 | # test the model 193 | # def test(): 194 | # import numpy as np 195 | # print('Begin testing...') 196 | # settings = Settings() 197 | # W_embedding = np.random.randn(50, 10) 198 | # config = tf.ConfigProto() 199 | # config.gpu_options.allow_growth = True 200 | # batch_size = 128 201 | # with tf.Session(config=config) as sess: 202 | # model = TextCNN(W_embedding, settings) 203 | # optimizer = tf.train.AdamOptimizer(0.001) 204 | # train_op = optimizer.minimize(model.loss) 205 | # update_op = tf.group(*model.update_emas) 206 | # sess.run(tf.global_variables_initializer()) 207 | # fetch = [model.loss, model.y_pred, train_op, update_op] 208 | # loss_list = list() 209 | # for i in xrange(100): 210 | # X1_batch = np.zeros((batch_size, 30), dtype=float) 211 | # X2_batch = np.zeros((batch_size, 150), dtype=float) 212 | # y_batch = np.zeros((batch_size, 1999), dtype=int) 213 | # _batch_size = len(y_batch) 214 | # feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch, 215 | # model.batch_size: _batch_size, model.tst: False, model.keep_prob: 0.5} 216 | # loss, y_pred, _, _ = sess.run(fetch, feed_dict=feed_dict) 217 | # loss_list.append(loss) 218 | # print(i, loss) 219 | # 220 | # if __name__ == '__main__': 221 | # test() 222 | -------------------------------------------------------------------------------- /zhihu-text-classification-master/models/wd_1_2_cnn_max/network.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | import tensorflow as tf 4 | 5 | """wd_1_2_cnn_max 6 | title 部分使用 TextCNN;content 部分使用 TextCNN; 两部分输出按位取 max。 7 | """ 8 | 9 | 10 | class Settings(object): 11 | def __init__(self): 12 | self.model_name = 'wd_1_2_cnn_max' 13 | self.title_len = 30 14 | self.content_len = 150 15 | self.filter_sizes = [2, 3, 4, 5, 7] 16 | self.n_filter = 256 17 | self.fc_hidden_size = 1024 18 | self.n_class = 1999 19 | self.summary_path = '../../summary/' + self.model_name + '/' 20 | self.ckpt_path = '../../ckpt/' + self.model_name + '/' 21 | 22 | 23 | class TextCNN(object): 24 | """ 25 | title: inputs->textcnn->output_title 26 | content: inputs->textcnn->output_content 27 | max[output_title, output_content] -> fc+bn+relu -> sigmoid_entropy. 28 | """ 29 | 30 | def __init__(self, W_embedding, settings): 31 | self.model_name = settings.model_name 32 | self.title_len = settings.title_len 33 | self.content_len = settings.content_len 34 | self.filter_sizes = settings.filter_sizes 35 | self.n_filter = settings.n_filter 36 | self.n_filter_total = self.n_filter * len(self.filter_sizes) 37 | self.n_class = settings.n_class 38 | self.fc_hidden_size = settings.fc_hidden_size 39 | self._global_step = tf.Variable(0, trainable=False, name='Global_Step') 40 | self.update_emas = list() 41 | # placeholders 42 | self._tst = tf.placeholder(tf.bool) 43 | self._keep_prob = tf.placeholder(tf.float32, []) 44 | self._batch_size = tf.placeholder(tf.int32, []) 45 | 46 | with tf.name_scope('Inputs'): 47 | self._X1_inputs = tf.placeholder(tf.int64, [None, self.title_len], name='X1_inputs') 48 | self._X2_inputs = tf.placeholder(tf.int64, [None, self.content_len], name='X2_inputs') 49 | self._y_inputs = tf.placeholder(tf.float32, [None, self.n_class], name='y_input') 50 | 51 | with tf.variable_scope('embedding'): 52 | self.embedding = tf.get_variable(name='embedding', shape=W_embedding.shape, 53 | initializer=tf.constant_initializer(W_embedding), trainable=True) 54 | self.embedding_size = W_embedding.shape[1] 55 | 56 | with tf.variable_scope('cnn_text'): 57 | output_title = self.cnn_inference(self._X1_inputs, self.title_len) 58 | output_title = tf.expand_dims(output_title, 0) 59 | 60 | with tf.variable_scope('hcnn_content'): 61 | output_content = self.cnn_inference(self._X2_inputs, self.content_len) 62 | output_content = tf.expand_dims(output_content, 0) 63 | 64 | with tf.variable_scope('fc-bn-layer'): 65 | output = tf.concat([output_title, output_content], axis=0) 66 | output = tf.reduce_max(output, axis=0) 67 | W_fc = self.weight_variable([self.n_filter_total, self.fc_hidden_size], name='Weight_fc') 68 | tf.summary.histogram('W_fc', W_fc) 69 | h_fc = tf.matmul(output, W_fc, name='h_fc') 70 | beta_fc = tf.Variable(tf.constant(0.1, tf.float32, shape=[self.fc_hidden_size], name="beta_fc")) 71 | tf.summary.histogram('beta_fc', beta_fc) 72 | fc_bn, update_ema_fc = self.batchnorm(h_fc, beta_fc, convolutional=False) 73 | self.update_emas.append(update_ema_fc) 74 | self.fc_bn_relu = tf.nn.relu(fc_bn, name="relu") 75 | fc_bn_drop = tf.nn.dropout(self.fc_bn_relu, self.keep_prob) 76 | 77 | with tf.variable_scope('out_layer'): 78 | W_out = self.weight_variable([self.fc_hidden_size, self.n_class], name='Weight_out') 79 | tf.summary.histogram('Weight_out', W_out) 80 | b_out = self.bias_variable([self.n_class], name='bias_out') 81 | tf.summary.histogram('bias_out', b_out) 82 | self._y_pred = tf.nn.xw_plus_b(fc_bn_drop, W_out, b_out, name='y_pred') # 每个类别的分数 scores 83 | 84 | with tf.name_scope('loss'): 85 | self._loss = tf.reduce_mean( 86 | tf.nn.sigmoid_cross_entropy_with_logits(logits=self._y_pred, labels=self._y_inputs)) 87 | tf.summary.scalar('loss', self._loss) 88 | 89 | self.saver = tf.train.Saver(max_to_keep=2) 90 | 91 | @property 92 | def tst(self): 93 | return self._tst 94 | 95 | @property 96 | def keep_prob(self): 97 | return self._keep_prob 98 | 99 | @property 100 | def batch_size(self): 101 | return self._batch_size 102 | 103 | @property 104 | def global_step(self): 105 | return self._global_step 106 | 107 | @property 108 | def X1_inputs(self): 109 | return self._X1_inputs 110 | 111 | @property 112 | def X2_inputs(self): 113 | return self._X2_inputs 114 | 115 | @property 116 | def y_inputs(self): 117 | return self._y_inputs 118 | 119 | @property 120 | def y_pred(self): 121 | return self._y_pred 122 | 123 | @property 124 | def loss(self): 125 | return self._loss 126 | 127 | def weight_variable(self, shape, name): 128 | """Create a weight variable with appropriate initialization.""" 129 | initial = tf.truncated_normal(shape, stddev=0.1) 130 | return tf.Variable(initial, name=name) 131 | 132 | def bias_variable(self, shape, name): 133 | """Create a bias variable with appropriate initialization.""" 134 | initial = tf.constant(0.1, shape=shape) 135 | return tf.Variable(initial, name=name) 136 | 137 | def batchnorm(self, Ylogits, offset, convolutional=False): 138 | """batchnormalization. 139 | Args: 140 | Ylogits: 1D向量或者是3D的卷积结果。 141 | num_updates: 迭代的global_step 142 | offset:表示beta,全局均值;在 RELU 激活中一般初始化为 0.1。 143 | scale:表示lambda,全局方差;在 sigmoid 激活中需要,这 RELU 激活中作用不大。 144 | m: 表示batch均值;v:表示batch方差。 145 | bnepsilon:一个很小的浮点数,防止除以 0. 146 | Returns: 147 | Ybn: 和 Ylogits 的维度一样,就是经过 Batch Normalization 处理的结果。 148 | update_moving_everages:更新mean和variance,主要是给最后的 test 使用。 149 | """ 150 | exp_moving_avg = tf.train.ExponentialMovingAverage(0.999, 151 | self._global_step) # adding the iteration prevents from averaging across non-existing iterations 152 | bnepsilon = 1e-5 153 | if convolutional: 154 | mean, variance = tf.nn.moments(Ylogits, [0, 1, 2]) 155 | else: 156 | mean, variance = tf.nn.moments(Ylogits, [0]) 157 | update_moving_everages = exp_moving_avg.apply([mean, variance]) 158 | m = tf.cond(self.tst, lambda: exp_moving_avg.average(mean), lambda: mean) 159 | v = tf.cond(self.tst, lambda: exp_moving_avg.average(variance), lambda: variance) 160 | Ybn = tf.nn.batch_normalization(Ylogits, m, v, offset, None, bnepsilon) 161 | return Ybn, update_moving_everages 162 | 163 | def cnn_inference(self, X_inputs, n_step): 164 | """TextCNN 模型。 165 | Args: 166 | X_inputs: tensor.shape=(batch_size, n_step) 167 | Returns: 168 | title_outputs: tensor.shape=(batch_size, self.n_filter_total) 169 | """ 170 | inputs = tf.nn.embedding_lookup(self.embedding, X_inputs) 171 | inputs = tf.expand_dims(inputs, -1) 172 | pooled_outputs = list() 173 | for i, filter_size in enumerate(self.filter_sizes): 174 | with tf.variable_scope("conv-maxpool-%s" % filter_size): 175 | # Convolution Layer 176 | filter_shape = [filter_size, self.embedding_size, 1, self.n_filter] 177 | W_filter = self.weight_variable(shape=filter_shape, name='W_filter') 178 | beta = self.bias_variable(shape=[self.n_filter], name='beta_filter') 179 | tf.summary.histogram('beta', beta) 180 | conv = tf.nn.conv2d(inputs, W_filter, strides=[1, 1, 1, 1], padding="VALID", name="conv") 181 | conv_bn, update_ema = self.batchnorm(conv, beta, convolutional=True) # 在激活层前面加 BN 182 | # Apply nonlinearity, batch norm scaling is not useful with relus 183 | # batch norm offsets are used instead of biases,使用 BN 层的 offset,不要 biases 184 | h = tf.nn.relu(conv_bn, name="relu") 185 | # Maxpooling over the outputs 186 | pooled = tf.nn.max_pool(h, ksize=[1, n_step - filter_size + 1, 1, 1], 187 | strides=[1, 1, 1, 1], padding='VALID', name="pool") 188 | pooled_outputs.append(pooled) 189 | self.update_emas.append(update_ema) 190 | h_pool = tf.concat(pooled_outputs, 3) 191 | h_pool_flat = tf.reshape(h_pool, [-1, self.n_filter_total]) 192 | return h_pool_flat # shape = [batch_size, self.n_filter_total] 193 | 194 | 195 | # test the model 196 | # def test(): 197 | # import numpy as np 198 | # print('Begin testing...') 199 | # settings = Settings() 200 | # W_embedding = np.random.randn(50, 10) 201 | # config = tf.ConfigProto() 202 | # config.gpu_options.allow_growth = True 203 | # batch_size = 128 204 | # with tf.Session(config=config) as sess: 205 | # model = TextCNN(W_embedding, settings) 206 | # optimizer = tf.train.AdamOptimizer(0.001) 207 | # train_op = optimizer.minimize(model.loss) 208 | # update_op = tf.group(*model.update_emas) 209 | # sess.run(tf.global_variables_initializer()) 210 | # fetch = [model.loss, model.y_pred, train_op, update_op] 211 | # loss_list = list() 212 | # for i in xrange(100): 213 | # X1_batch = np.zeros((batch_size, 30), dtype=float) 214 | # X2_batch = np.zeros((batch_size, 150), dtype=float) 215 | # y_batch = np.zeros((batch_size, 1999), dtype=int) 216 | # _batch_size = len(y_batch) 217 | # feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch, 218 | # model.batch_size: _batch_size, model.tst: False, model.keep_prob: 0.5} 219 | # loss, y_pred, _, _ = sess.run(fetch, feed_dict=feed_dict) 220 | # loss_list.append(loss) 221 | # print(i, loss) 222 | # 223 | # if __name__ == '__main__': 224 | # test() 225 | -------------------------------------------------------------------------------- /zhihu-text-classification-master/models/wd_4_han/train.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | from __future__ import print_function 4 | from __future__ import division 5 | import tensorflow as tf 6 | import numpy as np 7 | from tqdm import tqdm 8 | import os 9 | import sys 10 | import shutil 11 | import time 12 | import network 13 | 14 | sys.path.append('../..') 15 | from data_helpers import to_categorical 16 | from evaluator import score_eval 17 | 18 | flags = tf.flags 19 | flags.DEFINE_bool('is_retrain', False, 'if is_retrain is true, not rebuild the summary') 20 | flags.DEFINE_integer('max_epoch', 2, 'update the embedding after max_epoch, default: 2') 21 | flags.DEFINE_integer('max_max_epoch', 6, 'all training epoches, default: 6') 22 | flags.DEFINE_float('lr', 8e-4, 'initial learning rate, default: 8e-4') 23 | flags.DEFINE_float('decay_rate', 0.85, 'decay rate, default: 0.85') 24 | flags.DEFINE_float('keep_prob', 0.5, 'keep_prob for training, default: 0.5') 25 | # 正式 26 | flags.DEFINE_integer('decay_step', 15000, 'decay_step, default: 15000') 27 | flags.DEFINE_integer('valid_step', 10000, 'valid_step, default: 10000') 28 | flags.DEFINE_float('last_f1', 0.38, 'if valid_f1 > last_f1, save new model. default: 0.40') 29 | 30 | # 测试 31 | # flags.DEFINE_integer('decay_step', 1000, 'decay_step, default: 1000') 32 | # flags.DEFINE_integer('valid_step', 500, 'valid_step, default: 500') 33 | # flags.DEFINE_float('last_f1', 0.10, 'if valid_f1 > last_f1, save new model. default: 0.10') 34 | FLAGS = flags.FLAGS 35 | 36 | lr = FLAGS.lr 37 | last_f1 = FLAGS.last_f1 38 | settings = network.Settings() 39 | title_len = settings.title_len 40 | summary_path = settings.summary_path 41 | ckpt_path = settings.ckpt_path 42 | model_path = ckpt_path + 'model.ckpt' 43 | 44 | embedding_path = '../../data/word_embedding.npy' 45 | data_train_path = '../../data/wd-data/seg_train/' 46 | data_valid_path = '../../data/wd-data/seg_valid/' 47 | tr_batches = os.listdir(data_train_path) # batch 文件名列表 48 | va_batches = os.listdir(data_valid_path) 49 | n_tr_batches = len(tr_batches) 50 | n_va_batches = len(va_batches) 51 | 52 | # 测试 53 | # n_tr_batches = 1000 54 | # n_va_batches = 50 55 | 56 | 57 | def get_batch(data_path, batch_id): 58 | """get a batch from data_path""" 59 | new_batch = np.load(data_path + str(batch_id) + '.npz') 60 | X_batch = new_batch['X'] 61 | y_batch = new_batch['y'] 62 | X1_batch = X_batch[:, :title_len] 63 | X2_batch = X_batch[:, title_len:] 64 | return [X1_batch, X2_batch, y_batch] 65 | 66 | 67 | def valid_epoch(data_path, sess, model): 68 | """Test on the valid data.""" 69 | va_batches = os.listdir(data_path) 70 | n_va_batches = len(va_batches) 71 | _costs = 0.0 72 | predict_labels_list = list() # 所有的预测结果 73 | marked_labels_list = list() 74 | for i in range(n_va_batches): 75 | [X1_batch, X2_batch, y_batch] = get_batch(data_path, i) 76 | marked_labels_list.extend(y_batch) 77 | y_batch = to_categorical(y_batch) 78 | _batch_size = len(y_batch) 79 | fetches = [model.loss, model.y_pred] 80 | feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch, 81 | model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0} 82 | _cost, predict_labels = sess.run(fetches, feed_dict) 83 | _costs += _cost 84 | predict_labels = (map(lambda label: label.argsort()[-1:-6:-1], predict_labels)) # 取最大的5个下标 85 | predict_labels_list.extend(predict_labels) 86 | predict_label_and_marked_label_list = zip(predict_labels_list, marked_labels_list) 87 | precision, recall, f1 = score_eval(predict_label_and_marked_label_list) 88 | mean_cost = _costs / n_va_batches 89 | return mean_cost, precision, recall, f1 90 | 91 | 92 | def train_epoch(data_path, sess, model, train_fetches, valid_fetches, train_writer, test_writer): 93 | global last_f1 94 | global lr 95 | time0 = time.time() 96 | batch_indexs = np.random.permutation(n_tr_batches) # shuffle the training data 97 | for batch in tqdm(range(n_tr_batches)): 98 | global_step = sess.run(model.global_step) 99 | if 0 == (global_step + 1) % FLAGS.valid_step: 100 | valid_cost, precision, recall, f1 = valid_epoch(data_valid_path, sess, model) 101 | print('Global_step=%d: valid cost=%g; p=%g, r=%g, f1=%g, time=%g s' % ( 102 | global_step, valid_cost, precision, recall, f1, time.time() - time0)) 103 | time0 = time.time() 104 | if f1 > last_f1: 105 | last_f1 = f1 106 | saving_path = model.saver.save(sess, model_path, global_step+1) 107 | print('saved new model to %s ' % saving_path) 108 | # training 109 | batch_id = batch_indexs[batch] 110 | [X1_batch, X2_batch, y_batch] = get_batch(data_train_path, batch_id) 111 | y_batch = to_categorical(y_batch) 112 | _batch_size = len(y_batch) 113 | feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch, 114 | model.batch_size: _batch_size, model.tst: False, model.keep_prob: FLAGS.keep_prob} 115 | summary, _cost, _, _ = sess.run(train_fetches, feed_dict) # the cost is the mean cost of one batch 116 | # valid per 500 steps 117 | if 0 == (global_step + 1) % 500: 118 | train_writer.add_summary(summary, global_step) 119 | batch_id = np.random.randint(0, n_va_batches) # 随机选一个验证batch 120 | [X1_batch, X2_batch, y_batch] = get_batch(data_valid_path, batch_id) 121 | y_batch = to_categorical(y_batch) 122 | _batch_size = len(y_batch) 123 | feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch, 124 | model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0} 125 | summary, _cost = sess.run(valid_fetches, feed_dict) 126 | test_writer.add_summary(summary, global_step) 127 | 128 | 129 | def main(_): 130 | global ckpt_path 131 | global last_f1 132 | if not os.path.exists(ckpt_path): 133 | os.makedirs(ckpt_path) 134 | if not os.path.exists(summary_path): 135 | os.makedirs(summary_path) 136 | elif not FLAGS.is_retrain: # 重新训练本模型,删除以前的 summary 137 | shutil.rmtree(summary_path) 138 | os.makedirs(summary_path) 139 | if not os.path.exists(summary_path): 140 | os.makedirs(summary_path) 141 | 142 | print('1.Loading data...') 143 | W_embedding = np.load(embedding_path) 144 | print('training sample_num = %d' % n_tr_batches) 145 | print('valid sample_num = %d' % n_va_batches) 146 | 147 | # Initial or restore the model 148 | print('2.Building model...') 149 | config = tf.ConfigProto() 150 | config.gpu_options.allow_growth = True 151 | with tf.Session(config=config) as sess: 152 | model = network.HAN(W_embedding, settings) 153 | with tf.variable_scope('training_ops') as vs: 154 | learning_rate = tf.train.exponential_decay(FLAGS.lr, model.global_step, FLAGS.decay_step, 155 | FLAGS.decay_rate, staircase=True) 156 | # two optimizer: op1, update embedding; op2, do not update embedding. 157 | with tf.variable_scope('Optimizer1'): 158 | tvars1 = tf.trainable_variables() 159 | grads1 = tf.gradients(model.loss, tvars1) 160 | optimizer1 = tf.train.AdamOptimizer(learning_rate=learning_rate) 161 | train_op1 = optimizer1.apply_gradients(zip(grads1, tvars1), 162 | global_step=model.global_step) 163 | with tf.variable_scope('Optimizer2'): 164 | tvars2 = [tvar for tvar in tvars1 if 'embedding' not in tvar.name] 165 | grads2 = tf.gradients(model.loss, tvars2) 166 | optimizer2 = tf.train.AdamOptimizer(learning_rate=learning_rate) 167 | train_op2 = optimizer2.apply_gradients(zip(grads2, tvars2), 168 | global_step=model.global_step) 169 | update_op = tf.group(*model.update_emas) 170 | merged = tf.summary.merge_all() # summary 171 | train_writer = tf.summary.FileWriter(summary_path + 'train', sess.graph) 172 | test_writer = tf.summary.FileWriter(summary_path + 'test') 173 | training_ops = [v for v in tf.global_variables() if v.name.startswith(vs.name+'/')] 174 | 175 | # 如果已经保存过模型,导入上次的模型 176 | if os.path.exists(ckpt_path + "checkpoint"): 177 | print("Restoring Variables from Checkpoint...") 178 | model.saver.restore(sess, tf.train.latest_checkpoint(ckpt_path)) 179 | last_valid_cost, precision, recall, last_f1 = valid_epoch(data_valid_path, sess, model) 180 | print(' valid cost=%g; p=%g, r=%g, f1=%g' % (last_valid_cost, precision, recall, last_f1)) 181 | sess.run(tf.variables_initializer(training_ops)) 182 | train_op2 = train_op1 183 | else: 184 | print('Initializing Variables...') 185 | sess.run(tf.global_variables_initializer()) 186 | 187 | print('3.Begin training...') 188 | print('max_epoch=%d, max_max_epoch=%d' % (FLAGS.max_epoch, FLAGS.max_max_epoch)) 189 | train_op = train_op2 190 | for epoch in range(FLAGS.max_max_epoch): 191 | global_step = sess.run(model.global_step) 192 | print('Global step %d, lr=%g' % (global_step, sess.run(learning_rate))) 193 | if epoch == FLAGS.max_epoch: # update the embedding 194 | train_op = train_op1 195 | train_fetches = [merged, model.loss, train_op, update_op] 196 | valid_fetches = [merged, model.loss] 197 | train_epoch(data_train_path, sess, model, train_fetches, valid_fetches, train_writer, test_writer) 198 | # 最后再做一次验证 199 | valid_cost, precision, recall, f1 = valid_epoch(data_valid_path, sess, model) 200 | print('END.Global_step=%d: valid cost=%g; p=%g, r=%g, f1=%g' % ( 201 | sess.run(model.global_step), valid_cost, precision, recall, f1)) 202 | if f1 > last_f1: # save the better model 203 | saving_path = model.saver.save(sess, model_path, sess.run(model.global_step)+1) 204 | print('saved new model to %s ' % saving_path) 205 | 206 | 207 | if __name__ == '__main__': 208 | tf.app.run() 209 | -------------------------------------------------------------------------------- /zhihu-text-classification-master/models/wd_2_hcnn/train.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | from __future__ import print_function 4 | from __future__ import division 5 | import tensorflow as tf 6 | import numpy as np 7 | from tqdm import tqdm 8 | import os 9 | import sys 10 | import shutil 11 | import time 12 | import network 13 | 14 | sys.path.append('../..') 15 | from data_helpers import to_categorical 16 | from evaluator import score_eval 17 | 18 | flags = tf.flags 19 | flags.DEFINE_bool('is_retrain', False, 'if is_retrain is true, not rebuild the summary') 20 | flags.DEFINE_integer('max_epoch', 1, 'update the embedding after max_epoch, default: 1') 21 | flags.DEFINE_integer('max_max_epoch', 6, 'all training epoches, default: 6') 22 | flags.DEFINE_float('lr', 1e-3, 'initial learning rate, default: 1e-3') 23 | flags.DEFINE_float('decay_rate', 0.65, 'decay rate, default: 0.65') 24 | flags.DEFINE_float('keep_prob', 0.5, 'keep_prob for training, default: 0.5') 25 | # 正式 26 | flags.DEFINE_integer('decay_step', 15000, 'decay_step, default: 15000') 27 | flags.DEFINE_integer('valid_step', 10000, 'valid_step, default: 10000') 28 | flags.DEFINE_float('last_f1', 0.38, 'if valid_f1 > last_f1, save new model. default: 0.40') 29 | 30 | # 测试 31 | # flags.DEFINE_integer('decay_step', 1000, 'decay_step, default: 1000') 32 | # flags.DEFINE_integer('valid_step', 500, 'valid_step, default: 500') 33 | # flags.DEFINE_float('last_f1', 0.10, 'if valid_f1 > last_f1, save new model. default: 0.10') 34 | FLAGS = flags.FLAGS 35 | 36 | lr = FLAGS.lr 37 | last_f1 = FLAGS.last_f1 38 | settings = network.Settings() 39 | title_len = settings.title_len 40 | summary_path = settings.summary_path 41 | ckpt_path = settings.ckpt_path 42 | model_path = ckpt_path + 'model.ckpt' 43 | 44 | embedding_path = '../../data/word_embedding.npy' 45 | data_train_path = '../../data/wd-data/seg_train/' 46 | data_valid_path = '../../data/wd-data/seg_valid/' 47 | tr_batches = os.listdir(data_train_path) # batch 文件名列表 48 | va_batches = os.listdir(data_valid_path) 49 | n_tr_batches = len(tr_batches) 50 | n_va_batches = len(va_batches) 51 | 52 | # 测试 53 | # n_tr_batches = 1000 54 | # n_va_batches = 50 55 | 56 | 57 | def get_batch(data_path, batch_id): 58 | """get a batch from data_path""" 59 | new_batch = np.load(data_path + str(batch_id) + '.npz') 60 | X_batch = new_batch['X'] 61 | y_batch = new_batch['y'] 62 | X1_batch = X_batch[:, :title_len] 63 | X2_batch = X_batch[:, title_len:] 64 | return [X1_batch, X2_batch, y_batch] 65 | 66 | 67 | def valid_epoch(data_path, sess, model): 68 | """Test on the valid data.""" 69 | va_batches = os.listdir(data_path) 70 | n_va_batches = len(va_batches) 71 | _costs = 0.0 72 | predict_labels_list = list() # 所有的预测结果 73 | marked_labels_list = list() 74 | for i in range(n_va_batches): 75 | [X1_batch, X2_batch, y_batch] = get_batch(data_path, i) 76 | marked_labels_list.extend(y_batch) 77 | y_batch = to_categorical(y_batch) 78 | _batch_size = len(y_batch) 79 | fetches = [model.loss, model.y_pred] 80 | feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch, 81 | model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0} 82 | _cost, predict_labels = sess.run(fetches, feed_dict) 83 | _costs += _cost 84 | predict_labels = list(map(lambda label: label.argsort()[-1:-6:-1], predict_labels)) # 取最大的5个下标 85 | predict_labels_list.extend(predict_labels) 86 | predict_label_and_marked_label_list = zip(predict_labels_list, marked_labels_list) 87 | precision, recall, f1 = score_eval(predict_label_and_marked_label_list) 88 | mean_cost = _costs / n_va_batches 89 | return mean_cost, precision, recall, f1 90 | 91 | 92 | def train_epoch(data_path, sess, model, train_fetches, valid_fetches, train_writer, test_writer): 93 | global last_f1 94 | global lr 95 | time0 = time.time() 96 | batch_indexs = np.random.permutation(n_tr_batches) # shuffle the training data 97 | for batch in tqdm(range(n_tr_batches)): 98 | global_step = sess.run(model.global_step) 99 | if 0 == (global_step + 1) % FLAGS.valid_step: 100 | valid_cost, precision, recall, f1 = valid_epoch(data_valid_path, sess, model) 101 | print('Global_step=%d: valid cost=%g; p=%g, r=%g, f1=%g, time=%g s' % ( 102 | global_step, valid_cost, precision, recall, f1, time.time() - time0)) 103 | time0 = time.time() 104 | if f1 > last_f1: 105 | last_f1 = f1 106 | saving_path = model.saver.save(sess, model_path, global_step+1) 107 | print('saved new model to %s ' % saving_path) 108 | # training 109 | batch_id = batch_indexs[batch] 110 | [X1_batch, X2_batch, y_batch] = get_batch(data_train_path, batch_id) 111 | y_batch = to_categorical(y_batch) 112 | _batch_size = len(y_batch) 113 | feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch, 114 | model.batch_size: _batch_size, model.tst: False, model.keep_prob: FLAGS.keep_prob} 115 | summary, _cost, _, _ = sess.run(train_fetches, feed_dict) # the cost is the mean cost of one batch 116 | # valid per 500 steps 117 | if 0 == (global_step + 1) % 500: 118 | train_writer.add_summary(summary, global_step) 119 | batch_id = np.random.randint(0, n_va_batches) # 随机选一个验证batch 120 | [X1_batch, X2_batch, y_batch] = get_batch(data_valid_path, batch_id) 121 | y_batch = to_categorical(y_batch) 122 | _batch_size = len(y_batch) 123 | feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch, 124 | model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0} 125 | summary, _cost = sess.run(valid_fetches, feed_dict) 126 | test_writer.add_summary(summary, global_step) 127 | 128 | 129 | def main(_): 130 | global ckpt_path 131 | global last_f1 132 | if not os.path.exists(ckpt_path): 133 | os.makedirs(ckpt_path) 134 | if not os.path.exists(summary_path): 135 | os.makedirs(summary_path) 136 | elif not FLAGS.is_retrain: # 重新训练本模型,删除以前的 summary 137 | shutil.rmtree(summary_path) 138 | os.makedirs(summary_path) 139 | if not os.path.exists(summary_path): 140 | os.makedirs(summary_path) 141 | 142 | print('1.Loading data...') 143 | W_embedding = np.load(embedding_path) 144 | print('training sample_num = %d' % n_tr_batches) 145 | print('valid sample_num = %d' % n_va_batches) 146 | 147 | # Initial or restore the model 148 | print('2.Building model...') 149 | config = tf.ConfigProto() 150 | config.gpu_options.allow_growth = True 151 | with tf.Session(config=config) as sess: 152 | model = network.HCNN(W_embedding, settings) 153 | with tf.variable_scope('training_ops') as vs: 154 | learning_rate = tf.train.exponential_decay(FLAGS.lr, model.global_step, FLAGS.decay_step, 155 | FLAGS.decay_rate, staircase=True) 156 | # two optimizer: op1, update embedding; op2, do not update embedding. 157 | with tf.variable_scope('Optimizer1'): 158 | tvars1 = tf.trainable_variables() 159 | grads1 = tf.gradients(model.loss, tvars1) 160 | optimizer1 = tf.train.AdamOptimizer(learning_rate=learning_rate) 161 | train_op1 = optimizer1.apply_gradients(zip(grads1, tvars1), 162 | global_step=model.global_step) 163 | with tf.variable_scope('Optimizer2'): 164 | tvars2 = [tvar for tvar in tvars1 if 'embedding' not in tvar.name] 165 | grads2 = tf.gradients(model.loss, tvars2) 166 | optimizer2 = tf.train.AdamOptimizer(learning_rate=learning_rate) 167 | train_op2 = optimizer2.apply_gradients(zip(grads2, tvars2), 168 | global_step=model.global_step) 169 | update_op = tf.group(*model.update_emas) 170 | merged = tf.summary.merge_all() # summary 171 | train_writer = tf.summary.FileWriter(summary_path + 'train', sess.graph) 172 | test_writer = tf.summary.FileWriter(summary_path + 'test') 173 | training_ops = [v for v in tf.global_variables() if v.name.startswith(vs.name+'/')] 174 | 175 | # 如果已经保存过模型,导入上次的模型 176 | if os.path.exists(ckpt_path + "checkpoint"): 177 | print("Restoring Variables from Checkpoint...") 178 | model.saver.restore(sess, tf.train.latest_checkpoint(ckpt_path)) 179 | last_valid_cost, precision, recall, last_f1 = valid_epoch(data_valid_path, sess, model) 180 | print(' valid cost=%g; p=%g, r=%g, f1=%g' % (last_valid_cost, precision, recall, last_f1)) 181 | sess.run(tf.variables_initializer(training_ops)) 182 | train_op2 = train_op1 183 | else: 184 | print('Initializing Variables...') 185 | sess.run(tf.global_variables_initializer()) 186 | 187 | print('3.Begin training...') 188 | print('max_epoch=%d, max_max_epoch=%d' % (FLAGS.max_epoch, FLAGS.max_max_epoch)) 189 | train_op = train_op2 190 | for epoch in range(FLAGS.max_max_epoch): 191 | global_step = sess.run(model.global_step) 192 | print('Global step %d, lr=%g' % (global_step, sess.run(learning_rate))) 193 | if epoch == FLAGS.max_epoch: # update the embedding 194 | train_op = train_op1 195 | train_fetches = [merged, model.loss, train_op, update_op] 196 | valid_fetches = [merged, model.loss] 197 | train_epoch(data_train_path, sess, model, train_fetches, valid_fetches, train_writer, test_writer) 198 | # 最后再做一次验证 199 | valid_cost, precision, recall, f1 = valid_epoch(data_valid_path, sess, model) 200 | print('END.Global_step=%d: valid cost=%g; p=%g, r=%g, f1=%g' % ( 201 | sess.run(model.global_step), valid_cost, precision, recall, f1)) 202 | if f1 > last_f1: # save the better model 203 | saving_path = model.saver.save(sess, model_path, sess.run(model.global_step)+1) 204 | print('saved new model to %s ' % saving_path) 205 | 206 | 207 | if __name__ == '__main__': 208 | tf.app.run() 209 | -------------------------------------------------------------------------------- /zhihu-text-classification-master/models/wd_6_rcnn/train.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | from __future__ import print_function 4 | from __future__ import division 5 | import tensorflow as tf 6 | import numpy as np 7 | from tqdm import tqdm 8 | import os 9 | import sys 10 | import shutil 11 | import time 12 | import network 13 | 14 | sys.path.append('../..') 15 | from data_helpers import to_categorical 16 | from evaluator import score_eval 17 | 18 | flags = tf.flags 19 | flags.DEFINE_bool('is_retrain', False, 'if is_retrain is true, not rebuild the summary') 20 | flags.DEFINE_integer('max_epoch', 1, 'update the embedding after max_epoch, default: 1') 21 | flags.DEFINE_integer('max_max_epoch', 6, 'all training epoches, default: 6') 22 | flags.DEFINE_float('lr', 8e-4, 'initial learning rate, default: 8e-4') 23 | flags.DEFINE_float('decay_rate', 0.75, 'decay rate, default: 0.75') 24 | flags.DEFINE_float('keep_prob', 0.5, 'keep_prob for training, default: 0.5') 25 | # 正式 26 | flags.DEFINE_integer('decay_step', 15000, 'decay_step, default: 15000') 27 | flags.DEFINE_integer('valid_step', 10000, 'valid_step, default: 10000') 28 | flags.DEFINE_float('last_f1', 0.38, 'if valid_f1 > last_f1, save new model. default: 0.40') 29 | 30 | # 测试 31 | # flags.DEFINE_integer('decay_step', 1000, 'decay_step, default: 1000') 32 | # flags.DEFINE_integer('valid_step', 500, 'valid_step, default: 500') 33 | # flags.DEFINE_float('last_f1', 0.10, 'if valid_f1 > last_f1, save new model. default: 0.10') 34 | FLAGS = flags.FLAGS 35 | 36 | lr = FLAGS.lr 37 | last_f1 = FLAGS.last_f1 38 | settings = network.Settings() 39 | title_len = settings.title_len 40 | summary_path = settings.summary_path 41 | ckpt_path = settings.ckpt_path 42 | model_path = ckpt_path + 'model.ckpt' 43 | 44 | embedding_path = '../../data/word_embedding.npy' 45 | data_train_path = '../../data/wd-data/data_train/' 46 | data_valid_path = '../../data/wd-data/data_valid/' 47 | tr_batches = os.listdir(data_train_path) # batch 文件名列表 48 | va_batches = os.listdir(data_valid_path) 49 | n_tr_batches = len(tr_batches) 50 | n_va_batches = len(va_batches) 51 | 52 | # 测试 53 | # n_tr_batches = 1000 54 | # n_va_batches = 50 55 | 56 | 57 | def get_batch(data_path, batch_id): 58 | """get a batch from data_path""" 59 | new_batch = np.load(data_path + str(batch_id) + '.npz') 60 | X_batch = new_batch['X'] 61 | y_batch = new_batch['y'] 62 | X1_batch = X_batch[:, :title_len] 63 | X2_batch = X_batch[:, title_len:] 64 | return [X1_batch, X2_batch, y_batch] 65 | 66 | 67 | def valid_epoch(data_path, sess, model): 68 | """Test on the valid data.""" 69 | va_batches = os.listdir(data_path) 70 | n_va_batches = len(va_batches) 71 | _costs = 0.0 72 | predict_labels_list = list() # 所有的预测结果 73 | marked_labels_list = list() 74 | for i in range(n_va_batches): 75 | [X1_batch, X2_batch, y_batch] = get_batch(data_path, i) 76 | marked_labels_list.extend(y_batch) 77 | y_batch = to_categorical(y_batch) 78 | _batch_size = len(y_batch) 79 | fetches = [model.loss, model.y_pred] 80 | feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch, 81 | model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0} 82 | _cost, predict_labels = sess.run(fetches, feed_dict) 83 | _costs += _cost 84 | predict_labels = list(map(lambda label: label.argsort()[-1:-6:-1], predict_labels)) # 取最大的5个下标 85 | predict_labels_list.extend(predict_labels) 86 | predict_label_and_marked_label_list = zip(predict_labels_list, marked_labels_list) 87 | precision, recall, f1 = score_eval(predict_label_and_marked_label_list) 88 | mean_cost = _costs / n_va_batches 89 | return mean_cost, precision, recall, f1 90 | 91 | 92 | def train_epoch(data_path, sess, model, train_fetches, valid_fetches, train_writer, test_writer): 93 | global last_f1 94 | global lr 95 | time0 = time.time() 96 | batch_indexs = np.random.permutation(n_tr_batches) # shuffle the training data 97 | for batch in tqdm(range(n_tr_batches)): 98 | global_step = sess.run(model.global_step) 99 | if 0 == (global_step + 1) % FLAGS.valid_step: 100 | valid_cost, precision, recall, f1 = valid_epoch(data_valid_path, sess, model) 101 | print('Global_step=%d: valid cost=%g; p=%g, r=%g, f1=%g, time=%g s' % ( 102 | global_step, valid_cost, precision, recall, f1, time.time() - time0)) 103 | time0 = time.time() 104 | if f1 > last_f1: 105 | last_f1 = f1 106 | saving_path = model.saver.save(sess, model_path, global_step+1) 107 | print('saved new model to %s ' % saving_path) 108 | # training 109 | batch_id = batch_indexs[batch] 110 | [X1_batch, X2_batch, y_batch] = get_batch(data_train_path, batch_id) 111 | y_batch = to_categorical(y_batch) 112 | _batch_size = len(y_batch) 113 | feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch, 114 | model.batch_size: _batch_size, model.tst: False, model.keep_prob: FLAGS.keep_prob} 115 | summary, _cost, _, _ = sess.run(train_fetches, feed_dict) # the cost is the mean cost of one batch 116 | # valid per 500 steps 117 | if 0 == (global_step + 1) % 500: 118 | train_writer.add_summary(summary, global_step) 119 | batch_id = np.random.randint(0, n_va_batches) # 随机选一个验证batch 120 | [X1_batch, X2_batch, y_batch] = get_batch(data_valid_path, batch_id) 121 | y_batch = to_categorical(y_batch) 122 | _batch_size = len(y_batch) 123 | feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch, 124 | model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0} 125 | summary, _cost = sess.run(valid_fetches, feed_dict) 126 | test_writer.add_summary(summary, global_step) 127 | 128 | 129 | def main(_): 130 | global ckpt_path 131 | global last_f1 132 | if not os.path.exists(ckpt_path): 133 | os.makedirs(ckpt_path) 134 | if not os.path.exists(summary_path): 135 | os.makedirs(summary_path) 136 | elif not FLAGS.is_retrain: # 重新训练本模型,删除以前的 summary 137 | shutil.rmtree(summary_path) 138 | os.makedirs(summary_path) 139 | if not os.path.exists(summary_path): 140 | os.makedirs(summary_path) 141 | 142 | print('1.Loading data...') 143 | W_embedding = np.load(embedding_path) 144 | print('training sample_num = %d' % n_tr_batches) 145 | print('valid sample_num = %d' % n_va_batches) 146 | 147 | # Initial or restore the model 148 | print('2.Building model...') 149 | config = tf.ConfigProto() 150 | config.gpu_options.allow_growth = True 151 | with tf.Session(config=config) as sess: 152 | model = network.RCNN(W_embedding, settings) 153 | with tf.variable_scope('training_ops') as vs: 154 | learning_rate = tf.train.exponential_decay(FLAGS.lr, model.global_step, FLAGS.decay_step, 155 | FLAGS.decay_rate, staircase=True) 156 | # two optimizer: op1, update embedding; op2, do not update embedding. 157 | with tf.variable_scope('Optimizer1'): 158 | tvars1 = tf.trainable_variables() 159 | grads1 = tf.gradients(model.loss, tvars1) 160 | optimizer1 = tf.train.AdamOptimizer(learning_rate=learning_rate) 161 | train_op1 = optimizer1.apply_gradients(zip(grads1, tvars1), 162 | global_step=model.global_step) 163 | with tf.variable_scope('Optimizer2'): 164 | tvars2 = [tvar for tvar in tvars1 if 'embedding' not in tvar.name] 165 | grads2 = tf.gradients(model.loss, tvars2) 166 | optimizer2 = tf.train.AdamOptimizer(learning_rate=learning_rate) 167 | train_op2 = optimizer2.apply_gradients(zip(grads2, tvars2), 168 | global_step=model.global_step) 169 | update_op = tf.group(*model.update_emas) 170 | merged = tf.summary.merge_all() # summary 171 | train_writer = tf.summary.FileWriter(summary_path + 'train', sess.graph) 172 | test_writer = tf.summary.FileWriter(summary_path + 'test') 173 | training_ops = [v for v in tf.global_variables() if v.name.startswith(vs.name+'/')] 174 | 175 | # 如果已经保存过模型,导入上次的模型 176 | if os.path.exists(ckpt_path + "checkpoint"): 177 | print("Restoring Variables from Checkpoint...") 178 | model.saver.restore(sess, tf.train.latest_checkpoint(ckpt_path)) 179 | last_valid_cost, precision, recall, last_f1 = valid_epoch(data_valid_path, sess, model) 180 | print(' valid cost=%g; p=%g, r=%g, f1=%g' % (last_valid_cost, precision, recall, last_f1)) 181 | sess.run(tf.variables_initializer(training_ops)) 182 | train_op2 = train_op1 183 | else: 184 | print('Initializing Variables...') 185 | sess.run(tf.global_variables_initializer()) 186 | 187 | print('3.Begin training...') 188 | print('max_epoch=%d, max_max_epoch=%d' % (FLAGS.max_epoch, FLAGS.max_max_epoch)) 189 | train_op = train_op2 190 | for epoch in range(FLAGS.max_max_epoch): 191 | global_step = sess.run(model.global_step) 192 | print('Global step %d, lr=%g' % (global_step, sess.run(learning_rate))) 193 | if epoch == FLAGS.max_epoch: # update the embedding 194 | train_op = train_op1 195 | train_fetches = [merged, model.loss, train_op, update_op] 196 | valid_fetches = [merged, model.loss] 197 | train_epoch(data_train_path, sess, model, train_fetches, valid_fetches, train_writer, test_writer) 198 | # 最后再做一次验证 199 | valid_cost, precision, recall, f1 = valid_epoch(data_valid_path, sess, model) 200 | print('END.Global_step=%d: valid cost=%g; p=%g, r=%g, f1=%g' % ( 201 | sess.run(model.global_step), valid_cost, precision, recall, f1)) 202 | if f1 > last_f1: # save the better model 203 | saving_path = model.saver.save(sess, model_path, sess.run(model.global_step)+1) 204 | print('saved new model to %s ' % saving_path) 205 | 206 | 207 | if __name__ == '__main__': 208 | tf.app.run() 209 | -------------------------------------------------------------------------------- /zhihu-text-classification-master/models/wd_3_bigru/train.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | from __future__ import print_function 4 | from __future__ import division 5 | import tensorflow as tf 6 | import numpy as np 7 | from tqdm import tqdm 8 | import os 9 | import sys 10 | import shutil 11 | import time 12 | import network 13 | 14 | sys.path.append('../..') 15 | from data_helpers import to_categorical 16 | from evaluator import score_eval 17 | 18 | flags = tf.flags 19 | flags.DEFINE_bool('is_retrain', False, 'if is_retrain is true, not rebuild the summary') 20 | flags.DEFINE_integer('max_epoch', 1, 'update the embedding after max_epoch, default: 1') 21 | flags.DEFINE_integer('max_max_epoch', 6, 'all training epoches, default: 6') 22 | flags.DEFINE_float('lr', 8e-4, 'initial learning rate, default: 8e-4') 23 | flags.DEFINE_float('decay_rate', 0.85, 'decay rate, default: 0.85') 24 | flags.DEFINE_float('keep_prob', 0.5, 'keep_prob for training, default: 0.5') 25 | # 正式 26 | flags.DEFINE_integer('decay_step', 15000, 'decay_step, default: 15000') 27 | flags.DEFINE_integer('valid_step', 10000, 'valid_step, default: 10000') 28 | flags.DEFINE_float('last_f1', 0.40, 'if valid_f1 > last_f1, save new model. default: 0.40') 29 | 30 | # 测试 31 | # flags.DEFINE_integer('decay_step', 1000, 'decay_step, default: 1000') 32 | # flags.DEFINE_integer('valid_step', 500, 'valid_step, default: 500') 33 | # flags.DEFINE_float('last_f1', 0.10, 'if valid_f1 > last_f1, save new model. default: 0.10') 34 | FLAGS = flags.FLAGS 35 | 36 | lr = FLAGS.lr 37 | last_f1 = FLAGS.last_f1 38 | settings = network.Settings() 39 | title_len = settings.title_len 40 | summary_path = settings.summary_path 41 | ckpt_path = settings.ckpt_path 42 | model_path = ckpt_path + 'model.ckpt' 43 | 44 | embedding_path = '../../data/word_embedding.npy' 45 | data_train_path = '../../data/wd-data/data_train/' 46 | data_valid_path = '../../data/wd-data/data_valid/' 47 | tr_batches = os.listdir(data_train_path) # batch 文件名列表 48 | va_batches = os.listdir(data_valid_path) 49 | n_tr_batches = len(tr_batches) 50 | n_va_batches = len(va_batches) 51 | 52 | # 测试 53 | # n_tr_batches = 1000 54 | # n_va_batches = 50 55 | 56 | 57 | def get_batch(data_path, batch_id): 58 | """get a batch from data_path""" 59 | new_batch = np.load(data_path + str(batch_id) + '.npz') 60 | X_batch = new_batch['X'] 61 | y_batch = new_batch['y'] 62 | X1_batch = X_batch[:, :title_len] 63 | X2_batch = X_batch[:, title_len:] 64 | return [X1_batch, X2_batch, y_batch] 65 | 66 | 67 | def valid_epoch(data_path, sess, model): 68 | """Test on the valid data.""" 69 | va_batches = os.listdir(data_path) 70 | n_va_batches = len(va_batches) 71 | _costs = 0.0 72 | predict_labels_list = list() # 所有的预测结果 73 | marked_labels_list = list() 74 | for i in range(n_va_batches): 75 | [X1_batch, X2_batch, y_batch] = get_batch(data_path, i) 76 | marked_labels_list.extend(y_batch) 77 | y_batch = to_categorical(y_batch) 78 | _batch_size = len(y_batch) 79 | fetches = [model.loss, model.y_pred] 80 | feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch, 81 | model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0} 82 | _cost, predict_labels = sess.run(fetches, feed_dict) 83 | _costs += _cost 84 | predict_labels = list(map(lambda label: label.argsort()[-1:-6:-1], predict_labels)) # 取最大的5个下标 85 | predict_labels_list.extend(predict_labels) 86 | predict_label_and_marked_label_list = zip(predict_labels_list, marked_labels_list) 87 | precision, recall, f1 = score_eval(predict_label_and_marked_label_list) 88 | mean_cost = _costs / n_va_batches 89 | return mean_cost, precision, recall, f1 90 | 91 | 92 | def train_epoch(data_path, sess, model, train_fetches, valid_fetches, train_writer, test_writer): 93 | global last_f1 94 | global lr 95 | time0 = time.time() 96 | batch_indexs = np.random.permutation(n_tr_batches) # shuffle the training data 97 | for batch in tqdm(range(n_tr_batches)): 98 | global_step = sess.run(model.global_step) 99 | if 0 == (global_step + 1) % FLAGS.valid_step: 100 | valid_cost, precision, recall, f1 = valid_epoch(data_valid_path, sess, model) 101 | print('Global_step=%d: valid cost=%g; p=%g, r=%g, f1=%g, time=%g s' % ( 102 | global_step, valid_cost, precision, recall, f1, time.time() - time0)) 103 | time0 = time.time() 104 | if f1 > last_f1: 105 | last_f1 = f1 106 | saving_path = model.saver.save(sess, model_path, global_step+1) 107 | print('saved new model to %s ' % saving_path) 108 | # training 109 | batch_id = batch_indexs[batch] 110 | [X1_batch, X2_batch, y_batch] = get_batch(data_train_path, batch_id) 111 | y_batch = to_categorical(y_batch) 112 | _batch_size = len(y_batch) 113 | feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch, 114 | model.batch_size: _batch_size, model.tst: False, model.keep_prob: FLAGS.keep_prob} 115 | summary, _cost, _, _ = sess.run(train_fetches, feed_dict) # the cost is the mean cost of one batch 116 | # valid per 500 steps 117 | if 0 == (global_step + 1) % 500: 118 | train_writer.add_summary(summary, global_step) 119 | batch_id = np.random.randint(0, n_va_batches) # 随机选一个验证batch 120 | [X1_batch, X2_batch, y_batch] = get_batch(data_valid_path, batch_id) 121 | y_batch = to_categorical(y_batch) 122 | _batch_size = len(y_batch) 123 | feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch, 124 | model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0} 125 | summary, _cost = sess.run(valid_fetches, feed_dict) 126 | test_writer.add_summary(summary, global_step) 127 | 128 | 129 | def main(_): 130 | global ckpt_path 131 | global last_f1 132 | if not os.path.exists(ckpt_path): 133 | os.makedirs(ckpt_path) 134 | if not os.path.exists(summary_path): 135 | os.makedirs(summary_path) 136 | elif not FLAGS.is_retrain: # 重新训练本模型,删除以前的 summary 137 | shutil.rmtree(summary_path) 138 | os.makedirs(summary_path) 139 | if not os.path.exists(summary_path): 140 | os.makedirs(summary_path) 141 | 142 | print('1.Loading data...') 143 | W_embedding = np.load(embedding_path) 144 | print('training sample_num = %d' % n_tr_batches) 145 | print('valid sample_num = %d' % n_va_batches) 146 | 147 | # Initial or restore the model 148 | print('2.Building model...') 149 | config = tf.ConfigProto() 150 | config.gpu_options.allow_growth = True 151 | with tf.Session(config=config) as sess: 152 | model = network.BiGRU(W_embedding, settings) 153 | with tf.variable_scope('training_ops') as vs: 154 | learning_rate = tf.train.exponential_decay(FLAGS.lr, model.global_step, FLAGS.decay_step, 155 | FLAGS.decay_rate, staircase=True) 156 | # two optimizer: op1, update embedding; op2, do not update embedding. 157 | with tf.variable_scope('Optimizer1'): 158 | tvars1 = tf.trainable_variables() 159 | grads1 = tf.gradients(model.loss, tvars1) 160 | optimizer1 = tf.train.AdamOptimizer(learning_rate=learning_rate) 161 | train_op1 = optimizer1.apply_gradients(zip(grads1, tvars1), 162 | global_step=model.global_step) 163 | with tf.variable_scope('Optimizer2'): 164 | tvars2 = [tvar for tvar in tvars1 if 'embedding' not in tvar.name] 165 | grads2 = tf.gradients(model.loss, tvars2) 166 | optimizer2 = tf.train.AdamOptimizer(learning_rate=learning_rate) 167 | train_op2 = optimizer2.apply_gradients(zip(grads2, tvars2), 168 | global_step=model.global_step) 169 | update_op = tf.group(*model.update_emas) 170 | merged = tf.summary.merge_all() # summary 171 | train_writer = tf.summary.FileWriter(summary_path + 'train', sess.graph) 172 | test_writer = tf.summary.FileWriter(summary_path + 'test') 173 | training_ops = [v for v in tf.global_variables() if v.name.startswith(vs.name+'/')] 174 | 175 | # 如果已经保存过模型,导入上次的模型 176 | if os.path.exists(ckpt_path + "checkpoint"): 177 | print("Restoring Variables from Checkpoint...") 178 | model.saver.restore(sess, tf.train.latest_checkpoint(ckpt_path)) 179 | last_valid_cost, precision, recall, last_f1 = valid_epoch(data_valid_path, sess, model) 180 | print(' valid cost=%g; p=%g, r=%g, f1=%g' % (last_valid_cost, precision, recall, last_f1)) 181 | sess.run(tf.variables_initializer(training_ops)) 182 | train_op2 = train_op1 183 | else: 184 | print('Initializing Variables...') 185 | sess.run(tf.global_variables_initializer()) 186 | 187 | print('3.Begin training...') 188 | 189 | train_op = train_op2 190 | print('max_epoch=%d, max_max_epoch=%d' % (FLAGS.max_epoch, FLAGS.max_max_epoch)) 191 | for epoch in range(FLAGS.max_max_epoch): 192 | global_step = sess.run(model.global_step) 193 | print('Global step %d, lr=%g' % (global_step, sess.run(learning_rate))) 194 | if epoch == FLAGS.max_epoch: # update the embedding 195 | train_op = train_op1 196 | train_fetches = [merged, model.loss, train_op, update_op] 197 | valid_fetches = [merged, model.loss] 198 | train_epoch(data_train_path, sess, model, train_fetches, valid_fetches, train_writer, test_writer) 199 | # 最后再做一次验证 200 | valid_cost, precision, recall, f1 = valid_epoch(data_valid_path, sess, model) 201 | print('END.Global_step=%d: valid cost=%g; p=%g, r=%g, f1=%g' % ( 202 | sess.run(model.global_step), valid_cost, precision, recall, f1)) 203 | if f1 > last_f1: # save the better model 204 | saving_path = model.saver.save(sess, model_path, sess.run(model.global_step)+1) 205 | print('saved new model to %s ' % saving_path) 206 | 207 | 208 | if __name__ == '__main__': 209 | tf.app.run() 210 | -------------------------------------------------------------------------------- /zhihu-text-classification-master/models/wd_5_bigru_cnn/train.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | from __future__ import print_function 4 | from __future__ import division 5 | import tensorflow as tf 6 | import numpy as np 7 | from tqdm import tqdm 8 | import os 9 | import sys 10 | import shutil 11 | import time 12 | import network 13 | 14 | sys.path.append('../..') 15 | from data_helpers import to_categorical 16 | from evaluator import score_eval 17 | 18 | flags = tf.flags 19 | flags.DEFINE_bool('is_retrain', False, 'if is_retrain is true, not rebuild the summary') 20 | flags.DEFINE_integer('max_epoch', 1, 'update the embedding after max_epoch, default: 1') 21 | flags.DEFINE_integer('max_max_epoch', 6, 'all training epoches, default: 6') 22 | flags.DEFINE_float('lr', 8e-4, 'initial learning rate, default: 8e-4') 23 | flags.DEFINE_float('decay_rate', 0.75, 'decay rate, default: 0.75') 24 | flags.DEFINE_float('keep_prob', 0.5, 'keep_prob for training, default: 0.5') 25 | # 正式 26 | flags.DEFINE_integer('decay_step', 15000, 'decay_step, default: 15000') 27 | flags.DEFINE_integer('valid_step', 10000, 'valid_step, default: 10000') 28 | flags.DEFINE_float('last_f1', 0.38, 'if valid_f1 > last_f1, save new model. default: 0.40') 29 | 30 | # 测试 31 | # flags.DEFINE_integer('decay_step', 1000, 'decay_step, default: 1000') 32 | # flags.DEFINE_integer('valid_step', 500, 'valid_step, default: 500') 33 | # flags.DEFINE_float('last_f1', 0.10, 'if valid_f1 > last_f1, save new model. default: 0.10') 34 | FLAGS = flags.FLAGS 35 | 36 | lr = FLAGS.lr 37 | last_f1 = FLAGS.last_f1 38 | settings = network.Settings() 39 | title_len = settings.title_len 40 | summary_path = settings.summary_path 41 | ckpt_path = settings.ckpt_path 42 | model_path = ckpt_path + 'model.ckpt' 43 | 44 | embedding_path = '../../data/word_embedding.npy' 45 | data_train_path = '../../data/wd-data/data_train/' 46 | data_valid_path = '../../data/wd-data/data_valid/' 47 | tr_batches = os.listdir(data_train_path) # batch 文件名列表 48 | va_batches = os.listdir(data_valid_path) 49 | n_tr_batches = len(tr_batches) 50 | n_va_batches = len(va_batches) 51 | 52 | # 测试 53 | # n_tr_batches = 1000 54 | # n_va_batches = 50 55 | 56 | 57 | def get_batch(data_path, batch_id): 58 | """get a batch from data_path""" 59 | new_batch = np.load(data_path + str(batch_id) + '.npz') 60 | X_batch = new_batch['X'] 61 | y_batch = new_batch['y'] 62 | X1_batch = X_batch[:, :title_len] 63 | X2_batch = X_batch[:, title_len:] 64 | return [X1_batch, X2_batch, y_batch] 65 | 66 | 67 | def valid_epoch(data_path, sess, model): 68 | """Test on the valid data.""" 69 | va_batches = os.listdir(data_path) 70 | n_va_batches = len(va_batches) 71 | _costs = 0.0 72 | predict_labels_list = list() # 所有的预测结果 73 | marked_labels_list = list() 74 | for i in range(n_va_batches): 75 | [X1_batch, X2_batch, y_batch] = get_batch(data_path, i) 76 | marked_labels_list.extend(y_batch) 77 | y_batch = to_categorical(y_batch) 78 | _batch_size = len(y_batch) 79 | fetches = [model.loss, model.y_pred] 80 | feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch, 81 | model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0} 82 | _cost, predict_labels = sess.run(fetches, feed_dict) 83 | _costs += _cost 84 | predict_labels = (map(lambda label: label.argsort()[-1:-6:-1], predict_labels)) # 取最大的5个下标 85 | predict_labels_list.extend(predict_labels) 86 | predict_label_and_marked_label_list = zip(predict_labels_list, marked_labels_list) 87 | precision, recall, f1 = score_eval(predict_label_and_marked_label_list) 88 | mean_cost = _costs / n_va_batches 89 | return mean_cost, precision, recall, f1 90 | 91 | 92 | def train_epoch(data_path, sess, model, train_fetches, valid_fetches, train_writer, test_writer): 93 | global last_f1 94 | global lr 95 | time0 = time.time() 96 | batch_indexs = np.random.permutation(n_tr_batches) # shuffle the training data 97 | for batch in tqdm(range(n_tr_batches)): 98 | global_step = sess.run(model.global_step) 99 | if 0 == (global_step + 1) % FLAGS.valid_step: 100 | valid_cost, precision, recall, f1 = valid_epoch(data_valid_path, sess, model) 101 | print('Global_step=%d: valid cost=%g; p=%g, r=%g, f1=%g, time=%g s' % ( 102 | global_step, valid_cost, precision, recall, f1, time.time() - time0)) 103 | time0 = time.time() 104 | if f1 > last_f1: 105 | last_f1 = f1 106 | saving_path = model.saver.save(sess, model_path, global_step+1) 107 | print('saved new model to %s ' % saving_path) 108 | # training 109 | batch_id = batch_indexs[batch] 110 | [X1_batch, X2_batch, y_batch] = get_batch(data_train_path, batch_id) 111 | y_batch = to_categorical(y_batch) 112 | _batch_size = len(y_batch) 113 | feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch, 114 | model.batch_size: _batch_size, model.tst: False, model.keep_prob: FLAGS.keep_prob} 115 | summary, _cost, _, _ = sess.run(train_fetches, feed_dict) # the cost is the mean cost of one batch 116 | # valid per 500 steps 117 | if 0 == (global_step + 1) % 500: 118 | train_writer.add_summary(summary, global_step) 119 | batch_id = np.random.randint(0, n_va_batches) # 随机选一个验证batch 120 | [X1_batch, X2_batch, y_batch] = get_batch(data_valid_path, batch_id) 121 | y_batch = to_categorical(y_batch) 122 | _batch_size = len(y_batch) 123 | feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch, 124 | model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0} 125 | summary, _cost = sess.run(valid_fetches, feed_dict) 126 | test_writer.add_summary(summary, global_step) 127 | 128 | 129 | def main(_): 130 | global ckpt_path 131 | global last_f1 132 | if not os.path.exists(ckpt_path): 133 | os.makedirs(ckpt_path) 134 | if not os.path.exists(summary_path): 135 | os.makedirs(summary_path) 136 | elif not FLAGS.is_retrain: # 重新训练本模型,删除以前的 summary 137 | shutil.rmtree(summary_path) 138 | os.makedirs(summary_path) 139 | if not os.path.exists(summary_path): 140 | os.makedirs(summary_path) 141 | 142 | print('1.Loading data...') 143 | W_embedding = np.load(embedding_path) 144 | print('training sample_num = %d' % n_tr_batches) 145 | print('valid sample_num = %d' % n_va_batches) 146 | 147 | # Initial or restore the model 148 | print('2.Building model...') 149 | config = tf.ConfigProto() 150 | config.gpu_options.allow_growth = True 151 | with tf.Session(config=config) as sess: 152 | model = network.BiGRU_CNN(W_embedding, settings) 153 | with tf.variable_scope('training_ops') as vs: 154 | learning_rate = tf.train.exponential_decay(FLAGS.lr, model.global_step, FLAGS.decay_step, 155 | FLAGS.decay_rate, staircase=True) 156 | # two optimizer: op1, update embedding; op2, do not update embedding. 157 | with tf.variable_scope('Optimizer1'): 158 | tvars1 = tf.trainable_variables() 159 | grads1 = tf.gradients(model.loss, tvars1) 160 | optimizer1 = tf.train.AdamOptimizer(learning_rate=learning_rate) 161 | train_op1 = optimizer1.apply_gradients(zip(grads1, tvars1), 162 | global_step=model.global_step) 163 | with tf.variable_scope('Optimizer2'): 164 | tvars2 = [tvar for tvar in tvars1 if 'embedding' not in tvar.name] 165 | grads2 = tf.gradients(model.loss, tvars2) 166 | optimizer2 = tf.train.AdamOptimizer(learning_rate=learning_rate) 167 | train_op2 = optimizer2.apply_gradients(zip(grads2, tvars2), 168 | global_step=model.global_step) 169 | update_op = tf.group(*model.update_emas) 170 | merged = tf.summary.merge_all() # summary 171 | train_writer = tf.summary.FileWriter(summary_path + 'train', sess.graph) 172 | test_writer = tf.summary.FileWriter(summary_path + 'test') 173 | training_ops = [v for v in tf.global_variables() if v.name.startswith(vs.name+'/')] 174 | 175 | # 如果已经保存过模型,导入上次的模型 176 | if os.path.exists(ckpt_path + "checkpoint"): 177 | print("Restoring Variables from Checkpoint...") 178 | model.saver.restore(sess, tf.train.latest_checkpoint(ckpt_path)) 179 | last_valid_cost, precision, recall, last_f1 = valid_epoch(data_valid_path, sess, model) 180 | print(' valid cost=%g; p=%g, r=%g, f1=%g' % (last_valid_cost, precision, recall, last_f1)) 181 | sess.run(tf.variables_initializer(training_ops)) 182 | train_op2 = train_op1 183 | else: 184 | print('Initializing Variables...') 185 | sess.run(tf.global_variables_initializer()) 186 | 187 | print('3.Begin training...') 188 | print('max_epoch=%d, max_max_epoch=%d' % (FLAGS.max_epoch, FLAGS.max_max_epoch)) 189 | train_op = train_op1 190 | for epoch in range(FLAGS.max_max_epoch): 191 | global_step = sess.run(model.global_step) 192 | print('Global step %d, lr=%g' % (global_step, sess.run(learning_rate))) 193 | if epoch == FLAGS.max_epoch: # update the embedding 194 | train_op = train_op1 195 | train_fetches = [merged, model.loss, train_op, update_op] 196 | valid_fetches = [merged, model.loss] 197 | train_epoch(data_train_path, sess, model, train_fetches, valid_fetches, train_writer, test_writer) 198 | # 最后再做一次验证 199 | valid_cost, precision, recall, f1 = valid_epoch(data_valid_path, sess, model) 200 | print('END.Global_step=%d: valid cost=%g; p=%g, r=%g, f1=%g' % ( 201 | sess.run(model.global_step), valid_cost, precision, recall, f1)) 202 | if f1 > last_f1: # save the better model 203 | saving_path = model.saver.save(sess, model_path, sess.run(model.global_step)+1) 204 | print('saved new model to %s ' % saving_path) 205 | 206 | 207 | if __name__ == '__main__': 208 | tf.app.run() 209 | -------------------------------------------------------------------------------- /zhihu-text-classification-master/models/wd_1_2_cnn_max/train.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | from __future__ import print_function 4 | from __future__ import division 5 | import tensorflow as tf 6 | import numpy as np 7 | from tqdm import tqdm 8 | import os 9 | import sys 10 | import shutil 11 | import time 12 | import network 13 | 14 | sys.path.append('../..') 15 | from data_helpers import to_categorical 16 | from evaluator import score_eval 17 | 18 | flags = tf.flags 19 | flags.DEFINE_bool('is_retrain', False, 'if is_retrain is true, not rebuild the summary') 20 | flags.DEFINE_integer('max_epoch', 1, 'update the embedding after max_epoch, default: 1') 21 | flags.DEFINE_integer('max_max_epoch', 6, 'all training epoches, default: 6') 22 | flags.DEFINE_float('lr', 1e-3, 'initial learning rate, default: 1e-3') 23 | flags.DEFINE_float('decay_rate', 0.65, 'decay rate, default: 0.65') 24 | flags.DEFINE_float('keep_prob', 0.5, 'keep_prob for training, default: 0.5') 25 | # 正式 26 | 27 | flags.DEFINE_integer('decay_step', 15000, 'decay_step, default: 15000') 28 | flags.DEFINE_integer('valid_step', 10000, 'valid_step, default: 10000') 29 | flags.DEFINE_float('last_f1', 0.35, 'if valid_f1 > last_f1, save new model. default: 0.40') 30 | 31 | # 测试 32 | # flags.DEFINE_integer('decay_step', 1000, 'decay_step, default: 1000') 33 | # flags.DEFINE_integer('valid_step', 500, 'valid_step, default: 500') 34 | # flags.DEFINE_float('last_f1', 0.10, 'if valid_f1 > last_f1, save new model. default: 0.10') 35 | FLAGS = flags.FLAGS 36 | 37 | lr = FLAGS.lr 38 | last_f1 = FLAGS.last_f1 39 | settings = network.Settings() 40 | title_len = settings.title_len 41 | summary_path = settings.summary_path 42 | ckpt_path = settings.ckpt_path 43 | model_path = ckpt_path + 'model.ckpt' 44 | 45 | embedding_path = '../../data/word_embedding.npy' 46 | data_train_path = '../../data/wd-data/data_train/' 47 | data_valid_path = '../../data/wd-data/data_valid/' 48 | tr_batches = os.listdir(data_train_path) # batch 文件名列表 49 | va_batches = os.listdir(data_valid_path) 50 | n_tr_batches = len(tr_batches) 51 | n_va_batches = len(va_batches) 52 | 53 | # 测试 54 | # n_tr_batches = 1000 55 | # n_va_batches = 50 56 | 57 | 58 | def get_batch(data_path, batch_id): 59 | """get a batch from data_path""" 60 | new_batch = np.load(data_path + str(batch_id) + '.npz') 61 | X_batch = new_batch['X'] 62 | y_batch = new_batch['y'] 63 | X1_batch = X_batch[:, :title_len] 64 | X2_batch = X_batch[:, title_len:] 65 | return [X1_batch, X2_batch, y_batch] 66 | 67 | 68 | def valid_epoch(data_path, sess, model): 69 | """Test on the valid data.""" 70 | va_batches = os.listdir(data_path) 71 | n_va_batches = len(va_batches) 72 | _costs = 0.0 73 | predict_labels_list = list() # 所有的预测结果 74 | marked_labels_list = list() 75 | for i in range(n_va_batches): 76 | [X1_batch, X2_batch, y_batch] = get_batch(data_path, i) 77 | marked_labels_list.extend(y_batch) 78 | y_batch = to_categorical(y_batch) 79 | _batch_size = len(y_batch) 80 | fetches = [model.loss, model.y_pred] 81 | feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch, 82 | model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0} 83 | _cost, predict_labels = sess.run(fetches, feed_dict) 84 | _costs += _cost 85 | predict_labels = list(map(lambda label: label.argsort()[-1:-6:-1], predict_labels)) # 取最大的5个下标 86 | predict_labels_list.extend(predict_labels) 87 | predict_label_and_marked_label_list = zip(predict_labels_list, marked_labels_list) 88 | precision, recall, f1 = score_eval(predict_label_and_marked_label_list) 89 | mean_cost = _costs / n_va_batches 90 | return mean_cost, precision, recall, f1 91 | 92 | 93 | def train_epoch(data_path, sess, model, train_fetches, valid_fetches, train_writer, test_writer): 94 | global last_f1 95 | global lr 96 | time0 = time.time() 97 | batch_indexs = np.random.permutation(n_tr_batches) # shuffle the training data 98 | for batch in tqdm(range(n_tr_batches)): 99 | global_step = sess.run(model.global_step) 100 | if 0 == (global_step + 1) % FLAGS.valid_step: 101 | valid_cost, precision, recall, f1 = valid_epoch(data_valid_path, sess, model) 102 | print('Global_step=%d: valid cost=%g; p=%g, r=%g, f1=%g, time=%g s' % ( 103 | global_step, valid_cost, precision, recall, f1, time.time() - time0)) 104 | time0 = time.time() 105 | if f1 > last_f1: 106 | last_f1 = f1 107 | saving_path = model.saver.save(sess, model_path, global_step+1) 108 | print('saved new model to %s ' % saving_path) 109 | # training 110 | batch_id = batch_indexs[batch] 111 | [X1_batch, X2_batch, y_batch] = get_batch(data_train_path, batch_id) 112 | y_batch = to_categorical(y_batch) 113 | _batch_size = len(y_batch) 114 | feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch, 115 | model.batch_size: _batch_size, model.tst: False, model.keep_prob: FLAGS.keep_prob} 116 | summary, _cost, _, _ = sess.run(train_fetches, feed_dict) # the cost is the mean cost of one batch 117 | # valid per 500 steps 118 | if 0 == (global_step + 1) % 500: 119 | train_writer.add_summary(summary, global_step) 120 | batch_id = np.random.randint(0, n_va_batches) # 随机选一个验证batch 121 | [X1_batch, X2_batch, y_batch] = get_batch(data_valid_path, batch_id) 122 | y_batch = to_categorical(y_batch) 123 | _batch_size = len(y_batch) 124 | feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch, 125 | model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0} 126 | summary, _cost = sess.run(valid_fetches, feed_dict) 127 | test_writer.add_summary(summary, global_step) 128 | 129 | 130 | def main(_): 131 | global ckpt_path 132 | global last_f1 133 | if not os.path.exists(ckpt_path): 134 | os.makedirs(ckpt_path) 135 | if not os.path.exists(summary_path): 136 | os.makedirs(summary_path) 137 | elif not FLAGS.is_retrain: # 重新训练本模型,删除以前的 summary 138 | shutil.rmtree(summary_path) 139 | os.makedirs(summary_path) 140 | if not os.path.exists(summary_path): 141 | os.makedirs(summary_path) 142 | 143 | print('1.Loading data...') 144 | W_embedding = np.load(embedding_path) 145 | print('training sample_num = %d' % n_tr_batches) 146 | print('valid sample_num = %d' % n_va_batches) 147 | 148 | # Initial or restore the model 149 | print('2.Building model...') 150 | config = tf.ConfigProto() 151 | config.gpu_options.allow_growth = True 152 | with tf.Session(config=config) as sess: 153 | model = network.TextCNN(W_embedding, settings) 154 | with tf.variable_scope('training_ops') as vs: 155 | learning_rate = tf.train.exponential_decay(FLAGS.lr, model.global_step, FLAGS.decay_step, 156 | FLAGS.decay_rate, staircase=True) 157 | # two optimizer: op1, update embedding; op2, do not update embedding. 158 | with tf.variable_scope('Optimizer1'): 159 | tvars1 = tf.trainable_variables() 160 | grads1 = tf.gradients(model.loss, tvars1) 161 | optimizer1 = tf.train.AdamOptimizer(learning_rate=learning_rate) 162 | train_op1 = optimizer1.apply_gradients(zip(grads1, tvars1), 163 | global_step=model.global_step) 164 | with tf.variable_scope('Optimizer2'): 165 | tvars2 = [tvar for tvar in tvars1 if 'embedding' not in tvar.name] 166 | grads2 = tf.gradients(model.loss, tvars2) 167 | optimizer2 = tf.train.AdamOptimizer(learning_rate=learning_rate) 168 | train_op2 = optimizer2.apply_gradients(zip(grads2, tvars2), 169 | global_step=model.global_step) 170 | update_op = tf.group(*model.update_emas) 171 | merged = tf.summary.merge_all() # summary 172 | train_writer = tf.summary.FileWriter(summary_path + 'train', sess.graph) 173 | test_writer = tf.summary.FileWriter(summary_path + 'test') 174 | training_ops = [v for v in tf.global_variables() if v.name.startswith(vs.name+'/')] 175 | 176 | # 如果已经保存过模型,导入上次的模型 177 | if os.path.exists(ckpt_path + "checkpoint"): 178 | print("Restoring Variables from Checkpoint...") 179 | model.saver.restore(sess, tf.train.latest_checkpoint(ckpt_path)) 180 | last_valid_cost, precision, recall, last_f1 = valid_epoch(data_valid_path, sess, model) 181 | print(' valid cost=%g; p=%g, r=%g, f1=%g' % (last_valid_cost, precision, recall, last_f1)) 182 | sess.run(tf.variables_initializer(training_ops)) 183 | train_op2 = train_op1 184 | else: 185 | print('Initializing Variables...') 186 | sess.run(tf.global_variables_initializer()) 187 | 188 | print('3.Begin training...') 189 | print('max_epoch=%d, max_max_epoch=%d' % (FLAGS.max_epoch, FLAGS.max_max_epoch)) 190 | train_op = train_op2 191 | for epoch in range(FLAGS.max_max_epoch): 192 | global_step = sess.run(model.global_step) 193 | print('Global step %d, lr=%g' % (global_step, sess.run(learning_rate))) 194 | if epoch == FLAGS.max_epoch: # update the embedding 195 | train_op = train_op1 196 | train_fetches = [merged, model.loss, train_op, update_op] 197 | valid_fetches = [merged, model.loss] 198 | train_epoch(data_train_path, sess, model, train_fetches, valid_fetches, train_writer, test_writer) 199 | # 最后再做一次验证 200 | valid_cost, precision, recall, f1 = valid_epoch(data_valid_path, sess, model) 201 | print('END.Global_step=%d: valid cost=%g; p=%g, r=%g, f1=%g' % ( 202 | sess.run(model.global_step), valid_cost, precision, recall, f1)) 203 | if f1 > last_f1: # save the better model 204 | saving_path = model.saver.save(sess, model_path, sess.run(model.global_step)+1) 205 | print('saved new model to %s ' % saving_path) 206 | 207 | 208 | if __name__ == '__main__': 209 | tf.app.run() 210 | -------------------------------------------------------------------------------- /zhihu-text-classification-master/models/wd_3_bigru/network.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | import tensorflow as tf 4 | from tensorflow.contrib import rnn 5 | import tensorflow.contrib.layers as layers 6 | 7 | """wd_3_bigru 8 | title 部分使用 bigru+attention;content 部分使用 bigru+attention; 两部分输出直接 concat。 9 | """ 10 | 11 | 12 | class Settings(object): 13 | def __init__(self): 14 | self.model_name = 'wd_3_bigru' 15 | self.title_len = 30 16 | self.content_len = 150 17 | self.hidden_size = 256 18 | self.n_layer = 1 19 | self.fc_hidden_size = 1024 20 | self.n_class = 1999 21 | self.summary_path = '../../summary/' + self.model_name + '/' 22 | self.ckpt_path = '../../ckpt/' + self.model_name + '/' 23 | 24 | 25 | class BiGRU(object): 26 | """ 27 | title: inputs->bigru+attention->output_title 28 | content: inputs->bigru+attention->output_content 29 | concat[output_title, output_content] -> fc+bn+relu -> sigmoid_entropy. 30 | """ 31 | 32 | def __init__(self, W_embedding, settings): 33 | self.model_name = settings.model_name 34 | self.title_len = settings.title_len 35 | self.content_len = settings.content_len 36 | self.hidden_size = settings.hidden_size 37 | self.n_layer = settings.n_layer 38 | self.n_class = settings.n_class 39 | self.fc_hidden_size = settings.fc_hidden_size 40 | self._global_step = tf.Variable(0, trainable=False, name='Global_Step') 41 | self.update_emas = list() 42 | # placeholders 43 | self._tst = tf.placeholder(tf.bool) 44 | self._keep_prob = tf.placeholder(tf.float32, []) 45 | self._batch_size = tf.placeholder(tf.int32, []) 46 | 47 | with tf.name_scope('Inputs'): 48 | self._X1_inputs = tf.placeholder(tf.int64, [None, self.title_len], name='X1_inputs') 49 | self._X2_inputs = tf.placeholder(tf.int64, [None, self.content_len], name='X2_inputs') 50 | self._y_inputs = tf.placeholder(tf.float32, [None, self.n_class], name='y_input') 51 | 52 | with tf.variable_scope('embedding'): 53 | self.embedding = tf.get_variable(name='embedding', shape=W_embedding.shape, 54 | initializer=tf.constant_initializer(W_embedding), trainable=True) 55 | self.embedding_size = W_embedding.shape[1] 56 | 57 | with tf.variable_scope('bigru_text'): 58 | output_title = self.bigru_inference(self._X1_inputs) 59 | 60 | with tf.variable_scope('bigru_content'): 61 | output_content = self.bigru_inference(self._X2_inputs) 62 | 63 | with tf.variable_scope('fc-bn-layer'): 64 | output = tf.concat([output_title, output_content], axis=1) 65 | W_fc = self.weight_variable([self.hidden_size * 4, self.fc_hidden_size], name='Weight_fc') 66 | tf.summary.histogram('W_fc', W_fc) 67 | h_fc = tf.matmul(output, W_fc, name='h_fc') 68 | beta_fc = tf.Variable(tf.constant(0.1, tf.float32, shape=[self.fc_hidden_size], name="beta_fc")) 69 | tf.summary.histogram('beta_fc', beta_fc) 70 | fc_bn, update_ema_fc = self.batchnorm(h_fc, beta_fc, convolutional=False) 71 | self.update_emas.append(update_ema_fc) 72 | self.fc_bn_relu = tf.nn.relu(fc_bn, name="relu") 73 | 74 | with tf.variable_scope('out_layer'): 75 | W_out = self.weight_variable([self.fc_hidden_size, self.n_class], name='Weight_out') 76 | tf.summary.histogram('Weight_out', W_out) 77 | b_out = self.bias_variable([self.n_class], name='bias_out') 78 | tf.summary.histogram('bias_out', b_out) 79 | self._y_pred = tf.nn.xw_plus_b(self.fc_bn_relu, W_out, b_out, name='y_pred') # 每个类别的分数 scores 80 | 81 | with tf.name_scope('loss'): 82 | self._loss = tf.reduce_mean( 83 | tf.nn.sigmoid_cross_entropy_with_logits(logits=self._y_pred, labels=self._y_inputs)) 84 | tf.summary.scalar('loss', self._loss) 85 | 86 | self.saver = tf.train.Saver(max_to_keep=1) 87 | 88 | @property 89 | def tst(self): 90 | return self._tst 91 | 92 | @property 93 | def keep_prob(self): 94 | return self._keep_prob 95 | 96 | @property 97 | def batch_size(self): 98 | return self._batch_size 99 | 100 | @property 101 | def global_step(self): 102 | return self._global_step 103 | 104 | @property 105 | def X1_inputs(self): 106 | return self._X1_inputs 107 | 108 | @property 109 | def X2_inputs(self): 110 | return self._X2_inputs 111 | 112 | @property 113 | def y_inputs(self): 114 | return self._y_inputs 115 | 116 | @property 117 | def y_pred(self): 118 | return self._y_pred 119 | 120 | @property 121 | def loss(self): 122 | return self._loss 123 | 124 | def weight_variable(self, shape, name): 125 | """Create a weight variable with appropriate initialization.""" 126 | initial = tf.truncated_normal(shape, stddev=0.1) 127 | return tf.Variable(initial, name=name) 128 | 129 | def bias_variable(self, shape, name): 130 | """Create a bias variable with appropriate initialization.""" 131 | initial = tf.constant(0.1, shape=shape) 132 | return tf.Variable(initial, name=name) 133 | 134 | def batchnorm(self, Ylogits, offset, convolutional=False): 135 | """batchnormalization. 136 | Args: 137 | Ylogits: 1D向量或者是3D的卷积结果。 138 | num_updates: 迭代的global_step 139 | offset:表示beta,全局均值;在 RELU 激活中一般初始化为 0.1。 140 | scale:表示lambda,全局方差;在 sigmoid 激活中需要,这 RELU 激活中作用不大。 141 | m: 表示batch均值;v:表示batch方差。 142 | bnepsilon:一个很小的浮点数,防止除以 0. 143 | Returns: 144 | Ybn: 和 Ylogits 的维度一样,就是经过 Batch Normalization 处理的结果。 145 | update_moving_everages:更新mean和variance,主要是给最后的 test 使用。 146 | """ 147 | exp_moving_avg = tf.train.ExponentialMovingAverage(0.999, self._global_step) # adding the iteration prevents from averaging across non-existing iterations 148 | bnepsilon = 1e-5 149 | if convolutional: 150 | mean, variance = tf.nn.moments(Ylogits, [0, 1, 2]) 151 | else: 152 | mean, variance = tf.nn.moments(Ylogits, [0]) 153 | update_moving_everages = exp_moving_avg.apply([mean, variance]) 154 | m = tf.cond(self.tst, lambda: exp_moving_avg.average(mean), lambda: mean) 155 | v = tf.cond(self.tst, lambda: exp_moving_avg.average(variance), lambda: variance) 156 | Ybn = tf.nn.batch_normalization(Ylogits, m, v, offset, None, bnepsilon) 157 | return Ybn, update_moving_everages 158 | 159 | def gru_cell(self): 160 | with tf.name_scope('gru_cell'): 161 | cell = rnn.GRUCell(self.hidden_size, reuse=tf.get_variable_scope().reuse) 162 | return rnn.DropoutWrapper(cell, output_keep_prob=self.keep_prob) 163 | 164 | def bi_gru(self, inputs): 165 | """build the bi-GRU network. 返回个所有层的隐含状态。""" 166 | cells_fw = [self.gru_cell() for _ in range(self.n_layer)] 167 | cells_bw = [self.gru_cell() for _ in range(self.n_layer)] 168 | initial_states_fw = [cell_fw.zero_state(self.batch_size, tf.float32) for cell_fw in cells_fw] 169 | initial_states_bw = [cell_bw.zero_state(self.batch_size, tf.float32) for cell_bw in cells_bw] 170 | outputs, _, _ = rnn.stack_bidirectional_dynamic_rnn(cells_fw, cells_bw, inputs, 171 | initial_states_fw=initial_states_fw, 172 | initial_states_bw=initial_states_bw, dtype=tf.float32) 173 | return outputs 174 | 175 | def task_specific_attention(self, inputs, output_size, 176 | initializer=layers.xavier_initializer(), 177 | activation_fn=tf.tanh, scope=None): 178 | """ 179 | Performs task-specific attention reduction, using learned 180 | attention context vector (constant within task of interest). 181 | Args: 182 | inputs: Tensor of shape [batch_size, units, input_size] 183 | `input_size` must be static (known) 184 | `units` axis will be attended over (reduced from output) 185 | `batch_size` will be preserved 186 | output_size: Size of output's inner (feature) dimension 187 | Returns: 188 | outputs: Tensor of shape [batch_size, output_dim]. 189 | """ 190 | assert len(inputs.get_shape()) == 3 and inputs.get_shape()[-1].value is not None 191 | with tf.variable_scope(scope or 'attention') as scope: 192 | # u_w, attention 向量 193 | attention_context_vector = tf.get_variable(name='attention_context_vector', shape=[output_size], 194 | initializer=initializer, dtype=tf.float32) 195 | # 全连接层,把 h_i 转为 u_i , shape= [batch_size, units, input_size] -> [batch_size, units, output_size] 196 | input_projection = layers.fully_connected(inputs, output_size, activation_fn=activation_fn, scope=scope) 197 | # 输出 [batch_size, units] 198 | vector_attn = tf.reduce_sum(tf.multiply(input_projection, attention_context_vector), axis=2, keep_dims=True) 199 | attention_weights = tf.nn.softmax(vector_attn, dim=1) 200 | tf.summary.histogram('attention_weigths', attention_weights) 201 | weighted_projection = tf.multiply(inputs, attention_weights) 202 | outputs = tf.reduce_sum(weighted_projection, axis=1) 203 | return outputs # 输出 [batch_size, hidden_size*2] 204 | 205 | def bigru_inference(self, X_inputs): 206 | inputs = tf.nn.embedding_lookup(self.embedding, X_inputs) 207 | output_bigru = self.bi_gru(inputs) 208 | output_att = self.task_specific_attention(output_bigru, self.hidden_size*2) 209 | return output_att 210 | 211 | 212 | # test the model 213 | def test(): 214 | import numpy as np 215 | print('Begin testing...') 216 | settings = Settings() 217 | W_embedding = np.random.randn(50, 10) 218 | config = tf.ConfigProto() 219 | config.gpu_options.allow_growth = True 220 | batch_size = 128 221 | with tf.Session(config=config) as sess: 222 | model = BiGRU(W_embedding, settings) 223 | optimizer = tf.train.AdamOptimizer(0.001) 224 | train_op = optimizer.minimize(model.loss) 225 | update_op = tf.group(*model.update_emas) 226 | sess.run(tf.global_variables_initializer()) 227 | fetch = [model.loss, model.y_pred, train_op, update_op] 228 | loss_list = list() 229 | for i in xrange(100): 230 | X1_batch = np.zeros((batch_size, 30), dtype=float) 231 | X2_batch = np.zeros((batch_size, 150), dtype=float) 232 | y_batch = np.zeros((batch_size, 1999), dtype=int) 233 | _batch_size = len(y_batch) 234 | feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch, 235 | model.batch_size: _batch_size, model.tst: False, model.keep_prob: 0.5} 236 | loss, y_pred, _, _ = sess.run(fetch, feed_dict=feed_dict) 237 | loss_list.append(loss) 238 | print(i, loss) 239 | 240 | if __name__ == '__main__': 241 | test() 242 | -------------------------------------------------------------------------------- /zhihu-text-classification-master/models/wd_6_rcnn/network.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | import tensorflow as tf 4 | from tensorflow.contrib import rnn 5 | import tensorflow.contrib.layers as layers 6 | 7 | """wd_6_rcnn 8 | 在论文 Recurrent Convolutional Neural Networks for Text Classification 中。 9 | 使用 BiRNN 处理,将每个时刻的隐藏状态和原输入拼起来,在进行 max_pooling 操作。 10 | 这里有些不同,首先也是使用 bigru 得到每个时刻的,将每个时刻的隐藏状态和原输入拼起来; 11 | 然后使用输入到 TextCNN 网络中。 12 | """ 13 | 14 | 15 | class Settings(object): 16 | def __init__(self): 17 | self.model_name = "wd_6_rcnn" 18 | self.title_len = 30 19 | self.content_len = 150 20 | self.hidden_size = 256 21 | self.n_layer = 1 22 | self.filter_sizes = [2, 3, 4, 5, 7] 23 | self.n_filter = 256 24 | self.fc_hidden_size = 1024 25 | self.n_class = 1999 26 | self.summary_path = '../../summary/' + self.model_name + '/' 27 | self.ckpt_path = '../../ckpt/' + self.model_name + '/' 28 | 29 | 30 | class RCNN(object): 31 | def __init__(self, W_embedding, settings): 32 | self.model_name = settings.model_name 33 | self.title_len = settings.title_len 34 | self.content_len = settings.content_len 35 | self.hidden_size = settings.hidden_size 36 | self.n_layer = settings.n_layer 37 | self.filter_sizes = settings.filter_sizes 38 | self.n_filter = settings.n_filter 39 | self.n_filter_total = self.n_filter * len(self.filter_sizes) 40 | self.n_class = settings.n_class 41 | self.fc_hidden_size = settings.fc_hidden_size 42 | self._global_step = tf.Variable(0, trainable=False, name='Global_Step') 43 | self.update_emas = list() 44 | # placeholders 45 | self._tst = tf.placeholder(tf.bool) 46 | self._keep_prob = tf.placeholder(tf.float32, []) 47 | self._batch_size = tf.placeholder(tf.int32, []) 48 | 49 | with tf.name_scope('Inputs'): 50 | self._X1_inputs = tf.placeholder(tf.int64, [None, self.title_len], name='X1_inputs') 51 | self._X2_inputs = tf.placeholder(tf.int64, [None, self.content_len], name='X2_inputs') 52 | self._y_inputs = tf.placeholder(tf.float32, [None, self.n_class], name='y_input') 53 | 54 | with tf.variable_scope('embedding'): 55 | self.embedding = tf.get_variable(name='embedding', shape=W_embedding.shape, 56 | initializer=tf.constant_initializer(W_embedding), trainable=True) 57 | self.embedding_size = W_embedding.shape[1] 58 | 59 | with tf.variable_scope('rcnn_text'): 60 | output_title = self.rcnn_inference(self._X1_inputs, self.title_len) 61 | 62 | with tf.variable_scope('rcnn_content'): 63 | output_content = self.rcnn_inference(self._X2_inputs, self.content_len) 64 | 65 | with tf.variable_scope('fc-bn-layer'): 66 | output = tf.concat([output_title, output_content], axis=1) 67 | W_fc = self.weight_variable([self.n_filter_total*2, self.fc_hidden_size], 68 | name='Weight_fc') 69 | tf.summary.histogram('W_fc', W_fc) 70 | h_fc = tf.matmul(output, W_fc, name='h_fc') 71 | beta_fc = tf.Variable(tf.constant(0.1, tf.float32, shape=[self.fc_hidden_size], name="beta_fc")) 72 | tf.summary.histogram('beta_fc', beta_fc) 73 | fc_bn, update_ema_fc = self.batchnorm(h_fc, beta_fc, convolutional=False) 74 | self.update_emas.append(update_ema_fc) 75 | self.fc_bn_relu = tf.nn.relu(fc_bn, name="relu") 76 | fc_bn_drop = tf.nn.dropout(self.fc_bn_relu, self.keep_prob) 77 | 78 | with tf.variable_scope('out_layer'): 79 | W_out = self.weight_variable([self.fc_hidden_size, self.n_class], name='Weight_out') 80 | tf.summary.histogram('Weight_out', W_out) 81 | b_out = self.bias_variable([self.n_class], name='bias_out') 82 | tf.summary.histogram('bias_out', b_out) 83 | self._y_pred = tf.nn.xw_plus_b(fc_bn_drop, W_out, b_out, name='y_pred') # 每个类别的分数 scores 84 | 85 | with tf.name_scope('loss'): 86 | self._loss = tf.reduce_mean( 87 | tf.nn.sigmoid_cross_entropy_with_logits(logits=self._y_pred, labels=self._y_inputs)) 88 | tf.summary.scalar('loss', self._loss) 89 | 90 | self.saver = tf.train.Saver(max_to_keep=1) 91 | 92 | @property 93 | def tst(self): 94 | return self._tst 95 | 96 | @property 97 | def keep_prob(self): 98 | return self._keep_prob 99 | 100 | @property 101 | def batch_size(self): 102 | return self._batch_size 103 | 104 | @property 105 | def global_step(self): 106 | return self._global_step 107 | 108 | @property 109 | def X1_inputs(self): 110 | return self._X1_inputs 111 | 112 | @property 113 | def X2_inputs(self): 114 | return self._X2_inputs 115 | 116 | @property 117 | def y_inputs(self): 118 | return self._y_inputs 119 | 120 | @property 121 | def y_pred(self): 122 | return self._y_pred 123 | 124 | @property 125 | def loss(self): 126 | return self._loss 127 | 128 | def weight_variable(self, shape, name): 129 | """Create a weight variable with appropriate initialization.""" 130 | initial = tf.truncated_normal(shape, stddev=0.1) 131 | return tf.Variable(initial, name=name) 132 | 133 | def bias_variable(self, shape, name): 134 | """Create a bias variable with appropriate initialization.""" 135 | initial = tf.constant(0.1, shape=shape) 136 | return tf.Variable(initial, name=name) 137 | 138 | def batchnorm(self, Ylogits, offset, convolutional=False): 139 | """batchnormalization. 140 | Args: 141 | Ylogits: 1D向量或者是3D的卷积结果。 142 | num_updates: 迭代的global_step 143 | offset:表示beta,全局均值;在 RELU 激活中一般初始化为 0.1。 144 | scale:表示lambda,全局方差;在 sigmoid 激活中需要,这 RELU 激活中作用不大。 145 | m: 表示batch均值;v:表示batch方差。 146 | bnepsilon:一个很小的浮点数,防止除以 0. 147 | Returns: 148 | Ybn: 和 Ylogits 的维度一样,就是经过 Batch Normalization 处理的结果。 149 | update_moving_everages:更新mean和variance,主要是给最后的 test 使用。 150 | """ 151 | exp_moving_avg = tf.train.ExponentialMovingAverage(0.999, 152 | self._global_step) # adding the iteration prevents from averaging across non-existing iterations 153 | bnepsilon = 1e-5 154 | if convolutional: 155 | mean, variance = tf.nn.moments(Ylogits, [0, 1, 2]) 156 | else: 157 | mean, variance = tf.nn.moments(Ylogits, [0]) 158 | update_moving_everages = exp_moving_avg.apply([mean, variance]) 159 | m = tf.cond(self.tst, lambda: exp_moving_avg.average(mean), lambda: mean) 160 | v = tf.cond(self.tst, lambda: exp_moving_avg.average(variance), lambda: variance) 161 | Ybn = tf.nn.batch_normalization(Ylogits, m, v, offset, None, bnepsilon) 162 | return Ybn, update_moving_everages 163 | 164 | def gru_cell(self): 165 | with tf.name_scope('gru_cell'): 166 | cell = rnn.GRUCell(self.hidden_size, reuse=tf.get_variable_scope().reuse) 167 | return rnn.DropoutWrapper(cell, output_keep_prob=self.keep_prob) 168 | 169 | def bi_gru(self, X_inputs): 170 | """build the bi-GRU network. Return the encoder represented vector. 171 | X_inputs: [batch_size, n_step] 172 | n_step: 句子的词数量;或者文档的句子数。 173 | outputs: [fw_state, embeddings, bw_state], shape=[batch_size, hidden_size+embedding_size+hidden_size] 174 | """ 175 | inputs = tf.nn.embedding_lookup(self.embedding, X_inputs) # [batch_size, n_step, embedding_size] 176 | cells_fw = [self.gru_cell() for _ in range(self.n_layer)] 177 | cells_bw = [self.gru_cell() for _ in range(self.n_layer)] 178 | initial_states_fw = [cell_fw.zero_state(self.batch_size, tf.float32) for cell_fw in cells_fw] 179 | initial_states_bw = [cell_bw.zero_state(self.batch_size, tf.float32) for cell_bw in cells_bw] 180 | outputs, _, _ = rnn.stack_bidirectional_dynamic_rnn(cells_fw, cells_bw, inputs, 181 | initial_states_fw = initial_states_fw, initial_states_bw = initial_states_bw, dtype=tf.float32) 182 | hidden_outputs = tf.concat([outputs, inputs], axis=2) 183 | return hidden_outputs # shape =[seg_num, n_steps, hidden_size*2+embedding_size] 184 | 185 | def textcnn(self, cnn_inputs, n_step): 186 | """build the TextCNN network. Return the h_drop""" 187 | # cnn_inputs.shape = [batchsize, n_step, hidden_size*2+embedding_size] 188 | inputs = tf.expand_dims(cnn_inputs, -1) 189 | pooled_outputs = list() 190 | for i, filter_size in enumerate(self.filter_sizes): 191 | with tf.variable_scope("conv-maxpool-%s" % filter_size): 192 | # Convolution Layer 193 | filter_shape = [filter_size, self.hidden_size*2+self.embedding_size, 1, self.n_filter] 194 | W_filter = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W_filter") 195 | beta = tf.Variable(tf.constant(0.1, tf.float32, shape=[self.n_filter], name="beta")) 196 | tf.summary.histogram('beta', beta) 197 | conv = tf.nn.conv2d(inputs, W_filter, strides=[1, 1, 1, 1], padding="VALID", name="conv") 198 | conv_bn, update_ema = self.batchnorm(conv, beta, convolutional=True) # 在激活层前面加 BN 199 | # Apply nonlinearity, batch norm scaling is not useful with relus 200 | h = tf.nn.relu(conv_bn, name="relu") 201 | # Maxpooling over the outputs 202 | pooled = tf.nn.max_pool(h,ksize=[1, n_step - filter_size + 1, 1, 1], 203 | strides=[1, 1, 1, 1],padding='VALID',name="pool") 204 | pooled_outputs.append(pooled) 205 | self.update_emas.append(update_ema) 206 | h_pool = tf.concat(pooled_outputs, 3) 207 | h_pool_flat = tf.reshape(h_pool, [-1, self.n_filter_total]) 208 | return h_pool_flat # shape = [batch_size, n_filter_total] 209 | 210 | def rcnn_inference(self, X_inputs, n_step): 211 | output_bigru = self.bi_gru(X_inputs) 212 | output_cnn = self.textcnn(output_bigru, n_step) 213 | return output_cnn # shape = [batch_size, n_filter_total] 214 | 215 | 216 | # test the model 217 | def test(): 218 | import numpy as np 219 | print('Begin testing...') 220 | settings = Settings() 221 | W_embedding = np.random.randn(50, 10) 222 | config = tf.ConfigProto() 223 | config.gpu_options.allow_growth = True 224 | batch_size = 128 225 | with tf.Session(config=config) as sess: 226 | model = RCNN(W_embedding, settings) 227 | optimizer = tf.train.AdamOptimizer(0.001) 228 | train_op = optimizer.minimize(model.loss) 229 | update_op = tf.group(*model.update_emas) 230 | sess.run(tf.global_variables_initializer()) 231 | fetch = [model.loss, model.y_pred, train_op, update_op] 232 | loss_list = list() 233 | for i in xrange(100): 234 | X1_batch = np.zeros((batch_size, 30), dtype=float) 235 | X2_batch = np.zeros((batch_size, 150), dtype=float) 236 | y_batch = np.zeros((batch_size, 1999), dtype=int) 237 | _batch_size = len(y_batch) 238 | feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch, 239 | model.batch_size: _batch_size, model.tst: False, model.keep_prob: 0.5} 240 | loss, y_pred, _, _ = sess.run(fetch, feed_dict=feed_dict) 241 | loss_list.append(loss) 242 | print(i, loss) 243 | 244 | 245 | if __name__ == '__main__': 246 | test() 247 | -------------------------------------------------------------------------------- /zhihu-text-classification-master/models/wd_2_hcnn/network.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | import tensorflow as tf 4 | 5 | """wd_2_hcnn 6 | title 部分使用 TextCNN;content 部分使用分层的 TextCNN。 7 | """ 8 | 9 | 10 | class Settings(object): 11 | def __init__(self): 12 | self.model_name = 'wd_2_hcnn' 13 | self.title_len = self.sent_len = 30 14 | self.doc_len = 10 15 | self.sent_filter_sizes = [2, 3, 4, 5] 16 | self.doc_filter_sizes = [2, 3, 4] 17 | self.n_filter = 256 18 | self.fc_hidden_size = 1024 19 | self.n_class = 1999 20 | self.summary_path = '../../summary/' + self.model_name + '/' 21 | self.ckpt_path = '../../ckpt/' + self.model_name + '/' 22 | 23 | 24 | class HCNN(object): 25 | """ 26 | title: inputs->textcnn->output_title 27 | content: inputs->hcnn->output_content 28 | concat[output_title, output_content] -> fc+bn+relu -> sigmoid_entropy. 29 | """ 30 | 31 | def __init__(self, W_embedding, settings): 32 | self.model_name = settings.model_name 33 | self.sent_len = settings.sent_len 34 | self.doc_len = settings.doc_len 35 | self.sent_filter_sizes = settings.sent_filter_sizes 36 | self.doc_filter_sizes = settings.doc_filter_sizes 37 | self.n_filter = settings.n_filter 38 | self.n_class = settings.n_class 39 | self.fc_hidden_size = settings.fc_hidden_size 40 | self._global_step = tf.Variable(0, trainable=False, name='Global_Step') 41 | self.update_emas = list() 42 | # placeholders 43 | self._tst = tf.placeholder(tf.bool) 44 | self._keep_prob = tf.placeholder(tf.float32, []) 45 | self._batch_size = tf.placeholder(tf.int32, []) 46 | 47 | with tf.name_scope('Inputs'): 48 | self._X1_inputs = tf.placeholder(tf.int64, [None, self.sent_len], name='X1_inputs') 49 | self._X2_inputs = tf.placeholder(tf.int64, [None, self.doc_len * self.sent_len], name='X2_inputs') 50 | self._y_inputs = tf.placeholder(tf.float32, [None, self.n_class], name='y_input') 51 | 52 | with tf.variable_scope('embedding'): 53 | self.embedding = tf.get_variable(name='embedding', shape=W_embedding.shape, 54 | initializer=tf.constant_initializer(W_embedding), trainable=True) 55 | self.embedding_size = W_embedding.shape[1] 56 | 57 | with tf.variable_scope('cnn_text'): 58 | output_title = self.cnn_inference(self._X1_inputs) 59 | 60 | with tf.variable_scope('hcnn_content'): 61 | output_content = self.hcnn_inference(self._X2_inputs) 62 | 63 | with tf.variable_scope('fc-bn-layer'): 64 | output = tf.concat([output_title, output_content], axis=1) 65 | output_size = self.n_filter * (len(self.sent_filter_sizes) + len(self.doc_filter_sizes)) 66 | W_fc = self.weight_variable([output_size, self.fc_hidden_size], name='Weight_fc') 67 | tf.summary.histogram('W_fc', W_fc) 68 | h_fc = tf.matmul(output, W_fc, name='h_fc') 69 | beta_fc = tf.Variable(tf.constant(0.1, tf.float32, shape=[self.fc_hidden_size], name="beta_fc")) 70 | tf.summary.histogram('beta_fc', beta_fc) 71 | fc_bn, update_ema_fc = self.batchnorm(h_fc, beta_fc, convolutional=False) 72 | self.update_emas.append(update_ema_fc) 73 | self.fc_bn_relu = tf.nn.relu(fc_bn, name="relu") 74 | fc_bn_drop = tf.nn.dropout(self.fc_bn_relu, self.keep_prob) 75 | 76 | with tf.variable_scope('out_layer'): 77 | W_out = self.weight_variable([self.fc_hidden_size, self.n_class], name='Weight_out') 78 | tf.summary.histogram('Weight_out', W_out) 79 | b_out = self.bias_variable([self.n_class], name='bias_out') 80 | tf.summary.histogram('bias_out', b_out) 81 | self._y_pred = tf.nn.xw_plus_b(fc_bn_drop, W_out, b_out, name='y_pred') # 每个类别的分数 scores 82 | 83 | with tf.name_scope('loss'): 84 | self._loss = tf.reduce_mean( 85 | tf.nn.sigmoid_cross_entropy_with_logits(logits=self._y_pred, labels=self._y_inputs)) 86 | tf.summary.scalar('loss', self._loss) 87 | 88 | self.saver = tf.train.Saver(max_to_keep=2) 89 | 90 | @property 91 | def tst(self): 92 | return self._tst 93 | 94 | @property 95 | def keep_prob(self): 96 | return self._keep_prob 97 | 98 | @property 99 | def batch_size(self): 100 | return self._batch_size 101 | 102 | @property 103 | def global_step(self): 104 | return self._global_step 105 | 106 | @property 107 | def X1_inputs(self): 108 | return self._X1_inputs 109 | 110 | @property 111 | def X2_inputs(self): 112 | return self._X2_inputs 113 | 114 | @property 115 | def y_inputs(self): 116 | return self._y_inputs 117 | 118 | @property 119 | def y_pred(self): 120 | return self._y_pred 121 | 122 | @property 123 | def loss(self): 124 | return self._loss 125 | 126 | def weight_variable(self, shape, name): 127 | """Create a weight variable with appropriate initialization.""" 128 | initial = tf.truncated_normal(shape, stddev=0.1) 129 | return tf.Variable(initial, name=name) 130 | 131 | def bias_variable(self, shape, name): 132 | """Create a bias variable with appropriate initialization.""" 133 | initial = tf.constant(0.1, shape=shape) 134 | return tf.Variable(initial, name=name) 135 | 136 | def batchnorm(self, Ylogits, offset, convolutional=False): 137 | """batchnormalization. 138 | Args: 139 | Ylogits: 1D向量或者是3D的卷积结果。 140 | num_updates: 迭代的global_step 141 | offset:表示beta,全局均值;在 RELU 激活中一般初始化为 0.1。 142 | scale:表示lambda,全局方差;在 sigmoid 激活中需要,这 RELU 激活中作用不大。 143 | m: 表示batch均值;v:表示batch方差。 144 | bnepsilon:一个很小的浮点数,防止除以 0. 145 | Returns: 146 | Ybn: 和 Ylogits 的维度一样,就是经过 Batch Normalization 处理的结果。 147 | update_moving_everages:更新mean和variance,主要是给最后的 test 使用。 148 | """ 149 | exp_moving_avg = tf.train.ExponentialMovingAverage(0.999, 150 | self._global_step) # adding the iteration prevents from averaging across non-existing iterations 151 | bnepsilon = 1e-5 152 | if convolutional: 153 | mean, variance = tf.nn.moments(Ylogits, [0, 1, 2]) 154 | else: 155 | mean, variance = tf.nn.moments(Ylogits, [0]) 156 | update_moving_everages = exp_moving_avg.apply([mean, variance]) 157 | m = tf.cond(self.tst, lambda: exp_moving_avg.average(mean), lambda: mean) 158 | v = tf.cond(self.tst, lambda: exp_moving_avg.average(variance), lambda: variance) 159 | Ybn = tf.nn.batch_normalization(Ylogits, m, v, offset, None, bnepsilon) 160 | return Ybn, update_moving_everages 161 | 162 | def textcnn(self, X_inputs, n_step, filter_sizes, embed_size): 163 | """build the TextCNN network. 164 | n_step: the sentence len.""" 165 | inputs = tf.expand_dims(X_inputs, -1) 166 | pooled_outputs = list() 167 | for i, filter_size in enumerate(filter_sizes): 168 | with tf.name_scope("conv-maxpool-%s" % filter_size): 169 | # Convolution Layer 170 | filter_shape = [filter_size, embed_size, 1, self.n_filter] 171 | W_filter = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W_filter") 172 | beta = tf.Variable(tf.constant(0.1, tf.float32, shape=[self.n_filter], name="beta")) 173 | tf.summary.histogram('beta', beta) 174 | conv = tf.nn.conv2d(inputs, W_filter, strides=[1, 1, 1, 1], padding="VALID", name="conv") 175 | conv_bn, update_ema = self.batchnorm(conv, beta, convolutional=True) # 在激活层前面加 BN 176 | # Apply nonlinearity, batch norm scaling is not useful with relus 177 | # batch norm offsets are used instead of biases,使用 BN 层的 offset,不要 biases 178 | h = tf.nn.relu(conv_bn, name="relu") 179 | # Maxpooling over the outputs 180 | pooled = tf.nn.max_pool(h, ksize=[1, n_step - filter_size + 1, 1, 1], 181 | strides=[1, 1, 1, 1], padding='VALID', name="pool") 182 | pooled_outputs.append(pooled) 183 | self.update_emas.append(update_ema) 184 | h_pool = tf.concat(pooled_outputs, 3) 185 | n_filter_total = self.n_filter * len(filter_sizes) 186 | h_pool_flat = tf.reshape(h_pool, [-1, n_filter_total]) 187 | return h_pool_flat # shape = [-1, n_filter_total] 188 | 189 | def cnn_inference(self, X_inputs): 190 | """TextCNN 模型。title部分。 191 | Args: 192 | X_inputs: tensor.shape=(batch_size, title_len) 193 | Returns: 194 | title_outputs: tensor.shape=(batch_size, n_filter*filter_num_sent) 195 | """ 196 | inputs = tf.nn.embedding_lookup(self.embedding, X_inputs) 197 | with tf.variable_scope('title_encoder'): # 生成 title 的向量表示 198 | title_outputs = self.textcnn(inputs, self.sent_len, self.sent_filter_sizes, embed_size=self.embedding_size) 199 | return title_outputs # shape = [batch_size, n_filter*filter_num_sent] 200 | 201 | def hcnn_inference(self, X_inputs): 202 | """分层 TextCNN 模型。content部分。 203 | Args: 204 | X_inputs: tensor.shape=(batch_size, doc_len*sent_len) 205 | Returns: 206 | doc_attn_outputs: tensor.shape=(batch_size, n_filter*filter_num_doc) 207 | """ 208 | inputs = tf.nn.embedding_lookup(self.embedding, 209 | X_inputs) # inputs.shape=[batch_size, doc_len*sent_len, embedding_size] 210 | sent_inputs = tf.reshape(inputs, [self.batch_size * self.doc_len, self.sent_len, 211 | self.embedding_size]) # [batch_size*doc_len, sent_len, embedding_size] 212 | with tf.variable_scope('sentence_encoder'): # 生成句向量 213 | sent_outputs = self.textcnn(sent_inputs, self.sent_len, self.sent_filter_sizes, self.embedding_size) 214 | with tf.variable_scope('doc_encoder'): # 生成文档向量 215 | doc_inputs = tf.reshape(sent_outputs, [self.batch_size, self.doc_len, self.n_filter * len( 216 | self.sent_filter_sizes)]) # [batch_size, doc_len, n_filter*len(filter_sizes_sent)] 217 | doc_outputs = self.textcnn(doc_inputs, self.doc_len, self.doc_filter_sizes, self.n_filter * len( 218 | self.sent_filter_sizes)) # [batch_size, doc_len, n_filter*filter_num_doc] 219 | return doc_outputs # [batch_size, n_filter*len(doc_filter_sizes)] 220 | 221 | # test the model 222 | # def test(): 223 | # import numpy as np 224 | # print('Begin testing...') 225 | # settings = Settings() 226 | # W_embedding = np.random.randn(50, 10) 227 | # config = tf.ConfigProto() 228 | # config.gpu_options.allow_growth = True 229 | # batch_size = 128 230 | # with tf.Session(config=config) as sess: 231 | # model = HCNN(W_embedding, settings) 232 | # optimizer = tf.train.AdamOptimizer(0.001) 233 | # train_op = optimizer.minimize(model.loss) 234 | # update_op = tf.group(*model.update_emas) 235 | # sess.run(tf.global_variables_initializer()) 236 | # fetch = [model.loss, model.y_pred, train_op, update_op] 237 | # loss_list = list() 238 | # for i in xrange(100): 239 | # X1_batch = np.zeros((batch_size, 30), dtype=float) 240 | # X2_batch = np.zeros((batch_size, 10 * 30), dtype=float) 241 | # y_batch = np.zeros((batch_size, 1999), dtype=int) 242 | # _batch_size = len(y_batch) 243 | # feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch, 244 | # model.batch_size: _batch_size, model.tst: False, model.keep_prob: 0.5} 245 | # loss, y_pred, _, _ = sess.run(fetch, feed_dict=feed_dict) 246 | # loss_list.append(loss) 247 | # print(i, loss) 248 | 249 | # test() 250 | -------------------------------------------------------------------------------- /zhihu-text-classification-master/models/wd_4_han/network.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | import tensorflow as tf 4 | from tensorflow.contrib import rnn 5 | import tensorflow.contrib.layers as layers 6 | 7 | """wd_4_han 8 | title 部分使用 bigru+attention;content 部分使用 han; 两部分输出直接 concat。 9 | """ 10 | 11 | 12 | class Settings(object): 13 | def __init__(self): 14 | self.model_name = 'wd_4_han' 15 | self.title_len = self.sent_len = 30 16 | self.doc_len = 10 17 | self.hidden_size = 256 18 | self.n_layer = 1 19 | self.fc_hidden_size = 1024 20 | self.n_class = 1999 21 | self.summary_path = '../../summary/' + self.model_name + '/' 22 | self.ckpt_path = '../../ckpt/' + self.model_name + '/' 23 | 24 | 25 | class HAN(object): 26 | """ 27 | title: inputs->bigru+attention->output_title 28 | content: inputs->sent_encoder(bigru+attention)->doc_encoder(bigru+attention)->output_content 29 | concat[output_title, output_content] -> fc+bn+relu -> sigmoid_entropy. 30 | """ 31 | 32 | def __init__(self, W_embedding, settings): 33 | self.model_name = settings.model_name 34 | self.title_len = self.sent_len = settings.sent_len 35 | self.doc_len = settings.doc_len 36 | self.hidden_size = settings.hidden_size 37 | self.n_layer = settings.n_layer 38 | self.n_class = settings.n_class 39 | self.fc_hidden_size = settings.fc_hidden_size 40 | self._global_step = tf.Variable(0, trainable=False, name='Global_Step') 41 | self.update_emas = list() 42 | # placeholders 43 | self._tst = tf.placeholder(tf.bool) 44 | self._keep_prob = tf.placeholder(tf.float32, []) 45 | self._batch_size = tf.placeholder(tf.int32, []) 46 | 47 | with tf.name_scope('Inputs'): 48 | self._X1_inputs = tf.placeholder(tf.int64, [None, self.title_len], name='X1_inputs') 49 | self._X2_inputs = tf.placeholder(tf.int64, [None, self.doc_len * self.sent_len], name='X2_inputs') 50 | self._y_inputs = tf.placeholder(tf.float32, [None, self.n_class], name='y_input') 51 | 52 | with tf.variable_scope('embedding'): 53 | self.embedding = tf.get_variable(name='embedding', shape=W_embedding.shape, 54 | initializer=tf.constant_initializer(W_embedding), trainable=True) 55 | self.embedding_size = W_embedding.shape[1] 56 | 57 | with tf.variable_scope('bigru_text'): 58 | output_title = self.bigru_inference(self._X1_inputs) 59 | 60 | with tf.variable_scope('han_content'): 61 | output_content = self.han_inference(self._X2_inputs) 62 | 63 | with tf.variable_scope('fc-bn-layer'): 64 | output = tf.concat([output_title, output_content], axis=1) 65 | W_fc = self.weight_variable([self.hidden_size * 4, self.fc_hidden_size], name='Weight_fc') 66 | tf.summary.histogram('W_fc', W_fc) 67 | h_fc = tf.matmul(output, W_fc, name='h_fc') 68 | beta_fc = tf.Variable(tf.constant(0.1, tf.float32, shape=[self.fc_hidden_size], name="beta_fc")) 69 | tf.summary.histogram('beta_fc', beta_fc) 70 | fc_bn, update_ema_fc = self.batchnorm(h_fc, beta_fc, convolutional=False) 71 | self.update_emas.append(update_ema_fc) 72 | self.fc_bn_relu = tf.nn.relu(fc_bn, name="relu") 73 | fc_bn_drop = tf.nn.dropout(self.fc_bn_relu, self.keep_prob) 74 | 75 | with tf.variable_scope('out_layer'): 76 | W_out = self.weight_variable([self.fc_hidden_size, self.n_class], name='Weight_out') 77 | tf.summary.histogram('Weight_out', W_out) 78 | b_out = self.bias_variable([self.n_class], name='bias_out') 79 | tf.summary.histogram('bias_out', b_out) 80 | self._y_pred = tf.nn.xw_plus_b(fc_bn_drop, W_out, b_out, name='y_pred') # 每个类别的分数 scores 81 | 82 | with tf.name_scope('loss'): 83 | self._loss = tf.reduce_mean( 84 | tf.nn.sigmoid_cross_entropy_with_logits(logits=self._y_pred, labels=self._y_inputs)) 85 | tf.summary.scalar('loss', self._loss) 86 | 87 | self.saver = tf.train.Saver(max_to_keep=1) 88 | 89 | @property 90 | def tst(self): 91 | return self._tst 92 | 93 | @property 94 | def keep_prob(self): 95 | return self._keep_prob 96 | 97 | @property 98 | def batch_size(self): 99 | return self._batch_size 100 | 101 | @property 102 | def global_step(self): 103 | return self._global_step 104 | 105 | @property 106 | def X1_inputs(self): 107 | return self._X1_inputs 108 | 109 | @property 110 | def X2_inputs(self): 111 | return self._X2_inputs 112 | 113 | @property 114 | def y_inputs(self): 115 | return self._y_inputs 116 | 117 | @property 118 | def y_pred(self): 119 | return self._y_pred 120 | 121 | @property 122 | def loss(self): 123 | return self._loss 124 | 125 | def weight_variable(self, shape, name): 126 | """Create a weight variable with appropriate initialization.""" 127 | initial = tf.truncated_normal(shape, stddev=0.1) 128 | return tf.Variable(initial, name=name) 129 | 130 | def bias_variable(self, shape, name): 131 | """Create a bias variable with appropriate initialization.""" 132 | initial = tf.constant(0.1, shape=shape) 133 | return tf.Variable(initial, name=name) 134 | 135 | def batchnorm(self, Ylogits, offset, convolutional=False): 136 | """batchnormalization. 137 | Args: 138 | Ylogits: 1D向量或者是3D的卷积结果。 139 | num_updates: 迭代的global_step 140 | offset:表示beta,全局均值;在 RELU 激活中一般初始化为 0.1。 141 | scale:表示lambda,全局方差;在 sigmoid 激活中需要,这 RELU 激活中作用不大。 142 | m: 表示batch均值;v:表示batch方差。 143 | bnepsilon:一个很小的浮点数,防止除以 0. 144 | Returns: 145 | Ybn: 和 Ylogits 的维度一样,就是经过 Batch Normalization 处理的结果。 146 | update_moving_everages:更新mean和variance,主要是给最后的 test 使用。 147 | """ 148 | exp_moving_avg = tf.train.ExponentialMovingAverage(0.999, self._global_step) # adding the iteration prevents from averaging across non-existing iterations 149 | bnepsilon = 1e-5 150 | if convolutional: 151 | mean, variance = tf.nn.moments(Ylogits, [0, 1, 2]) 152 | else: 153 | mean, variance = tf.nn.moments(Ylogits, [0]) 154 | update_moving_everages = exp_moving_avg.apply([mean, variance]) 155 | m = tf.cond(self.tst, lambda: exp_moving_avg.average(mean), lambda: mean) 156 | v = tf.cond(self.tst, lambda: exp_moving_avg.average(variance), lambda: variance) 157 | Ybn = tf.nn.batch_normalization(Ylogits, m, v, offset, None, bnepsilon) 158 | return Ybn, update_moving_everages 159 | 160 | def gru_cell(self): 161 | with tf.name_scope('gru_cell'): 162 | cell = rnn.GRUCell(self.hidden_size, reuse=tf.get_variable_scope().reuse) 163 | return rnn.DropoutWrapper(cell, output_keep_prob=self.keep_prob) 164 | 165 | def bi_gru(self, inputs, seg_num): 166 | """build the bi-GRU network. Return the encoder represented vector. 167 | n_step: 句子的词数量;或者文档的句子数。 168 | seg_num: 序列的数量,原本应该为 batch_size, 但是这里将 batch_size 个 doc展开成很多个句子。 169 | """ 170 | cells_fw = [self.gru_cell() for _ in range(self.n_layer)] 171 | cells_bw = [self.gru_cell() for _ in range(self.n_layer)] 172 | initial_states_fw = [cell_fw.zero_state(seg_num, tf.float32) for cell_fw in cells_fw] 173 | initial_states_bw = [cell_bw.zero_state(seg_num, tf.float32) for cell_bw in cells_bw] 174 | outputs, _, _ = rnn.stack_bidirectional_dynamic_rnn(cells_fw, cells_bw, inputs, 175 | initial_states_fw = initial_states_fw, initial_states_bw = initial_states_bw, dtype=tf.float32) 176 | # outputs: Output Tensor shaped: seg_num, max_time, layers_output],其中layers_output=hidden_size * 2 在这里。 177 | return outputs 178 | 179 | def task_specific_attention(self, inputs, output_size, 180 | initializer=layers.xavier_initializer(), 181 | activation_fn=tf.tanh, scope=None): 182 | """ 183 | Performs task-specific attention reduction, using learned 184 | attention context vector (constant within task of interest). 185 | Args: 186 | inputs: Tensor of shape [batch_size, units, input_size] 187 | `input_size` must be static (known) 188 | `units` axis will be attended over (reduced from output) 189 | `batch_size` will be preserved 190 | output_size: Size of output's inner (feature) dimension 191 | Returns: 192 | outputs: Tensor of shape [batch_size, output_dim]. 193 | """ 194 | assert len(inputs.get_shape()) == 3 and inputs.get_shape()[-1].value is not None 195 | with tf.variable_scope(scope or 'attention') as scope: 196 | # u_w, attention 向量 197 | attention_context_vector = tf.get_variable(name='attention_context_vector', shape=[output_size], 198 | initializer=initializer, dtype=tf.float32) 199 | # 全连接层,把 h_i 转为 u_i , shape= [batch_size, units, input_size] -> [batch_size, units, output_size] 200 | input_projection = layers.fully_connected(inputs, output_size, activation_fn=activation_fn, scope=scope) 201 | # 输出 [batch_size, units] 202 | vector_attn = tf.reduce_sum(tf.multiply(input_projection, attention_context_vector), axis=2, keep_dims=True) 203 | attention_weights = tf.nn.softmax(vector_attn, dim=1) 204 | tf.summary.histogram('attention_weigths', attention_weights) 205 | weighted_projection = tf.multiply(inputs, attention_weights) 206 | outputs = tf.reduce_sum(weighted_projection, axis=1) 207 | return outputs # 输出 [batch_size, hidden_size*2] 208 | 209 | def bigru_inference(self, X_inputs): 210 | inputs = tf.nn.embedding_lookup(self.embedding, X_inputs) 211 | output_bigru = self.bi_gru(inputs, self.batch_size) 212 | output_att = self.task_specific_attention(output_bigru, self.hidden_size*2) 213 | return output_att # 输出 [batch_size, hidden_size*2] 214 | 215 | def han_inference(self, X_inputs): 216 | """分层 attention 模型。content部分。 217 | Args: 218 | X_inputs: tensor.shape=(batch_size, doc_len*sent_len) 219 | Returns: 220 | doc_attn_outputs: tensor.shape=(batch_size, hidden_size(*2 for bigru)) 221 | """ 222 | inputs = tf.nn.embedding_lookup(self.embedding, X_inputs) # inputs.shape=[batch_size, doc_len*sent_len, embedding_size] 223 | sent_inputs = tf.reshape(inputs,[self.batch_size*self.doc_len, self.sent_len, self.embedding_size]) # shape=(?, 40, 256) 224 | with tf.variable_scope('sentence_encoder'): # 生成句向量 225 | sent_outputs = self.bi_gru(sent_inputs, seg_num=self.batch_size*self.doc_len) 226 | sent_attn_outputs = self.task_specific_attention(sent_outputs, self.hidden_size*2) # [batch_size*doc_len, hidden_size*2] 227 | with tf.variable_scope('dropout'): 228 | sent_attn_outputs = tf.nn.dropout(sent_attn_outputs, self.keep_prob) 229 | with tf.variable_scope('doc_encoder'): # 生成文档向量 230 | doc_inputs = tf.reshape(sent_attn_outputs, [self.batch_size, self.doc_len, self.hidden_size*2]) 231 | doc_outputs = self.bi_gru(doc_inputs, self.batch_size) # [batch_size, doc_len, hidden_size*2] 232 | doc_attn_outputs = self.task_specific_attention(doc_outputs, self.hidden_size*2) # [batch_size, hidden_size*2] 233 | return doc_attn_outputs # [batch_size, hidden_size*2] 234 | 235 | 236 | 237 | # test the model 238 | def test(): 239 | import numpy as np 240 | print('Begin testing...') 241 | settings = Settings() 242 | W_embedding = np.random.randn(50, 10) 243 | config = tf.ConfigProto() 244 | config.gpu_options.allow_growth = True 245 | batch_size = 128 246 | with tf.Session(config=config) as sess: 247 | model = HAN(W_embedding, settings) 248 | optimizer = tf.train.AdamOptimizer(0.001) 249 | train_op = optimizer.minimize(model.loss) 250 | update_op = tf.group(*model.update_emas) 251 | sess.run(tf.global_variables_initializer()) 252 | fetch = [model.loss, model.y_pred, train_op, update_op] 253 | loss_list = list() 254 | for i in xrange(100): 255 | X1_batch = np.zeros((batch_size, 30), dtype=float) 256 | X2_batch = np.zeros((batch_size, 10 * 30), dtype=float) 257 | y_batch = np.zeros((batch_size, 1999), dtype=int) 258 | _batch_size = len(y_batch) 259 | feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch, 260 | model.batch_size: _batch_size, model.tst: False, model.keep_prob: 0.5} 261 | loss, y_pred, _, _ = sess.run(fetch, feed_dict=feed_dict) 262 | loss_list.append(loss) 263 | print(i, loss) 264 | 265 | if __name__ == '__main__': 266 | test() 267 | -------------------------------------------------------------------------------- /zhihu-text-classification-master/models/wd_5_bigru_cnn/network.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | import tensorflow as tf 4 | from tensorflow.contrib import rnn 5 | import tensorflow.contrib.layers as layers 6 | 7 | """wd_5_bigru_cnn 8 | 两部分使用不同的 embedding, 因为RNN与CNN结构完全不同,共用embedding会降低性能。 9 | title 部分使用 bigru+attention;content 部分使用 textcnn; 两部分输出直接 concat。 10 | """ 11 | 12 | 13 | class Settings(object): 14 | def __init__(self): 15 | self.model_name = 'wd_5_bigru_cnn' 16 | self.title_len = 30 17 | self.content_len = 150 18 | self.hidden_size = 256 19 | self.n_layer = 1 20 | self.filter_sizes = [2, 3, 4, 5, 7] 21 | self.n_filter = 256 22 | self.fc_hidden_size = 1024 23 | self.n_class = 1999 24 | self.summary_path = '../../summary/' + self.model_name + '/' 25 | self.ckpt_path = '../../ckpt/' + self.model_name + '/' 26 | 27 | 28 | class BiGRU_CNN(object): 29 | """ 30 | title: inputs->bigru+attention->output_title 31 | content: inputs->textcnn->output_content 32 | concat[output_title, output_content] -> fc+bn+relu -> sigmoid_entropy. 33 | """ 34 | 35 | def __init__(self, W_embedding, settings): 36 | self.model_name = settings.model_name 37 | self.title_len = settings.title_len 38 | self.content_len = settings.content_len 39 | self.hidden_size = settings.hidden_size 40 | self.n_layer = settings.n_layer 41 | self.filter_sizes = settings.filter_sizes 42 | self.n_filter = settings.n_filter 43 | self.n_filter_total = self.n_filter * len(self.filter_sizes) 44 | self.n_class = settings.n_class 45 | self.fc_hidden_size = settings.fc_hidden_size 46 | self._global_step = tf.Variable(0, trainable=False, name='Global_Step') 47 | self.update_emas = list() 48 | # placeholders 49 | self._tst = tf.placeholder(tf.bool) 50 | self._keep_prob = tf.placeholder(tf.float32, []) 51 | self._batch_size = tf.placeholder(tf.int32, []) 52 | 53 | with tf.name_scope('Inputs'): 54 | self._X1_inputs = tf.placeholder(tf.int64, [None, self.title_len], name='X1_inputs') 55 | self._X2_inputs = tf.placeholder(tf.int64, [None, self.content_len], name='X2_inputs') 56 | self._y_inputs = tf.placeholder(tf.float32, [None, self.n_class], name='y_input') 57 | 58 | with tf.variable_scope('embedding'): 59 | self.title_embedding = tf.get_variable(name='title_embedding', shape=W_embedding.shape, 60 | initializer=tf.constant_initializer(W_embedding), trainable=True) 61 | self.content_embedding = tf.get_variable(name='content_embedding', shape=W_embedding.shape, 62 | initializer=tf.constant_initializer(W_embedding), trainable=True) 63 | self.embedding_size = W_embedding.shape[1] 64 | 65 | with tf.variable_scope('bigru_text'): 66 | output_title = self.bigru_inference(self._X1_inputs) 67 | 68 | with tf.variable_scope('cnn_content'): 69 | output_content = self.cnn_inference(self._X2_inputs, self.content_len) 70 | 71 | with tf.variable_scope('fc-bn-layer'): 72 | output = tf.concat([output_title, output_content], axis=1) 73 | W_fc = self.weight_variable([self.hidden_size*2 + self.n_filter_total, self.fc_hidden_size], name='Weight_fc') 74 | tf.summary.histogram('W_fc', W_fc) 75 | h_fc = tf.matmul(output, W_fc, name='h_fc') 76 | beta_fc = tf.Variable(tf.constant(0.1, tf.float32, shape=[self.fc_hidden_size], name="beta_fc")) 77 | tf.summary.histogram('beta_fc', beta_fc) 78 | fc_bn, update_ema_fc = self.batchnorm(h_fc, beta_fc, convolutional=False) 79 | self.update_emas.append(update_ema_fc) 80 | self.fc_bn_relu = tf.nn.relu(fc_bn, name="relu") 81 | fc_bn_drop = tf.nn.dropout(self.fc_bn_relu, self.keep_prob) 82 | 83 | with tf.variable_scope('out_layer'): 84 | W_out = self.weight_variable([self.fc_hidden_size, self.n_class], name='Weight_out') 85 | tf.summary.histogram('Weight_out', W_out) 86 | b_out = self.bias_variable([self.n_class], name='bias_out') 87 | tf.summary.histogram('bias_out', b_out) 88 | self._y_pred = tf.nn.xw_plus_b(fc_bn_drop, W_out, b_out, name='y_pred') # 每个类别的分数 scores 89 | 90 | with tf.name_scope('loss'): 91 | self._loss = tf.reduce_mean( 92 | tf.nn.sigmoid_cross_entropy_with_logits(logits=self._y_pred, labels=self._y_inputs)) 93 | tf.summary.scalar('loss', self._loss) 94 | 95 | self.saver = tf.train.Saver(max_to_keep=1) 96 | 97 | @property 98 | def tst(self): 99 | return self._tst 100 | 101 | @property 102 | def keep_prob(self): 103 | return self._keep_prob 104 | 105 | @property 106 | def batch_size(self): 107 | return self._batch_size 108 | 109 | @property 110 | def global_step(self): 111 | return self._global_step 112 | 113 | @property 114 | def X1_inputs(self): 115 | return self._X1_inputs 116 | 117 | @property 118 | def X2_inputs(self): 119 | return self._X2_inputs 120 | 121 | @property 122 | def y_inputs(self): 123 | return self._y_inputs 124 | 125 | @property 126 | def y_pred(self): 127 | return self._y_pred 128 | 129 | @property 130 | def loss(self): 131 | return self._loss 132 | 133 | def weight_variable(self, shape, name): 134 | """Create a weight variable with appropriate initialization.""" 135 | initial = tf.truncated_normal(shape, stddev=0.1) 136 | return tf.Variable(initial, name=name) 137 | 138 | def bias_variable(self, shape, name): 139 | """Create a bias variable with appropriate initialization.""" 140 | initial = tf.constant(0.1, shape=shape) 141 | return tf.Variable(initial, name=name) 142 | 143 | def batchnorm(self, Ylogits, offset, convolutional=False): 144 | """batchnormalization. 145 | Args: 146 | Ylogits: 1D向量或者是3D的卷积结果。 147 | num_updates: 迭代的global_step 148 | offset:表示beta,全局均值;在 RELU 激活中一般初始化为 0.1。 149 | scale:表示lambda,全局方差;在 sigmoid 激活中需要,这 RELU 激活中作用不大。 150 | m: 表示batch均值;v:表示batch方差。 151 | bnepsilon:一个很小的浮点数,防止除以 0. 152 | Returns: 153 | Ybn: 和 Ylogits 的维度一样,就是经过 Batch Normalization 处理的结果。 154 | update_moving_everages:更新mean和variance,主要是给最后的 test 使用。 155 | """ 156 | exp_moving_avg = tf.train.ExponentialMovingAverage(0.999, self._global_step) # adding the iteration prevents from averaging across non-existing iterations 157 | bnepsilon = 1e-5 158 | if convolutional: 159 | mean, variance = tf.nn.moments(Ylogits, [0, 1, 2]) 160 | else: 161 | mean, variance = tf.nn.moments(Ylogits, [0]) 162 | update_moving_everages = exp_moving_avg.apply([mean, variance]) 163 | m = tf.cond(self.tst, lambda: exp_moving_avg.average(mean), lambda: mean) 164 | v = tf.cond(self.tst, lambda: exp_moving_avg.average(variance), lambda: variance) 165 | Ybn = tf.nn.batch_normalization(Ylogits, m, v, offset, None, bnepsilon) 166 | return Ybn, update_moving_everages 167 | 168 | def gru_cell(self): 169 | with tf.name_scope('gru_cell'): 170 | cell = rnn.GRUCell(self.hidden_size, reuse=tf.get_variable_scope().reuse) 171 | return rnn.DropoutWrapper(cell, output_keep_prob=self.keep_prob) 172 | 173 | def bi_gru(self, inputs): 174 | """build the bi-GRU network. 返回个所有层的隐含状态。""" 175 | cells_fw = [self.gru_cell() for _ in range(self.n_layer)] 176 | cells_bw = [self.gru_cell() for _ in range(self.n_layer)] 177 | initial_states_fw = [cell_fw.zero_state(self.batch_size, tf.float32) for cell_fw in cells_fw] 178 | initial_states_bw = [cell_bw.zero_state(self.batch_size, tf.float32) for cell_bw in cells_bw] 179 | outputs, _, _ = rnn.stack_bidirectional_dynamic_rnn(cells_fw, cells_bw, inputs, 180 | initial_states_fw=initial_states_fw, 181 | initial_states_bw=initial_states_bw, dtype=tf.float32) 182 | return outputs 183 | 184 | def task_specific_attention(self, inputs, output_size, 185 | initializer=layers.xavier_initializer(), 186 | activation_fn=tf.tanh, scope=None): 187 | """ 188 | Performs task-specific attention reduction, using learned 189 | attention context vector (constant within task of interest). 190 | Args: 191 | inputs: Tensor of shape [batch_size, units, input_size] 192 | `input_size` must be static (known) 193 | `units` axis will be attended over (reduced from output) 194 | `batch_size` will be preserved 195 | output_size: Size of output's inner (feature) dimension 196 | Returns: 197 | outputs: Tensor of shape [batch_size, output_dim]. 198 | """ 199 | assert len(inputs.get_shape()) == 3 and inputs.get_shape()[-1].value is not None 200 | with tf.variable_scope(scope or 'attention') as scope: 201 | # u_w, attention 向量 202 | attention_context_vector = tf.get_variable(name='attention_context_vector', shape=[output_size], 203 | initializer=initializer, dtype=tf.float32) 204 | # 全连接层,把 h_i 转为 u_i , shape= [batch_size, units, input_size] -> [batch_size, units, output_size] 205 | input_projection = layers.fully_connected(inputs, output_size, activation_fn=activation_fn, scope=scope) 206 | # 输出 [batch_size, units] 207 | vector_attn = tf.reduce_sum(tf.multiply(input_projection, attention_context_vector), axis=2, keep_dims=True) 208 | attention_weights = tf.nn.softmax(vector_attn, dim=1) 209 | tf.summary.histogram('attention_weigths', attention_weights) 210 | weighted_projection = tf.multiply(inputs, attention_weights) 211 | outputs = tf.reduce_sum(weighted_projection, axis=1) 212 | return outputs # 输出 [batch_size, hidden_size*2] 213 | 214 | def bigru_inference(self, X_inputs): 215 | inputs = tf.nn.embedding_lookup(self.title_embedding, X_inputs) 216 | output_bigru = self.bi_gru(inputs) 217 | output_att = self.task_specific_attention(output_bigru, self.hidden_size*2) 218 | return output_att 219 | 220 | def cnn_inference(self, X_inputs, n_step): 221 | """TextCNN 模型。 222 | Args: 223 | X_inputs: tensor.shape=(batch_size, n_step) 224 | Returns: 225 | title_outputs: tensor.shape=(batch_size, self.n_filter_total) 226 | """ 227 | inputs = tf.nn.embedding_lookup(self.content_embedding, X_inputs) 228 | inputs = tf.expand_dims(inputs, -1) 229 | pooled_outputs = list() 230 | for i, filter_size in enumerate(self.filter_sizes): 231 | with tf.variable_scope("conv-maxpool-%s" % filter_size): 232 | # Convolution Layer 233 | filter_shape = [filter_size, self.embedding_size, 1, self.n_filter] 234 | W_filter = self.weight_variable(shape=filter_shape, name='W_filter') 235 | beta = self.bias_variable(shape=[self.n_filter], name='beta_filter') 236 | tf.summary.histogram('beta', beta) 237 | conv = tf.nn.conv2d(inputs, W_filter, strides=[1, 1, 1, 1], padding="VALID", name="conv") 238 | conv_bn, update_ema = self.batchnorm(conv, beta, convolutional=True) 239 | # Apply nonlinearity, batch norm scaling is not useful with relus 240 | h = tf.nn.relu(conv_bn, name="relu") 241 | # Maxpooling over the outputs 242 | pooled = tf.nn.max_pool(h, ksize=[1, n_step - filter_size + 1, 1, 1], 243 | strides=[1, 1, 1, 1], padding='VALID', name="pool") 244 | pooled_outputs.append(pooled) 245 | self.update_emas.append(update_ema) 246 | h_pool = tf.concat(pooled_outputs, 3) 247 | h_pool_flat = tf.reshape(h_pool, [-1, self.n_filter_total]) 248 | return h_pool_flat # shape = [batch_size, self.n_filter_total] 249 | 250 | 251 | # test the model 252 | def test(): 253 | import numpy as np 254 | print('Begin testing...') 255 | settings = Settings() 256 | W_embedding = np.random.randn(50, 10) 257 | config = tf.ConfigProto() 258 | config.gpu_options.allow_growth = True 259 | batch_size = 128 260 | with tf.Session(config=config) as sess: 261 | model = BiGRU_CNN(W_embedding, settings) 262 | optimizer = tf.train.AdamOptimizer(0.001) 263 | train_op = optimizer.minimize(model.loss) 264 | update_op = tf.group(*model.update_emas) 265 | sess.run(tf.global_variables_initializer()) 266 | fetch = [model.loss, model.y_pred, train_op, update_op] 267 | loss_list = list() 268 | for i in xrange(100): 269 | X1_batch = np.zeros((batch_size, 30), dtype=float) 270 | X2_batch = np.zeros((batch_size, 150), dtype=float) 271 | y_batch = np.zeros((batch_size, 1999), dtype=int) 272 | _batch_size = len(y_batch) 273 | feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch, 274 | model.batch_size: _batch_size, model.tst: False, model.keep_prob: 0.5} 275 | loss, y_pred, _, _ = sess.run(fetch, feed_dict=feed_dict) 276 | loss_list.append(loss) 277 | print(i, loss) 278 | 279 | if __name__ == '__main__': 280 | test() 281 | --------------------------------------------------------------------------------