├── zhihu-text-classification-master
├── data_process
│ ├── .idea
│ │ ├── .name
│ │ ├── encodings.xml
│ │ ├── modules.xml
│ │ ├── deployment.xml
│ │ ├── data_process.iml
│ │ └── misc.xml
│ ├── test.py
│ ├── run_all_data_process.sh
│ ├── question_and_topic_2id.py
│ ├── README.md
│ ├── embed2ndarray.py
│ ├── word2id.py
│ ├── char2id.py
│ ├── creat_batch_seg.py
│ └── creat_batch_data.py
└── models
│ ├── wd_4_han
│ ├── __init__.py
│ ├── predict.py
│ ├── train.py
│ └── network.py
│ ├── wd_2_hcnn
│ ├── __init__.py
│ ├── predict.py
│ ├── train.py
│ └── network.py
│ ├── wd_3_bigru
│ ├── __init__.py
│ ├── predict.py
│ ├── train.py
│ └── network.py
│ ├── wd_6_rcnn
│ ├── __init__.py
│ ├── predict.py
│ ├── train.py
│ └── network.py
│ ├── wd_1_1_cnn_concat
│ ├── __init__.py
│ ├── predict.py
│ ├── train.py
│ └── network.py
│ ├── wd_1_2_cnn_max
│ ├── __init__.py
│ ├── predict.py
│ ├── network.py
│ └── train.py
│ └── wd_5_bigru_cnn
│ ├── __init__.py
│ ├── predict.py
│ ├── train.py
│ └── network.py
└── ReadMe.md
/zhihu-text-classification-master/data_process/.idea/.name:
--------------------------------------------------------------------------------
1 | data_process
--------------------------------------------------------------------------------
/ReadMe.md:
--------------------------------------------------------------------------------
1 | # 竞赛列表
2 | + [2017 知乎看山杯机器学习挑战赛](https://www.biendata.com/competition/zhihu/)
3 |
--------------------------------------------------------------------------------
/zhihu-text-classification-master/models/wd_4_han/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 |
--------------------------------------------------------------------------------
/zhihu-text-classification-master/models/wd_2_hcnn/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 |
--------------------------------------------------------------------------------
/zhihu-text-classification-master/models/wd_3_bigru/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 |
--------------------------------------------------------------------------------
/zhihu-text-classification-master/models/wd_6_rcnn/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 |
--------------------------------------------------------------------------------
/zhihu-text-classification-master/models/wd_1_1_cnn_concat/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 |
--------------------------------------------------------------------------------
/zhihu-text-classification-master/models/wd_1_2_cnn_max/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 |
--------------------------------------------------------------------------------
/zhihu-text-classification-master/models/wd_5_bigru_cnn/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 |
--------------------------------------------------------------------------------
/zhihu-text-classification-master/data_process/.idea/encodings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/zhihu-text-classification-master/data_process/test.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 |
4 | from multiprocessing import Pool
5 | import numpy as np
6 |
7 | def func(a, b):
8 | return a+b
9 |
10 | p = Pool()
11 | a = [1,2,3]
12 | b = [4,5,6]
13 | para = zip(a,b)
14 | result = p.map(func, para)
15 | p.close()
16 | p.join()
17 | print result
--------------------------------------------------------------------------------
/zhihu-text-classification-master/data_process/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/zhihu-text-classification-master/data_process/.idea/deployment.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/zhihu-text-classification-master/data_process/.idea/data_process.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/zhihu-text-classification-master/data_process/run_all_data_process.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | echo -e "\033[44;37;5m RUNNING embed2ndarray.py\033[0m ";
3 | python embed2ndarray.py;
4 | echo -e "\033[44;37;5m RUNNING question_and_topic_2id.py\033[0m ";
5 | python question_and_topic_2id.py;
6 | echo -e "\033[44;37;5m RUNNING char2id.py\033[0m ";
7 | python char2id.py;
8 | echo -e "\033[44;37;5m RUNNING word2id.py\033[0m ";
9 | python word2id.py;
10 | echo -e "\033[44;37;5m RUNNING creat_batch_data.py\033[0m ";
11 | python creat_batch_data.py;
12 | echo -e "\033[44;37;5m RUNNING creat_batch_seg.py\033[0m ";
13 | python creat_batch_seg.py;
--------------------------------------------------------------------------------
/zhihu-text-classification-master/data_process/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/zhihu-text-classification-master/data_process/question_and_topic_2id.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 | import pandas as pd
4 | import pickle
5 | from itertools import chain
6 |
7 |
8 | def question_and_topic_2id():
9 | """把question和topic转成id形式并保存至 ../data/目录下。"""
10 | print('Changing the quetion and topic to id and save in sr_question2.pkl and sr_topic2id.pkl in ../data/')
11 | df_question_topic = pd.read_csv('../raw_data/question_topic_train_set.txt', sep='\t', names=['question', 'topics'],
12 | dtype={'question': object, 'topics': object})
13 | df_question_topic.topics = df_question_topic.topics.apply(lambda tps: tps.split(','))
14 | save_path = '../data/'
15 | print('questino number = %d ' % len(df_question_topic))
16 | # 问题 id 按照给出的问题顺序编号
17 | questions = df_question_topic.question.values
18 | sr_question2id = pd.Series(range(len(questions)), index=questions)
19 | sr_id2question = pd.Series(questions, index=range(len(questions)))
20 | # topic 按照数量从大到小进行编号
21 | topics = df_question_topic.topics.values
22 | topics = list(chain(*topics))
23 | sr_topics = pd.Series(topics)
24 | topics_count = sr_topics.value_counts()
25 | topics = topics_count.index
26 | sr_topic2id = pd.Series(range(len(topics)),index=topics)
27 | sr_id2topic = pd.Series(topics, index=range(len(topics)))
28 |
29 | with open(save_path + 'sr_question2id.pkl', 'wb') as outp:
30 | pickle.dump(sr_question2id, outp)
31 | pickle.dump(sr_id2question, outp)
32 | with open(save_path + 'sr_topic2id.pkl', 'wb') as outp:
33 | pickle.dump(sr_topic2id, outp)
34 | pickle.dump(sr_id2topic, outp)
35 | print('Finished changing.')
36 |
37 |
38 | if __name__ == '__main__':
39 | question_and_topic_2id()
40 |
--------------------------------------------------------------------------------
/zhihu-text-classification-master/data_process/README.md:
--------------------------------------------------------------------------------
1 | ## 数据处理
2 |
3 | 1.把比赛提供的所有数据解压到 raw_data/ 目录下。
4 | 2.按照顺序依次执行各个 .py,不带任何参数。
5 | 或者在当前目录下输入下面命令运行所有文件:
6 | dos2unix run_all_data_process.sh # 使用cygwin工具dos2unix将script改为unix格式
7 | sh run_all_data_process.sh
8 | 3.环境依赖(下面是我使用的版本)
9 | - numpy 1.12.1
10 | - pandas 0.19.2
11 | - word2vec 0.9.1
12 | - tqdm 4.11.2
13 |
14 |
15 | ### embed2ndarray.py
16 | 赛方提供了txt格式的词向量和字向量,这里把embedding矩阵转成 np.ndarray 形式,分别保存为 data/word_embedding.npy 和 data/char_embedding.npy。在赛方提供的词向量基础上,添加 '\' 和 '\' 两个特殊符号。其中 '\' 用于将序列补全到固定长度, '\' 用于替换低频词(字)。
17 | 用 pd.Series 保存词(字)对应 embedding 中的行号(id),存储在 data/sr_word2id.pkl 和 data/sr_char2id.pkl 中。
18 |
19 | ### question_and_topic_2id.py
20 | 把问题和话题转为id形式,保存在 data/sr_question2id.pkl 和 data/sr_id2question.pkl 中。
21 |
22 | ### char2id.py
23 | 利用上面得到的 sr_char2id,把所有问题的字转为对应的id, 存储为
24 | data/ch_train_title.npy
25 | data/ch_train_content.npy
26 | data/ch_eval_title.npy
27 | data/ch_eval_content.npy
28 |
29 | ### word2id.py
30 | 同 char2id.py
31 |
32 | ### creat_batch_data.py
33 | 把所有的数据按照 batch_size(128) 进行打包,固定seed,随机取 10 万样本作为验证集。每个batch存储为一个 npz 文件,包括 X, y 两部分。
34 | 这里所有的序列都进行了截断,长度不足的用0进行padding到固定长度。
35 | 保存位置:
36 | wd_train_path = '../data/wd-data/data_train/'
37 | wd_valid_path = '../data/wd-data/data_valid/'
38 | wd_test_path = '../data/wd-data/data_test/'
39 | ch_train_path = '../data/ch-data/data_train/'
40 | ch_valid_path = '../data/ch-data/data_valid/'
41 | ch_test_path = '../data/ch-data/data_test/'
42 |
43 |
44 | ### creat_batch_seg.py
45 | 和 creat_batch_data.py 相同,只是对 content 部分进行句子划分。用于分层模型。
46 | 划分句子长度:
47 | wd_title_len = 30, wd_sent_len = 30, wd_doc_len = 10.(即content划分为10个句子,每个句子长度为30个词)
48 | ch_title_len = 52, ch_sent_len = 52, ch_doc_len = 10.
49 | 不划分句子:
50 | wd_title_len = 30, wd_content_len = 150.
51 | ch_title_len = 52, ch_content_len = 300.
52 |
53 |
54 | ### To do
55 | - 在数据读取中使用 tfrecord 文件进行数据读取。这样能够随时改变 batch_size, 而且 shuffle 会比使用 numpy 更加均匀。
56 | - 添加序列长度信息。在这里所有的序列都截断或者padding为固定长度,在误差计算中没有处理padding部分,可能会使准确率下降。在使用 dynamic_rnn 的时候加上 sequence_length 信息,在计算的时候忽略 padding 部分。同时结合 tf.train.SequenceExample() 和 tf.train.batch() 自动 padding,也可以减少数据量。
--------------------------------------------------------------------------------
/zhihu-text-classification-master/data_process/embed2ndarray.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 | from __future__ import division
4 | from __future__ import print_function
5 |
6 | import numpy as np
7 | import pandas as pd
8 | import word2vec
9 | import pickle
10 | import os
11 |
12 | SPECIAL_SYMBOL = ['', ''] # add these special symbols to word(char) embeddings.
13 |
14 |
15 | def get_word_embedding():
16 | """提取词向量,并保存至 ../data/word_embedding.npy"""
17 | print('getting the word_embedding.npy')
18 | wv = word2vec.load('../raw_data/word_embedding.txt')
19 | word_embedding = wv.vectors
20 | words = wv.vocab
21 | n_special_sym = len(SPECIAL_SYMBOL)
22 | sr_id2word = pd.Series(words, index=range(n_special_sym, n_special_sym + len(words)))
23 | sr_word2id = pd.Series(range(n_special_sym, n_special_sym + len(words)), index=words)
24 | # 添加特殊符号::0, :1
25 | embedding_size = 256
26 | vec_special_sym = np.random.randn(n_special_sym, embedding_size)
27 | for i in range(n_special_sym):
28 | sr_id2word[i] = SPECIAL_SYMBOL[i]
29 | sr_word2id[SPECIAL_SYMBOL[i]] = i
30 | word_embedding = np.vstack([vec_special_sym, word_embedding])
31 | # 保存词向量
32 | save_path = '../data/'
33 | if not os.path.exists(save_path):
34 | os.makedirs(save_path)
35 | np.save(save_path + 'word_embedding.npy', word_embedding)
36 | # 保存词与id的对应关系
37 | with open(save_path + 'sr_word2id.pkl', 'wb') as outp:
38 | pickle.dump(sr_id2word, outp)
39 | pickle.dump(sr_word2id, outp)
40 | print('Saving the word_embedding.npy to ../data/word_embedding.npy')
41 |
42 |
43 | def get_char_embedding():
44 | """提取字向量,并保存至 ../data/char_embedding.npy"""
45 | print('getting the char_embedding.npy')
46 | wv = word2vec.load('../raw_data/char_embedding.txt')
47 | char_embedding = wv.vectors
48 | chars = wv.vocab
49 | n_special_sym = len(SPECIAL_SYMBOL)
50 | sr_id2char = pd.Series(chars, index=range(n_special_sym, n_special_sym + len(chars)))
51 | sr_char2id = pd.Series(range(n_special_sym, n_special_sym + len(chars)), index=chars)
52 |
53 | # 添加特殊符号::0, :1
54 | embedding_size = 256
55 |
56 | vec_special_sym = np.random.randn(n_special_sym, embedding_size)
57 | for i in range(n_special_sym):
58 | sr_id2char[i] = SPECIAL_SYMBOL[i]
59 | sr_char2id[SPECIAL_SYMBOL[i]] = i
60 | char_embedding = np.vstack([vec_special_sym, char_embedding])
61 | # 保存字向量
62 | save_path = '../data/'
63 | if not os.path.exists(save_path):
64 | os.makedirs(save_path)
65 | np.save(save_path + 'char_embedding.npy', char_embedding)
66 | # 保存字与id的对应关系
67 | with open(save_path + 'sr_char2id.pkl', 'wb') as outp:
68 | pickle.dump(sr_id2char, outp)
69 | pickle.dump(sr_char2id, outp)
70 | print('Saving the char_embedding.npy to ../data/char_embedding.npy')
71 |
72 |
73 | if __name__ == '__main__':
74 | get_word_embedding()
75 | get_char_embedding()
76 |
--------------------------------------------------------------------------------
/zhihu-text-classification-master/data_process/word2id.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 | from __future__ import division
4 | from __future__ import print_function
5 |
6 | import numpy as np
7 | import pandas as pd
8 | import pickle
9 | from multiprocessing import Pool
10 | from tqdm import tqdm
11 | import time
12 |
13 | save_path = '../data/'
14 | with open(save_path + 'sr_word2id.pkl', 'rb') as inp:
15 | sr_id2word = pickle.load(inp)
16 | sr_word2id = pickle.load(inp)
17 | dict_word2id = dict()
18 | for i in range(len(sr_word2id)):
19 | dict_word2id[sr_word2id.index[i]] = sr_word2id.values[i]
20 |
21 |
22 | def get_id(word):
23 | """获取 word 所对应的 id.
24 | 如果该词不在词典中,用 (对应的 ID 为 1 )进行替换。
25 | """
26 | if word not in dict_word2id:
27 | return 1
28 | else:
29 | return dict_word2id[word]
30 |
31 |
32 | def get_id4words(words):
33 | """把 words 转为 对应的 id"""
34 | words = words.strip().split(',') # 先分开词
35 | ids = list(map(get_id, words)) # 获取id
36 | return ids
37 |
38 |
39 | def test_word2id():
40 | """把测试集的所有词转成对应的id。"""
41 | time0 = time.time()
42 | print('Processing eval data.')
43 | df_eval = pd.read_csv('../raw_data/question_eval_set.txt', sep='\t', usecols=[0, 2, 4],
44 | names=['question_id', 'word_title', 'word_content'], dtype={'question_id': object})
45 | print('test question number %d' % len(df_eval))
46 | # 没有 title 的问题用 content 来替换
47 | na_title_indexs = list()
48 | for i in range(len(df_eval)):
49 | word_title = df_eval.word_title.values[i]
50 | if type(word_title) is float:
51 | na_title_indexs.append(i)
52 | print('There are %d test questions without title.' % len(na_title_indexs))
53 | for na_index in na_title_indexs:
54 | df_eval.at[na_index, 'word_title'] = df_eval.at[na_index, 'word_content']
55 | # 没有 content 的问题用 title 来替换
56 | na_content_indexs = list()
57 | for i in tqdm(range(len(df_eval))):
58 | word_content = df_eval.word_content.values[i]
59 | if type(word_content) is float:
60 | na_content_indexs.append(i)
61 | print('There are %d test questions without content.' % len(na_content_indexs))
62 | for na_index in tqdm(na_content_indexs):
63 | df_eval.at[na_index, 'word_content'] = df_eval.at[na_index, 'word_title']
64 | # 转为 id 形式
65 | p = Pool()
66 | eval_title = np.asarray(p.map(get_id4words, df_eval.word_title.values))
67 | np.save('../data/wd_eval_title.npy', eval_title)
68 | eval_content = np.asarray(p.map(get_id4words, df_eval.word_content.values))
69 | np.save('../data/wd_eval_content.npy', eval_content)
70 | p.close()
71 | p.join()
72 | print('Finished changing the eval words to ids. Costed time %g s' % (time.time() - time0))
73 |
74 |
75 | def train_word2id():
76 | """把训练集的所有词转成对应的id。"""
77 | time0 = time.time()
78 | print('Processing train data.')
79 | df_train = pd.read_csv('../raw_data/question_train_set.txt', sep='\t', usecols=[0, 2, 4],
80 | names=['question_id', 'word_title', 'word_content'], dtype={'question_id': object})
81 | print('training question number %d ' % len(df_train))
82 | # 没有 content 的问题用 title 来替换
83 | na_content_indexs = list()
84 | for i in tqdm(range(len(df_train))):
85 | word_content = df_train.word_content.values[i]
86 | if type(word_content) is float:
87 | na_content_indexs.append(i)
88 | print('There are %d train questions without content.' % len(na_content_indexs))
89 | for na_index in tqdm(na_content_indexs):
90 | df_train.at[na_index, 'word_content'] = df_train.at[na_index, 'word_title']
91 | # 没有 title 的问题, 丢弃
92 | na_title_indexs = list()
93 | for i in range(len(df_train)):
94 | word_title = df_train.word_title.values[i]
95 | if type(word_title) is float:
96 | na_title_indexs.append(i)
97 | print('There are %d train questions without title.' % len(na_title_indexs))
98 | df_train = df_train.drop(na_title_indexs)
99 | print('After dropping, training question number(should be 2999952) = %d' % len(df_train))
100 | # 转为 id 形式
101 | p = Pool()
102 | train_title = np.asarray(p.map(get_id4words, df_train.word_title.values))
103 | np.save('../data/wd_train_title.npy', train_title)
104 | train_content = np.asarray(p.map(get_id4words, df_train.word_content.values))
105 | np.save('../data/wd_train_content.npy', train_content)
106 | p.close()
107 | p.join()
108 | print('Finished changing the training words to ids. Costed time %g s' % (time.time() - time0))
109 |
110 |
111 | if __name__ == '__main__':
112 | test_word2id()
113 | train_word2id()
114 |
--------------------------------------------------------------------------------
/zhihu-text-classification-master/models/wd_4_han/predict.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 | from __future__ import print_function
4 | from __future__ import division
5 | import tensorflow as tf
6 | import numpy as np
7 | from tqdm import tqdm
8 | import os
9 | import sys
10 | import time
11 | import network
12 |
13 | sys.path.append('../..')
14 | from evaluator import score_eval
15 |
16 | settings = network.Settings()
17 | title_len = settings.title_len
18 | model_name = settings.model_name
19 | ckpt_path = settings.ckpt_path
20 |
21 | local_scores_path = '../../local_scores/'
22 | scores_path = '../../scores/'
23 | if not os.path.exists(local_scores_path):
24 | os.makedirs(local_scores_path)
25 | if not os.path.exists(scores_path):
26 | os.makedirs(scores_path)
27 |
28 | embedding_path = '../../data/word_embedding.npy'
29 | data_valid_path = '../../data/wd-data/seg_valid/'
30 | data_test_path = '../../data/wd-data/seg_test/'
31 | va_batches = os.listdir(data_valid_path)
32 | te_batches = os.listdir(data_test_path) # batch 文件名列表
33 | n_va_batches = len(va_batches)
34 | n_te_batches = len(te_batches)
35 |
36 |
37 | def get_batch(batch_id):
38 | """get a batch from valid data"""
39 | new_batch = np.load(data_valid_path + str(batch_id) + '.npz')
40 | X_batch = new_batch['X']
41 | y_batch = new_batch['y']
42 | X1_batch = X_batch[:, :title_len]
43 | X2_batch = X_batch[:, title_len:]
44 | return [X1_batch, X2_batch, y_batch]
45 |
46 |
47 | def get_test_batch(batch_id):
48 | """get a batch from test data"""
49 | X_batch = np.load(data_test_path + str(batch_id) + '.npy')
50 | X1_batch = X_batch[:, :title_len]
51 | X2_batch = X_batch[:, title_len:]
52 | return [X1_batch, X2_batch]
53 |
54 |
55 | def local_predict(sess, model):
56 | """Test on the valid data."""
57 | time0 = time.time()
58 | predict_labels_list = list() # 所有的预测结果
59 | marked_labels_list = list()
60 | predict_scores = list()
61 | for i in tqdm(xrange(n_va_batches)):
62 | [X1_batch, X2_batch, y_batch] = get_batch(i)
63 | marked_labels_list.extend(y_batch)
64 | _batch_size = len(X1_batch)
65 | fetches = [model.y_pred]
66 | feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch,
67 | model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
68 | predict_labels = sess.run(fetches, feed_dict)[0]
69 | predict_scores.append(predict_labels)
70 | predict_labels = list(map(lambda label: label.argsort()[-1:-6:-1], predict_labels)) # 取最大的5个下标
71 | predict_labels_list.extend(predict_labels)
72 | predict_label_and_marked_label_list = zip(predict_labels_list, marked_labels_list)
73 | precision, recall, f1 = score_eval(predict_label_and_marked_label_list)
74 | print('Local valid p=%g, r=%g, f1=%g' % (precision, recall, f1))
75 | predict_scores = np.vstack(np.asarray(predict_scores))
76 | local_scores_name = local_scores_path + model_name + '.npy'
77 | np.save(local_scores_name, predict_scores)
78 | print('local_scores.shape=', predict_scores.shape)
79 | print('Writed the scores into %s, time %g s' % (local_scores_name, time.time() - time0))
80 |
81 |
82 | def predict(sess, model):
83 | """Test on the test data."""
84 | time0 = time.time()
85 | predict_scores = list()
86 | for i in tqdm(xrange(n_te_batches)):
87 | [X1_batch, X2_batch] = get_test_batch(i)
88 | _batch_size = len(X1_batch)
89 | fetches = [model.y_pred]
90 | feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch,
91 | model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
92 | predict_labels = sess.run(fetches, feed_dict)[0]
93 | predict_scores.append(predict_labels)
94 | predict_scores = np.vstack(np.asarray(predict_scores))
95 | scores_name = scores_path + model_name + '.npy'
96 | np.save(scores_name, predict_scores)
97 | print('scores.shape=', predict_scores.shape)
98 | print('Writed the scores into %s, time %g s' % (scores_name, time.time() - time0))
99 |
100 |
101 | def main(_):
102 | if not os.path.exists(ckpt_path + 'checkpoint'):
103 | print('there is not saved model, please check the ckpt path')
104 | exit()
105 | print('Loading model...')
106 | W_embedding = np.load(embedding_path)
107 | config = tf.ConfigProto()
108 | config.gpu_options.allow_growth = True
109 | with tf.Session(config=config) as sess:
110 | model = network.HAN(W_embedding, settings)
111 | model.saver.restore(sess, tf.train.latest_checkpoint(ckpt_path))
112 | print('Local predicting...')
113 | local_predict(sess, model)
114 | print('Test predicting...')
115 | predict(sess, model)
116 |
117 |
118 | if __name__ == '__main__':
119 | tf.app.run()
120 |
--------------------------------------------------------------------------------
/zhihu-text-classification-master/models/wd_2_hcnn/predict.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 | from __future__ import print_function
4 | from __future__ import division
5 | import tensorflow as tf
6 | import numpy as np
7 | from tqdm import tqdm
8 | import os
9 | import sys
10 | import time
11 | import network
12 |
13 | sys.path.append('../..')
14 | from evaluator import score_eval
15 |
16 | settings = network.Settings()
17 | title_len = settings.title_len
18 | model_name = settings.model_name
19 | ckpt_path = settings.ckpt_path
20 |
21 | local_scores_path = '../../local_scores/'
22 | scores_path = '../../scores/'
23 | if not os.path.exists(local_scores_path):
24 | os.makedirs(local_scores_path)
25 | if not os.path.exists(scores_path):
26 | os.makedirs(scores_path)
27 |
28 | embedding_path = '../../data/word_embedding.npy'
29 | data_valid_path = '../../data/wd-data/seg_valid/'
30 | data_test_path = '../../data/wd-data/seg_test/'
31 | va_batches = os.listdir(data_valid_path)
32 | te_batches = os.listdir(data_test_path) # batch 文件名列表
33 | n_va_batches = len(va_batches)
34 | n_te_batches = len(te_batches)
35 |
36 |
37 | def get_batch(batch_id):
38 | """get a batch from valid data"""
39 | new_batch = np.load(data_valid_path + str(batch_id) + '.npz')
40 | X_batch = new_batch['X']
41 | y_batch = new_batch['y']
42 | X1_batch = X_batch[:, :title_len]
43 | X2_batch = X_batch[:, title_len:]
44 | return [X1_batch, X2_batch, y_batch]
45 |
46 |
47 | def get_test_batch(batch_id):
48 | """get a batch from test data"""
49 | X_batch = np.load(data_test_path + str(batch_id) + '.npy')
50 | X1_batch = X_batch[:, :title_len]
51 | X2_batch = X_batch[:, title_len:]
52 | return [X1_batch, X2_batch]
53 |
54 |
55 | def local_predict(sess, model):
56 | """Test on the valid data."""
57 | time0 = time.time()
58 | predict_labels_list = list() # 所有的预测结果
59 | marked_labels_list = list()
60 | predict_scores = list()
61 | for i in tqdm(xrange(n_va_batches)):
62 | [X1_batch, X2_batch, y_batch] = get_batch(i)
63 | marked_labels_list.extend(y_batch)
64 | _batch_size = len(X1_batch)
65 | fetches = [model.y_pred]
66 | feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch,
67 | model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
68 | predict_labels = sess.run(fetches, feed_dict)[0]
69 | predict_scores.append(predict_labels)
70 | predict_labels = list(map(lambda label: label.argsort()[-1:-6:-1], predict_labels)) # 取最大的5个下标
71 | predict_labels_list.extend(predict_labels)
72 | predict_label_and_marked_label_list = zip(predict_labels_list, marked_labels_list)
73 | precision, recall, f1 = score_eval(predict_label_and_marked_label_list)
74 | print('Local valid p=%g, r=%g, f1=%g' % (precision, recall, f1))
75 | predict_scores = np.vstack(np.asarray(predict_scores))
76 | local_scores_name = local_scores_path + model_name + '.npy'
77 | np.save(local_scores_name, predict_scores)
78 | print('local_scores.shape=', predict_scores.shape)
79 | print('Writed the scores into %s, time %g s' % (local_scores_name, time.time() - time0))
80 |
81 |
82 | def predict(sess, model):
83 | """Test on the test data."""
84 | time0 = time.time()
85 | predict_scores = list()
86 | for i in tqdm(xrange(n_te_batches)):
87 | [X1_batch, X2_batch] = get_test_batch(i)
88 | _batch_size = len(X1_batch)
89 | fetches = [model.y_pred]
90 | feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch,
91 | model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
92 | predict_labels = sess.run(fetches, feed_dict)[0]
93 | predict_scores.append(predict_labels)
94 | predict_scores = np.vstack(np.asarray(predict_scores))
95 | scores_name = scores_path + model_name + '.npy'
96 | np.save(scores_name, predict_scores)
97 | print('scores.shape=', predict_scores.shape)
98 | print('Writed the scores into %s, time %g s' % (scores_name, time.time() - time0))
99 |
100 |
101 | def main(_):
102 | if not os.path.exists(ckpt_path + 'checkpoint'):
103 | print('there is not saved model, please check the ckpt path')
104 | exit()
105 | print('Loading model...')
106 | W_embedding = np.load(embedding_path)
107 | config = tf.ConfigProto()
108 | config.gpu_options.allow_growth = True
109 | with tf.Session(config=config) as sess:
110 | model = network.HCNN(W_embedding, settings)
111 | model.saver.restore(sess, tf.train.latest_checkpoint(ckpt_path))
112 | print('Local predicting...')
113 | local_predict(sess, model)
114 | print('Test predicting...')
115 | predict(sess, model)
116 |
117 |
118 | if __name__ == '__main__':
119 | tf.app.run()
120 |
--------------------------------------------------------------------------------
/zhihu-text-classification-master/models/wd_3_bigru/predict.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 | from __future__ import print_function
4 | from __future__ import division
5 | import tensorflow as tf
6 | import numpy as np
7 | from tqdm import tqdm
8 | import os
9 | import sys
10 | import time
11 | import network
12 |
13 | sys.path.append('../..')
14 | from evaluator import score_eval
15 |
16 | settings = network.Settings()
17 | title_len = settings.title_len
18 | model_name = settings.model_name
19 | ckpt_path = settings.ckpt_path
20 |
21 | local_scores_path = '../../local_scores/'
22 | scores_path = '../../scores/'
23 | if not os.path.exists(local_scores_path):
24 | os.makedirs(local_scores_path)
25 | if not os.path.exists(scores_path):
26 | os.makedirs(scores_path)
27 |
28 | embedding_path = '../../data/word_embedding.npy'
29 | data_valid_path = '../../data/wd-data/data_valid/'
30 | data_test_path = '../../data/wd-data/data_test/'
31 | va_batches = os.listdir(data_valid_path)
32 | te_batches = os.listdir(data_test_path) # batch 文件名列表
33 | n_va_batches = len(va_batches)
34 | n_te_batches = len(te_batches)
35 |
36 |
37 | def get_batch(batch_id):
38 | """get a batch from valid data"""
39 | new_batch = np.load(data_valid_path + str(batch_id) + '.npz')
40 | X_batch = new_batch['X']
41 | y_batch = new_batch['y']
42 | X1_batch = X_batch[:, :title_len]
43 | X2_batch = X_batch[:, title_len:]
44 | return [X1_batch, X2_batch, y_batch]
45 |
46 |
47 | def get_test_batch(batch_id):
48 | """get a batch from test data"""
49 | X_batch = np.load(data_test_path + str(batch_id) + '.npy')
50 | X1_batch = X_batch[:, :title_len]
51 | X2_batch = X_batch[:, title_len:]
52 | return [X1_batch, X2_batch]
53 |
54 |
55 | def local_predict(sess, model):
56 | """Test on the valid data."""
57 | time0 = time.time()
58 | predict_labels_list = list() # 所有的预测结果
59 | marked_labels_list = list()
60 | predict_scores = list()
61 | for i in tqdm(range(n_va_batches)):
62 | [X1_batch, X2_batch, y_batch] = get_batch(i)
63 | marked_labels_list.extend(y_batch)
64 | _batch_size = len(X1_batch)
65 | fetches = [model.y_pred]
66 | feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch,
67 | model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
68 | predict_labels = sess.run(fetches, feed_dict)[0]
69 | predict_scores.append(predict_labels)
70 | predict_labels = list(map(lambda label: label.argsort()[-1:-6:-1], predict_labels)) # 取最大的5个下标
71 | predict_labels_list.extend(predict_labels)
72 | predict_label_and_marked_label_list = zip(predict_labels_list, marked_labels_list)
73 | precision, recall, f1 = score_eval(predict_label_and_marked_label_list)
74 | print('Local valid p=%g, r=%g, f1=%g' % (precision, recall, f1))
75 | predict_scores = np.vstack(np.asarray(predict_scores))
76 | local_scores_name = local_scores_path + model_name + '.npy'
77 | np.save(local_scores_name, predict_scores)
78 | print('local_scores.shape=', predict_scores.shape)
79 | print('Writed the scores into %s, time %g s' % (local_scores_name, time.time() - time0))
80 |
81 |
82 | def predict(sess, model):
83 | """Test on the test data."""
84 | time0 = time.time()
85 | predict_scores = list()
86 | for i in tqdm(range(n_te_batches)):
87 | [X1_batch, X2_batch] = get_test_batch(i)
88 | _batch_size = len(X1_batch)
89 | fetches = [model.y_pred]
90 | feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch,
91 | model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
92 | predict_labels = sess.run(fetches, feed_dict)[0]
93 | predict_scores.append(predict_labels)
94 | predict_scores = np.vstack(np.asarray(predict_scores))
95 | scores_name = scores_path + model_name + '.npy'
96 | np.save(scores_name, predict_scores)
97 | print('scores.shape=', predict_scores.shape)
98 | print('Writed the scores into %s, time %g s' % (scores_name, time.time() - time0))
99 |
100 |
101 | def main(_):
102 | if not os.path.exists(ckpt_path + 'checkpoint'):
103 | print('there is not saved model, please check the ckpt path')
104 | exit()
105 | print('Loading model...')
106 | W_embedding = np.load(embedding_path)
107 | config = tf.ConfigProto()
108 | config.gpu_options.allow_growth = True
109 | with tf.Session(config=config) as sess:
110 | model = network.BiGRU(W_embedding, settings)
111 | model.saver.restore(sess, tf.train.latest_checkpoint(ckpt_path))
112 | print('Local predicting...')
113 | local_predict(sess, model)
114 | print('Test predicting...')
115 | predict(sess, model)
116 |
117 |
118 | if __name__ == '__main__':
119 | tf.app.run()
120 |
--------------------------------------------------------------------------------
/zhihu-text-classification-master/models/wd_6_rcnn/predict.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 | from __future__ import print_function
4 | from __future__ import division
5 | import tensorflow as tf
6 | import numpy as np
7 | from tqdm import tqdm
8 | import os
9 | import sys
10 | import time
11 | import network
12 |
13 | sys.path.append('../..')
14 | from evaluator import score_eval
15 |
16 | settings = network.Settings()
17 | title_len = settings.title_len
18 | model_name = settings.model_name
19 | ckpt_path = settings.ckpt_path
20 |
21 | local_scores_path = '../../local_scores/'
22 | scores_path = '../../scores/'
23 | if not os.path.exists(local_scores_path):
24 | os.makedirs(local_scores_path)
25 | if not os.path.exists(scores_path):
26 | os.makedirs(scores_path)
27 |
28 | embedding_path = '../../data/word_embedding.npy'
29 | data_valid_path = '../../data/wd-data/data_valid/'
30 | data_test_path = '../../data/wd-data/data_test/'
31 | va_batches = os.listdir(data_valid_path)
32 | te_batches = os.listdir(data_test_path) # batch 文件名列表
33 | n_va_batches = len(va_batches)
34 | n_te_batches = len(te_batches)
35 |
36 |
37 | def get_batch(batch_id):
38 | """get a batch from valid data"""
39 | new_batch = np.load(data_valid_path + str(batch_id) + '.npz')
40 | X_batch = new_batch['X']
41 | y_batch = new_batch['y']
42 | X1_batch = X_batch[:, :title_len]
43 | X2_batch = X_batch[:, title_len:]
44 | return [X1_batch, X2_batch, y_batch]
45 |
46 |
47 | def get_test_batch(batch_id):
48 | """get a batch from test data"""
49 | X_batch = np.load(data_test_path + str(batch_id) + '.npy')
50 | X1_batch = X_batch[:, :title_len]
51 | X2_batch = X_batch[:, title_len:]
52 | return [X1_batch, X2_batch]
53 |
54 |
55 | def local_predict(sess, model):
56 | """Test on the valid data."""
57 | time0 = time.time()
58 | predict_labels_list = list() # 所有的预测结果
59 | marked_labels_list = list()
60 | predict_scores = list()
61 | for i in tqdm(xrange(n_va_batches)):
62 | [X1_batch, X2_batch, y_batch] = get_batch(i)
63 | marked_labels_list.extend(y_batch)
64 | _batch_size = len(X1_batch)
65 | fetches = [model.y_pred]
66 | feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch,
67 | model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
68 | predict_labels = sess.run(fetches, feed_dict)[0]
69 | predict_scores.append(predict_labels)
70 | predict_labels = list(map(lambda label: label.argsort()[-1:-6:-1], predict_labels)) # 取最大的5个下标
71 | predict_labels_list.extend(predict_labels)
72 | predict_label_and_marked_label_list = zip(predict_labels_list, marked_labels_list)
73 | precision, recall, f1 = score_eval(predict_label_and_marked_label_list)
74 | print('Local valid p=%g, r=%g, f1=%g' % (precision, recall, f1))
75 | predict_scores = np.vstack(np.asarray(predict_scores))
76 | local_scores_name = local_scores_path + model_name + '.npy'
77 | np.save(local_scores_name, predict_scores)
78 | print('local_scores.shape=', predict_scores.shape)
79 | print('Writed the scores into %s, time %g s' % (local_scores_name, time.time() - time0))
80 |
81 |
82 | def predict(sess, model):
83 | """Test on the test data."""
84 | time0 = time.time()
85 | predict_scores = list()
86 | for i in tqdm(xrange(n_te_batches)):
87 | [X1_batch, X2_batch] = get_test_batch(i)
88 | _batch_size = len(X1_batch)
89 | fetches = [model.y_pred]
90 | feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch,
91 | model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
92 | predict_labels = sess.run(fetches, feed_dict)[0]
93 | predict_scores.append(predict_labels)
94 | predict_scores = np.vstack(np.asarray(predict_scores))
95 | scores_name = scores_path + model_name + '.npy'
96 | np.save(scores_name, predict_scores)
97 | print('scores.shape=', predict_scores.shape)
98 | print('Writed the scores into %s, time %g s' % (scores_name, time.time() - time0))
99 |
100 |
101 | def main(_):
102 | if not os.path.exists(ckpt_path + 'checkpoint'):
103 | print('there is not saved model, please check the ckpt path')
104 | exit()
105 | print('Loading model...')
106 | W_embedding = np.load(embedding_path)
107 | config = tf.ConfigProto()
108 | config.gpu_options.allow_growth = True
109 | with tf.Session(config=config) as sess:
110 | model = network.RCNN(W_embedding, settings)
111 | model.saver.restore(sess, tf.train.latest_checkpoint(ckpt_path))
112 | print('Local predicting...')
113 | local_predict(sess, model)
114 | print('Test predicting...')
115 | predict(sess, model)
116 |
117 |
118 | if __name__ == '__main__':
119 | tf.app.run()
120 |
--------------------------------------------------------------------------------
/zhihu-text-classification-master/models/wd_1_1_cnn_concat/predict.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 | from __future__ import print_function
4 | from __future__ import division
5 | import tensorflow as tf
6 | import numpy as np
7 | from tqdm import tqdm
8 | import os
9 | import sys
10 | import time
11 | import network
12 |
13 | sys.path.append('../..')
14 | from evaluator import score_eval
15 |
16 | settings = network.Settings()
17 | title_len = settings.title_len
18 | model_name = settings.model_name
19 | ckpt_path = settings.ckpt_path
20 |
21 | local_scores_path = '../../local_scores/'
22 | scores_path = '../../scores/'
23 | if not os.path.exists(local_scores_path):
24 | os.makedirs(local_scores_path)
25 | if not os.path.exists(scores_path):
26 | os.makedirs(scores_path)
27 |
28 | embedding_path = '../../data/word_embedding.npy'
29 | data_valid_path = '../../data/wd-data/data_valid/'
30 | data_test_path = '../../data/wd-data/data_test/'
31 | va_batches = os.listdir(data_valid_path)
32 | te_batches = os.listdir(data_test_path) # batch 文件名列表
33 | n_va_batches = len(va_batches)
34 | n_te_batches = len(te_batches)
35 |
36 |
37 | def get_batch(batch_id):
38 | """get a batch from valid data"""
39 | new_batch = np.load(data_valid_path + str(batch_id) + '.npz')
40 | X_batch = new_batch['X']
41 | y_batch = new_batch['y']
42 | X1_batch = X_batch[:, :title_len]
43 | X2_batch = X_batch[:, title_len:]
44 | return [X1_batch, X2_batch, y_batch]
45 |
46 |
47 | def get_test_batch(batch_id):
48 | """get a batch from test data"""
49 | X_batch = np.load(data_test_path + str(batch_id) + '.npy')
50 | X1_batch = X_batch[:, :title_len]
51 | X2_batch = X_batch[:, title_len:]
52 | return [X1_batch, X2_batch]
53 |
54 |
55 | def local_predict(sess, model):
56 | """Test on the valid data."""
57 | time0 = time.time()
58 | predict_labels_list = list() # 所有的预测结果
59 | marked_labels_list = list()
60 | predict_scores = list()
61 | for i in tqdm(xrange(n_va_batches)):
62 | [X1_batch, X2_batch, y_batch] = get_batch(i)
63 | marked_labels_list.extend(y_batch)
64 | _batch_size = len(X1_batch)
65 | fetches = [model.y_pred]
66 | feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch,
67 | model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
68 | predict_labels = sess.run(fetches, feed_dict)[0]
69 | predict_scores.append(predict_labels)
70 | predict_labels = map(lambda label: label.argsort()[-1:-6:-1], predict_labels) # 取最大的5个下标
71 | predict_labels_list.extend(predict_labels)
72 | predict_label_and_marked_label_list = zip(predict_labels_list, marked_labels_list)
73 | precision, recall, f1 = score_eval(predict_label_and_marked_label_list)
74 | print('Local valid p=%g, r=%g, f1=%g' % (precision, recall, f1))
75 | predict_scores = np.vstack(np.asarray(predict_scores))
76 | local_scores_name = local_scores_path + model_name + '.npy'
77 | np.save(local_scores_name, predict_scores)
78 | print('local_scores.shape=', predict_scores.shape)
79 | print('Writed the scores into %s, time %g s' % (local_scores_name, time.time() - time0))
80 |
81 |
82 | def predict(sess, model):
83 | """Test on the test data."""
84 | time0 = time.time()
85 | predict_scores = list()
86 | for i in tqdm(xrange(n_te_batches)):
87 | [X1_batch, X2_batch] = get_test_batch(i)
88 | _batch_size = len(X1_batch)
89 | fetches = [model.y_pred]
90 | feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch,
91 | model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
92 | predict_labels = sess.run(fetches, feed_dict)[0]
93 | predict_scores.append(predict_labels)
94 | predict_scores = np.vstack(np.asarray(predict_scores))
95 | scores_name = scores_path + model_name + '.npy'
96 | np.save(scores_name, predict_scores)
97 | print('scores.shape=', predict_scores.shape)
98 | print('Writed the scores into %s, time %g s' % (scores_name, time.time() - time0))
99 |
100 |
101 | def main(_):
102 | if not os.path.exists(ckpt_path + 'checkpoint'):
103 | print('there is not saved model, please check the ckpt path')
104 | exit()
105 | print('Loading model...')
106 | W_embedding = np.load(embedding_path)
107 | config = tf.ConfigProto()
108 | config.gpu_options.allow_growth = True
109 | with tf.Session(config=config) as sess:
110 | model = network.TextCNN(W_embedding, settings)
111 | model.saver.restore(sess, tf.train.latest_checkpoint(ckpt_path))
112 | print('Local predicting...')
113 | local_predict(sess, model)
114 | print('Test predicting...')
115 | predict(sess, model)
116 |
117 |
118 | if __name__ == '__main__':
119 | tf.app.run()
120 |
--------------------------------------------------------------------------------
/zhihu-text-classification-master/models/wd_1_2_cnn_max/predict.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 | from __future__ import print_function
4 | from __future__ import division
5 | import tensorflow as tf
6 | import numpy as np
7 | from tqdm import tqdm
8 | import os
9 | import sys
10 | import time
11 | import network
12 |
13 | sys.path.append('../..')
14 | from evaluator import score_eval
15 |
16 | settings = network.Settings()
17 | title_len = settings.title_len
18 | model_name = settings.model_name
19 | ckpt_path = settings.ckpt_path
20 |
21 | local_scores_path = '../../local_scores/'
22 | scores_path = '../../scores/'
23 | if not os.path.exists(local_scores_path):
24 | os.makedirs(local_scores_path)
25 | if not os.path.exists(scores_path):
26 | os.makedirs(scores_path)
27 |
28 | embedding_path = '../../data/word_embedding.npy'
29 | data_valid_path = '../../data/wd-data/data_valid/'
30 | data_test_path = '../../data/wd-data/data_test/'
31 | va_batches = os.listdir(data_valid_path)
32 | te_batches = os.listdir(data_test_path) # batch 文件名列表
33 | n_va_batches = len(va_batches)
34 | n_te_batches = len(te_batches)
35 |
36 |
37 | def get_batch(batch_id):
38 | """get a batch from valid data"""
39 | new_batch = np.load(data_valid_path + str(batch_id) + '.npz')
40 | X_batch = new_batch['X']
41 | y_batch = new_batch['y']
42 | X1_batch = X_batch[:, :title_len]
43 | X2_batch = X_batch[:, title_len:]
44 | return [X1_batch, X2_batch, y_batch]
45 |
46 |
47 | def get_test_batch(batch_id):
48 | """get a batch from test data"""
49 | X_batch = np.load(data_test_path + str(batch_id) + '.npy')
50 | X1_batch = X_batch[:, :title_len]
51 | X2_batch = X_batch[:, title_len:]
52 | return [X1_batch, X2_batch]
53 |
54 |
55 | def local_predict(sess, model):
56 | """Test on the valid data."""
57 | time0 = time.time()
58 | predict_labels_list = list() # 所有的预测结果
59 | marked_labels_list = list()
60 | predict_scores = list()
61 | for i in tqdm(xrange(n_va_batches)):
62 | [X1_batch, X2_batch, y_batch] = get_batch(i)
63 | marked_labels_list.extend(y_batch)
64 | _batch_size = len(X1_batch)
65 | fetches = [model.y_pred]
66 | feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch,
67 | model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
68 | predict_labels = sess.run(fetches, feed_dict)[0]
69 | predict_scores.append(predict_labels)
70 | predict_labels = list(map(lambda label: label.argsort()[-1:-6:-1], predict_labels)) # 取最大的5个下标
71 | predict_labels_list.extend(predict_labels)
72 | predict_label_and_marked_label_list = zip(predict_labels_list, marked_labels_list)
73 | precision, recall, f1 = score_eval(predict_label_and_marked_label_list)
74 | print('Local valid p=%g, r=%g, f1=%g' % (precision, recall, f1))
75 | predict_scores = np.vstack(np.asarray(predict_scores))
76 | local_scores_name = local_scores_path + model_name + '.npy'
77 | np.save(local_scores_name, predict_scores)
78 | print('local_scores.shape=', predict_scores.shape)
79 | print('Writed the scores into %s, time %g s' % (local_scores_name, time.time() - time0))
80 |
81 |
82 | def predict(sess, model):
83 | """Test on the test data."""
84 | time0 = time.time()
85 | predict_scores = list()
86 | for i in tqdm(xrange(n_te_batches)):
87 | [X1_batch, X2_batch] = get_test_batch(i)
88 | _batch_size = len(X1_batch)
89 | fetches = [model.y_pred]
90 | feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch,
91 | model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
92 | predict_labels = sess.run(fetches, feed_dict)[0]
93 | predict_scores.append(predict_labels)
94 | predict_scores = np.vstack(np.asarray(predict_scores))
95 | scores_name = scores_path + model_name + '.npy'
96 | np.save(scores_name, predict_scores)
97 | print('scores.shape=', predict_scores.shape)
98 | print('Writed the scores into %s, time %g s' % (scores_name, time.time() - time0))
99 |
100 |
101 | def main(_):
102 | if not os.path.exists(ckpt_path + 'checkpoint'):
103 | print('there is not saved model, please check the ckpt path')
104 | exit()
105 | print('Loading model...')
106 | W_embedding = np.load(embedding_path)
107 | config = tf.ConfigProto()
108 | config.gpu_options.allow_growth = True
109 | with tf.Session(config=config) as sess:
110 | model = network.TextCNN(W_embedding, settings)
111 | model.saver.restore(sess, tf.train.latest_checkpoint(ckpt_path))
112 | print('Local predicting...')
113 | local_predict(sess, model)
114 | print('Test predicting...')
115 | predict(sess, model)
116 |
117 |
118 | if __name__ == '__main__':
119 | tf.app.run()
120 |
--------------------------------------------------------------------------------
/zhihu-text-classification-master/models/wd_5_bigru_cnn/predict.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 | from __future__ import print_function
4 | from __future__ import division
5 | import tensorflow as tf
6 | import numpy as np
7 | from tqdm import tqdm
8 | import os
9 | import sys
10 | import time
11 | import network
12 |
13 | sys.path.append('../..')
14 | from evaluator import score_eval
15 |
16 | settings = network.Settings()
17 | title_len = settings.title_len
18 | model_name = settings.model_name
19 | ckpt_path = settings.ckpt_path
20 |
21 | local_scores_path = '../../local_scores/'
22 | scores_path = '../../scores/'
23 | if not os.path.exists(local_scores_path):
24 | os.makedirs(local_scores_path)
25 | if not os.path.exists(scores_path):
26 | os.makedirs(scores_path)
27 |
28 | embedding_path = '../../data/word_embedding.npy'
29 | data_valid_path = '../../data/wd-data/data_valid/'
30 | data_test_path = '../../data/wd-data/data_test/'
31 | va_batches = os.listdir(data_valid_path)
32 | te_batches = os.listdir(data_test_path) # batch 文件名列表
33 | n_va_batches = len(va_batches)
34 | n_te_batches = len(te_batches)
35 |
36 |
37 | def get_batch(batch_id):
38 | """get a batch from valid data"""
39 | new_batch = np.load(data_valid_path + str(batch_id) + '.npz')
40 | X_batch = new_batch['X']
41 | y_batch = new_batch['y']
42 | X1_batch = X_batch[:, :title_len]
43 | X2_batch = X_batch[:, title_len:]
44 | return [X1_batch, X2_batch, y_batch]
45 |
46 |
47 | def get_test_batch(batch_id):
48 | """get a batch from test data"""
49 | X_batch = np.load(data_test_path + str(batch_id) + '.npy')
50 | X1_batch = X_batch[:, :title_len]
51 | X2_batch = X_batch[:, title_len:]
52 | return [X1_batch, X2_batch]
53 |
54 |
55 | def local_predict(sess, model):
56 | """Test on the valid data."""
57 | time0 = time.time()
58 | predict_labels_list = list() # 所有的预测结果
59 | marked_labels_list = list()
60 | predict_scores = list()
61 | for i in tqdm(xrange(n_va_batches)):
62 | [X1_batch, X2_batch, y_batch] = get_batch(i)
63 | marked_labels_list.extend(y_batch)
64 | _batch_size = len(X1_batch)
65 | fetches = [model.y_pred]
66 | feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch,
67 | model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
68 | predict_labels = sess.run(fetches, feed_dict)[0]
69 | predict_scores.append(predict_labels)
70 | predict_labels = list(map(lambda label: label.argsort()[-1:-6:-1], predict_labels)) # 取最大的5个下标
71 | predict_labels_list.extend(predict_labels)
72 | predict_label_and_marked_label_list = zip(predict_labels_list, marked_labels_list)
73 | precision, recall, f1 = score_eval(predict_label_and_marked_label_list)
74 | print('Local valid p=%g, r=%g, f1=%g' % (precision, recall, f1))
75 | predict_scores = np.vstack(np.asarray(predict_scores))
76 | local_scores_name = local_scores_path + model_name + '.npy'
77 | np.save(local_scores_name, predict_scores)
78 | print('local_scores.shape=', predict_scores.shape)
79 | print('Writed the scores into %s, time %g s' % (local_scores_name, time.time() - time0))
80 |
81 |
82 | def predict(sess, model):
83 | """Test on the test data."""
84 | time0 = time.time()
85 | predict_scores = list()
86 | for i in tqdm(xrange(n_te_batches)):
87 | [X1_batch, X2_batch] = get_test_batch(i)
88 | _batch_size = len(X1_batch)
89 | fetches = [model.y_pred]
90 | feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch,
91 | model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
92 | predict_labels = sess.run(fetches, feed_dict)[0]
93 | predict_scores.append(predict_labels)
94 | predict_scores = np.vstack(np.asarray(predict_scores))
95 | scores_name = scores_path + model_name + '.npy'
96 | np.save(scores_name, predict_scores)
97 | print('scores.shape=', predict_scores.shape)
98 | print('Writed the scores into %s, time %g s' % (scores_name, time.time() - time0))
99 |
100 |
101 | def main(_):
102 | if not os.path.exists(ckpt_path + 'checkpoint'):
103 | print('there is not saved model, please check the ckpt path')
104 | exit()
105 | print('Loading model...')
106 | W_embedding = np.load(embedding_path)
107 | config = tf.ConfigProto()
108 | config.gpu_options.allow_growth = True
109 | with tf.Session(config=config) as sess:
110 | model = network.BiGRU_CNN(W_embedding, settings)
111 | model.saver.restore(sess, tf.train.latest_checkpoint(ckpt_path))
112 | print('Local predicting...')
113 | local_predict(sess, model)
114 | print('Test predicting...')
115 | predict(sess, model)
116 |
117 |
118 | if __name__ == '__main__':
119 | tf.app.run()
120 |
--------------------------------------------------------------------------------
/zhihu-text-classification-master/data_process/char2id.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 | from __future__ import division
4 | from __future__ import print_function
5 |
6 | import numpy as np
7 | import pandas as pd
8 | import pickle
9 | from multiprocessing import Pool
10 | from tqdm import tqdm
11 | import time
12 |
13 |
14 | save_path = '../data/'
15 | with open(save_path + 'sr_char2id.pkl', 'rb') as inp:
16 | sr_id2char = pickle.load(inp)
17 | sr_char2id = pickle.load(inp)
18 | dict_char2id = dict()
19 | for i in range(len(sr_char2id)):
20 | dict_char2id[sr_char2id.index[i]] = sr_char2id.values[i]
21 |
22 |
23 | def get_id(char):
24 | """获取 char 所对应的 id.
25 | 如果该字不在字典中,用1进行替换。
26 | """
27 | if char not in dict_char2id:
28 | return 1
29 | else:
30 | return dict_char2id[char]
31 |
32 |
33 | def get_id4chars(chars):
34 | """把 chars 转为 对应的 id"""
35 | chars = chars.strip().split(',') # 先分开字
36 | ids = list(map(get_id, chars)) # 获取id
37 | return ids
38 |
39 |
40 | def test_char2id():
41 | """把测试集的所有字转成对应的id。"""
42 | time0 = time.time()
43 | print('Processing eval data.')
44 | df_eval = pd.read_csv('../raw_data/question_eval_set.txt', sep='\t', usecols=[0, 1, 3],
45 | names=['question_id', 'char_title', 'char_content'], dtype={'question_id': object})
46 | print('test question number %d' % len(df_eval))
47 | # 没有 title 的问题用 content 来替换
48 | na_title_indexs = list()
49 | for i in range(len(df_eval)):
50 | char_title = df_eval.char_title.values[i]
51 | if type(char_title) is float:
52 | na_title_indexs.append(i)
53 | print('There are %d test questions without title.' % len(na_title_indexs))
54 | for na_index in na_title_indexs:
55 | df_eval.at[na_index, 'char_title'] = df_eval.at[na_index, 'char_content']
56 | # 没有 content 的问题用 title 来替换
57 | na_content_indexs = list()
58 | for i in tqdm(range(len(df_eval))):
59 | char_content = df_eval.char_content.values[i]
60 | if type(char_content) is float:
61 | na_content_indexs.append(i)
62 | print('There are %d test questions without content.' % len(na_content_indexs))
63 | for na_index in tqdm(na_content_indexs):
64 | df_eval.at[na_index, 'char_content'] = df_eval.at[na_index, 'char_title']
65 | # 转为 id 形式
66 | p = Pool()
67 | eval_title = np.asarray(p.map(get_id4chars, df_eval.char_title.values))
68 | np.save('../data/ch_eval_title.npy', eval_title)
69 | eval_content = np.asarray(p.map(get_id4chars, df_eval.char_content.values))
70 | np.save('../data/ch_eval_content.npy', eval_content)
71 | p.close()
72 | p.join()
73 | print('Finished changing the eval chars to ids. Costed time %g s' % (time.time()-time0))
74 |
75 |
76 | def train_char2id():
77 | """把训练集的所有字转成对应的id。"""
78 | time0 = time.time()
79 | print('Processing train data.')
80 | df_train = pd.read_csv('../raw_data/question_train_set.txt', sep='\t', usecols=[0, 1, 3],
81 | names=['question_id', 'char_title', 'char_content'], dtype={'question_id': object})
82 | print('training question number %d ' % len(df_train))
83 | # 没有 content 的问题用 title 来替换
84 | na_content_indexs = list()
85 | for i in tqdm(range(len(df_train))):
86 | char_content = df_train.char_content.values[i]
87 | if type(char_content) is float:
88 | na_content_indexs.append(i)
89 | print('There are %d train questions without content.' % len(na_content_indexs))
90 | for na_index in tqdm(na_content_indexs):
91 | df_train.at[na_index, 'char_content'] = df_train.at[na_index, 'char_title']
92 | # 没有 title 的问题, 与词一样丢弃下面样本
93 | na_title_indexs = [328877, 422123, 633584, 768738, 818616, 876828, 1273673, 1527297,
94 | 1636237, 1682969, 2052477, 2628516, 2657464, 2904162, 2993517]
95 | for i in range(len(df_train)):
96 | char_title = df_train.char_title.values[i]
97 | if type(char_title) is float:
98 | na_title_indexs.append(i)
99 | print('There are %d train questions without title.' % len(na_title_indexs))
100 | df_train = df_train.drop(na_title_indexs)
101 | print('After dropping, training question number(should be 2999952) = %d' % len(df_train))
102 | # 转为 id 形式
103 | p = Pool()
104 | train_title = np.asarray(list(p.map(get_id4chars, df_train.char_title.values)))
105 | np.save('../data/ch_train_title.npy', train_title)
106 | train_content = np.asarray(p.map(get_id4chars, df_train.char_content.values))
107 | np.save('../data/ch_train_content.npy', train_content)
108 | p.close()
109 | p.join()
110 | print('Finished changing the training chars to ids. Costed time %g s' % (time.time() - time0))
111 |
112 |
113 | if __name__ == '__main__':
114 | test_char2id()
115 | train_char2id()
116 |
117 |
118 |
119 |
120 |
121 |
122 |
--------------------------------------------------------------------------------
/zhihu-text-classification-master/data_process/creat_batch_seg.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 | from __future__ import division
4 | from __future__ import print_function
5 |
6 | import numpy as np
7 | from multiprocessing import Pool
8 | import sys
9 | import os
10 |
11 | sys.path.append('../')
12 | from data_helpers import pad_X30
13 | from data_helpers import pad_X52
14 | from data_helpers import wd_pad_cut_docs
15 | from data_helpers import ch_pad_cut_docs
16 | from data_helpers import train_batch
17 | from data_helpers import eval_batch
18 |
19 |
20 | wd_train_path = '../data/wd-data/seg_train/'
21 | wd_valid_path = '../data/wd-data/seg_valid/'
22 | wd_test_path = '../data/wd-data/seg_test/'
23 | ch_train_path = '../data/ch-data/seg_train/'
24 | ch_valid_path = '../data/ch-data/seg_valid/'
25 | ch_test_path = '../data/ch-data/seg_test/'
26 | paths = [wd_train_path, wd_valid_path, wd_test_path,
27 | ch_train_path, ch_valid_path, ch_test_path]
28 | for each in paths:
29 | if not os.path.exists(each):
30 | os.makedirs(each)
31 |
32 |
33 | # word 数据打包
34 | def wd_train_get_batch(title_len=30, batch_size=128):
35 | print('loading word train_title and train_content, this should cost minutes, please wait.')
36 | train_title = np.load('../data/wd_train_title.npy')
37 | train_content = np.load('../data/wd_train_content.npy')
38 | p = Pool(6)
39 | X_title = np.asarray(p.map(pad_X30, train_title))
40 | X_content = np.asarray(p.map(wd_pad_cut_docs, train_content))
41 | p.close()
42 | p.join()
43 | X_content.shape = [-1, 30*10]
44 | X = np.hstack([X_title, X_content])
45 | y = np.load('../data/y_tr.npy')
46 | # 划分验证集
47 | sample_num = X.shape[0]
48 | np.random.seed(13)
49 | valid_num = 100000
50 | new_index = np.random.permutation(sample_num)
51 | X = X[new_index]
52 | y = y[new_index]
53 | X_valid = X[:valid_num]
54 | y_valid = y[:valid_num]
55 | X_train = X[valid_num:]
56 | y_train = y[valid_num:]
57 | print('X_train.shape=', X_train.shape, 'y_train.shape=', y_train.shape)
58 | print('X_valid.shape=', X_valid.shape, 'y_valid.shape=', y_valid.shape)
59 | # 验证集打 batch
60 | print('creating batch data.')
61 | sample_num = len(X_valid)
62 | print('valid_sample_num=%d' % sample_num)
63 | train_batch(X_valid, y_valid, wd_valid_path, batch_size)
64 | # 训练集打 batch
65 | sample_num = len(X_train)
66 | print('train_sample_num=%d' % sample_num)
67 | train_batch(X_train, y_train, wd_train_path, batch_size)
68 |
69 |
70 | def wd_test_get_batch(title_len=30, batch_size=128):
71 | print('loading word eval_title and eval_content.')
72 | eval_title = np.load('../data/wd_eval_title.npy')
73 | eval_content = np.load('../data/wd_eval_content.npy')
74 | p = Pool(6)
75 | X_title = np.asarray(p.map(pad_X30, eval_title))
76 | X_content = np.asarray(p.map(wd_pad_cut_docs, eval_content))
77 | p.close()
78 | p.join()
79 | X_content.shape = [-1, 30*10]
80 | X = np.hstack([X_title, X_content])
81 | sample_num = len(X)
82 | print('eval_sample_num=%d' % sample_num)
83 | eval_batch(X, wd_test_path, batch_size)
84 |
85 |
86 | # char 数据打包
87 | def ch_train_get_batch(title_len=52, batch_size=128):
88 | print('loading char train_title and train_content, this should cost minutes, please wait.')
89 | train_title = np.load('../data/ch_train_title.npy')
90 | train_content = np.load('../data/ch_train_content.npy')
91 | p = Pool(8)
92 | X_title = np.asarray(p.map(pad_X52, train_title))
93 | X_content = np.asarray(p.map(ch_pad_cut_docs, train_content))
94 | p.close()
95 | p.join()
96 | X_content.shape = [-1, 52*10]
97 | X = np.hstack([X_title, X_content])
98 | y = np.load('../data/y_tr.npy')
99 | # 划分验证集
100 | sample_num = X.shape[0]
101 | np.random.seed(13)
102 | valid_num = 100000
103 | new_index = np.random.permutation(sample_num)
104 | X = X[new_index]
105 | y = y[new_index]
106 | X_valid = X[:valid_num]
107 | y_valid = y[:valid_num]
108 | X_train = X[valid_num:]
109 | y_train = y[valid_num:]
110 | print('X_train.shape=', X_train.shape, 'y_train.shape=', y_train.shape)
111 | print('X_valid.shape=', X_valid.shape, 'y_valid.shape=', y_valid.shape)
112 | # 验证集打batch
113 | print('creating batch data.')
114 | sample_num = len(X_valid)
115 | print('valid_sample_num=%d' % sample_num)
116 | train_batch(X_valid, y_valid, ch_valid_path, batch_size)
117 | # 训练集打batch
118 | sample_num = len(X_train)
119 | print('train_sample_num=%d' % sample_num)
120 | train_batch(X_train, y_train, ch_train_path, batch_size)
121 |
122 |
123 | def ch_test_get_batch(title_len=52, batch_size=128):
124 | print('loading char eval_title and eval_content.')
125 | eval_title = np.load('../data/ch_eval_title.npy')
126 | eval_content = np.load('../data/ch_eval_content.npy')
127 | p = Pool()
128 | X_title = np.asarray(p.map(pad_X52, eval_title))
129 | X_content = np.asarray(p.map(ch_pad_cut_docs, eval_content))
130 | p.close()
131 | p.join()
132 | X_content.shape = [-1, 52*10]
133 | X = np.hstack([X_title, X_content])
134 | sample_num = len(X)
135 | print('eval_sample_num=%d' % sample_num)
136 | eval_batch(X, ch_test_path, batch_size)
137 |
138 |
139 | if __name__ == '__main__':
140 | wd_train_get_batch()
141 | wd_test_get_batch()
142 | ch_train_get_batch()
143 | ch_test_get_batch()
144 |
--------------------------------------------------------------------------------
/zhihu-text-classification-master/data_process/creat_batch_data.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 | from __future__ import division
4 | from __future__ import print_function
5 |
6 | import numpy as np
7 | import pandas as pd
8 | import pickle
9 | from multiprocessing import Pool
10 | import sys
11 | import os
12 |
13 | sys.path.append('../')
14 | from data_helpers import pad_X30
15 | from data_helpers import pad_X150
16 | from data_helpers import pad_X52
17 | from data_helpers import pad_X300
18 | from data_helpers import train_batch
19 | from data_helpers import eval_batch
20 |
21 | """ 把所有的数据按照 batch_size(128) 进行打包。取 10万 样本作为验证集。
22 | word_title_len = 30.
23 | word_content_len = 150.
24 | char_title_len = 52.
25 | char_content_len = 300.
26 | """
27 |
28 |
29 | wd_train_path = '../data/wd-data/data_train/'
30 | wd_valid_path = '../data/wd-data/data_valid/'
31 | wd_test_path = '../data/wd-data/data_test/'
32 | ch_train_path = '../data/ch-data/data_train/'
33 | ch_valid_path = '../data/ch-data/data_valid/'
34 | ch_test_path = '../data/ch-data/data_test/'
35 | paths = [wd_train_path, wd_valid_path, wd_test_path,
36 | ch_train_path, ch_valid_path, ch_test_path]
37 | for each in paths:
38 | if not os.path.exists(each):
39 | os.makedirs(each)
40 |
41 | with open('../data/sr_topic2id.pkl', 'rb') as inp:
42 | sr_topic2id = pickle.load(inp)
43 |
44 | dict_topic2id = dict()
45 | for i in range(len(sr_topic2id)):
46 | dict_topic2id[sr_topic2id.index[i]] = sr_topic2id.values[i]
47 |
48 |
49 | def topics2ids(topics):
50 | """把 chars 转为 对应的 id"""
51 | topics = topics.split(',')
52 | ids = list(map(lambda topic: dict_topic2id[topic], topics)) # 获取id
53 | return ids
54 |
55 |
56 | def get_lables():
57 | """获取训练集所有样本的标签。注意之前在处理数据时丢弃了部分没有 title 的样本。"""
58 | df_question_topic = pd.read_csv('../raw_data/question_topic_train_set.txt', sep='\t',
59 | names=['questions', 'topics'], dtype={'questions': object, 'topics': object})
60 | na_title_indexs = [328877, 422123, 633584, 768738, 818616, 876828, 1273673, 1527297,
61 | 1636237, 1682969, 2052477, 2628516, 2657464, 2904162, 2993517]
62 | df_question_topic = df_question_topic.drop(na_title_indexs)
63 | p = Pool()
64 | y = p.map(topics2ids, df_question_topic.topics.values)
65 | p.close()
66 | p.join()
67 | return np.asarray(y)
68 |
69 |
70 | # word 数据打包
71 | def wd_train_get_batch(title_len=30, content_len=150, batch_size=128):
72 | print('loading word train_title and train_content.')
73 | train_title = np.load('../data/wd_train_title.npy')
74 | train_content = np.load('../data/wd_train_content.npy')
75 | p = Pool()
76 | X_title = np.asarray(p.map(pad_X30, train_title))
77 | X_content = np.asarray(p.map(pad_X150, train_content))
78 | p.close()
79 | p.join()
80 | X = np.hstack([X_title, X_content])
81 | print('getting labels, this should cost minutes, please wait.')
82 | y = get_lables()
83 | print('y.shape=', y.shape)
84 | np.save('../data/y_tr.npy', y)
85 | # 划分验证集
86 | sample_num = X.shape[0]
87 | np.random.seed(13)
88 | valid_num = 100000
89 | new_index = np.random.permutation(sample_num)
90 | X = X[new_index]
91 | y = y[new_index]
92 | X_valid = X[:valid_num]
93 | y_valid = y[:valid_num]
94 | X_train = X[valid_num:]
95 | y_train = y[valid_num:]
96 | print('X_train.shape=', X_train.shape, 'y_train.shape=', y_train.shape)
97 | print('X_valid.shape=', X_valid.shape, 'y_valid.shape=', y_valid.shape)
98 | print('creating batch data.')
99 | # 验证集打batch
100 | sample_num = len(X_valid)
101 | print('valid_sample_num=%d' % sample_num)
102 | train_batch(X_valid, y_valid, wd_valid_path, batch_size)
103 | # 训练集打batch
104 | sample_num = len(X_train)
105 | print('train_sample_num=%d' % sample_num)
106 | train_batch(X_train, y_train, wd_train_path, batch_size)
107 |
108 |
109 | def wd_test_get_batch(title_len=30, content_len=150, batch_size=128):
110 | eval_title = np.load('../data/wd_eval_title.npy')
111 | eval_content = np.load('../data/wd_eval_content.npy')
112 | p = Pool()
113 | X_title = np.asarray(p.map(pad_X30, eval_title))
114 | X_content = np.asarray(p.map(pad_X150, eval_content))
115 | p.close()
116 | p.join()
117 | X = np.hstack([X_title, X_content])
118 | sample_num = len(X)
119 | print('eval_sample_num=%d' % sample_num)
120 | eval_batch(X, wd_test_path, batch_size)
121 |
122 |
123 | # char 数据打包
124 | def ch_train_get_batch(title_len=52, content_len=300, batch_size=128):
125 | print('loading char train_title and train_content.')
126 | train_title = np.load('../data/ch_train_title.npy')
127 | train_content = np.load('../data/ch_train_content.npy')
128 | p = Pool()
129 | X_title = np.asarray(p.map(pad_X52, train_title))
130 | X_content = np.asarray(p.map(pad_X300, train_content))
131 | p.close()
132 | p.join()
133 | X = np.hstack([X_title, X_content])
134 | y = np.load('../data/y_tr.npy')
135 | # 划分验证集
136 | sample_num = X.shape[0]
137 | np.random.seed(13)
138 | valid_num = 100000
139 | new_index = np.random.permutation(sample_num)
140 | X = X[new_index]
141 | y = y[new_index]
142 | X_valid = X[:valid_num]
143 | y_valid = y[:valid_num]
144 | X_train = X[valid_num:]
145 | y_train = y[valid_num:]
146 | print('X_train.shape=', X_train.shape, 'y_train.shape=', y_train.shape)
147 | print('X_valid.shape=', X_valid.shape, 'y_valid.shape=', y_valid.shape)
148 | # 验证集打batch
149 | print('creating batch data.')
150 | sample_num = len(X_valid)
151 | print('valid_sample_num=%d' % sample_num)
152 | train_batch(X_valid, y_valid, ch_valid_path, batch_size)
153 | # 训练集打batch
154 | sample_num = len(X_train)
155 | print('train_sample_num=%d' % sample_num)
156 | train_batch(X_train, y_train, ch_train_path, batch_size)
157 |
158 |
159 | def ch_test_get_batch(title_len=52, content_len=300, batch_size=128):
160 | eval_title = np.load('../data/ch_eval_title.npy')
161 | eval_content = np.load('../data/ch_eval_content.npy')
162 | p = Pool()
163 | X_title = np.asarray(p.map(pad_X52, eval_title))
164 | X_content = np.asarray(p.map(pad_X300, eval_content))
165 | p.close()
166 | p.join()
167 | X = np.hstack([X_title, X_content])
168 | sample_num = len(X)
169 | print('eval_sample_num=%d' % sample_num)
170 | eval_batch(X, ch_test_path, batch_size)
171 |
172 |
173 | if __name__ == '__main__':
174 | wd_train_get_batch()
175 | wd_test_get_batch()
176 | ch_train_get_batch()
177 | ch_test_get_batch()
178 |
--------------------------------------------------------------------------------
/zhihu-text-classification-master/models/wd_1_1_cnn_concat/train.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 | from __future__ import print_function
4 | from __future__ import division
5 | import tensorflow as tf
6 | import numpy as np
7 | from tqdm import tqdm
8 | import os
9 | import sys
10 | import shutil
11 | import time
12 | import network
13 |
14 | sys.path.append('../..')
15 | from data_helpers import to_categorical
16 | from evaluator import score_eval
17 |
18 | flags = tf.flags
19 | flags.DEFINE_bool('is_retrain', False, 'if is_retrain is true, not rebuild the summary')
20 | flags.DEFINE_integer('max_epoch', 1, 'update the embedding after max_epoch, default: 1')
21 | flags.DEFINE_integer('max_max_epoch', 6, 'all training epoches, default: 6')
22 | flags.DEFINE_float('lr', 1e-3, 'initial learning rate, default: 1e-3')
23 | flags.DEFINE_float('decay_rate', 0.65, 'decay rate, default: 0.65')
24 | flags.DEFINE_float('keep_prob', 0.5, 'keep_prob for training, default: 0.5')
25 | # 正式
26 | flags.DEFINE_integer('decay_step', 15000, 'decay_step, default: 15000')
27 | flags.DEFINE_integer('valid_step', 10000, 'valid_step, default: 10000')
28 | flags.DEFINE_float('last_f1', 0.40, 'if valid_f1 > last_f1, save new model. default: 0.40')
29 |
30 | # 测试
31 | # flags.DEFINE_integer('decay_step', 1000, 'decay_step, default: 1000')
32 | # flags.DEFINE_integer('valid_step', 500, 'valid_step, default: 500')
33 | # flags.DEFINE_float('last_f1', 0.10, 'if valid_f1 > last_f1, save new model. default: 0.10')
34 | FLAGS = flags.FLAGS
35 |
36 | lr = FLAGS.lr
37 | last_f1 = FLAGS.last_f1
38 | settings = network.Settings()
39 | title_len = settings.title_len
40 | summary_path = settings.summary_path
41 | ckpt_path = settings.ckpt_path
42 | model_path = ckpt_path + 'model.ckpt'
43 |
44 | embedding_path = '../../data/word_embedding.npy'
45 | data_train_path = '../../data/wd-data/data_train/'
46 | data_valid_path = '../../data/wd-data/data_valid/'
47 | tr_batches = os.listdir(data_train_path) # batch 文件名列表
48 | va_batches = os.listdir(data_valid_path)
49 | n_tr_batches = len(tr_batches)
50 | n_va_batches = len(va_batches)
51 |
52 | # 测试
53 | # n_tr_batches = 1000
54 | # n_va_batches = 50
55 |
56 |
57 | def get_batch(data_path, batch_id):
58 | """get a batch from data_path"""
59 | new_batch = np.load(data_path + str(batch_id) + '.npz')
60 | X_batch = new_batch['X']
61 | y_batch = new_batch['y']
62 | X1_batch = X_batch[:, :title_len]
63 | X2_batch = X_batch[:, title_len:]
64 | return [X1_batch, X2_batch, y_batch]
65 |
66 |
67 | def valid_epoch(data_path, sess, model):
68 | """Test on the valid data."""
69 | va_batches = os.listdir(data_path)
70 | n_va_batches = len(va_batches)
71 | _costs = 0.0
72 | predict_labels_list = list() # 所有的预测结果
73 | marked_labels_list = list()
74 | for i in range(n_va_batches):
75 | [X1_batch, X2_batch, y_batch] = get_batch(data_path, i)
76 | marked_labels_list.extend(y_batch)
77 | y_batch = to_categorical(y_batch)
78 | _batch_size = len(y_batch)
79 | fetches = [model.loss, model.y_pred]
80 | feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
81 | model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
82 | _cost, predict_labels = sess.run(fetches, feed_dict)
83 | _costs += _cost
84 | predict_labels = list(map(lambda label: label.argsort()[-1:-6:-1], predict_labels)) # 取最大的5个下标
85 | predict_labels_list.extend(predict_labels)
86 | predict_label_and_marked_label_list = zip(predict_labels_list, marked_labels_list)
87 | precision, recall, f1 = score_eval(predict_label_and_marked_label_list)
88 | mean_cost = _costs / n_va_batches
89 | return mean_cost, precision, recall, f1
90 |
91 |
92 | def train_epoch(data_train_path, sess, model, train_fetches, valid_fetches, train_writer, test_writer):
93 | global last_f1
94 | global lr
95 | time0 = time.time()
96 | batch_indexs = np.random.permutation(n_tr_batches) # shuffle the training data
97 | for batch in tqdm(range(n_tr_batches)):
98 | global_step = sess.run(model.global_step)
99 | if 0 == (global_step + 1) % FLAGS.valid_step:
100 | valid_cost, precision, recall, f1 = valid_epoch(data_valid_path, sess, model)
101 | print('Global_step=%d: valid cost=%g; p=%g, r=%g, f1=%g, time=%g s' % (
102 | global_step, valid_cost, precision, recall, f1, time.time() - time0))
103 | time0 = time.time()
104 | if f1 > last_f1:
105 | last_f1 = f1
106 | saving_path = model.saver.save(sess, model_path, global_step+1)
107 | print('saved new model to %s ' % saving_path)
108 | # training
109 | batch_id = batch_indexs[batch]
110 | [X1_batch, X2_batch, y_batch] = get_batch(data_train_path, batch_id)
111 | y_batch = to_categorical(y_batch)
112 | _batch_size = len(y_batch)
113 | feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
114 | model.batch_size: _batch_size, model.tst: False, model.keep_prob: FLAGS.keep_prob}
115 | summary, _cost, _, _ = sess.run(train_fetches, feed_dict) # the cost is the mean cost of one batch
116 | # valid per 500 steps
117 | if 0 == (global_step + 1) % 500:
118 | train_writer.add_summary(summary, global_step)
119 | batch_id = np.random.randint(0, n_va_batches) # 随机选一个验证batch
120 | [X1_batch, X2_batch, y_batch] = get_batch(data_valid_path, batch_id)
121 | y_batch = to_categorical(y_batch)
122 | _batch_size = len(y_batch)
123 | feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
124 | model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
125 | summary, _cost = sess.run(valid_fetches, feed_dict)
126 | test_writer.add_summary(summary, global_step)
127 |
128 |
129 | def main(_):
130 | global ckpt_path
131 | global last_f1
132 | if not os.path.exists(ckpt_path):
133 | os.makedirs(ckpt_path)
134 | if not os.path.exists(summary_path):
135 | os.makedirs(summary_path)
136 | elif not FLAGS.is_retrain: # 重新训练本模型,删除以前的 summary
137 | shutil.rmtree(summary_path)
138 | os.makedirs(summary_path)
139 | if not os.path.exists(summary_path):
140 | os.makedirs(summary_path)
141 |
142 | print('1.Loading data...')
143 | W_embedding = np.load(embedding_path)
144 | print('training sample_num = %d' % n_tr_batches)
145 | print('valid sample_num = %d' % n_va_batches)
146 |
147 | # Initial or restore the model
148 | print('2.Building model...')
149 | config = tf.ConfigProto()
150 | config.gpu_options.allow_growth = True
151 | with tf.Session(config=config) as sess:
152 | model = network.TextCNN(W_embedding, settings)
153 | with tf.variable_scope('training_ops') as vs:
154 | learning_rate = tf.train.exponential_decay(FLAGS.lr, model.global_step, FLAGS.decay_step,
155 | FLAGS.decay_rate, staircase=True)
156 | # two optimizer: op1, update embedding; op2, do not update embedding.
157 | with tf.variable_scope('Optimizer1'):
158 | tvars1 = tf.trainable_variables()
159 | train_op1 = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(model.loss, global_step=model.global_step, var_list=tvars1)
160 |
161 | with tf.variable_scope('Optimizer2'):
162 | tvars2 = [tvar for tvar in tvars1 if 'embedding' not in tvar.name]
163 | train_op2 = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(model.loss, global_step=model.global_step, var_list=tvars2)
164 |
165 | update_op = tf.group(*model.update_emas)
166 | merged = tf.summary.merge_all() # summary
167 | train_writer = tf.summary.FileWriter(summary_path + 'train', sess.graph)
168 | test_writer = tf.summary.FileWriter(summary_path + 'test')
169 | training_ops = [v for v in tf.global_variables() if v.name.startswith(vs.name+'/')]
170 |
171 | # 如果已经保存过模型,导入上次的模型
172 | if os.path.exists(ckpt_path + "checkpoint"):
173 | print("Restoring Variables from Checkpoint...")
174 | model.saver.restore(sess, tf.train.latest_checkpoint(ckpt_path))
175 | last_valid_cost, precision, recall, last_f1 = valid_epoch(data_valid_path, sess, model)
176 | print(' valid cost=%g; p=%g, r=%g, f1=%g' % (last_valid_cost, precision, recall, last_f1))
177 | sess.run(tf.variables_initializer(training_ops))
178 | else:
179 | print('Initializing Variables...')
180 | sess.run(tf.global_variables_initializer())
181 |
182 | print('3.Begin training...')
183 | print('max_epoch=%d, max_max_epoch=%d' % (FLAGS.max_epoch, FLAGS.max_max_epoch))
184 | for epoch in range(FLAGS.max_max_epoch):
185 | global_step = sess.run(model.global_step)
186 | print('Global step %d, lr=%g' % (global_step, sess.run(learning_rate)))
187 | if epoch == FLAGS.max_epoch: # update the embedding
188 | train_op = train_op1
189 | else:
190 | train_op = train_op2
191 |
192 | train_fetches = [merged, model.loss, train_op, update_op]
193 | valid_fetches = [merged, model.loss]
194 | train_epoch(data_train_path, sess, model, train_fetches, valid_fetches, train_writer, test_writer)
195 | # 最后再做一次验证
196 | valid_cost, precision, recall, f1 = valid_epoch(data_valid_path, sess, model)
197 | print('END.Global_step=%d: valid cost=%g; p=%g, r=%g, f1=%g' % (
198 | sess.run(model.global_step), valid_cost, precision, recall, f1))
199 | if f1 > last_f1: # save the better model
200 | saving_path = model.saver.save(sess, model_path, sess.run(model.global_step)+1)
201 | print('saved new model to %s ' % saving_path)
202 |
203 |
204 | if __name__ == '__main__':
205 | tf.app.run()
206 |
--------------------------------------------------------------------------------
/zhihu-text-classification-master/models/wd_1_1_cnn_concat/network.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 | import tensorflow as tf
4 |
5 | """wd_1_1_cnn_concat
6 | title 部分使用 TextCNN;content 部分使用 TextCNN; 两部分输出直接 concat。
7 | """
8 |
9 |
10 | class Settings(object):
11 | def __init__(self):
12 | self.model_name = 'wd_1_1_cnn_concat'
13 | self.title_len = 30
14 | self.content_len = 150
15 | self.filter_sizes = [2, 3, 4, 5, 7]
16 | self.n_filter = 256
17 | self.fc_hidden_size = 1024
18 | self.n_class = 1999
19 | self.summary_path = '../../summary/' + self.model_name + '/'
20 | self.ckpt_path = '../../ckpt/' + self.model_name + '/'
21 |
22 |
23 | class TextCNN(object):
24 | """
25 | title: inputs->textcnn->output_title
26 | content: inputs->textcnn->output_content
27 | concat[output_title, output_content] -> fc+bn+relu -> sigmoid_entropy.
28 | """
29 |
30 | def __init__(self, W_embedding, settings):
31 | self.model_name = settings.model_name
32 | self.title_len = settings.title_len
33 | self.content_len = settings.content_len
34 | self.filter_sizes = settings.filter_sizes
35 | self.n_filter = settings.n_filter
36 | self.n_filter_total = self.n_filter * len(self.filter_sizes)
37 | self.n_class = settings.n_class
38 | self.fc_hidden_size = settings.fc_hidden_size
39 | self._global_step = tf.Variable(0, trainable=False, name='Global_Step')
40 | self.update_emas = list()
41 | # placeholders
42 | self._tst = tf.placeholder(tf.bool)
43 | self._keep_prob = tf.placeholder(tf.float32, [])
44 | self._batch_size = tf.placeholder(tf.int32, [])
45 |
46 | with tf.name_scope('Inputs'):
47 | self._X1_inputs = tf.placeholder(tf.int64, [None, self.title_len], name='X1_inputs')
48 | self._X2_inputs = tf.placeholder(tf.int64, [None, self.content_len], name='X2_inputs')
49 | self._y_inputs = tf.placeholder(tf.float32, [None, self.n_class], name='y_input')
50 |
51 | with tf.variable_scope('embedding'):
52 | self.embedding = tf.get_variable(name='embedding', shape=W_embedding.shape,
53 | initializer=tf.constant_initializer(W_embedding), trainable=True)
54 | self.embedding_size = W_embedding.shape[1]
55 |
56 | with tf.variable_scope('cnn_text'):
57 | output_title = self.cnn_inference(self._X1_inputs, self.title_len)
58 |
59 | with tf.variable_scope('hcnn_content'):
60 | output_content = self.cnn_inference(self._X2_inputs, self.content_len)
61 |
62 | with tf.variable_scope('fc-bn-layer'):
63 | output = tf.concat([output_title, output_content], axis=1)
64 | W_fc = self.weight_variable([self.n_filter_total * 2, self.fc_hidden_size], name='Weight_fc')
65 | tf.summary.histogram('W_fc', W_fc)
66 | h_fc = tf.matmul(output, W_fc, name='h_fc')
67 | beta_fc = tf.Variable(tf.constant(0.1, tf.float32, shape=[self.fc_hidden_size], name="beta_fc"))
68 | tf.summary.histogram('beta_fc', beta_fc)
69 | fc_bn, update_ema_fc = self.batchnorm(h_fc, beta_fc, convolutional=False)
70 | self.update_emas.append(update_ema_fc)
71 | self.fc_bn_relu = tf.nn.relu(fc_bn, name="relu")
72 | fc_bn_drop = tf.nn.dropout(self.fc_bn_relu, self.keep_prob)
73 |
74 | with tf.variable_scope('out_layer'):
75 | W_out = self.weight_variable([self.fc_hidden_size, self.n_class], name='Weight_out')
76 | tf.summary.histogram('Weight_out', W_out)
77 | b_out = self.bias_variable([self.n_class], name='bias_out')
78 | tf.summary.histogram('bias_out', b_out)
79 | self._y_pred = tf.nn.xw_plus_b(fc_bn_drop, W_out, b_out, name='y_pred') # 每个类别的分数 scores
80 |
81 | with tf.name_scope('loss'):
82 | self._loss = tf.reduce_mean(
83 | tf.nn.sigmoid_cross_entropy_with_logits(logits=self._y_pred, labels=self._y_inputs))
84 | tf.summary.scalar('loss', self._loss)
85 |
86 | self.saver = tf.train.Saver(max_to_keep=2)
87 |
88 | @property
89 | def tst(self):
90 | return self._tst
91 |
92 | @property
93 | def keep_prob(self):
94 | return self._keep_prob
95 |
96 | @property
97 | def batch_size(self):
98 | return self._batch_size
99 |
100 | @property
101 | def global_step(self):
102 | return self._global_step
103 |
104 | @property
105 | def X1_inputs(self):
106 | return self._X1_inputs
107 |
108 | @property
109 | def X2_inputs(self):
110 | return self._X2_inputs
111 |
112 | @property
113 | def y_inputs(self):
114 | return self._y_inputs
115 |
116 | @property
117 | def y_pred(self):
118 | return self._y_pred
119 |
120 | @property
121 | def loss(self):
122 | return self._loss
123 |
124 | def weight_variable(self, shape, name):
125 | """Create a weight variable with appropriate initialization."""
126 | initial = tf.truncated_normal(shape, stddev=0.1)
127 | return tf.Variable(initial, name=name)
128 |
129 | def bias_variable(self, shape, name):
130 | """Create a bias variable with appropriate initialization."""
131 | initial = tf.constant(0.1, shape=shape)
132 | return tf.Variable(initial, name=name)
133 |
134 | def batchnorm(self, Ylogits, offset, convolutional=False):
135 | """batchnormalization.
136 | Args:
137 | Ylogits: 1D向量或者是3D的卷积结果。
138 | num_updates: 迭代的global_step
139 | offset:表示beta,全局均值;在 RELU 激活中一般初始化为 0.1。
140 | scale:表示lambda,全局方差;在 sigmoid 激活中需要,这 RELU 激活中作用不大。
141 | m: 表示batch均值;v:表示batch方差。
142 | bnepsilon:一个很小的浮点数,防止除以 0.
143 | Returns:
144 | Ybn: 和 Ylogits 的维度一样,就是经过 Batch Normalization 处理的结果。
145 | update_moving_everages:更新mean和variance,主要是给最后的 test 使用。
146 | """
147 | exp_moving_avg = tf.train.ExponentialMovingAverage(0.999,
148 | self._global_step) # adding the iteration prevents from averaging across non-existing iterations
149 | bnepsilon = 1e-5
150 | if convolutional:
151 | mean, variance = tf.nn.moments(Ylogits, [0, 1, 2])
152 | else:
153 | mean, variance = tf.nn.moments(Ylogits, [0])
154 | update_moving_everages = exp_moving_avg.apply([mean, variance])
155 | m = tf.cond(self.tst, lambda: exp_moving_avg.average(mean), lambda: mean)
156 | v = tf.cond(self.tst, lambda: exp_moving_avg.average(variance), lambda: variance)
157 | Ybn = tf.nn.batch_normalization(Ylogits, m, v, offset, None, bnepsilon)
158 | return Ybn, update_moving_everages
159 |
160 | def cnn_inference(self, X_inputs, n_step):
161 | """TextCNN 模型。
162 | Args:
163 | X_inputs: tensor.shape=(batch_size, n_step)
164 | Returns:
165 | title_outputs: tensor.shape=(batch_size, self.n_filter_total)
166 | """
167 | inputs = tf.nn.embedding_lookup(self.embedding, X_inputs)
168 | inputs = tf.expand_dims(inputs, -1)
169 | pooled_outputs = list()
170 | for i, filter_size in enumerate(self.filter_sizes):
171 | with tf.variable_scope("conv-maxpool-%s" % filter_size):
172 | # Convolution Layer
173 | filter_shape = [filter_size, self.embedding_size, 1, self.n_filter]
174 | W_filter = self.weight_variable(shape=filter_shape, name='W_filter')
175 | beta = self.bias_variable(shape=[self.n_filter], name='beta_filter')
176 | tf.summary.histogram('beta', beta)
177 | conv = tf.nn.conv2d(inputs, W_filter, strides=[1, 1, 1, 1], padding="VALID", name="conv")
178 | conv_bn, update_ema = self.batchnorm(conv, beta, convolutional=True) # 在激活层前面加 BN
179 | # Apply nonlinearity, batch norm scaling is not useful with relus
180 | # batch norm offsets are used instead of biases,使用 BN 层的 offset,不要 biases
181 | h = tf.nn.relu(conv_bn, name="relu")
182 | # Maxpooling over the outputs
183 | pooled = tf.nn.max_pool(h, ksize=[1, n_step - filter_size + 1, 1, 1],
184 | strides=[1, 1, 1, 1], padding='VALID', name="pool")
185 | pooled_outputs.append(pooled)
186 | self.update_emas.append(update_ema)
187 | h_pool = tf.concat(pooled_outputs, 3)
188 | h_pool_flat = tf.reshape(h_pool, [-1, self.n_filter_total])
189 | return h_pool_flat # shape = [batch_size, self.n_filter_total]
190 |
191 |
192 | # test the model
193 | # def test():
194 | # import numpy as np
195 | # print('Begin testing...')
196 | # settings = Settings()
197 | # W_embedding = np.random.randn(50, 10)
198 | # config = tf.ConfigProto()
199 | # config.gpu_options.allow_growth = True
200 | # batch_size = 128
201 | # with tf.Session(config=config) as sess:
202 | # model = TextCNN(W_embedding, settings)
203 | # optimizer = tf.train.AdamOptimizer(0.001)
204 | # train_op = optimizer.minimize(model.loss)
205 | # update_op = tf.group(*model.update_emas)
206 | # sess.run(tf.global_variables_initializer())
207 | # fetch = [model.loss, model.y_pred, train_op, update_op]
208 | # loss_list = list()
209 | # for i in xrange(100):
210 | # X1_batch = np.zeros((batch_size, 30), dtype=float)
211 | # X2_batch = np.zeros((batch_size, 150), dtype=float)
212 | # y_batch = np.zeros((batch_size, 1999), dtype=int)
213 | # _batch_size = len(y_batch)
214 | # feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
215 | # model.batch_size: _batch_size, model.tst: False, model.keep_prob: 0.5}
216 | # loss, y_pred, _, _ = sess.run(fetch, feed_dict=feed_dict)
217 | # loss_list.append(loss)
218 | # print(i, loss)
219 | #
220 | # if __name__ == '__main__':
221 | # test()
222 |
--------------------------------------------------------------------------------
/zhihu-text-classification-master/models/wd_1_2_cnn_max/network.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 | import tensorflow as tf
4 |
5 | """wd_1_2_cnn_max
6 | title 部分使用 TextCNN;content 部分使用 TextCNN; 两部分输出按位取 max。
7 | """
8 |
9 |
10 | class Settings(object):
11 | def __init__(self):
12 | self.model_name = 'wd_1_2_cnn_max'
13 | self.title_len = 30
14 | self.content_len = 150
15 | self.filter_sizes = [2, 3, 4, 5, 7]
16 | self.n_filter = 256
17 | self.fc_hidden_size = 1024
18 | self.n_class = 1999
19 | self.summary_path = '../../summary/' + self.model_name + '/'
20 | self.ckpt_path = '../../ckpt/' + self.model_name + '/'
21 |
22 |
23 | class TextCNN(object):
24 | """
25 | title: inputs->textcnn->output_title
26 | content: inputs->textcnn->output_content
27 | max[output_title, output_content] -> fc+bn+relu -> sigmoid_entropy.
28 | """
29 |
30 | def __init__(self, W_embedding, settings):
31 | self.model_name = settings.model_name
32 | self.title_len = settings.title_len
33 | self.content_len = settings.content_len
34 | self.filter_sizes = settings.filter_sizes
35 | self.n_filter = settings.n_filter
36 | self.n_filter_total = self.n_filter * len(self.filter_sizes)
37 | self.n_class = settings.n_class
38 | self.fc_hidden_size = settings.fc_hidden_size
39 | self._global_step = tf.Variable(0, trainable=False, name='Global_Step')
40 | self.update_emas = list()
41 | # placeholders
42 | self._tst = tf.placeholder(tf.bool)
43 | self._keep_prob = tf.placeholder(tf.float32, [])
44 | self._batch_size = tf.placeholder(tf.int32, [])
45 |
46 | with tf.name_scope('Inputs'):
47 | self._X1_inputs = tf.placeholder(tf.int64, [None, self.title_len], name='X1_inputs')
48 | self._X2_inputs = tf.placeholder(tf.int64, [None, self.content_len], name='X2_inputs')
49 | self._y_inputs = tf.placeholder(tf.float32, [None, self.n_class], name='y_input')
50 |
51 | with tf.variable_scope('embedding'):
52 | self.embedding = tf.get_variable(name='embedding', shape=W_embedding.shape,
53 | initializer=tf.constant_initializer(W_embedding), trainable=True)
54 | self.embedding_size = W_embedding.shape[1]
55 |
56 | with tf.variable_scope('cnn_text'):
57 | output_title = self.cnn_inference(self._X1_inputs, self.title_len)
58 | output_title = tf.expand_dims(output_title, 0)
59 |
60 | with tf.variable_scope('hcnn_content'):
61 | output_content = self.cnn_inference(self._X2_inputs, self.content_len)
62 | output_content = tf.expand_dims(output_content, 0)
63 |
64 | with tf.variable_scope('fc-bn-layer'):
65 | output = tf.concat([output_title, output_content], axis=0)
66 | output = tf.reduce_max(output, axis=0)
67 | W_fc = self.weight_variable([self.n_filter_total, self.fc_hidden_size], name='Weight_fc')
68 | tf.summary.histogram('W_fc', W_fc)
69 | h_fc = tf.matmul(output, W_fc, name='h_fc')
70 | beta_fc = tf.Variable(tf.constant(0.1, tf.float32, shape=[self.fc_hidden_size], name="beta_fc"))
71 | tf.summary.histogram('beta_fc', beta_fc)
72 | fc_bn, update_ema_fc = self.batchnorm(h_fc, beta_fc, convolutional=False)
73 | self.update_emas.append(update_ema_fc)
74 | self.fc_bn_relu = tf.nn.relu(fc_bn, name="relu")
75 | fc_bn_drop = tf.nn.dropout(self.fc_bn_relu, self.keep_prob)
76 |
77 | with tf.variable_scope('out_layer'):
78 | W_out = self.weight_variable([self.fc_hidden_size, self.n_class], name='Weight_out')
79 | tf.summary.histogram('Weight_out', W_out)
80 | b_out = self.bias_variable([self.n_class], name='bias_out')
81 | tf.summary.histogram('bias_out', b_out)
82 | self._y_pred = tf.nn.xw_plus_b(fc_bn_drop, W_out, b_out, name='y_pred') # 每个类别的分数 scores
83 |
84 | with tf.name_scope('loss'):
85 | self._loss = tf.reduce_mean(
86 | tf.nn.sigmoid_cross_entropy_with_logits(logits=self._y_pred, labels=self._y_inputs))
87 | tf.summary.scalar('loss', self._loss)
88 |
89 | self.saver = tf.train.Saver(max_to_keep=2)
90 |
91 | @property
92 | def tst(self):
93 | return self._tst
94 |
95 | @property
96 | def keep_prob(self):
97 | return self._keep_prob
98 |
99 | @property
100 | def batch_size(self):
101 | return self._batch_size
102 |
103 | @property
104 | def global_step(self):
105 | return self._global_step
106 |
107 | @property
108 | def X1_inputs(self):
109 | return self._X1_inputs
110 |
111 | @property
112 | def X2_inputs(self):
113 | return self._X2_inputs
114 |
115 | @property
116 | def y_inputs(self):
117 | return self._y_inputs
118 |
119 | @property
120 | def y_pred(self):
121 | return self._y_pred
122 |
123 | @property
124 | def loss(self):
125 | return self._loss
126 |
127 | def weight_variable(self, shape, name):
128 | """Create a weight variable with appropriate initialization."""
129 | initial = tf.truncated_normal(shape, stddev=0.1)
130 | return tf.Variable(initial, name=name)
131 |
132 | def bias_variable(self, shape, name):
133 | """Create a bias variable with appropriate initialization."""
134 | initial = tf.constant(0.1, shape=shape)
135 | return tf.Variable(initial, name=name)
136 |
137 | def batchnorm(self, Ylogits, offset, convolutional=False):
138 | """batchnormalization.
139 | Args:
140 | Ylogits: 1D向量或者是3D的卷积结果。
141 | num_updates: 迭代的global_step
142 | offset:表示beta,全局均值;在 RELU 激活中一般初始化为 0.1。
143 | scale:表示lambda,全局方差;在 sigmoid 激活中需要,这 RELU 激活中作用不大。
144 | m: 表示batch均值;v:表示batch方差。
145 | bnepsilon:一个很小的浮点数,防止除以 0.
146 | Returns:
147 | Ybn: 和 Ylogits 的维度一样,就是经过 Batch Normalization 处理的结果。
148 | update_moving_everages:更新mean和variance,主要是给最后的 test 使用。
149 | """
150 | exp_moving_avg = tf.train.ExponentialMovingAverage(0.999,
151 | self._global_step) # adding the iteration prevents from averaging across non-existing iterations
152 | bnepsilon = 1e-5
153 | if convolutional:
154 | mean, variance = tf.nn.moments(Ylogits, [0, 1, 2])
155 | else:
156 | mean, variance = tf.nn.moments(Ylogits, [0])
157 | update_moving_everages = exp_moving_avg.apply([mean, variance])
158 | m = tf.cond(self.tst, lambda: exp_moving_avg.average(mean), lambda: mean)
159 | v = tf.cond(self.tst, lambda: exp_moving_avg.average(variance), lambda: variance)
160 | Ybn = tf.nn.batch_normalization(Ylogits, m, v, offset, None, bnepsilon)
161 | return Ybn, update_moving_everages
162 |
163 | def cnn_inference(self, X_inputs, n_step):
164 | """TextCNN 模型。
165 | Args:
166 | X_inputs: tensor.shape=(batch_size, n_step)
167 | Returns:
168 | title_outputs: tensor.shape=(batch_size, self.n_filter_total)
169 | """
170 | inputs = tf.nn.embedding_lookup(self.embedding, X_inputs)
171 | inputs = tf.expand_dims(inputs, -1)
172 | pooled_outputs = list()
173 | for i, filter_size in enumerate(self.filter_sizes):
174 | with tf.variable_scope("conv-maxpool-%s" % filter_size):
175 | # Convolution Layer
176 | filter_shape = [filter_size, self.embedding_size, 1, self.n_filter]
177 | W_filter = self.weight_variable(shape=filter_shape, name='W_filter')
178 | beta = self.bias_variable(shape=[self.n_filter], name='beta_filter')
179 | tf.summary.histogram('beta', beta)
180 | conv = tf.nn.conv2d(inputs, W_filter, strides=[1, 1, 1, 1], padding="VALID", name="conv")
181 | conv_bn, update_ema = self.batchnorm(conv, beta, convolutional=True) # 在激活层前面加 BN
182 | # Apply nonlinearity, batch norm scaling is not useful with relus
183 | # batch norm offsets are used instead of biases,使用 BN 层的 offset,不要 biases
184 | h = tf.nn.relu(conv_bn, name="relu")
185 | # Maxpooling over the outputs
186 | pooled = tf.nn.max_pool(h, ksize=[1, n_step - filter_size + 1, 1, 1],
187 | strides=[1, 1, 1, 1], padding='VALID', name="pool")
188 | pooled_outputs.append(pooled)
189 | self.update_emas.append(update_ema)
190 | h_pool = tf.concat(pooled_outputs, 3)
191 | h_pool_flat = tf.reshape(h_pool, [-1, self.n_filter_total])
192 | return h_pool_flat # shape = [batch_size, self.n_filter_total]
193 |
194 |
195 | # test the model
196 | # def test():
197 | # import numpy as np
198 | # print('Begin testing...')
199 | # settings = Settings()
200 | # W_embedding = np.random.randn(50, 10)
201 | # config = tf.ConfigProto()
202 | # config.gpu_options.allow_growth = True
203 | # batch_size = 128
204 | # with tf.Session(config=config) as sess:
205 | # model = TextCNN(W_embedding, settings)
206 | # optimizer = tf.train.AdamOptimizer(0.001)
207 | # train_op = optimizer.minimize(model.loss)
208 | # update_op = tf.group(*model.update_emas)
209 | # sess.run(tf.global_variables_initializer())
210 | # fetch = [model.loss, model.y_pred, train_op, update_op]
211 | # loss_list = list()
212 | # for i in xrange(100):
213 | # X1_batch = np.zeros((batch_size, 30), dtype=float)
214 | # X2_batch = np.zeros((batch_size, 150), dtype=float)
215 | # y_batch = np.zeros((batch_size, 1999), dtype=int)
216 | # _batch_size = len(y_batch)
217 | # feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
218 | # model.batch_size: _batch_size, model.tst: False, model.keep_prob: 0.5}
219 | # loss, y_pred, _, _ = sess.run(fetch, feed_dict=feed_dict)
220 | # loss_list.append(loss)
221 | # print(i, loss)
222 | #
223 | # if __name__ == '__main__':
224 | # test()
225 |
--------------------------------------------------------------------------------
/zhihu-text-classification-master/models/wd_4_han/train.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 | from __future__ import print_function
4 | from __future__ import division
5 | import tensorflow as tf
6 | import numpy as np
7 | from tqdm import tqdm
8 | import os
9 | import sys
10 | import shutil
11 | import time
12 | import network
13 |
14 | sys.path.append('../..')
15 | from data_helpers import to_categorical
16 | from evaluator import score_eval
17 |
18 | flags = tf.flags
19 | flags.DEFINE_bool('is_retrain', False, 'if is_retrain is true, not rebuild the summary')
20 | flags.DEFINE_integer('max_epoch', 2, 'update the embedding after max_epoch, default: 2')
21 | flags.DEFINE_integer('max_max_epoch', 6, 'all training epoches, default: 6')
22 | flags.DEFINE_float('lr', 8e-4, 'initial learning rate, default: 8e-4')
23 | flags.DEFINE_float('decay_rate', 0.85, 'decay rate, default: 0.85')
24 | flags.DEFINE_float('keep_prob', 0.5, 'keep_prob for training, default: 0.5')
25 | # 正式
26 | flags.DEFINE_integer('decay_step', 15000, 'decay_step, default: 15000')
27 | flags.DEFINE_integer('valid_step', 10000, 'valid_step, default: 10000')
28 | flags.DEFINE_float('last_f1', 0.38, 'if valid_f1 > last_f1, save new model. default: 0.40')
29 |
30 | # 测试
31 | # flags.DEFINE_integer('decay_step', 1000, 'decay_step, default: 1000')
32 | # flags.DEFINE_integer('valid_step', 500, 'valid_step, default: 500')
33 | # flags.DEFINE_float('last_f1', 0.10, 'if valid_f1 > last_f1, save new model. default: 0.10')
34 | FLAGS = flags.FLAGS
35 |
36 | lr = FLAGS.lr
37 | last_f1 = FLAGS.last_f1
38 | settings = network.Settings()
39 | title_len = settings.title_len
40 | summary_path = settings.summary_path
41 | ckpt_path = settings.ckpt_path
42 | model_path = ckpt_path + 'model.ckpt'
43 |
44 | embedding_path = '../../data/word_embedding.npy'
45 | data_train_path = '../../data/wd-data/seg_train/'
46 | data_valid_path = '../../data/wd-data/seg_valid/'
47 | tr_batches = os.listdir(data_train_path) # batch 文件名列表
48 | va_batches = os.listdir(data_valid_path)
49 | n_tr_batches = len(tr_batches)
50 | n_va_batches = len(va_batches)
51 |
52 | # 测试
53 | # n_tr_batches = 1000
54 | # n_va_batches = 50
55 |
56 |
57 | def get_batch(data_path, batch_id):
58 | """get a batch from data_path"""
59 | new_batch = np.load(data_path + str(batch_id) + '.npz')
60 | X_batch = new_batch['X']
61 | y_batch = new_batch['y']
62 | X1_batch = X_batch[:, :title_len]
63 | X2_batch = X_batch[:, title_len:]
64 | return [X1_batch, X2_batch, y_batch]
65 |
66 |
67 | def valid_epoch(data_path, sess, model):
68 | """Test on the valid data."""
69 | va_batches = os.listdir(data_path)
70 | n_va_batches = len(va_batches)
71 | _costs = 0.0
72 | predict_labels_list = list() # 所有的预测结果
73 | marked_labels_list = list()
74 | for i in range(n_va_batches):
75 | [X1_batch, X2_batch, y_batch] = get_batch(data_path, i)
76 | marked_labels_list.extend(y_batch)
77 | y_batch = to_categorical(y_batch)
78 | _batch_size = len(y_batch)
79 | fetches = [model.loss, model.y_pred]
80 | feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
81 | model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
82 | _cost, predict_labels = sess.run(fetches, feed_dict)
83 | _costs += _cost
84 | predict_labels = (map(lambda label: label.argsort()[-1:-6:-1], predict_labels)) # 取最大的5个下标
85 | predict_labels_list.extend(predict_labels)
86 | predict_label_and_marked_label_list = zip(predict_labels_list, marked_labels_list)
87 | precision, recall, f1 = score_eval(predict_label_and_marked_label_list)
88 | mean_cost = _costs / n_va_batches
89 | return mean_cost, precision, recall, f1
90 |
91 |
92 | def train_epoch(data_path, sess, model, train_fetches, valid_fetches, train_writer, test_writer):
93 | global last_f1
94 | global lr
95 | time0 = time.time()
96 | batch_indexs = np.random.permutation(n_tr_batches) # shuffle the training data
97 | for batch in tqdm(range(n_tr_batches)):
98 | global_step = sess.run(model.global_step)
99 | if 0 == (global_step + 1) % FLAGS.valid_step:
100 | valid_cost, precision, recall, f1 = valid_epoch(data_valid_path, sess, model)
101 | print('Global_step=%d: valid cost=%g; p=%g, r=%g, f1=%g, time=%g s' % (
102 | global_step, valid_cost, precision, recall, f1, time.time() - time0))
103 | time0 = time.time()
104 | if f1 > last_f1:
105 | last_f1 = f1
106 | saving_path = model.saver.save(sess, model_path, global_step+1)
107 | print('saved new model to %s ' % saving_path)
108 | # training
109 | batch_id = batch_indexs[batch]
110 | [X1_batch, X2_batch, y_batch] = get_batch(data_train_path, batch_id)
111 | y_batch = to_categorical(y_batch)
112 | _batch_size = len(y_batch)
113 | feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
114 | model.batch_size: _batch_size, model.tst: False, model.keep_prob: FLAGS.keep_prob}
115 | summary, _cost, _, _ = sess.run(train_fetches, feed_dict) # the cost is the mean cost of one batch
116 | # valid per 500 steps
117 | if 0 == (global_step + 1) % 500:
118 | train_writer.add_summary(summary, global_step)
119 | batch_id = np.random.randint(0, n_va_batches) # 随机选一个验证batch
120 | [X1_batch, X2_batch, y_batch] = get_batch(data_valid_path, batch_id)
121 | y_batch = to_categorical(y_batch)
122 | _batch_size = len(y_batch)
123 | feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
124 | model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
125 | summary, _cost = sess.run(valid_fetches, feed_dict)
126 | test_writer.add_summary(summary, global_step)
127 |
128 |
129 | def main(_):
130 | global ckpt_path
131 | global last_f1
132 | if not os.path.exists(ckpt_path):
133 | os.makedirs(ckpt_path)
134 | if not os.path.exists(summary_path):
135 | os.makedirs(summary_path)
136 | elif not FLAGS.is_retrain: # 重新训练本模型,删除以前的 summary
137 | shutil.rmtree(summary_path)
138 | os.makedirs(summary_path)
139 | if not os.path.exists(summary_path):
140 | os.makedirs(summary_path)
141 |
142 | print('1.Loading data...')
143 | W_embedding = np.load(embedding_path)
144 | print('training sample_num = %d' % n_tr_batches)
145 | print('valid sample_num = %d' % n_va_batches)
146 |
147 | # Initial or restore the model
148 | print('2.Building model...')
149 | config = tf.ConfigProto()
150 | config.gpu_options.allow_growth = True
151 | with tf.Session(config=config) as sess:
152 | model = network.HAN(W_embedding, settings)
153 | with tf.variable_scope('training_ops') as vs:
154 | learning_rate = tf.train.exponential_decay(FLAGS.lr, model.global_step, FLAGS.decay_step,
155 | FLAGS.decay_rate, staircase=True)
156 | # two optimizer: op1, update embedding; op2, do not update embedding.
157 | with tf.variable_scope('Optimizer1'):
158 | tvars1 = tf.trainable_variables()
159 | grads1 = tf.gradients(model.loss, tvars1)
160 | optimizer1 = tf.train.AdamOptimizer(learning_rate=learning_rate)
161 | train_op1 = optimizer1.apply_gradients(zip(grads1, tvars1),
162 | global_step=model.global_step)
163 | with tf.variable_scope('Optimizer2'):
164 | tvars2 = [tvar for tvar in tvars1 if 'embedding' not in tvar.name]
165 | grads2 = tf.gradients(model.loss, tvars2)
166 | optimizer2 = tf.train.AdamOptimizer(learning_rate=learning_rate)
167 | train_op2 = optimizer2.apply_gradients(zip(grads2, tvars2),
168 | global_step=model.global_step)
169 | update_op = tf.group(*model.update_emas)
170 | merged = tf.summary.merge_all() # summary
171 | train_writer = tf.summary.FileWriter(summary_path + 'train', sess.graph)
172 | test_writer = tf.summary.FileWriter(summary_path + 'test')
173 | training_ops = [v for v in tf.global_variables() if v.name.startswith(vs.name+'/')]
174 |
175 | # 如果已经保存过模型,导入上次的模型
176 | if os.path.exists(ckpt_path + "checkpoint"):
177 | print("Restoring Variables from Checkpoint...")
178 | model.saver.restore(sess, tf.train.latest_checkpoint(ckpt_path))
179 | last_valid_cost, precision, recall, last_f1 = valid_epoch(data_valid_path, sess, model)
180 | print(' valid cost=%g; p=%g, r=%g, f1=%g' % (last_valid_cost, precision, recall, last_f1))
181 | sess.run(tf.variables_initializer(training_ops))
182 | train_op2 = train_op1
183 | else:
184 | print('Initializing Variables...')
185 | sess.run(tf.global_variables_initializer())
186 |
187 | print('3.Begin training...')
188 | print('max_epoch=%d, max_max_epoch=%d' % (FLAGS.max_epoch, FLAGS.max_max_epoch))
189 | train_op = train_op2
190 | for epoch in range(FLAGS.max_max_epoch):
191 | global_step = sess.run(model.global_step)
192 | print('Global step %d, lr=%g' % (global_step, sess.run(learning_rate)))
193 | if epoch == FLAGS.max_epoch: # update the embedding
194 | train_op = train_op1
195 | train_fetches = [merged, model.loss, train_op, update_op]
196 | valid_fetches = [merged, model.loss]
197 | train_epoch(data_train_path, sess, model, train_fetches, valid_fetches, train_writer, test_writer)
198 | # 最后再做一次验证
199 | valid_cost, precision, recall, f1 = valid_epoch(data_valid_path, sess, model)
200 | print('END.Global_step=%d: valid cost=%g; p=%g, r=%g, f1=%g' % (
201 | sess.run(model.global_step), valid_cost, precision, recall, f1))
202 | if f1 > last_f1: # save the better model
203 | saving_path = model.saver.save(sess, model_path, sess.run(model.global_step)+1)
204 | print('saved new model to %s ' % saving_path)
205 |
206 |
207 | if __name__ == '__main__':
208 | tf.app.run()
209 |
--------------------------------------------------------------------------------
/zhihu-text-classification-master/models/wd_2_hcnn/train.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 | from __future__ import print_function
4 | from __future__ import division
5 | import tensorflow as tf
6 | import numpy as np
7 | from tqdm import tqdm
8 | import os
9 | import sys
10 | import shutil
11 | import time
12 | import network
13 |
14 | sys.path.append('../..')
15 | from data_helpers import to_categorical
16 | from evaluator import score_eval
17 |
18 | flags = tf.flags
19 | flags.DEFINE_bool('is_retrain', False, 'if is_retrain is true, not rebuild the summary')
20 | flags.DEFINE_integer('max_epoch', 1, 'update the embedding after max_epoch, default: 1')
21 | flags.DEFINE_integer('max_max_epoch', 6, 'all training epoches, default: 6')
22 | flags.DEFINE_float('lr', 1e-3, 'initial learning rate, default: 1e-3')
23 | flags.DEFINE_float('decay_rate', 0.65, 'decay rate, default: 0.65')
24 | flags.DEFINE_float('keep_prob', 0.5, 'keep_prob for training, default: 0.5')
25 | # 正式
26 | flags.DEFINE_integer('decay_step', 15000, 'decay_step, default: 15000')
27 | flags.DEFINE_integer('valid_step', 10000, 'valid_step, default: 10000')
28 | flags.DEFINE_float('last_f1', 0.38, 'if valid_f1 > last_f1, save new model. default: 0.40')
29 |
30 | # 测试
31 | # flags.DEFINE_integer('decay_step', 1000, 'decay_step, default: 1000')
32 | # flags.DEFINE_integer('valid_step', 500, 'valid_step, default: 500')
33 | # flags.DEFINE_float('last_f1', 0.10, 'if valid_f1 > last_f1, save new model. default: 0.10')
34 | FLAGS = flags.FLAGS
35 |
36 | lr = FLAGS.lr
37 | last_f1 = FLAGS.last_f1
38 | settings = network.Settings()
39 | title_len = settings.title_len
40 | summary_path = settings.summary_path
41 | ckpt_path = settings.ckpt_path
42 | model_path = ckpt_path + 'model.ckpt'
43 |
44 | embedding_path = '../../data/word_embedding.npy'
45 | data_train_path = '../../data/wd-data/seg_train/'
46 | data_valid_path = '../../data/wd-data/seg_valid/'
47 | tr_batches = os.listdir(data_train_path) # batch 文件名列表
48 | va_batches = os.listdir(data_valid_path)
49 | n_tr_batches = len(tr_batches)
50 | n_va_batches = len(va_batches)
51 |
52 | # 测试
53 | # n_tr_batches = 1000
54 | # n_va_batches = 50
55 |
56 |
57 | def get_batch(data_path, batch_id):
58 | """get a batch from data_path"""
59 | new_batch = np.load(data_path + str(batch_id) + '.npz')
60 | X_batch = new_batch['X']
61 | y_batch = new_batch['y']
62 | X1_batch = X_batch[:, :title_len]
63 | X2_batch = X_batch[:, title_len:]
64 | return [X1_batch, X2_batch, y_batch]
65 |
66 |
67 | def valid_epoch(data_path, sess, model):
68 | """Test on the valid data."""
69 | va_batches = os.listdir(data_path)
70 | n_va_batches = len(va_batches)
71 | _costs = 0.0
72 | predict_labels_list = list() # 所有的预测结果
73 | marked_labels_list = list()
74 | for i in range(n_va_batches):
75 | [X1_batch, X2_batch, y_batch] = get_batch(data_path, i)
76 | marked_labels_list.extend(y_batch)
77 | y_batch = to_categorical(y_batch)
78 | _batch_size = len(y_batch)
79 | fetches = [model.loss, model.y_pred]
80 | feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
81 | model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
82 | _cost, predict_labels = sess.run(fetches, feed_dict)
83 | _costs += _cost
84 | predict_labels = list(map(lambda label: label.argsort()[-1:-6:-1], predict_labels)) # 取最大的5个下标
85 | predict_labels_list.extend(predict_labels)
86 | predict_label_and_marked_label_list = zip(predict_labels_list, marked_labels_list)
87 | precision, recall, f1 = score_eval(predict_label_and_marked_label_list)
88 | mean_cost = _costs / n_va_batches
89 | return mean_cost, precision, recall, f1
90 |
91 |
92 | def train_epoch(data_path, sess, model, train_fetches, valid_fetches, train_writer, test_writer):
93 | global last_f1
94 | global lr
95 | time0 = time.time()
96 | batch_indexs = np.random.permutation(n_tr_batches) # shuffle the training data
97 | for batch in tqdm(range(n_tr_batches)):
98 | global_step = sess.run(model.global_step)
99 | if 0 == (global_step + 1) % FLAGS.valid_step:
100 | valid_cost, precision, recall, f1 = valid_epoch(data_valid_path, sess, model)
101 | print('Global_step=%d: valid cost=%g; p=%g, r=%g, f1=%g, time=%g s' % (
102 | global_step, valid_cost, precision, recall, f1, time.time() - time0))
103 | time0 = time.time()
104 | if f1 > last_f1:
105 | last_f1 = f1
106 | saving_path = model.saver.save(sess, model_path, global_step+1)
107 | print('saved new model to %s ' % saving_path)
108 | # training
109 | batch_id = batch_indexs[batch]
110 | [X1_batch, X2_batch, y_batch] = get_batch(data_train_path, batch_id)
111 | y_batch = to_categorical(y_batch)
112 | _batch_size = len(y_batch)
113 | feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
114 | model.batch_size: _batch_size, model.tst: False, model.keep_prob: FLAGS.keep_prob}
115 | summary, _cost, _, _ = sess.run(train_fetches, feed_dict) # the cost is the mean cost of one batch
116 | # valid per 500 steps
117 | if 0 == (global_step + 1) % 500:
118 | train_writer.add_summary(summary, global_step)
119 | batch_id = np.random.randint(0, n_va_batches) # 随机选一个验证batch
120 | [X1_batch, X2_batch, y_batch] = get_batch(data_valid_path, batch_id)
121 | y_batch = to_categorical(y_batch)
122 | _batch_size = len(y_batch)
123 | feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
124 | model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
125 | summary, _cost = sess.run(valid_fetches, feed_dict)
126 | test_writer.add_summary(summary, global_step)
127 |
128 |
129 | def main(_):
130 | global ckpt_path
131 | global last_f1
132 | if not os.path.exists(ckpt_path):
133 | os.makedirs(ckpt_path)
134 | if not os.path.exists(summary_path):
135 | os.makedirs(summary_path)
136 | elif not FLAGS.is_retrain: # 重新训练本模型,删除以前的 summary
137 | shutil.rmtree(summary_path)
138 | os.makedirs(summary_path)
139 | if not os.path.exists(summary_path):
140 | os.makedirs(summary_path)
141 |
142 | print('1.Loading data...')
143 | W_embedding = np.load(embedding_path)
144 | print('training sample_num = %d' % n_tr_batches)
145 | print('valid sample_num = %d' % n_va_batches)
146 |
147 | # Initial or restore the model
148 | print('2.Building model...')
149 | config = tf.ConfigProto()
150 | config.gpu_options.allow_growth = True
151 | with tf.Session(config=config) as sess:
152 | model = network.HCNN(W_embedding, settings)
153 | with tf.variable_scope('training_ops') as vs:
154 | learning_rate = tf.train.exponential_decay(FLAGS.lr, model.global_step, FLAGS.decay_step,
155 | FLAGS.decay_rate, staircase=True)
156 | # two optimizer: op1, update embedding; op2, do not update embedding.
157 | with tf.variable_scope('Optimizer1'):
158 | tvars1 = tf.trainable_variables()
159 | grads1 = tf.gradients(model.loss, tvars1)
160 | optimizer1 = tf.train.AdamOptimizer(learning_rate=learning_rate)
161 | train_op1 = optimizer1.apply_gradients(zip(grads1, tvars1),
162 | global_step=model.global_step)
163 | with tf.variable_scope('Optimizer2'):
164 | tvars2 = [tvar for tvar in tvars1 if 'embedding' not in tvar.name]
165 | grads2 = tf.gradients(model.loss, tvars2)
166 | optimizer2 = tf.train.AdamOptimizer(learning_rate=learning_rate)
167 | train_op2 = optimizer2.apply_gradients(zip(grads2, tvars2),
168 | global_step=model.global_step)
169 | update_op = tf.group(*model.update_emas)
170 | merged = tf.summary.merge_all() # summary
171 | train_writer = tf.summary.FileWriter(summary_path + 'train', sess.graph)
172 | test_writer = tf.summary.FileWriter(summary_path + 'test')
173 | training_ops = [v for v in tf.global_variables() if v.name.startswith(vs.name+'/')]
174 |
175 | # 如果已经保存过模型,导入上次的模型
176 | if os.path.exists(ckpt_path + "checkpoint"):
177 | print("Restoring Variables from Checkpoint...")
178 | model.saver.restore(sess, tf.train.latest_checkpoint(ckpt_path))
179 | last_valid_cost, precision, recall, last_f1 = valid_epoch(data_valid_path, sess, model)
180 | print(' valid cost=%g; p=%g, r=%g, f1=%g' % (last_valid_cost, precision, recall, last_f1))
181 | sess.run(tf.variables_initializer(training_ops))
182 | train_op2 = train_op1
183 | else:
184 | print('Initializing Variables...')
185 | sess.run(tf.global_variables_initializer())
186 |
187 | print('3.Begin training...')
188 | print('max_epoch=%d, max_max_epoch=%d' % (FLAGS.max_epoch, FLAGS.max_max_epoch))
189 | train_op = train_op2
190 | for epoch in range(FLAGS.max_max_epoch):
191 | global_step = sess.run(model.global_step)
192 | print('Global step %d, lr=%g' % (global_step, sess.run(learning_rate)))
193 | if epoch == FLAGS.max_epoch: # update the embedding
194 | train_op = train_op1
195 | train_fetches = [merged, model.loss, train_op, update_op]
196 | valid_fetches = [merged, model.loss]
197 | train_epoch(data_train_path, sess, model, train_fetches, valid_fetches, train_writer, test_writer)
198 | # 最后再做一次验证
199 | valid_cost, precision, recall, f1 = valid_epoch(data_valid_path, sess, model)
200 | print('END.Global_step=%d: valid cost=%g; p=%g, r=%g, f1=%g' % (
201 | sess.run(model.global_step), valid_cost, precision, recall, f1))
202 | if f1 > last_f1: # save the better model
203 | saving_path = model.saver.save(sess, model_path, sess.run(model.global_step)+1)
204 | print('saved new model to %s ' % saving_path)
205 |
206 |
207 | if __name__ == '__main__':
208 | tf.app.run()
209 |
--------------------------------------------------------------------------------
/zhihu-text-classification-master/models/wd_6_rcnn/train.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 | from __future__ import print_function
4 | from __future__ import division
5 | import tensorflow as tf
6 | import numpy as np
7 | from tqdm import tqdm
8 | import os
9 | import sys
10 | import shutil
11 | import time
12 | import network
13 |
14 | sys.path.append('../..')
15 | from data_helpers import to_categorical
16 | from evaluator import score_eval
17 |
18 | flags = tf.flags
19 | flags.DEFINE_bool('is_retrain', False, 'if is_retrain is true, not rebuild the summary')
20 | flags.DEFINE_integer('max_epoch', 1, 'update the embedding after max_epoch, default: 1')
21 | flags.DEFINE_integer('max_max_epoch', 6, 'all training epoches, default: 6')
22 | flags.DEFINE_float('lr', 8e-4, 'initial learning rate, default: 8e-4')
23 | flags.DEFINE_float('decay_rate', 0.75, 'decay rate, default: 0.75')
24 | flags.DEFINE_float('keep_prob', 0.5, 'keep_prob for training, default: 0.5')
25 | # 正式
26 | flags.DEFINE_integer('decay_step', 15000, 'decay_step, default: 15000')
27 | flags.DEFINE_integer('valid_step', 10000, 'valid_step, default: 10000')
28 | flags.DEFINE_float('last_f1', 0.38, 'if valid_f1 > last_f1, save new model. default: 0.40')
29 |
30 | # 测试
31 | # flags.DEFINE_integer('decay_step', 1000, 'decay_step, default: 1000')
32 | # flags.DEFINE_integer('valid_step', 500, 'valid_step, default: 500')
33 | # flags.DEFINE_float('last_f1', 0.10, 'if valid_f1 > last_f1, save new model. default: 0.10')
34 | FLAGS = flags.FLAGS
35 |
36 | lr = FLAGS.lr
37 | last_f1 = FLAGS.last_f1
38 | settings = network.Settings()
39 | title_len = settings.title_len
40 | summary_path = settings.summary_path
41 | ckpt_path = settings.ckpt_path
42 | model_path = ckpt_path + 'model.ckpt'
43 |
44 | embedding_path = '../../data/word_embedding.npy'
45 | data_train_path = '../../data/wd-data/data_train/'
46 | data_valid_path = '../../data/wd-data/data_valid/'
47 | tr_batches = os.listdir(data_train_path) # batch 文件名列表
48 | va_batches = os.listdir(data_valid_path)
49 | n_tr_batches = len(tr_batches)
50 | n_va_batches = len(va_batches)
51 |
52 | # 测试
53 | # n_tr_batches = 1000
54 | # n_va_batches = 50
55 |
56 |
57 | def get_batch(data_path, batch_id):
58 | """get a batch from data_path"""
59 | new_batch = np.load(data_path + str(batch_id) + '.npz')
60 | X_batch = new_batch['X']
61 | y_batch = new_batch['y']
62 | X1_batch = X_batch[:, :title_len]
63 | X2_batch = X_batch[:, title_len:]
64 | return [X1_batch, X2_batch, y_batch]
65 |
66 |
67 | def valid_epoch(data_path, sess, model):
68 | """Test on the valid data."""
69 | va_batches = os.listdir(data_path)
70 | n_va_batches = len(va_batches)
71 | _costs = 0.0
72 | predict_labels_list = list() # 所有的预测结果
73 | marked_labels_list = list()
74 | for i in range(n_va_batches):
75 | [X1_batch, X2_batch, y_batch] = get_batch(data_path, i)
76 | marked_labels_list.extend(y_batch)
77 | y_batch = to_categorical(y_batch)
78 | _batch_size = len(y_batch)
79 | fetches = [model.loss, model.y_pred]
80 | feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
81 | model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
82 | _cost, predict_labels = sess.run(fetches, feed_dict)
83 | _costs += _cost
84 | predict_labels = list(map(lambda label: label.argsort()[-1:-6:-1], predict_labels)) # 取最大的5个下标
85 | predict_labels_list.extend(predict_labels)
86 | predict_label_and_marked_label_list = zip(predict_labels_list, marked_labels_list)
87 | precision, recall, f1 = score_eval(predict_label_and_marked_label_list)
88 | mean_cost = _costs / n_va_batches
89 | return mean_cost, precision, recall, f1
90 |
91 |
92 | def train_epoch(data_path, sess, model, train_fetches, valid_fetches, train_writer, test_writer):
93 | global last_f1
94 | global lr
95 | time0 = time.time()
96 | batch_indexs = np.random.permutation(n_tr_batches) # shuffle the training data
97 | for batch in tqdm(range(n_tr_batches)):
98 | global_step = sess.run(model.global_step)
99 | if 0 == (global_step + 1) % FLAGS.valid_step:
100 | valid_cost, precision, recall, f1 = valid_epoch(data_valid_path, sess, model)
101 | print('Global_step=%d: valid cost=%g; p=%g, r=%g, f1=%g, time=%g s' % (
102 | global_step, valid_cost, precision, recall, f1, time.time() - time0))
103 | time0 = time.time()
104 | if f1 > last_f1:
105 | last_f1 = f1
106 | saving_path = model.saver.save(sess, model_path, global_step+1)
107 | print('saved new model to %s ' % saving_path)
108 | # training
109 | batch_id = batch_indexs[batch]
110 | [X1_batch, X2_batch, y_batch] = get_batch(data_train_path, batch_id)
111 | y_batch = to_categorical(y_batch)
112 | _batch_size = len(y_batch)
113 | feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
114 | model.batch_size: _batch_size, model.tst: False, model.keep_prob: FLAGS.keep_prob}
115 | summary, _cost, _, _ = sess.run(train_fetches, feed_dict) # the cost is the mean cost of one batch
116 | # valid per 500 steps
117 | if 0 == (global_step + 1) % 500:
118 | train_writer.add_summary(summary, global_step)
119 | batch_id = np.random.randint(0, n_va_batches) # 随机选一个验证batch
120 | [X1_batch, X2_batch, y_batch] = get_batch(data_valid_path, batch_id)
121 | y_batch = to_categorical(y_batch)
122 | _batch_size = len(y_batch)
123 | feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
124 | model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
125 | summary, _cost = sess.run(valid_fetches, feed_dict)
126 | test_writer.add_summary(summary, global_step)
127 |
128 |
129 | def main(_):
130 | global ckpt_path
131 | global last_f1
132 | if not os.path.exists(ckpt_path):
133 | os.makedirs(ckpt_path)
134 | if not os.path.exists(summary_path):
135 | os.makedirs(summary_path)
136 | elif not FLAGS.is_retrain: # 重新训练本模型,删除以前的 summary
137 | shutil.rmtree(summary_path)
138 | os.makedirs(summary_path)
139 | if not os.path.exists(summary_path):
140 | os.makedirs(summary_path)
141 |
142 | print('1.Loading data...')
143 | W_embedding = np.load(embedding_path)
144 | print('training sample_num = %d' % n_tr_batches)
145 | print('valid sample_num = %d' % n_va_batches)
146 |
147 | # Initial or restore the model
148 | print('2.Building model...')
149 | config = tf.ConfigProto()
150 | config.gpu_options.allow_growth = True
151 | with tf.Session(config=config) as sess:
152 | model = network.RCNN(W_embedding, settings)
153 | with tf.variable_scope('training_ops') as vs:
154 | learning_rate = tf.train.exponential_decay(FLAGS.lr, model.global_step, FLAGS.decay_step,
155 | FLAGS.decay_rate, staircase=True)
156 | # two optimizer: op1, update embedding; op2, do not update embedding.
157 | with tf.variable_scope('Optimizer1'):
158 | tvars1 = tf.trainable_variables()
159 | grads1 = tf.gradients(model.loss, tvars1)
160 | optimizer1 = tf.train.AdamOptimizer(learning_rate=learning_rate)
161 | train_op1 = optimizer1.apply_gradients(zip(grads1, tvars1),
162 | global_step=model.global_step)
163 | with tf.variable_scope('Optimizer2'):
164 | tvars2 = [tvar for tvar in tvars1 if 'embedding' not in tvar.name]
165 | grads2 = tf.gradients(model.loss, tvars2)
166 | optimizer2 = tf.train.AdamOptimizer(learning_rate=learning_rate)
167 | train_op2 = optimizer2.apply_gradients(zip(grads2, tvars2),
168 | global_step=model.global_step)
169 | update_op = tf.group(*model.update_emas)
170 | merged = tf.summary.merge_all() # summary
171 | train_writer = tf.summary.FileWriter(summary_path + 'train', sess.graph)
172 | test_writer = tf.summary.FileWriter(summary_path + 'test')
173 | training_ops = [v for v in tf.global_variables() if v.name.startswith(vs.name+'/')]
174 |
175 | # 如果已经保存过模型,导入上次的模型
176 | if os.path.exists(ckpt_path + "checkpoint"):
177 | print("Restoring Variables from Checkpoint...")
178 | model.saver.restore(sess, tf.train.latest_checkpoint(ckpt_path))
179 | last_valid_cost, precision, recall, last_f1 = valid_epoch(data_valid_path, sess, model)
180 | print(' valid cost=%g; p=%g, r=%g, f1=%g' % (last_valid_cost, precision, recall, last_f1))
181 | sess.run(tf.variables_initializer(training_ops))
182 | train_op2 = train_op1
183 | else:
184 | print('Initializing Variables...')
185 | sess.run(tf.global_variables_initializer())
186 |
187 | print('3.Begin training...')
188 | print('max_epoch=%d, max_max_epoch=%d' % (FLAGS.max_epoch, FLAGS.max_max_epoch))
189 | train_op = train_op2
190 | for epoch in range(FLAGS.max_max_epoch):
191 | global_step = sess.run(model.global_step)
192 | print('Global step %d, lr=%g' % (global_step, sess.run(learning_rate)))
193 | if epoch == FLAGS.max_epoch: # update the embedding
194 | train_op = train_op1
195 | train_fetches = [merged, model.loss, train_op, update_op]
196 | valid_fetches = [merged, model.loss]
197 | train_epoch(data_train_path, sess, model, train_fetches, valid_fetches, train_writer, test_writer)
198 | # 最后再做一次验证
199 | valid_cost, precision, recall, f1 = valid_epoch(data_valid_path, sess, model)
200 | print('END.Global_step=%d: valid cost=%g; p=%g, r=%g, f1=%g' % (
201 | sess.run(model.global_step), valid_cost, precision, recall, f1))
202 | if f1 > last_f1: # save the better model
203 | saving_path = model.saver.save(sess, model_path, sess.run(model.global_step)+1)
204 | print('saved new model to %s ' % saving_path)
205 |
206 |
207 | if __name__ == '__main__':
208 | tf.app.run()
209 |
--------------------------------------------------------------------------------
/zhihu-text-classification-master/models/wd_3_bigru/train.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 | from __future__ import print_function
4 | from __future__ import division
5 | import tensorflow as tf
6 | import numpy as np
7 | from tqdm import tqdm
8 | import os
9 | import sys
10 | import shutil
11 | import time
12 | import network
13 |
14 | sys.path.append('../..')
15 | from data_helpers import to_categorical
16 | from evaluator import score_eval
17 |
18 | flags = tf.flags
19 | flags.DEFINE_bool('is_retrain', False, 'if is_retrain is true, not rebuild the summary')
20 | flags.DEFINE_integer('max_epoch', 1, 'update the embedding after max_epoch, default: 1')
21 | flags.DEFINE_integer('max_max_epoch', 6, 'all training epoches, default: 6')
22 | flags.DEFINE_float('lr', 8e-4, 'initial learning rate, default: 8e-4')
23 | flags.DEFINE_float('decay_rate', 0.85, 'decay rate, default: 0.85')
24 | flags.DEFINE_float('keep_prob', 0.5, 'keep_prob for training, default: 0.5')
25 | # 正式
26 | flags.DEFINE_integer('decay_step', 15000, 'decay_step, default: 15000')
27 | flags.DEFINE_integer('valid_step', 10000, 'valid_step, default: 10000')
28 | flags.DEFINE_float('last_f1', 0.40, 'if valid_f1 > last_f1, save new model. default: 0.40')
29 |
30 | # 测试
31 | # flags.DEFINE_integer('decay_step', 1000, 'decay_step, default: 1000')
32 | # flags.DEFINE_integer('valid_step', 500, 'valid_step, default: 500')
33 | # flags.DEFINE_float('last_f1', 0.10, 'if valid_f1 > last_f1, save new model. default: 0.10')
34 | FLAGS = flags.FLAGS
35 |
36 | lr = FLAGS.lr
37 | last_f1 = FLAGS.last_f1
38 | settings = network.Settings()
39 | title_len = settings.title_len
40 | summary_path = settings.summary_path
41 | ckpt_path = settings.ckpt_path
42 | model_path = ckpt_path + 'model.ckpt'
43 |
44 | embedding_path = '../../data/word_embedding.npy'
45 | data_train_path = '../../data/wd-data/data_train/'
46 | data_valid_path = '../../data/wd-data/data_valid/'
47 | tr_batches = os.listdir(data_train_path) # batch 文件名列表
48 | va_batches = os.listdir(data_valid_path)
49 | n_tr_batches = len(tr_batches)
50 | n_va_batches = len(va_batches)
51 |
52 | # 测试
53 | # n_tr_batches = 1000
54 | # n_va_batches = 50
55 |
56 |
57 | def get_batch(data_path, batch_id):
58 | """get a batch from data_path"""
59 | new_batch = np.load(data_path + str(batch_id) + '.npz')
60 | X_batch = new_batch['X']
61 | y_batch = new_batch['y']
62 | X1_batch = X_batch[:, :title_len]
63 | X2_batch = X_batch[:, title_len:]
64 | return [X1_batch, X2_batch, y_batch]
65 |
66 |
67 | def valid_epoch(data_path, sess, model):
68 | """Test on the valid data."""
69 | va_batches = os.listdir(data_path)
70 | n_va_batches = len(va_batches)
71 | _costs = 0.0
72 | predict_labels_list = list() # 所有的预测结果
73 | marked_labels_list = list()
74 | for i in range(n_va_batches):
75 | [X1_batch, X2_batch, y_batch] = get_batch(data_path, i)
76 | marked_labels_list.extend(y_batch)
77 | y_batch = to_categorical(y_batch)
78 | _batch_size = len(y_batch)
79 | fetches = [model.loss, model.y_pred]
80 | feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
81 | model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
82 | _cost, predict_labels = sess.run(fetches, feed_dict)
83 | _costs += _cost
84 | predict_labels = list(map(lambda label: label.argsort()[-1:-6:-1], predict_labels)) # 取最大的5个下标
85 | predict_labels_list.extend(predict_labels)
86 | predict_label_and_marked_label_list = zip(predict_labels_list, marked_labels_list)
87 | precision, recall, f1 = score_eval(predict_label_and_marked_label_list)
88 | mean_cost = _costs / n_va_batches
89 | return mean_cost, precision, recall, f1
90 |
91 |
92 | def train_epoch(data_path, sess, model, train_fetches, valid_fetches, train_writer, test_writer):
93 | global last_f1
94 | global lr
95 | time0 = time.time()
96 | batch_indexs = np.random.permutation(n_tr_batches) # shuffle the training data
97 | for batch in tqdm(range(n_tr_batches)):
98 | global_step = sess.run(model.global_step)
99 | if 0 == (global_step + 1) % FLAGS.valid_step:
100 | valid_cost, precision, recall, f1 = valid_epoch(data_valid_path, sess, model)
101 | print('Global_step=%d: valid cost=%g; p=%g, r=%g, f1=%g, time=%g s' % (
102 | global_step, valid_cost, precision, recall, f1, time.time() - time0))
103 | time0 = time.time()
104 | if f1 > last_f1:
105 | last_f1 = f1
106 | saving_path = model.saver.save(sess, model_path, global_step+1)
107 | print('saved new model to %s ' % saving_path)
108 | # training
109 | batch_id = batch_indexs[batch]
110 | [X1_batch, X2_batch, y_batch] = get_batch(data_train_path, batch_id)
111 | y_batch = to_categorical(y_batch)
112 | _batch_size = len(y_batch)
113 | feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
114 | model.batch_size: _batch_size, model.tst: False, model.keep_prob: FLAGS.keep_prob}
115 | summary, _cost, _, _ = sess.run(train_fetches, feed_dict) # the cost is the mean cost of one batch
116 | # valid per 500 steps
117 | if 0 == (global_step + 1) % 500:
118 | train_writer.add_summary(summary, global_step)
119 | batch_id = np.random.randint(0, n_va_batches) # 随机选一个验证batch
120 | [X1_batch, X2_batch, y_batch] = get_batch(data_valid_path, batch_id)
121 | y_batch = to_categorical(y_batch)
122 | _batch_size = len(y_batch)
123 | feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
124 | model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
125 | summary, _cost = sess.run(valid_fetches, feed_dict)
126 | test_writer.add_summary(summary, global_step)
127 |
128 |
129 | def main(_):
130 | global ckpt_path
131 | global last_f1
132 | if not os.path.exists(ckpt_path):
133 | os.makedirs(ckpt_path)
134 | if not os.path.exists(summary_path):
135 | os.makedirs(summary_path)
136 | elif not FLAGS.is_retrain: # 重新训练本模型,删除以前的 summary
137 | shutil.rmtree(summary_path)
138 | os.makedirs(summary_path)
139 | if not os.path.exists(summary_path):
140 | os.makedirs(summary_path)
141 |
142 | print('1.Loading data...')
143 | W_embedding = np.load(embedding_path)
144 | print('training sample_num = %d' % n_tr_batches)
145 | print('valid sample_num = %d' % n_va_batches)
146 |
147 | # Initial or restore the model
148 | print('2.Building model...')
149 | config = tf.ConfigProto()
150 | config.gpu_options.allow_growth = True
151 | with tf.Session(config=config) as sess:
152 | model = network.BiGRU(W_embedding, settings)
153 | with tf.variable_scope('training_ops') as vs:
154 | learning_rate = tf.train.exponential_decay(FLAGS.lr, model.global_step, FLAGS.decay_step,
155 | FLAGS.decay_rate, staircase=True)
156 | # two optimizer: op1, update embedding; op2, do not update embedding.
157 | with tf.variable_scope('Optimizer1'):
158 | tvars1 = tf.trainable_variables()
159 | grads1 = tf.gradients(model.loss, tvars1)
160 | optimizer1 = tf.train.AdamOptimizer(learning_rate=learning_rate)
161 | train_op1 = optimizer1.apply_gradients(zip(grads1, tvars1),
162 | global_step=model.global_step)
163 | with tf.variable_scope('Optimizer2'):
164 | tvars2 = [tvar for tvar in tvars1 if 'embedding' not in tvar.name]
165 | grads2 = tf.gradients(model.loss, tvars2)
166 | optimizer2 = tf.train.AdamOptimizer(learning_rate=learning_rate)
167 | train_op2 = optimizer2.apply_gradients(zip(grads2, tvars2),
168 | global_step=model.global_step)
169 | update_op = tf.group(*model.update_emas)
170 | merged = tf.summary.merge_all() # summary
171 | train_writer = tf.summary.FileWriter(summary_path + 'train', sess.graph)
172 | test_writer = tf.summary.FileWriter(summary_path + 'test')
173 | training_ops = [v for v in tf.global_variables() if v.name.startswith(vs.name+'/')]
174 |
175 | # 如果已经保存过模型,导入上次的模型
176 | if os.path.exists(ckpt_path + "checkpoint"):
177 | print("Restoring Variables from Checkpoint...")
178 | model.saver.restore(sess, tf.train.latest_checkpoint(ckpt_path))
179 | last_valid_cost, precision, recall, last_f1 = valid_epoch(data_valid_path, sess, model)
180 | print(' valid cost=%g; p=%g, r=%g, f1=%g' % (last_valid_cost, precision, recall, last_f1))
181 | sess.run(tf.variables_initializer(training_ops))
182 | train_op2 = train_op1
183 | else:
184 | print('Initializing Variables...')
185 | sess.run(tf.global_variables_initializer())
186 |
187 | print('3.Begin training...')
188 |
189 | train_op = train_op2
190 | print('max_epoch=%d, max_max_epoch=%d' % (FLAGS.max_epoch, FLAGS.max_max_epoch))
191 | for epoch in range(FLAGS.max_max_epoch):
192 | global_step = sess.run(model.global_step)
193 | print('Global step %d, lr=%g' % (global_step, sess.run(learning_rate)))
194 | if epoch == FLAGS.max_epoch: # update the embedding
195 | train_op = train_op1
196 | train_fetches = [merged, model.loss, train_op, update_op]
197 | valid_fetches = [merged, model.loss]
198 | train_epoch(data_train_path, sess, model, train_fetches, valid_fetches, train_writer, test_writer)
199 | # 最后再做一次验证
200 | valid_cost, precision, recall, f1 = valid_epoch(data_valid_path, sess, model)
201 | print('END.Global_step=%d: valid cost=%g; p=%g, r=%g, f1=%g' % (
202 | sess.run(model.global_step), valid_cost, precision, recall, f1))
203 | if f1 > last_f1: # save the better model
204 | saving_path = model.saver.save(sess, model_path, sess.run(model.global_step)+1)
205 | print('saved new model to %s ' % saving_path)
206 |
207 |
208 | if __name__ == '__main__':
209 | tf.app.run()
210 |
--------------------------------------------------------------------------------
/zhihu-text-classification-master/models/wd_5_bigru_cnn/train.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 | from __future__ import print_function
4 | from __future__ import division
5 | import tensorflow as tf
6 | import numpy as np
7 | from tqdm import tqdm
8 | import os
9 | import sys
10 | import shutil
11 | import time
12 | import network
13 |
14 | sys.path.append('../..')
15 | from data_helpers import to_categorical
16 | from evaluator import score_eval
17 |
18 | flags = tf.flags
19 | flags.DEFINE_bool('is_retrain', False, 'if is_retrain is true, not rebuild the summary')
20 | flags.DEFINE_integer('max_epoch', 1, 'update the embedding after max_epoch, default: 1')
21 | flags.DEFINE_integer('max_max_epoch', 6, 'all training epoches, default: 6')
22 | flags.DEFINE_float('lr', 8e-4, 'initial learning rate, default: 8e-4')
23 | flags.DEFINE_float('decay_rate', 0.75, 'decay rate, default: 0.75')
24 | flags.DEFINE_float('keep_prob', 0.5, 'keep_prob for training, default: 0.5')
25 | # 正式
26 | flags.DEFINE_integer('decay_step', 15000, 'decay_step, default: 15000')
27 | flags.DEFINE_integer('valid_step', 10000, 'valid_step, default: 10000')
28 | flags.DEFINE_float('last_f1', 0.38, 'if valid_f1 > last_f1, save new model. default: 0.40')
29 |
30 | # 测试
31 | # flags.DEFINE_integer('decay_step', 1000, 'decay_step, default: 1000')
32 | # flags.DEFINE_integer('valid_step', 500, 'valid_step, default: 500')
33 | # flags.DEFINE_float('last_f1', 0.10, 'if valid_f1 > last_f1, save new model. default: 0.10')
34 | FLAGS = flags.FLAGS
35 |
36 | lr = FLAGS.lr
37 | last_f1 = FLAGS.last_f1
38 | settings = network.Settings()
39 | title_len = settings.title_len
40 | summary_path = settings.summary_path
41 | ckpt_path = settings.ckpt_path
42 | model_path = ckpt_path + 'model.ckpt'
43 |
44 | embedding_path = '../../data/word_embedding.npy'
45 | data_train_path = '../../data/wd-data/data_train/'
46 | data_valid_path = '../../data/wd-data/data_valid/'
47 | tr_batches = os.listdir(data_train_path) # batch 文件名列表
48 | va_batches = os.listdir(data_valid_path)
49 | n_tr_batches = len(tr_batches)
50 | n_va_batches = len(va_batches)
51 |
52 | # 测试
53 | # n_tr_batches = 1000
54 | # n_va_batches = 50
55 |
56 |
57 | def get_batch(data_path, batch_id):
58 | """get a batch from data_path"""
59 | new_batch = np.load(data_path + str(batch_id) + '.npz')
60 | X_batch = new_batch['X']
61 | y_batch = new_batch['y']
62 | X1_batch = X_batch[:, :title_len]
63 | X2_batch = X_batch[:, title_len:]
64 | return [X1_batch, X2_batch, y_batch]
65 |
66 |
67 | def valid_epoch(data_path, sess, model):
68 | """Test on the valid data."""
69 | va_batches = os.listdir(data_path)
70 | n_va_batches = len(va_batches)
71 | _costs = 0.0
72 | predict_labels_list = list() # 所有的预测结果
73 | marked_labels_list = list()
74 | for i in range(n_va_batches):
75 | [X1_batch, X2_batch, y_batch] = get_batch(data_path, i)
76 | marked_labels_list.extend(y_batch)
77 | y_batch = to_categorical(y_batch)
78 | _batch_size = len(y_batch)
79 | fetches = [model.loss, model.y_pred]
80 | feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
81 | model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
82 | _cost, predict_labels = sess.run(fetches, feed_dict)
83 | _costs += _cost
84 | predict_labels = (map(lambda label: label.argsort()[-1:-6:-1], predict_labels)) # 取最大的5个下标
85 | predict_labels_list.extend(predict_labels)
86 | predict_label_and_marked_label_list = zip(predict_labels_list, marked_labels_list)
87 | precision, recall, f1 = score_eval(predict_label_and_marked_label_list)
88 | mean_cost = _costs / n_va_batches
89 | return mean_cost, precision, recall, f1
90 |
91 |
92 | def train_epoch(data_path, sess, model, train_fetches, valid_fetches, train_writer, test_writer):
93 | global last_f1
94 | global lr
95 | time0 = time.time()
96 | batch_indexs = np.random.permutation(n_tr_batches) # shuffle the training data
97 | for batch in tqdm(range(n_tr_batches)):
98 | global_step = sess.run(model.global_step)
99 | if 0 == (global_step + 1) % FLAGS.valid_step:
100 | valid_cost, precision, recall, f1 = valid_epoch(data_valid_path, sess, model)
101 | print('Global_step=%d: valid cost=%g; p=%g, r=%g, f1=%g, time=%g s' % (
102 | global_step, valid_cost, precision, recall, f1, time.time() - time0))
103 | time0 = time.time()
104 | if f1 > last_f1:
105 | last_f1 = f1
106 | saving_path = model.saver.save(sess, model_path, global_step+1)
107 | print('saved new model to %s ' % saving_path)
108 | # training
109 | batch_id = batch_indexs[batch]
110 | [X1_batch, X2_batch, y_batch] = get_batch(data_train_path, batch_id)
111 | y_batch = to_categorical(y_batch)
112 | _batch_size = len(y_batch)
113 | feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
114 | model.batch_size: _batch_size, model.tst: False, model.keep_prob: FLAGS.keep_prob}
115 | summary, _cost, _, _ = sess.run(train_fetches, feed_dict) # the cost is the mean cost of one batch
116 | # valid per 500 steps
117 | if 0 == (global_step + 1) % 500:
118 | train_writer.add_summary(summary, global_step)
119 | batch_id = np.random.randint(0, n_va_batches) # 随机选一个验证batch
120 | [X1_batch, X2_batch, y_batch] = get_batch(data_valid_path, batch_id)
121 | y_batch = to_categorical(y_batch)
122 | _batch_size = len(y_batch)
123 | feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
124 | model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
125 | summary, _cost = sess.run(valid_fetches, feed_dict)
126 | test_writer.add_summary(summary, global_step)
127 |
128 |
129 | def main(_):
130 | global ckpt_path
131 | global last_f1
132 | if not os.path.exists(ckpt_path):
133 | os.makedirs(ckpt_path)
134 | if not os.path.exists(summary_path):
135 | os.makedirs(summary_path)
136 | elif not FLAGS.is_retrain: # 重新训练本模型,删除以前的 summary
137 | shutil.rmtree(summary_path)
138 | os.makedirs(summary_path)
139 | if not os.path.exists(summary_path):
140 | os.makedirs(summary_path)
141 |
142 | print('1.Loading data...')
143 | W_embedding = np.load(embedding_path)
144 | print('training sample_num = %d' % n_tr_batches)
145 | print('valid sample_num = %d' % n_va_batches)
146 |
147 | # Initial or restore the model
148 | print('2.Building model...')
149 | config = tf.ConfigProto()
150 | config.gpu_options.allow_growth = True
151 | with tf.Session(config=config) as sess:
152 | model = network.BiGRU_CNN(W_embedding, settings)
153 | with tf.variable_scope('training_ops') as vs:
154 | learning_rate = tf.train.exponential_decay(FLAGS.lr, model.global_step, FLAGS.decay_step,
155 | FLAGS.decay_rate, staircase=True)
156 | # two optimizer: op1, update embedding; op2, do not update embedding.
157 | with tf.variable_scope('Optimizer1'):
158 | tvars1 = tf.trainable_variables()
159 | grads1 = tf.gradients(model.loss, tvars1)
160 | optimizer1 = tf.train.AdamOptimizer(learning_rate=learning_rate)
161 | train_op1 = optimizer1.apply_gradients(zip(grads1, tvars1),
162 | global_step=model.global_step)
163 | with tf.variable_scope('Optimizer2'):
164 | tvars2 = [tvar for tvar in tvars1 if 'embedding' not in tvar.name]
165 | grads2 = tf.gradients(model.loss, tvars2)
166 | optimizer2 = tf.train.AdamOptimizer(learning_rate=learning_rate)
167 | train_op2 = optimizer2.apply_gradients(zip(grads2, tvars2),
168 | global_step=model.global_step)
169 | update_op = tf.group(*model.update_emas)
170 | merged = tf.summary.merge_all() # summary
171 | train_writer = tf.summary.FileWriter(summary_path + 'train', sess.graph)
172 | test_writer = tf.summary.FileWriter(summary_path + 'test')
173 | training_ops = [v for v in tf.global_variables() if v.name.startswith(vs.name+'/')]
174 |
175 | # 如果已经保存过模型,导入上次的模型
176 | if os.path.exists(ckpt_path + "checkpoint"):
177 | print("Restoring Variables from Checkpoint...")
178 | model.saver.restore(sess, tf.train.latest_checkpoint(ckpt_path))
179 | last_valid_cost, precision, recall, last_f1 = valid_epoch(data_valid_path, sess, model)
180 | print(' valid cost=%g; p=%g, r=%g, f1=%g' % (last_valid_cost, precision, recall, last_f1))
181 | sess.run(tf.variables_initializer(training_ops))
182 | train_op2 = train_op1
183 | else:
184 | print('Initializing Variables...')
185 | sess.run(tf.global_variables_initializer())
186 |
187 | print('3.Begin training...')
188 | print('max_epoch=%d, max_max_epoch=%d' % (FLAGS.max_epoch, FLAGS.max_max_epoch))
189 | train_op = train_op1
190 | for epoch in range(FLAGS.max_max_epoch):
191 | global_step = sess.run(model.global_step)
192 | print('Global step %d, lr=%g' % (global_step, sess.run(learning_rate)))
193 | if epoch == FLAGS.max_epoch: # update the embedding
194 | train_op = train_op1
195 | train_fetches = [merged, model.loss, train_op, update_op]
196 | valid_fetches = [merged, model.loss]
197 | train_epoch(data_train_path, sess, model, train_fetches, valid_fetches, train_writer, test_writer)
198 | # 最后再做一次验证
199 | valid_cost, precision, recall, f1 = valid_epoch(data_valid_path, sess, model)
200 | print('END.Global_step=%d: valid cost=%g; p=%g, r=%g, f1=%g' % (
201 | sess.run(model.global_step), valid_cost, precision, recall, f1))
202 | if f1 > last_f1: # save the better model
203 | saving_path = model.saver.save(sess, model_path, sess.run(model.global_step)+1)
204 | print('saved new model to %s ' % saving_path)
205 |
206 |
207 | if __name__ == '__main__':
208 | tf.app.run()
209 |
--------------------------------------------------------------------------------
/zhihu-text-classification-master/models/wd_1_2_cnn_max/train.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 | from __future__ import print_function
4 | from __future__ import division
5 | import tensorflow as tf
6 | import numpy as np
7 | from tqdm import tqdm
8 | import os
9 | import sys
10 | import shutil
11 | import time
12 | import network
13 |
14 | sys.path.append('../..')
15 | from data_helpers import to_categorical
16 | from evaluator import score_eval
17 |
18 | flags = tf.flags
19 | flags.DEFINE_bool('is_retrain', False, 'if is_retrain is true, not rebuild the summary')
20 | flags.DEFINE_integer('max_epoch', 1, 'update the embedding after max_epoch, default: 1')
21 | flags.DEFINE_integer('max_max_epoch', 6, 'all training epoches, default: 6')
22 | flags.DEFINE_float('lr', 1e-3, 'initial learning rate, default: 1e-3')
23 | flags.DEFINE_float('decay_rate', 0.65, 'decay rate, default: 0.65')
24 | flags.DEFINE_float('keep_prob', 0.5, 'keep_prob for training, default: 0.5')
25 | # 正式
26 |
27 | flags.DEFINE_integer('decay_step', 15000, 'decay_step, default: 15000')
28 | flags.DEFINE_integer('valid_step', 10000, 'valid_step, default: 10000')
29 | flags.DEFINE_float('last_f1', 0.35, 'if valid_f1 > last_f1, save new model. default: 0.40')
30 |
31 | # 测试
32 | # flags.DEFINE_integer('decay_step', 1000, 'decay_step, default: 1000')
33 | # flags.DEFINE_integer('valid_step', 500, 'valid_step, default: 500')
34 | # flags.DEFINE_float('last_f1', 0.10, 'if valid_f1 > last_f1, save new model. default: 0.10')
35 | FLAGS = flags.FLAGS
36 |
37 | lr = FLAGS.lr
38 | last_f1 = FLAGS.last_f1
39 | settings = network.Settings()
40 | title_len = settings.title_len
41 | summary_path = settings.summary_path
42 | ckpt_path = settings.ckpt_path
43 | model_path = ckpt_path + 'model.ckpt'
44 |
45 | embedding_path = '../../data/word_embedding.npy'
46 | data_train_path = '../../data/wd-data/data_train/'
47 | data_valid_path = '../../data/wd-data/data_valid/'
48 | tr_batches = os.listdir(data_train_path) # batch 文件名列表
49 | va_batches = os.listdir(data_valid_path)
50 | n_tr_batches = len(tr_batches)
51 | n_va_batches = len(va_batches)
52 |
53 | # 测试
54 | # n_tr_batches = 1000
55 | # n_va_batches = 50
56 |
57 |
58 | def get_batch(data_path, batch_id):
59 | """get a batch from data_path"""
60 | new_batch = np.load(data_path + str(batch_id) + '.npz')
61 | X_batch = new_batch['X']
62 | y_batch = new_batch['y']
63 | X1_batch = X_batch[:, :title_len]
64 | X2_batch = X_batch[:, title_len:]
65 | return [X1_batch, X2_batch, y_batch]
66 |
67 |
68 | def valid_epoch(data_path, sess, model):
69 | """Test on the valid data."""
70 | va_batches = os.listdir(data_path)
71 | n_va_batches = len(va_batches)
72 | _costs = 0.0
73 | predict_labels_list = list() # 所有的预测结果
74 | marked_labels_list = list()
75 | for i in range(n_va_batches):
76 | [X1_batch, X2_batch, y_batch] = get_batch(data_path, i)
77 | marked_labels_list.extend(y_batch)
78 | y_batch = to_categorical(y_batch)
79 | _batch_size = len(y_batch)
80 | fetches = [model.loss, model.y_pred]
81 | feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
82 | model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
83 | _cost, predict_labels = sess.run(fetches, feed_dict)
84 | _costs += _cost
85 | predict_labels = list(map(lambda label: label.argsort()[-1:-6:-1], predict_labels)) # 取最大的5个下标
86 | predict_labels_list.extend(predict_labels)
87 | predict_label_and_marked_label_list = zip(predict_labels_list, marked_labels_list)
88 | precision, recall, f1 = score_eval(predict_label_and_marked_label_list)
89 | mean_cost = _costs / n_va_batches
90 | return mean_cost, precision, recall, f1
91 |
92 |
93 | def train_epoch(data_path, sess, model, train_fetches, valid_fetches, train_writer, test_writer):
94 | global last_f1
95 | global lr
96 | time0 = time.time()
97 | batch_indexs = np.random.permutation(n_tr_batches) # shuffle the training data
98 | for batch in tqdm(range(n_tr_batches)):
99 | global_step = sess.run(model.global_step)
100 | if 0 == (global_step + 1) % FLAGS.valid_step:
101 | valid_cost, precision, recall, f1 = valid_epoch(data_valid_path, sess, model)
102 | print('Global_step=%d: valid cost=%g; p=%g, r=%g, f1=%g, time=%g s' % (
103 | global_step, valid_cost, precision, recall, f1, time.time() - time0))
104 | time0 = time.time()
105 | if f1 > last_f1:
106 | last_f1 = f1
107 | saving_path = model.saver.save(sess, model_path, global_step+1)
108 | print('saved new model to %s ' % saving_path)
109 | # training
110 | batch_id = batch_indexs[batch]
111 | [X1_batch, X2_batch, y_batch] = get_batch(data_train_path, batch_id)
112 | y_batch = to_categorical(y_batch)
113 | _batch_size = len(y_batch)
114 | feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
115 | model.batch_size: _batch_size, model.tst: False, model.keep_prob: FLAGS.keep_prob}
116 | summary, _cost, _, _ = sess.run(train_fetches, feed_dict) # the cost is the mean cost of one batch
117 | # valid per 500 steps
118 | if 0 == (global_step + 1) % 500:
119 | train_writer.add_summary(summary, global_step)
120 | batch_id = np.random.randint(0, n_va_batches) # 随机选一个验证batch
121 | [X1_batch, X2_batch, y_batch] = get_batch(data_valid_path, batch_id)
122 | y_batch = to_categorical(y_batch)
123 | _batch_size = len(y_batch)
124 | feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
125 | model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
126 | summary, _cost = sess.run(valid_fetches, feed_dict)
127 | test_writer.add_summary(summary, global_step)
128 |
129 |
130 | def main(_):
131 | global ckpt_path
132 | global last_f1
133 | if not os.path.exists(ckpt_path):
134 | os.makedirs(ckpt_path)
135 | if not os.path.exists(summary_path):
136 | os.makedirs(summary_path)
137 | elif not FLAGS.is_retrain: # 重新训练本模型,删除以前的 summary
138 | shutil.rmtree(summary_path)
139 | os.makedirs(summary_path)
140 | if not os.path.exists(summary_path):
141 | os.makedirs(summary_path)
142 |
143 | print('1.Loading data...')
144 | W_embedding = np.load(embedding_path)
145 | print('training sample_num = %d' % n_tr_batches)
146 | print('valid sample_num = %d' % n_va_batches)
147 |
148 | # Initial or restore the model
149 | print('2.Building model...')
150 | config = tf.ConfigProto()
151 | config.gpu_options.allow_growth = True
152 | with tf.Session(config=config) as sess:
153 | model = network.TextCNN(W_embedding, settings)
154 | with tf.variable_scope('training_ops') as vs:
155 | learning_rate = tf.train.exponential_decay(FLAGS.lr, model.global_step, FLAGS.decay_step,
156 | FLAGS.decay_rate, staircase=True)
157 | # two optimizer: op1, update embedding; op2, do not update embedding.
158 | with tf.variable_scope('Optimizer1'):
159 | tvars1 = tf.trainable_variables()
160 | grads1 = tf.gradients(model.loss, tvars1)
161 | optimizer1 = tf.train.AdamOptimizer(learning_rate=learning_rate)
162 | train_op1 = optimizer1.apply_gradients(zip(grads1, tvars1),
163 | global_step=model.global_step)
164 | with tf.variable_scope('Optimizer2'):
165 | tvars2 = [tvar for tvar in tvars1 if 'embedding' not in tvar.name]
166 | grads2 = tf.gradients(model.loss, tvars2)
167 | optimizer2 = tf.train.AdamOptimizer(learning_rate=learning_rate)
168 | train_op2 = optimizer2.apply_gradients(zip(grads2, tvars2),
169 | global_step=model.global_step)
170 | update_op = tf.group(*model.update_emas)
171 | merged = tf.summary.merge_all() # summary
172 | train_writer = tf.summary.FileWriter(summary_path + 'train', sess.graph)
173 | test_writer = tf.summary.FileWriter(summary_path + 'test')
174 | training_ops = [v for v in tf.global_variables() if v.name.startswith(vs.name+'/')]
175 |
176 | # 如果已经保存过模型,导入上次的模型
177 | if os.path.exists(ckpt_path + "checkpoint"):
178 | print("Restoring Variables from Checkpoint...")
179 | model.saver.restore(sess, tf.train.latest_checkpoint(ckpt_path))
180 | last_valid_cost, precision, recall, last_f1 = valid_epoch(data_valid_path, sess, model)
181 | print(' valid cost=%g; p=%g, r=%g, f1=%g' % (last_valid_cost, precision, recall, last_f1))
182 | sess.run(tf.variables_initializer(training_ops))
183 | train_op2 = train_op1
184 | else:
185 | print('Initializing Variables...')
186 | sess.run(tf.global_variables_initializer())
187 |
188 | print('3.Begin training...')
189 | print('max_epoch=%d, max_max_epoch=%d' % (FLAGS.max_epoch, FLAGS.max_max_epoch))
190 | train_op = train_op2
191 | for epoch in range(FLAGS.max_max_epoch):
192 | global_step = sess.run(model.global_step)
193 | print('Global step %d, lr=%g' % (global_step, sess.run(learning_rate)))
194 | if epoch == FLAGS.max_epoch: # update the embedding
195 | train_op = train_op1
196 | train_fetches = [merged, model.loss, train_op, update_op]
197 | valid_fetches = [merged, model.loss]
198 | train_epoch(data_train_path, sess, model, train_fetches, valid_fetches, train_writer, test_writer)
199 | # 最后再做一次验证
200 | valid_cost, precision, recall, f1 = valid_epoch(data_valid_path, sess, model)
201 | print('END.Global_step=%d: valid cost=%g; p=%g, r=%g, f1=%g' % (
202 | sess.run(model.global_step), valid_cost, precision, recall, f1))
203 | if f1 > last_f1: # save the better model
204 | saving_path = model.saver.save(sess, model_path, sess.run(model.global_step)+1)
205 | print('saved new model to %s ' % saving_path)
206 |
207 |
208 | if __name__ == '__main__':
209 | tf.app.run()
210 |
--------------------------------------------------------------------------------
/zhihu-text-classification-master/models/wd_3_bigru/network.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 | import tensorflow as tf
4 | from tensorflow.contrib import rnn
5 | import tensorflow.contrib.layers as layers
6 |
7 | """wd_3_bigru
8 | title 部分使用 bigru+attention;content 部分使用 bigru+attention; 两部分输出直接 concat。
9 | """
10 |
11 |
12 | class Settings(object):
13 | def __init__(self):
14 | self.model_name = 'wd_3_bigru'
15 | self.title_len = 30
16 | self.content_len = 150
17 | self.hidden_size = 256
18 | self.n_layer = 1
19 | self.fc_hidden_size = 1024
20 | self.n_class = 1999
21 | self.summary_path = '../../summary/' + self.model_name + '/'
22 | self.ckpt_path = '../../ckpt/' + self.model_name + '/'
23 |
24 |
25 | class BiGRU(object):
26 | """
27 | title: inputs->bigru+attention->output_title
28 | content: inputs->bigru+attention->output_content
29 | concat[output_title, output_content] -> fc+bn+relu -> sigmoid_entropy.
30 | """
31 |
32 | def __init__(self, W_embedding, settings):
33 | self.model_name = settings.model_name
34 | self.title_len = settings.title_len
35 | self.content_len = settings.content_len
36 | self.hidden_size = settings.hidden_size
37 | self.n_layer = settings.n_layer
38 | self.n_class = settings.n_class
39 | self.fc_hidden_size = settings.fc_hidden_size
40 | self._global_step = tf.Variable(0, trainable=False, name='Global_Step')
41 | self.update_emas = list()
42 | # placeholders
43 | self._tst = tf.placeholder(tf.bool)
44 | self._keep_prob = tf.placeholder(tf.float32, [])
45 | self._batch_size = tf.placeholder(tf.int32, [])
46 |
47 | with tf.name_scope('Inputs'):
48 | self._X1_inputs = tf.placeholder(tf.int64, [None, self.title_len], name='X1_inputs')
49 | self._X2_inputs = tf.placeholder(tf.int64, [None, self.content_len], name='X2_inputs')
50 | self._y_inputs = tf.placeholder(tf.float32, [None, self.n_class], name='y_input')
51 |
52 | with tf.variable_scope('embedding'):
53 | self.embedding = tf.get_variable(name='embedding', shape=W_embedding.shape,
54 | initializer=tf.constant_initializer(W_embedding), trainable=True)
55 | self.embedding_size = W_embedding.shape[1]
56 |
57 | with tf.variable_scope('bigru_text'):
58 | output_title = self.bigru_inference(self._X1_inputs)
59 |
60 | with tf.variable_scope('bigru_content'):
61 | output_content = self.bigru_inference(self._X2_inputs)
62 |
63 | with tf.variable_scope('fc-bn-layer'):
64 | output = tf.concat([output_title, output_content], axis=1)
65 | W_fc = self.weight_variable([self.hidden_size * 4, self.fc_hidden_size], name='Weight_fc')
66 | tf.summary.histogram('W_fc', W_fc)
67 | h_fc = tf.matmul(output, W_fc, name='h_fc')
68 | beta_fc = tf.Variable(tf.constant(0.1, tf.float32, shape=[self.fc_hidden_size], name="beta_fc"))
69 | tf.summary.histogram('beta_fc', beta_fc)
70 | fc_bn, update_ema_fc = self.batchnorm(h_fc, beta_fc, convolutional=False)
71 | self.update_emas.append(update_ema_fc)
72 | self.fc_bn_relu = tf.nn.relu(fc_bn, name="relu")
73 |
74 | with tf.variable_scope('out_layer'):
75 | W_out = self.weight_variable([self.fc_hidden_size, self.n_class], name='Weight_out')
76 | tf.summary.histogram('Weight_out', W_out)
77 | b_out = self.bias_variable([self.n_class], name='bias_out')
78 | tf.summary.histogram('bias_out', b_out)
79 | self._y_pred = tf.nn.xw_plus_b(self.fc_bn_relu, W_out, b_out, name='y_pred') # 每个类别的分数 scores
80 |
81 | with tf.name_scope('loss'):
82 | self._loss = tf.reduce_mean(
83 | tf.nn.sigmoid_cross_entropy_with_logits(logits=self._y_pred, labels=self._y_inputs))
84 | tf.summary.scalar('loss', self._loss)
85 |
86 | self.saver = tf.train.Saver(max_to_keep=1)
87 |
88 | @property
89 | def tst(self):
90 | return self._tst
91 |
92 | @property
93 | def keep_prob(self):
94 | return self._keep_prob
95 |
96 | @property
97 | def batch_size(self):
98 | return self._batch_size
99 |
100 | @property
101 | def global_step(self):
102 | return self._global_step
103 |
104 | @property
105 | def X1_inputs(self):
106 | return self._X1_inputs
107 |
108 | @property
109 | def X2_inputs(self):
110 | return self._X2_inputs
111 |
112 | @property
113 | def y_inputs(self):
114 | return self._y_inputs
115 |
116 | @property
117 | def y_pred(self):
118 | return self._y_pred
119 |
120 | @property
121 | def loss(self):
122 | return self._loss
123 |
124 | def weight_variable(self, shape, name):
125 | """Create a weight variable with appropriate initialization."""
126 | initial = tf.truncated_normal(shape, stddev=0.1)
127 | return tf.Variable(initial, name=name)
128 |
129 | def bias_variable(self, shape, name):
130 | """Create a bias variable with appropriate initialization."""
131 | initial = tf.constant(0.1, shape=shape)
132 | return tf.Variable(initial, name=name)
133 |
134 | def batchnorm(self, Ylogits, offset, convolutional=False):
135 | """batchnormalization.
136 | Args:
137 | Ylogits: 1D向量或者是3D的卷积结果。
138 | num_updates: 迭代的global_step
139 | offset:表示beta,全局均值;在 RELU 激活中一般初始化为 0.1。
140 | scale:表示lambda,全局方差;在 sigmoid 激活中需要,这 RELU 激活中作用不大。
141 | m: 表示batch均值;v:表示batch方差。
142 | bnepsilon:一个很小的浮点数,防止除以 0.
143 | Returns:
144 | Ybn: 和 Ylogits 的维度一样,就是经过 Batch Normalization 处理的结果。
145 | update_moving_everages:更新mean和variance,主要是给最后的 test 使用。
146 | """
147 | exp_moving_avg = tf.train.ExponentialMovingAverage(0.999, self._global_step) # adding the iteration prevents from averaging across non-existing iterations
148 | bnepsilon = 1e-5
149 | if convolutional:
150 | mean, variance = tf.nn.moments(Ylogits, [0, 1, 2])
151 | else:
152 | mean, variance = tf.nn.moments(Ylogits, [0])
153 | update_moving_everages = exp_moving_avg.apply([mean, variance])
154 | m = tf.cond(self.tst, lambda: exp_moving_avg.average(mean), lambda: mean)
155 | v = tf.cond(self.tst, lambda: exp_moving_avg.average(variance), lambda: variance)
156 | Ybn = tf.nn.batch_normalization(Ylogits, m, v, offset, None, bnepsilon)
157 | return Ybn, update_moving_everages
158 |
159 | def gru_cell(self):
160 | with tf.name_scope('gru_cell'):
161 | cell = rnn.GRUCell(self.hidden_size, reuse=tf.get_variable_scope().reuse)
162 | return rnn.DropoutWrapper(cell, output_keep_prob=self.keep_prob)
163 |
164 | def bi_gru(self, inputs):
165 | """build the bi-GRU network. 返回个所有层的隐含状态。"""
166 | cells_fw = [self.gru_cell() for _ in range(self.n_layer)]
167 | cells_bw = [self.gru_cell() for _ in range(self.n_layer)]
168 | initial_states_fw = [cell_fw.zero_state(self.batch_size, tf.float32) for cell_fw in cells_fw]
169 | initial_states_bw = [cell_bw.zero_state(self.batch_size, tf.float32) for cell_bw in cells_bw]
170 | outputs, _, _ = rnn.stack_bidirectional_dynamic_rnn(cells_fw, cells_bw, inputs,
171 | initial_states_fw=initial_states_fw,
172 | initial_states_bw=initial_states_bw, dtype=tf.float32)
173 | return outputs
174 |
175 | def task_specific_attention(self, inputs, output_size,
176 | initializer=layers.xavier_initializer(),
177 | activation_fn=tf.tanh, scope=None):
178 | """
179 | Performs task-specific attention reduction, using learned
180 | attention context vector (constant within task of interest).
181 | Args:
182 | inputs: Tensor of shape [batch_size, units, input_size]
183 | `input_size` must be static (known)
184 | `units` axis will be attended over (reduced from output)
185 | `batch_size` will be preserved
186 | output_size: Size of output's inner (feature) dimension
187 | Returns:
188 | outputs: Tensor of shape [batch_size, output_dim].
189 | """
190 | assert len(inputs.get_shape()) == 3 and inputs.get_shape()[-1].value is not None
191 | with tf.variable_scope(scope or 'attention') as scope:
192 | # u_w, attention 向量
193 | attention_context_vector = tf.get_variable(name='attention_context_vector', shape=[output_size],
194 | initializer=initializer, dtype=tf.float32)
195 | # 全连接层,把 h_i 转为 u_i , shape= [batch_size, units, input_size] -> [batch_size, units, output_size]
196 | input_projection = layers.fully_connected(inputs, output_size, activation_fn=activation_fn, scope=scope)
197 | # 输出 [batch_size, units]
198 | vector_attn = tf.reduce_sum(tf.multiply(input_projection, attention_context_vector), axis=2, keep_dims=True)
199 | attention_weights = tf.nn.softmax(vector_attn, dim=1)
200 | tf.summary.histogram('attention_weigths', attention_weights)
201 | weighted_projection = tf.multiply(inputs, attention_weights)
202 | outputs = tf.reduce_sum(weighted_projection, axis=1)
203 | return outputs # 输出 [batch_size, hidden_size*2]
204 |
205 | def bigru_inference(self, X_inputs):
206 | inputs = tf.nn.embedding_lookup(self.embedding, X_inputs)
207 | output_bigru = self.bi_gru(inputs)
208 | output_att = self.task_specific_attention(output_bigru, self.hidden_size*2)
209 | return output_att
210 |
211 |
212 | # test the model
213 | def test():
214 | import numpy as np
215 | print('Begin testing...')
216 | settings = Settings()
217 | W_embedding = np.random.randn(50, 10)
218 | config = tf.ConfigProto()
219 | config.gpu_options.allow_growth = True
220 | batch_size = 128
221 | with tf.Session(config=config) as sess:
222 | model = BiGRU(W_embedding, settings)
223 | optimizer = tf.train.AdamOptimizer(0.001)
224 | train_op = optimizer.minimize(model.loss)
225 | update_op = tf.group(*model.update_emas)
226 | sess.run(tf.global_variables_initializer())
227 | fetch = [model.loss, model.y_pred, train_op, update_op]
228 | loss_list = list()
229 | for i in xrange(100):
230 | X1_batch = np.zeros((batch_size, 30), dtype=float)
231 | X2_batch = np.zeros((batch_size, 150), dtype=float)
232 | y_batch = np.zeros((batch_size, 1999), dtype=int)
233 | _batch_size = len(y_batch)
234 | feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
235 | model.batch_size: _batch_size, model.tst: False, model.keep_prob: 0.5}
236 | loss, y_pred, _, _ = sess.run(fetch, feed_dict=feed_dict)
237 | loss_list.append(loss)
238 | print(i, loss)
239 |
240 | if __name__ == '__main__':
241 | test()
242 |
--------------------------------------------------------------------------------
/zhihu-text-classification-master/models/wd_6_rcnn/network.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 | import tensorflow as tf
4 | from tensorflow.contrib import rnn
5 | import tensorflow.contrib.layers as layers
6 |
7 | """wd_6_rcnn
8 | 在论文 Recurrent Convolutional Neural Networks for Text Classification 中。
9 | 使用 BiRNN 处理,将每个时刻的隐藏状态和原输入拼起来,在进行 max_pooling 操作。
10 | 这里有些不同,首先也是使用 bigru 得到每个时刻的,将每个时刻的隐藏状态和原输入拼起来;
11 | 然后使用输入到 TextCNN 网络中。
12 | """
13 |
14 |
15 | class Settings(object):
16 | def __init__(self):
17 | self.model_name = "wd_6_rcnn"
18 | self.title_len = 30
19 | self.content_len = 150
20 | self.hidden_size = 256
21 | self.n_layer = 1
22 | self.filter_sizes = [2, 3, 4, 5, 7]
23 | self.n_filter = 256
24 | self.fc_hidden_size = 1024
25 | self.n_class = 1999
26 | self.summary_path = '../../summary/' + self.model_name + '/'
27 | self.ckpt_path = '../../ckpt/' + self.model_name + '/'
28 |
29 |
30 | class RCNN(object):
31 | def __init__(self, W_embedding, settings):
32 | self.model_name = settings.model_name
33 | self.title_len = settings.title_len
34 | self.content_len = settings.content_len
35 | self.hidden_size = settings.hidden_size
36 | self.n_layer = settings.n_layer
37 | self.filter_sizes = settings.filter_sizes
38 | self.n_filter = settings.n_filter
39 | self.n_filter_total = self.n_filter * len(self.filter_sizes)
40 | self.n_class = settings.n_class
41 | self.fc_hidden_size = settings.fc_hidden_size
42 | self._global_step = tf.Variable(0, trainable=False, name='Global_Step')
43 | self.update_emas = list()
44 | # placeholders
45 | self._tst = tf.placeholder(tf.bool)
46 | self._keep_prob = tf.placeholder(tf.float32, [])
47 | self._batch_size = tf.placeholder(tf.int32, [])
48 |
49 | with tf.name_scope('Inputs'):
50 | self._X1_inputs = tf.placeholder(tf.int64, [None, self.title_len], name='X1_inputs')
51 | self._X2_inputs = tf.placeholder(tf.int64, [None, self.content_len], name='X2_inputs')
52 | self._y_inputs = tf.placeholder(tf.float32, [None, self.n_class], name='y_input')
53 |
54 | with tf.variable_scope('embedding'):
55 | self.embedding = tf.get_variable(name='embedding', shape=W_embedding.shape,
56 | initializer=tf.constant_initializer(W_embedding), trainable=True)
57 | self.embedding_size = W_embedding.shape[1]
58 |
59 | with tf.variable_scope('rcnn_text'):
60 | output_title = self.rcnn_inference(self._X1_inputs, self.title_len)
61 |
62 | with tf.variable_scope('rcnn_content'):
63 | output_content = self.rcnn_inference(self._X2_inputs, self.content_len)
64 |
65 | with tf.variable_scope('fc-bn-layer'):
66 | output = tf.concat([output_title, output_content], axis=1)
67 | W_fc = self.weight_variable([self.n_filter_total*2, self.fc_hidden_size],
68 | name='Weight_fc')
69 | tf.summary.histogram('W_fc', W_fc)
70 | h_fc = tf.matmul(output, W_fc, name='h_fc')
71 | beta_fc = tf.Variable(tf.constant(0.1, tf.float32, shape=[self.fc_hidden_size], name="beta_fc"))
72 | tf.summary.histogram('beta_fc', beta_fc)
73 | fc_bn, update_ema_fc = self.batchnorm(h_fc, beta_fc, convolutional=False)
74 | self.update_emas.append(update_ema_fc)
75 | self.fc_bn_relu = tf.nn.relu(fc_bn, name="relu")
76 | fc_bn_drop = tf.nn.dropout(self.fc_bn_relu, self.keep_prob)
77 |
78 | with tf.variable_scope('out_layer'):
79 | W_out = self.weight_variable([self.fc_hidden_size, self.n_class], name='Weight_out')
80 | tf.summary.histogram('Weight_out', W_out)
81 | b_out = self.bias_variable([self.n_class], name='bias_out')
82 | tf.summary.histogram('bias_out', b_out)
83 | self._y_pred = tf.nn.xw_plus_b(fc_bn_drop, W_out, b_out, name='y_pred') # 每个类别的分数 scores
84 |
85 | with tf.name_scope('loss'):
86 | self._loss = tf.reduce_mean(
87 | tf.nn.sigmoid_cross_entropy_with_logits(logits=self._y_pred, labels=self._y_inputs))
88 | tf.summary.scalar('loss', self._loss)
89 |
90 | self.saver = tf.train.Saver(max_to_keep=1)
91 |
92 | @property
93 | def tst(self):
94 | return self._tst
95 |
96 | @property
97 | def keep_prob(self):
98 | return self._keep_prob
99 |
100 | @property
101 | def batch_size(self):
102 | return self._batch_size
103 |
104 | @property
105 | def global_step(self):
106 | return self._global_step
107 |
108 | @property
109 | def X1_inputs(self):
110 | return self._X1_inputs
111 |
112 | @property
113 | def X2_inputs(self):
114 | return self._X2_inputs
115 |
116 | @property
117 | def y_inputs(self):
118 | return self._y_inputs
119 |
120 | @property
121 | def y_pred(self):
122 | return self._y_pred
123 |
124 | @property
125 | def loss(self):
126 | return self._loss
127 |
128 | def weight_variable(self, shape, name):
129 | """Create a weight variable with appropriate initialization."""
130 | initial = tf.truncated_normal(shape, stddev=0.1)
131 | return tf.Variable(initial, name=name)
132 |
133 | def bias_variable(self, shape, name):
134 | """Create a bias variable with appropriate initialization."""
135 | initial = tf.constant(0.1, shape=shape)
136 | return tf.Variable(initial, name=name)
137 |
138 | def batchnorm(self, Ylogits, offset, convolutional=False):
139 | """batchnormalization.
140 | Args:
141 | Ylogits: 1D向量或者是3D的卷积结果。
142 | num_updates: 迭代的global_step
143 | offset:表示beta,全局均值;在 RELU 激活中一般初始化为 0.1。
144 | scale:表示lambda,全局方差;在 sigmoid 激活中需要,这 RELU 激活中作用不大。
145 | m: 表示batch均值;v:表示batch方差。
146 | bnepsilon:一个很小的浮点数,防止除以 0.
147 | Returns:
148 | Ybn: 和 Ylogits 的维度一样,就是经过 Batch Normalization 处理的结果。
149 | update_moving_everages:更新mean和variance,主要是给最后的 test 使用。
150 | """
151 | exp_moving_avg = tf.train.ExponentialMovingAverage(0.999,
152 | self._global_step) # adding the iteration prevents from averaging across non-existing iterations
153 | bnepsilon = 1e-5
154 | if convolutional:
155 | mean, variance = tf.nn.moments(Ylogits, [0, 1, 2])
156 | else:
157 | mean, variance = tf.nn.moments(Ylogits, [0])
158 | update_moving_everages = exp_moving_avg.apply([mean, variance])
159 | m = tf.cond(self.tst, lambda: exp_moving_avg.average(mean), lambda: mean)
160 | v = tf.cond(self.tst, lambda: exp_moving_avg.average(variance), lambda: variance)
161 | Ybn = tf.nn.batch_normalization(Ylogits, m, v, offset, None, bnepsilon)
162 | return Ybn, update_moving_everages
163 |
164 | def gru_cell(self):
165 | with tf.name_scope('gru_cell'):
166 | cell = rnn.GRUCell(self.hidden_size, reuse=tf.get_variable_scope().reuse)
167 | return rnn.DropoutWrapper(cell, output_keep_prob=self.keep_prob)
168 |
169 | def bi_gru(self, X_inputs):
170 | """build the bi-GRU network. Return the encoder represented vector.
171 | X_inputs: [batch_size, n_step]
172 | n_step: 句子的词数量;或者文档的句子数。
173 | outputs: [fw_state, embeddings, bw_state], shape=[batch_size, hidden_size+embedding_size+hidden_size]
174 | """
175 | inputs = tf.nn.embedding_lookup(self.embedding, X_inputs) # [batch_size, n_step, embedding_size]
176 | cells_fw = [self.gru_cell() for _ in range(self.n_layer)]
177 | cells_bw = [self.gru_cell() for _ in range(self.n_layer)]
178 | initial_states_fw = [cell_fw.zero_state(self.batch_size, tf.float32) for cell_fw in cells_fw]
179 | initial_states_bw = [cell_bw.zero_state(self.batch_size, tf.float32) for cell_bw in cells_bw]
180 | outputs, _, _ = rnn.stack_bidirectional_dynamic_rnn(cells_fw, cells_bw, inputs,
181 | initial_states_fw = initial_states_fw, initial_states_bw = initial_states_bw, dtype=tf.float32)
182 | hidden_outputs = tf.concat([outputs, inputs], axis=2)
183 | return hidden_outputs # shape =[seg_num, n_steps, hidden_size*2+embedding_size]
184 |
185 | def textcnn(self, cnn_inputs, n_step):
186 | """build the TextCNN network. Return the h_drop"""
187 | # cnn_inputs.shape = [batchsize, n_step, hidden_size*2+embedding_size]
188 | inputs = tf.expand_dims(cnn_inputs, -1)
189 | pooled_outputs = list()
190 | for i, filter_size in enumerate(self.filter_sizes):
191 | with tf.variable_scope("conv-maxpool-%s" % filter_size):
192 | # Convolution Layer
193 | filter_shape = [filter_size, self.hidden_size*2+self.embedding_size, 1, self.n_filter]
194 | W_filter = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W_filter")
195 | beta = tf.Variable(tf.constant(0.1, tf.float32, shape=[self.n_filter], name="beta"))
196 | tf.summary.histogram('beta', beta)
197 | conv = tf.nn.conv2d(inputs, W_filter, strides=[1, 1, 1, 1], padding="VALID", name="conv")
198 | conv_bn, update_ema = self.batchnorm(conv, beta, convolutional=True) # 在激活层前面加 BN
199 | # Apply nonlinearity, batch norm scaling is not useful with relus
200 | h = tf.nn.relu(conv_bn, name="relu")
201 | # Maxpooling over the outputs
202 | pooled = tf.nn.max_pool(h,ksize=[1, n_step - filter_size + 1, 1, 1],
203 | strides=[1, 1, 1, 1],padding='VALID',name="pool")
204 | pooled_outputs.append(pooled)
205 | self.update_emas.append(update_ema)
206 | h_pool = tf.concat(pooled_outputs, 3)
207 | h_pool_flat = tf.reshape(h_pool, [-1, self.n_filter_total])
208 | return h_pool_flat # shape = [batch_size, n_filter_total]
209 |
210 | def rcnn_inference(self, X_inputs, n_step):
211 | output_bigru = self.bi_gru(X_inputs)
212 | output_cnn = self.textcnn(output_bigru, n_step)
213 | return output_cnn # shape = [batch_size, n_filter_total]
214 |
215 |
216 | # test the model
217 | def test():
218 | import numpy as np
219 | print('Begin testing...')
220 | settings = Settings()
221 | W_embedding = np.random.randn(50, 10)
222 | config = tf.ConfigProto()
223 | config.gpu_options.allow_growth = True
224 | batch_size = 128
225 | with tf.Session(config=config) as sess:
226 | model = RCNN(W_embedding, settings)
227 | optimizer = tf.train.AdamOptimizer(0.001)
228 | train_op = optimizer.minimize(model.loss)
229 | update_op = tf.group(*model.update_emas)
230 | sess.run(tf.global_variables_initializer())
231 | fetch = [model.loss, model.y_pred, train_op, update_op]
232 | loss_list = list()
233 | for i in xrange(100):
234 | X1_batch = np.zeros((batch_size, 30), dtype=float)
235 | X2_batch = np.zeros((batch_size, 150), dtype=float)
236 | y_batch = np.zeros((batch_size, 1999), dtype=int)
237 | _batch_size = len(y_batch)
238 | feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
239 | model.batch_size: _batch_size, model.tst: False, model.keep_prob: 0.5}
240 | loss, y_pred, _, _ = sess.run(fetch, feed_dict=feed_dict)
241 | loss_list.append(loss)
242 | print(i, loss)
243 |
244 |
245 | if __name__ == '__main__':
246 | test()
247 |
--------------------------------------------------------------------------------
/zhihu-text-classification-master/models/wd_2_hcnn/network.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 | import tensorflow as tf
4 |
5 | """wd_2_hcnn
6 | title 部分使用 TextCNN;content 部分使用分层的 TextCNN。
7 | """
8 |
9 |
10 | class Settings(object):
11 | def __init__(self):
12 | self.model_name = 'wd_2_hcnn'
13 | self.title_len = self.sent_len = 30
14 | self.doc_len = 10
15 | self.sent_filter_sizes = [2, 3, 4, 5]
16 | self.doc_filter_sizes = [2, 3, 4]
17 | self.n_filter = 256
18 | self.fc_hidden_size = 1024
19 | self.n_class = 1999
20 | self.summary_path = '../../summary/' + self.model_name + '/'
21 | self.ckpt_path = '../../ckpt/' + self.model_name + '/'
22 |
23 |
24 | class HCNN(object):
25 | """
26 | title: inputs->textcnn->output_title
27 | content: inputs->hcnn->output_content
28 | concat[output_title, output_content] -> fc+bn+relu -> sigmoid_entropy.
29 | """
30 |
31 | def __init__(self, W_embedding, settings):
32 | self.model_name = settings.model_name
33 | self.sent_len = settings.sent_len
34 | self.doc_len = settings.doc_len
35 | self.sent_filter_sizes = settings.sent_filter_sizes
36 | self.doc_filter_sizes = settings.doc_filter_sizes
37 | self.n_filter = settings.n_filter
38 | self.n_class = settings.n_class
39 | self.fc_hidden_size = settings.fc_hidden_size
40 | self._global_step = tf.Variable(0, trainable=False, name='Global_Step')
41 | self.update_emas = list()
42 | # placeholders
43 | self._tst = tf.placeholder(tf.bool)
44 | self._keep_prob = tf.placeholder(tf.float32, [])
45 | self._batch_size = tf.placeholder(tf.int32, [])
46 |
47 | with tf.name_scope('Inputs'):
48 | self._X1_inputs = tf.placeholder(tf.int64, [None, self.sent_len], name='X1_inputs')
49 | self._X2_inputs = tf.placeholder(tf.int64, [None, self.doc_len * self.sent_len], name='X2_inputs')
50 | self._y_inputs = tf.placeholder(tf.float32, [None, self.n_class], name='y_input')
51 |
52 | with tf.variable_scope('embedding'):
53 | self.embedding = tf.get_variable(name='embedding', shape=W_embedding.shape,
54 | initializer=tf.constant_initializer(W_embedding), trainable=True)
55 | self.embedding_size = W_embedding.shape[1]
56 |
57 | with tf.variable_scope('cnn_text'):
58 | output_title = self.cnn_inference(self._X1_inputs)
59 |
60 | with tf.variable_scope('hcnn_content'):
61 | output_content = self.hcnn_inference(self._X2_inputs)
62 |
63 | with tf.variable_scope('fc-bn-layer'):
64 | output = tf.concat([output_title, output_content], axis=1)
65 | output_size = self.n_filter * (len(self.sent_filter_sizes) + len(self.doc_filter_sizes))
66 | W_fc = self.weight_variable([output_size, self.fc_hidden_size], name='Weight_fc')
67 | tf.summary.histogram('W_fc', W_fc)
68 | h_fc = tf.matmul(output, W_fc, name='h_fc')
69 | beta_fc = tf.Variable(tf.constant(0.1, tf.float32, shape=[self.fc_hidden_size], name="beta_fc"))
70 | tf.summary.histogram('beta_fc', beta_fc)
71 | fc_bn, update_ema_fc = self.batchnorm(h_fc, beta_fc, convolutional=False)
72 | self.update_emas.append(update_ema_fc)
73 | self.fc_bn_relu = tf.nn.relu(fc_bn, name="relu")
74 | fc_bn_drop = tf.nn.dropout(self.fc_bn_relu, self.keep_prob)
75 |
76 | with tf.variable_scope('out_layer'):
77 | W_out = self.weight_variable([self.fc_hidden_size, self.n_class], name='Weight_out')
78 | tf.summary.histogram('Weight_out', W_out)
79 | b_out = self.bias_variable([self.n_class], name='bias_out')
80 | tf.summary.histogram('bias_out', b_out)
81 | self._y_pred = tf.nn.xw_plus_b(fc_bn_drop, W_out, b_out, name='y_pred') # 每个类别的分数 scores
82 |
83 | with tf.name_scope('loss'):
84 | self._loss = tf.reduce_mean(
85 | tf.nn.sigmoid_cross_entropy_with_logits(logits=self._y_pred, labels=self._y_inputs))
86 | tf.summary.scalar('loss', self._loss)
87 |
88 | self.saver = tf.train.Saver(max_to_keep=2)
89 |
90 | @property
91 | def tst(self):
92 | return self._tst
93 |
94 | @property
95 | def keep_prob(self):
96 | return self._keep_prob
97 |
98 | @property
99 | def batch_size(self):
100 | return self._batch_size
101 |
102 | @property
103 | def global_step(self):
104 | return self._global_step
105 |
106 | @property
107 | def X1_inputs(self):
108 | return self._X1_inputs
109 |
110 | @property
111 | def X2_inputs(self):
112 | return self._X2_inputs
113 |
114 | @property
115 | def y_inputs(self):
116 | return self._y_inputs
117 |
118 | @property
119 | def y_pred(self):
120 | return self._y_pred
121 |
122 | @property
123 | def loss(self):
124 | return self._loss
125 |
126 | def weight_variable(self, shape, name):
127 | """Create a weight variable with appropriate initialization."""
128 | initial = tf.truncated_normal(shape, stddev=0.1)
129 | return tf.Variable(initial, name=name)
130 |
131 | def bias_variable(self, shape, name):
132 | """Create a bias variable with appropriate initialization."""
133 | initial = tf.constant(0.1, shape=shape)
134 | return tf.Variable(initial, name=name)
135 |
136 | def batchnorm(self, Ylogits, offset, convolutional=False):
137 | """batchnormalization.
138 | Args:
139 | Ylogits: 1D向量或者是3D的卷积结果。
140 | num_updates: 迭代的global_step
141 | offset:表示beta,全局均值;在 RELU 激活中一般初始化为 0.1。
142 | scale:表示lambda,全局方差;在 sigmoid 激活中需要,这 RELU 激活中作用不大。
143 | m: 表示batch均值;v:表示batch方差。
144 | bnepsilon:一个很小的浮点数,防止除以 0.
145 | Returns:
146 | Ybn: 和 Ylogits 的维度一样,就是经过 Batch Normalization 处理的结果。
147 | update_moving_everages:更新mean和variance,主要是给最后的 test 使用。
148 | """
149 | exp_moving_avg = tf.train.ExponentialMovingAverage(0.999,
150 | self._global_step) # adding the iteration prevents from averaging across non-existing iterations
151 | bnepsilon = 1e-5
152 | if convolutional:
153 | mean, variance = tf.nn.moments(Ylogits, [0, 1, 2])
154 | else:
155 | mean, variance = tf.nn.moments(Ylogits, [0])
156 | update_moving_everages = exp_moving_avg.apply([mean, variance])
157 | m = tf.cond(self.tst, lambda: exp_moving_avg.average(mean), lambda: mean)
158 | v = tf.cond(self.tst, lambda: exp_moving_avg.average(variance), lambda: variance)
159 | Ybn = tf.nn.batch_normalization(Ylogits, m, v, offset, None, bnepsilon)
160 | return Ybn, update_moving_everages
161 |
162 | def textcnn(self, X_inputs, n_step, filter_sizes, embed_size):
163 | """build the TextCNN network.
164 | n_step: the sentence len."""
165 | inputs = tf.expand_dims(X_inputs, -1)
166 | pooled_outputs = list()
167 | for i, filter_size in enumerate(filter_sizes):
168 | with tf.name_scope("conv-maxpool-%s" % filter_size):
169 | # Convolution Layer
170 | filter_shape = [filter_size, embed_size, 1, self.n_filter]
171 | W_filter = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W_filter")
172 | beta = tf.Variable(tf.constant(0.1, tf.float32, shape=[self.n_filter], name="beta"))
173 | tf.summary.histogram('beta', beta)
174 | conv = tf.nn.conv2d(inputs, W_filter, strides=[1, 1, 1, 1], padding="VALID", name="conv")
175 | conv_bn, update_ema = self.batchnorm(conv, beta, convolutional=True) # 在激活层前面加 BN
176 | # Apply nonlinearity, batch norm scaling is not useful with relus
177 | # batch norm offsets are used instead of biases,使用 BN 层的 offset,不要 biases
178 | h = tf.nn.relu(conv_bn, name="relu")
179 | # Maxpooling over the outputs
180 | pooled = tf.nn.max_pool(h, ksize=[1, n_step - filter_size + 1, 1, 1],
181 | strides=[1, 1, 1, 1], padding='VALID', name="pool")
182 | pooled_outputs.append(pooled)
183 | self.update_emas.append(update_ema)
184 | h_pool = tf.concat(pooled_outputs, 3)
185 | n_filter_total = self.n_filter * len(filter_sizes)
186 | h_pool_flat = tf.reshape(h_pool, [-1, n_filter_total])
187 | return h_pool_flat # shape = [-1, n_filter_total]
188 |
189 | def cnn_inference(self, X_inputs):
190 | """TextCNN 模型。title部分。
191 | Args:
192 | X_inputs: tensor.shape=(batch_size, title_len)
193 | Returns:
194 | title_outputs: tensor.shape=(batch_size, n_filter*filter_num_sent)
195 | """
196 | inputs = tf.nn.embedding_lookup(self.embedding, X_inputs)
197 | with tf.variable_scope('title_encoder'): # 生成 title 的向量表示
198 | title_outputs = self.textcnn(inputs, self.sent_len, self.sent_filter_sizes, embed_size=self.embedding_size)
199 | return title_outputs # shape = [batch_size, n_filter*filter_num_sent]
200 |
201 | def hcnn_inference(self, X_inputs):
202 | """分层 TextCNN 模型。content部分。
203 | Args:
204 | X_inputs: tensor.shape=(batch_size, doc_len*sent_len)
205 | Returns:
206 | doc_attn_outputs: tensor.shape=(batch_size, n_filter*filter_num_doc)
207 | """
208 | inputs = tf.nn.embedding_lookup(self.embedding,
209 | X_inputs) # inputs.shape=[batch_size, doc_len*sent_len, embedding_size]
210 | sent_inputs = tf.reshape(inputs, [self.batch_size * self.doc_len, self.sent_len,
211 | self.embedding_size]) # [batch_size*doc_len, sent_len, embedding_size]
212 | with tf.variable_scope('sentence_encoder'): # 生成句向量
213 | sent_outputs = self.textcnn(sent_inputs, self.sent_len, self.sent_filter_sizes, self.embedding_size)
214 | with tf.variable_scope('doc_encoder'): # 生成文档向量
215 | doc_inputs = tf.reshape(sent_outputs, [self.batch_size, self.doc_len, self.n_filter * len(
216 | self.sent_filter_sizes)]) # [batch_size, doc_len, n_filter*len(filter_sizes_sent)]
217 | doc_outputs = self.textcnn(doc_inputs, self.doc_len, self.doc_filter_sizes, self.n_filter * len(
218 | self.sent_filter_sizes)) # [batch_size, doc_len, n_filter*filter_num_doc]
219 | return doc_outputs # [batch_size, n_filter*len(doc_filter_sizes)]
220 |
221 | # test the model
222 | # def test():
223 | # import numpy as np
224 | # print('Begin testing...')
225 | # settings = Settings()
226 | # W_embedding = np.random.randn(50, 10)
227 | # config = tf.ConfigProto()
228 | # config.gpu_options.allow_growth = True
229 | # batch_size = 128
230 | # with tf.Session(config=config) as sess:
231 | # model = HCNN(W_embedding, settings)
232 | # optimizer = tf.train.AdamOptimizer(0.001)
233 | # train_op = optimizer.minimize(model.loss)
234 | # update_op = tf.group(*model.update_emas)
235 | # sess.run(tf.global_variables_initializer())
236 | # fetch = [model.loss, model.y_pred, train_op, update_op]
237 | # loss_list = list()
238 | # for i in xrange(100):
239 | # X1_batch = np.zeros((batch_size, 30), dtype=float)
240 | # X2_batch = np.zeros((batch_size, 10 * 30), dtype=float)
241 | # y_batch = np.zeros((batch_size, 1999), dtype=int)
242 | # _batch_size = len(y_batch)
243 | # feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
244 | # model.batch_size: _batch_size, model.tst: False, model.keep_prob: 0.5}
245 | # loss, y_pred, _, _ = sess.run(fetch, feed_dict=feed_dict)
246 | # loss_list.append(loss)
247 | # print(i, loss)
248 |
249 | # test()
250 |
--------------------------------------------------------------------------------
/zhihu-text-classification-master/models/wd_4_han/network.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 | import tensorflow as tf
4 | from tensorflow.contrib import rnn
5 | import tensorflow.contrib.layers as layers
6 |
7 | """wd_4_han
8 | title 部分使用 bigru+attention;content 部分使用 han; 两部分输出直接 concat。
9 | """
10 |
11 |
12 | class Settings(object):
13 | def __init__(self):
14 | self.model_name = 'wd_4_han'
15 | self.title_len = self.sent_len = 30
16 | self.doc_len = 10
17 | self.hidden_size = 256
18 | self.n_layer = 1
19 | self.fc_hidden_size = 1024
20 | self.n_class = 1999
21 | self.summary_path = '../../summary/' + self.model_name + '/'
22 | self.ckpt_path = '../../ckpt/' + self.model_name + '/'
23 |
24 |
25 | class HAN(object):
26 | """
27 | title: inputs->bigru+attention->output_title
28 | content: inputs->sent_encoder(bigru+attention)->doc_encoder(bigru+attention)->output_content
29 | concat[output_title, output_content] -> fc+bn+relu -> sigmoid_entropy.
30 | """
31 |
32 | def __init__(self, W_embedding, settings):
33 | self.model_name = settings.model_name
34 | self.title_len = self.sent_len = settings.sent_len
35 | self.doc_len = settings.doc_len
36 | self.hidden_size = settings.hidden_size
37 | self.n_layer = settings.n_layer
38 | self.n_class = settings.n_class
39 | self.fc_hidden_size = settings.fc_hidden_size
40 | self._global_step = tf.Variable(0, trainable=False, name='Global_Step')
41 | self.update_emas = list()
42 | # placeholders
43 | self._tst = tf.placeholder(tf.bool)
44 | self._keep_prob = tf.placeholder(tf.float32, [])
45 | self._batch_size = tf.placeholder(tf.int32, [])
46 |
47 | with tf.name_scope('Inputs'):
48 | self._X1_inputs = tf.placeholder(tf.int64, [None, self.title_len], name='X1_inputs')
49 | self._X2_inputs = tf.placeholder(tf.int64, [None, self.doc_len * self.sent_len], name='X2_inputs')
50 | self._y_inputs = tf.placeholder(tf.float32, [None, self.n_class], name='y_input')
51 |
52 | with tf.variable_scope('embedding'):
53 | self.embedding = tf.get_variable(name='embedding', shape=W_embedding.shape,
54 | initializer=tf.constant_initializer(W_embedding), trainable=True)
55 | self.embedding_size = W_embedding.shape[1]
56 |
57 | with tf.variable_scope('bigru_text'):
58 | output_title = self.bigru_inference(self._X1_inputs)
59 |
60 | with tf.variable_scope('han_content'):
61 | output_content = self.han_inference(self._X2_inputs)
62 |
63 | with tf.variable_scope('fc-bn-layer'):
64 | output = tf.concat([output_title, output_content], axis=1)
65 | W_fc = self.weight_variable([self.hidden_size * 4, self.fc_hidden_size], name='Weight_fc')
66 | tf.summary.histogram('W_fc', W_fc)
67 | h_fc = tf.matmul(output, W_fc, name='h_fc')
68 | beta_fc = tf.Variable(tf.constant(0.1, tf.float32, shape=[self.fc_hidden_size], name="beta_fc"))
69 | tf.summary.histogram('beta_fc', beta_fc)
70 | fc_bn, update_ema_fc = self.batchnorm(h_fc, beta_fc, convolutional=False)
71 | self.update_emas.append(update_ema_fc)
72 | self.fc_bn_relu = tf.nn.relu(fc_bn, name="relu")
73 | fc_bn_drop = tf.nn.dropout(self.fc_bn_relu, self.keep_prob)
74 |
75 | with tf.variable_scope('out_layer'):
76 | W_out = self.weight_variable([self.fc_hidden_size, self.n_class], name='Weight_out')
77 | tf.summary.histogram('Weight_out', W_out)
78 | b_out = self.bias_variable([self.n_class], name='bias_out')
79 | tf.summary.histogram('bias_out', b_out)
80 | self._y_pred = tf.nn.xw_plus_b(fc_bn_drop, W_out, b_out, name='y_pred') # 每个类别的分数 scores
81 |
82 | with tf.name_scope('loss'):
83 | self._loss = tf.reduce_mean(
84 | tf.nn.sigmoid_cross_entropy_with_logits(logits=self._y_pred, labels=self._y_inputs))
85 | tf.summary.scalar('loss', self._loss)
86 |
87 | self.saver = tf.train.Saver(max_to_keep=1)
88 |
89 | @property
90 | def tst(self):
91 | return self._tst
92 |
93 | @property
94 | def keep_prob(self):
95 | return self._keep_prob
96 |
97 | @property
98 | def batch_size(self):
99 | return self._batch_size
100 |
101 | @property
102 | def global_step(self):
103 | return self._global_step
104 |
105 | @property
106 | def X1_inputs(self):
107 | return self._X1_inputs
108 |
109 | @property
110 | def X2_inputs(self):
111 | return self._X2_inputs
112 |
113 | @property
114 | def y_inputs(self):
115 | return self._y_inputs
116 |
117 | @property
118 | def y_pred(self):
119 | return self._y_pred
120 |
121 | @property
122 | def loss(self):
123 | return self._loss
124 |
125 | def weight_variable(self, shape, name):
126 | """Create a weight variable with appropriate initialization."""
127 | initial = tf.truncated_normal(shape, stddev=0.1)
128 | return tf.Variable(initial, name=name)
129 |
130 | def bias_variable(self, shape, name):
131 | """Create a bias variable with appropriate initialization."""
132 | initial = tf.constant(0.1, shape=shape)
133 | return tf.Variable(initial, name=name)
134 |
135 | def batchnorm(self, Ylogits, offset, convolutional=False):
136 | """batchnormalization.
137 | Args:
138 | Ylogits: 1D向量或者是3D的卷积结果。
139 | num_updates: 迭代的global_step
140 | offset:表示beta,全局均值;在 RELU 激活中一般初始化为 0.1。
141 | scale:表示lambda,全局方差;在 sigmoid 激活中需要,这 RELU 激活中作用不大。
142 | m: 表示batch均值;v:表示batch方差。
143 | bnepsilon:一个很小的浮点数,防止除以 0.
144 | Returns:
145 | Ybn: 和 Ylogits 的维度一样,就是经过 Batch Normalization 处理的结果。
146 | update_moving_everages:更新mean和variance,主要是给最后的 test 使用。
147 | """
148 | exp_moving_avg = tf.train.ExponentialMovingAverage(0.999, self._global_step) # adding the iteration prevents from averaging across non-existing iterations
149 | bnepsilon = 1e-5
150 | if convolutional:
151 | mean, variance = tf.nn.moments(Ylogits, [0, 1, 2])
152 | else:
153 | mean, variance = tf.nn.moments(Ylogits, [0])
154 | update_moving_everages = exp_moving_avg.apply([mean, variance])
155 | m = tf.cond(self.tst, lambda: exp_moving_avg.average(mean), lambda: mean)
156 | v = tf.cond(self.tst, lambda: exp_moving_avg.average(variance), lambda: variance)
157 | Ybn = tf.nn.batch_normalization(Ylogits, m, v, offset, None, bnepsilon)
158 | return Ybn, update_moving_everages
159 |
160 | def gru_cell(self):
161 | with tf.name_scope('gru_cell'):
162 | cell = rnn.GRUCell(self.hidden_size, reuse=tf.get_variable_scope().reuse)
163 | return rnn.DropoutWrapper(cell, output_keep_prob=self.keep_prob)
164 |
165 | def bi_gru(self, inputs, seg_num):
166 | """build the bi-GRU network. Return the encoder represented vector.
167 | n_step: 句子的词数量;或者文档的句子数。
168 | seg_num: 序列的数量,原本应该为 batch_size, 但是这里将 batch_size 个 doc展开成很多个句子。
169 | """
170 | cells_fw = [self.gru_cell() for _ in range(self.n_layer)]
171 | cells_bw = [self.gru_cell() for _ in range(self.n_layer)]
172 | initial_states_fw = [cell_fw.zero_state(seg_num, tf.float32) for cell_fw in cells_fw]
173 | initial_states_bw = [cell_bw.zero_state(seg_num, tf.float32) for cell_bw in cells_bw]
174 | outputs, _, _ = rnn.stack_bidirectional_dynamic_rnn(cells_fw, cells_bw, inputs,
175 | initial_states_fw = initial_states_fw, initial_states_bw = initial_states_bw, dtype=tf.float32)
176 | # outputs: Output Tensor shaped: seg_num, max_time, layers_output],其中layers_output=hidden_size * 2 在这里。
177 | return outputs
178 |
179 | def task_specific_attention(self, inputs, output_size,
180 | initializer=layers.xavier_initializer(),
181 | activation_fn=tf.tanh, scope=None):
182 | """
183 | Performs task-specific attention reduction, using learned
184 | attention context vector (constant within task of interest).
185 | Args:
186 | inputs: Tensor of shape [batch_size, units, input_size]
187 | `input_size` must be static (known)
188 | `units` axis will be attended over (reduced from output)
189 | `batch_size` will be preserved
190 | output_size: Size of output's inner (feature) dimension
191 | Returns:
192 | outputs: Tensor of shape [batch_size, output_dim].
193 | """
194 | assert len(inputs.get_shape()) == 3 and inputs.get_shape()[-1].value is not None
195 | with tf.variable_scope(scope or 'attention') as scope:
196 | # u_w, attention 向量
197 | attention_context_vector = tf.get_variable(name='attention_context_vector', shape=[output_size],
198 | initializer=initializer, dtype=tf.float32)
199 | # 全连接层,把 h_i 转为 u_i , shape= [batch_size, units, input_size] -> [batch_size, units, output_size]
200 | input_projection = layers.fully_connected(inputs, output_size, activation_fn=activation_fn, scope=scope)
201 | # 输出 [batch_size, units]
202 | vector_attn = tf.reduce_sum(tf.multiply(input_projection, attention_context_vector), axis=2, keep_dims=True)
203 | attention_weights = tf.nn.softmax(vector_attn, dim=1)
204 | tf.summary.histogram('attention_weigths', attention_weights)
205 | weighted_projection = tf.multiply(inputs, attention_weights)
206 | outputs = tf.reduce_sum(weighted_projection, axis=1)
207 | return outputs # 输出 [batch_size, hidden_size*2]
208 |
209 | def bigru_inference(self, X_inputs):
210 | inputs = tf.nn.embedding_lookup(self.embedding, X_inputs)
211 | output_bigru = self.bi_gru(inputs, self.batch_size)
212 | output_att = self.task_specific_attention(output_bigru, self.hidden_size*2)
213 | return output_att # 输出 [batch_size, hidden_size*2]
214 |
215 | def han_inference(self, X_inputs):
216 | """分层 attention 模型。content部分。
217 | Args:
218 | X_inputs: tensor.shape=(batch_size, doc_len*sent_len)
219 | Returns:
220 | doc_attn_outputs: tensor.shape=(batch_size, hidden_size(*2 for bigru))
221 | """
222 | inputs = tf.nn.embedding_lookup(self.embedding, X_inputs) # inputs.shape=[batch_size, doc_len*sent_len, embedding_size]
223 | sent_inputs = tf.reshape(inputs,[self.batch_size*self.doc_len, self.sent_len, self.embedding_size]) # shape=(?, 40, 256)
224 | with tf.variable_scope('sentence_encoder'): # 生成句向量
225 | sent_outputs = self.bi_gru(sent_inputs, seg_num=self.batch_size*self.doc_len)
226 | sent_attn_outputs = self.task_specific_attention(sent_outputs, self.hidden_size*2) # [batch_size*doc_len, hidden_size*2]
227 | with tf.variable_scope('dropout'):
228 | sent_attn_outputs = tf.nn.dropout(sent_attn_outputs, self.keep_prob)
229 | with tf.variable_scope('doc_encoder'): # 生成文档向量
230 | doc_inputs = tf.reshape(sent_attn_outputs, [self.batch_size, self.doc_len, self.hidden_size*2])
231 | doc_outputs = self.bi_gru(doc_inputs, self.batch_size) # [batch_size, doc_len, hidden_size*2]
232 | doc_attn_outputs = self.task_specific_attention(doc_outputs, self.hidden_size*2) # [batch_size, hidden_size*2]
233 | return doc_attn_outputs # [batch_size, hidden_size*2]
234 |
235 |
236 |
237 | # test the model
238 | def test():
239 | import numpy as np
240 | print('Begin testing...')
241 | settings = Settings()
242 | W_embedding = np.random.randn(50, 10)
243 | config = tf.ConfigProto()
244 | config.gpu_options.allow_growth = True
245 | batch_size = 128
246 | with tf.Session(config=config) as sess:
247 | model = HAN(W_embedding, settings)
248 | optimizer = tf.train.AdamOptimizer(0.001)
249 | train_op = optimizer.minimize(model.loss)
250 | update_op = tf.group(*model.update_emas)
251 | sess.run(tf.global_variables_initializer())
252 | fetch = [model.loss, model.y_pred, train_op, update_op]
253 | loss_list = list()
254 | for i in xrange(100):
255 | X1_batch = np.zeros((batch_size, 30), dtype=float)
256 | X2_batch = np.zeros((batch_size, 10 * 30), dtype=float)
257 | y_batch = np.zeros((batch_size, 1999), dtype=int)
258 | _batch_size = len(y_batch)
259 | feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
260 | model.batch_size: _batch_size, model.tst: False, model.keep_prob: 0.5}
261 | loss, y_pred, _, _ = sess.run(fetch, feed_dict=feed_dict)
262 | loss_list.append(loss)
263 | print(i, loss)
264 |
265 | if __name__ == '__main__':
266 | test()
267 |
--------------------------------------------------------------------------------
/zhihu-text-classification-master/models/wd_5_bigru_cnn/network.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 | import tensorflow as tf
4 | from tensorflow.contrib import rnn
5 | import tensorflow.contrib.layers as layers
6 |
7 | """wd_5_bigru_cnn
8 | 两部分使用不同的 embedding, 因为RNN与CNN结构完全不同,共用embedding会降低性能。
9 | title 部分使用 bigru+attention;content 部分使用 textcnn; 两部分输出直接 concat。
10 | """
11 |
12 |
13 | class Settings(object):
14 | def __init__(self):
15 | self.model_name = 'wd_5_bigru_cnn'
16 | self.title_len = 30
17 | self.content_len = 150
18 | self.hidden_size = 256
19 | self.n_layer = 1
20 | self.filter_sizes = [2, 3, 4, 5, 7]
21 | self.n_filter = 256
22 | self.fc_hidden_size = 1024
23 | self.n_class = 1999
24 | self.summary_path = '../../summary/' + self.model_name + '/'
25 | self.ckpt_path = '../../ckpt/' + self.model_name + '/'
26 |
27 |
28 | class BiGRU_CNN(object):
29 | """
30 | title: inputs->bigru+attention->output_title
31 | content: inputs->textcnn->output_content
32 | concat[output_title, output_content] -> fc+bn+relu -> sigmoid_entropy.
33 | """
34 |
35 | def __init__(self, W_embedding, settings):
36 | self.model_name = settings.model_name
37 | self.title_len = settings.title_len
38 | self.content_len = settings.content_len
39 | self.hidden_size = settings.hidden_size
40 | self.n_layer = settings.n_layer
41 | self.filter_sizes = settings.filter_sizes
42 | self.n_filter = settings.n_filter
43 | self.n_filter_total = self.n_filter * len(self.filter_sizes)
44 | self.n_class = settings.n_class
45 | self.fc_hidden_size = settings.fc_hidden_size
46 | self._global_step = tf.Variable(0, trainable=False, name='Global_Step')
47 | self.update_emas = list()
48 | # placeholders
49 | self._tst = tf.placeholder(tf.bool)
50 | self._keep_prob = tf.placeholder(tf.float32, [])
51 | self._batch_size = tf.placeholder(tf.int32, [])
52 |
53 | with tf.name_scope('Inputs'):
54 | self._X1_inputs = tf.placeholder(tf.int64, [None, self.title_len], name='X1_inputs')
55 | self._X2_inputs = tf.placeholder(tf.int64, [None, self.content_len], name='X2_inputs')
56 | self._y_inputs = tf.placeholder(tf.float32, [None, self.n_class], name='y_input')
57 |
58 | with tf.variable_scope('embedding'):
59 | self.title_embedding = tf.get_variable(name='title_embedding', shape=W_embedding.shape,
60 | initializer=tf.constant_initializer(W_embedding), trainable=True)
61 | self.content_embedding = tf.get_variable(name='content_embedding', shape=W_embedding.shape,
62 | initializer=tf.constant_initializer(W_embedding), trainable=True)
63 | self.embedding_size = W_embedding.shape[1]
64 |
65 | with tf.variable_scope('bigru_text'):
66 | output_title = self.bigru_inference(self._X1_inputs)
67 |
68 | with tf.variable_scope('cnn_content'):
69 | output_content = self.cnn_inference(self._X2_inputs, self.content_len)
70 |
71 | with tf.variable_scope('fc-bn-layer'):
72 | output = tf.concat([output_title, output_content], axis=1)
73 | W_fc = self.weight_variable([self.hidden_size*2 + self.n_filter_total, self.fc_hidden_size], name='Weight_fc')
74 | tf.summary.histogram('W_fc', W_fc)
75 | h_fc = tf.matmul(output, W_fc, name='h_fc')
76 | beta_fc = tf.Variable(tf.constant(0.1, tf.float32, shape=[self.fc_hidden_size], name="beta_fc"))
77 | tf.summary.histogram('beta_fc', beta_fc)
78 | fc_bn, update_ema_fc = self.batchnorm(h_fc, beta_fc, convolutional=False)
79 | self.update_emas.append(update_ema_fc)
80 | self.fc_bn_relu = tf.nn.relu(fc_bn, name="relu")
81 | fc_bn_drop = tf.nn.dropout(self.fc_bn_relu, self.keep_prob)
82 |
83 | with tf.variable_scope('out_layer'):
84 | W_out = self.weight_variable([self.fc_hidden_size, self.n_class], name='Weight_out')
85 | tf.summary.histogram('Weight_out', W_out)
86 | b_out = self.bias_variable([self.n_class], name='bias_out')
87 | tf.summary.histogram('bias_out', b_out)
88 | self._y_pred = tf.nn.xw_plus_b(fc_bn_drop, W_out, b_out, name='y_pred') # 每个类别的分数 scores
89 |
90 | with tf.name_scope('loss'):
91 | self._loss = tf.reduce_mean(
92 | tf.nn.sigmoid_cross_entropy_with_logits(logits=self._y_pred, labels=self._y_inputs))
93 | tf.summary.scalar('loss', self._loss)
94 |
95 | self.saver = tf.train.Saver(max_to_keep=1)
96 |
97 | @property
98 | def tst(self):
99 | return self._tst
100 |
101 | @property
102 | def keep_prob(self):
103 | return self._keep_prob
104 |
105 | @property
106 | def batch_size(self):
107 | return self._batch_size
108 |
109 | @property
110 | def global_step(self):
111 | return self._global_step
112 |
113 | @property
114 | def X1_inputs(self):
115 | return self._X1_inputs
116 |
117 | @property
118 | def X2_inputs(self):
119 | return self._X2_inputs
120 |
121 | @property
122 | def y_inputs(self):
123 | return self._y_inputs
124 |
125 | @property
126 | def y_pred(self):
127 | return self._y_pred
128 |
129 | @property
130 | def loss(self):
131 | return self._loss
132 |
133 | def weight_variable(self, shape, name):
134 | """Create a weight variable with appropriate initialization."""
135 | initial = tf.truncated_normal(shape, stddev=0.1)
136 | return tf.Variable(initial, name=name)
137 |
138 | def bias_variable(self, shape, name):
139 | """Create a bias variable with appropriate initialization."""
140 | initial = tf.constant(0.1, shape=shape)
141 | return tf.Variable(initial, name=name)
142 |
143 | def batchnorm(self, Ylogits, offset, convolutional=False):
144 | """batchnormalization.
145 | Args:
146 | Ylogits: 1D向量或者是3D的卷积结果。
147 | num_updates: 迭代的global_step
148 | offset:表示beta,全局均值;在 RELU 激活中一般初始化为 0.1。
149 | scale:表示lambda,全局方差;在 sigmoid 激活中需要,这 RELU 激活中作用不大。
150 | m: 表示batch均值;v:表示batch方差。
151 | bnepsilon:一个很小的浮点数,防止除以 0.
152 | Returns:
153 | Ybn: 和 Ylogits 的维度一样,就是经过 Batch Normalization 处理的结果。
154 | update_moving_everages:更新mean和variance,主要是给最后的 test 使用。
155 | """
156 | exp_moving_avg = tf.train.ExponentialMovingAverage(0.999, self._global_step) # adding the iteration prevents from averaging across non-existing iterations
157 | bnepsilon = 1e-5
158 | if convolutional:
159 | mean, variance = tf.nn.moments(Ylogits, [0, 1, 2])
160 | else:
161 | mean, variance = tf.nn.moments(Ylogits, [0])
162 | update_moving_everages = exp_moving_avg.apply([mean, variance])
163 | m = tf.cond(self.tst, lambda: exp_moving_avg.average(mean), lambda: mean)
164 | v = tf.cond(self.tst, lambda: exp_moving_avg.average(variance), lambda: variance)
165 | Ybn = tf.nn.batch_normalization(Ylogits, m, v, offset, None, bnepsilon)
166 | return Ybn, update_moving_everages
167 |
168 | def gru_cell(self):
169 | with tf.name_scope('gru_cell'):
170 | cell = rnn.GRUCell(self.hidden_size, reuse=tf.get_variable_scope().reuse)
171 | return rnn.DropoutWrapper(cell, output_keep_prob=self.keep_prob)
172 |
173 | def bi_gru(self, inputs):
174 | """build the bi-GRU network. 返回个所有层的隐含状态。"""
175 | cells_fw = [self.gru_cell() for _ in range(self.n_layer)]
176 | cells_bw = [self.gru_cell() for _ in range(self.n_layer)]
177 | initial_states_fw = [cell_fw.zero_state(self.batch_size, tf.float32) for cell_fw in cells_fw]
178 | initial_states_bw = [cell_bw.zero_state(self.batch_size, tf.float32) for cell_bw in cells_bw]
179 | outputs, _, _ = rnn.stack_bidirectional_dynamic_rnn(cells_fw, cells_bw, inputs,
180 | initial_states_fw=initial_states_fw,
181 | initial_states_bw=initial_states_bw, dtype=tf.float32)
182 | return outputs
183 |
184 | def task_specific_attention(self, inputs, output_size,
185 | initializer=layers.xavier_initializer(),
186 | activation_fn=tf.tanh, scope=None):
187 | """
188 | Performs task-specific attention reduction, using learned
189 | attention context vector (constant within task of interest).
190 | Args:
191 | inputs: Tensor of shape [batch_size, units, input_size]
192 | `input_size` must be static (known)
193 | `units` axis will be attended over (reduced from output)
194 | `batch_size` will be preserved
195 | output_size: Size of output's inner (feature) dimension
196 | Returns:
197 | outputs: Tensor of shape [batch_size, output_dim].
198 | """
199 | assert len(inputs.get_shape()) == 3 and inputs.get_shape()[-1].value is not None
200 | with tf.variable_scope(scope or 'attention') as scope:
201 | # u_w, attention 向量
202 | attention_context_vector = tf.get_variable(name='attention_context_vector', shape=[output_size],
203 | initializer=initializer, dtype=tf.float32)
204 | # 全连接层,把 h_i 转为 u_i , shape= [batch_size, units, input_size] -> [batch_size, units, output_size]
205 | input_projection = layers.fully_connected(inputs, output_size, activation_fn=activation_fn, scope=scope)
206 | # 输出 [batch_size, units]
207 | vector_attn = tf.reduce_sum(tf.multiply(input_projection, attention_context_vector), axis=2, keep_dims=True)
208 | attention_weights = tf.nn.softmax(vector_attn, dim=1)
209 | tf.summary.histogram('attention_weigths', attention_weights)
210 | weighted_projection = tf.multiply(inputs, attention_weights)
211 | outputs = tf.reduce_sum(weighted_projection, axis=1)
212 | return outputs # 输出 [batch_size, hidden_size*2]
213 |
214 | def bigru_inference(self, X_inputs):
215 | inputs = tf.nn.embedding_lookup(self.title_embedding, X_inputs)
216 | output_bigru = self.bi_gru(inputs)
217 | output_att = self.task_specific_attention(output_bigru, self.hidden_size*2)
218 | return output_att
219 |
220 | def cnn_inference(self, X_inputs, n_step):
221 | """TextCNN 模型。
222 | Args:
223 | X_inputs: tensor.shape=(batch_size, n_step)
224 | Returns:
225 | title_outputs: tensor.shape=(batch_size, self.n_filter_total)
226 | """
227 | inputs = tf.nn.embedding_lookup(self.content_embedding, X_inputs)
228 | inputs = tf.expand_dims(inputs, -1)
229 | pooled_outputs = list()
230 | for i, filter_size in enumerate(self.filter_sizes):
231 | with tf.variable_scope("conv-maxpool-%s" % filter_size):
232 | # Convolution Layer
233 | filter_shape = [filter_size, self.embedding_size, 1, self.n_filter]
234 | W_filter = self.weight_variable(shape=filter_shape, name='W_filter')
235 | beta = self.bias_variable(shape=[self.n_filter], name='beta_filter')
236 | tf.summary.histogram('beta', beta)
237 | conv = tf.nn.conv2d(inputs, W_filter, strides=[1, 1, 1, 1], padding="VALID", name="conv")
238 | conv_bn, update_ema = self.batchnorm(conv, beta, convolutional=True)
239 | # Apply nonlinearity, batch norm scaling is not useful with relus
240 | h = tf.nn.relu(conv_bn, name="relu")
241 | # Maxpooling over the outputs
242 | pooled = tf.nn.max_pool(h, ksize=[1, n_step - filter_size + 1, 1, 1],
243 | strides=[1, 1, 1, 1], padding='VALID', name="pool")
244 | pooled_outputs.append(pooled)
245 | self.update_emas.append(update_ema)
246 | h_pool = tf.concat(pooled_outputs, 3)
247 | h_pool_flat = tf.reshape(h_pool, [-1, self.n_filter_total])
248 | return h_pool_flat # shape = [batch_size, self.n_filter_total]
249 |
250 |
251 | # test the model
252 | def test():
253 | import numpy as np
254 | print('Begin testing...')
255 | settings = Settings()
256 | W_embedding = np.random.randn(50, 10)
257 | config = tf.ConfigProto()
258 | config.gpu_options.allow_growth = True
259 | batch_size = 128
260 | with tf.Session(config=config) as sess:
261 | model = BiGRU_CNN(W_embedding, settings)
262 | optimizer = tf.train.AdamOptimizer(0.001)
263 | train_op = optimizer.minimize(model.loss)
264 | update_op = tf.group(*model.update_emas)
265 | sess.run(tf.global_variables_initializer())
266 | fetch = [model.loss, model.y_pred, train_op, update_op]
267 | loss_list = list()
268 | for i in xrange(100):
269 | X1_batch = np.zeros((batch_size, 30), dtype=float)
270 | X2_batch = np.zeros((batch_size, 150), dtype=float)
271 | y_batch = np.zeros((batch_size, 1999), dtype=int)
272 | _batch_size = len(y_batch)
273 | feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
274 | model.batch_size: _batch_size, model.tst: False, model.keep_prob: 0.5}
275 | loss, y_pred, _, _ = sess.run(fetch, feed_dict=feed_dict)
276 | loss_list.append(loss)
277 | print(i, loss)
278 |
279 | if __name__ == '__main__':
280 | test()
281 |
--------------------------------------------------------------------------------