├── zhihu-text-classification-master
    ├── data_process
    │   ├── .idea
    │   │   ├── .name
    │   │   ├── encodings.xml
    │   │   ├── modules.xml
    │   │   ├── deployment.xml
    │   │   ├── data_process.iml
    │   │   └── misc.xml
    │   ├── test.py
    │   ├── run_all_data_process.sh
    │   ├── question_and_topic_2id.py
    │   ├── README.md
    │   ├── embed2ndarray.py
    │   ├── word2id.py
    │   ├── char2id.py
    │   ├── creat_batch_seg.py
    │   └── creat_batch_data.py
    └── models
    │   ├── wd_4_han
    │       ├── __init__.py
    │       ├── predict.py
    │       ├── train.py
    │       └── network.py
    │   ├── wd_2_hcnn
    │       ├── __init__.py
    │       ├── predict.py
    │       ├── train.py
    │       └── network.py
    │   ├── wd_3_bigru
    │       ├── __init__.py
    │       ├── predict.py
    │       ├── train.py
    │       └── network.py
    │   ├── wd_6_rcnn
    │       ├── __init__.py
    │       ├── predict.py
    │       ├── train.py
    │       └── network.py
    │   ├── wd_1_1_cnn_concat
    │       ├── __init__.py
    │       ├── predict.py
    │       ├── train.py
    │       └── network.py
    │   ├── wd_1_2_cnn_max
    │       ├── __init__.py
    │       ├── predict.py
    │       ├── network.py
    │       └── train.py
    │   └── wd_5_bigru_cnn
    │       ├── __init__.py
    │       ├── predict.py
    │       ├── train.py
    │       └── network.py
└── ReadMe.md


/zhihu-text-classification-master/data_process/.idea/.name:
--------------------------------------------------------------------------------
1 | data_process


--------------------------------------------------------------------------------
/ReadMe.md:
--------------------------------------------------------------------------------
1 | # 竞赛列表
2 | + [2017 知乎看山杯机器学习挑战赛](https://www.biendata.com/competition/zhihu/)
3 | 


--------------------------------------------------------------------------------
/zhihu-text-classification-master/models/wd_4_han/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*- 
2 | 
3 | 


--------------------------------------------------------------------------------
/zhihu-text-classification-master/models/wd_2_hcnn/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*- 
2 | 
3 | 


--------------------------------------------------------------------------------
/zhihu-text-classification-master/models/wd_3_bigru/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*- 
2 | 
3 | 


--------------------------------------------------------------------------------
/zhihu-text-classification-master/models/wd_6_rcnn/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*- 
2 | 
3 | 


--------------------------------------------------------------------------------
/zhihu-text-classification-master/models/wd_1_1_cnn_concat/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*- 
2 | 
3 | 


--------------------------------------------------------------------------------
/zhihu-text-classification-master/models/wd_1_2_cnn_max/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*- 
2 | 
3 | 


--------------------------------------------------------------------------------
/zhihu-text-classification-master/models/wd_5_bigru_cnn/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*- 
2 | 
3 | 


--------------------------------------------------------------------------------
/zhihu-text-classification-master/data_process/.idea/encodings.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="Encoding" native2AsciiForPropertiesFiles="true" defaultCharsetForPropertiesFiles="UTF-8">
4 |     <file url="PROJECT" charset="UTF-8" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/zhihu-text-classification-master/data_process/test.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | 
 3 | 
 4 | from multiprocessing import Pool
 5 | import numpy as np
 6 | 
 7 | def func(a, b):
 8 |     return a+b
 9 | 
10 | p = Pool()
11 | a = [1,2,3]
12 | b = [4,5,6]
13 | para = zip(a,b)
14 | result = p.map(func, para)
15 | p.close()
16 | p.join()
17 | print result


--------------------------------------------------------------------------------
/zhihu-text-classification-master/data_process/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/data_process.iml" filepath="$PROJECT_DIR$/.idea/data_process.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/zhihu-text-classification-master/data_process/.idea/deployment.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project version="4">
 3 |   <component name="PublishConfigData">
 4 |     <serverData>
 5 |       <paths name="Copy of project-level server 'model'">
 6 |         <serverdata>
 7 |           <mappings>
 8 |             <mapping local="$PROJECT_DIR$" web="/" />
 9 |           </mappings>
10 |         </serverdata>
11 |       </paths>
12 |     </serverData>
13 |   </component>
14 | </project>


--------------------------------------------------------------------------------
/zhihu-text-classification-master/data_process/.idea/data_process.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$" />
 5 |     <orderEntry type="inheritedJdk" />
 6 |     <orderEntry type="sourceFolder" forTests="false" />
 7 |   </component>
 8 |   <component name="TestRunnerService">
 9 |     <option name="projectConfiguration" value="Nosetests" />
10 |     <option name="PROJECT_TEST_RUNNER" value="Nosetests" />
11 |   </component>
12 | </module>


--------------------------------------------------------------------------------
/zhihu-text-classification-master/data_process/run_all_data_process.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | echo -e "\033[44;37;5m RUNNING embed2ndarray.py\033[0m ";
 3 | python embed2ndarray.py;
 4 | echo -e "\033[44;37;5m RUNNING question_and_topic_2id.py\033[0m ";
 5 | python question_and_topic_2id.py;
 6 | echo -e "\033[44;37;5m RUNNING char2id.py\033[0m ";
 7 | python char2id.py;
 8 | echo -e "\033[44;37;5m RUNNING word2id.py\033[0m ";
 9 | python word2id.py;
10 | echo -e "\033[44;37;5m RUNNING creat_batch_data.py\033[0m ";
11 | python creat_batch_data.py;
12 | echo -e "\033[44;37;5m RUNNING creat_batch_seg.py\033[0m ";
13 | python creat_batch_seg.py;


--------------------------------------------------------------------------------
/zhihu-text-classification-master/data_process/.idea/misc.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project version="4">
 3 |   <component name="ProjectLevelVcsManager" settingsEditedManually="false">
 4 |     <OptionsSetting value="true" id="Add" />
 5 |     <OptionsSetting value="true" id="Remove" />
 6 |     <OptionsSetting value="true" id="Checkout" />
 7 |     <OptionsSetting value="true" id="Update" />
 8 |     <OptionsSetting value="true" id="Status" />
 9 |     <OptionsSetting value="true" id="Edit" />
10 |     <ConfirmationsSetting value="0" id="Add" />
11 |     <ConfirmationsSetting value="0" id="Remove" />
12 |   </component>
13 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 2.7.10 (C:\Python27\python.exe)" project-jdk-type="Python SDK" />
14 | </project>


--------------------------------------------------------------------------------
/zhihu-text-classification-master/data_process/question_and_topic_2id.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*- 
 2 | 
 3 | import pandas as pd
 4 | import pickle
 5 | from itertools import chain
 6 | 
 7 | 
 8 | def question_and_topic_2id():
 9 |     """把question和topic转成id形式并保存至 ../data/目录下。"""
10 |     print('Changing the quetion and topic to id and save in sr_question2.pkl and sr_topic2id.pkl in ../data/')
11 |     df_question_topic = pd.read_csv('../raw_data/question_topic_train_set.txt', sep='\t', names=['question', 'topics'],
12 |                         dtype={'question': object, 'topics': object})
13 |     df_question_topic.topics = df_question_topic.topics.apply(lambda tps: tps.split(','))
14 |     save_path = '../data/'
15 |     print('questino number = %d ' % len(df_question_topic))
16 |     # 问题 id 按照给出的问题顺序编号
17 |     questions = df_question_topic.question.values
18 |     sr_question2id = pd.Series(range(len(questions)), index=questions) 
19 |     sr_id2question = pd.Series(questions, index=range(len(questions)))
20 |     # topic 按照数量从大到小进行编号
21 |     topics = df_question_topic.topics.values
22 |     topics = list(chain(*topics))
23 |     sr_topics = pd.Series(topics)
24 |     topics_count = sr_topics.value_counts()
25 |     topics = topics_count.index
26 |     sr_topic2id = pd.Series(range(len(topics)),index=topics)
27 |     sr_id2topic = pd.Series(topics, index=range(len(topics))) 
28 | 
29 |     with open(save_path + 'sr_question2id.pkl', 'wb') as outp:
30 |         pickle.dump(sr_question2id, outp)
31 |         pickle.dump(sr_id2question, outp)
32 |     with open(save_path + 'sr_topic2id.pkl', 'wb') as outp:
33 |         pickle.dump(sr_topic2id, outp)
34 |         pickle.dump(sr_id2topic, outp)
35 |     print('Finished changing.')
36 | 
37 | 
38 | if __name__ == '__main__':
39 |     question_and_topic_2id()
40 | 


--------------------------------------------------------------------------------
/zhihu-text-classification-master/data_process/README.md:
--------------------------------------------------------------------------------
 1 | ## 数据处理
 2 | 
 3 | 1.把比赛提供的所有数据解压到 raw_data/ 目录下。<br/>
 4 | 2.按照顺序依次执行各个 .py，不带任何参数。<br/>
 5 |   或者在当前目录下输入下面命令运行所有文件：<br/>
 6 |   dos2unix run_all_data_process.sh   # 使用cygwin工具dos2unix将script改为unix格式<br/>
 7 |   sh run_all_data_process.sh<br/>
 8 | 3.环境依赖(下面是我使用的版本) <br/>
 9 | - numpy		1.12.1
10 | - pandas 	0.19.2
11 | - word2vec	0.9.1
12 | - tqdm		4.11.2
13 | 
14 | 
15 | ### embed2ndarray.py
16 | 赛方提供了txt格式的词向量和字向量，这里把embedding矩阵转成 np.ndarray 形式，分别保存为 data/word_embedding.npy 和 data/char_embedding.npy。在赛方提供的词向量基础上，添加 '\<PAD\>' 和 '\<UNK\>' 两个特殊符号。其中 '\<PAD\>' 用于将序列补全到固定长度， '\<UNK\>' 用于替换低频词（字）。
17 | 用 pd.Series 保存词(字)对应 embedding 中的行号(id),存储在 data/sr_word2id.pkl 和 data/sr_char2id.pkl 中。
18 | 
19 | ### question_and_topic_2id.py
20 | 把问题和话题转为id形式，保存在 data/sr_question2id.pkl 和 data/sr_id2question.pkl 中。
21 | 
22 | ### char2id.py
23 | 利用上面得到的 sr_char2id，把所有问题的字转为对应的id, 存储为
24 | data/ch_train_title.npy
25 | data/ch_train_content.npy
26 | data/ch_eval_title.npy
27 | data/ch_eval_content.npy
28 | 
29 | ### word2id.py
30 | 同 char2id.py
31 | 
32 | ### creat_batch_data.py
33 | 把所有的数据按照 batch_size(128) 进行打包，固定seed，随机取 10 万样本作为验证集。每个batch存储为一个 npz 文件，包括 X, y 两部分。
34 | 这里所有的序列都进行了截断，长度不足的用0进行padding到固定长度。
35 | 保存位置：
36 | wd_train_path = '../data/wd-data/data_train/'
37 | wd_valid_path = '../data/wd-data/data_valid/'
38 | wd_test_path = '../data/wd-data/data_test/'
39 | ch_train_path = '../data/ch-data/data_train/'
40 | ch_valid_path = '../data/ch-data/data_valid/'
41 | ch_test_path = '../data/ch-data/data_test/'
42 | 
43 | 
44 | ### creat_batch_seg.py
45 | 和 creat_batch_data.py 相同，只是对 content 部分进行句子划分。用于分层模型。
46 | 划分句子长度：
47 | wd_title_len = 30, wd_sent_len = 30, wd_doc_len = 10.(即content划分为10个句子，每个句子长度为30个词)
48 | ch_title_len = 52, ch_sent_len = 52, ch_doc_len = 10.
49 | 不划分句子：
50 | wd_title_len = 30, wd_content_len = 150.
51 | ch_title_len = 52, ch_content_len = 300.
52 | 
53 | 
54 | ### To do
55 | - 在数据读取中使用 tfrecord 文件进行数据读取。这样能够随时改变 batch_size， 而且 shuffle 会比使用 numpy 更加均匀。
56 | - 添加序列长度信息。在这里所有的序列都截断或者padding为固定长度，在误差计算中没有处理padding部分，可能会使准确率下降。在使用 dynamic_rnn 的时候加上 sequence_length 信息，在计算的时候忽略 padding 部分。同时结合 tf.train.SequenceExample() 和 tf.train.batch() 自动 padding，也可以减少数据量。


--------------------------------------------------------------------------------
/zhihu-text-classification-master/data_process/embed2ndarray.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*- 
 2 | 
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | 
 6 | import numpy as np
 7 | import pandas as pd
 8 | import word2vec
 9 | import pickle
10 | import os
11 | 
12 | SPECIAL_SYMBOL = ['<PAD>', '<EOS>']  # add these special symbols to word(char) embeddings.
13 | 
14 | 
15 | def get_word_embedding():
16 |     """提取词向量，并保存至 ../data/word_embedding.npy"""
17 |     print('getting the word_embedding.npy')
18 |     wv = word2vec.load('../raw_data/word_embedding.txt')
19 |     word_embedding = wv.vectors
20 |     words = wv.vocab
21 |     n_special_sym = len(SPECIAL_SYMBOL)
22 |     sr_id2word = pd.Series(words, index=range(n_special_sym, n_special_sym + len(words)))
23 |     sr_word2id = pd.Series(range(n_special_sym, n_special_sym + len(words)), index=words)
24 |     # 添加特殊符号：<PAD>:0, <UNK>:1
25 |     embedding_size = 256
26 |     vec_special_sym = np.random.randn(n_special_sym, embedding_size)
27 |     for i in range(n_special_sym):
28 |         sr_id2word[i] = SPECIAL_SYMBOL[i]
29 |         sr_word2id[SPECIAL_SYMBOL[i]] = i
30 |     word_embedding = np.vstack([vec_special_sym, word_embedding])
31 |     # 保存词向量
32 |     save_path = '../data/'
33 |     if not os.path.exists(save_path):
34 |         os.makedirs(save_path)
35 |     np.save(save_path + 'word_embedding.npy', word_embedding)
36 |     # 保存词与id的对应关系
37 |     with open(save_path + 'sr_word2id.pkl', 'wb') as outp:
38 |         pickle.dump(sr_id2word, outp)
39 |         pickle.dump(sr_word2id, outp)
40 |     print('Saving the word_embedding.npy to ../data/word_embedding.npy')
41 | 
42 | 
43 | def get_char_embedding():
44 |     """提取字向量，并保存至 ../data/char_embedding.npy"""
45 |     print('getting the char_embedding.npy')
46 |     wv = word2vec.load('../raw_data/char_embedding.txt')
47 |     char_embedding = wv.vectors
48 |     chars = wv.vocab
49 |     n_special_sym = len(SPECIAL_SYMBOL)
50 |     sr_id2char = pd.Series(chars, index=range(n_special_sym, n_special_sym + len(chars)))
51 |     sr_char2id = pd.Series(range(n_special_sym, n_special_sym + len(chars)), index=chars)
52 | 
53 |     # 添加特殊符号：<PAD>:0, <UNK>:1
54 |     embedding_size = 256
55 | 
56 |     vec_special_sym = np.random.randn(n_special_sym, embedding_size)
57 |     for i in range(n_special_sym):
58 |         sr_id2char[i] = SPECIAL_SYMBOL[i]
59 |         sr_char2id[SPECIAL_SYMBOL[i]] = i
60 |     char_embedding = np.vstack([vec_special_sym, char_embedding])
61 |     # 保存字向量
62 |     save_path = '../data/'
63 |     if not os.path.exists(save_path):
64 |         os.makedirs(save_path)
65 |     np.save(save_path + 'char_embedding.npy', char_embedding)
66 |     # 保存字与id的对应关系
67 |     with open(save_path + 'sr_char2id.pkl', 'wb') as outp:
68 |         pickle.dump(sr_id2char, outp)
69 |         pickle.dump(sr_char2id, outp)
70 |     print('Saving the char_embedding.npy to ../data/char_embedding.npy')
71 | 
72 | 
73 | if __name__ == '__main__':
74 |     get_word_embedding()
75 |     get_char_embedding()
76 | 


--------------------------------------------------------------------------------
/zhihu-text-classification-master/data_process/word2id.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | 
  3 | from __future__ import division
  4 | from __future__ import print_function
  5 | 
  6 | import numpy as np
  7 | import pandas as pd
  8 | import pickle
  9 | from multiprocessing import Pool
 10 | from tqdm import tqdm
 11 | import time
 12 | 
 13 | save_path = '../data/'
 14 | with open(save_path + 'sr_word2id.pkl', 'rb') as inp:
 15 |     sr_id2word = pickle.load(inp)
 16 |     sr_word2id = pickle.load(inp)
 17 | dict_word2id = dict()
 18 | for i in range(len(sr_word2id)):
 19 |     dict_word2id[sr_word2id.index[i]] = sr_word2id.values[i]
 20 | 
 21 | 
 22 | def get_id(word):
 23 |     """获取 word 所对应的 id.
 24 |     如果该词不在词典中，用 <UNK>（对应的 ID 为 1 ）进行替换。
 25 |     """
 26 |     if word not in dict_word2id:
 27 |         return 1
 28 |     else:
 29 |         return dict_word2id[word]
 30 | 
 31 | 
 32 | def get_id4words(words):
 33 |     """把 words 转为 对应的 id"""
 34 |     words = words.strip().split(',')  # 先分开词
 35 |     ids = list(map(get_id, words))  # 获取id
 36 |     return ids
 37 | 
 38 | 
 39 | def test_word2id():
 40 |     """把测试集的所有词转成对应的id。"""
 41 |     time0 = time.time()
 42 |     print('Processing eval data.')
 43 |     df_eval = pd.read_csv('../raw_data/question_eval_set.txt', sep='\t', usecols=[0, 2, 4],
 44 |                           names=['question_id', 'word_title', 'word_content'], dtype={'question_id': object})
 45 |     print('test question number %d' % len(df_eval))
 46 |     # 没有 title 的问题用 content 来替换
 47 |     na_title_indexs = list()
 48 |     for i in range(len(df_eval)):
 49 |         word_title = df_eval.word_title.values[i]
 50 |         if type(word_title) is float:
 51 |             na_title_indexs.append(i)
 52 |     print('There are %d test questions without title.' % len(na_title_indexs))
 53 |     for na_index in na_title_indexs:
 54 |         df_eval.at[na_index, 'word_title'] = df_eval.at[na_index, 'word_content']
 55 |     # 没有 content 的问题用 title 来替换
 56 |     na_content_indexs = list()
 57 |     for i in tqdm(range(len(df_eval))):
 58 |         word_content = df_eval.word_content.values[i]
 59 |         if type(word_content) is float:
 60 |             na_content_indexs.append(i)
 61 |     print('There are %d test questions without content.' % len(na_content_indexs))
 62 |     for na_index in tqdm(na_content_indexs):
 63 |         df_eval.at[na_index, 'word_content'] = df_eval.at[na_index, 'word_title']
 64 |     # 转为 id 形式
 65 |     p = Pool()
 66 |     eval_title = np.asarray(p.map(get_id4words, df_eval.word_title.values))
 67 |     np.save('../data/wd_eval_title.npy', eval_title)
 68 |     eval_content = np.asarray(p.map(get_id4words, df_eval.word_content.values))
 69 |     np.save('../data/wd_eval_content.npy', eval_content)
 70 |     p.close()
 71 |     p.join()
 72 |     print('Finished changing the eval words to ids. Costed time %g s' % (time.time() - time0))
 73 | 
 74 | 
 75 | def train_word2id():
 76 |     """把训练集的所有词转成对应的id。"""
 77 |     time0 = time.time()
 78 |     print('Processing train data.')
 79 |     df_train = pd.read_csv('../raw_data/question_train_set.txt', sep='\t', usecols=[0, 2, 4],
 80 |                            names=['question_id', 'word_title', 'word_content'], dtype={'question_id': object})
 81 |     print('training question number %d ' % len(df_train))
 82 |     # 没有 content 的问题用 title 来替换
 83 |     na_content_indexs = list()
 84 |     for i in tqdm(range(len(df_train))):
 85 |         word_content = df_train.word_content.values[i]
 86 |         if type(word_content) is float:
 87 |             na_content_indexs.append(i)
 88 |     print('There are %d train questions without content.' % len(na_content_indexs))
 89 |     for na_index in tqdm(na_content_indexs):
 90 |         df_train.at[na_index, 'word_content'] = df_train.at[na_index, 'word_title']
 91 |     # 没有 title 的问题， 丢弃
 92 |     na_title_indexs = list()
 93 |     for i in range(len(df_train)):
 94 |         word_title = df_train.word_title.values[i]
 95 |         if type(word_title) is float:
 96 |             na_title_indexs.append(i)
 97 |     print('There are %d train questions without title.' % len(na_title_indexs))
 98 |     df_train = df_train.drop(na_title_indexs)
 99 |     print('After dropping, training question number(should be 2999952) = %d' % len(df_train))
100 |     # 转为 id 形式
101 |     p = Pool()
102 |     train_title = np.asarray(p.map(get_id4words, df_train.word_title.values))
103 |     np.save('../data/wd_train_title.npy', train_title)
104 |     train_content = np.asarray(p.map(get_id4words, df_train.word_content.values))
105 |     np.save('../data/wd_train_content.npy', train_content)
106 |     p.close()
107 |     p.join()
108 |     print('Finished changing the training words to ids. Costed time %g s' % (time.time() - time0))
109 | 
110 | 
111 | if __name__ == '__main__':
112 |     test_word2id()
113 |     train_word2id()
114 | 


--------------------------------------------------------------------------------
/zhihu-text-classification-master/models/wd_4_han/predict.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | 
  3 | from __future__ import print_function
  4 | from __future__ import division
  5 | import tensorflow as tf
  6 | import numpy as np
  7 | from tqdm import tqdm
  8 | import os
  9 | import sys
 10 | import time
 11 | import network
 12 | 
 13 | sys.path.append('../..')
 14 | from evaluator import score_eval
 15 | 
 16 | settings = network.Settings()
 17 | title_len = settings.title_len
 18 | model_name = settings.model_name
 19 | ckpt_path = settings.ckpt_path
 20 | 
 21 | local_scores_path = '../../local_scores/'
 22 | scores_path = '../../scores/'
 23 | if not os.path.exists(local_scores_path):
 24 |     os.makedirs(local_scores_path)
 25 | if not os.path.exists(scores_path):
 26 |     os.makedirs(scores_path)
 27 | 
 28 | embedding_path = '../../data/word_embedding.npy'
 29 | data_valid_path = '../../data/wd-data/seg_valid/'
 30 | data_test_path = '../../data/wd-data/seg_test/'
 31 | va_batches = os.listdir(data_valid_path)
 32 | te_batches = os.listdir(data_test_path)  # batch 文件名列表
 33 | n_va_batches = len(va_batches)
 34 | n_te_batches = len(te_batches)
 35 | 
 36 | 
 37 | def get_batch(batch_id):
 38 |     """get a batch from valid data"""
 39 |     new_batch = np.load(data_valid_path + str(batch_id) + '.npz')
 40 |     X_batch = new_batch['X']
 41 |     y_batch = new_batch['y']
 42 |     X1_batch = X_batch[:, :title_len]
 43 |     X2_batch = X_batch[:, title_len:]
 44 |     return [X1_batch, X2_batch, y_batch]
 45 | 
 46 | 
 47 | def get_test_batch(batch_id):
 48 |     """get a batch from test data"""
 49 |     X_batch = np.load(data_test_path + str(batch_id) + '.npy')
 50 |     X1_batch = X_batch[:, :title_len]
 51 |     X2_batch = X_batch[:, title_len:]
 52 |     return [X1_batch, X2_batch]
 53 | 
 54 | 
 55 | def local_predict(sess, model):
 56 |     """Test on the valid data."""
 57 |     time0 = time.time()
 58 |     predict_labels_list = list()  # 所有的预测结果
 59 |     marked_labels_list = list()
 60 |     predict_scores = list()
 61 |     for i in tqdm(xrange(n_va_batches)):
 62 |         [X1_batch, X2_batch, y_batch] = get_batch(i)
 63 |         marked_labels_list.extend(y_batch)
 64 |         _batch_size = len(X1_batch)
 65 |         fetches = [model.y_pred]
 66 |         feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch,
 67 |                      model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
 68 |         predict_labels = sess.run(fetches, feed_dict)[0]
 69 |         predict_scores.append(predict_labels)
 70 |         predict_labels = list(map(lambda label: label.argsort()[-1:-6:-1], predict_labels))  # 取最大的5个下标
 71 |         predict_labels_list.extend(predict_labels)
 72 |     predict_label_and_marked_label_list = zip(predict_labels_list, marked_labels_list)
 73 |     precision, recall, f1 = score_eval(predict_label_and_marked_label_list)
 74 |     print('Local valid p=%g, r=%g, f1=%g' % (precision, recall, f1))
 75 |     predict_scores = np.vstack(np.asarray(predict_scores))
 76 |     local_scores_name = local_scores_path + model_name + '.npy'
 77 |     np.save(local_scores_name, predict_scores)
 78 |     print('local_scores.shape=', predict_scores.shape)
 79 |     print('Writed the scores into %s, time %g s' % (local_scores_name, time.time() - time0))
 80 | 
 81 | 
 82 | def predict(sess, model):
 83 |     """Test on the test data."""
 84 |     time0 = time.time()
 85 |     predict_scores = list()
 86 |     for i in tqdm(xrange(n_te_batches)):
 87 |         [X1_batch, X2_batch] = get_test_batch(i)
 88 |         _batch_size = len(X1_batch)
 89 |         fetches = [model.y_pred]
 90 |         feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch,
 91 |                      model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
 92 |         predict_labels = sess.run(fetches, feed_dict)[0]
 93 |         predict_scores.append(predict_labels)
 94 |     predict_scores = np.vstack(np.asarray(predict_scores))
 95 |     scores_name = scores_path + model_name + '.npy'
 96 |     np.save(scores_name, predict_scores)
 97 |     print('scores.shape=', predict_scores.shape)
 98 |     print('Writed the scores into %s, time %g s' % (scores_name, time.time() - time0))
 99 | 
100 | 
101 | def main(_):
102 |     if not os.path.exists(ckpt_path + 'checkpoint'):
103 |         print('there is not saved model, please check the ckpt path')
104 |         exit()
105 |     print('Loading model...')
106 |     W_embedding = np.load(embedding_path)
107 |     config = tf.ConfigProto()
108 |     config.gpu_options.allow_growth = True
109 |     with tf.Session(config=config) as sess:
110 |         model = network.HAN(W_embedding, settings)
111 |         model.saver.restore(sess, tf.train.latest_checkpoint(ckpt_path))
112 |         print('Local predicting...')
113 |         local_predict(sess, model)
114 |         print('Test predicting...')
115 |         predict(sess, model)
116 | 
117 | 
118 | if __name__ == '__main__':
119 |     tf.app.run()
120 | 


--------------------------------------------------------------------------------
/zhihu-text-classification-master/models/wd_2_hcnn/predict.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | 
  3 | from __future__ import print_function
  4 | from __future__ import division
  5 | import tensorflow as tf
  6 | import numpy as np
  7 | from tqdm import tqdm
  8 | import os
  9 | import sys
 10 | import time
 11 | import network
 12 | 
 13 | sys.path.append('../..')
 14 | from evaluator import score_eval
 15 | 
 16 | settings = network.Settings()
 17 | title_len = settings.title_len
 18 | model_name = settings.model_name
 19 | ckpt_path = settings.ckpt_path
 20 | 
 21 | local_scores_path = '../../local_scores/'
 22 | scores_path = '../../scores/'
 23 | if not os.path.exists(local_scores_path):
 24 |     os.makedirs(local_scores_path)
 25 | if not os.path.exists(scores_path):
 26 |     os.makedirs(scores_path)
 27 | 
 28 | embedding_path = '../../data/word_embedding.npy'
 29 | data_valid_path = '../../data/wd-data/seg_valid/'
 30 | data_test_path = '../../data/wd-data/seg_test/'
 31 | va_batches = os.listdir(data_valid_path)
 32 | te_batches = os.listdir(data_test_path)  # batch 文件名列表
 33 | n_va_batches = len(va_batches)
 34 | n_te_batches = len(te_batches)
 35 | 
 36 | 
 37 | def get_batch(batch_id):
 38 |     """get a batch from valid data"""
 39 |     new_batch = np.load(data_valid_path + str(batch_id) + '.npz')
 40 |     X_batch = new_batch['X']
 41 |     y_batch = new_batch['y']
 42 |     X1_batch = X_batch[:, :title_len]
 43 |     X2_batch = X_batch[:, title_len:]
 44 |     return [X1_batch, X2_batch, y_batch]
 45 | 
 46 | 
 47 | def get_test_batch(batch_id):
 48 |     """get a batch from test data"""
 49 |     X_batch = np.load(data_test_path + str(batch_id) + '.npy')
 50 |     X1_batch = X_batch[:, :title_len]
 51 |     X2_batch = X_batch[:, title_len:]
 52 |     return [X1_batch, X2_batch]
 53 | 
 54 | 
 55 | def local_predict(sess, model):
 56 |     """Test on the valid data."""
 57 |     time0 = time.time()
 58 |     predict_labels_list = list()  # 所有的预测结果
 59 |     marked_labels_list = list()
 60 |     predict_scores = list()
 61 |     for i in tqdm(xrange(n_va_batches)):
 62 |         [X1_batch, X2_batch, y_batch] = get_batch(i)
 63 |         marked_labels_list.extend(y_batch)
 64 |         _batch_size = len(X1_batch)
 65 |         fetches = [model.y_pred]
 66 |         feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch,
 67 |                      model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
 68 |         predict_labels = sess.run(fetches, feed_dict)[0]
 69 |         predict_scores.append(predict_labels)
 70 |         predict_labels = list(map(lambda label: label.argsort()[-1:-6:-1], predict_labels))  # 取最大的5个下标
 71 |         predict_labels_list.extend(predict_labels)
 72 |     predict_label_and_marked_label_list = zip(predict_labels_list, marked_labels_list)
 73 |     precision, recall, f1 = score_eval(predict_label_and_marked_label_list)
 74 |     print('Local valid p=%g, r=%g, f1=%g' % (precision, recall, f1))
 75 |     predict_scores = np.vstack(np.asarray(predict_scores))
 76 |     local_scores_name = local_scores_path + model_name + '.npy'
 77 |     np.save(local_scores_name, predict_scores)
 78 |     print('local_scores.shape=', predict_scores.shape)
 79 |     print('Writed the scores into %s, time %g s' % (local_scores_name, time.time() - time0))
 80 | 
 81 | 
 82 | def predict(sess, model):
 83 |     """Test on the test data."""
 84 |     time0 = time.time()
 85 |     predict_scores = list()
 86 |     for i in tqdm(xrange(n_te_batches)):
 87 |         [X1_batch, X2_batch] = get_test_batch(i)
 88 |         _batch_size = len(X1_batch)
 89 |         fetches = [model.y_pred]
 90 |         feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch,
 91 |                      model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
 92 |         predict_labels = sess.run(fetches, feed_dict)[0]
 93 |         predict_scores.append(predict_labels)
 94 |     predict_scores = np.vstack(np.asarray(predict_scores))
 95 |     scores_name = scores_path + model_name + '.npy'
 96 |     np.save(scores_name, predict_scores)
 97 |     print('scores.shape=', predict_scores.shape)
 98 |     print('Writed the scores into %s, time %g s' % (scores_name, time.time() - time0))
 99 | 
100 | 
101 | def main(_):
102 |     if not os.path.exists(ckpt_path + 'checkpoint'):
103 |         print('there is not saved model, please check the ckpt path')
104 |         exit()
105 |     print('Loading model...')
106 |     W_embedding = np.load(embedding_path)
107 |     config = tf.ConfigProto()
108 |     config.gpu_options.allow_growth = True
109 |     with tf.Session(config=config) as sess:
110 |         model = network.HCNN(W_embedding, settings)
111 |         model.saver.restore(sess, tf.train.latest_checkpoint(ckpt_path))
112 |         print('Local predicting...')
113 |         local_predict(sess, model)
114 |         print('Test predicting...')
115 |         predict(sess, model)
116 | 
117 | 
118 | if __name__ == '__main__':
119 |     tf.app.run()
120 | 


--------------------------------------------------------------------------------
/zhihu-text-classification-master/models/wd_3_bigru/predict.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | 
  3 | from __future__ import print_function
  4 | from __future__ import division
  5 | import tensorflow as tf
  6 | import numpy as np
  7 | from tqdm import tqdm
  8 | import os
  9 | import sys
 10 | import time
 11 | import network
 12 | 
 13 | sys.path.append('../..')
 14 | from evaluator import score_eval
 15 | 
 16 | settings = network.Settings()
 17 | title_len = settings.title_len
 18 | model_name = settings.model_name
 19 | ckpt_path = settings.ckpt_path
 20 | 
 21 | local_scores_path = '../../local_scores/'
 22 | scores_path = '../../scores/'
 23 | if not os.path.exists(local_scores_path):
 24 |     os.makedirs(local_scores_path)
 25 | if not os.path.exists(scores_path):
 26 |     os.makedirs(scores_path)
 27 | 
 28 | embedding_path = '../../data/word_embedding.npy'
 29 | data_valid_path = '../../data/wd-data/data_valid/'
 30 | data_test_path = '../../data/wd-data/data_test/'
 31 | va_batches = os.listdir(data_valid_path)
 32 | te_batches = os.listdir(data_test_path)  # batch 文件名列表
 33 | n_va_batches = len(va_batches)
 34 | n_te_batches = len(te_batches)
 35 | 
 36 | 
 37 | def get_batch(batch_id):
 38 |     """get a batch from valid data"""
 39 |     new_batch = np.load(data_valid_path + str(batch_id) + '.npz')
 40 |     X_batch = new_batch['X']
 41 |     y_batch = new_batch['y']
 42 |     X1_batch = X_batch[:, :title_len]
 43 |     X2_batch = X_batch[:, title_len:]
 44 |     return [X1_batch, X2_batch, y_batch]
 45 | 
 46 | 
 47 | def get_test_batch(batch_id):
 48 |     """get a batch from test data"""
 49 |     X_batch = np.load(data_test_path + str(batch_id) + '.npy')
 50 |     X1_batch = X_batch[:, :title_len]
 51 |     X2_batch = X_batch[:, title_len:]
 52 |     return [X1_batch, X2_batch]
 53 | 
 54 | 
 55 | def local_predict(sess, model):
 56 |     """Test on the valid data."""
 57 |     time0 = time.time()
 58 |     predict_labels_list = list()  # 所有的预测结果
 59 |     marked_labels_list = list()
 60 |     predict_scores = list()
 61 |     for i in tqdm(range(n_va_batches)):
 62 |         [X1_batch, X2_batch, y_batch] = get_batch(i)
 63 |         marked_labels_list.extend(y_batch)
 64 |         _batch_size = len(X1_batch)
 65 |         fetches = [model.y_pred]
 66 |         feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch,
 67 |                      model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
 68 |         predict_labels = sess.run(fetches, feed_dict)[0]
 69 |         predict_scores.append(predict_labels)
 70 |         predict_labels = list(map(lambda label: label.argsort()[-1:-6:-1], predict_labels))  # 取最大的5个下标
 71 |         predict_labels_list.extend(predict_labels)
 72 |     predict_label_and_marked_label_list = zip(predict_labels_list, marked_labels_list)
 73 |     precision, recall, f1 = score_eval(predict_label_and_marked_label_list)
 74 |     print('Local valid p=%g, r=%g, f1=%g' % (precision, recall, f1))
 75 |     predict_scores = np.vstack(np.asarray(predict_scores))
 76 |     local_scores_name = local_scores_path + model_name + '.npy'
 77 |     np.save(local_scores_name, predict_scores)
 78 |     print('local_scores.shape=', predict_scores.shape)
 79 |     print('Writed the scores into %s, time %g s' % (local_scores_name, time.time() - time0))
 80 | 
 81 | 
 82 | def predict(sess, model):
 83 |     """Test on the test data."""
 84 |     time0 = time.time()
 85 |     predict_scores = list()
 86 |     for i in tqdm(range(n_te_batches)):
 87 |         [X1_batch, X2_batch] = get_test_batch(i)
 88 |         _batch_size = len(X1_batch)
 89 |         fetches = [model.y_pred]
 90 |         feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch,
 91 |                      model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
 92 |         predict_labels = sess.run(fetches, feed_dict)[0]
 93 |         predict_scores.append(predict_labels)
 94 |     predict_scores = np.vstack(np.asarray(predict_scores))
 95 |     scores_name = scores_path + model_name + '.npy'
 96 |     np.save(scores_name, predict_scores)
 97 |     print('scores.shape=', predict_scores.shape)
 98 |     print('Writed the scores into %s, time %g s' % (scores_name, time.time() - time0))
 99 | 
100 | 
101 | def main(_):
102 |     if not os.path.exists(ckpt_path + 'checkpoint'):
103 |         print('there is not saved model, please check the ckpt path')
104 |         exit()
105 |     print('Loading model...')
106 |     W_embedding = np.load(embedding_path)
107 |     config = tf.ConfigProto()
108 |     config.gpu_options.allow_growth = True
109 |     with tf.Session(config=config) as sess:
110 |         model = network.BiGRU(W_embedding, settings)
111 |         model.saver.restore(sess, tf.train.latest_checkpoint(ckpt_path))
112 |         print('Local predicting...')
113 |         local_predict(sess, model)
114 |         print('Test predicting...')
115 |         predict(sess, model)
116 | 
117 | 
118 | if __name__ == '__main__':
119 |     tf.app.run()
120 | 


--------------------------------------------------------------------------------
/zhihu-text-classification-master/models/wd_6_rcnn/predict.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | 
  3 | from __future__ import print_function
  4 | from __future__ import division
  5 | import tensorflow as tf
  6 | import numpy as np
  7 | from tqdm import tqdm
  8 | import os
  9 | import sys
 10 | import time
 11 | import network
 12 | 
 13 | sys.path.append('../..')
 14 | from evaluator import score_eval
 15 | 
 16 | settings = network.Settings()
 17 | title_len = settings.title_len
 18 | model_name = settings.model_name
 19 | ckpt_path = settings.ckpt_path
 20 | 
 21 | local_scores_path = '../../local_scores/'
 22 | scores_path = '../../scores/'
 23 | if not os.path.exists(local_scores_path):
 24 |     os.makedirs(local_scores_path)
 25 | if not os.path.exists(scores_path):
 26 |     os.makedirs(scores_path)
 27 | 
 28 | embedding_path = '../../data/word_embedding.npy'
 29 | data_valid_path = '../../data/wd-data/data_valid/'
 30 | data_test_path = '../../data/wd-data/data_test/'
 31 | va_batches = os.listdir(data_valid_path)
 32 | te_batches = os.listdir(data_test_path)  # batch 文件名列表
 33 | n_va_batches = len(va_batches)
 34 | n_te_batches = len(te_batches)
 35 | 
 36 | 
 37 | def get_batch(batch_id):
 38 |     """get a batch from valid data"""
 39 |     new_batch = np.load(data_valid_path + str(batch_id) + '.npz')
 40 |     X_batch = new_batch['X']
 41 |     y_batch = new_batch['y']
 42 |     X1_batch = X_batch[:, :title_len]
 43 |     X2_batch = X_batch[:, title_len:]
 44 |     return [X1_batch, X2_batch, y_batch]
 45 | 
 46 | 
 47 | def get_test_batch(batch_id):
 48 |     """get a batch from test data"""
 49 |     X_batch = np.load(data_test_path + str(batch_id) + '.npy')
 50 |     X1_batch = X_batch[:, :title_len]
 51 |     X2_batch = X_batch[:, title_len:]
 52 |     return [X1_batch, X2_batch]
 53 | 
 54 | 
 55 | def local_predict(sess, model):
 56 |     """Test on the valid data."""
 57 |     time0 = time.time()
 58 |     predict_labels_list = list()  # 所有的预测结果
 59 |     marked_labels_list = list()
 60 |     predict_scores = list()
 61 |     for i in tqdm(xrange(n_va_batches)):
 62 |         [X1_batch, X2_batch, y_batch] = get_batch(i)
 63 |         marked_labels_list.extend(y_batch)
 64 |         _batch_size = len(X1_batch)
 65 |         fetches = [model.y_pred]
 66 |         feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch,
 67 |                      model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
 68 |         predict_labels = sess.run(fetches, feed_dict)[0]
 69 |         predict_scores.append(predict_labels)
 70 |         predict_labels = list(map(lambda label: label.argsort()[-1:-6:-1], predict_labels))  # 取最大的5个下标
 71 |         predict_labels_list.extend(predict_labels)
 72 |     predict_label_and_marked_label_list = zip(predict_labels_list, marked_labels_list)
 73 |     precision, recall, f1 = score_eval(predict_label_and_marked_label_list)
 74 |     print('Local valid p=%g, r=%g, f1=%g' % (precision, recall, f1))
 75 |     predict_scores = np.vstack(np.asarray(predict_scores))
 76 |     local_scores_name = local_scores_path + model_name + '.npy'
 77 |     np.save(local_scores_name, predict_scores)
 78 |     print('local_scores.shape=', predict_scores.shape)
 79 |     print('Writed the scores into %s, time %g s' % (local_scores_name, time.time() - time0))
 80 | 
 81 | 
 82 | def predict(sess, model):
 83 |     """Test on the test data."""
 84 |     time0 = time.time()
 85 |     predict_scores = list()
 86 |     for i in tqdm(xrange(n_te_batches)):
 87 |         [X1_batch, X2_batch] = get_test_batch(i)
 88 |         _batch_size = len(X1_batch)
 89 |         fetches = [model.y_pred]
 90 |         feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch,
 91 |                      model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
 92 |         predict_labels = sess.run(fetches, feed_dict)[0]
 93 |         predict_scores.append(predict_labels)
 94 |     predict_scores = np.vstack(np.asarray(predict_scores))
 95 |     scores_name = scores_path + model_name + '.npy'
 96 |     np.save(scores_name, predict_scores)
 97 |     print('scores.shape=', predict_scores.shape)
 98 |     print('Writed the scores into %s, time %g s' % (scores_name, time.time() - time0))
 99 | 
100 | 
101 | def main(_):
102 |     if not os.path.exists(ckpt_path + 'checkpoint'):
103 |         print('there is not saved model, please check the ckpt path')
104 |         exit()
105 |     print('Loading model...')
106 |     W_embedding = np.load(embedding_path)
107 |     config = tf.ConfigProto()
108 |     config.gpu_options.allow_growth = True
109 |     with tf.Session(config=config) as sess:
110 |         model = network.RCNN(W_embedding, settings)
111 |         model.saver.restore(sess, tf.train.latest_checkpoint(ckpt_path))
112 |         print('Local predicting...')
113 |         local_predict(sess, model)
114 |         print('Test predicting...')
115 |         predict(sess, model)
116 | 
117 | 
118 | if __name__ == '__main__':
119 |     tf.app.run()
120 | 


--------------------------------------------------------------------------------
/zhihu-text-classification-master/models/wd_1_1_cnn_concat/predict.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | 
  3 | from __future__ import print_function
  4 | from __future__ import division
  5 | import tensorflow as tf
  6 | import numpy as np
  7 | from tqdm import tqdm
  8 | import os
  9 | import sys
 10 | import time
 11 | import network
 12 | 
 13 | sys.path.append('../..')
 14 | from evaluator import score_eval
 15 | 
 16 | settings = network.Settings()
 17 | title_len = settings.title_len
 18 | model_name = settings.model_name
 19 | ckpt_path = settings.ckpt_path
 20 | 
 21 | local_scores_path = '../../local_scores/'
 22 | scores_path = '../../scores/'
 23 | if not os.path.exists(local_scores_path):
 24 |     os.makedirs(local_scores_path)
 25 | if not os.path.exists(scores_path):
 26 |     os.makedirs(scores_path)
 27 | 
 28 | embedding_path = '../../data/word_embedding.npy'
 29 | data_valid_path = '../../data/wd-data/data_valid/'
 30 | data_test_path = '../../data/wd-data/data_test/'
 31 | va_batches = os.listdir(data_valid_path)
 32 | te_batches = os.listdir(data_test_path)  # batch 文件名列表
 33 | n_va_batches = len(va_batches)
 34 | n_te_batches = len(te_batches)
 35 | 
 36 | 
 37 | def get_batch(batch_id):
 38 |     """get a batch from valid data"""
 39 |     new_batch = np.load(data_valid_path + str(batch_id) + '.npz')
 40 |     X_batch = new_batch['X']
 41 |     y_batch = new_batch['y']
 42 |     X1_batch = X_batch[:, :title_len]
 43 |     X2_batch = X_batch[:, title_len:]
 44 |     return [X1_batch, X2_batch, y_batch]
 45 | 
 46 | 
 47 | def get_test_batch(batch_id):
 48 |     """get a batch from test data"""
 49 |     X_batch = np.load(data_test_path + str(batch_id) + '.npy')
 50 |     X1_batch = X_batch[:, :title_len]
 51 |     X2_batch = X_batch[:, title_len:]
 52 |     return [X1_batch, X2_batch]
 53 | 
 54 | 
 55 | def local_predict(sess, model):
 56 |     """Test on the valid data."""
 57 |     time0 = time.time()
 58 |     predict_labels_list = list()  # 所有的预测结果
 59 |     marked_labels_list = list()
 60 |     predict_scores = list()
 61 |     for i in tqdm(xrange(n_va_batches)):
 62 |         [X1_batch, X2_batch, y_batch] = get_batch(i)
 63 |         marked_labels_list.extend(y_batch)
 64 |         _batch_size = len(X1_batch)
 65 |         fetches = [model.y_pred]
 66 |         feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch,
 67 |                      model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
 68 |         predict_labels = sess.run(fetches, feed_dict)[0]
 69 |         predict_scores.append(predict_labels)
 70 |         predict_labels = map(lambda label: label.argsort()[-1:-6:-1], predict_labels)  # 取最大的5个下标
 71 |         predict_labels_list.extend(predict_labels)
 72 |     predict_label_and_marked_label_list = zip(predict_labels_list, marked_labels_list)
 73 |     precision, recall, f1 = score_eval(predict_label_and_marked_label_list)
 74 |     print('Local valid p=%g, r=%g, f1=%g' % (precision, recall, f1))
 75 |     predict_scores = np.vstack(np.asarray(predict_scores))
 76 |     local_scores_name = local_scores_path + model_name + '.npy'
 77 |     np.save(local_scores_name, predict_scores)
 78 |     print('local_scores.shape=', predict_scores.shape)
 79 |     print('Writed the scores into %s, time %g s' % (local_scores_name, time.time() - time0))
 80 | 
 81 | 
 82 | def predict(sess, model):
 83 |     """Test on the test data."""
 84 |     time0 = time.time()
 85 |     predict_scores = list()
 86 |     for i in tqdm(xrange(n_te_batches)):
 87 |         [X1_batch, X2_batch] = get_test_batch(i)
 88 |         _batch_size = len(X1_batch)
 89 |         fetches = [model.y_pred]
 90 |         feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch,
 91 |                      model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
 92 |         predict_labels = sess.run(fetches, feed_dict)[0]
 93 |         predict_scores.append(predict_labels)
 94 |     predict_scores = np.vstack(np.asarray(predict_scores))
 95 |     scores_name = scores_path + model_name + '.npy'
 96 |     np.save(scores_name, predict_scores)
 97 |     print('scores.shape=', predict_scores.shape)
 98 |     print('Writed the scores into %s, time %g s' % (scores_name, time.time() - time0))
 99 | 
100 | 
101 | def main(_):
102 |     if not os.path.exists(ckpt_path + 'checkpoint'):
103 |         print('there is not saved model, please check the ckpt path')
104 |         exit()
105 |     print('Loading model...')
106 |     W_embedding = np.load(embedding_path)
107 |     config = tf.ConfigProto()
108 |     config.gpu_options.allow_growth = True
109 |     with tf.Session(config=config) as sess:
110 |         model = network.TextCNN(W_embedding, settings)
111 |         model.saver.restore(sess, tf.train.latest_checkpoint(ckpt_path))
112 |         print('Local predicting...')
113 |         local_predict(sess, model)
114 |         print('Test predicting...')
115 |         predict(sess, model)
116 | 
117 | 
118 | if __name__ == '__main__':
119 |     tf.app.run()
120 | 


--------------------------------------------------------------------------------
/zhihu-text-classification-master/models/wd_1_2_cnn_max/predict.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | 
  3 | from __future__ import print_function
  4 | from __future__ import division
  5 | import tensorflow as tf
  6 | import numpy as np
  7 | from tqdm import tqdm
  8 | import os
  9 | import sys
 10 | import time
 11 | import network
 12 | 
 13 | sys.path.append('../..')
 14 | from evaluator import score_eval
 15 | 
 16 | settings = network.Settings()
 17 | title_len = settings.title_len
 18 | model_name = settings.model_name
 19 | ckpt_path = settings.ckpt_path
 20 | 
 21 | local_scores_path = '../../local_scores/'
 22 | scores_path = '../../scores/'
 23 | if not os.path.exists(local_scores_path):
 24 |     os.makedirs(local_scores_path)
 25 | if not os.path.exists(scores_path):
 26 |     os.makedirs(scores_path)
 27 | 
 28 | embedding_path = '../../data/word_embedding.npy'
 29 | data_valid_path = '../../data/wd-data/data_valid/'
 30 | data_test_path = '../../data/wd-data/data_test/'
 31 | va_batches = os.listdir(data_valid_path)
 32 | te_batches = os.listdir(data_test_path)  # batch 文件名列表
 33 | n_va_batches = len(va_batches)
 34 | n_te_batches = len(te_batches)
 35 | 
 36 | 
 37 | def get_batch(batch_id):
 38 |     """get a batch from valid data"""
 39 |     new_batch = np.load(data_valid_path + str(batch_id) + '.npz')
 40 |     X_batch = new_batch['X']
 41 |     y_batch = new_batch['y']
 42 |     X1_batch = X_batch[:, :title_len]
 43 |     X2_batch = X_batch[:, title_len:]
 44 |     return [X1_batch, X2_batch, y_batch]
 45 | 
 46 | 
 47 | def get_test_batch(batch_id):
 48 |     """get a batch from test data"""
 49 |     X_batch = np.load(data_test_path + str(batch_id) + '.npy')
 50 |     X1_batch = X_batch[:, :title_len]
 51 |     X2_batch = X_batch[:, title_len:]
 52 |     return [X1_batch, X2_batch]
 53 | 
 54 | 
 55 | def local_predict(sess, model):
 56 |     """Test on the valid data."""
 57 |     time0 = time.time()
 58 |     predict_labels_list = list()  # 所有的预测结果
 59 |     marked_labels_list = list()
 60 |     predict_scores = list()
 61 |     for i in tqdm(xrange(n_va_batches)):
 62 |         [X1_batch, X2_batch, y_batch] = get_batch(i)
 63 |         marked_labels_list.extend(y_batch)
 64 |         _batch_size = len(X1_batch)
 65 |         fetches = [model.y_pred]
 66 |         feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch,
 67 |                      model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
 68 |         predict_labels = sess.run(fetches, feed_dict)[0]
 69 |         predict_scores.append(predict_labels)
 70 |         predict_labels = list(map(lambda label: label.argsort()[-1:-6:-1], predict_labels))  # 取最大的5个下标
 71 |         predict_labels_list.extend(predict_labels)
 72 |     predict_label_and_marked_label_list = zip(predict_labels_list, marked_labels_list)
 73 |     precision, recall, f1 = score_eval(predict_label_and_marked_label_list)
 74 |     print('Local valid p=%g, r=%g, f1=%g' % (precision, recall, f1))
 75 |     predict_scores = np.vstack(np.asarray(predict_scores))
 76 |     local_scores_name = local_scores_path + model_name + '.npy'
 77 |     np.save(local_scores_name, predict_scores)
 78 |     print('local_scores.shape=', predict_scores.shape)
 79 |     print('Writed the scores into %s, time %g s' % (local_scores_name, time.time() - time0))
 80 | 
 81 | 
 82 | def predict(sess, model):
 83 |     """Test on the test data."""
 84 |     time0 = time.time()
 85 |     predict_scores = list()
 86 |     for i in tqdm(xrange(n_te_batches)):
 87 |         [X1_batch, X2_batch] = get_test_batch(i)
 88 |         _batch_size = len(X1_batch)
 89 |         fetches = [model.y_pred]
 90 |         feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch,
 91 |                      model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
 92 |         predict_labels = sess.run(fetches, feed_dict)[0]
 93 |         predict_scores.append(predict_labels)
 94 |     predict_scores = np.vstack(np.asarray(predict_scores))
 95 |     scores_name = scores_path + model_name + '.npy'
 96 |     np.save(scores_name, predict_scores)
 97 |     print('scores.shape=', predict_scores.shape)
 98 |     print('Writed the scores into %s, time %g s' % (scores_name, time.time() - time0))
 99 | 
100 | 
101 | def main(_):
102 |     if not os.path.exists(ckpt_path + 'checkpoint'):
103 |         print('there is not saved model, please check the ckpt path')
104 |         exit()
105 |     print('Loading model...')
106 |     W_embedding = np.load(embedding_path)
107 |     config = tf.ConfigProto()
108 |     config.gpu_options.allow_growth = True
109 |     with tf.Session(config=config) as sess:
110 |         model = network.TextCNN(W_embedding, settings)
111 |         model.saver.restore(sess, tf.train.latest_checkpoint(ckpt_path))
112 |         print('Local predicting...')
113 |         local_predict(sess, model)
114 |         print('Test predicting...')
115 |         predict(sess, model)
116 | 
117 | 
118 | if __name__ == '__main__':
119 |     tf.app.run()
120 | 


--------------------------------------------------------------------------------
/zhihu-text-classification-master/models/wd_5_bigru_cnn/predict.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | 
  3 | from __future__ import print_function
  4 | from __future__ import division
  5 | import tensorflow as tf
  6 | import numpy as np
  7 | from tqdm import tqdm
  8 | import os
  9 | import sys
 10 | import time
 11 | import network
 12 | 
 13 | sys.path.append('../..')
 14 | from evaluator import score_eval
 15 | 
 16 | settings = network.Settings()
 17 | title_len = settings.title_len
 18 | model_name = settings.model_name
 19 | ckpt_path = settings.ckpt_path
 20 | 
 21 | local_scores_path = '../../local_scores/'
 22 | scores_path = '../../scores/'
 23 | if not os.path.exists(local_scores_path):
 24 |     os.makedirs(local_scores_path)
 25 | if not os.path.exists(scores_path):
 26 |     os.makedirs(scores_path)
 27 | 
 28 | embedding_path = '../../data/word_embedding.npy'
 29 | data_valid_path = '../../data/wd-data/data_valid/'
 30 | data_test_path = '../../data/wd-data/data_test/'
 31 | va_batches = os.listdir(data_valid_path)
 32 | te_batches = os.listdir(data_test_path)  # batch 文件名列表
 33 | n_va_batches = len(va_batches)
 34 | n_te_batches = len(te_batches)
 35 | 
 36 | 
 37 | def get_batch(batch_id):
 38 |     """get a batch from valid data"""
 39 |     new_batch = np.load(data_valid_path + str(batch_id) + '.npz')
 40 |     X_batch = new_batch['X']
 41 |     y_batch = new_batch['y']
 42 |     X1_batch = X_batch[:, :title_len]
 43 |     X2_batch = X_batch[:, title_len:]
 44 |     return [X1_batch, X2_batch, y_batch]
 45 | 
 46 | 
 47 | def get_test_batch(batch_id):
 48 |     """get a batch from test data"""
 49 |     X_batch = np.load(data_test_path + str(batch_id) + '.npy')
 50 |     X1_batch = X_batch[:, :title_len]
 51 |     X2_batch = X_batch[:, title_len:]
 52 |     return [X1_batch, X2_batch]
 53 | 
 54 | 
 55 | def local_predict(sess, model):
 56 |     """Test on the valid data."""
 57 |     time0 = time.time()
 58 |     predict_labels_list = list()  # 所有的预测结果
 59 |     marked_labels_list = list()
 60 |     predict_scores = list()
 61 |     for i in tqdm(xrange(n_va_batches)):
 62 |         [X1_batch, X2_batch, y_batch] = get_batch(i)
 63 |         marked_labels_list.extend(y_batch)
 64 |         _batch_size = len(X1_batch)
 65 |         fetches = [model.y_pred]
 66 |         feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch,
 67 |                      model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
 68 |         predict_labels = sess.run(fetches, feed_dict)[0]
 69 |         predict_scores.append(predict_labels)
 70 |         predict_labels = list(map(lambda label: label.argsort()[-1:-6:-1], predict_labels))  # 取最大的5个下标
 71 |         predict_labels_list.extend(predict_labels)
 72 |     predict_label_and_marked_label_list = zip(predict_labels_list, marked_labels_list)
 73 |     precision, recall, f1 = score_eval(predict_label_and_marked_label_list)
 74 |     print('Local valid p=%g, r=%g, f1=%g' % (precision, recall, f1))
 75 |     predict_scores = np.vstack(np.asarray(predict_scores))
 76 |     local_scores_name = local_scores_path + model_name + '.npy'
 77 |     np.save(local_scores_name, predict_scores)
 78 |     print('local_scores.shape=', predict_scores.shape)
 79 |     print('Writed the scores into %s, time %g s' % (local_scores_name, time.time() - time0))
 80 | 
 81 | 
 82 | def predict(sess, model):
 83 |     """Test on the test data."""
 84 |     time0 = time.time()
 85 |     predict_scores = list()
 86 |     for i in tqdm(xrange(n_te_batches)):
 87 |         [X1_batch, X2_batch] = get_test_batch(i)
 88 |         _batch_size = len(X1_batch)
 89 |         fetches = [model.y_pred]
 90 |         feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch,
 91 |                      model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
 92 |         predict_labels = sess.run(fetches, feed_dict)[0]
 93 |         predict_scores.append(predict_labels)
 94 |     predict_scores = np.vstack(np.asarray(predict_scores))
 95 |     scores_name = scores_path + model_name + '.npy'
 96 |     np.save(scores_name, predict_scores)
 97 |     print('scores.shape=', predict_scores.shape)
 98 |     print('Writed the scores into %s, time %g s' % (scores_name, time.time() - time0))
 99 | 
100 | 
101 | def main(_):
102 |     if not os.path.exists(ckpt_path + 'checkpoint'):
103 |         print('there is not saved model, please check the ckpt path')
104 |         exit()
105 |     print('Loading model...')
106 |     W_embedding = np.load(embedding_path)
107 |     config = tf.ConfigProto()
108 |     config.gpu_options.allow_growth = True
109 |     with tf.Session(config=config) as sess:
110 |         model = network.BiGRU_CNN(W_embedding, settings)
111 |         model.saver.restore(sess, tf.train.latest_checkpoint(ckpt_path))
112 |         print('Local predicting...')
113 |         local_predict(sess, model)
114 |         print('Test predicting...')
115 |         predict(sess, model)
116 | 
117 | 
118 | if __name__ == '__main__':
119 |     tf.app.run()
120 | 


--------------------------------------------------------------------------------
/zhihu-text-classification-master/data_process/char2id.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | 
  3 | from __future__ import division
  4 | from __future__ import print_function
  5 | 
  6 | import numpy as np
  7 | import pandas as pd
  8 | import pickle
  9 | from multiprocessing import Pool
 10 | from tqdm import tqdm
 11 | import time
 12 | 
 13 | 
 14 | save_path = '../data/'
 15 | with open(save_path + 'sr_char2id.pkl', 'rb') as inp:
 16 |     sr_id2char = pickle.load(inp)
 17 |     sr_char2id = pickle.load(inp)
 18 | dict_char2id = dict()
 19 | for i in range(len(sr_char2id)):
 20 |     dict_char2id[sr_char2id.index[i]] = sr_char2id.values[i]
 21 | 
 22 | 
 23 | def get_id(char):
 24 |     """获取 char 所对应的 id.
 25 |     如果该字不在字典中，用1进行替换。
 26 |     """
 27 |     if char not in dict_char2id:
 28 |         return 1
 29 |     else:
 30 |         return dict_char2id[char]
 31 | 
 32 | 
 33 | def get_id4chars(chars):
 34 |     """把 chars 转为 对应的 id"""
 35 |     chars = chars.strip().split(',')  # 先分开字
 36 |     ids = list(map(get_id, chars))          # 获取id
 37 |     return ids
 38 | 
 39 | 
 40 | def test_char2id():
 41 |     """把测试集的所有字转成对应的id。"""
 42 |     time0 = time.time()
 43 |     print('Processing eval data.')
 44 |     df_eval = pd.read_csv('../raw_data/question_eval_set.txt', sep='\t',  usecols=[0, 1, 3],
 45 |                           names=['question_id', 'char_title', 'char_content'], dtype={'question_id': object})
 46 |     print('test question number %d' % len(df_eval))
 47 |     # 没有 title 的问题用 content 来替换
 48 |     na_title_indexs = list()
 49 |     for i in range(len(df_eval)):
 50 |         char_title = df_eval.char_title.values[i]
 51 |         if type(char_title) is float:
 52 |             na_title_indexs.append(i)
 53 |     print('There are %d test questions without title.' % len(na_title_indexs))
 54 |     for na_index in na_title_indexs:
 55 |         df_eval.at[na_index, 'char_title'] = df_eval.at[na_index, 'char_content']
 56 |     # 没有 content 的问题用 title 来替换
 57 |     na_content_indexs = list()
 58 |     for i in tqdm(range(len(df_eval))):
 59 |         char_content = df_eval.char_content.values[i]
 60 |         if type(char_content) is float:
 61 |             na_content_indexs.append(i)
 62 |     print('There are %d test questions without content.' % len(na_content_indexs))
 63 |     for na_index in tqdm(na_content_indexs):
 64 |         df_eval.at[na_index, 'char_content'] = df_eval.at[na_index, 'char_title']
 65 |     # 转为 id 形式
 66 |     p = Pool()
 67 |     eval_title = np.asarray(p.map(get_id4chars, df_eval.char_title.values))
 68 |     np.save('../data/ch_eval_title.npy', eval_title)
 69 |     eval_content = np.asarray(p.map(get_id4chars, df_eval.char_content.values))
 70 |     np.save('../data/ch_eval_content.npy', eval_content)
 71 |     p.close()
 72 |     p.join()
 73 |     print('Finished changing the eval chars to ids. Costed time %g s' % (time.time()-time0))
 74 | 
 75 | 
 76 | def train_char2id():
 77 |     """把训练集的所有字转成对应的id。"""
 78 |     time0 = time.time()
 79 |     print('Processing train data.')
 80 |     df_train = pd.read_csv('../raw_data/question_train_set.txt', sep='\t', usecols=[0, 1, 3],
 81 |                            names=['question_id', 'char_title', 'char_content'], dtype={'question_id': object})
 82 |     print('training question number %d ' % len(df_train))
 83 |     # 没有 content 的问题用 title 来替换
 84 |     na_content_indexs = list()
 85 |     for i in tqdm(range(len(df_train))):
 86 |         char_content = df_train.char_content.values[i]
 87 |         if type(char_content) is float:
 88 |             na_content_indexs.append(i)
 89 |     print('There are %d train questions without content.' % len(na_content_indexs))
 90 |     for na_index in tqdm(na_content_indexs):
 91 |         df_train.at[na_index, 'char_content'] = df_train.at[na_index, 'char_title']
 92 |     # 没有 title 的问题， 与词一样丢弃下面样本
 93 |     na_title_indexs = [328877, 422123, 633584, 768738, 818616, 876828, 1273673, 1527297,
 94 |               1636237, 1682969, 2052477, 2628516, 2657464, 2904162, 2993517]
 95 |     for i in range(len(df_train)):
 96 |         char_title = df_train.char_title.values[i]
 97 |         if type(char_title) is float:
 98 |             na_title_indexs.append(i)
 99 |     print('There are %d train questions without title.' % len(na_title_indexs))
100 |     df_train = df_train.drop(na_title_indexs)
101 |     print('After dropping, training question number(should be 2999952) = %d' % len(df_train))
102 |     # 转为 id 形式
103 |     p = Pool()
104 |     train_title = np.asarray(list(p.map(get_id4chars, df_train.char_title.values)))
105 |     np.save('../data/ch_train_title.npy', train_title)
106 |     train_content = np.asarray(p.map(get_id4chars, df_train.char_content.values))
107 |     np.save('../data/ch_train_content.npy', train_content)
108 |     p.close()
109 |     p.join()
110 |     print('Finished changing the training chars to ids. Costed time %g s' % (time.time() - time0))
111 | 
112 | 
113 | if __name__ == '__main__':
114 |     test_char2id()
115 |     train_char2id()
116 | 
117 | 
118 | 
119 | 
120 | 
121 | 
122 | 


--------------------------------------------------------------------------------
/zhihu-text-classification-master/data_process/creat_batch_seg.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | 
  3 | from __future__ import division
  4 | from __future__ import print_function
  5 | 
  6 | import numpy as np
  7 | from multiprocessing import Pool
  8 | import sys
  9 | import os
 10 | 
 11 | sys.path.append('../')
 12 | from data_helpers import pad_X30
 13 | from data_helpers import pad_X52
 14 | from data_helpers import wd_pad_cut_docs
 15 | from data_helpers import ch_pad_cut_docs
 16 | from data_helpers import train_batch
 17 | from data_helpers import eval_batch
 18 | 
 19 | 
 20 | wd_train_path = '../data/wd-data/seg_train/'
 21 | wd_valid_path = '../data/wd-data/seg_valid/'
 22 | wd_test_path = '../data/wd-data/seg_test/'
 23 | ch_train_path = '../data/ch-data/seg_train/'
 24 | ch_valid_path = '../data/ch-data/seg_valid/'
 25 | ch_test_path = '../data/ch-data/seg_test/'
 26 | paths = [wd_train_path, wd_valid_path, wd_test_path,
 27 |          ch_train_path, ch_valid_path, ch_test_path]
 28 | for each in paths:
 29 |     if not os.path.exists(each):
 30 |         os.makedirs(each)
 31 | 
 32 | 
 33 | # word 数据打包
 34 | def wd_train_get_batch(title_len=30, batch_size=128):
 35 |     print('loading word train_title and train_content, this should cost minutes, please wait.')
 36 |     train_title = np.load('../data/wd_train_title.npy')
 37 |     train_content = np.load('../data/wd_train_content.npy')
 38 |     p = Pool(6)
 39 |     X_title = np.asarray(p.map(pad_X30, train_title))
 40 |     X_content = np.asarray(p.map(wd_pad_cut_docs, train_content))
 41 |     p.close()
 42 |     p.join()
 43 |     X_content.shape = [-1, 30*10]
 44 |     X = np.hstack([X_title, X_content])
 45 |     y = np.load('../data/y_tr.npy')
 46 |     # 划分验证集
 47 |     sample_num = X.shape[0]
 48 |     np.random.seed(13)
 49 |     valid_num = 100000
 50 |     new_index = np.random.permutation(sample_num)
 51 |     X = X[new_index]
 52 |     y = y[new_index]
 53 |     X_valid = X[:valid_num]
 54 |     y_valid = y[:valid_num]
 55 |     X_train = X[valid_num:]
 56 |     y_train = y[valid_num:]
 57 |     print('X_train.shape=', X_train.shape, 'y_train.shape=', y_train.shape)
 58 |     print('X_valid.shape=', X_valid.shape, 'y_valid.shape=', y_valid.shape)
 59 |     # 验证集打 batch
 60 |     print('creating batch data.')
 61 |     sample_num = len(X_valid)
 62 |     print('valid_sample_num=%d' % sample_num)
 63 |     train_batch(X_valid, y_valid, wd_valid_path, batch_size)
 64 |     # 训练集打 batch
 65 |     sample_num = len(X_train)
 66 |     print('train_sample_num=%d' % sample_num)
 67 |     train_batch(X_train, y_train, wd_train_path, batch_size)
 68 | 
 69 | 
 70 | def wd_test_get_batch(title_len=30, batch_size=128):
 71 |     print('loading word eval_title and eval_content.')
 72 |     eval_title = np.load('../data/wd_eval_title.npy')
 73 |     eval_content = np.load('../data/wd_eval_content.npy')
 74 |     p = Pool(6)
 75 |     X_title = np.asarray(p.map(pad_X30, eval_title))
 76 |     X_content = np.asarray(p.map(wd_pad_cut_docs, eval_content))
 77 |     p.close()
 78 |     p.join()
 79 |     X_content.shape = [-1, 30*10]
 80 |     X = np.hstack([X_title, X_content])
 81 |     sample_num = len(X)
 82 |     print('eval_sample_num=%d' % sample_num)
 83 |     eval_batch(X, wd_test_path, batch_size)
 84 | 
 85 | 
 86 | # char 数据打包
 87 | def ch_train_get_batch(title_len=52, batch_size=128):
 88 |     print('loading char train_title and train_content, this should cost minutes, please wait.')
 89 |     train_title = np.load('../data/ch_train_title.npy')
 90 |     train_content = np.load('../data/ch_train_content.npy')
 91 |     p = Pool(8)
 92 |     X_title = np.asarray(p.map(pad_X52, train_title))
 93 |     X_content = np.asarray(p.map(ch_pad_cut_docs, train_content))
 94 |     p.close()
 95 |     p.join()
 96 |     X_content.shape = [-1, 52*10]
 97 |     X = np.hstack([X_title, X_content])
 98 |     y = np.load('../data/y_tr.npy')
 99 |     # 划分验证集
100 |     sample_num = X.shape[0]
101 |     np.random.seed(13)
102 |     valid_num = 100000
103 |     new_index = np.random.permutation(sample_num)
104 |     X = X[new_index]
105 |     y = y[new_index]
106 |     X_valid = X[:valid_num]
107 |     y_valid = y[:valid_num]
108 |     X_train = X[valid_num:]
109 |     y_train = y[valid_num:]
110 |     print('X_train.shape=', X_train.shape, 'y_train.shape=', y_train.shape)
111 |     print('X_valid.shape=', X_valid.shape, 'y_valid.shape=', y_valid.shape)
112 |     # 验证集打batch
113 |     print('creating batch data.')
114 |     sample_num = len(X_valid)
115 |     print('valid_sample_num=%d' % sample_num)
116 |     train_batch(X_valid, y_valid, ch_valid_path, batch_size)
117 |     # 训练集打batch
118 |     sample_num = len(X_train)
119 |     print('train_sample_num=%d' % sample_num)
120 |     train_batch(X_train, y_train, ch_train_path, batch_size)
121 | 
122 | 
123 | def ch_test_get_batch(title_len=52, batch_size=128):
124 |     print('loading char eval_title and eval_content.')
125 |     eval_title = np.load('../data/ch_eval_title.npy')
126 |     eval_content = np.load('../data/ch_eval_content.npy')
127 |     p = Pool()
128 |     X_title = np.asarray(p.map(pad_X52, eval_title))
129 |     X_content = np.asarray(p.map(ch_pad_cut_docs, eval_content))
130 |     p.close()
131 |     p.join()
132 |     X_content.shape = [-1, 52*10]
133 |     X = np.hstack([X_title, X_content])
134 |     sample_num = len(X)
135 |     print('eval_sample_num=%d' % sample_num)
136 |     eval_batch(X, ch_test_path, batch_size)
137 | 
138 | 
139 | if __name__ == '__main__':
140 |     wd_train_get_batch()
141 |     wd_test_get_batch()
142 |     ch_train_get_batch()
143 |     ch_test_get_batch()
144 | 


--------------------------------------------------------------------------------
/zhihu-text-classification-master/data_process/creat_batch_data.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | 
  3 | from __future__ import division
  4 | from __future__ import print_function
  5 | 
  6 | import numpy as np
  7 | import pandas as pd
  8 | import pickle
  9 | from multiprocessing import Pool
 10 | import sys
 11 | import os
 12 | 
 13 | sys.path.append('../')
 14 | from data_helpers import pad_X30
 15 | from data_helpers import pad_X150
 16 | from data_helpers import pad_X52
 17 | from data_helpers import pad_X300
 18 | from data_helpers import train_batch
 19 | from data_helpers import eval_batch
 20 | 
 21 | """ 把所有的数据按照 batch_size(128) 进行打包。取 10万 样本作为验证集。
 22 | word_title_len = 30.
 23 | word_content_len = 150.
 24 | char_title_len = 52.
 25 | char_content_len = 300.
 26 | """
 27 | 
 28 | 
 29 | wd_train_path = '../data/wd-data/data_train/'
 30 | wd_valid_path = '../data/wd-data/data_valid/'
 31 | wd_test_path = '../data/wd-data/data_test/'
 32 | ch_train_path = '../data/ch-data/data_train/'
 33 | ch_valid_path = '../data/ch-data/data_valid/'
 34 | ch_test_path = '../data/ch-data/data_test/'
 35 | paths = [wd_train_path, wd_valid_path, wd_test_path,
 36 |          ch_train_path, ch_valid_path, ch_test_path]
 37 | for each in paths:
 38 |     if not os.path.exists(each):
 39 |         os.makedirs(each)
 40 | 
 41 | with open('../data/sr_topic2id.pkl', 'rb') as inp:
 42 |     sr_topic2id = pickle.load(inp)
 43 | 
 44 | dict_topic2id = dict()
 45 | for i in range(len(sr_topic2id)):
 46 |     dict_topic2id[sr_topic2id.index[i]] = sr_topic2id.values[i]
 47 | 
 48 | 
 49 | def topics2ids(topics):
 50 |     """把 chars 转为 对应的 id"""
 51 |     topics = topics.split(',')
 52 |     ids = list(map(lambda topic: dict_topic2id[topic], topics))         # 获取id
 53 |     return ids
 54 | 
 55 | 
 56 | def get_lables():
 57 |     """获取训练集所有样本的标签。注意之前在处理数据时丢弃了部分没有 title 的样本。"""
 58 |     df_question_topic = pd.read_csv('../raw_data/question_topic_train_set.txt', sep='\t',
 59 |                                     names=['questions', 'topics'], dtype={'questions': object, 'topics': object})
 60 |     na_title_indexs = [328877, 422123, 633584, 768738, 818616, 876828, 1273673, 1527297,
 61 |                        1636237, 1682969, 2052477, 2628516, 2657464, 2904162, 2993517]
 62 |     df_question_topic = df_question_topic.drop(na_title_indexs)
 63 |     p = Pool()
 64 |     y = p.map(topics2ids, df_question_topic.topics.values)
 65 |     p.close()
 66 |     p.join()
 67 |     return np.asarray(y)
 68 | 
 69 | 
 70 | # word 数据打包
 71 | def wd_train_get_batch(title_len=30, content_len=150, batch_size=128):
 72 |     print('loading word train_title and train_content.')
 73 |     train_title = np.load('../data/wd_train_title.npy')
 74 |     train_content = np.load('../data/wd_train_content.npy')
 75 |     p = Pool()
 76 |     X_title = np.asarray(p.map(pad_X30, train_title))
 77 |     X_content = np.asarray(p.map(pad_X150, train_content))
 78 |     p.close()
 79 |     p.join()
 80 |     X = np.hstack([X_title, X_content])
 81 |     print('getting labels, this should cost minutes, please wait.')
 82 |     y = get_lables()
 83 |     print('y.shape=', y.shape)
 84 |     np.save('../data/y_tr.npy', y)
 85 |     # 划分验证集
 86 |     sample_num = X.shape[0]
 87 |     np.random.seed(13)
 88 |     valid_num = 100000
 89 |     new_index = np.random.permutation(sample_num)
 90 |     X = X[new_index]
 91 |     y = y[new_index]
 92 |     X_valid = X[:valid_num]
 93 |     y_valid = y[:valid_num]
 94 |     X_train = X[valid_num:]
 95 |     y_train = y[valid_num:]
 96 |     print('X_train.shape=', X_train.shape, 'y_train.shape=', y_train.shape)
 97 |     print('X_valid.shape=', X_valid.shape, 'y_valid.shape=', y_valid.shape)
 98 |     print('creating batch data.')
 99 |     # 验证集打batch
100 |     sample_num = len(X_valid)
101 |     print('valid_sample_num=%d' % sample_num)
102 |     train_batch(X_valid, y_valid, wd_valid_path, batch_size)
103 |     # 训练集打batch
104 |     sample_num = len(X_train)
105 |     print('train_sample_num=%d' % sample_num)
106 |     train_batch(X_train, y_train, wd_train_path, batch_size)
107 | 
108 | 
109 | def wd_test_get_batch(title_len=30, content_len=150, batch_size=128):
110 |     eval_title = np.load('../data/wd_eval_title.npy')
111 |     eval_content = np.load('../data/wd_eval_content.npy')
112 |     p = Pool()
113 |     X_title = np.asarray(p.map(pad_X30, eval_title))
114 |     X_content = np.asarray(p.map(pad_X150, eval_content))
115 |     p.close()
116 |     p.join()
117 |     X = np.hstack([X_title, X_content])
118 |     sample_num = len(X)
119 |     print('eval_sample_num=%d' % sample_num)
120 |     eval_batch(X, wd_test_path, batch_size)
121 | 
122 | 
123 | # char 数据打包
124 | def ch_train_get_batch(title_len=52, content_len=300, batch_size=128):
125 |     print('loading char train_title and train_content.')
126 |     train_title = np.load('../data/ch_train_title.npy')
127 |     train_content = np.load('../data/ch_train_content.npy')
128 |     p = Pool()
129 |     X_title = np.asarray(p.map(pad_X52, train_title))
130 |     X_content = np.asarray(p.map(pad_X300, train_content))
131 |     p.close()
132 |     p.join()
133 |     X = np.hstack([X_title, X_content])
134 |     y = np.load('../data/y_tr.npy')
135 |     # 划分验证集
136 |     sample_num = X.shape[0]
137 |     np.random.seed(13)
138 |     valid_num = 100000
139 |     new_index = np.random.permutation(sample_num)
140 |     X = X[new_index]
141 |     y = y[new_index]
142 |     X_valid = X[:valid_num]
143 |     y_valid = y[:valid_num]
144 |     X_train = X[valid_num:]
145 |     y_train = y[valid_num:]
146 |     print('X_train.shape=', X_train.shape, 'y_train.shape=', y_train.shape)
147 |     print('X_valid.shape=', X_valid.shape, 'y_valid.shape=', y_valid.shape)
148 |     # 验证集打batch
149 |     print('creating batch data.')
150 |     sample_num = len(X_valid)
151 |     print('valid_sample_num=%d' % sample_num)
152 |     train_batch(X_valid, y_valid, ch_valid_path, batch_size)
153 |     # 训练集打batch
154 |     sample_num = len(X_train)
155 |     print('train_sample_num=%d' % sample_num)
156 |     train_batch(X_train, y_train, ch_train_path, batch_size)
157 | 
158 | 
159 | def ch_test_get_batch(title_len=52, content_len=300, batch_size=128):
160 |     eval_title = np.load('../data/ch_eval_title.npy')
161 |     eval_content = np.load('../data/ch_eval_content.npy')
162 |     p = Pool()
163 |     X_title = np.asarray(p.map(pad_X52, eval_title))
164 |     X_content = np.asarray(p.map(pad_X300, eval_content))
165 |     p.close()
166 |     p.join()
167 |     X = np.hstack([X_title, X_content])
168 |     sample_num = len(X)
169 |     print('eval_sample_num=%d' % sample_num)
170 |     eval_batch(X, ch_test_path, batch_size)
171 | 
172 | 
173 | if __name__ == '__main__':
174 |     wd_train_get_batch()
175 |     wd_test_get_batch()
176 |     ch_train_get_batch()
177 |     ch_test_get_batch()
178 | 


--------------------------------------------------------------------------------
/zhihu-text-classification-master/models/wd_1_1_cnn_concat/train.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | 
  3 | from __future__ import print_function
  4 | from __future__ import division
  5 | import tensorflow as tf
  6 | import numpy as np
  7 | from tqdm import tqdm
  8 | import os
  9 | import sys
 10 | import shutil
 11 | import time
 12 | import network
 13 | 
 14 | sys.path.append('../..')
 15 | from data_helpers import to_categorical
 16 | from evaluator import score_eval
 17 | 
 18 | flags = tf.flags
 19 | flags.DEFINE_bool('is_retrain', False, 'if is_retrain is true, not rebuild the summary')
 20 | flags.DEFINE_integer('max_epoch', 1, 'update the embedding after max_epoch, default: 1')
 21 | flags.DEFINE_integer('max_max_epoch', 6, 'all training epoches, default: 6')
 22 | flags.DEFINE_float('lr', 1e-3, 'initial learning rate, default: 1e-3')
 23 | flags.DEFINE_float('decay_rate', 0.65, 'decay rate, default: 0.65')
 24 | flags.DEFINE_float('keep_prob', 0.5, 'keep_prob for training, default: 0.5')
 25 | # 正式
 26 | flags.DEFINE_integer('decay_step', 15000, 'decay_step, default: 15000')
 27 | flags.DEFINE_integer('valid_step', 10000, 'valid_step, default: 10000')
 28 | flags.DEFINE_float('last_f1', 0.40, 'if valid_f1 > last_f1, save new model. default: 0.40')
 29 | 
 30 | # 测试
 31 | # flags.DEFINE_integer('decay_step', 1000, 'decay_step, default: 1000')
 32 | # flags.DEFINE_integer('valid_step', 500, 'valid_step, default: 500')
 33 | # flags.DEFINE_float('last_f1', 0.10, 'if valid_f1 > last_f1, save new model. default: 0.10')
 34 | FLAGS = flags.FLAGS
 35 | 
 36 | lr = FLAGS.lr
 37 | last_f1 = FLAGS.last_f1
 38 | settings = network.Settings()
 39 | title_len = settings.title_len
 40 | summary_path = settings.summary_path
 41 | ckpt_path = settings.ckpt_path
 42 | model_path = ckpt_path + 'model.ckpt'
 43 | 
 44 | embedding_path = '../../data/word_embedding.npy'
 45 | data_train_path = '../../data/wd-data/data_train/'
 46 | data_valid_path = '../../data/wd-data/data_valid/'
 47 | tr_batches = os.listdir(data_train_path)  # batch 文件名列表
 48 | va_batches = os.listdir(data_valid_path)
 49 | n_tr_batches = len(tr_batches)
 50 | n_va_batches = len(va_batches)
 51 | 
 52 | # 测试
 53 | # n_tr_batches = 1000
 54 | # n_va_batches = 50
 55 | 
 56 | 
 57 | def get_batch(data_path, batch_id):
 58 |     """get a batch from data_path"""
 59 |     new_batch = np.load(data_path + str(batch_id) + '.npz')
 60 |     X_batch = new_batch['X']
 61 |     y_batch = new_batch['y']
 62 |     X1_batch = X_batch[:, :title_len]
 63 |     X2_batch = X_batch[:, title_len:]
 64 |     return [X1_batch, X2_batch, y_batch]
 65 | 
 66 | 
 67 | def valid_epoch(data_path, sess, model):
 68 |     """Test on the valid data."""
 69 |     va_batches = os.listdir(data_path)
 70 |     n_va_batches = len(va_batches)
 71 |     _costs = 0.0
 72 |     predict_labels_list = list()  # 所有的预测结果
 73 |     marked_labels_list = list()
 74 |     for i in range(n_va_batches):
 75 |         [X1_batch, X2_batch, y_batch] = get_batch(data_path, i)
 76 |         marked_labels_list.extend(y_batch)
 77 |         y_batch = to_categorical(y_batch)
 78 |         _batch_size = len(y_batch)
 79 |         fetches = [model.loss, model.y_pred]
 80 |         feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
 81 |                      model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
 82 |         _cost, predict_labels = sess.run(fetches, feed_dict)
 83 |         _costs += _cost
 84 |         predict_labels = list(map(lambda label: label.argsort()[-1:-6:-1], predict_labels))  # 取最大的5个下标
 85 |         predict_labels_list.extend(predict_labels)
 86 |     predict_label_and_marked_label_list = zip(predict_labels_list, marked_labels_list)
 87 |     precision, recall, f1 = score_eval(predict_label_and_marked_label_list)
 88 |     mean_cost = _costs / n_va_batches
 89 |     return mean_cost, precision, recall, f1
 90 | 
 91 | 
 92 | def train_epoch(data_train_path, sess, model, train_fetches, valid_fetches, train_writer, test_writer):
 93 |     global last_f1
 94 |     global lr
 95 |     time0 = time.time()
 96 |     batch_indexs = np.random.permutation(n_tr_batches)  # shuffle the training data
 97 |     for batch in tqdm(range(n_tr_batches)):
 98 |         global_step = sess.run(model.global_step)
 99 |         if 0 == (global_step + 1) % FLAGS.valid_step:
100 |             valid_cost, precision, recall, f1 = valid_epoch(data_valid_path, sess, model)
101 |             print('Global_step=%d: valid cost=%g; p=%g, r=%g, f1=%g, time=%g s' % (
102 |                 global_step, valid_cost, precision, recall, f1, time.time() - time0))
103 |             time0 = time.time()
104 |             if f1 > last_f1:
105 |                 last_f1 = f1
106 |                 saving_path = model.saver.save(sess, model_path, global_step+1)
107 |                 print('saved new model to %s ' % saving_path)
108 |         # training
109 |         batch_id = batch_indexs[batch]
110 |         [X1_batch, X2_batch, y_batch] = get_batch(data_train_path, batch_id)
111 |         y_batch = to_categorical(y_batch)
112 |         _batch_size = len(y_batch)
113 |         feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
114 |                      model.batch_size: _batch_size, model.tst: False, model.keep_prob: FLAGS.keep_prob}
115 |         summary, _cost, _, _ = sess.run(train_fetches, feed_dict)  # the cost is the mean cost of one batch
116 |         # valid per 500 steps
117 |         if 0 == (global_step + 1) % 500:
118 |             train_writer.add_summary(summary, global_step)
119 |             batch_id = np.random.randint(0, n_va_batches)  # 随机选一个验证batch
120 |             [X1_batch, X2_batch, y_batch] = get_batch(data_valid_path, batch_id)
121 |             y_batch = to_categorical(y_batch)
122 |             _batch_size = len(y_batch)
123 |             feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
124 |                          model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
125 |             summary, _cost = sess.run(valid_fetches, feed_dict)
126 |             test_writer.add_summary(summary, global_step)
127 | 
128 | 
129 | def main(_):
130 |     global ckpt_path
131 |     global last_f1
132 |     if not os.path.exists(ckpt_path):
133 |         os.makedirs(ckpt_path)
134 |     if not os.path.exists(summary_path):
135 |         os.makedirs(summary_path)
136 |     elif not FLAGS.is_retrain:  # 重新训练本模型，删除以前的 summary
137 |         shutil.rmtree(summary_path)
138 |         os.makedirs(summary_path)
139 |     if not os.path.exists(summary_path):
140 |         os.makedirs(summary_path)
141 | 
142 |     print('1.Loading data...')
143 |     W_embedding = np.load(embedding_path)
144 |     print('training sample_num = %d' % n_tr_batches)
145 |     print('valid sample_num = %d' % n_va_batches)
146 | 
147 |     # Initial or restore the model
148 |     print('2.Building model...')
149 |     config = tf.ConfigProto()
150 |     config.gpu_options.allow_growth = True
151 |     with tf.Session(config=config) as sess:
152 |         model = network.TextCNN(W_embedding, settings)
153 |         with tf.variable_scope('training_ops') as vs:
154 |             learning_rate = tf.train.exponential_decay(FLAGS.lr, model.global_step, FLAGS.decay_step,
155 |                                                    FLAGS.decay_rate, staircase=True)
156 |             # two optimizer: op1, update embedding; op2, do not update embedding.
157 |             with tf.variable_scope('Optimizer1'):
158 |                 tvars1 = tf.trainable_variables()
159 |                 train_op1 = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(model.loss, global_step=model.global_step, var_list=tvars1)
160 | 
161 |             with tf.variable_scope('Optimizer2'):
162 |                 tvars2 = [tvar for tvar in tvars1 if 'embedding' not in tvar.name]
163 |                 train_op2 = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(model.loss, global_step=model.global_step, var_list=tvars2)
164 | 
165 |             update_op = tf.group(*model.update_emas)
166 |             merged = tf.summary.merge_all()  # summary
167 |             train_writer = tf.summary.FileWriter(summary_path + 'train', sess.graph)
168 |             test_writer = tf.summary.FileWriter(summary_path + 'test')
169 |             training_ops = [v for v in tf.global_variables() if v.name.startswith(vs.name+'/')]
170 | 
171 |         # 如果已经保存过模型，导入上次的模型
172 |         if os.path.exists(ckpt_path + "checkpoint"):
173 |             print("Restoring Variables from Checkpoint...")
174 |             model.saver.restore(sess, tf.train.latest_checkpoint(ckpt_path))
175 |             last_valid_cost, precision, recall, last_f1 = valid_epoch(data_valid_path, sess, model)
176 |             print(' valid cost=%g; p=%g, r=%g, f1=%g' % (last_valid_cost, precision, recall, last_f1))
177 |             sess.run(tf.variables_initializer(training_ops))
178 |         else:
179 |             print('Initializing Variables...')
180 |             sess.run(tf.global_variables_initializer())
181 | 
182 |         print('3.Begin training...')
183 |         print('max_epoch=%d, max_max_epoch=%d' % (FLAGS.max_epoch, FLAGS.max_max_epoch))
184 |         for epoch in range(FLAGS.max_max_epoch):
185 |             global_step = sess.run(model.global_step)
186 |             print('Global step %d, lr=%g' % (global_step, sess.run(learning_rate)))
187 |             if epoch == FLAGS.max_epoch:  # update the embedding
188 |                 train_op = train_op1
189 |             else:
190 |                 train_op = train_op2
191 | 
192 |             train_fetches = [merged, model.loss, train_op, update_op]
193 |             valid_fetches = [merged, model.loss]
194 |             train_epoch(data_train_path, sess, model, train_fetches, valid_fetches, train_writer, test_writer)
195 |         # 最后再做一次验证
196 |         valid_cost, precision, recall, f1 = valid_epoch(data_valid_path, sess, model)
197 |         print('END.Global_step=%d: valid cost=%g; p=%g, r=%g, f1=%g' % (
198 |             sess.run(model.global_step), valid_cost, precision, recall, f1))
199 |         if f1 > last_f1:  # save the better model
200 |             saving_path = model.saver.save(sess, model_path, sess.run(model.global_step)+1)
201 |             print('saved new model to %s ' % saving_path)
202 | 
203 | 
204 | if __name__ == '__main__':
205 |     tf.app.run()
206 | 


--------------------------------------------------------------------------------
/zhihu-text-classification-master/models/wd_1_1_cnn_concat/network.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | 
  3 | import tensorflow as tf
  4 | 
  5 | """wd_1_1_cnn_concat
  6 | title 部分使用 TextCNN；content 部分使用 TextCNN； 两部分输出直接 concat。
  7 | """
  8 | 
  9 | 
 10 | class Settings(object):
 11 |     def __init__(self):
 12 |         self.model_name = 'wd_1_1_cnn_concat'
 13 |         self.title_len = 30
 14 |         self.content_len = 150
 15 |         self.filter_sizes = [2, 3, 4, 5, 7]
 16 |         self.n_filter = 256
 17 |         self.fc_hidden_size = 1024
 18 |         self.n_class = 1999
 19 |         self.summary_path = '../../summary/' + self.model_name + '/'
 20 |         self.ckpt_path = '../../ckpt/' + self.model_name + '/'
 21 | 
 22 | 
 23 | class TextCNN(object):
 24 |     """
 25 |     title: inputs->textcnn->output_title
 26 |     content: inputs->textcnn->output_content
 27 |     concat[output_title, output_content] -> fc+bn+relu -> sigmoid_entropy.
 28 |     """
 29 | 
 30 |     def __init__(self, W_embedding, settings):
 31 |         self.model_name = settings.model_name
 32 |         self.title_len = settings.title_len
 33 |         self.content_len = settings.content_len
 34 |         self.filter_sizes = settings.filter_sizes
 35 |         self.n_filter = settings.n_filter
 36 |         self.n_filter_total = self.n_filter * len(self.filter_sizes)
 37 |         self.n_class = settings.n_class
 38 |         self.fc_hidden_size = settings.fc_hidden_size
 39 |         self._global_step = tf.Variable(0, trainable=False, name='Global_Step')
 40 |         self.update_emas = list()
 41 |         # placeholders
 42 |         self._tst = tf.placeholder(tf.bool)
 43 |         self._keep_prob = tf.placeholder(tf.float32, [])
 44 |         self._batch_size = tf.placeholder(tf.int32, [])
 45 | 
 46 |         with tf.name_scope('Inputs'):
 47 |             self._X1_inputs = tf.placeholder(tf.int64, [None, self.title_len], name='X1_inputs')
 48 |             self._X2_inputs = tf.placeholder(tf.int64, [None, self.content_len], name='X2_inputs')
 49 |             self._y_inputs = tf.placeholder(tf.float32, [None, self.n_class], name='y_input')
 50 | 
 51 |         with tf.variable_scope('embedding'):
 52 |             self.embedding = tf.get_variable(name='embedding', shape=W_embedding.shape,
 53 |                                              initializer=tf.constant_initializer(W_embedding), trainable=True)
 54 |         self.embedding_size = W_embedding.shape[1]
 55 | 
 56 |         with tf.variable_scope('cnn_text'):
 57 |             output_title = self.cnn_inference(self._X1_inputs, self.title_len)
 58 | 
 59 |         with tf.variable_scope('hcnn_content'):
 60 |             output_content = self.cnn_inference(self._X2_inputs, self.content_len)
 61 | 
 62 |         with tf.variable_scope('fc-bn-layer'):
 63 |             output = tf.concat([output_title, output_content], axis=1)
 64 |             W_fc = self.weight_variable([self.n_filter_total * 2, self.fc_hidden_size], name='Weight_fc')
 65 |             tf.summary.histogram('W_fc', W_fc)
 66 |             h_fc = tf.matmul(output, W_fc, name='h_fc')
 67 |             beta_fc = tf.Variable(tf.constant(0.1, tf.float32, shape=[self.fc_hidden_size], name="beta_fc"))
 68 |             tf.summary.histogram('beta_fc', beta_fc)
 69 |             fc_bn, update_ema_fc = self.batchnorm(h_fc, beta_fc, convolutional=False)
 70 |             self.update_emas.append(update_ema_fc)
 71 |             self.fc_bn_relu = tf.nn.relu(fc_bn, name="relu")
 72 |             fc_bn_drop = tf.nn.dropout(self.fc_bn_relu, self.keep_prob)
 73 | 
 74 |         with tf.variable_scope('out_layer'):
 75 |             W_out = self.weight_variable([self.fc_hidden_size, self.n_class], name='Weight_out')
 76 |             tf.summary.histogram('Weight_out', W_out)
 77 |             b_out = self.bias_variable([self.n_class], name='bias_out')
 78 |             tf.summary.histogram('bias_out', b_out)
 79 |             self._y_pred = tf.nn.xw_plus_b(fc_bn_drop, W_out, b_out, name='y_pred')  # 每个类别的分数 scores
 80 | 
 81 |         with tf.name_scope('loss'):
 82 |             self._loss = tf.reduce_mean(
 83 |                 tf.nn.sigmoid_cross_entropy_with_logits(logits=self._y_pred, labels=self._y_inputs))
 84 |             tf.summary.scalar('loss', self._loss)
 85 | 
 86 |         self.saver = tf.train.Saver(max_to_keep=2)
 87 | 
 88 |     @property
 89 |     def tst(self):
 90 |         return self._tst
 91 | 
 92 |     @property
 93 |     def keep_prob(self):
 94 |         return self._keep_prob
 95 | 
 96 |     @property
 97 |     def batch_size(self):
 98 |         return self._batch_size
 99 | 
100 |     @property
101 |     def global_step(self):
102 |         return self._global_step
103 | 
104 |     @property
105 |     def X1_inputs(self):
106 |         return self._X1_inputs
107 | 
108 |     @property
109 |     def X2_inputs(self):
110 |         return self._X2_inputs
111 | 
112 |     @property
113 |     def y_inputs(self):
114 |         return self._y_inputs
115 | 
116 |     @property
117 |     def y_pred(self):
118 |         return self._y_pred
119 | 
120 |     @property
121 |     def loss(self):
122 |         return self._loss
123 | 
124 |     def weight_variable(self, shape, name):
125 |         """Create a weight variable with appropriate initialization."""
126 |         initial = tf.truncated_normal(shape, stddev=0.1)
127 |         return tf.Variable(initial, name=name)
128 | 
129 |     def bias_variable(self, shape, name):
130 |         """Create a bias variable with appropriate initialization."""
131 |         initial = tf.constant(0.1, shape=shape)
132 |         return tf.Variable(initial, name=name)
133 | 
134 |     def batchnorm(self, Ylogits, offset, convolutional=False):
135 |         """batchnormalization.
136 |         Args:
137 |             Ylogits: 1D向量或者是3D的卷积结果。
138 |             num_updates: 迭代的global_step
139 |             offset：表示beta，全局均值；在 RELU 激活中一般初始化为 0.1。
140 |             scale：表示lambda，全局方差；在 sigmoid 激活中需要，这 RELU 激活中作用不大。
141 |             m: 表示batch均值；v:表示batch方差。
142 |             bnepsilon：一个很小的浮点数，防止除以 0.
143 |         Returns:
144 |             Ybn: 和 Ylogits 的维度一样，就是经过 Batch Normalization 处理的结果。
145 |             update_moving_everages：更新mean和variance，主要是给最后的 test 使用。
146 |         """
147 |         exp_moving_avg = tf.train.ExponentialMovingAverage(0.999,
148 |                                                            self._global_step)  # adding the iteration prevents from averaging across non-existing iterations
149 |         bnepsilon = 1e-5
150 |         if convolutional:
151 |             mean, variance = tf.nn.moments(Ylogits, [0, 1, 2])
152 |         else:
153 |             mean, variance = tf.nn.moments(Ylogits, [0])
154 |         update_moving_everages = exp_moving_avg.apply([mean, variance])
155 |         m = tf.cond(self.tst, lambda: exp_moving_avg.average(mean), lambda: mean)
156 |         v = tf.cond(self.tst, lambda: exp_moving_avg.average(variance), lambda: variance)
157 |         Ybn = tf.nn.batch_normalization(Ylogits, m, v, offset, None, bnepsilon)
158 |         return Ybn, update_moving_everages
159 | 
160 |     def cnn_inference(self, X_inputs, n_step):
161 |         """TextCNN 模型。
162 |         Args:
163 |             X_inputs: tensor.shape=(batch_size, n_step)
164 |         Returns:
165 |             title_outputs: tensor.shape=(batch_size, self.n_filter_total)
166 |         """
167 |         inputs = tf.nn.embedding_lookup(self.embedding, X_inputs)
168 |         inputs = tf.expand_dims(inputs, -1)
169 |         pooled_outputs = list()
170 |         for i, filter_size in enumerate(self.filter_sizes):
171 |             with tf.variable_scope("conv-maxpool-%s" % filter_size):
172 |                 # Convolution Layer
173 |                 filter_shape = [filter_size, self.embedding_size, 1, self.n_filter]
174 |                 W_filter = self.weight_variable(shape=filter_shape, name='W_filter')
175 |                 beta = self.bias_variable(shape=[self.n_filter], name='beta_filter')
176 |                 tf.summary.histogram('beta', beta)
177 |                 conv = tf.nn.conv2d(inputs, W_filter, strides=[1, 1, 1, 1], padding="VALID", name="conv")
178 |                 conv_bn, update_ema = self.batchnorm(conv, beta, convolutional=True)  # 在激活层前面加 BN
179 |                 # Apply nonlinearity, batch norm scaling is not useful with relus
180 |                 # batch norm offsets are used instead of biases,使用 BN 层的 offset，不要 biases
181 |                 h = tf.nn.relu(conv_bn, name="relu")
182 |                 # Maxpooling over the outputs
183 |                 pooled = tf.nn.max_pool(h, ksize=[1, n_step - filter_size + 1, 1, 1],
184 |                                         strides=[1, 1, 1, 1], padding='VALID', name="pool")
185 |                 pooled_outputs.append(pooled)
186 |                 self.update_emas.append(update_ema)
187 |         h_pool = tf.concat(pooled_outputs, 3)
188 |         h_pool_flat = tf.reshape(h_pool, [-1, self.n_filter_total])
189 |         return h_pool_flat  # shape = [batch_size, self.n_filter_total]
190 | 
191 | 
192 | # test the model
193 | # def test():
194 | #     import numpy as np
195 | #     print('Begin testing...')
196 | #     settings = Settings()
197 | #     W_embedding = np.random.randn(50, 10)
198 | #     config = tf.ConfigProto()
199 | #     config.gpu_options.allow_growth = True
200 | #     batch_size = 128
201 | #     with tf.Session(config=config) as sess:
202 | #         model = TextCNN(W_embedding, settings)
203 | #         optimizer = tf.train.AdamOptimizer(0.001)
204 | #         train_op = optimizer.minimize(model.loss)
205 | #         update_op = tf.group(*model.update_emas)
206 | #         sess.run(tf.global_variables_initializer())
207 | #         fetch = [model.loss, model.y_pred, train_op, update_op]
208 | #         loss_list = list()
209 | #         for i in xrange(100):
210 | #             X1_batch = np.zeros((batch_size, 30), dtype=float)
211 | #             X2_batch = np.zeros((batch_size, 150), dtype=float)
212 | #             y_batch = np.zeros((batch_size, 1999), dtype=int)
213 | #             _batch_size = len(y_batch)
214 | #             feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
215 | #                          model.batch_size: _batch_size, model.tst: False, model.keep_prob: 0.5}
216 | #             loss, y_pred, _, _ = sess.run(fetch, feed_dict=feed_dict)
217 | #             loss_list.append(loss)
218 | #             print(i, loss)
219 | #
220 | # if __name__ == '__main__':
221 | #     test()
222 | 


--------------------------------------------------------------------------------
/zhihu-text-classification-master/models/wd_1_2_cnn_max/network.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | 
  3 | import tensorflow as tf
  4 | 
  5 | """wd_1_2_cnn_max
  6 | title 部分使用 TextCNN；content 部分使用 TextCNN； 两部分输出按位取 max。
  7 | """
  8 | 
  9 | 
 10 | class Settings(object):
 11 |     def __init__(self):
 12 |         self.model_name = 'wd_1_2_cnn_max'
 13 |         self.title_len = 30
 14 |         self.content_len = 150
 15 |         self.filter_sizes = [2, 3, 4, 5, 7]
 16 |         self.n_filter = 256
 17 |         self.fc_hidden_size = 1024
 18 |         self.n_class = 1999
 19 |         self.summary_path = '../../summary/' + self.model_name + '/'
 20 |         self.ckpt_path = '../../ckpt/' + self.model_name + '/'
 21 | 
 22 | 
 23 | class TextCNN(object):
 24 |     """
 25 |     title: inputs->textcnn->output_title
 26 |     content: inputs->textcnn->output_content
 27 |     max[output_title, output_content] -> fc+bn+relu -> sigmoid_entropy.
 28 |     """
 29 | 
 30 |     def __init__(self, W_embedding, settings):
 31 |         self.model_name = settings.model_name
 32 |         self.title_len = settings.title_len
 33 |         self.content_len = settings.content_len
 34 |         self.filter_sizes = settings.filter_sizes
 35 |         self.n_filter = settings.n_filter
 36 |         self.n_filter_total = self.n_filter * len(self.filter_sizes)
 37 |         self.n_class = settings.n_class
 38 |         self.fc_hidden_size = settings.fc_hidden_size
 39 |         self._global_step = tf.Variable(0, trainable=False, name='Global_Step')
 40 |         self.update_emas = list()
 41 |         # placeholders
 42 |         self._tst = tf.placeholder(tf.bool)
 43 |         self._keep_prob = tf.placeholder(tf.float32, [])
 44 |         self._batch_size = tf.placeholder(tf.int32, [])
 45 | 
 46 |         with tf.name_scope('Inputs'):
 47 |             self._X1_inputs = tf.placeholder(tf.int64, [None, self.title_len], name='X1_inputs')
 48 |             self._X2_inputs = tf.placeholder(tf.int64, [None, self.content_len], name='X2_inputs')
 49 |             self._y_inputs = tf.placeholder(tf.float32, [None, self.n_class], name='y_input')
 50 | 
 51 |         with tf.variable_scope('embedding'):
 52 |             self.embedding = tf.get_variable(name='embedding', shape=W_embedding.shape,
 53 |                                              initializer=tf.constant_initializer(W_embedding), trainable=True)
 54 |         self.embedding_size = W_embedding.shape[1]
 55 | 
 56 |         with tf.variable_scope('cnn_text'):
 57 |             output_title = self.cnn_inference(self._X1_inputs, self.title_len)
 58 |             output_title = tf.expand_dims(output_title, 0)
 59 | 
 60 |         with tf.variable_scope('hcnn_content'):
 61 |             output_content = self.cnn_inference(self._X2_inputs, self.content_len)
 62 |             output_content = tf.expand_dims(output_content, 0)
 63 | 
 64 |         with tf.variable_scope('fc-bn-layer'):
 65 |             output = tf.concat([output_title, output_content], axis=0)
 66 |             output = tf.reduce_max(output, axis=0)
 67 |             W_fc = self.weight_variable([self.n_filter_total, self.fc_hidden_size], name='Weight_fc')
 68 |             tf.summary.histogram('W_fc', W_fc)
 69 |             h_fc = tf.matmul(output, W_fc, name='h_fc')
 70 |             beta_fc = tf.Variable(tf.constant(0.1, tf.float32, shape=[self.fc_hidden_size], name="beta_fc"))
 71 |             tf.summary.histogram('beta_fc', beta_fc)
 72 |             fc_bn, update_ema_fc = self.batchnorm(h_fc, beta_fc, convolutional=False)
 73 |             self.update_emas.append(update_ema_fc)
 74 |             self.fc_bn_relu = tf.nn.relu(fc_bn, name="relu")
 75 |             fc_bn_drop = tf.nn.dropout(self.fc_bn_relu, self.keep_prob)
 76 | 
 77 |         with tf.variable_scope('out_layer'):
 78 |             W_out = self.weight_variable([self.fc_hidden_size, self.n_class], name='Weight_out')
 79 |             tf.summary.histogram('Weight_out', W_out)
 80 |             b_out = self.bias_variable([self.n_class], name='bias_out')
 81 |             tf.summary.histogram('bias_out', b_out)
 82 |             self._y_pred = tf.nn.xw_plus_b(fc_bn_drop, W_out, b_out, name='y_pred')  # 每个类别的分数 scores
 83 | 
 84 |         with tf.name_scope('loss'):
 85 |             self._loss = tf.reduce_mean(
 86 |                 tf.nn.sigmoid_cross_entropy_with_logits(logits=self._y_pred, labels=self._y_inputs))
 87 |             tf.summary.scalar('loss', self._loss)
 88 | 
 89 |         self.saver = tf.train.Saver(max_to_keep=2)
 90 | 
 91 |     @property
 92 |     def tst(self):
 93 |         return self._tst
 94 | 
 95 |     @property
 96 |     def keep_prob(self):
 97 |         return self._keep_prob
 98 | 
 99 |     @property
100 |     def batch_size(self):
101 |         return self._batch_size
102 | 
103 |     @property
104 |     def global_step(self):
105 |         return self._global_step
106 | 
107 |     @property
108 |     def X1_inputs(self):
109 |         return self._X1_inputs
110 | 
111 |     @property
112 |     def X2_inputs(self):
113 |         return self._X2_inputs
114 | 
115 |     @property
116 |     def y_inputs(self):
117 |         return self._y_inputs
118 | 
119 |     @property
120 |     def y_pred(self):
121 |         return self._y_pred
122 | 
123 |     @property
124 |     def loss(self):
125 |         return self._loss
126 | 
127 |     def weight_variable(self, shape, name):
128 |         """Create a weight variable with appropriate initialization."""
129 |         initial = tf.truncated_normal(shape, stddev=0.1)
130 |         return tf.Variable(initial, name=name)
131 | 
132 |     def bias_variable(self, shape, name):
133 |         """Create a bias variable with appropriate initialization."""
134 |         initial = tf.constant(0.1, shape=shape)
135 |         return tf.Variable(initial, name=name)
136 | 
137 |     def batchnorm(self, Ylogits, offset, convolutional=False):
138 |         """batchnormalization.
139 |         Args:
140 |             Ylogits: 1D向量或者是3D的卷积结果。
141 |             num_updates: 迭代的global_step
142 |             offset：表示beta，全局均值；在 RELU 激活中一般初始化为 0.1。
143 |             scale：表示lambda，全局方差；在 sigmoid 激活中需要，这 RELU 激活中作用不大。
144 |             m: 表示batch均值；v:表示batch方差。
145 |             bnepsilon：一个很小的浮点数，防止除以 0.
146 |         Returns:
147 |             Ybn: 和 Ylogits 的维度一样，就是经过 Batch Normalization 处理的结果。
148 |             update_moving_everages：更新mean和variance，主要是给最后的 test 使用。
149 |         """
150 |         exp_moving_avg = tf.train.ExponentialMovingAverage(0.999,
151 |                                                            self._global_step)  # adding the iteration prevents from averaging across non-existing iterations
152 |         bnepsilon = 1e-5
153 |         if convolutional:
154 |             mean, variance = tf.nn.moments(Ylogits, [0, 1, 2])
155 |         else:
156 |             mean, variance = tf.nn.moments(Ylogits, [0])
157 |         update_moving_everages = exp_moving_avg.apply([mean, variance])
158 |         m = tf.cond(self.tst, lambda: exp_moving_avg.average(mean), lambda: mean)
159 |         v = tf.cond(self.tst, lambda: exp_moving_avg.average(variance), lambda: variance)
160 |         Ybn = tf.nn.batch_normalization(Ylogits, m, v, offset, None, bnepsilon)
161 |         return Ybn, update_moving_everages
162 | 
163 |     def cnn_inference(self, X_inputs, n_step):
164 |         """TextCNN 模型。
165 |         Args:
166 |             X_inputs: tensor.shape=(batch_size, n_step)
167 |         Returns:
168 |             title_outputs: tensor.shape=(batch_size, self.n_filter_total)
169 |         """
170 |         inputs = tf.nn.embedding_lookup(self.embedding, X_inputs)
171 |         inputs = tf.expand_dims(inputs, -1)
172 |         pooled_outputs = list()
173 |         for i, filter_size in enumerate(self.filter_sizes):
174 |             with tf.variable_scope("conv-maxpool-%s" % filter_size):
175 |                 # Convolution Layer
176 |                 filter_shape = [filter_size, self.embedding_size, 1, self.n_filter]
177 |                 W_filter = self.weight_variable(shape=filter_shape, name='W_filter')
178 |                 beta = self.bias_variable(shape=[self.n_filter], name='beta_filter')
179 |                 tf.summary.histogram('beta', beta)
180 |                 conv = tf.nn.conv2d(inputs, W_filter, strides=[1, 1, 1, 1], padding="VALID", name="conv")
181 |                 conv_bn, update_ema = self.batchnorm(conv, beta, convolutional=True)  # 在激活层前面加 BN
182 |                 # Apply nonlinearity, batch norm scaling is not useful with relus
183 |                 # batch norm offsets are used instead of biases,使用 BN 层的 offset，不要 biases
184 |                 h = tf.nn.relu(conv_bn, name="relu")
185 |                 # Maxpooling over the outputs
186 |                 pooled = tf.nn.max_pool(h, ksize=[1, n_step - filter_size + 1, 1, 1],
187 |                                         strides=[1, 1, 1, 1], padding='VALID', name="pool")
188 |                 pooled_outputs.append(pooled)
189 |                 self.update_emas.append(update_ema)
190 |         h_pool = tf.concat(pooled_outputs, 3)
191 |         h_pool_flat = tf.reshape(h_pool, [-1, self.n_filter_total])
192 |         return h_pool_flat  # shape = [batch_size, self.n_filter_total]
193 | 
194 | 
195 | # test the model
196 | # def test():
197 | #     import numpy as np
198 | #     print('Begin testing...')
199 | #     settings = Settings()
200 | #     W_embedding = np.random.randn(50, 10)
201 | #     config = tf.ConfigProto()
202 | #     config.gpu_options.allow_growth = True
203 | #     batch_size = 128
204 | #     with tf.Session(config=config) as sess:
205 | #         model = TextCNN(W_embedding, settings)
206 | #         optimizer = tf.train.AdamOptimizer(0.001)
207 | #         train_op = optimizer.minimize(model.loss)
208 | #         update_op = tf.group(*model.update_emas)
209 | #         sess.run(tf.global_variables_initializer())
210 | #         fetch = [model.loss, model.y_pred, train_op, update_op]
211 | #         loss_list = list()
212 | #         for i in xrange(100):
213 | #             X1_batch = np.zeros((batch_size, 30), dtype=float)
214 | #             X2_batch = np.zeros((batch_size, 150), dtype=float)
215 | #             y_batch = np.zeros((batch_size, 1999), dtype=int)
216 | #             _batch_size = len(y_batch)
217 | #             feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
218 | #                          model.batch_size: _batch_size, model.tst: False, model.keep_prob: 0.5}
219 | #             loss, y_pred, _, _ = sess.run(fetch, feed_dict=feed_dict)
220 | #             loss_list.append(loss)
221 | #             print(i, loss)
222 | #
223 | # if __name__ == '__main__':
224 | #     test()
225 | 


--------------------------------------------------------------------------------
/zhihu-text-classification-master/models/wd_4_han/train.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | 
  3 | from __future__ import print_function
  4 | from __future__ import division
  5 | import tensorflow as tf
  6 | import numpy as np
  7 | from tqdm import tqdm
  8 | import os
  9 | import sys
 10 | import shutil
 11 | import time
 12 | import network
 13 | 
 14 | sys.path.append('../..')
 15 | from data_helpers import to_categorical
 16 | from evaluator import score_eval
 17 | 
 18 | flags = tf.flags
 19 | flags.DEFINE_bool('is_retrain', False, 'if is_retrain is true, not rebuild the summary')
 20 | flags.DEFINE_integer('max_epoch', 2, 'update the embedding after max_epoch, default: 2')
 21 | flags.DEFINE_integer('max_max_epoch', 6, 'all training epoches, default: 6')
 22 | flags.DEFINE_float('lr', 8e-4, 'initial learning rate, default: 8e-4')
 23 | flags.DEFINE_float('decay_rate', 0.85, 'decay rate, default: 0.85')
 24 | flags.DEFINE_float('keep_prob', 0.5, 'keep_prob for training, default: 0.5')
 25 | # 正式
 26 | flags.DEFINE_integer('decay_step', 15000, 'decay_step, default: 15000')
 27 | flags.DEFINE_integer('valid_step', 10000, 'valid_step, default: 10000')
 28 | flags.DEFINE_float('last_f1', 0.38, 'if valid_f1 > last_f1, save new model. default: 0.40')
 29 | 
 30 | # 测试
 31 | # flags.DEFINE_integer('decay_step', 1000, 'decay_step, default: 1000')
 32 | # flags.DEFINE_integer('valid_step', 500, 'valid_step, default: 500')
 33 | # flags.DEFINE_float('last_f1', 0.10, 'if valid_f1 > last_f1, save new model. default: 0.10')
 34 | FLAGS = flags.FLAGS
 35 | 
 36 | lr = FLAGS.lr
 37 | last_f1 = FLAGS.last_f1
 38 | settings = network.Settings()
 39 | title_len = settings.title_len
 40 | summary_path = settings.summary_path
 41 | ckpt_path = settings.ckpt_path
 42 | model_path = ckpt_path + 'model.ckpt'
 43 | 
 44 | embedding_path = '../../data/word_embedding.npy'
 45 | data_train_path = '../../data/wd-data/seg_train/'
 46 | data_valid_path = '../../data/wd-data/seg_valid/'
 47 | tr_batches = os.listdir(data_train_path)  # batch 文件名列表
 48 | va_batches = os.listdir(data_valid_path)
 49 | n_tr_batches = len(tr_batches)
 50 | n_va_batches = len(va_batches)
 51 | 
 52 | # 测试
 53 | # n_tr_batches = 1000
 54 | # n_va_batches = 50
 55 | 
 56 | 
 57 | def get_batch(data_path, batch_id):
 58 |     """get a batch from data_path"""
 59 |     new_batch = np.load(data_path + str(batch_id) + '.npz')
 60 |     X_batch = new_batch['X']
 61 |     y_batch = new_batch['y']
 62 |     X1_batch = X_batch[:, :title_len]
 63 |     X2_batch = X_batch[:, title_len:]
 64 |     return [X1_batch, X2_batch, y_batch]
 65 | 
 66 | 
 67 | def valid_epoch(data_path, sess, model):
 68 |     """Test on the valid data."""
 69 |     va_batches = os.listdir(data_path)
 70 |     n_va_batches = len(va_batches)
 71 |     _costs = 0.0
 72 |     predict_labels_list = list()  # 所有的预测结果
 73 |     marked_labels_list = list()
 74 |     for i in range(n_va_batches):
 75 |         [X1_batch, X2_batch, y_batch] = get_batch(data_path, i)
 76 |         marked_labels_list.extend(y_batch)
 77 |         y_batch = to_categorical(y_batch)
 78 |         _batch_size = len(y_batch)
 79 |         fetches = [model.loss, model.y_pred]
 80 |         feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
 81 |                      model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
 82 |         _cost, predict_labels = sess.run(fetches, feed_dict)
 83 |         _costs += _cost
 84 |         predict_labels = (map(lambda label: label.argsort()[-1:-6:-1], predict_labels))  # 取最大的5个下标
 85 |         predict_labels_list.extend(predict_labels)
 86 |     predict_label_and_marked_label_list = zip(predict_labels_list, marked_labels_list)
 87 |     precision, recall, f1 = score_eval(predict_label_and_marked_label_list)
 88 |     mean_cost = _costs / n_va_batches
 89 |     return mean_cost, precision, recall, f1
 90 | 
 91 | 
 92 | def train_epoch(data_path, sess, model, train_fetches, valid_fetches, train_writer, test_writer):
 93 |     global last_f1
 94 |     global lr
 95 |     time0 = time.time()
 96 |     batch_indexs = np.random.permutation(n_tr_batches)  # shuffle the training data
 97 |     for batch in tqdm(range(n_tr_batches)):
 98 |         global_step = sess.run(model.global_step)
 99 |         if 0 == (global_step + 1) % FLAGS.valid_step:
100 |             valid_cost, precision, recall, f1 = valid_epoch(data_valid_path, sess, model)
101 |             print('Global_step=%d: valid cost=%g; p=%g, r=%g, f1=%g, time=%g s' % (
102 |                 global_step, valid_cost, precision, recall, f1, time.time() - time0))
103 |             time0 = time.time()
104 |             if f1 > last_f1:
105 |                 last_f1 = f1
106 |                 saving_path = model.saver.save(sess, model_path, global_step+1)
107 |                 print('saved new model to %s ' % saving_path)
108 |         # training
109 |         batch_id = batch_indexs[batch]
110 |         [X1_batch, X2_batch, y_batch] = get_batch(data_train_path, batch_id)
111 |         y_batch = to_categorical(y_batch)
112 |         _batch_size = len(y_batch)
113 |         feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
114 |                      model.batch_size: _batch_size, model.tst: False, model.keep_prob: FLAGS.keep_prob}
115 |         summary, _cost, _, _ = sess.run(train_fetches, feed_dict)  # the cost is the mean cost of one batch
116 |         # valid per 500 steps
117 |         if 0 == (global_step + 1) % 500:
118 |             train_writer.add_summary(summary, global_step)
119 |             batch_id = np.random.randint(0, n_va_batches)  # 随机选一个验证batch
120 |             [X1_batch, X2_batch, y_batch] = get_batch(data_valid_path, batch_id)
121 |             y_batch = to_categorical(y_batch)
122 |             _batch_size = len(y_batch)
123 |             feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
124 |                          model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
125 |             summary, _cost = sess.run(valid_fetches, feed_dict)
126 |             test_writer.add_summary(summary, global_step)
127 | 
128 | 
129 | def main(_):
130 |     global ckpt_path
131 |     global last_f1
132 |     if not os.path.exists(ckpt_path):
133 |         os.makedirs(ckpt_path)
134 |     if not os.path.exists(summary_path):
135 |         os.makedirs(summary_path)
136 |     elif not FLAGS.is_retrain:  # 重新训练本模型，删除以前的 summary
137 |         shutil.rmtree(summary_path)
138 |         os.makedirs(summary_path)
139 |     if not os.path.exists(summary_path):
140 |         os.makedirs(summary_path)
141 | 
142 |     print('1.Loading data...')
143 |     W_embedding = np.load(embedding_path)
144 |     print('training sample_num = %d' % n_tr_batches)
145 |     print('valid sample_num = %d' % n_va_batches)
146 | 
147 |     # Initial or restore the model
148 |     print('2.Building model...')
149 |     config = tf.ConfigProto()
150 |     config.gpu_options.allow_growth = True
151 |     with tf.Session(config=config) as sess:
152 |         model = network.HAN(W_embedding, settings)
153 |         with tf.variable_scope('training_ops') as vs:
154 |             learning_rate = tf.train.exponential_decay(FLAGS.lr, model.global_step, FLAGS.decay_step,
155 |                                                    FLAGS.decay_rate, staircase=True)
156 |             # two optimizer: op1, update embedding; op2, do not update embedding.
157 |             with tf.variable_scope('Optimizer1'):
158 |                 tvars1 = tf.trainable_variables()
159 |                 grads1 = tf.gradients(model.loss, tvars1)
160 |                 optimizer1 = tf.train.AdamOptimizer(learning_rate=learning_rate)
161 |                 train_op1 = optimizer1.apply_gradients(zip(grads1, tvars1),
162 |                                                    global_step=model.global_step)
163 |             with tf.variable_scope('Optimizer2'):
164 |                 tvars2 = [tvar for tvar in tvars1 if 'embedding' not in tvar.name]
165 |                 grads2 = tf.gradients(model.loss, tvars2)
166 |                 optimizer2 = tf.train.AdamOptimizer(learning_rate=learning_rate)
167 |                 train_op2 = optimizer2.apply_gradients(zip(grads2, tvars2),
168 |                                                    global_step=model.global_step)
169 |             update_op = tf.group(*model.update_emas)
170 |             merged = tf.summary.merge_all()  # summary
171 |             train_writer = tf.summary.FileWriter(summary_path + 'train', sess.graph)
172 |             test_writer = tf.summary.FileWriter(summary_path + 'test')
173 |             training_ops = [v for v in tf.global_variables() if v.name.startswith(vs.name+'/')]
174 | 
175 |         # 如果已经保存过模型，导入上次的模型
176 |         if os.path.exists(ckpt_path + "checkpoint"):
177 |             print("Restoring Variables from Checkpoint...")
178 |             model.saver.restore(sess, tf.train.latest_checkpoint(ckpt_path))
179 |             last_valid_cost, precision, recall, last_f1 = valid_epoch(data_valid_path, sess, model)
180 |             print(' valid cost=%g; p=%g, r=%g, f1=%g' % (last_valid_cost, precision, recall, last_f1))
181 |             sess.run(tf.variables_initializer(training_ops))
182 |             train_op2 = train_op1
183 |         else:
184 |             print('Initializing Variables...')
185 |             sess.run(tf.global_variables_initializer())
186 | 
187 |         print('3.Begin training...')
188 |         print('max_epoch=%d, max_max_epoch=%d' % (FLAGS.max_epoch, FLAGS.max_max_epoch))
189 |         train_op = train_op2
190 |         for epoch in range(FLAGS.max_max_epoch):
191 |             global_step = sess.run(model.global_step)
192 |             print('Global step %d, lr=%g' % (global_step, sess.run(learning_rate)))
193 |             if epoch == FLAGS.max_epoch:  # update the embedding
194 |                 train_op = train_op1
195 |             train_fetches = [merged, model.loss, train_op, update_op]
196 |             valid_fetches = [merged, model.loss]
197 |             train_epoch(data_train_path, sess, model, train_fetches, valid_fetches, train_writer, test_writer)
198 |         # 最后再做一次验证
199 |         valid_cost, precision, recall, f1 = valid_epoch(data_valid_path, sess, model)
200 |         print('END.Global_step=%d: valid cost=%g; p=%g, r=%g, f1=%g' % (
201 |             sess.run(model.global_step), valid_cost, precision, recall, f1))
202 |         if f1 > last_f1:  # save the better model
203 |             saving_path = model.saver.save(sess, model_path, sess.run(model.global_step)+1)
204 |             print('saved new model to %s ' % saving_path)
205 | 
206 | 
207 | if __name__ == '__main__':
208 |     tf.app.run()
209 | 


--------------------------------------------------------------------------------
/zhihu-text-classification-master/models/wd_2_hcnn/train.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | 
  3 | from __future__ import print_function
  4 | from __future__ import division
  5 | import tensorflow as tf
  6 | import numpy as np
  7 | from tqdm import tqdm
  8 | import os
  9 | import sys
 10 | import shutil
 11 | import time
 12 | import network
 13 | 
 14 | sys.path.append('../..')
 15 | from data_helpers import to_categorical
 16 | from evaluator import score_eval
 17 | 
 18 | flags = tf.flags
 19 | flags.DEFINE_bool('is_retrain', False, 'if is_retrain is true, not rebuild the summary')
 20 | flags.DEFINE_integer('max_epoch', 1, 'update the embedding after max_epoch, default: 1')
 21 | flags.DEFINE_integer('max_max_epoch', 6, 'all training epoches, default: 6')
 22 | flags.DEFINE_float('lr', 1e-3, 'initial learning rate, default: 1e-3')
 23 | flags.DEFINE_float('decay_rate', 0.65, 'decay rate, default: 0.65')
 24 | flags.DEFINE_float('keep_prob', 0.5, 'keep_prob for training, default: 0.5')
 25 | # 正式
 26 | flags.DEFINE_integer('decay_step', 15000, 'decay_step, default: 15000')
 27 | flags.DEFINE_integer('valid_step', 10000, 'valid_step, default: 10000')
 28 | flags.DEFINE_float('last_f1', 0.38, 'if valid_f1 > last_f1, save new model. default: 0.40')
 29 | 
 30 | # 测试
 31 | # flags.DEFINE_integer('decay_step', 1000, 'decay_step, default: 1000')
 32 | # flags.DEFINE_integer('valid_step', 500, 'valid_step, default: 500')
 33 | # flags.DEFINE_float('last_f1', 0.10, 'if valid_f1 > last_f1, save new model. default: 0.10')
 34 | FLAGS = flags.FLAGS
 35 | 
 36 | lr = FLAGS.lr
 37 | last_f1 = FLAGS.last_f1
 38 | settings = network.Settings()
 39 | title_len = settings.title_len
 40 | summary_path = settings.summary_path
 41 | ckpt_path = settings.ckpt_path
 42 | model_path = ckpt_path + 'model.ckpt'
 43 | 
 44 | embedding_path = '../../data/word_embedding.npy'
 45 | data_train_path = '../../data/wd-data/seg_train/'
 46 | data_valid_path = '../../data/wd-data/seg_valid/'
 47 | tr_batches = os.listdir(data_train_path)  # batch 文件名列表
 48 | va_batches = os.listdir(data_valid_path)
 49 | n_tr_batches = len(tr_batches)
 50 | n_va_batches = len(va_batches)
 51 | 
 52 | # 测试
 53 | # n_tr_batches = 1000
 54 | # n_va_batches = 50
 55 | 
 56 | 
 57 | def get_batch(data_path, batch_id):
 58 |     """get a batch from data_path"""
 59 |     new_batch = np.load(data_path + str(batch_id) + '.npz')
 60 |     X_batch = new_batch['X']
 61 |     y_batch = new_batch['y']
 62 |     X1_batch = X_batch[:, :title_len]
 63 |     X2_batch = X_batch[:, title_len:]
 64 |     return [X1_batch, X2_batch, y_batch]
 65 | 
 66 | 
 67 | def valid_epoch(data_path, sess, model):
 68 |     """Test on the valid data."""
 69 |     va_batches = os.listdir(data_path)
 70 |     n_va_batches = len(va_batches)
 71 |     _costs = 0.0
 72 |     predict_labels_list = list()  # 所有的预测结果
 73 |     marked_labels_list = list()
 74 |     for i in range(n_va_batches):
 75 |         [X1_batch, X2_batch, y_batch] = get_batch(data_path, i)
 76 |         marked_labels_list.extend(y_batch)
 77 |         y_batch = to_categorical(y_batch)
 78 |         _batch_size = len(y_batch)
 79 |         fetches = [model.loss, model.y_pred]
 80 |         feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
 81 |                      model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
 82 |         _cost, predict_labels = sess.run(fetches, feed_dict)
 83 |         _costs += _cost
 84 |         predict_labels = list(map(lambda label: label.argsort()[-1:-6:-1], predict_labels))  # 取最大的5个下标
 85 |         predict_labels_list.extend(predict_labels)
 86 |     predict_label_and_marked_label_list = zip(predict_labels_list, marked_labels_list)
 87 |     precision, recall, f1 = score_eval(predict_label_and_marked_label_list)
 88 |     mean_cost = _costs / n_va_batches
 89 |     return mean_cost, precision, recall, f1
 90 | 
 91 | 
 92 | def train_epoch(data_path, sess, model, train_fetches, valid_fetches, train_writer, test_writer):
 93 |     global last_f1
 94 |     global lr
 95 |     time0 = time.time()
 96 |     batch_indexs = np.random.permutation(n_tr_batches)  # shuffle the training data
 97 |     for batch in tqdm(range(n_tr_batches)):
 98 |         global_step = sess.run(model.global_step)
 99 |         if 0 == (global_step + 1) % FLAGS.valid_step:
100 |             valid_cost, precision, recall, f1 = valid_epoch(data_valid_path, sess, model)
101 |             print('Global_step=%d: valid cost=%g; p=%g, r=%g, f1=%g, time=%g s' % (
102 |                 global_step, valid_cost, precision, recall, f1, time.time() - time0))
103 |             time0 = time.time()
104 |             if f1 > last_f1:
105 |                 last_f1 = f1
106 |                 saving_path = model.saver.save(sess, model_path, global_step+1)
107 |                 print('saved new model to %s ' % saving_path)
108 |         # training
109 |         batch_id = batch_indexs[batch]
110 |         [X1_batch, X2_batch, y_batch] = get_batch(data_train_path, batch_id)
111 |         y_batch = to_categorical(y_batch)
112 |         _batch_size = len(y_batch)
113 |         feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
114 |                      model.batch_size: _batch_size, model.tst: False, model.keep_prob: FLAGS.keep_prob}
115 |         summary, _cost, _, _ = sess.run(train_fetches, feed_dict)  # the cost is the mean cost of one batch
116 |         # valid per 500 steps
117 |         if 0 == (global_step + 1) % 500:
118 |             train_writer.add_summary(summary, global_step)
119 |             batch_id = np.random.randint(0, n_va_batches)  # 随机选一个验证batch
120 |             [X1_batch, X2_batch, y_batch] = get_batch(data_valid_path, batch_id)
121 |             y_batch = to_categorical(y_batch)
122 |             _batch_size = len(y_batch)
123 |             feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
124 |                          model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
125 |             summary, _cost = sess.run(valid_fetches, feed_dict)
126 |             test_writer.add_summary(summary, global_step)
127 | 
128 | 
129 | def main(_):
130 |     global ckpt_path
131 |     global last_f1
132 |     if not os.path.exists(ckpt_path):
133 |         os.makedirs(ckpt_path)
134 |     if not os.path.exists(summary_path):
135 |         os.makedirs(summary_path)
136 |     elif not FLAGS.is_retrain:  # 重新训练本模型，删除以前的 summary
137 |         shutil.rmtree(summary_path)
138 |         os.makedirs(summary_path)
139 |     if not os.path.exists(summary_path):
140 |         os.makedirs(summary_path)
141 | 
142 |     print('1.Loading data...')
143 |     W_embedding = np.load(embedding_path)
144 |     print('training sample_num = %d' % n_tr_batches)
145 |     print('valid sample_num = %d' % n_va_batches)
146 | 
147 |     # Initial or restore the model
148 |     print('2.Building model...')
149 |     config = tf.ConfigProto()
150 |     config.gpu_options.allow_growth = True
151 |     with tf.Session(config=config) as sess:
152 |         model = network.HCNN(W_embedding, settings)
153 |         with tf.variable_scope('training_ops') as vs:
154 |             learning_rate = tf.train.exponential_decay(FLAGS.lr, model.global_step, FLAGS.decay_step,
155 |                                                    FLAGS.decay_rate, staircase=True)
156 |             # two optimizer: op1, update embedding; op2, do not update embedding.
157 |             with tf.variable_scope('Optimizer1'):
158 |                 tvars1 = tf.trainable_variables()
159 |                 grads1 = tf.gradients(model.loss, tvars1)
160 |                 optimizer1 = tf.train.AdamOptimizer(learning_rate=learning_rate)
161 |                 train_op1 = optimizer1.apply_gradients(zip(grads1, tvars1),
162 |                                                    global_step=model.global_step)
163 |             with tf.variable_scope('Optimizer2'):
164 |                 tvars2 = [tvar for tvar in tvars1 if 'embedding' not in tvar.name]
165 |                 grads2 = tf.gradients(model.loss, tvars2)
166 |                 optimizer2 = tf.train.AdamOptimizer(learning_rate=learning_rate)
167 |                 train_op2 = optimizer2.apply_gradients(zip(grads2, tvars2),
168 |                                                    global_step=model.global_step)
169 |             update_op = tf.group(*model.update_emas)
170 |             merged = tf.summary.merge_all()  # summary
171 |             train_writer = tf.summary.FileWriter(summary_path + 'train', sess.graph)
172 |             test_writer = tf.summary.FileWriter(summary_path + 'test')
173 |             training_ops = [v for v in tf.global_variables() if v.name.startswith(vs.name+'/')]
174 | 
175 |         # 如果已经保存过模型，导入上次的模型
176 |         if os.path.exists(ckpt_path + "checkpoint"):
177 |             print("Restoring Variables from Checkpoint...")
178 |             model.saver.restore(sess, tf.train.latest_checkpoint(ckpt_path))
179 |             last_valid_cost, precision, recall, last_f1 = valid_epoch(data_valid_path, sess, model)
180 |             print(' valid cost=%g; p=%g, r=%g, f1=%g' % (last_valid_cost, precision, recall, last_f1))
181 |             sess.run(tf.variables_initializer(training_ops))
182 |             train_op2 = train_op1
183 |         else:
184 |             print('Initializing Variables...')
185 |             sess.run(tf.global_variables_initializer())
186 | 
187 |         print('3.Begin training...')
188 |         print('max_epoch=%d, max_max_epoch=%d' % (FLAGS.max_epoch, FLAGS.max_max_epoch))
189 |         train_op = train_op2
190 |         for epoch in range(FLAGS.max_max_epoch):
191 |             global_step = sess.run(model.global_step)
192 |             print('Global step %d, lr=%g' % (global_step, sess.run(learning_rate)))
193 |             if epoch == FLAGS.max_epoch:  # update the embedding
194 |                 train_op = train_op1
195 |             train_fetches = [merged, model.loss, train_op, update_op]
196 |             valid_fetches = [merged, model.loss]
197 |             train_epoch(data_train_path, sess, model, train_fetches, valid_fetches, train_writer, test_writer)
198 |         # 最后再做一次验证
199 |         valid_cost, precision, recall, f1 = valid_epoch(data_valid_path, sess, model)
200 |         print('END.Global_step=%d: valid cost=%g; p=%g, r=%g, f1=%g' % (
201 |             sess.run(model.global_step), valid_cost, precision, recall, f1))
202 |         if f1 > last_f1:  # save the better model
203 |             saving_path = model.saver.save(sess, model_path, sess.run(model.global_step)+1)
204 |             print('saved new model to %s ' % saving_path)
205 | 
206 | 
207 | if __name__ == '__main__':
208 |     tf.app.run()
209 | 


--------------------------------------------------------------------------------
/zhihu-text-classification-master/models/wd_6_rcnn/train.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | 
  3 | from __future__ import print_function
  4 | from __future__ import division
  5 | import tensorflow as tf
  6 | import numpy as np
  7 | from tqdm import tqdm
  8 | import os
  9 | import sys
 10 | import shutil
 11 | import time
 12 | import network
 13 | 
 14 | sys.path.append('../..')
 15 | from data_helpers import to_categorical
 16 | from evaluator import score_eval
 17 | 
 18 | flags = tf.flags
 19 | flags.DEFINE_bool('is_retrain', False, 'if is_retrain is true, not rebuild the summary')
 20 | flags.DEFINE_integer('max_epoch', 1, 'update the embedding after max_epoch, default: 1')
 21 | flags.DEFINE_integer('max_max_epoch', 6, 'all training epoches, default: 6')
 22 | flags.DEFINE_float('lr', 8e-4, 'initial learning rate, default: 8e-4')
 23 | flags.DEFINE_float('decay_rate', 0.75, 'decay rate, default: 0.75')
 24 | flags.DEFINE_float('keep_prob', 0.5, 'keep_prob for training, default: 0.5')
 25 | # 正式
 26 | flags.DEFINE_integer('decay_step', 15000, 'decay_step, default: 15000')
 27 | flags.DEFINE_integer('valid_step', 10000, 'valid_step, default: 10000')
 28 | flags.DEFINE_float('last_f1', 0.38, 'if valid_f1 > last_f1, save new model. default: 0.40')
 29 | 
 30 | # 测试
 31 | # flags.DEFINE_integer('decay_step', 1000, 'decay_step, default: 1000')
 32 | # flags.DEFINE_integer('valid_step', 500, 'valid_step, default: 500')
 33 | # flags.DEFINE_float('last_f1', 0.10, 'if valid_f1 > last_f1, save new model. default: 0.10')
 34 | FLAGS = flags.FLAGS
 35 | 
 36 | lr = FLAGS.lr
 37 | last_f1 = FLAGS.last_f1
 38 | settings = network.Settings()
 39 | title_len = settings.title_len
 40 | summary_path = settings.summary_path
 41 | ckpt_path = settings.ckpt_path
 42 | model_path = ckpt_path + 'model.ckpt'
 43 | 
 44 | embedding_path = '../../data/word_embedding.npy'
 45 | data_train_path = '../../data/wd-data/data_train/'
 46 | data_valid_path = '../../data/wd-data/data_valid/'
 47 | tr_batches = os.listdir(data_train_path)  # batch 文件名列表
 48 | va_batches = os.listdir(data_valid_path)
 49 | n_tr_batches = len(tr_batches)
 50 | n_va_batches = len(va_batches)
 51 | 
 52 | # 测试
 53 | # n_tr_batches = 1000
 54 | # n_va_batches = 50
 55 | 
 56 | 
 57 | def get_batch(data_path, batch_id):
 58 |     """get a batch from data_path"""
 59 |     new_batch = np.load(data_path + str(batch_id) + '.npz')
 60 |     X_batch = new_batch['X']
 61 |     y_batch = new_batch['y']
 62 |     X1_batch = X_batch[:, :title_len]
 63 |     X2_batch = X_batch[:, title_len:]
 64 |     return [X1_batch, X2_batch, y_batch]
 65 | 
 66 | 
 67 | def valid_epoch(data_path, sess, model):
 68 |     """Test on the valid data."""
 69 |     va_batches = os.listdir(data_path)
 70 |     n_va_batches = len(va_batches)
 71 |     _costs = 0.0
 72 |     predict_labels_list = list()  # 所有的预测结果
 73 |     marked_labels_list = list()
 74 |     for i in range(n_va_batches):
 75 |         [X1_batch, X2_batch, y_batch] = get_batch(data_path, i)
 76 |         marked_labels_list.extend(y_batch)
 77 |         y_batch = to_categorical(y_batch)
 78 |         _batch_size = len(y_batch)
 79 |         fetches = [model.loss, model.y_pred]
 80 |         feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
 81 |                      model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
 82 |         _cost, predict_labels = sess.run(fetches, feed_dict)
 83 |         _costs += _cost
 84 |         predict_labels = list(map(lambda label: label.argsort()[-1:-6:-1], predict_labels))  # 取最大的5个下标
 85 |         predict_labels_list.extend(predict_labels)
 86 |     predict_label_and_marked_label_list = zip(predict_labels_list, marked_labels_list)
 87 |     precision, recall, f1 = score_eval(predict_label_and_marked_label_list)
 88 |     mean_cost = _costs / n_va_batches
 89 |     return mean_cost, precision, recall, f1
 90 | 
 91 | 
 92 | def train_epoch(data_path, sess, model, train_fetches, valid_fetches, train_writer, test_writer):
 93 |     global last_f1
 94 |     global lr
 95 |     time0 = time.time()
 96 |     batch_indexs = np.random.permutation(n_tr_batches)  # shuffle the training data
 97 |     for batch in tqdm(range(n_tr_batches)):
 98 |         global_step = sess.run(model.global_step)
 99 |         if 0 == (global_step + 1) % FLAGS.valid_step:
100 |             valid_cost, precision, recall, f1 = valid_epoch(data_valid_path, sess, model)
101 |             print('Global_step=%d: valid cost=%g; p=%g, r=%g, f1=%g, time=%g s' % (
102 |                 global_step, valid_cost, precision, recall, f1, time.time() - time0))
103 |             time0 = time.time()
104 |             if f1 > last_f1:
105 |                 last_f1 = f1
106 |                 saving_path = model.saver.save(sess, model_path, global_step+1)
107 |                 print('saved new model to %s ' % saving_path)
108 |         # training
109 |         batch_id = batch_indexs[batch]
110 |         [X1_batch, X2_batch, y_batch] = get_batch(data_train_path, batch_id)
111 |         y_batch = to_categorical(y_batch)
112 |         _batch_size = len(y_batch)
113 |         feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
114 |                      model.batch_size: _batch_size, model.tst: False, model.keep_prob: FLAGS.keep_prob}
115 |         summary, _cost, _, _ = sess.run(train_fetches, feed_dict)  # the cost is the mean cost of one batch
116 |         # valid per 500 steps
117 |         if 0 == (global_step + 1) % 500:
118 |             train_writer.add_summary(summary, global_step)
119 |             batch_id = np.random.randint(0, n_va_batches)  # 随机选一个验证batch
120 |             [X1_batch, X2_batch, y_batch] = get_batch(data_valid_path, batch_id)
121 |             y_batch = to_categorical(y_batch)
122 |             _batch_size = len(y_batch)
123 |             feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
124 |                          model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
125 |             summary, _cost = sess.run(valid_fetches, feed_dict)
126 |             test_writer.add_summary(summary, global_step)
127 | 
128 | 
129 | def main(_):
130 |     global ckpt_path
131 |     global last_f1
132 |     if not os.path.exists(ckpt_path):
133 |         os.makedirs(ckpt_path)
134 |     if not os.path.exists(summary_path):
135 |         os.makedirs(summary_path)
136 |     elif not FLAGS.is_retrain:  # 重新训练本模型，删除以前的 summary
137 |         shutil.rmtree(summary_path)
138 |         os.makedirs(summary_path)
139 |     if not os.path.exists(summary_path):
140 |         os.makedirs(summary_path)
141 | 
142 |     print('1.Loading data...')
143 |     W_embedding = np.load(embedding_path)
144 |     print('training sample_num = %d' % n_tr_batches)
145 |     print('valid sample_num = %d' % n_va_batches)
146 | 
147 |     # Initial or restore the model
148 |     print('2.Building model...')
149 |     config = tf.ConfigProto()
150 |     config.gpu_options.allow_growth = True
151 |     with tf.Session(config=config) as sess:
152 |         model = network.RCNN(W_embedding, settings)
153 |         with tf.variable_scope('training_ops') as vs:
154 |             learning_rate = tf.train.exponential_decay(FLAGS.lr, model.global_step, FLAGS.decay_step,
155 |                                                    FLAGS.decay_rate, staircase=True)
156 |             # two optimizer: op1, update embedding; op2, do not update embedding.
157 |             with tf.variable_scope('Optimizer1'):
158 |                 tvars1 = tf.trainable_variables()
159 |                 grads1 = tf.gradients(model.loss, tvars1)
160 |                 optimizer1 = tf.train.AdamOptimizer(learning_rate=learning_rate)
161 |                 train_op1 = optimizer1.apply_gradients(zip(grads1, tvars1),
162 |                                                    global_step=model.global_step)
163 |             with tf.variable_scope('Optimizer2'):
164 |                 tvars2 = [tvar for tvar in tvars1 if 'embedding' not in tvar.name]
165 |                 grads2 = tf.gradients(model.loss, tvars2)
166 |                 optimizer2 = tf.train.AdamOptimizer(learning_rate=learning_rate)
167 |                 train_op2 = optimizer2.apply_gradients(zip(grads2, tvars2),
168 |                                                    global_step=model.global_step)
169 |             update_op = tf.group(*model.update_emas)
170 |             merged = tf.summary.merge_all()  # summary
171 |             train_writer = tf.summary.FileWriter(summary_path + 'train', sess.graph)
172 |             test_writer = tf.summary.FileWriter(summary_path + 'test')
173 |             training_ops = [v for v in tf.global_variables() if v.name.startswith(vs.name+'/')]
174 | 
175 |         # 如果已经保存过模型，导入上次的模型
176 |         if os.path.exists(ckpt_path + "checkpoint"):
177 |             print("Restoring Variables from Checkpoint...")
178 |             model.saver.restore(sess, tf.train.latest_checkpoint(ckpt_path))
179 |             last_valid_cost, precision, recall, last_f1 = valid_epoch(data_valid_path, sess, model)
180 |             print(' valid cost=%g; p=%g, r=%g, f1=%g' % (last_valid_cost, precision, recall, last_f1))
181 |             sess.run(tf.variables_initializer(training_ops))
182 |             train_op2 = train_op1
183 |         else:
184 |             print('Initializing Variables...')
185 |             sess.run(tf.global_variables_initializer())
186 | 
187 |         print('3.Begin training...')
188 |         print('max_epoch=%d, max_max_epoch=%d' % (FLAGS.max_epoch, FLAGS.max_max_epoch))
189 |         train_op = train_op2
190 |         for epoch in range(FLAGS.max_max_epoch):
191 |             global_step = sess.run(model.global_step)
192 |             print('Global step %d, lr=%g' % (global_step, sess.run(learning_rate)))
193 |             if epoch == FLAGS.max_epoch:  # update the embedding
194 |                 train_op = train_op1
195 |             train_fetches = [merged, model.loss, train_op, update_op]
196 |             valid_fetches = [merged, model.loss]
197 |             train_epoch(data_train_path, sess, model, train_fetches, valid_fetches, train_writer, test_writer)
198 |         # 最后再做一次验证
199 |         valid_cost, precision, recall, f1 = valid_epoch(data_valid_path, sess, model)
200 |         print('END.Global_step=%d: valid cost=%g; p=%g, r=%g, f1=%g' % (
201 |             sess.run(model.global_step), valid_cost, precision, recall, f1))
202 |         if f1 > last_f1:  # save the better model
203 |             saving_path = model.saver.save(sess, model_path, sess.run(model.global_step)+1)
204 |             print('saved new model to %s ' % saving_path)
205 | 
206 | 
207 | if __name__ == '__main__':
208 |     tf.app.run()
209 | 


--------------------------------------------------------------------------------
/zhihu-text-classification-master/models/wd_3_bigru/train.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | 
  3 | from __future__ import print_function
  4 | from __future__ import division
  5 | import tensorflow as tf
  6 | import numpy as np
  7 | from tqdm import tqdm
  8 | import os
  9 | import sys
 10 | import shutil
 11 | import time
 12 | import network
 13 | 
 14 | sys.path.append('../..')
 15 | from data_helpers import to_categorical
 16 | from evaluator import score_eval
 17 | 
 18 | flags = tf.flags
 19 | flags.DEFINE_bool('is_retrain', False, 'if is_retrain is true, not rebuild the summary')
 20 | flags.DEFINE_integer('max_epoch', 1, 'update the embedding after max_epoch, default: 1')
 21 | flags.DEFINE_integer('max_max_epoch', 6, 'all training epoches, default: 6')
 22 | flags.DEFINE_float('lr', 8e-4, 'initial learning rate, default: 8e-4')
 23 | flags.DEFINE_float('decay_rate', 0.85, 'decay rate, default: 0.85')
 24 | flags.DEFINE_float('keep_prob', 0.5, 'keep_prob for training, default: 0.5')
 25 | # 正式
 26 | flags.DEFINE_integer('decay_step', 15000, 'decay_step, default: 15000')
 27 | flags.DEFINE_integer('valid_step', 10000, 'valid_step, default: 10000')
 28 | flags.DEFINE_float('last_f1', 0.40, 'if valid_f1 > last_f1, save new model. default: 0.40')
 29 | 
 30 | # 测试
 31 | # flags.DEFINE_integer('decay_step', 1000, 'decay_step, default: 1000')
 32 | # flags.DEFINE_integer('valid_step', 500, 'valid_step, default: 500')
 33 | # flags.DEFINE_float('last_f1', 0.10, 'if valid_f1 > last_f1, save new model. default: 0.10')
 34 | FLAGS = flags.FLAGS
 35 | 
 36 | lr = FLAGS.lr
 37 | last_f1 = FLAGS.last_f1
 38 | settings = network.Settings()
 39 | title_len = settings.title_len
 40 | summary_path = settings.summary_path
 41 | ckpt_path = settings.ckpt_path
 42 | model_path = ckpt_path + 'model.ckpt'
 43 | 
 44 | embedding_path = '../../data/word_embedding.npy'
 45 | data_train_path = '../../data/wd-data/data_train/'
 46 | data_valid_path = '../../data/wd-data/data_valid/'
 47 | tr_batches = os.listdir(data_train_path)  # batch 文件名列表
 48 | va_batches = os.listdir(data_valid_path)
 49 | n_tr_batches = len(tr_batches)
 50 | n_va_batches = len(va_batches)
 51 | 
 52 | # 测试
 53 | # n_tr_batches = 1000
 54 | # n_va_batches = 50
 55 | 
 56 | 
 57 | def get_batch(data_path, batch_id):
 58 |     """get a batch from data_path"""
 59 |     new_batch = np.load(data_path + str(batch_id) + '.npz')
 60 |     X_batch = new_batch['X']
 61 |     y_batch = new_batch['y']
 62 |     X1_batch = X_batch[:, :title_len]
 63 |     X2_batch = X_batch[:, title_len:]
 64 |     return [X1_batch, X2_batch, y_batch]
 65 | 
 66 | 
 67 | def valid_epoch(data_path, sess, model):
 68 |     """Test on the valid data."""
 69 |     va_batches = os.listdir(data_path)
 70 |     n_va_batches = len(va_batches)
 71 |     _costs = 0.0
 72 |     predict_labels_list = list()  # 所有的预测结果
 73 |     marked_labels_list = list()
 74 |     for i in range(n_va_batches):
 75 |         [X1_batch, X2_batch, y_batch] = get_batch(data_path, i)
 76 |         marked_labels_list.extend(y_batch)
 77 |         y_batch = to_categorical(y_batch)
 78 |         _batch_size = len(y_batch)
 79 |         fetches = [model.loss, model.y_pred]
 80 |         feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
 81 |                      model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
 82 |         _cost, predict_labels = sess.run(fetches, feed_dict)
 83 |         _costs += _cost
 84 |         predict_labels = list(map(lambda label: label.argsort()[-1:-6:-1], predict_labels))  # 取最大的5个下标
 85 |         predict_labels_list.extend(predict_labels)
 86 |     predict_label_and_marked_label_list = zip(predict_labels_list, marked_labels_list)
 87 |     precision, recall, f1 = score_eval(predict_label_and_marked_label_list)
 88 |     mean_cost = _costs / n_va_batches
 89 |     return mean_cost, precision, recall, f1
 90 | 
 91 | 
 92 | def train_epoch(data_path, sess, model, train_fetches, valid_fetches, train_writer, test_writer):
 93 |     global last_f1
 94 |     global lr
 95 |     time0 = time.time()
 96 |     batch_indexs = np.random.permutation(n_tr_batches)  # shuffle the training data
 97 |     for batch in tqdm(range(n_tr_batches)):
 98 |         global_step = sess.run(model.global_step)
 99 |         if 0 == (global_step + 1) % FLAGS.valid_step:
100 |             valid_cost, precision, recall, f1 = valid_epoch(data_valid_path, sess, model)
101 |             print('Global_step=%d: valid cost=%g; p=%g, r=%g, f1=%g, time=%g s' % (
102 |                 global_step, valid_cost, precision, recall, f1, time.time() - time0))
103 |             time0 = time.time()
104 |             if f1 > last_f1:
105 |                 last_f1 = f1
106 |                 saving_path = model.saver.save(sess, model_path, global_step+1)
107 |                 print('saved new model to %s ' % saving_path)
108 |         # training
109 |         batch_id = batch_indexs[batch]
110 |         [X1_batch, X2_batch, y_batch] = get_batch(data_train_path, batch_id)
111 |         y_batch = to_categorical(y_batch)
112 |         _batch_size = len(y_batch)
113 |         feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
114 |                      model.batch_size: _batch_size, model.tst: False, model.keep_prob: FLAGS.keep_prob}
115 |         summary, _cost, _, _ = sess.run(train_fetches, feed_dict)  # the cost is the mean cost of one batch
116 |         # valid per 500 steps
117 |         if 0 == (global_step + 1) % 500:
118 |             train_writer.add_summary(summary, global_step)
119 |             batch_id = np.random.randint(0, n_va_batches)  # 随机选一个验证batch
120 |             [X1_batch, X2_batch, y_batch] = get_batch(data_valid_path, batch_id)
121 |             y_batch = to_categorical(y_batch)
122 |             _batch_size = len(y_batch)
123 |             feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
124 |                          model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
125 |             summary, _cost = sess.run(valid_fetches, feed_dict)
126 |             test_writer.add_summary(summary, global_step)
127 | 
128 | 
129 | def main(_):
130 |     global ckpt_path
131 |     global last_f1
132 |     if not os.path.exists(ckpt_path):
133 |         os.makedirs(ckpt_path)
134 |     if not os.path.exists(summary_path):
135 |         os.makedirs(summary_path)
136 |     elif not FLAGS.is_retrain:  # 重新训练本模型，删除以前的 summary
137 |         shutil.rmtree(summary_path)
138 |         os.makedirs(summary_path)
139 |     if not os.path.exists(summary_path):
140 |         os.makedirs(summary_path)
141 | 
142 |     print('1.Loading data...')
143 |     W_embedding = np.load(embedding_path)
144 |     print('training sample_num = %d' % n_tr_batches)
145 |     print('valid sample_num = %d' % n_va_batches)
146 | 
147 |     # Initial or restore the model
148 |     print('2.Building model...')
149 |     config = tf.ConfigProto()
150 |     config.gpu_options.allow_growth = True
151 |     with tf.Session(config=config) as sess:
152 |         model = network.BiGRU(W_embedding, settings)
153 |         with tf.variable_scope('training_ops') as vs:
154 |             learning_rate = tf.train.exponential_decay(FLAGS.lr, model.global_step, FLAGS.decay_step,
155 |                                                    FLAGS.decay_rate, staircase=True)
156 |             # two optimizer: op1, update embedding; op2, do not update embedding.
157 |             with tf.variable_scope('Optimizer1'):
158 |                 tvars1 = tf.trainable_variables()
159 |                 grads1 = tf.gradients(model.loss, tvars1)
160 |                 optimizer1 = tf.train.AdamOptimizer(learning_rate=learning_rate)
161 |                 train_op1 = optimizer1.apply_gradients(zip(grads1, tvars1),
162 |                                                    global_step=model.global_step)
163 |             with tf.variable_scope('Optimizer2'):
164 |                 tvars2 = [tvar for tvar in tvars1 if 'embedding' not in tvar.name]
165 |                 grads2 = tf.gradients(model.loss, tvars2)
166 |                 optimizer2 = tf.train.AdamOptimizer(learning_rate=learning_rate)
167 |                 train_op2 = optimizer2.apply_gradients(zip(grads2, tvars2),
168 |                                                    global_step=model.global_step)
169 |             update_op = tf.group(*model.update_emas)
170 |             merged = tf.summary.merge_all()  # summary
171 |             train_writer = tf.summary.FileWriter(summary_path + 'train', sess.graph)
172 |             test_writer = tf.summary.FileWriter(summary_path + 'test')
173 |             training_ops = [v for v in tf.global_variables() if v.name.startswith(vs.name+'/')]
174 | 
175 |         # 如果已经保存过模型，导入上次的模型
176 |         if os.path.exists(ckpt_path + "checkpoint"):
177 |             print("Restoring Variables from Checkpoint...")
178 |             model.saver.restore(sess, tf.train.latest_checkpoint(ckpt_path))
179 |             last_valid_cost, precision, recall, last_f1 = valid_epoch(data_valid_path, sess, model)
180 |             print(' valid cost=%g; p=%g, r=%g, f1=%g' % (last_valid_cost, precision, recall, last_f1))
181 |             sess.run(tf.variables_initializer(training_ops))
182 |             train_op2 = train_op1
183 |         else:
184 |             print('Initializing Variables...')
185 |             sess.run(tf.global_variables_initializer())
186 | 
187 |         print('3.Begin training...')
188 | 
189 |         train_op = train_op2
190 |         print('max_epoch=%d, max_max_epoch=%d' % (FLAGS.max_epoch, FLAGS.max_max_epoch))
191 |         for epoch in range(FLAGS.max_max_epoch):
192 |             global_step = sess.run(model.global_step)
193 |             print('Global step %d, lr=%g' % (global_step, sess.run(learning_rate)))
194 |             if epoch == FLAGS.max_epoch:  # update the embedding
195 |                 train_op = train_op1
196 |             train_fetches = [merged, model.loss, train_op, update_op]
197 |             valid_fetches = [merged, model.loss]
198 |             train_epoch(data_train_path, sess, model, train_fetches, valid_fetches, train_writer, test_writer)
199 |         # 最后再做一次验证
200 |         valid_cost, precision, recall, f1 = valid_epoch(data_valid_path, sess, model)
201 |         print('END.Global_step=%d: valid cost=%g; p=%g, r=%g, f1=%g' % (
202 |             sess.run(model.global_step), valid_cost, precision, recall, f1))
203 |         if f1 > last_f1:  # save the better model
204 |             saving_path = model.saver.save(sess, model_path, sess.run(model.global_step)+1)
205 |             print('saved new model to %s ' % saving_path)
206 | 
207 | 
208 | if __name__ == '__main__':
209 |     tf.app.run()
210 | 


--------------------------------------------------------------------------------
/zhihu-text-classification-master/models/wd_5_bigru_cnn/train.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | 
  3 | from __future__ import print_function
  4 | from __future__ import division
  5 | import tensorflow as tf
  6 | import numpy as np
  7 | from tqdm import tqdm
  8 | import os
  9 | import sys
 10 | import shutil
 11 | import time
 12 | import network
 13 | 
 14 | sys.path.append('../..')
 15 | from data_helpers import to_categorical
 16 | from evaluator import score_eval
 17 | 
 18 | flags = tf.flags
 19 | flags.DEFINE_bool('is_retrain', False, 'if is_retrain is true, not rebuild the summary')
 20 | flags.DEFINE_integer('max_epoch', 1, 'update the embedding after max_epoch, default: 1')
 21 | flags.DEFINE_integer('max_max_epoch', 6, 'all training epoches, default: 6')
 22 | flags.DEFINE_float('lr', 8e-4, 'initial learning rate, default: 8e-4')
 23 | flags.DEFINE_float('decay_rate', 0.75, 'decay rate, default: 0.75')
 24 | flags.DEFINE_float('keep_prob', 0.5, 'keep_prob for training, default: 0.5')
 25 | # 正式
 26 | flags.DEFINE_integer('decay_step', 15000, 'decay_step, default: 15000')
 27 | flags.DEFINE_integer('valid_step', 10000, 'valid_step, default: 10000')
 28 | flags.DEFINE_float('last_f1', 0.38, 'if valid_f1 > last_f1, save new model. default: 0.40')
 29 | 
 30 | # 测试
 31 | # flags.DEFINE_integer('decay_step', 1000, 'decay_step, default: 1000')
 32 | # flags.DEFINE_integer('valid_step', 500, 'valid_step, default: 500')
 33 | # flags.DEFINE_float('last_f1', 0.10, 'if valid_f1 > last_f1, save new model. default: 0.10')
 34 | FLAGS = flags.FLAGS
 35 | 
 36 | lr = FLAGS.lr
 37 | last_f1 = FLAGS.last_f1
 38 | settings = network.Settings()
 39 | title_len = settings.title_len
 40 | summary_path = settings.summary_path
 41 | ckpt_path = settings.ckpt_path
 42 | model_path = ckpt_path + 'model.ckpt'
 43 | 
 44 | embedding_path = '../../data/word_embedding.npy'
 45 | data_train_path = '../../data/wd-data/data_train/'
 46 | data_valid_path = '../../data/wd-data/data_valid/'
 47 | tr_batches = os.listdir(data_train_path)  # batch 文件名列表
 48 | va_batches = os.listdir(data_valid_path)
 49 | n_tr_batches = len(tr_batches)
 50 | n_va_batches = len(va_batches)
 51 | 
 52 | # 测试
 53 | # n_tr_batches = 1000
 54 | # n_va_batches = 50
 55 | 
 56 | 
 57 | def get_batch(data_path, batch_id):
 58 |     """get a batch from data_path"""
 59 |     new_batch = np.load(data_path + str(batch_id) + '.npz')
 60 |     X_batch = new_batch['X']
 61 |     y_batch = new_batch['y']
 62 |     X1_batch = X_batch[:, :title_len]
 63 |     X2_batch = X_batch[:, title_len:]
 64 |     return [X1_batch, X2_batch, y_batch]
 65 | 
 66 | 
 67 | def valid_epoch(data_path, sess, model):
 68 |     """Test on the valid data."""
 69 |     va_batches = os.listdir(data_path)
 70 |     n_va_batches = len(va_batches)
 71 |     _costs = 0.0
 72 |     predict_labels_list = list()  # 所有的预测结果
 73 |     marked_labels_list = list()
 74 |     for i in range(n_va_batches):
 75 |         [X1_batch, X2_batch, y_batch] = get_batch(data_path, i)
 76 |         marked_labels_list.extend(y_batch)
 77 |         y_batch = to_categorical(y_batch)
 78 |         _batch_size = len(y_batch)
 79 |         fetches = [model.loss, model.y_pred]
 80 |         feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
 81 |                      model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
 82 |         _cost, predict_labels = sess.run(fetches, feed_dict)
 83 |         _costs += _cost
 84 |         predict_labels = (map(lambda label: label.argsort()[-1:-6:-1], predict_labels))  # 取最大的5个下标
 85 |         predict_labels_list.extend(predict_labels)
 86 |     predict_label_and_marked_label_list = zip(predict_labels_list, marked_labels_list)
 87 |     precision, recall, f1 = score_eval(predict_label_and_marked_label_list)
 88 |     mean_cost = _costs / n_va_batches
 89 |     return mean_cost, precision, recall, f1
 90 | 
 91 | 
 92 | def train_epoch(data_path, sess, model, train_fetches, valid_fetches, train_writer, test_writer):
 93 |     global last_f1
 94 |     global lr
 95 |     time0 = time.time()
 96 |     batch_indexs = np.random.permutation(n_tr_batches)  # shuffle the training data
 97 |     for batch in tqdm(range(n_tr_batches)):
 98 |         global_step = sess.run(model.global_step)
 99 |         if 0 == (global_step + 1) % FLAGS.valid_step:
100 |             valid_cost, precision, recall, f1 = valid_epoch(data_valid_path, sess, model)
101 |             print('Global_step=%d: valid cost=%g; p=%g, r=%g, f1=%g, time=%g s' % (
102 |                 global_step, valid_cost, precision, recall, f1, time.time() - time0))
103 |             time0 = time.time()
104 |             if f1 > last_f1:
105 |                 last_f1 = f1
106 |                 saving_path = model.saver.save(sess, model_path, global_step+1)
107 |                 print('saved new model to %s ' % saving_path)
108 |         # training
109 |         batch_id = batch_indexs[batch]
110 |         [X1_batch, X2_batch, y_batch] = get_batch(data_train_path, batch_id)
111 |         y_batch = to_categorical(y_batch)
112 |         _batch_size = len(y_batch)
113 |         feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
114 |                      model.batch_size: _batch_size, model.tst: False, model.keep_prob: FLAGS.keep_prob}
115 |         summary, _cost, _, _ = sess.run(train_fetches, feed_dict)  # the cost is the mean cost of one batch
116 |         # valid per 500 steps
117 |         if 0 == (global_step + 1) % 500:
118 |             train_writer.add_summary(summary, global_step)
119 |             batch_id = np.random.randint(0, n_va_batches)  # 随机选一个验证batch
120 |             [X1_batch, X2_batch, y_batch] = get_batch(data_valid_path, batch_id)
121 |             y_batch = to_categorical(y_batch)
122 |             _batch_size = len(y_batch)
123 |             feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
124 |                          model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
125 |             summary, _cost = sess.run(valid_fetches, feed_dict)
126 |             test_writer.add_summary(summary, global_step)
127 | 
128 | 
129 | def main(_):
130 |     global ckpt_path
131 |     global last_f1
132 |     if not os.path.exists(ckpt_path):
133 |         os.makedirs(ckpt_path)
134 |     if not os.path.exists(summary_path):
135 |         os.makedirs(summary_path)
136 |     elif not FLAGS.is_retrain:  # 重新训练本模型，删除以前的 summary
137 |         shutil.rmtree(summary_path)
138 |         os.makedirs(summary_path)
139 |     if not os.path.exists(summary_path):
140 |         os.makedirs(summary_path)
141 | 
142 |     print('1.Loading data...')
143 |     W_embedding = np.load(embedding_path)
144 |     print('training sample_num = %d' % n_tr_batches)
145 |     print('valid sample_num = %d' % n_va_batches)
146 | 
147 |     # Initial or restore the model
148 |     print('2.Building model...')
149 |     config = tf.ConfigProto()
150 |     config.gpu_options.allow_growth = True
151 |     with tf.Session(config=config) as sess:
152 |         model = network.BiGRU_CNN(W_embedding, settings)
153 |         with tf.variable_scope('training_ops') as vs:
154 |             learning_rate = tf.train.exponential_decay(FLAGS.lr, model.global_step, FLAGS.decay_step,
155 |                                                    FLAGS.decay_rate, staircase=True)
156 |             # two optimizer: op1, update embedding; op2, do not update embedding.
157 |             with tf.variable_scope('Optimizer1'):
158 |                 tvars1 = tf.trainable_variables()
159 |                 grads1 = tf.gradients(model.loss, tvars1)
160 |                 optimizer1 = tf.train.AdamOptimizer(learning_rate=learning_rate)
161 |                 train_op1 = optimizer1.apply_gradients(zip(grads1, tvars1),
162 |                                                    global_step=model.global_step)
163 |             with tf.variable_scope('Optimizer2'):
164 |                 tvars2 = [tvar for tvar in tvars1 if 'embedding' not in tvar.name]
165 |                 grads2 = tf.gradients(model.loss, tvars2)
166 |                 optimizer2 = tf.train.AdamOptimizer(learning_rate=learning_rate)
167 |                 train_op2 = optimizer2.apply_gradients(zip(grads2, tvars2),
168 |                                                    global_step=model.global_step)
169 |             update_op = tf.group(*model.update_emas)
170 |             merged = tf.summary.merge_all()  # summary
171 |             train_writer = tf.summary.FileWriter(summary_path + 'train', sess.graph)
172 |             test_writer = tf.summary.FileWriter(summary_path + 'test')
173 |             training_ops = [v for v in tf.global_variables() if v.name.startswith(vs.name+'/')]
174 | 
175 |         # 如果已经保存过模型，导入上次的模型
176 |         if os.path.exists(ckpt_path + "checkpoint"):
177 |             print("Restoring Variables from Checkpoint...")
178 |             model.saver.restore(sess, tf.train.latest_checkpoint(ckpt_path))
179 |             last_valid_cost, precision, recall, last_f1 = valid_epoch(data_valid_path, sess, model)
180 |             print(' valid cost=%g; p=%g, r=%g, f1=%g' % (last_valid_cost, precision, recall, last_f1))
181 |             sess.run(tf.variables_initializer(training_ops))
182 |             train_op2 = train_op1
183 |         else:
184 |             print('Initializing Variables...')
185 |             sess.run(tf.global_variables_initializer())
186 | 
187 |         print('3.Begin training...')
188 |         print('max_epoch=%d, max_max_epoch=%d' % (FLAGS.max_epoch, FLAGS.max_max_epoch))
189 |         train_op = train_op1
190 |         for epoch in range(FLAGS.max_max_epoch):
191 |             global_step = sess.run(model.global_step)
192 |             print('Global step %d, lr=%g' % (global_step, sess.run(learning_rate)))
193 |             if epoch == FLAGS.max_epoch:  # update the embedding
194 |                 train_op = train_op1
195 |             train_fetches = [merged, model.loss, train_op, update_op]
196 |             valid_fetches = [merged, model.loss]
197 |             train_epoch(data_train_path, sess, model, train_fetches, valid_fetches, train_writer, test_writer)
198 |         # 最后再做一次验证
199 |         valid_cost, precision, recall, f1 = valid_epoch(data_valid_path, sess, model)
200 |         print('END.Global_step=%d: valid cost=%g; p=%g, r=%g, f1=%g' % (
201 |             sess.run(model.global_step), valid_cost, precision, recall, f1))
202 |         if f1 > last_f1:  # save the better model
203 |             saving_path = model.saver.save(sess, model_path, sess.run(model.global_step)+1)
204 |             print('saved new model to %s ' % saving_path)
205 | 
206 | 
207 | if __name__ == '__main__':
208 |     tf.app.run()
209 | 


--------------------------------------------------------------------------------
/zhihu-text-classification-master/models/wd_1_2_cnn_max/train.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | 
  3 | from __future__ import print_function
  4 | from __future__ import division
  5 | import tensorflow as tf
  6 | import numpy as np
  7 | from tqdm import tqdm
  8 | import os
  9 | import sys
 10 | import shutil
 11 | import time
 12 | import network
 13 | 
 14 | sys.path.append('../..')
 15 | from data_helpers import to_categorical
 16 | from evaluator import score_eval
 17 | 
 18 | flags = tf.flags
 19 | flags.DEFINE_bool('is_retrain', False, 'if is_retrain is true, not rebuild the summary')
 20 | flags.DEFINE_integer('max_epoch', 1, 'update the embedding after max_epoch, default: 1')
 21 | flags.DEFINE_integer('max_max_epoch', 6, 'all training epoches, default: 6')
 22 | flags.DEFINE_float('lr', 1e-3, 'initial learning rate, default: 1e-3')
 23 | flags.DEFINE_float('decay_rate', 0.65, 'decay rate, default: 0.65')
 24 | flags.DEFINE_float('keep_prob', 0.5, 'keep_prob for training, default: 0.5')
 25 | # 正式
 26 | 
 27 | flags.DEFINE_integer('decay_step', 15000, 'decay_step, default: 15000')
 28 | flags.DEFINE_integer('valid_step', 10000, 'valid_step, default: 10000')
 29 | flags.DEFINE_float('last_f1', 0.35, 'if valid_f1 > last_f1, save new model. default: 0.40')
 30 | 
 31 | # 测试
 32 | # flags.DEFINE_integer('decay_step', 1000, 'decay_step, default: 1000')
 33 | # flags.DEFINE_integer('valid_step', 500, 'valid_step, default: 500')
 34 | # flags.DEFINE_float('last_f1', 0.10, 'if valid_f1 > last_f1, save new model. default: 0.10')
 35 | FLAGS = flags.FLAGS
 36 | 
 37 | lr = FLAGS.lr
 38 | last_f1 = FLAGS.last_f1
 39 | settings = network.Settings()
 40 | title_len = settings.title_len
 41 | summary_path = settings.summary_path
 42 | ckpt_path = settings.ckpt_path
 43 | model_path = ckpt_path + 'model.ckpt'
 44 | 
 45 | embedding_path = '../../data/word_embedding.npy'
 46 | data_train_path = '../../data/wd-data/data_train/'
 47 | data_valid_path = '../../data/wd-data/data_valid/'
 48 | tr_batches = os.listdir(data_train_path)  # batch 文件名列表
 49 | va_batches = os.listdir(data_valid_path)
 50 | n_tr_batches = len(tr_batches)
 51 | n_va_batches = len(va_batches)
 52 | 
 53 | # 测试
 54 | # n_tr_batches = 1000
 55 | # n_va_batches = 50
 56 | 
 57 | 
 58 | def get_batch(data_path, batch_id):
 59 |     """get a batch from data_path"""
 60 |     new_batch = np.load(data_path + str(batch_id) + '.npz')
 61 |     X_batch = new_batch['X']
 62 |     y_batch = new_batch['y']
 63 |     X1_batch = X_batch[:, :title_len]
 64 |     X2_batch = X_batch[:, title_len:]
 65 |     return [X1_batch, X2_batch, y_batch]
 66 | 
 67 | 
 68 | def valid_epoch(data_path, sess, model):
 69 |     """Test on the valid data."""
 70 |     va_batches = os.listdir(data_path)
 71 |     n_va_batches = len(va_batches)
 72 |     _costs = 0.0
 73 |     predict_labels_list = list()  # 所有的预测结果
 74 |     marked_labels_list = list()
 75 |     for i in range(n_va_batches):
 76 |         [X1_batch, X2_batch, y_batch] = get_batch(data_path, i)
 77 |         marked_labels_list.extend(y_batch)
 78 |         y_batch = to_categorical(y_batch)
 79 |         _batch_size = len(y_batch)
 80 |         fetches = [model.loss, model.y_pred]
 81 |         feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
 82 |                      model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
 83 |         _cost, predict_labels = sess.run(fetches, feed_dict)
 84 |         _costs += _cost
 85 |         predict_labels = list(map(lambda label: label.argsort()[-1:-6:-1], predict_labels))  # 取最大的5个下标
 86 |         predict_labels_list.extend(predict_labels)
 87 |     predict_label_and_marked_label_list = zip(predict_labels_list, marked_labels_list)
 88 |     precision, recall, f1 = score_eval(predict_label_and_marked_label_list)
 89 |     mean_cost = _costs / n_va_batches
 90 |     return mean_cost, precision, recall, f1
 91 | 
 92 | 
 93 | def train_epoch(data_path, sess, model, train_fetches, valid_fetches, train_writer, test_writer):
 94 |     global last_f1
 95 |     global lr
 96 |     time0 = time.time()
 97 |     batch_indexs = np.random.permutation(n_tr_batches)  # shuffle the training data
 98 |     for batch in tqdm(range(n_tr_batches)):
 99 |         global_step = sess.run(model.global_step)
100 |         if 0 == (global_step + 1) % FLAGS.valid_step:
101 |             valid_cost, precision, recall, f1 = valid_epoch(data_valid_path, sess, model)
102 |             print('Global_step=%d: valid cost=%g; p=%g, r=%g, f1=%g, time=%g s' % (
103 |                 global_step, valid_cost, precision, recall, f1, time.time() - time0))
104 |             time0 = time.time()
105 |             if f1 > last_f1:
106 |                 last_f1 = f1
107 |                 saving_path = model.saver.save(sess, model_path, global_step+1)
108 |                 print('saved new model to %s ' % saving_path)
109 |         # training
110 |         batch_id = batch_indexs[batch]
111 |         [X1_batch, X2_batch, y_batch] = get_batch(data_train_path, batch_id)
112 |         y_batch = to_categorical(y_batch)
113 |         _batch_size = len(y_batch)
114 |         feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
115 |                      model.batch_size: _batch_size, model.tst: False, model.keep_prob: FLAGS.keep_prob}
116 |         summary, _cost, _, _ = sess.run(train_fetches, feed_dict)  # the cost is the mean cost of one batch
117 |         # valid per 500 steps
118 |         if 0 == (global_step + 1) % 500:
119 |             train_writer.add_summary(summary, global_step)
120 |             batch_id = np.random.randint(0, n_va_batches)  # 随机选一个验证batch
121 |             [X1_batch, X2_batch, y_batch] = get_batch(data_valid_path, batch_id)
122 |             y_batch = to_categorical(y_batch)
123 |             _batch_size = len(y_batch)
124 |             feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
125 |                          model.batch_size: _batch_size, model.tst: True, model.keep_prob: 1.0}
126 |             summary, _cost = sess.run(valid_fetches, feed_dict)
127 |             test_writer.add_summary(summary, global_step)
128 | 
129 | 
130 | def main(_):
131 |     global ckpt_path
132 |     global last_f1
133 |     if not os.path.exists(ckpt_path):
134 |         os.makedirs(ckpt_path)
135 |     if not os.path.exists(summary_path):
136 |         os.makedirs(summary_path)
137 |     elif not FLAGS.is_retrain:  # 重新训练本模型，删除以前的 summary
138 |         shutil.rmtree(summary_path)
139 |         os.makedirs(summary_path)
140 |     if not os.path.exists(summary_path):
141 |         os.makedirs(summary_path)
142 | 
143 |     print('1.Loading data...')
144 |     W_embedding = np.load(embedding_path)
145 |     print('training sample_num = %d' % n_tr_batches)
146 |     print('valid sample_num = %d' % n_va_batches)
147 | 
148 |     # Initial or restore the model
149 |     print('2.Building model...')
150 |     config = tf.ConfigProto()
151 |     config.gpu_options.allow_growth = True
152 |     with tf.Session(config=config) as sess:
153 |         model = network.TextCNN(W_embedding, settings)
154 |         with tf.variable_scope('training_ops') as vs:
155 |             learning_rate = tf.train.exponential_decay(FLAGS.lr, model.global_step, FLAGS.decay_step,
156 |                                                    FLAGS.decay_rate, staircase=True)
157 |             # two optimizer: op1, update embedding; op2, do not update embedding.
158 |             with tf.variable_scope('Optimizer1'):
159 |                 tvars1 = tf.trainable_variables()
160 |                 grads1 = tf.gradients(model.loss, tvars1)
161 |                 optimizer1 = tf.train.AdamOptimizer(learning_rate=learning_rate)
162 |                 train_op1 = optimizer1.apply_gradients(zip(grads1, tvars1),
163 |                                                    global_step=model.global_step)
164 |             with tf.variable_scope('Optimizer2'):
165 |                 tvars2 = [tvar for tvar in tvars1 if 'embedding' not in tvar.name]
166 |                 grads2 = tf.gradients(model.loss, tvars2)
167 |                 optimizer2 = tf.train.AdamOptimizer(learning_rate=learning_rate)
168 |                 train_op2 = optimizer2.apply_gradients(zip(grads2, tvars2),
169 |                                                    global_step=model.global_step)
170 |             update_op = tf.group(*model.update_emas)
171 |             merged = tf.summary.merge_all()  # summary
172 |             train_writer = tf.summary.FileWriter(summary_path + 'train', sess.graph)
173 |             test_writer = tf.summary.FileWriter(summary_path + 'test')
174 |             training_ops = [v for v in tf.global_variables() if v.name.startswith(vs.name+'/')]
175 | 
176 |         # 如果已经保存过模型，导入上次的模型
177 |         if os.path.exists(ckpt_path + "checkpoint"):
178 |             print("Restoring Variables from Checkpoint...")
179 |             model.saver.restore(sess, tf.train.latest_checkpoint(ckpt_path))
180 |             last_valid_cost, precision, recall, last_f1 = valid_epoch(data_valid_path, sess, model)
181 |             print(' valid cost=%g; p=%g, r=%g, f1=%g' % (last_valid_cost, precision, recall, last_f1))
182 |             sess.run(tf.variables_initializer(training_ops))
183 |             train_op2 = train_op1
184 |         else:
185 |             print('Initializing Variables...')
186 |             sess.run(tf.global_variables_initializer())
187 | 
188 |         print('3.Begin training...')
189 |         print('max_epoch=%d, max_max_epoch=%d' % (FLAGS.max_epoch, FLAGS.max_max_epoch))
190 |         train_op = train_op2
191 |         for epoch in range(FLAGS.max_max_epoch):
192 |             global_step = sess.run(model.global_step)
193 |             print('Global step %d, lr=%g' % (global_step, sess.run(learning_rate)))
194 |             if epoch == FLAGS.max_epoch:  # update the embedding
195 |                 train_op = train_op1
196 |             train_fetches = [merged, model.loss, train_op, update_op]
197 |             valid_fetches = [merged, model.loss]
198 |             train_epoch(data_train_path, sess, model, train_fetches, valid_fetches, train_writer, test_writer)
199 |         # 最后再做一次验证
200 |         valid_cost, precision, recall, f1 = valid_epoch(data_valid_path, sess, model)
201 |         print('END.Global_step=%d: valid cost=%g; p=%g, r=%g, f1=%g' % (
202 |             sess.run(model.global_step), valid_cost, precision, recall, f1))
203 |         if f1 > last_f1:  # save the better model
204 |             saving_path = model.saver.save(sess, model_path, sess.run(model.global_step)+1)
205 |             print('saved new model to %s ' % saving_path)
206 | 
207 | 
208 | if __name__ == '__main__':
209 |     tf.app.run()
210 | 


--------------------------------------------------------------------------------
/zhihu-text-classification-master/models/wd_3_bigru/network.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | 
  3 | import tensorflow as tf
  4 | from tensorflow.contrib import rnn
  5 | import tensorflow.contrib.layers as layers
  6 | 
  7 | """wd_3_bigru
  8 | title 部分使用 bigru+attention；content 部分使用 bigru+attention； 两部分输出直接 concat。
  9 | """
 10 | 
 11 | 
 12 | class Settings(object):
 13 |     def __init__(self):
 14 |         self.model_name = 'wd_3_bigru'
 15 |         self.title_len = 30
 16 |         self.content_len = 150
 17 |         self.hidden_size = 256
 18 |         self.n_layer = 1
 19 |         self.fc_hidden_size = 1024
 20 |         self.n_class = 1999
 21 |         self.summary_path = '../../summary/' + self.model_name + '/'
 22 |         self.ckpt_path = '../../ckpt/' + self.model_name + '/'
 23 | 
 24 | 
 25 | class BiGRU(object):
 26 |     """
 27 |     title: inputs->bigru+attention->output_title
 28 |     content: inputs->bigru+attention->output_content
 29 |     concat[output_title, output_content] -> fc+bn+relu -> sigmoid_entropy.
 30 |     """
 31 | 
 32 |     def __init__(self, W_embedding, settings):
 33 |         self.model_name = settings.model_name
 34 |         self.title_len = settings.title_len
 35 |         self.content_len = settings.content_len
 36 |         self.hidden_size = settings.hidden_size
 37 |         self.n_layer = settings.n_layer
 38 |         self.n_class = settings.n_class
 39 |         self.fc_hidden_size = settings.fc_hidden_size
 40 |         self._global_step = tf.Variable(0, trainable=False, name='Global_Step')
 41 |         self.update_emas = list()
 42 |         # placeholders
 43 |         self._tst = tf.placeholder(tf.bool)
 44 |         self._keep_prob = tf.placeholder(tf.float32, [])
 45 |         self._batch_size = tf.placeholder(tf.int32, [])
 46 | 
 47 |         with tf.name_scope('Inputs'):
 48 |             self._X1_inputs = tf.placeholder(tf.int64, [None, self.title_len], name='X1_inputs')
 49 |             self._X2_inputs = tf.placeholder(tf.int64, [None, self.content_len], name='X2_inputs')
 50 |             self._y_inputs = tf.placeholder(tf.float32, [None, self.n_class], name='y_input')
 51 | 
 52 |         with tf.variable_scope('embedding'):
 53 |             self.embedding = tf.get_variable(name='embedding', shape=W_embedding.shape,
 54 |                                              initializer=tf.constant_initializer(W_embedding), trainable=True)
 55 |         self.embedding_size = W_embedding.shape[1]
 56 | 
 57 |         with tf.variable_scope('bigru_text'):
 58 |             output_title = self.bigru_inference(self._X1_inputs)
 59 | 
 60 |         with tf.variable_scope('bigru_content'):
 61 |             output_content = self.bigru_inference(self._X2_inputs)
 62 | 
 63 |         with tf.variable_scope('fc-bn-layer'):
 64 |             output = tf.concat([output_title, output_content], axis=1)
 65 |             W_fc = self.weight_variable([self.hidden_size * 4, self.fc_hidden_size], name='Weight_fc')
 66 |             tf.summary.histogram('W_fc', W_fc)
 67 |             h_fc = tf.matmul(output, W_fc, name='h_fc')
 68 |             beta_fc = tf.Variable(tf.constant(0.1, tf.float32, shape=[self.fc_hidden_size], name="beta_fc"))
 69 |             tf.summary.histogram('beta_fc', beta_fc)
 70 |             fc_bn, update_ema_fc = self.batchnorm(h_fc, beta_fc, convolutional=False)
 71 |             self.update_emas.append(update_ema_fc)
 72 |             self.fc_bn_relu = tf.nn.relu(fc_bn, name="relu")
 73 | 
 74 |         with tf.variable_scope('out_layer'):
 75 |             W_out = self.weight_variable([self.fc_hidden_size, self.n_class], name='Weight_out')
 76 |             tf.summary.histogram('Weight_out', W_out)
 77 |             b_out = self.bias_variable([self.n_class], name='bias_out')
 78 |             tf.summary.histogram('bias_out', b_out)
 79 |             self._y_pred = tf.nn.xw_plus_b(self.fc_bn_relu, W_out, b_out, name='y_pred')  # 每个类别的分数 scores
 80 | 
 81 |         with tf.name_scope('loss'):
 82 |             self._loss = tf.reduce_mean(
 83 |                 tf.nn.sigmoid_cross_entropy_with_logits(logits=self._y_pred, labels=self._y_inputs))
 84 |             tf.summary.scalar('loss', self._loss)
 85 | 
 86 |         self.saver = tf.train.Saver(max_to_keep=1)
 87 | 
 88 |     @property
 89 |     def tst(self):
 90 |         return self._tst
 91 | 
 92 |     @property
 93 |     def keep_prob(self):
 94 |         return self._keep_prob
 95 | 
 96 |     @property
 97 |     def batch_size(self):
 98 |         return self._batch_size
 99 | 
100 |     @property
101 |     def global_step(self):
102 |         return self._global_step
103 | 
104 |     @property
105 |     def X1_inputs(self):
106 |         return self._X1_inputs
107 | 
108 |     @property
109 |     def X2_inputs(self):
110 |         return self._X2_inputs
111 | 
112 |     @property
113 |     def y_inputs(self):
114 |         return self._y_inputs
115 | 
116 |     @property
117 |     def y_pred(self):
118 |         return self._y_pred
119 | 
120 |     @property
121 |     def loss(self):
122 |         return self._loss
123 | 
124 |     def weight_variable(self, shape, name):
125 |         """Create a weight variable with appropriate initialization."""
126 |         initial = tf.truncated_normal(shape, stddev=0.1)
127 |         return tf.Variable(initial, name=name)
128 | 
129 |     def bias_variable(self, shape, name):
130 |         """Create a bias variable with appropriate initialization."""
131 |         initial = tf.constant(0.1, shape=shape)
132 |         return tf.Variable(initial, name=name)
133 | 
134 |     def batchnorm(self, Ylogits, offset, convolutional=False):
135 |         """batchnormalization.
136 |         Args:
137 |             Ylogits: 1D向量或者是3D的卷积结果。
138 |             num_updates: 迭代的global_step
139 |             offset：表示beta，全局均值；在 RELU 激活中一般初始化为 0.1。
140 |             scale：表示lambda，全局方差；在 sigmoid 激活中需要，这 RELU 激活中作用不大。
141 |             m: 表示batch均值；v:表示batch方差。
142 |             bnepsilon：一个很小的浮点数，防止除以 0.
143 |         Returns:
144 |             Ybn: 和 Ylogits 的维度一样，就是经过 Batch Normalization 处理的结果。
145 |             update_moving_everages：更新mean和variance，主要是给最后的 test 使用。
146 |         """
147 |         exp_moving_avg = tf.train.ExponentialMovingAverage(0.999, self._global_step)  # adding the iteration prevents from averaging across non-existing iterations
148 |         bnepsilon = 1e-5
149 |         if convolutional:
150 |             mean, variance = tf.nn.moments(Ylogits, [0, 1, 2])
151 |         else:
152 |             mean, variance = tf.nn.moments(Ylogits, [0])
153 |         update_moving_everages = exp_moving_avg.apply([mean, variance])
154 |         m = tf.cond(self.tst, lambda: exp_moving_avg.average(mean), lambda: mean)
155 |         v = tf.cond(self.tst, lambda: exp_moving_avg.average(variance), lambda: variance)
156 |         Ybn = tf.nn.batch_normalization(Ylogits, m, v, offset, None, bnepsilon)
157 |         return Ybn, update_moving_everages
158 | 
159 |     def gru_cell(self):
160 |         with tf.name_scope('gru_cell'):
161 |             cell = rnn.GRUCell(self.hidden_size, reuse=tf.get_variable_scope().reuse)
162 |         return rnn.DropoutWrapper(cell, output_keep_prob=self.keep_prob)
163 | 
164 |     def bi_gru(self, inputs):
165 |         """build the bi-GRU network. 返回个所有层的隐含状态。"""
166 |         cells_fw = [self.gru_cell() for _ in range(self.n_layer)]
167 |         cells_bw = [self.gru_cell() for _ in range(self.n_layer)]
168 |         initial_states_fw = [cell_fw.zero_state(self.batch_size, tf.float32) for cell_fw in cells_fw]
169 |         initial_states_bw = [cell_bw.zero_state(self.batch_size, tf.float32) for cell_bw in cells_bw]
170 |         outputs, _, _ = rnn.stack_bidirectional_dynamic_rnn(cells_fw, cells_bw, inputs,
171 |                                                             initial_states_fw=initial_states_fw,
172 |                                                             initial_states_bw=initial_states_bw, dtype=tf.float32)
173 |         return outputs
174 | 
175 |     def task_specific_attention(self, inputs, output_size,
176 |                                 initializer=layers.xavier_initializer(),
177 |                                 activation_fn=tf.tanh, scope=None):
178 |         """
179 |         Performs task-specific attention reduction, using learned
180 |         attention context vector (constant within task of interest).
181 |         Args:
182 |             inputs: Tensor of shape [batch_size, units, input_size]
183 |                 `input_size` must be static (known)
184 |                 `units` axis will be attended over (reduced from output)
185 |                 `batch_size` will be preserved
186 |             output_size: Size of output's inner (feature) dimension
187 |         Returns:
188 |            outputs: Tensor of shape [batch_size, output_dim].
189 |         """
190 |         assert len(inputs.get_shape()) == 3 and inputs.get_shape()[-1].value is not None
191 |         with tf.variable_scope(scope or 'attention') as scope:
192 |             # u_w, attention 向量
193 |             attention_context_vector = tf.get_variable(name='attention_context_vector', shape=[output_size],
194 |                                                        initializer=initializer, dtype=tf.float32)
195 |             # 全连接层，把 h_i 转为 u_i ， shape= [batch_size, units, input_size] -> [batch_size, units, output_size]
196 |             input_projection = layers.fully_connected(inputs, output_size, activation_fn=activation_fn, scope=scope)
197 |             # 输出 [batch_size, units]
198 |             vector_attn = tf.reduce_sum(tf.multiply(input_projection, attention_context_vector), axis=2, keep_dims=True)
199 |             attention_weights = tf.nn.softmax(vector_attn, dim=1)
200 |             tf.summary.histogram('attention_weigths', attention_weights)
201 |             weighted_projection = tf.multiply(inputs, attention_weights)
202 |             outputs = tf.reduce_sum(weighted_projection, axis=1)
203 |             return outputs  # 输出 [batch_size, hidden_size*2]
204 | 
205 |     def bigru_inference(self, X_inputs):
206 |         inputs = tf.nn.embedding_lookup(self.embedding, X_inputs)
207 |         output_bigru = self.bi_gru(inputs)
208 |         output_att = self.task_specific_attention(output_bigru, self.hidden_size*2)
209 |         return output_att
210 | 
211 | 
212 | # test the model
213 | def test():
214 |     import numpy as np
215 |     print('Begin testing...')
216 |     settings = Settings()
217 |     W_embedding = np.random.randn(50, 10)
218 |     config = tf.ConfigProto()
219 |     config.gpu_options.allow_growth = True
220 |     batch_size = 128
221 |     with tf.Session(config=config) as sess:
222 |         model = BiGRU(W_embedding, settings)
223 |         optimizer = tf.train.AdamOptimizer(0.001)
224 |         train_op = optimizer.minimize(model.loss)
225 |         update_op = tf.group(*model.update_emas)
226 |         sess.run(tf.global_variables_initializer())
227 |         fetch = [model.loss, model.y_pred, train_op, update_op]
228 |         loss_list = list()
229 |         for i in xrange(100):
230 |             X1_batch = np.zeros((batch_size, 30), dtype=float)
231 |             X2_batch = np.zeros((batch_size, 150), dtype=float)
232 |             y_batch = np.zeros((batch_size, 1999), dtype=int)
233 |             _batch_size = len(y_batch)
234 |             feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
235 |                          model.batch_size: _batch_size, model.tst: False, model.keep_prob: 0.5}
236 |             loss, y_pred, _, _ = sess.run(fetch, feed_dict=feed_dict)
237 |             loss_list.append(loss)
238 |             print(i, loss)
239 | 
240 | if __name__ == '__main__':
241 |     test()
242 | 


--------------------------------------------------------------------------------
/zhihu-text-classification-master/models/wd_6_rcnn/network.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | 
  3 | import tensorflow as tf
  4 | from tensorflow.contrib import rnn
  5 | import tensorflow.contrib.layers as layers
  6 | 
  7 | """wd_6_rcnn
  8 | 在论文 Recurrent Convolutional Neural Networks for Text Classification 中。
  9 | 使用 BiRNN 处理，将每个时刻的隐藏状态和原输入拼起来，在进行 max_pooling 操作。
 10 | 这里有些不同，首先也是使用 bigru 得到每个时刻的，将每个时刻的隐藏状态和原输入拼起来；
 11 | 然后使用输入到 TextCNN 网络中。
 12 | """
 13 | 
 14 | 
 15 | class Settings(object):
 16 |     def __init__(self):
 17 |         self.model_name = "wd_6_rcnn"
 18 |         self.title_len = 30
 19 |         self.content_len = 150
 20 |         self.hidden_size = 256
 21 |         self.n_layer = 1
 22 |         self.filter_sizes = [2, 3, 4, 5, 7]
 23 |         self.n_filter = 256
 24 |         self.fc_hidden_size = 1024
 25 |         self.n_class = 1999
 26 |         self.summary_path = '../../summary/' + self.model_name + '/'
 27 |         self.ckpt_path = '../../ckpt/' + self.model_name + '/'
 28 | 
 29 | 
 30 | class RCNN(object):
 31 |     def __init__(self, W_embedding, settings):
 32 |         self.model_name = settings.model_name
 33 |         self.title_len = settings.title_len
 34 |         self.content_len = settings.content_len
 35 |         self.hidden_size = settings.hidden_size
 36 |         self.n_layer = settings.n_layer
 37 |         self.filter_sizes = settings.filter_sizes
 38 |         self.n_filter = settings.n_filter
 39 |         self.n_filter_total = self.n_filter * len(self.filter_sizes)
 40 |         self.n_class = settings.n_class
 41 |         self.fc_hidden_size = settings.fc_hidden_size
 42 |         self._global_step = tf.Variable(0, trainable=False, name='Global_Step')
 43 |         self.update_emas = list()
 44 |         # placeholders
 45 |         self._tst = tf.placeholder(tf.bool)
 46 |         self._keep_prob = tf.placeholder(tf.float32, [])
 47 |         self._batch_size = tf.placeholder(tf.int32, [])
 48 | 
 49 |         with tf.name_scope('Inputs'):
 50 |             self._X1_inputs = tf.placeholder(tf.int64, [None, self.title_len], name='X1_inputs')
 51 |             self._X2_inputs = tf.placeholder(tf.int64, [None, self.content_len], name='X2_inputs')
 52 |             self._y_inputs = tf.placeholder(tf.float32, [None, self.n_class], name='y_input')
 53 | 
 54 |         with tf.variable_scope('embedding'):
 55 |             self.embedding = tf.get_variable(name='embedding', shape=W_embedding.shape,
 56 |                                                    initializer=tf.constant_initializer(W_embedding), trainable=True)
 57 |         self.embedding_size = W_embedding.shape[1]
 58 | 
 59 |         with tf.variable_scope('rcnn_text'):
 60 |             output_title = self.rcnn_inference(self._X1_inputs, self.title_len)
 61 | 
 62 |         with tf.variable_scope('rcnn_content'):
 63 |             output_content = self.rcnn_inference(self._X2_inputs, self.content_len)
 64 | 
 65 |         with tf.variable_scope('fc-bn-layer'):
 66 |             output = tf.concat([output_title, output_content], axis=1)
 67 |             W_fc = self.weight_variable([self.n_filter_total*2, self.fc_hidden_size],
 68 |                                         name='Weight_fc')
 69 |             tf.summary.histogram('W_fc', W_fc)
 70 |             h_fc = tf.matmul(output, W_fc, name='h_fc')
 71 |             beta_fc = tf.Variable(tf.constant(0.1, tf.float32, shape=[self.fc_hidden_size], name="beta_fc"))
 72 |             tf.summary.histogram('beta_fc', beta_fc)
 73 |             fc_bn, update_ema_fc = self.batchnorm(h_fc, beta_fc, convolutional=False)
 74 |             self.update_emas.append(update_ema_fc)
 75 |             self.fc_bn_relu = tf.nn.relu(fc_bn, name="relu")
 76 |             fc_bn_drop = tf.nn.dropout(self.fc_bn_relu, self.keep_prob)
 77 | 
 78 |         with tf.variable_scope('out_layer'):
 79 |             W_out = self.weight_variable([self.fc_hidden_size, self.n_class], name='Weight_out')
 80 |             tf.summary.histogram('Weight_out', W_out)
 81 |             b_out = self.bias_variable([self.n_class], name='bias_out')
 82 |             tf.summary.histogram('bias_out', b_out)
 83 |             self._y_pred = tf.nn.xw_plus_b(fc_bn_drop, W_out, b_out, name='y_pred')  # 每个类别的分数 scores
 84 | 
 85 |         with tf.name_scope('loss'):
 86 |             self._loss = tf.reduce_mean(
 87 |                 tf.nn.sigmoid_cross_entropy_with_logits(logits=self._y_pred, labels=self._y_inputs))
 88 |             tf.summary.scalar('loss', self._loss)
 89 | 
 90 |         self.saver = tf.train.Saver(max_to_keep=1)
 91 | 
 92 |     @property
 93 |     def tst(self):
 94 |         return self._tst
 95 | 
 96 |     @property
 97 |     def keep_prob(self):
 98 |         return self._keep_prob
 99 | 
100 |     @property
101 |     def batch_size(self):
102 |         return self._batch_size
103 | 
104 |     @property
105 |     def global_step(self):
106 |         return self._global_step
107 | 
108 |     @property
109 |     def X1_inputs(self):
110 |         return self._X1_inputs
111 | 
112 |     @property
113 |     def X2_inputs(self):
114 |         return self._X2_inputs
115 | 
116 |     @property
117 |     def y_inputs(self):
118 |         return self._y_inputs
119 | 
120 |     @property
121 |     def y_pred(self):
122 |         return self._y_pred
123 | 
124 |     @property
125 |     def loss(self):
126 |         return self._loss
127 | 
128 |     def weight_variable(self, shape, name):
129 |         """Create a weight variable with appropriate initialization."""
130 |         initial = tf.truncated_normal(shape, stddev=0.1)
131 |         return tf.Variable(initial, name=name)
132 | 
133 |     def bias_variable(self, shape, name):
134 |         """Create a bias variable with appropriate initialization."""
135 |         initial = tf.constant(0.1, shape=shape)
136 |         return tf.Variable(initial, name=name)
137 | 
138 |     def batchnorm(self, Ylogits, offset, convolutional=False):
139 |         """batchnormalization.
140 |         Args:
141 |             Ylogits: 1D向量或者是3D的卷积结果。
142 |             num_updates: 迭代的global_step
143 |             offset：表示beta，全局均值；在 RELU 激活中一般初始化为 0.1。
144 |             scale：表示lambda，全局方差；在 sigmoid 激活中需要，这 RELU 激活中作用不大。
145 |             m: 表示batch均值；v:表示batch方差。
146 |             bnepsilon：一个很小的浮点数，防止除以 0.
147 |         Returns:
148 |             Ybn: 和 Ylogits 的维度一样，就是经过 Batch Normalization 处理的结果。
149 |             update_moving_everages：更新mean和variance，主要是给最后的 test 使用。
150 |         """
151 |         exp_moving_avg = tf.train.ExponentialMovingAverage(0.999,
152 |                                                            self._global_step)  # adding the iteration prevents from averaging across non-existing iterations
153 |         bnepsilon = 1e-5
154 |         if convolutional:
155 |             mean, variance = tf.nn.moments(Ylogits, [0, 1, 2])
156 |         else:
157 |             mean, variance = tf.nn.moments(Ylogits, [0])
158 |         update_moving_everages = exp_moving_avg.apply([mean, variance])
159 |         m = tf.cond(self.tst, lambda: exp_moving_avg.average(mean), lambda: mean)
160 |         v = tf.cond(self.tst, lambda: exp_moving_avg.average(variance), lambda: variance)
161 |         Ybn = tf.nn.batch_normalization(Ylogits, m, v, offset, None, bnepsilon)
162 |         return Ybn, update_moving_everages
163 | 
164 |     def gru_cell(self):
165 |         with tf.name_scope('gru_cell'):
166 |             cell = rnn.GRUCell(self.hidden_size, reuse=tf.get_variable_scope().reuse)
167 |         return rnn.DropoutWrapper(cell, output_keep_prob=self.keep_prob)
168 | 
169 |     def bi_gru(self, X_inputs):
170 |         """build the bi-GRU network. Return the encoder represented vector.
171 |         X_inputs: [batch_size, n_step]
172 |         n_step: 句子的词数量；或者文档的句子数。
173 |         outputs: [fw_state, embeddings, bw_state], shape=[batch_size, hidden_size+embedding_size+hidden_size]
174 |         """
175 |         inputs = tf.nn.embedding_lookup(self.embedding, X_inputs)   # [batch_size, n_step, embedding_size]
176 |         cells_fw = [self.gru_cell() for _ in range(self.n_layer)]
177 |         cells_bw = [self.gru_cell() for _ in range(self.n_layer)]
178 |         initial_states_fw = [cell_fw.zero_state(self.batch_size, tf.float32) for cell_fw in cells_fw]
179 |         initial_states_bw = [cell_bw.zero_state(self.batch_size, tf.float32) for cell_bw in cells_bw]
180 |         outputs, _, _ = rnn.stack_bidirectional_dynamic_rnn(cells_fw, cells_bw, inputs,
181 |                         initial_states_fw = initial_states_fw, initial_states_bw = initial_states_bw, dtype=tf.float32)
182 |         hidden_outputs = tf.concat([outputs, inputs], axis=2)
183 |         return hidden_outputs  # shape =[seg_num, n_steps, hidden_size*2+embedding_size]
184 | 
185 |     def textcnn(self, cnn_inputs, n_step):
186 |         """build the TextCNN network. Return the h_drop"""
187 |         # cnn_inputs.shape = [batchsize, n_step, hidden_size*2+embedding_size]
188 |         inputs = tf.expand_dims(cnn_inputs, -1)
189 |         pooled_outputs = list()
190 |         for i, filter_size in enumerate(self.filter_sizes):
191 |             with tf.variable_scope("conv-maxpool-%s" % filter_size):
192 |                 # Convolution Layer
193 |                 filter_shape = [filter_size, self.hidden_size*2+self.embedding_size, 1, self.n_filter]
194 |                 W_filter = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W_filter")
195 |                 beta = tf.Variable(tf.constant(0.1, tf.float32, shape=[self.n_filter], name="beta"))
196 |                 tf.summary.histogram('beta', beta)
197 |                 conv = tf.nn.conv2d(inputs, W_filter, strides=[1, 1, 1, 1], padding="VALID", name="conv")
198 |                 conv_bn, update_ema = self.batchnorm(conv, beta, convolutional=True)    # 在激活层前面加 BN
199 |                 # Apply nonlinearity, batch norm scaling is not useful with relus
200 |                 h = tf.nn.relu(conv_bn, name="relu")
201 |                 # Maxpooling over the outputs
202 |                 pooled = tf.nn.max_pool(h,ksize=[1, n_step - filter_size + 1, 1, 1],
203 |                                         strides=[1, 1, 1, 1],padding='VALID',name="pool")
204 |                 pooled_outputs.append(pooled)
205 |                 self.update_emas.append(update_ema)
206 |         h_pool = tf.concat(pooled_outputs, 3)
207 |         h_pool_flat = tf.reshape(h_pool, [-1, self.n_filter_total])
208 |         return h_pool_flat    # shape = [batch_size, n_filter_total]
209 | 
210 |     def rcnn_inference(self, X_inputs, n_step):
211 |         output_bigru = self.bi_gru(X_inputs)
212 |         output_cnn = self.textcnn(output_bigru, n_step)
213 |         return output_cnn # shape = [batch_size, n_filter_total]
214 | 
215 | 
216 | # test the model
217 | def test():
218 |     import numpy as np
219 |     print('Begin testing...')
220 |     settings = Settings()
221 |     W_embedding = np.random.randn(50, 10)
222 |     config = tf.ConfigProto()
223 |     config.gpu_options.allow_growth = True
224 |     batch_size = 128
225 |     with tf.Session(config=config) as sess:
226 |         model = RCNN(W_embedding, settings)
227 |         optimizer = tf.train.AdamOptimizer(0.001)
228 |         train_op = optimizer.minimize(model.loss)
229 |         update_op = tf.group(*model.update_emas)
230 |         sess.run(tf.global_variables_initializer())
231 |         fetch = [model.loss, model.y_pred, train_op, update_op]
232 |         loss_list = list()
233 |         for i in xrange(100):
234 |             X1_batch = np.zeros((batch_size, 30), dtype=float)
235 |             X2_batch = np.zeros((batch_size, 150), dtype=float)
236 |             y_batch = np.zeros((batch_size, 1999), dtype=int)
237 |             _batch_size = len(y_batch)
238 |             feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
239 |                          model.batch_size: _batch_size, model.tst: False, model.keep_prob: 0.5}
240 |             loss, y_pred, _, _ = sess.run(fetch, feed_dict=feed_dict)
241 |             loss_list.append(loss)
242 |             print(i, loss)
243 | 
244 | 
245 | if __name__ == '__main__':
246 |     test()
247 | 


--------------------------------------------------------------------------------
/zhihu-text-classification-master/models/wd_2_hcnn/network.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | 
  3 | import tensorflow as tf
  4 | 
  5 | """wd_2_hcnn
  6 | title 部分使用 TextCNN；content 部分使用分层的 TextCNN。
  7 | """
  8 | 
  9 | 
 10 | class Settings(object):
 11 |     def __init__(self):
 12 |         self.model_name = 'wd_2_hcnn'
 13 |         self.title_len = self.sent_len = 30
 14 |         self.doc_len = 10
 15 |         self.sent_filter_sizes = [2, 3, 4, 5]
 16 |         self.doc_filter_sizes = [2, 3, 4]
 17 |         self.n_filter = 256
 18 |         self.fc_hidden_size = 1024
 19 |         self.n_class = 1999
 20 |         self.summary_path = '../../summary/' + self.model_name + '/'
 21 |         self.ckpt_path = '../../ckpt/' + self.model_name + '/'
 22 | 
 23 | 
 24 | class HCNN(object):
 25 |     """
 26 |     title: inputs->textcnn->output_title
 27 |     content: inputs->hcnn->output_content
 28 |     concat[output_title, output_content] -> fc+bn+relu -> sigmoid_entropy.
 29 |     """
 30 | 
 31 |     def __init__(self, W_embedding, settings):
 32 |         self.model_name = settings.model_name
 33 |         self.sent_len = settings.sent_len
 34 |         self.doc_len = settings.doc_len
 35 |         self.sent_filter_sizes = settings.sent_filter_sizes
 36 |         self.doc_filter_sizes = settings.doc_filter_sizes
 37 |         self.n_filter = settings.n_filter
 38 |         self.n_class = settings.n_class
 39 |         self.fc_hidden_size = settings.fc_hidden_size
 40 |         self._global_step = tf.Variable(0, trainable=False, name='Global_Step')
 41 |         self.update_emas = list()
 42 |         # placeholders
 43 |         self._tst = tf.placeholder(tf.bool)
 44 |         self._keep_prob = tf.placeholder(tf.float32, [])
 45 |         self._batch_size = tf.placeholder(tf.int32, [])
 46 | 
 47 |         with tf.name_scope('Inputs'):
 48 |             self._X1_inputs = tf.placeholder(tf.int64, [None, self.sent_len], name='X1_inputs')
 49 |             self._X2_inputs = tf.placeholder(tf.int64, [None, self.doc_len * self.sent_len], name='X2_inputs')
 50 |             self._y_inputs = tf.placeholder(tf.float32, [None, self.n_class], name='y_input')
 51 | 
 52 |         with tf.variable_scope('embedding'):
 53 |             self.embedding = tf.get_variable(name='embedding', shape=W_embedding.shape,
 54 |                                              initializer=tf.constant_initializer(W_embedding), trainable=True)
 55 |         self.embedding_size = W_embedding.shape[1]
 56 | 
 57 |         with tf.variable_scope('cnn_text'):
 58 |             output_title = self.cnn_inference(self._X1_inputs)
 59 | 
 60 |         with tf.variable_scope('hcnn_content'):
 61 |             output_content = self.hcnn_inference(self._X2_inputs)
 62 | 
 63 |         with tf.variable_scope('fc-bn-layer'):
 64 |             output = tf.concat([output_title, output_content], axis=1)
 65 |             output_size = self.n_filter * (len(self.sent_filter_sizes) + len(self.doc_filter_sizes))
 66 |             W_fc = self.weight_variable([output_size, self.fc_hidden_size], name='Weight_fc')
 67 |             tf.summary.histogram('W_fc', W_fc)
 68 |             h_fc = tf.matmul(output, W_fc, name='h_fc')
 69 |             beta_fc = tf.Variable(tf.constant(0.1, tf.float32, shape=[self.fc_hidden_size], name="beta_fc"))
 70 |             tf.summary.histogram('beta_fc', beta_fc)
 71 |             fc_bn, update_ema_fc = self.batchnorm(h_fc, beta_fc, convolutional=False)
 72 |             self.update_emas.append(update_ema_fc)
 73 |             self.fc_bn_relu = tf.nn.relu(fc_bn, name="relu")
 74 |             fc_bn_drop = tf.nn.dropout(self.fc_bn_relu, self.keep_prob)
 75 | 
 76 |         with tf.variable_scope('out_layer'):
 77 |             W_out = self.weight_variable([self.fc_hidden_size, self.n_class], name='Weight_out')
 78 |             tf.summary.histogram('Weight_out', W_out)
 79 |             b_out = self.bias_variable([self.n_class], name='bias_out')
 80 |             tf.summary.histogram('bias_out', b_out)
 81 |             self._y_pred = tf.nn.xw_plus_b(fc_bn_drop, W_out, b_out, name='y_pred')  # 每个类别的分数 scores
 82 | 
 83 |         with tf.name_scope('loss'):
 84 |             self._loss = tf.reduce_mean(
 85 |                 tf.nn.sigmoid_cross_entropy_with_logits(logits=self._y_pred, labels=self._y_inputs))
 86 |             tf.summary.scalar('loss', self._loss)
 87 | 
 88 |         self.saver = tf.train.Saver(max_to_keep=2)
 89 | 
 90 |     @property
 91 |     def tst(self):
 92 |         return self._tst
 93 | 
 94 |     @property
 95 |     def keep_prob(self):
 96 |         return self._keep_prob
 97 | 
 98 |     @property
 99 |     def batch_size(self):
100 |         return self._batch_size
101 | 
102 |     @property
103 |     def global_step(self):
104 |         return self._global_step
105 | 
106 |     @property
107 |     def X1_inputs(self):
108 |         return self._X1_inputs
109 | 
110 |     @property
111 |     def X2_inputs(self):
112 |         return self._X2_inputs
113 | 
114 |     @property
115 |     def y_inputs(self):
116 |         return self._y_inputs
117 | 
118 |     @property
119 |     def y_pred(self):
120 |         return self._y_pred
121 | 
122 |     @property
123 |     def loss(self):
124 |         return self._loss
125 | 
126 |     def weight_variable(self, shape, name):
127 |         """Create a weight variable with appropriate initialization."""
128 |         initial = tf.truncated_normal(shape, stddev=0.1)
129 |         return tf.Variable(initial, name=name)
130 | 
131 |     def bias_variable(self, shape, name):
132 |         """Create a bias variable with appropriate initialization."""
133 |         initial = tf.constant(0.1, shape=shape)
134 |         return tf.Variable(initial, name=name)
135 | 
136 |     def batchnorm(self, Ylogits, offset, convolutional=False):
137 |         """batchnormalization.
138 |         Args:
139 |             Ylogits: 1D向量或者是3D的卷积结果。
140 |             num_updates: 迭代的global_step
141 |             offset：表示beta，全局均值；在 RELU 激活中一般初始化为 0.1。
142 |             scale：表示lambda，全局方差；在 sigmoid 激活中需要，这 RELU 激活中作用不大。
143 |             m: 表示batch均值；v:表示batch方差。
144 |             bnepsilon：一个很小的浮点数，防止除以 0.
145 |         Returns:
146 |             Ybn: 和 Ylogits 的维度一样，就是经过 Batch Normalization 处理的结果。
147 |             update_moving_everages：更新mean和variance，主要是给最后的 test 使用。
148 |         """
149 |         exp_moving_avg = tf.train.ExponentialMovingAverage(0.999,
150 |                                                            self._global_step)  # adding the iteration prevents from averaging across non-existing iterations
151 |         bnepsilon = 1e-5
152 |         if convolutional:
153 |             mean, variance = tf.nn.moments(Ylogits, [0, 1, 2])
154 |         else:
155 |             mean, variance = tf.nn.moments(Ylogits, [0])
156 |         update_moving_everages = exp_moving_avg.apply([mean, variance])
157 |         m = tf.cond(self.tst, lambda: exp_moving_avg.average(mean), lambda: mean)
158 |         v = tf.cond(self.tst, lambda: exp_moving_avg.average(variance), lambda: variance)
159 |         Ybn = tf.nn.batch_normalization(Ylogits, m, v, offset, None, bnepsilon)
160 |         return Ybn, update_moving_everages
161 | 
162 |     def textcnn(self, X_inputs, n_step, filter_sizes, embed_size):
163 |         """build the TextCNN network.
164 |         n_step: the sentence len."""
165 |         inputs = tf.expand_dims(X_inputs, -1)
166 |         pooled_outputs = list()
167 |         for i, filter_size in enumerate(filter_sizes):
168 |             with tf.name_scope("conv-maxpool-%s" % filter_size):
169 |                 # Convolution Layer
170 |                 filter_shape = [filter_size, embed_size, 1, self.n_filter]
171 |                 W_filter = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W_filter")
172 |                 beta = tf.Variable(tf.constant(0.1, tf.float32, shape=[self.n_filter], name="beta"))
173 |                 tf.summary.histogram('beta', beta)
174 |                 conv = tf.nn.conv2d(inputs, W_filter, strides=[1, 1, 1, 1], padding="VALID", name="conv")
175 |                 conv_bn, update_ema = self.batchnorm(conv, beta, convolutional=True)  # 在激活层前面加 BN
176 |                 # Apply nonlinearity, batch norm scaling is not useful with relus
177 |                 # batch norm offsets are used instead of biases,使用 BN 层的 offset，不要 biases
178 |                 h = tf.nn.relu(conv_bn, name="relu")
179 |                 # Maxpooling over the outputs
180 |                 pooled = tf.nn.max_pool(h, ksize=[1, n_step - filter_size + 1, 1, 1],
181 |                                         strides=[1, 1, 1, 1], padding='VALID', name="pool")
182 |                 pooled_outputs.append(pooled)
183 |                 self.update_emas.append(update_ema)
184 |         h_pool = tf.concat(pooled_outputs, 3)
185 |         n_filter_total = self.n_filter * len(filter_sizes)
186 |         h_pool_flat = tf.reshape(h_pool, [-1, n_filter_total])
187 |         return h_pool_flat  # shape = [-1, n_filter_total]
188 | 
189 |     def cnn_inference(self, X_inputs):
190 |         """TextCNN 模型。title部分。
191 |         Args:
192 |             X_inputs: tensor.shape=(batch_size, title_len)
193 |         Returns:
194 |             title_outputs: tensor.shape=(batch_size, n_filter*filter_num_sent)
195 |         """
196 |         inputs = tf.nn.embedding_lookup(self.embedding, X_inputs)
197 |         with tf.variable_scope('title_encoder'):  # 生成 title 的向量表示
198 |             title_outputs = self.textcnn(inputs, self.sent_len, self.sent_filter_sizes, embed_size=self.embedding_size)
199 |         return title_outputs  # shape = [batch_size, n_filter*filter_num_sent]
200 | 
201 |     def hcnn_inference(self, X_inputs):
202 |         """分层 TextCNN 模型。content部分。
203 |         Args:
204 |             X_inputs: tensor.shape=(batch_size, doc_len*sent_len)
205 |         Returns:
206 |             doc_attn_outputs: tensor.shape=(batch_size, n_filter*filter_num_doc)
207 |         """
208 |         inputs = tf.nn.embedding_lookup(self.embedding,
209 |                                         X_inputs)  # inputs.shape=[batch_size, doc_len*sent_len, embedding_size]
210 |         sent_inputs = tf.reshape(inputs, [self.batch_size * self.doc_len, self.sent_len,
211 |                                           self.embedding_size])  # [batch_size*doc_len, sent_len, embedding_size]
212 |         with tf.variable_scope('sentence_encoder'):  # 生成句向量
213 |             sent_outputs = self.textcnn(sent_inputs, self.sent_len, self.sent_filter_sizes, self.embedding_size)
214 |         with tf.variable_scope('doc_encoder'):  # 生成文档向量
215 |             doc_inputs = tf.reshape(sent_outputs, [self.batch_size, self.doc_len, self.n_filter * len(
216 |                 self.sent_filter_sizes)])  # [batch_size, doc_len, n_filter*len(filter_sizes_sent)]
217 |             doc_outputs = self.textcnn(doc_inputs, self.doc_len, self.doc_filter_sizes, self.n_filter * len(
218 |                 self.sent_filter_sizes))  # [batch_size, doc_len, n_filter*filter_num_doc]
219 |         return doc_outputs  # [batch_size,  n_filter*len(doc_filter_sizes)]
220 | 
221 | # test the model
222 | # def test():
223 | #     import numpy as np
224 | #     print('Begin testing...')
225 | #     settings = Settings()
226 | #     W_embedding = np.random.randn(50, 10)
227 | #     config = tf.ConfigProto()
228 | #     config.gpu_options.allow_growth = True
229 | #     batch_size = 128
230 | #     with tf.Session(config=config) as sess:
231 | #         model = HCNN(W_embedding, settings)
232 | #         optimizer = tf.train.AdamOptimizer(0.001)
233 | #         train_op = optimizer.minimize(model.loss)
234 | #         update_op = tf.group(*model.update_emas)
235 | #         sess.run(tf.global_variables_initializer())
236 | #         fetch = [model.loss, model.y_pred, train_op, update_op]
237 | #         loss_list = list()
238 | #         for i in xrange(100):
239 | #             X1_batch = np.zeros((batch_size, 30), dtype=float)
240 | #             X2_batch = np.zeros((batch_size, 10 * 30), dtype=float)
241 | #             y_batch = np.zeros((batch_size, 1999), dtype=int)
242 | #             _batch_size = len(y_batch)
243 | #             feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
244 | #                          model.batch_size: _batch_size, model.tst: False, model.keep_prob: 0.5}
245 | #             loss, y_pred, _, _ = sess.run(fetch, feed_dict=feed_dict)
246 | #             loss_list.append(loss)
247 | #             print(i, loss)
248 | 
249 | # test()
250 | 


--------------------------------------------------------------------------------
/zhihu-text-classification-master/models/wd_4_han/network.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | 
  3 | import tensorflow as tf
  4 | from tensorflow.contrib import rnn
  5 | import tensorflow.contrib.layers as layers
  6 | 
  7 | """wd_4_han
  8 | title 部分使用 bigru+attention；content 部分使用 han； 两部分输出直接 concat。
  9 | """
 10 | 
 11 | 
 12 | class Settings(object):
 13 |     def __init__(self):
 14 |         self.model_name = 'wd_4_han'
 15 |         self.title_len = self.sent_len = 30
 16 |         self.doc_len = 10
 17 |         self.hidden_size = 256
 18 |         self.n_layer = 1
 19 |         self.fc_hidden_size = 1024
 20 |         self.n_class = 1999
 21 |         self.summary_path = '../../summary/' + self.model_name + '/'
 22 |         self.ckpt_path = '../../ckpt/' + self.model_name + '/'
 23 | 
 24 | 
 25 | class HAN(object):
 26 |     """
 27 |     title: inputs->bigru+attention->output_title
 28 |     content: inputs->sent_encoder(bigru+attention)->doc_encoder(bigru+attention)->output_content
 29 |     concat[output_title, output_content] -> fc+bn+relu -> sigmoid_entropy.
 30 |     """
 31 | 
 32 |     def __init__(self, W_embedding, settings):
 33 |         self.model_name = settings.model_name
 34 |         self.title_len = self.sent_len = settings.sent_len
 35 |         self.doc_len = settings.doc_len
 36 |         self.hidden_size = settings.hidden_size
 37 |         self.n_layer = settings.n_layer
 38 |         self.n_class = settings.n_class
 39 |         self.fc_hidden_size = settings.fc_hidden_size
 40 |         self._global_step = tf.Variable(0, trainable=False, name='Global_Step')
 41 |         self.update_emas = list()
 42 |         # placeholders
 43 |         self._tst = tf.placeholder(tf.bool)
 44 |         self._keep_prob = tf.placeholder(tf.float32, [])
 45 |         self._batch_size = tf.placeholder(tf.int32, [])
 46 | 
 47 |         with tf.name_scope('Inputs'):
 48 |             self._X1_inputs = tf.placeholder(tf.int64, [None, self.title_len], name='X1_inputs')
 49 |             self._X2_inputs = tf.placeholder(tf.int64, [None, self.doc_len * self.sent_len], name='X2_inputs')
 50 |             self._y_inputs = tf.placeholder(tf.float32, [None, self.n_class], name='y_input')
 51 | 
 52 |         with tf.variable_scope('embedding'):
 53 |             self.embedding = tf.get_variable(name='embedding', shape=W_embedding.shape,
 54 |                                              initializer=tf.constant_initializer(W_embedding), trainable=True)
 55 |         self.embedding_size = W_embedding.shape[1]
 56 | 
 57 |         with tf.variable_scope('bigru_text'):
 58 |             output_title = self.bigru_inference(self._X1_inputs)
 59 | 
 60 |         with tf.variable_scope('han_content'):
 61 |             output_content = self.han_inference(self._X2_inputs)
 62 | 
 63 |         with tf.variable_scope('fc-bn-layer'):
 64 |             output = tf.concat([output_title, output_content], axis=1)
 65 |             W_fc = self.weight_variable([self.hidden_size * 4, self.fc_hidden_size], name='Weight_fc')
 66 |             tf.summary.histogram('W_fc', W_fc)
 67 |             h_fc = tf.matmul(output, W_fc, name='h_fc')
 68 |             beta_fc = tf.Variable(tf.constant(0.1, tf.float32, shape=[self.fc_hidden_size], name="beta_fc"))
 69 |             tf.summary.histogram('beta_fc', beta_fc)
 70 |             fc_bn, update_ema_fc = self.batchnorm(h_fc, beta_fc, convolutional=False)
 71 |             self.update_emas.append(update_ema_fc)
 72 |             self.fc_bn_relu = tf.nn.relu(fc_bn, name="relu")
 73 |             fc_bn_drop = tf.nn.dropout(self.fc_bn_relu, self.keep_prob)
 74 | 
 75 |         with tf.variable_scope('out_layer'):
 76 |             W_out = self.weight_variable([self.fc_hidden_size, self.n_class], name='Weight_out')
 77 |             tf.summary.histogram('Weight_out', W_out)
 78 |             b_out = self.bias_variable([self.n_class], name='bias_out')
 79 |             tf.summary.histogram('bias_out', b_out)
 80 |             self._y_pred = tf.nn.xw_plus_b(fc_bn_drop, W_out, b_out, name='y_pred')  # 每个类别的分数 scores
 81 | 
 82 |         with tf.name_scope('loss'):
 83 |             self._loss = tf.reduce_mean(
 84 |                 tf.nn.sigmoid_cross_entropy_with_logits(logits=self._y_pred, labels=self._y_inputs))
 85 |             tf.summary.scalar('loss', self._loss)
 86 | 
 87 |         self.saver = tf.train.Saver(max_to_keep=1)
 88 | 
 89 |     @property
 90 |     def tst(self):
 91 |         return self._tst
 92 | 
 93 |     @property
 94 |     def keep_prob(self):
 95 |         return self._keep_prob
 96 | 
 97 |     @property
 98 |     def batch_size(self):
 99 |         return self._batch_size
100 | 
101 |     @property
102 |     def global_step(self):
103 |         return self._global_step
104 | 
105 |     @property
106 |     def X1_inputs(self):
107 |         return self._X1_inputs
108 | 
109 |     @property
110 |     def X2_inputs(self):
111 |         return self._X2_inputs
112 | 
113 |     @property
114 |     def y_inputs(self):
115 |         return self._y_inputs
116 | 
117 |     @property
118 |     def y_pred(self):
119 |         return self._y_pred
120 | 
121 |     @property
122 |     def loss(self):
123 |         return self._loss
124 | 
125 |     def weight_variable(self, shape, name):
126 |         """Create a weight variable with appropriate initialization."""
127 |         initial = tf.truncated_normal(shape, stddev=0.1)
128 |         return tf.Variable(initial, name=name)
129 | 
130 |     def bias_variable(self, shape, name):
131 |         """Create a bias variable with appropriate initialization."""
132 |         initial = tf.constant(0.1, shape=shape)
133 |         return tf.Variable(initial, name=name)
134 | 
135 |     def batchnorm(self, Ylogits, offset, convolutional=False):
136 |         """batchnormalization.
137 |         Args:
138 |             Ylogits: 1D向量或者是3D的卷积结果。
139 |             num_updates: 迭代的global_step
140 |             offset：表示beta，全局均值；在 RELU 激活中一般初始化为 0.1。
141 |             scale：表示lambda，全局方差；在 sigmoid 激活中需要，这 RELU 激活中作用不大。
142 |             m: 表示batch均值；v:表示batch方差。
143 |             bnepsilon：一个很小的浮点数，防止除以 0.
144 |         Returns:
145 |             Ybn: 和 Ylogits 的维度一样，就是经过 Batch Normalization 处理的结果。
146 |             update_moving_everages：更新mean和variance，主要是给最后的 test 使用。
147 |         """
148 |         exp_moving_avg = tf.train.ExponentialMovingAverage(0.999, self._global_step)  # adding the iteration prevents from averaging across non-existing iterations
149 |         bnepsilon = 1e-5
150 |         if convolutional:
151 |             mean, variance = tf.nn.moments(Ylogits, [0, 1, 2])
152 |         else:
153 |             mean, variance = tf.nn.moments(Ylogits, [0])
154 |         update_moving_everages = exp_moving_avg.apply([mean, variance])
155 |         m = tf.cond(self.tst, lambda: exp_moving_avg.average(mean), lambda: mean)
156 |         v = tf.cond(self.tst, lambda: exp_moving_avg.average(variance), lambda: variance)
157 |         Ybn = tf.nn.batch_normalization(Ylogits, m, v, offset, None, bnepsilon)
158 |         return Ybn, update_moving_everages
159 | 
160 |     def gru_cell(self):
161 |         with tf.name_scope('gru_cell'):
162 |             cell = rnn.GRUCell(self.hidden_size, reuse=tf.get_variable_scope().reuse)
163 |         return rnn.DropoutWrapper(cell, output_keep_prob=self.keep_prob)
164 | 
165 |     def bi_gru(self, inputs, seg_num):
166 |         """build the bi-GRU network. Return the encoder represented vector.
167 |         n_step: 句子的词数量；或者文档的句子数。
168 |         seg_num: 序列的数量，原本应该为 batch_size, 但是这里将 batch_size 个 doc展开成很多个句子。
169 |         """
170 |         cells_fw = [self.gru_cell() for _ in range(self.n_layer)]
171 |         cells_bw = [self.gru_cell() for _ in range(self.n_layer)]
172 |         initial_states_fw = [cell_fw.zero_state(seg_num, tf.float32) for cell_fw in cells_fw]
173 |         initial_states_bw = [cell_bw.zero_state(seg_num, tf.float32) for cell_bw in cells_bw]
174 |         outputs, _, _ = rnn.stack_bidirectional_dynamic_rnn(cells_fw, cells_bw, inputs,
175 |                         initial_states_fw = initial_states_fw, initial_states_bw = initial_states_bw, dtype=tf.float32)
176 |         # outputs: Output Tensor shaped: seg_num, max_time, layers_output]，其中layers_output=hidden_size * 2 在这里。
177 |         return outputs
178 | 
179 |     def task_specific_attention(self, inputs, output_size,
180 |                                 initializer=layers.xavier_initializer(),
181 |                                 activation_fn=tf.tanh, scope=None):
182 |         """
183 |         Performs task-specific attention reduction, using learned
184 |         attention context vector (constant within task of interest).
185 |         Args:
186 |             inputs: Tensor of shape [batch_size, units, input_size]
187 |                 `input_size` must be static (known)
188 |                 `units` axis will be attended over (reduced from output)
189 |                 `batch_size` will be preserved
190 |             output_size: Size of output's inner (feature) dimension
191 |         Returns:
192 |            outputs: Tensor of shape [batch_size, output_dim].
193 |         """
194 |         assert len(inputs.get_shape()) == 3 and inputs.get_shape()[-1].value is not None
195 |         with tf.variable_scope(scope or 'attention') as scope:
196 |             # u_w, attention 向量
197 |             attention_context_vector = tf.get_variable(name='attention_context_vector', shape=[output_size],
198 |                                                        initializer=initializer, dtype=tf.float32)
199 |             # 全连接层，把 h_i 转为 u_i ， shape= [batch_size, units, input_size] -> [batch_size, units, output_size]
200 |             input_projection = layers.fully_connected(inputs, output_size, activation_fn=activation_fn, scope=scope)
201 |             # 输出 [batch_size, units]
202 |             vector_attn = tf.reduce_sum(tf.multiply(input_projection, attention_context_vector), axis=2, keep_dims=True)
203 |             attention_weights = tf.nn.softmax(vector_attn, dim=1)
204 |             tf.summary.histogram('attention_weigths', attention_weights)
205 |             weighted_projection = tf.multiply(inputs, attention_weights)
206 |             outputs = tf.reduce_sum(weighted_projection, axis=1)
207 |             return outputs  # 输出 [batch_size, hidden_size*2]
208 | 
209 |     def bigru_inference(self, X_inputs):
210 |         inputs = tf.nn.embedding_lookup(self.embedding, X_inputs)
211 |         output_bigru = self.bi_gru(inputs, self.batch_size)
212 |         output_att = self.task_specific_attention(output_bigru, self.hidden_size*2)
213 |         return output_att   # 输出 [batch_size, hidden_size*2]
214 | 
215 |     def han_inference(self, X_inputs):
216 |         """分层 attention 模型。content部分。
217 |         Args:
218 |             X_inputs: tensor.shape=(batch_size, doc_len*sent_len)
219 |         Returns:
220 |             doc_attn_outputs: tensor.shape=(batch_size, hidden_size(*2 for bigru))
221 |         """
222 |         inputs = tf.nn.embedding_lookup(self.embedding, X_inputs)    # inputs.shape=[batch_size, doc_len*sent_len, embedding_size]
223 |         sent_inputs = tf.reshape(inputs,[self.batch_size*self.doc_len, self.sent_len, self.embedding_size]) # shape=(?, 40, 256)
224 |         with tf.variable_scope('sentence_encoder'):  # 生成句向量
225 |             sent_outputs = self.bi_gru(sent_inputs, seg_num=self.batch_size*self.doc_len)
226 |             sent_attn_outputs = self.task_specific_attention(sent_outputs, self.hidden_size*2) # [batch_size*doc_len, hidden_size*2]
227 |             with tf.variable_scope('dropout'):
228 |                 sent_attn_outputs = tf.nn.dropout(sent_attn_outputs, self.keep_prob)
229 |         with tf.variable_scope('doc_encoder'):      # 生成文档向量
230 |             doc_inputs = tf.reshape(sent_attn_outputs, [self.batch_size, self.doc_len, self.hidden_size*2])
231 |             doc_outputs = self.bi_gru(doc_inputs, self.batch_size)  # [batch_size, doc_len, hidden_size*2]
232 |             doc_attn_outputs = self.task_specific_attention(doc_outputs, self.hidden_size*2) # [batch_size, hidden_size*2]
233 |         return doc_attn_outputs    # [batch_size, hidden_size*2]
234 | 
235 | 
236 | 
237 | # test the model
238 | def test():
239 |     import numpy as np
240 |     print('Begin testing...')
241 |     settings = Settings()
242 |     W_embedding = np.random.randn(50, 10)
243 |     config = tf.ConfigProto()
244 |     config.gpu_options.allow_growth = True
245 |     batch_size = 128
246 |     with tf.Session(config=config) as sess:
247 |         model = HAN(W_embedding, settings)
248 |         optimizer = tf.train.AdamOptimizer(0.001)
249 |         train_op = optimizer.minimize(model.loss)
250 |         update_op = tf.group(*model.update_emas)
251 |         sess.run(tf.global_variables_initializer())
252 |         fetch = [model.loss, model.y_pred, train_op, update_op]
253 |         loss_list = list()
254 |         for i in xrange(100):
255 |             X1_batch = np.zeros((batch_size, 30), dtype=float)
256 |             X2_batch = np.zeros((batch_size, 10 * 30), dtype=float)
257 |             y_batch = np.zeros((batch_size, 1999), dtype=int)
258 |             _batch_size = len(y_batch)
259 |             feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
260 |                          model.batch_size: _batch_size, model.tst: False, model.keep_prob: 0.5}
261 |             loss, y_pred, _, _ = sess.run(fetch, feed_dict=feed_dict)
262 |             loss_list.append(loss)
263 |             print(i, loss)
264 | 
265 | if __name__ == '__main__':
266 |     test()
267 | 


--------------------------------------------------------------------------------
/zhihu-text-classification-master/models/wd_5_bigru_cnn/network.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | 
  3 | import tensorflow as tf
  4 | from tensorflow.contrib import rnn
  5 | import tensorflow.contrib.layers as layers
  6 | 
  7 | """wd_5_bigru_cnn
  8 | 两部分使用不同的 embedding， 因为RNN与CNN结构完全不同，共用embedding会降低性能。
  9 | title 部分使用 bigru+attention；content 部分使用 textcnn； 两部分输出直接 concat。
 10 | """
 11 | 
 12 | 
 13 | class Settings(object):
 14 |     def __init__(self):
 15 |         self.model_name = 'wd_5_bigru_cnn'
 16 |         self.title_len = 30
 17 |         self.content_len = 150
 18 |         self.hidden_size = 256
 19 |         self.n_layer = 1
 20 |         self.filter_sizes = [2, 3, 4, 5, 7]
 21 |         self.n_filter = 256
 22 |         self.fc_hidden_size = 1024
 23 |         self.n_class = 1999
 24 |         self.summary_path = '../../summary/' + self.model_name + '/'
 25 |         self.ckpt_path = '../../ckpt/' + self.model_name + '/'
 26 | 
 27 | 
 28 | class BiGRU_CNN(object):
 29 |     """
 30 |     title: inputs->bigru+attention->output_title
 31 |     content: inputs->textcnn->output_content
 32 |     concat[output_title, output_content] -> fc+bn+relu -> sigmoid_entropy.
 33 |     """
 34 | 
 35 |     def __init__(self, W_embedding, settings):
 36 |         self.model_name = settings.model_name
 37 |         self.title_len = settings.title_len
 38 |         self.content_len = settings.content_len
 39 |         self.hidden_size = settings.hidden_size
 40 |         self.n_layer = settings.n_layer
 41 |         self.filter_sizes = settings.filter_sizes
 42 |         self.n_filter = settings.n_filter
 43 |         self.n_filter_total = self.n_filter * len(self.filter_sizes)
 44 |         self.n_class = settings.n_class
 45 |         self.fc_hidden_size = settings.fc_hidden_size
 46 |         self._global_step = tf.Variable(0, trainable=False, name='Global_Step')
 47 |         self.update_emas = list()
 48 |         # placeholders
 49 |         self._tst = tf.placeholder(tf.bool)
 50 |         self._keep_prob = tf.placeholder(tf.float32, [])
 51 |         self._batch_size = tf.placeholder(tf.int32, [])
 52 | 
 53 |         with tf.name_scope('Inputs'):
 54 |             self._X1_inputs = tf.placeholder(tf.int64, [None, self.title_len], name='X1_inputs')
 55 |             self._X2_inputs = tf.placeholder(tf.int64, [None, self.content_len], name='X2_inputs')
 56 |             self._y_inputs = tf.placeholder(tf.float32, [None, self.n_class], name='y_input')
 57 | 
 58 |         with tf.variable_scope('embedding'):
 59 |             self.title_embedding = tf.get_variable(name='title_embedding', shape=W_embedding.shape,
 60 |                                              initializer=tf.constant_initializer(W_embedding), trainable=True)
 61 |             self.content_embedding = tf.get_variable(name='content_embedding', shape=W_embedding.shape,
 62 |                                              initializer=tf.constant_initializer(W_embedding), trainable=True)
 63 |         self.embedding_size = W_embedding.shape[1]
 64 | 
 65 |         with tf.variable_scope('bigru_text'):
 66 |             output_title = self.bigru_inference(self._X1_inputs)
 67 | 
 68 |         with tf.variable_scope('cnn_content'):
 69 |             output_content = self.cnn_inference(self._X2_inputs, self.content_len)
 70 | 
 71 |         with tf.variable_scope('fc-bn-layer'):
 72 |             output = tf.concat([output_title, output_content], axis=1)
 73 |             W_fc = self.weight_variable([self.hidden_size*2 + self.n_filter_total, self.fc_hidden_size], name='Weight_fc')
 74 |             tf.summary.histogram('W_fc', W_fc)
 75 |             h_fc = tf.matmul(output, W_fc, name='h_fc')
 76 |             beta_fc = tf.Variable(tf.constant(0.1, tf.float32, shape=[self.fc_hidden_size], name="beta_fc"))
 77 |             tf.summary.histogram('beta_fc', beta_fc)
 78 |             fc_bn, update_ema_fc = self.batchnorm(h_fc, beta_fc, convolutional=False)
 79 |             self.update_emas.append(update_ema_fc)
 80 |             self.fc_bn_relu = tf.nn.relu(fc_bn, name="relu")
 81 |             fc_bn_drop = tf.nn.dropout(self.fc_bn_relu, self.keep_prob)
 82 | 
 83 |         with tf.variable_scope('out_layer'):
 84 |             W_out = self.weight_variable([self.fc_hidden_size, self.n_class], name='Weight_out')
 85 |             tf.summary.histogram('Weight_out', W_out)
 86 |             b_out = self.bias_variable([self.n_class], name='bias_out')
 87 |             tf.summary.histogram('bias_out', b_out)
 88 |             self._y_pred = tf.nn.xw_plus_b(fc_bn_drop, W_out, b_out, name='y_pred')  # 每个类别的分数 scores
 89 | 
 90 |         with tf.name_scope('loss'):
 91 |             self._loss = tf.reduce_mean(
 92 |                 tf.nn.sigmoid_cross_entropy_with_logits(logits=self._y_pred, labels=self._y_inputs))
 93 |             tf.summary.scalar('loss', self._loss)
 94 | 
 95 |         self.saver = tf.train.Saver(max_to_keep=1)
 96 | 
 97 |     @property
 98 |     def tst(self):
 99 |         return self._tst
100 | 
101 |     @property
102 |     def keep_prob(self):
103 |         return self._keep_prob
104 | 
105 |     @property
106 |     def batch_size(self):
107 |         return self._batch_size
108 | 
109 |     @property
110 |     def global_step(self):
111 |         return self._global_step
112 | 
113 |     @property
114 |     def X1_inputs(self):
115 |         return self._X1_inputs
116 | 
117 |     @property
118 |     def X2_inputs(self):
119 |         return self._X2_inputs
120 | 
121 |     @property
122 |     def y_inputs(self):
123 |         return self._y_inputs
124 | 
125 |     @property
126 |     def y_pred(self):
127 |         return self._y_pred
128 | 
129 |     @property
130 |     def loss(self):
131 |         return self._loss
132 | 
133 |     def weight_variable(self, shape, name):
134 |         """Create a weight variable with appropriate initialization."""
135 |         initial = tf.truncated_normal(shape, stddev=0.1)
136 |         return tf.Variable(initial, name=name)
137 | 
138 |     def bias_variable(self, shape, name):
139 |         """Create a bias variable with appropriate initialization."""
140 |         initial = tf.constant(0.1, shape=shape)
141 |         return tf.Variable(initial, name=name)
142 | 
143 |     def batchnorm(self, Ylogits, offset, convolutional=False):
144 |         """batchnormalization.
145 |         Args:
146 |             Ylogits: 1D向量或者是3D的卷积结果。
147 |             num_updates: 迭代的global_step
148 |             offset：表示beta，全局均值；在 RELU 激活中一般初始化为 0.1。
149 |             scale：表示lambda，全局方差；在 sigmoid 激活中需要，这 RELU 激活中作用不大。
150 |             m: 表示batch均值；v:表示batch方差。
151 |             bnepsilon：一个很小的浮点数，防止除以 0.
152 |         Returns:
153 |             Ybn: 和 Ylogits 的维度一样，就是经过 Batch Normalization 处理的结果。
154 |             update_moving_everages：更新mean和variance，主要是给最后的 test 使用。
155 |         """
156 |         exp_moving_avg = tf.train.ExponentialMovingAverage(0.999, self._global_step)  # adding the iteration prevents from averaging across non-existing iterations
157 |         bnepsilon = 1e-5
158 |         if convolutional:
159 |             mean, variance = tf.nn.moments(Ylogits, [0, 1, 2])
160 |         else:
161 |             mean, variance = tf.nn.moments(Ylogits, [0])
162 |         update_moving_everages = exp_moving_avg.apply([mean, variance])
163 |         m = tf.cond(self.tst, lambda: exp_moving_avg.average(mean), lambda: mean)
164 |         v = tf.cond(self.tst, lambda: exp_moving_avg.average(variance), lambda: variance)
165 |         Ybn = tf.nn.batch_normalization(Ylogits, m, v, offset, None, bnepsilon)
166 |         return Ybn, update_moving_everages
167 | 
168 |     def gru_cell(self):
169 |         with tf.name_scope('gru_cell'):
170 |             cell = rnn.GRUCell(self.hidden_size, reuse=tf.get_variable_scope().reuse)
171 |         return rnn.DropoutWrapper(cell, output_keep_prob=self.keep_prob)
172 | 
173 |     def bi_gru(self, inputs):
174 |         """build the bi-GRU network. 返回个所有层的隐含状态。"""
175 |         cells_fw = [self.gru_cell() for _ in range(self.n_layer)]
176 |         cells_bw = [self.gru_cell() for _ in range(self.n_layer)]
177 |         initial_states_fw = [cell_fw.zero_state(self.batch_size, tf.float32) for cell_fw in cells_fw]
178 |         initial_states_bw = [cell_bw.zero_state(self.batch_size, tf.float32) for cell_bw in cells_bw]
179 |         outputs, _, _ = rnn.stack_bidirectional_dynamic_rnn(cells_fw, cells_bw, inputs,
180 |                                                             initial_states_fw=initial_states_fw,
181 |                                                             initial_states_bw=initial_states_bw, dtype=tf.float32)
182 |         return outputs
183 | 
184 |     def task_specific_attention(self, inputs, output_size,
185 |                                 initializer=layers.xavier_initializer(),
186 |                                 activation_fn=tf.tanh, scope=None):
187 |         """
188 |         Performs task-specific attention reduction, using learned
189 |         attention context vector (constant within task of interest).
190 |         Args:
191 |             inputs: Tensor of shape [batch_size, units, input_size]
192 |                 `input_size` must be static (known)
193 |                 `units` axis will be attended over (reduced from output)
194 |                 `batch_size` will be preserved
195 |             output_size: Size of output's inner (feature) dimension
196 |         Returns:
197 |            outputs: Tensor of shape [batch_size, output_dim].
198 |         """
199 |         assert len(inputs.get_shape()) == 3 and inputs.get_shape()[-1].value is not None
200 |         with tf.variable_scope(scope or 'attention') as scope:
201 |             # u_w, attention 向量
202 |             attention_context_vector = tf.get_variable(name='attention_context_vector', shape=[output_size],
203 |                                                        initializer=initializer, dtype=tf.float32)
204 |             # 全连接层，把 h_i 转为 u_i ， shape= [batch_size, units, input_size] -> [batch_size, units, output_size]
205 |             input_projection = layers.fully_connected(inputs, output_size, activation_fn=activation_fn, scope=scope)
206 |             # 输出 [batch_size, units]
207 |             vector_attn = tf.reduce_sum(tf.multiply(input_projection, attention_context_vector), axis=2, keep_dims=True)
208 |             attention_weights = tf.nn.softmax(vector_attn, dim=1)
209 |             tf.summary.histogram('attention_weigths', attention_weights)
210 |             weighted_projection = tf.multiply(inputs, attention_weights)
211 |             outputs = tf.reduce_sum(weighted_projection, axis=1)
212 |             return outputs  # 输出 [batch_size, hidden_size*2]
213 | 
214 |     def bigru_inference(self, X_inputs):
215 |         inputs = tf.nn.embedding_lookup(self.title_embedding, X_inputs)
216 |         output_bigru = self.bi_gru(inputs)
217 |         output_att = self.task_specific_attention(output_bigru, self.hidden_size*2)
218 |         return output_att
219 | 
220 |     def cnn_inference(self, X_inputs, n_step):
221 |         """TextCNN 模型。
222 |         Args:
223 |             X_inputs: tensor.shape=(batch_size, n_step)
224 |         Returns:
225 |             title_outputs: tensor.shape=(batch_size, self.n_filter_total)
226 |         """
227 |         inputs = tf.nn.embedding_lookup(self.content_embedding, X_inputs)
228 |         inputs = tf.expand_dims(inputs, -1)
229 |         pooled_outputs = list()
230 |         for i, filter_size in enumerate(self.filter_sizes):
231 |             with tf.variable_scope("conv-maxpool-%s" % filter_size):
232 |                 # Convolution Layer
233 |                 filter_shape = [filter_size, self.embedding_size, 1, self.n_filter]
234 |                 W_filter = self.weight_variable(shape=filter_shape, name='W_filter')
235 |                 beta = self.bias_variable(shape=[self.n_filter], name='beta_filter')
236 |                 tf.summary.histogram('beta', beta)
237 |                 conv = tf.nn.conv2d(inputs, W_filter, strides=[1, 1, 1, 1], padding="VALID", name="conv")
238 |                 conv_bn, update_ema = self.batchnorm(conv, beta, convolutional=True)
239 |                 # Apply nonlinearity, batch norm scaling is not useful with relus
240 |                 h = tf.nn.relu(conv_bn, name="relu")
241 |                 # Maxpooling over the outputs
242 |                 pooled = tf.nn.max_pool(h, ksize=[1, n_step - filter_size + 1, 1, 1],
243 |                                         strides=[1, 1, 1, 1], padding='VALID', name="pool")
244 |                 pooled_outputs.append(pooled)
245 |                 self.update_emas.append(update_ema)
246 |         h_pool = tf.concat(pooled_outputs, 3)
247 |         h_pool_flat = tf.reshape(h_pool, [-1, self.n_filter_total])
248 |         return h_pool_flat  # shape = [batch_size, self.n_filter_total]
249 | 
250 | 
251 | # test the model
252 | def test():
253 |     import numpy as np
254 |     print('Begin testing...')
255 |     settings = Settings()
256 |     W_embedding = np.random.randn(50, 10)
257 |     config = tf.ConfigProto()
258 |     config.gpu_options.allow_growth = True
259 |     batch_size = 128
260 |     with tf.Session(config=config) as sess:
261 |         model = BiGRU_CNN(W_embedding, settings)
262 |         optimizer = tf.train.AdamOptimizer(0.001)
263 |         train_op = optimizer.minimize(model.loss)
264 |         update_op = tf.group(*model.update_emas)
265 |         sess.run(tf.global_variables_initializer())
266 |         fetch = [model.loss, model.y_pred, train_op, update_op]
267 |         loss_list = list()
268 |         for i in xrange(100):
269 |             X1_batch = np.zeros((batch_size, 30), dtype=float)
270 |             X2_batch = np.zeros((batch_size, 150), dtype=float)
271 |             y_batch = np.zeros((batch_size, 1999), dtype=int)
272 |             _batch_size = len(y_batch)
273 |             feed_dict = {model.X1_inputs: X1_batch, model.X2_inputs: X2_batch, model.y_inputs: y_batch,
274 |                          model.batch_size: _batch_size, model.tst: False, model.keep_prob: 0.5}
275 |             loss, y_pred, _, _ = sess.run(fetch, feed_dict=feed_dict)
276 |             loss_list.append(loss)
277 |             print(i, loss)
278 | 
279 | if __name__ == '__main__':
280 |     test()
281 | 


--------------------------------------------------------------------------------