├── data
    ├── .DS_Store
    └── cnn_pred_200.npy
├── image
    └── result.png
├── model
    ├── .DS_Store
    └── checkpoint
├── origin_data
    └── .DS_Store
├── README.md
├── test_pred.py
├── preprocess.py
├── init.py
├── cnnmodel.py
└── rnn_cnn_model.py


/data/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xuyanfu/smp2018/HEAD/data/.DS_Store


--------------------------------------------------------------------------------
/image/result.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xuyanfu/smp2018/HEAD/image/result.png


--------------------------------------------------------------------------------
/model/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xuyanfu/smp2018/HEAD/model/.DS_Store


--------------------------------------------------------------------------------
/data/cnn_pred_200.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xuyanfu/smp2018/HEAD/data/cnn_pred_200.npy


--------------------------------------------------------------------------------
/origin_data/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xuyanfu/smp2018/HEAD/origin_data/.DS_Store


--------------------------------------------------------------------------------
/model/checkpoint:
--------------------------------------------------------------------------------
1 | model_checkpoint_path: "cnnmodel_total_450.ckpt"
2 | all_model_checkpoint_paths: "cnnmodel_total_450.ckpt"
3 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## smp2018用户画像技术评测基础方案
 2 | 
 3 | [用户画像技术评测](https://biendata.com/competition/smpeupt2018/)给定一个由若干媒体内容文档构成的数据集，参赛队伍采用适当的算法，对每篇文档的作者进行身份识别，区分出该文档属于人类写作、机器写作、机器翻译和机器自动摘要中的哪一类。简单的来说就是文本分类任务。
 4 | 
 5 | 原始训练集用于模型的学习，共146,421篇，其中自动摘要31034篇，机器翻译36206，人类作者 48018，机器作者31163。每一种类型中各取5000篇作为测试集，剩下的作为训练集。
 6 | 
 7 | 分别使用词汇级别CNN和RNN+CNN的模型进行文本分类。
 8 | 
 9 | ## 结果
10 | 自构建的测试集上，使用RNN+CNN的模型，达到了98%的准确率。使用全部数据训练的RNN+CNN模型，在第一轮评测中的结果如图所示：
11 | 
12 | ![](./image/result.png)
13 | 
14 | ## 环境
15 | - Python (>=3.6.1)
16 | - TensorFlow (=1.6.0)
17 | - jieba
18 | 
19 | ## 文件结构
20 | 
21 | - data: 存储中间生成的数据
22 | - model: 存储训练完成的模型  
23 | - origin_data: 存储原始txt文件
24 | - result: 存储模型预测的结果
25 | 
26 | ## 代码结构
27 | - preprocess.py: 数据预处理，处理原始json格式的文件
28 | - init.py: 数据初始化，划分训练集测试集
29 | - cnnmodel.py: CNN模型
30 | - rnn_cnn_model.py: RNN+CNN模型
31 | - test_pred.py: 计算模型预测结果的准确率
32 | 
33 | ## 运行方式
34 | 	python preprocess.py#数据预处理
35 | 	python init.py#数据初始化
36 | 	python cnnmodel.py#使用CNN模型训练数据，查看结果
37 | 	python rnn_cnn_model.py#使用RNN模型训练数据，查看结果
38 | 	
39 | 
40 | 
41 | 
42 | 
43 | 
44 | 
45 | 
46 | 
47 | 


--------------------------------------------------------------------------------
/test_pred.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | import random
 4 | import tensorflow.contrib.layers as layers
 5 | from tqdm import tqdm
 6 | import time
 7 | import pickle
 8 | 
 9 | 
10 | 
11 | def test(save_path):
12 |     pred = np.load(save_path)
13 |     with open('data/ytest.pkl','rb') as input:
14 |         ytest = pickle.load(input)
15 | 
16 |     pred =np.array(pred)
17 |     ytest = np.array(ytest)
18 |     result = np.equal(pred,ytest)
19 |     #print (result)
20 |     result = list(result)
21 |     positive = result.count(True)
22 |     print (positive)
23 |     print (positive/len(ytest))
24 | 
25 | def validation():
26 |     pred = np.load('data/validation_labels_epoch3.npy',)
27 |     with open('data/id_validation.pkl','rb') as input:
28 |         id_validationn = pickle.load(input)
29 | 
30 |     assert (len(pred) == len(id_validationn))
31 |     length = len(pred)
32 |     with open('result/origin_rnn_cnn_total_c300_r300_3.csv','w',encoding='utf-8') as output:
33 |         for i in range(length):
34 |             tmp_line = str (id_validationn[i]) + ',' + pred[i]+'\n'
35 |             output.write(tmp_line)
36 | 
37 | #test('data/pred.npy')
38 | #validation()


--------------------------------------------------------------------------------
/preprocess.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import random
  3 | import pickle
  4 | 
  5 | 
  6 | def init_train():
  7 | 
  8 |     with open('origin_data/training.txt','r',encoding='utf-8') as input:
  9 |         lines = input.readlines()
 10 | 
 11 |     dict_nums = {}
 12 |     dict_content = {}#{index:[content]}
 13 |     print (len(lines))
 14 |     num = 0
 15 | 
 16 |     for line in lines:
 17 |         try:
 18 |             line = line.strip()
 19 |             items = json.loads(line)
 20 |             index = items['标签']
 21 |             content = items['内容']
 22 | 
 23 |             if index not in dict_nums.keys():
 24 |                 dict_nums[index] = 0
 25 |                 dict_nums[index] +=1
 26 |             else:
 27 |                 dict_nums[index] += 1
 28 | 
 29 |             if index not in dict_content.keys():
 30 |                 dict_content[index] = []
 31 |                 dict_content[index].append(content)
 32 |             else:
 33 |                 dict_content[index].append(content)
 34 | 
 35 |         except:
 36 |             print (line)
 37 | 
 38 |         num +=1
 39 |         if num %1000 == 0:
 40 |             print(num)
 41 | 
 42 |     print (dict_nums,len(dict_content))
 43 | 
 44 | 
 45 |     #{'自动摘要': 31034, '机器翻译': 36206, '人类作者': 48018, '机器作者': 31163}
 46 |     '''
 47 |     with open('origin_data/human','r',encoding='utf-8') as input:
 48 |         human = input.readlines()
 49 |     
 50 |     with open('origin_data/machine','r',encoding='utf-8') as input:
 51 |         machine = input.readlines()
 52 |     
 53 |     with open('origin_data/summary','r',encoding='utf-8') as input:
 54 |         summary = input.readlines()
 55 |     
 56 |     with open('origin_data/translate','r',encoding='utf-8') as input:
 57 |         translate = input.readlines()
 58 |     
 59 |     print (len(summary),len(translate),len(human),len(machine))
 60 |     '''
 61 |     dict_index = {'自动摘要': 0, '机器翻译': 1, '人类作者': 2, '机器作者': 3}
 62 | 
 63 |     train_total = []
 64 |     test_total = []
 65 |     for key in dict_content.keys():
 66 |         contents = dict_content[key]
 67 |         contents = list(set(contents))
 68 | 
 69 |         tmp_len  = len(contents)
 70 |         len_test = 5000
 71 | 
 72 |         contents_test = random.sample(contents,len_test)
 73 |         contents_train = list((set(contents) - set(contents_test)))
 74 |         print (len(contents))
 75 |         print (len(contents_test))
 76 |         print (len(contents_train))
 77 | 
 78 |         train_total += [(x, dict_index[key]) for x in contents_train]
 79 |         test_total  += [(x, dict_index[key]) for x in contents_test]
 80 | 
 81 |     print ('\n')
 82 |     print (len(train_total))
 83 |     print (len(test_total))
 84 | 
 85 |     '''for i in train_total:
 86 |         if train_total.count(i) >1:
 87 |             print (i)
 88 |     
 89 |     for i in test_total:
 90 |         if test_total.count(i) >1:
 91 |             print (i)
 92 |     '''
 93 | 
 94 |     train_total = list(set(train_total))
 95 |     test_total= list(set(test_total))
 96 | 
 97 |     print ('\n')
 98 |     print (len(train_total))
 99 |     print (len(test_total))
100 | 
101 | 
102 |     xtrain = [x[0] for x in train_total]
103 |     ytrain = [x[1] for x in train_total]
104 |     xtest = [x[0] for x in test_total]
105 |     ytest = [x[1] for x in test_total]
106 | 
107 |     print (len(xtrain),len(ytrain),len(xtest),len(ytest))
108 | 
109 | 
110 |     with open('data/xtrain.pkl','wb') as output:
111 |         pickle.dump(xtrain,output)
112 | 
113 |     with open('data/ytrain.pkl','wb') as output:
114 |         pickle.dump(ytrain,output)
115 | 
116 |     with open('data/xtest.pkl','wb') as output:
117 |         pickle.dump(xtest,output)
118 | 
119 |     with open('data/ytest.pkl','wb') as output:
120 |         pickle.dump(ytest,output)
121 | 
122 | 
123 | def init_validation():
124 |     with open('origin_data/validation.txt','r',encoding='utf-8') as input:
125 |         lines = input.readlines()
126 | 
127 |     x_validation = []
128 |     id_validation = []
129 | 
130 |     for line in lines:
131 |         try:
132 |             line = line.strip()
133 |             items = json.loads(line)
134 |             id = items['id']
135 |             content = items['内容']
136 |             x_validation.append(content)
137 |             id_validation.append(id)
138 |         except:
139 |             print (line)
140 | 
141 | 
142 |     with open('data/x_validation.pkl','wb') as output:
143 |         pickle.dump(x_validation,output)
144 | 
145 |     with open('data/id_validation.pkl','wb') as output:
146 |         pickle.dump(id_validation,output)
147 | 
148 | init_train()
149 | #init_validation()
150 | 
151 | 
152 | 
153 | 


--------------------------------------------------------------------------------
/init.py:
--------------------------------------------------------------------------------
  1 | import pickle
  2 | import jieba
  3 | import re
  4 | from tqdm import tqdm
  5 | import numpy as np
  6 | 
  7 | 
  8 | 
  9 | def init_train():
 10 |     with open('data/xtrain.pkl','rb') as input:
 11 |         xtrain = pickle.load(input)
 12 | 
 13 |     with open('data/ytrain.pkl','rb') as input:
 14 |         ytrain = pickle.load(input)
 15 | 
 16 |     with open('data/xtest.pkl','rb') as input:
 17 |         xtest = pickle.load(input)
 18 | 
 19 |     with open('data/ytest.pkl','rb') as input:
 20 |         ytest = pickle.load(input)
 21 | 
 22 |     print (123)
 23 | 
 24 |     print (len(xtrain),len(ytrain),len(xtest),len(ytest))
 25 | 
 26 |     set_len = set()
 27 |     list_len = []
 28 |     fixlen = 350
 29 | 
 30 |     '''
 31 |     for x in xtrain:
 32 |         tmp_sentence = re.split('。|？|！',x)
 33 |         set_len.add(len(tmp_sentence))
 34 |         list_len.append(len(tmp_sentence))
 35 |         for sentence in tmp_sentence:
 36 |             words = list (jieba.cut(sentence))
 37 |             print (words)
 38 |     avg_len = sum(list_len) / len(list_len)
 39 |     print (123)
 40 |     '''
 41 | 
 42 |     '''
 43 |     for x in tqdm(xtrain):
 44 |         words = list(jieba.cut(x))
 45 |         list_len.append(len(words))
 46 |     
 47 |     avg_len = sum(list_len) / len(list_len)
 48 |     print (avg_len)
 49 |     print (123)
 50 |     '''
 51 | 
 52 |     xtrain_words = []
 53 |     xtest_words = []
 54 | 
 55 |     for x in tqdm(xtrain):
 56 |         xtrain_words.append(list(jieba.cut(x)))
 57 | 
 58 |     for x in tqdm(xtest):
 59 |         xtest_words.append(list(jieba.cut(x)))
 60 | 
 61 | 
 62 |     dict_word2id = {}
 63 |     
 64 |     for x in tqdm(xtrain_words):
 65 |         #words = list (jieba.cut(x))
 66 |         words = x[:fixlen]
 67 |         for word in words:
 68 |             if word not in dict_word2id.keys():
 69 |                 dict_word2id[word] = len(dict_word2id)
 70 |             #break
 71 |         #break
 72 |         #print (words)
 73 |     print (len(dict_word2id))
 74 |     
 75 |     for x in tqdm (xtest_words):
 76 |         #words = list (jieba.cut(x))
 77 |         words = x[:fixlen]
 78 |         for word in words:
 79 |             if word not in dict_word2id.keys():
 80 |                 dict_word2id[word] = len(dict_word2id)
 81 |             #break
 82 |         #break
 83 |         #print (words)
 84 |     print (len(dict_word2id))
 85 |     
 86 |     with open('data/dict_word2id.pkl','wb') as output:
 87 |         pickle.dump(dict_word2id,output)
 88 | 
 89 | 
 90 |     with open('data/dict_word2id.pkl','rb') as input:
 91 |         dict_word2id = pickle.load(input)
 92 | 
 93 |     dict_word2id['UNK'] = len(dict_word2id)
 94 |     dict_word2id['BLANK'] = len(dict_word2id)
 95 |     print (len(dict_word2id))
 96 | 
 97 |     list_xtrain = []#[[wordid*fixlen],[wordid*fixlen] ... ]
 98 |     list_xtest = []#[[wordid*fixlen],[wordid*fixlen] ... ]
 99 |     for x in tqdm(xtrain_words):
100 |         #words = list (jieba.cut(x))
101 |         words = x[:fixlen]
102 | 
103 |         tmp_train = []
104 |         for i in range(fixlen):
105 |             wordid = dict_word2id['BLANK']
106 |             tmp_train.append(wordid)
107 | 
108 |         for index in range(len(words)):
109 |             if words[index] not in dict_word2id.keys():
110 |                 wordid = dict_word2id['UNK']
111 |             else:
112 |                 wordid = dict_word2id[words[index]]
113 |             tmp_train[index] = wordid
114 | 
115 |         list_xtrain.append(tmp_train)
116 |             #break
117 |         #break
118 |         #print (words)
119 | 
120 |     for x in tqdm(xtest_words):
121 |         #words = list (jieba.cut(x))
122 |         words = x[:fixlen]
123 | 
124 |         tmp_test = []
125 |         for i in range(fixlen):
126 |             wordid = dict_word2id['BLANK']
127 |             tmp_test.append(wordid)
128 | 
129 |         for index in range(len(words)):
130 |             if words[index] not in dict_word2id.keys():
131 |                 wordid = dict_word2id['UNK']
132 |             else:
133 |                 wordid = dict_word2id[words[index]]
134 |             tmp_test[index] = wordid
135 | 
136 |         list_xtest.append(tmp_test)
137 | 
138 |     list_xtest = np.array(list_xtest)
139 |     list_xtrain = np.array(list_xtrain)
140 | 
141 |     np.save('data/list_xtrain.npy',list_xtrain)
142 |     np.save('data/list_xtest.npy',list_xtest)
143 | 
144 |     list_ytrain = [] #[[4*one hot],[4*one hot],[4*one hot]]
145 |     list_ytest =  [] #[[4*one hot],[4*one hot],[4*one hot]]
146 | 
147 |     for y in ytrain:
148 |         tmp_label = [0 for _ in range(4)]
149 |         tmp_label[y] = 1
150 |         list_ytrain.append(tmp_label)
151 |         #break
152 | 
153 |     for y in ytest:
154 |         tmp_label = [0 for _ in range(4)]
155 |         tmp_label[y] = 1
156 |         list_ytest.append(tmp_label)
157 |         #break
158 | 
159 |     list_ytrain = np.array(list_ytrain)
160 |     list_ytest  = np.array(list_ytest)
161 |     np.save('data/list_ytrain.npy',list_ytrain)
162 |     np.save('data/list_ytest.npy',list_ytest)
163 | 
164 |     print (123)
165 | 
166 | 
167 | 
168 | 
169 | def init_validation():
170 | 
171 |     with open('data/x_validation.pkl','rb') as input:
172 |         x_validation = pickle.load(input)
173 | 
174 |     with open('data/dict_word2id.pkl', 'rb') as input:
175 |         dict_word2id = pickle.load(input)
176 | 
177 |     dict_word2id['UNK'] = len(dict_word2id)
178 |     dict_word2id['BLANK'] = len(dict_word2id)
179 |     print('dict_word2id:',len(dict_word2id))
180 | 
181 |     fixlen = 350
182 | 
183 |     x_validation_words = []
184 | 
185 |     for x in tqdm(x_validation):
186 |         x_validation_words.append(list(jieba.cut(x)))
187 | 
188 |     list_x_validation = []  #[[wordid*fixlen],[wordid*fixlen] ... ]
189 |     for x in tqdm(x_validation_words):
190 |         # words = list (jieba.cut(x))
191 |         words = x[:fixlen]
192 | 
193 |         tmp_train = []
194 |         for i in range(fixlen):
195 |             wordid = dict_word2id['BLANK']
196 |             tmp_train.append(wordid)
197 | 
198 |         for index in range(len(words)):
199 |             if words[index] not in dict_word2id.keys():
200 |                 wordid = dict_word2id['UNK']
201 |             else:
202 |                 wordid = dict_word2id[words[index]]
203 |             tmp_train[index] = wordid
204 | 
205 |         list_x_validation.append(tmp_train)
206 |         # break
207 |         # break
208 |         # print (words)
209 |     list_x_validation = np.array(list_x_validation)
210 |     np.save('data/list_x_validation.npy', list_x_validation)
211 |     print ('list_x_validation:',len(list_x_validation))
212 | 
213 | def init_lengths():
214 |     list_xtrain = np.load('data/list_xtrain.npy')
215 |     list_xtest = np.load('data/list_xtest.npy')
216 |     list_x_validation = np.load('data/list_x_validation.npy')
217 | 
218 |     list_xtrain_len = []
219 |     list_xtest_len = []
220 |     list_x_validation_len = []
221 | 
222 | 
223 |     for i in list_xtrain:
224 |         i = list(i)
225 | 
226 |         if 435614 in i:
227 |             tmp_len = i.index(435614) + 1
228 |         else:
229 |             tmp_len=350
230 | 
231 |         list_xtrain_len.append(tmp_len)
232 | 
233 |     for i in list_xtest:
234 |         i = list(i)
235 | 
236 |         if 435614 in i:
237 |             tmp_len = i.index(435614) + 1
238 |         else:
239 |             tmp_len=350
240 | 
241 |         list_xtest_len.append(tmp_len)
242 | 
243 |     for i in list_x_validation:
244 |         i = list(i)
245 | 
246 |         if 435614 in i:
247 |             tmp_len = i.index(435614) + 1
248 |         else:
249 |             tmp_len=350
250 | 
251 |         list_x_validation_len.append(tmp_len)
252 | 
253 |     list_xtrain_len = np.array(list_xtrain_len)
254 |     list_xtest_len = np.array(list_xtest_len)
255 |     list_x_validation_len = np.array(list_x_validation_len)
256 | 
257 |     np.save('data/list_xtrain_len.npy',list_xtrain_len)
258 |     np.save('data/list_xtest_len.npy', list_xtest_len)
259 |     np.save('data/list_x_validation_len.npy', list_x_validation_len)
260 | 
261 | 
262 | #init_validation()
263 | 
264 | init_lengths()


--------------------------------------------------------------------------------
/cnnmodel.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | import random
  4 | import tensorflow.contrib.layers as layers
  5 | from tqdm import tqdm
  6 | import time
  7 | import pickle
  8 | import test_pred
  9 | 
 10 | 
 11 | class Settings(object):
 12 |     def __init__(self):
 13 |         self.vocab_size = 435615
 14 |         self.len_sentence = 350
 15 |         self.num_epochs = 3
 16 |         self.num_classes = 4
 17 |         self.cnn_size = 300
 18 |         self.num_layers = 1
 19 |         self.word_embedding = 50
 20 |         self.keep_prob = 0.5
 21 |         self.batch_size = 300
 22 |         self.num_steps = 10000
 23 |         self.lr= 0.001
 24 | 
 25 | 
 26 | class CNN():
 27 |     def __init__(self, setting):
 28 |         self.vocab_size = setting.vocab_size
 29 |         self.len_sentence = len_sentence = setting.len_sentence
 30 |         self.num_epochs = setting.num_epochs
 31 |         self.num_classes = num_classes = setting.num_classes
 32 |         self.cnn_size = setting.cnn_size
 33 |         self.num_layers = setting.num_layers
 34 |         self.word_embedding = setting.word_embedding
 35 |         self.lr = setting.lr
 36 | 
 37 | 
 38 |         self.input_word = tf.placeholder(dtype=tf.int32, shape=[None, len_sentence], name='input_word')
 39 |         self.input_y = tf.placeholder(dtype=tf.float32, shape=[None, num_classes], name='input_y')
 40 |         self.keep_prob = tf.placeholder(tf.float32)
 41 | 
 42 |         word_embedding = tf.get_variable('word_embedding',[self.vocab_size, self.word_embedding])
 43 |         self.inputs = tf.nn.embedding_lookup(word_embedding, self.input_word)
 44 |         self.inputs = tf.reshape(self.inputs, [-1,self.len_sentence,self.word_embedding,1] )
 45 | 
 46 | 
 47 |         #卷积层
 48 |         conv = layers.conv2d(inputs =self.inputs ,num_outputs = self.cnn_size ,kernel_size = [3,self.word_embedding],stride=[1,self.word_embedding],padding='SAME')
 49 | 
 50 |         # pooling层
 51 |         max_pool = layers.max_pool2d(conv, kernel_size=[self.len_sentence, 1], stride=[1, 1])
 52 |         self.sentence = tf.reshape(max_pool, [-1, self.cnn_size])
 53 | 
 54 |         # dropout层
 55 |         tanh = tf.nn.tanh(self.sentence)
 56 |         drop = layers.dropout(tanh, keep_prob=self.keep_prob)
 57 | 
 58 |         # 全连接层
 59 |         self.outputs = layers.fully_connected(inputs=drop, num_outputs=self.num_classes, activation_fn=tf.nn.softmax)
 60 | 
 61 |         # loss
 62 |         self.cross_loss = -tf.reduce_mean( tf.log(tf.reduce_sum( self.input_y  * self.outputs ,axis=1)))
 63 |         self.l2_loss = tf.contrib.layers.apply_regularization(regularizer=tf.contrib.layers.l2_regularizer(0.0001),
 64 |                                                               weights_list=tf.trainable_variables())
 65 |         self.final_loss = self.cross_loss + self.l2_loss
 66 | 
 67 |         # accuracy
 68 |         self.pred = tf.argmax(self.outputs, axis=1)
 69 |         self.pred_prob = tf.reduce_max(self.outputs, axis=1)
 70 | 
 71 |         self.y_label = tf.argmax(self.input_y, axis=1)
 72 |         self.accuracy = tf.reduce_mean(tf.cast(tf.equal(self.pred, self.y_label), 'float'))
 73 | 
 74 |         # minimize loss
 75 |         optimizer = tf.train.AdamOptimizer(self.lr)
 76 |         self.train_op = optimizer.minimize(self.final_loss)
 77 | 
 78 | 
 79 | 
 80 | def train (save_path,cnn_size):
 81 | 
 82 |     print('reading training data')
 83 | 
 84 |     list_xtrain = np.load('data/list_xtrain.npy')
 85 |     list_xtest = np.load('data/list_xtest.npy')
 86 |     list_ytrain = np.load('data/list_ytrain.npy')
 87 |     list_ytest = np.load('data/list_ytest.npy')
 88 | 
 89 | 
 90 | 
 91 |     assert(len(list_ytrain) == len(list_xtrain) and len(list_ytest) == len(list_xtest))
 92 | 
 93 | 
 94 |     list_xtrain = list(list_xtrain)
 95 |     list_xtest = list(list_xtest)
 96 |     list_ytrain = list(list_ytrain)
 97 |     list_ytest = list(list_ytest)
 98 |     list_xtrain += list_xtest
 99 |     list_ytrain += list_ytest
100 |     assert (len(list_ytrain) == len(list_xtrain))
101 | 
102 | 
103 |     print (len(list_xtrain),len(list_ytrain))
104 | 
105 |     settings = Settings()
106 |     settings.num_classes = len(list_ytrain[0])
107 |     settings.num_steps = (len(list_xtrain) // settings.batch_size) +1
108 |     settings.cnn_size = cnn_size
109 | 
110 |     with tf.Graph().as_default():
111 |         sess = tf.Session()
112 |         with sess.as_default():
113 |             initializer = tf.contrib.layers.xavier_initializer()
114 |             with tf.variable_scope("model", reuse=None, initializer=initializer):
115 |                 model = CNN(setting=settings)
116 | 
117 |             sess.run(tf.global_variables_initializer())
118 |             saver = tf.train.Saver()
119 |             #saver.restore(sess, save_path=save_path)
120 | 
121 |             for epoch in range(1, settings.num_epochs + 1):
122 | 
123 |                 bar = tqdm(range(settings.num_steps), desc='epoch {}, loss=0.000000, accuracy=0.000000'.format(epoch))
124 |                 for _ in bar:
125 | 
126 |                     sample_list = random.sample(range(len(list_ytrain)), settings.batch_size)
127 |                     batch_train_word = [list_xtrain[x] for x in sample_list]
128 |                     batch_train_y = [list_ytrain[x] for x in sample_list]
129 | 
130 |                     feed_dict = {}
131 |                     feed_dict[model.input_word] = batch_train_word
132 |                     feed_dict[model.input_y] = batch_train_y
133 |                     feed_dict[model.keep_prob] = settings.keep_prob
134 |                     _,loss,accuracy=sess.run([model.train_op, model.final_loss, model.accuracy],feed_dict=feed_dict)
135 |                     bar.set_description('epoch {} loss={:.6f} accuracy={:.6f}'.format(epoch, loss, accuracy))
136 |                 saver.save(sess, save_path=save_path)
137 | 
138 | 
139 | def test (save_path,cnn_size):
140 | 
141 |     result = []#[labels]
142 | 
143 |     list_xtest = np.load('data/list_xtest.npy')
144 |     list_ytest = np.load('data/list_ytest.npy')
145 |     assert (len(list_ytest) == len(list_xtest))
146 | 
147 |     settings = Settings()
148 |     settings.num_classes = len(list_ytest[0])
149 |     settings.num_steps = (len(list_xtest) // settings.batch_size) + 1
150 |     settings.cnn_size = cnn_size
151 | 
152 |     with tf.Graph().as_default():
153 |         sess = tf.Session()
154 |         with sess.as_default():
155 |             with tf.variable_scope("model"):
156 |                 model = CNN(setting=settings)
157 | 
158 |             saver = tf.train.Saver()
159 |             saver.restore(sess, save_path=save_path)
160 | 
161 |             for i in tqdm(range(settings.num_steps + 1)):
162 | 
163 |                 batch_test_word = list_xtest[settings.batch_size * i: settings.batch_size * (i + 1)]
164 |                 batch_test_y = list_ytest[settings.batch_size * i: settings.batch_size * (i + 1)]
165 | 
166 |                 feed_dict = {}
167 |                 feed_dict[model.input_word] = batch_test_word
168 |                 feed_dict[model.input_y] = batch_test_y
169 |                 feed_dict[model.keep_prob] = 1
170 |                 pred = sess.run([model.pred],feed_dict=feed_dict)
171 |                 pred = list(pred[0])
172 |                 result += pred
173 |     return result
174 | 
175 | 
176 | 
177 | 
178 | def validation(save_path,cnn_size):
179 | 
180 |     result = []#[labels]
181 | 
182 |     list_xtest = np.load('data/list_x_validation.npy')
183 |     dict_index = {0:'自动摘要', 1:'机器翻译',2:'人类作者',3:'机器作者'}
184 |     #list_ytest = np.load('data/list_ytest.npy')
185 |     #assert (len(list_ytest) == len(list_xtest))
186 | 
187 |     settings = Settings()
188 |     settings.num_classes = 4
189 |     settings.num_steps = (len(list_xtest) // settings.batch_size) + 1
190 |     settings.cnn_size = cnn_size
191 | 
192 |     with tf.Graph().as_default():
193 |         sess = tf.Session()
194 |         with sess.as_default():
195 |             with tf.variable_scope("model"):
196 |                 model = CNN(setting=settings)
197 | 
198 |             saver = tf.train.Saver()
199 |             saver.restore(sess, save_path=save_path)
200 | 
201 |             for i in tqdm(range(settings.num_steps + 1)):
202 | 
203 |                 batch_test_word = list_xtest[settings.batch_size * i: settings.batch_size * (i + 1)]
204 |                 #batch_test_y = [0 for _ in range(settings.batch_size)]
205 | 
206 |                 feed_dict = {}
207 |                 feed_dict[model.input_word] = batch_test_word
208 |                 #feed_dict[model.input_y] = batch_test_y
209 |                 feed_dict[model.keep_prob] = 1
210 |                 pred = sess.run([model.pred],feed_dict=feed_dict)
211 |                 pred = list(pred[0])
212 |                 result += pred
213 | 
214 |     assert (len(result) == len(list_xtest))
215 |     validation_labels = [dict_index[x] for x in result ]
216 |     return validation_labels
217 | 
218 | 
219 | 
220 | 
221 | cnn_size = 300
222 | print ('cnn_size:',cnn_size)
223 | print ('\n')
224 | save_path = 'model/cnnmodel_total_'+str(cnn_size)+'.ckpt'
225 | train(save_path,cnn_size)
226 | pred = test (save_path,cnn_size)
227 | 
228 | np.save('data/cnn_total_pred_'+ str(cnn_size) +'.npy',pred)
229 | test_pred.test(save_path ='data/cnn_total_pred_'+ str(cnn_size) +'.npy')
230 | print ('\n')
231 | 
232 | 
233 | #validation_labels = validation('model/cnnmodel_total_450.ckpt',cnn_size)
234 | #np.save('data/validation_labels.npy',validation_labels)
235 | 
236 | 
237 | 
238 | 
239 | 
240 | 
241 | 
242 | 
243 | 
244 | 
245 | 


--------------------------------------------------------------------------------
/rnn_cnn_model.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | import random
  4 | import tensorflow.contrib.layers as layers
  5 | from tqdm import tqdm
  6 | import time
  7 | import pickle
  8 | import test_pred
  9 | 
 10 | 
 11 | class Settings(object):
 12 |     def __init__(self):
 13 |         self.vocab_size = 435615
 14 |         self.len_sentence = 350
 15 |         self.num_epochs = 1
 16 |         self.num_classes = 4
 17 |         self.cnn_size = 300
 18 |         self.num_layers = 1
 19 |         self.word_embedding = 50
 20 |         self.keep_prob = 0.5
 21 |         self.batch_size = 300
 22 |         self.num_steps = 10000
 23 |         self.lr= 0.001
 24 |         self.gru_size = 300
 25 | 
 26 | 
 27 | class RNN_CNN():
 28 |     def __init__(self, setting):
 29 |         self.vocab_size = setting.vocab_size
 30 |         self.len_sentence = len_sentence = setting.len_sentence
 31 |         self.num_epochs = setting.num_epochs
 32 |         self.num_classes = num_classes = setting.num_classes
 33 |         self.cnn_size = setting.cnn_size
 34 |         self.num_layers = setting.num_layers
 35 |         self.word_embedding = setting.word_embedding
 36 |         self.lr = setting.lr
 37 |         self.gru_size = setting.gru_size
 38 | 
 39 |         self.input_word = tf.placeholder(dtype=tf.int32, shape=[None, len_sentence], name='input_word')
 40 |         self.input_y = tf.placeholder(dtype=tf.float32, shape=[None, num_classes], name='input_y')
 41 |         self.keep_prob = tf.placeholder(tf.float32)
 42 |         self.input_length = tf.placeholder(tf.int32, [None])
 43 | 
 44 | 
 45 |         word_embedding = tf.get_variable('word_embedding',[self.vocab_size, self.word_embedding])
 46 | 
 47 |         self.input_data = tf.nn.embedding_lookup(word_embedding, self.input_word)
 48 | 
 49 | 
 50 | 
 51 |         #双向GRU层
 52 |         gru_fw_cell = tf.nn.rnn_cell.GRUCell(self.gru_size)
 53 |         gru_bw_cell = tf.nn.rnn_cell.GRUCell(self.gru_size)
 54 | 
 55 |         (output_fw, output_bw), states = tf.nn.bidirectional_dynamic_rnn(cell_fw=gru_fw_cell,
 56 |                                                                          cell_bw=gru_bw_cell,
 57 |                                                                          inputs=self.input_data,
 58 |                                                                          sequence_length=self.input_length,
 59 |                                                                          dtype=tf.float32,
 60 |                                                                          scope="BiGRU")
 61 | 
 62 |         self.rnn_outputs = tf.concat([output_fw, output_bw], axis=2)
 63 | 
 64 | 
 65 |         cnn_input = tf.reshape(self.rnn_outputs, [-1,self.len_sentence,self.gru_size*2,1] )
 66 | 
 67 |         # 卷积层
 68 |         conv = layers.conv2d(inputs=cnn_input, num_outputs=self.cnn_size, kernel_size=[3, self.gru_size*2],
 69 |                              stride=[1, self.gru_size*2], padding='SAME')
 70 | 
 71 |         # pooling层
 72 |         max_pool = layers.max_pool2d(conv, kernel_size=[self.len_sentence, 1], stride=[1, 1])
 73 |         self.sentence = tf.reshape(max_pool, [-1, self.cnn_size])
 74 | 
 75 |         # dropout层
 76 |         tanh = tf.nn.tanh(self.sentence)
 77 |         drop = layers.dropout(tanh, keep_prob=self.keep_prob)
 78 | 
 79 |         # 全连接层
 80 |         self.outputs = layers.fully_connected(inputs=drop, num_outputs=self.num_classes, activation_fn=tf.nn.softmax)
 81 | 
 82 |         # loss
 83 |         self.cross_loss = -tf.reduce_mean(tf.log(tf.reduce_sum(self.input_y * self.outputs, axis=1)))
 84 |         self.l2_loss = tf.contrib.layers.apply_regularization(regularizer=tf.contrib.layers.l2_regularizer(0.0001),
 85 |                                                               weights_list=tf.trainable_variables())
 86 |         self.final_loss = self.cross_loss + self.l2_loss
 87 | 
 88 |         # accuracy
 89 |         self.pred = tf.argmax(self.outputs, axis=1)
 90 |         self.pred_prob = tf.reduce_max(self.outputs, axis=1)
 91 | 
 92 |         self.y_label = tf.argmax(self.input_y, axis=1)
 93 |         self.accuracy = tf.reduce_mean(tf.cast(tf.equal(self.pred, self.y_label), 'float'))
 94 | 
 95 |         # minimize loss
 96 |         optimizer = tf.train.AdamOptimizer(self.lr)
 97 |         self.train_op = optimizer.minimize(self.final_loss)
 98 | 
 99 | def train (save_path,cnn_num,rnn_num,i):
100 | 
101 |     print('reading training data')
102 | 
103 |     list_xtrain = np.load('data/list_xtrain.npy')
104 |     list_xtest = np.load('data/list_xtest.npy')
105 |     list_ytrain = np.load('data/list_ytrain.npy')
106 |     list_ytest = np.load('data/list_ytest.npy')
107 |     list_xtrain_len = np.load('data/list_xtrain_len.npy')
108 |     list_xtest_len = np.load('data/list_xtest_len.npy')
109 | 
110 |     assert(len(list_ytrain) == len(list_xtrain) and len(list_xtrain) == len(list_xtrain_len) )
111 | 
112 | 
113 |     list_xtrain = list(list_xtrain)
114 |     list_xtest = list(list_xtest)
115 |     list_ytrain = list(list_ytrain)
116 |     list_ytest = list(list_ytest)
117 |     list_xtrain += list_xtest
118 |     list_ytrain += list_ytest
119 |     assert (len(list_ytrain) == len(list_xtrain))
120 | 
121 | 
122 |     print (len(list_xtrain),len(list_ytrain))
123 | 
124 |     settings = Settings()
125 |     settings.num_classes = len(list_ytrain[0])
126 |     settings.num_steps = (len(list_xtrain) // settings.batch_size) +1
127 |     settings.cnn_size = cnn_num
128 |     settings.gru_size = rnn_num
129 | 
130 |     with tf.Graph().as_default():
131 |         sess = tf.Session()
132 |         with sess.as_default():
133 |             initializer = tf.contrib.layers.xavier_initializer()
134 |             with tf.variable_scope("model", reuse=None, initializer=initializer):
135 |                 model = RNN_CNN(setting=settings)
136 | 
137 |             sess.run(tf.global_variables_initializer())
138 |             saver = tf.train.Saver()
139 |             if i !=0:
140 |                 saver.restore(sess, save_path=save_path)
141 | 
142 |             for epoch in range(1, settings.num_epochs + 1):
143 | 
144 |                 bar = tqdm(range(settings.num_steps), desc='epoch {}, loss=0.000000, accuracy=0.000000'.format(epoch))
145 |                 for _ in bar:
146 |                 #for _ in range(settings.num_steps):
147 | 
148 |                     sample_list = random.sample(range(len(list_ytrain)), settings.batch_size)
149 |                     batch_train_word = [list_xtrain[x] for x in sample_list]
150 |                     batch_train_y = [list_ytrain[x] for x in sample_list]
151 |                     batch_train_len = [list_xtrain_len[x] for x in sample_list]
152 | 
153 |                     feed_dict = {}
154 |                     feed_dict[model.input_word] = batch_train_word
155 |                     feed_dict[model.input_y] = batch_train_y
156 |                     feed_dict[model.keep_prob] = settings.keep_prob
157 |                     feed_dict[model.input_length] = batch_train_len
158 |                     '''
159 |                     output  = sess.run(model.rnn_outputs,feed_dict=feed_dict)
160 |                     print (output.shape)
161 |                     '''
162 |                     _,loss,accuracy=sess.run([model.train_op, model.final_loss, model.accuracy],feed_dict=feed_dict)
163 |                     bar.set_description('epoch {} loss={:.6f} accuracy={:.6f}'.format(epoch, loss, accuracy))
164 |                     #break
165 |                 saver.save(sess, save_path=save_path)
166 |                 #break
167 | 
168 | 
169 | 
170 | 
171 | def test (save_path,cnn_num,rnn_num):
172 | 
173 |     result = []#[labels]
174 |     list_xtest = np.load('data/list_xtest.npy')
175 |     list_ytest = np.load('data/list_ytest.npy')
176 |     list_xtest_len = np.load('data/list_xtest_len.npy')
177 | 
178 |     assert (len(list_ytest) == len(list_xtest))
179 | 
180 |     settings = Settings()
181 |     settings.num_classes = len(list_ytest[0])
182 |     settings.num_steps = (len(list_xtest) // settings.batch_size) + 1
183 |     settings.cnn_size = cnn_num
184 |     settings.gru_size = rnn_num
185 | 
186 |     with tf.Graph().as_default():
187 |         sess = tf.Session()
188 |         with sess.as_default():
189 |             with tf.variable_scope("model"):
190 |                 model = RNN_CNN(setting=settings)
191 | 
192 |             saver = tf.train.Saver()
193 | 
194 |             saver.restore(sess, save_path=save_path)
195 | 
196 |             #for i in range(settings.num_steps + 1):
197 |             for i in tqdm(range(settings.num_steps + 1)):
198 | 
199 |                 batch_test_word = list_xtest[settings.batch_size * i: settings.batch_size * (i + 1)]
200 |                 batch_test_y = list_ytest[settings.batch_size * i: settings.batch_size * (i + 1)]
201 |                 batch_train_len = list_xtest_len[settings.batch_size * i: settings.batch_size * (i + 1)]
202 | 
203 |                 feed_dict = {}
204 |                 feed_dict[model.input_word] = batch_test_word
205 |                 feed_dict[model.input_y] = batch_test_y
206 |                 feed_dict[model.keep_prob] = 1
207 |                 feed_dict[model.input_length] = batch_train_len
208 | 
209 |                 pred = sess.run([model.pred],feed_dict=feed_dict)
210 |                 pred = list(pred[0])
211 |                 result += pred
212 |     return result
213 | 
214 | 
215 | def validation(save_path):
216 | 
217 |     result = []#[labels]
218 | 
219 |     list_xtest = np.load('data/list_x_validation.npy')
220 |     list_xtest_len = np.load('data/list_x_validation_len.npy')
221 |     dict_index = {0:'自动摘要', 1:'机器翻译',2:'人类作者',3:'机器作者'}
222 |     #list_ytest = np.load('data/list_ytest.npy')
223 |     assert (len(list_xtest) == len(list_xtest_len))
224 | 
225 |     settings = Settings()
226 |     settings.num_classes = 4
227 |     settings.num_steps = (len(list_xtest) // settings.batch_size) + 1
228 | 
229 |     with tf.Graph().as_default():
230 |         sess = tf.Session()
231 |         with sess.as_default():
232 |             with tf.variable_scope("model"):
233 |                 model = RNN_CNN(setting=settings)
234 | 
235 |             saver = tf.train.Saver()
236 |             saver.restore(sess, save_path=save_path)
237 | 
238 |             for i in tqdm(range(settings.num_steps + 1)):
239 | 
240 |                 batch_test_word = list_xtest[settings.batch_size * i: settings.batch_size * (i + 1)]
241 |                 batch_train_len = list_xtest_len[settings.batch_size * i: settings.batch_size * (i + 1)]
242 |                 #batch_test_y = [0 for _ in range(settings.batch_size)]
243 | 
244 |                 feed_dict = {}
245 |                 feed_dict[model.input_word] = batch_test_word
246 |                 feed_dict[model.input_length] = batch_train_len
247 |                 #feed_dict[model.input_y] = batch_test_y
248 |                 feed_dict[model.keep_prob] = 1
249 |                 pred = sess.run([model.pred],feed_dict=feed_dict)
250 |                 pred = list(pred[0])
251 |                 result += pred
252 | 
253 |     assert (len(result) == len(list_xtest))
254 |     validation_labels = [dict_index[x] for x in result ]
255 |     return validation_labels
256 | 
257 | rnn_num = 200
258 | cnn_num = 300
259 | 
260 | for i in range (3):
261 |     print('rnn_num,cnn_num,i:', rnn_num, cnn_num, i)
262 |     save_path = 'model/origin_rnncnn_model_total_c'+str(cnn_num)+'_r'+str(rnn_num)+'.ckpt'
263 |     train(save_path,cnn_num,rnn_num,i)
264 |     pred = test (save_path,cnn_num,rnn_num)
265 |     np.save('data/origin_rnncnn_pred.npy',pred)
266 |     test_pred.test(save_path ='data/origin_rnncnn_pred.npy')
267 |     print ('\n')
268 | 
269 | 
270 | #validation_labels = validation(save_path)
271 | #np.save('data/validation_labels.npy',validation_labels)


--------------------------------------------------------------------------------