├── data ├── .DS_Store └── cnn_pred_200.npy ├── image └── result.png ├── model ├── .DS_Store └── checkpoint ├── origin_data └── .DS_Store ├── README.md ├── test_pred.py ├── preprocess.py ├── init.py ├── cnnmodel.py └── rnn_cnn_model.py /data/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xuyanfu/smp2018/HEAD/data/.DS_Store -------------------------------------------------------------------------------- /image/result.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xuyanfu/smp2018/HEAD/image/result.png -------------------------------------------------------------------------------- /model/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xuyanfu/smp2018/HEAD/model/.DS_Store -------------------------------------------------------------------------------- /data/cnn_pred_200.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xuyanfu/smp2018/HEAD/data/cnn_pred_200.npy -------------------------------------------------------------------------------- /origin_data/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xuyanfu/smp2018/HEAD/origin_data/.DS_Store -------------------------------------------------------------------------------- /model/checkpoint: -------------------------------------------------------------------------------- 1 | model_checkpoint_path: "cnnmodel_total_450.ckpt" 2 | all_model_checkpoint_paths: "cnnmodel_total_450.ckpt" 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## smp2018用户画像技术评测基础方案 2 | 3 | [用户画像技术评测](https://biendata.com/competition/smpeupt2018/)给定一个由若干媒体内容文档构成的数据集,参赛队伍采用适当的算法,对每篇文档的作者进行身份识别,区分出该文档属于人类写作、机器写作、机器翻译和机器自动摘要中的哪一类。简单的来说就是文本分类任务。 4 | 5 | 原始训练集用于模型的学习,共146,421篇,其中自动摘要31034篇,机器翻译36206,人类作者 48018,机器作者31163。每一种类型中各取5000篇作为测试集,剩下的作为训练集。 6 | 7 | 分别使用词汇级别CNN和RNN+CNN的模型进行文本分类。 8 | 9 | ## 结果 10 | 自构建的测试集上,使用RNN+CNN的模型,达到了98%的准确率。使用全部数据训练的RNN+CNN模型,在第一轮评测中的结果如图所示: 11 | 12 | ![](./image/result.png) 13 | 14 | ## 环境 15 | - Python (>=3.6.1) 16 | - TensorFlow (=1.6.0) 17 | - jieba 18 | 19 | ## 文件结构 20 | 21 | - data: 存储中间生成的数据 22 | - model: 存储训练完成的模型 23 | - origin_data: 存储原始txt文件 24 | - result: 存储模型预测的结果 25 | 26 | ## 代码结构 27 | - preprocess.py: 数据预处理,处理原始json格式的文件 28 | - init.py: 数据初始化,划分训练集测试集 29 | - cnnmodel.py: CNN模型 30 | - rnn_cnn_model.py: RNN+CNN模型 31 | - test_pred.py: 计算模型预测结果的准确率 32 | 33 | ## 运行方式 34 | python preprocess.py#数据预处理 35 | python init.py#数据初始化 36 | python cnnmodel.py#使用CNN模型训练数据,查看结果 37 | python rnn_cnn_model.py#使用RNN模型训练数据,查看结果 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | -------------------------------------------------------------------------------- /test_pred.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import random 4 | import tensorflow.contrib.layers as layers 5 | from tqdm import tqdm 6 | import time 7 | import pickle 8 | 9 | 10 | 11 | def test(save_path): 12 | pred = np.load(save_path) 13 | with open('data/ytest.pkl','rb') as input: 14 | ytest = pickle.load(input) 15 | 16 | pred =np.array(pred) 17 | ytest = np.array(ytest) 18 | result = np.equal(pred,ytest) 19 | #print (result) 20 | result = list(result) 21 | positive = result.count(True) 22 | print (positive) 23 | print (positive/len(ytest)) 24 | 25 | def validation(): 26 | pred = np.load('data/validation_labels_epoch3.npy',) 27 | with open('data/id_validation.pkl','rb') as input: 28 | id_validationn = pickle.load(input) 29 | 30 | assert (len(pred) == len(id_validationn)) 31 | length = len(pred) 32 | with open('result/origin_rnn_cnn_total_c300_r300_3.csv','w',encoding='utf-8') as output: 33 | for i in range(length): 34 | tmp_line = str (id_validationn[i]) + ',' + pred[i]+'\n' 35 | output.write(tmp_line) 36 | 37 | #test('data/pred.npy') 38 | #validation() -------------------------------------------------------------------------------- /preprocess.py: -------------------------------------------------------------------------------- 1 | import json 2 | import random 3 | import pickle 4 | 5 | 6 | def init_train(): 7 | 8 | with open('origin_data/training.txt','r',encoding='utf-8') as input: 9 | lines = input.readlines() 10 | 11 | dict_nums = {} 12 | dict_content = {}#{index:[content]} 13 | print (len(lines)) 14 | num = 0 15 | 16 | for line in lines: 17 | try: 18 | line = line.strip() 19 | items = json.loads(line) 20 | index = items['标签'] 21 | content = items['内容'] 22 | 23 | if index not in dict_nums.keys(): 24 | dict_nums[index] = 0 25 | dict_nums[index] +=1 26 | else: 27 | dict_nums[index] += 1 28 | 29 | if index not in dict_content.keys(): 30 | dict_content[index] = [] 31 | dict_content[index].append(content) 32 | else: 33 | dict_content[index].append(content) 34 | 35 | except: 36 | print (line) 37 | 38 | num +=1 39 | if num %1000 == 0: 40 | print(num) 41 | 42 | print (dict_nums,len(dict_content)) 43 | 44 | 45 | #{'自动摘要': 31034, '机器翻译': 36206, '人类作者': 48018, '机器作者': 31163} 46 | ''' 47 | with open('origin_data/human','r',encoding='utf-8') as input: 48 | human = input.readlines() 49 | 50 | with open('origin_data/machine','r',encoding='utf-8') as input: 51 | machine = input.readlines() 52 | 53 | with open('origin_data/summary','r',encoding='utf-8') as input: 54 | summary = input.readlines() 55 | 56 | with open('origin_data/translate','r',encoding='utf-8') as input: 57 | translate = input.readlines() 58 | 59 | print (len(summary),len(translate),len(human),len(machine)) 60 | ''' 61 | dict_index = {'自动摘要': 0, '机器翻译': 1, '人类作者': 2, '机器作者': 3} 62 | 63 | train_total = [] 64 | test_total = [] 65 | for key in dict_content.keys(): 66 | contents = dict_content[key] 67 | contents = list(set(contents)) 68 | 69 | tmp_len = len(contents) 70 | len_test = 5000 71 | 72 | contents_test = random.sample(contents,len_test) 73 | contents_train = list((set(contents) - set(contents_test))) 74 | print (len(contents)) 75 | print (len(contents_test)) 76 | print (len(contents_train)) 77 | 78 | train_total += [(x, dict_index[key]) for x in contents_train] 79 | test_total += [(x, dict_index[key]) for x in contents_test] 80 | 81 | print ('\n') 82 | print (len(train_total)) 83 | print (len(test_total)) 84 | 85 | '''for i in train_total: 86 | if train_total.count(i) >1: 87 | print (i) 88 | 89 | for i in test_total: 90 | if test_total.count(i) >1: 91 | print (i) 92 | ''' 93 | 94 | train_total = list(set(train_total)) 95 | test_total= list(set(test_total)) 96 | 97 | print ('\n') 98 | print (len(train_total)) 99 | print (len(test_total)) 100 | 101 | 102 | xtrain = [x[0] for x in train_total] 103 | ytrain = [x[1] for x in train_total] 104 | xtest = [x[0] for x in test_total] 105 | ytest = [x[1] for x in test_total] 106 | 107 | print (len(xtrain),len(ytrain),len(xtest),len(ytest)) 108 | 109 | 110 | with open('data/xtrain.pkl','wb') as output: 111 | pickle.dump(xtrain,output) 112 | 113 | with open('data/ytrain.pkl','wb') as output: 114 | pickle.dump(ytrain,output) 115 | 116 | with open('data/xtest.pkl','wb') as output: 117 | pickle.dump(xtest,output) 118 | 119 | with open('data/ytest.pkl','wb') as output: 120 | pickle.dump(ytest,output) 121 | 122 | 123 | def init_validation(): 124 | with open('origin_data/validation.txt','r',encoding='utf-8') as input: 125 | lines = input.readlines() 126 | 127 | x_validation = [] 128 | id_validation = [] 129 | 130 | for line in lines: 131 | try: 132 | line = line.strip() 133 | items = json.loads(line) 134 | id = items['id'] 135 | content = items['内容'] 136 | x_validation.append(content) 137 | id_validation.append(id) 138 | except: 139 | print (line) 140 | 141 | 142 | with open('data/x_validation.pkl','wb') as output: 143 | pickle.dump(x_validation,output) 144 | 145 | with open('data/id_validation.pkl','wb') as output: 146 | pickle.dump(id_validation,output) 147 | 148 | init_train() 149 | #init_validation() 150 | 151 | 152 | 153 | -------------------------------------------------------------------------------- /init.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import jieba 3 | import re 4 | from tqdm import tqdm 5 | import numpy as np 6 | 7 | 8 | 9 | def init_train(): 10 | with open('data/xtrain.pkl','rb') as input: 11 | xtrain = pickle.load(input) 12 | 13 | with open('data/ytrain.pkl','rb') as input: 14 | ytrain = pickle.load(input) 15 | 16 | with open('data/xtest.pkl','rb') as input: 17 | xtest = pickle.load(input) 18 | 19 | with open('data/ytest.pkl','rb') as input: 20 | ytest = pickle.load(input) 21 | 22 | print (123) 23 | 24 | print (len(xtrain),len(ytrain),len(xtest),len(ytest)) 25 | 26 | set_len = set() 27 | list_len = [] 28 | fixlen = 350 29 | 30 | ''' 31 | for x in xtrain: 32 | tmp_sentence = re.split('。|?|!',x) 33 | set_len.add(len(tmp_sentence)) 34 | list_len.append(len(tmp_sentence)) 35 | for sentence in tmp_sentence: 36 | words = list (jieba.cut(sentence)) 37 | print (words) 38 | avg_len = sum(list_len) / len(list_len) 39 | print (123) 40 | ''' 41 | 42 | ''' 43 | for x in tqdm(xtrain): 44 | words = list(jieba.cut(x)) 45 | list_len.append(len(words)) 46 | 47 | avg_len = sum(list_len) / len(list_len) 48 | print (avg_len) 49 | print (123) 50 | ''' 51 | 52 | xtrain_words = [] 53 | xtest_words = [] 54 | 55 | for x in tqdm(xtrain): 56 | xtrain_words.append(list(jieba.cut(x))) 57 | 58 | for x in tqdm(xtest): 59 | xtest_words.append(list(jieba.cut(x))) 60 | 61 | 62 | dict_word2id = {} 63 | 64 | for x in tqdm(xtrain_words): 65 | #words = list (jieba.cut(x)) 66 | words = x[:fixlen] 67 | for word in words: 68 | if word not in dict_word2id.keys(): 69 | dict_word2id[word] = len(dict_word2id) 70 | #break 71 | #break 72 | #print (words) 73 | print (len(dict_word2id)) 74 | 75 | for x in tqdm (xtest_words): 76 | #words = list (jieba.cut(x)) 77 | words = x[:fixlen] 78 | for word in words: 79 | if word not in dict_word2id.keys(): 80 | dict_word2id[word] = len(dict_word2id) 81 | #break 82 | #break 83 | #print (words) 84 | print (len(dict_word2id)) 85 | 86 | with open('data/dict_word2id.pkl','wb') as output: 87 | pickle.dump(dict_word2id,output) 88 | 89 | 90 | with open('data/dict_word2id.pkl','rb') as input: 91 | dict_word2id = pickle.load(input) 92 | 93 | dict_word2id['UNK'] = len(dict_word2id) 94 | dict_word2id['BLANK'] = len(dict_word2id) 95 | print (len(dict_word2id)) 96 | 97 | list_xtrain = []#[[wordid*fixlen],[wordid*fixlen] ... ] 98 | list_xtest = []#[[wordid*fixlen],[wordid*fixlen] ... ] 99 | for x in tqdm(xtrain_words): 100 | #words = list (jieba.cut(x)) 101 | words = x[:fixlen] 102 | 103 | tmp_train = [] 104 | for i in range(fixlen): 105 | wordid = dict_word2id['BLANK'] 106 | tmp_train.append(wordid) 107 | 108 | for index in range(len(words)): 109 | if words[index] not in dict_word2id.keys(): 110 | wordid = dict_word2id['UNK'] 111 | else: 112 | wordid = dict_word2id[words[index]] 113 | tmp_train[index] = wordid 114 | 115 | list_xtrain.append(tmp_train) 116 | #break 117 | #break 118 | #print (words) 119 | 120 | for x in tqdm(xtest_words): 121 | #words = list (jieba.cut(x)) 122 | words = x[:fixlen] 123 | 124 | tmp_test = [] 125 | for i in range(fixlen): 126 | wordid = dict_word2id['BLANK'] 127 | tmp_test.append(wordid) 128 | 129 | for index in range(len(words)): 130 | if words[index] not in dict_word2id.keys(): 131 | wordid = dict_word2id['UNK'] 132 | else: 133 | wordid = dict_word2id[words[index]] 134 | tmp_test[index] = wordid 135 | 136 | list_xtest.append(tmp_test) 137 | 138 | list_xtest = np.array(list_xtest) 139 | list_xtrain = np.array(list_xtrain) 140 | 141 | np.save('data/list_xtrain.npy',list_xtrain) 142 | np.save('data/list_xtest.npy',list_xtest) 143 | 144 | list_ytrain = [] #[[4*one hot],[4*one hot],[4*one hot]] 145 | list_ytest = [] #[[4*one hot],[4*one hot],[4*one hot]] 146 | 147 | for y in ytrain: 148 | tmp_label = [0 for _ in range(4)] 149 | tmp_label[y] = 1 150 | list_ytrain.append(tmp_label) 151 | #break 152 | 153 | for y in ytest: 154 | tmp_label = [0 for _ in range(4)] 155 | tmp_label[y] = 1 156 | list_ytest.append(tmp_label) 157 | #break 158 | 159 | list_ytrain = np.array(list_ytrain) 160 | list_ytest = np.array(list_ytest) 161 | np.save('data/list_ytrain.npy',list_ytrain) 162 | np.save('data/list_ytest.npy',list_ytest) 163 | 164 | print (123) 165 | 166 | 167 | 168 | 169 | def init_validation(): 170 | 171 | with open('data/x_validation.pkl','rb') as input: 172 | x_validation = pickle.load(input) 173 | 174 | with open('data/dict_word2id.pkl', 'rb') as input: 175 | dict_word2id = pickle.load(input) 176 | 177 | dict_word2id['UNK'] = len(dict_word2id) 178 | dict_word2id['BLANK'] = len(dict_word2id) 179 | print('dict_word2id:',len(dict_word2id)) 180 | 181 | fixlen = 350 182 | 183 | x_validation_words = [] 184 | 185 | for x in tqdm(x_validation): 186 | x_validation_words.append(list(jieba.cut(x))) 187 | 188 | list_x_validation = [] #[[wordid*fixlen],[wordid*fixlen] ... ] 189 | for x in tqdm(x_validation_words): 190 | # words = list (jieba.cut(x)) 191 | words = x[:fixlen] 192 | 193 | tmp_train = [] 194 | for i in range(fixlen): 195 | wordid = dict_word2id['BLANK'] 196 | tmp_train.append(wordid) 197 | 198 | for index in range(len(words)): 199 | if words[index] not in dict_word2id.keys(): 200 | wordid = dict_word2id['UNK'] 201 | else: 202 | wordid = dict_word2id[words[index]] 203 | tmp_train[index] = wordid 204 | 205 | list_x_validation.append(tmp_train) 206 | # break 207 | # break 208 | # print (words) 209 | list_x_validation = np.array(list_x_validation) 210 | np.save('data/list_x_validation.npy', list_x_validation) 211 | print ('list_x_validation:',len(list_x_validation)) 212 | 213 | def init_lengths(): 214 | list_xtrain = np.load('data/list_xtrain.npy') 215 | list_xtest = np.load('data/list_xtest.npy') 216 | list_x_validation = np.load('data/list_x_validation.npy') 217 | 218 | list_xtrain_len = [] 219 | list_xtest_len = [] 220 | list_x_validation_len = [] 221 | 222 | 223 | for i in list_xtrain: 224 | i = list(i) 225 | 226 | if 435614 in i: 227 | tmp_len = i.index(435614) + 1 228 | else: 229 | tmp_len=350 230 | 231 | list_xtrain_len.append(tmp_len) 232 | 233 | for i in list_xtest: 234 | i = list(i) 235 | 236 | if 435614 in i: 237 | tmp_len = i.index(435614) + 1 238 | else: 239 | tmp_len=350 240 | 241 | list_xtest_len.append(tmp_len) 242 | 243 | for i in list_x_validation: 244 | i = list(i) 245 | 246 | if 435614 in i: 247 | tmp_len = i.index(435614) + 1 248 | else: 249 | tmp_len=350 250 | 251 | list_x_validation_len.append(tmp_len) 252 | 253 | list_xtrain_len = np.array(list_xtrain_len) 254 | list_xtest_len = np.array(list_xtest_len) 255 | list_x_validation_len = np.array(list_x_validation_len) 256 | 257 | np.save('data/list_xtrain_len.npy',list_xtrain_len) 258 | np.save('data/list_xtest_len.npy', list_xtest_len) 259 | np.save('data/list_x_validation_len.npy', list_x_validation_len) 260 | 261 | 262 | #init_validation() 263 | 264 | init_lengths() -------------------------------------------------------------------------------- /cnnmodel.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import random 4 | import tensorflow.contrib.layers as layers 5 | from tqdm import tqdm 6 | import time 7 | import pickle 8 | import test_pred 9 | 10 | 11 | class Settings(object): 12 | def __init__(self): 13 | self.vocab_size = 435615 14 | self.len_sentence = 350 15 | self.num_epochs = 3 16 | self.num_classes = 4 17 | self.cnn_size = 300 18 | self.num_layers = 1 19 | self.word_embedding = 50 20 | self.keep_prob = 0.5 21 | self.batch_size = 300 22 | self.num_steps = 10000 23 | self.lr= 0.001 24 | 25 | 26 | class CNN(): 27 | def __init__(self, setting): 28 | self.vocab_size = setting.vocab_size 29 | self.len_sentence = len_sentence = setting.len_sentence 30 | self.num_epochs = setting.num_epochs 31 | self.num_classes = num_classes = setting.num_classes 32 | self.cnn_size = setting.cnn_size 33 | self.num_layers = setting.num_layers 34 | self.word_embedding = setting.word_embedding 35 | self.lr = setting.lr 36 | 37 | 38 | self.input_word = tf.placeholder(dtype=tf.int32, shape=[None, len_sentence], name='input_word') 39 | self.input_y = tf.placeholder(dtype=tf.float32, shape=[None, num_classes], name='input_y') 40 | self.keep_prob = tf.placeholder(tf.float32) 41 | 42 | word_embedding = tf.get_variable('word_embedding',[self.vocab_size, self.word_embedding]) 43 | self.inputs = tf.nn.embedding_lookup(word_embedding, self.input_word) 44 | self.inputs = tf.reshape(self.inputs, [-1,self.len_sentence,self.word_embedding,1] ) 45 | 46 | 47 | #卷积层 48 | conv = layers.conv2d(inputs =self.inputs ,num_outputs = self.cnn_size ,kernel_size = [3,self.word_embedding],stride=[1,self.word_embedding],padding='SAME') 49 | 50 | # pooling层 51 | max_pool = layers.max_pool2d(conv, kernel_size=[self.len_sentence, 1], stride=[1, 1]) 52 | self.sentence = tf.reshape(max_pool, [-1, self.cnn_size]) 53 | 54 | # dropout层 55 | tanh = tf.nn.tanh(self.sentence) 56 | drop = layers.dropout(tanh, keep_prob=self.keep_prob) 57 | 58 | # 全连接层 59 | self.outputs = layers.fully_connected(inputs=drop, num_outputs=self.num_classes, activation_fn=tf.nn.softmax) 60 | 61 | # loss 62 | self.cross_loss = -tf.reduce_mean( tf.log(tf.reduce_sum( self.input_y * self.outputs ,axis=1))) 63 | self.l2_loss = tf.contrib.layers.apply_regularization(regularizer=tf.contrib.layers.l2_regularizer(0.0001), 64 | weights_list=tf.trainable_variables()) 65 | self.final_loss = self.cross_loss + self.l2_loss 66 | 67 | # accuracy 68 | self.pred = tf.argmax(self.outputs, axis=1) 69 | self.pred_prob = tf.reduce_max(self.outputs, axis=1) 70 | 71 | self.y_label = tf.argmax(self.input_y, axis=1) 72 | self.accuracy = tf.reduce_mean(tf.cast(tf.equal(self.pred, self.y_label), 'float')) 73 | 74 | # minimize loss 75 | optimizer = tf.train.AdamOptimizer(self.lr) 76 | self.train_op = optimizer.minimize(self.final_loss) 77 | 78 | 79 | 80 | def train (save_path,cnn_size): 81 | 82 | print('reading training data') 83 | 84 | list_xtrain = np.load('data/list_xtrain.npy') 85 | list_xtest = np.load('data/list_xtest.npy') 86 | list_ytrain = np.load('data/list_ytrain.npy') 87 | list_ytest = np.load('data/list_ytest.npy') 88 | 89 | 90 | 91 | assert(len(list_ytrain) == len(list_xtrain) and len(list_ytest) == len(list_xtest)) 92 | 93 | 94 | list_xtrain = list(list_xtrain) 95 | list_xtest = list(list_xtest) 96 | list_ytrain = list(list_ytrain) 97 | list_ytest = list(list_ytest) 98 | list_xtrain += list_xtest 99 | list_ytrain += list_ytest 100 | assert (len(list_ytrain) == len(list_xtrain)) 101 | 102 | 103 | print (len(list_xtrain),len(list_ytrain)) 104 | 105 | settings = Settings() 106 | settings.num_classes = len(list_ytrain[0]) 107 | settings.num_steps = (len(list_xtrain) // settings.batch_size) +1 108 | settings.cnn_size = cnn_size 109 | 110 | with tf.Graph().as_default(): 111 | sess = tf.Session() 112 | with sess.as_default(): 113 | initializer = tf.contrib.layers.xavier_initializer() 114 | with tf.variable_scope("model", reuse=None, initializer=initializer): 115 | model = CNN(setting=settings) 116 | 117 | sess.run(tf.global_variables_initializer()) 118 | saver = tf.train.Saver() 119 | #saver.restore(sess, save_path=save_path) 120 | 121 | for epoch in range(1, settings.num_epochs + 1): 122 | 123 | bar = tqdm(range(settings.num_steps), desc='epoch {}, loss=0.000000, accuracy=0.000000'.format(epoch)) 124 | for _ in bar: 125 | 126 | sample_list = random.sample(range(len(list_ytrain)), settings.batch_size) 127 | batch_train_word = [list_xtrain[x] for x in sample_list] 128 | batch_train_y = [list_ytrain[x] for x in sample_list] 129 | 130 | feed_dict = {} 131 | feed_dict[model.input_word] = batch_train_word 132 | feed_dict[model.input_y] = batch_train_y 133 | feed_dict[model.keep_prob] = settings.keep_prob 134 | _,loss,accuracy=sess.run([model.train_op, model.final_loss, model.accuracy],feed_dict=feed_dict) 135 | bar.set_description('epoch {} loss={:.6f} accuracy={:.6f}'.format(epoch, loss, accuracy)) 136 | saver.save(sess, save_path=save_path) 137 | 138 | 139 | def test (save_path,cnn_size): 140 | 141 | result = []#[labels] 142 | 143 | list_xtest = np.load('data/list_xtest.npy') 144 | list_ytest = np.load('data/list_ytest.npy') 145 | assert (len(list_ytest) == len(list_xtest)) 146 | 147 | settings = Settings() 148 | settings.num_classes = len(list_ytest[0]) 149 | settings.num_steps = (len(list_xtest) // settings.batch_size) + 1 150 | settings.cnn_size = cnn_size 151 | 152 | with tf.Graph().as_default(): 153 | sess = tf.Session() 154 | with sess.as_default(): 155 | with tf.variable_scope("model"): 156 | model = CNN(setting=settings) 157 | 158 | saver = tf.train.Saver() 159 | saver.restore(sess, save_path=save_path) 160 | 161 | for i in tqdm(range(settings.num_steps + 1)): 162 | 163 | batch_test_word = list_xtest[settings.batch_size * i: settings.batch_size * (i + 1)] 164 | batch_test_y = list_ytest[settings.batch_size * i: settings.batch_size * (i + 1)] 165 | 166 | feed_dict = {} 167 | feed_dict[model.input_word] = batch_test_word 168 | feed_dict[model.input_y] = batch_test_y 169 | feed_dict[model.keep_prob] = 1 170 | pred = sess.run([model.pred],feed_dict=feed_dict) 171 | pred = list(pred[0]) 172 | result += pred 173 | return result 174 | 175 | 176 | 177 | 178 | def validation(save_path,cnn_size): 179 | 180 | result = []#[labels] 181 | 182 | list_xtest = np.load('data/list_x_validation.npy') 183 | dict_index = {0:'自动摘要', 1:'机器翻译',2:'人类作者',3:'机器作者'} 184 | #list_ytest = np.load('data/list_ytest.npy') 185 | #assert (len(list_ytest) == len(list_xtest)) 186 | 187 | settings = Settings() 188 | settings.num_classes = 4 189 | settings.num_steps = (len(list_xtest) // settings.batch_size) + 1 190 | settings.cnn_size = cnn_size 191 | 192 | with tf.Graph().as_default(): 193 | sess = tf.Session() 194 | with sess.as_default(): 195 | with tf.variable_scope("model"): 196 | model = CNN(setting=settings) 197 | 198 | saver = tf.train.Saver() 199 | saver.restore(sess, save_path=save_path) 200 | 201 | for i in tqdm(range(settings.num_steps + 1)): 202 | 203 | batch_test_word = list_xtest[settings.batch_size * i: settings.batch_size * (i + 1)] 204 | #batch_test_y = [0 for _ in range(settings.batch_size)] 205 | 206 | feed_dict = {} 207 | feed_dict[model.input_word] = batch_test_word 208 | #feed_dict[model.input_y] = batch_test_y 209 | feed_dict[model.keep_prob] = 1 210 | pred = sess.run([model.pred],feed_dict=feed_dict) 211 | pred = list(pred[0]) 212 | result += pred 213 | 214 | assert (len(result) == len(list_xtest)) 215 | validation_labels = [dict_index[x] for x in result ] 216 | return validation_labels 217 | 218 | 219 | 220 | 221 | cnn_size = 300 222 | print ('cnn_size:',cnn_size) 223 | print ('\n') 224 | save_path = 'model/cnnmodel_total_'+str(cnn_size)+'.ckpt' 225 | train(save_path,cnn_size) 226 | pred = test (save_path,cnn_size) 227 | 228 | np.save('data/cnn_total_pred_'+ str(cnn_size) +'.npy',pred) 229 | test_pred.test(save_path ='data/cnn_total_pred_'+ str(cnn_size) +'.npy') 230 | print ('\n') 231 | 232 | 233 | #validation_labels = validation('model/cnnmodel_total_450.ckpt',cnn_size) 234 | #np.save('data/validation_labels.npy',validation_labels) 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | -------------------------------------------------------------------------------- /rnn_cnn_model.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import random 4 | import tensorflow.contrib.layers as layers 5 | from tqdm import tqdm 6 | import time 7 | import pickle 8 | import test_pred 9 | 10 | 11 | class Settings(object): 12 | def __init__(self): 13 | self.vocab_size = 435615 14 | self.len_sentence = 350 15 | self.num_epochs = 1 16 | self.num_classes = 4 17 | self.cnn_size = 300 18 | self.num_layers = 1 19 | self.word_embedding = 50 20 | self.keep_prob = 0.5 21 | self.batch_size = 300 22 | self.num_steps = 10000 23 | self.lr= 0.001 24 | self.gru_size = 300 25 | 26 | 27 | class RNN_CNN(): 28 | def __init__(self, setting): 29 | self.vocab_size = setting.vocab_size 30 | self.len_sentence = len_sentence = setting.len_sentence 31 | self.num_epochs = setting.num_epochs 32 | self.num_classes = num_classes = setting.num_classes 33 | self.cnn_size = setting.cnn_size 34 | self.num_layers = setting.num_layers 35 | self.word_embedding = setting.word_embedding 36 | self.lr = setting.lr 37 | self.gru_size = setting.gru_size 38 | 39 | self.input_word = tf.placeholder(dtype=tf.int32, shape=[None, len_sentence], name='input_word') 40 | self.input_y = tf.placeholder(dtype=tf.float32, shape=[None, num_classes], name='input_y') 41 | self.keep_prob = tf.placeholder(tf.float32) 42 | self.input_length = tf.placeholder(tf.int32, [None]) 43 | 44 | 45 | word_embedding = tf.get_variable('word_embedding',[self.vocab_size, self.word_embedding]) 46 | 47 | self.input_data = tf.nn.embedding_lookup(word_embedding, self.input_word) 48 | 49 | 50 | 51 | #双向GRU层 52 | gru_fw_cell = tf.nn.rnn_cell.GRUCell(self.gru_size) 53 | gru_bw_cell = tf.nn.rnn_cell.GRUCell(self.gru_size) 54 | 55 | (output_fw, output_bw), states = tf.nn.bidirectional_dynamic_rnn(cell_fw=gru_fw_cell, 56 | cell_bw=gru_bw_cell, 57 | inputs=self.input_data, 58 | sequence_length=self.input_length, 59 | dtype=tf.float32, 60 | scope="BiGRU") 61 | 62 | self.rnn_outputs = tf.concat([output_fw, output_bw], axis=2) 63 | 64 | 65 | cnn_input = tf.reshape(self.rnn_outputs, [-1,self.len_sentence,self.gru_size*2,1] ) 66 | 67 | # 卷积层 68 | conv = layers.conv2d(inputs=cnn_input, num_outputs=self.cnn_size, kernel_size=[3, self.gru_size*2], 69 | stride=[1, self.gru_size*2], padding='SAME') 70 | 71 | # pooling层 72 | max_pool = layers.max_pool2d(conv, kernel_size=[self.len_sentence, 1], stride=[1, 1]) 73 | self.sentence = tf.reshape(max_pool, [-1, self.cnn_size]) 74 | 75 | # dropout层 76 | tanh = tf.nn.tanh(self.sentence) 77 | drop = layers.dropout(tanh, keep_prob=self.keep_prob) 78 | 79 | # 全连接层 80 | self.outputs = layers.fully_connected(inputs=drop, num_outputs=self.num_classes, activation_fn=tf.nn.softmax) 81 | 82 | # loss 83 | self.cross_loss = -tf.reduce_mean(tf.log(tf.reduce_sum(self.input_y * self.outputs, axis=1))) 84 | self.l2_loss = tf.contrib.layers.apply_regularization(regularizer=tf.contrib.layers.l2_regularizer(0.0001), 85 | weights_list=tf.trainable_variables()) 86 | self.final_loss = self.cross_loss + self.l2_loss 87 | 88 | # accuracy 89 | self.pred = tf.argmax(self.outputs, axis=1) 90 | self.pred_prob = tf.reduce_max(self.outputs, axis=1) 91 | 92 | self.y_label = tf.argmax(self.input_y, axis=1) 93 | self.accuracy = tf.reduce_mean(tf.cast(tf.equal(self.pred, self.y_label), 'float')) 94 | 95 | # minimize loss 96 | optimizer = tf.train.AdamOptimizer(self.lr) 97 | self.train_op = optimizer.minimize(self.final_loss) 98 | 99 | def train (save_path,cnn_num,rnn_num,i): 100 | 101 | print('reading training data') 102 | 103 | list_xtrain = np.load('data/list_xtrain.npy') 104 | list_xtest = np.load('data/list_xtest.npy') 105 | list_ytrain = np.load('data/list_ytrain.npy') 106 | list_ytest = np.load('data/list_ytest.npy') 107 | list_xtrain_len = np.load('data/list_xtrain_len.npy') 108 | list_xtest_len = np.load('data/list_xtest_len.npy') 109 | 110 | assert(len(list_ytrain) == len(list_xtrain) and len(list_xtrain) == len(list_xtrain_len) ) 111 | 112 | 113 | list_xtrain = list(list_xtrain) 114 | list_xtest = list(list_xtest) 115 | list_ytrain = list(list_ytrain) 116 | list_ytest = list(list_ytest) 117 | list_xtrain += list_xtest 118 | list_ytrain += list_ytest 119 | assert (len(list_ytrain) == len(list_xtrain)) 120 | 121 | 122 | print (len(list_xtrain),len(list_ytrain)) 123 | 124 | settings = Settings() 125 | settings.num_classes = len(list_ytrain[0]) 126 | settings.num_steps = (len(list_xtrain) // settings.batch_size) +1 127 | settings.cnn_size = cnn_num 128 | settings.gru_size = rnn_num 129 | 130 | with tf.Graph().as_default(): 131 | sess = tf.Session() 132 | with sess.as_default(): 133 | initializer = tf.contrib.layers.xavier_initializer() 134 | with tf.variable_scope("model", reuse=None, initializer=initializer): 135 | model = RNN_CNN(setting=settings) 136 | 137 | sess.run(tf.global_variables_initializer()) 138 | saver = tf.train.Saver() 139 | if i !=0: 140 | saver.restore(sess, save_path=save_path) 141 | 142 | for epoch in range(1, settings.num_epochs + 1): 143 | 144 | bar = tqdm(range(settings.num_steps), desc='epoch {}, loss=0.000000, accuracy=0.000000'.format(epoch)) 145 | for _ in bar: 146 | #for _ in range(settings.num_steps): 147 | 148 | sample_list = random.sample(range(len(list_ytrain)), settings.batch_size) 149 | batch_train_word = [list_xtrain[x] for x in sample_list] 150 | batch_train_y = [list_ytrain[x] for x in sample_list] 151 | batch_train_len = [list_xtrain_len[x] for x in sample_list] 152 | 153 | feed_dict = {} 154 | feed_dict[model.input_word] = batch_train_word 155 | feed_dict[model.input_y] = batch_train_y 156 | feed_dict[model.keep_prob] = settings.keep_prob 157 | feed_dict[model.input_length] = batch_train_len 158 | ''' 159 | output = sess.run(model.rnn_outputs,feed_dict=feed_dict) 160 | print (output.shape) 161 | ''' 162 | _,loss,accuracy=sess.run([model.train_op, model.final_loss, model.accuracy],feed_dict=feed_dict) 163 | bar.set_description('epoch {} loss={:.6f} accuracy={:.6f}'.format(epoch, loss, accuracy)) 164 | #break 165 | saver.save(sess, save_path=save_path) 166 | #break 167 | 168 | 169 | 170 | 171 | def test (save_path,cnn_num,rnn_num): 172 | 173 | result = []#[labels] 174 | list_xtest = np.load('data/list_xtest.npy') 175 | list_ytest = np.load('data/list_ytest.npy') 176 | list_xtest_len = np.load('data/list_xtest_len.npy') 177 | 178 | assert (len(list_ytest) == len(list_xtest)) 179 | 180 | settings = Settings() 181 | settings.num_classes = len(list_ytest[0]) 182 | settings.num_steps = (len(list_xtest) // settings.batch_size) + 1 183 | settings.cnn_size = cnn_num 184 | settings.gru_size = rnn_num 185 | 186 | with tf.Graph().as_default(): 187 | sess = tf.Session() 188 | with sess.as_default(): 189 | with tf.variable_scope("model"): 190 | model = RNN_CNN(setting=settings) 191 | 192 | saver = tf.train.Saver() 193 | 194 | saver.restore(sess, save_path=save_path) 195 | 196 | #for i in range(settings.num_steps + 1): 197 | for i in tqdm(range(settings.num_steps + 1)): 198 | 199 | batch_test_word = list_xtest[settings.batch_size * i: settings.batch_size * (i + 1)] 200 | batch_test_y = list_ytest[settings.batch_size * i: settings.batch_size * (i + 1)] 201 | batch_train_len = list_xtest_len[settings.batch_size * i: settings.batch_size * (i + 1)] 202 | 203 | feed_dict = {} 204 | feed_dict[model.input_word] = batch_test_word 205 | feed_dict[model.input_y] = batch_test_y 206 | feed_dict[model.keep_prob] = 1 207 | feed_dict[model.input_length] = batch_train_len 208 | 209 | pred = sess.run([model.pred],feed_dict=feed_dict) 210 | pred = list(pred[0]) 211 | result += pred 212 | return result 213 | 214 | 215 | def validation(save_path): 216 | 217 | result = []#[labels] 218 | 219 | list_xtest = np.load('data/list_x_validation.npy') 220 | list_xtest_len = np.load('data/list_x_validation_len.npy') 221 | dict_index = {0:'自动摘要', 1:'机器翻译',2:'人类作者',3:'机器作者'} 222 | #list_ytest = np.load('data/list_ytest.npy') 223 | assert (len(list_xtest) == len(list_xtest_len)) 224 | 225 | settings = Settings() 226 | settings.num_classes = 4 227 | settings.num_steps = (len(list_xtest) // settings.batch_size) + 1 228 | 229 | with tf.Graph().as_default(): 230 | sess = tf.Session() 231 | with sess.as_default(): 232 | with tf.variable_scope("model"): 233 | model = RNN_CNN(setting=settings) 234 | 235 | saver = tf.train.Saver() 236 | saver.restore(sess, save_path=save_path) 237 | 238 | for i in tqdm(range(settings.num_steps + 1)): 239 | 240 | batch_test_word = list_xtest[settings.batch_size * i: settings.batch_size * (i + 1)] 241 | batch_train_len = list_xtest_len[settings.batch_size * i: settings.batch_size * (i + 1)] 242 | #batch_test_y = [0 for _ in range(settings.batch_size)] 243 | 244 | feed_dict = {} 245 | feed_dict[model.input_word] = batch_test_word 246 | feed_dict[model.input_length] = batch_train_len 247 | #feed_dict[model.input_y] = batch_test_y 248 | feed_dict[model.keep_prob] = 1 249 | pred = sess.run([model.pred],feed_dict=feed_dict) 250 | pred = list(pred[0]) 251 | result += pred 252 | 253 | assert (len(result) == len(list_xtest)) 254 | validation_labels = [dict_index[x] for x in result ] 255 | return validation_labels 256 | 257 | rnn_num = 200 258 | cnn_num = 300 259 | 260 | for i in range (3): 261 | print('rnn_num,cnn_num,i:', rnn_num, cnn_num, i) 262 | save_path = 'model/origin_rnncnn_model_total_c'+str(cnn_num)+'_r'+str(rnn_num)+'.ckpt' 263 | train(save_path,cnn_num,rnn_num,i) 264 | pred = test (save_path,cnn_num,rnn_num) 265 | np.save('data/origin_rnncnn_pred.npy',pred) 266 | test_pred.test(save_path ='data/origin_rnncnn_pred.npy') 267 | print ('\n') 268 | 269 | 270 | #validation_labels = validation(save_path) 271 | #np.save('data/validation_labels.npy',validation_labels) --------------------------------------------------------------------------------