├── README.md ├── sw_kgtrain.txt.github ├── tf_rnn_char.py └── tf_cnn_char.py /README.md: -------------------------------------------------------------------------------- 1 | char-level Textual Classification(CNN and RNN) 2 | -------------------------------------------------------------------------------- /sw_kgtrain.txt.github: -------------------------------------------------------------------------------- 1 | tongyong1 什么时间出库 买_下_什_么_时_候_可_以_发_货 2 | tongyong1 订单状态解释 帮_我_查_询_这_个_订_单_NUMBER 3 | tongyong1 other 那_还_写_如_需_定_制_请_联_系_客_服 4 | -------------------------------------------------------------------------------- /tf_rnn_char.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import random, datetime 4 | 5 | def build_vocab(): 6 | code, vocab = int(0), {} 7 | vocab['UNKNOWN'] = code 8 | code += 1 9 | vocab[''] = code 10 | code += 1 11 | for line in open('/export/jw/kg/data/sw_kgtrain.txt'): 12 | items = line.strip().split('\t') 13 | if len(items) != 3: 14 | continue 15 | for word in items[2].split('_'): 16 | if not word in vocab: 17 | vocab[word] = code 18 | code += 1 19 | return vocab 20 | 21 | def load_index(): 22 | imap, c = {}, int(0) 23 | for line in open('/export/jw/kg/data/sw_kgtrain.txt'): 24 | items = line.strip().split('\t') 25 | if not imap.has_key(items[1]): 26 | imap[items[1]] = c 27 | c += 1 28 | return imap 29 | 30 | def encode_index(c, imap): 31 | index = imap[c] 32 | y = [int(0)] * len(imap) 33 | y[index] = int(1) 34 | return y 35 | 36 | def encode_sent(vocab, sent, size): 37 | x = [] 38 | words = sent.split('_') 39 | for i in range(0, size): 40 | if i < len(words): 41 | if words[i] in vocab: 42 | x.append(vocab[words[i]]) 43 | else: 44 | x.append(vocab['UNKNOWN']) 45 | else: 46 | x.append(vocab['']) 47 | return x 48 | 49 | def encode_mask(sent, size): 50 | mask = [] 51 | words = sent.split('_') 52 | for i in range(0, size): 53 | if i < len(words): 54 | mask.append(1) 55 | else: 56 | mask.append(0) 57 | return mask 58 | 59 | def load_data_val(testList, vocab, index, batch_size, sent_len, imap): 60 | xlist, ylist, mask_x, origxlist = [], [], [], [] 61 | for i in range(0, batch_size): 62 | true_index = index + i 63 | if true_index >= len(testList): 64 | true_index = len(testList) - 1 65 | c, s = testList[true_index] 66 | xlist.append(encode_sent(vocab, s, sent_len)) 67 | ylist.append(encode_index(c, imap)) 68 | origxlist.append(s) 69 | mask_x.append(encode_mask(s, sent_len)) 70 | return np.array(xlist, dtype='float32'), np.array(ylist, dtype='float32'), np.transpose(np.array(mask_x, dtype='float32')), origxlist 71 | 72 | def load_train_list(): 73 | tmap, tlist = {}, [] 74 | for line in open('/export/jw/kg/data/sw_kgtrain.txt'): 75 | items = line.strip().split('\t') 76 | if (len(items) == 2): 77 | items.append('') 78 | if not tmap.has_key(items[1]): 79 | tmap[items[1]] = [] 80 | tmap[items[1]].append(items[2]) 81 | tlist.append((items[1], items[2])) 82 | return tmap, tlist 83 | 84 | def load_test_list(): 85 | testList = [] 86 | for line in open('/export/jw/kg/data/sw_kgval.txt'): 87 | items = line.strip().split('\t') 88 | if (len(items) == 2): 89 | items.append('') 90 | testList.append((items[1], items[2])) 91 | return testList 92 | 93 | def load_train_data(train_list, vocab, batch_size, sent_len, imap): 94 | xlist, ylist, mask_x = [], [], [] 95 | for i in range(0, batch_size): 96 | c, sent = train_list[random.randint(0, len(train_list) - 1)] 97 | xlist.append(encode_sent(vocab, sent, sent_len)) 98 | ylist.append(encode_index(c, imap)) 99 | mask_x.append(encode_mask(sent, sent_len)) 100 | return np.array(xlist, dtype='float32'), np.array(ylist, dtype='float32'), np.transpose(np.array(mask_x, dtype='float32')) 101 | 102 | class RNN_Model(object): 103 | def __init__(self,config,is_training=True): 104 | self.keep_prob=config.keep_prob 105 | self.batch_size=config.batch_size 106 | num_step=config.num_step 107 | 108 | self.input_data = tf.placeholder(tf.int32, [self.batch_size, num_step]) 109 | self.target = tf.placeholder(tf.int64, [self.batch_size, config.num_classes]) 110 | #tf.nn.rnn的输出是[n_step, batch_size, hidden_layer_size], 这里和输出对应上 111 | self.mask_x = tf.placeholder(tf.float32, [num_step, self.batch_size]) 112 | 113 | num_classes=config.num_classes 114 | hidden_neural_size=config.hidden_neural_size 115 | vocabulary_size=config.vocabulary_size 116 | embed_dim=config.embed_dim 117 | hidden_layer_num=config.hidden_layer_num 118 | 119 | #fw_cell = tf.contrib.rnn.BasicLSTMCell(hidden_neural_size,forget_bias=0.0,state_is_tuple=True) 120 | #定义gru运算单元,前向 121 | fw_cell = tf.contrib.rnn.GRUCell(num_units=hidden_neural_size, activation=tf.nn.relu) 122 | if self.keep_prob<1: 123 | fw_cell = tf.contrib.rnn.DropoutWrapper( 124 | fw_cell,output_keep_prob=self.keep_prob 125 | ) 126 | self._initial_state = fw_cell.zero_state(self.batch_size,dtype=tf.float32) 127 | #bw_cell = tf.contrib.rnn.BasicLSTMCell(hidden_neural_size,forget_bias=0.0,state_is_tuple=True) 128 | #定义gru运算单元,后向 129 | bw_cell = tf.contrib.rnn.GRUCell(num_units=hidden_neural_size, activation=tf.nn.relu) 130 | if self.keep_prob<1: 131 | bw_cell = tf.contrib.rnn.DropoutWrapper( 132 | bw_cell,output_keep_prob=self.keep_prob 133 | ) 134 | #初始化状态为0 135 | self._initial_state = bw_cell.zero_state(self.batch_size,dtype=tf.float32) 136 | 137 | #embedding layer 138 | with tf.device("/cpu:0"),tf.name_scope("embedding_layer"): 139 | embedding = tf.get_variable("embedding",[vocabulary_size,embed_dim],dtype=tf.float32) 140 | inputs=tf.nn.embedding_lookup(embedding,self.input_data) 141 | 142 | #embedding的输出进行dropout 143 | if self.keep_prob<1: 144 | inputs = tf.nn.dropout(inputs,self.keep_prob) 145 | 146 | """ 147 | out_put=[] 148 | state=self._initial_state 149 | print state 150 | with tf.variable_scope("LSTM_layer"): 151 | for time_step in range(num_step): 152 | if time_step>0: tf.get_variable_scope().reuse_variables() 153 | (cell_output,state)=cell(inputs[:,time_step,:],state) 154 | out_put.append(cell_output) 155 | out_put = out_put * self.mask_x[:,:,None] 156 | """ 157 | #初始化状态 158 | state = self._initial_state 159 | #[batch_size, n_steps, embedding_size] -> [n_steps, batch_size, embedding_size] 160 | inputs = tf.transpose(inputs, [1, 0, 2]) 161 | #[n_steps, batch_size, embedding_size] -> [n_steps * batch_size, embedding_size] 162 | inputs = tf.reshape(inputs, [-1, embed_dim]) 163 | #n_steps * [batch_size, embedding_size],split成一个list,每个list是一个[batch_size, embedding_size] 164 | inputs = tf.split(inputs, num_step) 165 | out_put, _, _ = tf.contrib.rnn.static_bidirectional_rnn(fw_cell, bw_cell, inputs, initial_state_fw=state, initial_state_bw=state) 166 | #对append的生成的hidden向量,全部置为0 167 | out_put = out_put * self.mask_x[:, :, None] 168 | 169 | #求每句话的hidden向量的均值 170 | with tf.name_scope("mean_pooling_layer"): 171 | out_put = tf.reduce_sum(out_put,0) / (tf.reduce_sum(self.mask_x,0)[:,None]) 172 | 173 | with tf.name_scope("Softmax_layer_and_output"): 174 | softmax_w = tf.get_variable("softmax_w",[hidden_neural_size * 2, num_classes],dtype=tf.float32) 175 | softmax_b = tf.get_variable("softmax_b",[num_classes],dtype=tf.float32) 176 | self.logits = tf.matmul(out_put, softmax_w) + softmax_b 177 | 178 | with tf.name_scope("loss"): 179 | #self.loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits+1e-10, labels=self.target) 180 | self.loss = tf.losses.softmax_cross_entropy(self.target, self.logits) 181 | self.cost = tf.reduce_mean(self.loss) 182 | 183 | with tf.name_scope("accuracy"): 184 | correct = tf.equal(tf.argmax(self.logits, 1), tf.argmax(self.target, 1)) 185 | self.accuracy = tf.reduce_mean(tf.cast(correct, "float"), name="accuracy") 186 | 187 | with tf.name_scope("output"): 188 | self.orig_y = tf.argmax(self.target, 1) 189 | self.pred_y = tf.argmax(self.logits, 1) 190 | 191 | tf.flags.DEFINE_integer('evaluate_every',1000,'evaluate every') 192 | tf.flags.DEFINE_integer('batch_size',128,'the batch_size of the training procedure') 193 | tf.flags.DEFINE_float('lr',0.1,'the learning rate') 194 | tf.flags.DEFINE_float('lr_decay',0.6,'the learning rate decay') 195 | tf.flags.DEFINE_integer('emdedding_dim',100,'embedding dim') 196 | tf.flags.DEFINE_integer('hidden_neural_size',100,'LSTM hidden neural size') 197 | tf.flags.DEFINE_integer('hidden_layer_num',1,'LSTM hidden layer num') 198 | tf.flags.DEFINE_integer('max_len',100,'max_len of training sentence') 199 | tf.flags.DEFINE_float('init_scale',0.1,'init scale') 200 | tf.flags.DEFINE_float('keep_prob',0.5,'dropout rate') 201 | tf.flags.DEFINE_integer('num_epoch',100000,'num epoch') 202 | tf.flags.DEFINE_integer('max_grad_norm',5,'max_grad_norm') 203 | # Misc Parameters 204 | tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement") 205 | tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") 206 | FLAGS = tf.flags.FLAGS 207 | FLAGS._parse_flags() 208 | 209 | vocab = build_vocab() 210 | train_map, train_list = load_train_list() 211 | test_list = load_test_list() 212 | imap = load_index() 213 | x, y, mask_x = load_train_data(train_list, vocab, FLAGS.batch_size, FLAGS.max_len, imap) 214 | 215 | class Config(object): 216 | hidden_neural_size=FLAGS.hidden_neural_size 217 | vocabulary_size=len(vocab) 218 | embed_dim=FLAGS.emdedding_dim 219 | hidden_layer_num=FLAGS.hidden_layer_num 220 | keep_prob=FLAGS.keep_prob 221 | lr = FLAGS.lr 222 | lr_decay = FLAGS.lr_decay 223 | batch_size = FLAGS.batch_size 224 | num_step = FLAGS.max_len 225 | max_grad_norm=FLAGS.max_grad_norm 226 | num_epoch = FLAGS.num_epoch 227 | num_classes = len(imap) 228 | 229 | config = Config() 230 | eval_config=Config() 231 | eval_config.keep_prob=1.0 232 | 233 | with tf.Graph().as_default(): 234 | with tf.device('/gpu:0'): 235 | session_conf = tf.ConfigProto( 236 | allow_soft_placement=FLAGS.allow_soft_placement, 237 | log_device_placement=FLAGS.log_device_placement) 238 | sess = tf.Session(config=session_conf) 239 | with sess.as_default(): 240 | initializer = tf.random_uniform_initializer(-1*FLAGS.init_scale,1*FLAGS.init_scale) 241 | with tf.variable_scope("model",reuse=None,initializer=initializer): 242 | model = RNN_Model(config=config,is_training=True) 243 | with tf.variable_scope("model",reuse=True,initializer=initializer): 244 | dev_model = RNN_Model(config=eval_config,is_training=False) 245 | 246 | # Define Training procedure 247 | global_step = tf.Variable(0, name="global_step", trainable=False) 248 | optimizer = tf.train.RMSPropOptimizer(0.005) 249 | grads_and_vars = optimizer.compute_gradients(model.loss) 250 | train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) 251 | 252 | def train_step(model, x, y, mask_x): 253 | fetches = [model.cost, model.accuracy, global_step, train_op] 254 | feed_dict = { 255 | model.input_data : x, 256 | model.target : y, 257 | model.mask_x : mask_x 258 | } 259 | #state = sess.run(model._initial_state) 260 | #print state 261 | #print model._initial_state 262 | #for i , (c,h) in enumerate(model._initial_state): 263 | #feed_dict[c]=state.c 264 | #feed_dict[h]=state.h 265 | cost, accuracy, step, _ = sess.run(fetches, feed_dict) 266 | time_str = datetime.datetime.now().isoformat() 267 | print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, cost, accuracy)) 268 | 269 | def dev_step(model, testList, vocab, batch_size, sent_len, imap): 270 | index, output_list, origy_list, origx_list = int(0), [], [], [] 271 | while True: 272 | x, y, mask_x, origx = load_data_val(testList, vocab, index, batch_size, sent_len, imap) 273 | feed_dict = {model.input_data : x, model.target : y, model.mask_x: mask_x} 274 | origy, output = sess.run([model.orig_y, model.pred_y], feed_dict) 275 | for c in output: 276 | output_list.append(c) 277 | for c in origy: 278 | origy_list.append(c) 279 | for c in origx: 280 | origx_list.append(c) 281 | index += batch_size 282 | if index >= len(testList): 283 | break 284 | fp = file('/export/jw/kg/cnn.output', 'w+') 285 | i2nmap = {} 286 | for name, index in imap.items(): 287 | i2nmap[index] = name 288 | for i in xrange(0, len(output_list)): 289 | fp.write(i2nmap[int(output_list[i])] + '\t' + i2nmap[origy_list[i]] + '\t' + origx_list[i] + '\n') 290 | fp.close() 291 | 292 | # Initialize all variables 293 | sess.run(tf.global_variables_initializer()) 294 | for i in range(config.num_epoch): 295 | x, y, mask_x = load_train_data(train_list, vocab, FLAGS.batch_size, FLAGS.max_len, imap) 296 | train_step(model, x, y, mask_x) 297 | current_step = tf.train.global_step(sess, global_step) 298 | if current_step % FLAGS.evaluate_every == 0: 299 | dev_step(dev_model, test_list, vocab, FLAGS.batch_size, FLAGS.max_len, imap) 300 | 301 | -------------------------------------------------------------------------------- /tf_cnn_char.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | import tensorflow as tf 3 | import numpy as np 4 | import random, time, os, datetime 5 | 6 | ######################################################################### 7 | # 单层CNN文本分类模型 8 | ######################################################################### 9 | 10 | #输入是定长序列,超过指定长度的截断,不足指定长度的补 11 | #构建字典 12 | def build_vocab(): 13 | code, vocab = int(0), {} 14 | vocab['UNKNOWN'] = code 15 | code += 1 16 | vocab[''] = code 17 | code += 1 18 | for line in open('/export/jw/kg/data/sw_kgtrain.txt'): 19 | items = line.strip().split('\t') 20 | if len(items) != 3: 21 | continue 22 | for word in items[2].split('_'): 23 | if not word in vocab: 24 | vocab[word] = code 25 | code += 1 26 | return vocab 27 | 28 | #分类名转id 29 | def load_index(): 30 | imap, c = {}, int(0) 31 | for line in open('/export/jw/kg/data/sw_kgtrain.txt'): 32 | items = line.strip().split('\t') 33 | if not imap.has_key(items[1]): 34 | imap[items[1]] = c 35 | c += 1 36 | return imap 37 | 38 | #将分类结果转换成one-hot的形式 39 | def encode_index(c, imap): 40 | index = imap[c] 41 | y = [int(0)] * len(imap) 42 | y[index] = int(1) 43 | return y 44 | 45 | #be attention initialization of UNKNNOW 46 | #对句子进行编码 47 | def encode_sent(vocab, sent, size): 48 | x = [] 49 | words = sent.split('_') 50 | for i in range(0, size): 51 | if i < len(words): 52 | if words[i] in vocab: 53 | x.append(vocab[words[i]]) 54 | else: 55 | x.append(vocab['UNKNOWN']) 56 | else: 57 | x.append(vocab['']) 58 | return x 59 | 60 | #读取验证数据,验证数据格式和训练数据一样 61 | def load_data_val(testList, vocab, index, batch_size, sent_len, imap): 62 | xlist, ylist, origxlist = [], [], [] 63 | for i in range(0, batch_size): 64 | true_index = index + i 65 | if true_index >= len(testList): 66 | true_index = len(testList) - 1 67 | c, s = testList[true_index] 68 | xlist.append(encode_sent(vocab, s, sent_len)) 69 | ylist.append(encode_index(c, imap)) 70 | origxlist.append(s) 71 | return np.array(xlist, dtype='float32'), np.array(ylist, dtype='float32'), origxlist 72 | 73 | def load_train_list(): 74 | tmap, tlist = {}, [] 75 | for line in open('/export/jw/kg/data/sw_kgtrain.txt'): 76 | items = line.strip().split('\t') 77 | if (len(items) == 2): 78 | items.append('') 79 | if not tmap.has_key(items[1]): 80 | tmap[items[1]] = [] 81 | tmap[items[1]].append(items[2]) 82 | tlist.append((items[1], items[2])) 83 | return tmap, tlist 84 | 85 | def load_data(train_list, vocab, batch_size, sent_len, imap): 86 | xlist, ylist = [], [] 87 | for i in xrange(0, batch_size): 88 | c, sent = train_list[random.randint(0, len(train_list) - 1)] 89 | xlist.append(encode_sent(vocab, sent, sent_len)) 90 | ylist.append(encode_index(c, imap)) 91 | return np.array(xlist, dtype='float32'), np.array(ylist, dtype='float32') 92 | 93 | class CNN(object): 94 | def __init__( 95 | self, sequence_length, batch_size, 96 | vocab_size, embedding_size, 97 | filter_sizes, num_filters, num_classes, l2_reg_lambda=0.0): 98 | 99 | #用户问题,字向量使用embedding_lookup 100 | self.x_batch = tf.placeholder(tf.int32, [batch_size, sequence_length], name="x_batch") 101 | self.y_batch = tf.placeholder(tf.int32, [batch_size, num_classes], name='y_batch') 102 | self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob") 103 | print("xlist", self.x_batch) 104 | 105 | # Embedding layer 106 | with tf.device('/cpu:0'), tf.name_scope("embedding"): 107 | We = tf.Variable( 108 | tf.truncated_normal([vocab_size, embedding_size], stddev=0.1), 109 | name="W") 110 | chars = tf.nn.embedding_lookup(We, self.x_batch) 111 | self.embedded_chars = chars 112 | self.embedded_chars_expanded = tf.expand_dims(self.embedded_chars, -1) 113 | 114 | pooled_outputs = [] 115 | for i, filter_size in enumerate(filter_sizes): 116 | with tf.name_scope("conv-maxpool-%s" % filter_size): 117 | filter_shape = [filter_size, embedding_size, 1, num_filters] 118 | W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.01), name="W-%s" % filter_size) 119 | b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b-%s" % filter_size) 120 | conv = tf.nn.conv2d( 121 | self.embedded_chars_expanded, 122 | W, 123 | strides=[1, 1, 1, 1], 124 | padding='VALID', 125 | name="conv" 126 | ) 127 | h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu") 128 | pooled = tf.nn.max_pool( 129 | h, 130 | ksize=[1, sequence_length - filter_size + 1, 1, 1], 131 | strides=[1, 1, 1, 1], 132 | padding='VALID', 133 | name="pool" 134 | ) 135 | pooled_outputs.append(pooled) 136 | num_filters_total = num_filters * len(filter_sizes) 137 | pooled_reshape = tf.reshape(tf.concat(pooled_outputs, 3), [-1, num_filters_total]) 138 | #dropout 139 | h_drop = tf.nn.dropout(pooled_reshape, self.dropout_keep_prob) 140 | 141 | Wfc = tf.Variable(tf.truncated_normal([num_filters_total, num_classes], stddev=0.1), name='Wfc') 142 | bfc = tf.Variable(tf.constant(0.1, shape=[num_classes]), name='bfc') 143 | h_output = tf.nn.xw_plus_b(h_drop, Wfc, bfc, name='scores') 144 | print('h_output', h_output) 145 | 146 | with tf.name_scope("output"): 147 | self.orig_y = tf.argmax(self.y_batch, 1) 148 | self.pred_y = tf.argmax(h_output, 1) 149 | 150 | with tf.name_scope("loss"): 151 | #cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=h_output, labels=self.y_batch) 152 | #print('batch_loss', cross_entropy) 153 | #self.loss = tf.reduce_mean(cross_entropy) 154 | #print('loss ', self.loss) 155 | self.loss = tf.losses.softmax_cross_entropy(self.y_batch, h_output) 156 | 157 | # Accuracy 158 | with tf.name_scope("accuracy"): 159 | correct = tf.equal(tf.argmax(h_output, 1), tf.argmax(self.y_batch, 1)) 160 | print('correct', correct) 161 | self.accuracy = tf.reduce_mean(tf.cast(correct, "float"), name="accuracy") 162 | 163 | # Parameters 164 | # ================================================== 165 | # Model Hyperparameters 166 | tf.flags.DEFINE_integer("embedding_dim", 100, "Dimensionality of character embedding (default: 128)") 167 | tf.flags.DEFINE_string("filter_sizes", "1,2,3,4,5", "Comma-separated filter sizes (default: '3,4,5')") 168 | tf.flags.DEFINE_integer("num_filters", 128, "Number of filters per filter size (default: 128)") 169 | tf.flags.DEFINE_float("dropout_keep_prob", 0.5, "Dropout keep probability (default: 0.5)") 170 | tf.flags.DEFINE_float("l2_reg_lambda", 0, "L2 regularizaion lambda (default: 0.0)") 171 | 172 | # Training parameters 173 | tf.flags.DEFINE_integer("batch_size", 512, "Batch Size (default: 64)") 174 | tf.flags.DEFINE_integer("num_epochs", 5000000, "Number of training epochs (default: 200)") 175 | tf.flags.DEFINE_integer("evaluate_every", 500, "Evaluate model on dev set after this many steps (default: 100)") 176 | tf.flags.DEFINE_integer("checkpoint_every", 3000, "Save model after this many steps (default: 100)") 177 | # Misc Parameters 178 | tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement") 179 | tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") 180 | sent_len = int(100) 181 | 182 | FLAGS = tf.flags.FLAGS 183 | FLAGS._parse_flags() 184 | print("\nParameters:") 185 | for attr, value in sorted(FLAGS.__flags.items()): 186 | print("{}={}".format(attr.upper(), value)) 187 | print("") 188 | 189 | # Data Preparatopn 190 | # ================================================== 191 | # Load data 192 | print("Loading data...") 193 | 194 | def train_step(x_batch, y_batch): 195 | feed_dict = { 196 | cnn.x_batch: x_batch, 197 | cnn.y_batch: y_batch, 198 | cnn.dropout_keep_prob: FLAGS.dropout_keep_prob 199 | } 200 | _, step, summaries, loss, accuracy = sess.run( 201 | [train_op, global_step, train_summary_op, cnn.loss, cnn.accuracy], feed_dict) 202 | time_str = datetime.datetime.now().isoformat() 203 | print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy)) 204 | train_summary_writer.add_summary(summaries, step) 205 | 206 | def dev_step(testList, vocab, batch_size, sent_len, imap): 207 | index, output_list, origy_list, origx_list = int(0), [], [], [] 208 | while True: 209 | x_batch, y_batch, origx = load_data_val(testList, vocab, index, batch_size, sent_len, imap) 210 | feed_dict = {cnn.x_batch: x_batch, cnn.y_batch: y_batch, cnn.dropout_keep_prob: 1.0} 211 | origy, output = sess.run([cnn.orig_y, cnn.pred_y], feed_dict) 212 | for c in output: 213 | output_list.append(c) 214 | for c in origy: 215 | origy_list.append(c) 216 | for c in origx: 217 | origx_list.append(c) 218 | index += batch_size 219 | if index >= len(testList): 220 | break 221 | fp = file('/export/jw/kg/cnn.output', 'w+') 222 | i2nmap = {} 223 | for name, index in imap.items(): 224 | i2nmap[index] = name 225 | for i in xrange(0, len(output_list)): 226 | fp.write(i2nmap[int(output_list[i])] + '\t' + i2nmap[origy_list[i]] + '\t' + origx_list[i] + '\n') 227 | fp.close() 228 | print 'wirte done ......' 229 | 230 | def load_test_list(): 231 | testList = [] 232 | for line in open('/export/jw/kg/data/sw_kgval.txt'): 233 | items = line.strip().split('\t') 234 | if (len(items) == 2): 235 | items.append('') 236 | testList.append((items[1], items[2])) 237 | return testList 238 | 239 | vocab = build_vocab() 240 | train_map, train_list = load_train_list() 241 | test_list = load_test_list() 242 | imap = load_index() 243 | xlist, ylist = load_data(train_list, vocab, FLAGS.batch_size, sent_len, imap) 244 | num_classes = ylist.shape[1] 245 | print("Load done...") 246 | 247 | # Training 248 | # ================================================== 249 | 250 | with tf.Graph().as_default(): 251 | with tf.device("/gpu:0"): 252 | session_conf = tf.ConfigProto( 253 | allow_soft_placement=FLAGS.allow_soft_placement, 254 | log_device_placement=FLAGS.log_device_placement) 255 | sess = tf.Session(config=session_conf) 256 | with sess.as_default(): 257 | cnn = CNN( 258 | sequence_length=sent_len, 259 | batch_size=FLAGS.batch_size, 260 | vocab_size=len(vocab), 261 | embedding_size=FLAGS.embedding_dim, 262 | filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))), 263 | num_filters=FLAGS.num_filters, 264 | num_classes=num_classes, 265 | l2_reg_lambda=FLAGS.l2_reg_lambda) 266 | 267 | # Define Training procedure 268 | global_step = tf.Variable(0, name="global_step", trainable=False) 269 | optimizer = tf.train.RMSPropOptimizer(0.0005) 270 | #optimizer = tf.train.AdamOptimizer(0.0001) 271 | #optimizer = tf.train.GradientDescentOptimizer(1e-2) 272 | grads_and_vars = optimizer.compute_gradients(cnn.loss) 273 | train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) 274 | 275 | # Keep track of gradient values and sparsity (optional) 276 | grad_summaries = [] 277 | for g, v in grads_and_vars: 278 | if g is not None: 279 | grad_hist_summary = tf.summary.histogram("{}/grad/hist".format(v.name), g) 280 | sparsity_summary = tf.summary.scalar("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g)) 281 | grad_summaries.append(grad_hist_summary) 282 | grad_summaries.append(sparsity_summary) 283 | grad_summaries_merged = tf.summary.merge(grad_summaries) 284 | 285 | # Output directory for models and summaries 286 | timestamp = str(int(time.time())) 287 | out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp)) 288 | print("Writing to {}\n".format(out_dir)) 289 | 290 | # Summaries for loss and accuracy 291 | loss_summary = tf.summary.scalar("loss", cnn.loss) 292 | acc_summary = tf.summary.scalar("accuracy", cnn.accuracy) 293 | 294 | # Train Summaries 295 | #train_summary_op = tf.merge_summary([loss_summary, acc_summary, grad_summaries_merged]) 296 | train_summary_op = tf.summary.merge([loss_summary, acc_summary]) 297 | train_summary_dir = os.path.join(out_dir, "summaries", "train") 298 | train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph_def) 299 | 300 | # Dev summaries 301 | dev_summary_op = tf.summary.merge([loss_summary, acc_summary]) 302 | dev_summary_dir = os.path.join(out_dir, "summaries", "dev") 303 | dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph_def) 304 | 305 | # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it 306 | checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints")) 307 | checkpoint_prefix = os.path.join(checkpoint_dir, "model") 308 | if not os.path.exists(checkpoint_dir): 309 | os.makedirs(checkpoint_dir) 310 | saver = tf.train.Saver(tf.global_variables()) 311 | 312 | # Initialize all variables 313 | sess.run(tf.global_variables_initializer()) 314 | 315 | # Generate batches 316 | # Training loop. For each batch... 317 | for i in range(FLAGS.num_epochs): 318 | try: 319 | x_batch, y_batch = load_data(train_list, vocab, FLAGS.batch_size, sent_len, imap) 320 | train_step(x_batch, y_batch) 321 | current_step = tf.train.global_step(sess, global_step) 322 | if current_step % FLAGS.evaluate_every == 0: 323 | print("\nEvaluation:") 324 | dev_step(test_list, vocab, FLAGS.batch_size, sent_len, imap) 325 | print("") 326 | if current_step % FLAGS.checkpoint_every == 0: 327 | path = saver.save(sess, checkpoint_prefix, global_step=current_step) 328 | print("Saved model checkpoint to {}\n".format(path)) 329 | except Exception as e: 330 | print(e) 331 | 332 | --------------------------------------------------------------------------------