├── README.md
├── sw_kgtrain.txt.github
├── tf_rnn_char.py
└── tf_cnn_char.py
/README.md:
--------------------------------------------------------------------------------
1 | char-level Textual Classification(CNN and RNN)
2 |
--------------------------------------------------------------------------------
/sw_kgtrain.txt.github:
--------------------------------------------------------------------------------
1 | tongyong1 什么时间出库 买_下_什_么_时_候_可_以_发_货
2 | tongyong1 订单状态解释 帮_我_查_询_这_个_订_单_NUMBER
3 | tongyong1 other 那_还_写_如_需_定_制_请_联_系_客_服
4 |
--------------------------------------------------------------------------------
/tf_rnn_char.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | import numpy as np
3 | import random, datetime
4 |
5 | def build_vocab():
6 | code, vocab = int(0), {}
7 | vocab['UNKNOWN'] = code
8 | code += 1
9 | vocab[''] = code
10 | code += 1
11 | for line in open('/export/jw/kg/data/sw_kgtrain.txt'):
12 | items = line.strip().split('\t')
13 | if len(items) != 3:
14 | continue
15 | for word in items[2].split('_'):
16 | if not word in vocab:
17 | vocab[word] = code
18 | code += 1
19 | return vocab
20 |
21 | def load_index():
22 | imap, c = {}, int(0)
23 | for line in open('/export/jw/kg/data/sw_kgtrain.txt'):
24 | items = line.strip().split('\t')
25 | if not imap.has_key(items[1]):
26 | imap[items[1]] = c
27 | c += 1
28 | return imap
29 |
30 | def encode_index(c, imap):
31 | index = imap[c]
32 | y = [int(0)] * len(imap)
33 | y[index] = int(1)
34 | return y
35 |
36 | def encode_sent(vocab, sent, size):
37 | x = []
38 | words = sent.split('_')
39 | for i in range(0, size):
40 | if i < len(words):
41 | if words[i] in vocab:
42 | x.append(vocab[words[i]])
43 | else:
44 | x.append(vocab['UNKNOWN'])
45 | else:
46 | x.append(vocab[''])
47 | return x
48 |
49 | def encode_mask(sent, size):
50 | mask = []
51 | words = sent.split('_')
52 | for i in range(0, size):
53 | if i < len(words):
54 | mask.append(1)
55 | else:
56 | mask.append(0)
57 | return mask
58 |
59 | def load_data_val(testList, vocab, index, batch_size, sent_len, imap):
60 | xlist, ylist, mask_x, origxlist = [], [], [], []
61 | for i in range(0, batch_size):
62 | true_index = index + i
63 | if true_index >= len(testList):
64 | true_index = len(testList) - 1
65 | c, s = testList[true_index]
66 | xlist.append(encode_sent(vocab, s, sent_len))
67 | ylist.append(encode_index(c, imap))
68 | origxlist.append(s)
69 | mask_x.append(encode_mask(s, sent_len))
70 | return np.array(xlist, dtype='float32'), np.array(ylist, dtype='float32'), np.transpose(np.array(mask_x, dtype='float32')), origxlist
71 |
72 | def load_train_list():
73 | tmap, tlist = {}, []
74 | for line in open('/export/jw/kg/data/sw_kgtrain.txt'):
75 | items = line.strip().split('\t')
76 | if (len(items) == 2):
77 | items.append('')
78 | if not tmap.has_key(items[1]):
79 | tmap[items[1]] = []
80 | tmap[items[1]].append(items[2])
81 | tlist.append((items[1], items[2]))
82 | return tmap, tlist
83 |
84 | def load_test_list():
85 | testList = []
86 | for line in open('/export/jw/kg/data/sw_kgval.txt'):
87 | items = line.strip().split('\t')
88 | if (len(items) == 2):
89 | items.append('')
90 | testList.append((items[1], items[2]))
91 | return testList
92 |
93 | def load_train_data(train_list, vocab, batch_size, sent_len, imap):
94 | xlist, ylist, mask_x = [], [], []
95 | for i in range(0, batch_size):
96 | c, sent = train_list[random.randint(0, len(train_list) - 1)]
97 | xlist.append(encode_sent(vocab, sent, sent_len))
98 | ylist.append(encode_index(c, imap))
99 | mask_x.append(encode_mask(sent, sent_len))
100 | return np.array(xlist, dtype='float32'), np.array(ylist, dtype='float32'), np.transpose(np.array(mask_x, dtype='float32'))
101 |
102 | class RNN_Model(object):
103 | def __init__(self,config,is_training=True):
104 | self.keep_prob=config.keep_prob
105 | self.batch_size=config.batch_size
106 | num_step=config.num_step
107 |
108 | self.input_data = tf.placeholder(tf.int32, [self.batch_size, num_step])
109 | self.target = tf.placeholder(tf.int64, [self.batch_size, config.num_classes])
110 | #tf.nn.rnn的输出是[n_step, batch_size, hidden_layer_size], 这里和输出对应上
111 | self.mask_x = tf.placeholder(tf.float32, [num_step, self.batch_size])
112 |
113 | num_classes=config.num_classes
114 | hidden_neural_size=config.hidden_neural_size
115 | vocabulary_size=config.vocabulary_size
116 | embed_dim=config.embed_dim
117 | hidden_layer_num=config.hidden_layer_num
118 |
119 | #fw_cell = tf.contrib.rnn.BasicLSTMCell(hidden_neural_size,forget_bias=0.0,state_is_tuple=True)
120 | #定义gru运算单元,前向
121 | fw_cell = tf.contrib.rnn.GRUCell(num_units=hidden_neural_size, activation=tf.nn.relu)
122 | if self.keep_prob<1:
123 | fw_cell = tf.contrib.rnn.DropoutWrapper(
124 | fw_cell,output_keep_prob=self.keep_prob
125 | )
126 | self._initial_state = fw_cell.zero_state(self.batch_size,dtype=tf.float32)
127 | #bw_cell = tf.contrib.rnn.BasicLSTMCell(hidden_neural_size,forget_bias=0.0,state_is_tuple=True)
128 | #定义gru运算单元,后向
129 | bw_cell = tf.contrib.rnn.GRUCell(num_units=hidden_neural_size, activation=tf.nn.relu)
130 | if self.keep_prob<1:
131 | bw_cell = tf.contrib.rnn.DropoutWrapper(
132 | bw_cell,output_keep_prob=self.keep_prob
133 | )
134 | #初始化状态为0
135 | self._initial_state = bw_cell.zero_state(self.batch_size,dtype=tf.float32)
136 |
137 | #embedding layer
138 | with tf.device("/cpu:0"),tf.name_scope("embedding_layer"):
139 | embedding = tf.get_variable("embedding",[vocabulary_size,embed_dim],dtype=tf.float32)
140 | inputs=tf.nn.embedding_lookup(embedding,self.input_data)
141 |
142 | #embedding的输出进行dropout
143 | if self.keep_prob<1:
144 | inputs = tf.nn.dropout(inputs,self.keep_prob)
145 |
146 | """
147 | out_put=[]
148 | state=self._initial_state
149 | print state
150 | with tf.variable_scope("LSTM_layer"):
151 | for time_step in range(num_step):
152 | if time_step>0: tf.get_variable_scope().reuse_variables()
153 | (cell_output,state)=cell(inputs[:,time_step,:],state)
154 | out_put.append(cell_output)
155 | out_put = out_put * self.mask_x[:,:,None]
156 | """
157 | #初始化状态
158 | state = self._initial_state
159 | #[batch_size, n_steps, embedding_size] -> [n_steps, batch_size, embedding_size]
160 | inputs = tf.transpose(inputs, [1, 0, 2])
161 | #[n_steps, batch_size, embedding_size] -> [n_steps * batch_size, embedding_size]
162 | inputs = tf.reshape(inputs, [-1, embed_dim])
163 | #n_steps * [batch_size, embedding_size],split成一个list,每个list是一个[batch_size, embedding_size]
164 | inputs = tf.split(inputs, num_step)
165 | out_put, _, _ = tf.contrib.rnn.static_bidirectional_rnn(fw_cell, bw_cell, inputs, initial_state_fw=state, initial_state_bw=state)
166 | #对append的生成的hidden向量,全部置为0
167 | out_put = out_put * self.mask_x[:, :, None]
168 |
169 | #求每句话的hidden向量的均值
170 | with tf.name_scope("mean_pooling_layer"):
171 | out_put = tf.reduce_sum(out_put,0) / (tf.reduce_sum(self.mask_x,0)[:,None])
172 |
173 | with tf.name_scope("Softmax_layer_and_output"):
174 | softmax_w = tf.get_variable("softmax_w",[hidden_neural_size * 2, num_classes],dtype=tf.float32)
175 | softmax_b = tf.get_variable("softmax_b",[num_classes],dtype=tf.float32)
176 | self.logits = tf.matmul(out_put, softmax_w) + softmax_b
177 |
178 | with tf.name_scope("loss"):
179 | #self.loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits+1e-10, labels=self.target)
180 | self.loss = tf.losses.softmax_cross_entropy(self.target, self.logits)
181 | self.cost = tf.reduce_mean(self.loss)
182 |
183 | with tf.name_scope("accuracy"):
184 | correct = tf.equal(tf.argmax(self.logits, 1), tf.argmax(self.target, 1))
185 | self.accuracy = tf.reduce_mean(tf.cast(correct, "float"), name="accuracy")
186 |
187 | with tf.name_scope("output"):
188 | self.orig_y = tf.argmax(self.target, 1)
189 | self.pred_y = tf.argmax(self.logits, 1)
190 |
191 | tf.flags.DEFINE_integer('evaluate_every',1000,'evaluate every')
192 | tf.flags.DEFINE_integer('batch_size',128,'the batch_size of the training procedure')
193 | tf.flags.DEFINE_float('lr',0.1,'the learning rate')
194 | tf.flags.DEFINE_float('lr_decay',0.6,'the learning rate decay')
195 | tf.flags.DEFINE_integer('emdedding_dim',100,'embedding dim')
196 | tf.flags.DEFINE_integer('hidden_neural_size',100,'LSTM hidden neural size')
197 | tf.flags.DEFINE_integer('hidden_layer_num',1,'LSTM hidden layer num')
198 | tf.flags.DEFINE_integer('max_len',100,'max_len of training sentence')
199 | tf.flags.DEFINE_float('init_scale',0.1,'init scale')
200 | tf.flags.DEFINE_float('keep_prob',0.5,'dropout rate')
201 | tf.flags.DEFINE_integer('num_epoch',100000,'num epoch')
202 | tf.flags.DEFINE_integer('max_grad_norm',5,'max_grad_norm')
203 | # Misc Parameters
204 | tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
205 | tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")
206 | FLAGS = tf.flags.FLAGS
207 | FLAGS._parse_flags()
208 |
209 | vocab = build_vocab()
210 | train_map, train_list = load_train_list()
211 | test_list = load_test_list()
212 | imap = load_index()
213 | x, y, mask_x = load_train_data(train_list, vocab, FLAGS.batch_size, FLAGS.max_len, imap)
214 |
215 | class Config(object):
216 | hidden_neural_size=FLAGS.hidden_neural_size
217 | vocabulary_size=len(vocab)
218 | embed_dim=FLAGS.emdedding_dim
219 | hidden_layer_num=FLAGS.hidden_layer_num
220 | keep_prob=FLAGS.keep_prob
221 | lr = FLAGS.lr
222 | lr_decay = FLAGS.lr_decay
223 | batch_size = FLAGS.batch_size
224 | num_step = FLAGS.max_len
225 | max_grad_norm=FLAGS.max_grad_norm
226 | num_epoch = FLAGS.num_epoch
227 | num_classes = len(imap)
228 |
229 | config = Config()
230 | eval_config=Config()
231 | eval_config.keep_prob=1.0
232 |
233 | with tf.Graph().as_default():
234 | with tf.device('/gpu:0'):
235 | session_conf = tf.ConfigProto(
236 | allow_soft_placement=FLAGS.allow_soft_placement,
237 | log_device_placement=FLAGS.log_device_placement)
238 | sess = tf.Session(config=session_conf)
239 | with sess.as_default():
240 | initializer = tf.random_uniform_initializer(-1*FLAGS.init_scale,1*FLAGS.init_scale)
241 | with tf.variable_scope("model",reuse=None,initializer=initializer):
242 | model = RNN_Model(config=config,is_training=True)
243 | with tf.variable_scope("model",reuse=True,initializer=initializer):
244 | dev_model = RNN_Model(config=eval_config,is_training=False)
245 |
246 | # Define Training procedure
247 | global_step = tf.Variable(0, name="global_step", trainable=False)
248 | optimizer = tf.train.RMSPropOptimizer(0.005)
249 | grads_and_vars = optimizer.compute_gradients(model.loss)
250 | train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)
251 |
252 | def train_step(model, x, y, mask_x):
253 | fetches = [model.cost, model.accuracy, global_step, train_op]
254 | feed_dict = {
255 | model.input_data : x,
256 | model.target : y,
257 | model.mask_x : mask_x
258 | }
259 | #state = sess.run(model._initial_state)
260 | #print state
261 | #print model._initial_state
262 | #for i , (c,h) in enumerate(model._initial_state):
263 | #feed_dict[c]=state.c
264 | #feed_dict[h]=state.h
265 | cost, accuracy, step, _ = sess.run(fetches, feed_dict)
266 | time_str = datetime.datetime.now().isoformat()
267 | print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, cost, accuracy))
268 |
269 | def dev_step(model, testList, vocab, batch_size, sent_len, imap):
270 | index, output_list, origy_list, origx_list = int(0), [], [], []
271 | while True:
272 | x, y, mask_x, origx = load_data_val(testList, vocab, index, batch_size, sent_len, imap)
273 | feed_dict = {model.input_data : x, model.target : y, model.mask_x: mask_x}
274 | origy, output = sess.run([model.orig_y, model.pred_y], feed_dict)
275 | for c in output:
276 | output_list.append(c)
277 | for c in origy:
278 | origy_list.append(c)
279 | for c in origx:
280 | origx_list.append(c)
281 | index += batch_size
282 | if index >= len(testList):
283 | break
284 | fp = file('/export/jw/kg/cnn.output', 'w+')
285 | i2nmap = {}
286 | for name, index in imap.items():
287 | i2nmap[index] = name
288 | for i in xrange(0, len(output_list)):
289 | fp.write(i2nmap[int(output_list[i])] + '\t' + i2nmap[origy_list[i]] + '\t' + origx_list[i] + '\n')
290 | fp.close()
291 |
292 | # Initialize all variables
293 | sess.run(tf.global_variables_initializer())
294 | for i in range(config.num_epoch):
295 | x, y, mask_x = load_train_data(train_list, vocab, FLAGS.batch_size, FLAGS.max_len, imap)
296 | train_step(model, x, y, mask_x)
297 | current_step = tf.train.global_step(sess, global_step)
298 | if current_step % FLAGS.evaluate_every == 0:
299 | dev_step(dev_model, test_list, vocab, FLAGS.batch_size, FLAGS.max_len, imap)
300 |
301 |
--------------------------------------------------------------------------------
/tf_cnn_char.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | import tensorflow as tf
3 | import numpy as np
4 | import random, time, os, datetime
5 |
6 | #########################################################################
7 | # 单层CNN文本分类模型
8 | #########################################################################
9 |
10 | #输入是定长序列,超过指定长度的截断,不足指定长度的补
11 | #构建字典
12 | def build_vocab():
13 | code, vocab = int(0), {}
14 | vocab['UNKNOWN'] = code
15 | code += 1
16 | vocab[''] = code
17 | code += 1
18 | for line in open('/export/jw/kg/data/sw_kgtrain.txt'):
19 | items = line.strip().split('\t')
20 | if len(items) != 3:
21 | continue
22 | for word in items[2].split('_'):
23 | if not word in vocab:
24 | vocab[word] = code
25 | code += 1
26 | return vocab
27 |
28 | #分类名转id
29 | def load_index():
30 | imap, c = {}, int(0)
31 | for line in open('/export/jw/kg/data/sw_kgtrain.txt'):
32 | items = line.strip().split('\t')
33 | if not imap.has_key(items[1]):
34 | imap[items[1]] = c
35 | c += 1
36 | return imap
37 |
38 | #将分类结果转换成one-hot的形式
39 | def encode_index(c, imap):
40 | index = imap[c]
41 | y = [int(0)] * len(imap)
42 | y[index] = int(1)
43 | return y
44 |
45 | #be attention initialization of UNKNNOW
46 | #对句子进行编码
47 | def encode_sent(vocab, sent, size):
48 | x = []
49 | words = sent.split('_')
50 | for i in range(0, size):
51 | if i < len(words):
52 | if words[i] in vocab:
53 | x.append(vocab[words[i]])
54 | else:
55 | x.append(vocab['UNKNOWN'])
56 | else:
57 | x.append(vocab[''])
58 | return x
59 |
60 | #读取验证数据,验证数据格式和训练数据一样
61 | def load_data_val(testList, vocab, index, batch_size, sent_len, imap):
62 | xlist, ylist, origxlist = [], [], []
63 | for i in range(0, batch_size):
64 | true_index = index + i
65 | if true_index >= len(testList):
66 | true_index = len(testList) - 1
67 | c, s = testList[true_index]
68 | xlist.append(encode_sent(vocab, s, sent_len))
69 | ylist.append(encode_index(c, imap))
70 | origxlist.append(s)
71 | return np.array(xlist, dtype='float32'), np.array(ylist, dtype='float32'), origxlist
72 |
73 | def load_train_list():
74 | tmap, tlist = {}, []
75 | for line in open('/export/jw/kg/data/sw_kgtrain.txt'):
76 | items = line.strip().split('\t')
77 | if (len(items) == 2):
78 | items.append('')
79 | if not tmap.has_key(items[1]):
80 | tmap[items[1]] = []
81 | tmap[items[1]].append(items[2])
82 | tlist.append((items[1], items[2]))
83 | return tmap, tlist
84 |
85 | def load_data(train_list, vocab, batch_size, sent_len, imap):
86 | xlist, ylist = [], []
87 | for i in xrange(0, batch_size):
88 | c, sent = train_list[random.randint(0, len(train_list) - 1)]
89 | xlist.append(encode_sent(vocab, sent, sent_len))
90 | ylist.append(encode_index(c, imap))
91 | return np.array(xlist, dtype='float32'), np.array(ylist, dtype='float32')
92 |
93 | class CNN(object):
94 | def __init__(
95 | self, sequence_length, batch_size,
96 | vocab_size, embedding_size,
97 | filter_sizes, num_filters, num_classes, l2_reg_lambda=0.0):
98 |
99 | #用户问题,字向量使用embedding_lookup
100 | self.x_batch = tf.placeholder(tf.int32, [batch_size, sequence_length], name="x_batch")
101 | self.y_batch = tf.placeholder(tf.int32, [batch_size, num_classes], name='y_batch')
102 | self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")
103 | print("xlist", self.x_batch)
104 |
105 | # Embedding layer
106 | with tf.device('/cpu:0'), tf.name_scope("embedding"):
107 | We = tf.Variable(
108 | tf.truncated_normal([vocab_size, embedding_size], stddev=0.1),
109 | name="W")
110 | chars = tf.nn.embedding_lookup(We, self.x_batch)
111 | self.embedded_chars = chars
112 | self.embedded_chars_expanded = tf.expand_dims(self.embedded_chars, -1)
113 |
114 | pooled_outputs = []
115 | for i, filter_size in enumerate(filter_sizes):
116 | with tf.name_scope("conv-maxpool-%s" % filter_size):
117 | filter_shape = [filter_size, embedding_size, 1, num_filters]
118 | W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.01), name="W-%s" % filter_size)
119 | b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b-%s" % filter_size)
120 | conv = tf.nn.conv2d(
121 | self.embedded_chars_expanded,
122 | W,
123 | strides=[1, 1, 1, 1],
124 | padding='VALID',
125 | name="conv"
126 | )
127 | h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu")
128 | pooled = tf.nn.max_pool(
129 | h,
130 | ksize=[1, sequence_length - filter_size + 1, 1, 1],
131 | strides=[1, 1, 1, 1],
132 | padding='VALID',
133 | name="pool"
134 | )
135 | pooled_outputs.append(pooled)
136 | num_filters_total = num_filters * len(filter_sizes)
137 | pooled_reshape = tf.reshape(tf.concat(pooled_outputs, 3), [-1, num_filters_total])
138 | #dropout
139 | h_drop = tf.nn.dropout(pooled_reshape, self.dropout_keep_prob)
140 |
141 | Wfc = tf.Variable(tf.truncated_normal([num_filters_total, num_classes], stddev=0.1), name='Wfc')
142 | bfc = tf.Variable(tf.constant(0.1, shape=[num_classes]), name='bfc')
143 | h_output = tf.nn.xw_plus_b(h_drop, Wfc, bfc, name='scores')
144 | print('h_output', h_output)
145 |
146 | with tf.name_scope("output"):
147 | self.orig_y = tf.argmax(self.y_batch, 1)
148 | self.pred_y = tf.argmax(h_output, 1)
149 |
150 | with tf.name_scope("loss"):
151 | #cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=h_output, labels=self.y_batch)
152 | #print('batch_loss', cross_entropy)
153 | #self.loss = tf.reduce_mean(cross_entropy)
154 | #print('loss ', self.loss)
155 | self.loss = tf.losses.softmax_cross_entropy(self.y_batch, h_output)
156 |
157 | # Accuracy
158 | with tf.name_scope("accuracy"):
159 | correct = tf.equal(tf.argmax(h_output, 1), tf.argmax(self.y_batch, 1))
160 | print('correct', correct)
161 | self.accuracy = tf.reduce_mean(tf.cast(correct, "float"), name="accuracy")
162 |
163 | # Parameters
164 | # ==================================================
165 | # Model Hyperparameters
166 | tf.flags.DEFINE_integer("embedding_dim", 100, "Dimensionality of character embedding (default: 128)")
167 | tf.flags.DEFINE_string("filter_sizes", "1,2,3,4,5", "Comma-separated filter sizes (default: '3,4,5')")
168 | tf.flags.DEFINE_integer("num_filters", 128, "Number of filters per filter size (default: 128)")
169 | tf.flags.DEFINE_float("dropout_keep_prob", 0.5, "Dropout keep probability (default: 0.5)")
170 | tf.flags.DEFINE_float("l2_reg_lambda", 0, "L2 regularizaion lambda (default: 0.0)")
171 |
172 | # Training parameters
173 | tf.flags.DEFINE_integer("batch_size", 512, "Batch Size (default: 64)")
174 | tf.flags.DEFINE_integer("num_epochs", 5000000, "Number of training epochs (default: 200)")
175 | tf.flags.DEFINE_integer("evaluate_every", 500, "Evaluate model on dev set after this many steps (default: 100)")
176 | tf.flags.DEFINE_integer("checkpoint_every", 3000, "Save model after this many steps (default: 100)")
177 | # Misc Parameters
178 | tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
179 | tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")
180 | sent_len = int(100)
181 |
182 | FLAGS = tf.flags.FLAGS
183 | FLAGS._parse_flags()
184 | print("\nParameters:")
185 | for attr, value in sorted(FLAGS.__flags.items()):
186 | print("{}={}".format(attr.upper(), value))
187 | print("")
188 |
189 | # Data Preparatopn
190 | # ==================================================
191 | # Load data
192 | print("Loading data...")
193 |
194 | def train_step(x_batch, y_batch):
195 | feed_dict = {
196 | cnn.x_batch: x_batch,
197 | cnn.y_batch: y_batch,
198 | cnn.dropout_keep_prob: FLAGS.dropout_keep_prob
199 | }
200 | _, step, summaries, loss, accuracy = sess.run(
201 | [train_op, global_step, train_summary_op, cnn.loss, cnn.accuracy], feed_dict)
202 | time_str = datetime.datetime.now().isoformat()
203 | print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
204 | train_summary_writer.add_summary(summaries, step)
205 |
206 | def dev_step(testList, vocab, batch_size, sent_len, imap):
207 | index, output_list, origy_list, origx_list = int(0), [], [], []
208 | while True:
209 | x_batch, y_batch, origx = load_data_val(testList, vocab, index, batch_size, sent_len, imap)
210 | feed_dict = {cnn.x_batch: x_batch, cnn.y_batch: y_batch, cnn.dropout_keep_prob: 1.0}
211 | origy, output = sess.run([cnn.orig_y, cnn.pred_y], feed_dict)
212 | for c in output:
213 | output_list.append(c)
214 | for c in origy:
215 | origy_list.append(c)
216 | for c in origx:
217 | origx_list.append(c)
218 | index += batch_size
219 | if index >= len(testList):
220 | break
221 | fp = file('/export/jw/kg/cnn.output', 'w+')
222 | i2nmap = {}
223 | for name, index in imap.items():
224 | i2nmap[index] = name
225 | for i in xrange(0, len(output_list)):
226 | fp.write(i2nmap[int(output_list[i])] + '\t' + i2nmap[origy_list[i]] + '\t' + origx_list[i] + '\n')
227 | fp.close()
228 | print 'wirte done ......'
229 |
230 | def load_test_list():
231 | testList = []
232 | for line in open('/export/jw/kg/data/sw_kgval.txt'):
233 | items = line.strip().split('\t')
234 | if (len(items) == 2):
235 | items.append('')
236 | testList.append((items[1], items[2]))
237 | return testList
238 |
239 | vocab = build_vocab()
240 | train_map, train_list = load_train_list()
241 | test_list = load_test_list()
242 | imap = load_index()
243 | xlist, ylist = load_data(train_list, vocab, FLAGS.batch_size, sent_len, imap)
244 | num_classes = ylist.shape[1]
245 | print("Load done...")
246 |
247 | # Training
248 | # ==================================================
249 |
250 | with tf.Graph().as_default():
251 | with tf.device("/gpu:0"):
252 | session_conf = tf.ConfigProto(
253 | allow_soft_placement=FLAGS.allow_soft_placement,
254 | log_device_placement=FLAGS.log_device_placement)
255 | sess = tf.Session(config=session_conf)
256 | with sess.as_default():
257 | cnn = CNN(
258 | sequence_length=sent_len,
259 | batch_size=FLAGS.batch_size,
260 | vocab_size=len(vocab),
261 | embedding_size=FLAGS.embedding_dim,
262 | filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))),
263 | num_filters=FLAGS.num_filters,
264 | num_classes=num_classes,
265 | l2_reg_lambda=FLAGS.l2_reg_lambda)
266 |
267 | # Define Training procedure
268 | global_step = tf.Variable(0, name="global_step", trainable=False)
269 | optimizer = tf.train.RMSPropOptimizer(0.0005)
270 | #optimizer = tf.train.AdamOptimizer(0.0001)
271 | #optimizer = tf.train.GradientDescentOptimizer(1e-2)
272 | grads_and_vars = optimizer.compute_gradients(cnn.loss)
273 | train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)
274 |
275 | # Keep track of gradient values and sparsity (optional)
276 | grad_summaries = []
277 | for g, v in grads_and_vars:
278 | if g is not None:
279 | grad_hist_summary = tf.summary.histogram("{}/grad/hist".format(v.name), g)
280 | sparsity_summary = tf.summary.scalar("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g))
281 | grad_summaries.append(grad_hist_summary)
282 | grad_summaries.append(sparsity_summary)
283 | grad_summaries_merged = tf.summary.merge(grad_summaries)
284 |
285 | # Output directory for models and summaries
286 | timestamp = str(int(time.time()))
287 | out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
288 | print("Writing to {}\n".format(out_dir))
289 |
290 | # Summaries for loss and accuracy
291 | loss_summary = tf.summary.scalar("loss", cnn.loss)
292 | acc_summary = tf.summary.scalar("accuracy", cnn.accuracy)
293 |
294 | # Train Summaries
295 | #train_summary_op = tf.merge_summary([loss_summary, acc_summary, grad_summaries_merged])
296 | train_summary_op = tf.summary.merge([loss_summary, acc_summary])
297 | train_summary_dir = os.path.join(out_dir, "summaries", "train")
298 | train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph_def)
299 |
300 | # Dev summaries
301 | dev_summary_op = tf.summary.merge([loss_summary, acc_summary])
302 | dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
303 | dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph_def)
304 |
305 | # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
306 | checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
307 | checkpoint_prefix = os.path.join(checkpoint_dir, "model")
308 | if not os.path.exists(checkpoint_dir):
309 | os.makedirs(checkpoint_dir)
310 | saver = tf.train.Saver(tf.global_variables())
311 |
312 | # Initialize all variables
313 | sess.run(tf.global_variables_initializer())
314 |
315 | # Generate batches
316 | # Training loop. For each batch...
317 | for i in range(FLAGS.num_epochs):
318 | try:
319 | x_batch, y_batch = load_data(train_list, vocab, FLAGS.batch_size, sent_len, imap)
320 | train_step(x_batch, y_batch)
321 | current_step = tf.train.global_step(sess, global_step)
322 | if current_step % FLAGS.evaluate_every == 0:
323 | print("\nEvaluation:")
324 | dev_step(test_list, vocab, FLAGS.batch_size, sent_len, imap)
325 | print("")
326 | if current_step % FLAGS.checkpoint_every == 0:
327 | path = saver.save(sess, checkpoint_prefix, global_step=current_step)
328 | print("Saved model checkpoint to {}\n".format(path))
329 | except Exception as e:
330 | print(e)
331 |
332 |
--------------------------------------------------------------------------------