├── README.md
├── code
    ├── attention.py
    ├── attention_N.py
    ├── attention_N_parent.py
    ├── attention_parent.py
    ├── myModel_commented.py
    ├── pointer.py
    ├── pointer_parent.py
    ├── reader_pointer.py
    ├── reader_pointer_original.py
    └── vanillaLSTM.py
└── preprocess_code
    ├── freq_dict.py
    ├── get_non_terminal.py
    ├── get_terminal_dict.py
    ├── get_terminal_whole.py
    ├── get_total_length.py
    ├── output.txt
    └── utils.py


/README.md:
--------------------------------------------------------------------------------
 1 | # Code completion
 2 | This is a repo holding codes for the paper: Code Completion with Neural Attention and Pointer Networks
 3 | 
 4 | ## Descriptions for the directories
 5 | ### code
 6 | * attention.py: standard attention model for predicting terminals
 7 | * attention_N.py: standard attention model for predicting non-terminals
 8 | * attention_N_parent.py: parent attention model for predicting non-terminals
 9 | * attention_parent.py: parent attention model for predicting terminals
10 | * myModel_commented.py: a good commented example for our model
11 | * pointer.py: our poirnter mixture network without parent attention
12 | * pointer_parent.py: our poirnter mixture network with parent attention
13 | * reader_pointer.py: reader for reading dataset (with parent)
14 | * reader_pointer_original.py: reader for reading dataset (original without parent)
15 | * vanillaLSTM.py: vanilla LSTM
16 | 
17 | ### preprocess_code
18 | * freq_dict.py: generate the frequency dictionary for terminals
19 | * get_non_terminal.py: process the non-terminals (utilize AST information)
20 | * get_terminal_dict.py: get the terminal dictionary according to the vocabulary size
21 | * get_terminal_whole.py: the final step to process the terminals (recording location and parent information)
22 | * get_total_length.py: calculate the total length of the file 
23 | * output.txt: some statistics for the terminals
24 | * utils.py: some utils to process the data
25 | 
26 | ## Download the dataset
27 | This is the link for you to download the raw dataset: [JS & PY data](http://plml.ethz.ch/)
28 | If you do not want to get your hands dirty with data preprocess, you can download the pickle data after preprocessed here: [pickle data](https://drive.google.com/open?id=1EZZuL8Rl3tatvxpIClvO_a8JD_Oid_oY)
29 | 
30 | ## How to run the code
31 | 1. Download the dataset
32 | 2. Preprocess the data into pickle files and store them in a proper directory
33 | 3. Simply adjust the parameter setting inside the code file and run using python3, e.g. python3 attention.py.
34 | 


--------------------------------------------------------------------------------
/code/attention.py:
--------------------------------------------------------------------------------
  1 | # attentional LSTM, count all unk as wrong, default predict termianl
  2 | # what we exactly use
  3 | # use reader_pointer_original, without parent
  4 | 
  5 | from __future__ import absolute_import
  6 | from __future__ import division
  7 | from __future__ import print_function
  8 | 
  9 | import inspect
 10 | import time
 11 | 
 12 | import numpy as np
 13 | import tensorflow as tf
 14 | 
 15 | import reader_pointer_original as reader
 16 | import os
 17 | os.environ['TF_CPP_MIN_LOG_LEVEL']='2'
 18 | 
 19 | os.environ['CUDA_VISIBLE_DEVICES']='0'
 20 | outfile = 'output_attention.txt'
 21 | 
 22 | N_filename = '../pickle_data/JS_non_terminal.pickle'
 23 | T_filename = '../pickle_data/JS_terminal_50k_whole.pickle'
 24 | 
 25 | flags = tf.flags
 26 | flags.DEFINE_string("save_path", None, #'./logs/modelT0A'
 27 |                     "Model output directory.")
 28 | 
 29 | flags.DEFINE_string(
 30 |     "model", "small",
 31 |     "A type of model. Possible options are: small, medium, best.")
 32 | # flags.DEFINE_string("data_path", '../data/dataJS',
 33 | #                     "Where the training/test data is stored.")
 34 | flags.DEFINE_bool("use_fp16", False,
 35 |                   "Train using 16-bit floats instead of 32bit floats")
 36 | 
 37 | FLAGS = flags.FLAGS
 38 | logging = tf.logging
 39 | 
 40 | if FLAGS.model == "test":
 41 |   outfile = 'TESToutput.txt'
 42 | def data_type():
 43 |   return tf.float16 if FLAGS.use_fp16 else tf.float32
 44 | 
 45 | class SmallConfig(object):
 46 |   """Small config.  get best result as 0.733 """
 47 |   init_scale = 0.05
 48 |   learning_rate = 0.001
 49 |   max_grad_norm = 5
 50 |   num_layers = 1#1
 51 |   num_steps = 50
 52 |   attn_size = 50
 53 |   hidden_sizeN = 300
 54 |   hidden_sizeT = 500
 55 |   sizeH = 800
 56 |   max_epoch = 1#8
 57 |   max_max_epoch = 8#79
 58 |   keep_prob = 1.0#1.0
 59 |   lr_decay = 0.6#0.95
 60 |   batch_size = 128#80
 61 | 
 62 | class TestConfig(object):
 63 |   """Tiny config, for testing."""
 64 |   init_scale = 0.05
 65 |   learning_rate = 0.001
 66 |   max_grad_norm = 5
 67 |   num_layers = 1
 68 |   num_steps = 50
 69 |   attn_size = 50
 70 |   hidden_sizeN = 50
 71 |   hidden_sizeT = 50
 72 |   sizeH = 100
 73 |   max_epoch = 1
 74 |   max_max_epoch = 1
 75 |   keep_prob = 1.0
 76 |   lr_decay = 0.6
 77 |   batch_size = 128
 78 | 
 79 | 
 80 | def get_config():
 81 |   if FLAGS.model == "small":
 82 |     return SmallConfig()
 83 |   elif FLAGS.model == "medium":
 84 |     return MediumConfig()
 85 |   elif FLAGS.model == "best":
 86 |     return BestConfig()
 87 |   elif FLAGS.model == "test":
 88 |     return TestConfig()
 89 |   else:
 90 |     raise ValueError("Invalid model: %s", FLAGS.model)
 91 | 
 92 | 
 93 | class PTBInput(object):
 94 |   """The input data."""
 95 | 
 96 |   def __init__(self, config, data, name=None):
 97 |     self.batch_size = batch_size = config.batch_size
 98 |     self.attn_size = attn_size = config.attn_size
 99 |     self.num_steps = num_steps = config.num_steps
100 |     self.input_dataN, self.targetsN, self.input_dataT, self.targetsT, self.epoch_size, self.eof_indicator = \
101 |             reader.data_producer(data, batch_size, num_steps, config.vocab_size, config.attn_size, change_yT=True, name=name)
102 |     if FLAGS.model == "test":
103 |       self.epoch_size = 16   #small epoch size for test
104 | 
105 | 
106 | class PTBModel(object):
107 | 
108 |   def __init__(self, is_training, config, input_):
109 |     self._input = input_
110 |     self.attn_size = attn_size = config.attn_size
111 |     batch_size = input_.batch_size
112 |     num_steps = input_.num_steps
113 |     self.sizeN = sizeN = config.hidden_sizeN
114 |     self.sizeT = sizeT = config.hidden_sizeT
115 |     self.size = size = config.sizeH
116 |     vocab_sizeN, vocab_sizeT = config.vocab_size
117 | 
118 |     # Slightly better results can be obtained with forget gate biases
119 |     # initialized to 1 but the hyperparameters of the model would need to be
120 |     # different than reported in the paper.
121 |     def lstm_cell():
122 |       if 'reuse' in inspect.getargspec(
123 |           tf.contrib.rnn.BasicLSTMCell.__init__).args:
124 |         return tf.contrib.rnn.BasicLSTMCell(
125 |             size, forget_bias=1.0, state_is_tuple=True,
126 |             reuse=tf.get_variable_scope().reuse)
127 |       else:
128 |         return tf.contrib.rnn.BasicLSTMCell(
129 |             size, forget_bias=1.0, state_is_tuple=True)
130 |     attn_cell = lstm_cell
131 |     if is_training and config.keep_prob < 1:
132 |       def attn_cell():
133 |         return tf.contrib.rnn.DropoutWrapper(
134 |             lstm_cell(), output_keep_prob=config.keep_prob)
135 |     cell = tf.contrib.rnn.MultiRNNCell(
136 |         [attn_cell() for _ in range(config.num_layers)], state_is_tuple=True)
137 | 
138 |     state_variables = []
139 |     with tf.variable_scope("myCH0"):
140 |       for i, (state_c, state_h) in enumerate(cell.zero_state(batch_size, data_type())):
141 |         if i > 0: tf.get_variable_scope().reuse_variables()
142 |         myC0 = tf.get_variable("myC0", state_c.shape[1], initializer=tf.zeros_initializer())
143 |         myH0 = tf.get_variable("myH0", state_h.shape[1], initializer=tf.zeros_initializer())
144 |         myC0_tensor = tf.convert_to_tensor([myC0 for _ in range(batch_size)])
145 |         myH0_tensor = tf.convert_to_tensor([myH0 for _ in range(batch_size)])
146 |         state_variables.append(tf.contrib.rnn.LSTMStateTuple(myC0_tensor, myH0_tensor))
147 | 
148 |     self._initial_state = state_variables
149 | 
150 |     self.eof_indicator = input_.eof_indicator
151 | 
152 |     with tf.device("/cpu:0"):
153 |       embeddingN = tf.get_variable(
154 |           "embeddingN", [vocab_sizeN, sizeN], dtype=data_type())
155 |       inputsN = tf.nn.embedding_lookup(embeddingN, input_.input_dataN)
156 | 
157 |     with tf.device("/cpu:0"):
158 |       embeddingT = tf.get_variable(
159 |           "embeddingT", [vocab_sizeT, sizeT], dtype=data_type())
160 |       inputsT = tf.nn.embedding_lookup(embeddingT, input_.input_dataT)
161 | 
162 |     inputs = tf.concat([inputsN, inputsT], 2)
163 |     #inputs = tf.one_hot(input_.input_data, vocab_size) 
164 |     if is_training and config.keep_prob < 1:
165 |       inputs = tf.nn.dropout(inputs, config.keep_prob)
166 | 
167 |     outputs = []
168 |     attentions = []
169 |     parents = []
170 |     state = self._initial_state
171 |     self.memory = tf.placeholder(dtype=data_type(), shape=[batch_size, num_steps, size], name="memory")
172 |     valid_memory = self.memory[:,-attn_size:,:]
173 |     # print ("test test test,, state shape", np.array(state).shape)
174 |     with tf.variable_scope("RNN"):
175 |       for time_step in range(num_steps):
176 |         if time_step > 0: tf.get_variable_scope().reuse_variables()
177 |         (cell_output, state) = cell(inputs[:, time_step, :], state)
178 |         outputs.append(cell_output)
179 | 
180 |         # parent_index = input_.input_dataP[:, time_step]
181 |         # cell_parent = [valid_memory[i,-parent_index[i],:] for i in range(batch_size)]
182 |         # parents.append(tf.convert_to_tensor(cell_parent))
183 | 
184 |         wm = tf.get_variable("wm", [size, size], dtype=data_type())
185 |         wh = tf.get_variable("wh", [size, size], dtype=data_type())
186 |         wt = tf.get_variable("wt", [size, 1], dtype=data_type())
187 |         gt = tf.tanh(tf.matmul(tf.reshape(valid_memory, [-1, size]), wm) + tf.reshape(tf.tile(tf.matmul(cell_output, wh),[1, attn_size]), [-1, size]))
188 |         alpha = tf.nn.softmax(tf.reshape(tf.matmul(gt, wt), [-1,attn_size]))
189 |         ct = tf.squeeze(tf.matmul(tf.transpose(valid_memory, [0, 2, 1]), tf.reshape(alpha, [-1, attn_size, 1])))
190 |         attentions.append(ct)
191 |         valid_memory = tf.concat([valid_memory[:,1:,:], tf.expand_dims(cell_output, axis=1)], axis=1)
192 | 
193 |     output = tf.reshape(tf.stack(axis=1, values=outputs), [-1, size])
194 |     attention = tf.reshape(tf.stack(axis=1, values=attentions), [-1, size])
195 |     # parent = tf.reshape(tf.stack(axis=1, values=parents), [-1, size])
196 |     self.output = tf.reshape(output, [-1, num_steps, size]) #to record the memory for next batch
197 |     wa = tf.get_variable("wa", [size*2, size], dtype=data_type())
198 |     nt = tf.tanh(tf.matmul(tf.concat([output, attention], axis=1), wa))
199 | 
200 |     softmax_w = tf.get_variable("softmax_w", [size, vocab_sizeT], dtype=data_type())
201 |     softmax_b = tf.get_variable("softmax_b", [vocab_sizeT], dtype=data_type())
202 |     logits = tf.matmul(nt, softmax_w) + softmax_b
203 |     labels = tf.reshape(input_.targetsT, [-1])
204 |     weights = tf.ones([batch_size * num_steps], dtype=data_type())
205 |     
206 |     #counting unk as wrong
207 |     unk_id = vocab_sizeT - 2
208 |     unk_tf = tf.constant(value=unk_id, dtype=tf.int32, shape=labels.shape)
209 |     zero_weights = tf.zeros_like(labels, dtype=data_type())
210 |     wrong_label = tf.constant(value=-1, dtype=tf.int32, shape=labels.shape)
211 |     condition_tf = tf.equal(labels, unk_tf)
212 |     new_weights = tf.where(condition_tf, zero_weights, weights)
213 |     new_labels = tf.where(condition_tf, wrong_label, labels)
214 | 
215 |     
216 |     loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example([logits], [labels], [new_weights])
217 |     probs = tf.nn.softmax(logits)
218 |     correct_prediction = tf.equal(tf.cast(tf.argmax(probs, 1), dtype = tf.int32), new_labels)
219 |     self._accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
220 | 
221 |     self._cost = cost = tf.reduce_sum(loss) / batch_size
222 |     self._final_state = state
223 | 
224 |     if not is_training:
225 |       return
226 | 
227 |     self._lr = tf.Variable(0.0, trainable=False)
228 |     tvars = tf.trainable_variables()
229 |     print ('tvars', len(tvars))
230 |     grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars),
231 |                                       config.max_grad_norm)
232 |     print ('*******the length', len(grads), '\n')
233 |     optimizer = tf.train.AdamOptimizer(self._lr)
234 |     self._train_op = optimizer.apply_gradients(
235 |         zip(grads, tvars),
236 |         global_step=tf.contrib.framework.get_or_create_global_step())
237 | 
238 |     self._new_lr = tf.placeholder(
239 |         tf.float32, shape=[], name="new_learning_rate")
240 |     self._lr_update = tf.assign(self._lr, self._new_lr)
241 | 
242 |   def assign_lr(self, session, lr_value):
243 |     session.run(self._lr_update, feed_dict={self._new_lr: lr_value})
244 | 
245 |   @property
246 |   def input(self):
247 |     return self._input
248 | 
249 |   @property
250 |   def initial_state(self):
251 |     return self._initial_state
252 | 
253 |   @property
254 |   def cost(self):
255 |     return self._cost
256 | 
257 |   @property
258 |   def final_state(self):
259 |     return self._final_state
260 | 
261 |   @property
262 |   def accuracy(self):
263 |     return self._accuracy
264 | 
265 |   @property
266 |   def lr(self):
267 |     return self._lr
268 | 
269 |   @property
270 |   def train_op(self):
271 |     return self._train_op
272 | 
273 | 
274 | def run_epoch(session, model, eval_op=None, verbose=False):
275 |   start_time = time.time()
276 |   costs = 0.0
277 |   accuracy_list = []
278 |   iters = 0
279 |   state = session.run(model.initial_state)
280 |   # print ('at the very initial of the run_epoch\n', state[0].c)
281 |   eof_indicator = np.ones((model.input.batch_size), dtype=bool)
282 |   memory = np.zeros([model.input.batch_size, model.input.num_steps, model.size])
283 |   # file_id = session.run(model.initial_file_id) #need to remove _
284 | 
285 |   fetches = {
286 |       "cost": model.cost,
287 |       "accuracy": model.accuracy,
288 |       "final_state": model.final_state,
289 |       "eof_indicator": model.eof_indicator,
290 |       "memory":model.output,
291 |   }
292 |   if eval_op is not None:
293 |     fetches["eval_op"] = eval_op
294 | 
295 |   for step in range(model.input.epoch_size):
296 |     feed_dict = {}
297 |     # current_file_id = file_id #session.run(model.file_id)
298 |     sub_cond = np.expand_dims(eof_indicator, axis = 1)
299 |     condition = np.repeat(sub_cond, model.size, axis = 1)
300 |     # zero_state = np.zeros_like(condition)
301 |     # zero_state = np.random.uniform(-0.05,0.05,condition.shape)
302 |     zero_state = session.run(model.initial_state)
303 | 
304 |     for i, (c, h) in enumerate(model.initial_state):
305 |       assert condition.shape == state[i].c.shape
306 |       feed_dict[c] = np.where(condition, zero_state[i][0], state[i].c)
307 |       feed_dict[h] = np.where(condition, zero_state[i][1], state[i].h)
308 |     
309 |     feed_dict[model.memory] = memory
310 |     vals = session.run(fetches, feed_dict)
311 |     
312 |     cost = vals["cost"]
313 |     accuracy = vals["accuracy"]
314 |     eof_indicator = vals["eof_indicator"]
315 |     state = vals["final_state"]  #use the final state as the initial state within a whole epoch
316 |     memory = vals["memory"]
317 | 
318 |     accuracy_list.append(accuracy)
319 |     costs += cost
320 |     iters += model.input.num_steps
321 | 
322 |     if verbose and step % (model.input.epoch_size // 10) == 10:
323 |       print("%.3f perplexity: %.3f accuracy: %.4f speed: %.0f wps" %
324 |             (step * 1.0 / model.input.epoch_size, np.exp(costs / iters), np.mean(accuracy_list),
325 |              (time.time() - start_time)))
326 |   
327 |   print ('this run_epoch takes time %.2f' %(time.time() - start_time))
328 |   return np.exp(costs / iters), np.mean(accuracy_list)
329 | 
330 | 
331 | def main(_):
332 |   start_time = time.time()
333 |   fout = open(outfile, 'a')
334 |   print ('\n', time.asctime(time.localtime()), file=fout)
335 |   print ('start a new experiment %s'%outfile, file=fout)
336 |   print ('Using dataset %s and %s'%(N_filename, T_filename), file=fout)
337 | 
338 |   train_dataN, valid_dataN, vocab_sizeN, train_dataT, valid_dataT, vocab_sizeT, attn_size \
339 |      = reader.input_data(N_filename, T_filename)
340 | 
341 |   train_data = (train_dataN, train_dataT)
342 |   valid_data = (valid_dataN, valid_dataT)
343 |   vocab_size = (vocab_sizeN+1, vocab_sizeT+2) # plus EOF, N is [w, eof], T is [w, unk, eof]
344 | 
345 |   config = get_config()
346 |   assert attn_size == config.attn_size #make sure the attn_size used in generate terminal is the same as the configuration
347 |   config.vocab_size = vocab_size  
348 |   eval_config = get_config()
349 |   eval_config.batch_size = config.batch_size * config.num_steps
350 |   eval_config.num_steps = 1
351 |   eval_config.vocab_size = vocab_size
352 | 
353 |   with tf.Graph().as_default():
354 |     initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale)
355 | 
356 |     with tf.name_scope("Train"):
357 |       train_input = PTBInput(config=config, data=train_data, name="TrainInput")
358 |       with tf.variable_scope("Model", reuse=None, initializer=initializer):
359 |         m = PTBModel(is_training=True, config=config, input_=train_input)
360 | 
361 |     with tf.name_scope("Valid"):
362 |       valid_input = PTBInput(config=config, data=valid_data, name="ValidInput")
363 |       with tf.variable_scope("Model", reuse=True, initializer=initializer):
364 |         mvalid = PTBModel(is_training=False, config=config, input_=valid_input)
365 | 
366 |     # with tf.name_scope("Test"):
367 |     #   test_input = PTBInput(config=eval_config, data=valid_data, name="TestInput")
368 |     #   with tf.variable_scope("Model", reuse=True, initializer=initializer):
369 |     #     mtest = PTBModel(is_training=False, config=eval_config,
370 |     #                      input_=test_input)
371 | 
372 | 
373 |     print ('total trainable variables', len(tf.trainable_variables()), '\n\n')
374 |     max_valid = 0
375 |     max_step = 0
376 |     saver = tf.train.Saver()
377 | 
378 |     sv = tf.train.Supervisor(logdir=None, summary_op=None)
379 |     with sv.managed_session() as session:
380 | 
381 |       for i in range(config.max_max_epoch):
382 |         lr_decay = config.lr_decay ** max(i + 1 - config.max_epoch, 0.0)
383 |         m.assign_lr(session, config.learning_rate * lr_decay)
384 |         print (outfile, "Epoch: %d Learning rate: %.6f" % (i + 1, session.run(m.lr)))
385 | 
386 |         train_perplexity, train_accuracy = run_epoch(session, m, eval_op=m.train_op, verbose=True)
387 |         print("Epoch: %d Train Perplexity: %.3f Train Accuracy: %.3f" % (i + 1, train_perplexity, train_accuracy))
388 |         print("Epoch: %d Train Perplexity: %.3f Train Accuracy: %.3f" % (i + 1, train_perplexity, train_accuracy), file=fout)
389 | 
390 |         if i > 5:
391 |           valid_perplexity, valid_accuracy = run_epoch(session, mvalid)
392 |           print("Epoch: %d Valid Perplexity: ~~%.3f Valid Accuracy: %.3f~" % (i + 1, valid_perplexity, valid_accuracy))
393 |           print("Epoch: %d Valid Perplexity: ~~%.3f Valid Accuracy: %.3f~" % (i + 1, valid_perplexity, valid_accuracy), file=fout)
394 |           if valid_accuracy > max_valid:
395 |             max_valid = valid_accuracy
396 |             max_step = i + 1
397 | 
398 |       # test_perplexity, test_accuracy = run_epoch(session, mtest)
399 |       # print("\nTest Perplexity: %.3f Test Accuracy: %.3f" % (test_perplexity, test_accuracy))
400 | 
401 |       print ('max step %d, max valid %.3f' %(max_step, max_valid))
402 |       # print ('data path is', FLAGS.data_path)
403 |       print ('total time takes', time.time()-start_time)
404 |       print ('max step %d, max valid %.3f' %(max_step, max_valid), file=fout)
405 |       print ('total time takes', time.time()-start_time, file=fout)
406 |       fout.close()
407 | 
408 |       # if FLAGS.save_path:
409 |       #   print("Saving model to %s." % FLAGS.save_path)
410 |       #   save_path = saver.save(session, FLAGS.save_path, write_meta_graph=False, write_state=False)
411 | 
412 | 
413 | if __name__ == "__main__":
414 |   tf.app.run()
415 | 


--------------------------------------------------------------------------------
/code/attention_N.py:
--------------------------------------------------------------------------------
  1 | # attentional LSTM, predict non terminal
  2 | # what we exactly use
  3 | # use reader_pointer_original, without parent
  4 | 
  5 | from __future__ import absolute_import
  6 | from __future__ import division
  7 | from __future__ import print_function
  8 | 
  9 | import inspect
 10 | import time
 11 | 
 12 | import numpy as np
 13 | import tensorflow as tf
 14 | 
 15 | import reader_pointer_original as reader
 16 | import os
 17 | os.environ['TF_CPP_MIN_LOG_LEVEL']='2'
 18 | 
 19 | os.environ['CUDA_VISIBLE_DEVICES']='0'
 20 | outfile = 'output_attention_N.txt'
 21 | 
 22 | N_filename = '../pickle_data/PY_non_terminal.pickle'
 23 | T_filename = '../pickle_data/PY_terminal_50k_whole.pickle'
 24 | 
 25 | flags = tf.flags
 26 | flags.DEFINE_string("save_path", None, #'./logs/modelT0A'
 27 |                     "Model output directory.")
 28 | 
 29 | flags.DEFINE_string(
 30 |     "model", "small",
 31 |     "A type of model. Possible options are: small, medium, best.")
 32 | # flags.DEFINE_string("data_path", '../data/dataJS',
 33 | #                     "Where the training/test data is stored.")
 34 | flags.DEFINE_bool("use_fp16", False,
 35 |                   "Train using 16-bit floats instead of 32bit floats")
 36 | 
 37 | FLAGS = flags.FLAGS
 38 | logging = tf.logging
 39 | 
 40 | if FLAGS.model == "test":
 41 |   outfile = 'TESToutput.txt'
 42 | def data_type():
 43 |   return tf.float16 if FLAGS.use_fp16 else tf.float32
 44 | 
 45 | class SmallConfig(object):
 46 |   """Small config.  get best result as 0.733 """
 47 |   init_scale = 0.05
 48 |   learning_rate = 0.001
 49 |   max_grad_norm = 5
 50 |   num_layers = 1#1
 51 |   num_steps = 50
 52 |   attn_size = 50
 53 |   hidden_sizeN = 50
 54 |   hidden_sizeT = 50
 55 |   sizeH = 100
 56 |   max_epoch = 1#8
 57 |   max_max_epoch = 8#79
 58 |   keep_prob = 1.0#1.0
 59 |   lr_decay = 0.6#0.95
 60 |   batch_size = 128#80
 61 | 
 62 | class TestConfig(object):
 63 |   """Tiny config, for testing."""
 64 |   init_scale = 0.05
 65 |   learning_rate = 0.001
 66 |   max_grad_norm = 5
 67 |   num_layers = 1
 68 |   num_steps = 50
 69 |   attn_size = 50
 70 |   hidden_sizeN = 50
 71 |   hidden_sizeT = 50
 72 |   sizeH = 100
 73 |   max_epoch = 1
 74 |   max_max_epoch = 1
 75 |   keep_prob = 1.0
 76 |   lr_decay = 0.6
 77 |   batch_size = 128
 78 | 
 79 | 
 80 | def get_config():
 81 |   if FLAGS.model == "small":
 82 |     return SmallConfig()
 83 |   elif FLAGS.model == "medium":
 84 |     return MediumConfig()
 85 |   elif FLAGS.model == "best":
 86 |     return BestConfig()
 87 |   elif FLAGS.model == "test":
 88 |     return TestConfig()
 89 |   else:
 90 |     raise ValueError("Invalid model: %s", FLAGS.model)
 91 | 
 92 | 
 93 | class PTBInput(object):
 94 |   """The input data."""
 95 | 
 96 |   def __init__(self, config, data, name=None):
 97 |     self.batch_size = batch_size = config.batch_size
 98 |     self.attn_size = attn_size = config.attn_size
 99 |     self.num_steps = num_steps = config.num_steps
100 |     self.input_dataN, self.targetsN, self.input_dataT, self.targetsT, self.epoch_size, self.eof_indicator = \
101 |             reader.data_producer(data, batch_size, num_steps, config.vocab_size, config.attn_size, change_yT=True, name=name)
102 |     if FLAGS.model == "test":
103 |       self.epoch_size = 16   #small epoch size for test
104 | 
105 | 
106 | class PTBModel(object):
107 | 
108 |   def __init__(self, is_training, config, input_):
109 |     self._input = input_
110 |     self.attn_size = attn_size = config.attn_size
111 |     batch_size = input_.batch_size
112 |     num_steps = input_.num_steps
113 |     self.sizeN = sizeN = config.hidden_sizeN
114 |     self.sizeT = sizeT = config.hidden_sizeT
115 |     self.size = size = config.sizeH
116 |     vocab_sizeN, vocab_sizeT = config.vocab_size
117 | 
118 |     # Slightly better results can be obtained with forget gate biases
119 |     # initialized to 1 but the hyperparameters of the model would need to be
120 |     # different than reported in the paper.
121 |     def lstm_cell():
122 |       if 'reuse' in inspect.getargspec(
123 |           tf.contrib.rnn.BasicLSTMCell.__init__).args:
124 |         return tf.contrib.rnn.BasicLSTMCell(
125 |             size, forget_bias=1.0, state_is_tuple=True,
126 |             reuse=tf.get_variable_scope().reuse)
127 |       else:
128 |         return tf.contrib.rnn.BasicLSTMCell(
129 |             size, forget_bias=1.0, state_is_tuple=True)
130 |     attn_cell = lstm_cell
131 |     if is_training and config.keep_prob < 1:
132 |       def attn_cell():
133 |         return tf.contrib.rnn.DropoutWrapper(
134 |             lstm_cell(), output_keep_prob=config.keep_prob)
135 |     cell = tf.contrib.rnn.MultiRNNCell(
136 |         [attn_cell() for _ in range(config.num_layers)], state_is_tuple=True)
137 | 
138 |     state_variables = []
139 |     with tf.variable_scope("myCH0"):
140 |       for i, (state_c, state_h) in enumerate(cell.zero_state(batch_size, data_type())):
141 |         if i > 0: tf.get_variable_scope().reuse_variables()
142 |         myC0 = tf.get_variable("myC0", state_c.shape[1], initializer=tf.zeros_initializer())
143 |         myH0 = tf.get_variable("myH0", state_h.shape[1], initializer=tf.zeros_initializer())
144 |         myC0_tensor = tf.convert_to_tensor([myC0 for _ in range(batch_size)])
145 |         myH0_tensor = tf.convert_to_tensor([myH0 for _ in range(batch_size)])
146 |         state_variables.append(tf.contrib.rnn.LSTMStateTuple(myC0_tensor, myH0_tensor))
147 | 
148 |     self._initial_state = state_variables
149 | 
150 |     self.eof_indicator = input_.eof_indicator
151 | 
152 |     with tf.device("/cpu:0"):
153 |       embeddingN = tf.get_variable(
154 |           "embeddingN", [vocab_sizeN, sizeN], dtype=data_type())
155 |       inputsN = tf.nn.embedding_lookup(embeddingN, input_.input_dataN)
156 | 
157 |     with tf.device("/cpu:0"):
158 |       embeddingT = tf.get_variable(
159 |           "embeddingT", [vocab_sizeT, sizeT], dtype=data_type())
160 |       inputsT = tf.nn.embedding_lookup(embeddingT, input_.input_dataT)
161 | 
162 |     inputs = tf.concat([inputsN, inputsT], 2)
163 |     #inputs = tf.one_hot(input_.input_data, vocab_size) 
164 |     if is_training and config.keep_prob < 1:
165 |       inputs = tf.nn.dropout(inputs, config.keep_prob)
166 | 
167 |     outputs = []
168 |     attentions = []
169 |     state = self._initial_state
170 |     self.memory = tf.placeholder(dtype=data_type(), shape=[batch_size, num_steps, size], name="memory")
171 |     valid_memory = self.memory[:,-attn_size:,:]
172 |     # print ("test test test,, state shape", np.array(state).shape)
173 |     with tf.variable_scope("RNN"):
174 |       for time_step in range(num_steps):
175 |         if time_step > 0: tf.get_variable_scope().reuse_variables()
176 |         (cell_output, state) = cell(inputs[:, time_step, :], state)
177 |         outputs.append(cell_output)
178 | 
179 |         wm = tf.get_variable("wm", [size, size], dtype=data_type())
180 |         wh = tf.get_variable("wh", [size, size], dtype=data_type())
181 |         wt = tf.get_variable("wt", [size, 1], dtype=data_type())
182 |         gt = tf.tanh(tf.matmul(tf.reshape(valid_memory, [-1, size]), wm) + tf.reshape(tf.tile(tf.matmul(cell_output, wh),[1, attn_size]), [-1, size]))
183 |         alpha = tf.nn.softmax(tf.reshape(tf.matmul(gt, wt), [-1,attn_size]))
184 |         ct = tf.squeeze(tf.matmul(tf.transpose(valid_memory, [0, 2, 1]), tf.reshape(alpha, [-1, attn_size, 1])))
185 |         attentions.append(ct)
186 |         valid_memory = tf.concat([valid_memory[:,1:,:], tf.expand_dims(cell_output, axis=1)], axis=1)
187 | 
188 |     output = tf.reshape(tf.stack(axis=1, values=outputs), [-1, size])
189 |     attention = tf.reshape(tf.stack(axis=1, values=attentions), [-1, size])
190 |     self.output = tf.reshape(output, [-1, num_steps, size]) #to record the memory for next batch
191 |     wa = tf.get_variable("wa", [size*2, size], dtype=data_type())
192 |     nt = tf.tanh(tf.matmul(tf.concat([output, attention], axis=1), wa))
193 | 
194 |     softmax_w = tf.get_variable("softmax_w", [size, vocab_sizeN], dtype=data_type())
195 |     softmax_b = tf.get_variable("softmax_b", [vocab_sizeN], dtype=data_type())
196 |     logits = tf.matmul(nt, softmax_w) + softmax_b
197 |     labels = tf.reshape(input_.targetsN, [-1])
198 |     weights = tf.ones([batch_size * num_steps], dtype=data_type())
199 |     
200 |     loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example([logits], [labels], [weights])
201 |     probs = tf.nn.softmax(logits)
202 |     correct_prediction = tf.equal(tf.cast(tf.argmax(probs, 1), dtype = tf.int32), labels)
203 |     self._accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
204 | 
205 |     self._cost = cost = tf.reduce_sum(loss) / batch_size
206 |     self._final_state = state
207 | 
208 |     if not is_training:
209 |       return
210 | 
211 |     self._lr = tf.Variable(0.0, trainable=False)
212 |     tvars = tf.trainable_variables()
213 |     print ('tvars', len(tvars))
214 |     grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars),
215 |                                       config.max_grad_norm)
216 |     print ('*******the length', len(grads))
217 |     optimizer = tf.train.AdamOptimizer(self._lr)
218 |     self._train_op = optimizer.apply_gradients(
219 |         zip(grads, tvars),
220 |         global_step=tf.contrib.framework.get_or_create_global_step())
221 | 
222 |     self._new_lr = tf.placeholder(
223 |         tf.float32, shape=[], name="new_learning_rate")
224 |     self._lr_update = tf.assign(self._lr, self._new_lr)
225 | 
226 |   def assign_lr(self, session, lr_value):
227 |     session.run(self._lr_update, feed_dict={self._new_lr: lr_value})
228 | 
229 |   @property
230 |   def input(self):
231 |     return self._input
232 | 
233 |   @property
234 |   def initial_state(self):
235 |     return self._initial_state
236 | 
237 |   @property
238 |   def cost(self):
239 |     return self._cost
240 | 
241 |   @property
242 |   def final_state(self):
243 |     return self._final_state
244 | 
245 |   @property
246 |   def accuracy(self):
247 |     return self._accuracy
248 | 
249 |   @property
250 |   def lr(self):
251 |     return self._lr
252 | 
253 |   @property
254 |   def train_op(self):
255 |     return self._train_op
256 | 
257 | 
258 | def run_epoch(session, model, eval_op=None, verbose=False):
259 |   """Runs the model on the given data."""
260 |   start_time = time.time()
261 |   costs = 0.0
262 |   accuracy_list = []
263 |   iters = 0
264 |   state = session.run(model.initial_state)
265 |   # print ('at the very initial of the run_epoch\n', state[0].c)
266 |   eof_indicator = np.ones((model.input.batch_size), dtype=bool)
267 |   memory = np.zeros([model.input.batch_size, model.input.num_steps, model.size])
268 |   # file_id = session.run(model.initial_file_id) #need to remove _
269 | 
270 |   fetches = {
271 |       "cost": model.cost,
272 |       "accuracy": model.accuracy,
273 |       "final_state": model.final_state,
274 |       "eof_indicator": model.eof_indicator,
275 |       "memory":model.output,
276 |   }
277 |   if eval_op is not None:
278 |     fetches["eval_op"] = eval_op
279 | 
280 |   for step in range(model.input.epoch_size):
281 |     feed_dict = {}
282 |     # current_file_id = file_id #session.run(model.file_id)
283 |     sub_cond = np.expand_dims(eof_indicator, axis = 1)
284 |     condition = np.repeat(sub_cond, model.size, axis = 1)
285 |     # zero_state = np.zeros_like(condition)
286 |     # zero_state = np.random.uniform(-0.05,0.05,condition.shape)
287 |     zero_state = session.run(model.initial_state)
288 | 
289 |     for i, (c, h) in enumerate(model.initial_state):
290 |       assert condition.shape == state[i].c.shape
291 |       feed_dict[c] = np.where(condition, zero_state[i][0], state[i].c)
292 |       feed_dict[h] = np.where(condition, zero_state[i][1], state[i].h)
293 |     
294 |     feed_dict[model.memory] = memory
295 |     vals = session.run(fetches, feed_dict)
296 |     
297 |     cost = vals["cost"]
298 |     accuracy = vals["accuracy"]
299 |     eof_indicator = vals["eof_indicator"]
300 |     state = vals["final_state"]  #use the final state as the initial state within a whole epoch
301 |     memory = vals["memory"]
302 | 
303 |     accuracy_list.append(accuracy)
304 |     costs += cost
305 |     iters += model.input.num_steps
306 | 
307 |     if verbose and step % (model.input.epoch_size // 10) == 10:
308 |       print("%.3f perplexity: %.3f accuracy: %.4f speed: %.0f wps" %
309 |             (step * 1.0 / model.input.epoch_size, np.exp(costs / iters), np.mean(accuracy_list),
310 |              (time.time() - start_time)))
311 |   
312 |   print ('this run_epoch takes time %.2f' %(time.time() - start_time))
313 |   return np.exp(costs / iters), np.mean(accuracy_list)
314 | 
315 | 
316 | def main(_):
317 |   start_time = time.time()
318 |   fout = open(outfile, 'a')
319 |   print ('\n', time.asctime(time.localtime()), file=fout)
320 |   print ('start a new experiment %s'%outfile, file=fout)
321 |   print ('Using dataset %s and %s'%(N_filename, T_filename), file=fout)
322 | 
323 |   train_dataN, valid_dataN, vocab_sizeN, train_dataT, valid_dataT, vocab_sizeT, attn_size \
324 |      = reader.input_data(N_filename, T_filename)
325 | 
326 |   train_data = (train_dataN, train_dataT)
327 |   valid_data = (valid_dataN, valid_dataT)
328 |   vocab_size = (vocab_sizeN+1, vocab_sizeT+2) # plus EOF, N is [w, eof], T is [w, unk, eof]
329 | 
330 |   config = get_config()
331 |   assert attn_size == config.attn_size #make sure the attn_size used in generate terminal is the same as the configuration
332 |   config.vocab_size = vocab_size  
333 |   eval_config = get_config()
334 |   eval_config.batch_size = config.batch_size * config.num_steps
335 |   eval_config.num_steps = 1
336 |   eval_config.vocab_size = vocab_size
337 | 
338 |   with tf.Graph().as_default():
339 |     initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale)
340 | 
341 |     with tf.name_scope("Train"):
342 |       train_input = PTBInput(config=config, data=train_data, name="TrainInput")
343 |       with tf.variable_scope("Model", reuse=None, initializer=initializer):
344 |         m = PTBModel(is_training=True, config=config, input_=train_input)
345 | 
346 |     with tf.name_scope("Valid"):
347 |       valid_input = PTBInput(config=config, data=valid_data, name="ValidInput")
348 |       with tf.variable_scope("Model", reuse=True, initializer=initializer):
349 |         mvalid = PTBModel(is_training=False, config=config, input_=valid_input)
350 | 
351 |     # with tf.name_scope("Test"):
352 |     #   test_input = PTBInput(config=eval_config, data=valid_data, name="TestInput")
353 |     #   with tf.variable_scope("Model", reuse=True, initializer=initializer):
354 |     #     mtest = PTBModel(is_training=False, config=eval_config,
355 |     #                      input_=test_input)
356 | 
357 | 
358 |     print ('total trainable variables', len(tf.trainable_variables()), '\n\n')
359 |     max_valid = 0
360 |     max_step = 0
361 |     saver = tf.train.Saver()
362 | 
363 |     sv = tf.train.Supervisor(logdir=None, summary_op=None)
364 |     with sv.managed_session() as session:
365 | 
366 |       for i in range(config.max_max_epoch):
367 |         lr_decay = config.lr_decay ** max(i + 1 - config.max_epoch, 0.0)
368 |         m.assign_lr(session, config.learning_rate * lr_decay)
369 |         print (outfile, "Epoch: %d Learning rate: %.6f" % (i + 1, session.run(m.lr)))
370 | 
371 |         train_perplexity, train_accuracy = run_epoch(session, m, eval_op=m.train_op, verbose=True)
372 |         print("Epoch: %d Train Perplexity: %.3f Train Accuracy: %.3f" % (i + 1, train_perplexity, train_accuracy))
373 |         print("Epoch: %d Train Perplexity: %.3f Train Accuracy: %.3f" % (i + 1, train_perplexity, train_accuracy), file=fout)
374 | 
375 |         if i > 5:
376 |           valid_perplexity, valid_accuracy = run_epoch(session, mvalid)
377 |           print("Epoch: %d Valid Perplexity: ~~%.3f Valid Accuracy: %.3f~" % (i + 1, valid_perplexity, valid_accuracy))
378 |           print("Epoch: %d Valid Perplexity: ~~%.3f Valid Accuracy: %.3f~" % (i + 1, valid_perplexity, valid_accuracy), file=fout)
379 |           if valid_accuracy > max_valid:
380 |             max_valid = valid_accuracy
381 |             max_step = i + 1
382 | 
383 |       # test_perplexity, test_accuracy = run_epoch(session, mtest)
384 |       # print("\nTest Perplexity: %.3f Test Accuracy: %.3f" % (test_perplexity, test_accuracy))
385 | 
386 |       print ('max step %d, max valid %.3f' %(max_step, max_valid))
387 |       # print ('data path is', FLAGS.data_path)
388 |       print ('total time takes', time.time()-start_time)
389 |       print ('max step %d, max valid %.3f' %(max_step, max_valid), file=fout)
390 |       print ('total time takes', time.time()-start_time, file=fout)
391 |       fout.close()
392 | 
393 |       # if FLAGS.save_path:
394 |       #   print("Saving model to %s." % FLAGS.save_path)
395 |       #   save_path = saver.save(session, FLAGS.save_path, write_meta_graph=False, write_state=False)
396 | 
397 | 
398 | if __name__ == "__main__":
399 |   tf.app.run()
400 | 


--------------------------------------------------------------------------------
/code/attention_N_parent.py:
--------------------------------------------------------------------------------
  1 | # attentional LSTM, predict non terminal
  2 | # what we exactly use
  3 | # revise 01/09, add parent hidden states at output
  4 | 
  5 | from __future__ import absolute_import
  6 | from __future__ import division
  7 | from __future__ import print_function
  8 | 
  9 | import inspect
 10 | import time
 11 | 
 12 | import numpy as np
 13 | import tensorflow as tf
 14 | 
 15 | import reader_pointer as reader
 16 | import os
 17 | os.environ['TF_CPP_MIN_LOG_LEVEL']='2'
 18 | 
 19 | os.environ['CUDA_VISIBLE_DEVICES']='0'
 20 | outfile = 'output_attention_parent_N.txt'
 21 | 
 22 | N_filename = '../pickle_data/PY_non_terminal.pickle'
 23 | T_filename = '../pickle_data/PY_terminal_50k_whole.pickle'
 24 | 
 25 | flags = tf.flags
 26 | flags.DEFINE_string("save_path", None, #'./logs/modelT0A'
 27 |                     "Model output directory.")
 28 | 
 29 | flags.DEFINE_string(
 30 |     "model", "small",
 31 |     "A type of model. Possible options are: small, medium, best.")
 32 | # flags.DEFINE_string("data_path", '../data/dataJS',
 33 | #                     "Where the training/test data is stored.")
 34 | flags.DEFINE_bool("use_fp16", False,
 35 |                   "Train using 16-bit floats instead of 32bit floats")
 36 | 
 37 | FLAGS = flags.FLAGS
 38 | logging = tf.logging
 39 | 
 40 | if FLAGS.model == "test":
 41 |   outfile = 'TESToutput.txt'
 42 | def data_type():
 43 |   return tf.float16 if FLAGS.use_fp16 else tf.float32
 44 | 
 45 | class SmallConfig(object):
 46 |   """Small config.  get best result as 0.733 """
 47 |   init_scale = 0.05
 48 |   learning_rate = 0.001
 49 |   max_grad_norm = 5
 50 |   num_layers = 1#1
 51 |   num_steps = 50
 52 |   attn_size = 50
 53 |   hidden_sizeN = 50
 54 |   hidden_sizeT = 50
 55 |   sizeH = 100
 56 |   max_epoch = 1#8
 57 |   max_max_epoch = 8#79
 58 |   keep_prob = 1.0#1.0
 59 |   lr_decay = 0.6#0.95
 60 |   batch_size = 80#80
 61 | 
 62 | class TestConfig(object):
 63 |   """Tiny config, for testing."""
 64 |   init_scale = 0.05
 65 |   learning_rate = 0.001
 66 |   max_grad_norm = 5
 67 |   num_layers = 1
 68 |   num_steps = 50
 69 |   attn_size = 50
 70 |   hidden_sizeN = 50
 71 |   hidden_sizeT = 50
 72 |   sizeH = 100
 73 |   max_epoch = 1
 74 |   max_max_epoch = 1
 75 |   keep_prob = 1.0
 76 |   lr_decay = 0.6
 77 |   batch_size = 80
 78 | 
 79 | 
 80 | def get_config():
 81 |   if FLAGS.model == "small":
 82 |     return SmallConfig()
 83 |   elif FLAGS.model == "medium":
 84 |     return MediumConfig()
 85 |   elif FLAGS.model == "best":
 86 |     return BestConfig()
 87 |   elif FLAGS.model == "test":
 88 |     return TestConfig()
 89 |   else:
 90 |     raise ValueError("Invalid model: %s", FLAGS.model)
 91 | 
 92 | 
 93 | class PTBInput(object):
 94 |   """The input data."""
 95 | 
 96 |   def __init__(self, config, data, name=None):
 97 |     self.batch_size = batch_size = config.batch_size
 98 |     self.attn_size = attn_size = config.attn_size
 99 |     self.num_steps = num_steps = config.num_steps
100 |     self.input_dataN, self.targetsN, self.input_dataT, self.targetsT, self.epoch_size, self.eof_indicator, self.input_dataP = \
101 |             reader.data_producer(data, batch_size, num_steps, config.vocab_size, config.attn_size, change_yT=True, name=name)
102 |     if FLAGS.model == "test":
103 |       self.epoch_size = 16   #small epoch size for test
104 | 
105 | 
106 | class PTBModel(object):
107 | 
108 |   def __init__(self, is_training, config, input_):
109 |     self._input = input_
110 |     self.attn_size = attn_size = config.attn_size
111 |     batch_size = input_.batch_size
112 |     num_steps = input_.num_steps
113 |     self.sizeN = sizeN = config.hidden_sizeN
114 |     self.sizeT = sizeT = config.hidden_sizeT
115 |     self.size = size = config.sizeH
116 |     vocab_sizeN, vocab_sizeT = config.vocab_size
117 | 
118 |     # Slightly better results can be obtained with forget gate biases
119 |     # initialized to 1 but the hyperparameters of the model would need to be
120 |     # different than reported in the paper.
121 |     def lstm_cell():
122 |       if 'reuse' in inspect.getargspec(
123 |           tf.contrib.rnn.BasicLSTMCell.__init__).args:
124 |         return tf.contrib.rnn.BasicLSTMCell(
125 |             size, forget_bias=1.0, state_is_tuple=True,
126 |             reuse=tf.get_variable_scope().reuse)
127 |       else:
128 |         return tf.contrib.rnn.BasicLSTMCell(
129 |             size, forget_bias=1.0, state_is_tuple=True)
130 |     attn_cell = lstm_cell
131 |     if is_training and config.keep_prob < 1:
132 |       def attn_cell():
133 |         return tf.contrib.rnn.DropoutWrapper(
134 |             lstm_cell(), output_keep_prob=config.keep_prob)
135 |     cell = tf.contrib.rnn.MultiRNNCell(
136 |         [attn_cell() for _ in range(config.num_layers)], state_is_tuple=True)
137 | 
138 |     state_variables = []
139 |     with tf.variable_scope("myCH0"):
140 |       for i, (state_c, state_h) in enumerate(cell.zero_state(batch_size, data_type())):
141 |         if i > 0: tf.get_variable_scope().reuse_variables()
142 |         myC0 = tf.get_variable("myC0", state_c.shape[1], initializer=tf.zeros_initializer())
143 |         myH0 = tf.get_variable("myH0", state_h.shape[1], initializer=tf.zeros_initializer())
144 |         myC0_tensor = tf.convert_to_tensor([myC0 for _ in range(batch_size)])
145 |         myH0_tensor = tf.convert_to_tensor([myH0 for _ in range(batch_size)])
146 |         state_variables.append(tf.contrib.rnn.LSTMStateTuple(myC0_tensor, myH0_tensor))
147 | 
148 |     self._initial_state = state_variables
149 | 
150 |     self.eof_indicator = input_.eof_indicator
151 | 
152 |     with tf.device("/cpu:0"):
153 |       embeddingN = tf.get_variable(
154 |           "embeddingN", [vocab_sizeN, sizeN], dtype=data_type())
155 |       inputsN = tf.nn.embedding_lookup(embeddingN, input_.input_dataN)
156 |       inputsP = tf.nn.embedding_lookup(embeddingN, input_.input_dataP)
157 | 
158 |     with tf.device("/cpu:0"):
159 |       embeddingT = tf.get_variable(
160 |           "embeddingT", [vocab_sizeT, sizeT], dtype=data_type())
161 |       inputsT = tf.nn.embedding_lookup(embeddingT, input_.input_dataT)
162 | 
163 |     inputs = tf.concat([inputsN, inputsT], 2)
164 |     #inputs = tf.one_hot(input_.input_data, vocab_size) 
165 |     if is_training and config.keep_prob < 1:
166 |       inputs = tf.nn.dropout(inputs, config.keep_prob)
167 | 
168 |     outputs = []
169 |     attentions = []
170 |     parents = []
171 |     state = self._initial_state
172 |     self.memory = tf.placeholder(dtype=data_type(), shape=[batch_size, num_steps, size], name="memory")
173 |     valid_memory = self.memory[:,-attn_size:,:]
174 |     # print ("test test test,, state shape", np.array(state).shape)
175 |     with tf.variable_scope("RNN"):
176 |       for time_step in range(num_steps):
177 |         if time_step > 0: tf.get_variable_scope().reuse_variables()
178 |         (cell_output, state) = cell(inputs[:, time_step, :], state)
179 |         outputs.append(cell_output)
180 | 
181 |         # parent_index = input_.input_dataP[:, time_step]  # retrieval parent hidden state in batch
182 |         # cell_parent = tf.convert_to_tensor([valid_memory[i,-parent_index[i],:] for i in range(batch_size)])
183 |         cell_parent = inputsP[:, time_step, :]
184 |         parents.append(cell_parent)
185 | 
186 |         wm = tf.get_variable("wm", [size, size], dtype=data_type())
187 |         wh = tf.get_variable("wh", [size, size], dtype=data_type())
188 |         wt = tf.get_variable("wt", [size, 1], dtype=data_type())
189 |         gt = tf.tanh(tf.matmul(tf.reshape(valid_memory, [-1, size]), wm) + tf.reshape(tf.tile(tf.matmul(cell_output, wh),[1, attn_size]), [-1, size]))
190 |         alpha = tf.nn.softmax(tf.reshape(tf.matmul(gt, wt), [-1,attn_size]))
191 |         ct = tf.squeeze(tf.matmul(tf.transpose(valid_memory, [0, 2, 1]), tf.reshape(alpha, [-1, attn_size, 1])))
192 |         attentions.append(ct)
193 |         valid_memory = tf.concat([valid_memory[:,1:,:], tf.expand_dims(cell_output, axis=1)], axis=1)
194 | 
195 |     output = tf.reshape(tf.stack(axis=1, values=outputs), [-1, size])
196 |     attention = tf.reshape(tf.stack(axis=1, values=attentions), [-1, size])
197 |     parent = tf.reshape(tf.stack(axis=1, values=parents), [-1, sizeN])
198 |     zeros_parent = tf.zeros_like(parent, dtype=data_type())
199 | 
200 |     self.output = tf.reshape(output, [-1, num_steps, size]) #to record the memory for next batch
201 |     wa = tf.get_variable("wa", [size*2+sizeN, size], dtype=data_type())  # feed parent at the output
202 |     nt = tf.tanh(tf.matmul(tf.concat([output, attention, parent], axis=1), wa))
203 | 
204 |     softmax_w = tf.get_variable("softmax_w", [size, vocab_sizeN], dtype=data_type())
205 |     softmax_b = tf.get_variable("softmax_b", [vocab_sizeN], dtype=data_type())
206 |     logits = tf.matmul(nt, softmax_w) + softmax_b
207 |     labels = tf.reshape(input_.targetsN, [-1])
208 |     weights = tf.ones([batch_size * num_steps], dtype=data_type())
209 |     
210 |     loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example([logits], [labels], [weights])
211 |     probs = tf.nn.softmax(logits)
212 |     correct_prediction = tf.equal(tf.cast(tf.argmax(probs, 1), dtype = tf.int32), labels)
213 |     self._accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
214 | 
215 |     self._cost = cost = tf.reduce_sum(loss) / batch_size
216 |     self._final_state = state
217 | 
218 |     if not is_training:
219 |       return
220 | 
221 |     self._lr = tf.Variable(0.0, trainable=False)
222 |     tvars = tf.trainable_variables()
223 |     print ('tvars', len(tvars))
224 |     grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars),
225 |                                       config.max_grad_norm)
226 |     print ('*******the length', len(grads))
227 |     optimizer = tf.train.AdamOptimizer(self._lr)
228 |     self._train_op = optimizer.apply_gradients(
229 |         zip(grads, tvars),
230 |         global_step=tf.contrib.framework.get_or_create_global_step())
231 | 
232 |     self._new_lr = tf.placeholder(
233 |         tf.float32, shape=[], name="new_learning_rate")
234 |     self._lr_update = tf.assign(self._lr, self._new_lr)
235 | 
236 |   def assign_lr(self, session, lr_value):
237 |     session.run(self._lr_update, feed_dict={self._new_lr: lr_value})
238 | 
239 |   @property
240 |   def input(self):
241 |     return self._input
242 | 
243 |   @property
244 |   def initial_state(self):
245 |     return self._initial_state
246 | 
247 |   @property
248 |   def cost(self):
249 |     return self._cost
250 | 
251 |   @property
252 |   def final_state(self):
253 |     return self._final_state
254 | 
255 |   @property
256 |   def accuracy(self):
257 |     return self._accuracy
258 | 
259 |   @property
260 |   def lr(self):
261 |     return self._lr
262 | 
263 |   @property
264 |   def train_op(self):
265 |     return self._train_op
266 | 
267 | 
268 | def run_epoch(session, model, eval_op=None, verbose=False):
269 |   """Runs the model on the given data."""
270 |   start_time = time.time()
271 |   costs = 0.0
272 |   accuracy_list = []
273 |   iters = 0
274 |   state = session.run(model.initial_state)
275 |   # print ('at the very initial of the run_epoch\n', state[0].c)
276 |   eof_indicator = np.ones((model.input.batch_size), dtype=bool)
277 |   memory = np.zeros([model.input.batch_size, model.input.num_steps, model.size])
278 |   # file_id = session.run(model.initial_file_id) #need to remove _
279 | 
280 |   fetches = {
281 |       "cost": model.cost,
282 |       "accuracy": model.accuracy,
283 |       "final_state": model.final_state,
284 |       "eof_indicator": model.eof_indicator,
285 |       "memory":model.output,
286 |   }
287 |   if eval_op is not None:
288 |     fetches["eval_op"] = eval_op
289 | 
290 |   for step in range(model.input.epoch_size):
291 |     feed_dict = {}
292 |     # current_file_id = file_id #session.run(model.file_id)
293 |     sub_cond = np.expand_dims(eof_indicator, axis = 1)
294 |     condition = np.repeat(sub_cond, model.size, axis = 1)
295 |     # zero_state = np.zeros_like(condition)
296 |     # zero_state = np.random.uniform(-0.05,0.05,condition.shape)
297 |     zero_state = session.run(model.initial_state)
298 | 
299 |     for i, (c, h) in enumerate(model.initial_state):
300 |       assert condition.shape == state[i].c.shape
301 |       feed_dict[c] = np.where(condition, zero_state[i][0], state[i].c)
302 |       feed_dict[h] = np.where(condition, zero_state[i][1], state[i].h)
303 |     
304 |     feed_dict[model.memory] = memory
305 |     vals = session.run(fetches, feed_dict)
306 |     
307 |     cost = vals["cost"]
308 |     accuracy = vals["accuracy"]
309 |     eof_indicator = vals["eof_indicator"]
310 |     state = vals["final_state"]  #use the final state as the initial state within a whole epoch
311 |     memory = vals["memory"]
312 | 
313 |     accuracy_list.append(accuracy)
314 |     costs += cost
315 |     iters += model.input.num_steps
316 | 
317 |     if verbose and step % (model.input.epoch_size // 10) == 10:
318 |       print("%.3f perplexity: %.3f accuracy: %.4f speed: %.0f wps" %
319 |             (step * 1.0 / model.input.epoch_size, np.exp(costs / iters), np.mean(accuracy_list),
320 |              (time.time() - start_time)))
321 |   
322 |   print ('this run_epoch takes time %.2f' %(time.time() - start_time))
323 |   return np.exp(costs / iters), np.mean(accuracy_list)
324 | 
325 | 
326 | def main(_):
327 |   start_time = time.time()
328 |   fout = open(outfile, 'a')
329 |   print ('\n', time.asctime(time.localtime()), file=fout)
330 |   print ('start a new experiment %s'%outfile, file=fout)
331 |   print ('Using dataset %s and %s'%(N_filename, T_filename), file=fout)
332 | 
333 |   train_dataN, valid_dataN, vocab_sizeN, train_dataT, valid_dataT, vocab_sizeT, attn_size, train_dataP, valid_dataP \
334 |      = reader.input_data(N_filename, T_filename)
335 | 
336 |   train_data = (train_dataN, train_dataT, train_dataP)
337 |   valid_data = (valid_dataN, valid_dataT, valid_dataP)
338 |   vocab_size = (vocab_sizeN+1, vocab_sizeT+2) # plus EOF, N is [w, eof], T is [w, unk, eof]
339 | 
340 |   config = get_config()
341 |   assert attn_size == config.attn_size #make sure the attn_size used in generate terminal is the same as the configuration
342 |   config.vocab_size = vocab_size  
343 |   eval_config = get_config()
344 |   eval_config.batch_size = config.batch_size * config.num_steps
345 |   eval_config.num_steps = 1
346 |   eval_config.vocab_size = vocab_size
347 | 
348 |   with tf.Graph().as_default():
349 |     initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale)
350 | 
351 |     with tf.name_scope("Train"):
352 |       train_input = PTBInput(config=config, data=train_data, name="TrainInput")
353 |       with tf.variable_scope("Model", reuse=None, initializer=initializer):
354 |         m = PTBModel(is_training=True, config=config, input_=train_input)
355 | 
356 |     with tf.name_scope("Valid"):
357 |       valid_input = PTBInput(config=config, data=valid_data, name="ValidInput")
358 |       with tf.variable_scope("Model", reuse=True, initializer=initializer):
359 |         mvalid = PTBModel(is_training=False, config=config, input_=valid_input)
360 | 
361 |     # with tf.name_scope("Test"):
362 |     #   test_input = PTBInput(config=eval_config, data=valid_data, name="TestInput")
363 |     #   with tf.variable_scope("Model", reuse=True, initializer=initializer):
364 |     #     mtest = PTBModel(is_training=False, config=eval_config,
365 |     #                      input_=test_input)
366 | 
367 | 
368 |     print ('total trainable variables', len(tf.trainable_variables()), '\n\n')
369 |     max_valid = 0
370 |     max_step = 0
371 |     saver = tf.train.Saver()
372 | 
373 |     sv = tf.train.Supervisor(logdir=None, summary_op=None)
374 |     with sv.managed_session() as session:
375 | 
376 |       for i in range(config.max_max_epoch):
377 |         lr_decay = config.lr_decay ** max(i + 1 - config.max_epoch, 0.0)
378 |         m.assign_lr(session, config.learning_rate * lr_decay)
379 |         print (outfile, "Epoch: %d Learning rate: %.6f" % (i + 1, session.run(m.lr)))
380 | 
381 |         train_perplexity, train_accuracy = run_epoch(session, m, eval_op=m.train_op, verbose=True)
382 |         print("Epoch: %d Train Perplexity: %.3f Train Accuracy: %.3f" % (i + 1, train_perplexity, train_accuracy))
383 |         print("Epoch: %d Train Perplexity: %.3f Train Accuracy: %.3f" % (i + 1, train_perplexity, train_accuracy), file=fout)
384 | 
385 |         if i > 5:
386 |           valid_perplexity, valid_accuracy = run_epoch(session, mvalid)
387 |           print("Epoch: %d Valid Perplexity: ~~%.3f Valid Accuracy: %.3f~" % (i + 1, valid_perplexity, valid_accuracy))
388 |           print("Epoch: %d Valid Perplexity: ~~%.3f Valid Accuracy: %.3f~" % (i + 1, valid_perplexity, valid_accuracy), file=fout)
389 |           if valid_accuracy > max_valid:
390 |             max_valid = valid_accuracy
391 |             max_step = i + 1
392 | 
393 |       # test_perplexity, test_accuracy = run_epoch(session, mtest)
394 |       # print("\nTest Perplexity: %.3f Test Accuracy: %.3f" % (test_perplexity, test_accuracy))
395 | 
396 |       print ('max step %d, max valid %.3f' %(max_step, max_valid))
397 |       # print ('data path is', FLAGS.data_path)
398 |       print ('total time takes', time.time()-start_time)
399 |       print ('max step %d, max valid %.3f' %(max_step, max_valid), file=fout)
400 |       print ('total time takes', time.time()-start_time, file=fout)
401 |       fout.close()
402 | 
403 |       # if FLAGS.save_path:
404 |       #   print("Saving model to %s." % FLAGS.save_path)
405 |       #   save_path = saver.save(session, FLAGS.save_path, write_meta_graph=False, write_state=False)
406 | 
407 | 
408 | if __name__ == "__main__":
409 |   tf.app.run()
410 | 


--------------------------------------------------------------------------------
/code/attention_parent.py:
--------------------------------------------------------------------------------
  1 | # attentional LSTM, count all unk as wrong, default predict termianl
  2 | # what we exactly use
  3 | # revise 01/09, add parent at output
  4 | 
  5 | from __future__ import absolute_import
  6 | from __future__ import division
  7 | from __future__ import print_function
  8 | 
  9 | import inspect
 10 | import time
 11 | 
 12 | import numpy as np
 13 | import tensorflow as tf
 14 | 
 15 | import reader_pointer as reader
 16 | import os
 17 | os.environ['TF_CPP_MIN_LOG_LEVEL']='2'
 18 | 
 19 | os.environ['CUDA_VISIBLE_DEVICES']='0'
 20 | outfile = 'output_attention_parent.txt'
 21 | 
 22 | N_filename = '../pickle_data/PY_non_terminal.pickle'
 23 | T_filename = '../pickle_data/PY_terminal_50k_whole.pickle'
 24 | 
 25 | flags = tf.flags
 26 | flags.DEFINE_string("save_path", None, #'./logs/modelT0A'
 27 |                     "Model output directory.")
 28 | 
 29 | flags.DEFINE_string(
 30 |     "model", "small",
 31 |     "A type of model. Possible options are: small, medium, best.")
 32 | # flags.DEFINE_string("data_path", '../data/dataJS',
 33 | #                     "Where the training/test data is stored.")
 34 | flags.DEFINE_bool("use_fp16", False,
 35 |                   "Train using 16-bit floats instead of 32bit floats")
 36 | 
 37 | FLAGS = flags.FLAGS
 38 | logging = tf.logging
 39 | 
 40 | if FLAGS.model == "test":
 41 |   outfile = 'TESToutput.txt'
 42 | def data_type():
 43 |   return tf.float16 if FLAGS.use_fp16 else tf.float32
 44 | 
 45 | class SmallConfig(object):
 46 |   """Small config.  get best result as 0.733 """
 47 |   init_scale = 0.05
 48 |   learning_rate = 0.001
 49 |   max_grad_norm = 5
 50 |   num_layers = 1#1
 51 |   num_steps = 50
 52 |   attn_size = 50
 53 |   hidden_sizeN = 300
 54 |   hidden_sizeT = 500
 55 |   sizeH = 800
 56 |   max_epoch = 1#8
 57 |   max_max_epoch = 8#79
 58 |   keep_prob = 1.0#1.0
 59 |   lr_decay = 0.6#0.95
 60 |   batch_size = 128#80
 61 | 
 62 | class TestConfig(object):
 63 |   """Tiny config, for testing."""
 64 |   init_scale = 0.05
 65 |   learning_rate = 0.001
 66 |   max_grad_norm = 5
 67 |   num_layers = 1
 68 |   num_steps = 50
 69 |   attn_size = 50
 70 |   hidden_sizeN = 50
 71 |   hidden_sizeT = 50
 72 |   sizeH = 100
 73 |   max_epoch = 1
 74 |   max_max_epoch = 1
 75 |   keep_prob = 1.0
 76 |   lr_decay = 0.6
 77 |   batch_size = 128
 78 | 
 79 | 
 80 | def get_config():
 81 |   if FLAGS.model == "small":
 82 |     return SmallConfig()
 83 |   elif FLAGS.model == "medium":
 84 |     return MediumConfig()
 85 |   elif FLAGS.model == "best":
 86 |     return BestConfig()
 87 |   elif FLAGS.model == "test":
 88 |     return TestConfig()
 89 |   else:
 90 |     raise ValueError("Invalid model: %s", FLAGS.model)
 91 | 
 92 | 
 93 | class PTBInput(object):
 94 |   """The input data."""
 95 | 
 96 |   def __init__(self, config, data, name=None):
 97 |     self.batch_size = batch_size = config.batch_size
 98 |     self.attn_size = attn_size = config.attn_size
 99 |     self.num_steps = num_steps = config.num_steps
100 |     self.input_dataN, self.targetsN, self.input_dataT, self.targetsT, self.epoch_size, self.eof_indicator, self.input_dataP = \
101 |             reader.data_producer(data, batch_size, num_steps, config.vocab_size, config.attn_size, change_yT=True, name=name)
102 |     if FLAGS.model == "test":
103 |       self.epoch_size = 16   #small epoch size for test
104 | 
105 | 
106 | class PTBModel(object):
107 | 
108 |   def __init__(self, is_training, config, input_):
109 |     self._input = input_
110 |     self.attn_size = attn_size = config.attn_size
111 |     batch_size = input_.batch_size
112 |     num_steps = input_.num_steps
113 |     self.sizeN = sizeN = config.hidden_sizeN
114 |     self.sizeT = sizeT = config.hidden_sizeT
115 |     self.size = size = config.sizeH
116 |     vocab_sizeN, vocab_sizeT = config.vocab_size
117 | 
118 |     # Slightly better results can be obtained with forget gate biases
119 |     # initialized to 1 but the hyperparameters of the model would need to be
120 |     # different than reported in the paper.
121 |     def lstm_cell():
122 |       if 'reuse' in inspect.getargspec(
123 |           tf.contrib.rnn.BasicLSTMCell.__init__).args:
124 |         return tf.contrib.rnn.BasicLSTMCell(
125 |             size, forget_bias=1.0, state_is_tuple=True,
126 |             reuse=tf.get_variable_scope().reuse)
127 |       else:
128 |         return tf.contrib.rnn.BasicLSTMCell(
129 |             size, forget_bias=1.0, state_is_tuple=True)
130 |     attn_cell = lstm_cell
131 |     if is_training and config.keep_prob < 1:
132 |       def attn_cell():
133 |         return tf.contrib.rnn.DropoutWrapper(
134 |             lstm_cell(), output_keep_prob=config.keep_prob)
135 |     cell = tf.contrib.rnn.MultiRNNCell(
136 |         [attn_cell() for _ in range(config.num_layers)], state_is_tuple=True)
137 | 
138 |     state_variables = []
139 |     with tf.variable_scope("myCH0"):
140 |       for i, (state_c, state_h) in enumerate(cell.zero_state(batch_size, data_type())):
141 |         if i > 0: tf.get_variable_scope().reuse_variables()
142 |         myC0 = tf.get_variable("myC0", state_c.shape[1], initializer=tf.zeros_initializer())
143 |         myH0 = tf.get_variable("myH0", state_h.shape[1], initializer=tf.zeros_initializer())
144 |         myC0_tensor = tf.convert_to_tensor([myC0 for _ in range(batch_size)])
145 |         myH0_tensor = tf.convert_to_tensor([myH0 for _ in range(batch_size)])
146 |         state_variables.append(tf.contrib.rnn.LSTMStateTuple(myC0_tensor, myH0_tensor))
147 | 
148 |     self._initial_state = state_variables
149 | 
150 |     self.eof_indicator = input_.eof_indicator
151 | 
152 |     with tf.device("/cpu:0"):
153 |       embeddingN = tf.get_variable(
154 |           "embeddingN", [vocab_sizeN, sizeN], dtype=data_type())
155 |       inputsN = tf.nn.embedding_lookup(embeddingN, input_.input_dataN)
156 |       inputsP = tf.nn.embedding_lookup(embeddingN, input_.input_dataP)
157 | 
158 |     with tf.device("/cpu:0"):
159 |       embeddingT = tf.get_variable(
160 |           "embeddingT", [vocab_sizeT, sizeT], dtype=data_type())
161 |       inputsT = tf.nn.embedding_lookup(embeddingT, input_.input_dataT)
162 | 
163 |     inputs = tf.concat([inputsN, inputsT], 2)  ## feed parent at the input
164 |     #inputs = tf.one_hot(input_.input_data, vocab_size) 
165 |     if is_training and config.keep_prob < 1:
166 |       inputs = tf.nn.dropout(inputs, config.keep_prob)
167 | 
168 |     outputs = []
169 |     attentions = []
170 |     parents = []
171 |     state = self._initial_state
172 |     self.memory = tf.placeholder(dtype=data_type(), shape=[batch_size, num_steps, size], name="memory")
173 |     valid_memory = self.memory[:,-attn_size:,:]
174 |     # print ("test test test,, state shape", np.array(state).shape)
175 |     with tf.variable_scope("RNN"):
176 |       for time_step in range(num_steps):
177 |         if time_step > 0: tf.get_variable_scope().reuse_variables()
178 |         (cell_output, state) = cell(inputs[:, time_step, :], state)
179 |         outputs.append(cell_output)
180 | 
181 |         # parent_index = input_.input_dataP[:, time_step]  # retrieval parent hidden state in batch
182 |         # cell_parent = tf.convert_to_tensor([valid_memory[i,-parent_index[i],:] for i in range(batch_size)])
183 |         cell_parent = inputsP[:, time_step, :]
184 |         parents.append(cell_parent)
185 | 
186 |         wm = tf.get_variable("wm", [size, size], dtype=data_type())
187 |         wh = tf.get_variable("wh", [size, size], dtype=data_type())
188 |         wt = tf.get_variable("wt", [size, 1], dtype=data_type())
189 |         gt = tf.tanh(tf.matmul(tf.reshape(valid_memory, [-1, size]), wm) + tf.reshape(tf.tile(tf.matmul(cell_output, wh),[1, attn_size]), [-1, size]))
190 |         alpha = tf.nn.softmax(tf.reshape(tf.matmul(gt, wt), [-1,attn_size]))
191 |         ct = tf.squeeze(tf.matmul(tf.transpose(valid_memory, [0, 2, 1]), tf.reshape(alpha, [-1, attn_size, 1])))
192 |         attentions.append(ct)
193 |         valid_memory = tf.concat([valid_memory[:,1:,:], tf.expand_dims(cell_output, axis=1)], axis=1)
194 | 
195 |     output = tf.reshape(tf.stack(axis=1, values=outputs), [-1, size])
196 |     attention = tf.reshape(tf.stack(axis=1, values=attentions), [-1, size])
197 |     parent = tf.reshape(tf.stack(axis=1, values=parents), [-1, sizeN])
198 | 
199 |     self.output = tf.reshape(output, [-1, num_steps, size]) #to record the memory for next batch
200 |     wa = tf.get_variable("wa", [size*2+sizeN, size], dtype=data_type())
201 |     nt = tf.tanh(tf.matmul(tf.concat([output, attention, parent], axis=1), wa))
202 | 
203 |     softmax_w = tf.get_variable("softmax_w", [size, vocab_sizeT], dtype=data_type())
204 |     softmax_b = tf.get_variable("softmax_b", [vocab_sizeT], dtype=data_type())
205 |     logits = tf.matmul(nt, softmax_w) + softmax_b
206 |     labels = tf.reshape(input_.targetsT, [-1])
207 |     weights = tf.ones([batch_size * num_steps], dtype=data_type())
208 |     
209 |     #counting unk as wrong
210 |     unk_id = vocab_sizeT - 2
211 |     unk_tf = tf.constant(value=unk_id, dtype=tf.int32, shape=labels.shape)
212 |     zero_weights = tf.zeros_like(labels, dtype=data_type())
213 |     wrong_label = tf.constant(value=-1, dtype=tf.int32, shape=labels.shape)
214 |     condition_tf = tf.equal(labels, unk_tf)
215 |     new_weights = tf.where(condition_tf, zero_weights, weights)
216 |     new_labels = tf.where(condition_tf, wrong_label, labels)
217 | 
218 |     
219 |     loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example([logits], [labels], [new_weights])
220 |     probs = tf.nn.softmax(logits)
221 |     correct_prediction = tf.equal(tf.cast(tf.argmax(probs, 1), dtype = tf.int32), new_labels)
222 |     self._accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
223 | 
224 |     self._cost = cost = tf.reduce_sum(loss) / batch_size
225 |     self._final_state = state
226 | 
227 |     if not is_training:
228 |       return
229 | 
230 |     self._lr = tf.Variable(0.0, trainable=False)
231 |     tvars = tf.trainable_variables()
232 |     print ('tvars', len(tvars))
233 |     grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars),
234 |                                       config.max_grad_norm)
235 |     print ('*******the length', len(grads), '\n')
236 |     optimizer = tf.train.AdamOptimizer(self._lr)
237 |     self._train_op = optimizer.apply_gradients(
238 |         zip(grads, tvars),
239 |         global_step=tf.contrib.framework.get_or_create_global_step())
240 | 
241 |     self._new_lr = tf.placeholder(
242 |         tf.float32, shape=[], name="new_learning_rate")
243 |     self._lr_update = tf.assign(self._lr, self._new_lr)
244 | 
245 |   def assign_lr(self, session, lr_value):
246 |     session.run(self._lr_update, feed_dict={self._new_lr: lr_value})
247 | 
248 |   @property
249 |   def input(self):
250 |     return self._input
251 | 
252 |   @property
253 |   def initial_state(self):
254 |     return self._initial_state
255 | 
256 |   @property
257 |   def cost(self):
258 |     return self._cost
259 | 
260 |   @property
261 |   def final_state(self):
262 |     return self._final_state
263 | 
264 |   @property
265 |   def accuracy(self):
266 |     return self._accuracy
267 | 
268 |   @property
269 |   def lr(self):
270 |     return self._lr
271 | 
272 |   @property
273 |   def train_op(self):
274 |     return self._train_op
275 | 
276 | 
277 | def run_epoch(session, model, eval_op=None, verbose=False):
278 |   start_time = time.time()
279 |   costs = 0.0
280 |   accuracy_list = []
281 |   iters = 0
282 |   state = session.run(model.initial_state)
283 |   # print ('at the very initial of the run_epoch\n', state[0].c)
284 |   eof_indicator = np.ones((model.input.batch_size), dtype=bool)
285 |   memory = np.zeros([model.input.batch_size, model.input.num_steps, model.size])
286 |   # file_id = session.run(model.initial_file_id) #need to remove _
287 | 
288 |   fetches = {
289 |       "cost": model.cost,
290 |       "accuracy": model.accuracy,
291 |       "final_state": model.final_state,
292 |       "eof_indicator": model.eof_indicator,
293 |       "memory":model.output,
294 |   }
295 |   if eval_op is not None:
296 |     fetches["eval_op"] = eval_op
297 | 
298 |   for step in range(model.input.epoch_size):
299 |     feed_dict = {}
300 |     # current_file_id = file_id #session.run(model.file_id)
301 |     sub_cond = np.expand_dims(eof_indicator, axis = 1)
302 |     condition = np.repeat(sub_cond, model.size, axis = 1)
303 |     # zero_state = np.zeros_like(condition)
304 |     # zero_state = np.random.uniform(-0.05,0.05,condition.shape)
305 |     zero_state = session.run(model.initial_state)
306 | 
307 |     for i, (c, h) in enumerate(model.initial_state):
308 |       assert condition.shape == state[i].c.shape
309 |       feed_dict[c] = np.where(condition, zero_state[i][0], state[i].c)
310 |       feed_dict[h] = np.where(condition, zero_state[i][1], state[i].h)
311 |     
312 |     feed_dict[model.memory] = memory
313 |     vals = session.run(fetches, feed_dict)
314 |     
315 |     cost = vals["cost"]
316 |     accuracy = vals["accuracy"]
317 |     eof_indicator = vals["eof_indicator"]
318 |     state = vals["final_state"]  #use the final state as the initial state within a whole epoch
319 |     memory = vals["memory"]
320 | 
321 |     accuracy_list.append(accuracy)
322 |     costs += cost
323 |     iters += model.input.num_steps
324 | 
325 |     if verbose and step % (model.input.epoch_size // 10) == 10:
326 |       print("%.3f perplexity: %.3f accuracy: %.4f speed: %.0f wps" %
327 |             (step * 1.0 / model.input.epoch_size, np.exp(costs / iters), np.mean(accuracy_list),
328 |              (time.time() - start_time)))
329 |   
330 |   print ('this run_epoch takes time %.2f' %(time.time() - start_time))
331 |   return np.exp(costs / iters), np.mean(accuracy_list)
332 | 
333 | 
334 | def main(_):
335 |   start_time = time.time()
336 |   fout = open(outfile, 'a')
337 |   print ('\n', time.asctime(time.localtime()), file=fout)
338 |   print ('start a new experiment %s'%outfile, file=fout)
339 |   print ('Using dataset %s and %s'%(N_filename, T_filename), file=fout)
340 | 
341 |   train_dataN, valid_dataN, vocab_sizeN, train_dataT, valid_dataT, vocab_sizeT, attn_size, train_dataP, valid_dataP \
342 |      = reader.input_data(N_filename, T_filename)
343 | 
344 |   train_data = (train_dataN, train_dataT, train_dataP)
345 |   valid_data = (valid_dataN, valid_dataT, valid_dataP)
346 |   vocab_size = (vocab_sizeN+1, vocab_sizeT+2) # plus EOF, N is [w, eof], T is [w, unk, eof]
347 | 
348 |   config = get_config()
349 |   assert attn_size == config.attn_size #make sure the attn_size used in generate terminal is the same as the configuration
350 |   config.vocab_size = vocab_size  
351 |   eval_config = get_config()
352 |   eval_config.batch_size = config.batch_size * config.num_steps
353 |   eval_config.num_steps = 1
354 |   eval_config.vocab_size = vocab_size
355 | 
356 |   with tf.Graph().as_default():
357 |     initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale)
358 | 
359 |     with tf.name_scope("Train"):
360 |       train_input = PTBInput(config=config, data=train_data, name="TrainInput")
361 |       with tf.variable_scope("Model", reuse=None, initializer=initializer):
362 |         m = PTBModel(is_training=True, config=config, input_=train_input)
363 | 
364 |     with tf.name_scope("Valid"):
365 |       valid_input = PTBInput(config=config, data=valid_data, name="ValidInput")
366 |       with tf.variable_scope("Model", reuse=True, initializer=initializer):
367 |         mvalid = PTBModel(is_training=False, config=config, input_=valid_input)
368 | 
369 |     # with tf.name_scope("Test"):
370 |     #   test_input = PTBInput(config=eval_config, data=valid_data, name="TestInput")
371 |     #   with tf.variable_scope("Model", reuse=True, initializer=initializer):
372 |     #     mtest = PTBModel(is_training=False, config=eval_config,
373 |     #                      input_=test_input)
374 | 
375 | 
376 |     print ('total trainable variables', len(tf.trainable_variables()), '\n\n')
377 |     max_valid = 0
378 |     max_step = 0
379 |     saver = tf.train.Saver()
380 | 
381 |     sv = tf.train.Supervisor(logdir=None, summary_op=None)
382 |     with sv.managed_session() as session:
383 | 
384 |       for i in range(config.max_max_epoch):
385 |         lr_decay = config.lr_decay ** max(i + 1 - config.max_epoch, 0.0)
386 |         m.assign_lr(session, config.learning_rate * lr_decay)
387 |         print (outfile, "Epoch: %d Learning rate: %.6f" % (i + 1, session.run(m.lr)))
388 | 
389 |         train_perplexity, train_accuracy = run_epoch(session, m, eval_op=m.train_op, verbose=True)
390 |         print("Epoch: %d Train Perplexity: %.3f Train Accuracy: %.3f" % (i + 1, train_perplexity, train_accuracy))
391 |         print("Epoch: %d Train Perplexity: %.3f Train Accuracy: %.3f" % (i + 1, train_perplexity, train_accuracy), file=fout)
392 | 
393 |         if i > 5:
394 |           valid_perplexity, valid_accuracy = run_epoch(session, mvalid)
395 |           print("Epoch: %d Valid Perplexity: ~~%.3f Valid Accuracy: %.3f~" % (i + 1, valid_perplexity, valid_accuracy))
396 |           print("Epoch: %d Valid Perplexity: ~~%.3f Valid Accuracy: %.3f~" % (i + 1, valid_perplexity, valid_accuracy), file=fout)
397 |           if valid_accuracy > max_valid:
398 |             max_valid = valid_accuracy
399 |             max_step = i + 1
400 | 
401 |       # test_perplexity, test_accuracy = run_epoch(session, mtest)
402 |       # print("\nTest Perplexity: %.3f Test Accuracy: %.3f" % (test_perplexity, test_accuracy))
403 | 
404 |       print ('max step %d, max valid %.3f' %(max_step, max_valid))
405 |       # print ('data path is', FLAGS.data_path)
406 |       print ('total time takes', time.time()-start_time)
407 |       print ('max step %d, max valid %.3f' %(max_step, max_valid), file=fout)
408 |       print ('total time takes', time.time()-start_time, file=fout)
409 |       fout.close()
410 | 
411 |       # if FLAGS.save_path:
412 |       #   print("Saving model to %s." % FLAGS.save_path)
413 |       #   save_path = saver.save(session, FLAGS.save_path, write_meta_graph=False, write_state=False)
414 | 
415 | 
416 | if __name__ == "__main__":
417 |   tf.app.run()
418 | 


--------------------------------------------------------------------------------
/code/myModel_commented.py:
--------------------------------------------------------------------------------
  1 | # a word w is composed of two kinds of information: type(N) and value(T), i.e., w_i = (N_i, T_i)
  2 | # task: given a sequence of words w_1 to w_(t-1), predict the next word value T_t
  3 | 
  4 | class my_Model(object):
  5 |   """This class is to build my lstm model, which mainly refers to The PTB model from official tensorflow example."""
  6 | 
  7 |   def __init__(self, is_training, config, input_):
  8 |     self._input = input_
  9 |     self.attn_size = attn_size = config.attn_size  # attention size
 10 |     batch_size = input_.batch_size
 11 |     num_steps = input_.num_steps  # the lstm unrolling length
 12 |     self.sizeN = sizeN = config.hidden_sizeN  # embedding size of type(N)
 13 |     self.sizeT = sizeT = config.hidden_sizeT  # embedding size of value(T)
 14 |     self.size = size = config.sizeH  # hidden size of the lstm cell
 15 |     (vocab_sizeN, vocab_sizeT) = config.vocab_size  # vocabulary size of type and value
 16 | 
 17 |     # from line 17 to line 33: copy from official PTB model which defines an lstm cell with drop-out and multi-layers
 18 |     def lstm_cell():
 19 |       if 'reuse' in inspect.getargspec(
 20 |           tf.contrib.rnn.BasicLSTMCell.__init__).args:
 21 |         return tf.contrib.rnn.BasicLSTMCell(
 22 |             size, forget_bias=1.0, state_is_tuple=True,
 23 |             reuse=tf.get_variable_scope().reuse)
 24 |       else:
 25 |         return tf.contrib.rnn.BasicLSTMCell(
 26 |             size, forget_bias=1.0, state_is_tuple=True)
 27 |     attn_cell = lstm_cell
 28 |     if is_training and config.keep_prob < 1:  # drop-out when training
 29 |       def attn_cell():
 30 |         return tf.contrib.rnn.DropoutWrapper(
 31 |             lstm_cell(), output_keep_prob=config.keep_prob)
 32 |     cell = tf.contrib.rnn.MultiRNNCell(
 33 |         [attn_cell() for _ in range(config.num_layers)], state_is_tuple=True)  #multi-layers
 34 | 
 35 |     # from line 35 to line 44: set the initial hidden states, which are two trainable vectors. Processing a new sentence starts from here.
 36 |     state_variables = []
 37 |     with tf.variable_scope("myCH0"):
 38 |       for i, (state_c, state_h) in enumerate(cell.zero_state(batch_size, data_type())):
 39 |         if i > 0: tf.get_variable_scope().reuse_variables()
 40 |         myC0 = tf.get_variable("myC0", state_c.shape[1], initializer=tf.zeros_initializer())
 41 |         myH0 = tf.get_variable("myH0", state_h.shape[1], initializer=tf.zeros_initializer())
 42 |         myC0_tensor = tf.convert_to_tensor([myC0 for _ in range(batch_size)])
 43 |         myH0_tensor = tf.convert_to_tensor([myH0 for _ in range(batch_size)])
 44 |         state_variables.append(tf.contrib.rnn.LSTMStateTuple(myC0_tensor, myH0_tensor))
 45 | 
 46 |     self._initial_state = state_variables
 47 | 
 48 |     self.eof_indicator = input_.eof_indicator  # indicate whether this is the end of a sentence
 49 | 
 50 |     with tf.device("/cpu:0"):
 51 |       embeddingN = tf.get_variable(
 52 |           "embeddingN", [vocab_sizeN, sizeN], dtype=data_type())
 53 |       inputsN = tf.nn.embedding_lookup(embeddingN, input_.input_dataN)  # input type embedding
 54 | 
 55 |     with tf.device("/cpu:0"):
 56 |       embeddingT = tf.get_variable(
 57 |           "embeddingT", [vocab_sizeT, sizeT], dtype=data_type())
 58 |       inputsT = tf.nn.embedding_lookup(embeddingT, input_.input_dataT)  # input value embedding
 59 | 
 60 |     inputs = tf.concat([inputsN, inputsT], 2) # concatenate the type and value embedding
 61 |     if is_training and config.keep_prob < 1:
 62 |       inputs = tf.nn.dropout(inputs, config.keep_prob)
 63 | 
 64 |     outputs = []  # store hidden state at each time_step
 65 |     attentions = []  # store context attention vector at each time_step
 66 |     alphas = []  # store attention scores at each time_step
 67 |     state = self._initial_state
 68 |     self.memory = tf.placeholder(dtype=data_type(), shape=[batch_size, num_steps, size], name="memory")
 69 |     valid_memory = self.memory[:,-attn_size:,:]  # previous hidden states within the attention window
 70 | 
 71 |     # from line 72 to line 87: build the RNN model, and calculate attention
 72 |     with tf.variable_scope("RNN"):
 73 |       for time_step in range(num_steps):
 74 |         if time_step > 0: tf.get_variable_scope().reuse_variables()
 75 |         (cell_output, state) = cell(inputs[:, time_step, :], state)  # lstm_cell update function
 76 |         outputs.append(cell_output) # store hidden state
 77 | 
 78 |         # calculate attention scores alpha and context vector ct
 79 |         wm = tf.get_variable("wm", [size, size], dtype=data_type())
 80 |         wh = tf.get_variable("wh", [size, size], dtype=data_type())
 81 |         wt = tf.get_variable("wt", [size, 1], dtype=data_type())
 82 |         gt = tf.tanh(tf.matmul(tf.reshape(valid_memory, [-1, size]), wm) + tf.reshape(tf.tile(tf.matmul(cell_output, wh),[1, attn_size]), [-1, size]))
 83 |         alpha = tf.nn.softmax(tf.reshape(tf.matmul(gt, wt), [-1,attn_size])) #the size of alpha: batch_size by attn_size
 84 |         alphas.append(alpha)
 85 |         ct = tf.squeeze(tf.matmul(tf.transpose(valid_memory, [0, 2, 1]), tf.reshape(alpha, [-1, attn_size, 1])))
 86 |         attentions.append(ct)
 87 |         valid_memory = tf.concat([valid_memory[:,1:,:], tf.expand_dims(cell_output, axis=1)], axis=1) #move forward attention window
 88 | 
 89 |     output = tf.reshape(tf.stack(axis=1, values=outputs), [-1, size])  # hidden states for all time_steps
 90 |     attention = tf.reshape(tf.stack(axis=1, values=attentions), [-1, size])  # context vectors for all time_steps
 91 | 
 92 |     self.output = tf.reshape(output, [-1, num_steps, size]) #to record the memory for next batch
 93 |     wa = tf.get_variable("wa", [size*2, size], dtype=data_type())
 94 |     nt = tf.tanh(tf.matmul(tf.concat([output, attention], axis=1), wa))
 95 | 
 96 |     #compute w: the word distribution within the global vocabulary
 97 |     softmax_w = tf.get_variable("softmax_w", [size, vocab_sizeT], dtype=data_type())
 98 |     softmax_b = tf.get_variable("softmax_b", [vocab_sizeT], dtype=data_type())
 99 |     w_logits = tf.matmul(nt, softmax_w) + softmax_b
100 |     w_probs = tf.nn.softmax(w_logits)  # baseline model uses this
101 | 
102 |     #compute l: reuse attention scores as the location distribution for pointer network
103 |     l_logits_pre = tf.reshape(tf.stack(axis=1, values=alphas), [-1, attn_size]) #the size is batch_size*num_steps by attn_size
104 |     l_logits = tf.reverse(l_logits_pre, axis=[1])
105 | 
106 |     #compute d: a switching network to balance the above two distributions, based on hidden states and context
107 |     d_conditioned = tf.concat([output, attention], axis=1)
108 |     d_w = tf.get_variable("d_w1", [2*size, 1], dtype=data_type())
109 |     d_b = tf.get_variable("d_b1", [1], dtype=data_type())
110 |     d = tf.nn.sigmoid(tf.matmul(d_conditioned, d_w) + d_b)
111 | 
112 |     #concat w and l to construct f
113 |     f_logits = tf.concat([w_logits*d, l_logits*(1-d)], axis=1)
114 | 
115 |     labels = tf.reshape(input_.targetsT, [-1])
116 |     weights = tf.ones([batch_size * num_steps], dtype=data_type())
117 | 
118 |     # set mask for counting unk as wrong
119 |     unk_id = vocab_sizeT - 2
120 |     unk_tf = tf.constant(value=unk_id, dtype=tf.int32, shape=labels.shape)
121 |     zero_weights = tf.zeros_like(labels, dtype=data_type())
122 |     wrong_label = tf.constant(value=-1, dtype=tf.int32, shape=labels.shape)
123 |     condition_tf = tf.equal(labels, unk_tf)
124 |     new_weights = tf.where(condition_tf, zero_weights, weights)
125 |     new_labels = tf.where(condition_tf, wrong_label, labels)
126 | 
127 |     loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example([f_logits], [labels], [new_weights])
128 |     probs = tf.nn.softmax(f_logits)
129 | 
130 |     correct_prediction = tf.equal(tf.cast(tf.argmax(probs, 1), dtype = tf.int32), new_labels)
131 |     self._accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
132 | 
133 |     self._cost = cost = tf.reduce_sum(loss) / batch_size
134 |     self._final_state = state


--------------------------------------------------------------------------------
/code/pointer.py:
--------------------------------------------------------------------------------
  1 | # use word distribution and location information(pointer)
  2 | # 1-28, reverse l_logits
  3 | 
  4 | from __future__ import absolute_import
  5 | from __future__ import division
  6 | from __future__ import print_function
  7 | 
  8 | import inspect
  9 | import time
 10 | 
 11 | import numpy as np
 12 | import tensorflow as tf
 13 | 
 14 | import reader_pointer_original as reader
 15 | import os
 16 | os.environ['TF_CPP_MIN_LOG_LEVEL']='2'
 17 | 
 18 | os.environ['CUDA_VISIBLE_DEVICES']='0'
 19 | outfile = 'output_pointer.txt'
 20 | 
 21 | N_filename = '../pickle_data/small_JS_non_terminal.pickle'
 22 | T_filename = '../pickle_data/small_JS_terminal_1k_whole.pickle'
 23 | 
 24 | flags = tf.flags
 25 | flags.DEFINE_string("save_path", None, #'./logs/modelT0A'
 26 |                     "Model output directory.")
 27 | 
 28 | flags.DEFINE_string(
 29 |     "model", "small",
 30 |     "A type of model. Possible options are: small, medium, best.")
 31 | # flags.DEFINE_string("data_path", '../data/dataJS',
 32 | #                     "Where the training/test data is stored.")
 33 | flags.DEFINE_bool("use_fp16", False,
 34 |                   "Train using 16-bit floats instead of 32bit floats")
 35 | 
 36 | FLAGS = flags.FLAGS
 37 | logging = tf.logging
 38 | 
 39 | if FLAGS.model == "test":
 40 |   outfile = 'TESToutput.txt'
 41 | def data_type():
 42 |   return tf.float16 if FLAGS.use_fp16 else tf.float32
 43 | 
 44 | class SmallConfig(object):
 45 |   """Small config.  get best result as 0.733 """
 46 |   init_scale = 0.05
 47 |   learning_rate = 0.001
 48 |   max_grad_norm = 5
 49 |   num_layers = 1#1
 50 |   num_steps = 50
 51 |   attn_size = 50
 52 |   hidden_sizeN = 300
 53 |   hidden_sizeT = 500
 54 |   sizeH = 800
 55 |   max_epoch = 1#8
 56 |   max_max_epoch = 8#79
 57 |   keep_prob = 1.0#1.0
 58 |   lr_decay = 0.6#0.95
 59 |   batch_size = 64#80
 60 | 
 61 | class TestConfig(object):
 62 |   """Tiny config, for testing."""
 63 |   init_scale = 0.05
 64 |   learning_rate = 0.001
 65 |   max_grad_norm = 5
 66 |   num_layers = 1
 67 |   num_steps = 50
 68 |   attn_size = 50
 69 |   hidden_sizeN = 300
 70 |   hidden_sizeT = 500
 71 |   sizeH = 800
 72 |   max_epoch = 1
 73 |   max_max_epoch = 1
 74 |   keep_prob = 1.0
 75 |   lr_decay = 0.6
 76 |   batch_size = 80
 77 | 
 78 | 
 79 | def get_config():
 80 |   if FLAGS.model == "small":
 81 |     return SmallConfig()
 82 |   elif FLAGS.model == "medium":
 83 |     return MediumConfig()
 84 |   elif FLAGS.model == "best":
 85 |     return BestConfig()
 86 |   elif FLAGS.model == "test":
 87 |     return TestConfig()
 88 |   else:
 89 |     raise ValueError("Invalid model: %s", FLAGS.model)
 90 | 
 91 | 
 92 | class PTBInput(object):
 93 |   """The input data."""
 94 | 
 95 |   def __init__(self, config, data, name=None):
 96 |     self.batch_size = batch_size = config.batch_size
 97 |     self.attn_size = attn_size = config.attn_size
 98 |     self.num_steps = num_steps = config.num_steps
 99 |     self.input_dataN, self.targetsN, self.input_dataT, self.targetsT, self.epoch_size, self.eof_indicator = \
100 |             reader.data_producer(data, batch_size, num_steps, config.vocab_size, config.attn_size, change_yT=False, name=name)
101 |     if FLAGS.model == "test":
102 |       self.epoch_size = 16   #small epoch size for test
103 | 
104 | 
105 | class PTBModel(object):
106 |   """The PTB model."""
107 | 
108 |   def __init__(self, is_training, config, input_):
109 |     self._input = input_
110 |     self.attn_size = attn_size = config.attn_size
111 |     batch_size = input_.batch_size
112 |     num_steps = input_.num_steps
113 |     self.sizeN = sizeN = config.hidden_sizeN
114 |     self.sizeT = sizeT = config.hidden_sizeT
115 |     self.size = size = config.sizeH
116 |     (vocab_sizeN, vocab_sizeT) = config.vocab_size
117 | 
118 |     # Slightly better results can be obtained with forget gate biases
119 |     # initialized to 1 but the hyperparameters of the model would need to be
120 |     # different than reported in the paper.
121 |     def lstm_cell():
122 |       if 'reuse' in inspect.getargspec(
123 |           tf.contrib.rnn.BasicLSTMCell.__init__).args:
124 |         return tf.contrib.rnn.BasicLSTMCell(
125 |             size, forget_bias=1.0, state_is_tuple=True,
126 |             reuse=tf.get_variable_scope().reuse)
127 |       else:
128 |         return tf.contrib.rnn.BasicLSTMCell(
129 |             size, forget_bias=1.0, state_is_tuple=True)
130 |     attn_cell = lstm_cell
131 |     if is_training and config.keep_prob < 1:
132 |       def attn_cell():
133 |         return tf.contrib.rnn.DropoutWrapper(
134 |             lstm_cell(), output_keep_prob=config.keep_prob)
135 |     cell = tf.contrib.rnn.MultiRNNCell(
136 |         [attn_cell() for _ in range(config.num_layers)], state_is_tuple=True)
137 | 
138 |     state_variables = []
139 |     with tf.variable_scope("myCH0"):
140 |       for i, (state_c, state_h) in enumerate(cell.zero_state(batch_size, data_type())):
141 |         if i > 0: tf.get_variable_scope().reuse_variables()
142 |         myC0 = tf.get_variable("myC0", state_c.shape[1], initializer=tf.zeros_initializer())
143 |         myH0 = tf.get_variable("myH0", state_h.shape[1], initializer=tf.zeros_initializer())
144 |         myC0_tensor = tf.convert_to_tensor([myC0 for _ in range(batch_size)])
145 |         myH0_tensor = tf.convert_to_tensor([myH0 for _ in range(batch_size)])
146 |         state_variables.append(tf.contrib.rnn.LSTMStateTuple(myC0_tensor, myH0_tensor))
147 | 
148 |     self._initial_state = state_variables
149 | 
150 |     self.eof_indicator = input_.eof_indicator
151 | 
152 |     with tf.device("/cpu:0"):
153 |       embeddingN = tf.get_variable(
154 |           "embeddingN", [vocab_sizeN, sizeN], dtype=data_type())
155 |       inputsN = tf.nn.embedding_lookup(embeddingN, input_.input_dataN)
156 | 
157 |     with tf.device("/cpu:0"):
158 |       embeddingT = tf.get_variable(
159 |           "embeddingT", [vocab_sizeT, sizeT], dtype=data_type())
160 |       inputsT = tf.nn.embedding_lookup(embeddingT, input_.input_dataT)
161 | 
162 |     inputs = tf.concat([inputsN, inputsT], 2)
163 |     #inputs = tf.one_hot(input_.input_data, vocab_size) 
164 |     if is_training and config.keep_prob < 1:
165 |       inputs = tf.nn.dropout(inputs, config.keep_prob)
166 | 
167 |     outputs = []
168 |     attentions = []
169 |     alphas = []
170 |     state = self._initial_state
171 |     self.memory = tf.placeholder(dtype=data_type(), shape=[batch_size, num_steps, size], name="memory")
172 |     valid_memory = self.memory[:,-attn_size:,:]
173 |     # print ("test test test,, state shape", np.array(state).shape)
174 |     with tf.variable_scope("RNN"):
175 |       for time_step in range(num_steps):
176 |         if time_step > 0: tf.get_variable_scope().reuse_variables()
177 |         (cell_output, state) = cell(inputs[:, time_step, :], state)
178 |         outputs.append(cell_output)
179 | 
180 |         wm = tf.get_variable("wm", [size, size], dtype=data_type())
181 |         wh = tf.get_variable("wh", [size, size], dtype=data_type())
182 |         wt = tf.get_variable("wt", [size, 1], dtype=data_type())
183 |         gt = tf.tanh(tf.matmul(tf.reshape(valid_memory, [-1, size]), wm) + tf.reshape(tf.tile(tf.matmul(cell_output, wh),[1, attn_size]), [-1, size]))
184 |         alpha = tf.nn.softmax(tf.reshape(tf.matmul(gt, wt), [-1,attn_size])) #the size of alpha: batch_size by attn_size
185 |         alphas.append(alpha)
186 |         ct = tf.squeeze(tf.matmul(tf.transpose(valid_memory, [0, 2, 1]), tf.reshape(alpha, [-1, attn_size, 1])))
187 |         attentions.append(ct)
188 |         valid_memory = tf.concat([valid_memory[:,1:,:], tf.expand_dims(cell_output, axis=1)], axis=1)
189 | 
190 |     output = tf.reshape(tf.stack(axis=1, values=outputs), [-1, size])
191 |     attention = tf.reshape(tf.stack(axis=1, values=attentions), [-1, size])
192 | 
193 |     self.output = tf.reshape(output, [-1, num_steps, size]) #to record the memory for next batch
194 |     wa = tf.get_variable("wa", [size*2, size], dtype=data_type())
195 |     nt = tf.tanh(tf.matmul(tf.concat([output, attention], axis=1), wa))
196 | 
197 |     #compute w
198 |     softmax_w = tf.get_variable("softmax_w", [size, vocab_sizeT], dtype=data_type())
199 |     softmax_b = tf.get_variable("softmax_b", [vocab_sizeT], dtype=data_type())
200 |     w_logits = tf.matmul(nt, softmax_w) + softmax_b
201 |     w_probs = tf.nn.softmax(w_logits)
202 | 
203 |     #compute l
204 |     l_logits_pre = tf.reshape(tf.stack(axis=1, values=alphas), [-1, attn_size]) #the size of alpha_reshaped: batch_size*num_steps by attn_size
205 |     l_logits = tf.reverse(l_logits_pre, axis=[1])
206 |     # l_probs = tf.nn.softmax(l_logits)
207 | 
208 |     #compute d
209 |     # input_reshaped = tf.reshape(inputs, [-1, size])
210 |     d_conditioned = tf.concat([output, attention], axis=1)
211 |     d_w = tf.get_variable("d_w1", [2*size, 1], dtype=data_type())
212 |     d_b = tf.get_variable("d_b1", [1], dtype=data_type())
213 |     d = tf.nn.sigmoid(tf.matmul(d_conditioned, d_w) + d_b)
214 | 
215 |     # d_conditioned = tf.concat([output, attention], axis=1)
216 |     # d_w1 = tf.get_variable("d_w1", [2*size, size], dtype=data_type())
217 |     # d_b1 = tf.get_variable("d_b1", [size], dtype=data_type())
218 |     # fc1 = tf.nn.relu(tf.matmul(d_conditioned, d_w1) + d_b1)
219 |     # d_w2 = tf.get_variable("d_w2", [size, 1], dtype=data_type())
220 |     # d_b2 = tf.get_variable("d_b2", [1], dtype=data_type())
221 |     # d = tf.nn.sigmoid(tf.matmul(fc1, d_w2) + d_b2)
222 | 
223 |     #concat w and l to construct f
224 |     f_logits = tf.concat([w_logits*d, l_logits*(1-d)], axis=1)
225 | 
226 |     labels = tf.reshape(input_.targetsT, [-1])
227 |     weights = tf.ones([batch_size * num_steps], dtype=data_type())
228 | 
229 |     #counting unk as wrong
230 |     unk_id = vocab_sizeT - 2
231 |     unk_tf = tf.constant(value=unk_id, dtype=tf.int32, shape=labels.shape)
232 |     zero_weights = tf.zeros_like(labels, dtype=data_type())
233 |     wrong_label = tf.constant(value=-1, dtype=tf.int32, shape=labels.shape)
234 |     condition_tf = tf.equal(labels, unk_tf)
235 |     new_weights = tf.where(condition_tf, zero_weights, weights)
236 |     new_labels = tf.where(condition_tf, wrong_label, labels) # only for computing the accuracy, can not be used to compute the loss(cause nan error)
237 | 
238 | 
239 |     loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example([f_logits], [labels], [new_weights])
240 |     probs = tf.nn.softmax(f_logits)
241 | 
242 |     # condition = tf.not_equal(labels, 182)
243 |     # non_pad_len = tf.reduce_sum(tf.cast(condition, tf.float32))
244 |     # mask_labels = tf.where(condition, labels, tf.constant(250, shape = labels.get_shape())) #250 just do not belong to the vocab    
245 |     # correct_prediction = tf.equal(tf.cast(tf.argmax(probs, 1), dtype = tf.int32), mask_labels)
246 |     # self._accuracy = tf.reduce_sum(tf.cast(correct_prediction, tf.float32)) /  non_pad_len # do not count predict <pad>(182)
247 | 
248 |     correct_prediction = tf.equal(tf.cast(tf.argmax(probs, 1), dtype = tf.int32), new_labels)
249 |     self._accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
250 | 
251 |     self._cost = cost = tf.reduce_sum(loss) / batch_size
252 |     self._final_state = state
253 | 
254 |     if not is_training:
255 |       return
256 | 
257 |     self._lr = tf.Variable(0.0, trainable=False)
258 |     tvars = tf.trainable_variables()
259 |     print ('tvars', len(tvars))
260 |     grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars),
261 |                                       config.max_grad_norm)
262 |     print ('*******the length', len(grads))
263 |     optimizer = tf.train.AdamOptimizer(self._lr)
264 |     self._train_op = optimizer.apply_gradients(
265 |         zip(grads, tvars),
266 |         global_step=tf.contrib.framework.get_or_create_global_step())
267 | 
268 |     self._new_lr = tf.placeholder(
269 |         tf.float32, shape=[], name="new_learning_rate")
270 |     self._lr_update = tf.assign(self._lr, self._new_lr)
271 | 
272 |   def assign_lr(self, session, lr_value):
273 |     session.run(self._lr_update, feed_dict={self._new_lr: lr_value})
274 | 
275 |   @property
276 |   def input(self):
277 |     return self._input
278 | 
279 |   @property
280 |   def initial_state(self):
281 |     return self._initial_state
282 | 
283 |   @property
284 |   def cost(self):
285 |     return self._cost
286 | 
287 |   @property
288 |   def final_state(self):
289 |     return self._final_state
290 | 
291 |   @property
292 |   def accuracy(self):
293 |     return self._accuracy
294 | 
295 |   @property
296 |   def lr(self):
297 |     return self._lr
298 | 
299 |   @property
300 |   def train_op(self):
301 |     return self._train_op
302 | 
303 | 
304 | def run_epoch(session, model, eval_op=None, verbose=False):
305 |   """Runs the model on the given data."""
306 |   start_time = time.time()
307 |   costs = 0.0
308 |   accuracy_list = []
309 |   iters = 0
310 |   state = session.run(model.initial_state)
311 |   # print ('at the very initial of the run_epoch\n', state[0].c)
312 |   eof_indicator = np.ones((model.input.batch_size), dtype=bool)
313 |   memory = np.zeros([model.input.batch_size, model.input.num_steps, model.size])
314 |   # file_id = session.run(model.initial_file_id) #need to remove _
315 | 
316 |   fetches = {
317 |       "cost": model.cost,
318 |       "accuracy": model.accuracy,
319 |       "final_state": model.final_state,
320 |       "eof_indicator": model.eof_indicator,
321 |       "memory":model.output,
322 |   }
323 |   if eval_op is not None:
324 |     fetches["eval_op"] = eval_op
325 | 
326 |   for step in range(model.input.epoch_size):
327 |     feed_dict = {}
328 |     # current_file_id = file_id #session.run(model.file_id)
329 |     sub_cond = np.expand_dims(eof_indicator, axis = 1)
330 |     condition = np.repeat(sub_cond, model.size, axis = 1)
331 |     # zero_state = np.zeros_like(condition)
332 |     # zero_state = np.random.uniform(-0.05,0.05,condition.shape)
333 |     zero_state = session.run(model.initial_state)
334 | 
335 |     for i, (c, h) in enumerate(model.initial_state):
336 |       assert condition.shape == state[i].c.shape
337 |       feed_dict[c] = np.where(condition, zero_state[i][0], state[i].c)
338 |       feed_dict[h] = np.where(condition, zero_state[i][1], state[i].h)
339 |     
340 |     feed_dict[model.memory] = memory
341 |     vals = session.run(fetches, feed_dict)
342 |     
343 |     cost = vals["cost"]
344 |     accuracy = vals["accuracy"]
345 |     eof_indicator = vals["eof_indicator"]
346 |     state = vals["final_state"]  #use the final state as the initial state within a whole epoch
347 |     memory = vals["memory"]
348 | 
349 |     accuracy_list.append(accuracy)
350 |     costs += cost
351 |     iters += model.input.num_steps
352 | 
353 |     if verbose and step % (model.input.epoch_size // 10) == 10:
354 |       print("%.3f perplexity: %.3f accuracy: %.4f speed: %.0f wps" %
355 |             (step * 1.0 / model.input.epoch_size, np.exp(costs / iters), np.mean(accuracy_list),
356 |              (time.time() - start_time)))
357 |       # print ('zero_state value', zero_state[0][0])
358 |       # print ('gradients value', session.run(model.grads))
359 |   
360 |   print ('this run_epoch takes time %.2f' %(time.time() - start_time))
361 |   return np.exp(costs / iters), np.mean(accuracy_list)
362 | 
363 | 
364 | 
365 | 
366 | def main(_):
367 |   start_time = time.time()
368 |   fout = open(outfile, 'a')
369 |   print ('\n', time.asctime(time.localtime()), file=fout)
370 |   print ('start a new experiment %s'%outfile, file=fout)
371 |   print ('Using dataset %s and %s'%(N_filename, T_filename), file=fout)
372 |   print ('condition on two, two layers', file=fout)
373 | 
374 |   train_dataN, valid_dataN, vocab_sizeN, train_dataT, valid_dataT, vocab_sizeT, attn_size = reader.input_data(N_filename, T_filename)
375 | 
376 |   train_data = (train_dataN, train_dataT)
377 |   valid_data = (valid_dataN, valid_dataT)
378 |   vocab_size = (vocab_sizeN+1, vocab_sizeT+2) # N is [w, eof], T is [w, unk, eof]
379 | 
380 |   config = get_config()
381 |   assert attn_size == config.attn_size #make sure the attn_size used in generate terminal is the same as the configuration
382 |   config.vocab_size = vocab_size  
383 |   eval_config = get_config()
384 |   eval_config.batch_size = config.batch_size * config.num_steps
385 |   eval_config.num_steps = 1
386 |   eval_config.vocab_size = vocab_size
387 | 
388 |   with tf.Graph().as_default():
389 |     initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale)
390 | 
391 |     with tf.name_scope("Train"):
392 |       train_input = PTBInput(config=config, data=train_data, name="TrainInput")
393 |       with tf.variable_scope("Model", reuse=None, initializer=initializer):
394 |         m = PTBModel(is_training=True, config=config, input_=train_input)
395 | 
396 |     with tf.name_scope("Valid"):
397 |       valid_input = PTBInput(config=config, data=valid_data, name="ValidInput")
398 |       with tf.variable_scope("Model", reuse=True, initializer=initializer):
399 |         mvalid = PTBModel(is_training=False, config=config, input_=valid_input)
400 | 
401 |     # with tf.name_scope("Test"):
402 |     #   test_input = PTBInput(config=eval_config, data=valid_data, name="TestInput")
403 |     #   with tf.variable_scope("Model", reuse=True, initializer=initializer):
404 |     #     mtest = PTBModel(is_training=False, config=eval_config,
405 |     #                      input_=test_input)
406 | 
407 | 
408 |     print ('total trainable variables', len(tf.trainable_variables()), '\n\n')
409 |     max_valid = 0
410 |     max_step = 0
411 |     saver = tf.train.Saver()
412 | 
413 |     sv = tf.train.Supervisor(logdir=None, summary_op=None)
414 |     with sv.managed_session() as session:
415 | 
416 |       for i in range(config.max_max_epoch):
417 |         lr_decay = config.lr_decay ** max(i + 1 - config.max_epoch, 0.0)
418 |         m.assign_lr(session, config.learning_rate * lr_decay)
419 |         print (outfile, "Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr)))
420 | 
421 |         train_perplexity, train_accuracy = run_epoch(session, m, eval_op=m.train_op, verbose=True)
422 |         print("Epoch: %d Train Perplexity: %.3f Train Accuracy: %.3f" % (i + 1, train_perplexity, train_accuracy))
423 |         print("Epoch: %d Train Perplexity: %.3f Train Accuracy: %.3f" % (i + 1, train_perplexity, train_accuracy), file=fout)
424 | 
425 |         if i > 5:
426 |           valid_perplexity, valid_accuracy = run_epoch(session, mvalid)
427 |           print("Epoch: %d Valid Perplexity: ~~%.3f Valid Accuracy: %.3f~" % (i + 1, valid_perplexity, valid_accuracy))
428 |           print("Epoch: %d Valid Perplexity: ~~%.3f Valid Accuracy: %.3f~" % (i + 1, valid_perplexity, valid_accuracy), file=fout)
429 |           if valid_accuracy > max_valid:
430 |             max_valid = valid_accuracy
431 |             max_step = i + 1
432 | 
433 |       # test_perplexity, test_accuracy = run_epoch(session, mtest)
434 |       # print("\nTest Perplexity: %.3f Test Accuracy: %.3f" % (test_perplexity, test_accuracy))
435 | 
436 |       print ('max step %d, max valid %.3f' %(max_step, max_valid))
437 |       # print ('data path is', FLAGS.data_path)
438 |       print ('total time takes', time.time()-start_time)
439 |       print ('max step %d, max valid %.3f' %(max_step, max_valid), file=fout)
440 |       print ('total time takes', time.time()-start_time, file=fout)
441 |       fout.close()
442 | 
443 |       # if FLAGS.save_path:
444 |       #   print("Saving model to %s." % FLAGS.save_path)
445 |       #   save_path = saver.save(session, FLAGS.save_path, write_meta_graph=False, write_state=False)
446 | 
447 | 
448 | if __name__ == "__main__":
449 |   tf.app.run()
450 | 


--------------------------------------------------------------------------------
/code/pointer_parent.py:
--------------------------------------------------------------------------------
  1 | # use word distribution and location information(pointer)
  2 | # 1-28, reverse l_logits
  3 | 
  4 | from __future__ import absolute_import
  5 | from __future__ import division
  6 | from __future__ import print_function
  7 | 
  8 | import inspect
  9 | import time
 10 | 
 11 | import numpy as np
 12 | import tensorflow as tf
 13 | 
 14 | import reader_pointer as reader
 15 | import os
 16 | os.environ['TF_CPP_MIN_LOG_LEVEL']='2'
 17 | 
 18 | os.environ['CUDA_VISIBLE_DEVICES']='0'
 19 | outfile = 'output_pointer_parent.txt'
 20 | 
 21 | N_filename = '../pickle_data/JS_non_terminal.pickle'
 22 | T_filename = '../pickle_data/JS_terminal_50k_whole.pickle'
 23 | 
 24 | flags = tf.flags
 25 | flags.DEFINE_string("save_path", None, #'./logs/modelT0A'
 26 |                     "Model output directory.")
 27 | 
 28 | flags.DEFINE_string(
 29 |     "model", "small",
 30 |     "A type of model. Possible options are: small, medium, best.")
 31 | # flags.DEFINE_string("data_path", '../data/dataJS',
 32 | #                     "Where the training/test data is stored.")
 33 | flags.DEFINE_bool("use_fp16", False,
 34 |                   "Train using 16-bit floats instead of 32bit floats")
 35 | 
 36 | FLAGS = flags.FLAGS
 37 | logging = tf.logging
 38 | 
 39 | if FLAGS.model == "test":
 40 |   outfile = 'TESToutput.txt'
 41 | def data_type():
 42 |   return tf.float16 if FLAGS.use_fp16 else tf.float32
 43 | 
 44 | class SmallConfig(object):
 45 |   """Small config.  get best result as 0.733 """
 46 |   init_scale = 0.05
 47 |   learning_rate = 0.001
 48 |   max_grad_norm = 5
 49 |   num_layers = 1#1
 50 |   num_steps = 50
 51 |   attn_size = 50
 52 |   hidden_sizeN = 300
 53 |   hidden_sizeT = 500
 54 |   sizeH = 800
 55 |   max_epoch = 1#8
 56 |   max_max_epoch = 8#79
 57 |   keep_prob = 1.0#1.0
 58 |   lr_decay = 0.6#0.95
 59 |   batch_size = 64#80
 60 | 
 61 | class TestConfig(object):
 62 |   """Tiny config, for testing."""
 63 |   init_scale = 0.05
 64 |   learning_rate = 0.001
 65 |   max_grad_norm = 5
 66 |   num_layers = 1
 67 |   num_steps = 50
 68 |   attn_size = 50
 69 |   hidden_sizeN = 300
 70 |   hidden_sizeT = 500
 71 |   sizeH = 800
 72 |   max_epoch = 1
 73 |   max_max_epoch = 1
 74 |   keep_prob = 1.0
 75 |   lr_decay = 0.6
 76 |   batch_size = 80
 77 | 
 78 | 
 79 | def get_config():
 80 |   if FLAGS.model == "small":
 81 |     return SmallConfig()
 82 |   elif FLAGS.model == "medium":
 83 |     return MediumConfig()
 84 |   elif FLAGS.model == "best":
 85 |     return BestConfig()
 86 |   elif FLAGS.model == "test":
 87 |     return TestConfig()
 88 |   else:
 89 |     raise ValueError("Invalid model: %s", FLAGS.model)
 90 | 
 91 | 
 92 | class PTBInput(object):
 93 |   """The input data."""
 94 | 
 95 |   def __init__(self, config, data, name=None):
 96 |     self.batch_size = batch_size = config.batch_size
 97 |     self.attn_size = attn_size = config.attn_size
 98 |     self.num_steps = num_steps = config.num_steps
 99 |     self.input_dataN, self.targetsN, self.input_dataT, self.targetsT, self.epoch_size, self.eof_indicator, self.input_dataP = \
100 |             reader.data_producer(data, batch_size, num_steps, config.vocab_size, config.attn_size, change_yT=False, name=name)
101 |     if FLAGS.model == "test":
102 |       self.epoch_size = 16   #small epoch size for test
103 | 
104 | 
105 | class PTBModel(object):
106 |   """The PTB model."""
107 | 
108 |   def __init__(self, is_training, config, input_):
109 |     self._input = input_
110 |     self.attn_size = attn_size = config.attn_size
111 |     batch_size = input_.batch_size
112 |     num_steps = input_.num_steps
113 |     self.sizeN = sizeN = config.hidden_sizeN
114 |     self.sizeT = sizeT = config.hidden_sizeT
115 |     self.size = size = config.sizeH
116 |     (vocab_sizeN, vocab_sizeT) = config.vocab_size
117 | 
118 |     # Slightly better results can be obtained with forget gate biases
119 |     # initialized to 1 but the hyperparameters of the model would need to be
120 |     # different than reported in the paper.
121 |     def lstm_cell():
122 |       if 'reuse' in inspect.getargspec(
123 |           tf.contrib.rnn.BasicLSTMCell.__init__).args:
124 |         return tf.contrib.rnn.BasicLSTMCell(
125 |             size, forget_bias=1.0, state_is_tuple=True,
126 |             reuse=tf.get_variable_scope().reuse)
127 |       else:
128 |         return tf.contrib.rnn.BasicLSTMCell(
129 |             size, forget_bias=1.0, state_is_tuple=True)
130 |     attn_cell = lstm_cell
131 |     if is_training and config.keep_prob < 1:
132 |       def attn_cell():
133 |         return tf.contrib.rnn.DropoutWrapper(
134 |             lstm_cell(), output_keep_prob=config.keep_prob)
135 |     cell = tf.contrib.rnn.MultiRNNCell(
136 |         [attn_cell() for _ in range(config.num_layers)], state_is_tuple=True)
137 | 
138 |     state_variables = []
139 |     with tf.variable_scope("myCH0"):
140 |       for i, (state_c, state_h) in enumerate(cell.zero_state(batch_size, data_type())):
141 |         if i > 0: tf.get_variable_scope().reuse_variables()
142 |         myC0 = tf.get_variable("myC0", state_c.shape[1], initializer=tf.zeros_initializer())
143 |         myH0 = tf.get_variable("myH0", state_h.shape[1], initializer=tf.zeros_initializer())
144 |         myC0_tensor = tf.convert_to_tensor([myC0 for _ in range(batch_size)])
145 |         myH0_tensor = tf.convert_to_tensor([myH0 for _ in range(batch_size)])
146 |         state_variables.append(tf.contrib.rnn.LSTMStateTuple(myC0_tensor, myH0_tensor))
147 | 
148 |     self._initial_state = state_variables
149 | 
150 |     self.eof_indicator = input_.eof_indicator
151 | 
152 |     with tf.device("/cpu:0"):
153 |       embeddingN = tf.get_variable(
154 |           "embeddingN", [vocab_sizeN, sizeN], dtype=data_type())
155 |       inputsN = tf.nn.embedding_lookup(embeddingN, input_.input_dataN)
156 |       inputsP = tf.nn.embedding_lookup(embeddingN, input_.input_dataP)
157 |       inputsL = tf.nn.embedding_lookup(embeddingN, input_.targetsN) # target type information
158 | 
159 |     with tf.device("/cpu:0"):
160 |       embeddingT = tf.get_variable(
161 |           "embeddingT", [vocab_sizeT, sizeT], dtype=data_type())
162 |       inputsT = tf.nn.embedding_lookup(embeddingT, input_.input_dataT)
163 | 
164 |     inputs = tf.concat([inputsN, inputsT], 2)
165 |     inputsPL = tf.concat([inputsP, inputsL], 2)
166 |     #inputs = tf.one_hot(input_.input_data, vocab_size) 
167 |     if is_training and config.keep_prob < 1:
168 |       inputs = tf.nn.dropout(inputs, config.keep_prob)
169 | 
170 |     outputs = []
171 |     attentions = []
172 |     parents = []
173 |     alphas = []
174 |     state = self._initial_state
175 |     self.memory = tf.placeholder(dtype=data_type(), shape=[batch_size, num_steps, size], name="memory")
176 |     valid_memory = self.memory[:,-attn_size:,:]
177 |     # print ("test test test,, state shape", np.array(state).shape)
178 |     with tf.variable_scope("RNN"):
179 |       for time_step in range(num_steps):
180 |         if time_step > 0: tf.get_variable_scope().reuse_variables()
181 |         (cell_output, state) = cell(inputs[:, time_step, :], state)
182 |         outputs.append(cell_output)
183 | 
184 |         cell_parent = inputsP[:, time_step, :]
185 |         parents.append(cell_parent)
186 | 
187 |         wm = tf.get_variable("wm", [size, size], dtype=data_type())
188 |         wh = tf.get_variable("wh", [size, size], dtype=data_type())
189 |         wt = tf.get_variable("wt", [size, 1], dtype=data_type())
190 |         gt = tf.tanh(tf.matmul(tf.reshape(valid_memory, [-1, size]), wm) + tf.reshape(tf.tile(tf.matmul(cell_output, wh),[1, attn_size]), [-1, size]))
191 |         alpha = tf.nn.softmax(tf.reshape(tf.matmul(gt, wt), [-1,attn_size])) #the size of alpha: batch_size by attn_size
192 |         alphas.append(alpha)
193 |         ct = tf.squeeze(tf.matmul(tf.transpose(valid_memory, [0, 2, 1]), tf.reshape(alpha, [-1, attn_size, 1])))
194 |         attentions.append(ct)
195 |         valid_memory = tf.concat([valid_memory[:,1:,:], tf.expand_dims(cell_output, axis=1)], axis=1)
196 | 
197 |     output = tf.reshape(tf.stack(axis=1, values=outputs), [-1, size])
198 |     attention = tf.reshape(tf.stack(axis=1, values=attentions), [-1, size])
199 |     parent = tf.reshape(tf.stack(axis=1, values=parents), [-1, sizeN])
200 | 
201 |     self.output = tf.reshape(output, [-1, num_steps, size]) #to record the memory for next batch
202 |     wa = tf.get_variable("wa", [size*2+sizeN, size], dtype=data_type())
203 |     nt = tf.tanh(tf.matmul(tf.concat([output, attention, parent], axis=1), wa))
204 | 
205 |     #compute w
206 |     softmax_w = tf.get_variable("softmax_w", [size, vocab_sizeT], dtype=data_type())
207 |     softmax_b = tf.get_variable("softmax_b", [vocab_sizeT], dtype=data_type())
208 |     w_logits = tf.matmul(nt, softmax_w) + softmax_b
209 |     w_probs = tf.nn.softmax(w_logits)
210 | 
211 |     #compute l
212 |     l_logits_pre = tf.reshape(tf.stack(axis=1, values=alphas), [-1, attn_size]) #the size of alpha_reshaped: batch_size*num_steps by attn_size
213 |     l_logits = tf.reverse(l_logits_pre, axis=[1])
214 |     # l_probs = tf.nn.softmax(l_logits)
215 | 
216 |     #compute d
217 |     # input_reshaped = tf.reshape(inputs, [-1, size])
218 |     d_conditioned = tf.concat([output, attention], axis=1)
219 |     d_w = tf.get_variable("d_w1", [2*size, 1], dtype=data_type())
220 |     d_b = tf.get_variable("d_b1", [1], dtype=data_type())
221 |     d = tf.nn.sigmoid(tf.matmul(d_conditioned, d_w) + d_b)
222 | 
223 |     # d_conditioned = tf.concat([output, attention], axis=1)
224 |     # d_w1 = tf.get_variable("d_w1", [2*size, size], dtype=data_type())
225 |     # d_b1 = tf.get_variable("d_b1", [size], dtype=data_type())
226 |     # fc1 = tf.nn.relu(tf.matmul(d_conditioned, d_w1) + d_b1)
227 |     # d_w2 = tf.get_variable("d_w2", [size, 1], dtype=data_type())
228 |     # d_b2 = tf.get_variable("d_b2", [1], dtype=data_type())
229 |     # d = tf.nn.sigmoid(tf.matmul(fc1, d_w2) + d_b2)
230 | 
231 |     #concat w and l to construct f
232 |     f_logits = tf.concat([w_logits*d, l_logits*(1-d)], axis=1)
233 | 
234 |     labels = tf.reshape(input_.targetsT, [-1])
235 |     weights = tf.ones([batch_size * num_steps], dtype=data_type())
236 | 
237 |     #counting unk as wrong
238 |     unk_id = vocab_sizeT - 2
239 |     unk_tf = tf.constant(value=unk_id, dtype=tf.int32, shape=labels.shape)
240 |     zero_weights = tf.zeros_like(labels, dtype=data_type())
241 |     wrong_label = tf.constant(value=-1, dtype=tf.int32, shape=labels.shape)
242 |     condition_tf = tf.equal(labels, unk_tf)
243 |     new_weights = tf.where(condition_tf, zero_weights, weights)
244 |     new_labels = tf.where(condition_tf, wrong_label, labels) # only for computing the accuracy, can not be used to compute the loss(cause nan error)
245 | 
246 | 
247 |     loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example([f_logits], [labels], [new_weights])
248 |     probs = tf.nn.softmax(f_logits)
249 | 
250 |     # condition = tf.not_equal(labels, 182)
251 |     # non_pad_len = tf.reduce_sum(tf.cast(condition, tf.float32))
252 |     # mask_labels = tf.where(condition, labels, tf.constant(250, shape = labels.get_shape())) #250 just do not belong to the vocab    
253 |     # correct_prediction = tf.equal(tf.cast(tf.argmax(probs, 1), dtype = tf.int32), mask_labels)
254 |     # self._accuracy = tf.reduce_sum(tf.cast(correct_prediction, tf.float32)) /  non_pad_len # do not count predict <pad>(182)
255 | 
256 |     correct_prediction = tf.equal(tf.cast(tf.argmax(probs, 1), dtype = tf.int32), new_labels)
257 |     self._accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
258 | 
259 |     self._cost = cost = tf.reduce_sum(loss) / batch_size
260 |     self._final_state = state
261 | 
262 |     if not is_training:
263 |       return
264 | 
265 |     self._lr = tf.Variable(0.0, trainable=False)
266 |     tvars = tf.trainable_variables()
267 |     print ('tvars', len(tvars))
268 |     grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars),
269 |                                       config.max_grad_norm)
270 |     print ('*******the length', len(grads))
271 |     optimizer = tf.train.AdamOptimizer(self._lr)
272 |     self._train_op = optimizer.apply_gradients(
273 |         zip(grads, tvars),
274 |         global_step=tf.contrib.framework.get_or_create_global_step())
275 | 
276 |     self._new_lr = tf.placeholder(
277 |         tf.float32, shape=[], name="new_learning_rate")
278 |     self._lr_update = tf.assign(self._lr, self._new_lr)
279 | 
280 |   def assign_lr(self, session, lr_value):
281 |     session.run(self._lr_update, feed_dict={self._new_lr: lr_value})
282 | 
283 |   @property
284 |   def input(self):
285 |     return self._input
286 | 
287 |   @property
288 |   def initial_state(self):
289 |     return self._initial_state
290 | 
291 |   @property
292 |   def cost(self):
293 |     return self._cost
294 | 
295 |   @property
296 |   def final_state(self):
297 |     return self._final_state
298 | 
299 |   @property
300 |   def accuracy(self):
301 |     return self._accuracy
302 | 
303 |   @property
304 |   def lr(self):
305 |     return self._lr
306 | 
307 |   @property
308 |   def train_op(self):
309 |     return self._train_op
310 | 
311 | 
312 | def run_epoch(session, model, eval_op=None, verbose=False):
313 |   """Runs the model on the given data."""
314 |   start_time = time.time()
315 |   costs = 0.0
316 |   accuracy_list = []
317 |   iters = 0
318 |   state = session.run(model.initial_state)
319 |   # print ('at the very initial of the run_epoch\n', state[0].c)
320 |   eof_indicator = np.ones((model.input.batch_size), dtype=bool)
321 |   memory = np.zeros([model.input.batch_size, model.input.num_steps, model.size])
322 |   # file_id = session.run(model.initial_file_id) #need to remove _
323 | 
324 |   fetches = {
325 |       "cost": model.cost,
326 |       "accuracy": model.accuracy,
327 |       "final_state": model.final_state,
328 |       "eof_indicator": model.eof_indicator,
329 |       "memory":model.output,
330 |   }
331 |   if eval_op is not None:
332 |     fetches["eval_op"] = eval_op
333 | 
334 |   for step in range(model.input.epoch_size):
335 |     feed_dict = {}
336 |     # current_file_id = file_id #session.run(model.file_id)
337 |     sub_cond = np.expand_dims(eof_indicator, axis = 1)
338 |     condition = np.repeat(sub_cond, model.size, axis = 1)
339 |     # zero_state = np.zeros_like(condition)
340 |     # zero_state = np.random.uniform(-0.05,0.05,condition.shape)
341 |     zero_state = session.run(model.initial_state)
342 | 
343 |     for i, (c, h) in enumerate(model.initial_state):
344 |       assert condition.shape == state[i].c.shape
345 |       feed_dict[c] = np.where(condition, zero_state[i][0], state[i].c)
346 |       feed_dict[h] = np.where(condition, zero_state[i][1], state[i].h)
347 |     
348 |     feed_dict[model.memory] = memory
349 |     vals = session.run(fetches, feed_dict)
350 |     
351 |     cost = vals["cost"]
352 |     accuracy = vals["accuracy"]
353 |     eof_indicator = vals["eof_indicator"]
354 |     state = vals["final_state"]  #use the final state as the initial state within a whole epoch
355 |     memory = vals["memory"]
356 | 
357 |     accuracy_list.append(accuracy)
358 |     costs += cost
359 |     iters += model.input.num_steps
360 | 
361 |     if verbose and step % (model.input.epoch_size // 10) == 10:
362 |       print("%.3f perplexity: %.3f accuracy: %.4f speed: %.0f wps" %
363 |             (step * 1.0 / model.input.epoch_size, np.exp(costs / iters), np.mean(accuracy_list),
364 |              (time.time() - start_time)))
365 |       # print ('zero_state value', zero_state[0][0])
366 |       # print ('gradients value', session.run(model.grads))
367 |   
368 |   print ('this run_epoch takes time %.2f' %(time.time() - start_time))
369 |   return np.exp(costs / iters), np.mean(accuracy_list)
370 | 
371 | 
372 | 
373 | 
374 | def main(_):
375 |   start_time = time.time()
376 |   fout = open(outfile, 'a')
377 |   print ('\n', time.asctime(time.localtime()), file=fout)
378 |   print ('start a new experiment %s'%outfile, file=fout)
379 |   print ('Using dataset %s and %s'%(N_filename, T_filename), file=fout)
380 |   print ('condition on two, two layers', file=fout)
381 | 
382 |   train_dataN, valid_dataN, vocab_sizeN, train_dataT, valid_dataT, vocab_sizeT, attn_size, train_dataP, valid_dataP \
383 |    = reader.input_data(N_filename, T_filename)
384 | 
385 |   train_data = (train_dataN, train_dataT, train_dataP)
386 |   valid_data = (valid_dataN, valid_dataT, valid_dataP)
387 |   vocab_size = (vocab_sizeN+1, vocab_sizeT+2) # N is [w, eof], T is [w, unk, eof]
388 | 
389 |   config = get_config()
390 |   assert attn_size == config.attn_size #make sure the attn_size used in generate terminal is the same as the configuration
391 |   config.vocab_size = vocab_size  
392 |   eval_config = get_config()
393 |   eval_config.batch_size = config.batch_size * config.num_steps
394 |   eval_config.num_steps = 1
395 |   eval_config.vocab_size = vocab_size
396 | 
397 |   with tf.Graph().as_default():
398 |     initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale)
399 | 
400 |     with tf.name_scope("Train"):
401 |       train_input = PTBInput(config=config, data=train_data, name="TrainInput")
402 |       with tf.variable_scope("Model", reuse=None, initializer=initializer):
403 |         m = PTBModel(is_training=True, config=config, input_=train_input)
404 | 
405 |     with tf.name_scope("Valid"):
406 |       valid_input = PTBInput(config=config, data=valid_data, name="ValidInput")
407 |       with tf.variable_scope("Model", reuse=True, initializer=initializer):
408 |         mvalid = PTBModel(is_training=False, config=config, input_=valid_input)
409 | 
410 |     # with tf.name_scope("Test"):
411 |     #   test_input = PTBInput(config=eval_config, data=valid_data, name="TestInput")
412 |     #   with tf.variable_scope("Model", reuse=True, initializer=initializer):
413 |     #     mtest = PTBModel(is_training=False, config=eval_config,
414 |     #                      input_=test_input)
415 | 
416 | 
417 |     print ('total trainable variables', len(tf.trainable_variables()), '\n\n')
418 |     max_valid = 0
419 |     max_step = 0
420 |     saver = tf.train.Saver()
421 | 
422 |     sv = tf.train.Supervisor(logdir=None, summary_op=None)
423 |     with sv.managed_session() as session:
424 | 
425 |       for i in range(config.max_max_epoch):
426 |         lr_decay = config.lr_decay ** max(i + 1 - config.max_epoch, 0.0)
427 |         m.assign_lr(session, config.learning_rate * lr_decay)
428 |         print (outfile, "Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr)))
429 | 
430 |         train_perplexity, train_accuracy = run_epoch(session, m, eval_op=m.train_op, verbose=True)
431 |         print("Epoch: %d Train Perplexity: %.3f Train Accuracy: %.3f" % (i + 1, train_perplexity, train_accuracy))
432 |         print("Epoch: %d Train Perplexity: %.3f Train Accuracy: %.3f" % (i + 1, train_perplexity, train_accuracy), file=fout)
433 | 
434 |         if i > 5:
435 |           valid_perplexity, valid_accuracy = run_epoch(session, mvalid)
436 |           print("Epoch: %d Valid Perplexity: ~~%.3f Valid Accuracy: %.3f~" % (i + 1, valid_perplexity, valid_accuracy))
437 |           print("Epoch: %d Valid Perplexity: ~~%.3f Valid Accuracy: %.3f~" % (i + 1, valid_perplexity, valid_accuracy), file=fout)
438 |           if valid_accuracy > max_valid:
439 |             max_valid = valid_accuracy
440 |             max_step = i + 1
441 | 
442 |       # test_perplexity, test_accuracy = run_epoch(session, mtest)
443 |       # print("\nTest Perplexity: %.3f Test Accuracy: %.3f" % (test_perplexity, test_accuracy))
444 | 
445 |       print ('max step %d, max valid %.3f' %(max_step, max_valid))
446 |       # print ('data path is', FLAGS.data_path)
447 |       print ('total time takes', time.time()-start_time)
448 |       print ('max step %d, max valid %.3f' %(max_step, max_valid), file=fout)
449 |       print ('total time takes', time.time()-start_time, file=fout)
450 |       fout.close()
451 | 
452 |       # if FLAGS.save_path:
453 |       #   print("Saving model to %s." % FLAGS.save_path)
454 |       #   save_path = saver.save(session, FLAGS.save_path, write_meta_graph=False, write_state=False)
455 | 
456 | 
457 | if __name__ == "__main__":
458 |   tf.app.run()
459 | 


--------------------------------------------------------------------------------
/code/reader_pointer.py:
--------------------------------------------------------------------------------
  1 | # xxx revise it on 01/09, add parent
  2 | # Add attn_size in input_data, data_producer; add change_yT for indicating whether to remove the location of unk(just label it as unk)
  3 | # refactor the code of contructing the long line (def padding_and_concat)
  4 | 
  5 | from __future__ import absolute_import
  6 | from __future__ import division
  7 | from __future__ import print_function
  8 | 
  9 | import collections
 10 | from six.moves import cPickle as pickle
 11 | import tensorflow as tf
 12 | import numpy as np
 13 | import time
 14 | from collections import Counter, defaultdict
 15 | import os
 16 | os.environ['TF_CPP_MIN_LOG_LEVEL']='2'
 17 | 
 18 | def input_data(N_filename, T_filename):
 19 |   start_time = time.time()
 20 |   with open(N_filename, 'rb') as f:
 21 |     print ("reading data from ", N_filename)
 22 |     save = pickle.load(f)
 23 |     train_dataN = save['trainData']
 24 |     test_dataN = save['testData']
 25 |     train_dataP = save['trainParent']
 26 |     test_dataP = save['testParent']
 27 |     vocab_sizeN = save['vocab_size']
 28 |     print ('the vocab_sizeN is %d (not including the eof)' %vocab_sizeN)
 29 |     print ('the number of training data is %d' %(len(train_dataN)))
 30 |     print ('the number of test data is %d\n' %(len(test_dataN)))
 31 | 
 32 |   with open(T_filename, 'rb') as f:
 33 |     print ("reading data from ", T_filename)
 34 |     save = pickle.load(f)
 35 |     train_dataT = save['trainData']
 36 |     test_dataT = save['testData']
 37 |     vocab_sizeT = save['vocab_size']
 38 |     attn_size = save['attn_size']
 39 |     print ('the vocab_sizeT is %d (not including the unk and eof)' %vocab_sizeT)
 40 |     print ('the attn_size is %d' %attn_size)
 41 |     print ('the number of training data is %d' %(len(train_dataT)))
 42 |     print ('the number of test data is %d' %(len(test_dataT)))
 43 |     print ('Finish reading data and take %.2f\n'%(time.time()-start_time))
 44 | 
 45 |   return train_dataN, test_dataN, vocab_sizeN, train_dataT, test_dataT, vocab_sizeT, attn_size, train_dataP, test_dataP
 46 | 
 47 | 
 48 | def data_producer(raw_data, batch_size, num_steps, vocab_size, attn_size, change_yT=False, name=None, verbose=False):
 49 | 
 50 |   start_time = time.time()
 51 | 
 52 |   with tf.name_scope(name, "DataProducer", [raw_data, batch_size, num_steps, vocab_size]):
 53 |     (raw_dataN, raw_dataT, raw_dataP) = raw_data
 54 |     assert len(raw_dataN) == len(raw_dataT)
 55 | 
 56 |     (vocab_sizeN, vocab_sizeT) = vocab_size
 57 |     eof_N_id = vocab_sizeN - 1
 58 |     eof_T_id = vocab_sizeT - 1
 59 |     unk_id = vocab_sizeT - 2
 60 |     
 61 |     def padding_and_concat(data, width, pad_id):
 62 |       #the size of data: a list of list. This function will pad the data according to width
 63 |       long_line = list()
 64 |       for line in data:
 65 |         pad_len = width - (len(line) % width)
 66 |         new_line = line + [pad_id] * pad_len
 67 |         assert len(new_line) % width == 0
 68 |         long_line += new_line
 69 |       return long_line
 70 | 
 71 |     pad_start = time.time()
 72 |     long_lineN = padding_and_concat(raw_dataN, num_steps, pad_id=eof_N_id)
 73 |     long_lineT = padding_and_concat(raw_dataT, num_steps, pad_id=eof_T_id)
 74 |     long_lineP = padding_and_concat(raw_dataP, num_steps, pad_id=1)
 75 |     assert len(long_lineN) == len(long_lineT) 
 76 |     print('Pading three long lines and take %.2fs'%(time.time()-pad_start))
 77 | 
 78 |     # print statistics for long_lineT  
 79 |     if verbose:
 80 |       print('Start counting the statistics of T!!')
 81 |       verbose_start = time.time()
 82 |       cnt_T = Counter(long_lineT) 
 83 |       long_lineT_len = len(long_lineT)
 84 |       empty_cnt = cnt_T[0]
 85 |       unk_cnt = cnt_T[unk_id]
 86 |       eof_cnt = cnt_T[eof_T_id]
 87 |       l_cnt = sum(np.array(long_lineT) > eof_T_id)
 88 |       w_cnt = long_lineT_len - empty_cnt - unk_cnt - eof_cnt - l_cnt
 89 |       print('long_lineT_len: %d, empty: %.4f, unk: %.4f, location: %.4f, eof: %.4f, word (except Empty): %.4f'%
 90 |             (long_lineT_len, float(empty_cnt)/long_lineT_len, float(unk_cnt)/long_lineT_len, 
 91 |             float(l_cnt)/long_lineT_len, float(eof_cnt)/long_lineT_len, float(w_cnt)/long_lineT_len))
 92 |       print('the most common 5 of cnt_T', cnt_T.most_common(5))
 93 |       print('print verbose information and take %.2fs\n'%(time.time()-verbose_start))
 94 |     
 95 |     temp_len = len(long_lineN)
 96 |     n = temp_len // (batch_size * num_steps)
 97 |     long_lineN_truncated = np.array(long_lineN[0 : n * (batch_size * num_steps)])
 98 |     long_lineP_truncated = np.array(long_lineP[0 : n * (batch_size * num_steps)])
 99 |     long_lineT_truncated_x = np.array(long_lineT[0 : n * (batch_size * num_steps)])
100 |     long_lineT_truncated_y = np.array(long_lineT[0 : n * (batch_size * num_steps)])
101 | 
102 |     # long_lineP_truncated[long_lineP_truncated > attn_size] = attn_size  #if the parent location is too far
103 |     long_lineP_truncated = [long_lineN_truncated[i-j] for i,j in enumerate(long_lineP_truncated)] #only store parent N
104 | 
105 |     location_index = long_lineT_truncated_x > eof_T_id    
106 |     long_lineT_truncated_x[location_index] = unk_id
107 |     if change_yT:
108 |       long_lineT_truncated_y[location_index] = unk_id
109 | 
110 |     # print('count of greater than eof', sum(long_lineT_truncated_y > eof_T_id))
111 |     
112 |     tf_dataN = tf.convert_to_tensor(long_lineN_truncated, name="raw_dataN", dtype=tf.int32)
113 |     tf_dataP = tf.convert_to_tensor(long_lineP_truncated, name="raw_dataP", dtype=tf.int32)
114 |     tf_dataT_x = tf.convert_to_tensor(long_lineT_truncated_x, name="raw_dataT_x", dtype=tf.int32)
115 |     tf_dataT_y = tf.convert_to_tensor(long_lineT_truncated_y, name="raw_dataT_y", dtype=tf.int32)
116 |     
117 |     data_len = len(long_lineN_truncated)
118 |     batch_len = data_len // batch_size
119 |     # print ('the total data length is %d, batch_len is %d\n ' %(data_len, batch_len))
120 |     dataN = tf.reshape(tf_dataN[0 : batch_size * batch_len], [batch_size, batch_len])
121 |     dataP = tf.reshape(tf_dataP[0 : batch_size * batch_len], [batch_size, batch_len])
122 |     dataT_x = tf.reshape(tf_dataT_x[0 : batch_size * batch_len], [batch_size, batch_len])
123 |     dataT_y = tf.reshape(tf_dataT_y[0 : batch_size * batch_len], [batch_size, batch_len])
124 | 
125 |     epoch_size = (batch_len - 1) // num_steps  # how many batches to complete a epoch
126 |     assert epoch_size > 0
127 |     i = tf.train.range_input_producer(epoch_size, shuffle=False).dequeue()
128 |     per_start = time.time()
129 |     xN = tf.strided_slice(dataN, [0, i * num_steps],
130 |                          [batch_size, (i + 1) * num_steps])
131 |     xN.set_shape([batch_size, num_steps]) # need to assert all values in x[a,:,1] are the same
132 |     yN = tf.strided_slice(dataN, [0, i * num_steps + 1],
133 |                          [batch_size, (i + 1) * num_steps + 1])
134 |     yN.set_shape([batch_size, num_steps])
135 | 
136 |     xT = tf.strided_slice(dataT_x, [0, i * num_steps],
137 |                          [batch_size, (i + 1) * num_steps])
138 |     xT.set_shape([batch_size, num_steps]) # need to assert all values in x[a,:,1] are the same
139 |     yT = tf.strided_slice(dataT_y, [0, i * num_steps + 1],
140 |                          [batch_size, (i + 1) * num_steps + 1])
141 |     yT.set_shape([batch_size, num_steps])
142 | 
143 |     xP = tf.strided_slice(dataP, [0, i * num_steps],
144 |                          [batch_size, (i + 1) * num_steps])
145 |     xP.set_shape([batch_size, num_steps])
146 | 
147 |     eof_indicator = tf.equal(xN[:, num_steps - 1], tf.constant([eof_N_id]*batch_size))
148 |     print('Finish preparing input producer and takes %.2fs' %(time.time()-start_time))
149 |     print('Each produce data takes time %.2f\n' %(time.time()-per_start))
150 |     return xN, yN, xT, yT, epoch_size, eof_indicator, xP
151 | 
152 | if __name__ == '__main__':
153 |   N_filename = '../pickle_data/JS_non_terminal.pickle'
154 |   T_filename = '../pickle_data/JS_terminal_50k_whole.pickle'
155 | 
156 |   train_dataN, valid_dataN, vocab_sizeN, train_dataT, valid_dataT, vocab_sizeT, attn_size, train_dataP, valid_dataP \
157 |                                                      = input_data(N_filename, T_filename)
158 |   train_data = (train_dataN, train_dataT, train_dataP)
159 |   valid_data = (valid_dataN, valid_dataT, valid_dataP)
160 |   vocab_size = (vocab_sizeN+1, vocab_sizeT+2) # N is [w, eof], T is [w, unk, eof]
161 | 
162 |   input_dataN, targetsN, input_dataT, targetsT, epoch_size, eof_indicator, input_dataP = \
163 |       data_producer(train_data, batch_size=128, num_steps=50, vocab_size=vocab_size, attn_size=attn_size, change_yT=False, name='train', verbose=False)
164 |   # input_dataN1, targetsN1, input_dataT1, targetsT1, epoch_size1, eof_indicator1 = \
165 |   #     data_producer(valid_data, batch_size=128, num_steps=50, vocab_size=vocab_size, attn_size=attn_size, change_yT=False, name='test', verbose=False)      
166 | 
167 |   labels = tf.reshape(targetsT, [-1])
168 |   eof_id = vocab_size[1] -1
169 |   loss_condition = tf.greater(labels, tf.constant(value=eof_id, dtype=tf.int32, shape=labels.shape))
170 |   fetches = {
171 |       "labels":labels,
172 |       "loss_condition":loss_condition,}
173 |   # sess = tf.Session()  #there is no graph to run
174 |   # vals = sess.run(fetches)
175 |   # labels_np = vals["labels"]
176 |   # loss_condition_np = vals["loss_condition"]
177 |   print('*** Done! ***')


--------------------------------------------------------------------------------
/code/reader_pointer_original.py:
--------------------------------------------------------------------------------
  1 | # Yue revise it on 08/15
  2 | # Add attn_size in input_data, data_producer; add change_yT for indicating whether to remove the location of unk(just label it as unk)
  3 | # refactor the code of contructing the long line (def padding_and_concat)
  4 | 
  5 | from __future__ import absolute_import
  6 | from __future__ import division
  7 | from __future__ import print_function
  8 | 
  9 | import collections
 10 | from six.moves import cPickle as pickle
 11 | import tensorflow as tf
 12 | import numpy as np
 13 | import time
 14 | from collections import Counter
 15 | import os
 16 | os.environ['TF_CPP_MIN_LOG_LEVEL']='2'
 17 | 
 18 | def input_data(N_filename, T_filename):
 19 |   start_time = time.time()
 20 |   with open(N_filename, 'rb') as f:
 21 |     print ("reading data from ", N_filename)
 22 |     save = pickle.load(f)
 23 |     train_dataN = save['trainData']
 24 |     test_dataN = save['testData']
 25 |     vocab_sizeN = save['vocab_size']
 26 |     print ('the vocab_sizeN is %d (not including the eof)' %vocab_sizeN)
 27 |     print ('the number of training data is %d' %(len(train_dataN)))
 28 |     print ('the number of test data is %d\n' %(len(test_dataN)))
 29 | 
 30 |   with open(T_filename, 'rb') as f:
 31 |     print ("reading data from ", T_filename)
 32 |     save = pickle.load(f)
 33 |     train_dataT = save['trainData']
 34 |     test_dataT = save['testData']
 35 |     vocab_sizeT = save['vocab_size']
 36 |     attn_size = save['attn_size']
 37 |     print ('the vocab_sizeT is %d (not including the unk and eof)' %vocab_sizeT)
 38 |     print ('the attn_size is %d' %attn_size)
 39 |     print ('the number of training data is %d' %(len(train_dataT)))
 40 |     print ('the number of test data is %d' %(len(test_dataT)))
 41 |     print ('Finish reading data and take %.2f\n'%(time.time()-start_time))
 42 | 
 43 |   return train_dataN, test_dataN, vocab_sizeN, train_dataT, test_dataT, vocab_sizeT, attn_size
 44 | 
 45 | 
 46 | def data_producer(raw_data, batch_size, num_steps, vocab_size, attn_size, change_yT=False, name=None, verbose=False):
 47 | 
 48 |   start_time = time.time()
 49 | 
 50 |   with tf.name_scope(name, "DataProducer", [raw_data, batch_size, num_steps, vocab_size]):
 51 |     (raw_dataN, raw_dataT) = raw_data
 52 |     assert len(raw_dataN) == len(raw_dataT)
 53 | 
 54 |     (vocab_sizeN, vocab_sizeT) = vocab_size
 55 |     eof_N_id = vocab_sizeN - 1
 56 |     eof_T_id = vocab_sizeT - 1
 57 |     unk_id = vocab_sizeT - 2
 58 |     
 59 |     def padding_and_concat(data, width, pad_id):
 60 |       #the size of data: a list of list. This function will pad the data according to width
 61 |       long_line = list()
 62 |       for line in data:
 63 |         pad_len = width - (len(line) % width)
 64 |         new_line = line + [pad_id] * pad_len
 65 |         assert len(new_line) % width == 0
 66 |         long_line += new_line
 67 |       return long_line
 68 | 
 69 |     pad_start = time.time()
 70 |     long_lineN = padding_and_concat(raw_dataN, num_steps, pad_id=eof_N_id)
 71 |     long_lineT = padding_and_concat(raw_dataT, num_steps, pad_id=eof_T_id)
 72 |     assert len(long_lineN) == len(long_lineT) 
 73 |     print('Pading two long lines and take %.2fs'%(time.time()-pad_start))
 74 | 
 75 |     # print statistics for long_lineT  
 76 |     if verbose:
 77 |       print('Start counting the statistics of T!!')
 78 |       verbose_start = time.time()
 79 |       cnt_T = Counter(long_lineT) 
 80 |       long_lineT_len = len(long_lineT)
 81 |       empty_cnt = cnt_T[0]
 82 |       unk_cnt = cnt_T[unk_id]
 83 |       eof_cnt = cnt_T[eof_T_id]
 84 |       l_cnt = sum(np.array(long_lineT) > eof_T_id)
 85 |       w_cnt = long_lineT_len - empty_cnt - unk_cnt - eof_cnt - l_cnt
 86 |       print('long_lineT_len: %d, empty: %.4f, unk: %.4f, location: %.4f, eof: %.4f, word (except Empty): %.4f'%
 87 |             (long_lineT_len, float(empty_cnt)/long_lineT_len, float(unk_cnt)/long_lineT_len, 
 88 |             float(l_cnt)/long_lineT_len, float(eof_cnt)/long_lineT_len, float(w_cnt)/long_lineT_len))
 89 |       print('the most common 5 of cnt_T', cnt_T.most_common(5))
 90 |       print('print verbose information and take %.2fs\n'%(time.time()-verbose_start))
 91 |     
 92 |     temp_len = len(long_lineN)
 93 |     # print ('\nthe original data length is %d' %temp_len)
 94 |     n = temp_len // (batch_size * num_steps)
 95 |     long_lineN_truncated = np.array(long_lineN[0 : n * (batch_size * num_steps)])
 96 |     long_lineT_truncated_x = np.array(long_lineT[0 : n * (batch_size * num_steps)])
 97 |     long_lineT_truncated_y = np.array(long_lineT[0 : n * (batch_size * num_steps)])
 98 | 
 99 |     location_index = long_lineT_truncated_x > eof_T_id
100 |     
101 |     long_lineT_truncated_x[location_index] = unk_id
102 |     if change_yT:
103 |       long_lineT_truncated_y[location_index] = unk_id
104 | 
105 |     # print('count of greater than eof', sum(long_lineT_truncated_y > eof_T_id))
106 |     
107 |     tf_dataN = tf.convert_to_tensor(long_lineN_truncated, name="raw_dataN", dtype=tf.int32)
108 |     tf_dataT_x = tf.convert_to_tensor(long_lineT_truncated_x, name="raw_dataT_x", dtype=tf.int32)
109 |     tf_dataT_y = tf.convert_to_tensor(long_lineT_truncated_y, name="raw_dataT_y", dtype=tf.int32)
110 |     
111 |     data_len = len(long_lineN_truncated)
112 |     batch_len = data_len // batch_size
113 |     # print ('the total data length is %d, batch_len is %d\n ' %(data_len, batch_len))
114 |     dataN = tf.reshape(tf_dataN[0 : batch_size * batch_len], [batch_size, batch_len])
115 |     dataT_x = tf.reshape(tf_dataT_x[0 : batch_size * batch_len], [batch_size, batch_len])
116 |     dataT_y = tf.reshape(tf_dataT_y[0 : batch_size * batch_len], [batch_size, batch_len])
117 | 
118 |     epoch_size = (batch_len - 1) // num_steps
119 |     assert epoch_size > 0
120 |     i = tf.train.range_input_producer(epoch_size, shuffle=False).dequeue()
121 |     per_start = time.time()
122 |     xN = tf.strided_slice(dataN, [0, i * num_steps],
123 |                          [batch_size, (i + 1) * num_steps])
124 |     xN.set_shape([batch_size, num_steps]) # need to assert all values in x[a,:,1] are the same
125 |     yN = tf.strided_slice(dataN, [0, i * num_steps + 1],
126 |                          [batch_size, (i + 1) * num_steps + 1])
127 |     yN.set_shape([batch_size, num_steps])
128 | 
129 |     xT = tf.strided_slice(dataT_x, [0, i * num_steps],
130 |                          [batch_size, (i + 1) * num_steps])
131 |     xT.set_shape([batch_size, num_steps]) # need to assert all values in x[a,:,1] are the same
132 |     yT = tf.strided_slice(dataT_y, [0, i * num_steps + 1],
133 |                          [batch_size, (i + 1) * num_steps + 1])
134 |     yT.set_shape([batch_size, num_steps])
135 | 
136 |     eof_indicator = tf.equal(xN[:, num_steps - 1], tf.constant([eof_N_id]*batch_size))
137 |     print('Finish preparing input producer and takes %.2fs' %(time.time()-start_time))
138 |     print('Each produce data takes time %.2f\n' %(time.time()-per_start))
139 |     return xN, yN, xT, yT, epoch_size, eof_indicator
140 | 
141 | if __name__ == '__main__':
142 |   N_filename = '../pickle_data/JS_non_terminal.pickle'
143 |   T_filename = '../pickle_data/JS_terminal_50k_whole.pickle'
144 | 
145 |   train_dataN, valid_dataN, vocab_sizeN, train_dataT, valid_dataT, vocab_sizeT, attn_size = input_data(N_filename, T_filename)
146 |   train_data = (train_dataN, train_dataT)
147 |   valid_data = (valid_dataN, valid_dataT)
148 |   vocab_size = (vocab_sizeN+1, vocab_sizeT+2) # N is [w, eof], T is [w, unk, eof]
149 |   input_dataN, targetsN, input_dataT, targetsT, epoch_size, eof_indicator = \
150 |             data_producer(train_data, batch_size=128, num_steps=50, vocab_size=vocab_size, attn_size=attn_size, change_yT=False, name='train', verbose=False)
151 |   input_dataN1, targetsN1, input_dataT1, targetsT1, epoch_size1, eof_indicator1 = \
152 |             data_producer(valid_data, batch_size=128, num_steps=50, vocab_size=vocab_size, attn_size=attn_size, change_yT=False, name='test', verbose=False)      
153 | 
154 |   labels = tf.reshape(targetsT, [-1])
155 |   eof_id = vocab_size[1] -1
156 |   loss_condition = tf.greater(labels, tf.constant(value=eof_id, dtype=tf.int32, shape=labels.shape))
157 |   fetches = {
158 |       "labels":labels,
159 |       "loss_condition":loss_condition,}
160 |   # sess = tf.Session()
161 |   # vals = sess.run(fetches)
162 |   # labels_np = vals["labels"]
163 |   # loss_condition_np = vals["loss_condition"]
164 |   print('*** Done! ***')


--------------------------------------------------------------------------------
/code/vanillaLSTM.py:
--------------------------------------------------------------------------------
  1 | # vanilla LSTM, count all unk as wrong
  2 | 
  3 | from __future__ import absolute_import
  4 | from __future__ import division
  5 | from __future__ import print_function
  6 | 
  7 | import inspect
  8 | import time
  9 | 
 10 | import numpy as np
 11 | import tensorflow as tf
 12 | 
 13 | import reader_pointer_original as reader
 14 | import os
 15 | os.environ['TF_CPP_MIN_LOG_LEVEL']='2'
 16 | 
 17 | os.environ['CUDA_VISIBLE_DEVICES']='0'
 18 | outfile = 'output_vanilla.txt'
 19 | 
 20 | N_filename = '../pickle_data/JS_non_terminal.pickle'
 21 | T_filename = '../pickle_data/JS_terminal_5k_whole.pickle'
 22 | 
 23 | flags = tf.flags
 24 | flags.DEFINE_string("save_path", None, #'./logs/modelT0A'
 25 |                     "Model output directory.")
 26 | 
 27 | flags.DEFINE_string(
 28 |     "model", "small",
 29 |     "A type of model. Possible options are: small, medium, best.")
 30 | # flags.DEFINE_string("data_path", '../data/dataJS',
 31 | #                     "Where the training/test data is stored.")
 32 | flags.DEFINE_bool("use_fp16", False,
 33 |                   "Train using 16-bit floats instead of 32bit floats")
 34 | 
 35 | FLAGS = flags.FLAGS
 36 | logging = tf.logging
 37 | 
 38 | if FLAGS.model == "test":
 39 |   outfile = 'TESToutput.txt'
 40 | def data_type():
 41 |   return tf.float16 if FLAGS.use_fp16 else tf.float32
 42 | 
 43 | class SmallConfig(object):
 44 |   """Small config.  get best result as 0.733 """
 45 |   init_scale = 0.05
 46 |   learning_rate = 0.001
 47 |   max_grad_norm = 5
 48 |   num_layers = 1#1
 49 |   num_steps = 50
 50 |   attn_size = 50
 51 |   hidden_sizeN = 300
 52 |   hidden_sizeT = 500
 53 |   sizeH = 800
 54 |   max_epoch = 1#8
 55 |   max_max_epoch = 8#79
 56 |   keep_prob = 1.0#1.0
 57 |   lr_decay = 0.6#0.95
 58 |   batch_size = 64#80
 59 |   vocab_size = 95, 50001
 60 | 
 61 | class TestConfig(object):
 62 |   """Tiny config, for testing."""
 63 |   init_scale = 0.05
 64 |   learning_rate = 0.001
 65 |   max_grad_norm = 5
 66 |   num_layers = 1
 67 |   num_steps = 50
 68 |   attn_size = 50
 69 |   hidden_sizeN = 300
 70 |   hidden_sizeT = 500
 71 |   sizeH = 800
 72 |   max_epoch = 1
 73 |   max_max_epoch = 1
 74 |   keep_prob = 1.0
 75 |   lr_decay = 0.6
 76 |   batch_size = 80
 77 |   vocab_size = 95, 50001
 78 | 
 79 | 
 80 | def get_config():
 81 |   if FLAGS.model == "small":
 82 |     return SmallConfig()
 83 |   elif FLAGS.model == "medium":
 84 |     return MediumConfig()
 85 |   elif FLAGS.model == "best":
 86 |     return BestConfig()
 87 |   elif FLAGS.model == "test":
 88 |     return TestConfig()
 89 |   else:
 90 |     raise ValueError("Invalid model: %s", FLAGS.model)
 91 | 
 92 | 
 93 | class PTBInput(object):
 94 |   """The input data."""
 95 | 
 96 |   def __init__(self, config, data, name=None):
 97 |     self.batch_size = batch_size = config.batch_size
 98 |     self.attn_size = attn_size = config.attn_size
 99 |     self.num_steps = num_steps = config.num_steps
100 |     self.input_dataN, self.targetsN, self.input_dataT, self.targetsT, self.epoch_size, self.eof_indicator = \
101 |             reader.data_producer(data, batch_size, num_steps, config.vocab_size, config.attn_size, change_yT=True, name=name)
102 |     if FLAGS.model == "test":
103 |       self.epoch_size = 16   #small epoch size for test
104 | 
105 | 
106 | class PTBModel(object):
107 | 
108 |   def __init__(self, is_training, config, input_):
109 |     self._input = input_
110 |     self.attn_size = attn_size = config.attn_size
111 |     batch_size = input_.batch_size
112 |     num_steps = input_.num_steps
113 |     self.sizeN = sizeN = config.hidden_sizeN
114 |     self.sizeT = sizeT = config.hidden_sizeT
115 |     self.size = size = config.sizeH
116 |     vocab_sizeN, vocab_sizeT = config.vocab_size
117 | 
118 |     # Slightly better results can be obtained with forget gate biases
119 |     # initialized to 1 but the hyperparameters of the model would need to be
120 |     # different than reported in the paper.
121 |     def lstm_cell():
122 |       if 'reuse' in inspect.getargspec(
123 |           tf.contrib.rnn.BasicLSTMCell.__init__).args:
124 |         return tf.contrib.rnn.BasicLSTMCell(
125 |             size, forget_bias=1.0, state_is_tuple=True,
126 |             reuse=tf.get_variable_scope().reuse)
127 |       else:
128 |         return tf.contrib.rnn.BasicLSTMCell(
129 |             size, forget_bias=1.0, state_is_tuple=True)
130 |     attn_cell = lstm_cell
131 |     if is_training and config.keep_prob < 1:
132 |       def attn_cell():
133 |         return tf.contrib.rnn.DropoutWrapper(
134 |             lstm_cell(), output_keep_prob=config.keep_prob)
135 |     cell = tf.contrib.rnn.MultiRNNCell(
136 |         [attn_cell() for _ in range(config.num_layers)], state_is_tuple=True)
137 | 
138 |     state_variables = []
139 |     with tf.variable_scope("myCH0"):
140 |       for i, (state_c, state_h) in enumerate(cell.zero_state(batch_size, data_type())):
141 |         if i > 0: tf.get_variable_scope().reuse_variables()
142 |         myC0 = tf.get_variable("myC0", state_c.shape[1], initializer=tf.zeros_initializer())
143 |         myH0 = tf.get_variable("myH0", state_h.shape[1], initializer=tf.zeros_initializer())
144 |         myC0_tensor = tf.convert_to_tensor([myC0 for _ in range(batch_size)])
145 |         myH0_tensor = tf.convert_to_tensor([myH0 for _ in range(batch_size)])
146 |         state_variables.append(tf.contrib.rnn.LSTMStateTuple(myC0_tensor, myH0_tensor))
147 | 
148 |     self._initial_state = state_variables
149 | 
150 |     self.eof_indicator = input_.eof_indicator
151 | 
152 |     with tf.device("/cpu:0"):
153 |       embeddingN = tf.get_variable(
154 |           "embeddingN", [vocab_sizeN, sizeN], dtype=data_type())
155 |       inputsN = tf.nn.embedding_lookup(embeddingN, input_.input_dataN)
156 | 
157 |     with tf.device("/cpu:0"):
158 |       embeddingT = tf.get_variable(
159 |           "embeddingT", [vocab_sizeT, sizeT], dtype=data_type())
160 |       inputsT = tf.nn.embedding_lookup(embeddingT, input_.input_dataT)
161 | 
162 |     inputs = tf.concat([inputsN, inputsT], 2)
163 |     #inputs = tf.one_hot(input_.input_data, vocab_size) 
164 |     if is_training and config.keep_prob < 1:
165 |       inputs = tf.nn.dropout(inputs, config.keep_prob)
166 | 
167 |     outputs = []
168 |     attentions = []
169 |     state = self._initial_state
170 |     # self.memory = tf.placeholder(dtype=data_type(), shape=[batch_size, num_steps, size], name="memory")
171 |     # valid_memory = self.memory[:,-attn_size:,:]
172 |     # print ("test test test,, state shape", np.array(state).shape)
173 |     with tf.variable_scope("RNN"):
174 |       for time_step in range(num_steps):
175 |         if time_step > 0: tf.get_variable_scope().reuse_variables()
176 |         (cell_output, state) = cell(inputs[:, time_step, :], state)
177 |         outputs.append(cell_output)
178 | 
179 |         # wm = tf.get_variable("wm", [size, size], dtype=data_type())
180 |         # wh = tf.get_variable("wh", [size, size], dtype=data_type())
181 |         # wt = tf.get_variable("wt", [size, 1], dtype=data_type())
182 |         # gt = tf.tanh(tf.matmul(tf.reshape(valid_memory, [-1, size]), wm) + tf.reshape(tf.tile(tf.matmul(cell_output, wh),[1, attn_size]), [-1, size]))
183 |         # alpha = tf.nn.softmax(tf.reshape(tf.matmul(gt, wt), [-1,attn_size]))
184 |         # ct = tf.squeeze(tf.matmul(tf.transpose(valid_memory, [0, 2, 1]), tf.reshape(alpha, [-1, attn_size, 1])))
185 |         # attentions.append(ct)
186 |         # valid_memory = tf.concat([valid_memory[:,1:,:], tf.expand_dims(cell_output, axis=1)], axis=1)
187 | 
188 |     output = tf.reshape(tf.stack(axis=1, values=outputs), [-1, size])
189 |     # attention = tf.reshape(tf.stack(axis=1, values=attentions), [-1, size])
190 |     self.output = tf.reshape(output, [-1, num_steps, size]) #to record the memory for next batch
191 |     # wa = tf.get_variable("wa", [size*2, size], dtype=data_type())
192 |     # nt = tf.tanh(tf.matmul(tf.concat([output, attention], axis=1), wa))
193 | 
194 |     softmax_w = tf.get_variable("softmax_w", [size, vocab_sizeT], dtype=data_type())
195 |     softmax_b = tf.get_variable("softmax_b", [vocab_sizeT], dtype=data_type())
196 |     logits = tf.matmul(output, softmax_w) + softmax_b
197 |     labels = tf.reshape(input_.targetsT, [-1])
198 |     weights = tf.ones([batch_size * num_steps], dtype=data_type())
199 |     
200 |     #counting unk as wrong
201 |     unk_id = vocab_sizeT - 2
202 |     unk_tf = tf.constant(value=unk_id, dtype=tf.int32, shape=labels.shape)
203 |     zero_weights = tf.zeros_like(labels, dtype=data_type())
204 |     wrong_label = tf.constant(value=-1, dtype=tf.int32, shape=labels.shape)
205 |     condition_tf = tf.equal(labels, unk_tf)
206 |     new_weights = tf.where(condition_tf, zero_weights, weights)
207 |     new_labels = tf.where(condition_tf, wrong_label, labels)
208 | 
209 |     
210 |     loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example([logits], [labels], [new_weights])
211 |     probs = tf.nn.softmax(logits)
212 |     correct_prediction = tf.equal(tf.cast(tf.argmax(probs, 1), dtype = tf.int32), new_labels)
213 |     self._accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
214 | 
215 |     self._cost = cost = tf.reduce_sum(loss) / batch_size
216 |     self._final_state = state
217 | 
218 |     if not is_training:
219 |       return
220 | 
221 |     self._lr = tf.Variable(0.0, trainable=False)
222 |     tvars = tf.trainable_variables()
223 |     print ('tvars', len(tvars))
224 |     grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars),
225 |                                       config.max_grad_norm)
226 |     print ('*******the length', len(grads))
227 |     optimizer = tf.train.AdamOptimizer(self._lr)
228 |     self._train_op = optimizer.apply_gradients(
229 |         zip(grads, tvars),
230 |         global_step=tf.contrib.framework.get_or_create_global_step())
231 | 
232 |     self._new_lr = tf.placeholder(
233 |         tf.float32, shape=[], name="new_learning_rate")
234 |     self._lr_update = tf.assign(self._lr, self._new_lr)
235 | 
236 |   def assign_lr(self, session, lr_value):
237 |     session.run(self._lr_update, feed_dict={self._new_lr: lr_value})
238 | 
239 |   @property
240 |   def input(self):
241 |     return self._input
242 | 
243 |   @property
244 |   def initial_state(self):
245 |     return self._initial_state
246 | 
247 |   @property
248 |   def cost(self):
249 |     return self._cost
250 | 
251 |   @property
252 |   def final_state(self):
253 |     return self._final_state
254 | 
255 |   @property
256 |   def accuracy(self):
257 |     return self._accuracy
258 | 
259 |   @property
260 |   def lr(self):
261 |     return self._lr
262 | 
263 |   @property
264 |   def train_op(self):
265 |     return self._train_op
266 | 
267 | 
268 | def run_epoch(session, model, eval_op=None, verbose=False):
269 |   """Runs the model on the given data."""
270 |   start_time = time.time()
271 |   costs = 0.0
272 |   accuracy_list = []
273 |   iters = 0
274 |   state = session.run(model.initial_state)
275 |   # print ('at the very initial of the run_epoch\n', state[0].c)
276 |   eof_indicator = np.ones((model.input.batch_size), dtype=bool)
277 |   # memory = np.zeros([model.input.batch_size, model.input.num_steps, model.size])
278 |   # file_id = session.run(model.initial_file_id) #need to remove _
279 | 
280 |   fetches = {
281 |       "cost": model.cost,
282 |       "accuracy": model.accuracy,
283 |       "final_state": model.final_state,
284 |       "eof_indicator": model.eof_indicator,
285 |       # "memory":model.output,
286 |   }
287 |   if eval_op is not None:
288 |     fetches["eval_op"] = eval_op
289 | 
290 |   for step in range(model.input.epoch_size):
291 |     feed_dict = {}
292 |     # current_file_id = file_id #session.run(model.file_id)
293 |     sub_cond = np.expand_dims(eof_indicator, axis = 1)
294 |     condition = np.repeat(sub_cond, model.size, axis = 1)
295 |     # zero_state = np.zeros_like(condition)
296 |     # zero_state = np.random.uniform(-0.05,0.05,condition.shape)
297 |     zero_state = session.run(model.initial_state)
298 | 
299 |     for i, (c, h) in enumerate(model.initial_state):
300 |       assert condition.shape == state[i].c.shape
301 |       feed_dict[c] = np.where(condition, zero_state[i][0], state[i].c)
302 |       feed_dict[h] = np.where(condition, zero_state[i][1], state[i].h)
303 |     
304 |     # feed_dict[model.memory] = memory
305 |     vals = session.run(fetches, feed_dict)
306 |     
307 |     cost = vals["cost"]
308 |     accuracy = vals["accuracy"]
309 |     eof_indicator = vals["eof_indicator"]
310 |     state = vals["final_state"]  #use the final state as the initial state within a whole epoch
311 |     # memory = vals["memory"]
312 | 
313 |     accuracy_list.append(accuracy)
314 |     costs += cost
315 |     iters += model.input.num_steps
316 | 
317 |     if verbose and step % (model.input.epoch_size // 10) == 10:
318 |       print("%.3f perplexity: %.3f accuracy: %.4f speed: %.0f wps" %
319 |             (step * 1.0 / model.input.epoch_size, np.exp(costs / iters), np.mean(accuracy_list),
320 |              (time.time() - start_time)))
321 |   
322 |   print ('this run_epoch takes time %.2f' %(time.time() - start_time))
323 |   return np.exp(costs / iters), np.mean(accuracy_list)
324 | 
325 | 
326 | def main(_):
327 |   start_time = time.time()
328 |   fout = open(outfile, 'a')
329 |   print ('\n', time.asctime(time.localtime()), file=fout)
330 |   print ('start a new experiment %s'%outfile, file=fout)
331 |   print ('Using dataset %s and %s'%(N_filename, T_filename), file=fout)
332 | 
333 |   train_dataN, valid_dataN, vocab_sizeN, train_dataT, valid_dataT, vocab_sizeT, attn_size = reader.input_data(N_filename, T_filename)
334 | 
335 |   train_data = (train_dataN, train_dataT)
336 |   valid_data = (valid_dataN, valid_dataT)
337 |   vocab_size = (vocab_sizeN+1, vocab_sizeT+2) # plus EOF, N is [w, eof], T is [w, unk, eof]
338 | 
339 |   config = get_config()
340 |   assert attn_size == config.attn_size #make sure the attn_size used in generate terminal is the same as the configuration
341 |   config.vocab_size = vocab_size  
342 |   eval_config = get_config()
343 |   eval_config.batch_size = config.batch_size * config.num_steps
344 |   eval_config.num_steps = 1
345 |   eval_config.vocab_size = vocab_size
346 | 
347 |   with tf.Graph().as_default():
348 |     initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale)
349 | 
350 |     with tf.name_scope("Train"):
351 |       train_input = PTBInput(config=config, data=train_data, name="TrainInput")
352 |       with tf.variable_scope("Model", reuse=None, initializer=initializer):
353 |         m = PTBModel(is_training=True, config=config, input_=train_input)
354 | 
355 |     with tf.name_scope("Valid"):
356 |       valid_input = PTBInput(config=config, data=valid_data, name="ValidInput")
357 |       with tf.variable_scope("Model", reuse=True, initializer=initializer):
358 |         mvalid = PTBModel(is_training=False, config=config, input_=valid_input)
359 | 
360 |     # with tf.name_scope("Test"):
361 |     #   test_input = PTBInput(config=eval_config, data=valid_data, name="TestInput")
362 |     #   with tf.variable_scope("Model", reuse=True, initializer=initializer):
363 |     #     mtest = PTBModel(is_training=False, config=eval_config,
364 |     #                      input_=test_input)
365 | 
366 | 
367 |     print ('total trainable variables', len(tf.trainable_variables()), '\n\n')
368 |     max_valid = 0
369 |     max_step = 0
370 |     saver = tf.train.Saver()
371 | 
372 |     sv = tf.train.Supervisor(logdir=None, summary_op=None)
373 |     with sv.managed_session() as session:
374 | 
375 |       for i in range(config.max_max_epoch):
376 |         lr_decay = config.lr_decay ** max(i + 1 - config.max_epoch, 0.0)
377 |         m.assign_lr(session, config.learning_rate * lr_decay)
378 |         print (outfile, "Epoch: %d Learning rate: %.6f" % (i + 1, session.run(m.lr)))
379 | 
380 |         train_perplexity, train_accuracy = run_epoch(session, m, eval_op=m.train_op, verbose=True)
381 |         print("Epoch: %d Train Perplexity: %.3f Train Accuracy: %.3f" % (i + 1, train_perplexity, train_accuracy))
382 |         print("Epoch: %d Train Perplexity: %.3f Train Accuracy: %.3f" % (i + 1, train_perplexity, train_accuracy), file=fout)
383 | 
384 |         if i > 5:
385 |           valid_perplexity, valid_accuracy = run_epoch(session, mvalid)
386 |           print("Epoch: %d Valid Perplexity: ~~%.3f Valid Accuracy: %.3f~" % (i + 1, valid_perplexity, valid_accuracy))
387 |           print("Epoch: %d Valid Perplexity: ~~%.3f Valid Accuracy: %.3f~" % (i + 1, valid_perplexity, valid_accuracy), file=fout)
388 |           if valid_accuracy > max_valid:
389 |             max_valid = valid_accuracy
390 |             max_step = i + 1
391 | 
392 |       # test_perplexity, test_accuracy = run_epoch(session, mtest)
393 |       # print("\nTest Perplexity: %.3f Test Accuracy: %.3f" % (test_perplexity, test_accuracy))
394 | 
395 |       print ('max step %d, max valid %.3f' %(max_step, max_valid))
396 |       # print ('data path is', FLAGS.data_path)
397 |       print ('total time takes', time.time()-start_time)
398 |       print ('max step %d, max valid %.3f' %(max_step, max_valid), file=fout)
399 |       print ('total time takes', time.time()-start_time, file=fout)
400 |       fout.close()
401 | 
402 |       # if FLAGS.save_path:
403 |       #   print("Saving model to %s." % FLAGS.save_path)
404 |       #   save_path = saver.save(session, FLAGS.save_path, write_meta_graph=False, write_state=False)
405 | 
406 | 
407 | if __name__ == "__main__":
408 |   tf.app.run()
409 | 


--------------------------------------------------------------------------------
/preprocess_code/freq_dict.py:
--------------------------------------------------------------------------------
 1 | #freq_dict: each terminal's frequency; terminal_num: a set about all the terminals.
 2 | 
 3 | import numpy as np
 4 | from six.moves import cPickle as pickle
 5 | import json
 6 | from collections import Counter
 7 | import time
 8 | 
 9 | #attention line 28: for python dataset, not exclude the last one
10 | train_filename = '../json_data/programs_training.json'
11 | test_filename = '../json_data/programs_eval.json'
12 | target_filename = '../pickle_data/freq_dict_JS.pickle'
13 | 
14 | freq_dict = Counter()
15 | terminal_num = set()
16 | terminal_num.add('EmptY')
17 | 
18 | def process(filename):
19 |   with open(filename, encoding='latin-1') as lines:
20 |     print ('Start procesing %s !!!'%(filename))
21 |     line_index = 0
22 |     for line in lines:
23 |       line_index += 1
24 |       if line_index % 1000 == 0:
25 |         print ('Processing line:', line_index)
26 |       data = json.loads(line)
27 |       if len(data) < 3e4:
28 |         for i, dic in enumerate(data[:-1]):  #JS data[:-1] or PY data
29 |           if 'value' in dic.keys():
30 |             terminal_num.add(dic['value'])
31 |             freq_dict[dic['value']] += 1
32 |           else:          
33 |             freq_dict['EmptY'] += 1
34 | 
35 | def save(filename):
36 |   with open(filename, 'wb') as f:
37 |     save = {'freq_dict': freq_dict,'terminal_num': terminal_num}
38 |     pickle.dump(save, f, protocol=2)
39 | 
40 | 
41 | if __name__ == '__main__':
42 |   start_time = time.time()
43 |   process(train_filename)
44 |   process(test_filename)
45 |   save(target_filename)
46 |   print(freq_dict['EmptY'], freq_dict['Empty'], freq_dict['empty'], freq_dict['EMPTY'])
47 |   print('Finishing generating freq_dict and takes %.2f'%(time.time() - start_time))
48 | 
49 | 
50 | 


--------------------------------------------------------------------------------
/preprocess_code/get_non_terminal.py:
--------------------------------------------------------------------------------
  1 | # rewrite on 2018/1/8 by xxx, add parent
  2 | 
  3 | import numpy as np
  4 | from six.moves import cPickle as pickle
  5 | import json
  6 | import time
  7 | from collections import Counter, defaultdict
  8 | 
  9 | #attention line 42: for python dataset, not exclude the last one
 10 | train_filename = '../json_data/programs_training.json'
 11 | test_filename = '../json_data/programs_eval.json'
 12 | target_filename = '../pickle_data/JS_non_terminal_small.pickle'
 13 | 
 14 | # global variables
 15 | typeDict = dict() #map N's name into its original ID(before expanding into 4*base_ID)
 16 | numID = set()  #the set to include all sparse ID
 17 | no_empty_set = set()
 18 | typeList = list()  #the set to include all Types
 19 | numType = 0
 20 | dicID = dict() #map sparse id to dense id (remove empty id inside 4*base_ID)
 21 | 
 22 | def process(filename):
 23 |   with open(filename, encoding='latin-1') as lines:
 24 |     print ('Start procesing %s !!!'%(filename))
 25 |     line_index = 0
 26 |     corpus_N = list()
 27 |     corpus_parent = list()
 28 | 
 29 |     for line in lines:
 30 |         line_index += 1
 31 |         if line_index % 1000 == 0:
 32 |             print ('Processing line: ', line_index)
 33 |         data = json.loads(line)
 34 |         line_N = list()
 35 |         has_sibling = Counter()
 36 |         parent_counter = defaultdict(lambda: 1) #default parent is previous 1
 37 |         parent_list = list()
 38 | 
 39 |         if len(data) >= 3e4:
 40 |             continue
 41 | 
 42 |         for i, dic in enumerate(data[:-1]):  #JS data[:-1] or PY data
 43 |             typeName = dic['type']
 44 |             if typeName in typeList:
 45 |                 base_ID = typeDict[typeName]         
 46 |             else:
 47 |                 typeList.append(typeName)
 48 |                 global numType
 49 |                 typeDict[typeName] = numType
 50 |                 base_ID = numType
 51 |                 numType = numType + 1
 52 | 
 53 |             #expand the ID into the range of 4*base_ID, according to whether it has sibling or children. Sibling information is got by the ancestor's children information
 54 |             if 'children' in dic.keys():
 55 |                     if has_sibling[i]:
 56 |                         ID = base_ID * 4 + 3
 57 |                     else:
 58 |                         ID = base_ID * 4 + 2
 59 | 
 60 |                     childs = dic['children']
 61 |                     for j in childs:
 62 |                         parent_counter[j] = j-i
 63 | 
 64 |                     if len(childs) > 1:
 65 |                         for j in childs:
 66 |                             has_sibling[j] = 1
 67 |             else:
 68 |                 if has_sibling[i]:
 69 |                     ID = base_ID * 4 + 1
 70 |                 else:
 71 |                     ID = base_ID * 4
 72 |             #recording the N which has non-empty T
 73 |             if 'value' in dic.keys():
 74 |                 no_empty_set.add(ID)
 75 | 
 76 |             line_N.append(ID)
 77 |             parent_list.append(parent_counter[i])
 78 |             numID.add(ID)
 79 | 
 80 |         corpus_N.append(line_N)
 81 |         corpus_parent.append(parent_list)
 82 |     return corpus_N, corpus_parent
 83 | 
 84 | 
 85 | 
 86 | def map_dense_id(data):
 87 |     result = list()
 88 |     for line_id in data:
 89 |         line_new_id = list()
 90 |         for i in line_id:
 91 |             if i in dicID.keys():
 92 |                 line_new_id.append(dicID[i])
 93 |             else:
 94 |                 dicID[i] = len(dicID)
 95 |                 line_new_id.append(dicID[i])
 96 |         result.append(line_new_id)
 97 |     return result
 98 | 
 99 | 
100 | def save(filename, typeDict, numType, dicID, vocab_size, trainData, testData, trainParent, testParent, empty_set_dense):
101 |   with open(filename, 'wb') as f:
102 |     save = {
103 |         # 'typeDict': typeDict,
104 |         # 'numType': numType,
105 |         # 'dicID': dicID,
106 |         'vocab_size': vocab_size,
107 |         'trainData': trainData,
108 |         'testData': testData,
109 |         'trainParent': trainParent,
110 |         'testParent': testParent,
111 |         # 'typeOnlyHasEmptyValue': empty_set_dense,
112 |     }
113 |     pickle.dump(save, f, protocol=2)
114 | 
115 | if __name__ == '__main__':
116 |     start_time = time.time()
117 |     trainData, trainParent = process(train_filename)
118 |     testData, testParent = process(test_filename)
119 |     trainData = map_dense_id(trainData)
120 |     testData = map_dense_id(testData)
121 |     vocab_size = len(numID)
122 |     assert len(dicID) == vocab_size
123 | 
124 |     #for print the N which can only has empty T
125 |     assert no_empty_set.issubset(numID)
126 |     empty_set = numID.difference(no_empty_set)
127 |     empty_set_dense = set()
128 |     # print(dicID)
129 |     for i in empty_set:
130 |         empty_set_dense.add(dicID[i])
131 |     print('The N set that can only has empty terminals: ',len(empty_set_dense), empty_set_dense)
132 |     print('The vocaburary:', vocab_size, numID)
133 | 
134 | 
135 |     save(target_filename, typeDict, numType, dicID, vocab_size, trainData, testData, trainParent, testParent,empty_set_dense)
136 |     print('Finishing generating terminals and takes %.2fs'%(time.time() - start_time))


--------------------------------------------------------------------------------
/preprocess_code/get_terminal_dict.py:
--------------------------------------------------------------------------------
 1 | #sort the freq_dict and get the terminal_dict for top <vocab_size> terminals (include EmptY)
 2 | 
 3 | import time
 4 | from six.moves import cPickle as pickle
 5 | import json
 6 | from collections import Counter
 7 | import operator
 8 | 
 9 | vocab_size = 10000
10 | total_length = 92758587 # JS: 160143814, PY 92758587
11 | freq_dict_filename = '../pickle_data/freq_dict_PY.pickle'
12 | target_filename = '../pickle_data/terminal_dict_10k_PY.pickle'
13 | 
14 | def restore_freq_dict(filename):
15 |   with open(filename, 'rb') as f:
16 |     save = pickle.load(f)
17 |     freq_dict = save['freq_dict']
18 |     terminal_num = save['terminal_num']
19 |     return freq_dict, terminal_num
20 | 
21 | def get_terminal_dict(vocab_size, freq_dict, verbose=False):
22 |   terminal_dict = dict()
23 |   sorted_freq_dict = sorted(freq_dict.items(), key=operator.itemgetter(1), reverse=True)
24 |   if verbose == True:
25 |     for i in range(100):
26 |       print ('the %d frequent terminal: %s, its frequency: %.5f'%(i, sorted_freq_dict[i][0], float(sorted_freq_dict[i][1])/total_length))
27 |   new_freq_dict = sorted_freq_dict[:vocab_size]
28 |   for i, (terminal, frequent) in enumerate(new_freq_dict):
29 |     terminal_dict[terminal] = i
30 |   return terminal_dict, sorted_freq_dict
31 | 
32 | def save(filename, terminal_dict, terminal_num, sorted_freq_dict):
33 |   with open(filename, 'wb') as f:
34 |     save = {'terminal_dict': terminal_dict,'terminal_num': terminal_num, 'vocab_size': vocab_size, 'sorted_freq_dict': sorted_freq_dict,}
35 |     pickle.dump(save, f, protocol=2)
36 | 
37 | if __name__ == '__main__':
38 |   start_time = time.time()
39 |   freq_dict, terminal_num = restore_freq_dict(freq_dict_filename)
40 |   print(freq_dict['EmptY'], freq_dict['empty'])
41 |   terminal_dict, sorted_freq_dict = get_terminal_dict(vocab_size, freq_dict, True)
42 |   save(target_filename, terminal_dict, terminal_num, sorted_freq_dict)
43 |   print('Finishing generating terminal_dict and takes %.2f'%(time.time() - start_time))
44 | 
45 | 


--------------------------------------------------------------------------------
/preprocess_code/get_terminal_whole.py:
--------------------------------------------------------------------------------
  1 | #According to the terminal_dict you choose (i.e. 5k, 10k, 50k), parse the json file and turn them into ids that are stored in pickle file
  2 | #Output just one vector for terminal, the upper part is the word id while the lower part is the location
  3 | # 0108 revise the Empty into EmptY, normal to NormaL
  4 | # Here attn_size matters
  5 | 
  6 | import numpy as np
  7 | from six.moves import cPickle as pickle
  8 | import json
  9 | from collections import deque
 10 | import time
 11 | 
 12 | #attention line 48: for python dataset, not exclude the last one
 13 | terminal_dict_filename = '../pickle_data/terminal_dict_10k_PY.pickle'
 14 | train_filename = '../json_data/python100k_train.json'
 15 | test_filename = '../json_data/python50k_eval.json'
 16 | target_filename = '../pickle_data/PY_terminal_10k_whole.pickle'
 17 | 
 18 | 
 19 | def restore_terminal_dict(filename):
 20 |   with open(filename, 'rb') as f:
 21 |     save = pickle.load(f)
 22 |     terminal_dict = save['terminal_dict']
 23 |     terminal_num = save['terminal_num']
 24 |     vocab_size = save['vocab_size']
 25 |     return terminal_dict, terminal_num, vocab_size #vocab_size is 50k, and also the unk_id
 26 | 
 27 | def process(filename, terminal_dict, unk_id, attn_size, verbose=False, is_train=False):
 28 |   with open(filename, encoding='latin-1') as lines:
 29 |     print ('Start procesing %s !!!'%(filename))
 30 |     terminal_corpus = list()
 31 |     attn_que = deque(maxlen=attn_size)
 32 |     attn_success_total = 0
 33 |     attn_fail_total = 0
 34 |     length_total = 0
 35 |     line_index = 0
 36 |     for line in lines:
 37 |       line_index += 1
 38 |       # if is_train and line_index == 11:
 39 |       #   continue
 40 |       if line_index % 1000 == 0:
 41 |         print ('Processing line:', line_index)
 42 |       data = json.loads(line)
 43 |       if len(data) < 3e4:
 44 |         terminal_line = list()
 45 |         attn_que.clear() # have a new queue for each file
 46 |         attn_success_cnt  = 0
 47 |         attn_fail_cnt  = 0
 48 |         for i, dic in enumerate(data):      ##JS data[:-1] or PY data
 49 |           if 'value' in dic.keys():
 50 |             dic_value = dic['value']
 51 |             if dic_value in terminal_dict.keys():  #take long time!!!
 52 |               terminal_line.append(terminal_dict[dic_value])
 53 |               attn_que.append('NormaL')
 54 |             else:                       
 55 |               if dic_value in attn_que:
 56 |                 location_index = [len(attn_que)-ind for ind,x in enumerate(attn_que) if x==dic_value][-1] 
 57 |                 location_id = unk_id + 1 + (location_index)                
 58 |                 # print('\nattn_success!! its value is ', dic_value)
 59 |                 # print('The current file index: ', line_index, ', the location index', location_index,', the location_id: ', location_id, ',\n the attn_que', attn_que)
 60 |                 terminal_line.append(location_id)
 61 |                 attn_success_cnt += 1
 62 |               else:
 63 |                 attn_fail_cnt += 1
 64 |                 terminal_line.append(unk_id)
 65 |               attn_que.append(dic_value)
 66 |           else:
 67 |             terminal_line.append(terminal_dict['EmptY'])
 68 |             attn_que.append('EmptY')
 69 |         terminal_corpus.append(terminal_line)
 70 |         attn_success_total += attn_success_cnt
 71 |         attn_fail_total += attn_fail_cnt
 72 |         attn_total = attn_success_total + attn_fail_total
 73 |         length_total += len(data)
 74 |         # print ('Process line', line_index, 'attn_success_cnt', attn_success_cnt, 'attn_fail_cnt', attn_fail_cnt,'data length', len(data))
 75 |         if verbose and line_index % 1000 == 0:
 76 |           print('\nUntil line %d: attn_success_total: %d, attn_fail_total: %d, success/attn_total: %.4f, length_total: %d, attn_success percentage: %.4f, total unk percentage: %.4f\n'%
 77 |                 (line_index, attn_success_total, attn_fail_total, float(attn_success_total)/attn_total, length_total, 
 78 |                 float(attn_success_total)/length_total, float(attn_total)/length_total))
 79 |     with open('output.txt', 'a') as fout:
 80 |       fout.write('Statistics: attn_success_total: %d, attn_fail_total: %d, success/fail: %.4f, length_total: %d, attn_success percentage: %.4f, total unk percentage: %.4f\n'%
 81 |                 (attn_success_total, attn_fail_total, float(attn_success_total)/attn_fail_total, length_total, 
 82 |                 float(attn_success_total)/length_total, float(attn_success_total + attn_fail_total)/length_total))
 83 | 
 84 |     return terminal_corpus 
 85 | 
 86 | def save(filename, terminal_dict, terminal_num, vocab_size, attn_size, trainData, testData):
 87 |   with open(filename, 'wb') as f:
 88 |     save = {'terminal_dict': terminal_dict,
 89 |             'terminal_num': terminal_num,
 90 |             'vocab_size': vocab_size, 
 91 |             'attn_size': attn_size,
 92 |             'trainData': trainData, 
 93 |             'testData': testData,
 94 |             }
 95 |     pickle.dump(save, f, protocol=2)
 96 | 
 97 | if __name__ == '__main__':
 98 |   start_time = time.time()
 99 |   attn_size = 50
100 |   terminal_dict, terminal_num, vocab_size = restore_terminal_dict(terminal_dict_filename)
101 |   trainData = process(train_filename, terminal_dict, vocab_size, attn_size=attn_size, verbose=True, is_train=True)
102 |   testData = process(test_filename, terminal_dict, vocab_size, attn_size=attn_size, verbose=True, is_train=False)
103 |   save(target_filename, terminal_dict, terminal_num, vocab_size, attn_size, trainData, testData)
104 |   print('Finishing generating terminals and takes %.2f'%(time.time() - start_time))


--------------------------------------------------------------------------------
/preprocess_code/get_total_length.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import time
 3 | 
 4 | train_filename = '../json_data/programs_training.json'
 5 | test_filename = '../json_data/programs_eval.json'
 6 | 
 7 | def process(filename):
 8 |   with open(filename, encoding='latin-1') as lines:    
 9 |     print ('Start procesing %s !!!'%(filename))
10 |     length = 0
11 |     line_index = 0
12 |     for line in lines:
13 |       line_index += 1
14 |       if line_index % 1000 == 0:
15 |         print ('Processing line:', line_index)
16 |       data = json.loads(line)
17 |       if len(data) < 3e4:
18 |         length += len(data[:-1])  # total number of AST nodes
19 |     return length
20 | 
21 | if __name__ == '__main__':
22 |   start_time = time.time()
23 |   train_len = process(train_filename)
24 |   test_len = process(test_filename)
25 |   print('total_length is ', train_len + test_len)
26 |   print('Finishing counting the length and takes %.2f'%(time.time() - start_time))


--------------------------------------------------------------------------------
/preprocess_code/output.txt:
--------------------------------------------------------------------------------
 1 | Statistics: attn_success_total: 61250, attn_fail_total: 131890, success/fail: 0.4644, length_total: 800977, attn_success percentage: 0.0765, total unk percentage: 0.2411
 2 | Statistics: attn_success_total: 27749, attn_fail_total: 45969, success/fail: 0.6036, length_total: 336474, attn_success percentage: 0.0825, total unk percentage: 0.2191
 3 | Statistics: attn_success_total: 5826053, attn_fail_total: 9457247, success/fail: 0.6160, length_total: 62340693, attn_success percentage: 0.0935, total unk percentage: 0.2452
 4 | Statistics: attn_success_total: 2830099, attn_fail_total: 4609149, success/fail: 0.6140, length_total: 30417894, attn_success percentage: 0.0930, total unk percentage: 0.2446
 5 | Statistics: attn_success_total: 3239633, attn_fail_total: 6752116, success/fail: 0.4798, length_total: 62340693, attn_success percentage: 0.0520, total unk percentage: 0.1603
 6 | Statistics: attn_success_total: 1577885, attn_fail_total: 3284925, success/fail: 0.4803, length_total: 30417894, attn_success percentage: 0.0519, total unk percentage: 0.1599
 7 | Statistics: attn_success_total: 2004001, attn_fail_total: 5276976, success/fail: 0.3798, length_total: 62340693, attn_success percentage: 0.0321, total unk percentage: 0.1168
 8 | Statistics: attn_success_total: 983407, attn_fail_total: 2565952, success/fail: 0.3833, length_total: 30417894, attn_success percentage: 0.0323, total unk percentage: 0.1167
 9 | --------------------------
10 | Statistics: attn_success_total: 2007407, attn_fail_total: 5285969, success/fail: 0.3798, length_total: 62340693, attn_success percentage: 0.0322, total unk percentage: 0.1170
11 | Statistics: attn_success_total: 985513, attn_fail_total: 2570911, success/fail: 0.3833, length_total: 30417894, attn_success percentage: 0.0324, total unk percentage: 0.1169 #PY_test, attn_size is 50
12 | Statistics: attn_success_total: 1215723, attn_fail_total: 2340701, success/fail: 0.5194, length_total: 30417894, attn_success percentage: 0.0400, total unk percentage: 0.1169 #PY_test, attn_size is 100
13 | Statistics: attn_success_total: 1329858, attn_fail_total: 2226566, success/fail: 0.5973, length_total: 30417894, attn_success percentage: 0.0437, total unk percentage: 0.1169 #PY_test, attn_size is 150
14 | Statistics: attn_success_total: 2079826, attn_fail_total: 5445211, success/fail: 0.3820, length_total: 107104111, attn_success percentage: 0.0194, total unk percentage: 0.0703
15 | Statistics: attn_success_total: 1044410, attn_fail_total: 2673748, success/fail: 0.3906, length_total: 53188270, attn_success percentage: 0.0196, total unk percentage: 0.0699
16 | Statistics: attn_success_total: 2483781, attn_fail_total: 4809595, success/fail: 0.5164, length_total: 62340693, attn_success percentage: 0.0398, total unk percentage: 0.1170
17 | Statistics: attn_success_total: 1215723, attn_fail_total: 2340701, success/fail: 0.5194, length_total: 30417894, attn_success percentage: 0.0400, total unk percentage: 0.1169
18 | Statistics: attn_success_total: 2534934, attn_fail_total: 4990103, success/fail: 0.5080, length_total: 107104111, attn_success percentage: 0.0237, total unk percentage: 0.0703
19 | Statistics: attn_success_total: 1269273, attn_fail_total: 2448885, success/fail: 0.5183, length_total: 53188270, attn_success percentage: 0.0239, total unk percentage: 0.0699
20 | Statistics: attn_success_total: 3960793, attn_fail_total: 7909092, success/fail: 0.5008, length_total: 107104111, attn_success percentage: 0.0370, total unk percentage: 0.1108
21 | Statistics: attn_success_total: 1997242, attn_fail_total: 3899387, success/fail: 0.5122, length_total: 53188270, attn_success percentage: 0.0376, total unk percentage: 0.1109
22 | Statistics: attn_success_total: 8572892, attn_fail_total: 12842395, success/fail: 0.6675, length_total: 107104111, attn_success percentage: 0.0800, total unk percentage: 0.1999
23 | Statistics: attn_success_total: 4296929, attn_fail_total: 6352560, success/fail: 0.6764, length_total: 53188270, attn_success percentage: 0.0808, total unk percentage: 0.2002
24 | Statistics: attn_success_total: 5832257, attn_fail_total: 9473851, success/fail: 0.6156, length_total: 62340693, attn_success percentage: 0.0936, total unk percentage: 0.2455
25 | Statistics: attn_success_total: 2833262, attn_fail_total: 4617843, success/fail: 0.6135, length_total: 30417894, attn_success percentage: 0.0931, total unk percentage: 0.2450
26 | Statistics: attn_success_total: 3244113, attn_fail_total: 6763678, success/fail: 0.4796, length_total: 62340693, attn_success percentage: 0.0520, total unk percentage: 0.1605
27 | Statistics: attn_success_total: 1580175, attn_fail_total: 3290966, success/fail: 0.4802, length_total: 30417894, attn_success percentage: 0.0519, total unk percentage: 0.1601
28 | 


--------------------------------------------------------------------------------
/preprocess_code/utils.py:
--------------------------------------------------------------------------------
  1 | #Utilities for preprocess the data
  2 | 
  3 | import numpy as np
  4 | from six.moves import cPickle as pickle
  5 | import json
  6 | from collections import deque
  7 | import time
  8 | 
  9 | 
 10 | def read_N_pickle(filename):
 11 |   with open(filename, 'rb') as f:
 12 |     print ("Reading data from ", filename)
 13 |     save = pickle.load(f)
 14 |     train_data = save['trainData']
 15 |     test_data = save['testData']
 16 |     vocab_size = save['vocab_size']
 17 |     print ('the vocab_size is %d' %vocab_size)
 18 |     print ('the number of training data is %d' %(len(train_data)))
 19 |     print ('the number of test data is %d' %(len(test_data)))
 20 |     print ('Finish reading data!!')
 21 |     return train_data, test_data, vocab_size
 22 | 
 23 | def read_T_pickle(filename):
 24 |   with open(filename, 'rb') as f:
 25 |     print ("Reading data from ", filename)
 26 |     save = pickle.load(f)
 27 |     train_data = save['trainData']
 28 |     test_data = save['testData']
 29 |     vocab_size = save['vocab_size']
 30 |     attn_size = save['attn_size']
 31 |     print ('the vocab_size is %d' %vocab_size)
 32 |     print ('the attn_size is %d' %attn_size)
 33 |     print ('the number of training data is %d' %(len(train_data)))
 34 |     print ('the number of test data is %d' %(len(test_data)))
 35 |     print ('Finish reading data!!')
 36 |     return train_data, test_data, vocab_size, attn_size
 37 | 
 38 | 
 39 | def save(filename, terminal_dict, terminal_num, vocab_size, sorted_freq_dict):
 40 |   with open(filename, 'wb') as f:
 41 |     save = {'terminal_dict': terminal_dict,'terminal_num': terminal_num, 'vocab_size': vocab_size, 'sorted_freq_dict': sorted_freq_dict,}
 42 |     pickle.dump(save, f)
 43 | 
 44 | def change_protocol_for_N(filename):
 45 | 
 46 |     f = open(filename, 'rb')
 47 |     save = pickle.load(f)
 48 |     typeDict = save['typeDict']
 49 |     numType = save['numType']
 50 |     dicID = save['dicID']
 51 |     vocab_size = save['vocab_size']
 52 |     trainData = save['trainData']
 53 |     testData = save['testData']
 54 |     typeOnlyHasEmptyValue = save['typeOnlyHasEmptyValue']
 55 |     f.close()
 56 | 
 57 |     f = open(filename, 'wb')
 58 |     save = {
 59 |         'typeDict': typeDict,
 60 |         'numType': numType,
 61 |         'dicID': dicID,
 62 |         'vocab_size': vocab_size,
 63 |         'trainData': trainData,
 64 |         'testData': testData,
 65 |         'typeOnlyHasEmptyValue': typeOnlyHasEmptyValue,
 66 |         }
 67 |     pickle.dump(save, f, protocol=2)
 68 |     f.close()
 69 | 
 70 | 
 71 | def change_protocol_for_T(filename):
 72 |     f = open(filename, 'rb')
 73 |     save = pickle.load(f)
 74 |     terminal_dict = save['terminal_dict']
 75 |     terminal_num = save['terminal_num']
 76 |     vocab_size = save['vocab_size']
 77 |     attn_size = save['attn_size']
 78 |     trainData = save['trainData']
 79 |     testData = save['testData']
 80 |     f.close()
 81 | 
 82 |     f = open(target_filename, 'wb')
 83 |     save = {'terminal_dict': terminal_dict,
 84 |             'terminal_num': terminal_num,
 85 |             'vocab_size': vocab_size, 
 86 |             'attn_size': attn_size,
 87 |             'trainData': trainData, 
 88 |             'testData': testData,
 89 |             }
 90 |     pickle.dump(save, f, protocol=2)
 91 |     f.close()
 92 | 
 93 | if __name__ == '__main__':
 94 |     
 95 |     # train_filename = '../json_data/small_programs_training.json'
 96 |     # test_filename = '../json_data/small_programs_eval.json'
 97 |     # N_pickle_filename = '../pickle_data/JS_non_terminal.pickle'
 98 |     # T_pickle_filename = '../pickle_data/JS_terminal_1k.pickle'
 99 |     filename = '../pickle_data/PY_non_terminal.pickle'
100 |     read_N_pickle(filename)
101 |     # filename = '../pickle_data/JS_terminal_1k_whole.pickle'
102 |     # change_protocol_for_T(filename, target_filename)
103 | 
104 | 
105 |     # N_train_data, N_test_data, N_vocab_size = read_N_pickle(N_pickle_filename)
106 |     # T_train_data, T_test_data, T_vocab_size, attn_size = read_T_pickle(T_pickle_filename)
107 |     # print(len(N_train_data), len(T_train_data))
108 | 
109 | 


--------------------------------------------------------------------------------