├── README.md ├── code ├── attention.py ├── attention_N.py ├── attention_N_parent.py ├── attention_parent.py ├── myModel_commented.py ├── pointer.py ├── pointer_parent.py ├── reader_pointer.py ├── reader_pointer_original.py └── vanillaLSTM.py └── preprocess_code ├── freq_dict.py ├── get_non_terminal.py ├── get_terminal_dict.py ├── get_terminal_whole.py ├── get_total_length.py ├── output.txt └── utils.py /README.md: -------------------------------------------------------------------------------- 1 | # Code completion 2 | This is a repo holding codes for the paper: Code Completion with Neural Attention and Pointer Networks 3 | 4 | ## Descriptions for the directories 5 | ### code 6 | * attention.py: standard attention model for predicting terminals 7 | * attention_N.py: standard attention model for predicting non-terminals 8 | * attention_N_parent.py: parent attention model for predicting non-terminals 9 | * attention_parent.py: parent attention model for predicting terminals 10 | * myModel_commented.py: a good commented example for our model 11 | * pointer.py: our poirnter mixture network without parent attention 12 | * pointer_parent.py: our poirnter mixture network with parent attention 13 | * reader_pointer.py: reader for reading dataset (with parent) 14 | * reader_pointer_original.py: reader for reading dataset (original without parent) 15 | * vanillaLSTM.py: vanilla LSTM 16 | 17 | ### preprocess_code 18 | * freq_dict.py: generate the frequency dictionary for terminals 19 | * get_non_terminal.py: process the non-terminals (utilize AST information) 20 | * get_terminal_dict.py: get the terminal dictionary according to the vocabulary size 21 | * get_terminal_whole.py: the final step to process the terminals (recording location and parent information) 22 | * get_total_length.py: calculate the total length of the file 23 | * output.txt: some statistics for the terminals 24 | * utils.py: some utils to process the data 25 | 26 | ## Download the dataset 27 | This is the link for you to download the raw dataset: [JS & PY data](http://plml.ethz.ch/) 28 | If you do not want to get your hands dirty with data preprocess, you can download the pickle data after preprocessed here: [pickle data](https://drive.google.com/open?id=1EZZuL8Rl3tatvxpIClvO_a8JD_Oid_oY) 29 | 30 | ## How to run the code 31 | 1. Download the dataset 32 | 2. Preprocess the data into pickle files and store them in a proper directory 33 | 3. Simply adjust the parameter setting inside the code file and run using python3, e.g. python3 attention.py. 34 | -------------------------------------------------------------------------------- /code/attention.py: -------------------------------------------------------------------------------- 1 | # attentional LSTM, count all unk as wrong, default predict termianl 2 | # what we exactly use 3 | # use reader_pointer_original, without parent 4 | 5 | from __future__ import absolute_import 6 | from __future__ import division 7 | from __future__ import print_function 8 | 9 | import inspect 10 | import time 11 | 12 | import numpy as np 13 | import tensorflow as tf 14 | 15 | import reader_pointer_original as reader 16 | import os 17 | os.environ['TF_CPP_MIN_LOG_LEVEL']='2' 18 | 19 | os.environ['CUDA_VISIBLE_DEVICES']='0' 20 | outfile = 'output_attention.txt' 21 | 22 | N_filename = '../pickle_data/JS_non_terminal.pickle' 23 | T_filename = '../pickle_data/JS_terminal_50k_whole.pickle' 24 | 25 | flags = tf.flags 26 | flags.DEFINE_string("save_path", None, #'./logs/modelT0A' 27 | "Model output directory.") 28 | 29 | flags.DEFINE_string( 30 | "model", "small", 31 | "A type of model. Possible options are: small, medium, best.") 32 | # flags.DEFINE_string("data_path", '../data/dataJS', 33 | # "Where the training/test data is stored.") 34 | flags.DEFINE_bool("use_fp16", False, 35 | "Train using 16-bit floats instead of 32bit floats") 36 | 37 | FLAGS = flags.FLAGS 38 | logging = tf.logging 39 | 40 | if FLAGS.model == "test": 41 | outfile = 'TESToutput.txt' 42 | def data_type(): 43 | return tf.float16 if FLAGS.use_fp16 else tf.float32 44 | 45 | class SmallConfig(object): 46 | """Small config. get best result as 0.733 """ 47 | init_scale = 0.05 48 | learning_rate = 0.001 49 | max_grad_norm = 5 50 | num_layers = 1#1 51 | num_steps = 50 52 | attn_size = 50 53 | hidden_sizeN = 300 54 | hidden_sizeT = 500 55 | sizeH = 800 56 | max_epoch = 1#8 57 | max_max_epoch = 8#79 58 | keep_prob = 1.0#1.0 59 | lr_decay = 0.6#0.95 60 | batch_size = 128#80 61 | 62 | class TestConfig(object): 63 | """Tiny config, for testing.""" 64 | init_scale = 0.05 65 | learning_rate = 0.001 66 | max_grad_norm = 5 67 | num_layers = 1 68 | num_steps = 50 69 | attn_size = 50 70 | hidden_sizeN = 50 71 | hidden_sizeT = 50 72 | sizeH = 100 73 | max_epoch = 1 74 | max_max_epoch = 1 75 | keep_prob = 1.0 76 | lr_decay = 0.6 77 | batch_size = 128 78 | 79 | 80 | def get_config(): 81 | if FLAGS.model == "small": 82 | return SmallConfig() 83 | elif FLAGS.model == "medium": 84 | return MediumConfig() 85 | elif FLAGS.model == "best": 86 | return BestConfig() 87 | elif FLAGS.model == "test": 88 | return TestConfig() 89 | else: 90 | raise ValueError("Invalid model: %s", FLAGS.model) 91 | 92 | 93 | class PTBInput(object): 94 | """The input data.""" 95 | 96 | def __init__(self, config, data, name=None): 97 | self.batch_size = batch_size = config.batch_size 98 | self.attn_size = attn_size = config.attn_size 99 | self.num_steps = num_steps = config.num_steps 100 | self.input_dataN, self.targetsN, self.input_dataT, self.targetsT, self.epoch_size, self.eof_indicator = \ 101 | reader.data_producer(data, batch_size, num_steps, config.vocab_size, config.attn_size, change_yT=True, name=name) 102 | if FLAGS.model == "test": 103 | self.epoch_size = 16 #small epoch size for test 104 | 105 | 106 | class PTBModel(object): 107 | 108 | def __init__(self, is_training, config, input_): 109 | self._input = input_ 110 | self.attn_size = attn_size = config.attn_size 111 | batch_size = input_.batch_size 112 | num_steps = input_.num_steps 113 | self.sizeN = sizeN = config.hidden_sizeN 114 | self.sizeT = sizeT = config.hidden_sizeT 115 | self.size = size = config.sizeH 116 | vocab_sizeN, vocab_sizeT = config.vocab_size 117 | 118 | # Slightly better results can be obtained with forget gate biases 119 | # initialized to 1 but the hyperparameters of the model would need to be 120 | # different than reported in the paper. 121 | def lstm_cell(): 122 | if 'reuse' in inspect.getargspec( 123 | tf.contrib.rnn.BasicLSTMCell.__init__).args: 124 | return tf.contrib.rnn.BasicLSTMCell( 125 | size, forget_bias=1.0, state_is_tuple=True, 126 | reuse=tf.get_variable_scope().reuse) 127 | else: 128 | return tf.contrib.rnn.BasicLSTMCell( 129 | size, forget_bias=1.0, state_is_tuple=True) 130 | attn_cell = lstm_cell 131 | if is_training and config.keep_prob < 1: 132 | def attn_cell(): 133 | return tf.contrib.rnn.DropoutWrapper( 134 | lstm_cell(), output_keep_prob=config.keep_prob) 135 | cell = tf.contrib.rnn.MultiRNNCell( 136 | [attn_cell() for _ in range(config.num_layers)], state_is_tuple=True) 137 | 138 | state_variables = [] 139 | with tf.variable_scope("myCH0"): 140 | for i, (state_c, state_h) in enumerate(cell.zero_state(batch_size, data_type())): 141 | if i > 0: tf.get_variable_scope().reuse_variables() 142 | myC0 = tf.get_variable("myC0", state_c.shape[1], initializer=tf.zeros_initializer()) 143 | myH0 = tf.get_variable("myH0", state_h.shape[1], initializer=tf.zeros_initializer()) 144 | myC0_tensor = tf.convert_to_tensor([myC0 for _ in range(batch_size)]) 145 | myH0_tensor = tf.convert_to_tensor([myH0 for _ in range(batch_size)]) 146 | state_variables.append(tf.contrib.rnn.LSTMStateTuple(myC0_tensor, myH0_tensor)) 147 | 148 | self._initial_state = state_variables 149 | 150 | self.eof_indicator = input_.eof_indicator 151 | 152 | with tf.device("/cpu:0"): 153 | embeddingN = tf.get_variable( 154 | "embeddingN", [vocab_sizeN, sizeN], dtype=data_type()) 155 | inputsN = tf.nn.embedding_lookup(embeddingN, input_.input_dataN) 156 | 157 | with tf.device("/cpu:0"): 158 | embeddingT = tf.get_variable( 159 | "embeddingT", [vocab_sizeT, sizeT], dtype=data_type()) 160 | inputsT = tf.nn.embedding_lookup(embeddingT, input_.input_dataT) 161 | 162 | inputs = tf.concat([inputsN, inputsT], 2) 163 | #inputs = tf.one_hot(input_.input_data, vocab_size) 164 | if is_training and config.keep_prob < 1: 165 | inputs = tf.nn.dropout(inputs, config.keep_prob) 166 | 167 | outputs = [] 168 | attentions = [] 169 | parents = [] 170 | state = self._initial_state 171 | self.memory = tf.placeholder(dtype=data_type(), shape=[batch_size, num_steps, size], name="memory") 172 | valid_memory = self.memory[:,-attn_size:,:] 173 | # print ("test test test,, state shape", np.array(state).shape) 174 | with tf.variable_scope("RNN"): 175 | for time_step in range(num_steps): 176 | if time_step > 0: tf.get_variable_scope().reuse_variables() 177 | (cell_output, state) = cell(inputs[:, time_step, :], state) 178 | outputs.append(cell_output) 179 | 180 | # parent_index = input_.input_dataP[:, time_step] 181 | # cell_parent = [valid_memory[i,-parent_index[i],:] for i in range(batch_size)] 182 | # parents.append(tf.convert_to_tensor(cell_parent)) 183 | 184 | wm = tf.get_variable("wm", [size, size], dtype=data_type()) 185 | wh = tf.get_variable("wh", [size, size], dtype=data_type()) 186 | wt = tf.get_variable("wt", [size, 1], dtype=data_type()) 187 | gt = tf.tanh(tf.matmul(tf.reshape(valid_memory, [-1, size]), wm) + tf.reshape(tf.tile(tf.matmul(cell_output, wh),[1, attn_size]), [-1, size])) 188 | alpha = tf.nn.softmax(tf.reshape(tf.matmul(gt, wt), [-1,attn_size])) 189 | ct = tf.squeeze(tf.matmul(tf.transpose(valid_memory, [0, 2, 1]), tf.reshape(alpha, [-1, attn_size, 1]))) 190 | attentions.append(ct) 191 | valid_memory = tf.concat([valid_memory[:,1:,:], tf.expand_dims(cell_output, axis=1)], axis=1) 192 | 193 | output = tf.reshape(tf.stack(axis=1, values=outputs), [-1, size]) 194 | attention = tf.reshape(tf.stack(axis=1, values=attentions), [-1, size]) 195 | # parent = tf.reshape(tf.stack(axis=1, values=parents), [-1, size]) 196 | self.output = tf.reshape(output, [-1, num_steps, size]) #to record the memory for next batch 197 | wa = tf.get_variable("wa", [size*2, size], dtype=data_type()) 198 | nt = tf.tanh(tf.matmul(tf.concat([output, attention], axis=1), wa)) 199 | 200 | softmax_w = tf.get_variable("softmax_w", [size, vocab_sizeT], dtype=data_type()) 201 | softmax_b = tf.get_variable("softmax_b", [vocab_sizeT], dtype=data_type()) 202 | logits = tf.matmul(nt, softmax_w) + softmax_b 203 | labels = tf.reshape(input_.targetsT, [-1]) 204 | weights = tf.ones([batch_size * num_steps], dtype=data_type()) 205 | 206 | #counting unk as wrong 207 | unk_id = vocab_sizeT - 2 208 | unk_tf = tf.constant(value=unk_id, dtype=tf.int32, shape=labels.shape) 209 | zero_weights = tf.zeros_like(labels, dtype=data_type()) 210 | wrong_label = tf.constant(value=-1, dtype=tf.int32, shape=labels.shape) 211 | condition_tf = tf.equal(labels, unk_tf) 212 | new_weights = tf.where(condition_tf, zero_weights, weights) 213 | new_labels = tf.where(condition_tf, wrong_label, labels) 214 | 215 | 216 | loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example([logits], [labels], [new_weights]) 217 | probs = tf.nn.softmax(logits) 218 | correct_prediction = tf.equal(tf.cast(tf.argmax(probs, 1), dtype = tf.int32), new_labels) 219 | self._accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) 220 | 221 | self._cost = cost = tf.reduce_sum(loss) / batch_size 222 | self._final_state = state 223 | 224 | if not is_training: 225 | return 226 | 227 | self._lr = tf.Variable(0.0, trainable=False) 228 | tvars = tf.trainable_variables() 229 | print ('tvars', len(tvars)) 230 | grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), 231 | config.max_grad_norm) 232 | print ('*******the length', len(grads), '\n') 233 | optimizer = tf.train.AdamOptimizer(self._lr) 234 | self._train_op = optimizer.apply_gradients( 235 | zip(grads, tvars), 236 | global_step=tf.contrib.framework.get_or_create_global_step()) 237 | 238 | self._new_lr = tf.placeholder( 239 | tf.float32, shape=[], name="new_learning_rate") 240 | self._lr_update = tf.assign(self._lr, self._new_lr) 241 | 242 | def assign_lr(self, session, lr_value): 243 | session.run(self._lr_update, feed_dict={self._new_lr: lr_value}) 244 | 245 | @property 246 | def input(self): 247 | return self._input 248 | 249 | @property 250 | def initial_state(self): 251 | return self._initial_state 252 | 253 | @property 254 | def cost(self): 255 | return self._cost 256 | 257 | @property 258 | def final_state(self): 259 | return self._final_state 260 | 261 | @property 262 | def accuracy(self): 263 | return self._accuracy 264 | 265 | @property 266 | def lr(self): 267 | return self._lr 268 | 269 | @property 270 | def train_op(self): 271 | return self._train_op 272 | 273 | 274 | def run_epoch(session, model, eval_op=None, verbose=False): 275 | start_time = time.time() 276 | costs = 0.0 277 | accuracy_list = [] 278 | iters = 0 279 | state = session.run(model.initial_state) 280 | # print ('at the very initial of the run_epoch\n', state[0].c) 281 | eof_indicator = np.ones((model.input.batch_size), dtype=bool) 282 | memory = np.zeros([model.input.batch_size, model.input.num_steps, model.size]) 283 | # file_id = session.run(model.initial_file_id) #need to remove _ 284 | 285 | fetches = { 286 | "cost": model.cost, 287 | "accuracy": model.accuracy, 288 | "final_state": model.final_state, 289 | "eof_indicator": model.eof_indicator, 290 | "memory":model.output, 291 | } 292 | if eval_op is not None: 293 | fetches["eval_op"] = eval_op 294 | 295 | for step in range(model.input.epoch_size): 296 | feed_dict = {} 297 | # current_file_id = file_id #session.run(model.file_id) 298 | sub_cond = np.expand_dims(eof_indicator, axis = 1) 299 | condition = np.repeat(sub_cond, model.size, axis = 1) 300 | # zero_state = np.zeros_like(condition) 301 | # zero_state = np.random.uniform(-0.05,0.05,condition.shape) 302 | zero_state = session.run(model.initial_state) 303 | 304 | for i, (c, h) in enumerate(model.initial_state): 305 | assert condition.shape == state[i].c.shape 306 | feed_dict[c] = np.where(condition, zero_state[i][0], state[i].c) 307 | feed_dict[h] = np.where(condition, zero_state[i][1], state[i].h) 308 | 309 | feed_dict[model.memory] = memory 310 | vals = session.run(fetches, feed_dict) 311 | 312 | cost = vals["cost"] 313 | accuracy = vals["accuracy"] 314 | eof_indicator = vals["eof_indicator"] 315 | state = vals["final_state"] #use the final state as the initial state within a whole epoch 316 | memory = vals["memory"] 317 | 318 | accuracy_list.append(accuracy) 319 | costs += cost 320 | iters += model.input.num_steps 321 | 322 | if verbose and step % (model.input.epoch_size // 10) == 10: 323 | print("%.3f perplexity: %.3f accuracy: %.4f speed: %.0f wps" % 324 | (step * 1.0 / model.input.epoch_size, np.exp(costs / iters), np.mean(accuracy_list), 325 | (time.time() - start_time))) 326 | 327 | print ('this run_epoch takes time %.2f' %(time.time() - start_time)) 328 | return np.exp(costs / iters), np.mean(accuracy_list) 329 | 330 | 331 | def main(_): 332 | start_time = time.time() 333 | fout = open(outfile, 'a') 334 | print ('\n', time.asctime(time.localtime()), file=fout) 335 | print ('start a new experiment %s'%outfile, file=fout) 336 | print ('Using dataset %s and %s'%(N_filename, T_filename), file=fout) 337 | 338 | train_dataN, valid_dataN, vocab_sizeN, train_dataT, valid_dataT, vocab_sizeT, attn_size \ 339 | = reader.input_data(N_filename, T_filename) 340 | 341 | train_data = (train_dataN, train_dataT) 342 | valid_data = (valid_dataN, valid_dataT) 343 | vocab_size = (vocab_sizeN+1, vocab_sizeT+2) # plus EOF, N is [w, eof], T is [w, unk, eof] 344 | 345 | config = get_config() 346 | assert attn_size == config.attn_size #make sure the attn_size used in generate terminal is the same as the configuration 347 | config.vocab_size = vocab_size 348 | eval_config = get_config() 349 | eval_config.batch_size = config.batch_size * config.num_steps 350 | eval_config.num_steps = 1 351 | eval_config.vocab_size = vocab_size 352 | 353 | with tf.Graph().as_default(): 354 | initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale) 355 | 356 | with tf.name_scope("Train"): 357 | train_input = PTBInput(config=config, data=train_data, name="TrainInput") 358 | with tf.variable_scope("Model", reuse=None, initializer=initializer): 359 | m = PTBModel(is_training=True, config=config, input_=train_input) 360 | 361 | with tf.name_scope("Valid"): 362 | valid_input = PTBInput(config=config, data=valid_data, name="ValidInput") 363 | with tf.variable_scope("Model", reuse=True, initializer=initializer): 364 | mvalid = PTBModel(is_training=False, config=config, input_=valid_input) 365 | 366 | # with tf.name_scope("Test"): 367 | # test_input = PTBInput(config=eval_config, data=valid_data, name="TestInput") 368 | # with tf.variable_scope("Model", reuse=True, initializer=initializer): 369 | # mtest = PTBModel(is_training=False, config=eval_config, 370 | # input_=test_input) 371 | 372 | 373 | print ('total trainable variables', len(tf.trainable_variables()), '\n\n') 374 | max_valid = 0 375 | max_step = 0 376 | saver = tf.train.Saver() 377 | 378 | sv = tf.train.Supervisor(logdir=None, summary_op=None) 379 | with sv.managed_session() as session: 380 | 381 | for i in range(config.max_max_epoch): 382 | lr_decay = config.lr_decay ** max(i + 1 - config.max_epoch, 0.0) 383 | m.assign_lr(session, config.learning_rate * lr_decay) 384 | print (outfile, "Epoch: %d Learning rate: %.6f" % (i + 1, session.run(m.lr))) 385 | 386 | train_perplexity, train_accuracy = run_epoch(session, m, eval_op=m.train_op, verbose=True) 387 | print("Epoch: %d Train Perplexity: %.3f Train Accuracy: %.3f" % (i + 1, train_perplexity, train_accuracy)) 388 | print("Epoch: %d Train Perplexity: %.3f Train Accuracy: %.3f" % (i + 1, train_perplexity, train_accuracy), file=fout) 389 | 390 | if i > 5: 391 | valid_perplexity, valid_accuracy = run_epoch(session, mvalid) 392 | print("Epoch: %d Valid Perplexity: ~~%.3f Valid Accuracy: %.3f~" % (i + 1, valid_perplexity, valid_accuracy)) 393 | print("Epoch: %d Valid Perplexity: ~~%.3f Valid Accuracy: %.3f~" % (i + 1, valid_perplexity, valid_accuracy), file=fout) 394 | if valid_accuracy > max_valid: 395 | max_valid = valid_accuracy 396 | max_step = i + 1 397 | 398 | # test_perplexity, test_accuracy = run_epoch(session, mtest) 399 | # print("\nTest Perplexity: %.3f Test Accuracy: %.3f" % (test_perplexity, test_accuracy)) 400 | 401 | print ('max step %d, max valid %.3f' %(max_step, max_valid)) 402 | # print ('data path is', FLAGS.data_path) 403 | print ('total time takes', time.time()-start_time) 404 | print ('max step %d, max valid %.3f' %(max_step, max_valid), file=fout) 405 | print ('total time takes', time.time()-start_time, file=fout) 406 | fout.close() 407 | 408 | # if FLAGS.save_path: 409 | # print("Saving model to %s." % FLAGS.save_path) 410 | # save_path = saver.save(session, FLAGS.save_path, write_meta_graph=False, write_state=False) 411 | 412 | 413 | if __name__ == "__main__": 414 | tf.app.run() 415 | -------------------------------------------------------------------------------- /code/attention_N.py: -------------------------------------------------------------------------------- 1 | # attentional LSTM, predict non terminal 2 | # what we exactly use 3 | # use reader_pointer_original, without parent 4 | 5 | from __future__ import absolute_import 6 | from __future__ import division 7 | from __future__ import print_function 8 | 9 | import inspect 10 | import time 11 | 12 | import numpy as np 13 | import tensorflow as tf 14 | 15 | import reader_pointer_original as reader 16 | import os 17 | os.environ['TF_CPP_MIN_LOG_LEVEL']='2' 18 | 19 | os.environ['CUDA_VISIBLE_DEVICES']='0' 20 | outfile = 'output_attention_N.txt' 21 | 22 | N_filename = '../pickle_data/PY_non_terminal.pickle' 23 | T_filename = '../pickle_data/PY_terminal_50k_whole.pickle' 24 | 25 | flags = tf.flags 26 | flags.DEFINE_string("save_path", None, #'./logs/modelT0A' 27 | "Model output directory.") 28 | 29 | flags.DEFINE_string( 30 | "model", "small", 31 | "A type of model. Possible options are: small, medium, best.") 32 | # flags.DEFINE_string("data_path", '../data/dataJS', 33 | # "Where the training/test data is stored.") 34 | flags.DEFINE_bool("use_fp16", False, 35 | "Train using 16-bit floats instead of 32bit floats") 36 | 37 | FLAGS = flags.FLAGS 38 | logging = tf.logging 39 | 40 | if FLAGS.model == "test": 41 | outfile = 'TESToutput.txt' 42 | def data_type(): 43 | return tf.float16 if FLAGS.use_fp16 else tf.float32 44 | 45 | class SmallConfig(object): 46 | """Small config. get best result as 0.733 """ 47 | init_scale = 0.05 48 | learning_rate = 0.001 49 | max_grad_norm = 5 50 | num_layers = 1#1 51 | num_steps = 50 52 | attn_size = 50 53 | hidden_sizeN = 50 54 | hidden_sizeT = 50 55 | sizeH = 100 56 | max_epoch = 1#8 57 | max_max_epoch = 8#79 58 | keep_prob = 1.0#1.0 59 | lr_decay = 0.6#0.95 60 | batch_size = 128#80 61 | 62 | class TestConfig(object): 63 | """Tiny config, for testing.""" 64 | init_scale = 0.05 65 | learning_rate = 0.001 66 | max_grad_norm = 5 67 | num_layers = 1 68 | num_steps = 50 69 | attn_size = 50 70 | hidden_sizeN = 50 71 | hidden_sizeT = 50 72 | sizeH = 100 73 | max_epoch = 1 74 | max_max_epoch = 1 75 | keep_prob = 1.0 76 | lr_decay = 0.6 77 | batch_size = 128 78 | 79 | 80 | def get_config(): 81 | if FLAGS.model == "small": 82 | return SmallConfig() 83 | elif FLAGS.model == "medium": 84 | return MediumConfig() 85 | elif FLAGS.model == "best": 86 | return BestConfig() 87 | elif FLAGS.model == "test": 88 | return TestConfig() 89 | else: 90 | raise ValueError("Invalid model: %s", FLAGS.model) 91 | 92 | 93 | class PTBInput(object): 94 | """The input data.""" 95 | 96 | def __init__(self, config, data, name=None): 97 | self.batch_size = batch_size = config.batch_size 98 | self.attn_size = attn_size = config.attn_size 99 | self.num_steps = num_steps = config.num_steps 100 | self.input_dataN, self.targetsN, self.input_dataT, self.targetsT, self.epoch_size, self.eof_indicator = \ 101 | reader.data_producer(data, batch_size, num_steps, config.vocab_size, config.attn_size, change_yT=True, name=name) 102 | if FLAGS.model == "test": 103 | self.epoch_size = 16 #small epoch size for test 104 | 105 | 106 | class PTBModel(object): 107 | 108 | def __init__(self, is_training, config, input_): 109 | self._input = input_ 110 | self.attn_size = attn_size = config.attn_size 111 | batch_size = input_.batch_size 112 | num_steps = input_.num_steps 113 | self.sizeN = sizeN = config.hidden_sizeN 114 | self.sizeT = sizeT = config.hidden_sizeT 115 | self.size = size = config.sizeH 116 | vocab_sizeN, vocab_sizeT = config.vocab_size 117 | 118 | # Slightly better results can be obtained with forget gate biases 119 | # initialized to 1 but the hyperparameters of the model would need to be 120 | # different than reported in the paper. 121 | def lstm_cell(): 122 | if 'reuse' in inspect.getargspec( 123 | tf.contrib.rnn.BasicLSTMCell.__init__).args: 124 | return tf.contrib.rnn.BasicLSTMCell( 125 | size, forget_bias=1.0, state_is_tuple=True, 126 | reuse=tf.get_variable_scope().reuse) 127 | else: 128 | return tf.contrib.rnn.BasicLSTMCell( 129 | size, forget_bias=1.0, state_is_tuple=True) 130 | attn_cell = lstm_cell 131 | if is_training and config.keep_prob < 1: 132 | def attn_cell(): 133 | return tf.contrib.rnn.DropoutWrapper( 134 | lstm_cell(), output_keep_prob=config.keep_prob) 135 | cell = tf.contrib.rnn.MultiRNNCell( 136 | [attn_cell() for _ in range(config.num_layers)], state_is_tuple=True) 137 | 138 | state_variables = [] 139 | with tf.variable_scope("myCH0"): 140 | for i, (state_c, state_h) in enumerate(cell.zero_state(batch_size, data_type())): 141 | if i > 0: tf.get_variable_scope().reuse_variables() 142 | myC0 = tf.get_variable("myC0", state_c.shape[1], initializer=tf.zeros_initializer()) 143 | myH0 = tf.get_variable("myH0", state_h.shape[1], initializer=tf.zeros_initializer()) 144 | myC0_tensor = tf.convert_to_tensor([myC0 for _ in range(batch_size)]) 145 | myH0_tensor = tf.convert_to_tensor([myH0 for _ in range(batch_size)]) 146 | state_variables.append(tf.contrib.rnn.LSTMStateTuple(myC0_tensor, myH0_tensor)) 147 | 148 | self._initial_state = state_variables 149 | 150 | self.eof_indicator = input_.eof_indicator 151 | 152 | with tf.device("/cpu:0"): 153 | embeddingN = tf.get_variable( 154 | "embeddingN", [vocab_sizeN, sizeN], dtype=data_type()) 155 | inputsN = tf.nn.embedding_lookup(embeddingN, input_.input_dataN) 156 | 157 | with tf.device("/cpu:0"): 158 | embeddingT = tf.get_variable( 159 | "embeddingT", [vocab_sizeT, sizeT], dtype=data_type()) 160 | inputsT = tf.nn.embedding_lookup(embeddingT, input_.input_dataT) 161 | 162 | inputs = tf.concat([inputsN, inputsT], 2) 163 | #inputs = tf.one_hot(input_.input_data, vocab_size) 164 | if is_training and config.keep_prob < 1: 165 | inputs = tf.nn.dropout(inputs, config.keep_prob) 166 | 167 | outputs = [] 168 | attentions = [] 169 | state = self._initial_state 170 | self.memory = tf.placeholder(dtype=data_type(), shape=[batch_size, num_steps, size], name="memory") 171 | valid_memory = self.memory[:,-attn_size:,:] 172 | # print ("test test test,, state shape", np.array(state).shape) 173 | with tf.variable_scope("RNN"): 174 | for time_step in range(num_steps): 175 | if time_step > 0: tf.get_variable_scope().reuse_variables() 176 | (cell_output, state) = cell(inputs[:, time_step, :], state) 177 | outputs.append(cell_output) 178 | 179 | wm = tf.get_variable("wm", [size, size], dtype=data_type()) 180 | wh = tf.get_variable("wh", [size, size], dtype=data_type()) 181 | wt = tf.get_variable("wt", [size, 1], dtype=data_type()) 182 | gt = tf.tanh(tf.matmul(tf.reshape(valid_memory, [-1, size]), wm) + tf.reshape(tf.tile(tf.matmul(cell_output, wh),[1, attn_size]), [-1, size])) 183 | alpha = tf.nn.softmax(tf.reshape(tf.matmul(gt, wt), [-1,attn_size])) 184 | ct = tf.squeeze(tf.matmul(tf.transpose(valid_memory, [0, 2, 1]), tf.reshape(alpha, [-1, attn_size, 1]))) 185 | attentions.append(ct) 186 | valid_memory = tf.concat([valid_memory[:,1:,:], tf.expand_dims(cell_output, axis=1)], axis=1) 187 | 188 | output = tf.reshape(tf.stack(axis=1, values=outputs), [-1, size]) 189 | attention = tf.reshape(tf.stack(axis=1, values=attentions), [-1, size]) 190 | self.output = tf.reshape(output, [-1, num_steps, size]) #to record the memory for next batch 191 | wa = tf.get_variable("wa", [size*2, size], dtype=data_type()) 192 | nt = tf.tanh(tf.matmul(tf.concat([output, attention], axis=1), wa)) 193 | 194 | softmax_w = tf.get_variable("softmax_w", [size, vocab_sizeN], dtype=data_type()) 195 | softmax_b = tf.get_variable("softmax_b", [vocab_sizeN], dtype=data_type()) 196 | logits = tf.matmul(nt, softmax_w) + softmax_b 197 | labels = tf.reshape(input_.targetsN, [-1]) 198 | weights = tf.ones([batch_size * num_steps], dtype=data_type()) 199 | 200 | loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example([logits], [labels], [weights]) 201 | probs = tf.nn.softmax(logits) 202 | correct_prediction = tf.equal(tf.cast(tf.argmax(probs, 1), dtype = tf.int32), labels) 203 | self._accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) 204 | 205 | self._cost = cost = tf.reduce_sum(loss) / batch_size 206 | self._final_state = state 207 | 208 | if not is_training: 209 | return 210 | 211 | self._lr = tf.Variable(0.0, trainable=False) 212 | tvars = tf.trainable_variables() 213 | print ('tvars', len(tvars)) 214 | grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), 215 | config.max_grad_norm) 216 | print ('*******the length', len(grads)) 217 | optimizer = tf.train.AdamOptimizer(self._lr) 218 | self._train_op = optimizer.apply_gradients( 219 | zip(grads, tvars), 220 | global_step=tf.contrib.framework.get_or_create_global_step()) 221 | 222 | self._new_lr = tf.placeholder( 223 | tf.float32, shape=[], name="new_learning_rate") 224 | self._lr_update = tf.assign(self._lr, self._new_lr) 225 | 226 | def assign_lr(self, session, lr_value): 227 | session.run(self._lr_update, feed_dict={self._new_lr: lr_value}) 228 | 229 | @property 230 | def input(self): 231 | return self._input 232 | 233 | @property 234 | def initial_state(self): 235 | return self._initial_state 236 | 237 | @property 238 | def cost(self): 239 | return self._cost 240 | 241 | @property 242 | def final_state(self): 243 | return self._final_state 244 | 245 | @property 246 | def accuracy(self): 247 | return self._accuracy 248 | 249 | @property 250 | def lr(self): 251 | return self._lr 252 | 253 | @property 254 | def train_op(self): 255 | return self._train_op 256 | 257 | 258 | def run_epoch(session, model, eval_op=None, verbose=False): 259 | """Runs the model on the given data.""" 260 | start_time = time.time() 261 | costs = 0.0 262 | accuracy_list = [] 263 | iters = 0 264 | state = session.run(model.initial_state) 265 | # print ('at the very initial of the run_epoch\n', state[0].c) 266 | eof_indicator = np.ones((model.input.batch_size), dtype=bool) 267 | memory = np.zeros([model.input.batch_size, model.input.num_steps, model.size]) 268 | # file_id = session.run(model.initial_file_id) #need to remove _ 269 | 270 | fetches = { 271 | "cost": model.cost, 272 | "accuracy": model.accuracy, 273 | "final_state": model.final_state, 274 | "eof_indicator": model.eof_indicator, 275 | "memory":model.output, 276 | } 277 | if eval_op is not None: 278 | fetches["eval_op"] = eval_op 279 | 280 | for step in range(model.input.epoch_size): 281 | feed_dict = {} 282 | # current_file_id = file_id #session.run(model.file_id) 283 | sub_cond = np.expand_dims(eof_indicator, axis = 1) 284 | condition = np.repeat(sub_cond, model.size, axis = 1) 285 | # zero_state = np.zeros_like(condition) 286 | # zero_state = np.random.uniform(-0.05,0.05,condition.shape) 287 | zero_state = session.run(model.initial_state) 288 | 289 | for i, (c, h) in enumerate(model.initial_state): 290 | assert condition.shape == state[i].c.shape 291 | feed_dict[c] = np.where(condition, zero_state[i][0], state[i].c) 292 | feed_dict[h] = np.where(condition, zero_state[i][1], state[i].h) 293 | 294 | feed_dict[model.memory] = memory 295 | vals = session.run(fetches, feed_dict) 296 | 297 | cost = vals["cost"] 298 | accuracy = vals["accuracy"] 299 | eof_indicator = vals["eof_indicator"] 300 | state = vals["final_state"] #use the final state as the initial state within a whole epoch 301 | memory = vals["memory"] 302 | 303 | accuracy_list.append(accuracy) 304 | costs += cost 305 | iters += model.input.num_steps 306 | 307 | if verbose and step % (model.input.epoch_size // 10) == 10: 308 | print("%.3f perplexity: %.3f accuracy: %.4f speed: %.0f wps" % 309 | (step * 1.0 / model.input.epoch_size, np.exp(costs / iters), np.mean(accuracy_list), 310 | (time.time() - start_time))) 311 | 312 | print ('this run_epoch takes time %.2f' %(time.time() - start_time)) 313 | return np.exp(costs / iters), np.mean(accuracy_list) 314 | 315 | 316 | def main(_): 317 | start_time = time.time() 318 | fout = open(outfile, 'a') 319 | print ('\n', time.asctime(time.localtime()), file=fout) 320 | print ('start a new experiment %s'%outfile, file=fout) 321 | print ('Using dataset %s and %s'%(N_filename, T_filename), file=fout) 322 | 323 | train_dataN, valid_dataN, vocab_sizeN, train_dataT, valid_dataT, vocab_sizeT, attn_size \ 324 | = reader.input_data(N_filename, T_filename) 325 | 326 | train_data = (train_dataN, train_dataT) 327 | valid_data = (valid_dataN, valid_dataT) 328 | vocab_size = (vocab_sizeN+1, vocab_sizeT+2) # plus EOF, N is [w, eof], T is [w, unk, eof] 329 | 330 | config = get_config() 331 | assert attn_size == config.attn_size #make sure the attn_size used in generate terminal is the same as the configuration 332 | config.vocab_size = vocab_size 333 | eval_config = get_config() 334 | eval_config.batch_size = config.batch_size * config.num_steps 335 | eval_config.num_steps = 1 336 | eval_config.vocab_size = vocab_size 337 | 338 | with tf.Graph().as_default(): 339 | initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale) 340 | 341 | with tf.name_scope("Train"): 342 | train_input = PTBInput(config=config, data=train_data, name="TrainInput") 343 | with tf.variable_scope("Model", reuse=None, initializer=initializer): 344 | m = PTBModel(is_training=True, config=config, input_=train_input) 345 | 346 | with tf.name_scope("Valid"): 347 | valid_input = PTBInput(config=config, data=valid_data, name="ValidInput") 348 | with tf.variable_scope("Model", reuse=True, initializer=initializer): 349 | mvalid = PTBModel(is_training=False, config=config, input_=valid_input) 350 | 351 | # with tf.name_scope("Test"): 352 | # test_input = PTBInput(config=eval_config, data=valid_data, name="TestInput") 353 | # with tf.variable_scope("Model", reuse=True, initializer=initializer): 354 | # mtest = PTBModel(is_training=False, config=eval_config, 355 | # input_=test_input) 356 | 357 | 358 | print ('total trainable variables', len(tf.trainable_variables()), '\n\n') 359 | max_valid = 0 360 | max_step = 0 361 | saver = tf.train.Saver() 362 | 363 | sv = tf.train.Supervisor(logdir=None, summary_op=None) 364 | with sv.managed_session() as session: 365 | 366 | for i in range(config.max_max_epoch): 367 | lr_decay = config.lr_decay ** max(i + 1 - config.max_epoch, 0.0) 368 | m.assign_lr(session, config.learning_rate * lr_decay) 369 | print (outfile, "Epoch: %d Learning rate: %.6f" % (i + 1, session.run(m.lr))) 370 | 371 | train_perplexity, train_accuracy = run_epoch(session, m, eval_op=m.train_op, verbose=True) 372 | print("Epoch: %d Train Perplexity: %.3f Train Accuracy: %.3f" % (i + 1, train_perplexity, train_accuracy)) 373 | print("Epoch: %d Train Perplexity: %.3f Train Accuracy: %.3f" % (i + 1, train_perplexity, train_accuracy), file=fout) 374 | 375 | if i > 5: 376 | valid_perplexity, valid_accuracy = run_epoch(session, mvalid) 377 | print("Epoch: %d Valid Perplexity: ~~%.3f Valid Accuracy: %.3f~" % (i + 1, valid_perplexity, valid_accuracy)) 378 | print("Epoch: %d Valid Perplexity: ~~%.3f Valid Accuracy: %.3f~" % (i + 1, valid_perplexity, valid_accuracy), file=fout) 379 | if valid_accuracy > max_valid: 380 | max_valid = valid_accuracy 381 | max_step = i + 1 382 | 383 | # test_perplexity, test_accuracy = run_epoch(session, mtest) 384 | # print("\nTest Perplexity: %.3f Test Accuracy: %.3f" % (test_perplexity, test_accuracy)) 385 | 386 | print ('max step %d, max valid %.3f' %(max_step, max_valid)) 387 | # print ('data path is', FLAGS.data_path) 388 | print ('total time takes', time.time()-start_time) 389 | print ('max step %d, max valid %.3f' %(max_step, max_valid), file=fout) 390 | print ('total time takes', time.time()-start_time, file=fout) 391 | fout.close() 392 | 393 | # if FLAGS.save_path: 394 | # print("Saving model to %s." % FLAGS.save_path) 395 | # save_path = saver.save(session, FLAGS.save_path, write_meta_graph=False, write_state=False) 396 | 397 | 398 | if __name__ == "__main__": 399 | tf.app.run() 400 | -------------------------------------------------------------------------------- /code/attention_N_parent.py: -------------------------------------------------------------------------------- 1 | # attentional LSTM, predict non terminal 2 | # what we exactly use 3 | # revise 01/09, add parent hidden states at output 4 | 5 | from __future__ import absolute_import 6 | from __future__ import division 7 | from __future__ import print_function 8 | 9 | import inspect 10 | import time 11 | 12 | import numpy as np 13 | import tensorflow as tf 14 | 15 | import reader_pointer as reader 16 | import os 17 | os.environ['TF_CPP_MIN_LOG_LEVEL']='2' 18 | 19 | os.environ['CUDA_VISIBLE_DEVICES']='0' 20 | outfile = 'output_attention_parent_N.txt' 21 | 22 | N_filename = '../pickle_data/PY_non_terminal.pickle' 23 | T_filename = '../pickle_data/PY_terminal_50k_whole.pickle' 24 | 25 | flags = tf.flags 26 | flags.DEFINE_string("save_path", None, #'./logs/modelT0A' 27 | "Model output directory.") 28 | 29 | flags.DEFINE_string( 30 | "model", "small", 31 | "A type of model. Possible options are: small, medium, best.") 32 | # flags.DEFINE_string("data_path", '../data/dataJS', 33 | # "Where the training/test data is stored.") 34 | flags.DEFINE_bool("use_fp16", False, 35 | "Train using 16-bit floats instead of 32bit floats") 36 | 37 | FLAGS = flags.FLAGS 38 | logging = tf.logging 39 | 40 | if FLAGS.model == "test": 41 | outfile = 'TESToutput.txt' 42 | def data_type(): 43 | return tf.float16 if FLAGS.use_fp16 else tf.float32 44 | 45 | class SmallConfig(object): 46 | """Small config. get best result as 0.733 """ 47 | init_scale = 0.05 48 | learning_rate = 0.001 49 | max_grad_norm = 5 50 | num_layers = 1#1 51 | num_steps = 50 52 | attn_size = 50 53 | hidden_sizeN = 50 54 | hidden_sizeT = 50 55 | sizeH = 100 56 | max_epoch = 1#8 57 | max_max_epoch = 8#79 58 | keep_prob = 1.0#1.0 59 | lr_decay = 0.6#0.95 60 | batch_size = 80#80 61 | 62 | class TestConfig(object): 63 | """Tiny config, for testing.""" 64 | init_scale = 0.05 65 | learning_rate = 0.001 66 | max_grad_norm = 5 67 | num_layers = 1 68 | num_steps = 50 69 | attn_size = 50 70 | hidden_sizeN = 50 71 | hidden_sizeT = 50 72 | sizeH = 100 73 | max_epoch = 1 74 | max_max_epoch = 1 75 | keep_prob = 1.0 76 | lr_decay = 0.6 77 | batch_size = 80 78 | 79 | 80 | def get_config(): 81 | if FLAGS.model == "small": 82 | return SmallConfig() 83 | elif FLAGS.model == "medium": 84 | return MediumConfig() 85 | elif FLAGS.model == "best": 86 | return BestConfig() 87 | elif FLAGS.model == "test": 88 | return TestConfig() 89 | else: 90 | raise ValueError("Invalid model: %s", FLAGS.model) 91 | 92 | 93 | class PTBInput(object): 94 | """The input data.""" 95 | 96 | def __init__(self, config, data, name=None): 97 | self.batch_size = batch_size = config.batch_size 98 | self.attn_size = attn_size = config.attn_size 99 | self.num_steps = num_steps = config.num_steps 100 | self.input_dataN, self.targetsN, self.input_dataT, self.targetsT, self.epoch_size, self.eof_indicator, self.input_dataP = \ 101 | reader.data_producer(data, batch_size, num_steps, config.vocab_size, config.attn_size, change_yT=True, name=name) 102 | if FLAGS.model == "test": 103 | self.epoch_size = 16 #small epoch size for test 104 | 105 | 106 | class PTBModel(object): 107 | 108 | def __init__(self, is_training, config, input_): 109 | self._input = input_ 110 | self.attn_size = attn_size = config.attn_size 111 | batch_size = input_.batch_size 112 | num_steps = input_.num_steps 113 | self.sizeN = sizeN = config.hidden_sizeN 114 | self.sizeT = sizeT = config.hidden_sizeT 115 | self.size = size = config.sizeH 116 | vocab_sizeN, vocab_sizeT = config.vocab_size 117 | 118 | # Slightly better results can be obtained with forget gate biases 119 | # initialized to 1 but the hyperparameters of the model would need to be 120 | # different than reported in the paper. 121 | def lstm_cell(): 122 | if 'reuse' in inspect.getargspec( 123 | tf.contrib.rnn.BasicLSTMCell.__init__).args: 124 | return tf.contrib.rnn.BasicLSTMCell( 125 | size, forget_bias=1.0, state_is_tuple=True, 126 | reuse=tf.get_variable_scope().reuse) 127 | else: 128 | return tf.contrib.rnn.BasicLSTMCell( 129 | size, forget_bias=1.0, state_is_tuple=True) 130 | attn_cell = lstm_cell 131 | if is_training and config.keep_prob < 1: 132 | def attn_cell(): 133 | return tf.contrib.rnn.DropoutWrapper( 134 | lstm_cell(), output_keep_prob=config.keep_prob) 135 | cell = tf.contrib.rnn.MultiRNNCell( 136 | [attn_cell() for _ in range(config.num_layers)], state_is_tuple=True) 137 | 138 | state_variables = [] 139 | with tf.variable_scope("myCH0"): 140 | for i, (state_c, state_h) in enumerate(cell.zero_state(batch_size, data_type())): 141 | if i > 0: tf.get_variable_scope().reuse_variables() 142 | myC0 = tf.get_variable("myC0", state_c.shape[1], initializer=tf.zeros_initializer()) 143 | myH0 = tf.get_variable("myH0", state_h.shape[1], initializer=tf.zeros_initializer()) 144 | myC0_tensor = tf.convert_to_tensor([myC0 for _ in range(batch_size)]) 145 | myH0_tensor = tf.convert_to_tensor([myH0 for _ in range(batch_size)]) 146 | state_variables.append(tf.contrib.rnn.LSTMStateTuple(myC0_tensor, myH0_tensor)) 147 | 148 | self._initial_state = state_variables 149 | 150 | self.eof_indicator = input_.eof_indicator 151 | 152 | with tf.device("/cpu:0"): 153 | embeddingN = tf.get_variable( 154 | "embeddingN", [vocab_sizeN, sizeN], dtype=data_type()) 155 | inputsN = tf.nn.embedding_lookup(embeddingN, input_.input_dataN) 156 | inputsP = tf.nn.embedding_lookup(embeddingN, input_.input_dataP) 157 | 158 | with tf.device("/cpu:0"): 159 | embeddingT = tf.get_variable( 160 | "embeddingT", [vocab_sizeT, sizeT], dtype=data_type()) 161 | inputsT = tf.nn.embedding_lookup(embeddingT, input_.input_dataT) 162 | 163 | inputs = tf.concat([inputsN, inputsT], 2) 164 | #inputs = tf.one_hot(input_.input_data, vocab_size) 165 | if is_training and config.keep_prob < 1: 166 | inputs = tf.nn.dropout(inputs, config.keep_prob) 167 | 168 | outputs = [] 169 | attentions = [] 170 | parents = [] 171 | state = self._initial_state 172 | self.memory = tf.placeholder(dtype=data_type(), shape=[batch_size, num_steps, size], name="memory") 173 | valid_memory = self.memory[:,-attn_size:,:] 174 | # print ("test test test,, state shape", np.array(state).shape) 175 | with tf.variable_scope("RNN"): 176 | for time_step in range(num_steps): 177 | if time_step > 0: tf.get_variable_scope().reuse_variables() 178 | (cell_output, state) = cell(inputs[:, time_step, :], state) 179 | outputs.append(cell_output) 180 | 181 | # parent_index = input_.input_dataP[:, time_step] # retrieval parent hidden state in batch 182 | # cell_parent = tf.convert_to_tensor([valid_memory[i,-parent_index[i],:] for i in range(batch_size)]) 183 | cell_parent = inputsP[:, time_step, :] 184 | parents.append(cell_parent) 185 | 186 | wm = tf.get_variable("wm", [size, size], dtype=data_type()) 187 | wh = tf.get_variable("wh", [size, size], dtype=data_type()) 188 | wt = tf.get_variable("wt", [size, 1], dtype=data_type()) 189 | gt = tf.tanh(tf.matmul(tf.reshape(valid_memory, [-1, size]), wm) + tf.reshape(tf.tile(tf.matmul(cell_output, wh),[1, attn_size]), [-1, size])) 190 | alpha = tf.nn.softmax(tf.reshape(tf.matmul(gt, wt), [-1,attn_size])) 191 | ct = tf.squeeze(tf.matmul(tf.transpose(valid_memory, [0, 2, 1]), tf.reshape(alpha, [-1, attn_size, 1]))) 192 | attentions.append(ct) 193 | valid_memory = tf.concat([valid_memory[:,1:,:], tf.expand_dims(cell_output, axis=1)], axis=1) 194 | 195 | output = tf.reshape(tf.stack(axis=1, values=outputs), [-1, size]) 196 | attention = tf.reshape(tf.stack(axis=1, values=attentions), [-1, size]) 197 | parent = tf.reshape(tf.stack(axis=1, values=parents), [-1, sizeN]) 198 | zeros_parent = tf.zeros_like(parent, dtype=data_type()) 199 | 200 | self.output = tf.reshape(output, [-1, num_steps, size]) #to record the memory for next batch 201 | wa = tf.get_variable("wa", [size*2+sizeN, size], dtype=data_type()) # feed parent at the output 202 | nt = tf.tanh(tf.matmul(tf.concat([output, attention, parent], axis=1), wa)) 203 | 204 | softmax_w = tf.get_variable("softmax_w", [size, vocab_sizeN], dtype=data_type()) 205 | softmax_b = tf.get_variable("softmax_b", [vocab_sizeN], dtype=data_type()) 206 | logits = tf.matmul(nt, softmax_w) + softmax_b 207 | labels = tf.reshape(input_.targetsN, [-1]) 208 | weights = tf.ones([batch_size * num_steps], dtype=data_type()) 209 | 210 | loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example([logits], [labels], [weights]) 211 | probs = tf.nn.softmax(logits) 212 | correct_prediction = tf.equal(tf.cast(tf.argmax(probs, 1), dtype = tf.int32), labels) 213 | self._accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) 214 | 215 | self._cost = cost = tf.reduce_sum(loss) / batch_size 216 | self._final_state = state 217 | 218 | if not is_training: 219 | return 220 | 221 | self._lr = tf.Variable(0.0, trainable=False) 222 | tvars = tf.trainable_variables() 223 | print ('tvars', len(tvars)) 224 | grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), 225 | config.max_grad_norm) 226 | print ('*******the length', len(grads)) 227 | optimizer = tf.train.AdamOptimizer(self._lr) 228 | self._train_op = optimizer.apply_gradients( 229 | zip(grads, tvars), 230 | global_step=tf.contrib.framework.get_or_create_global_step()) 231 | 232 | self._new_lr = tf.placeholder( 233 | tf.float32, shape=[], name="new_learning_rate") 234 | self._lr_update = tf.assign(self._lr, self._new_lr) 235 | 236 | def assign_lr(self, session, lr_value): 237 | session.run(self._lr_update, feed_dict={self._new_lr: lr_value}) 238 | 239 | @property 240 | def input(self): 241 | return self._input 242 | 243 | @property 244 | def initial_state(self): 245 | return self._initial_state 246 | 247 | @property 248 | def cost(self): 249 | return self._cost 250 | 251 | @property 252 | def final_state(self): 253 | return self._final_state 254 | 255 | @property 256 | def accuracy(self): 257 | return self._accuracy 258 | 259 | @property 260 | def lr(self): 261 | return self._lr 262 | 263 | @property 264 | def train_op(self): 265 | return self._train_op 266 | 267 | 268 | def run_epoch(session, model, eval_op=None, verbose=False): 269 | """Runs the model on the given data.""" 270 | start_time = time.time() 271 | costs = 0.0 272 | accuracy_list = [] 273 | iters = 0 274 | state = session.run(model.initial_state) 275 | # print ('at the very initial of the run_epoch\n', state[0].c) 276 | eof_indicator = np.ones((model.input.batch_size), dtype=bool) 277 | memory = np.zeros([model.input.batch_size, model.input.num_steps, model.size]) 278 | # file_id = session.run(model.initial_file_id) #need to remove _ 279 | 280 | fetches = { 281 | "cost": model.cost, 282 | "accuracy": model.accuracy, 283 | "final_state": model.final_state, 284 | "eof_indicator": model.eof_indicator, 285 | "memory":model.output, 286 | } 287 | if eval_op is not None: 288 | fetches["eval_op"] = eval_op 289 | 290 | for step in range(model.input.epoch_size): 291 | feed_dict = {} 292 | # current_file_id = file_id #session.run(model.file_id) 293 | sub_cond = np.expand_dims(eof_indicator, axis = 1) 294 | condition = np.repeat(sub_cond, model.size, axis = 1) 295 | # zero_state = np.zeros_like(condition) 296 | # zero_state = np.random.uniform(-0.05,0.05,condition.shape) 297 | zero_state = session.run(model.initial_state) 298 | 299 | for i, (c, h) in enumerate(model.initial_state): 300 | assert condition.shape == state[i].c.shape 301 | feed_dict[c] = np.where(condition, zero_state[i][0], state[i].c) 302 | feed_dict[h] = np.where(condition, zero_state[i][1], state[i].h) 303 | 304 | feed_dict[model.memory] = memory 305 | vals = session.run(fetches, feed_dict) 306 | 307 | cost = vals["cost"] 308 | accuracy = vals["accuracy"] 309 | eof_indicator = vals["eof_indicator"] 310 | state = vals["final_state"] #use the final state as the initial state within a whole epoch 311 | memory = vals["memory"] 312 | 313 | accuracy_list.append(accuracy) 314 | costs += cost 315 | iters += model.input.num_steps 316 | 317 | if verbose and step % (model.input.epoch_size // 10) == 10: 318 | print("%.3f perplexity: %.3f accuracy: %.4f speed: %.0f wps" % 319 | (step * 1.0 / model.input.epoch_size, np.exp(costs / iters), np.mean(accuracy_list), 320 | (time.time() - start_time))) 321 | 322 | print ('this run_epoch takes time %.2f' %(time.time() - start_time)) 323 | return np.exp(costs / iters), np.mean(accuracy_list) 324 | 325 | 326 | def main(_): 327 | start_time = time.time() 328 | fout = open(outfile, 'a') 329 | print ('\n', time.asctime(time.localtime()), file=fout) 330 | print ('start a new experiment %s'%outfile, file=fout) 331 | print ('Using dataset %s and %s'%(N_filename, T_filename), file=fout) 332 | 333 | train_dataN, valid_dataN, vocab_sizeN, train_dataT, valid_dataT, vocab_sizeT, attn_size, train_dataP, valid_dataP \ 334 | = reader.input_data(N_filename, T_filename) 335 | 336 | train_data = (train_dataN, train_dataT, train_dataP) 337 | valid_data = (valid_dataN, valid_dataT, valid_dataP) 338 | vocab_size = (vocab_sizeN+1, vocab_sizeT+2) # plus EOF, N is [w, eof], T is [w, unk, eof] 339 | 340 | config = get_config() 341 | assert attn_size == config.attn_size #make sure the attn_size used in generate terminal is the same as the configuration 342 | config.vocab_size = vocab_size 343 | eval_config = get_config() 344 | eval_config.batch_size = config.batch_size * config.num_steps 345 | eval_config.num_steps = 1 346 | eval_config.vocab_size = vocab_size 347 | 348 | with tf.Graph().as_default(): 349 | initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale) 350 | 351 | with tf.name_scope("Train"): 352 | train_input = PTBInput(config=config, data=train_data, name="TrainInput") 353 | with tf.variable_scope("Model", reuse=None, initializer=initializer): 354 | m = PTBModel(is_training=True, config=config, input_=train_input) 355 | 356 | with tf.name_scope("Valid"): 357 | valid_input = PTBInput(config=config, data=valid_data, name="ValidInput") 358 | with tf.variable_scope("Model", reuse=True, initializer=initializer): 359 | mvalid = PTBModel(is_training=False, config=config, input_=valid_input) 360 | 361 | # with tf.name_scope("Test"): 362 | # test_input = PTBInput(config=eval_config, data=valid_data, name="TestInput") 363 | # with tf.variable_scope("Model", reuse=True, initializer=initializer): 364 | # mtest = PTBModel(is_training=False, config=eval_config, 365 | # input_=test_input) 366 | 367 | 368 | print ('total trainable variables', len(tf.trainable_variables()), '\n\n') 369 | max_valid = 0 370 | max_step = 0 371 | saver = tf.train.Saver() 372 | 373 | sv = tf.train.Supervisor(logdir=None, summary_op=None) 374 | with sv.managed_session() as session: 375 | 376 | for i in range(config.max_max_epoch): 377 | lr_decay = config.lr_decay ** max(i + 1 - config.max_epoch, 0.0) 378 | m.assign_lr(session, config.learning_rate * lr_decay) 379 | print (outfile, "Epoch: %d Learning rate: %.6f" % (i + 1, session.run(m.lr))) 380 | 381 | train_perplexity, train_accuracy = run_epoch(session, m, eval_op=m.train_op, verbose=True) 382 | print("Epoch: %d Train Perplexity: %.3f Train Accuracy: %.3f" % (i + 1, train_perplexity, train_accuracy)) 383 | print("Epoch: %d Train Perplexity: %.3f Train Accuracy: %.3f" % (i + 1, train_perplexity, train_accuracy), file=fout) 384 | 385 | if i > 5: 386 | valid_perplexity, valid_accuracy = run_epoch(session, mvalid) 387 | print("Epoch: %d Valid Perplexity: ~~%.3f Valid Accuracy: %.3f~" % (i + 1, valid_perplexity, valid_accuracy)) 388 | print("Epoch: %d Valid Perplexity: ~~%.3f Valid Accuracy: %.3f~" % (i + 1, valid_perplexity, valid_accuracy), file=fout) 389 | if valid_accuracy > max_valid: 390 | max_valid = valid_accuracy 391 | max_step = i + 1 392 | 393 | # test_perplexity, test_accuracy = run_epoch(session, mtest) 394 | # print("\nTest Perplexity: %.3f Test Accuracy: %.3f" % (test_perplexity, test_accuracy)) 395 | 396 | print ('max step %d, max valid %.3f' %(max_step, max_valid)) 397 | # print ('data path is', FLAGS.data_path) 398 | print ('total time takes', time.time()-start_time) 399 | print ('max step %d, max valid %.3f' %(max_step, max_valid), file=fout) 400 | print ('total time takes', time.time()-start_time, file=fout) 401 | fout.close() 402 | 403 | # if FLAGS.save_path: 404 | # print("Saving model to %s." % FLAGS.save_path) 405 | # save_path = saver.save(session, FLAGS.save_path, write_meta_graph=False, write_state=False) 406 | 407 | 408 | if __name__ == "__main__": 409 | tf.app.run() 410 | -------------------------------------------------------------------------------- /code/attention_parent.py: -------------------------------------------------------------------------------- 1 | # attentional LSTM, count all unk as wrong, default predict termianl 2 | # what we exactly use 3 | # revise 01/09, add parent at output 4 | 5 | from __future__ import absolute_import 6 | from __future__ import division 7 | from __future__ import print_function 8 | 9 | import inspect 10 | import time 11 | 12 | import numpy as np 13 | import tensorflow as tf 14 | 15 | import reader_pointer as reader 16 | import os 17 | os.environ['TF_CPP_MIN_LOG_LEVEL']='2' 18 | 19 | os.environ['CUDA_VISIBLE_DEVICES']='0' 20 | outfile = 'output_attention_parent.txt' 21 | 22 | N_filename = '../pickle_data/PY_non_terminal.pickle' 23 | T_filename = '../pickle_data/PY_terminal_50k_whole.pickle' 24 | 25 | flags = tf.flags 26 | flags.DEFINE_string("save_path", None, #'./logs/modelT0A' 27 | "Model output directory.") 28 | 29 | flags.DEFINE_string( 30 | "model", "small", 31 | "A type of model. Possible options are: small, medium, best.") 32 | # flags.DEFINE_string("data_path", '../data/dataJS', 33 | # "Where the training/test data is stored.") 34 | flags.DEFINE_bool("use_fp16", False, 35 | "Train using 16-bit floats instead of 32bit floats") 36 | 37 | FLAGS = flags.FLAGS 38 | logging = tf.logging 39 | 40 | if FLAGS.model == "test": 41 | outfile = 'TESToutput.txt' 42 | def data_type(): 43 | return tf.float16 if FLAGS.use_fp16 else tf.float32 44 | 45 | class SmallConfig(object): 46 | """Small config. get best result as 0.733 """ 47 | init_scale = 0.05 48 | learning_rate = 0.001 49 | max_grad_norm = 5 50 | num_layers = 1#1 51 | num_steps = 50 52 | attn_size = 50 53 | hidden_sizeN = 300 54 | hidden_sizeT = 500 55 | sizeH = 800 56 | max_epoch = 1#8 57 | max_max_epoch = 8#79 58 | keep_prob = 1.0#1.0 59 | lr_decay = 0.6#0.95 60 | batch_size = 128#80 61 | 62 | class TestConfig(object): 63 | """Tiny config, for testing.""" 64 | init_scale = 0.05 65 | learning_rate = 0.001 66 | max_grad_norm = 5 67 | num_layers = 1 68 | num_steps = 50 69 | attn_size = 50 70 | hidden_sizeN = 50 71 | hidden_sizeT = 50 72 | sizeH = 100 73 | max_epoch = 1 74 | max_max_epoch = 1 75 | keep_prob = 1.0 76 | lr_decay = 0.6 77 | batch_size = 128 78 | 79 | 80 | def get_config(): 81 | if FLAGS.model == "small": 82 | return SmallConfig() 83 | elif FLAGS.model == "medium": 84 | return MediumConfig() 85 | elif FLAGS.model == "best": 86 | return BestConfig() 87 | elif FLAGS.model == "test": 88 | return TestConfig() 89 | else: 90 | raise ValueError("Invalid model: %s", FLAGS.model) 91 | 92 | 93 | class PTBInput(object): 94 | """The input data.""" 95 | 96 | def __init__(self, config, data, name=None): 97 | self.batch_size = batch_size = config.batch_size 98 | self.attn_size = attn_size = config.attn_size 99 | self.num_steps = num_steps = config.num_steps 100 | self.input_dataN, self.targetsN, self.input_dataT, self.targetsT, self.epoch_size, self.eof_indicator, self.input_dataP = \ 101 | reader.data_producer(data, batch_size, num_steps, config.vocab_size, config.attn_size, change_yT=True, name=name) 102 | if FLAGS.model == "test": 103 | self.epoch_size = 16 #small epoch size for test 104 | 105 | 106 | class PTBModel(object): 107 | 108 | def __init__(self, is_training, config, input_): 109 | self._input = input_ 110 | self.attn_size = attn_size = config.attn_size 111 | batch_size = input_.batch_size 112 | num_steps = input_.num_steps 113 | self.sizeN = sizeN = config.hidden_sizeN 114 | self.sizeT = sizeT = config.hidden_sizeT 115 | self.size = size = config.sizeH 116 | vocab_sizeN, vocab_sizeT = config.vocab_size 117 | 118 | # Slightly better results can be obtained with forget gate biases 119 | # initialized to 1 but the hyperparameters of the model would need to be 120 | # different than reported in the paper. 121 | def lstm_cell(): 122 | if 'reuse' in inspect.getargspec( 123 | tf.contrib.rnn.BasicLSTMCell.__init__).args: 124 | return tf.contrib.rnn.BasicLSTMCell( 125 | size, forget_bias=1.0, state_is_tuple=True, 126 | reuse=tf.get_variable_scope().reuse) 127 | else: 128 | return tf.contrib.rnn.BasicLSTMCell( 129 | size, forget_bias=1.0, state_is_tuple=True) 130 | attn_cell = lstm_cell 131 | if is_training and config.keep_prob < 1: 132 | def attn_cell(): 133 | return tf.contrib.rnn.DropoutWrapper( 134 | lstm_cell(), output_keep_prob=config.keep_prob) 135 | cell = tf.contrib.rnn.MultiRNNCell( 136 | [attn_cell() for _ in range(config.num_layers)], state_is_tuple=True) 137 | 138 | state_variables = [] 139 | with tf.variable_scope("myCH0"): 140 | for i, (state_c, state_h) in enumerate(cell.zero_state(batch_size, data_type())): 141 | if i > 0: tf.get_variable_scope().reuse_variables() 142 | myC0 = tf.get_variable("myC0", state_c.shape[1], initializer=tf.zeros_initializer()) 143 | myH0 = tf.get_variable("myH0", state_h.shape[1], initializer=tf.zeros_initializer()) 144 | myC0_tensor = tf.convert_to_tensor([myC0 for _ in range(batch_size)]) 145 | myH0_tensor = tf.convert_to_tensor([myH0 for _ in range(batch_size)]) 146 | state_variables.append(tf.contrib.rnn.LSTMStateTuple(myC0_tensor, myH0_tensor)) 147 | 148 | self._initial_state = state_variables 149 | 150 | self.eof_indicator = input_.eof_indicator 151 | 152 | with tf.device("/cpu:0"): 153 | embeddingN = tf.get_variable( 154 | "embeddingN", [vocab_sizeN, sizeN], dtype=data_type()) 155 | inputsN = tf.nn.embedding_lookup(embeddingN, input_.input_dataN) 156 | inputsP = tf.nn.embedding_lookup(embeddingN, input_.input_dataP) 157 | 158 | with tf.device("/cpu:0"): 159 | embeddingT = tf.get_variable( 160 | "embeddingT", [vocab_sizeT, sizeT], dtype=data_type()) 161 | inputsT = tf.nn.embedding_lookup(embeddingT, input_.input_dataT) 162 | 163 | inputs = tf.concat([inputsN, inputsT], 2) ## feed parent at the input 164 | #inputs = tf.one_hot(input_.input_data, vocab_size) 165 | if is_training and config.keep_prob < 1: 166 | inputs = tf.nn.dropout(inputs, config.keep_prob) 167 | 168 | outputs = [] 169 | attentions = [] 170 | parents = [] 171 | state = self._initial_state 172 | self.memory = tf.placeholder(dtype=data_type(), shape=[batch_size, num_steps, size], name="memory") 173 | valid_memory = self.memory[:,-attn_size:,:] 174 | # print ("test test test,, state shape", np.array(state).shape) 175 | with tf.variable_scope("RNN"): 176 | for time_step in range(num_steps): 177 | if time_step > 0: tf.get_variable_scope().reuse_variables() 178 | (cell_output, state) = cell(inputs[:, time_step, :], state) 179 | outputs.append(cell_output) 180 | 181 | # parent_index = input_.input_dataP[:, time_step] # retrieval parent hidden state in batch 182 | # cell_parent = tf.convert_to_tensor([valid_memory[i,-parent_index[i],:] for i in range(batch_size)]) 183 | cell_parent = inputsP[:, time_step, :] 184 | parents.append(cell_parent) 185 | 186 | wm = tf.get_variable("wm", [size, size], dtype=data_type()) 187 | wh = tf.get_variable("wh", [size, size], dtype=data_type()) 188 | wt = tf.get_variable("wt", [size, 1], dtype=data_type()) 189 | gt = tf.tanh(tf.matmul(tf.reshape(valid_memory, [-1, size]), wm) + tf.reshape(tf.tile(tf.matmul(cell_output, wh),[1, attn_size]), [-1, size])) 190 | alpha = tf.nn.softmax(tf.reshape(tf.matmul(gt, wt), [-1,attn_size])) 191 | ct = tf.squeeze(tf.matmul(tf.transpose(valid_memory, [0, 2, 1]), tf.reshape(alpha, [-1, attn_size, 1]))) 192 | attentions.append(ct) 193 | valid_memory = tf.concat([valid_memory[:,1:,:], tf.expand_dims(cell_output, axis=1)], axis=1) 194 | 195 | output = tf.reshape(tf.stack(axis=1, values=outputs), [-1, size]) 196 | attention = tf.reshape(tf.stack(axis=1, values=attentions), [-1, size]) 197 | parent = tf.reshape(tf.stack(axis=1, values=parents), [-1, sizeN]) 198 | 199 | self.output = tf.reshape(output, [-1, num_steps, size]) #to record the memory for next batch 200 | wa = tf.get_variable("wa", [size*2+sizeN, size], dtype=data_type()) 201 | nt = tf.tanh(tf.matmul(tf.concat([output, attention, parent], axis=1), wa)) 202 | 203 | softmax_w = tf.get_variable("softmax_w", [size, vocab_sizeT], dtype=data_type()) 204 | softmax_b = tf.get_variable("softmax_b", [vocab_sizeT], dtype=data_type()) 205 | logits = tf.matmul(nt, softmax_w) + softmax_b 206 | labels = tf.reshape(input_.targetsT, [-1]) 207 | weights = tf.ones([batch_size * num_steps], dtype=data_type()) 208 | 209 | #counting unk as wrong 210 | unk_id = vocab_sizeT - 2 211 | unk_tf = tf.constant(value=unk_id, dtype=tf.int32, shape=labels.shape) 212 | zero_weights = tf.zeros_like(labels, dtype=data_type()) 213 | wrong_label = tf.constant(value=-1, dtype=tf.int32, shape=labels.shape) 214 | condition_tf = tf.equal(labels, unk_tf) 215 | new_weights = tf.where(condition_tf, zero_weights, weights) 216 | new_labels = tf.where(condition_tf, wrong_label, labels) 217 | 218 | 219 | loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example([logits], [labels], [new_weights]) 220 | probs = tf.nn.softmax(logits) 221 | correct_prediction = tf.equal(tf.cast(tf.argmax(probs, 1), dtype = tf.int32), new_labels) 222 | self._accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) 223 | 224 | self._cost = cost = tf.reduce_sum(loss) / batch_size 225 | self._final_state = state 226 | 227 | if not is_training: 228 | return 229 | 230 | self._lr = tf.Variable(0.0, trainable=False) 231 | tvars = tf.trainable_variables() 232 | print ('tvars', len(tvars)) 233 | grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), 234 | config.max_grad_norm) 235 | print ('*******the length', len(grads), '\n') 236 | optimizer = tf.train.AdamOptimizer(self._lr) 237 | self._train_op = optimizer.apply_gradients( 238 | zip(grads, tvars), 239 | global_step=tf.contrib.framework.get_or_create_global_step()) 240 | 241 | self._new_lr = tf.placeholder( 242 | tf.float32, shape=[], name="new_learning_rate") 243 | self._lr_update = tf.assign(self._lr, self._new_lr) 244 | 245 | def assign_lr(self, session, lr_value): 246 | session.run(self._lr_update, feed_dict={self._new_lr: lr_value}) 247 | 248 | @property 249 | def input(self): 250 | return self._input 251 | 252 | @property 253 | def initial_state(self): 254 | return self._initial_state 255 | 256 | @property 257 | def cost(self): 258 | return self._cost 259 | 260 | @property 261 | def final_state(self): 262 | return self._final_state 263 | 264 | @property 265 | def accuracy(self): 266 | return self._accuracy 267 | 268 | @property 269 | def lr(self): 270 | return self._lr 271 | 272 | @property 273 | def train_op(self): 274 | return self._train_op 275 | 276 | 277 | def run_epoch(session, model, eval_op=None, verbose=False): 278 | start_time = time.time() 279 | costs = 0.0 280 | accuracy_list = [] 281 | iters = 0 282 | state = session.run(model.initial_state) 283 | # print ('at the very initial of the run_epoch\n', state[0].c) 284 | eof_indicator = np.ones((model.input.batch_size), dtype=bool) 285 | memory = np.zeros([model.input.batch_size, model.input.num_steps, model.size]) 286 | # file_id = session.run(model.initial_file_id) #need to remove _ 287 | 288 | fetches = { 289 | "cost": model.cost, 290 | "accuracy": model.accuracy, 291 | "final_state": model.final_state, 292 | "eof_indicator": model.eof_indicator, 293 | "memory":model.output, 294 | } 295 | if eval_op is not None: 296 | fetches["eval_op"] = eval_op 297 | 298 | for step in range(model.input.epoch_size): 299 | feed_dict = {} 300 | # current_file_id = file_id #session.run(model.file_id) 301 | sub_cond = np.expand_dims(eof_indicator, axis = 1) 302 | condition = np.repeat(sub_cond, model.size, axis = 1) 303 | # zero_state = np.zeros_like(condition) 304 | # zero_state = np.random.uniform(-0.05,0.05,condition.shape) 305 | zero_state = session.run(model.initial_state) 306 | 307 | for i, (c, h) in enumerate(model.initial_state): 308 | assert condition.shape == state[i].c.shape 309 | feed_dict[c] = np.where(condition, zero_state[i][0], state[i].c) 310 | feed_dict[h] = np.where(condition, zero_state[i][1], state[i].h) 311 | 312 | feed_dict[model.memory] = memory 313 | vals = session.run(fetches, feed_dict) 314 | 315 | cost = vals["cost"] 316 | accuracy = vals["accuracy"] 317 | eof_indicator = vals["eof_indicator"] 318 | state = vals["final_state"] #use the final state as the initial state within a whole epoch 319 | memory = vals["memory"] 320 | 321 | accuracy_list.append(accuracy) 322 | costs += cost 323 | iters += model.input.num_steps 324 | 325 | if verbose and step % (model.input.epoch_size // 10) == 10: 326 | print("%.3f perplexity: %.3f accuracy: %.4f speed: %.0f wps" % 327 | (step * 1.0 / model.input.epoch_size, np.exp(costs / iters), np.mean(accuracy_list), 328 | (time.time() - start_time))) 329 | 330 | print ('this run_epoch takes time %.2f' %(time.time() - start_time)) 331 | return np.exp(costs / iters), np.mean(accuracy_list) 332 | 333 | 334 | def main(_): 335 | start_time = time.time() 336 | fout = open(outfile, 'a') 337 | print ('\n', time.asctime(time.localtime()), file=fout) 338 | print ('start a new experiment %s'%outfile, file=fout) 339 | print ('Using dataset %s and %s'%(N_filename, T_filename), file=fout) 340 | 341 | train_dataN, valid_dataN, vocab_sizeN, train_dataT, valid_dataT, vocab_sizeT, attn_size, train_dataP, valid_dataP \ 342 | = reader.input_data(N_filename, T_filename) 343 | 344 | train_data = (train_dataN, train_dataT, train_dataP) 345 | valid_data = (valid_dataN, valid_dataT, valid_dataP) 346 | vocab_size = (vocab_sizeN+1, vocab_sizeT+2) # plus EOF, N is [w, eof], T is [w, unk, eof] 347 | 348 | config = get_config() 349 | assert attn_size == config.attn_size #make sure the attn_size used in generate terminal is the same as the configuration 350 | config.vocab_size = vocab_size 351 | eval_config = get_config() 352 | eval_config.batch_size = config.batch_size * config.num_steps 353 | eval_config.num_steps = 1 354 | eval_config.vocab_size = vocab_size 355 | 356 | with tf.Graph().as_default(): 357 | initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale) 358 | 359 | with tf.name_scope("Train"): 360 | train_input = PTBInput(config=config, data=train_data, name="TrainInput") 361 | with tf.variable_scope("Model", reuse=None, initializer=initializer): 362 | m = PTBModel(is_training=True, config=config, input_=train_input) 363 | 364 | with tf.name_scope("Valid"): 365 | valid_input = PTBInput(config=config, data=valid_data, name="ValidInput") 366 | with tf.variable_scope("Model", reuse=True, initializer=initializer): 367 | mvalid = PTBModel(is_training=False, config=config, input_=valid_input) 368 | 369 | # with tf.name_scope("Test"): 370 | # test_input = PTBInput(config=eval_config, data=valid_data, name="TestInput") 371 | # with tf.variable_scope("Model", reuse=True, initializer=initializer): 372 | # mtest = PTBModel(is_training=False, config=eval_config, 373 | # input_=test_input) 374 | 375 | 376 | print ('total trainable variables', len(tf.trainable_variables()), '\n\n') 377 | max_valid = 0 378 | max_step = 0 379 | saver = tf.train.Saver() 380 | 381 | sv = tf.train.Supervisor(logdir=None, summary_op=None) 382 | with sv.managed_session() as session: 383 | 384 | for i in range(config.max_max_epoch): 385 | lr_decay = config.lr_decay ** max(i + 1 - config.max_epoch, 0.0) 386 | m.assign_lr(session, config.learning_rate * lr_decay) 387 | print (outfile, "Epoch: %d Learning rate: %.6f" % (i + 1, session.run(m.lr))) 388 | 389 | train_perplexity, train_accuracy = run_epoch(session, m, eval_op=m.train_op, verbose=True) 390 | print("Epoch: %d Train Perplexity: %.3f Train Accuracy: %.3f" % (i + 1, train_perplexity, train_accuracy)) 391 | print("Epoch: %d Train Perplexity: %.3f Train Accuracy: %.3f" % (i + 1, train_perplexity, train_accuracy), file=fout) 392 | 393 | if i > 5: 394 | valid_perplexity, valid_accuracy = run_epoch(session, mvalid) 395 | print("Epoch: %d Valid Perplexity: ~~%.3f Valid Accuracy: %.3f~" % (i + 1, valid_perplexity, valid_accuracy)) 396 | print("Epoch: %d Valid Perplexity: ~~%.3f Valid Accuracy: %.3f~" % (i + 1, valid_perplexity, valid_accuracy), file=fout) 397 | if valid_accuracy > max_valid: 398 | max_valid = valid_accuracy 399 | max_step = i + 1 400 | 401 | # test_perplexity, test_accuracy = run_epoch(session, mtest) 402 | # print("\nTest Perplexity: %.3f Test Accuracy: %.3f" % (test_perplexity, test_accuracy)) 403 | 404 | print ('max step %d, max valid %.3f' %(max_step, max_valid)) 405 | # print ('data path is', FLAGS.data_path) 406 | print ('total time takes', time.time()-start_time) 407 | print ('max step %d, max valid %.3f' %(max_step, max_valid), file=fout) 408 | print ('total time takes', time.time()-start_time, file=fout) 409 | fout.close() 410 | 411 | # if FLAGS.save_path: 412 | # print("Saving model to %s." % FLAGS.save_path) 413 | # save_path = saver.save(session, FLAGS.save_path, write_meta_graph=False, write_state=False) 414 | 415 | 416 | if __name__ == "__main__": 417 | tf.app.run() 418 | -------------------------------------------------------------------------------- /code/myModel_commented.py: -------------------------------------------------------------------------------- 1 | # a word w is composed of two kinds of information: type(N) and value(T), i.e., w_i = (N_i, T_i) 2 | # task: given a sequence of words w_1 to w_(t-1), predict the next word value T_t 3 | 4 | class my_Model(object): 5 | """This class is to build my lstm model, which mainly refers to The PTB model from official tensorflow example.""" 6 | 7 | def __init__(self, is_training, config, input_): 8 | self._input = input_ 9 | self.attn_size = attn_size = config.attn_size # attention size 10 | batch_size = input_.batch_size 11 | num_steps = input_.num_steps # the lstm unrolling length 12 | self.sizeN = sizeN = config.hidden_sizeN # embedding size of type(N) 13 | self.sizeT = sizeT = config.hidden_sizeT # embedding size of value(T) 14 | self.size = size = config.sizeH # hidden size of the lstm cell 15 | (vocab_sizeN, vocab_sizeT) = config.vocab_size # vocabulary size of type and value 16 | 17 | # from line 17 to line 33: copy from official PTB model which defines an lstm cell with drop-out and multi-layers 18 | def lstm_cell(): 19 | if 'reuse' in inspect.getargspec( 20 | tf.contrib.rnn.BasicLSTMCell.__init__).args: 21 | return tf.contrib.rnn.BasicLSTMCell( 22 | size, forget_bias=1.0, state_is_tuple=True, 23 | reuse=tf.get_variable_scope().reuse) 24 | else: 25 | return tf.contrib.rnn.BasicLSTMCell( 26 | size, forget_bias=1.0, state_is_tuple=True) 27 | attn_cell = lstm_cell 28 | if is_training and config.keep_prob < 1: # drop-out when training 29 | def attn_cell(): 30 | return tf.contrib.rnn.DropoutWrapper( 31 | lstm_cell(), output_keep_prob=config.keep_prob) 32 | cell = tf.contrib.rnn.MultiRNNCell( 33 | [attn_cell() for _ in range(config.num_layers)], state_is_tuple=True) #multi-layers 34 | 35 | # from line 35 to line 44: set the initial hidden states, which are two trainable vectors. Processing a new sentence starts from here. 36 | state_variables = [] 37 | with tf.variable_scope("myCH0"): 38 | for i, (state_c, state_h) in enumerate(cell.zero_state(batch_size, data_type())): 39 | if i > 0: tf.get_variable_scope().reuse_variables() 40 | myC0 = tf.get_variable("myC0", state_c.shape[1], initializer=tf.zeros_initializer()) 41 | myH0 = tf.get_variable("myH0", state_h.shape[1], initializer=tf.zeros_initializer()) 42 | myC0_tensor = tf.convert_to_tensor([myC0 for _ in range(batch_size)]) 43 | myH0_tensor = tf.convert_to_tensor([myH0 for _ in range(batch_size)]) 44 | state_variables.append(tf.contrib.rnn.LSTMStateTuple(myC0_tensor, myH0_tensor)) 45 | 46 | self._initial_state = state_variables 47 | 48 | self.eof_indicator = input_.eof_indicator # indicate whether this is the end of a sentence 49 | 50 | with tf.device("/cpu:0"): 51 | embeddingN = tf.get_variable( 52 | "embeddingN", [vocab_sizeN, sizeN], dtype=data_type()) 53 | inputsN = tf.nn.embedding_lookup(embeddingN, input_.input_dataN) # input type embedding 54 | 55 | with tf.device("/cpu:0"): 56 | embeddingT = tf.get_variable( 57 | "embeddingT", [vocab_sizeT, sizeT], dtype=data_type()) 58 | inputsT = tf.nn.embedding_lookup(embeddingT, input_.input_dataT) # input value embedding 59 | 60 | inputs = tf.concat([inputsN, inputsT], 2) # concatenate the type and value embedding 61 | if is_training and config.keep_prob < 1: 62 | inputs = tf.nn.dropout(inputs, config.keep_prob) 63 | 64 | outputs = [] # store hidden state at each time_step 65 | attentions = [] # store context attention vector at each time_step 66 | alphas = [] # store attention scores at each time_step 67 | state = self._initial_state 68 | self.memory = tf.placeholder(dtype=data_type(), shape=[batch_size, num_steps, size], name="memory") 69 | valid_memory = self.memory[:,-attn_size:,:] # previous hidden states within the attention window 70 | 71 | # from line 72 to line 87: build the RNN model, and calculate attention 72 | with tf.variable_scope("RNN"): 73 | for time_step in range(num_steps): 74 | if time_step > 0: tf.get_variable_scope().reuse_variables() 75 | (cell_output, state) = cell(inputs[:, time_step, :], state) # lstm_cell update function 76 | outputs.append(cell_output) # store hidden state 77 | 78 | # calculate attention scores alpha and context vector ct 79 | wm = tf.get_variable("wm", [size, size], dtype=data_type()) 80 | wh = tf.get_variable("wh", [size, size], dtype=data_type()) 81 | wt = tf.get_variable("wt", [size, 1], dtype=data_type()) 82 | gt = tf.tanh(tf.matmul(tf.reshape(valid_memory, [-1, size]), wm) + tf.reshape(tf.tile(tf.matmul(cell_output, wh),[1, attn_size]), [-1, size])) 83 | alpha = tf.nn.softmax(tf.reshape(tf.matmul(gt, wt), [-1,attn_size])) #the size of alpha: batch_size by attn_size 84 | alphas.append(alpha) 85 | ct = tf.squeeze(tf.matmul(tf.transpose(valid_memory, [0, 2, 1]), tf.reshape(alpha, [-1, attn_size, 1]))) 86 | attentions.append(ct) 87 | valid_memory = tf.concat([valid_memory[:,1:,:], tf.expand_dims(cell_output, axis=1)], axis=1) #move forward attention window 88 | 89 | output = tf.reshape(tf.stack(axis=1, values=outputs), [-1, size]) # hidden states for all time_steps 90 | attention = tf.reshape(tf.stack(axis=1, values=attentions), [-1, size]) # context vectors for all time_steps 91 | 92 | self.output = tf.reshape(output, [-1, num_steps, size]) #to record the memory for next batch 93 | wa = tf.get_variable("wa", [size*2, size], dtype=data_type()) 94 | nt = tf.tanh(tf.matmul(tf.concat([output, attention], axis=1), wa)) 95 | 96 | #compute w: the word distribution within the global vocabulary 97 | softmax_w = tf.get_variable("softmax_w", [size, vocab_sizeT], dtype=data_type()) 98 | softmax_b = tf.get_variable("softmax_b", [vocab_sizeT], dtype=data_type()) 99 | w_logits = tf.matmul(nt, softmax_w) + softmax_b 100 | w_probs = tf.nn.softmax(w_logits) # baseline model uses this 101 | 102 | #compute l: reuse attention scores as the location distribution for pointer network 103 | l_logits_pre = tf.reshape(tf.stack(axis=1, values=alphas), [-1, attn_size]) #the size is batch_size*num_steps by attn_size 104 | l_logits = tf.reverse(l_logits_pre, axis=[1]) 105 | 106 | #compute d: a switching network to balance the above two distributions, based on hidden states and context 107 | d_conditioned = tf.concat([output, attention], axis=1) 108 | d_w = tf.get_variable("d_w1", [2*size, 1], dtype=data_type()) 109 | d_b = tf.get_variable("d_b1", [1], dtype=data_type()) 110 | d = tf.nn.sigmoid(tf.matmul(d_conditioned, d_w) + d_b) 111 | 112 | #concat w and l to construct f 113 | f_logits = tf.concat([w_logits*d, l_logits*(1-d)], axis=1) 114 | 115 | labels = tf.reshape(input_.targetsT, [-1]) 116 | weights = tf.ones([batch_size * num_steps], dtype=data_type()) 117 | 118 | # set mask for counting unk as wrong 119 | unk_id = vocab_sizeT - 2 120 | unk_tf = tf.constant(value=unk_id, dtype=tf.int32, shape=labels.shape) 121 | zero_weights = tf.zeros_like(labels, dtype=data_type()) 122 | wrong_label = tf.constant(value=-1, dtype=tf.int32, shape=labels.shape) 123 | condition_tf = tf.equal(labels, unk_tf) 124 | new_weights = tf.where(condition_tf, zero_weights, weights) 125 | new_labels = tf.where(condition_tf, wrong_label, labels) 126 | 127 | loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example([f_logits], [labels], [new_weights]) 128 | probs = tf.nn.softmax(f_logits) 129 | 130 | correct_prediction = tf.equal(tf.cast(tf.argmax(probs, 1), dtype = tf.int32), new_labels) 131 | self._accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) 132 | 133 | self._cost = cost = tf.reduce_sum(loss) / batch_size 134 | self._final_state = state -------------------------------------------------------------------------------- /code/pointer.py: -------------------------------------------------------------------------------- 1 | # use word distribution and location information(pointer) 2 | # 1-28, reverse l_logits 3 | 4 | from __future__ import absolute_import 5 | from __future__ import division 6 | from __future__ import print_function 7 | 8 | import inspect 9 | import time 10 | 11 | import numpy as np 12 | import tensorflow as tf 13 | 14 | import reader_pointer_original as reader 15 | import os 16 | os.environ['TF_CPP_MIN_LOG_LEVEL']='2' 17 | 18 | os.environ['CUDA_VISIBLE_DEVICES']='0' 19 | outfile = 'output_pointer.txt' 20 | 21 | N_filename = '../pickle_data/small_JS_non_terminal.pickle' 22 | T_filename = '../pickle_data/small_JS_terminal_1k_whole.pickle' 23 | 24 | flags = tf.flags 25 | flags.DEFINE_string("save_path", None, #'./logs/modelT0A' 26 | "Model output directory.") 27 | 28 | flags.DEFINE_string( 29 | "model", "small", 30 | "A type of model. Possible options are: small, medium, best.") 31 | # flags.DEFINE_string("data_path", '../data/dataJS', 32 | # "Where the training/test data is stored.") 33 | flags.DEFINE_bool("use_fp16", False, 34 | "Train using 16-bit floats instead of 32bit floats") 35 | 36 | FLAGS = flags.FLAGS 37 | logging = tf.logging 38 | 39 | if FLAGS.model == "test": 40 | outfile = 'TESToutput.txt' 41 | def data_type(): 42 | return tf.float16 if FLAGS.use_fp16 else tf.float32 43 | 44 | class SmallConfig(object): 45 | """Small config. get best result as 0.733 """ 46 | init_scale = 0.05 47 | learning_rate = 0.001 48 | max_grad_norm = 5 49 | num_layers = 1#1 50 | num_steps = 50 51 | attn_size = 50 52 | hidden_sizeN = 300 53 | hidden_sizeT = 500 54 | sizeH = 800 55 | max_epoch = 1#8 56 | max_max_epoch = 8#79 57 | keep_prob = 1.0#1.0 58 | lr_decay = 0.6#0.95 59 | batch_size = 64#80 60 | 61 | class TestConfig(object): 62 | """Tiny config, for testing.""" 63 | init_scale = 0.05 64 | learning_rate = 0.001 65 | max_grad_norm = 5 66 | num_layers = 1 67 | num_steps = 50 68 | attn_size = 50 69 | hidden_sizeN = 300 70 | hidden_sizeT = 500 71 | sizeH = 800 72 | max_epoch = 1 73 | max_max_epoch = 1 74 | keep_prob = 1.0 75 | lr_decay = 0.6 76 | batch_size = 80 77 | 78 | 79 | def get_config(): 80 | if FLAGS.model == "small": 81 | return SmallConfig() 82 | elif FLAGS.model == "medium": 83 | return MediumConfig() 84 | elif FLAGS.model == "best": 85 | return BestConfig() 86 | elif FLAGS.model == "test": 87 | return TestConfig() 88 | else: 89 | raise ValueError("Invalid model: %s", FLAGS.model) 90 | 91 | 92 | class PTBInput(object): 93 | """The input data.""" 94 | 95 | def __init__(self, config, data, name=None): 96 | self.batch_size = batch_size = config.batch_size 97 | self.attn_size = attn_size = config.attn_size 98 | self.num_steps = num_steps = config.num_steps 99 | self.input_dataN, self.targetsN, self.input_dataT, self.targetsT, self.epoch_size, self.eof_indicator = \ 100 | reader.data_producer(data, batch_size, num_steps, config.vocab_size, config.attn_size, change_yT=False, name=name) 101 | if FLAGS.model == "test": 102 | self.epoch_size = 16 #small epoch size for test 103 | 104 | 105 | class PTBModel(object): 106 | """The PTB model.""" 107 | 108 | def __init__(self, is_training, config, input_): 109 | self._input = input_ 110 | self.attn_size = attn_size = config.attn_size 111 | batch_size = input_.batch_size 112 | num_steps = input_.num_steps 113 | self.sizeN = sizeN = config.hidden_sizeN 114 | self.sizeT = sizeT = config.hidden_sizeT 115 | self.size = size = config.sizeH 116 | (vocab_sizeN, vocab_sizeT) = config.vocab_size 117 | 118 | # Slightly better results can be obtained with forget gate biases 119 | # initialized to 1 but the hyperparameters of the model would need to be 120 | # different than reported in the paper. 121 | def lstm_cell(): 122 | if 'reuse' in inspect.getargspec( 123 | tf.contrib.rnn.BasicLSTMCell.__init__).args: 124 | return tf.contrib.rnn.BasicLSTMCell( 125 | size, forget_bias=1.0, state_is_tuple=True, 126 | reuse=tf.get_variable_scope().reuse) 127 | else: 128 | return tf.contrib.rnn.BasicLSTMCell( 129 | size, forget_bias=1.0, state_is_tuple=True) 130 | attn_cell = lstm_cell 131 | if is_training and config.keep_prob < 1: 132 | def attn_cell(): 133 | return tf.contrib.rnn.DropoutWrapper( 134 | lstm_cell(), output_keep_prob=config.keep_prob) 135 | cell = tf.contrib.rnn.MultiRNNCell( 136 | [attn_cell() for _ in range(config.num_layers)], state_is_tuple=True) 137 | 138 | state_variables = [] 139 | with tf.variable_scope("myCH0"): 140 | for i, (state_c, state_h) in enumerate(cell.zero_state(batch_size, data_type())): 141 | if i > 0: tf.get_variable_scope().reuse_variables() 142 | myC0 = tf.get_variable("myC0", state_c.shape[1], initializer=tf.zeros_initializer()) 143 | myH0 = tf.get_variable("myH0", state_h.shape[1], initializer=tf.zeros_initializer()) 144 | myC0_tensor = tf.convert_to_tensor([myC0 for _ in range(batch_size)]) 145 | myH0_tensor = tf.convert_to_tensor([myH0 for _ in range(batch_size)]) 146 | state_variables.append(tf.contrib.rnn.LSTMStateTuple(myC0_tensor, myH0_tensor)) 147 | 148 | self._initial_state = state_variables 149 | 150 | self.eof_indicator = input_.eof_indicator 151 | 152 | with tf.device("/cpu:0"): 153 | embeddingN = tf.get_variable( 154 | "embeddingN", [vocab_sizeN, sizeN], dtype=data_type()) 155 | inputsN = tf.nn.embedding_lookup(embeddingN, input_.input_dataN) 156 | 157 | with tf.device("/cpu:0"): 158 | embeddingT = tf.get_variable( 159 | "embeddingT", [vocab_sizeT, sizeT], dtype=data_type()) 160 | inputsT = tf.nn.embedding_lookup(embeddingT, input_.input_dataT) 161 | 162 | inputs = tf.concat([inputsN, inputsT], 2) 163 | #inputs = tf.one_hot(input_.input_data, vocab_size) 164 | if is_training and config.keep_prob < 1: 165 | inputs = tf.nn.dropout(inputs, config.keep_prob) 166 | 167 | outputs = [] 168 | attentions = [] 169 | alphas = [] 170 | state = self._initial_state 171 | self.memory = tf.placeholder(dtype=data_type(), shape=[batch_size, num_steps, size], name="memory") 172 | valid_memory = self.memory[:,-attn_size:,:] 173 | # print ("test test test,, state shape", np.array(state).shape) 174 | with tf.variable_scope("RNN"): 175 | for time_step in range(num_steps): 176 | if time_step > 0: tf.get_variable_scope().reuse_variables() 177 | (cell_output, state) = cell(inputs[:, time_step, :], state) 178 | outputs.append(cell_output) 179 | 180 | wm = tf.get_variable("wm", [size, size], dtype=data_type()) 181 | wh = tf.get_variable("wh", [size, size], dtype=data_type()) 182 | wt = tf.get_variable("wt", [size, 1], dtype=data_type()) 183 | gt = tf.tanh(tf.matmul(tf.reshape(valid_memory, [-1, size]), wm) + tf.reshape(tf.tile(tf.matmul(cell_output, wh),[1, attn_size]), [-1, size])) 184 | alpha = tf.nn.softmax(tf.reshape(tf.matmul(gt, wt), [-1,attn_size])) #the size of alpha: batch_size by attn_size 185 | alphas.append(alpha) 186 | ct = tf.squeeze(tf.matmul(tf.transpose(valid_memory, [0, 2, 1]), tf.reshape(alpha, [-1, attn_size, 1]))) 187 | attentions.append(ct) 188 | valid_memory = tf.concat([valid_memory[:,1:,:], tf.expand_dims(cell_output, axis=1)], axis=1) 189 | 190 | output = tf.reshape(tf.stack(axis=1, values=outputs), [-1, size]) 191 | attention = tf.reshape(tf.stack(axis=1, values=attentions), [-1, size]) 192 | 193 | self.output = tf.reshape(output, [-1, num_steps, size]) #to record the memory for next batch 194 | wa = tf.get_variable("wa", [size*2, size], dtype=data_type()) 195 | nt = tf.tanh(tf.matmul(tf.concat([output, attention], axis=1), wa)) 196 | 197 | #compute w 198 | softmax_w = tf.get_variable("softmax_w", [size, vocab_sizeT], dtype=data_type()) 199 | softmax_b = tf.get_variable("softmax_b", [vocab_sizeT], dtype=data_type()) 200 | w_logits = tf.matmul(nt, softmax_w) + softmax_b 201 | w_probs = tf.nn.softmax(w_logits) 202 | 203 | #compute l 204 | l_logits_pre = tf.reshape(tf.stack(axis=1, values=alphas), [-1, attn_size]) #the size of alpha_reshaped: batch_size*num_steps by attn_size 205 | l_logits = tf.reverse(l_logits_pre, axis=[1]) 206 | # l_probs = tf.nn.softmax(l_logits) 207 | 208 | #compute d 209 | # input_reshaped = tf.reshape(inputs, [-1, size]) 210 | d_conditioned = tf.concat([output, attention], axis=1) 211 | d_w = tf.get_variable("d_w1", [2*size, 1], dtype=data_type()) 212 | d_b = tf.get_variable("d_b1", [1], dtype=data_type()) 213 | d = tf.nn.sigmoid(tf.matmul(d_conditioned, d_w) + d_b) 214 | 215 | # d_conditioned = tf.concat([output, attention], axis=1) 216 | # d_w1 = tf.get_variable("d_w1", [2*size, size], dtype=data_type()) 217 | # d_b1 = tf.get_variable("d_b1", [size], dtype=data_type()) 218 | # fc1 = tf.nn.relu(tf.matmul(d_conditioned, d_w1) + d_b1) 219 | # d_w2 = tf.get_variable("d_w2", [size, 1], dtype=data_type()) 220 | # d_b2 = tf.get_variable("d_b2", [1], dtype=data_type()) 221 | # d = tf.nn.sigmoid(tf.matmul(fc1, d_w2) + d_b2) 222 | 223 | #concat w and l to construct f 224 | f_logits = tf.concat([w_logits*d, l_logits*(1-d)], axis=1) 225 | 226 | labels = tf.reshape(input_.targetsT, [-1]) 227 | weights = tf.ones([batch_size * num_steps], dtype=data_type()) 228 | 229 | #counting unk as wrong 230 | unk_id = vocab_sizeT - 2 231 | unk_tf = tf.constant(value=unk_id, dtype=tf.int32, shape=labels.shape) 232 | zero_weights = tf.zeros_like(labels, dtype=data_type()) 233 | wrong_label = tf.constant(value=-1, dtype=tf.int32, shape=labels.shape) 234 | condition_tf = tf.equal(labels, unk_tf) 235 | new_weights = tf.where(condition_tf, zero_weights, weights) 236 | new_labels = tf.where(condition_tf, wrong_label, labels) # only for computing the accuracy, can not be used to compute the loss(cause nan error) 237 | 238 | 239 | loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example([f_logits], [labels], [new_weights]) 240 | probs = tf.nn.softmax(f_logits) 241 | 242 | # condition = tf.not_equal(labels, 182) 243 | # non_pad_len = tf.reduce_sum(tf.cast(condition, tf.float32)) 244 | # mask_labels = tf.where(condition, labels, tf.constant(250, shape = labels.get_shape())) #250 just do not belong to the vocab 245 | # correct_prediction = tf.equal(tf.cast(tf.argmax(probs, 1), dtype = tf.int32), mask_labels) 246 | # self._accuracy = tf.reduce_sum(tf.cast(correct_prediction, tf.float32)) / non_pad_len # do not count predict (182) 247 | 248 | correct_prediction = tf.equal(tf.cast(tf.argmax(probs, 1), dtype = tf.int32), new_labels) 249 | self._accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) 250 | 251 | self._cost = cost = tf.reduce_sum(loss) / batch_size 252 | self._final_state = state 253 | 254 | if not is_training: 255 | return 256 | 257 | self._lr = tf.Variable(0.0, trainable=False) 258 | tvars = tf.trainable_variables() 259 | print ('tvars', len(tvars)) 260 | grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), 261 | config.max_grad_norm) 262 | print ('*******the length', len(grads)) 263 | optimizer = tf.train.AdamOptimizer(self._lr) 264 | self._train_op = optimizer.apply_gradients( 265 | zip(grads, tvars), 266 | global_step=tf.contrib.framework.get_or_create_global_step()) 267 | 268 | self._new_lr = tf.placeholder( 269 | tf.float32, shape=[], name="new_learning_rate") 270 | self._lr_update = tf.assign(self._lr, self._new_lr) 271 | 272 | def assign_lr(self, session, lr_value): 273 | session.run(self._lr_update, feed_dict={self._new_lr: lr_value}) 274 | 275 | @property 276 | def input(self): 277 | return self._input 278 | 279 | @property 280 | def initial_state(self): 281 | return self._initial_state 282 | 283 | @property 284 | def cost(self): 285 | return self._cost 286 | 287 | @property 288 | def final_state(self): 289 | return self._final_state 290 | 291 | @property 292 | def accuracy(self): 293 | return self._accuracy 294 | 295 | @property 296 | def lr(self): 297 | return self._lr 298 | 299 | @property 300 | def train_op(self): 301 | return self._train_op 302 | 303 | 304 | def run_epoch(session, model, eval_op=None, verbose=False): 305 | """Runs the model on the given data.""" 306 | start_time = time.time() 307 | costs = 0.0 308 | accuracy_list = [] 309 | iters = 0 310 | state = session.run(model.initial_state) 311 | # print ('at the very initial of the run_epoch\n', state[0].c) 312 | eof_indicator = np.ones((model.input.batch_size), dtype=bool) 313 | memory = np.zeros([model.input.batch_size, model.input.num_steps, model.size]) 314 | # file_id = session.run(model.initial_file_id) #need to remove _ 315 | 316 | fetches = { 317 | "cost": model.cost, 318 | "accuracy": model.accuracy, 319 | "final_state": model.final_state, 320 | "eof_indicator": model.eof_indicator, 321 | "memory":model.output, 322 | } 323 | if eval_op is not None: 324 | fetches["eval_op"] = eval_op 325 | 326 | for step in range(model.input.epoch_size): 327 | feed_dict = {} 328 | # current_file_id = file_id #session.run(model.file_id) 329 | sub_cond = np.expand_dims(eof_indicator, axis = 1) 330 | condition = np.repeat(sub_cond, model.size, axis = 1) 331 | # zero_state = np.zeros_like(condition) 332 | # zero_state = np.random.uniform(-0.05,0.05,condition.shape) 333 | zero_state = session.run(model.initial_state) 334 | 335 | for i, (c, h) in enumerate(model.initial_state): 336 | assert condition.shape == state[i].c.shape 337 | feed_dict[c] = np.where(condition, zero_state[i][0], state[i].c) 338 | feed_dict[h] = np.where(condition, zero_state[i][1], state[i].h) 339 | 340 | feed_dict[model.memory] = memory 341 | vals = session.run(fetches, feed_dict) 342 | 343 | cost = vals["cost"] 344 | accuracy = vals["accuracy"] 345 | eof_indicator = vals["eof_indicator"] 346 | state = vals["final_state"] #use the final state as the initial state within a whole epoch 347 | memory = vals["memory"] 348 | 349 | accuracy_list.append(accuracy) 350 | costs += cost 351 | iters += model.input.num_steps 352 | 353 | if verbose and step % (model.input.epoch_size // 10) == 10: 354 | print("%.3f perplexity: %.3f accuracy: %.4f speed: %.0f wps" % 355 | (step * 1.0 / model.input.epoch_size, np.exp(costs / iters), np.mean(accuracy_list), 356 | (time.time() - start_time))) 357 | # print ('zero_state value', zero_state[0][0]) 358 | # print ('gradients value', session.run(model.grads)) 359 | 360 | print ('this run_epoch takes time %.2f' %(time.time() - start_time)) 361 | return np.exp(costs / iters), np.mean(accuracy_list) 362 | 363 | 364 | 365 | 366 | def main(_): 367 | start_time = time.time() 368 | fout = open(outfile, 'a') 369 | print ('\n', time.asctime(time.localtime()), file=fout) 370 | print ('start a new experiment %s'%outfile, file=fout) 371 | print ('Using dataset %s and %s'%(N_filename, T_filename), file=fout) 372 | print ('condition on two, two layers', file=fout) 373 | 374 | train_dataN, valid_dataN, vocab_sizeN, train_dataT, valid_dataT, vocab_sizeT, attn_size = reader.input_data(N_filename, T_filename) 375 | 376 | train_data = (train_dataN, train_dataT) 377 | valid_data = (valid_dataN, valid_dataT) 378 | vocab_size = (vocab_sizeN+1, vocab_sizeT+2) # N is [w, eof], T is [w, unk, eof] 379 | 380 | config = get_config() 381 | assert attn_size == config.attn_size #make sure the attn_size used in generate terminal is the same as the configuration 382 | config.vocab_size = vocab_size 383 | eval_config = get_config() 384 | eval_config.batch_size = config.batch_size * config.num_steps 385 | eval_config.num_steps = 1 386 | eval_config.vocab_size = vocab_size 387 | 388 | with tf.Graph().as_default(): 389 | initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale) 390 | 391 | with tf.name_scope("Train"): 392 | train_input = PTBInput(config=config, data=train_data, name="TrainInput") 393 | with tf.variable_scope("Model", reuse=None, initializer=initializer): 394 | m = PTBModel(is_training=True, config=config, input_=train_input) 395 | 396 | with tf.name_scope("Valid"): 397 | valid_input = PTBInput(config=config, data=valid_data, name="ValidInput") 398 | with tf.variable_scope("Model", reuse=True, initializer=initializer): 399 | mvalid = PTBModel(is_training=False, config=config, input_=valid_input) 400 | 401 | # with tf.name_scope("Test"): 402 | # test_input = PTBInput(config=eval_config, data=valid_data, name="TestInput") 403 | # with tf.variable_scope("Model", reuse=True, initializer=initializer): 404 | # mtest = PTBModel(is_training=False, config=eval_config, 405 | # input_=test_input) 406 | 407 | 408 | print ('total trainable variables', len(tf.trainable_variables()), '\n\n') 409 | max_valid = 0 410 | max_step = 0 411 | saver = tf.train.Saver() 412 | 413 | sv = tf.train.Supervisor(logdir=None, summary_op=None) 414 | with sv.managed_session() as session: 415 | 416 | for i in range(config.max_max_epoch): 417 | lr_decay = config.lr_decay ** max(i + 1 - config.max_epoch, 0.0) 418 | m.assign_lr(session, config.learning_rate * lr_decay) 419 | print (outfile, "Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr))) 420 | 421 | train_perplexity, train_accuracy = run_epoch(session, m, eval_op=m.train_op, verbose=True) 422 | print("Epoch: %d Train Perplexity: %.3f Train Accuracy: %.3f" % (i + 1, train_perplexity, train_accuracy)) 423 | print("Epoch: %d Train Perplexity: %.3f Train Accuracy: %.3f" % (i + 1, train_perplexity, train_accuracy), file=fout) 424 | 425 | if i > 5: 426 | valid_perplexity, valid_accuracy = run_epoch(session, mvalid) 427 | print("Epoch: %d Valid Perplexity: ~~%.3f Valid Accuracy: %.3f~" % (i + 1, valid_perplexity, valid_accuracy)) 428 | print("Epoch: %d Valid Perplexity: ~~%.3f Valid Accuracy: %.3f~" % (i + 1, valid_perplexity, valid_accuracy), file=fout) 429 | if valid_accuracy > max_valid: 430 | max_valid = valid_accuracy 431 | max_step = i + 1 432 | 433 | # test_perplexity, test_accuracy = run_epoch(session, mtest) 434 | # print("\nTest Perplexity: %.3f Test Accuracy: %.3f" % (test_perplexity, test_accuracy)) 435 | 436 | print ('max step %d, max valid %.3f' %(max_step, max_valid)) 437 | # print ('data path is', FLAGS.data_path) 438 | print ('total time takes', time.time()-start_time) 439 | print ('max step %d, max valid %.3f' %(max_step, max_valid), file=fout) 440 | print ('total time takes', time.time()-start_time, file=fout) 441 | fout.close() 442 | 443 | # if FLAGS.save_path: 444 | # print("Saving model to %s." % FLAGS.save_path) 445 | # save_path = saver.save(session, FLAGS.save_path, write_meta_graph=False, write_state=False) 446 | 447 | 448 | if __name__ == "__main__": 449 | tf.app.run() 450 | -------------------------------------------------------------------------------- /code/pointer_parent.py: -------------------------------------------------------------------------------- 1 | # use word distribution and location information(pointer) 2 | # 1-28, reverse l_logits 3 | 4 | from __future__ import absolute_import 5 | from __future__ import division 6 | from __future__ import print_function 7 | 8 | import inspect 9 | import time 10 | 11 | import numpy as np 12 | import tensorflow as tf 13 | 14 | import reader_pointer as reader 15 | import os 16 | os.environ['TF_CPP_MIN_LOG_LEVEL']='2' 17 | 18 | os.environ['CUDA_VISIBLE_DEVICES']='0' 19 | outfile = 'output_pointer_parent.txt' 20 | 21 | N_filename = '../pickle_data/JS_non_terminal.pickle' 22 | T_filename = '../pickle_data/JS_terminal_50k_whole.pickle' 23 | 24 | flags = tf.flags 25 | flags.DEFINE_string("save_path", None, #'./logs/modelT0A' 26 | "Model output directory.") 27 | 28 | flags.DEFINE_string( 29 | "model", "small", 30 | "A type of model. Possible options are: small, medium, best.") 31 | # flags.DEFINE_string("data_path", '../data/dataJS', 32 | # "Where the training/test data is stored.") 33 | flags.DEFINE_bool("use_fp16", False, 34 | "Train using 16-bit floats instead of 32bit floats") 35 | 36 | FLAGS = flags.FLAGS 37 | logging = tf.logging 38 | 39 | if FLAGS.model == "test": 40 | outfile = 'TESToutput.txt' 41 | def data_type(): 42 | return tf.float16 if FLAGS.use_fp16 else tf.float32 43 | 44 | class SmallConfig(object): 45 | """Small config. get best result as 0.733 """ 46 | init_scale = 0.05 47 | learning_rate = 0.001 48 | max_grad_norm = 5 49 | num_layers = 1#1 50 | num_steps = 50 51 | attn_size = 50 52 | hidden_sizeN = 300 53 | hidden_sizeT = 500 54 | sizeH = 800 55 | max_epoch = 1#8 56 | max_max_epoch = 8#79 57 | keep_prob = 1.0#1.0 58 | lr_decay = 0.6#0.95 59 | batch_size = 64#80 60 | 61 | class TestConfig(object): 62 | """Tiny config, for testing.""" 63 | init_scale = 0.05 64 | learning_rate = 0.001 65 | max_grad_norm = 5 66 | num_layers = 1 67 | num_steps = 50 68 | attn_size = 50 69 | hidden_sizeN = 300 70 | hidden_sizeT = 500 71 | sizeH = 800 72 | max_epoch = 1 73 | max_max_epoch = 1 74 | keep_prob = 1.0 75 | lr_decay = 0.6 76 | batch_size = 80 77 | 78 | 79 | def get_config(): 80 | if FLAGS.model == "small": 81 | return SmallConfig() 82 | elif FLAGS.model == "medium": 83 | return MediumConfig() 84 | elif FLAGS.model == "best": 85 | return BestConfig() 86 | elif FLAGS.model == "test": 87 | return TestConfig() 88 | else: 89 | raise ValueError("Invalid model: %s", FLAGS.model) 90 | 91 | 92 | class PTBInput(object): 93 | """The input data.""" 94 | 95 | def __init__(self, config, data, name=None): 96 | self.batch_size = batch_size = config.batch_size 97 | self.attn_size = attn_size = config.attn_size 98 | self.num_steps = num_steps = config.num_steps 99 | self.input_dataN, self.targetsN, self.input_dataT, self.targetsT, self.epoch_size, self.eof_indicator, self.input_dataP = \ 100 | reader.data_producer(data, batch_size, num_steps, config.vocab_size, config.attn_size, change_yT=False, name=name) 101 | if FLAGS.model == "test": 102 | self.epoch_size = 16 #small epoch size for test 103 | 104 | 105 | class PTBModel(object): 106 | """The PTB model.""" 107 | 108 | def __init__(self, is_training, config, input_): 109 | self._input = input_ 110 | self.attn_size = attn_size = config.attn_size 111 | batch_size = input_.batch_size 112 | num_steps = input_.num_steps 113 | self.sizeN = sizeN = config.hidden_sizeN 114 | self.sizeT = sizeT = config.hidden_sizeT 115 | self.size = size = config.sizeH 116 | (vocab_sizeN, vocab_sizeT) = config.vocab_size 117 | 118 | # Slightly better results can be obtained with forget gate biases 119 | # initialized to 1 but the hyperparameters of the model would need to be 120 | # different than reported in the paper. 121 | def lstm_cell(): 122 | if 'reuse' in inspect.getargspec( 123 | tf.contrib.rnn.BasicLSTMCell.__init__).args: 124 | return tf.contrib.rnn.BasicLSTMCell( 125 | size, forget_bias=1.0, state_is_tuple=True, 126 | reuse=tf.get_variable_scope().reuse) 127 | else: 128 | return tf.contrib.rnn.BasicLSTMCell( 129 | size, forget_bias=1.0, state_is_tuple=True) 130 | attn_cell = lstm_cell 131 | if is_training and config.keep_prob < 1: 132 | def attn_cell(): 133 | return tf.contrib.rnn.DropoutWrapper( 134 | lstm_cell(), output_keep_prob=config.keep_prob) 135 | cell = tf.contrib.rnn.MultiRNNCell( 136 | [attn_cell() for _ in range(config.num_layers)], state_is_tuple=True) 137 | 138 | state_variables = [] 139 | with tf.variable_scope("myCH0"): 140 | for i, (state_c, state_h) in enumerate(cell.zero_state(batch_size, data_type())): 141 | if i > 0: tf.get_variable_scope().reuse_variables() 142 | myC0 = tf.get_variable("myC0", state_c.shape[1], initializer=tf.zeros_initializer()) 143 | myH0 = tf.get_variable("myH0", state_h.shape[1], initializer=tf.zeros_initializer()) 144 | myC0_tensor = tf.convert_to_tensor([myC0 for _ in range(batch_size)]) 145 | myH0_tensor = tf.convert_to_tensor([myH0 for _ in range(batch_size)]) 146 | state_variables.append(tf.contrib.rnn.LSTMStateTuple(myC0_tensor, myH0_tensor)) 147 | 148 | self._initial_state = state_variables 149 | 150 | self.eof_indicator = input_.eof_indicator 151 | 152 | with tf.device("/cpu:0"): 153 | embeddingN = tf.get_variable( 154 | "embeddingN", [vocab_sizeN, sizeN], dtype=data_type()) 155 | inputsN = tf.nn.embedding_lookup(embeddingN, input_.input_dataN) 156 | inputsP = tf.nn.embedding_lookup(embeddingN, input_.input_dataP) 157 | inputsL = tf.nn.embedding_lookup(embeddingN, input_.targetsN) # target type information 158 | 159 | with tf.device("/cpu:0"): 160 | embeddingT = tf.get_variable( 161 | "embeddingT", [vocab_sizeT, sizeT], dtype=data_type()) 162 | inputsT = tf.nn.embedding_lookup(embeddingT, input_.input_dataT) 163 | 164 | inputs = tf.concat([inputsN, inputsT], 2) 165 | inputsPL = tf.concat([inputsP, inputsL], 2) 166 | #inputs = tf.one_hot(input_.input_data, vocab_size) 167 | if is_training and config.keep_prob < 1: 168 | inputs = tf.nn.dropout(inputs, config.keep_prob) 169 | 170 | outputs = [] 171 | attentions = [] 172 | parents = [] 173 | alphas = [] 174 | state = self._initial_state 175 | self.memory = tf.placeholder(dtype=data_type(), shape=[batch_size, num_steps, size], name="memory") 176 | valid_memory = self.memory[:,-attn_size:,:] 177 | # print ("test test test,, state shape", np.array(state).shape) 178 | with tf.variable_scope("RNN"): 179 | for time_step in range(num_steps): 180 | if time_step > 0: tf.get_variable_scope().reuse_variables() 181 | (cell_output, state) = cell(inputs[:, time_step, :], state) 182 | outputs.append(cell_output) 183 | 184 | cell_parent = inputsP[:, time_step, :] 185 | parents.append(cell_parent) 186 | 187 | wm = tf.get_variable("wm", [size, size], dtype=data_type()) 188 | wh = tf.get_variable("wh", [size, size], dtype=data_type()) 189 | wt = tf.get_variable("wt", [size, 1], dtype=data_type()) 190 | gt = tf.tanh(tf.matmul(tf.reshape(valid_memory, [-1, size]), wm) + tf.reshape(tf.tile(tf.matmul(cell_output, wh),[1, attn_size]), [-1, size])) 191 | alpha = tf.nn.softmax(tf.reshape(tf.matmul(gt, wt), [-1,attn_size])) #the size of alpha: batch_size by attn_size 192 | alphas.append(alpha) 193 | ct = tf.squeeze(tf.matmul(tf.transpose(valid_memory, [0, 2, 1]), tf.reshape(alpha, [-1, attn_size, 1]))) 194 | attentions.append(ct) 195 | valid_memory = tf.concat([valid_memory[:,1:,:], tf.expand_dims(cell_output, axis=1)], axis=1) 196 | 197 | output = tf.reshape(tf.stack(axis=1, values=outputs), [-1, size]) 198 | attention = tf.reshape(tf.stack(axis=1, values=attentions), [-1, size]) 199 | parent = tf.reshape(tf.stack(axis=1, values=parents), [-1, sizeN]) 200 | 201 | self.output = tf.reshape(output, [-1, num_steps, size]) #to record the memory for next batch 202 | wa = tf.get_variable("wa", [size*2+sizeN, size], dtype=data_type()) 203 | nt = tf.tanh(tf.matmul(tf.concat([output, attention, parent], axis=1), wa)) 204 | 205 | #compute w 206 | softmax_w = tf.get_variable("softmax_w", [size, vocab_sizeT], dtype=data_type()) 207 | softmax_b = tf.get_variable("softmax_b", [vocab_sizeT], dtype=data_type()) 208 | w_logits = tf.matmul(nt, softmax_w) + softmax_b 209 | w_probs = tf.nn.softmax(w_logits) 210 | 211 | #compute l 212 | l_logits_pre = tf.reshape(tf.stack(axis=1, values=alphas), [-1, attn_size]) #the size of alpha_reshaped: batch_size*num_steps by attn_size 213 | l_logits = tf.reverse(l_logits_pre, axis=[1]) 214 | # l_probs = tf.nn.softmax(l_logits) 215 | 216 | #compute d 217 | # input_reshaped = tf.reshape(inputs, [-1, size]) 218 | d_conditioned = tf.concat([output, attention], axis=1) 219 | d_w = tf.get_variable("d_w1", [2*size, 1], dtype=data_type()) 220 | d_b = tf.get_variable("d_b1", [1], dtype=data_type()) 221 | d = tf.nn.sigmoid(tf.matmul(d_conditioned, d_w) + d_b) 222 | 223 | # d_conditioned = tf.concat([output, attention], axis=1) 224 | # d_w1 = tf.get_variable("d_w1", [2*size, size], dtype=data_type()) 225 | # d_b1 = tf.get_variable("d_b1", [size], dtype=data_type()) 226 | # fc1 = tf.nn.relu(tf.matmul(d_conditioned, d_w1) + d_b1) 227 | # d_w2 = tf.get_variable("d_w2", [size, 1], dtype=data_type()) 228 | # d_b2 = tf.get_variable("d_b2", [1], dtype=data_type()) 229 | # d = tf.nn.sigmoid(tf.matmul(fc1, d_w2) + d_b2) 230 | 231 | #concat w and l to construct f 232 | f_logits = tf.concat([w_logits*d, l_logits*(1-d)], axis=1) 233 | 234 | labels = tf.reshape(input_.targetsT, [-1]) 235 | weights = tf.ones([batch_size * num_steps], dtype=data_type()) 236 | 237 | #counting unk as wrong 238 | unk_id = vocab_sizeT - 2 239 | unk_tf = tf.constant(value=unk_id, dtype=tf.int32, shape=labels.shape) 240 | zero_weights = tf.zeros_like(labels, dtype=data_type()) 241 | wrong_label = tf.constant(value=-1, dtype=tf.int32, shape=labels.shape) 242 | condition_tf = tf.equal(labels, unk_tf) 243 | new_weights = tf.where(condition_tf, zero_weights, weights) 244 | new_labels = tf.where(condition_tf, wrong_label, labels) # only for computing the accuracy, can not be used to compute the loss(cause nan error) 245 | 246 | 247 | loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example([f_logits], [labels], [new_weights]) 248 | probs = tf.nn.softmax(f_logits) 249 | 250 | # condition = tf.not_equal(labels, 182) 251 | # non_pad_len = tf.reduce_sum(tf.cast(condition, tf.float32)) 252 | # mask_labels = tf.where(condition, labels, tf.constant(250, shape = labels.get_shape())) #250 just do not belong to the vocab 253 | # correct_prediction = tf.equal(tf.cast(tf.argmax(probs, 1), dtype = tf.int32), mask_labels) 254 | # self._accuracy = tf.reduce_sum(tf.cast(correct_prediction, tf.float32)) / non_pad_len # do not count predict (182) 255 | 256 | correct_prediction = tf.equal(tf.cast(tf.argmax(probs, 1), dtype = tf.int32), new_labels) 257 | self._accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) 258 | 259 | self._cost = cost = tf.reduce_sum(loss) / batch_size 260 | self._final_state = state 261 | 262 | if not is_training: 263 | return 264 | 265 | self._lr = tf.Variable(0.0, trainable=False) 266 | tvars = tf.trainable_variables() 267 | print ('tvars', len(tvars)) 268 | grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), 269 | config.max_grad_norm) 270 | print ('*******the length', len(grads)) 271 | optimizer = tf.train.AdamOptimizer(self._lr) 272 | self._train_op = optimizer.apply_gradients( 273 | zip(grads, tvars), 274 | global_step=tf.contrib.framework.get_or_create_global_step()) 275 | 276 | self._new_lr = tf.placeholder( 277 | tf.float32, shape=[], name="new_learning_rate") 278 | self._lr_update = tf.assign(self._lr, self._new_lr) 279 | 280 | def assign_lr(self, session, lr_value): 281 | session.run(self._lr_update, feed_dict={self._new_lr: lr_value}) 282 | 283 | @property 284 | def input(self): 285 | return self._input 286 | 287 | @property 288 | def initial_state(self): 289 | return self._initial_state 290 | 291 | @property 292 | def cost(self): 293 | return self._cost 294 | 295 | @property 296 | def final_state(self): 297 | return self._final_state 298 | 299 | @property 300 | def accuracy(self): 301 | return self._accuracy 302 | 303 | @property 304 | def lr(self): 305 | return self._lr 306 | 307 | @property 308 | def train_op(self): 309 | return self._train_op 310 | 311 | 312 | def run_epoch(session, model, eval_op=None, verbose=False): 313 | """Runs the model on the given data.""" 314 | start_time = time.time() 315 | costs = 0.0 316 | accuracy_list = [] 317 | iters = 0 318 | state = session.run(model.initial_state) 319 | # print ('at the very initial of the run_epoch\n', state[0].c) 320 | eof_indicator = np.ones((model.input.batch_size), dtype=bool) 321 | memory = np.zeros([model.input.batch_size, model.input.num_steps, model.size]) 322 | # file_id = session.run(model.initial_file_id) #need to remove _ 323 | 324 | fetches = { 325 | "cost": model.cost, 326 | "accuracy": model.accuracy, 327 | "final_state": model.final_state, 328 | "eof_indicator": model.eof_indicator, 329 | "memory":model.output, 330 | } 331 | if eval_op is not None: 332 | fetches["eval_op"] = eval_op 333 | 334 | for step in range(model.input.epoch_size): 335 | feed_dict = {} 336 | # current_file_id = file_id #session.run(model.file_id) 337 | sub_cond = np.expand_dims(eof_indicator, axis = 1) 338 | condition = np.repeat(sub_cond, model.size, axis = 1) 339 | # zero_state = np.zeros_like(condition) 340 | # zero_state = np.random.uniform(-0.05,0.05,condition.shape) 341 | zero_state = session.run(model.initial_state) 342 | 343 | for i, (c, h) in enumerate(model.initial_state): 344 | assert condition.shape == state[i].c.shape 345 | feed_dict[c] = np.where(condition, zero_state[i][0], state[i].c) 346 | feed_dict[h] = np.where(condition, zero_state[i][1], state[i].h) 347 | 348 | feed_dict[model.memory] = memory 349 | vals = session.run(fetches, feed_dict) 350 | 351 | cost = vals["cost"] 352 | accuracy = vals["accuracy"] 353 | eof_indicator = vals["eof_indicator"] 354 | state = vals["final_state"] #use the final state as the initial state within a whole epoch 355 | memory = vals["memory"] 356 | 357 | accuracy_list.append(accuracy) 358 | costs += cost 359 | iters += model.input.num_steps 360 | 361 | if verbose and step % (model.input.epoch_size // 10) == 10: 362 | print("%.3f perplexity: %.3f accuracy: %.4f speed: %.0f wps" % 363 | (step * 1.0 / model.input.epoch_size, np.exp(costs / iters), np.mean(accuracy_list), 364 | (time.time() - start_time))) 365 | # print ('zero_state value', zero_state[0][0]) 366 | # print ('gradients value', session.run(model.grads)) 367 | 368 | print ('this run_epoch takes time %.2f' %(time.time() - start_time)) 369 | return np.exp(costs / iters), np.mean(accuracy_list) 370 | 371 | 372 | 373 | 374 | def main(_): 375 | start_time = time.time() 376 | fout = open(outfile, 'a') 377 | print ('\n', time.asctime(time.localtime()), file=fout) 378 | print ('start a new experiment %s'%outfile, file=fout) 379 | print ('Using dataset %s and %s'%(N_filename, T_filename), file=fout) 380 | print ('condition on two, two layers', file=fout) 381 | 382 | train_dataN, valid_dataN, vocab_sizeN, train_dataT, valid_dataT, vocab_sizeT, attn_size, train_dataP, valid_dataP \ 383 | = reader.input_data(N_filename, T_filename) 384 | 385 | train_data = (train_dataN, train_dataT, train_dataP) 386 | valid_data = (valid_dataN, valid_dataT, valid_dataP) 387 | vocab_size = (vocab_sizeN+1, vocab_sizeT+2) # N is [w, eof], T is [w, unk, eof] 388 | 389 | config = get_config() 390 | assert attn_size == config.attn_size #make sure the attn_size used in generate terminal is the same as the configuration 391 | config.vocab_size = vocab_size 392 | eval_config = get_config() 393 | eval_config.batch_size = config.batch_size * config.num_steps 394 | eval_config.num_steps = 1 395 | eval_config.vocab_size = vocab_size 396 | 397 | with tf.Graph().as_default(): 398 | initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale) 399 | 400 | with tf.name_scope("Train"): 401 | train_input = PTBInput(config=config, data=train_data, name="TrainInput") 402 | with tf.variable_scope("Model", reuse=None, initializer=initializer): 403 | m = PTBModel(is_training=True, config=config, input_=train_input) 404 | 405 | with tf.name_scope("Valid"): 406 | valid_input = PTBInput(config=config, data=valid_data, name="ValidInput") 407 | with tf.variable_scope("Model", reuse=True, initializer=initializer): 408 | mvalid = PTBModel(is_training=False, config=config, input_=valid_input) 409 | 410 | # with tf.name_scope("Test"): 411 | # test_input = PTBInput(config=eval_config, data=valid_data, name="TestInput") 412 | # with tf.variable_scope("Model", reuse=True, initializer=initializer): 413 | # mtest = PTBModel(is_training=False, config=eval_config, 414 | # input_=test_input) 415 | 416 | 417 | print ('total trainable variables', len(tf.trainable_variables()), '\n\n') 418 | max_valid = 0 419 | max_step = 0 420 | saver = tf.train.Saver() 421 | 422 | sv = tf.train.Supervisor(logdir=None, summary_op=None) 423 | with sv.managed_session() as session: 424 | 425 | for i in range(config.max_max_epoch): 426 | lr_decay = config.lr_decay ** max(i + 1 - config.max_epoch, 0.0) 427 | m.assign_lr(session, config.learning_rate * lr_decay) 428 | print (outfile, "Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr))) 429 | 430 | train_perplexity, train_accuracy = run_epoch(session, m, eval_op=m.train_op, verbose=True) 431 | print("Epoch: %d Train Perplexity: %.3f Train Accuracy: %.3f" % (i + 1, train_perplexity, train_accuracy)) 432 | print("Epoch: %d Train Perplexity: %.3f Train Accuracy: %.3f" % (i + 1, train_perplexity, train_accuracy), file=fout) 433 | 434 | if i > 5: 435 | valid_perplexity, valid_accuracy = run_epoch(session, mvalid) 436 | print("Epoch: %d Valid Perplexity: ~~%.3f Valid Accuracy: %.3f~" % (i + 1, valid_perplexity, valid_accuracy)) 437 | print("Epoch: %d Valid Perplexity: ~~%.3f Valid Accuracy: %.3f~" % (i + 1, valid_perplexity, valid_accuracy), file=fout) 438 | if valid_accuracy > max_valid: 439 | max_valid = valid_accuracy 440 | max_step = i + 1 441 | 442 | # test_perplexity, test_accuracy = run_epoch(session, mtest) 443 | # print("\nTest Perplexity: %.3f Test Accuracy: %.3f" % (test_perplexity, test_accuracy)) 444 | 445 | print ('max step %d, max valid %.3f' %(max_step, max_valid)) 446 | # print ('data path is', FLAGS.data_path) 447 | print ('total time takes', time.time()-start_time) 448 | print ('max step %d, max valid %.3f' %(max_step, max_valid), file=fout) 449 | print ('total time takes', time.time()-start_time, file=fout) 450 | fout.close() 451 | 452 | # if FLAGS.save_path: 453 | # print("Saving model to %s." % FLAGS.save_path) 454 | # save_path = saver.save(session, FLAGS.save_path, write_meta_graph=False, write_state=False) 455 | 456 | 457 | if __name__ == "__main__": 458 | tf.app.run() 459 | -------------------------------------------------------------------------------- /code/reader_pointer.py: -------------------------------------------------------------------------------- 1 | # xxx revise it on 01/09, add parent 2 | # Add attn_size in input_data, data_producer; add change_yT for indicating whether to remove the location of unk(just label it as unk) 3 | # refactor the code of contructing the long line (def padding_and_concat) 4 | 5 | from __future__ import absolute_import 6 | from __future__ import division 7 | from __future__ import print_function 8 | 9 | import collections 10 | from six.moves import cPickle as pickle 11 | import tensorflow as tf 12 | import numpy as np 13 | import time 14 | from collections import Counter, defaultdict 15 | import os 16 | os.environ['TF_CPP_MIN_LOG_LEVEL']='2' 17 | 18 | def input_data(N_filename, T_filename): 19 | start_time = time.time() 20 | with open(N_filename, 'rb') as f: 21 | print ("reading data from ", N_filename) 22 | save = pickle.load(f) 23 | train_dataN = save['trainData'] 24 | test_dataN = save['testData'] 25 | train_dataP = save['trainParent'] 26 | test_dataP = save['testParent'] 27 | vocab_sizeN = save['vocab_size'] 28 | print ('the vocab_sizeN is %d (not including the eof)' %vocab_sizeN) 29 | print ('the number of training data is %d' %(len(train_dataN))) 30 | print ('the number of test data is %d\n' %(len(test_dataN))) 31 | 32 | with open(T_filename, 'rb') as f: 33 | print ("reading data from ", T_filename) 34 | save = pickle.load(f) 35 | train_dataT = save['trainData'] 36 | test_dataT = save['testData'] 37 | vocab_sizeT = save['vocab_size'] 38 | attn_size = save['attn_size'] 39 | print ('the vocab_sizeT is %d (not including the unk and eof)' %vocab_sizeT) 40 | print ('the attn_size is %d' %attn_size) 41 | print ('the number of training data is %d' %(len(train_dataT))) 42 | print ('the number of test data is %d' %(len(test_dataT))) 43 | print ('Finish reading data and take %.2f\n'%(time.time()-start_time)) 44 | 45 | return train_dataN, test_dataN, vocab_sizeN, train_dataT, test_dataT, vocab_sizeT, attn_size, train_dataP, test_dataP 46 | 47 | 48 | def data_producer(raw_data, batch_size, num_steps, vocab_size, attn_size, change_yT=False, name=None, verbose=False): 49 | 50 | start_time = time.time() 51 | 52 | with tf.name_scope(name, "DataProducer", [raw_data, batch_size, num_steps, vocab_size]): 53 | (raw_dataN, raw_dataT, raw_dataP) = raw_data 54 | assert len(raw_dataN) == len(raw_dataT) 55 | 56 | (vocab_sizeN, vocab_sizeT) = vocab_size 57 | eof_N_id = vocab_sizeN - 1 58 | eof_T_id = vocab_sizeT - 1 59 | unk_id = vocab_sizeT - 2 60 | 61 | def padding_and_concat(data, width, pad_id): 62 | #the size of data: a list of list. This function will pad the data according to width 63 | long_line = list() 64 | for line in data: 65 | pad_len = width - (len(line) % width) 66 | new_line = line + [pad_id] * pad_len 67 | assert len(new_line) % width == 0 68 | long_line += new_line 69 | return long_line 70 | 71 | pad_start = time.time() 72 | long_lineN = padding_and_concat(raw_dataN, num_steps, pad_id=eof_N_id) 73 | long_lineT = padding_and_concat(raw_dataT, num_steps, pad_id=eof_T_id) 74 | long_lineP = padding_and_concat(raw_dataP, num_steps, pad_id=1) 75 | assert len(long_lineN) == len(long_lineT) 76 | print('Pading three long lines and take %.2fs'%(time.time()-pad_start)) 77 | 78 | # print statistics for long_lineT 79 | if verbose: 80 | print('Start counting the statistics of T!!') 81 | verbose_start = time.time() 82 | cnt_T = Counter(long_lineT) 83 | long_lineT_len = len(long_lineT) 84 | empty_cnt = cnt_T[0] 85 | unk_cnt = cnt_T[unk_id] 86 | eof_cnt = cnt_T[eof_T_id] 87 | l_cnt = sum(np.array(long_lineT) > eof_T_id) 88 | w_cnt = long_lineT_len - empty_cnt - unk_cnt - eof_cnt - l_cnt 89 | print('long_lineT_len: %d, empty: %.4f, unk: %.4f, location: %.4f, eof: %.4f, word (except Empty): %.4f'% 90 | (long_lineT_len, float(empty_cnt)/long_lineT_len, float(unk_cnt)/long_lineT_len, 91 | float(l_cnt)/long_lineT_len, float(eof_cnt)/long_lineT_len, float(w_cnt)/long_lineT_len)) 92 | print('the most common 5 of cnt_T', cnt_T.most_common(5)) 93 | print('print verbose information and take %.2fs\n'%(time.time()-verbose_start)) 94 | 95 | temp_len = len(long_lineN) 96 | n = temp_len // (batch_size * num_steps) 97 | long_lineN_truncated = np.array(long_lineN[0 : n * (batch_size * num_steps)]) 98 | long_lineP_truncated = np.array(long_lineP[0 : n * (batch_size * num_steps)]) 99 | long_lineT_truncated_x = np.array(long_lineT[0 : n * (batch_size * num_steps)]) 100 | long_lineT_truncated_y = np.array(long_lineT[0 : n * (batch_size * num_steps)]) 101 | 102 | # long_lineP_truncated[long_lineP_truncated > attn_size] = attn_size #if the parent location is too far 103 | long_lineP_truncated = [long_lineN_truncated[i-j] for i,j in enumerate(long_lineP_truncated)] #only store parent N 104 | 105 | location_index = long_lineT_truncated_x > eof_T_id 106 | long_lineT_truncated_x[location_index] = unk_id 107 | if change_yT: 108 | long_lineT_truncated_y[location_index] = unk_id 109 | 110 | # print('count of greater than eof', sum(long_lineT_truncated_y > eof_T_id)) 111 | 112 | tf_dataN = tf.convert_to_tensor(long_lineN_truncated, name="raw_dataN", dtype=tf.int32) 113 | tf_dataP = tf.convert_to_tensor(long_lineP_truncated, name="raw_dataP", dtype=tf.int32) 114 | tf_dataT_x = tf.convert_to_tensor(long_lineT_truncated_x, name="raw_dataT_x", dtype=tf.int32) 115 | tf_dataT_y = tf.convert_to_tensor(long_lineT_truncated_y, name="raw_dataT_y", dtype=tf.int32) 116 | 117 | data_len = len(long_lineN_truncated) 118 | batch_len = data_len // batch_size 119 | # print ('the total data length is %d, batch_len is %d\n ' %(data_len, batch_len)) 120 | dataN = tf.reshape(tf_dataN[0 : batch_size * batch_len], [batch_size, batch_len]) 121 | dataP = tf.reshape(tf_dataP[0 : batch_size * batch_len], [batch_size, batch_len]) 122 | dataT_x = tf.reshape(tf_dataT_x[0 : batch_size * batch_len], [batch_size, batch_len]) 123 | dataT_y = tf.reshape(tf_dataT_y[0 : batch_size * batch_len], [batch_size, batch_len]) 124 | 125 | epoch_size = (batch_len - 1) // num_steps # how many batches to complete a epoch 126 | assert epoch_size > 0 127 | i = tf.train.range_input_producer(epoch_size, shuffle=False).dequeue() 128 | per_start = time.time() 129 | xN = tf.strided_slice(dataN, [0, i * num_steps], 130 | [batch_size, (i + 1) * num_steps]) 131 | xN.set_shape([batch_size, num_steps]) # need to assert all values in x[a,:,1] are the same 132 | yN = tf.strided_slice(dataN, [0, i * num_steps + 1], 133 | [batch_size, (i + 1) * num_steps + 1]) 134 | yN.set_shape([batch_size, num_steps]) 135 | 136 | xT = tf.strided_slice(dataT_x, [0, i * num_steps], 137 | [batch_size, (i + 1) * num_steps]) 138 | xT.set_shape([batch_size, num_steps]) # need to assert all values in x[a,:,1] are the same 139 | yT = tf.strided_slice(dataT_y, [0, i * num_steps + 1], 140 | [batch_size, (i + 1) * num_steps + 1]) 141 | yT.set_shape([batch_size, num_steps]) 142 | 143 | xP = tf.strided_slice(dataP, [0, i * num_steps], 144 | [batch_size, (i + 1) * num_steps]) 145 | xP.set_shape([batch_size, num_steps]) 146 | 147 | eof_indicator = tf.equal(xN[:, num_steps - 1], tf.constant([eof_N_id]*batch_size)) 148 | print('Finish preparing input producer and takes %.2fs' %(time.time()-start_time)) 149 | print('Each produce data takes time %.2f\n' %(time.time()-per_start)) 150 | return xN, yN, xT, yT, epoch_size, eof_indicator, xP 151 | 152 | if __name__ == '__main__': 153 | N_filename = '../pickle_data/JS_non_terminal.pickle' 154 | T_filename = '../pickle_data/JS_terminal_50k_whole.pickle' 155 | 156 | train_dataN, valid_dataN, vocab_sizeN, train_dataT, valid_dataT, vocab_sizeT, attn_size, train_dataP, valid_dataP \ 157 | = input_data(N_filename, T_filename) 158 | train_data = (train_dataN, train_dataT, train_dataP) 159 | valid_data = (valid_dataN, valid_dataT, valid_dataP) 160 | vocab_size = (vocab_sizeN+1, vocab_sizeT+2) # N is [w, eof], T is [w, unk, eof] 161 | 162 | input_dataN, targetsN, input_dataT, targetsT, epoch_size, eof_indicator, input_dataP = \ 163 | data_producer(train_data, batch_size=128, num_steps=50, vocab_size=vocab_size, attn_size=attn_size, change_yT=False, name='train', verbose=False) 164 | # input_dataN1, targetsN1, input_dataT1, targetsT1, epoch_size1, eof_indicator1 = \ 165 | # data_producer(valid_data, batch_size=128, num_steps=50, vocab_size=vocab_size, attn_size=attn_size, change_yT=False, name='test', verbose=False) 166 | 167 | labels = tf.reshape(targetsT, [-1]) 168 | eof_id = vocab_size[1] -1 169 | loss_condition = tf.greater(labels, tf.constant(value=eof_id, dtype=tf.int32, shape=labels.shape)) 170 | fetches = { 171 | "labels":labels, 172 | "loss_condition":loss_condition,} 173 | # sess = tf.Session() #there is no graph to run 174 | # vals = sess.run(fetches) 175 | # labels_np = vals["labels"] 176 | # loss_condition_np = vals["loss_condition"] 177 | print('*** Done! ***') -------------------------------------------------------------------------------- /code/reader_pointer_original.py: -------------------------------------------------------------------------------- 1 | # Yue revise it on 08/15 2 | # Add attn_size in input_data, data_producer; add change_yT for indicating whether to remove the location of unk(just label it as unk) 3 | # refactor the code of contructing the long line (def padding_and_concat) 4 | 5 | from __future__ import absolute_import 6 | from __future__ import division 7 | from __future__ import print_function 8 | 9 | import collections 10 | from six.moves import cPickle as pickle 11 | import tensorflow as tf 12 | import numpy as np 13 | import time 14 | from collections import Counter 15 | import os 16 | os.environ['TF_CPP_MIN_LOG_LEVEL']='2' 17 | 18 | def input_data(N_filename, T_filename): 19 | start_time = time.time() 20 | with open(N_filename, 'rb') as f: 21 | print ("reading data from ", N_filename) 22 | save = pickle.load(f) 23 | train_dataN = save['trainData'] 24 | test_dataN = save['testData'] 25 | vocab_sizeN = save['vocab_size'] 26 | print ('the vocab_sizeN is %d (not including the eof)' %vocab_sizeN) 27 | print ('the number of training data is %d' %(len(train_dataN))) 28 | print ('the number of test data is %d\n' %(len(test_dataN))) 29 | 30 | with open(T_filename, 'rb') as f: 31 | print ("reading data from ", T_filename) 32 | save = pickle.load(f) 33 | train_dataT = save['trainData'] 34 | test_dataT = save['testData'] 35 | vocab_sizeT = save['vocab_size'] 36 | attn_size = save['attn_size'] 37 | print ('the vocab_sizeT is %d (not including the unk and eof)' %vocab_sizeT) 38 | print ('the attn_size is %d' %attn_size) 39 | print ('the number of training data is %d' %(len(train_dataT))) 40 | print ('the number of test data is %d' %(len(test_dataT))) 41 | print ('Finish reading data and take %.2f\n'%(time.time()-start_time)) 42 | 43 | return train_dataN, test_dataN, vocab_sizeN, train_dataT, test_dataT, vocab_sizeT, attn_size 44 | 45 | 46 | def data_producer(raw_data, batch_size, num_steps, vocab_size, attn_size, change_yT=False, name=None, verbose=False): 47 | 48 | start_time = time.time() 49 | 50 | with tf.name_scope(name, "DataProducer", [raw_data, batch_size, num_steps, vocab_size]): 51 | (raw_dataN, raw_dataT) = raw_data 52 | assert len(raw_dataN) == len(raw_dataT) 53 | 54 | (vocab_sizeN, vocab_sizeT) = vocab_size 55 | eof_N_id = vocab_sizeN - 1 56 | eof_T_id = vocab_sizeT - 1 57 | unk_id = vocab_sizeT - 2 58 | 59 | def padding_and_concat(data, width, pad_id): 60 | #the size of data: a list of list. This function will pad the data according to width 61 | long_line = list() 62 | for line in data: 63 | pad_len = width - (len(line) % width) 64 | new_line = line + [pad_id] * pad_len 65 | assert len(new_line) % width == 0 66 | long_line += new_line 67 | return long_line 68 | 69 | pad_start = time.time() 70 | long_lineN = padding_and_concat(raw_dataN, num_steps, pad_id=eof_N_id) 71 | long_lineT = padding_and_concat(raw_dataT, num_steps, pad_id=eof_T_id) 72 | assert len(long_lineN) == len(long_lineT) 73 | print('Pading two long lines and take %.2fs'%(time.time()-pad_start)) 74 | 75 | # print statistics for long_lineT 76 | if verbose: 77 | print('Start counting the statistics of T!!') 78 | verbose_start = time.time() 79 | cnt_T = Counter(long_lineT) 80 | long_lineT_len = len(long_lineT) 81 | empty_cnt = cnt_T[0] 82 | unk_cnt = cnt_T[unk_id] 83 | eof_cnt = cnt_T[eof_T_id] 84 | l_cnt = sum(np.array(long_lineT) > eof_T_id) 85 | w_cnt = long_lineT_len - empty_cnt - unk_cnt - eof_cnt - l_cnt 86 | print('long_lineT_len: %d, empty: %.4f, unk: %.4f, location: %.4f, eof: %.4f, word (except Empty): %.4f'% 87 | (long_lineT_len, float(empty_cnt)/long_lineT_len, float(unk_cnt)/long_lineT_len, 88 | float(l_cnt)/long_lineT_len, float(eof_cnt)/long_lineT_len, float(w_cnt)/long_lineT_len)) 89 | print('the most common 5 of cnt_T', cnt_T.most_common(5)) 90 | print('print verbose information and take %.2fs\n'%(time.time()-verbose_start)) 91 | 92 | temp_len = len(long_lineN) 93 | # print ('\nthe original data length is %d' %temp_len) 94 | n = temp_len // (batch_size * num_steps) 95 | long_lineN_truncated = np.array(long_lineN[0 : n * (batch_size * num_steps)]) 96 | long_lineT_truncated_x = np.array(long_lineT[0 : n * (batch_size * num_steps)]) 97 | long_lineT_truncated_y = np.array(long_lineT[0 : n * (batch_size * num_steps)]) 98 | 99 | location_index = long_lineT_truncated_x > eof_T_id 100 | 101 | long_lineT_truncated_x[location_index] = unk_id 102 | if change_yT: 103 | long_lineT_truncated_y[location_index] = unk_id 104 | 105 | # print('count of greater than eof', sum(long_lineT_truncated_y > eof_T_id)) 106 | 107 | tf_dataN = tf.convert_to_tensor(long_lineN_truncated, name="raw_dataN", dtype=tf.int32) 108 | tf_dataT_x = tf.convert_to_tensor(long_lineT_truncated_x, name="raw_dataT_x", dtype=tf.int32) 109 | tf_dataT_y = tf.convert_to_tensor(long_lineT_truncated_y, name="raw_dataT_y", dtype=tf.int32) 110 | 111 | data_len = len(long_lineN_truncated) 112 | batch_len = data_len // batch_size 113 | # print ('the total data length is %d, batch_len is %d\n ' %(data_len, batch_len)) 114 | dataN = tf.reshape(tf_dataN[0 : batch_size * batch_len], [batch_size, batch_len]) 115 | dataT_x = tf.reshape(tf_dataT_x[0 : batch_size * batch_len], [batch_size, batch_len]) 116 | dataT_y = tf.reshape(tf_dataT_y[0 : batch_size * batch_len], [batch_size, batch_len]) 117 | 118 | epoch_size = (batch_len - 1) // num_steps 119 | assert epoch_size > 0 120 | i = tf.train.range_input_producer(epoch_size, shuffle=False).dequeue() 121 | per_start = time.time() 122 | xN = tf.strided_slice(dataN, [0, i * num_steps], 123 | [batch_size, (i + 1) * num_steps]) 124 | xN.set_shape([batch_size, num_steps]) # need to assert all values in x[a,:,1] are the same 125 | yN = tf.strided_slice(dataN, [0, i * num_steps + 1], 126 | [batch_size, (i + 1) * num_steps + 1]) 127 | yN.set_shape([batch_size, num_steps]) 128 | 129 | xT = tf.strided_slice(dataT_x, [0, i * num_steps], 130 | [batch_size, (i + 1) * num_steps]) 131 | xT.set_shape([batch_size, num_steps]) # need to assert all values in x[a,:,1] are the same 132 | yT = tf.strided_slice(dataT_y, [0, i * num_steps + 1], 133 | [batch_size, (i + 1) * num_steps + 1]) 134 | yT.set_shape([batch_size, num_steps]) 135 | 136 | eof_indicator = tf.equal(xN[:, num_steps - 1], tf.constant([eof_N_id]*batch_size)) 137 | print('Finish preparing input producer and takes %.2fs' %(time.time()-start_time)) 138 | print('Each produce data takes time %.2f\n' %(time.time()-per_start)) 139 | return xN, yN, xT, yT, epoch_size, eof_indicator 140 | 141 | if __name__ == '__main__': 142 | N_filename = '../pickle_data/JS_non_terminal.pickle' 143 | T_filename = '../pickle_data/JS_terminal_50k_whole.pickle' 144 | 145 | train_dataN, valid_dataN, vocab_sizeN, train_dataT, valid_dataT, vocab_sizeT, attn_size = input_data(N_filename, T_filename) 146 | train_data = (train_dataN, train_dataT) 147 | valid_data = (valid_dataN, valid_dataT) 148 | vocab_size = (vocab_sizeN+1, vocab_sizeT+2) # N is [w, eof], T is [w, unk, eof] 149 | input_dataN, targetsN, input_dataT, targetsT, epoch_size, eof_indicator = \ 150 | data_producer(train_data, batch_size=128, num_steps=50, vocab_size=vocab_size, attn_size=attn_size, change_yT=False, name='train', verbose=False) 151 | input_dataN1, targetsN1, input_dataT1, targetsT1, epoch_size1, eof_indicator1 = \ 152 | data_producer(valid_data, batch_size=128, num_steps=50, vocab_size=vocab_size, attn_size=attn_size, change_yT=False, name='test', verbose=False) 153 | 154 | labels = tf.reshape(targetsT, [-1]) 155 | eof_id = vocab_size[1] -1 156 | loss_condition = tf.greater(labels, tf.constant(value=eof_id, dtype=tf.int32, shape=labels.shape)) 157 | fetches = { 158 | "labels":labels, 159 | "loss_condition":loss_condition,} 160 | # sess = tf.Session() 161 | # vals = sess.run(fetches) 162 | # labels_np = vals["labels"] 163 | # loss_condition_np = vals["loss_condition"] 164 | print('*** Done! ***') -------------------------------------------------------------------------------- /code/vanillaLSTM.py: -------------------------------------------------------------------------------- 1 | # vanilla LSTM, count all unk as wrong 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | import inspect 8 | import time 9 | 10 | import numpy as np 11 | import tensorflow as tf 12 | 13 | import reader_pointer_original as reader 14 | import os 15 | os.environ['TF_CPP_MIN_LOG_LEVEL']='2' 16 | 17 | os.environ['CUDA_VISIBLE_DEVICES']='0' 18 | outfile = 'output_vanilla.txt' 19 | 20 | N_filename = '../pickle_data/JS_non_terminal.pickle' 21 | T_filename = '../pickle_data/JS_terminal_5k_whole.pickle' 22 | 23 | flags = tf.flags 24 | flags.DEFINE_string("save_path", None, #'./logs/modelT0A' 25 | "Model output directory.") 26 | 27 | flags.DEFINE_string( 28 | "model", "small", 29 | "A type of model. Possible options are: small, medium, best.") 30 | # flags.DEFINE_string("data_path", '../data/dataJS', 31 | # "Where the training/test data is stored.") 32 | flags.DEFINE_bool("use_fp16", False, 33 | "Train using 16-bit floats instead of 32bit floats") 34 | 35 | FLAGS = flags.FLAGS 36 | logging = tf.logging 37 | 38 | if FLAGS.model == "test": 39 | outfile = 'TESToutput.txt' 40 | def data_type(): 41 | return tf.float16 if FLAGS.use_fp16 else tf.float32 42 | 43 | class SmallConfig(object): 44 | """Small config. get best result as 0.733 """ 45 | init_scale = 0.05 46 | learning_rate = 0.001 47 | max_grad_norm = 5 48 | num_layers = 1#1 49 | num_steps = 50 50 | attn_size = 50 51 | hidden_sizeN = 300 52 | hidden_sizeT = 500 53 | sizeH = 800 54 | max_epoch = 1#8 55 | max_max_epoch = 8#79 56 | keep_prob = 1.0#1.0 57 | lr_decay = 0.6#0.95 58 | batch_size = 64#80 59 | vocab_size = 95, 50001 60 | 61 | class TestConfig(object): 62 | """Tiny config, for testing.""" 63 | init_scale = 0.05 64 | learning_rate = 0.001 65 | max_grad_norm = 5 66 | num_layers = 1 67 | num_steps = 50 68 | attn_size = 50 69 | hidden_sizeN = 300 70 | hidden_sizeT = 500 71 | sizeH = 800 72 | max_epoch = 1 73 | max_max_epoch = 1 74 | keep_prob = 1.0 75 | lr_decay = 0.6 76 | batch_size = 80 77 | vocab_size = 95, 50001 78 | 79 | 80 | def get_config(): 81 | if FLAGS.model == "small": 82 | return SmallConfig() 83 | elif FLAGS.model == "medium": 84 | return MediumConfig() 85 | elif FLAGS.model == "best": 86 | return BestConfig() 87 | elif FLAGS.model == "test": 88 | return TestConfig() 89 | else: 90 | raise ValueError("Invalid model: %s", FLAGS.model) 91 | 92 | 93 | class PTBInput(object): 94 | """The input data.""" 95 | 96 | def __init__(self, config, data, name=None): 97 | self.batch_size = batch_size = config.batch_size 98 | self.attn_size = attn_size = config.attn_size 99 | self.num_steps = num_steps = config.num_steps 100 | self.input_dataN, self.targetsN, self.input_dataT, self.targetsT, self.epoch_size, self.eof_indicator = \ 101 | reader.data_producer(data, batch_size, num_steps, config.vocab_size, config.attn_size, change_yT=True, name=name) 102 | if FLAGS.model == "test": 103 | self.epoch_size = 16 #small epoch size for test 104 | 105 | 106 | class PTBModel(object): 107 | 108 | def __init__(self, is_training, config, input_): 109 | self._input = input_ 110 | self.attn_size = attn_size = config.attn_size 111 | batch_size = input_.batch_size 112 | num_steps = input_.num_steps 113 | self.sizeN = sizeN = config.hidden_sizeN 114 | self.sizeT = sizeT = config.hidden_sizeT 115 | self.size = size = config.sizeH 116 | vocab_sizeN, vocab_sizeT = config.vocab_size 117 | 118 | # Slightly better results can be obtained with forget gate biases 119 | # initialized to 1 but the hyperparameters of the model would need to be 120 | # different than reported in the paper. 121 | def lstm_cell(): 122 | if 'reuse' in inspect.getargspec( 123 | tf.contrib.rnn.BasicLSTMCell.__init__).args: 124 | return tf.contrib.rnn.BasicLSTMCell( 125 | size, forget_bias=1.0, state_is_tuple=True, 126 | reuse=tf.get_variable_scope().reuse) 127 | else: 128 | return tf.contrib.rnn.BasicLSTMCell( 129 | size, forget_bias=1.0, state_is_tuple=True) 130 | attn_cell = lstm_cell 131 | if is_training and config.keep_prob < 1: 132 | def attn_cell(): 133 | return tf.contrib.rnn.DropoutWrapper( 134 | lstm_cell(), output_keep_prob=config.keep_prob) 135 | cell = tf.contrib.rnn.MultiRNNCell( 136 | [attn_cell() for _ in range(config.num_layers)], state_is_tuple=True) 137 | 138 | state_variables = [] 139 | with tf.variable_scope("myCH0"): 140 | for i, (state_c, state_h) in enumerate(cell.zero_state(batch_size, data_type())): 141 | if i > 0: tf.get_variable_scope().reuse_variables() 142 | myC0 = tf.get_variable("myC0", state_c.shape[1], initializer=tf.zeros_initializer()) 143 | myH0 = tf.get_variable("myH0", state_h.shape[1], initializer=tf.zeros_initializer()) 144 | myC0_tensor = tf.convert_to_tensor([myC0 for _ in range(batch_size)]) 145 | myH0_tensor = tf.convert_to_tensor([myH0 for _ in range(batch_size)]) 146 | state_variables.append(tf.contrib.rnn.LSTMStateTuple(myC0_tensor, myH0_tensor)) 147 | 148 | self._initial_state = state_variables 149 | 150 | self.eof_indicator = input_.eof_indicator 151 | 152 | with tf.device("/cpu:0"): 153 | embeddingN = tf.get_variable( 154 | "embeddingN", [vocab_sizeN, sizeN], dtype=data_type()) 155 | inputsN = tf.nn.embedding_lookup(embeddingN, input_.input_dataN) 156 | 157 | with tf.device("/cpu:0"): 158 | embeddingT = tf.get_variable( 159 | "embeddingT", [vocab_sizeT, sizeT], dtype=data_type()) 160 | inputsT = tf.nn.embedding_lookup(embeddingT, input_.input_dataT) 161 | 162 | inputs = tf.concat([inputsN, inputsT], 2) 163 | #inputs = tf.one_hot(input_.input_data, vocab_size) 164 | if is_training and config.keep_prob < 1: 165 | inputs = tf.nn.dropout(inputs, config.keep_prob) 166 | 167 | outputs = [] 168 | attentions = [] 169 | state = self._initial_state 170 | # self.memory = tf.placeholder(dtype=data_type(), shape=[batch_size, num_steps, size], name="memory") 171 | # valid_memory = self.memory[:,-attn_size:,:] 172 | # print ("test test test,, state shape", np.array(state).shape) 173 | with tf.variable_scope("RNN"): 174 | for time_step in range(num_steps): 175 | if time_step > 0: tf.get_variable_scope().reuse_variables() 176 | (cell_output, state) = cell(inputs[:, time_step, :], state) 177 | outputs.append(cell_output) 178 | 179 | # wm = tf.get_variable("wm", [size, size], dtype=data_type()) 180 | # wh = tf.get_variable("wh", [size, size], dtype=data_type()) 181 | # wt = tf.get_variable("wt", [size, 1], dtype=data_type()) 182 | # gt = tf.tanh(tf.matmul(tf.reshape(valid_memory, [-1, size]), wm) + tf.reshape(tf.tile(tf.matmul(cell_output, wh),[1, attn_size]), [-1, size])) 183 | # alpha = tf.nn.softmax(tf.reshape(tf.matmul(gt, wt), [-1,attn_size])) 184 | # ct = tf.squeeze(tf.matmul(tf.transpose(valid_memory, [0, 2, 1]), tf.reshape(alpha, [-1, attn_size, 1]))) 185 | # attentions.append(ct) 186 | # valid_memory = tf.concat([valid_memory[:,1:,:], tf.expand_dims(cell_output, axis=1)], axis=1) 187 | 188 | output = tf.reshape(tf.stack(axis=1, values=outputs), [-1, size]) 189 | # attention = tf.reshape(tf.stack(axis=1, values=attentions), [-1, size]) 190 | self.output = tf.reshape(output, [-1, num_steps, size]) #to record the memory for next batch 191 | # wa = tf.get_variable("wa", [size*2, size], dtype=data_type()) 192 | # nt = tf.tanh(tf.matmul(tf.concat([output, attention], axis=1), wa)) 193 | 194 | softmax_w = tf.get_variable("softmax_w", [size, vocab_sizeT], dtype=data_type()) 195 | softmax_b = tf.get_variable("softmax_b", [vocab_sizeT], dtype=data_type()) 196 | logits = tf.matmul(output, softmax_w) + softmax_b 197 | labels = tf.reshape(input_.targetsT, [-1]) 198 | weights = tf.ones([batch_size * num_steps], dtype=data_type()) 199 | 200 | #counting unk as wrong 201 | unk_id = vocab_sizeT - 2 202 | unk_tf = tf.constant(value=unk_id, dtype=tf.int32, shape=labels.shape) 203 | zero_weights = tf.zeros_like(labels, dtype=data_type()) 204 | wrong_label = tf.constant(value=-1, dtype=tf.int32, shape=labels.shape) 205 | condition_tf = tf.equal(labels, unk_tf) 206 | new_weights = tf.where(condition_tf, zero_weights, weights) 207 | new_labels = tf.where(condition_tf, wrong_label, labels) 208 | 209 | 210 | loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example([logits], [labels], [new_weights]) 211 | probs = tf.nn.softmax(logits) 212 | correct_prediction = tf.equal(tf.cast(tf.argmax(probs, 1), dtype = tf.int32), new_labels) 213 | self._accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) 214 | 215 | self._cost = cost = tf.reduce_sum(loss) / batch_size 216 | self._final_state = state 217 | 218 | if not is_training: 219 | return 220 | 221 | self._lr = tf.Variable(0.0, trainable=False) 222 | tvars = tf.trainable_variables() 223 | print ('tvars', len(tvars)) 224 | grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), 225 | config.max_grad_norm) 226 | print ('*******the length', len(grads)) 227 | optimizer = tf.train.AdamOptimizer(self._lr) 228 | self._train_op = optimizer.apply_gradients( 229 | zip(grads, tvars), 230 | global_step=tf.contrib.framework.get_or_create_global_step()) 231 | 232 | self._new_lr = tf.placeholder( 233 | tf.float32, shape=[], name="new_learning_rate") 234 | self._lr_update = tf.assign(self._lr, self._new_lr) 235 | 236 | def assign_lr(self, session, lr_value): 237 | session.run(self._lr_update, feed_dict={self._new_lr: lr_value}) 238 | 239 | @property 240 | def input(self): 241 | return self._input 242 | 243 | @property 244 | def initial_state(self): 245 | return self._initial_state 246 | 247 | @property 248 | def cost(self): 249 | return self._cost 250 | 251 | @property 252 | def final_state(self): 253 | return self._final_state 254 | 255 | @property 256 | def accuracy(self): 257 | return self._accuracy 258 | 259 | @property 260 | def lr(self): 261 | return self._lr 262 | 263 | @property 264 | def train_op(self): 265 | return self._train_op 266 | 267 | 268 | def run_epoch(session, model, eval_op=None, verbose=False): 269 | """Runs the model on the given data.""" 270 | start_time = time.time() 271 | costs = 0.0 272 | accuracy_list = [] 273 | iters = 0 274 | state = session.run(model.initial_state) 275 | # print ('at the very initial of the run_epoch\n', state[0].c) 276 | eof_indicator = np.ones((model.input.batch_size), dtype=bool) 277 | # memory = np.zeros([model.input.batch_size, model.input.num_steps, model.size]) 278 | # file_id = session.run(model.initial_file_id) #need to remove _ 279 | 280 | fetches = { 281 | "cost": model.cost, 282 | "accuracy": model.accuracy, 283 | "final_state": model.final_state, 284 | "eof_indicator": model.eof_indicator, 285 | # "memory":model.output, 286 | } 287 | if eval_op is not None: 288 | fetches["eval_op"] = eval_op 289 | 290 | for step in range(model.input.epoch_size): 291 | feed_dict = {} 292 | # current_file_id = file_id #session.run(model.file_id) 293 | sub_cond = np.expand_dims(eof_indicator, axis = 1) 294 | condition = np.repeat(sub_cond, model.size, axis = 1) 295 | # zero_state = np.zeros_like(condition) 296 | # zero_state = np.random.uniform(-0.05,0.05,condition.shape) 297 | zero_state = session.run(model.initial_state) 298 | 299 | for i, (c, h) in enumerate(model.initial_state): 300 | assert condition.shape == state[i].c.shape 301 | feed_dict[c] = np.where(condition, zero_state[i][0], state[i].c) 302 | feed_dict[h] = np.where(condition, zero_state[i][1], state[i].h) 303 | 304 | # feed_dict[model.memory] = memory 305 | vals = session.run(fetches, feed_dict) 306 | 307 | cost = vals["cost"] 308 | accuracy = vals["accuracy"] 309 | eof_indicator = vals["eof_indicator"] 310 | state = vals["final_state"] #use the final state as the initial state within a whole epoch 311 | # memory = vals["memory"] 312 | 313 | accuracy_list.append(accuracy) 314 | costs += cost 315 | iters += model.input.num_steps 316 | 317 | if verbose and step % (model.input.epoch_size // 10) == 10: 318 | print("%.3f perplexity: %.3f accuracy: %.4f speed: %.0f wps" % 319 | (step * 1.0 / model.input.epoch_size, np.exp(costs / iters), np.mean(accuracy_list), 320 | (time.time() - start_time))) 321 | 322 | print ('this run_epoch takes time %.2f' %(time.time() - start_time)) 323 | return np.exp(costs / iters), np.mean(accuracy_list) 324 | 325 | 326 | def main(_): 327 | start_time = time.time() 328 | fout = open(outfile, 'a') 329 | print ('\n', time.asctime(time.localtime()), file=fout) 330 | print ('start a new experiment %s'%outfile, file=fout) 331 | print ('Using dataset %s and %s'%(N_filename, T_filename), file=fout) 332 | 333 | train_dataN, valid_dataN, vocab_sizeN, train_dataT, valid_dataT, vocab_sizeT, attn_size = reader.input_data(N_filename, T_filename) 334 | 335 | train_data = (train_dataN, train_dataT) 336 | valid_data = (valid_dataN, valid_dataT) 337 | vocab_size = (vocab_sizeN+1, vocab_sizeT+2) # plus EOF, N is [w, eof], T is [w, unk, eof] 338 | 339 | config = get_config() 340 | assert attn_size == config.attn_size #make sure the attn_size used in generate terminal is the same as the configuration 341 | config.vocab_size = vocab_size 342 | eval_config = get_config() 343 | eval_config.batch_size = config.batch_size * config.num_steps 344 | eval_config.num_steps = 1 345 | eval_config.vocab_size = vocab_size 346 | 347 | with tf.Graph().as_default(): 348 | initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale) 349 | 350 | with tf.name_scope("Train"): 351 | train_input = PTBInput(config=config, data=train_data, name="TrainInput") 352 | with tf.variable_scope("Model", reuse=None, initializer=initializer): 353 | m = PTBModel(is_training=True, config=config, input_=train_input) 354 | 355 | with tf.name_scope("Valid"): 356 | valid_input = PTBInput(config=config, data=valid_data, name="ValidInput") 357 | with tf.variable_scope("Model", reuse=True, initializer=initializer): 358 | mvalid = PTBModel(is_training=False, config=config, input_=valid_input) 359 | 360 | # with tf.name_scope("Test"): 361 | # test_input = PTBInput(config=eval_config, data=valid_data, name="TestInput") 362 | # with tf.variable_scope("Model", reuse=True, initializer=initializer): 363 | # mtest = PTBModel(is_training=False, config=eval_config, 364 | # input_=test_input) 365 | 366 | 367 | print ('total trainable variables', len(tf.trainable_variables()), '\n\n') 368 | max_valid = 0 369 | max_step = 0 370 | saver = tf.train.Saver() 371 | 372 | sv = tf.train.Supervisor(logdir=None, summary_op=None) 373 | with sv.managed_session() as session: 374 | 375 | for i in range(config.max_max_epoch): 376 | lr_decay = config.lr_decay ** max(i + 1 - config.max_epoch, 0.0) 377 | m.assign_lr(session, config.learning_rate * lr_decay) 378 | print (outfile, "Epoch: %d Learning rate: %.6f" % (i + 1, session.run(m.lr))) 379 | 380 | train_perplexity, train_accuracy = run_epoch(session, m, eval_op=m.train_op, verbose=True) 381 | print("Epoch: %d Train Perplexity: %.3f Train Accuracy: %.3f" % (i + 1, train_perplexity, train_accuracy)) 382 | print("Epoch: %d Train Perplexity: %.3f Train Accuracy: %.3f" % (i + 1, train_perplexity, train_accuracy), file=fout) 383 | 384 | if i > 5: 385 | valid_perplexity, valid_accuracy = run_epoch(session, mvalid) 386 | print("Epoch: %d Valid Perplexity: ~~%.3f Valid Accuracy: %.3f~" % (i + 1, valid_perplexity, valid_accuracy)) 387 | print("Epoch: %d Valid Perplexity: ~~%.3f Valid Accuracy: %.3f~" % (i + 1, valid_perplexity, valid_accuracy), file=fout) 388 | if valid_accuracy > max_valid: 389 | max_valid = valid_accuracy 390 | max_step = i + 1 391 | 392 | # test_perplexity, test_accuracy = run_epoch(session, mtest) 393 | # print("\nTest Perplexity: %.3f Test Accuracy: %.3f" % (test_perplexity, test_accuracy)) 394 | 395 | print ('max step %d, max valid %.3f' %(max_step, max_valid)) 396 | # print ('data path is', FLAGS.data_path) 397 | print ('total time takes', time.time()-start_time) 398 | print ('max step %d, max valid %.3f' %(max_step, max_valid), file=fout) 399 | print ('total time takes', time.time()-start_time, file=fout) 400 | fout.close() 401 | 402 | # if FLAGS.save_path: 403 | # print("Saving model to %s." % FLAGS.save_path) 404 | # save_path = saver.save(session, FLAGS.save_path, write_meta_graph=False, write_state=False) 405 | 406 | 407 | if __name__ == "__main__": 408 | tf.app.run() 409 | -------------------------------------------------------------------------------- /preprocess_code/freq_dict.py: -------------------------------------------------------------------------------- 1 | #freq_dict: each terminal's frequency; terminal_num: a set about all the terminals. 2 | 3 | import numpy as np 4 | from six.moves import cPickle as pickle 5 | import json 6 | from collections import Counter 7 | import time 8 | 9 | #attention line 28: for python dataset, not exclude the last one 10 | train_filename = '../json_data/programs_training.json' 11 | test_filename = '../json_data/programs_eval.json' 12 | target_filename = '../pickle_data/freq_dict_JS.pickle' 13 | 14 | freq_dict = Counter() 15 | terminal_num = set() 16 | terminal_num.add('EmptY') 17 | 18 | def process(filename): 19 | with open(filename, encoding='latin-1') as lines: 20 | print ('Start procesing %s !!!'%(filename)) 21 | line_index = 0 22 | for line in lines: 23 | line_index += 1 24 | if line_index % 1000 == 0: 25 | print ('Processing line:', line_index) 26 | data = json.loads(line) 27 | if len(data) < 3e4: 28 | for i, dic in enumerate(data[:-1]): #JS data[:-1] or PY data 29 | if 'value' in dic.keys(): 30 | terminal_num.add(dic['value']) 31 | freq_dict[dic['value']] += 1 32 | else: 33 | freq_dict['EmptY'] += 1 34 | 35 | def save(filename): 36 | with open(filename, 'wb') as f: 37 | save = {'freq_dict': freq_dict,'terminal_num': terminal_num} 38 | pickle.dump(save, f, protocol=2) 39 | 40 | 41 | if __name__ == '__main__': 42 | start_time = time.time() 43 | process(train_filename) 44 | process(test_filename) 45 | save(target_filename) 46 | print(freq_dict['EmptY'], freq_dict['Empty'], freq_dict['empty'], freq_dict['EMPTY']) 47 | print('Finishing generating freq_dict and takes %.2f'%(time.time() - start_time)) 48 | 49 | 50 | -------------------------------------------------------------------------------- /preprocess_code/get_non_terminal.py: -------------------------------------------------------------------------------- 1 | # rewrite on 2018/1/8 by xxx, add parent 2 | 3 | import numpy as np 4 | from six.moves import cPickle as pickle 5 | import json 6 | import time 7 | from collections import Counter, defaultdict 8 | 9 | #attention line 42: for python dataset, not exclude the last one 10 | train_filename = '../json_data/programs_training.json' 11 | test_filename = '../json_data/programs_eval.json' 12 | target_filename = '../pickle_data/JS_non_terminal_small.pickle' 13 | 14 | # global variables 15 | typeDict = dict() #map N's name into its original ID(before expanding into 4*base_ID) 16 | numID = set() #the set to include all sparse ID 17 | no_empty_set = set() 18 | typeList = list() #the set to include all Types 19 | numType = 0 20 | dicID = dict() #map sparse id to dense id (remove empty id inside 4*base_ID) 21 | 22 | def process(filename): 23 | with open(filename, encoding='latin-1') as lines: 24 | print ('Start procesing %s !!!'%(filename)) 25 | line_index = 0 26 | corpus_N = list() 27 | corpus_parent = list() 28 | 29 | for line in lines: 30 | line_index += 1 31 | if line_index % 1000 == 0: 32 | print ('Processing line: ', line_index) 33 | data = json.loads(line) 34 | line_N = list() 35 | has_sibling = Counter() 36 | parent_counter = defaultdict(lambda: 1) #default parent is previous 1 37 | parent_list = list() 38 | 39 | if len(data) >= 3e4: 40 | continue 41 | 42 | for i, dic in enumerate(data[:-1]): #JS data[:-1] or PY data 43 | typeName = dic['type'] 44 | if typeName in typeList: 45 | base_ID = typeDict[typeName] 46 | else: 47 | typeList.append(typeName) 48 | global numType 49 | typeDict[typeName] = numType 50 | base_ID = numType 51 | numType = numType + 1 52 | 53 | #expand the ID into the range of 4*base_ID, according to whether it has sibling or children. Sibling information is got by the ancestor's children information 54 | if 'children' in dic.keys(): 55 | if has_sibling[i]: 56 | ID = base_ID * 4 + 3 57 | else: 58 | ID = base_ID * 4 + 2 59 | 60 | childs = dic['children'] 61 | for j in childs: 62 | parent_counter[j] = j-i 63 | 64 | if len(childs) > 1: 65 | for j in childs: 66 | has_sibling[j] = 1 67 | else: 68 | if has_sibling[i]: 69 | ID = base_ID * 4 + 1 70 | else: 71 | ID = base_ID * 4 72 | #recording the N which has non-empty T 73 | if 'value' in dic.keys(): 74 | no_empty_set.add(ID) 75 | 76 | line_N.append(ID) 77 | parent_list.append(parent_counter[i]) 78 | numID.add(ID) 79 | 80 | corpus_N.append(line_N) 81 | corpus_parent.append(parent_list) 82 | return corpus_N, corpus_parent 83 | 84 | 85 | 86 | def map_dense_id(data): 87 | result = list() 88 | for line_id in data: 89 | line_new_id = list() 90 | for i in line_id: 91 | if i in dicID.keys(): 92 | line_new_id.append(dicID[i]) 93 | else: 94 | dicID[i] = len(dicID) 95 | line_new_id.append(dicID[i]) 96 | result.append(line_new_id) 97 | return result 98 | 99 | 100 | def save(filename, typeDict, numType, dicID, vocab_size, trainData, testData, trainParent, testParent, empty_set_dense): 101 | with open(filename, 'wb') as f: 102 | save = { 103 | # 'typeDict': typeDict, 104 | # 'numType': numType, 105 | # 'dicID': dicID, 106 | 'vocab_size': vocab_size, 107 | 'trainData': trainData, 108 | 'testData': testData, 109 | 'trainParent': trainParent, 110 | 'testParent': testParent, 111 | # 'typeOnlyHasEmptyValue': empty_set_dense, 112 | } 113 | pickle.dump(save, f, protocol=2) 114 | 115 | if __name__ == '__main__': 116 | start_time = time.time() 117 | trainData, trainParent = process(train_filename) 118 | testData, testParent = process(test_filename) 119 | trainData = map_dense_id(trainData) 120 | testData = map_dense_id(testData) 121 | vocab_size = len(numID) 122 | assert len(dicID) == vocab_size 123 | 124 | #for print the N which can only has empty T 125 | assert no_empty_set.issubset(numID) 126 | empty_set = numID.difference(no_empty_set) 127 | empty_set_dense = set() 128 | # print(dicID) 129 | for i in empty_set: 130 | empty_set_dense.add(dicID[i]) 131 | print('The N set that can only has empty terminals: ',len(empty_set_dense), empty_set_dense) 132 | print('The vocaburary:', vocab_size, numID) 133 | 134 | 135 | save(target_filename, typeDict, numType, dicID, vocab_size, trainData, testData, trainParent, testParent,empty_set_dense) 136 | print('Finishing generating terminals and takes %.2fs'%(time.time() - start_time)) -------------------------------------------------------------------------------- /preprocess_code/get_terminal_dict.py: -------------------------------------------------------------------------------- 1 | #sort the freq_dict and get the terminal_dict for top terminals (include EmptY) 2 | 3 | import time 4 | from six.moves import cPickle as pickle 5 | import json 6 | from collections import Counter 7 | import operator 8 | 9 | vocab_size = 10000 10 | total_length = 92758587 # JS: 160143814, PY 92758587 11 | freq_dict_filename = '../pickle_data/freq_dict_PY.pickle' 12 | target_filename = '../pickle_data/terminal_dict_10k_PY.pickle' 13 | 14 | def restore_freq_dict(filename): 15 | with open(filename, 'rb') as f: 16 | save = pickle.load(f) 17 | freq_dict = save['freq_dict'] 18 | terminal_num = save['terminal_num'] 19 | return freq_dict, terminal_num 20 | 21 | def get_terminal_dict(vocab_size, freq_dict, verbose=False): 22 | terminal_dict = dict() 23 | sorted_freq_dict = sorted(freq_dict.items(), key=operator.itemgetter(1), reverse=True) 24 | if verbose == True: 25 | for i in range(100): 26 | print ('the %d frequent terminal: %s, its frequency: %.5f'%(i, sorted_freq_dict[i][0], float(sorted_freq_dict[i][1])/total_length)) 27 | new_freq_dict = sorted_freq_dict[:vocab_size] 28 | for i, (terminal, frequent) in enumerate(new_freq_dict): 29 | terminal_dict[terminal] = i 30 | return terminal_dict, sorted_freq_dict 31 | 32 | def save(filename, terminal_dict, terminal_num, sorted_freq_dict): 33 | with open(filename, 'wb') as f: 34 | save = {'terminal_dict': terminal_dict,'terminal_num': terminal_num, 'vocab_size': vocab_size, 'sorted_freq_dict': sorted_freq_dict,} 35 | pickle.dump(save, f, protocol=2) 36 | 37 | if __name__ == '__main__': 38 | start_time = time.time() 39 | freq_dict, terminal_num = restore_freq_dict(freq_dict_filename) 40 | print(freq_dict['EmptY'], freq_dict['empty']) 41 | terminal_dict, sorted_freq_dict = get_terminal_dict(vocab_size, freq_dict, True) 42 | save(target_filename, terminal_dict, terminal_num, sorted_freq_dict) 43 | print('Finishing generating terminal_dict and takes %.2f'%(time.time() - start_time)) 44 | 45 | -------------------------------------------------------------------------------- /preprocess_code/get_terminal_whole.py: -------------------------------------------------------------------------------- 1 | #According to the terminal_dict you choose (i.e. 5k, 10k, 50k), parse the json file and turn them into ids that are stored in pickle file 2 | #Output just one vector for terminal, the upper part is the word id while the lower part is the location 3 | # 0108 revise the Empty into EmptY, normal to NormaL 4 | # Here attn_size matters 5 | 6 | import numpy as np 7 | from six.moves import cPickle as pickle 8 | import json 9 | from collections import deque 10 | import time 11 | 12 | #attention line 48: for python dataset, not exclude the last one 13 | terminal_dict_filename = '../pickle_data/terminal_dict_10k_PY.pickle' 14 | train_filename = '../json_data/python100k_train.json' 15 | test_filename = '../json_data/python50k_eval.json' 16 | target_filename = '../pickle_data/PY_terminal_10k_whole.pickle' 17 | 18 | 19 | def restore_terminal_dict(filename): 20 | with open(filename, 'rb') as f: 21 | save = pickle.load(f) 22 | terminal_dict = save['terminal_dict'] 23 | terminal_num = save['terminal_num'] 24 | vocab_size = save['vocab_size'] 25 | return terminal_dict, terminal_num, vocab_size #vocab_size is 50k, and also the unk_id 26 | 27 | def process(filename, terminal_dict, unk_id, attn_size, verbose=False, is_train=False): 28 | with open(filename, encoding='latin-1') as lines: 29 | print ('Start procesing %s !!!'%(filename)) 30 | terminal_corpus = list() 31 | attn_que = deque(maxlen=attn_size) 32 | attn_success_total = 0 33 | attn_fail_total = 0 34 | length_total = 0 35 | line_index = 0 36 | for line in lines: 37 | line_index += 1 38 | # if is_train and line_index == 11: 39 | # continue 40 | if line_index % 1000 == 0: 41 | print ('Processing line:', line_index) 42 | data = json.loads(line) 43 | if len(data) < 3e4: 44 | terminal_line = list() 45 | attn_que.clear() # have a new queue for each file 46 | attn_success_cnt = 0 47 | attn_fail_cnt = 0 48 | for i, dic in enumerate(data): ##JS data[:-1] or PY data 49 | if 'value' in dic.keys(): 50 | dic_value = dic['value'] 51 | if dic_value in terminal_dict.keys(): #take long time!!! 52 | terminal_line.append(terminal_dict[dic_value]) 53 | attn_que.append('NormaL') 54 | else: 55 | if dic_value in attn_que: 56 | location_index = [len(attn_que)-ind for ind,x in enumerate(attn_que) if x==dic_value][-1] 57 | location_id = unk_id + 1 + (location_index) 58 | # print('\nattn_success!! its value is ', dic_value) 59 | # print('The current file index: ', line_index, ', the location index', location_index,', the location_id: ', location_id, ',\n the attn_que', attn_que) 60 | terminal_line.append(location_id) 61 | attn_success_cnt += 1 62 | else: 63 | attn_fail_cnt += 1 64 | terminal_line.append(unk_id) 65 | attn_que.append(dic_value) 66 | else: 67 | terminal_line.append(terminal_dict['EmptY']) 68 | attn_que.append('EmptY') 69 | terminal_corpus.append(terminal_line) 70 | attn_success_total += attn_success_cnt 71 | attn_fail_total += attn_fail_cnt 72 | attn_total = attn_success_total + attn_fail_total 73 | length_total += len(data) 74 | # print ('Process line', line_index, 'attn_success_cnt', attn_success_cnt, 'attn_fail_cnt', attn_fail_cnt,'data length', len(data)) 75 | if verbose and line_index % 1000 == 0: 76 | print('\nUntil line %d: attn_success_total: %d, attn_fail_total: %d, success/attn_total: %.4f, length_total: %d, attn_success percentage: %.4f, total unk percentage: %.4f\n'% 77 | (line_index, attn_success_total, attn_fail_total, float(attn_success_total)/attn_total, length_total, 78 | float(attn_success_total)/length_total, float(attn_total)/length_total)) 79 | with open('output.txt', 'a') as fout: 80 | fout.write('Statistics: attn_success_total: %d, attn_fail_total: %d, success/fail: %.4f, length_total: %d, attn_success percentage: %.4f, total unk percentage: %.4f\n'% 81 | (attn_success_total, attn_fail_total, float(attn_success_total)/attn_fail_total, length_total, 82 | float(attn_success_total)/length_total, float(attn_success_total + attn_fail_total)/length_total)) 83 | 84 | return terminal_corpus 85 | 86 | def save(filename, terminal_dict, terminal_num, vocab_size, attn_size, trainData, testData): 87 | with open(filename, 'wb') as f: 88 | save = {'terminal_dict': terminal_dict, 89 | 'terminal_num': terminal_num, 90 | 'vocab_size': vocab_size, 91 | 'attn_size': attn_size, 92 | 'trainData': trainData, 93 | 'testData': testData, 94 | } 95 | pickle.dump(save, f, protocol=2) 96 | 97 | if __name__ == '__main__': 98 | start_time = time.time() 99 | attn_size = 50 100 | terminal_dict, terminal_num, vocab_size = restore_terminal_dict(terminal_dict_filename) 101 | trainData = process(train_filename, terminal_dict, vocab_size, attn_size=attn_size, verbose=True, is_train=True) 102 | testData = process(test_filename, terminal_dict, vocab_size, attn_size=attn_size, verbose=True, is_train=False) 103 | save(target_filename, terminal_dict, terminal_num, vocab_size, attn_size, trainData, testData) 104 | print('Finishing generating terminals and takes %.2f'%(time.time() - start_time)) -------------------------------------------------------------------------------- /preprocess_code/get_total_length.py: -------------------------------------------------------------------------------- 1 | import json 2 | import time 3 | 4 | train_filename = '../json_data/programs_training.json' 5 | test_filename = '../json_data/programs_eval.json' 6 | 7 | def process(filename): 8 | with open(filename, encoding='latin-1') as lines: 9 | print ('Start procesing %s !!!'%(filename)) 10 | length = 0 11 | line_index = 0 12 | for line in lines: 13 | line_index += 1 14 | if line_index % 1000 == 0: 15 | print ('Processing line:', line_index) 16 | data = json.loads(line) 17 | if len(data) < 3e4: 18 | length += len(data[:-1]) # total number of AST nodes 19 | return length 20 | 21 | if __name__ == '__main__': 22 | start_time = time.time() 23 | train_len = process(train_filename) 24 | test_len = process(test_filename) 25 | print('total_length is ', train_len + test_len) 26 | print('Finishing counting the length and takes %.2f'%(time.time() - start_time)) -------------------------------------------------------------------------------- /preprocess_code/output.txt: -------------------------------------------------------------------------------- 1 | Statistics: attn_success_total: 61250, attn_fail_total: 131890, success/fail: 0.4644, length_total: 800977, attn_success percentage: 0.0765, total unk percentage: 0.2411 2 | Statistics: attn_success_total: 27749, attn_fail_total: 45969, success/fail: 0.6036, length_total: 336474, attn_success percentage: 0.0825, total unk percentage: 0.2191 3 | Statistics: attn_success_total: 5826053, attn_fail_total: 9457247, success/fail: 0.6160, length_total: 62340693, attn_success percentage: 0.0935, total unk percentage: 0.2452 4 | Statistics: attn_success_total: 2830099, attn_fail_total: 4609149, success/fail: 0.6140, length_total: 30417894, attn_success percentage: 0.0930, total unk percentage: 0.2446 5 | Statistics: attn_success_total: 3239633, attn_fail_total: 6752116, success/fail: 0.4798, length_total: 62340693, attn_success percentage: 0.0520, total unk percentage: 0.1603 6 | Statistics: attn_success_total: 1577885, attn_fail_total: 3284925, success/fail: 0.4803, length_total: 30417894, attn_success percentage: 0.0519, total unk percentage: 0.1599 7 | Statistics: attn_success_total: 2004001, attn_fail_total: 5276976, success/fail: 0.3798, length_total: 62340693, attn_success percentage: 0.0321, total unk percentage: 0.1168 8 | Statistics: attn_success_total: 983407, attn_fail_total: 2565952, success/fail: 0.3833, length_total: 30417894, attn_success percentage: 0.0323, total unk percentage: 0.1167 9 | -------------------------- 10 | Statistics: attn_success_total: 2007407, attn_fail_total: 5285969, success/fail: 0.3798, length_total: 62340693, attn_success percentage: 0.0322, total unk percentage: 0.1170 11 | Statistics: attn_success_total: 985513, attn_fail_total: 2570911, success/fail: 0.3833, length_total: 30417894, attn_success percentage: 0.0324, total unk percentage: 0.1169 #PY_test, attn_size is 50 12 | Statistics: attn_success_total: 1215723, attn_fail_total: 2340701, success/fail: 0.5194, length_total: 30417894, attn_success percentage: 0.0400, total unk percentage: 0.1169 #PY_test, attn_size is 100 13 | Statistics: attn_success_total: 1329858, attn_fail_total: 2226566, success/fail: 0.5973, length_total: 30417894, attn_success percentage: 0.0437, total unk percentage: 0.1169 #PY_test, attn_size is 150 14 | Statistics: attn_success_total: 2079826, attn_fail_total: 5445211, success/fail: 0.3820, length_total: 107104111, attn_success percentage: 0.0194, total unk percentage: 0.0703 15 | Statistics: attn_success_total: 1044410, attn_fail_total: 2673748, success/fail: 0.3906, length_total: 53188270, attn_success percentage: 0.0196, total unk percentage: 0.0699 16 | Statistics: attn_success_total: 2483781, attn_fail_total: 4809595, success/fail: 0.5164, length_total: 62340693, attn_success percentage: 0.0398, total unk percentage: 0.1170 17 | Statistics: attn_success_total: 1215723, attn_fail_total: 2340701, success/fail: 0.5194, length_total: 30417894, attn_success percentage: 0.0400, total unk percentage: 0.1169 18 | Statistics: attn_success_total: 2534934, attn_fail_total: 4990103, success/fail: 0.5080, length_total: 107104111, attn_success percentage: 0.0237, total unk percentage: 0.0703 19 | Statistics: attn_success_total: 1269273, attn_fail_total: 2448885, success/fail: 0.5183, length_total: 53188270, attn_success percentage: 0.0239, total unk percentage: 0.0699 20 | Statistics: attn_success_total: 3960793, attn_fail_total: 7909092, success/fail: 0.5008, length_total: 107104111, attn_success percentage: 0.0370, total unk percentage: 0.1108 21 | Statistics: attn_success_total: 1997242, attn_fail_total: 3899387, success/fail: 0.5122, length_total: 53188270, attn_success percentage: 0.0376, total unk percentage: 0.1109 22 | Statistics: attn_success_total: 8572892, attn_fail_total: 12842395, success/fail: 0.6675, length_total: 107104111, attn_success percentage: 0.0800, total unk percentage: 0.1999 23 | Statistics: attn_success_total: 4296929, attn_fail_total: 6352560, success/fail: 0.6764, length_total: 53188270, attn_success percentage: 0.0808, total unk percentage: 0.2002 24 | Statistics: attn_success_total: 5832257, attn_fail_total: 9473851, success/fail: 0.6156, length_total: 62340693, attn_success percentage: 0.0936, total unk percentage: 0.2455 25 | Statistics: attn_success_total: 2833262, attn_fail_total: 4617843, success/fail: 0.6135, length_total: 30417894, attn_success percentage: 0.0931, total unk percentage: 0.2450 26 | Statistics: attn_success_total: 3244113, attn_fail_total: 6763678, success/fail: 0.4796, length_total: 62340693, attn_success percentage: 0.0520, total unk percentage: 0.1605 27 | Statistics: attn_success_total: 1580175, attn_fail_total: 3290966, success/fail: 0.4802, length_total: 30417894, attn_success percentage: 0.0519, total unk percentage: 0.1601 28 | -------------------------------------------------------------------------------- /preprocess_code/utils.py: -------------------------------------------------------------------------------- 1 | #Utilities for preprocess the data 2 | 3 | import numpy as np 4 | from six.moves import cPickle as pickle 5 | import json 6 | from collections import deque 7 | import time 8 | 9 | 10 | def read_N_pickle(filename): 11 | with open(filename, 'rb') as f: 12 | print ("Reading data from ", filename) 13 | save = pickle.load(f) 14 | train_data = save['trainData'] 15 | test_data = save['testData'] 16 | vocab_size = save['vocab_size'] 17 | print ('the vocab_size is %d' %vocab_size) 18 | print ('the number of training data is %d' %(len(train_data))) 19 | print ('the number of test data is %d' %(len(test_data))) 20 | print ('Finish reading data!!') 21 | return train_data, test_data, vocab_size 22 | 23 | def read_T_pickle(filename): 24 | with open(filename, 'rb') as f: 25 | print ("Reading data from ", filename) 26 | save = pickle.load(f) 27 | train_data = save['trainData'] 28 | test_data = save['testData'] 29 | vocab_size = save['vocab_size'] 30 | attn_size = save['attn_size'] 31 | print ('the vocab_size is %d' %vocab_size) 32 | print ('the attn_size is %d' %attn_size) 33 | print ('the number of training data is %d' %(len(train_data))) 34 | print ('the number of test data is %d' %(len(test_data))) 35 | print ('Finish reading data!!') 36 | return train_data, test_data, vocab_size, attn_size 37 | 38 | 39 | def save(filename, terminal_dict, terminal_num, vocab_size, sorted_freq_dict): 40 | with open(filename, 'wb') as f: 41 | save = {'terminal_dict': terminal_dict,'terminal_num': terminal_num, 'vocab_size': vocab_size, 'sorted_freq_dict': sorted_freq_dict,} 42 | pickle.dump(save, f) 43 | 44 | def change_protocol_for_N(filename): 45 | 46 | f = open(filename, 'rb') 47 | save = pickle.load(f) 48 | typeDict = save['typeDict'] 49 | numType = save['numType'] 50 | dicID = save['dicID'] 51 | vocab_size = save['vocab_size'] 52 | trainData = save['trainData'] 53 | testData = save['testData'] 54 | typeOnlyHasEmptyValue = save['typeOnlyHasEmptyValue'] 55 | f.close() 56 | 57 | f = open(filename, 'wb') 58 | save = { 59 | 'typeDict': typeDict, 60 | 'numType': numType, 61 | 'dicID': dicID, 62 | 'vocab_size': vocab_size, 63 | 'trainData': trainData, 64 | 'testData': testData, 65 | 'typeOnlyHasEmptyValue': typeOnlyHasEmptyValue, 66 | } 67 | pickle.dump(save, f, protocol=2) 68 | f.close() 69 | 70 | 71 | def change_protocol_for_T(filename): 72 | f = open(filename, 'rb') 73 | save = pickle.load(f) 74 | terminal_dict = save['terminal_dict'] 75 | terminal_num = save['terminal_num'] 76 | vocab_size = save['vocab_size'] 77 | attn_size = save['attn_size'] 78 | trainData = save['trainData'] 79 | testData = save['testData'] 80 | f.close() 81 | 82 | f = open(target_filename, 'wb') 83 | save = {'terminal_dict': terminal_dict, 84 | 'terminal_num': terminal_num, 85 | 'vocab_size': vocab_size, 86 | 'attn_size': attn_size, 87 | 'trainData': trainData, 88 | 'testData': testData, 89 | } 90 | pickle.dump(save, f, protocol=2) 91 | f.close() 92 | 93 | if __name__ == '__main__': 94 | 95 | # train_filename = '../json_data/small_programs_training.json' 96 | # test_filename = '../json_data/small_programs_eval.json' 97 | # N_pickle_filename = '../pickle_data/JS_non_terminal.pickle' 98 | # T_pickle_filename = '../pickle_data/JS_terminal_1k.pickle' 99 | filename = '../pickle_data/PY_non_terminal.pickle' 100 | read_N_pickle(filename) 101 | # filename = '../pickle_data/JS_terminal_1k_whole.pickle' 102 | # change_protocol_for_T(filename, target_filename) 103 | 104 | 105 | # N_train_data, N_test_data, N_vocab_size = read_N_pickle(N_pickle_filename) 106 | # T_train_data, T_test_data, T_vocab_size, attn_size = read_T_pickle(T_pickle_filename) 107 | # print(len(N_train_data), len(T_train_data)) 108 | 109 | --------------------------------------------------------------------------------