├── .gitignore ├── test_tree_gru.py ├── test_tree_lstm.py ├── README.md ├── simple_demo.py ├── modulo_demo.py ├── tree_gru.py ├── sentiment.py ├── test_tree_rnn.py ├── tree_lstm.py ├── data_utils.py └── tree_rnn.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | -------------------------------------------------------------------------------- /test_tree_gru.py: -------------------------------------------------------------------------------- 1 | import tree_gru 2 | import tree_rnn 3 | 4 | import numpy as np 5 | import theano 6 | 7 | 8 | def test(): 9 | # very simple for now... just checks compilation and training step 10 | # TODO: better tests 11 | root = tree_rnn.BinaryNode(0) 12 | c1 = tree_rnn.BinaryNode(1) 13 | cc1 = tree_rnn.BinaryNode(2) 14 | ccc1 = tree_rnn.BinaryNode(3) 15 | cccc1 = tree_rnn.BinaryNode(5) 16 | cccc2 = tree_rnn.BinaryNode(6) 17 | ccc1.add_left(cccc1) 18 | ccc1.add_right(cccc2) 19 | cc1.add_left(ccc1) 20 | c1.add_right(cc1) 21 | root.add_left(c1) 22 | 23 | # check child sum 24 | model = tree_gru.ChildSumTreeGRU(10, 20, 30, 1) 25 | model.train_step(root, np.array([0]).astype(theano.config.floatX)) 26 | 27 | # check n-ary 28 | model = tree_gru.NaryTreeGRU(10, 20, 30, 1) 29 | model.train_step(root, np.array([0]).astype(theano.config.floatX)) 30 | -------------------------------------------------------------------------------- /test_tree_lstm.py: -------------------------------------------------------------------------------- 1 | import tree_lstm 2 | import tree_rnn 3 | 4 | import numpy as np 5 | import theano 6 | 7 | 8 | def test(): 9 | # very simple for now... just checks compilation and training step 10 | # TODO: better tests 11 | root = tree_rnn.BinaryNode(0) 12 | c1 = tree_rnn.BinaryNode(1) 13 | cc1 = tree_rnn.BinaryNode(2) 14 | ccc1 = tree_rnn.BinaryNode(3) 15 | cccc1 = tree_rnn.BinaryNode(5) 16 | cccc2 = tree_rnn.BinaryNode(6) 17 | ccc1.add_left(cccc1) 18 | ccc1.add_right(cccc2) 19 | cc1.add_left(ccc1) 20 | c1.add_right(cc1) 21 | root.add_left(c1) 22 | 23 | # check child sum 24 | model = tree_lstm.ChildSumTreeLSTM(10, 20, 30, 1) 25 | model.train_step(root, np.array([0]).astype(theano.config.floatX)) 26 | 27 | # check n-ary 28 | model = tree_lstm.NaryTreeLSTM(10, 20, 30, 1) 29 | model.train_step(root, np.array([0]).astype(theano.config.floatX)) 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # tree_rnn 2 | Theano implementation of Tree RNNs aka Recursive Neural Networks. 3 | 4 | Includes implementation of TreeLSTMs as described in "Improved 5 | Semantic Representations From Tree-Structured Long Short-Term 6 | Memory Networks" by Kai Sheng Tai, Richard Socher, and Christopher 7 | D. Manning. 8 | 9 | Also includes implementation of TreeGRUs derived using similar 10 | methods. 11 | 12 | You may immediately run "dummy" demos via simple_demo.py and 13 | modulo_demo.py. 14 | 15 | Code for evaluation on the Stanford Sentiment Treebank (used by 16 | the paper) is also available in sentiment.py. To run this, you'll 17 | need to download the relevant data. 18 | 19 | Step-by-step for cloning this repo and getting the sentiment model 20 | running: 21 | 22 | From your shell, run 23 | 24 | git clone https://github.com/ofirnachum/tree_rnn.git 25 | git clone https://github.com/stanfordnlp/treelstm.git 26 | cd treelstm 27 | ./fetch_and_preprocess.sh 28 | 29 | This will download the datasets, the word vectors, and do some 30 | preprocessing on the data. Once this is complete, go into the 31 | tree_rnn directory and start a Python shell. In that shell, 32 | we'll preprocess the word vectors: 33 | 34 | import data_utils 35 | vocab = data_utils.Vocab() 36 | vocab.load('../treelstm/data/sst/vocab-cased.txt') 37 | words, embeddings = \ 38 | data_utils.read_embeddings_into_numpy( 39 | '../treelstm/data/glove/glove.840B.300d.txt', vocab=vocab) 40 | 41 | import numpy as np 42 | np.save('../treelstm/data/words.npy', words) 43 | np.save('../treelstm/data/glove.npy', embeddings) 44 | 45 | After exiting the Python shell, you can run the sentiment training 46 | directly 47 | 48 | python sentiment.py 49 | 50 | The first couple lines of output should be 51 | 52 | train 6920 53 | dev 872 54 | test 1821 55 | num emb 21701 56 | num labels 3 57 | epoch 0 58 | avg loss 16.7419t example 6919 of 6920 59 | dev score 0.586009174312 60 | epoch 1 61 | avg loss 13.8955t example 6919 of 6920 62 | dev score 0.69495412844 63 | epoch 2 64 | avg loss 12.9191t example 6919 of 6920 65 | dev score 0.730504587156 66 | -------------------------------------------------------------------------------- /simple_demo.py: -------------------------------------------------------------------------------- 1 | import tree_rnn 2 | import tree_lstm 3 | import tree_gru 4 | from test_tree_rnn import DummyBinaryRNN 5 | 6 | import theano 7 | import numpy as np 8 | import random 9 | 10 | NUM_EMB = 10 11 | EMB_DIM = 10 12 | HIDDEN_DIM = 10 13 | OUTPUT_DIM = 1 14 | 15 | NUM_ITER = 100000 16 | MAX_DEPTH = 4 17 | 18 | 19 | def get_trainable_model(): 20 | # change this to the model of your choosing 21 | return tree_rnn.TreeRNN(NUM_EMB, EMB_DIM, HIDDEN_DIM, OUTPUT_DIM) 22 | 23 | 24 | def get_groundtruth_model(): 25 | model = DummyBinaryRNN(NUM_EMB, EMB_DIM, HIDDEN_DIM, OUTPUT_DIM) 26 | model.embeddings.set_value( 27 | model.embeddings.get_value() + 28 | np.random.randint(low=-1, high=2, size=(NUM_EMB, EMB_DIM)).astype(theano.config.floatX)) 29 | return model 30 | 31 | 32 | def get_groundtruth_label(root_node, model): 33 | label = model.predict(root_node) 34 | return label.reshape((-1,)) 35 | 36 | 37 | def get_random_binary_tree(min_depth, max_depth, num_vals, child_prob=0.7, _cur_depth=0): 38 | root = tree_rnn.BinaryNode(int(random.random() * num_vals)) 39 | if max_depth <= 1: 40 | return root 41 | 42 | # left child 43 | if _cur_depth < min_depth or random.random() < child_prob: 44 | left_child = get_random_binary_tree(min_depth, max_depth - 1, num_vals, 45 | child_prob=child_prob, 46 | _cur_depth=_cur_depth + 1) 47 | root.add_left(left_child) 48 | 49 | # right child 50 | if _cur_depth < min_depth or random.random() < child_prob: 51 | right_child = get_random_binary_tree(min_depth, max_depth - 1, num_vals, 52 | child_prob=child_prob, 53 | _cur_depth=_cur_depth + 1) 54 | root.add_right(right_child) 55 | 56 | return root 57 | 58 | 59 | def main(num_iter=NUM_ITER): 60 | groundtruth_model = get_groundtruth_model() 61 | trainable_model = get_trainable_model() 62 | 63 | losses = [] 64 | for it in xrange(num_iter): 65 | tree = get_random_binary_tree(1, MAX_DEPTH, NUM_EMB, child_prob=0.7) 66 | label = get_groundtruth_label(tree, groundtruth_model) 67 | 68 | loss, pred_y = trainable_model.train_step(tree, label) 69 | losses.append(loss) 70 | 71 | if it % 1000 == 0: 72 | print 'iter', it, ':', np.mean(losses), pred_y, label 73 | losses = [] 74 | 75 | 76 | if __name__ == '__main__': 77 | main() 78 | -------------------------------------------------------------------------------- /modulo_demo.py: -------------------------------------------------------------------------------- 1 | import tree_rnn 2 | import tree_lstm 3 | import tree_gru 4 | from test_tree_rnn import DummyBinaryRNN 5 | 6 | import theano 7 | import numpy as np 8 | import random 9 | 10 | NUM_EMB = 4 11 | EMB_DIM = 2 12 | HIDDEN_DIM = 2 13 | OUTPUT_DIM = NUM_EMB 14 | 15 | NUM_ITER = 100000 16 | MAX_DEPTH = 4 17 | 18 | 19 | def get_trainable_model(): 20 | # change this to the model of your choosing 21 | model = tree_rnn.TreeRNN(NUM_EMB, EMB_DIM, HIDDEN_DIM, OUTPUT_DIM, 22 | trainable_embeddings=False) 23 | model.embeddings.set_value( 24 | np.arange(NUM_EMB * EMB_DIM).reshape([NUM_EMB, EMB_DIM]). 25 | astype(theano.config.floatX)) 26 | return model 27 | 28 | 29 | def get_groundtruth_model(): 30 | model = DummyBinaryRNN(NUM_EMB, EMB_DIM, HIDDEN_DIM, OUTPUT_DIM) 31 | model.embeddings.set_value( 32 | np.arange(NUM_EMB * EMB_DIM).reshape([NUM_EMB, EMB_DIM]). 33 | astype(theano.config.floatX)) 34 | return model 35 | 36 | 37 | def get_groundtruth_label(root_node, model): 38 | root_emb = model.evaluate(root_node) 39 | label = np.zeros(OUTPUT_DIM).astype(theano.config.floatX) 40 | idx = int(np.sum(root_emb)) if np.all(np.isfinite(root_emb)) else 0 41 | label[idx % OUTPUT_DIM] = 1 42 | return label 43 | 44 | 45 | def get_random_binary_tree(min_depth, max_depth, num_vals, child_prob=0.7, _cur_depth=0): 46 | root = tree_rnn.BinaryNode(int(random.random() * num_vals)) 47 | if max_depth <= 1: 48 | return root 49 | 50 | # left child 51 | if _cur_depth < min_depth or random.random() < child_prob: 52 | left_child = get_random_binary_tree(min_depth, max_depth - 1, num_vals, 53 | child_prob=child_prob, 54 | _cur_depth=_cur_depth + 1) 55 | root.add_left(left_child) 56 | 57 | # right child 58 | if _cur_depth < min_depth or random.random() < child_prob: 59 | right_child = get_random_binary_tree(min_depth, max_depth - 1, num_vals, 60 | child_prob=child_prob, 61 | _cur_depth=_cur_depth + 1) 62 | root.add_right(right_child) 63 | 64 | return root 65 | 66 | 67 | def main(num_iter=NUM_ITER): 68 | groundtruth_model = get_groundtruth_model() 69 | trainable_model = get_trainable_model() 70 | 71 | losses = [] 72 | for it in xrange(num_iter): 73 | tree = get_random_binary_tree(1, MAX_DEPTH, NUM_EMB, child_prob=0.7) 74 | label = get_groundtruth_label(tree, groundtruth_model) 75 | 76 | loss, pred_y = trainable_model.train_step(tree, label) 77 | losses.append(loss) 78 | 79 | if it % 1000 == 0: 80 | print 'iter', it, ':', np.mean(losses), pred_y, label 81 | losses = [] 82 | 83 | 84 | if __name__ == '__main__': 85 | main() 86 | -------------------------------------------------------------------------------- /tree_gru.py: -------------------------------------------------------------------------------- 1 | __doc__ = """Implementation of Tree GRUs, and adaptation of GRU RNNs to trees.""" 2 | 3 | import tree_rnn 4 | 5 | import theano 6 | from theano import tensor as T 7 | 8 | 9 | def _softmax(inp, exists, add_one=False): 10 | """Equivalent to T.nnet.softmax, but allowing for ignoring some columns. 11 | 12 | Also works on rows rather than columns. 13 | 14 | """ 15 | inp = inp * exists.dimshuffle(0, 'x') 16 | e_inp = T.exp(inp - inp.max(axis=0, keepdims=True)) * exists.dimshuffle(0, 'x') 17 | if add_one: 18 | return e_inp / (1 + e_inp.sum(axis=0, keepdims=True)) 19 | else: 20 | return e_inp / e_inp.sum(axis=0, keepdims=True) 21 | 22 | 23 | class ChildSumTreeGRU(tree_rnn.TreeRNN): 24 | def create_recursive_unit(self): 25 | self.W_z = theano.shared(self.init_matrix([self.hidden_dim, self.emb_dim])) 26 | self.U_z = theano.shared(self.init_matrix([self.hidden_dim, self.hidden_dim])) 27 | self.W_r = theano.shared(self.init_matrix([self.hidden_dim, self.emb_dim])) 28 | self.U_r = theano.shared(self.init_matrix([self.hidden_dim, self.hidden_dim])) 29 | self.W_h = theano.shared(self.init_matrix([self.hidden_dim, self.emb_dim])) 30 | self.U_h = theano.shared(self.init_matrix([self.hidden_dim, self.hidden_dim])) 31 | self.params.extend([ 32 | self.W_z, self.U_z, 33 | self.W_r, self.U_r, 34 | self.W_h, self.U_h]) 35 | 36 | def unit(parent_x, child_h, child_exists): 37 | z = _softmax( 38 | (T.dot(self.W_z, parent_x).dimshuffle('x', 0) + 39 | T.dot(child_h, self.U_z.T)), 40 | child_exists, add_one=True) 41 | r = _softmax( 42 | (T.dot(self.W_r, parent_x).dimshuffle('x', 0) + 43 | T.dot(child_h, self.U_r.T)), 44 | child_exists, add_one=False) 45 | h_hat = T.tanh(T.dot(self.W_h, parent_x) + 46 | T.dot(self.U_h, T.sum(r * child_h, axis=0))) 47 | h = (1 - T.sum(z, axis=0)) * h_hat + T.sum(z * child_h, axis=0) 48 | return h 49 | 50 | return unit 51 | 52 | def create_leaf_unit(self): 53 | dummy = 0 * theano.shared(self.init_vector([self.degree, self.hidden_dim])) 54 | def unit(leaf_x): 55 | return self.recursive_unit( 56 | leaf_x, 57 | dummy, 58 | 1 + dummy.sum(axis=1)) 59 | return unit 60 | 61 | 62 | class NaryTreeGRU(ChildSumTreeGRU): 63 | # TODO: try a more analgous to LSTM degree ** 2 version 64 | 65 | def create_recursive_unit(self): 66 | self.W_z = theano.shared(self.init_matrix([self.hidden_dim, self.emb_dim])) 67 | self.U_z = theano.shared(self.init_matrix( 68 | [self.degree, self.hidden_dim, self.hidden_dim])) 69 | self.W_r = theano.shared(self.init_matrix([self.hidden_dim, self.emb_dim])) 70 | self.U_r = theano.shared(self.init_matrix( 71 | [self.degree, self.hidden_dim, self.hidden_dim])) 72 | self.W_h = theano.shared(self.init_matrix([self.hidden_dim, self.emb_dim])) 73 | self.U_h = theano.shared(self.init_matrix([self.hidden_dim, self.hidden_dim])) 74 | self.params.extend([ 75 | self.W_z, self.U_z, 76 | self.W_r, self.U_r, 77 | self.W_h, self.U_h]) 78 | 79 | def unit(parent_x, child_h, child_exists): 80 | (pre_z, pre_r), _ = theano.map( 81 | fn=lambda Uz, Ur, h: (T.dot(Uz, h), T.dot(Ur, h)), 82 | sequences=[self.U_z, self.U_r, child_h]) 83 | 84 | z = _softmax( 85 | T.dot(self.W_z, parent_x).dimshuffle('x', 0) + pre_z, 86 | child_exists, add_one=True) 87 | r = _softmax( 88 | T.dot(self.W_r, parent_x).dimshuffle('x', 0) + pre_r, 89 | child_exists, add_one=False) 90 | h_hat = T.tanh(T.dot(self.W_h, parent_x) + 91 | T.dot(self.U_h, T.sum(r * child_h, axis=0))) 92 | h = (1 - T.sum(z, axis=0)) * h_hat + T.sum(z * child_h, axis=0) 93 | return h 94 | 95 | return unit 96 | -------------------------------------------------------------------------------- /sentiment.py: -------------------------------------------------------------------------------- 1 | import data_utils 2 | import tree_rnn 3 | import tree_lstm 4 | import tree_gru 5 | 6 | import numpy as np 7 | import theano 8 | from theano import tensor as T 9 | import random 10 | import pickle 11 | import os 12 | 13 | DIR = '../treelstm/data/sst' 14 | GLOVE_DIR = '../treelstm/data' # should include .npy files of glove vecs and words 15 | FINE_GRAINED = False 16 | DEPENDENCY = False 17 | SEED = 88 18 | 19 | NUM_EPOCHS = 30 20 | LEARNING_RATE = 0.01 21 | 22 | EMB_DIM = 300 23 | HIDDEN_DIM = 100 24 | 25 | 26 | class SentimentModel(tree_lstm.ChildSumTreeLSTM): 27 | def train_step_inner(self, x, tree, y, y_exists): 28 | self._check_input(x, tree) 29 | return self._train(x, tree[:, :-1], y, y_exists) 30 | 31 | def train_step(self, root_node, label): 32 | x, tree, labels, labels_exist = \ 33 | tree_rnn.gen_nn_inputs(root_node, max_degree=self.degree, 34 | only_leaves_have_vals=False, 35 | with_labels=True) 36 | y = np.zeros((len(labels), self.output_dim), dtype=theano.config.floatX) 37 | y[np.arange(len(labels)), labels.astype('int32')] = 1 38 | loss, pred_y = self.train_step_inner(x, tree, y, labels_exist) 39 | return loss, pred_y 40 | 41 | def loss_fn_multi(self, y, pred_y, y_exists): 42 | return T.sum(T.nnet.categorical_crossentropy(pred_y, y) * y_exists) 43 | 44 | 45 | def get_model(num_emb, output_dim, max_degree): 46 | return SentimentModel( 47 | num_emb, EMB_DIM, HIDDEN_DIM, output_dim, 48 | degree=max_degree, learning_rate=LEARNING_RATE, 49 | trainable_embeddings=True, 50 | labels_on_nonroot_nodes=True, 51 | irregular_tree=DEPENDENCY) 52 | 53 | def train(): 54 | vocab, data = data_utils.read_sentiment_dataset(DIR, FINE_GRAINED, DEPENDENCY) 55 | 56 | train_set, dev_set, test_set = data['train'], data['dev'], data['test'] 57 | max_degree = data['max_degree'] 58 | print 'train', len(train_set) 59 | print 'dev', len(dev_set) 60 | print 'test', len(test_set) 61 | print 'max degree', max_degree 62 | 63 | num_emb = vocab.size() 64 | num_labels = 5 if FINE_GRAINED else 3 65 | for key, dataset in data.items(): 66 | if key == 'max_degree': 67 | continue 68 | labels = [label for _, label in dataset] 69 | assert set(labels) <= set(xrange(num_labels)), set(labels) 70 | print 'num emb', num_emb 71 | print 'num labels', num_labels 72 | 73 | random.seed(SEED) 74 | np.random.seed(SEED) 75 | model = get_model(num_emb, num_labels, max_degree) 76 | 77 | # initialize model embeddings to glove 78 | embeddings = model.embeddings.get_value() 79 | glove_vecs = np.load(os.path.join(GLOVE_DIR, 'glove.npy')) 80 | glove_words = np.load(os.path.join(GLOVE_DIR, 'words.npy')) 81 | glove_word2idx = dict((word, i) for i, word in enumerate(glove_words)) 82 | for i, word in enumerate(vocab.words): 83 | if word in glove_word2idx: 84 | embeddings[i] = glove_vecs[glove_word2idx[word]] 85 | glove_vecs, glove_words, glove_word2idx = [], [], [] 86 | model.embeddings.set_value(embeddings) 87 | 88 | for epoch in xrange(NUM_EPOCHS): 89 | print 'epoch', epoch 90 | avg_loss = train_dataset(model, train_set) 91 | print 'avg loss', avg_loss 92 | dev_score = evaluate_dataset(model, dev_set) 93 | print 'dev score', dev_score 94 | 95 | print 'finished training' 96 | test_score = evaluate_dataset(model, test_set) 97 | print 'test score', test_score 98 | 99 | 100 | def train_dataset(model, data): 101 | losses = [] 102 | avg_loss = 0.0 103 | total_data = len(data) 104 | for i, (tree, _) in enumerate(data): 105 | loss, pred_y = model.train_step(tree, None) # labels will be determined by model 106 | losses.append(loss) 107 | avg_loss = avg_loss * (len(losses) - 1) / len(losses) + loss / len(losses) 108 | print 'avg loss %.2f at example %d of %d\r' % (avg_loss, i, total_data), 109 | return np.mean(losses) 110 | 111 | 112 | def evaluate_dataset(model, data): 113 | num_correct = 0 114 | for tree, label in data: 115 | pred_y = model.predict(tree)[-1] # root pred is final row 116 | num_correct += (label == np.argmax(pred_y)) 117 | 118 | return float(num_correct) / len(data) 119 | 120 | 121 | if __name__ == '__main__': 122 | train() 123 | -------------------------------------------------------------------------------- /test_tree_rnn.py: -------------------------------------------------------------------------------- 1 | import tree_rnn 2 | 3 | import theano 4 | from theano import tensor as T 5 | import numpy as np 6 | from numpy.testing import assert_array_almost_equal 7 | 8 | 9 | class DummyTreeRNN(tree_rnn.TreeRNN): 10 | 11 | def create_recursive_unit(self): 12 | def unit(parent_x, child_h, child_exists): # assumes emb_dim == hidden_dim 13 | return parent_x + T.prod((child_h - 1) * child_exists.dimshuffle(0, 'x') + 1, 14 | axis=0) 15 | return unit 16 | 17 | def create_leaf_unit(self): 18 | def unit(leaf_x): # assumes emb_dim == hidden_dim 19 | return leaf_x 20 | return unit 21 | 22 | 23 | class DummyBinaryRNN(tree_rnn.TreeRNN): 24 | 25 | def create_recursive_unit(self): 26 | def unit(parent_x, child_h, child_exists): # assumes emb_dim == hidden_dim 27 | return (parent_x + child_exists[0] * child_h[0] + 28 | child_exists[1] * child_h[1] ** 2) 29 | return unit 30 | 31 | def create_leaf_unit(self): 32 | def unit(leaf_x): # assumes emb_dim == hidden_dim 33 | return leaf_x 34 | return unit 35 | 36 | 37 | def test_tree_rnn(): 38 | model = DummyTreeRNN(8, 2, 2, 1, degree=2) 39 | emb = model.embeddings.get_value() 40 | 41 | root = tree_rnn.Node(3) 42 | c1 = tree_rnn.Node(1) 43 | c2 = tree_rnn.Node(2) 44 | root.add_children([c1, c2]) 45 | 46 | root_emb = model.evaluate(root) 47 | expected = emb[3] + emb[1] * emb[2] 48 | assert_array_almost_equal(expected, root_emb) 49 | 50 | cc1 = tree_rnn.Node(5) 51 | cc2 = tree_rnn.Node(2) 52 | c2.add_children([cc1, cc2]) 53 | 54 | root_emb = model.evaluate(root) 55 | expected = emb[3] + (emb[2] + emb[5] * emb[2]) * emb[1] 56 | assert_array_almost_equal(expected, root_emb) 57 | 58 | ccc1 = tree_rnn.Node(5) 59 | ccc2 = tree_rnn.Node(4) 60 | cc1.add_children([ccc1, ccc2]) 61 | 62 | root_emb = model.evaluate(root) 63 | expected = emb[3] + (emb[2] + (emb[5] + emb[5] * emb[4]) * emb[2]) * emb[1] 64 | assert_array_almost_equal(expected, root_emb) 65 | 66 | # check step works without error 67 | model.train_step(root, np.array([0]).astype(theano.config.floatX)) 68 | 69 | # degree > 2 70 | model = DummyTreeRNN(10, 2, 2, 1, degree=3) 71 | emb = model.embeddings.get_value() 72 | 73 | root = tree_rnn.Node(0) 74 | c1 = tree_rnn.Node(1) 75 | c2 = tree_rnn.Node(2) 76 | c3 = tree_rnn.Node(3) 77 | root.add_children([c1, c2, c3]) 78 | 79 | cc1 = tree_rnn.Node(1) 80 | cc2 = tree_rnn.Node(2) 81 | cc3 = tree_rnn.Node(3) 82 | cc4 = tree_rnn.Node(4) 83 | cc5 = tree_rnn.Node(5) 84 | cc6 = tree_rnn.Node(6) 85 | cc7 = tree_rnn.Node(7) 86 | cc8 = tree_rnn.Node(8) 87 | cc9 = tree_rnn.Node(9) 88 | 89 | c1.add_children([cc1, cc2, cc3]) 90 | c2.add_children([cc4, cc5, cc6]) 91 | c3.add_children([cc7, cc8, cc9]) 92 | 93 | root_emb = model.evaluate(root) 94 | expected = \ 95 | emb[0] + ((emb[1] + emb[1] * emb[2] * emb[3]) * 96 | (emb[2] + emb[4] * emb[5] * emb[6]) * 97 | (emb[3] + emb[7] * emb[8] * emb[9])) 98 | assert_array_almost_equal(expected, root_emb) 99 | 100 | # check step works without error 101 | model.train_step(root, np.array([0]).astype(theano.config.floatX)) 102 | 103 | 104 | def test_tree_rnn_var_degree(): 105 | model = DummyBinaryRNN(10, 2, 2, 1, degree=2) 106 | emb = model.embeddings.get_value() 107 | 108 | root = tree_rnn.BinaryNode(0) 109 | c1 = tree_rnn.BinaryNode(1) 110 | cc1 = tree_rnn.BinaryNode(2) 111 | ccc1 = tree_rnn.BinaryNode(3) 112 | cc1.add_left(ccc1) 113 | c1.add_right(cc1) 114 | root.add_left(c1) 115 | 116 | root_emb = model.evaluate(root) 117 | expected = emb[0] + (emb[1] + (emb[2] + emb[3]) ** 2) 118 | assert_array_almost_equal(expected, root_emb) 119 | 120 | cccc1 = tree_rnn.BinaryNode(5) 121 | cccc2 = tree_rnn.BinaryNode(6) 122 | ccc1.add_left(cccc1) 123 | ccc1.add_right(cccc2) 124 | 125 | root_emb = model.evaluate(root) 126 | expected = emb[0] + (emb[1] + (emb[2] + (emb[3] + emb[5] + emb[6] ** 2)) ** 2) 127 | assert_array_almost_equal(expected, root_emb) 128 | 129 | # check step works without error 130 | model.train_step(root, np.array([0]).astype(theano.config.floatX)) 131 | 132 | 133 | def test_irregular_tree(): 134 | model = DummyTreeRNN(8, 2, 2, 1, degree=4, irregular_tree=True) 135 | emb = model.embeddings.get_value() 136 | 137 | root = tree_rnn.Node(3) 138 | c1 = tree_rnn.Node(1) 139 | c2 = tree_rnn.Node(2) 140 | c3 = tree_rnn.Node(3) 141 | c4 = tree_rnn.Node(4) 142 | c5 = tree_rnn.Node(5) 143 | c6 = tree_rnn.Node(6) 144 | root.add_children([c1, c2, c3, c4]) 145 | c1.add_children([c5]) 146 | c5.add_children([c6]) 147 | 148 | root_emb = model.evaluate(root) 149 | expected = emb[3] + emb[2] * emb[3] * emb[4] * (emb[1] + emb[5] + emb[6]) 150 | assert_array_almost_equal(expected, root_emb) 151 | -------------------------------------------------------------------------------- /tree_lstm.py: -------------------------------------------------------------------------------- 1 | __doc__ = """Implementation of Tree LSTMs described in http://arxiv.org/abs/1503.00075""" 2 | 3 | import tree_rnn 4 | 5 | import theano 6 | from theano import tensor as T 7 | 8 | 9 | class ChildSumTreeLSTM(tree_rnn.TreeRNN): 10 | def create_recursive_unit(self): 11 | self.W_i = theano.shared(self.init_matrix([self.hidden_dim, self.emb_dim])) 12 | self.U_i = theano.shared(self.init_matrix([self.hidden_dim, self.hidden_dim])) 13 | self.b_i = theano.shared(self.init_vector([self.hidden_dim])) 14 | self.W_f = theano.shared(self.init_matrix([self.hidden_dim, self.emb_dim])) 15 | self.U_f = theano.shared(self.init_matrix([self.hidden_dim, self.hidden_dim])) 16 | self.b_f = theano.shared(self.init_vector([self.hidden_dim])) 17 | self.W_o = theano.shared(self.init_matrix([self.hidden_dim, self.emb_dim])) 18 | self.U_o = theano.shared(self.init_matrix([self.hidden_dim, self.hidden_dim])) 19 | self.b_o = theano.shared(self.init_vector([self.hidden_dim])) 20 | self.W_u = theano.shared(self.init_matrix([self.hidden_dim, self.emb_dim])) 21 | self.U_u = theano.shared(self.init_matrix([self.hidden_dim, self.hidden_dim])) 22 | self.b_u = theano.shared(self.init_vector([self.hidden_dim])) 23 | self.params.extend([ 24 | self.W_i, self.U_i, self.b_i, 25 | self.W_f, self.U_f, self.b_f, 26 | self.W_o, self.U_o, self.b_o, 27 | self.W_u, self.U_u, self.b_u]) 28 | 29 | def unit(parent_x, child_h, child_c, child_exists): 30 | h_tilde = T.sum(child_h, axis=0) 31 | i = T.nnet.sigmoid(T.dot(self.W_i, parent_x) + T.dot(self.U_i, h_tilde) + self.b_i) 32 | o = T.nnet.sigmoid(T.dot(self.W_o, parent_x) + T.dot(self.U_o, h_tilde) + self.b_o) 33 | u = T.tanh(T.dot(self.W_u, parent_x) + T.dot(self.U_u, h_tilde) + self.b_u) 34 | 35 | f = (T.nnet.sigmoid( 36 | T.dot(self.W_f, parent_x).dimshuffle('x', 0) + 37 | T.dot(child_h, self.U_f.T) + 38 | self.b_f.dimshuffle('x', 0)) * 39 | child_exists.dimshuffle(0, 'x')) 40 | 41 | c = i * u + T.sum(f * child_c, axis=0) 42 | h = o * T.tanh(c) 43 | return h, c 44 | 45 | return unit 46 | 47 | def create_leaf_unit(self): 48 | dummy = 0 * theano.shared(self.init_vector([self.degree, self.hidden_dim])) 49 | def unit(leaf_x): 50 | return self.recursive_unit( 51 | leaf_x, 52 | dummy, 53 | dummy, 54 | dummy.sum(axis=1)) 55 | return unit 56 | 57 | def compute_tree(self, emb_x, tree): 58 | self.recursive_unit = self.create_recursive_unit() 59 | self.leaf_unit = self.create_leaf_unit() 60 | num_nodes = tree.shape[0] # num internal nodes 61 | num_leaves = self.num_words - num_nodes 62 | 63 | # compute leaf hidden states 64 | (leaf_h, leaf_c), _ = theano.map( 65 | fn=self.leaf_unit, 66 | sequences=[emb_x[:num_leaves]]) 67 | if self.irregular_tree: 68 | init_node_h = T.concatenate([leaf_h, leaf_h], axis=0) 69 | init_node_c = T.concatenate([leaf_c, leaf_c], axis=0) 70 | else: 71 | init_node_h = leaf_h 72 | init_node_c = leaf_c 73 | 74 | # use recurrence to compute internal node hidden states 75 | def _recurrence(cur_emb, node_info, t, node_h, node_c, last_h): 76 | child_exists = node_info > -1 77 | offset = num_leaves * int(self.irregular_tree) - child_exists * t 78 | child_h = node_h[node_info + offset] * child_exists.dimshuffle(0, 'x') 79 | child_c = node_c[node_info + offset] * child_exists.dimshuffle(0, 'x') 80 | parent_h, parent_c = self.recursive_unit(cur_emb, child_h, child_c, child_exists) 81 | node_h = T.concatenate([node_h, 82 | parent_h.reshape([1, self.hidden_dim])]) 83 | node_c = T.concatenate([node_c, 84 | parent_c.reshape([1, self.hidden_dim])]) 85 | return node_h[1:], node_c[1:], parent_h 86 | 87 | dummy = theano.shared(self.init_vector([self.hidden_dim])) 88 | (_, _, parent_h), _ = theano.scan( 89 | fn=_recurrence, 90 | outputs_info=[init_node_h, init_node_c, dummy], 91 | sequences=[emb_x[num_leaves:], tree, T.arange(num_nodes)], 92 | n_steps=num_nodes) 93 | 94 | return T.concatenate([leaf_h, parent_h], axis=0) 95 | 96 | 97 | class NaryTreeLSTM(ChildSumTreeLSTM): 98 | # we inherit from ChildSumTreeLSTM to re-use the compute_tree method 99 | 100 | def create_recursive_unit(self): 101 | self.W_i = theano.shared(self.init_matrix([self.hidden_dim, self.emb_dim])) 102 | self.U_i = theano.shared(self.init_matrix( 103 | [self.degree, self.hidden_dim, self.hidden_dim])) 104 | self.b_i = theano.shared(self.init_vector([self.hidden_dim])) 105 | self.W_f = theano.shared(self.init_matrix([self.hidden_dim, self.emb_dim])) 106 | self.U_f = theano.shared(self.init_matrix( 107 | [self.degree, self.degree, self.hidden_dim, self.hidden_dim])) 108 | self.b_f = theano.shared(self.init_vector([self.hidden_dim])) 109 | self.W_o = theano.shared(self.init_matrix([self.hidden_dim, self.emb_dim])) 110 | self.U_o = theano.shared(self.init_matrix( 111 | [self.degree, self.hidden_dim, self.hidden_dim])) 112 | self.b_o = theano.shared(self.init_vector([self.hidden_dim])) 113 | self.W_u = theano.shared(self.init_matrix([self.hidden_dim, self.emb_dim])) 114 | self.U_u = theano.shared(self.init_matrix( 115 | [self.degree, self.hidden_dim, self.hidden_dim])) 116 | self.b_u = theano.shared(self.init_vector([self.hidden_dim])) 117 | self.params.extend([ 118 | self.W_i, self.U_i, self.b_i, 119 | self.W_f, self.U_f, self.b_f, 120 | self.W_o, self.U_o, self.b_o, 121 | self.W_u, self.U_u, self.b_u]) 122 | 123 | def unit(parent_x, child_h, child_c, child_exists): 124 | (h_i, h_o, h_u), _ = theano.map( 125 | fn=lambda Ui, Uo, Uu, h, exists: 126 | (exists * T.dot(Ui, h), exists * T.dot(Uo, h), exists * T.dot(Uu, h)), 127 | sequences=[self.U_i, self.U_o, self.U_u, child_h, child_exists]) 128 | 129 | i = T.nnet.sigmoid(T.dot(self.W_i, parent_x) + h_i.sum(axis=0) + self.b_i) 130 | o = T.nnet.sigmoid(T.dot(self.W_o, parent_x) + h_o.sum(axis=0) + self.b_o) 131 | u = T.tanh(T.dot(self.W_u, parent_x) + h_u.sum(axis=0) + self.b_u) 132 | 133 | def _sub_f(U): 134 | sub_h_f, _ = theano.map( 135 | fn=lambda sub_U, h, exists: exists * T.dot(sub_U, h), 136 | sequences=[U, child_h, child_exists]) 137 | return sub_h_f.sum(axis=0) 138 | 139 | h_f, _ = theano.map( 140 | fn=lambda U: _sub_f(U), 141 | sequences=[self.U_f]) 142 | f = (T.nnet.sigmoid( 143 | T.dot(self.W_f, parent_x).dimshuffle('x', 0) + h_f + 144 | self.b_f.dimshuffle('x', 0)) * 145 | child_exists.dimshuffle(0, 'x')) 146 | 147 | c = i * u + T.sum(f * child_c, axis=0) 148 | h = o * T.tanh(c) 149 | return h, c 150 | 151 | return unit 152 | -------------------------------------------------------------------------------- /data_utils.py: -------------------------------------------------------------------------------- 1 | __doc__ = """Utilities for loading language datasets. 2 | 3 | Basically porting http://github.com/stanfordnlp/treelstm/tree/master/util to Python. 4 | 5 | """ 6 | 7 | import tree_rnn 8 | 9 | import numpy as np 10 | import os 11 | 12 | 13 | def read_sentiment_dataset(data_dir, fine_grained=False, dependency=False): 14 | vocab = Vocab() 15 | vocab.load(os.path.join(data_dir, 'vocab-cased.txt')) 16 | 17 | train_dir = os.path.join(data_dir, 'train') 18 | dev_dir = os.path.join(data_dir, 'dev') 19 | test_dir = os.path.join(data_dir, 'test') 20 | 21 | data = {} 22 | overall_max_degree = 0 23 | for name, sub_dir in zip(['train', 'dev', 'test'], [train_dir, dev_dir, test_dir]): 24 | if dependency: 25 | max_degree, trees = read_trees( 26 | os.path.join(sub_dir, 'dparents.txt'), 27 | os.path.join(sub_dir, 'dlabels.txt')) 28 | else: 29 | max_degree, trees = read_trees( 30 | os.path.join(sub_dir, 'parents.txt'), 31 | os.path.join(sub_dir, 'labels.txt')) 32 | sentences = read_sentences( 33 | os.path.join(sub_dir, 'sents.txt'), 34 | vocab) 35 | 36 | this_dataset = zip(trees, sentences) 37 | if not fine_grained: # remove all 'neutral' data 38 | this_dataset = [(tree, sentence) for tree, sentence in this_dataset 39 | if tree.label != 0] 40 | 41 | for tree, sentence in this_dataset: 42 | _remap_tokens_and_labels(tree, sentence, fine_grained) 43 | 44 | data[name] = [(tree, tree.label) for tree, _ in this_dataset] 45 | overall_max_degree = max(overall_max_degree, max_degree) 46 | 47 | data['max_degree'] = overall_max_degree 48 | assert overall_max_degree == 2 or dependency 49 | return vocab, data 50 | 51 | 52 | class Vocab(object): 53 | 54 | def __init__(self): 55 | self.words = [] 56 | self.word2idx = {} 57 | self.unk_index = None 58 | self.start_index = None 59 | self.end_index = None 60 | self.unk_token = None 61 | self.start_token = None 62 | self.end_token = None 63 | 64 | def load(self, path): 65 | with open(path, 'r') as in_file: 66 | for line in in_file: 67 | word = line.strip() 68 | assert word not in self.word2idx 69 | self.word2idx[word] = len(self.words) 70 | self.words.append(word) 71 | 72 | for unk in ['', '', 'UUUNKKK']: 73 | self.unk_index = self.unk_index or self.word2idx.get(unk, None) 74 | if self.unk_index is not None: 75 | self.unk_token = unk 76 | break 77 | 78 | for start in ['', '']: 79 | self.start_index = self.start_index or self.word2idx.get(start, None) 80 | if self.start_index is not None: 81 | self.start_token = start 82 | break 83 | 84 | for end in ['', '']: 85 | self.end_index = self.end_index or self.word2idx.get(end, None) 86 | if self.end_index is not None: 87 | self.end_token = end 88 | break 89 | 90 | def index(self, word): 91 | if self.unk_index is None: 92 | assert word in self.word2idx 93 | return self.word2idx.get(word, self.unk_index) 94 | 95 | def size(self): 96 | return len(self.words) 97 | 98 | 99 | def read_trees(parents_file, labels_file): 100 | trees = [] 101 | max_degree = 0 102 | with open(parents_file, 'r') as parents_f: 103 | with open(labels_file, 'r') as labels_f: 104 | while True: 105 | cur_parents = parents_f.readline() 106 | cur_labels = labels_f.readline() 107 | if not cur_parents or not cur_labels: 108 | break 109 | cur_parents = [int(p) for p in cur_parents.strip().split()] 110 | cur_labels = [int(l) if l != '#' else None for l in cur_labels.strip().split()] 111 | cur_max_degree, cur_tree = read_tree(cur_parents, cur_labels) 112 | max_degree = max(max_degree, cur_max_degree) 113 | trees.append(cur_tree) 114 | return max_degree, trees 115 | 116 | 117 | def read_tree(parents, labels): 118 | nodes = {} 119 | parents = [p - 1 for p in parents] # 1-indexed 120 | for i in xrange(len(parents)): 121 | if i not in nodes: 122 | idx = i 123 | prev = None 124 | while True: 125 | node = tree_rnn.Node(val=idx) # for now, val is just idx 126 | if prev is not None: 127 | assert prev.val != node.val 128 | node.add_child(prev) 129 | 130 | node.label = labels[idx] 131 | nodes[idx] = node 132 | 133 | parent = parents[idx] 134 | if parent in nodes: 135 | nodes[parent].add_child(node) 136 | break 137 | elif parent == -1: 138 | root = node 139 | break 140 | 141 | prev = node 142 | idx = parent 143 | 144 | # ensure tree is connected 145 | num_roots = sum(node.parent is None for node in nodes.itervalues()) 146 | assert num_roots == 1, num_roots 147 | 148 | # overwrite vals to match sentence indices - 149 | # only leaves correspond to sentence tokens 150 | leaf_idx = 0 151 | for node in nodes.itervalues(): 152 | if node.children: 153 | node.val = None 154 | else: 155 | node.val = leaf_idx 156 | leaf_idx += 1 157 | 158 | max_degree = max(len(node.children) for node in nodes.itervalues()) 159 | 160 | return max_degree, root 161 | 162 | 163 | def read_sentences(path, vocab): 164 | sentences = [] 165 | with open(path, 'r') as in_file: 166 | for line in in_file: 167 | tokens = line.strip().split() 168 | sentences.append([vocab.index(tok) for tok in tokens]) 169 | return sentences 170 | 171 | 172 | def _remap_tokens_and_labels(tree, sentence, fine_grained): 173 | # map leaf idx to word idx 174 | if tree.val is not None: 175 | tree.val = sentence[tree.val] 176 | 177 | # map label to suitable range 178 | if tree.label is not None: 179 | if fine_grained: 180 | tree.label += 2 181 | else: 182 | if tree.label < 0: 183 | tree.label = 0 184 | elif tree.label == 0: 185 | tree.label = 1 186 | else: 187 | tree.label = 2 188 | 189 | [_remap_tokens_and_labels(child, sentence, fine_grained) 190 | for child in tree.children 191 | if child is not None] 192 | 193 | 194 | def read_embeddings_into_numpy(file_name, vocab=None): 195 | """Reads Glove vector files and returns numpy arrays. 196 | 197 | If vocab is given, only intersection of vocab and words is used. 198 | 199 | """ 200 | words = [] 201 | array = [] 202 | with open(file_name, 'r') as in_file: 203 | for line in in_file: 204 | fields = line.strip().split() 205 | word = fields[0] 206 | if vocab and word not in vocab.word2idx: 207 | continue 208 | embedding = np.array([float(f) for f in fields[1:]]) 209 | words.append(word) 210 | array.append(embedding) 211 | 212 | return np.array(words), np.array(array) 213 | -------------------------------------------------------------------------------- /tree_rnn.py: -------------------------------------------------------------------------------- 1 | __doc__ = """Tree RNNs aka Recursive Neural Networks.""" 2 | 3 | import numpy as np 4 | import theano 5 | from theano import tensor as T 6 | from theano.compat.python2x import OrderedDict 7 | 8 | 9 | theano.config.floatX = 'float32' 10 | 11 | 12 | class Node(object): 13 | def __init__(self, val=None): 14 | self.children = [] 15 | self.val = val 16 | self.idx = None 17 | self.height = 1 18 | self.size = 1 19 | self.num_leaves = 1 20 | self.parent = None 21 | self.label = None 22 | 23 | def _update(self): 24 | self.height = 1 + max([child.height for child in self.children if child] or [0]) 25 | self.size = 1 + sum(child.size for child in self.children if child) 26 | self.num_leaves = (all(child is None for child in self.children) + 27 | sum(child.num_leaves for child in self.children if child)) 28 | if self.parent is not None: 29 | self.parent._update() 30 | 31 | def add_child(self, child): 32 | self.children.append(child) 33 | child.parent = self 34 | self._update() 35 | 36 | def add_children(self, other_children): 37 | self.children.extend(other_children) 38 | for child in other_children: 39 | child.parent = self 40 | self._update() 41 | 42 | 43 | class BinaryNode(Node): 44 | def __init__(self, val=None): 45 | super(BinaryNode, self).__init__(val=val) 46 | 47 | def add_left(self, node): 48 | if not self.children: 49 | self.children = [None, None] 50 | self.children[0] = node 51 | node.parent = self 52 | self._update() 53 | 54 | def add_right(self, node): 55 | if not self.children: 56 | self.children = [None, None] 57 | self.children[1] = node 58 | node.parent = self 59 | self._update() 60 | 61 | def get_left(self): 62 | if not self.children: 63 | return None 64 | return self.children[0] 65 | 66 | def get_right(self): 67 | if not self.children: 68 | return None 69 | return self.children[1] 70 | 71 | 72 | def gen_nn_inputs(root_node, max_degree=None, only_leaves_have_vals=True, 73 | with_labels=False): 74 | """Given a root node, returns the appropriate inputs to NN. 75 | 76 | The NN takes in 77 | x: the values at the leaves (e.g. word indices) 78 | tree: a (n x degree) matrix that provides the computation order. 79 | Namely, a row tree[i] = [a, b, c] in tree signifies that a 80 | and b are children of c, and that the computation 81 | f(a, b) -> c should happen on step i. 82 | 83 | """ 84 | _clear_indices(root_node) 85 | x, leaf_labels = _get_leaf_vals(root_node) 86 | tree, internal_x, internal_labels = \ 87 | _get_tree_traversal(root_node, len(x), max_degree) 88 | assert all(v is not None for v in x) 89 | if not only_leaves_have_vals: 90 | assert all(v is not None for v in internal_x) 91 | x.extend(internal_x) 92 | if max_degree is not None: 93 | assert all(len(t) == max_degree + 1 for t in tree) 94 | if with_labels: 95 | labels = leaf_labels + internal_labels 96 | labels_exist = [l is not None for l in labels] 97 | labels = [l or 0 for l in labels] 98 | return (np.array(x, dtype='int32'), 99 | np.array(tree, dtype='int32'), 100 | np.array(labels, dtype=theano.config.floatX), 101 | np.array(labels_exist, dtype=theano.config.floatX)) 102 | return (np.array(x, dtype='int32'), 103 | np.array(tree, dtype='int32')) 104 | 105 | 106 | def _clear_indices(root_node): 107 | root_node.idx = None 108 | [_clear_indices(child) for child in root_node.children if child] 109 | 110 | 111 | def _get_leaf_vals(root_node): 112 | """Get leaf values in deep-to-shallow, left-to-right order.""" 113 | all_leaves = [] 114 | layer = [root_node] 115 | while layer: 116 | next_layer = [] 117 | for node in layer: 118 | if all(child is None for child in node.children): 119 | all_leaves.append(node) 120 | else: 121 | next_layer.extend([child for child in node.children[::-1] if child]) 122 | layer = next_layer 123 | 124 | vals = [] 125 | labels = [] 126 | for idx, leaf in enumerate(reversed(all_leaves)): 127 | leaf.idx = idx 128 | vals.append(leaf.val) 129 | labels.append(leaf.label) 130 | return vals, labels 131 | 132 | 133 | def _get_tree_traversal(root_node, start_idx=0, max_degree=None): 134 | """Get computation order of leaves -> root.""" 135 | if not root_node.children: 136 | return [], [], [] 137 | layers = [] 138 | layer = [root_node] 139 | while layer: 140 | layers.append(layer[:]) 141 | next_layer = [] 142 | [next_layer.extend([child for child in node.children if child]) 143 | for node in layer] 144 | layer = next_layer 145 | 146 | tree = [] 147 | internal_vals = [] 148 | labels = [] 149 | idx = start_idx 150 | for layer in reversed(layers): 151 | for node in layer: 152 | if node.idx is not None: 153 | # must be leaf 154 | assert all(child is None for child in node.children) 155 | continue 156 | 157 | child_idxs = [(child.idx if child else -1) 158 | for child in node.children] 159 | if max_degree is not None: 160 | child_idxs.extend([-1] * (max_degree - len(child_idxs))) 161 | assert not any(idx is None for idx in child_idxs) 162 | 163 | node.idx = idx 164 | tree.append(child_idxs + [node.idx]) 165 | internal_vals.append(node.val if node.val is not None else -1) 166 | labels.append(node.label) 167 | idx += 1 168 | 169 | return tree, internal_vals, labels 170 | 171 | 172 | class TreeRNN(object): 173 | """Data is represented in a tree structure. 174 | 175 | Every leaf and internal node has a data (provided by the input) 176 | and a memory or hidden state. The hidden state is computed based 177 | on its own data and the hidden states of its children. The 178 | hidden state of leaves is given by a custom init function. 179 | 180 | The entire tree's embedding is represented by the final 181 | state computed at the root. 182 | 183 | """ 184 | 185 | def __init__(self, num_emb, emb_dim, hidden_dim, output_dim, 186 | degree=2, learning_rate=0.01, momentum=0.9, 187 | trainable_embeddings=True, 188 | labels_on_nonroot_nodes=False, 189 | irregular_tree=False): 190 | assert emb_dim > 1 and hidden_dim > 1 191 | self.num_emb = num_emb 192 | self.emb_dim = emb_dim 193 | self.hidden_dim = hidden_dim 194 | self.output_dim = output_dim 195 | self.degree = degree 196 | self.learning_rate = learning_rate 197 | self.momentum = momentum 198 | self.irregular_tree = irregular_tree 199 | 200 | self.params = [] 201 | self.embeddings = theano.shared(self.init_matrix([self.num_emb, self.emb_dim])) 202 | if trainable_embeddings: 203 | self.params.append(self.embeddings) 204 | 205 | self.x = T.ivector(name='x') # word indices 206 | self.tree = T.imatrix(name='tree') # shape [None, self.degree] 207 | if labels_on_nonroot_nodes: 208 | self.y = T.fmatrix(name='y') # output shape [None, self.output_dim] 209 | self.y_exists = T.fvector(name='y_exists') # shape [None] 210 | else: 211 | self.y = T.fvector(name='y') # output shape [self.output_dim] 212 | 213 | self.num_words = self.x.shape[0] # total number of nodes (leaves + internal) in tree 214 | emb_x = self.embeddings[self.x] 215 | emb_x = emb_x * T.neq(self.x, -1).dimshuffle(0, 'x') # zero-out non-existent embeddings 216 | 217 | self.tree_states = self.compute_tree(emb_x, self.tree) 218 | self.final_state = self.tree_states[-1] 219 | if labels_on_nonroot_nodes: 220 | self.output_fn = self.create_output_fn_multi() 221 | self.pred_y = self.output_fn(self.tree_states) 222 | self.loss = self.loss_fn_multi(self.y, self.pred_y, self.y_exists) 223 | else: 224 | self.output_fn = self.create_output_fn() 225 | self.pred_y = self.output_fn(self.final_state) 226 | self.loss = self.loss_fn(self.y, self.pred_y) 227 | 228 | updates = self.gradient_descent(self.loss) 229 | 230 | train_inputs = [self.x, self.tree, self.y] 231 | if labels_on_nonroot_nodes: 232 | train_inputs.append(self.y_exists) 233 | self._train = theano.function(train_inputs, 234 | [self.loss, self.pred_y], 235 | updates=updates) 236 | 237 | self._evaluate = theano.function([self.x, self.tree], 238 | self.final_state) 239 | 240 | self._predict = theano.function([self.x, self.tree], 241 | self.pred_y) 242 | 243 | def _check_input(self, x, tree): 244 | assert np.array_equal(tree[:, -1], np.arange(len(x) - len(tree), len(x))) 245 | if not self.irregular_tree: 246 | assert np.all((tree[:, 0] + 1 >= np.arange(len(tree))) | 247 | (tree[:, 0] == -1)) 248 | assert np.all((tree[:, 1] + 1 >= np.arange(len(tree))) | 249 | (tree[:, 1] == -1)) 250 | 251 | def train_step_inner(self, x, tree, y): 252 | self._check_input(x, tree) 253 | return self._train(x, tree[:, :-1], y) 254 | 255 | def train_step(self, root_node, y): 256 | x, tree = gen_nn_inputs(root_node, max_degree=self.degree, only_leaves_have_vals=False) 257 | return self.train_step_inner(x, tree, y) 258 | 259 | def evaluate(self, root_node): 260 | x, tree = gen_nn_inputs(root_node, max_degree=self.degree, only_leaves_have_vals=False) 261 | self._check_input(x, tree) 262 | return self._evaluate(x, tree[:, :-1]) 263 | 264 | def predict(self, root_node): 265 | x, tree = gen_nn_inputs(root_node, max_degree=self.degree, only_leaves_have_vals=False) 266 | self._check_input(x, tree) 267 | return self._predict(x, tree[:, :-1]) 268 | 269 | def init_matrix(self, shape): 270 | return np.random.normal(scale=0.1, size=shape).astype(theano.config.floatX) 271 | 272 | def init_vector(self, shape): 273 | return np.zeros(shape, dtype=theano.config.floatX) 274 | 275 | def create_output_fn(self): 276 | self.W_out = theano.shared(self.init_matrix([self.output_dim, self.hidden_dim])) 277 | self.b_out = theano.shared(self.init_vector([self.output_dim])) 278 | self.params.extend([self.W_out, self.b_out]) 279 | 280 | def fn(final_state): 281 | return T.nnet.softmax( 282 | T.dot(self.W_out, final_state) + self.b_out) 283 | return fn 284 | 285 | def create_output_fn_multi(self): 286 | self.W_out = theano.shared(self.init_matrix([self.output_dim, self.hidden_dim])) 287 | self.b_out = theano.shared(self.init_vector([self.output_dim])) 288 | self.params.extend([self.W_out, self.b_out]) 289 | 290 | def fn(tree_states): 291 | return T.nnet.softmax( 292 | T.dot(tree_states, self.W_out.T) + 293 | self.b_out.dimshuffle('x', 0)) 294 | return fn 295 | 296 | def create_recursive_unit(self): 297 | self.W_hx = theano.shared(self.init_matrix([self.hidden_dim, self.emb_dim])) 298 | self.W_hh = theano.shared(self.init_matrix([self.hidden_dim, self.hidden_dim])) 299 | self.b_h = theano.shared(self.init_vector([self.hidden_dim])) 300 | self.params.extend([self.W_hx, self.W_hh, self.b_h]) 301 | def unit(parent_x, child_h, child_exists): # very simple 302 | h_tilde = T.sum(child_h, axis=0) 303 | h = T.tanh(self.b_h + T.dot(self.W_hx, parent_x) + T.dot(self.W_hh, h_tilde)) 304 | return h 305 | return unit 306 | 307 | def create_leaf_unit(self): 308 | dummy = 0 * theano.shared(self.init_matrix([self.degree, self.hidden_dim])) 309 | def unit(leaf_x): 310 | return self.recursive_unit(leaf_x, dummy, dummy.sum(axis=1)) 311 | return unit 312 | 313 | def compute_tree(self, emb_x, tree): 314 | self.recursive_unit = self.create_recursive_unit() 315 | self.leaf_unit = self.create_leaf_unit() 316 | num_nodes = tree.shape[0] # num internal nodes 317 | num_leaves = self.num_words - num_nodes 318 | 319 | # compute leaf hidden states 320 | leaf_h, _ = theano.map( 321 | fn=self.leaf_unit, 322 | sequences=[emb_x[:num_leaves]]) 323 | if self.irregular_tree: 324 | init_node_h = T.concatenate([leaf_h, leaf_h], axis=0) 325 | else: 326 | init_node_h = leaf_h 327 | 328 | # use recurrence to compute internal node hidden states 329 | def _recurrence(cur_emb, node_info, t, node_h, last_h): 330 | child_exists = node_info > -1 331 | offset = num_leaves * int(self.irregular_tree) - child_exists * t 332 | child_h = node_h[node_info + offset] * child_exists.dimshuffle(0, 'x') 333 | parent_h = self.recursive_unit(cur_emb, child_h, child_exists) 334 | node_h = T.concatenate([node_h, 335 | parent_h.reshape([1, self.hidden_dim])]) 336 | return node_h[1:], parent_h 337 | 338 | dummy = theano.shared(self.init_vector([self.hidden_dim])) 339 | (_, parent_h), _ = theano.scan( 340 | fn=_recurrence, 341 | outputs_info=[init_node_h, dummy], 342 | sequences=[emb_x[num_leaves:], tree, T.arange(num_nodes)], 343 | n_steps=num_nodes) 344 | 345 | return T.concatenate([leaf_h, parent_h], axis=0) 346 | 347 | def loss_fn(self, y, pred_y): 348 | return T.sum(T.sqr(y - pred_y)) 349 | 350 | def loss_fn_multi(self, y, pred_y, y_exists): 351 | return T.sum(T.sum(T.sqr(y - pred_y), axis=1) * y_exists, axis=0) 352 | 353 | def gradient_descent(self, loss): 354 | """Momentum GD with gradient clipping.""" 355 | grad = T.grad(loss, self.params) 356 | self.momentum_velocity_ = [0.] * len(grad) 357 | grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), grad))) 358 | updates = OrderedDict() 359 | not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm)) 360 | scaling_den = T.maximum(5.0, grad_norm) 361 | for n, (param, grad) in enumerate(zip(self.params, grad)): 362 | grad = T.switch(not_finite, 0.1 * param, 363 | grad * (5.0 / scaling_den)) 364 | velocity = self.momentum_velocity_[n] 365 | update_step = self.momentum * velocity - self.learning_rate * grad 366 | self.momentum_velocity_[n] = update_step 367 | updates[param] = param + update_step 368 | return updates 369 | --------------------------------------------------------------------------------