├── __init__.py ├── .gitignore ├── data_utils.pyc ├── README.md ├── model_config.py ├── data_utils.py ├── preprocess-sick.py ├── relatedness_test.py ├── data_utils_test.py ├── treelstm.py ├── run_relatedness.py ├── download.py └── relatedness.py /__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | jars/* 2 | sick/* 3 | glove/* 4 | *.pyc 5 | -------------------------------------------------------------------------------- /data_utils.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wolfhu/LSTMRelatedness/HEAD/data_utils.pyc -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # LSTMRelatedness 2 | Attempt at using LSTMs to predict semantic relatedness of sentences (a la Tai et al. in Improved Semantic Representations From Tree-Structured Long Short-Term Memory Networks, Kai Sheng Tai, Richard Socher, and Christopher D. Manning) 3 | -------------------------------------------------------------------------------- /model_config.py: -------------------------------------------------------------------------------- 1 | 2 | class SmallConfig(object): 3 | """Small config.""" 4 | init_scale = 0.1 5 | learning_rate = 1.0 6 | max_grad_norm = 5 7 | num_layers = 2 8 | num_steps = 20 9 | hidden_size = 200 10 | max_epoch = 4 11 | max_max_epoch = 13 12 | keep_prob = 1.0 13 | lr_decay = 0.5 14 | batch_size = 20 15 | word_vec_size = 300 16 | vocab_size = 10000 17 | 18 | 19 | class MediumConfig(object): 20 | """Medium config.""" 21 | init_scale = 0.05 22 | learning_rate = 1.0 23 | max_grad_norm = 5 24 | num_layers = 2 25 | num_steps = 35 26 | hidden_size = 650 27 | max_epoch = 6 28 | max_max_epoch = 39 29 | keep_prob = 0.5 30 | lr_decay = 0.8 31 | batch_size = 20 32 | vocab_size = 10000 33 | word_vec_size = 300 34 | 35 | 36 | class LargeConfig(object): 37 | """Large config.""" 38 | init_scale = 0.04 39 | learning_rate = 1.0 40 | max_grad_norm = 10 41 | word_vec_size = 300 42 | num_layers = 2 43 | num_steps = 35 44 | hidden_size = 1500 45 | max_epoch = 14 46 | max_max_epoch = 55 47 | keep_prob = 0.35 48 | lr_decay = 1 / 1.15 49 | batch_size = 20 50 | vocab_size = 10000 51 | -------------------------------------------------------------------------------- /data_utils.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | 4 | def load_pretrained_glove_vectors(glove_file): 5 | """ 6 | This loads the pretrained glove vectors found at 7 | http://nlp.stanford.edu/projects/glove/ into a dictionary and 8 | vocabulary wordlist. returns the word list and a dictionary with 9 | indexes of the word related to the vectors 10 | 11 | :param glove_file: the filepath of the pretrained glove vecotrs 12 | """ 13 | index = 0 14 | vocabulary = {} 15 | glove_vectors = {} #skip information on first line 16 | with open(glove_file, 'r') as fin: 17 | for line in fin: 18 | items = line.replace('\r','').replace('\n','').split(' ') 19 | # if len(items) < 10: continue 20 | word = items[0] 21 | if word in vocabulary: 22 | wordindex = vocabulary[word] 23 | else: 24 | wordindex = index 25 | vocabulary[word] = index 26 | index += 1 27 | vect = np.array([np.float32(i) for i in items[1:] if len(i) > 1]) 28 | glove_vectors[wordindex] = vect 29 | 30 | return glove_vectors, vocabulary 31 | 32 | def convert_sentence_to_glove_vectors(sentence, vocab, glove_vectors, vector_size=300): 33 | # TODO: tokenize better 34 | word_vectors = [] 35 | for word in sentence.split(" "): 36 | word_vectors.append(convert_word_to_glove(word, vocab, glove_vectors, vector_size)) 37 | return np.array(word_vectors) 38 | 39 | def convert_word_to_glove(word, vocab, glove_vectors, vector_size = 300): 40 | if word in vocab: 41 | return glove_vectors[vocab[word]] 42 | else: 43 | index = len(vocab) + 1 44 | vocab[word] = index 45 | zeroes = np.zeros(vector_size) 46 | glove_vectors[index] = zeroes 47 | return zeroes 48 | 49 | def load_sick_data(sick_path, vocab, glove_vectors, vector_size = 300): 50 | #TODO: this can probably be a lot more pythonic and efficient 51 | l_sentences = [] 52 | r_sentences = [] 53 | relatedness = [] 54 | with open(sick_path) as f: 55 | for line in f.readlines(): 56 | cols = line.split('\t') 57 | # l_sentence = 58 | -------------------------------------------------------------------------------- /preprocess-sick.py: -------------------------------------------------------------------------------- 1 | """ 2 | Preprocessing script for SICK data. 3 | 4 | """ 5 | 6 | import os 7 | import glob 8 | 9 | def make_dirs(dirs): 10 | for d in dirs: 11 | if not os.path.exists(d): 12 | os.makedirs(d) 13 | 14 | def build_vocab(filepaths, dst_path, lowercase=True): 15 | vocab = set() 16 | for filepath in filepaths: 17 | with open(filepath) as f: 18 | for line in f: 19 | if lowercase: 20 | line = line.lower() 21 | vocab |= set(line.split()) 22 | with open(dst_path, 'w') as f: 23 | for w in sorted(vocab): 24 | f.write(w + '\n') 25 | 26 | def split(filepath, dst_dir): 27 | with open(filepath) as datafile, \ 28 | open(os.path.join(dst_dir, 'a.txt'), 'w') as afile, \ 29 | open(os.path.join(dst_dir, 'b.txt'), 'w') as bfile, \ 30 | open(os.path.join(dst_dir, 'id.txt'), 'w') as idfile, \ 31 | open(os.path.join(dst_dir, 'sim.txt'), 'w') as simfile: 32 | datafile.readline() 33 | for line in datafile: 34 | i, a, b, sim, ent = line.strip().split('\t') 35 | idfile.write(i + '\n') 36 | afile.write(a + '\n') 37 | bfile.write(b + '\n') 38 | simfile.write(sim + '\n') 39 | 40 | if __name__ == '__main__': 41 | print('=' * 80) 42 | print('Preprocessing SICK dataset') 43 | print('=' * 80) 44 | 45 | base_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) 46 | data_dir = os.path.join(base_dir, 'data') 47 | sick_dir = os.path.join(data_dir, 'sick') 48 | lib_dir = os.path.join(base_dir, 'lib') 49 | train_dir = os.path.join(sick_dir, 'train') 50 | dev_dir = os.path.join(sick_dir, 'dev') 51 | test_dir = os.path.join(sick_dir, 'test') 52 | make_dirs([train_dir, dev_dir, test_dir]) 53 | 54 | # split into separate files 55 | split(os.path.join(sick_dir, 'SICK_train.txt'), train_dir) 56 | split(os.path.join(sick_dir, 'SICK_trial.txt'), dev_dir) 57 | split(os.path.join(sick_dir, 'SICK_test_annotated.txt'), test_dir) 58 | 59 | # get vocabulary 60 | build_vocab( 61 | glob.glob(os.path.join(sick_dir, '*/*.toks')), 62 | os.path.join(sick_dir, 'vocab.txt')) 63 | build_vocab( 64 | glob.glob(os.path.join(sick_dir, '*/*.toks')), 65 | os.path.join(sick_dir, 'vocab-cased.txt'), 66 | lowercase=False) 67 | -------------------------------------------------------------------------------- /relatedness_test.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import numpy as np 3 | 4 | from data_utils import * 5 | from relatedness import RelatednessModel 6 | from model_config import SmallConfig 7 | import tensorflow as tf 8 | 9 | comma_glove_vector = np.array([-0.082752, 0.67204, -0.14987, -0.064983, 0.056491, 10 | 0.40228, 0.0027747, -0.3311, -0.30691, 2.0817, 0.031819, 0.013643, 0.30265, 11 | 0.0071297, -0.5819, -0.2774, -0.062254, 1.1451, -0.24232, 0.1235, -0.12243, 12 | 0.33152, -0.006162, -0.30541, -0.13057, -0.054601, 0.037083, -0.070552, 13 | 0.5893, -0.30385, 0.2898, -0.14653, -0.27052, 0.37161, 0.32031, 14 | -0.29125, 0.0052483, -0.13212, -0.052736, 0.087349, -0.26668, -0.16897, 0.015162, 15 | -0.0083746, -0.14871, 0.23413, -0.20719, -0.091386, 0.40075, -0.17223, 0.18145, 16 | 0.37586, -0.28682, 0.37289, -0.16185, 0.18008, 0.3032, -0.13216, 0.18352, 17 | 0.095759, 0.094916, 0.008289, 0.11761, 0.34046, 0.03677, -0.29077, 18 | 0.058303, -0.027814, 0.082941, 0.1862, -0.031494, 0.27985, -0.074412, 19 | -0.13762, -0.21866, 0.18138, 0.040855, -0.113, 0.24107, 0.3657, -0.27525, 20 | -0.05684, 0.34872, 0.011884, 0.14517, -0.71395, 0.48497, 0.14807, 0.62287, 21 | 0.20599, 0.58379, -0.13438, 0.40207, 0.18311, 0.28021, -0.42349, -0.25626, 22 | 0.17715, -0.54095, 0.16596, -0.036058, 0.08499, -0.64989, 0.075549, -0.28831, 23 | 0.40626, -0.2802, 0.094062, 0.32406, 0.28437, -0.26341, 0.11553, 0.071918, 24 | -0.47215, -0.18366, -0.34709, 0.29964, -0.66514, 0.002516, -0.42333, 0.27512, 25 | 0.36012, 0.16311, 0.23964, -0.05923, 0.3261, 0.20559, 0.038677, -0.045816, 26 | 0.089764, 0.43151, -0.15954, 0.08532, -0.26572, -0.15001, 0.084286, -0.16714, -0.43004, 27 | 0.060807, 0.13121, -0.24112, 0.66554, 0.4453, -0.18019, -0.13919, 0.56252, 0.21457, 28 | -0.46443, -0.012211, 0.029988, -0.051094, -0.20135, 0.80788, 0.47377, -0.057647, 29 | 0.46216, 0.16084, -0.20954, -0.05452, 0.15572, -0.13712, 0.12972, -0.011936, 30 | -0.003378, -0.13595, -0.080711, 0.20065, 0.054056, 0.046816, 0.059539, 0.046265, 0.17754, 31 | -0.31094, 0.28119, -0.24355, 0.085252, -0.21011, -0.19472, 0.0027297, -0.46341, 0.14789, -0.31517, 32 | -0.065939, 0.036106, 0.42903, -0.33759, 0.16432, 0.32568, -0.050392, -0.054297, 0.24074, 33 | 0.41923, 0.13012, -0.17167, -0.37808, -0.23089, -0.019477, -0.29291, -0.30824, 0.30297, 34 | -0.22659, 0.081574, -0.18516, -0.21408, 0.40616, -0.28974, 0.074174, -0.17795, 0.28595, 35 | -0.039626, -0.2339, -0.36054, -0.067503, -0.091065, 0.23438, -0.0041331, 0.003232, 0.0072134, 36 | 0.008697, 0.21614, 0.049904, 0.35582, 0.13748, 0.073361, 0.14166, 0.2412, -0.013322, 37 | 0.15613, 0.083381, 0.088146, -0.019357, 0.43795, 0.083961, 0.45309, -0.50489, 38 | -0.10865, -0.2527, -0.18251, 0.20441, 0.13319, 0.1294, 0.050594, -0.15612, -0.39543, 39 | 0.12538, 0.24881, -0.1927, -0.31847, -0.12719, 0.4341, 0.31177, -0.0040946, -0.2094, 40 | -0.079961, 0.1161, -0.050794, 0.015266, -0.2803, -0.12486, 0.23587, 0.2339, -0.14023, 41 | 0.028462, 0.56923, -0.1649, -0.036429, 0.010051, -0.17107, -0.042608, 0.044965, -0.4393, -0.26137, 42 | 0.30088, -0.060772, -0.45312, -0.19076, -0.20288, 0.27694, -0.060888, 0.11944, 0.62206, 43 | -0.19343, 0.47849, -0.30113, 0.059389, 0.074901, 0.061068, -0.4662, 0.40054, -0.19099, 44 | -0.14331,0.018267,-0.18643,0.20709,-0.35598,0.05338,-0.050821,-0.1918, -0.37846, -0.06589]) 45 | 46 | class RelatednessModelTest(unittest.TestCase): 47 | def test_load_glove_vectors(self): 48 | # Get a small config 49 | config = SmallConfig() 50 | glove, vocab = load_pretrained_glove_vectors('glove/glove.test.subset.txt') 51 | model = RelatednessModel(False, glove, vocab, config) 52 | # right now this just tests that the dang thing just runs 53 | sentence = ", , , ," 54 | with tf.Graph().as_default(), tf.Session() as session: 55 | model.process_sentence_pair(sentence, sentence, session) 56 | 57 | 58 | 59 | if __name__ == '__main__': 60 | unittest.main() 61 | -------------------------------------------------------------------------------- /data_utils_test.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import numpy as np 3 | 4 | from data_utils import * 5 | 6 | comma_glove_vector = np.array([-0.082752, 0.67204, -0.14987, -0.064983, 0.056491, 7 | 0.40228, 0.0027747, -0.3311, -0.30691, 2.0817, 0.031819, 0.013643, 0.30265, 8 | 0.0071297, -0.5819, -0.2774, -0.062254, 1.1451, -0.24232, 0.1235, -0.12243, 9 | 0.33152, -0.006162, -0.30541, -0.13057, -0.054601, 0.037083, -0.070552, 10 | 0.5893, -0.30385, 0.2898, -0.14653, -0.27052, 0.37161, 0.32031, 11 | -0.29125, 0.0052483, -0.13212, -0.052736, 0.087349, -0.26668, -0.16897, 0.015162, 12 | -0.0083746, -0.14871, 0.23413, -0.20719, -0.091386, 0.40075, -0.17223, 0.18145, 13 | 0.37586, -0.28682, 0.37289, -0.16185, 0.18008, 0.3032, -0.13216, 0.18352, 14 | 0.095759, 0.094916, 0.008289, 0.11761, 0.34046, 0.03677, -0.29077, 15 | 0.058303, -0.027814, 0.082941, 0.1862, -0.031494, 0.27985, -0.074412, 16 | -0.13762, -0.21866, 0.18138, 0.040855, -0.113, 0.24107, 0.3657, -0.27525, 17 | -0.05684, 0.34872, 0.011884, 0.14517, -0.71395, 0.48497, 0.14807, 0.62287, 18 | 0.20599, 0.58379, -0.13438, 0.40207, 0.18311, 0.28021, -0.42349, -0.25626, 19 | 0.17715, -0.54095, 0.16596, -0.036058, 0.08499, -0.64989, 0.075549, -0.28831, 20 | 0.40626, -0.2802, 0.094062, 0.32406, 0.28437, -0.26341, 0.11553, 0.071918, 21 | -0.47215, -0.18366, -0.34709, 0.29964, -0.66514, 0.002516, -0.42333, 0.27512, 22 | 0.36012, 0.16311, 0.23964, -0.05923, 0.3261, 0.20559, 0.038677, -0.045816, 23 | 0.089764, 0.43151, -0.15954, 0.08532, -0.26572, -0.15001, 0.084286, -0.16714, -0.43004, 24 | 0.060807, 0.13121, -0.24112, 0.66554, 0.4453, -0.18019, -0.13919, 0.56252, 0.21457, 25 | -0.46443, -0.012211, 0.029988, -0.051094, -0.20135, 0.80788, 0.47377, -0.057647, 26 | 0.46216, 0.16084, -0.20954, -0.05452, 0.15572, -0.13712, 0.12972, -0.011936, 27 | -0.003378, -0.13595, -0.080711, 0.20065, 0.054056, 0.046816, 0.059539, 0.046265, 0.17754, 28 | -0.31094, 0.28119, -0.24355, 0.085252, -0.21011, -0.19472, 0.0027297, -0.46341, 0.14789, -0.31517, 29 | -0.065939, 0.036106, 0.42903, -0.33759, 0.16432, 0.32568, -0.050392, -0.054297, 0.24074, 30 | 0.41923, 0.13012, -0.17167, -0.37808, -0.23089, -0.019477, -0.29291, -0.30824, 0.30297, 31 | -0.22659, 0.081574, -0.18516, -0.21408, 0.40616, -0.28974, 0.074174, -0.17795, 0.28595, 32 | -0.039626, -0.2339, -0.36054, -0.067503, -0.091065, 0.23438, -0.0041331, 0.003232, 0.0072134, 33 | 0.008697, 0.21614, 0.049904, 0.35582, 0.13748, 0.073361, 0.14166, 0.2412, -0.013322, 34 | 0.15613, 0.083381, 0.088146, -0.019357, 0.43795, 0.083961, 0.45309, -0.50489, 35 | -0.10865, -0.2527, -0.18251, 0.20441, 0.13319, 0.1294, 0.050594, -0.15612, -0.39543, 36 | 0.12538, 0.24881, -0.1927, -0.31847, -0.12719, 0.4341, 0.31177, -0.0040946, -0.2094, 37 | -0.079961, 0.1161, -0.050794, 0.015266, -0.2803, -0.12486, 0.23587, 0.2339, -0.14023, 38 | 0.028462, 0.56923, -0.1649, -0.036429, 0.010051, -0.17107, -0.042608, 0.044965, -0.4393, -0.26137, 39 | 0.30088, -0.060772, -0.45312, -0.19076, -0.20288, 0.27694, -0.060888, 0.11944, 0.62206, 40 | -0.19343, 0.47849, -0.30113, 0.059389, 0.074901, 0.061068, -0.4662, 0.40054, -0.19099, 41 | -0.14331,0.018267,-0.18643,0.20709,-0.35598,0.05338,-0.050821,-0.1918, -0.37846, -0.06589]) 42 | 43 | class TestDataUtils(unittest.TestCase): 44 | 45 | def test_load_glove_vectors(self): 46 | glove, vocab = load_pretrained_glove_vectors('glove/glove.test.subset.txt') 47 | self.assertEqual(vocab[','], 0) 48 | np.testing.assert_allclose(glove[0], comma_glove_vector) 49 | print("Glove vector size: %s" % len(comma_glove_vector)) 50 | 51 | def test_convert_sentence_to_glove_vectors(self): 52 | glove, vocab = load_pretrained_glove_vectors('glove/glove.test.subset.txt') 53 | sentence = ", , , , ," 54 | vecs = convert_sentence_to_glove_vectors(sentence, vocab, glove) 55 | self.assertEqual(len(vecs), 5) 56 | for vec in vecs: np.testing.assert_allclose(vec, comma_glove_vector) 57 | 58 | if __name__ == '__main__': 59 | unittest.main() 60 | -------------------------------------------------------------------------------- /treelstm.py: -------------------------------------------------------------------------------- 1 | """Module for constructing Child Sum Tree LSTM Cells.""" 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | 6 | import math 7 | 8 | from six.moves import xrange # pylint: disable=redefined-builtin 9 | import tensorflow as tf 10 | 11 | from tensorflow.models.rnn import linear 12 | from tensorflow.models.rnn.rnn_cell import RNNCell 13 | 14 | class ChildSumTreeLSTMCell(RNNCell): 15 | """Child Sum Tree Long short-termmemory unit recurrent 16 | network cell. 17 | 18 | This implementation is based on: 19 | 20 | http://arxiv.org/pdf/1503.00075v3.pdf 21 | 22 | Kai Sheng Tai, Richard Socher, Christopher D. Manning 23 | "Improved Semantic Representations From Tree-Structured Long 24 | Short-Term Memory Networks." CoRR, 2015. 25 | """ 26 | 27 | def __init__(self, num_units, forget_bias=1.0): 28 | self._num_units = num_units 29 | self._forget_bias = forget_bias 30 | 31 | @property 32 | def input_size(self): 33 | return self._num_units 34 | 35 | @property 36 | def output_size(self): 37 | return self._num_units 38 | 39 | @property 40 | def state_size(self): 41 | return 2 * self._num_units 42 | 43 | def __call__(self, inputs, state, scope=None): 44 | """Long short-term memory cell (LSTM).""" 45 | with tf.variable_scope(scope or type(self).__name__): # "BasicLSTMCell" 46 | # Parameters of gates are concatenated into one multiply for efficiency. 47 | c, h = tf.split(1, 2, state) 48 | concat = linear.linear([inputs, h], 4 * self._num_units, True) 49 | 50 | fs = [] 51 | 52 | # This can be made more efficient since we're doing more than needs to be 53 | # done, but for now w/e 54 | for child_state in child_states: 55 | c_k, h_k = tf.split(1, 2, child_state) 56 | concat = linear.linear([inputs, h_k], 4 * self._num_units, True) 57 | i_k, j_k, f_k, o_k = tf.split(1, 4, concat) 58 | fs.append(f_k) 59 | 60 | 61 | # i = input_gate, j = new_input, f = forget_gate, o = output_gate 62 | # TODO: forget gate for each child, probably need to split by number 63 | # of child states or something 64 | i, j, f, o = tf.split(1, 4, concat) 65 | 66 | # If no children just treat it like a regular lstm 67 | if not fs: 68 | fs.append(f) 69 | 70 | new_c = sum(c * tf.sigmoid(fs + self._forget_bias)) + tf.sigmoid(i) * tf.tanh(j) 71 | new_h = tf.tanh(new_c) * tf.sigmoid(o) 72 | 73 | return new_h, tf.concat(1, [new_c, new_h]) 74 | 75 | 76 | class ChildSumTreeLSTM(RNNCell): 77 | """RNN cell composed sequentially of multiple simple cells.""" 78 | 79 | def __init__(self, size, keep_prob = 1): 80 | self._children = [] 81 | self._keep_prob = keep_prob 82 | self._root = ChildSumTreeLSTMCell(size, forget_bias=0.0) 83 | self._root = rnn_cell.DropoutWrapper(self._root, output_keep_prob=keep_prob) 84 | 85 | @property 86 | def input_size(self): 87 | return self._root.input_size 88 | 89 | @property 90 | def output_size(self): 91 | return self._root.output_size 92 | 93 | @property 94 | def state_size(self): 95 | return sum([cell.state_size for cell in self._children]) + self._root.state_size 96 | 97 | def __call__(self, inputs, state, scope=None): 98 | """Run this multi-layer cell on inputs, starting from state.""" 99 | with tf.variable_scope(scope or type(self).__name__): # "MultiRNNCell" 100 | cur_state_pos = 0 101 | cur_inp = inputs 102 | new_states = [] 103 | # Can the number of cells be variable??? 104 | for i, cell in enumerate(self._cells): 105 | with tf.variable_scope("Cell%d" % i): 106 | cur_state = tf.slice(state, [0, cur_state_pos], [-1, cell.state_size]) 107 | cur_state_pos += cell.state_size 108 | cur_inp, new_state = cell(cur_inp, cur_state) 109 | new_states.append(new_state) 110 | return cur_inp, tf.concat(1, new_states) 111 | -------------------------------------------------------------------------------- /run_relatedness.py: -------------------------------------------------------------------------------- 1 | # Peter Henderson 2 | # ============================================================================== 3 | 4 | from __future__ import absolute_import 5 | from __future__ import division 6 | from __future__ import print_function 7 | 8 | import time 9 | 10 | import tensorflow.python.platform 11 | 12 | import numpy as np 13 | import tensorflow as tf 14 | 15 | from tensorflow.models.rnn import rnn_cell 16 | from tensorflow.models.rnn import seq2seq 17 | from tensorflow.models.rnn.ptb import reader 18 | 19 | from model_config import * 20 | 21 | flags = tf.flags 22 | logging = tf.logging 23 | 24 | flags.DEFINE_string( 25 | "model", "small", 26 | "A type of model. Possible options are: small, medium, large.") 27 | flags.DEFINE_string("data_path", None, "data_path") 28 | flags.DEFINE_string("glove_path", None, "glove_path") 29 | 30 | FLAGS = flags.FLAGS 31 | 32 | def get_config(): 33 | if FLAGS.model == "small": 34 | return SmallConfig() 35 | elif FLAGS.model == "medium": 36 | return MediumConfig() 37 | elif FLAGS.model == "large": 38 | return LargeConfig() 39 | else: 40 | raise ValueError("Invalid model: %s", FLAGS.model) 41 | 42 | def run_epoch(session, m, data, eval_op, verbose=False): 43 | """Runs the model on the given data.""" 44 | epoch_size = ((len(data) // m.batch_size) - 1) // m.num_steps 45 | start_time = time.time() 46 | costs = 0.0 47 | iters = 0 48 | state = m.initial_state.eval() 49 | for step, (left_batch, right_batch, relatedness_scores) in enumerate(reader.ptb_iterator(data, m.batch_size, 50 | m.num_steps)): 51 | cost, state, _ = session.run([m.cost, m.final_state, eval_op], 52 | {m.l_inputs: left_batch, 53 | m.r_inputs: right_batch, 54 | m.targets: relatedness_score, 55 | m.initial_state: state}) 56 | costs += cost 57 | iters += m.num_steps 58 | 59 | if verbose and step % (epoch_size // 10) == 10: 60 | print("%.3f perplexity: %.3f speed: %.0f wps" % 61 | (step * 1.0 / epoch_size, np.exp(costs / iters), 62 | iters * m.batch_size / (time.time() - start_time))) 63 | 64 | return np.exp(costs / iters) 65 | 66 | def main(unused_args): 67 | if not FLAGS.data_path: 68 | raise ValueError("Must set --data_path to PTB data directory") 69 | if not FLAGS.glove_path: 70 | raise ValueError("Must set --data_path to PTB data directory") 71 | 72 | glove_vectors, vocabulary = load_pretrained_glove_vectors(FLAGS.glove_path) 73 | 74 | # TODO: read training, test, validation data 75 | # train_data, test_data, valid_data = load_data(FLAGS.data_path) 76 | 77 | # training config params 78 | config = get_config() 79 | 80 | # create a new config object for the eval, since we're changing variables 81 | eval_config = get_config() 82 | eval_config.batch_size = 1 83 | eval_config.num_steps = 1 84 | 85 | # Generate a session 86 | with tf.Graph().as_default(), tf.Session() as session: 87 | initializer = tf.random_uniform_initializer(-config.init_scale, 88 | config.init_scale) 89 | # TODO: initialize model 90 | tf.initialize_all_variables().run() 91 | 92 | # For X epochs run an epoch and decrease the learning 93 | # rate 94 | for i in range(config.max_max_epoch): 95 | 96 | lr_decay = config.lr_decay ** max(i - config.max_epoch, 0.0) 97 | m.assign_lr(session, config.learning_rate * lr_decay) 98 | 99 | print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr))) 100 | train_perplexity = run_epoch(session, m, train_data, m.train_op, 101 | verbose=True) 102 | print("Epoch: %d Train Perplexity: %.3f" % (i + 1, train_perplexity)) 103 | valid_perplexity = run_epoch(session, mvalid, valid_data, tf.no_op()) 104 | print("Epoch: %d Valid Perplexity: %.3f" % (i + 1, valid_perplexity)) 105 | 106 | test_perplexity = run_epoch(session, mtest, test_data, tf.no_op()) 107 | print("Test Perplexity: %.3f" % test_perplexity) 108 | 109 | 110 | if __name__ == "__main__": 111 | tf.app.run() 112 | -------------------------------------------------------------------------------- /download.py: -------------------------------------------------------------------------------- 1 | """ 2 | Downloads the following: 3 | - Stanford parser 4 | - Stanford POS tagger 5 | - Glove vectors 6 | - SICK dataset (semantic relatedness task) 7 | - Stanford Sentiment Treebank (sentiment classification task) 8 | 9 | """ 10 | 11 | from __future__ import print_function 12 | import urllib2 13 | import sys 14 | import os 15 | import shutil 16 | import zipfile 17 | import gzip 18 | 19 | def download(url, dirpath): 20 | filename = url.split('/')[-1] 21 | filepath = os.path.join(dirpath, filename) 22 | u = urllib2.urlopen(url) 23 | f = open(filepath, 'wb') 24 | filesize = int(u.info().getheaders("Content-Length")[0]) 25 | print("Downloading: %s Bytes: %s" % (filename, filesize)) 26 | 27 | downloaded = 0 28 | block_sz = 8192 29 | status_width = 70 30 | while True: 31 | buf = u.read(block_sz) 32 | if not buf: 33 | print('') 34 | break 35 | else: 36 | print('', end='\r') 37 | downloaded += len(buf) 38 | f.write(buf) 39 | status = (("[%-" + str(status_width + 1) + "s] %3.2f%%") % 40 | ('=' * int(float(downloaded) / filesize * status_width) + '>', downloaded * 100. / filesize)) 41 | print(status, end='') 42 | sys.stdout.flush() 43 | f.close() 44 | return filepath 45 | 46 | def unzip(filepath): 47 | dirpath = os.path.dirname(filepath) 48 | with zipfile.ZipFile(filepath) as zf: 49 | zf.extractall(dirpath) 50 | os.remove(filepath) 51 | 52 | def download_tagger(dirpath): 53 | tagger_dir = 'stanford-tagger' 54 | if os.path.exists(os.path.join(dirpath, tagger_dir)): 55 | print('Found Stanford POS Tagger - skip') 56 | return 57 | url = 'http://nlp.stanford.edu/software/stanford-postagger-2015-01-29.zip' 58 | filepath = download(url, dirpath) 59 | zip_dir = '' 60 | with zipfile.ZipFile(filepath) as zf: 61 | zip_dir = zf.namelist()[0] 62 | zf.extractall(dirpath) 63 | os.remove(filepath) 64 | os.rename(os.path.join(dirpath, zip_dir), os.path.join(dirpath, tagger_dir)) 65 | 66 | def download_parser(dirpath): 67 | parser_dir = 'stanford-parser' 68 | if os.path.exists(os.path.join(dirpath, parser_dir)): 69 | print('Found Stanford Parser - skip') 70 | return 71 | url = 'http://nlp.stanford.edu/software/stanford-parser-full-2015-01-29.zip' 72 | filepath = download(url, dirpath) 73 | zip_dir = '' 74 | with zipfile.ZipFile(filepath) as zf: 75 | zip_dir = zf.namelist()[0] 76 | zf.extractall(dirpath) 77 | os.remove(filepath) 78 | os.rename(os.path.join(dirpath, zip_dir), os.path.join(dirpath, parser_dir)) 79 | 80 | def download_wordvecs(dirpath): 81 | if os.path.exists(dirpath): 82 | print('Found Glove vectors - skip') 83 | return 84 | else: 85 | os.makedirs(dirpath) 86 | url = 'http://www-nlp.stanford.edu/data/glove.840B.300d.txt.gz' 87 | filepath = download(url, dirpath) 88 | print('extracting ' + filepath) 89 | with gzip.open(filepath, 'rb') as gf: 90 | with open(filepath[:-3], 'w') as f: 91 | for line in gf: 92 | f.write(line) 93 | os.remove(filepath) 94 | 95 | def download_sick(dirpath): 96 | if os.path.exists(dirpath): 97 | print('Found SICK dataset - skip') 98 | return 99 | else: 100 | os.makedirs(dirpath) 101 | train_url = 'http://alt.qcri.org/semeval2014/task1/data/uploads/sick_train.zip' 102 | trial_url = 'http://alt.qcri.org/semeval2014/task1/data/uploads/sick_trial.zip' 103 | test_url = 'http://alt.qcri.org/semeval2014/task1/data/uploads/sick_test_annotated.zip' 104 | unzip(download(train_url, dirpath)) 105 | unzip(download(trial_url, dirpath)) 106 | unzip(download(test_url, dirpath)) 107 | 108 | def download_sst(dirpath): 109 | if os.path.exists(dirpath): 110 | print('Found SST dataset - skip') 111 | return 112 | url = 'http://nlp.stanford.edu/~socherr/stanfordSentimentTreebank.zip' 113 | parent_dir = os.path.dirname(dirpath) 114 | unzip(download(url, parent_dir)) 115 | os.rename( 116 | os.path.join(parent_dir, 'stanfordSentimentTreebank'), 117 | os.path.join(parent_dir, 'sst')) 118 | shutil.rmtree(os.path.join(parent_dir, '__MACOSX')) # remove extraneous dir 119 | 120 | if __name__ == '__main__': 121 | base_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) 122 | 123 | # data 124 | data_dir = os.path.join(base_dir, 'data') 125 | wordvec_dir = os.path.join(data_dir, 'glove') 126 | sick_dir = os.path.join(data_dir, 'sick') 127 | sst_dir = os.path.join(data_dir, 'sst') 128 | 129 | # libraries 130 | lib_dir = os.path.join(base_dir, 'lib') 131 | 132 | # download dependencies 133 | download_tagger(lib_dir) 134 | download_parser(lib_dir) 135 | download_wordvecs(wordvec_dir) 136 | download_sick(sick_dir) 137 | download_sst(sst_dir) 138 | -------------------------------------------------------------------------------- /relatedness.py: -------------------------------------------------------------------------------- 1 | # Peter Henderson 2 | # ============================================================================== 3 | 4 | """Example TreeLSTM implementation based on Socher et al.'s 5 | TODO: link here and citation 6 | 7 | Much of this was taken from tensorflow 8 | 9 | TODO: link here 10 | 11 | There are 3 supported model configurations: 12 | =========================================== 13 | | config | epochs | train | valid | test 14 | =========================================== 15 | | small | 13 | 37.99 | 121.39 | 115.91 16 | | medium | 39 | 48.45 | 86.16 | 82.07 17 | | large | 55 | 37.87 | 82.62 | 78.29 18 | The exact results may vary depending on the random initialization. 19 | 20 | The hyperparameters used in the model: 21 | - init_scale - the initial scale of the weights 22 | - learning_rate - the initial value of the learning rate 23 | - max_grad_norm - the maximum permissible norm of the gradient 24 | - num_layers - the number of LSTM layers 25 | - num_steps - the number of unrolled steps of LSTM 26 | - hidden_size - the number of LSTM units 27 | - max_epoch - the number of epochs trained with the initial learning rate 28 | - max_max_epoch - the total number of epochs for training 29 | - keep_prob - the probability of keeping weights in the dropout layer 30 | - lr_decay - the decay of the learning rate for each epoch after "max_epoch" 31 | - batch_size - the batch size 32 | 33 | To compile on CPU: 34 | bazel build -c opt tensorflow/models/rnn/ptb:ptb_word_lm 35 | To compile on GPU: 36 | bazel build -c opt tensorflow --config=cuda \ 37 | tensorflow/models/rnn/ptb:ptb_word_lm 38 | To run: 39 | ./bazel-bin/.../ptb_word_lm --data_path=/tmp/simple-examples/data/ 40 | 41 | """ 42 | from __future__ import absolute_import 43 | from __future__ import division 44 | from __future__ import print_function 45 | 46 | import time 47 | 48 | import tensorflow.python.platform 49 | 50 | import numpy as np 51 | import tensorflow as tf 52 | 53 | from tensorflow.models.rnn import rnn_cell 54 | from tensorflow.models.rnn import rnn 55 | from tensorflow.models.rnn import seq2seq 56 | from data_utils import * 57 | 58 | class RelatednessModel(object): 59 | """The relatedness model.""" 60 | 61 | def process_sentence_pair(self, lsentence_raw, rsentence_raw, session, prev_state = None): 62 | """ TODO: this is mad inefficient esp. without symbolic 63 | compiling, should really batch this 64 | Input sentence is just string""" 65 | # convert sentence into word vector array 66 | lsentence = convert_sentence_to_glove_vectors(lsentence_raw, self.vocabulary, self.glove_word_vectors, self.word_vec_size) 67 | rsentence = convert_sentence_to_glove_vectors(rsentence_raw, self.vocabulary, self.glove_word_vectors, self.word_vec_size) 68 | 69 | # 5 x 300 70 | _left_inputs = tf.placeholder(tf.float32, [len(lsentence), self.config.word_vec_size]) 71 | _right_inputs = tf.placeholder(tf.float32, [len(rsentence), self.config.word_vec_size]) 72 | 73 | # _targets = tf.placeholder(tf.int32) 74 | 75 | # Apply dropout filter 76 | # if self.is_training and self.config.keep_prob < 1: 77 | # left_inputs = [tf.nn.dropout(input_, self.config.keep_prob) for input_ in left_inputs] 78 | # right_inputs = [tf.nn.dropout(input_, self.config.keep_prob) for input_ in right_inputs] 79 | 80 | linputs = [ tf.reshape(i, (1, self.config.word_vec_size)) for i in tf.split(0, len(lsentence), _left_inputs)] 81 | rinputs = [ tf.reshape(i, (1, self.config.word_vec_size)) for i in tf.split(0, len(rsentence), _right_inputs)] 82 | 83 | if prev_state is None: 84 | prev_state = self.left_lstm_cell.zero_state(1, tf.float32) 85 | 86 | with tf.variable_scope("LeftLSTM"): 87 | loutputs, rstates = rnn.rnn(self.left_lstm_cell, linputs, initial_state=prev_state, sequence_length=len(lsentence)) 88 | with tf.variable_scope("RightLSTM"): 89 | routputs, rstates = rnn.rnn(self.right_lstm_cell, rinputs, initial_state=prev_state, sequence_length=len(lsentence)) 90 | 91 | iop = tf.initialize_all_variables() 92 | session.run(iop) 93 | 94 | # TODO: the actual loss function and relatedness softmax layer 95 | louts = session.run(loutputs, feed_dict = {_left_inputs : lsentence, _right_inputs : rsentence }) 96 | 97 | 98 | # outputs at each timestep of the sentence (i.e. each word) 99 | print(louts) 100 | print(len(louts)) 101 | # print(routs) 102 | 103 | def __init__(self, is_training, glove_word_vectors, vocabulary, config): 104 | self.size = config.hidden_size 105 | self.config = config 106 | self.is_training = is_training 107 | self.word_vec_size = config.word_vec_size 108 | vocab_size = config.vocab_size 109 | self.glove_word_vectors = glove_word_vectors 110 | self.vocabulary = vocabulary 111 | 112 | # Slightly better results can be obtained with forget gate biases 113 | # initialized to 1 but the hyperparameters of the model would need to be 114 | # different than reported in the paper. 115 | 116 | # TODO: these might be able to be improved if used the LSTMCell which has other features 117 | # to improve performance, but then need the sentence_length 118 | with tf.variable_scope("LeftLSTM"): 119 | self.left_lstm_cell = rnn_cell.BasicLSTMCell(self.size, forget_bias=1.0) 120 | with tf.variable_scope("RightLSTM"): 121 | self.right_lstm_cell = rnn_cell.BasicLSTMCell(self.size, forget_bias=1.0) 122 | if is_training and config.keep_prob < 1: 123 | with tf.variable_scope("LeftLSTM"): 124 | self.left_lstm_cell = rnn_cell.DropoutWrapper(self.left_lstm_cell, output_keep_prob=config.keep_prob) 125 | with tf.variable_scope("RightLSTM"): 126 | self.right_lstm_cell = rnn_cell.DropoutWrapper(self.right_lstm_cell, output_keep_prob=config.keep_prob) 127 | 128 | with tf.variable_scope("LeftLSTM"): 129 | self.left_lstm_cell = rnn_cell.MultiRNNCell([self.left_lstm_cell] * config.num_layers) 130 | with tf.variable_scope("RightLSTM"): 131 | self.right_lstm_cell = rnn_cell.MultiRNNCell([self.right_lstm_cell] * config.num_layers) 132 | 133 | # output = tf.reshape(tf.concat(1, outputs), [-1, size]) 134 | # # Need a simple network on top for the similarity 135 | # logits = tf.nn.xw_plus_b(output, 136 | # tf.get_variable("softmax_w", [size, vocab_size]), 137 | # tf.get_variable("softmax_b", [vocab_size])) 138 | # # TODO: replace this with softmax 139 | # loss = seq2seq.sequence_loss_by_example([logits], 140 | # [tf.reshape(self._targets, [-1])], 141 | # [tf.ones([batch_size * num_steps])], 142 | # vocab_size) 143 | # self._cost = cost = tf.reduce_sum(loss) / batch_size 144 | # self._final_state = states[-1] 145 | # 146 | # if not is_training: 147 | # return 148 | # 149 | # self._lr = tf.Variable(0.0, trainable=False) 150 | # tvars = tf.trainable_variables() 151 | # grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), 152 | # config.max_grad_norm) 153 | # optimizer = tf.train.GradientDescentOptimizer(self.lr) 154 | # self._train_op = optimizer.apply_gradients(zip(grads, tvars)) 155 | 156 | def assign_lr(self, session, lr_value): 157 | session.run(tf.assign(self.lr, lr_value)) 158 | 159 | @property 160 | def input_data(self): 161 | return self._input_data 162 | 163 | @property 164 | def targets(self): 165 | return self._targets 166 | 167 | @property 168 | def initial_state(self): 169 | return self._initial_state 170 | 171 | @property 172 | def cost(self): 173 | return self._cost 174 | 175 | @property 176 | def final_state(self): 177 | return self._final_state 178 | 179 | @property 180 | def lr(self): 181 | return self._lr 182 | 183 | @property 184 | def train_op(self): 185 | return self._train_op 186 | --------------------------------------------------------------------------------