├── __init__.py
├── .gitignore
├── data_utils.pyc
├── README.md
├── model_config.py
├── data_utils.py
├── preprocess-sick.py
├── relatedness_test.py
├── data_utils_test.py
├── treelstm.py
├── run_relatedness.py
├── download.py
└── relatedness.py


/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | jars/*
2 | sick/*
3 | glove/*
4 | *.pyc
5 | 


--------------------------------------------------------------------------------
/data_utils.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wolfhu/LSTMRelatedness/HEAD/data_utils.pyc


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # LSTMRelatedness
2 | Attempt at using LSTMs to predict semantic relatedness of sentences (a la Tai et al. in Improved Semantic Representations From Tree-Structured Long Short-Term Memory Networks, Kai Sheng Tai, Richard Socher, and Christopher D. Manning)
3 | 


--------------------------------------------------------------------------------
/model_config.py:
--------------------------------------------------------------------------------
 1 | 
 2 | class SmallConfig(object):
 3 |   """Small config."""
 4 |   init_scale = 0.1
 5 |   learning_rate = 1.0
 6 |   max_grad_norm = 5
 7 |   num_layers = 2
 8 |   num_steps = 20
 9 |   hidden_size = 200
10 |   max_epoch = 4
11 |   max_max_epoch = 13
12 |   keep_prob = 1.0
13 |   lr_decay = 0.5
14 |   batch_size = 20
15 |   word_vec_size = 300
16 |   vocab_size = 10000
17 | 
18 | 
19 | class MediumConfig(object):
20 |   """Medium config."""
21 |   init_scale = 0.05
22 |   learning_rate = 1.0
23 |   max_grad_norm = 5
24 |   num_layers = 2
25 |   num_steps = 35
26 |   hidden_size = 650
27 |   max_epoch = 6
28 |   max_max_epoch = 39
29 |   keep_prob = 0.5
30 |   lr_decay = 0.8
31 |   batch_size = 20
32 |   vocab_size = 10000
33 |   word_vec_size = 300
34 | 
35 | 
36 | class LargeConfig(object):
37 |   """Large config."""
38 |   init_scale = 0.04
39 |   learning_rate = 1.0
40 |   max_grad_norm = 10
41 |   word_vec_size = 300
42 |   num_layers = 2
43 |   num_steps = 35
44 |   hidden_size = 1500
45 |   max_epoch = 14
46 |   max_max_epoch = 55
47 |   keep_prob = 0.35
48 |   lr_decay = 1 / 1.15
49 |   batch_size = 20
50 |   vocab_size = 10000
51 | 


--------------------------------------------------------------------------------
/data_utils.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import numpy as np
 3 | 
 4 | def load_pretrained_glove_vectors(glove_file):
 5 |     """
 6 |     This loads the pretrained glove vectors found at
 7 |     http://nlp.stanford.edu/projects/glove/ into a dictionary and
 8 |     vocabulary wordlist. returns the word list and a dictionary with
 9 |     indexes of the word related to the vectors
10 | 
11 |     :param glove_file: the filepath of the pretrained glove vecotrs
12 |     """
13 |     index = 0
14 |     vocabulary = {}
15 |     glove_vectors = {} #skip information on first line
16 |     with open(glove_file, 'r') as fin:
17 |         for line in fin:
18 |             items = line.replace('\r','').replace('\n','').split(' ')
19 |             # if len(items) < 10: continue
20 |             word = items[0]
21 |             if word in vocabulary:
22 |                 wordindex = vocabulary[word]
23 |             else:
24 |                 wordindex = index
25 |                 vocabulary[word] = index
26 |                 index += 1
27 |             vect = np.array([np.float32(i) for i in items[1:] if len(i) > 1])
28 |             glove_vectors[wordindex] = vect
29 | 
30 |     return glove_vectors, vocabulary
31 | 
32 | def convert_sentence_to_glove_vectors(sentence, vocab, glove_vectors, vector_size=300):
33 |     # TODO: tokenize better
34 |     word_vectors = []
35 |     for word in sentence.split(" "):
36 |         word_vectors.append(convert_word_to_glove(word, vocab, glove_vectors, vector_size))
37 |     return np.array(word_vectors)
38 | 
39 | def convert_word_to_glove(word, vocab, glove_vectors, vector_size = 300):
40 |     if word in vocab:
41 |         return glove_vectors[vocab[word]]
42 |     else:
43 |         index = len(vocab) + 1
44 |         vocab[word] = index
45 |         zeroes = np.zeros(vector_size)
46 |         glove_vectors[index] = zeroes
47 |         return zeroes
48 | 
49 | def load_sick_data(sick_path, vocab, glove_vectors, vector_size = 300):
50 |     #TODO: this can probably be a lot more pythonic and efficient
51 |     l_sentences = []
52 |     r_sentences = []
53 |     relatedness = []
54 |     with open(sick_path) as f:
55 |         for line in f.readlines():
56 |             cols = line.split('\t')
57 |             # l_sentence =
58 | 


--------------------------------------------------------------------------------
/preprocess-sick.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Preprocessing script for SICK data.
 3 | 
 4 | """
 5 | 
 6 | import os
 7 | import glob
 8 | 
 9 | def make_dirs(dirs):
10 |     for d in dirs:
11 |         if not os.path.exists(d):
12 |             os.makedirs(d)
13 | 
14 | def build_vocab(filepaths, dst_path, lowercase=True):
15 |     vocab = set()
16 |     for filepath in filepaths:
17 |         with open(filepath) as f:
18 |             for line in f:
19 |                 if lowercase:
20 |                     line = line.lower()
21 |                 vocab |= set(line.split())
22 |     with open(dst_path, 'w') as f:
23 |         for w in sorted(vocab):
24 |             f.write(w + '\n')
25 | 
26 | def split(filepath, dst_dir):
27 |     with open(filepath) as datafile, \
28 |          open(os.path.join(dst_dir, 'a.txt'), 'w') as afile, \
29 |          open(os.path.join(dst_dir, 'b.txt'), 'w') as bfile,  \
30 |          open(os.path.join(dst_dir, 'id.txt'), 'w') as idfile, \
31 |          open(os.path.join(dst_dir, 'sim.txt'), 'w') as simfile:
32 |             datafile.readline()
33 |             for line in datafile:
34 |                 i, a, b, sim, ent = line.strip().split('\t')
35 |                 idfile.write(i + '\n')
36 |                 afile.write(a + '\n')
37 |                 bfile.write(b + '\n')
38 |                 simfile.write(sim + '\n')
39 | 
40 | if __name__ == '__main__':
41 |     print('=' * 80)
42 |     print('Preprocessing SICK dataset')
43 |     print('=' * 80)
44 | 
45 |     base_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
46 |     data_dir = os.path.join(base_dir, 'data')
47 |     sick_dir = os.path.join(data_dir, 'sick')
48 |     lib_dir = os.path.join(base_dir, 'lib')
49 |     train_dir = os.path.join(sick_dir, 'train')
50 |     dev_dir = os.path.join(sick_dir, 'dev')
51 |     test_dir = os.path.join(sick_dir, 'test')
52 |     make_dirs([train_dir, dev_dir, test_dir])
53 | 
54 |     # split into separate files
55 |     split(os.path.join(sick_dir, 'SICK_train.txt'), train_dir)
56 |     split(os.path.join(sick_dir, 'SICK_trial.txt'), dev_dir)
57 |     split(os.path.join(sick_dir, 'SICK_test_annotated.txt'), test_dir)
58 | 
59 |     # get vocabulary
60 |     build_vocab(
61 |         glob.glob(os.path.join(sick_dir, '*/*.toks')),
62 |         os.path.join(sick_dir, 'vocab.txt'))
63 |     build_vocab(
64 |         glob.glob(os.path.join(sick_dir, '*/*.toks')),
65 |         os.path.join(sick_dir, 'vocab-cased.txt'),
66 |         lowercase=False)
67 | 


--------------------------------------------------------------------------------
/relatedness_test.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import numpy as np
 3 | 
 4 | from data_utils import *
 5 | from relatedness import RelatednessModel
 6 | from model_config import SmallConfig
 7 | import tensorflow as tf
 8 | 
 9 | comma_glove_vector = np.array([-0.082752, 0.67204, -0.14987, -0.064983, 0.056491,
10 |     0.40228, 0.0027747, -0.3311, -0.30691, 2.0817, 0.031819, 0.013643, 0.30265,
11 |     0.0071297, -0.5819, -0.2774, -0.062254, 1.1451, -0.24232, 0.1235, -0.12243,
12 |     0.33152, -0.006162, -0.30541, -0.13057, -0.054601, 0.037083, -0.070552,
13 |     0.5893, -0.30385, 0.2898, -0.14653, -0.27052, 0.37161, 0.32031,
14 |     -0.29125, 0.0052483, -0.13212, -0.052736, 0.087349, -0.26668, -0.16897, 0.015162,
15 |     -0.0083746, -0.14871, 0.23413, -0.20719, -0.091386, 0.40075, -0.17223, 0.18145,
16 |     0.37586, -0.28682, 0.37289, -0.16185, 0.18008, 0.3032, -0.13216, 0.18352,
17 |     0.095759, 0.094916, 0.008289, 0.11761, 0.34046, 0.03677, -0.29077,
18 |     0.058303, -0.027814, 0.082941, 0.1862, -0.031494, 0.27985, -0.074412,
19 |     -0.13762, -0.21866, 0.18138, 0.040855, -0.113, 0.24107, 0.3657, -0.27525,
20 |     -0.05684, 0.34872, 0.011884, 0.14517, -0.71395, 0.48497, 0.14807, 0.62287,
21 |     0.20599, 0.58379, -0.13438, 0.40207, 0.18311, 0.28021, -0.42349, -0.25626,
22 |     0.17715, -0.54095, 0.16596, -0.036058, 0.08499, -0.64989, 0.075549, -0.28831,
23 |     0.40626, -0.2802, 0.094062, 0.32406, 0.28437, -0.26341, 0.11553, 0.071918,
24 |     -0.47215, -0.18366, -0.34709, 0.29964, -0.66514, 0.002516, -0.42333, 0.27512,
25 |     0.36012, 0.16311, 0.23964, -0.05923, 0.3261, 0.20559, 0.038677, -0.045816,
26 |     0.089764, 0.43151, -0.15954, 0.08532, -0.26572, -0.15001, 0.084286, -0.16714, -0.43004,
27 |     0.060807, 0.13121, -0.24112, 0.66554, 0.4453, -0.18019, -0.13919, 0.56252, 0.21457,
28 |     -0.46443, -0.012211, 0.029988, -0.051094, -0.20135, 0.80788, 0.47377, -0.057647,
29 |     0.46216, 0.16084, -0.20954, -0.05452, 0.15572, -0.13712, 0.12972, -0.011936,
30 |     -0.003378, -0.13595, -0.080711, 0.20065, 0.054056, 0.046816, 0.059539, 0.046265, 0.17754,
31 |     -0.31094, 0.28119, -0.24355, 0.085252, -0.21011, -0.19472, 0.0027297, -0.46341, 0.14789, -0.31517,
32 |     -0.065939, 0.036106, 0.42903, -0.33759, 0.16432, 0.32568, -0.050392, -0.054297, 0.24074,
33 |     0.41923, 0.13012, -0.17167, -0.37808, -0.23089, -0.019477, -0.29291, -0.30824, 0.30297,
34 |     -0.22659, 0.081574, -0.18516, -0.21408, 0.40616, -0.28974, 0.074174, -0.17795, 0.28595,
35 |     -0.039626, -0.2339, -0.36054, -0.067503, -0.091065, 0.23438, -0.0041331, 0.003232, 0.0072134,
36 |     0.008697, 0.21614, 0.049904, 0.35582, 0.13748, 0.073361, 0.14166, 0.2412, -0.013322,
37 |     0.15613, 0.083381, 0.088146, -0.019357, 0.43795, 0.083961, 0.45309, -0.50489,
38 |     -0.10865, -0.2527, -0.18251, 0.20441, 0.13319, 0.1294, 0.050594, -0.15612, -0.39543,
39 |     0.12538, 0.24881, -0.1927, -0.31847, -0.12719, 0.4341, 0.31177, -0.0040946, -0.2094,
40 |     -0.079961, 0.1161, -0.050794, 0.015266, -0.2803, -0.12486, 0.23587, 0.2339, -0.14023,
41 |     0.028462, 0.56923, -0.1649, -0.036429, 0.010051, -0.17107, -0.042608, 0.044965, -0.4393, -0.26137,
42 |     0.30088, -0.060772, -0.45312, -0.19076, -0.20288, 0.27694, -0.060888, 0.11944, 0.62206,
43 |     -0.19343, 0.47849, -0.30113, 0.059389, 0.074901, 0.061068, -0.4662, 0.40054, -0.19099,
44 |     -0.14331,0.018267,-0.18643,0.20709,-0.35598,0.05338,-0.050821,-0.1918, -0.37846, -0.06589])
45 | 
46 | class RelatednessModelTest(unittest.TestCase):
47 |   def test_load_glove_vectors(self):
48 |     # Get a small config
49 |     config = SmallConfig()
50 |     glove, vocab = load_pretrained_glove_vectors('glove/glove.test.subset.txt')
51 |     model = RelatednessModel(False, glove, vocab, config)
52 |     # right now this just tests that the dang thing just runs
53 |     sentence = ", , , ,"
54 |     with tf.Graph().as_default(), tf.Session() as session:
55 |         model.process_sentence_pair(sentence, sentence, session)
56 | 
57 | 
58 | 
59 | if __name__ == '__main__':
60 |     unittest.main()
61 | 


--------------------------------------------------------------------------------
/data_utils_test.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import numpy as np
 3 | 
 4 | from data_utils import *
 5 | 
 6 | comma_glove_vector = np.array([-0.082752, 0.67204, -0.14987, -0.064983, 0.056491,
 7 |     0.40228, 0.0027747, -0.3311, -0.30691, 2.0817, 0.031819, 0.013643, 0.30265,
 8 |     0.0071297, -0.5819, -0.2774, -0.062254, 1.1451, -0.24232, 0.1235, -0.12243,
 9 |     0.33152, -0.006162, -0.30541, -0.13057, -0.054601, 0.037083, -0.070552,
10 |     0.5893, -0.30385, 0.2898, -0.14653, -0.27052, 0.37161, 0.32031,
11 |     -0.29125, 0.0052483, -0.13212, -0.052736, 0.087349, -0.26668, -0.16897, 0.015162,
12 |     -0.0083746, -0.14871, 0.23413, -0.20719, -0.091386, 0.40075, -0.17223, 0.18145,
13 |     0.37586, -0.28682, 0.37289, -0.16185, 0.18008, 0.3032, -0.13216, 0.18352,
14 |     0.095759, 0.094916, 0.008289, 0.11761, 0.34046, 0.03677, -0.29077,
15 |     0.058303, -0.027814, 0.082941, 0.1862, -0.031494, 0.27985, -0.074412,
16 |     -0.13762, -0.21866, 0.18138, 0.040855, -0.113, 0.24107, 0.3657, -0.27525,
17 |     -0.05684, 0.34872, 0.011884, 0.14517, -0.71395, 0.48497, 0.14807, 0.62287,
18 |     0.20599, 0.58379, -0.13438, 0.40207, 0.18311, 0.28021, -0.42349, -0.25626,
19 |     0.17715, -0.54095, 0.16596, -0.036058, 0.08499, -0.64989, 0.075549, -0.28831,
20 |     0.40626, -0.2802, 0.094062, 0.32406, 0.28437, -0.26341, 0.11553, 0.071918,
21 |     -0.47215, -0.18366, -0.34709, 0.29964, -0.66514, 0.002516, -0.42333, 0.27512,
22 |     0.36012, 0.16311, 0.23964, -0.05923, 0.3261, 0.20559, 0.038677, -0.045816,
23 |     0.089764, 0.43151, -0.15954, 0.08532, -0.26572, -0.15001, 0.084286, -0.16714, -0.43004,
24 |     0.060807, 0.13121, -0.24112, 0.66554, 0.4453, -0.18019, -0.13919, 0.56252, 0.21457,
25 |     -0.46443, -0.012211, 0.029988, -0.051094, -0.20135, 0.80788, 0.47377, -0.057647,
26 |     0.46216, 0.16084, -0.20954, -0.05452, 0.15572, -0.13712, 0.12972, -0.011936,
27 |     -0.003378, -0.13595, -0.080711, 0.20065, 0.054056, 0.046816, 0.059539, 0.046265, 0.17754,
28 |     -0.31094, 0.28119, -0.24355, 0.085252, -0.21011, -0.19472, 0.0027297, -0.46341, 0.14789, -0.31517,
29 |     -0.065939, 0.036106, 0.42903, -0.33759, 0.16432, 0.32568, -0.050392, -0.054297, 0.24074,
30 |     0.41923, 0.13012, -0.17167, -0.37808, -0.23089, -0.019477, -0.29291, -0.30824, 0.30297,
31 |     -0.22659, 0.081574, -0.18516, -0.21408, 0.40616, -0.28974, 0.074174, -0.17795, 0.28595,
32 |     -0.039626, -0.2339, -0.36054, -0.067503, -0.091065, 0.23438, -0.0041331, 0.003232, 0.0072134,
33 |     0.008697, 0.21614, 0.049904, 0.35582, 0.13748, 0.073361, 0.14166, 0.2412, -0.013322,
34 |     0.15613, 0.083381, 0.088146, -0.019357, 0.43795, 0.083961, 0.45309, -0.50489,
35 |     -0.10865, -0.2527, -0.18251, 0.20441, 0.13319, 0.1294, 0.050594, -0.15612, -0.39543,
36 |     0.12538, 0.24881, -0.1927, -0.31847, -0.12719, 0.4341, 0.31177, -0.0040946, -0.2094,
37 |     -0.079961, 0.1161, -0.050794, 0.015266, -0.2803, -0.12486, 0.23587, 0.2339, -0.14023,
38 |     0.028462, 0.56923, -0.1649, -0.036429, 0.010051, -0.17107, -0.042608, 0.044965, -0.4393, -0.26137,
39 |     0.30088, -0.060772, -0.45312, -0.19076, -0.20288, 0.27694, -0.060888, 0.11944, 0.62206,
40 |     -0.19343, 0.47849, -0.30113, 0.059389, 0.074901, 0.061068, -0.4662, 0.40054, -0.19099,
41 |     -0.14331,0.018267,-0.18643,0.20709,-0.35598,0.05338,-0.050821,-0.1918, -0.37846, -0.06589])
42 | 
43 | class TestDataUtils(unittest.TestCase):
44 | 
45 |   def test_load_glove_vectors(self):
46 |     glove, vocab = load_pretrained_glove_vectors('glove/glove.test.subset.txt')
47 |     self.assertEqual(vocab[','], 0)
48 |     np.testing.assert_allclose(glove[0], comma_glove_vector)
49 |     print("Glove vector size: %s" % len(comma_glove_vector))
50 | 
51 |   def test_convert_sentence_to_glove_vectors(self):
52 |     glove, vocab = load_pretrained_glove_vectors('glove/glove.test.subset.txt')
53 |     sentence = ", , , , ,"
54 |     vecs = convert_sentence_to_glove_vectors(sentence, vocab, glove)
55 |     self.assertEqual(len(vecs), 5)
56 |     for vec in vecs: np.testing.assert_allclose(vec, comma_glove_vector)
57 | 
58 | if __name__ == '__main__':
59 |     unittest.main()
60 | 


--------------------------------------------------------------------------------
/treelstm.py:
--------------------------------------------------------------------------------
  1 | """Module for constructing Child Sum Tree LSTM Cells."""
  2 | from __future__ import absolute_import
  3 | from __future__ import division
  4 | from __future__ import print_function
  5 | 
  6 | import math
  7 | 
  8 | from six.moves import xrange  # pylint: disable=redefined-builtin
  9 | import tensorflow as tf
 10 | 
 11 | from tensorflow.models.rnn import linear
 12 | from tensorflow.models.rnn.rnn_cell import RNNCell
 13 | 
 14 | class ChildSumTreeLSTMCell(RNNCell):
 15 |   """Child Sum Tree Long short-termmemory unit recurrent
 16 |   network cell.
 17 | 
 18 |   This implementation is based on:
 19 | 
 20 |     http://arxiv.org/pdf/1503.00075v3.pdf
 21 | 
 22 |   Kai Sheng Tai, Richard Socher, Christopher D. Manning
 23 |   "Improved Semantic Representations From Tree-Structured Long
 24 |   Short-Term Memory Networks." CoRR, 2015.
 25 |   """
 26 | 
 27 |   def __init__(self, num_units, forget_bias=1.0):
 28 |     self._num_units = num_units
 29 |     self._forget_bias = forget_bias
 30 | 
 31 |   @property
 32 |   def input_size(self):
 33 |     return self._num_units
 34 | 
 35 |   @property
 36 |   def output_size(self):
 37 |     return self._num_units
 38 | 
 39 |   @property
 40 |   def state_size(self):
 41 |     return 2 * self._num_units
 42 | 
 43 |   def __call__(self, inputs, state, scope=None):
 44 |     """Long short-term memory cell (LSTM)."""
 45 |     with tf.variable_scope(scope or type(self).__name__):  # "BasicLSTMCell"
 46 |       # Parameters of gates are concatenated into one multiply for efficiency.
 47 |       c, h = tf.split(1, 2, state)
 48 |       concat = linear.linear([inputs, h], 4 * self._num_units, True)
 49 | 
 50 |       fs = []
 51 | 
 52 |       # This can be made more efficient since we're doing more than needs to be
 53 |       # done, but for now w/e
 54 |       for child_state in child_states:
 55 |           c_k, h_k = tf.split(1, 2, child_state)
 56 |           concat = linear.linear([inputs, h_k], 4 * self._num_units, True)
 57 |           i_k, j_k, f_k, o_k = tf.split(1, 4, concat)
 58 |           fs.append(f_k)
 59 | 
 60 | 
 61 |       # i = input_gate, j = new_input, f = forget_gate, o = output_gate
 62 |       # TODO: forget gate for each child, probably need to split by number
 63 |       # of child states or something
 64 |       i, j, f, o = tf.split(1, 4, concat)
 65 | 
 66 |       # If no children just treat it like a regular lstm
 67 |       if not fs:
 68 |         fs.append(f)
 69 | 
 70 |       new_c = sum(c * tf.sigmoid(fs + self._forget_bias)) + tf.sigmoid(i) * tf.tanh(j)
 71 |       new_h = tf.tanh(new_c) * tf.sigmoid(o)
 72 | 
 73 |     return new_h, tf.concat(1, [new_c, new_h])
 74 | 
 75 | 
 76 | class ChildSumTreeLSTM(RNNCell):
 77 |   """RNN cell composed sequentially of multiple simple cells."""
 78 | 
 79 |   def __init__(self, size, keep_prob = 1):
 80 |     self._children = []
 81 |     self._keep_prob = keep_prob
 82 |     self._root = ChildSumTreeLSTMCell(size, forget_bias=0.0)
 83 |     self._root = rnn_cell.DropoutWrapper(self._root, output_keep_prob=keep_prob)
 84 | 
 85 |   @property
 86 |   def input_size(self):
 87 |     return self._root.input_size
 88 | 
 89 |   @property
 90 |   def output_size(self):
 91 |     return self._root.output_size
 92 | 
 93 |   @property
 94 |   def state_size(self):
 95 |     return sum([cell.state_size for cell in self._children]) + self._root.state_size
 96 | 
 97 |   def __call__(self, inputs, state, scope=None):
 98 |     """Run this multi-layer cell on inputs, starting from state."""
 99 |     with tf.variable_scope(scope or type(self).__name__):  # "MultiRNNCell"
100 |       cur_state_pos = 0
101 |       cur_inp = inputs
102 |       new_states = []
103 |       # Can the number of cells be variable???
104 |       for i, cell in enumerate(self._cells):
105 |         with tf.variable_scope("Cell%d" % i):
106 |           cur_state = tf.slice(state, [0, cur_state_pos], [-1, cell.state_size])
107 |           cur_state_pos += cell.state_size
108 |           cur_inp, new_state = cell(cur_inp, cur_state)
109 |           new_states.append(new_state)
110 |     return cur_inp, tf.concat(1, new_states)
111 | 


--------------------------------------------------------------------------------
/run_relatedness.py:
--------------------------------------------------------------------------------
  1 | # Peter Henderson
  2 | # ==============================================================================
  3 | 
  4 | from __future__ import absolute_import
  5 | from __future__ import division
  6 | from __future__ import print_function
  7 | 
  8 | import time
  9 | 
 10 | import tensorflow.python.platform
 11 | 
 12 | import numpy as np
 13 | import tensorflow as tf
 14 | 
 15 | from tensorflow.models.rnn import rnn_cell
 16 | from tensorflow.models.rnn import seq2seq
 17 | from tensorflow.models.rnn.ptb import reader
 18 | 
 19 | from model_config import *
 20 | 
 21 | flags = tf.flags
 22 | logging = tf.logging
 23 | 
 24 | flags.DEFINE_string(
 25 |     "model", "small",
 26 |     "A type of model. Possible options are: small, medium, large.")
 27 | flags.DEFINE_string("data_path", None, "data_path")
 28 | flags.DEFINE_string("glove_path", None, "glove_path")
 29 | 
 30 | FLAGS = flags.FLAGS
 31 | 
 32 | def get_config():
 33 |   if FLAGS.model == "small":
 34 |     return SmallConfig()
 35 |   elif FLAGS.model == "medium":
 36 |     return MediumConfig()
 37 |   elif FLAGS.model == "large":
 38 |     return LargeConfig()
 39 |   else:
 40 |     raise ValueError("Invalid model: %s", FLAGS.model)
 41 | 
 42 | def run_epoch(session, m, data, eval_op, verbose=False):
 43 |   """Runs the model on the given data."""
 44 |   epoch_size = ((len(data) // m.batch_size) - 1) // m.num_steps
 45 |   start_time = time.time()
 46 |   costs = 0.0
 47 |   iters = 0
 48 |   state = m.initial_state.eval()
 49 |   for step, (left_batch, right_batch, relatedness_scores) in enumerate(reader.ptb_iterator(data, m.batch_size,
 50 |                                                     m.num_steps)):
 51 |     cost, state, _ = session.run([m.cost, m.final_state, eval_op],
 52 |                                  {m.l_inputs: left_batch,
 53 |                                   m.r_inputs: right_batch,
 54 |                                   m.targets: relatedness_score,
 55 |                                   m.initial_state: state})
 56 |     costs += cost
 57 |     iters += m.num_steps
 58 | 
 59 |     if verbose and step % (epoch_size // 10) == 10:
 60 |       print("%.3f perplexity: %.3f speed: %.0f wps" %
 61 |             (step * 1.0 / epoch_size, np.exp(costs / iters),
 62 |              iters * m.batch_size / (time.time() - start_time)))
 63 | 
 64 |   return np.exp(costs / iters)
 65 | 
 66 | def main(unused_args):
 67 |   if not FLAGS.data_path:
 68 |     raise ValueError("Must set --data_path to PTB data directory")
 69 |   if not FLAGS.glove_path:
 70 |     raise ValueError("Must set --data_path to PTB data directory")
 71 | 
 72 |   glove_vectors, vocabulary = load_pretrained_glove_vectors(FLAGS.glove_path)
 73 | 
 74 |   # TODO: read training, test, validation data
 75 |   # train_data, test_data, valid_data = load_data(FLAGS.data_path)
 76 | 
 77 |   # training config params
 78 |   config = get_config()
 79 | 
 80 |   # create a new config object for the eval, since we're changing variables
 81 |   eval_config = get_config()
 82 |   eval_config.batch_size = 1
 83 |   eval_config.num_steps = 1
 84 | 
 85 |   # Generate a session
 86 |   with tf.Graph().as_default(), tf.Session() as session:
 87 |     initializer = tf.random_uniform_initializer(-config.init_scale,
 88 |                                                 config.init_scale)
 89 |     # TODO: initialize model
 90 |     tf.initialize_all_variables().run()
 91 | 
 92 |     # For X epochs run an epoch and decrease the learning
 93 |     # rate
 94 |     for i in range(config.max_max_epoch):
 95 | 
 96 |       lr_decay = config.lr_decay ** max(i - config.max_epoch, 0.0)
 97 |       m.assign_lr(session, config.learning_rate * lr_decay)
 98 | 
 99 |       print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr)))
100 |       train_perplexity = run_epoch(session, m, train_data, m.train_op,
101 |                                    verbose=True)
102 |       print("Epoch: %d Train Perplexity: %.3f" % (i + 1, train_perplexity))
103 |       valid_perplexity = run_epoch(session, mvalid, valid_data, tf.no_op())
104 |       print("Epoch: %d Valid Perplexity: %.3f" % (i + 1, valid_perplexity))
105 | 
106 |     test_perplexity = run_epoch(session, mtest, test_data, tf.no_op())
107 |     print("Test Perplexity: %.3f" % test_perplexity)
108 | 
109 | 
110 | if __name__ == "__main__":
111 |   tf.app.run()
112 | 


--------------------------------------------------------------------------------
/download.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Downloads the following:
  3 | - Stanford parser
  4 | - Stanford POS tagger
  5 | - Glove vectors
  6 | - SICK dataset (semantic relatedness task)
  7 | - Stanford Sentiment Treebank (sentiment classification task)
  8 | 
  9 | """
 10 | 
 11 | from __future__ import print_function
 12 | import urllib2
 13 | import sys
 14 | import os
 15 | import shutil
 16 | import zipfile
 17 | import gzip
 18 | 
 19 | def download(url, dirpath):
 20 |     filename = url.split('/')[-1]
 21 |     filepath = os.path.join(dirpath, filename)
 22 |     u = urllib2.urlopen(url)
 23 |     f = open(filepath, 'wb')
 24 |     filesize = int(u.info().getheaders("Content-Length")[0])
 25 |     print("Downloading: %s Bytes: %s" % (filename, filesize))
 26 | 
 27 |     downloaded = 0
 28 |     block_sz = 8192
 29 |     status_width = 70
 30 |     while True:
 31 |         buf = u.read(block_sz)
 32 |         if not buf:
 33 |             print('')
 34 |             break
 35 |         else:
 36 |             print('', end='\r')
 37 |         downloaded += len(buf)
 38 |         f.write(buf)
 39 |         status = (("[%-" + str(status_width + 1) + "s] %3.2f%%") %
 40 |             ('=' * int(float(downloaded) / filesize * status_width) + '>', downloaded * 100. / filesize))
 41 |         print(status, end='')
 42 |         sys.stdout.flush()
 43 |     f.close()
 44 |     return filepath
 45 | 
 46 | def unzip(filepath):
 47 |     dirpath = os.path.dirname(filepath)
 48 |     with zipfile.ZipFile(filepath) as zf:
 49 |         zf.extractall(dirpath)
 50 |     os.remove(filepath)
 51 | 
 52 | def download_tagger(dirpath):
 53 |     tagger_dir = 'stanford-tagger'
 54 |     if os.path.exists(os.path.join(dirpath, tagger_dir)):
 55 |         print('Found Stanford POS Tagger - skip')
 56 |         return
 57 |     url = 'http://nlp.stanford.edu/software/stanford-postagger-2015-01-29.zip'
 58 |     filepath = download(url, dirpath)
 59 |     zip_dir = ''
 60 |     with zipfile.ZipFile(filepath) as zf:
 61 |         zip_dir = zf.namelist()[0]
 62 |         zf.extractall(dirpath)
 63 |     os.remove(filepath)
 64 |     os.rename(os.path.join(dirpath, zip_dir), os.path.join(dirpath, tagger_dir))
 65 | 
 66 | def download_parser(dirpath):
 67 |     parser_dir = 'stanford-parser'
 68 |     if os.path.exists(os.path.join(dirpath, parser_dir)):
 69 |         print('Found Stanford Parser - skip')
 70 |         return
 71 |     url = 'http://nlp.stanford.edu/software/stanford-parser-full-2015-01-29.zip'
 72 |     filepath = download(url, dirpath)
 73 |     zip_dir = ''
 74 |     with zipfile.ZipFile(filepath) as zf:
 75 |         zip_dir = zf.namelist()[0]
 76 |         zf.extractall(dirpath)
 77 |     os.remove(filepath)
 78 |     os.rename(os.path.join(dirpath, zip_dir), os.path.join(dirpath, parser_dir))
 79 | 
 80 | def download_wordvecs(dirpath):
 81 |     if os.path.exists(dirpath):
 82 |         print('Found Glove vectors - skip')
 83 |         return
 84 |     else:
 85 |         os.makedirs(dirpath)
 86 |     url = 'http://www-nlp.stanford.edu/data/glove.840B.300d.txt.gz'
 87 |     filepath = download(url, dirpath)
 88 |     print('extracting ' + filepath)
 89 |     with gzip.open(filepath, 'rb') as gf:
 90 |         with open(filepath[:-3], 'w') as f:
 91 |             for line in gf:
 92 |                 f.write(line)
 93 |     os.remove(filepath)
 94 | 
 95 | def download_sick(dirpath):
 96 |     if os.path.exists(dirpath):
 97 |         print('Found SICK dataset - skip')
 98 |         return
 99 |     else:
100 |         os.makedirs(dirpath)
101 |     train_url = 'http://alt.qcri.org/semeval2014/task1/data/uploads/sick_train.zip'
102 |     trial_url = 'http://alt.qcri.org/semeval2014/task1/data/uploads/sick_trial.zip'
103 |     test_url = 'http://alt.qcri.org/semeval2014/task1/data/uploads/sick_test_annotated.zip'
104 |     unzip(download(train_url, dirpath))
105 |     unzip(download(trial_url, dirpath))
106 |     unzip(download(test_url, dirpath))
107 | 
108 | def download_sst(dirpath):
109 |     if os.path.exists(dirpath):
110 |         print('Found SST dataset - skip')
111 |         return
112 |     url = 'http://nlp.stanford.edu/~socherr/stanfordSentimentTreebank.zip'
113 |     parent_dir = os.path.dirname(dirpath)
114 |     unzip(download(url, parent_dir))
115 |     os.rename(
116 |         os.path.join(parent_dir, 'stanfordSentimentTreebank'),
117 |         os.path.join(parent_dir, 'sst'))
118 |     shutil.rmtree(os.path.join(parent_dir, '__MACOSX')) # remove extraneous dir
119 | 
120 | if __name__ == '__main__':
121 |     base_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
122 | 
123 |     # data
124 |     data_dir = os.path.join(base_dir, 'data')
125 |     wordvec_dir = os.path.join(data_dir, 'glove')
126 |     sick_dir = os.path.join(data_dir, 'sick')
127 |     sst_dir = os.path.join(data_dir, 'sst')
128 | 
129 |     # libraries
130 |     lib_dir = os.path.join(base_dir, 'lib')
131 | 
132 |     # download dependencies
133 |     download_tagger(lib_dir)
134 |     download_parser(lib_dir)
135 |     download_wordvecs(wordvec_dir)
136 |     download_sick(sick_dir)
137 |     download_sst(sst_dir)
138 | 


--------------------------------------------------------------------------------
/relatedness.py:
--------------------------------------------------------------------------------
  1 | # Peter Henderson
  2 | # ==============================================================================
  3 | 
  4 | """Example TreeLSTM implementation based on Socher et al.'s
  5 | TODO: link here and citation
  6 | 
  7 | Much of this was taken from tensorflow
  8 | 
  9 | TODO: link here
 10 | 
 11 | There are 3 supported model configurations:
 12 | ===========================================
 13 | | config | epochs | train | valid  | test
 14 | ===========================================
 15 | | small  | 13     | 37.99 | 121.39 | 115.91
 16 | | medium | 39     | 48.45 |  86.16 |  82.07
 17 | | large  | 55     | 37.87 |  82.62 |  78.29
 18 | The exact results may vary depending on the random initialization.
 19 | 
 20 | The hyperparameters used in the model:
 21 | - init_scale - the initial scale of the weights
 22 | - learning_rate - the initial value of the learning rate
 23 | - max_grad_norm - the maximum permissible norm of the gradient
 24 | - num_layers - the number of LSTM layers
 25 | - num_steps - the number of unrolled steps of LSTM
 26 | - hidden_size - the number of LSTM units
 27 | - max_epoch - the number of epochs trained with the initial learning rate
 28 | - max_max_epoch - the total number of epochs for training
 29 | - keep_prob - the probability of keeping weights in the dropout layer
 30 | - lr_decay - the decay of the learning rate for each epoch after "max_epoch"
 31 | - batch_size - the batch size
 32 | 
 33 | To compile on CPU:
 34 |   bazel build -c opt tensorflow/models/rnn/ptb:ptb_word_lm
 35 | To compile on GPU:
 36 |   bazel build -c opt tensorflow --config=cuda \
 37 |     tensorflow/models/rnn/ptb:ptb_word_lm
 38 | To run:
 39 |   ./bazel-bin/.../ptb_word_lm --data_path=/tmp/simple-examples/data/
 40 | 
 41 | """
 42 | from __future__ import absolute_import
 43 | from __future__ import division
 44 | from __future__ import print_function
 45 | 
 46 | import time
 47 | 
 48 | import tensorflow.python.platform
 49 | 
 50 | import numpy as np
 51 | import tensorflow as tf
 52 | 
 53 | from tensorflow.models.rnn import rnn_cell
 54 | from tensorflow.models.rnn import rnn
 55 | from tensorflow.models.rnn import seq2seq
 56 | from data_utils import *
 57 | 
 58 | class RelatednessModel(object):
 59 |   """The relatedness model."""
 60 | 
 61 |   def process_sentence_pair(self, lsentence_raw, rsentence_raw, session, prev_state = None):
 62 |     """ TODO: this is mad inefficient esp. without symbolic
 63 |         compiling, should really batch this
 64 |         Input sentence is just string"""
 65 |     # convert sentence into word vector array
 66 |     lsentence = convert_sentence_to_glove_vectors(lsentence_raw, self.vocabulary, self.glove_word_vectors, self.word_vec_size)
 67 |     rsentence = convert_sentence_to_glove_vectors(rsentence_raw, self.vocabulary, self.glove_word_vectors, self.word_vec_size)
 68 | 
 69 |     # 5 x 300
 70 |     _left_inputs = tf.placeholder(tf.float32, [len(lsentence), self.config.word_vec_size])
 71 |     _right_inputs = tf.placeholder(tf.float32, [len(rsentence), self.config.word_vec_size])
 72 | 
 73 |     # _targets = tf.placeholder(tf.int32)
 74 | 
 75 |     # Apply dropout filter
 76 |     # if self.is_training and self.config.keep_prob < 1:
 77 |     #   left_inputs = [tf.nn.dropout(input_, self.config.keep_prob) for input_ in left_inputs]
 78 |     #   right_inputs = [tf.nn.dropout(input_, self.config.keep_prob) for input_ in right_inputs]
 79 | 
 80 |     linputs = [ tf.reshape(i, (1, self.config.word_vec_size)) for i in tf.split(0, len(lsentence), _left_inputs)]
 81 |     rinputs = [ tf.reshape(i, (1, self.config.word_vec_size)) for i in tf.split(0, len(rsentence), _right_inputs)]
 82 | 
 83 |     if prev_state is None:
 84 |       prev_state = self.left_lstm_cell.zero_state(1, tf.float32)
 85 | 
 86 |     with tf.variable_scope("LeftLSTM"):
 87 |       loutputs, rstates = rnn.rnn(self.left_lstm_cell, linputs, initial_state=prev_state, sequence_length=len(lsentence))
 88 |     with tf.variable_scope("RightLSTM"):
 89 |       routputs, rstates = rnn.rnn(self.right_lstm_cell, rinputs, initial_state=prev_state, sequence_length=len(lsentence))
 90 | 
 91 |     iop = tf.initialize_all_variables()
 92 |     session.run(iop)
 93 | 
 94 |     # TODO: the actual loss function and relatedness softmax layer
 95 |     louts = session.run(loutputs, feed_dict = {_left_inputs : lsentence, _right_inputs : rsentence })
 96 | 
 97 | 
 98 |     # outputs at each timestep of the sentence (i.e. each word)
 99 |     print(louts)
100 |     print(len(louts))
101 |     # print(routs)
102 | 
103 |   def __init__(self, is_training, glove_word_vectors, vocabulary, config):
104 |     self.size = config.hidden_size
105 |     self.config = config
106 |     self.is_training = is_training
107 |     self.word_vec_size = config.word_vec_size
108 |     vocab_size = config.vocab_size
109 |     self.glove_word_vectors = glove_word_vectors
110 |     self.vocabulary = vocabulary
111 | 
112 |     # Slightly better results can be obtained with forget gate biases
113 |     # initialized to 1 but the hyperparameters of the model would need to be
114 |     # different than reported in the paper.
115 | 
116 |     # TODO: these might be able to be improved if used the LSTMCell which has other features
117 |     # to improve performance, but then need the sentence_length
118 |     with tf.variable_scope("LeftLSTM"):
119 |         self.left_lstm_cell = rnn_cell.BasicLSTMCell(self.size, forget_bias=1.0)
120 |     with tf.variable_scope("RightLSTM"):
121 |         self.right_lstm_cell = rnn_cell.BasicLSTMCell(self.size, forget_bias=1.0)
122 |     if is_training and config.keep_prob < 1:
123 |       with tf.variable_scope("LeftLSTM"):
124 |         self.left_lstm_cell = rnn_cell.DropoutWrapper(self.left_lstm_cell, output_keep_prob=config.keep_prob)
125 |       with tf.variable_scope("RightLSTM"):
126 |         self.right_lstm_cell = rnn_cell.DropoutWrapper(self.right_lstm_cell, output_keep_prob=config.keep_prob)
127 | 
128 |     with tf.variable_scope("LeftLSTM"):
129 |       self.left_lstm_cell = rnn_cell.MultiRNNCell([self.left_lstm_cell] * config.num_layers)
130 |     with tf.variable_scope("RightLSTM"):
131 |       self.right_lstm_cell = rnn_cell.MultiRNNCell([self.right_lstm_cell] * config.num_layers)
132 | 
133 |     # output = tf.reshape(tf.concat(1, outputs), [-1, size])
134 |     # # Need a simple network on top for the similarity
135 |     # logits = tf.nn.xw_plus_b(output,
136 |     #                          tf.get_variable("softmax_w", [size, vocab_size]),
137 |     #                          tf.get_variable("softmax_b", [vocab_size]))
138 |     # # TODO: replace this with softmax
139 |     # loss = seq2seq.sequence_loss_by_example([logits],
140 |     #                                         [tf.reshape(self._targets, [-1])],
141 |     #                                         [tf.ones([batch_size * num_steps])],
142 |     #                                         vocab_size)
143 |     # self._cost = cost = tf.reduce_sum(loss) / batch_size
144 |     # self._final_state = states[-1]
145 |     #
146 |     # if not is_training:
147 |     #   return
148 |     #
149 |     # self._lr = tf.Variable(0.0, trainable=False)
150 |     # tvars = tf.trainable_variables()
151 |     # grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars),
152 |     #                                   config.max_grad_norm)
153 |     # optimizer = tf.train.GradientDescentOptimizer(self.lr)
154 |     # self._train_op = optimizer.apply_gradients(zip(grads, tvars))
155 | 
156 |   def assign_lr(self, session, lr_value):
157 |     session.run(tf.assign(self.lr, lr_value))
158 | 
159 |   @property
160 |   def input_data(self):
161 |     return self._input_data
162 | 
163 |   @property
164 |   def targets(self):
165 |     return self._targets
166 | 
167 |   @property
168 |   def initial_state(self):
169 |     return self._initial_state
170 | 
171 |   @property
172 |   def cost(self):
173 |     return self._cost
174 | 
175 |   @property
176 |   def final_state(self):
177 |     return self._final_state
178 | 
179 |   @property
180 |   def lr(self):
181 |     return self._lr
182 | 
183 |   @property
184 |   def train_op(self):
185 |     return self._train_op
186 | 


--------------------------------------------------------------------------------