├── .gitignore ├── README.md ├── attend.py ├── compare.py ├── embed.py ├── encode.py ├── propogate.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # quora-duplicate-questions 2 | detect duplicate questions on Quora 3 | 4 | **embed.py** 5 | * embeddings from sparse encodings 6 | * word embeddings via lstm encoding of character sequences 7 | * word embeddings via dense layer + max pooling over character sequences 8 | * word embeddings via convolution + max pooling over character sequences 9 | 10 | **propogate.py** 11 | * lstm layer 12 | * bidirectional lstm layer 13 | * time distributed dense layer 14 | * temporal convolution layer 15 | * dense layer 16 | 17 | **attend.py** 18 | * multiplicative attention 19 | * additive attention 20 | * concat attention 21 | * dot attention 22 | * cosine attention 23 | * softmax attentive matching 24 | * maxpool attentive matching 25 | * argmax attentive matching 26 | 27 | **encode.py** 28 | * lstm encoder 29 | * bidirectional lstm encoder 30 | * max pooling encoder 31 | * sum pooling encoder 32 | * mean pooling encoder 33 | 34 | **compare.py** 35 | * cosine 36 | * euclidian 37 | * manhattan 38 | * dot 39 | * dense (learnable distance function) 40 | * mahalanobis (with learnable covariance matrix) (TODO) 41 | -------------------------------------------------------------------------------- /attend.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | from propogate import time_distributed_dense_layer 4 | from utils import shape 5 | 6 | 7 | def multiplicative_attention(a, b, a_lengths, b_lengths, max_seq_len, hidden_units=150, 8 | scope='multiplicative-attention', reuse=False): 9 | """ 10 | For sequences a and b of lengths a_lengths and b_lengths, computes an attention matrix attn, 11 | where attn(i, j) = dot(W*a_i, W*b_j). W is a learnable matrix. The rows of attn are 12 | softmax normalized. 13 | 14 | Args: 15 | a: Input sequence a. Tensor of shape [batch_size, max_seq_len, input_size]. 16 | b: Input sequence b. Tensor of shape [batch_size, max_seq_len, input_size]. 17 | a_lengths: Lengths of sequences in a. Tensor of shape [batch_size]. 18 | b_lengths: Lengths of sequences in b. Tensor of shape [batch_size]. 19 | max_seq_len: Length of padded sequences a and b. Integer. 20 | hidden_units: Number of hidden units. Integer. 21 | 22 | Returns: 23 | Attention matrix. Tensor of shape [max_seq_len, max_seq_len]. 24 | 25 | """ 26 | with tf.variable_scope(scope, reuse=reuse): 27 | aW = time_distributed_dense_layer(a, hidden_units, bias=False, scope='dense', reuse=False) 28 | bW = time_distributed_dense_layer(b, hidden_units, bias=False, scope='dense', reuse=True) 29 | logits = tf.matmul(aW, tf.transpose(bW, (0, 2, 1))) 30 | logits = logits - tf.expand_dims(tf.reduce_max(logits, axis=2), 2) 31 | attn = tf.exp(logits) 32 | attn = mask_attention_weights(attn, a_lengths, b_lengths, max_seq_len) 33 | return attn / tf.expand_dims(tf.reduce_sum(attn, axis=2) + 1e-10, 2) 34 | 35 | 36 | def additive_attention(a, b, a_lengths, b_lengths, max_seq_len, hidden_units=150, 37 | scope='additive-attention', reuse=False): 38 | """ 39 | For sequences a and b of lengths a_lengths and b_lengths, computes an attention matrix attn, 40 | where attn(i, j) = dot(v, tanh(W*a_i + W*b_j)). v is a learnable vector and W is a learnable 41 | matrix. The rows of attn are softmax normalized. 42 | 43 | Args: 44 | a: Input sequence a. Tensor of shape [batch_size, max_seq_len, input_size]. 45 | b: Input sequence b. Tensor of shape [batch_size, max_seq_len, input_size]. 46 | a_lengths: Lengths of sequences in a. Tensor of shape [batch_size]. 47 | b_lengths: Lengths of sequences in b. Tensor of shape [batch_size]. 48 | max_seq_len: Length of padded sequences a and b. Integer. 49 | hidden_units: Number of hidden units. Integer. 50 | 51 | Returns: 52 | Attention matrix. Tensor of shape [max_seq_len, max_seq_len]. 53 | 54 | """ 55 | with tf.variable_scope(scope, reuse=reuse): 56 | aW = time_distributed_dense_layer(a, hidden_units, bias=False, scope='dense', reuse=False) 57 | bW = time_distributed_dense_layer(b, hidden_units, bias=False, scope='dense', reuse=True) 58 | aW = tf.expand_dims(aW, 2) 59 | bW = tf.expand_dims(bW, 1) 60 | v = tf.get_variable( 61 | name='dot_weights', 62 | initializer=tf.variance_scaling_initializer(), 63 | shape=[hidden_units] 64 | ) 65 | logits = tf.einsum('ijkl,l->ijk', tf.nn.tanh(aW + bW), v) 66 | logits = logits - tf.expand_dims(tf.reduce_max(logits, axis=2), 2) 67 | attn = tf.exp(logits) 68 | attn = mask_attention_weights(attn, a_lengths, b_lengths, max_seq_len) 69 | return attn / tf.expand_dims(tf.reduce_sum(attn, axis=2) + 1e-10, 2) 70 | 71 | 72 | def concat_attention(a, b, a_lengths, b_lengths, max_seq_len, hidden_units=150, 73 | scope='concat-attention', reuse=False): 74 | """ 75 | For sequences a and b of lengths a_lengths and b_lengths, computes an attention matrix attn, 76 | where attn(i, j) = dot(v, tanh(W*[a_i; b_j])). v is a learnable vector and W is a learnable 77 | matrix. The rows of attn are softmax normalized. 78 | 79 | Args: 80 | a: Input sequence a. Tensor of shape [batch_size, max_seq_len, input_size]. 81 | b: Input sequence b. Tensor of shape [batch_size, max_seq_len, input_size]. 82 | a_lengths: Lengths of sequences in a. Tensor of shape [batch_size]. 83 | b_lengths: Lengths of sequences in b. Tensor of shape [batch_size]. 84 | max_seq_len: Length of padded sequences a and b. Integer. 85 | hidden_units: Number of hidden units. Integer. 86 | 87 | Returns: 88 | Attention matrix. Tensor of shape [max_seq_len, max_seq_len]. 89 | 90 | """ 91 | with tf.variable_scope(scope, reuse=reuse): 92 | a = tf.expand_dims(a, 2) 93 | b = tf.expand_dims(b, 1) 94 | c = tf.concat([a, b], axis=3) 95 | W = tf.get_variable( 96 | name='matmul_weights', 97 | initializer=tf.contrib.layers.variance_scaling_initializer(), 98 | shape=[shape(c, -1), hidden_units] 99 | ) 100 | cW = tf.einsum('ijkl,lm->ijkm', c, W) 101 | v = tf.get_variable( 102 | name='dot_weights', 103 | initializer=tf.ones_initializer(), 104 | shape=[hidden_units] 105 | ) 106 | logits = tf.einsum('ijkl,l->ijk', tf.nn.tanh(cW), v) 107 | logits = logits - tf.expand_dims(tf.reduce_max(logits, axis=2), 2) 108 | attn = tf.exp(logits) 109 | attn = mask_attention_weights(attn, a_lengths, b_lengths, max_seq_len) 110 | return attn / tf.expand_dims(tf.reduce_sum(attn, axis=2) + 1e-10, 2) 111 | 112 | 113 | def dot_attention(a, b, a_lengths, b_lengths, max_seq_len): 114 | """ 115 | For sequences a and b of lengths a_lengths and b_lengths, computes an attention matrix attn, 116 | where attn(i, j) = dot(a_i, b_j). The rows of attn are softmax normalized. 117 | 118 | Args: 119 | a: Input sequence a. Tensor of shape [batch_size, max_seq_len, input_size]. 120 | b: Input sequence b. Tensor of shape [batch_size, max_seq_len, input_size]. 121 | a_lengths: Lengths of sequences in a. Tensor of shape [batch_size]. 122 | b_lengths: Lengths of sequences in b. Tensor of shape [batch_size]. 123 | max_seq_len: Length of padded sequences a and b. Integer. 124 | 125 | Returns: 126 | Attention matrix. Tensor of shape [max_seq_len, max_seq_len] 127 | 128 | """ 129 | logits = tf.matmul(a, tf.transpose(b, (0, 2, 1))) 130 | logits = logits - tf.expand_dims(tf.reduce_max(logits, axis=2), 2) 131 | attn = tf.exp(logits) 132 | attn = mask_attention_weights(attn, a_lengths, b_lengths, max_seq_len) 133 | return attn / tf.expand_dims(tf.reduce_sum(attn, axis=2) + 1e-10, 2) 134 | 135 | 136 | def cosine_attention(a, b, a_lengths, b_lengths, max_seq_len): 137 | """ 138 | For sequences a and b of lengths a_lengths and b_lengths, computes an attention matrix attn, 139 | where attn(i, j) = dot(a_i, b_j) / (l2_norm(a_i)*l2_norm(b_j)). The rows of attn are softmax 140 | normalized. 141 | 142 | Args: 143 | a: Input sequence a. Tensor of shape [batch_size, max_seq_len, input_size]. 144 | b: Input sequence b. Tensor of shape [batch_size, max_seq_len, input_size]. 145 | a_lengths: Lengths of sequences in a. Tensor of shape [batch_size]. 146 | b_lengths: Lengths of sequences in b. Tensor of shape [batch_size]. 147 | max_seq_len: Length of padded sequences a and b. Integer. 148 | 149 | Returns: 150 | Attention matrix. Tensor of shape [max_seq_len, max_seq_len]. 151 | 152 | """ 153 | a_norm = tf.nn.l2_normalize(a, dim=2) 154 | b_norm = tf.nn.l2_normalize(b, dim=2) 155 | logits = tf.matmul(a_norm, tf.transpose(b_norm, (0, 2, 1))) 156 | logits = logits - tf.expand_dims(tf.reduce_max(logits, axis=2), 2) 157 | attn = tf.exp(logits) 158 | attn = mask_attention_weights(attn, a_lengths, b_lengths, max_seq_len) 159 | return attn / tf.expand_dims(tf.reduce_sum(attn, axis=2) + 1e-10, 2) 160 | 161 | 162 | def mask_attention_weights(weights, a_lengths, b_lengths, max_seq_len): 163 | """ 164 | Masks an attention matrix for sequences a and b of lengths a_lengths and b_lengths so that 165 | the attention matrix of shape max_len by max_len contains zeros outside of 166 | a_lengths by b_lengths submatrix in the top left corner. 167 | 168 | Args: 169 | weights: Tensor of shape [max_seq_len, max_seq_len]. 170 | a_lengths: Lengths of sequences in a. Tensor of shape [batch_size]. 171 | b_lengths: Lengths of sequences in b. Tensor of shape [batch_size]. 172 | max_seq_len: Length of padded sequences a and b. Integer. 173 | 174 | Returns: 175 | Masked attention matrix. Tensor of shape [max_seq_len, max_seq_len]. 176 | 177 | """ 178 | a_mask = tf.expand_dims(tf.sequence_mask(a_lengths, maxlen=max_seq_len), 2) 179 | b_mask = tf.expand_dims(tf.sequence_mask(b_lengths, maxlen=max_seq_len), 1) 180 | seq_mask = tf.cast(tf.matmul(tf.cast(a_mask, tf.int32), tf.cast(b_mask, tf.int32)), tf.bool) 181 | return tf.where(seq_mask, weights, tf.zeros_like(weights)) 182 | 183 | 184 | def softmax_attentive_matching(a, b, a_lengths, b_lengths, max_seq_len, attention_func=dot_attention, 185 | attention_func_kwargs={}): 186 | """ 187 | Matches each vector in a with a weighted sum of the vectors in b. The weighted sum is determined 188 | by the attention matrix. The attention matrix is computed using attention_func. 189 | 190 | Args: 191 | a: Input sequence a. Tensor of shape [batch_size, max_seq_len, input_size]. 192 | b: Input sequence b. Tensor of shape [batch_size, max_seq_len, input_size]. 193 | a_lengths: Lengths of sequences in a. Tensor of shape [batch_size]. 194 | b_lengths: Lengths of sequences in b. Tensor of shape [batch_size]. 195 | max_seq_len: Length of padded sequences a and b. Integer. 196 | attention_func: Function used to calculate attention matrix. Can be one of the following: 197 | multiplicative_attention, additive_attention, concat_attention, dot_attention, 198 | or cosine_attention. 199 | attention_func_kwargs: Keyword arguments to pass to attention_func. 200 | 201 | Returns: 202 | Tensor of shape [batch_size, max_seq_len, input_size] consisting of the matching vectors for 203 | each timestep in a. 204 | 205 | """ 206 | attn = attention_func(a, b, a_lengths, b_lengths, max_seq_len, **attention_func_kwargs) 207 | return tf.matmul(attn, b) 208 | 209 | 210 | def maxpool_attentive_matching(a, b, a_lengths, b_lengths, max_seq_len, attention_func=dot_attention, 211 | attention_func_kwargs={}): 212 | """ 213 | Matches each vector in a with a vector created by maxpooling over the weighted vectors in b. 214 | The weightings are determined by the attention matrix. The attention matrix is 215 | computed using attention_func. 216 | 217 | Args: 218 | a: Input sequence a. Tensor of shape [batch_size, max_seq_len, input_size]. 219 | b: Input sequence b. Tensor of shape [batch_size, max_seq_len, input_size]. 220 | a_lengths: Lengths of sequences in a. Tensor of shape [batch_size]. 221 | b_lengths: Lengths of sequences in b. Tensor of shape [batch_size]. 222 | max_seq_len: Length of padded sequences a and b. Integer. 223 | attention_func: Function used to calculate attention matrix. Can be one of the following: 224 | multiplicative_attention, additive_attention, concat_attention, dot_attention, 225 | or cosine_attention. 226 | attention_func_kwargs: Keyword arguments to pass to attention_func. 227 | 228 | Returns: 229 | Tensor of shape [batch_size, max_seq_len, input_size] consisting of the matching vectors for 230 | each timestep in a. 231 | 232 | """ 233 | attn = attention_func(a, b, a_lengths, b_lengths, max_seq_len, **attention_func_kwargs) 234 | return tf.reduce_max(tf.einsum('ijk,ikl->ijkl', attn, b), axis=2) 235 | 236 | 237 | def argmax_attentive_matching(a, b, a_lengths, b_lengths, max_seq_len, attention_func=dot_attention, 238 | attention_func_kwargs={}): 239 | """ 240 | Matches each vector in a with the weighted vector in b that has the largest inner product. 241 | The weightings are determined by the attention matrix. The attention matrix is computed 242 | using attention_func. 243 | 244 | Args: 245 | a: Input sequence a. Tensor of shape [batch_size, max_seq_len, input_size]. 246 | b: Input sequence b. Tensor of shape [batch_size, max_seq_len, input_size]. 247 | a_lengths: Lengths of sequences in a. Tensor of shape [batch_size]. 248 | b_lengths: Lengths of sequences in b. Tensor of shape [batch_size]. 249 | max_seq_len: Length of padded sequences a and b. Integer. 250 | attention_func: Function used to calculate attention matrix. Can be one of the following: 251 | multiplicative_attention, additive_attention, concat_attention, dot_attention, 252 | or cosine_attention. 253 | attention_func_kwargs: Keyword arguments to pass to attention_func. 254 | 255 | Returns: 256 | Tensor of shape [batch_size, max_seq_len, input_size] consisting of the matching vectors for 257 | each timestep in a. 258 | 259 | """ 260 | attn = attention_func(a, b, a_lengths, b_lengths, max_seq_len, **attention_func_kwargs) 261 | b_match_idx = tf.argmax(attn, axis=2) 262 | batch_index = tf.tile(tf.expand_dims(tf.range(shape(b, 0), dtype=tf.int64), 1), (1, max_seq_len)) 263 | b_idx = tf.stack([batch_index, b_match_idx], axis=2) 264 | return tf.gather_nd(b, b_idx) 265 | -------------------------------------------------------------------------------- /compare.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | from utils import shape 4 | 5 | 6 | def cosine(a_enc, b_enc): 7 | """ 8 | Compare the encoded representations a_enc and b_enc via cosine similarity. 9 | 10 | Args: 11 | a_enc: Encoded representation of sequence a. Tensor of shape [batch_size, input_units]. 12 | b_enc: Encoded representation of sequence b. Tensor of shape [batch_size, input_units]. 13 | 14 | Returns: 15 | Tensor of shape [batch size]. 16 | 17 | """ 18 | a_norm = tf.nn.l2_normalize(a_enc, dim=1) 19 | b_norm = tf.nn.l2_normalize(b_enc, dim=1) 20 | return tf.reduce_sum(a_norm*b_norm) 21 | 22 | 23 | def euclidian(a_enc, b_enc): 24 | """ 25 | Compare the encoded representations a_enc and b_enc via euclidian distance. 26 | 27 | Args: 28 | a_enc: Encoded representation of sequence a. Tensor of shape [batch_size, input_units]. 29 | b_enc: Encoded representation of sequence b. Tensor of shape [batch_size, input_units]. 30 | 31 | Returns: 32 | Tensor of shape [batch size]. 33 | 34 | """ 35 | return tf.sqrt(tf.reduce_sum(tf.square(a_enc - b_enc)), axis=1) 36 | 37 | 38 | def manhattan(a_enc, b_enc): 39 | """ 40 | Compare the encoded representations a_enc and b_enc via manhattan distance 41 | 42 | Args: 43 | a_enc: Encoded representation of sequence a. Tensor of shape [batch_size, input_units]. 44 | b_enc: Encoded representation of sequence b. Tensor of shape [batch_size, input_units]. 45 | 46 | Returns: 47 | Tensor of shape [batch size]. 48 | 49 | """ 50 | return tf.reduce_sum(tf.abs(a_enc - b_enc), axis=1) 51 | 52 | 53 | def dot(a_enc, b_enc): 54 | """ 55 | Compare the encoded representations a_enc and b_enc via dot product 56 | 57 | Args: 58 | a_enc: Encoded representation of sequence a. Tensor of shape [batch_size, input_units]. 59 | b_enc: Encoded representation of sequence b. Tensor of shape [batch_size, input_units]. 60 | 61 | Returns: 62 | Tensor of shape [batch size]. 63 | 64 | """ 65 | return tf.reduce_sum(a_enc*b_enc) 66 | 67 | 68 | def dense(a_enc, b_enc, bias=True, activation=None, dropout=None, scope='dense', reuse=False): 69 | """ 70 | Compare the encoded representations a_enc and b_enc using a learnable paramterized 71 | function in the form of dense layer applied to the concatenation of a_enc and b_enc. 72 | 73 | Args: 74 | a_enc: Encoded representation of sequence a. Tensor of shape [batch_size, input_units]. 75 | b_enc: Encoded representation of sequence b. Tensor of shape [batch_size, input_units]. 76 | activation: Activation function. 77 | dropout: Dropout keep prob. Float. 78 | 79 | Returns: 80 | Tensor of shape [batch size]. 81 | 82 | """ 83 | with tf.variable_scope(scope, reuse=reuse): 84 | inputs = tf.concat([a_enc, b_enc], axis=1) 85 | W = tf.get_variable( 86 | name='weights', 87 | initializer=tf.contrib.layers.variance_scaling_initializer(), 88 | shape=[shape(inputs, -1), 1] 89 | ) 90 | z = tf.matmul(inputs, W) 91 | if bias: 92 | b_enc = tf.get_variable( 93 | name='biases', 94 | initializer=tf.constant_initializer(), 95 | shape=[1] 96 | ) 97 | z = z + b_enc 98 | z = activation(z) if activation else z 99 | z = tf.nn.dropout(z, dropout) if dropout else z 100 | return tf.squeeze(z) 101 | -------------------------------------------------------------------------------- /embed.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | from encode import lstm_encoder 4 | from propogate import temporal_convolution_layer 5 | from utils import shape 6 | 7 | 8 | def embedding_from_sparse_encodings(encodings, shape, embedding_matrix=None, scope='gather-embed', 9 | reuse=False): 10 | """ 11 | Gathers embedding vectors corresponding to values in encodings. If embedding_matrix is passed, 12 | then it will be used to initialize the embedding matrix. Otherwise, the matrix will be 13 | initialized with random embeddings. 14 | 15 | Args: 16 | encodings: Tensor of shape [batch_size, sequence length]. 17 | shape: Shape of 2D parameter matrix. The first dimension should contain 18 | the vocabulary size and the second dimension should be the size 19 | of the embedding dimension. 20 | embedding_matrix: numpy array of the embedding matrix. 21 | 22 | Returns: 23 | Sequence of embedding vectors. Tensor of shape [batch_size, sequence length, shape[1]]. 24 | 25 | """ 26 | with tf.variable_scope(scope, reuse=reuse): 27 | W = tf.get_variable( 28 | name='weights', 29 | initializer=embedding_matrix or tf.contrib.layers.variance_scaling_initializer(), 30 | shape=shape 31 | ) 32 | embeddings = tf.nn.embedding_lookup(W, encodings) 33 | return embeddings 34 | 35 | 36 | def dense_word_embedding_from_chars(chars, embed_dim, bias=True, scope='dense-word-embed', reuse=False): 37 | """ 38 | Word embeddings via dense transformation + maxpooling of character sequences. 39 | 40 | Args: 41 | chars: Tensor of shape [batch_size, word sequence length, char sequence length, alphabet size]. 42 | embed_dim: Dimension of word embeddings. Integer. 43 | 44 | Returns: 45 | Sequence of embedding vectors. Tensor of shape [batch_size, word sequence length, embed_dim]. 46 | 47 | """ 48 | with tf.variable_scope(scope, reuse=reuse): 49 | chars = tf.cast(chars, tf.float32) 50 | W = tf.get_variable( 51 | name='weights', 52 | initializer=tf.contrib.layers.variance_scaling_initializer(), 53 | shape=[shape(chars, -1), embed_dim] 54 | ) 55 | z = tf.einsum('ijkl,lm->ijkm', chars, W) 56 | if bias: 57 | b = tf.get_variable( 58 | name='biases', 59 | initializer=tf.constant_initializer(), 60 | shape=[embed_dim] 61 | ) 62 | z = z + b 63 | dense_word_embedding = tf.reduce_max(z, 2) 64 | return dense_word_embedding 65 | 66 | 67 | def lstm_word_embedding_from_chars(chars, lengths, embed_dim, scope='lstm-word-embed', reuse=False): 68 | """ 69 | Word embeddings via LSTM encoding of character sequences. 70 | 71 | Args: 72 | chars: Tensor of shape [batch_size, word sequence length, char sequence length, num characters]. 73 | lengths: Tensor of shape [batch_size, word_sequence length]. 74 | embed_dim: Dimension of word embeddings. Integer. 75 | 76 | Returns: 77 | Sequence of embedding vectors. Tensor of shape [batch_size, word sequence length, embed_dim]. 78 | 79 | """ 80 | chars = tf.cast(chars, tf.float32) 81 | 82 | # this is super inefficient 83 | chars = tf.unstack(chars, axis=0) 84 | lengths = tf.unstack(lengths, axis=0) 85 | 86 | lstm_word_embeddings = [] 87 | for i, (char, length) in enumerate(zip(chars, lengths)): 88 | temp_reuse = i != 0 or reuse 89 | embedding = lstm_encoder(char, length, embed_dim, 1.0, scope=scope, reuse=temp_reuse) 90 | lstm_word_embeddings.append(embedding) 91 | lstm_word_embeddings = tf.stack(lstm_word_embeddings, axis=0) 92 | 93 | return lstm_word_embeddings 94 | 95 | 96 | def convolutional_word_embedding_from_chars(chars, embed_dim, convolution_width, bias=True, 97 | scope='conv-word-embed', reuse=False): 98 | """ 99 | Word embeddings via convolution + maxpooling of character sequences. 100 | 101 | Args: 102 | chars: Tensor of shape [batch_size, word sequence length, char sequence length, alphabet size]. 103 | embed_dim: Dimension of word embeddings Integer. 104 | convolution_width: Number of characters used in the convolution. Integer. 105 | 106 | Returns: 107 | Sequence of embedding vectors. Tensor of shape [batch_size, word sequence length, embed_dim]. 108 | 109 | """ 110 | chars = tf.cast(chars, tf.float32) 111 | 112 | # this is super inefficient 113 | chars = tf.unstack(chars, axis=0) 114 | 115 | conv_word_embeddings = [] 116 | for i, char in enumerate(chars): 117 | temp_reuse = i != 0 or reuse 118 | conv = temporal_convolution_layer( 119 | char, embed_dim, convolution_width, scope=scope, reuse=temp_reuse) 120 | embedding = tf.reduce_max(conv, axis=1) 121 | conv_word_embeddings.append(embedding) 122 | conv_word_embeddings = tf.stack(conv_word_embeddings, axis=0) 123 | 124 | return conv_word_embeddings 125 | -------------------------------------------------------------------------------- /encode.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | def lstm_encoder(inputs, lengths, state_size, keep_prob, scope='lstm-encoder', reuse=False): 5 | """ 6 | LSTM encoder 7 | 8 | Args: 9 | inputs: Sequence data. Tensor of shape [batch_size, max_seq_len, input_size]. 10 | lengths: Lengths of sequences in inputs. Tensor of shape [batch_size]. 11 | state_size: LSTM state size. 12 | keep_prob: 1 - p, where p is the dropout probability. 13 | 14 | Returns: 15 | Tensor of shape [batch_size, state size] containing the final h states. 16 | 17 | """ 18 | with tf.variable_scope(scope, reuse=reuse): 19 | cell_fw = tf.contrib.rnn.core_rnn_cell.DropoutWrapper( 20 | tf.contrib.rnn.core_rnn_cell.LSTMCell( 21 | state_size, 22 | reuse=reuse 23 | ), 24 | output_keep_prob=keep_prob 25 | ) 26 | outputs, output_state = tf.nn.dynamic_rnn( 27 | inputs=inputs, 28 | cell=cell_fw, 29 | sequence_length=lengths, 30 | dtype=tf.float32 31 | ) 32 | return output_state.h 33 | 34 | 35 | def bidirectional_lstm_encoder(inputs, lengths, state_size, keep_prob, scope='bi-lstm-encoder', reuse=False): 36 | """ 37 | Bidirectional LSTM encoder 38 | 39 | Args: 40 | inputs: Sequence data. Tensor of shape [batch_size, max_seq_len, input_size]. 41 | lengths: Lengths of sequences in inputs. Tensor of shape [batch_size]. 42 | state_size: LSTM state size. 43 | keep_prob: 1 - p, where p is the dropout probability. 44 | 45 | Returns: 46 | Tensor of shape [batch_size, 2*state size] containing the concatenated 47 | forward and backward lstm final h states. 48 | 49 | """ 50 | with tf.variable_scope(scope, reuse=reuse): 51 | cell_fw = tf.contrib.rnn.core_rnn_cell.DropoutWrapper( 52 | tf.contrib.rnn.core_rnn_cell.LSTMCell( 53 | state_size, 54 | reuse=reuse 55 | ), 56 | output_keep_prob=keep_prob 57 | ) 58 | cell_bw = tf.contrib.rnn.core_rnn_cell.DropoutWrapper( 59 | tf.contrib.rnn.core_rnn_cell.LSTMCell( 60 | state_size, 61 | reuse=reuse 62 | ), 63 | output_keep_prob=keep_prob 64 | ) 65 | outputs, (output_fw, output_bw) = tf.nn.bidirectional_dynamic_rnn( 66 | inputs=inputs, 67 | cell_fw=cell_fw, 68 | cell_bw=cell_bw, 69 | sequence_length=lengths, 70 | dtype=tf.float32 71 | ) 72 | outputs = tf.concat(outputs, 2) 73 | output_state = tf.concat([output_fw.h, output_bw.h], axis=1) 74 | return output_state 75 | 76 | 77 | def reduce_max_encoder(inputs): 78 | """ 79 | Max pooling over the time dimension 80 | 81 | Args: 82 | inputs: Sequence data. Tensor of shape [batch_size, max_seq_len, input_size]. 83 | 84 | Returns: 85 | Tensor of shape [batch_size, input_size]. 86 | """ 87 | return tf.reduce_max(inputs, axis=1) 88 | 89 | 90 | def reduce_sum_encoder(inputs): 91 | """ 92 | Sum pooling over the time dimension 93 | 94 | Args: 95 | inputs: Sequence data. Tensor of shape [batch_size, max_seq_len, input_size]. 96 | 97 | Returns: 98 | Tensor of shape [batch_size, input_size]. 99 | """ 100 | return tf.reduce_sum(inputs, axis=1) 101 | 102 | 103 | def reduce_mean_encoder(inputs, lengths): 104 | """ 105 | Max pooling over the time dimension 106 | 107 | Args: 108 | inputs: Sequence data. Tensor of shape [batch_size, max_seq_len, input_size]. 109 | lengths: Lengths of sequences in inputs. Tensor of shape [batch_size]. 110 | 111 | Returns: 112 | Tensor of shape [batch_size, input_size]. 113 | """ 114 | return tf.reduce_sum(inputs, axis=1) / tf.expand_dims(lengths, 1) 115 | -------------------------------------------------------------------------------- /propogate.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | from utils import shape 4 | 5 | 6 | def lstm_layer(inputs, lengths, state_size, keep_prob, scope='lstm-layer', reuse=False): 7 | """ 8 | LSTM layer. 9 | 10 | Args: 11 | inputs: Tensor of shape [batch size, max sequence length, ...]. 12 | lengths: Tensor of shape [batch size]. 13 | state_size: LSTM state size. 14 | keep_prob: 1 - p, where p is the dropout probability. 15 | 16 | Returns: 17 | Tensor of shape [batch size, max sequence length, state_size] containing the lstm 18 | outputs at each timestep. 19 | 20 | """ 21 | with tf.variable_scope(scope, reuse=reuse): 22 | cell_fw = tf.contrib.rnn.core_rnn_cell.DropoutWrapper( 23 | tf.contrib.rnn.core_rnn_cell.LSTMCell( 24 | state_size, 25 | reuse=reuse 26 | ), 27 | output_keep_prob=keep_prob 28 | ) 29 | outputs, output_state = tf.nn.dynamic_rnn( 30 | inputs=inputs, 31 | cell=cell_fw, 32 | sequence_length=lengths, 33 | dtype=tf.float32 34 | ) 35 | return outputs 36 | 37 | 38 | def bidirectional_lstm_layer(inputs, lengths, state_size, keep_prob, scope='bi-lstm-layer', reuse=False): 39 | """ 40 | Bidirectional LSTM layer. 41 | 42 | Args: 43 | inputs: Tensor of shape [batch size, max sequence length, ...]. 44 | lengths: Tensor of shape [batch size]. 45 | state_size: LSTM state size. 46 | keep_prob: 1 - p, where p is the dropout probability. 47 | 48 | Returns: 49 | Tensor of shape [batch size, max sequence length, 2*state_size] containing the concatenated 50 | forward and backward lstm outputs at each timestep. 51 | 52 | """ 53 | with tf.variable_scope(scope, reuse=reuse): 54 | cell_fw = tf.contrib.rnn.core_rnn_cell.DropoutWrapper( 55 | tf.contrib.rnn.core_rnn_cell.LSTMCell( 56 | state_size, 57 | reuse=reuse 58 | ), 59 | output_keep_prob=keep_prob 60 | ) 61 | cell_bw = tf.contrib.rnn.core_rnn_cell.DropoutWrapper( 62 | tf.contrib.rnn.core_rnn_cell.LSTMCell( 63 | state_size, 64 | reuse=reuse 65 | ), 66 | output_keep_prob=keep_prob 67 | ) 68 | outputs, (output_fw, output_bw) = tf.nn.bidirectional_dynamic_rnn( 69 | inputs=inputs, 70 | cell_fw=cell_fw, 71 | cell_bw=cell_bw, 72 | sequence_length=lengths, 73 | dtype=tf.float32 74 | ) 75 | outputs = tf.concat(outputs, 2) 76 | return outputs 77 | 78 | 79 | def time_distributed_dense_layer(inputs, output_units, bias=True, activation=None, dropout=None, 80 | scope='time-distributed-dense-layer', reuse=False): 81 | """ 82 | Applies a shared dense layer to each timestep of a tensor of shape [batch_size, max_seq_len, input_units] 83 | to produce a tensor of shape [batch_size, max_seq_len, output_units]. 84 | 85 | Args: 86 | inputs: Tensor of shape [batch size, max sequence length, ...]. 87 | output_units: Number of output units. 88 | activation: activation function. 89 | dropout: dropout keep prob. 90 | 91 | Returns: 92 | Tensor of shape [batch size, max sequence length, output_units]. 93 | 94 | """ 95 | with tf.variable_scope(scope, reuse=reuse): 96 | W = tf.get_variable( 97 | name='weights', 98 | initializer=tf.contrib.layers.variance_scaling_initializer(), 99 | shape=[shape(inputs, -1), output_units] 100 | ) 101 | z = tf.einsum('ijk,kl->ijl', inputs, W) 102 | if bias: 103 | b = tf.get_variable( 104 | name='biases', 105 | initializer=tf.constant_initializer(), 106 | shape=[output_units] 107 | ) 108 | z = z + b 109 | z = activation(z) if activation else z 110 | z = tf.nn.dropout(z, dropout) if dropout else z 111 | return z 112 | 113 | 114 | def temporal_convolution_layer(inputs, output_units, convolution_width, bias=True, activation=None, 115 | dropout=None, scope='time-distributed-conv-layer', reuse=False): 116 | """ 117 | Convolution over the temporal axis of sequence data. 118 | 119 | Args: 120 | inputs: Tensor of shape [batch size, max sequence length, input_units]. 121 | output_units: Output channels for convolution. 122 | convolution_width: Number of timesteps (words) to use in convolution. 123 | 124 | Returns: 125 | Tensor of shape [batch size, max sequence length, output_units]. 126 | 127 | """ 128 | with tf.variable_scope(scope, reuse=reuse): 129 | W = tf.get_variable( 130 | name='weights', 131 | initializer=tf.contrib.layers.variance_scaling_initializer(), 132 | shape=[convolution_width, shape(inputs, 2), output_units] 133 | ) 134 | 135 | z = tf.nn.convolution(inputs, W, padding='SAME', strides=[1]) 136 | if bias: 137 | b = tf.get_variable( 138 | name='biases', 139 | initializer=tf.constant_initializer(), 140 | shape=[output_units] 141 | ) 142 | z = z + b 143 | z = activation(z) if activation else z 144 | z = tf.nn.dropout(z, dropout) if dropout else z 145 | return z 146 | 147 | 148 | def dense_layer(inputs, output_units, bias=True, activation=None, dropout=None, scope='dense-layer', 149 | reuse=False): 150 | """ 151 | Applies a dense layer to a 2D tensor of shape [batch_size, input_units] 152 | to produce a tensor of shape [batch_size, output_units]. 153 | 154 | Args: 155 | inputs: Tensor of shape [batch size, input_units]. 156 | output_units: Number of output units. 157 | activation: activation function. 158 | dropout: dropout keep prob. 159 | 160 | Returns: 161 | Tensor of shape [batch size, output_units]. 162 | 163 | """ 164 | with tf.variable_scope(scope, reuse=reuse): 165 | W = tf.get_variable( 166 | name='weights', 167 | initializer=tf.contrib.layers.variance_scaling_initializer(), 168 | shape=[shape(inputs, -1), output_units] 169 | ) 170 | z = tf.matmul(inputs, W) 171 | if bias: 172 | b = tf.get_variable( 173 | name='biases', 174 | initializer=tf.constant_initializer(), 175 | shape=[output_units] 176 | ) 177 | z = z + b 178 | z = activation(z) if activation else z 179 | z = tf.nn.dropout(z, dropout) if dropout else z 180 | return z 181 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | def rank(tensor): 2 | """Get tensor rank as python list""" 3 | return len(tensor.shape.as_list()) 4 | 5 | 6 | def shape(tensor, dim=None): 7 | """Get tensor shape/dimension as list/int""" 8 | if not dim: 9 | return tensor.shape.as_list() 10 | if dim: 11 | return tensor.shape.as_list()[dim] 12 | --------------------------------------------------------------------------------