├── .gitignore
├── README.md
├── attend.py
├── compare.py
├── embed.py
├── encode.py
├── propogate.py
└── utils.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 | 
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 | 
60 | # Scrapy stuff:
61 | .scrapy
62 | 
63 | # Sphinx documentation
64 | docs/_build/
65 | 
66 | # PyBuilder
67 | target/
68 | 
69 | # IPython Notebook
70 | .ipynb_checkpoints
71 | 
72 | # pyenv
73 | .python-version
74 | 
75 | # celery beat schedule file
76 | celerybeat-schedule
77 | 
78 | # dotenv
79 | .env
80 | 
81 | # virtualenv
82 | venv/
83 | ENV/
84 | 
85 | # Spyder project settings
86 | .spyderproject
87 | 
88 | # Rope project settings
89 | .ropeproject
90 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # quora-duplicate-questions
 2 | detect duplicate questions on Quora
 3 | 
 4 | **embed.py**
 5 |   * embeddings from sparse encodings
 6 |   * word embeddings via lstm encoding of character sequences
 7 |   * word embeddings via dense layer + max pooling over character sequences
 8 |   * word embeddings via convolution + max pooling over character sequences
 9 | 
10 | **propogate.py**
11 |   * lstm layer
12 |   * bidirectional lstm layer
13 |   * time distributed dense layer
14 |   * temporal convolution layer
15 |   * dense layer
16 | 
17 | **attend.py**
18 |   * multiplicative attention
19 |   * additive attention
20 |   * concat attention
21 |   * dot attention
22 |   * cosine attention
23 |   * softmax attentive matching
24 |   * maxpool attentive matching
25 |   * argmax attentive matching
26 | 
27 | **encode.py**
28 |   * lstm encoder
29 |   * bidirectional lstm encoder
30 |   * max pooling encoder
31 |   * sum pooling encoder
32 |   * mean pooling encoder
33 | 
34 | **compare.py**
35 |   * cosine
36 |   * euclidian
37 |   * manhattan
38 |   * dot
39 |   * dense (learnable distance function)
40 |   * mahalanobis (with learnable covariance matrix) (TODO)
41 | 


--------------------------------------------------------------------------------
/attend.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | 
  3 | from propogate import time_distributed_dense_layer
  4 | from utils import shape
  5 | 
  6 | 
  7 | def multiplicative_attention(a, b, a_lengths, b_lengths, max_seq_len, hidden_units=150,
  8 |                              scope='multiplicative-attention', reuse=False):
  9 |     """
 10 |     For sequences a and b of lengths a_lengths and b_lengths, computes an attention matrix attn,
 11 |     where attn(i, j) = dot(W*a_i, W*b_j).  W is a learnable matrix.  The rows of attn are
 12 |     softmax normalized.
 13 | 
 14 |     Args:
 15 |         a: Input sequence a.  Tensor of shape [batch_size, max_seq_len, input_size].
 16 |         b: Input sequence b.  Tensor of shape [batch_size, max_seq_len, input_size].
 17 |         a_lengths: Lengths of sequences in a.  Tensor of shape [batch_size].
 18 |         b_lengths: Lengths of sequences in b.  Tensor of shape [batch_size].
 19 |         max_seq_len: Length of padded sequences a and b.  Integer.
 20 |         hidden_units: Number of hidden units.  Integer.
 21 | 
 22 |     Returns:
 23 |         Attention matrix.  Tensor of shape [max_seq_len, max_seq_len].
 24 | 
 25 |     """
 26 |     with tf.variable_scope(scope, reuse=reuse):
 27 |         aW = time_distributed_dense_layer(a, hidden_units, bias=False, scope='dense', reuse=False)
 28 |         bW = time_distributed_dense_layer(b, hidden_units, bias=False, scope='dense', reuse=True)
 29 |         logits = tf.matmul(aW, tf.transpose(bW, (0, 2, 1)))
 30 |         logits = logits - tf.expand_dims(tf.reduce_max(logits, axis=2), 2)
 31 |         attn = tf.exp(logits)
 32 |         attn = mask_attention_weights(attn, a_lengths, b_lengths, max_seq_len)
 33 |         return attn / tf.expand_dims(tf.reduce_sum(attn, axis=2) + 1e-10, 2)
 34 | 
 35 | 
 36 | def additive_attention(a, b, a_lengths, b_lengths, max_seq_len, hidden_units=150,
 37 |                        scope='additive-attention', reuse=False):
 38 |     """
 39 |     For sequences a and b of lengths a_lengths and b_lengths, computes an attention matrix attn,
 40 |     where attn(i, j) = dot(v, tanh(W*a_i + W*b_j)).  v is a learnable vector and W is a learnable
 41 |     matrix. The rows of attn are softmax normalized.
 42 | 
 43 |     Args:
 44 |         a: Input sequence a.  Tensor of shape [batch_size, max_seq_len, input_size].
 45 |         b: Input sequence b.  Tensor of shape [batch_size, max_seq_len, input_size].
 46 |         a_lengths: Lengths of sequences in a.  Tensor of shape [batch_size].
 47 |         b_lengths: Lengths of sequences in b.  Tensor of shape [batch_size].
 48 |         max_seq_len: Length of padded sequences a and b.  Integer.
 49 |         hidden_units: Number of hidden units.  Integer.
 50 | 
 51 |     Returns:
 52 |         Attention matrix.  Tensor of shape [max_seq_len, max_seq_len].
 53 | 
 54 |     """
 55 |     with tf.variable_scope(scope, reuse=reuse):
 56 |         aW = time_distributed_dense_layer(a, hidden_units, bias=False, scope='dense', reuse=False)
 57 |         bW = time_distributed_dense_layer(b, hidden_units, bias=False, scope='dense', reuse=True)
 58 |         aW = tf.expand_dims(aW, 2)
 59 |         bW = tf.expand_dims(bW, 1)
 60 |         v = tf.get_variable(
 61 |             name='dot_weights',
 62 |             initializer=tf.variance_scaling_initializer(),
 63 |             shape=[hidden_units]
 64 |         )
 65 |         logits = tf.einsum('ijkl,l->ijk', tf.nn.tanh(aW + bW), v)
 66 |         logits = logits - tf.expand_dims(tf.reduce_max(logits, axis=2), 2)
 67 |         attn = tf.exp(logits)
 68 |         attn = mask_attention_weights(attn, a_lengths, b_lengths, max_seq_len)
 69 |         return attn / tf.expand_dims(tf.reduce_sum(attn, axis=2) + 1e-10, 2)
 70 | 
 71 | 
 72 | def concat_attention(a, b, a_lengths, b_lengths, max_seq_len, hidden_units=150,
 73 |                      scope='concat-attention', reuse=False):
 74 |     """
 75 |     For sequences a and b of lengths a_lengths and b_lengths, computes an attention matrix attn,
 76 |     where attn(i, j) = dot(v, tanh(W*[a_i; b_j])).  v is a learnable vector and W is a learnable
 77 |     matrix.  The rows of attn are softmax normalized.
 78 | 
 79 |     Args:
 80 |         a: Input sequence a.  Tensor of shape [batch_size, max_seq_len, input_size].
 81 |         b: Input sequence b.  Tensor of shape [batch_size, max_seq_len, input_size].
 82 |         a_lengths: Lengths of sequences in a.  Tensor of shape [batch_size].
 83 |         b_lengths: Lengths of sequences in b.  Tensor of shape [batch_size].
 84 |         max_seq_len: Length of padded sequences a and b.  Integer.
 85 |         hidden_units: Number of hidden units.  Integer.
 86 | 
 87 |     Returns:
 88 |         Attention matrix.  Tensor of shape [max_seq_len, max_seq_len].
 89 | 
 90 |     """
 91 |     with tf.variable_scope(scope, reuse=reuse):
 92 |         a = tf.expand_dims(a, 2)
 93 |         b = tf.expand_dims(b, 1)
 94 |         c = tf.concat([a, b], axis=3)
 95 |         W = tf.get_variable(
 96 |             name='matmul_weights',
 97 |             initializer=tf.contrib.layers.variance_scaling_initializer(),
 98 |             shape=[shape(c, -1), hidden_units]
 99 |         )
100 |         cW = tf.einsum('ijkl,lm->ijkm', c, W)
101 |         v = tf.get_variable(
102 |             name='dot_weights',
103 |             initializer=tf.ones_initializer(),
104 |             shape=[hidden_units]
105 |         )
106 |         logits = tf.einsum('ijkl,l->ijk', tf.nn.tanh(cW), v)
107 |         logits = logits - tf.expand_dims(tf.reduce_max(logits, axis=2), 2)
108 |         attn = tf.exp(logits)
109 |         attn = mask_attention_weights(attn, a_lengths, b_lengths, max_seq_len)
110 |         return attn / tf.expand_dims(tf.reduce_sum(attn, axis=2) + 1e-10, 2)
111 | 
112 | 
113 | def dot_attention(a, b, a_lengths, b_lengths, max_seq_len):
114 |     """
115 |     For sequences a and b of lengths a_lengths and b_lengths, computes an attention matrix attn,
116 |     where attn(i, j) = dot(a_i, b_j). The rows of attn are softmax normalized.
117 | 
118 |     Args:
119 |         a: Input sequence a.  Tensor of shape [batch_size, max_seq_len, input_size].
120 |         b: Input sequence b.  Tensor of shape [batch_size, max_seq_len, input_size].
121 |         a_lengths: Lengths of sequences in a.  Tensor of shape [batch_size].
122 |         b_lengths: Lengths of sequences in b.  Tensor of shape [batch_size].
123 |         max_seq_len: Length of padded sequences a and b.  Integer.
124 | 
125 |     Returns:
126 |         Attention matrix.  Tensor of shape [max_seq_len, max_seq_len]
127 | 
128 |     """
129 |     logits = tf.matmul(a, tf.transpose(b, (0, 2, 1)))
130 |     logits = logits - tf.expand_dims(tf.reduce_max(logits, axis=2), 2)
131 |     attn = tf.exp(logits)
132 |     attn = mask_attention_weights(attn, a_lengths, b_lengths, max_seq_len)
133 |     return attn / tf.expand_dims(tf.reduce_sum(attn, axis=2) + 1e-10, 2)
134 | 
135 | 
136 | def cosine_attention(a, b, a_lengths, b_lengths, max_seq_len):
137 |     """
138 |     For sequences a and b of lengths a_lengths and b_lengths, computes an attention matrix attn,
139 |     where attn(i, j) = dot(a_i, b_j) / (l2_norm(a_i)*l2_norm(b_j)). The rows of attn are softmax
140 |     normalized.
141 | 
142 |     Args:
143 |         a: Input sequence a.  Tensor of shape [batch_size, max_seq_len, input_size].
144 |         b: Input sequence b.  Tensor of shape [batch_size, max_seq_len, input_size].
145 |         a_lengths: Lengths of sequences in a.  Tensor of shape [batch_size].
146 |         b_lengths: Lengths of sequences in b.  Tensor of shape [batch_size].
147 |         max_seq_len: Length of padded sequences a and b.  Integer.
148 | 
149 |     Returns:
150 |         Attention matrix.  Tensor of shape [max_seq_len, max_seq_len].
151 | 
152 |     """
153 |     a_norm = tf.nn.l2_normalize(a, dim=2)
154 |     b_norm = tf.nn.l2_normalize(b, dim=2)
155 |     logits = tf.matmul(a_norm, tf.transpose(b_norm, (0, 2, 1)))
156 |     logits = logits - tf.expand_dims(tf.reduce_max(logits, axis=2), 2)
157 |     attn = tf.exp(logits)
158 |     attn = mask_attention_weights(attn, a_lengths, b_lengths, max_seq_len)
159 |     return attn / tf.expand_dims(tf.reduce_sum(attn, axis=2) + 1e-10, 2)
160 | 
161 | 
162 | def mask_attention_weights(weights, a_lengths, b_lengths, max_seq_len):
163 |     """
164 |     Masks an attention matrix for sequences a and b of lengths a_lengths and b_lengths so that
165 |     the attention matrix of shape max_len by max_len contains zeros outside of
166 |     a_lengths by b_lengths submatrix in the top left corner.
167 | 
168 |     Args:
169 |         weights: Tensor of shape [max_seq_len, max_seq_len].
170 |         a_lengths: Lengths of sequences in a.  Tensor of shape [batch_size].
171 |         b_lengths: Lengths of sequences in b.  Tensor of shape [batch_size].
172 |         max_seq_len: Length of padded sequences a and b.  Integer.
173 | 
174 |     Returns:
175 |         Masked attention matrix.  Tensor of shape [max_seq_len, max_seq_len].
176 | 
177 |     """
178 |     a_mask = tf.expand_dims(tf.sequence_mask(a_lengths, maxlen=max_seq_len), 2)
179 |     b_mask = tf.expand_dims(tf.sequence_mask(b_lengths, maxlen=max_seq_len), 1)
180 |     seq_mask = tf.cast(tf.matmul(tf.cast(a_mask, tf.int32), tf.cast(b_mask, tf.int32)), tf.bool)
181 |     return tf.where(seq_mask, weights, tf.zeros_like(weights))
182 | 
183 | 
184 | def softmax_attentive_matching(a, b, a_lengths, b_lengths, max_seq_len, attention_func=dot_attention,
185 |                                attention_func_kwargs={}):
186 |     """
187 |     Matches each vector in a with a weighted sum of the vectors in b.  The weighted sum is determined
188 |     by the attention matrix.  The attention matrix is computed using attention_func.
189 | 
190 |     Args:
191 |         a: Input sequence a.  Tensor of shape [batch_size, max_seq_len, input_size].
192 |         b: Input sequence b.  Tensor of shape [batch_size, max_seq_len, input_size].
193 |         a_lengths: Lengths of sequences in a.  Tensor of shape [batch_size].
194 |         b_lengths: Lengths of sequences in b.  Tensor of shape [batch_size].
195 |         max_seq_len: Length of padded sequences a and b.  Integer.
196 |         attention_func: Function used to calculate attention matrix.  Can be one of the following:
197 |             multiplicative_attention, additive_attention, concat_attention, dot_attention,
198 |             or cosine_attention.
199 |         attention_func_kwargs: Keyword arguments to pass to attention_func.
200 | 
201 |     Returns:
202 |         Tensor of shape [batch_size, max_seq_len, input_size] consisting of the matching vectors for
203 |         each timestep in a.
204 | 
205 |     """
206 |     attn = attention_func(a, b, a_lengths, b_lengths, max_seq_len, **attention_func_kwargs)
207 |     return tf.matmul(attn, b)
208 | 
209 | 
210 | def maxpool_attentive_matching(a, b, a_lengths, b_lengths, max_seq_len, attention_func=dot_attention,
211 |                                attention_func_kwargs={}):
212 |     """
213 |     Matches each vector in a with a vector created by maxpooling over the weighted vectors in b.
214 |     The weightings are determined by the attention matrix.  The attention matrix is
215 |     computed using attention_func.
216 | 
217 |     Args:
218 |         a: Input sequence a.  Tensor of shape [batch_size, max_seq_len, input_size].
219 |         b: Input sequence b.  Tensor of shape [batch_size, max_seq_len, input_size].
220 |         a_lengths: Lengths of sequences in a.  Tensor of shape [batch_size].
221 |         b_lengths: Lengths of sequences in b.  Tensor of shape [batch_size].
222 |         max_seq_len: Length of padded sequences a and b.  Integer.
223 |         attention_func: Function used to calculate attention matrix.  Can be one of the following:
224 |             multiplicative_attention, additive_attention, concat_attention, dot_attention,
225 |             or cosine_attention.
226 |         attention_func_kwargs: Keyword arguments to pass to attention_func.
227 | 
228 |     Returns:
229 |         Tensor of shape [batch_size, max_seq_len, input_size] consisting of the matching vectors for
230 |         each timestep in a.
231 | 
232 |     """
233 |     attn = attention_func(a, b, a_lengths, b_lengths, max_seq_len, **attention_func_kwargs)
234 |     return tf.reduce_max(tf.einsum('ijk,ikl->ijkl', attn, b), axis=2)
235 | 
236 | 
237 | def argmax_attentive_matching(a, b, a_lengths, b_lengths, max_seq_len, attention_func=dot_attention,
238 |                               attention_func_kwargs={}):
239 |     """
240 |     Matches each vector in a with the weighted vector in b that has the largest inner product.
241 |     The weightings are determined by the attention matrix.  The attention matrix is computed
242 |     using attention_func.
243 | 
244 |     Args:
245 |         a: Input sequence a.  Tensor of shape [batch_size, max_seq_len, input_size].
246 |         b: Input sequence b.  Tensor of shape [batch_size, max_seq_len, input_size].
247 |         a_lengths: Lengths of sequences in a.  Tensor of shape [batch_size].
248 |         b_lengths: Lengths of sequences in b.  Tensor of shape [batch_size].
249 |         max_seq_len: Length of padded sequences a and b.  Integer.
250 |         attention_func: Function used to calculate attention matrix.  Can be one of the following:
251 |             multiplicative_attention, additive_attention, concat_attention, dot_attention,
252 |             or cosine_attention.
253 |         attention_func_kwargs: Keyword arguments to pass to attention_func.
254 | 
255 |     Returns:
256 |         Tensor of shape [batch_size, max_seq_len, input_size] consisting of the matching vectors for
257 |         each timestep in a.
258 | 
259 |     """
260 |     attn = attention_func(a, b, a_lengths, b_lengths, max_seq_len, **attention_func_kwargs)
261 |     b_match_idx = tf.argmax(attn, axis=2)
262 |     batch_index = tf.tile(tf.expand_dims(tf.range(shape(b, 0), dtype=tf.int64), 1), (1, max_seq_len))
263 |     b_idx = tf.stack([batch_index, b_match_idx], axis=2)
264 |     return tf.gather_nd(b, b_idx)
265 | 


--------------------------------------------------------------------------------
/compare.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | 
  3 | from utils import shape
  4 | 
  5 | 
  6 | def cosine(a_enc, b_enc):
  7 |     """
  8 |     Compare the encoded representations a_enc and b_enc via cosine similarity.
  9 | 
 10 |     Args:
 11 |         a_enc: Encoded representation of sequence a. Tensor of shape [batch_size, input_units].
 12 |         b_enc: Encoded representation of sequence b. Tensor of shape [batch_size, input_units].
 13 | 
 14 |     Returns:
 15 |         Tensor of shape [batch size].
 16 | 
 17 |     """
 18 |     a_norm = tf.nn.l2_normalize(a_enc, dim=1)
 19 |     b_norm = tf.nn.l2_normalize(b_enc, dim=1)
 20 |     return tf.reduce_sum(a_norm*b_norm)
 21 | 
 22 | 
 23 | def euclidian(a_enc, b_enc):
 24 |     """
 25 |     Compare the encoded representations a_enc and b_enc via euclidian distance.
 26 | 
 27 |     Args:
 28 |         a_enc: Encoded representation of sequence a. Tensor of shape [batch_size, input_units].
 29 |         b_enc: Encoded representation of sequence b. Tensor of shape [batch_size, input_units].
 30 | 
 31 |     Returns:
 32 |         Tensor of shape [batch size].
 33 | 
 34 |     """
 35 |     return tf.sqrt(tf.reduce_sum(tf.square(a_enc - b_enc)), axis=1)
 36 | 
 37 | 
 38 | def manhattan(a_enc, b_enc):
 39 |     """
 40 |     Compare the encoded representations a_enc and b_enc via manhattan distance
 41 | 
 42 |     Args:
 43 |         a_enc: Encoded representation of sequence a. Tensor of shape [batch_size, input_units].
 44 |         b_enc: Encoded representation of sequence b. Tensor of shape [batch_size, input_units].
 45 | 
 46 |     Returns:
 47 |         Tensor of shape [batch size].
 48 | 
 49 |     """
 50 |     return tf.reduce_sum(tf.abs(a_enc - b_enc), axis=1)
 51 | 
 52 | 
 53 | def dot(a_enc, b_enc):
 54 |     """
 55 |     Compare the encoded representations a_enc and b_enc via dot product
 56 | 
 57 |     Args:
 58 |         a_enc: Encoded representation of sequence a. Tensor of shape [batch_size, input_units].
 59 |         b_enc: Encoded representation of sequence b. Tensor of shape [batch_size, input_units].
 60 | 
 61 |     Returns:
 62 |         Tensor of shape [batch size].
 63 | 
 64 |     """
 65 |     return tf.reduce_sum(a_enc*b_enc)
 66 | 
 67 | 
 68 | def dense(a_enc, b_enc, bias=True, activation=None, dropout=None, scope='dense', reuse=False):
 69 |     """
 70 |     Compare the encoded representations a_enc and b_enc using a learnable paramterized
 71 |     function in the form of dense layer applied to the concatenation of a_enc and b_enc.
 72 | 
 73 |     Args:
 74 |         a_enc: Encoded representation of sequence a. Tensor of shape [batch_size, input_units].
 75 |         b_enc: Encoded representation of sequence b. Tensor of shape [batch_size, input_units].
 76 |         activation: Activation function.
 77 |         dropout: Dropout keep prob.  Float.
 78 | 
 79 |     Returns:
 80 |         Tensor of shape [batch size].
 81 | 
 82 |     """
 83 |     with tf.variable_scope(scope, reuse=reuse):
 84 |         inputs = tf.concat([a_enc, b_enc], axis=1)
 85 |         W = tf.get_variable(
 86 |             name='weights',
 87 |             initializer=tf.contrib.layers.variance_scaling_initializer(),
 88 |             shape=[shape(inputs, -1), 1]
 89 |         )
 90 |         z = tf.matmul(inputs, W)
 91 |         if bias:
 92 |             b_enc = tf.get_variable(
 93 |                 name='biases',
 94 |                 initializer=tf.constant_initializer(),
 95 |                 shape=[1]
 96 |             )
 97 |             z = z + b_enc
 98 |         z = activation(z) if activation else z
 99 |         z = tf.nn.dropout(z, dropout) if dropout else z
100 |         return tf.squeeze(z)
101 | 


--------------------------------------------------------------------------------
/embed.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | 
  3 | from encode import lstm_encoder
  4 | from propogate import temporal_convolution_layer
  5 | from utils import shape
  6 | 
  7 | 
  8 | def embedding_from_sparse_encodings(encodings, shape, embedding_matrix=None, scope='gather-embed',
  9 |                                     reuse=False):
 10 |     """
 11 |     Gathers embedding vectors corresponding to values in encodings.  If embedding_matrix is passed,
 12 |     then it will be used to initialize the embedding matrix.  Otherwise, the matrix will be
 13 |     initialized with random embeddings.
 14 | 
 15 |     Args:
 16 |         encodings: Tensor of shape [batch_size, sequence length].
 17 |         shape: Shape of 2D parameter matrix.  The first dimension should contain
 18 |             the vocabulary size and the second dimension should be the size
 19 |             of the embedding dimension.
 20 |         embedding_matrix: numpy array of the embedding matrix.
 21 | 
 22 |     Returns:
 23 |         Sequence of embedding vectors.  Tensor of shape [batch_size, sequence length, shape[1]].
 24 | 
 25 |     """
 26 |     with tf.variable_scope(scope, reuse=reuse):
 27 |         W = tf.get_variable(
 28 |             name='weights',
 29 |             initializer=embedding_matrix or tf.contrib.layers.variance_scaling_initializer(),
 30 |             shape=shape
 31 |         )
 32 |         embeddings = tf.nn.embedding_lookup(W, encodings)
 33 |         return embeddings
 34 | 
 35 | 
 36 | def dense_word_embedding_from_chars(chars, embed_dim, bias=True, scope='dense-word-embed', reuse=False):
 37 |     """
 38 |     Word embeddings via dense transformation + maxpooling of character sequences.
 39 | 
 40 |     Args:
 41 |         chars: Tensor of shape [batch_size, word sequence length, char sequence length, alphabet size].
 42 |         embed_dim: Dimension of word embeddings.  Integer.
 43 | 
 44 |     Returns:
 45 |         Sequence of embedding vectors.  Tensor of shape [batch_size, word sequence length, embed_dim].
 46 | 
 47 |     """
 48 |     with tf.variable_scope(scope, reuse=reuse):
 49 |         chars = tf.cast(chars, tf.float32)
 50 |         W = tf.get_variable(
 51 |             name='weights',
 52 |             initializer=tf.contrib.layers.variance_scaling_initializer(),
 53 |             shape=[shape(chars, -1), embed_dim]
 54 |         )
 55 |         z = tf.einsum('ijkl,lm->ijkm', chars, W)
 56 |         if bias:
 57 |             b = tf.get_variable(
 58 |                 name='biases',
 59 |                 initializer=tf.constant_initializer(),
 60 |                 shape=[embed_dim]
 61 |             )
 62 |             z = z + b
 63 |         dense_word_embedding = tf.reduce_max(z, 2)
 64 |         return dense_word_embedding
 65 | 
 66 | 
 67 | def lstm_word_embedding_from_chars(chars, lengths, embed_dim, scope='lstm-word-embed', reuse=False):
 68 |     """
 69 |     Word embeddings via LSTM encoding of character sequences.
 70 | 
 71 |     Args:
 72 |         chars: Tensor of shape [batch_size, word sequence length, char sequence length, num characters].
 73 |         lengths: Tensor of shape [batch_size, word_sequence length].
 74 |         embed_dim: Dimension of word embeddings.  Integer.
 75 | 
 76 |     Returns:
 77 |         Sequence of embedding vectors.  Tensor of shape [batch_size, word sequence length, embed_dim].
 78 | 
 79 |     """
 80 |     chars = tf.cast(chars, tf.float32)
 81 | 
 82 |     # this is super inefficient
 83 |     chars = tf.unstack(chars, axis=0)
 84 |     lengths = tf.unstack(lengths, axis=0)
 85 | 
 86 |     lstm_word_embeddings = []
 87 |     for i, (char, length) in enumerate(zip(chars, lengths)):
 88 |         temp_reuse = i != 0 or reuse
 89 |         embedding = lstm_encoder(char, length, embed_dim, 1.0, scope=scope, reuse=temp_reuse)
 90 |         lstm_word_embeddings.append(embedding)
 91 |     lstm_word_embeddings = tf.stack(lstm_word_embeddings, axis=0)
 92 | 
 93 |     return lstm_word_embeddings
 94 | 
 95 | 
 96 | def convolutional_word_embedding_from_chars(chars, embed_dim, convolution_width, bias=True,
 97 |                                             scope='conv-word-embed', reuse=False):
 98 |     """
 99 |     Word embeddings via convolution + maxpooling of character sequences.
100 | 
101 |     Args:
102 |         chars: Tensor of shape [batch_size, word sequence length, char sequence length, alphabet size].
103 |         embed_dim: Dimension of word embeddings  Integer.
104 |         convolution_width:  Number of characters used in the convolution.  Integer.
105 | 
106 |     Returns:
107 |         Sequence of embedding vectors.  Tensor of shape [batch_size, word sequence length, embed_dim].
108 | 
109 |     """
110 |     chars = tf.cast(chars, tf.float32)
111 | 
112 |     # this is super inefficient
113 |     chars = tf.unstack(chars, axis=0)
114 | 
115 |     conv_word_embeddings = []
116 |     for i, char in enumerate(chars):
117 |         temp_reuse = i != 0 or reuse
118 |         conv = temporal_convolution_layer(
119 |             char, embed_dim, convolution_width, scope=scope, reuse=temp_reuse)
120 |         embedding = tf.reduce_max(conv, axis=1)
121 |         conv_word_embeddings.append(embedding)
122 |     conv_word_embeddings = tf.stack(conv_word_embeddings, axis=0)
123 | 
124 |     return conv_word_embeddings
125 | 


--------------------------------------------------------------------------------
/encode.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | 
  3 | 
  4 | def lstm_encoder(inputs, lengths, state_size, keep_prob, scope='lstm-encoder', reuse=False):
  5 |     """
  6 |     LSTM encoder
  7 | 
  8 |     Args:
  9 |         inputs:  Sequence data. Tensor of shape [batch_size, max_seq_len, input_size].
 10 |         lengths: Lengths of sequences in inputs.  Tensor of shape [batch_size].
 11 |         state_size: LSTM state size.
 12 |         keep_prob: 1 - p, where p is the dropout probability.
 13 | 
 14 |     Returns:
 15 |         Tensor of shape [batch_size, state size] containing the final h states.
 16 | 
 17 |     """
 18 |     with tf.variable_scope(scope, reuse=reuse):
 19 |         cell_fw = tf.contrib.rnn.core_rnn_cell.DropoutWrapper(
 20 |             tf.contrib.rnn.core_rnn_cell.LSTMCell(
 21 |                 state_size,
 22 |                 reuse=reuse
 23 |             ),
 24 |             output_keep_prob=keep_prob
 25 |         )
 26 |         outputs, output_state = tf.nn.dynamic_rnn(
 27 |             inputs=inputs,
 28 |             cell=cell_fw,
 29 |             sequence_length=lengths,
 30 |             dtype=tf.float32
 31 |         )
 32 |         return output_state.h
 33 | 
 34 | 
 35 | def bidirectional_lstm_encoder(inputs, lengths, state_size, keep_prob, scope='bi-lstm-encoder', reuse=False):
 36 |     """
 37 |     Bidirectional LSTM encoder
 38 | 
 39 |     Args:
 40 |         inputs:  Sequence data. Tensor of shape [batch_size, max_seq_len, input_size].
 41 |         lengths: Lengths of sequences in inputs.  Tensor of shape [batch_size].
 42 |         state_size: LSTM state size.
 43 |         keep_prob: 1 - p, where p is the dropout probability.
 44 | 
 45 |     Returns:
 46 |         Tensor of shape [batch_size, 2*state size] containing the concatenated
 47 |         forward and backward lstm final h states.
 48 | 
 49 |     """
 50 |     with tf.variable_scope(scope, reuse=reuse):
 51 |         cell_fw = tf.contrib.rnn.core_rnn_cell.DropoutWrapper(
 52 |             tf.contrib.rnn.core_rnn_cell.LSTMCell(
 53 |                 state_size,
 54 |                 reuse=reuse
 55 |             ),
 56 |             output_keep_prob=keep_prob
 57 |         )
 58 |         cell_bw = tf.contrib.rnn.core_rnn_cell.DropoutWrapper(
 59 |             tf.contrib.rnn.core_rnn_cell.LSTMCell(
 60 |                 state_size,
 61 |                 reuse=reuse
 62 |             ),
 63 |             output_keep_prob=keep_prob
 64 |         )
 65 |         outputs, (output_fw, output_bw) = tf.nn.bidirectional_dynamic_rnn(
 66 |             inputs=inputs,
 67 |             cell_fw=cell_fw,
 68 |             cell_bw=cell_bw,
 69 |             sequence_length=lengths,
 70 |             dtype=tf.float32
 71 |         )
 72 |         outputs = tf.concat(outputs, 2)
 73 |         output_state = tf.concat([output_fw.h, output_bw.h], axis=1)
 74 |         return output_state
 75 | 
 76 | 
 77 | def reduce_max_encoder(inputs):
 78 |     """
 79 |     Max pooling over the time dimension
 80 | 
 81 |     Args:
 82 |         inputs:  Sequence data. Tensor of shape [batch_size, max_seq_len, input_size].
 83 | 
 84 |     Returns:
 85 |         Tensor of shape [batch_size, input_size].
 86 |     """
 87 |     return tf.reduce_max(inputs, axis=1)
 88 | 
 89 | 
 90 | def reduce_sum_encoder(inputs):
 91 |     """
 92 |     Sum pooling over the time dimension
 93 | 
 94 |     Args:
 95 |         inputs:  Sequence data. Tensor of shape [batch_size, max_seq_len, input_size].
 96 | 
 97 |     Returns:
 98 |         Tensor of shape [batch_size, input_size].
 99 |     """
100 |     return tf.reduce_sum(inputs, axis=1)
101 | 
102 | 
103 | def reduce_mean_encoder(inputs, lengths):
104 |     """
105 |     Max pooling over the time dimension
106 | 
107 |     Args:
108 |         inputs:  Sequence data. Tensor of shape [batch_size, max_seq_len, input_size].
109 |         lengths: Lengths of sequences in inputs.  Tensor of shape [batch_size].
110 | 
111 |     Returns:
112 |         Tensor of shape [batch_size, input_size].
113 |     """
114 |     return tf.reduce_sum(inputs, axis=1) / tf.expand_dims(lengths, 1)
115 | 


--------------------------------------------------------------------------------
/propogate.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | 
  3 | from utils import shape
  4 | 
  5 | 
  6 | def lstm_layer(inputs, lengths, state_size, keep_prob, scope='lstm-layer', reuse=False):
  7 |     """
  8 |     LSTM layer.
  9 | 
 10 |     Args:
 11 |         inputs: Tensor of shape [batch size, max sequence length, ...].
 12 |         lengths: Tensor of shape [batch size].
 13 |         state_size: LSTM state size.
 14 |         keep_prob: 1 - p, where p is the dropout probability.
 15 | 
 16 |     Returns:
 17 |         Tensor of shape [batch size, max sequence length, state_size] containing the lstm
 18 |         outputs at each timestep.
 19 | 
 20 |     """
 21 |     with tf.variable_scope(scope, reuse=reuse):
 22 |         cell_fw = tf.contrib.rnn.core_rnn_cell.DropoutWrapper(
 23 |             tf.contrib.rnn.core_rnn_cell.LSTMCell(
 24 |                 state_size,
 25 |                 reuse=reuse
 26 |             ),
 27 |             output_keep_prob=keep_prob
 28 |         )
 29 |         outputs, output_state = tf.nn.dynamic_rnn(
 30 |             inputs=inputs,
 31 |             cell=cell_fw,
 32 |             sequence_length=lengths,
 33 |             dtype=tf.float32
 34 |         )
 35 |         return outputs
 36 | 
 37 | 
 38 | def bidirectional_lstm_layer(inputs, lengths, state_size, keep_prob, scope='bi-lstm-layer', reuse=False):
 39 |     """
 40 |     Bidirectional LSTM layer.
 41 | 
 42 |     Args:
 43 |         inputs: Tensor of shape [batch size, max sequence length, ...].
 44 |         lengths: Tensor of shape [batch size].
 45 |         state_size: LSTM state size.
 46 |         keep_prob: 1 - p, where p is the dropout probability.
 47 | 
 48 |     Returns:
 49 |         Tensor of shape [batch size, max sequence length, 2*state_size] containing the concatenated
 50 |         forward and backward lstm outputs at each timestep.
 51 | 
 52 |     """
 53 |     with tf.variable_scope(scope, reuse=reuse):
 54 |         cell_fw = tf.contrib.rnn.core_rnn_cell.DropoutWrapper(
 55 |             tf.contrib.rnn.core_rnn_cell.LSTMCell(
 56 |                 state_size,
 57 |                 reuse=reuse
 58 |             ),
 59 |             output_keep_prob=keep_prob
 60 |         )
 61 |         cell_bw = tf.contrib.rnn.core_rnn_cell.DropoutWrapper(
 62 |             tf.contrib.rnn.core_rnn_cell.LSTMCell(
 63 |                 state_size,
 64 |                 reuse=reuse
 65 |             ),
 66 |             output_keep_prob=keep_prob
 67 |         )
 68 |         outputs, (output_fw, output_bw) = tf.nn.bidirectional_dynamic_rnn(
 69 |             inputs=inputs,
 70 |             cell_fw=cell_fw,
 71 |             cell_bw=cell_bw,
 72 |             sequence_length=lengths,
 73 |             dtype=tf.float32
 74 |         )
 75 |         outputs = tf.concat(outputs, 2)
 76 |         return outputs
 77 | 
 78 | 
 79 | def time_distributed_dense_layer(inputs, output_units, bias=True, activation=None, dropout=None,
 80 |                                  scope='time-distributed-dense-layer', reuse=False):
 81 |     """
 82 |     Applies a shared dense layer to each timestep of a tensor of shape [batch_size, max_seq_len, input_units]
 83 |     to produce a tensor of shape [batch_size, max_seq_len, output_units].
 84 | 
 85 |     Args:
 86 |         inputs: Tensor of shape [batch size, max sequence length, ...].
 87 |         output_units: Number of output units.
 88 |         activation: activation function.
 89 |         dropout: dropout keep prob.
 90 | 
 91 |     Returns:
 92 |         Tensor of shape [batch size, max sequence length, output_units].
 93 | 
 94 |     """
 95 |     with tf.variable_scope(scope, reuse=reuse):
 96 |         W = tf.get_variable(
 97 |             name='weights',
 98 |             initializer=tf.contrib.layers.variance_scaling_initializer(),
 99 |             shape=[shape(inputs, -1), output_units]
100 |         )
101 |         z = tf.einsum('ijk,kl->ijl', inputs, W)
102 |         if bias:
103 |             b = tf.get_variable(
104 |                 name='biases',
105 |                 initializer=tf.constant_initializer(),
106 |                 shape=[output_units]
107 |             )
108 |             z = z + b
109 |         z = activation(z) if activation else z
110 |         z = tf.nn.dropout(z, dropout) if dropout else z
111 |         return z
112 | 
113 | 
114 | def temporal_convolution_layer(inputs, output_units, convolution_width, bias=True, activation=None,
115 |                                dropout=None, scope='time-distributed-conv-layer', reuse=False):
116 |     """
117 |     Convolution over the temporal axis of sequence data.
118 | 
119 |     Args:
120 |         inputs: Tensor of shape [batch size, max sequence length, input_units].
121 |         output_units: Output channels for convolution.
122 |         convolution_width: Number of timesteps (words) to use in convolution.
123 | 
124 |     Returns:
125 |         Tensor of shape [batch size, max sequence length, output_units].
126 | 
127 |     """
128 |     with tf.variable_scope(scope, reuse=reuse):
129 |         W = tf.get_variable(
130 |             name='weights',
131 |             initializer=tf.contrib.layers.variance_scaling_initializer(),
132 |             shape=[convolution_width, shape(inputs, 2), output_units]
133 |         )
134 | 
135 |         z = tf.nn.convolution(inputs, W, padding='SAME', strides=[1])
136 |         if bias:
137 |             b = tf.get_variable(
138 |                 name='biases',
139 |                 initializer=tf.constant_initializer(),
140 |                 shape=[output_units]
141 |             )
142 |             z = z + b
143 |         z = activation(z) if activation else z
144 |         z = tf.nn.dropout(z, dropout) if dropout else z
145 |         return z
146 | 
147 | 
148 | def dense_layer(inputs, output_units, bias=True, activation=None, dropout=None, scope='dense-layer',
149 |                 reuse=False):
150 |     """
151 |     Applies a dense layer to a 2D tensor of shape [batch_size, input_units]
152 |     to produce a tensor of shape [batch_size, output_units].
153 | 
154 |     Args:
155 |         inputs: Tensor of shape [batch size, input_units].
156 |         output_units: Number of output units.
157 |         activation: activation function.
158 |         dropout: dropout keep prob.
159 | 
160 |     Returns:
161 |         Tensor of shape [batch size, output_units].
162 | 
163 |     """
164 |     with tf.variable_scope(scope, reuse=reuse):
165 |         W = tf.get_variable(
166 |             name='weights',
167 |             initializer=tf.contrib.layers.variance_scaling_initializer(),
168 |             shape=[shape(inputs, -1), output_units]
169 |         )
170 |         z = tf.matmul(inputs, W)
171 |         if bias:
172 |             b = tf.get_variable(
173 |                 name='biases',
174 |                 initializer=tf.constant_initializer(),
175 |                 shape=[output_units]
176 |             )
177 |             z = z + b
178 |         z = activation(z) if activation else z
179 |         z = tf.nn.dropout(z, dropout) if dropout else z
180 |         return z
181 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | def rank(tensor):
 2 |     """Get tensor rank as python list"""
 3 |     return len(tensor.shape.as_list())
 4 | 
 5 | 
 6 | def shape(tensor, dim=None):
 7 |     """Get tensor shape/dimension as list/int"""
 8 |     if not dim:
 9 |         return tensor.shape.as_list()
10 |     if dim:
11 |         return tensor.shape.as_list()[dim]
12 | 


--------------------------------------------------------------------------------