├── Data
    └── README.md
├── LICENSE
├── README.md
├── bi_rnn.py
├── crf_defs.py
├── model_config.py
├── model_defs.py
├── model_use.py
├── training.py
├── training_crf.py
└── utils.py


/Data/README.md:
--------------------------------------------------------------------------------
1 | Put your data there
2 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | #DeepCRF package
2 | 
3 | Convolutions and RNN tagging models are implemented and tested
4 | 
5 | CRF model is implemented and needs to be tested
6 | 


--------------------------------------------------------------------------------
/bi_rnn.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | from __future__ import division
 3 | from __future__ import print_function
 4 | 
 5 | import tensorflow as tf
 6 | 
 7 | from tensorflow.models.rnn import rnn_cell
 8 | from tensorflow.python.ops import control_flow_ops
 9 | 
10 | 
11 | def bi_rnn(cell_forward, cell_backward, inputs, initial_state=None,
12 |            dtype=None, scope=None, reuse=False):
13 |     if not (isinstance(cell_forward, rnn_cell.RNNCell) and
14 |             isinstance(cell_backward, rnn_cell.RNNCell)):
15 |         raise TypeError("cell must be an instance of RNNCell")
16 |     if not isinstance(inputs, list):
17 |         raise TypeError("inputs must be a list")
18 |     if not inputs:
19 |         raise ValueError("inputs must not be empty")
20 |     outputs = []
21 |     states = []
22 |     with tf.variable_scope(scope or "RNN"):
23 |         batch_size = tf.shape(inputs[0])[0]
24 |         outputs_f = [0] * len(inputs)
25 |         states_f = [0] * len(inputs)
26 |         outputs_b = [0] * len(inputs)
27 |         states_b = [0] * len(inputs)
28 |         if initial_state is not None:
29 |             state_f = initial_state
30 |             state_b = initial_state
31 |         else:
32 |             if not dtype:
33 |                 raise ValueError("If no initial_state is provided, \
34 |                                   dtype must be.")
35 |             state_f = cell_forward.zero_state(batch_size, dtype)
36 |             state_b = cell_backward.zero_state(batch_size, dtype)
37 |         for t, input_ in enumerate(inputs):
38 |             if reuse or t > 0:
39 |                 tf.get_variable_scope().reuse_variables()
40 |             output_f, state_f = cell_forward(inputs[t], state_f,
41 |                                              scope='LSTM_f')
42 |             output_b, state_b = cell_backward(inputs[-1 - t], state_b,
43 |                                               scope='LSTM_b')
44 |             outputs_f[t] = output_f
45 |             outputs_b[-1 - t] = output_b
46 |             states_f[t] = state_f
47 |             states_b[-1 - t] = state_b
48 |         for t in range(len(inputs)):
49 |             outputs.append(tf.concat(1, [outputs_f[t], outputs_b[t]]))
50 |             states.append(tf.concat(1, [states_f[t], states_b[t]]))
51 |     return (outputs, states)
52 | 


--------------------------------------------------------------------------------
/crf_defs.py:
--------------------------------------------------------------------------------
  1 | from model_defs import *
  2 | from utils import *
  3 | from tensorflow.models.rnn.rnn_cell import *
  4 | 
  5 | ###################################
  6 | # Building blocks                 #
  7 | ###################################
  8 | 
  9 | # takes features and outputs potentials
 10 | def potentials_layer(in_layer, mask, config, params, reuse=False, name='Potentials'):
 11 |     batch_size = int(in_layer.get_shape()[0])
 12 |     num_steps = int(in_layer.get_shape()[1])
 13 |     input_size = int(in_layer.get_shape()[2])
 14 |     pot_shape = [config.n_tags] * config.pot_window
 15 |     out_shape = [batch_size, num_steps] + pot_shape
 16 |     #~ pot_size = config.n_tags ** config.pot_window
 17 |     #~ if reuse:
 18 |         #~ tf.get_variable_scope().reuse_variables()
 19 |         #~ W_pot = params.W_pot
 20 |         #~ b_pot = params.b_pot
 21 |     #~ else:
 22 |         #~ W_pot = weight_variable([input_size, pot_size], name=name)
 23 |         #~ b_pot = bias_variable([pot_size], name=name)
 24 |     #~ flat_input = tf.reshape(in_layer, [-1, input_size])
 25 |     #~ pre_scores = tf.matmul(flat_input, W_pot) + b_pot
 26 |     # BOGUS
 27 |     W_pot = False
 28 |     b_pot = False
 29 |     reshaped_in = tf.reshape(in_layer, [batch_size, num_steps, config.pot_window, -1])
 30 |     pre_scores = tf.reduce_sum(reshaped_in, 2)
 31 |     # /BOGUS
 32 |     pots_layer = tf.reshape(pre_scores, out_shape)
 33 |     # define potentials for padding tokens
 34 |     padding_pot = np.zeros(pot_shape)
 35 |     num = config.pot_window / 2
 36 |     idx = [slice(None)] * num + [0] + [slice(None)] * num
 37 |     padding_pot[idx] += 10000
 38 |     pad_pot = tf.convert_to_tensor(padding_pot, tf.float32)
 39 |     pad_pots = tf.expand_dims(tf.expand_dims(pad_pot, 0), 0)
 40 |     pad_pots = tf.tile(pad_pots, [batch_size, num_steps] + [1] * config.pot_window)
 41 |     # expand mask
 42 |     mask_a = mask
 43 |     for _ in range(config.pot_window):
 44 |         mask_a = tf.expand_dims(mask_a, -1)
 45 |     mask_a = tf.tile(mask_a, [1, 1] + pot_shape)
 46 |     # combine
 47 |     pots_layer = (pots_layer * mask_a + (1 - mask_a) * pad_pots)
 48 |     return (pots_layer, W_pot, b_pot)
 49 | 
 50 | 
 51 | # pseudo-likelihood criterion
 52 | def pseudo_likelihood(potentials, pot_indices, targets, config):
 53 |     batch_size = int(potentials.get_shape()[0])
 54 |     num_steps = int(potentials.get_shape()[1])
 55 |     pots_shape = map(int, potentials.get_shape()[2:])
 56 |     # move the current tag to the last dimension
 57 |     perm = range(len(potentials.get_shape()))
 58 |     mid = config.pot_window / 2
 59 |     perm[-1] = perm[-mid - 1]
 60 |     for i in range(-1, mid -1):
 61 |         perm[-mid + i] = perm[-mid + i] + 1
 62 |     perm_potentials = tf.transpose(potentials, perm=perm)
 63 |     # get conditional distribution of the current tag
 64 |     flat_pots = tf.reshape(perm_potentials, [-1, config.n_tags])
 65 |     flat_cond = tf.gather(flat_pots, pot_indices)
 66 |     pre_cond = tf.nn.softmax(flat_cond)
 67 |     conditional = tf.reshape(pre_cond, [batch_size, num_steps, -1])
 68 |     # compute pseudo-log-likelihood of sequence
 69 |     p_ll = tf.reduce_sum(targets * tf.log(conditional))
 70 |     return (conditional, p_ll)
 71 | 
 72 | 
 73 | # dynamic programming part 1: max sum
 74 | class CRFMaxCell(RNNCell):
 75 |     """Dynamic programming for CRF"""
 76 |     def __init__(self, config):
 77 |         self._num_units = config.n_tags ** (config.pot_window - 1)
 78 |         self.n_tags = config.n_tags
 79 |     
 80 |     @property
 81 |     def input_size(self):
 82 |         return self._num_units
 83 | 
 84 |     @property
 85 |     def output_size(self):
 86 |         return self._num_units
 87 |     
 88 |     @property
 89 |     def state_size(self):
 90 |         return self._num_units
 91 |     
 92 |     def __call__(self, inputs, state, scope=None):
 93 |         """Summation for dynamic programming. Inputs are the
 94 |         log-potentials. States are the results of the summation at the
 95 |         last step"""
 96 |         with tf.variable_scope(scope or type(self).__name__):
 97 |             # add states and log-potentials
 98 |             multiples = [1] * (len(state.get_shape()) + 1)
 99 |             multiples[-1] = self.n_tags
100 |             exp_state = tf.tile(tf.expand_dims(state, -1), multiples)
101 |             added = exp_state + inputs
102 |             # return maxes, arg_maxes along first dimension (after the batch dim)
103 |             new_state = tf.reduce_max(added, 1)
104 |             max_id = tf.argmax(added, 1)
105 |         return new_state, max_id
106 | 
107 | 
108 | # max a posteriori tags assignment: implement dynamic programming
109 | def map_assignment(potentials, config):
110 |     batch_size = int(potentials.get_shape()[0])
111 |     num_steps = int(potentials.get_shape()[1])
112 |     pots_shape = map(int, potentials.get_shape()[2:])
113 |     inputs_list = [tf.reshape(x, [batch_size] + pots_shape)
114 |                    for x in tf.split(1, num_steps, potentials)]
115 |     # forward pass
116 |     max_cell = CRFMaxCell(config)
117 |     max_ids = [0] * len(inputs_list)
118 |     # initial state: starts at 0 - 0 - 0 etc...
119 |     state = tf.zeros(pots_shape[:-1])
120 |     for t, input_ in enumerate(inputs_list):
121 |         state, max_id = max_cell(inputs_list[t], state)
122 |         max_ids[t] = max_id
123 |     # backward pass
124 |     powers = tf.to_int64(map(float, range(batch_size))) * \
125 |              (config.n_tags ** (config.pot_window - 1))
126 |     outputs = [-1] * len(inputs_list)
127 |     best_end = tf.argmax(tf.reshape(state, [batch_size, -1]), 1)
128 |     current = best_end
129 |     mid = config.pot_window / 2
130 |     max_pow = (config.n_tags ** mid)
131 |     for i, _ in enumerate(outputs):
132 |         outputs[-1 - i] = (current / max_pow) 
133 |         prev_best = tf.gather(tf.reshape(max_ids[-1 - i], [-1]), current + powers)
134 |         current = prev_best * max_pow + (current / config.n_tags)
135 |     map_tags = tf.transpose(tf.pack(outputs))
136 |     return map_tags
137 | 
138 | 
139 | # dynamic programming part 2: sum product
140 | class CRFSumCell(RNNCell):
141 |     """Dynamic programming for CRF"""
142 |     def __init__(self, config):
143 |         self._num_units = config.n_tags ** (config.pot_window - 1)
144 |         self.n_tags = config.n_tags
145 |     
146 |     @property
147 |     def input_size(self):
148 |         return self._num_units
149 | 
150 |     @property
151 |     def output_size(self):
152 |         return self._num_units
153 |     
154 |     @property
155 |     def state_size(self):
156 |         return self._num_units
157 |     
158 |     def __call__(self, inputs, state, scope=None):
159 |         """Summation for dynamic programming. Inputs are the
160 |         log-potentials. States are the results of the summation at the
161 |         last step"""
162 |         with tf.variable_scope(scope or type(self).__name__):
163 |             # add states and log-potentials
164 |             multiples = [1] * (len(state.get_shape()) + 1)
165 |             multiples[-1] = self.n_tags
166 |             exp_state = tf.tile(tf.expand_dims(state, -1), multiples)
167 |             added = exp_state + inputs
168 |             # log-sum along first dimension (after the batch dim)
169 |             max_val = tf.reduce_max(added)
170 |             added_exp = tf.exp(added - max_val)
171 |             summed_exp = tf.reduce_sum(added_exp, 1)
172 |             new_state = tf.log(summed_exp) + max_val
173 |         return new_state
174 | 
175 | 
176 | # computing the log partition for a sequence of length config.num_steps
177 | def log_partition(potentials, config):
178 |     batch_size = int(potentials.get_shape()[0])
179 |     num_steps = int(potentials.get_shape()[1])
180 |     pots_shape = map(int, potentials.get_shape()[2:])
181 |     inputs_list = [tf.reshape(x, [batch_size] + pots_shape)
182 |                    for x in tf.split(1, num_steps, potentials)]
183 |     # forward pass
184 |     sum_cell = CRFSumCell(config)
185 |     state = tf.zeros([batch_size] + pots_shape[:-1])
186 |     partial_sums = [0] * len(inputs_list)
187 |     for t, input_ in enumerate(inputs_list):
188 |         state = sum_cell(inputs_list[t], state)
189 |         partial_sums[t] = state
190 |     # sum at the end
191 |     max_val = tf.reduce_max(state)
192 |     state_exp = tf.exp(state - max_val)
193 |     log_part = tf.log(tf.reduce_sum(tf.reshape(state_exp, [batch_size, -1]), 1)) + max_val
194 |     return tf.reduce_sum(log_part)
195 | 
196 | 
197 | # compute the log to get the log-likelihood
198 | def log_score(potentials, window_indices, mask, config):
199 |     batch_size = int(potentials.get_shape()[0])
200 |     num_steps = int(potentials.get_shape()[1])
201 |     pots_shape = map(int, potentials.get_shape()[2:])
202 |     flat_pots = tf.reshape(potentials, [-1])
203 |     flat_scores = tf.gather(flat_pots, window_indices)
204 |     scores = tf.reshape(flat_scores, [batch_size, num_steps])
205 |     scores = tf.mul(scores, mask)
206 |     return tf.reduce_sum(scores)
207 |     
208 | 
209 | # TODO: alpha-beta rec
210 | def marginals(potentials, config):
211 |     batch_size = int(potentials.get_shape()[0])
212 |     num_steps = int(potentials.get_shape()[1])
213 |     pots_shape = map(int, potentials.get_shape()[2:])
214 |     inputs_list = [tf.reshape(x, [batch_size] + pots_shape)
215 |                    for x in tf.split(1, num_steps, potentials)]
216 |     # forward and backwar pass
217 |     sum_cell_f = CRFSumCell(config)
218 |     sum_cell_b = CRFSumCell(config)
219 |     state_f = tf.convert_to_tensor(np.zeros(pots_shape[:-1]))
220 |     state_b = tf.convert_to_tensor(np.zeros(pots_shape[:-1]))
221 |     partial_sums_f = [0] * len(inputs_list)
222 |     partial_sums_b = [0] * len(inputs_list)
223 |     for t, _ in enumerate(inputs_list):
224 |         state_f = sum_cell_f(inputs_list[t], state_f)
225 |         partial_sums_f[t] = state_f
226 |         state_b = sum_cell_b(inputs_list[t], state_b)
227 |         partial_sums_b[-1 - t] = state_b
228 |     # TODO: compute marginals
229 |     marginals = 0
230 |     return marginals
231 | 
232 | 
233 | ###################################
234 | # Making a (deep) CRF             #
235 | ###################################
236 | class CRF:
237 |     def __init__(self, config):
238 |         self.batch_size = config.batch_size
239 |         self.num_steps = config.num_steps
240 |         num_features = len(config.input_features)
241 |         # input_ids <- batch.features
242 |         self.input_ids = tf.placeholder(tf.int32, shape=[self.batch_size,
243 |                                                          self.num_steps,
244 |                                                          num_features])
245 |         # mask <- batch.mask
246 |         self.mask = tf.placeholder(tf.float32, [self.batch_size, self.num_steps])
247 |         # pot_indices <- batch.tag_neighbours_lin
248 |         self.pot_indices = tf.placeholder(tf.int32,
249 |                                           [config.batch_size * config.num_steps])
250 |         # targets <- batch.tags_one_hot
251 |         self.targets = tf.placeholder(tf.float32, [config.batch_size,
252 |                                                    config.num_steps,
253 |                                                    config.n_tags])
254 |         # window_indices <- batch.tag_windows_lin
255 |         self.window_indices = tf.placeholder(tf.int32,
256 |                                              [config.batch_size * config.num_steps])
257 | 
258 |     def make(self, config, params, reuse=False, name='CRF'):
259 |         # TODO: add marginal inference
260 |         with tf.variable_scope(name):
261 |             if reuse:
262 |                 tf.get_variable_scope().reuse_variables()
263 |             # out_layer <- output of NN (TODO: add layers)
264 |             (out_layer, embeddings) = feature_layer(self.input_ids,
265 |                                                     config, params,
266 |                                                     reuse=reuse)
267 |             params.embeddings = embeddings
268 |             if config.verbose:
269 |                 print('features layer done')
270 |             self.out_layer = out_layer
271 |             # pots_layer <- potentials
272 |             (pots_layer, W_pot, b_pot) = potentials_layer(out_layer,
273 |                                                           self.mask,
274 |                                                           config, params,
275 |                                                           reuse=reuse)
276 |             params.W_pot = W_pot
277 |             params.b_pot = b_pot
278 |             if config.verbose:
279 |                 print('potentials layer done')
280 |             self.pots_layer = pots_layer
281 |             # pseudo-log-likelihood
282 |             conditional, pseudo_ll = pseudo_likelihood(pots_layer,
283 |                                                        self.pot_indices,
284 |                                                        self.targets, config)
285 |             self.pseudo_ll = pseudo_ll
286 |             # accuracy of p(t_i | t_{i-1}, t_{i+1})
287 |             correct_cond_pred = tf.equal(tf.argmax(conditional, 2), tf.argmax(self.targets, 2))
288 |             correct_cond_pred = tf.cast(correct_cond_pred,"float")
289 |             cond_accuracy = tf.reduce_sum(correct_cond_pred * tf.reduce_sum(self.targets, 2)) /\
290 |                             tf.reduce_sum(self.targets)
291 |             self.cond_accuracy = cond_accuracy
292 |             # log-likelihood
293 |             log_sc = log_score(self.pots_layer, self.window_indices,
294 |                                self.mask, config)
295 |             log_part = log_partition(self.pots_layer, config)
296 |             log_likelihood = log_sc - log_part
297 |             self.log_likelihood = log_likelihood
298 |             # L1 regularization
299 |             self.l1_norm = tf.reduce_sum(tf.zeros([1]))
300 |             for feat in config.l1_list:
301 |                 self.l1_norm += config.l1_reg * \
302 |                                 tf.reduce_sum(tf.abs(params.embeddings[feat]))
303 |             # L2 regularization
304 |             self.l2_norm = tf.reduce_sum(tf.zeros([1]))
305 |             for feat in config.l2_list:
306 |                 self.l2_norm += config.l2_reg * \
307 |                                 tf.reduce_sum(tf.mul(params.embeddings[feat],
308 |                                                      params.embeddings[feat]))
309 |             # map assignment and accuracy of map assignment
310 |             map_tags = map_assignment(self.pots_layer, config)
311 |             correct_pred = tf.equal(map_tags, tf.argmax(self.targets, 2))
312 |             correct_pred = tf.cast(correct_pred,"float")
313 |             accuracy = tf.reduce_sum(correct_pred * tf.reduce_sum(self.targets, 2)) /\
314 |                        tf.reduce_sum(self.targets)
315 |             self.map_tags = map_tags
316 |             self.accuracy = accuracy
317 |     
318 |     def train_epoch(self, data, config, params, session, crit_type='likelihood'):
319 |         batch_size = config.batch_size
320 |         criterion = None
321 |         if crit_type == 'pseudo':
322 |             criterion = -self.pseudo_ll
323 |         else:
324 |             criterion = -self.log_likelihood
325 |         criterion -= config.l1_reg * self.l1_norm + config.l1_reg * self.l2_norm
326 |         train_step = tf.train.AdagradOptimizer(config.learning_rate).minimize(criterion)
327 |         session.run(tf.initialize_all_variables())
328 |         # TODO: gradient clipping
329 |         total_crit = 0.
330 |         n_batches = len(data) / batch_size
331 |         batch = Batch()
332 |         for i in range(n_batches):
333 |             batch.read(data, i * batch_size, config)
334 |             f_dict = {self.input_ids: batch.features,
335 |                       self.pot_indices: batch.tag_neighbours_lin,
336 |                       self.window_indices: batch.tag_windows_lin,
337 |                       self.mask: batch.mask,
338 |                       self.targets: batch.tags_one_hot}
339 |             train_step.run(feed_dict=f_dict)
340 |             crit = criterion.eval(feed_dict=f_dict)
341 |             total_crit += crit
342 |             if i % 50 == 0:
343 |                 train_accuracy = self.accuracy.eval(feed_dict=f_dict)
344 |                 print i, n_batches, train_accuracy, crit
345 |                 print("step %d of %d, training accuracy %f, criterion %f" %
346 |                       (i, n_batches, train_accuracy, crit))
347 |         print 'total crit', total_crit / n_batches
348 |         return total_crit / n_batches
349 |     
350 |     def validate_accuracy(self, data, config):
351 |         batch_size = config.batch_size
352 |         batch = Batch()
353 |         total_accuracy = 0.
354 |         total_cond_accuracy = 0.
355 |         total = 0.
356 |         for i in range(len(data) / batch_size):
357 |             batch.read(data, i * batch_size, config)
358 |             f_dict = {self.input_ids: batch.features,
359 |                       self.targets: batch.tags_one_hot,
360 |                       self.pot_indices: batch.tag_neighbours_lin}
361 |             dev_accuracy = self.accuracy.eval(feed_dict=f_dict)
362 |             dev_cond_accuracy = self.cond_accuracy.eval(feed_dict=f_dict)
363 |             pll = self.pseudo_ll.eval(feed_dict=f_dict)
364 |             ll = self.log_likelihood.eval(feed_dict=f_dict)
365 |             total_accuracy += dev_accuracy
366 |             total_cond_accuracy += dev_cond_accuracy
367 |             total_pll += pll
368 |             total_ll += ll
369 |             total += 1
370 |             if i % 100 == 0:
371 |                 print("%d of %d: \t map accuracy: %f \t cond accuracy: %f \
372 |                        \t pseudo_ll:  %f \t log_likelihood:  %f" % (i, len(data) / batch_size,
373 |                                                 total_accuracy / total,
374 |                                                 total_cond_accuracy / total))
375 |         return (total_accuracy / total, total_cond_accuracy / total)
376 | 
377 | 


--------------------------------------------------------------------------------
/model_config.py:
--------------------------------------------------------------------------------
 1 | # All the model arguments / parameters / file locations in one file
 2 | from os.path import join as pjoin
 3 | from utils import *
 4 | 
 5 | 
 6 | def base_config(input_features, l1_list, tag_list):
 7 |     return Config(input_features=input_features, l1_list=l1_list,
 8 |                   tag_list=tag_list)
 9 | 
10 | 
11 | def base_rnn_config(input_features, l1_list, tag_list):
12 |     return Config(input_features=input_features, l1_list=l1_list,
13 |                   tag_list=tag_list, use_rnn=True)
14 | 
15 | 
16 | def base_convo_config(input_features, l1_list, tag_list):
17 |     return Config(input_features=input_features, l1_list=l1_list,
18 |                   tag_list=tag_list, use_convo=True,
19 |                   num_epochs=15, num_predict=5, pred_window=3)
20 | 
21 | 
22 | def base_crf_config(input_features, l1_list, tag_list):
23 |     config = Config(input_features=input_features, l1_list=l1_list,
24 |                     tag_list=tag_list, use_convo=True,
25 |                     num_epochs=6, num_predict=2,
26 |                     pred_window=3,
27 |                     pot_window=3)
28 |     config.features_dim = config.n_tags ** config.pot_window * config.pot_window
29 |     return config
30 | 
31 | 
32 | # file locations
33 | git_dir = '/home/jernite/Code/DeepCRF'
34 | 
35 | train_file = pjoin(git_dir, 'Data/semeval_train/crfpp_text_batch_1.txt')
36 | dev_file = pjoin(git_dir, 'Data/semeval_dev/crfpp_text_batch_1.txt')
37 | vecs_file = pjoin(git_dir, 'Data/semeval_vecs.dat')
38 | 
39 | train_spans_file = pjoin(git_dir, 'Data/semeval_train/crfpp_spans_batch_1.txt')
40 | dev_spans_file = pjoin(git_dir, 'Data/semeval_dev/crfpp_spans_batch_1.txt')
41 | 
42 | # feature names and tag list
43 | features = ['word', 'lemma', 'pos', 'normal', 'word_length',
44 |             'prefix', 'suffix', 'all_caps', 'capitalized', 'word_pos',
45 |             'sentence_pos', 'sentence_length', 'med_prefix',
46 |             'umls_match_tag_full', 'umls_match_tag_prefix',
47 |             'umls_match_tag_acro', 'label']
48 | 
49 | input_features = ['lemma', 'prefix', 'suffix', 'pos', 'umls_match_tag_full']
50 | l1_list = ['lemma', 'prefix', 'suffix']
51 | tag_list = ['<P>', 'B', 'I', 'O', 'ID', 'OD']
52 | 


--------------------------------------------------------------------------------
/model_defs.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | import tensorflow as tf
  4 | import tensorflow.python.platform
  5 | from tensorflow.models.rnn import rnn
  6 | from tensorflow.models.rnn import rnn_cell
  7 | 
  8 | from bi_rnn import bi_rnn
  9 | from utils import *
 10 | 
 11 | ###############################################
 12 | # NN creation functions                       #
 13 | ###############################################
 14 | class Parameters:
 15 |     def __init__(self, init={}, emb={}, w_c=False, b_c=False, w_p=False,
 16 |                  b_p=False, w_po=False, b_po=False):
 17 |         self.init_dic = init
 18 |         self.embeddings = emb
 19 |         self.W_conv = w_c
 20 |         self.b_conv = b_c
 21 |         self.W_pred = w_p
 22 |         self.b_pred = b_p
 23 |         self.W_pot = w_po
 24 |         self.b_pot = b_po
 25 | 
 26 | 
 27 | def device_for_node(n):
 28 |     if n.type == "MatMul":
 29 |         return "/gpu:0"
 30 |     else:
 31 |         return "/cpu:0"
 32 | 
 33 | 
 34 | def conv2d(x, W):
 35 |     return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')
 36 | 
 37 | 
 38 | def weight_variable(shape, name='weight'):
 39 |     initial = tf.truncated_normal(shape, stddev=0.1)
 40 |     return tf.Variable(initial, name=name+'_W')
 41 | 
 42 | 
 43 | def bias_variable(shape, name='weight'):
 44 |     initial = tf.constant(0.1, shape=shape)
 45 |     return tf.Variable(initial, name=name+'_b')
 46 | 
 47 | 
 48 | def feature_layer(in_layer, config, params, reuse=False):
 49 |     in_features = config.input_features
 50 |     features_dim = config.features_dim
 51 |     batch_size = config.batch_size
 52 |     num_steps = config.num_steps
 53 |     feature_mappings = config.feature_maps
 54 |     # inputs
 55 |     num_features = len(in_features)
 56 |     input_ids = in_layer
 57 |     if reuse:
 58 |         tf.get_variable_scope().reuse_variables()
 59 |         param_vars = params.embeddings
 60 |     # lookup layer
 61 |     else:
 62 |         param_dic = params.init_dic
 63 |         param_vars = {}
 64 |         for feat in in_features:
 65 |             if feat in param_dic:
 66 |                 param_vars[feat] = \
 67 |                   tf.Variable(tf.convert_to_tensor(param_dic[feat],
 68 |                                                    dtype=tf.float32),
 69 |                               name=feat + '_embedding',
 70 |                               trainable=False)
 71 |             else:
 72 |                 shape = [len(feature_mappings[feat]['reverse']), features_dim]
 73 |                 initial = tf.truncated_normal(shape, stddev=0.1)
 74 |                 param_vars[feat] = tf.Variable(initial,
 75 |                                                name=feat + '_embedding')
 76 |     params = [param_vars[feat] for feat in in_features]
 77 |     input_embeddings = tf.nn.embedding_lookup(params, input_ids, name='lookup')
 78 |     # add and return
 79 |     embedding_layer = tf.reduce_sum(input_embeddings, 2)
 80 |     return (embedding_layer, param_vars)
 81 | 
 82 | 
 83 | def bi_lstm_layer(in_layer, config, reuse=False, name='Bi_LSTM'):
 84 |     num_units = config.rnn_hidden_units
 85 |     output_size = config.rnn_output_size
 86 |     batch_size = int(in_layer.get_shape()[0])
 87 |     num_steps = int(in_layer.get_shape()[1])
 88 |     input_size = int(in_layer.get_shape()[2])
 89 |     initializer = tf.random_uniform_initializer(-0.1, 0.1)
 90 |     lstm_cell_f = rnn_cell.LSTMCell(num_units, input_size, use_peepholes=True,
 91 |                                     num_proj=output_size, cell_clip=1.0,
 92 |                                     initializer=initializer)
 93 |     lstm_cell_b = rnn_cell.LSTMCell(num_units, input_size, use_peepholes=True,
 94 |                                     num_proj=output_size, cell_clip=1.0,
 95 |                                     initializer=initializer)
 96 |     initial_state_f = lstm_cell_f.zero_state(batch_size, tf.float32)
 97 |     inputs_list = [tf.reshape(x, [batch_size, input_size])
 98 |                    for x in tf.split(1, num_steps, in_layer)]
 99 |     rnn_out, rnn_states = bi_rnn(lstm_cell_f, lstm_cell_b, inputs_list,
100 |                                  initial_state=initial_state_f, scope=name,
101 |                                  reuse=reuse)
102 |     out_layer = tf.transpose(tf.pack(rnn_out), perm=[1, 0, 2])
103 |     return out_layer
104 | 
105 | 
106 | def convo_layer(in_layer, config, params, reuse=False, name='Convo'):
107 |     conv_window = config.conv_window
108 |     output_size = config.conv_dim
109 |     batch_size = int(in_layer.get_shape()[0])
110 |     num_steps = int(in_layer.get_shape()[1])
111 |     input_size = int(in_layer.get_shape()[2])
112 |     if reuse:
113 |         tf.get_variable_scope().reuse_variables()
114 |         W_conv = params.W_conv
115 |         b_conv = params.b_conv
116 |     else:
117 |         W_conv = weight_variable([conv_window, 1, input_size, output_size],
118 |                                  name=name)
119 |         b_conv = bias_variable([output_size], name=name)
120 |     reshaped = tf.reshape(in_layer, [batch_size, num_steps, 1, input_size])
121 |     conv_layer = tf.nn.relu(tf.reshape(conv2d(reshaped, W_conv),
122 |                                        [batch_size, num_steps, output_size],
123 |                                        name=name) + b_conv)
124 |     return (conv_layer, W_conv, b_conv)
125 | 
126 | 
127 | def predict_layer(in_layer, config, params, reuse=False, name='Predict'):
128 |     n_outcomes = config.n_outcomes
129 |     batch_size = int(in_layer.get_shape()[0])
130 |     num_steps = int(in_layer.get_shape()[1])
131 |     input_size = int(in_layer.get_shape()[2])
132 |     if reuse:
133 |         tf.get_variable_scope().reuse_variables()
134 |         W_pred = params.W_pred
135 |         b_pred = params.b_pred
136 |     else:
137 |         W_pred = weight_variable([input_size, n_outcomes], name=name)
138 |         b_pred = bias_variable([n_outcomes], name=name)
139 |     flat_input = tf.reshape(in_layer, [-1, input_size])
140 |     pre_scores = tf.nn.softmax(tf.matmul(flat_input, W_pred) + b_pred)
141 |     preds_layer = tf.reshape(pre_scores, [batch_size, num_steps, -1])
142 |     return (preds_layer, W_pred, b_pred)
143 | 
144 | 
145 | def optim_outputs(outcome, targets, config, params):
146 |     batch_size = int(outcome.get_shape()[0])
147 |     num_steps = int(outcome.get_shape()[1])
148 |     n_outputs = int(outcome.get_shape()[2])
149 |     # We are currently using cross entropy as criterion
150 |     criterion = -tf.reduce_sum(targets * tf.log(outcome))
151 |     for feat in config.l1_list:
152 |         criterion += config.l1_reg * \
153 |                      tf.reduce_sum(tf.abs(params.embeddings[feat]))
154 |     # We also compute the per-tag accuracy
155 |     correct_prediction = tf.equal(tf.argmax(outcome, 2), tf.argmax(targets, 2))
156 |     accuracy = tf.reduce_sum(tf.cast(correct_prediction,
157 |                                      "float") * tf.reduce_sum(targets, 2)) /\
158 |         tf.reduce_sum(targets)
159 |     return (criterion, accuracy)
160 | 
161 | 
162 | class SequNN:
163 |     def __init__(self, config):
164 |         self.batch_size = config.batch_size
165 |         self.num_steps = config.num_steps
166 |         num_features = len(config.input_features)
167 |         # input_ids <- batch.features
168 |         self.input_ids = tf.placeholder(tf.int32, shape=[self.batch_size,
169 |                                                          self.num_steps,
170 |                                                          num_features])
171 |         # targets <- batch.tag_windows_one_hot
172 |         self.targets = tf.placeholder(tf.float32, shape=[self.batch_size,
173 |                                                          self.num_steps,
174 |                                                          config.n_outcomes])
175 |     
176 |     def make(self, config, params, reuse=False, name='SequNN'):
177 |         with tf.variable_scope(name):
178 |             if reuse:
179 |                 tf.get_variable_scope().reuse_variables()
180 |             (out_layer, embeddings) = feature_layer(self.input_ids, config,
181 |                                                     params, reuse=reuse)
182 |             params.embeddings = embeddings
183 |             if config.verbose:
184 |                 print('features layer done')
185 |             if config.use_rnn:
186 |                 out_layer = bi_lstm_layer(embedding_layer, config, reuse=reuse)
187 |                 if config.verbose:
188 |                     print('rnn layer done')
189 |             if config.use_convo:
190 |                 (out_layer, W_conv, b_conv) = convo_layer(out_layer, config,
191 |                                                           params, reuse=reuse)
192 |                 params.W_conv = W_conv
193 |                 params.b_conv = b_conv
194 |                 if config.verbose:
195 |                     print('convolution layer done')
196 |             self.out_layer = out_layer
197 |             (preds_layer, W_pred, b_pred) = predict_layer(out_layer, config,
198 |                                                           params, reuse=reuse)
199 |             params.W_pred = W_pred
200 |             params.b_pred = b_pred
201 |             self.preds_layer = preds_layer
202 |             (criterion, accuracy) = optim_outputs(preds_layer, config, params)
203 |             if config.verbose:
204 |                 print('output layer done')
205 |             self.criterion = criterion
206 |             self.accuracy = accuracy
207 |     
208 |     def train_epoch(self, data, train_step, config, params):
209 |         batch_size = config.batch_size
210 |         train_step = tf.train.AdagradOptimizer(config.learning_rate).minimize(criterion)
211 |         batch = Batch()
212 |         for i in range(len(data) / batch_size):
213 |             batch.read(data, i * batch_size, config)
214 |             f_dict = {self.input_ids: batch.features,
215 |                       self.targets: batch.tag_windows_one_hot}
216 |             if i % 100 == 0:
217 |                 train_accuracy = self.accuracy.eval(feed_dict=f_dict)
218 |                 print("step %d of %d, training accuracy %f, Lemma_l1 %f" %
219 |                       (i, len(data) / batch_size, train_accuracy,
220 |                        tf.reduce_sum(tf.abs(params.embeddings['lemma'])).eval()))
221 |             train_step.run(feed_dict=f_dict)
222 |     
223 |     def validate_accuracy(self, data, config):
224 |         batch_size = config.batch_size
225 |         batch = Batch()
226 |         total_accuracy = 0.
227 |         total = 0.
228 |         for i in range(len(data) / batch_size):
229 |             batch.read(data, i * batch_size, config)
230 |             f_dict = {self.input_ids: batch.features,
231 |                       self.targets: batch.tag_windows_one_hot}
232 |             dev_accuracy = self.accuracy.eval(feed_dict=f_dict)
233 |             total_accuracy += dev_accuracy
234 |             total += 1
235 |             if i % 100 == 0:
236 |                 print("%d of %d: \t:%f" % (i, len(data) / batch_size,
237 |                                            total_accuracy / total))
238 |         return total_accuracy / total
239 | 
240 | 
241 | 
242 | 
243 | 
244 | 


--------------------------------------------------------------------------------
/model_use.py:
--------------------------------------------------------------------------------
 1 | from random import shuffle
 2 | 
 3 | from utils import *
 4 | from model_defs import *
 5 | 
 6 | 
 7 | ###############################################
 8 | # NN usage functions                          #
 9 | ###############################################
10 | # combines a sentence with the predicted marginals
11 | def fuse_preds(sentence, pred, config):
12 |     res = []
13 |     mid = config.pred_window / 2
14 |     for tok in zip(sentence, pred):
15 |         tok_d = dict([(tag, 0) for tag in ['B', 'I', 'O', 'ID', 'OD']])
16 |         for lab, idx in config.label_dict.items():
17 |             tag = config.tag_list[idx[1]]
18 |             if idx[0] >= 0:
19 |                 tok_d[tag] += tok[1][1][idx[0]]
20 |         tok_d['word'] = tok[0]['word']
21 |         tok_d['label'] = tok[0]['label'].split('_')[mid]
22 |         res += [tok_d]
23 |     return res
24 | 
25 | 
26 | # tag a full dataset TODO: ensure compatibility with SequNN class
27 | def tag_dataset(pre_data, config, params, graph):
28 |     save_num_steps = config.num_steps
29 |     batch_size = config.batch_size
30 |     batch = Batch()
31 |     # first, sort by length for computational reasons
32 |     num_dev = enumerate(pre_data)
33 |     mixed = sorted(num_dev, key=lambda x: len(x[1]))
34 |     mixed_data = [dat for i, dat in mixed]
35 |     mixed_indices = [i for i, dat in mixed]
36 |     # completing the last batch
37 |     missing = (batch_size - (len(pre_data) % batch_size)) % batch_size
38 |     data = mixed_data + missing * [mixed_data[-1]]
39 |     # tagging sentences
40 |     res = []
41 |     config.num_steps = 0
42 |     preds_layer_s = []
43 |     in_words = []
44 |     print 'processing %d sentences' % ((len(data) / batch_size) * batch_size,)
45 |     for i in range(len(data) / batch_size):
46 |         batch.read(data, i * batch_size, config, fill=True)
47 |         if i % 100 == 0:
48 |             print 'making features', i, 'of', len(data) / batch_size,
49 |             print 'rnn size', config.num_steps
50 |         n_words = len(batch.features[0])
51 |         if n_words > config.num_steps:
52 |             config.num_steps = n_words
53 |             tf.get_variable_scope().reuse_variables()
54 |             (input_ids, targets, preds_layer, criterion,
55 |                   accuracy) = make_network(config, params, reuse=True)
56 |         f_dict = {input_ids: batch.features}
57 |         tmp_preds = [[(batch.tag_windows_one_hot[i][j].index(1), token_preds)
58 |                       for j, token_preds in enumerate(sentence) if 1 in batch.tag_windows_one_hot[i][j]]
59 |                      for i, sentence in enumerate(list(preds_layer.eval(feed_dict=f_dict)))]
60 |         res += tmp_preds
61 |     # re-order data
62 |     res = res[:len(pre_data)]
63 |     res = [dat for i, dat in sorted(zip(mixed_indices, res), key=lambda x:x[0])]
64 |     config.num_steps = save_num_steps
65 |     return res
66 | 
67 | 
68 | def train_model(train_data, dev_data, sequ_nn, config, params, graph):
69 |     #~ train_data_32 = cut_and_pad(train_data, config)
70 |     #~ dev_data_32 = cut_and_pad(dev_data, config)
71 |     train_data_32 = cut_batches(train_data, config)
72 |     dev_data_32 = cut_batches(dev_data, config)
73 |     accuracies = []
74 |     preds = {}
75 |     for i in range(config.num_epochs):
76 |         print i
77 |         shuffle(train_data_32)
78 |         sequ_nn.train_epoch(train_data_32, config, params)
79 |         train_acc = sequ_nn.validate_accuracy(train_data_32, config)
80 |         dev_acc = sequ_nn.validate_accuracy(dev_data_32, config)
81 |         accuracies += [(train_acc, dev_acc)]
82 |         if i % config.num_predict == config.num_predict - 1:
83 |             preds[i+1] = tag_dataset(dev_data, config, params, graph)
84 |     return (accuracies, preds)
85 | 
86 | 


--------------------------------------------------------------------------------
/training.py:
--------------------------------------------------------------------------------
 1 | from pprint import pprint
 2 | 
 3 | from model_config import *
 4 | from model_defs import *
 5 | from model_use import *
 6 | 
 7 | ###############################################
 8 | # Load the data                               #
 9 | ###############################################
10 | config = base_convo_config(input_features, l1_list, tag_list)
11 | 
12 | train_data = read_data(train_file, features, config)
13 | dev_data = read_data(dev_file, features, config)
14 | dev_spans = treat_spans(dev_spans_file)
15 | 
16 | config.make_mappings(train_data + dev_data)
17 | 
18 | if config.init_words:
19 |     word_vectors = read_vectors(vecs_file, config.feature_maps['word']['reverse'])
20 |     pre_trained = {'word': word_vectors}
21 | else:
22 |     pre_trained = {}
23 | 
24 | params = Parameters(init=pre_trained)
25 | 
26 | ###############################################
27 | # make and test the NN                        #
28 | ###############################################
29 | 
30 | graph = tf.Graph()
31 | sess = tf.InteractiveSession()
32 | 
33 | (inputs, targets, preds_layer, criterion, accuracy) =  make_network(config, params)
34 | train_step = tf.train.AdagradOptimizer(config.learning_rate).minimize(criterion)
35 | sess.run(tf.initialize_all_variables())
36 | 
37 | accuracies, preds = train_model(train_data, dev_data, inputs, targets,
38 |                                 train_step, accuracy, config, params, graph)
39 | 
40 | predictions = [fuse_preds(sent, pred, config)
41 |                for sent, pred in zip(dev_data, preds[config.num_epochs])]
42 | 
43 | merged = merge(predictions, dev_spans)
44 | 
45 | if True:
46 |     print '##### Parameters'
47 |     pprint(config.to_string().splitlines())
48 |     print '##### Train/dev accuracies'
49 |     pprint(accuracies)
50 |     print '##### P-R-F curves'
51 |     for i in range(10):
52 |         evaluate(merged, 0.1 * i)
53 | 
54 | #~ execfile('training.py')
55 | 
56 | 
57 | # code to assign computation nodes:
58 | #~ graph = tf.Graph()
59 | #~ with graph.as_default():
60 |     #~ with graph.device(device_for_node):
61 | 


--------------------------------------------------------------------------------
/training_crf.py:
--------------------------------------------------------------------------------
 1 | from pprint import pprint
 2 | from random import shuffle
 3 | 
 4 | from model_config import *
 5 | from crf_defs import *
 6 | 
 7 | ###############################################
 8 | # Load the data                               #
 9 | ###############################################
10 | config = base_crf_config(input_features, l1_list, tag_list)
11 | 
12 | train_data = read_data(train_file, features, config)
13 | dev_data = read_data(dev_file, features, config)
14 | dev_spans = treat_spans(dev_spans_file)
15 | 
16 | config.make_mappings(train_data + dev_data)
17 | 
18 | if config.init_words:
19 |     word_vectors = read_vectors(vecs_file, config.feature_maps['word']['reverse'])
20 |     pre_trained = {'word': word_vectors}
21 | else:
22 |     pre_trained = {}
23 | 
24 | params = Parameters(init=pre_trained)
25 | 
26 | 
27 | #~ train_data_32 = cut_batches(train_data, config)
28 | #~ dev_data_32 = cut_batches(dev_data, config)
29 | 
30 | train_data_32 = cut_and_pad(train_data, config)
31 | dev_data_32 = cut_and_pad(dev_data, config)
32 | 
33 | ###############################################
34 | # make and test the CRF                       #
35 | ###############################################
36 | 
37 | sess = tf.InteractiveSession()
38 | 
39 | ### pseudo_ll
40 | config.learning_rate = 1e-2
41 | config.l1_reg = 0
42 | config.l2_list = config.input_features
43 | config.l2_reg = 1e-2
44 | 
45 | crf = CRF(config)
46 | crf.make(config, params)
47 | sess.run(tf.initialize_all_variables())
48 | 
49 | for i in range(2):
50 |     print 'epoch ----------------', i
51 |     shuffle(train_data_32)
52 |     crf.train_epoch(train_data_32, config, params, sess, crit_type='pseudo')
53 |     crf.validate_accuracy(train_data_32, config)
54 |     crf.validate_accuracy(dev_data_32, config)
55 | 
56 | 
57 | ### log-likelihood
58 | config.learning_rate = 1e-3
59 | config.l1_reg = 1
60 | config.l2_list = config.input_features
61 | config.l2_reg = 2e-2
62 | 
63 | crf = CRF(config)
64 | crf.make(config, params)
65 | sess.run(tf.initialize_all_variables())
66 | 
67 | for i in range(5):
68 |     print 'epoch ----------------', i
69 |     shuffle(train_data_32)
70 |     crf.train_epoch(train_data_32, config, params)
71 |     crf.validate_accuracy(train_data_32, config)
72 |     crf.validate_accuracy(dev_data_32, config)
73 | 
74 | 
75 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
  1 | # A few utility functions
  2 | import itertools
  3 | import numpy as np
  4 | 
  5 | 
  6 | ###############################################
  7 | # Generally useful functions                  #
  8 | ###############################################
  9 | # useful with reshape
 10 | def linearize_indices(indices, dims):
 11 |     res = []
 12 |     remain = indices
 13 |     for i, _ in enumerate(dims):
 14 |         res = [remain % dims[-i - 1]] + res
 15 |         remain = remain / dims[-i - 1]
 16 |     linearized = tf.transpose(tf.pack(res))
 17 |     return linearized
 18 | 
 19 | 
 20 | ###############################################
 21 | # Data reading functions                      #
 22 | ###############################################
 23 | class Config:
 24 |     def __init__(self, batch_size=20, num_steps=32, learning_rate=1e-2,
 25 |                  l1_reg=2e-3, l1_list=[],
 26 |                  l2_reg=2e-3, l2_list=[],
 27 |                  features_dim=50, init_words=False, input_features=[],
 28 |                  use_rnn=False, rnn_hidden_units=100, rnn_output_size=50,
 29 |                  use_convo=False, conv_window=5, conv_dim=50,
 30 |                  pot_size=1,
 31 |                  pred_window=1, tag_list=[],
 32 |                  verbose=False, num_epochs=10, num_predict=5):
 33 |         # optimization parameters
 34 |         self.batch_size = batch_size
 35 |         self.num_steps = num_steps
 36 |         self.learning_rate = learning_rate
 37 |         # regularization parameters
 38 |         self.l1_reg = l1_reg
 39 |         self.l1_list = l1_list
 40 |         self.l2_reg = l2_reg
 41 |         self.l2_list = l2_list
 42 |         # input layer
 43 |         self.features_dim = features_dim
 44 |         self.init_words = init_words
 45 |         self.input_features = input_features
 46 |         # recurrent layer
 47 |         self.use_rnn = use_rnn
 48 |         self.rnn_hidden_units = rnn_hidden_units
 49 |         self.rnn_output_size = rnn_output_size
 50 |         # convolutional layer
 51 |         self.use_convo = use_convo
 52 |         self.conv_window = conv_window
 53 |         self.conv_dim = conv_dim
 54 |         # CRF parameters:
 55 |         self.pot_size = pot_size
 56 |         self.n_tags = len(tag_list)
 57 |         # output layer
 58 |         self.pred_window = pred_window
 59 |         self.tag_list = tag_list
 60 |         self.label_dict = {}
 61 |         tags_ct = 0
 62 |         for element in itertools.product(tag_list, repeat=pred_window):
 63 |             tag_st = '_'.join(element)
 64 |             mid = element[pred_window / 2]
 65 |             if mid == '<P>':
 66 |                 self.label_dict[tag_st] = (-1, tag_list.index(mid))
 67 |             else:
 68 |                 self.label_dict[tag_st] = (tags_ct, tag_list.index(mid))
 69 |             tags_ct += 1
 70 |         self.n_outcomes = tags_ct
 71 |         # misc parameters
 72 |         self.verbose = verbose
 73 |         self.num_epochs = num_epochs
 74 |         self.num_predict = num_predict
 75 | 
 76 |     def make_mappings(self, data):
 77 |         self.feature_maps = dict([(feat, {'lookup': {'_unk_': 0},
 78 |                                           'reverse': ['_unk_']})
 79 |                                   for feat in data[0][0]])
 80 |         for sentence in data:
 81 |             for token in sentence:
 82 |                 for feat in data[0][0]:
 83 |                     ft = token[feat]
 84 |                     if ft not in self.feature_maps[feat]['lookup']:
 85 |                         self.feature_maps[feat]['lookup'][ft] = \
 86 |                                     len(self.feature_maps[feat]['reverse'])
 87 |                         self.feature_maps[feat]['reverse'] += [ft]
 88 | 
 89 |     def to_string(self):
 90 |         st = ''
 91 |         for k, v in self.__dict__.items():
 92 |             if k not in ['feature_maps', 'label_dict']:
 93 |                 st += k + ' --- ' + str(v) + ' \n'
 94 |         return st
 95 | 
 96 | 
 97 | class Batch:
 98 |     def __init__(self):
 99 |         # features: {'word': 'have', 'pos': 'VB', ...} ->
100 |         #                              [1345, 12 * num_features + 1,...]
101 |         self.features = []
102 |         # tags: 'B' -> 1
103 |         self.tags = []
104 |         # tags_one_hot: 'B' -> [0, 1, 0, 0, 0, 0]
105 |         self.tags_one_hot = []
106 |         # tag_windows: '<P>_B_O' -> [0, 1, 3]
107 |         self.tag_windows = []
108 |         # tag_windows_lin: '<P>_B_O' -> num_values * token_id + 0 * config.n_tags **2 + 1 * config.n_tags + 3
109 |         self.tag_windows_lin = []
110 |         # tag_windows_one_hot: '<P>_B_O' -> [0, ..., 0, 1, 0, ..., 0]
111 |         self.tag_windows_one_hot = []
112 |         # tag_neighbours: '<P>_B_O' -> [0, 3]
113 |         self.tag_neighbours = []
114 |         # tag_neighbours_linearized: '<P>_B_O' -> num_values * token_id + 0 * config.n_tags + 3
115 |         self.tag_neighbours_lin = []
116 |         # mask: <P> -> 0, everything else -> 1
117 |     def read(self, data, start, config, fill=False):
118 |         num_features = len(config.input_features)
119 |         batch_data = data[start:start + config.batch_size]
120 |         batch_features = [[[config.feature_maps[feat]['lookup'][token[feat]]
121 |                             for feat in config.input_features]
122 |                            for token in sentence]
123 |                           for sentence in batch_data]
124 |         batch_labels = [[config.label_dict[token['label']]
125 |                          for token in sentence]
126 |                         for sentence in batch_data]
127 |         # multiply feature indices for use in tf.nn.embedding_lookup
128 |         self.features = [[[num_features * ft + i for i, ft in enumerate(word)]
129 |                          for word in sentence] for sentence in batch_features]
130 |         self.tags = [[label[1] for label in sentence]
131 |                      for sentence in batch_labels]
132 |         self.tags_one_hot = [[[int(x == label[1] and x > 0)  # TODO: count padding tokens?
133 |                                for x in range(config.n_tags)]
134 |                               for label in sentence]
135 |                              for sentence in batch_labels]
136 |         self.tag_windows_one_hot = [[[int(x == label[0])
137 |                                       for x in range(config.n_outcomes)]
138 |                                      for label in sentence]
139 |                                     for sentence in batch_labels]
140 |         if fill:
141 |             max_len = max(config.conv_window,
142 |                           max([len(sentence) for sentence in batch_data]) + 2)
143 |             for i in range(config.batch_size):
144 |                 current_len = len(batch_data[i])
145 |                 pre_len = (max_len - current_len) / 2
146 |                 post_len = max_len - pre_len - current_len
147 |                 self.features[i] = [range(num_features)] * pre_len + \
148 |                                    self.features[i] + \
149 |                                    [range(num_features)] * post_len
150 |                 self.tags[i] = [0] * pre_len + self.tags[i] + [0] * post_len
151 |                 self.tags_one_hot[i] = [[0] * config.n_outcomes] * pre_len + \
152 |                                        self.tags_one_hot[i] + \
153 |                                        [[0] * config.n_outcomes] * post_len
154 |                 self.tag_windows_one_hot[i] = [[0] * config.n_outcomes] * pre_len + \
155 |                                               self.tag_windows_one_hot[i] + \
156 |                                               [[0] * config.n_outcomes] * post_len
157 |         mid = config.pot_window / 2
158 |         padded_tags = [[0] * mid + sentence + [0] * mid
159 |                        for sentence in self.tags]
160 |         # get linearized window indices
161 |         self.tag_windows = [[sent[i + j] for j in range(-mid, mid + 1)]
162 |                             for sent in padded_tags
163 |                             for i in range(mid, len(sent) - mid)]
164 |         n_indices = config.n_tags ** config.pot_window
165 |         self.tag_windows_lin = [sum([t * (config.n_tags ** (config.pot_window - 1 - i))
166 |                                       for i, t in enumerate(window)]) + i * n_indices
167 |                                 for i, window in enumerate(self.tag_windows)]
168 |         # get linearized potential indices
169 |         self.tag_neighbours = [[sent[i + j]
170 |                                 for j in range(-mid, 0) + range(1, mid + 1)]
171 |                                for sent in padded_tags
172 |                                for i in range(mid, len(sent) - mid)]
173 |         max_pow = config.pot_window - 1
174 |         n_indices = config.n_tags ** max_pow
175 |         self.tag_neighbours_lin = [sum([idx * (config.n_tags) ** (max_pow - j - 1)
176 |                                         for j, idx in enumerate(token)]) + i * n_indices
177 |                                    for i, token in enumerate(self.tag_neighbours)]
178 |         # make mask:
179 |         self.mask = [[int(tag > 0) for tag in sent] for sent in self.tags]
180 | 
181 | 
182 | def aggregate_labels(sentence, config):
183 |     pre_tags = ['<P>'] * (config.pred_window / 2)
184 |     sentence_ext = pre_tags + [token['label']
185 |                                for token in sentence] + pre_tags
186 |     for i, token in enumerate(sentence):
187 |         current = token['label']
188 |         sentence[i]['label'] = '_'.join([sentence_ext[i+j]
189 |                                          for j in range(config.pred_window)])
190 | 
191 | 
192 | def read_data(file_name, features, config):
193 |     sentences = []
194 |     sentence = []
195 |     f = open(file_name)
196 |     c = 0
197 |     for line in f:
198 |         c += 1
199 |         if c % 100000 == 0:
200 |             print c, 'lines read'
201 |         if len(line.strip()) == 0 and len(sentence) > 0:
202 |             sentences += [sentence[:]]
203 |             sentence = []
204 |         else:
205 |             sentence += [dict(zip(features, line.strip().split('\t')))]
206 |     if len(sentence) > 0:
207 |         sentences += [sentence[:]]
208 |     f.close()
209 |     foo = [aggregate_labels(sentence, config) for sentence in sentences]
210 |     return sentences
211 | 
212 | 
213 | def show(sentence):
214 |     return ' '.join([token['word']+'/'+token['label'] for token in sentence])
215 | 
216 | 
217 | # read pre_trained word vectors
218 | def read_vectors(file_name, vocab):
219 |     vectors = {}
220 |     f = open(file_name)
221 |     dim = int(f.readline().strip().split()[1])
222 |     for line in f:
223 |         w = line.split()[0]
224 |         vec = [float(x) for x in line.strip().split()[1:]]
225 |         vectors[w] = np.array(vec)
226 |     f.close()
227 |     res = np.zeros((len(vocab), dim))
228 |     for i, w in enumerate(vocab):
229 |         res[i] = vectors.get(w, np.zeros(dim))
230 |     return res
231 | 
232 | 
233 | # extract windows from data to fit into unrolled RNN. Independent sentences
234 | def cut_and_pad(data, config):
235 |     pad_token = dict([(feat, '_unk_') for feat in data[0][0]])
236 |     pad_token['label'] = '_'.join(['<P>'] * config.pred_window)
237 |     num_steps = config.num_steps
238 |     res = []
239 |     seen = 0
240 |     pad_len = max(config.pred_window, config.pot_window) / 2
241 |     sen = [pad_token] * pad_len + data[0] + [pad_token] * pad_len
242 |     while seen < len(data):
243 |         if len(sen) < num_steps:
244 |             if sen[0]['label'] == '<P>':
245 |                 new_sen = ((num_steps - len(sen)) / 2) * [pad_token] + sen
246 |             else:
247 |                 new_sen = sen
248 |             new_sen = new_sen + (num_steps - len(new_sen)) * [pad_token]
249 |             res += [new_sen[:]]
250 |             seen += 1
251 |             if seen < len(data):
252 |                 sen = [pad_token] * pad_len + data[seen] + [pad_token] * pad_len
253 |         else:
254 |             res += [sen[:num_steps]]
255 |             sen = sen[(2 * num_steps) / 3:]
256 |     return res
257 | 
258 | 
259 | # extract windows from data to fit into unrolled RNN. Continuous model
260 | def cut_batches(data, config):
261 |     pad_token = dict([(feat, '_unk_') for feat in data[0][0]])
262 |     pad_token['label'] = '_'.join(['<P>'] * config.pred_window)
263 |     padding = [pad_token] * config.pred_window
264 |     new_data = padding + [tok for sentence in data
265 |                           for tok in sentence + padding]
266 |     step_size = (config.num_steps / 2)
267 |     num_cuts = len(new_data) / step_size
268 |     res = [new_data[i * step_size: i * step_size + config.num_steps]
269 |            for i in range(num_cuts)]
270 |     res[-1] = res[-1] + [pad_token] * (config.num_steps - len(res[-1]))
271 |     return res
272 | 
273 | 
274 | ###############################################
275 | # NN evaluation functions                     #
276 | ###############################################
277 | def treat_spans(spans_file):
278 |     span_lists = []
279 |     f = open(spans_file)
280 |     y = []
281 |     for line in f:
282 |         if line.strip() == '':
283 |             span_lists += [y[:]]
284 |             y = []
285 |         else:
286 |             lsp = line.strip().split()
287 |             y = y + [(int(lsp[0]), int(lsp[1]), lsp[2])]
288 |     f.close()
289 |     return span_lists
290 | 
291 | 
292 | def find_gold(sentence):
293 |     gold = []
294 |     current_gold = []
295 |     for i, token in enumerate(sentence):
296 |         if token['label'] == 'B' or token['label'] == 'O':
297 |             if len(current_gold) > 0:
298 |                 gold += [tuple(current_gold)]
299 |                 current_gold = []
300 |         if 'I' in token['label'] or token['label'] == 'B':
301 |             current_gold += [i]
302 |     if len(current_gold) > 0:
303 |         gold += [tuple(current_gold)]
304 |     return gold
305 | 
306 | 
307 | def make_scores(token, thr):
308 |     res = dict([(key, val)
309 |                 for key, val in token.items()
310 |                 if key in ['O', 'OD', 'I', 'ID', 'B'] and val > thr])
311 |     return res
312 | 
313 | 
314 | def find_mentions(sentence, thr=0.02):
315 |     scores = [make_scores(token, thr) for token in sentence]
316 |     found = []
317 |     working = []
318 |     for i, score in enumerate(scores):
319 |         if 'B' in score or 'O' in score:
320 |             for work in working:
321 |                 if work[0][-1] == i-1:
322 |                     sc = work[1] + np.log(score.get('B', 0) +
323 |                                           score.get('O', 0))
324 |                     sc /= (work[0][-1] + 2 - work[0][0])
325 |                     found += [(tuple(work[0]), np.exp(sc))]
326 |         if len(score) == 1 and 'O' in score:
327 |             working = []
328 |         else:
329 |             new_working = []
330 |             if 'B' in score:
331 |                 new_working = [[[i], np.log(score['B']), False]]
332 |             for work in working:
333 |                 for tg, sc in score.items():
334 |                     if tg == 'OD':
335 |                         new_working += [[work[0], work[1] + np.log(sc), True]]
336 |                     elif tg == 'ID' and work[2]:
337 |                         new_working += [[work[0] + [i], work[1] + np.log(sc),
338 |                                          True]]
339 |                     elif tg == 'I' and not work[2]:
340 |                         new_working += [[work[0] + [i], work[1] + np.log(sc),
341 |                                          False]]
342 |             working = new_working[:]
343 |             if len(working) > 1000:
344 |                 working = sorted(working, key=lambda x: x[1],
345 |                                  reverse=True)[:1000]
346 |     return sorted(found, key=lambda x: x[1], reverse=True)
347 | 
348 | 
349 | def read_sentence(sentence):
350 |     return (sentence, find_gold(sentence), find_mentions(sentence))
351 | 
352 | 
353 | def merge(sentences, spans):
354 |     res = []
355 |     sent = read_sentence(sentences[0])
356 |     span = spans[0]
357 |     for i, sp in enumerate(spans):
358 |         if i == 0:
359 |             continue
360 |         if sp[0] == span[0]:
361 |             sen = read_sentence(sentences[i])
362 |             gold = sorted(list(set(sen[1] + sent[1])))
363 |             sent = (sen[0], gold, sen[2])
364 |         else:
365 |             res += [(sent, span)]
366 |             sent = read_sentence(sentences[i])
367 |             span = spans[i]
368 |     res += [(sent, span)]
369 |     return res
370 | 
371 | 
372 | def evaluate(merged_sentences, threshold):
373 |     TP = 0
374 |     FP = 0
375 |     FN = 0
376 |     for sentence in merged_sentences:
377 |         true_mentions = sentence[0][1]
378 |         tp = 0
379 |         for pred in sentence[0][2]:
380 |             if pred[1] >= threshold:
381 |                 if pred[0] in true_mentions:
382 |                     tp += 1
383 |                 else:
384 |                     FP += 1
385 |         TP += tp
386 |         FN += len(true_mentions) - tp
387 |     if (TP + FP) == 0:
388 |         prec = 0
389 |         recall = 0
390 |     else:
391 |         prec = float(TP) / (TP + FP)
392 |         recall = float(TP) / (TP + FN)
393 |     if prec == 0 or recall == 0:
394 |         f1 = 0
395 |     else:
396 |         f1 =  2 * (prec * recall) / (prec + recall)
397 |     print 'TH:', threshold, '\t', 'P:', prec, '\t', 'R:', recall, '\t', 'F:', f1
398 | 


--------------------------------------------------------------------------------