├── Data └── README.md ├── LICENSE ├── README.md ├── bi_rnn.py ├── crf_defs.py ├── model_config.py ├── model_defs.py ├── model_use.py ├── training.py ├── training_crf.py └── utils.py /Data/README.md: -------------------------------------------------------------------------------- 1 | Put your data there 2 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | #DeepCRF package 2 | 3 | Convolutions and RNN tagging models are implemented and tested 4 | 5 | CRF model is implemented and needs to be tested 6 | -------------------------------------------------------------------------------- /bi_rnn.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | import tensorflow as tf 6 | 7 | from tensorflow.models.rnn import rnn_cell 8 | from tensorflow.python.ops import control_flow_ops 9 | 10 | 11 | def bi_rnn(cell_forward, cell_backward, inputs, initial_state=None, 12 | dtype=None, scope=None, reuse=False): 13 | if not (isinstance(cell_forward, rnn_cell.RNNCell) and 14 | isinstance(cell_backward, rnn_cell.RNNCell)): 15 | raise TypeError("cell must be an instance of RNNCell") 16 | if not isinstance(inputs, list): 17 | raise TypeError("inputs must be a list") 18 | if not inputs: 19 | raise ValueError("inputs must not be empty") 20 | outputs = [] 21 | states = [] 22 | with tf.variable_scope(scope or "RNN"): 23 | batch_size = tf.shape(inputs[0])[0] 24 | outputs_f = [0] * len(inputs) 25 | states_f = [0] * len(inputs) 26 | outputs_b = [0] * len(inputs) 27 | states_b = [0] * len(inputs) 28 | if initial_state is not None: 29 | state_f = initial_state 30 | state_b = initial_state 31 | else: 32 | if not dtype: 33 | raise ValueError("If no initial_state is provided, \ 34 | dtype must be.") 35 | state_f = cell_forward.zero_state(batch_size, dtype) 36 | state_b = cell_backward.zero_state(batch_size, dtype) 37 | for t, input_ in enumerate(inputs): 38 | if reuse or t > 0: 39 | tf.get_variable_scope().reuse_variables() 40 | output_f, state_f = cell_forward(inputs[t], state_f, 41 | scope='LSTM_f') 42 | output_b, state_b = cell_backward(inputs[-1 - t], state_b, 43 | scope='LSTM_b') 44 | outputs_f[t] = output_f 45 | outputs_b[-1 - t] = output_b 46 | states_f[t] = state_f 47 | states_b[-1 - t] = state_b 48 | for t in range(len(inputs)): 49 | outputs.append(tf.concat(1, [outputs_f[t], outputs_b[t]])) 50 | states.append(tf.concat(1, [states_f[t], states_b[t]])) 51 | return (outputs, states) 52 | -------------------------------------------------------------------------------- /crf_defs.py: -------------------------------------------------------------------------------- 1 | from model_defs import * 2 | from utils import * 3 | from tensorflow.models.rnn.rnn_cell import * 4 | 5 | ################################### 6 | # Building blocks # 7 | ################################### 8 | 9 | # takes features and outputs potentials 10 | def potentials_layer(in_layer, mask, config, params, reuse=False, name='Potentials'): 11 | batch_size = int(in_layer.get_shape()[0]) 12 | num_steps = int(in_layer.get_shape()[1]) 13 | input_size = int(in_layer.get_shape()[2]) 14 | pot_shape = [config.n_tags] * config.pot_window 15 | out_shape = [batch_size, num_steps] + pot_shape 16 | #~ pot_size = config.n_tags ** config.pot_window 17 | #~ if reuse: 18 | #~ tf.get_variable_scope().reuse_variables() 19 | #~ W_pot = params.W_pot 20 | #~ b_pot = params.b_pot 21 | #~ else: 22 | #~ W_pot = weight_variable([input_size, pot_size], name=name) 23 | #~ b_pot = bias_variable([pot_size], name=name) 24 | #~ flat_input = tf.reshape(in_layer, [-1, input_size]) 25 | #~ pre_scores = tf.matmul(flat_input, W_pot) + b_pot 26 | # BOGUS 27 | W_pot = False 28 | b_pot = False 29 | reshaped_in = tf.reshape(in_layer, [batch_size, num_steps, config.pot_window, -1]) 30 | pre_scores = tf.reduce_sum(reshaped_in, 2) 31 | # /BOGUS 32 | pots_layer = tf.reshape(pre_scores, out_shape) 33 | # define potentials for padding tokens 34 | padding_pot = np.zeros(pot_shape) 35 | num = config.pot_window / 2 36 | idx = [slice(None)] * num + [0] + [slice(None)] * num 37 | padding_pot[idx] += 10000 38 | pad_pot = tf.convert_to_tensor(padding_pot, tf.float32) 39 | pad_pots = tf.expand_dims(tf.expand_dims(pad_pot, 0), 0) 40 | pad_pots = tf.tile(pad_pots, [batch_size, num_steps] + [1] * config.pot_window) 41 | # expand mask 42 | mask_a = mask 43 | for _ in range(config.pot_window): 44 | mask_a = tf.expand_dims(mask_a, -1) 45 | mask_a = tf.tile(mask_a, [1, 1] + pot_shape) 46 | # combine 47 | pots_layer = (pots_layer * mask_a + (1 - mask_a) * pad_pots) 48 | return (pots_layer, W_pot, b_pot) 49 | 50 | 51 | # pseudo-likelihood criterion 52 | def pseudo_likelihood(potentials, pot_indices, targets, config): 53 | batch_size = int(potentials.get_shape()[0]) 54 | num_steps = int(potentials.get_shape()[1]) 55 | pots_shape = map(int, potentials.get_shape()[2:]) 56 | # move the current tag to the last dimension 57 | perm = range(len(potentials.get_shape())) 58 | mid = config.pot_window / 2 59 | perm[-1] = perm[-mid - 1] 60 | for i in range(-1, mid -1): 61 | perm[-mid + i] = perm[-mid + i] + 1 62 | perm_potentials = tf.transpose(potentials, perm=perm) 63 | # get conditional distribution of the current tag 64 | flat_pots = tf.reshape(perm_potentials, [-1, config.n_tags]) 65 | flat_cond = tf.gather(flat_pots, pot_indices) 66 | pre_cond = tf.nn.softmax(flat_cond) 67 | conditional = tf.reshape(pre_cond, [batch_size, num_steps, -1]) 68 | # compute pseudo-log-likelihood of sequence 69 | p_ll = tf.reduce_sum(targets * tf.log(conditional)) 70 | return (conditional, p_ll) 71 | 72 | 73 | # dynamic programming part 1: max sum 74 | class CRFMaxCell(RNNCell): 75 | """Dynamic programming for CRF""" 76 | def __init__(self, config): 77 | self._num_units = config.n_tags ** (config.pot_window - 1) 78 | self.n_tags = config.n_tags 79 | 80 | @property 81 | def input_size(self): 82 | return self._num_units 83 | 84 | @property 85 | def output_size(self): 86 | return self._num_units 87 | 88 | @property 89 | def state_size(self): 90 | return self._num_units 91 | 92 | def __call__(self, inputs, state, scope=None): 93 | """Summation for dynamic programming. Inputs are the 94 | log-potentials. States are the results of the summation at the 95 | last step""" 96 | with tf.variable_scope(scope or type(self).__name__): 97 | # add states and log-potentials 98 | multiples = [1] * (len(state.get_shape()) + 1) 99 | multiples[-1] = self.n_tags 100 | exp_state = tf.tile(tf.expand_dims(state, -1), multiples) 101 | added = exp_state + inputs 102 | # return maxes, arg_maxes along first dimension (after the batch dim) 103 | new_state = tf.reduce_max(added, 1) 104 | max_id = tf.argmax(added, 1) 105 | return new_state, max_id 106 | 107 | 108 | # max a posteriori tags assignment: implement dynamic programming 109 | def map_assignment(potentials, config): 110 | batch_size = int(potentials.get_shape()[0]) 111 | num_steps = int(potentials.get_shape()[1]) 112 | pots_shape = map(int, potentials.get_shape()[2:]) 113 | inputs_list = [tf.reshape(x, [batch_size] + pots_shape) 114 | for x in tf.split(1, num_steps, potentials)] 115 | # forward pass 116 | max_cell = CRFMaxCell(config) 117 | max_ids = [0] * len(inputs_list) 118 | # initial state: starts at 0 - 0 - 0 etc... 119 | state = tf.zeros(pots_shape[:-1]) 120 | for t, input_ in enumerate(inputs_list): 121 | state, max_id = max_cell(inputs_list[t], state) 122 | max_ids[t] = max_id 123 | # backward pass 124 | powers = tf.to_int64(map(float, range(batch_size))) * \ 125 | (config.n_tags ** (config.pot_window - 1)) 126 | outputs = [-1] * len(inputs_list) 127 | best_end = tf.argmax(tf.reshape(state, [batch_size, -1]), 1) 128 | current = best_end 129 | mid = config.pot_window / 2 130 | max_pow = (config.n_tags ** mid) 131 | for i, _ in enumerate(outputs): 132 | outputs[-1 - i] = (current / max_pow) 133 | prev_best = tf.gather(tf.reshape(max_ids[-1 - i], [-1]), current + powers) 134 | current = prev_best * max_pow + (current / config.n_tags) 135 | map_tags = tf.transpose(tf.pack(outputs)) 136 | return map_tags 137 | 138 | 139 | # dynamic programming part 2: sum product 140 | class CRFSumCell(RNNCell): 141 | """Dynamic programming for CRF""" 142 | def __init__(self, config): 143 | self._num_units = config.n_tags ** (config.pot_window - 1) 144 | self.n_tags = config.n_tags 145 | 146 | @property 147 | def input_size(self): 148 | return self._num_units 149 | 150 | @property 151 | def output_size(self): 152 | return self._num_units 153 | 154 | @property 155 | def state_size(self): 156 | return self._num_units 157 | 158 | def __call__(self, inputs, state, scope=None): 159 | """Summation for dynamic programming. Inputs are the 160 | log-potentials. States are the results of the summation at the 161 | last step""" 162 | with tf.variable_scope(scope or type(self).__name__): 163 | # add states and log-potentials 164 | multiples = [1] * (len(state.get_shape()) + 1) 165 | multiples[-1] = self.n_tags 166 | exp_state = tf.tile(tf.expand_dims(state, -1), multiples) 167 | added = exp_state + inputs 168 | # log-sum along first dimension (after the batch dim) 169 | max_val = tf.reduce_max(added) 170 | added_exp = tf.exp(added - max_val) 171 | summed_exp = tf.reduce_sum(added_exp, 1) 172 | new_state = tf.log(summed_exp) + max_val 173 | return new_state 174 | 175 | 176 | # computing the log partition for a sequence of length config.num_steps 177 | def log_partition(potentials, config): 178 | batch_size = int(potentials.get_shape()[0]) 179 | num_steps = int(potentials.get_shape()[1]) 180 | pots_shape = map(int, potentials.get_shape()[2:]) 181 | inputs_list = [tf.reshape(x, [batch_size] + pots_shape) 182 | for x in tf.split(1, num_steps, potentials)] 183 | # forward pass 184 | sum_cell = CRFSumCell(config) 185 | state = tf.zeros([batch_size] + pots_shape[:-1]) 186 | partial_sums = [0] * len(inputs_list) 187 | for t, input_ in enumerate(inputs_list): 188 | state = sum_cell(inputs_list[t], state) 189 | partial_sums[t] = state 190 | # sum at the end 191 | max_val = tf.reduce_max(state) 192 | state_exp = tf.exp(state - max_val) 193 | log_part = tf.log(tf.reduce_sum(tf.reshape(state_exp, [batch_size, -1]), 1)) + max_val 194 | return tf.reduce_sum(log_part) 195 | 196 | 197 | # compute the log to get the log-likelihood 198 | def log_score(potentials, window_indices, mask, config): 199 | batch_size = int(potentials.get_shape()[0]) 200 | num_steps = int(potentials.get_shape()[1]) 201 | pots_shape = map(int, potentials.get_shape()[2:]) 202 | flat_pots = tf.reshape(potentials, [-1]) 203 | flat_scores = tf.gather(flat_pots, window_indices) 204 | scores = tf.reshape(flat_scores, [batch_size, num_steps]) 205 | scores = tf.mul(scores, mask) 206 | return tf.reduce_sum(scores) 207 | 208 | 209 | # TODO: alpha-beta rec 210 | def marginals(potentials, config): 211 | batch_size = int(potentials.get_shape()[0]) 212 | num_steps = int(potentials.get_shape()[1]) 213 | pots_shape = map(int, potentials.get_shape()[2:]) 214 | inputs_list = [tf.reshape(x, [batch_size] + pots_shape) 215 | for x in tf.split(1, num_steps, potentials)] 216 | # forward and backwar pass 217 | sum_cell_f = CRFSumCell(config) 218 | sum_cell_b = CRFSumCell(config) 219 | state_f = tf.convert_to_tensor(np.zeros(pots_shape[:-1])) 220 | state_b = tf.convert_to_tensor(np.zeros(pots_shape[:-1])) 221 | partial_sums_f = [0] * len(inputs_list) 222 | partial_sums_b = [0] * len(inputs_list) 223 | for t, _ in enumerate(inputs_list): 224 | state_f = sum_cell_f(inputs_list[t], state_f) 225 | partial_sums_f[t] = state_f 226 | state_b = sum_cell_b(inputs_list[t], state_b) 227 | partial_sums_b[-1 - t] = state_b 228 | # TODO: compute marginals 229 | marginals = 0 230 | return marginals 231 | 232 | 233 | ################################### 234 | # Making a (deep) CRF # 235 | ################################### 236 | class CRF: 237 | def __init__(self, config): 238 | self.batch_size = config.batch_size 239 | self.num_steps = config.num_steps 240 | num_features = len(config.input_features) 241 | # input_ids <- batch.features 242 | self.input_ids = tf.placeholder(tf.int32, shape=[self.batch_size, 243 | self.num_steps, 244 | num_features]) 245 | # mask <- batch.mask 246 | self.mask = tf.placeholder(tf.float32, [self.batch_size, self.num_steps]) 247 | # pot_indices <- batch.tag_neighbours_lin 248 | self.pot_indices = tf.placeholder(tf.int32, 249 | [config.batch_size * config.num_steps]) 250 | # targets <- batch.tags_one_hot 251 | self.targets = tf.placeholder(tf.float32, [config.batch_size, 252 | config.num_steps, 253 | config.n_tags]) 254 | # window_indices <- batch.tag_windows_lin 255 | self.window_indices = tf.placeholder(tf.int32, 256 | [config.batch_size * config.num_steps]) 257 | 258 | def make(self, config, params, reuse=False, name='CRF'): 259 | # TODO: add marginal inference 260 | with tf.variable_scope(name): 261 | if reuse: 262 | tf.get_variable_scope().reuse_variables() 263 | # out_layer <- output of NN (TODO: add layers) 264 | (out_layer, embeddings) = feature_layer(self.input_ids, 265 | config, params, 266 | reuse=reuse) 267 | params.embeddings = embeddings 268 | if config.verbose: 269 | print('features layer done') 270 | self.out_layer = out_layer 271 | # pots_layer <- potentials 272 | (pots_layer, W_pot, b_pot) = potentials_layer(out_layer, 273 | self.mask, 274 | config, params, 275 | reuse=reuse) 276 | params.W_pot = W_pot 277 | params.b_pot = b_pot 278 | if config.verbose: 279 | print('potentials layer done') 280 | self.pots_layer = pots_layer 281 | # pseudo-log-likelihood 282 | conditional, pseudo_ll = pseudo_likelihood(pots_layer, 283 | self.pot_indices, 284 | self.targets, config) 285 | self.pseudo_ll = pseudo_ll 286 | # accuracy of p(t_i | t_{i-1}, t_{i+1}) 287 | correct_cond_pred = tf.equal(tf.argmax(conditional, 2), tf.argmax(self.targets, 2)) 288 | correct_cond_pred = tf.cast(correct_cond_pred,"float") 289 | cond_accuracy = tf.reduce_sum(correct_cond_pred * tf.reduce_sum(self.targets, 2)) /\ 290 | tf.reduce_sum(self.targets) 291 | self.cond_accuracy = cond_accuracy 292 | # log-likelihood 293 | log_sc = log_score(self.pots_layer, self.window_indices, 294 | self.mask, config) 295 | log_part = log_partition(self.pots_layer, config) 296 | log_likelihood = log_sc - log_part 297 | self.log_likelihood = log_likelihood 298 | # L1 regularization 299 | self.l1_norm = tf.reduce_sum(tf.zeros([1])) 300 | for feat in config.l1_list: 301 | self.l1_norm += config.l1_reg * \ 302 | tf.reduce_sum(tf.abs(params.embeddings[feat])) 303 | # L2 regularization 304 | self.l2_norm = tf.reduce_sum(tf.zeros([1])) 305 | for feat in config.l2_list: 306 | self.l2_norm += config.l2_reg * \ 307 | tf.reduce_sum(tf.mul(params.embeddings[feat], 308 | params.embeddings[feat])) 309 | # map assignment and accuracy of map assignment 310 | map_tags = map_assignment(self.pots_layer, config) 311 | correct_pred = tf.equal(map_tags, tf.argmax(self.targets, 2)) 312 | correct_pred = tf.cast(correct_pred,"float") 313 | accuracy = tf.reduce_sum(correct_pred * tf.reduce_sum(self.targets, 2)) /\ 314 | tf.reduce_sum(self.targets) 315 | self.map_tags = map_tags 316 | self.accuracy = accuracy 317 | 318 | def train_epoch(self, data, config, params, session, crit_type='likelihood'): 319 | batch_size = config.batch_size 320 | criterion = None 321 | if crit_type == 'pseudo': 322 | criterion = -self.pseudo_ll 323 | else: 324 | criterion = -self.log_likelihood 325 | criterion -= config.l1_reg * self.l1_norm + config.l1_reg * self.l2_norm 326 | train_step = tf.train.AdagradOptimizer(config.learning_rate).minimize(criterion) 327 | session.run(tf.initialize_all_variables()) 328 | # TODO: gradient clipping 329 | total_crit = 0. 330 | n_batches = len(data) / batch_size 331 | batch = Batch() 332 | for i in range(n_batches): 333 | batch.read(data, i * batch_size, config) 334 | f_dict = {self.input_ids: batch.features, 335 | self.pot_indices: batch.tag_neighbours_lin, 336 | self.window_indices: batch.tag_windows_lin, 337 | self.mask: batch.mask, 338 | self.targets: batch.tags_one_hot} 339 | train_step.run(feed_dict=f_dict) 340 | crit = criterion.eval(feed_dict=f_dict) 341 | total_crit += crit 342 | if i % 50 == 0: 343 | train_accuracy = self.accuracy.eval(feed_dict=f_dict) 344 | print i, n_batches, train_accuracy, crit 345 | print("step %d of %d, training accuracy %f, criterion %f" % 346 | (i, n_batches, train_accuracy, crit)) 347 | print 'total crit', total_crit / n_batches 348 | return total_crit / n_batches 349 | 350 | def validate_accuracy(self, data, config): 351 | batch_size = config.batch_size 352 | batch = Batch() 353 | total_accuracy = 0. 354 | total_cond_accuracy = 0. 355 | total = 0. 356 | for i in range(len(data) / batch_size): 357 | batch.read(data, i * batch_size, config) 358 | f_dict = {self.input_ids: batch.features, 359 | self.targets: batch.tags_one_hot, 360 | self.pot_indices: batch.tag_neighbours_lin} 361 | dev_accuracy = self.accuracy.eval(feed_dict=f_dict) 362 | dev_cond_accuracy = self.cond_accuracy.eval(feed_dict=f_dict) 363 | pll = self.pseudo_ll.eval(feed_dict=f_dict) 364 | ll = self.log_likelihood.eval(feed_dict=f_dict) 365 | total_accuracy += dev_accuracy 366 | total_cond_accuracy += dev_cond_accuracy 367 | total_pll += pll 368 | total_ll += ll 369 | total += 1 370 | if i % 100 == 0: 371 | print("%d of %d: \t map accuracy: %f \t cond accuracy: %f \ 372 | \t pseudo_ll: %f \t log_likelihood: %f" % (i, len(data) / batch_size, 373 | total_accuracy / total, 374 | total_cond_accuracy / total)) 375 | return (total_accuracy / total, total_cond_accuracy / total) 376 | 377 | -------------------------------------------------------------------------------- /model_config.py: -------------------------------------------------------------------------------- 1 | # All the model arguments / parameters / file locations in one file 2 | from os.path import join as pjoin 3 | from utils import * 4 | 5 | 6 | def base_config(input_features, l1_list, tag_list): 7 | return Config(input_features=input_features, l1_list=l1_list, 8 | tag_list=tag_list) 9 | 10 | 11 | def base_rnn_config(input_features, l1_list, tag_list): 12 | return Config(input_features=input_features, l1_list=l1_list, 13 | tag_list=tag_list, use_rnn=True) 14 | 15 | 16 | def base_convo_config(input_features, l1_list, tag_list): 17 | return Config(input_features=input_features, l1_list=l1_list, 18 | tag_list=tag_list, use_convo=True, 19 | num_epochs=15, num_predict=5, pred_window=3) 20 | 21 | 22 | def base_crf_config(input_features, l1_list, tag_list): 23 | config = Config(input_features=input_features, l1_list=l1_list, 24 | tag_list=tag_list, use_convo=True, 25 | num_epochs=6, num_predict=2, 26 | pred_window=3, 27 | pot_window=3) 28 | config.features_dim = config.n_tags ** config.pot_window * config.pot_window 29 | return config 30 | 31 | 32 | # file locations 33 | git_dir = '/home/jernite/Code/DeepCRF' 34 | 35 | train_file = pjoin(git_dir, 'Data/semeval_train/crfpp_text_batch_1.txt') 36 | dev_file = pjoin(git_dir, 'Data/semeval_dev/crfpp_text_batch_1.txt') 37 | vecs_file = pjoin(git_dir, 'Data/semeval_vecs.dat') 38 | 39 | train_spans_file = pjoin(git_dir, 'Data/semeval_train/crfpp_spans_batch_1.txt') 40 | dev_spans_file = pjoin(git_dir, 'Data/semeval_dev/crfpp_spans_batch_1.txt') 41 | 42 | # feature names and tag list 43 | features = ['word', 'lemma', 'pos', 'normal', 'word_length', 44 | 'prefix', 'suffix', 'all_caps', 'capitalized', 'word_pos', 45 | 'sentence_pos', 'sentence_length', 'med_prefix', 46 | 'umls_match_tag_full', 'umls_match_tag_prefix', 47 | 'umls_match_tag_acro', 'label'] 48 | 49 | input_features = ['lemma', 'prefix', 'suffix', 'pos', 'umls_match_tag_full'] 50 | l1_list = ['lemma', 'prefix', 'suffix'] 51 | tag_list = ['
', 'B', 'I', 'O', 'ID', 'OD'] 52 | -------------------------------------------------------------------------------- /model_defs.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import tensorflow as tf 4 | import tensorflow.python.platform 5 | from tensorflow.models.rnn import rnn 6 | from tensorflow.models.rnn import rnn_cell 7 | 8 | from bi_rnn import bi_rnn 9 | from utils import * 10 | 11 | ############################################### 12 | # NN creation functions # 13 | ############################################### 14 | class Parameters: 15 | def __init__(self, init={}, emb={}, w_c=False, b_c=False, w_p=False, 16 | b_p=False, w_po=False, b_po=False): 17 | self.init_dic = init 18 | self.embeddings = emb 19 | self.W_conv = w_c 20 | self.b_conv = b_c 21 | self.W_pred = w_p 22 | self.b_pred = b_p 23 | self.W_pot = w_po 24 | self.b_pot = b_po 25 | 26 | 27 | def device_for_node(n): 28 | if n.type == "MatMul": 29 | return "/gpu:0" 30 | else: 31 | return "/cpu:0" 32 | 33 | 34 | def conv2d(x, W): 35 | return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME') 36 | 37 | 38 | def weight_variable(shape, name='weight'): 39 | initial = tf.truncated_normal(shape, stddev=0.1) 40 | return tf.Variable(initial, name=name+'_W') 41 | 42 | 43 | def bias_variable(shape, name='weight'): 44 | initial = tf.constant(0.1, shape=shape) 45 | return tf.Variable(initial, name=name+'_b') 46 | 47 | 48 | def feature_layer(in_layer, config, params, reuse=False): 49 | in_features = config.input_features 50 | features_dim = config.features_dim 51 | batch_size = config.batch_size 52 | num_steps = config.num_steps 53 | feature_mappings = config.feature_maps 54 | # inputs 55 | num_features = len(in_features) 56 | input_ids = in_layer 57 | if reuse: 58 | tf.get_variable_scope().reuse_variables() 59 | param_vars = params.embeddings 60 | # lookup layer 61 | else: 62 | param_dic = params.init_dic 63 | param_vars = {} 64 | for feat in in_features: 65 | if feat in param_dic: 66 | param_vars[feat] = \ 67 | tf.Variable(tf.convert_to_tensor(param_dic[feat], 68 | dtype=tf.float32), 69 | name=feat + '_embedding', 70 | trainable=False) 71 | else: 72 | shape = [len(feature_mappings[feat]['reverse']), features_dim] 73 | initial = tf.truncated_normal(shape, stddev=0.1) 74 | param_vars[feat] = tf.Variable(initial, 75 | name=feat + '_embedding') 76 | params = [param_vars[feat] for feat in in_features] 77 | input_embeddings = tf.nn.embedding_lookup(params, input_ids, name='lookup') 78 | # add and return 79 | embedding_layer = tf.reduce_sum(input_embeddings, 2) 80 | return (embedding_layer, param_vars) 81 | 82 | 83 | def bi_lstm_layer(in_layer, config, reuse=False, name='Bi_LSTM'): 84 | num_units = config.rnn_hidden_units 85 | output_size = config.rnn_output_size 86 | batch_size = int(in_layer.get_shape()[0]) 87 | num_steps = int(in_layer.get_shape()[1]) 88 | input_size = int(in_layer.get_shape()[2]) 89 | initializer = tf.random_uniform_initializer(-0.1, 0.1) 90 | lstm_cell_f = rnn_cell.LSTMCell(num_units, input_size, use_peepholes=True, 91 | num_proj=output_size, cell_clip=1.0, 92 | initializer=initializer) 93 | lstm_cell_b = rnn_cell.LSTMCell(num_units, input_size, use_peepholes=True, 94 | num_proj=output_size, cell_clip=1.0, 95 | initializer=initializer) 96 | initial_state_f = lstm_cell_f.zero_state(batch_size, tf.float32) 97 | inputs_list = [tf.reshape(x, [batch_size, input_size]) 98 | for x in tf.split(1, num_steps, in_layer)] 99 | rnn_out, rnn_states = bi_rnn(lstm_cell_f, lstm_cell_b, inputs_list, 100 | initial_state=initial_state_f, scope=name, 101 | reuse=reuse) 102 | out_layer = tf.transpose(tf.pack(rnn_out), perm=[1, 0, 2]) 103 | return out_layer 104 | 105 | 106 | def convo_layer(in_layer, config, params, reuse=False, name='Convo'): 107 | conv_window = config.conv_window 108 | output_size = config.conv_dim 109 | batch_size = int(in_layer.get_shape()[0]) 110 | num_steps = int(in_layer.get_shape()[1]) 111 | input_size = int(in_layer.get_shape()[2]) 112 | if reuse: 113 | tf.get_variable_scope().reuse_variables() 114 | W_conv = params.W_conv 115 | b_conv = params.b_conv 116 | else: 117 | W_conv = weight_variable([conv_window, 1, input_size, output_size], 118 | name=name) 119 | b_conv = bias_variable([output_size], name=name) 120 | reshaped = tf.reshape(in_layer, [batch_size, num_steps, 1, input_size]) 121 | conv_layer = tf.nn.relu(tf.reshape(conv2d(reshaped, W_conv), 122 | [batch_size, num_steps, output_size], 123 | name=name) + b_conv) 124 | return (conv_layer, W_conv, b_conv) 125 | 126 | 127 | def predict_layer(in_layer, config, params, reuse=False, name='Predict'): 128 | n_outcomes = config.n_outcomes 129 | batch_size = int(in_layer.get_shape()[0]) 130 | num_steps = int(in_layer.get_shape()[1]) 131 | input_size = int(in_layer.get_shape()[2]) 132 | if reuse: 133 | tf.get_variable_scope().reuse_variables() 134 | W_pred = params.W_pred 135 | b_pred = params.b_pred 136 | else: 137 | W_pred = weight_variable([input_size, n_outcomes], name=name) 138 | b_pred = bias_variable([n_outcomes], name=name) 139 | flat_input = tf.reshape(in_layer, [-1, input_size]) 140 | pre_scores = tf.nn.softmax(tf.matmul(flat_input, W_pred) + b_pred) 141 | preds_layer = tf.reshape(pre_scores, [batch_size, num_steps, -1]) 142 | return (preds_layer, W_pred, b_pred) 143 | 144 | 145 | def optim_outputs(outcome, targets, config, params): 146 | batch_size = int(outcome.get_shape()[0]) 147 | num_steps = int(outcome.get_shape()[1]) 148 | n_outputs = int(outcome.get_shape()[2]) 149 | # We are currently using cross entropy as criterion 150 | criterion = -tf.reduce_sum(targets * tf.log(outcome)) 151 | for feat in config.l1_list: 152 | criterion += config.l1_reg * \ 153 | tf.reduce_sum(tf.abs(params.embeddings[feat])) 154 | # We also compute the per-tag accuracy 155 | correct_prediction = tf.equal(tf.argmax(outcome, 2), tf.argmax(targets, 2)) 156 | accuracy = tf.reduce_sum(tf.cast(correct_prediction, 157 | "float") * tf.reduce_sum(targets, 2)) /\ 158 | tf.reduce_sum(targets) 159 | return (criterion, accuracy) 160 | 161 | 162 | class SequNN: 163 | def __init__(self, config): 164 | self.batch_size = config.batch_size 165 | self.num_steps = config.num_steps 166 | num_features = len(config.input_features) 167 | # input_ids <- batch.features 168 | self.input_ids = tf.placeholder(tf.int32, shape=[self.batch_size, 169 | self.num_steps, 170 | num_features]) 171 | # targets <- batch.tag_windows_one_hot 172 | self.targets = tf.placeholder(tf.float32, shape=[self.batch_size, 173 | self.num_steps, 174 | config.n_outcomes]) 175 | 176 | def make(self, config, params, reuse=False, name='SequNN'): 177 | with tf.variable_scope(name): 178 | if reuse: 179 | tf.get_variable_scope().reuse_variables() 180 | (out_layer, embeddings) = feature_layer(self.input_ids, config, 181 | params, reuse=reuse) 182 | params.embeddings = embeddings 183 | if config.verbose: 184 | print('features layer done') 185 | if config.use_rnn: 186 | out_layer = bi_lstm_layer(embedding_layer, config, reuse=reuse) 187 | if config.verbose: 188 | print('rnn layer done') 189 | if config.use_convo: 190 | (out_layer, W_conv, b_conv) = convo_layer(out_layer, config, 191 | params, reuse=reuse) 192 | params.W_conv = W_conv 193 | params.b_conv = b_conv 194 | if config.verbose: 195 | print('convolution layer done') 196 | self.out_layer = out_layer 197 | (preds_layer, W_pred, b_pred) = predict_layer(out_layer, config, 198 | params, reuse=reuse) 199 | params.W_pred = W_pred 200 | params.b_pred = b_pred 201 | self.preds_layer = preds_layer 202 | (criterion, accuracy) = optim_outputs(preds_layer, config, params) 203 | if config.verbose: 204 | print('output layer done') 205 | self.criterion = criterion 206 | self.accuracy = accuracy 207 | 208 | def train_epoch(self, data, train_step, config, params): 209 | batch_size = config.batch_size 210 | train_step = tf.train.AdagradOptimizer(config.learning_rate).minimize(criterion) 211 | batch = Batch() 212 | for i in range(len(data) / batch_size): 213 | batch.read(data, i * batch_size, config) 214 | f_dict = {self.input_ids: batch.features, 215 | self.targets: batch.tag_windows_one_hot} 216 | if i % 100 == 0: 217 | train_accuracy = self.accuracy.eval(feed_dict=f_dict) 218 | print("step %d of %d, training accuracy %f, Lemma_l1 %f" % 219 | (i, len(data) / batch_size, train_accuracy, 220 | tf.reduce_sum(tf.abs(params.embeddings['lemma'])).eval())) 221 | train_step.run(feed_dict=f_dict) 222 | 223 | def validate_accuracy(self, data, config): 224 | batch_size = config.batch_size 225 | batch = Batch() 226 | total_accuracy = 0. 227 | total = 0. 228 | for i in range(len(data) / batch_size): 229 | batch.read(data, i * batch_size, config) 230 | f_dict = {self.input_ids: batch.features, 231 | self.targets: batch.tag_windows_one_hot} 232 | dev_accuracy = self.accuracy.eval(feed_dict=f_dict) 233 | total_accuracy += dev_accuracy 234 | total += 1 235 | if i % 100 == 0: 236 | print("%d of %d: \t:%f" % (i, len(data) / batch_size, 237 | total_accuracy / total)) 238 | return total_accuracy / total 239 | 240 | 241 | 242 | 243 | 244 | -------------------------------------------------------------------------------- /model_use.py: -------------------------------------------------------------------------------- 1 | from random import shuffle 2 | 3 | from utils import * 4 | from model_defs import * 5 | 6 | 7 | ############################################### 8 | # NN usage functions # 9 | ############################################### 10 | # combines a sentence with the predicted marginals 11 | def fuse_preds(sentence, pred, config): 12 | res = [] 13 | mid = config.pred_window / 2 14 | for tok in zip(sentence, pred): 15 | tok_d = dict([(tag, 0) for tag in ['B', 'I', 'O', 'ID', 'OD']]) 16 | for lab, idx in config.label_dict.items(): 17 | tag = config.tag_list[idx[1]] 18 | if idx[0] >= 0: 19 | tok_d[tag] += tok[1][1][idx[0]] 20 | tok_d['word'] = tok[0]['word'] 21 | tok_d['label'] = tok[0]['label'].split('_')[mid] 22 | res += [tok_d] 23 | return res 24 | 25 | 26 | # tag a full dataset TODO: ensure compatibility with SequNN class 27 | def tag_dataset(pre_data, config, params, graph): 28 | save_num_steps = config.num_steps 29 | batch_size = config.batch_size 30 | batch = Batch() 31 | # first, sort by length for computational reasons 32 | num_dev = enumerate(pre_data) 33 | mixed = sorted(num_dev, key=lambda x: len(x[1])) 34 | mixed_data = [dat for i, dat in mixed] 35 | mixed_indices = [i for i, dat in mixed] 36 | # completing the last batch 37 | missing = (batch_size - (len(pre_data) % batch_size)) % batch_size 38 | data = mixed_data + missing * [mixed_data[-1]] 39 | # tagging sentences 40 | res = [] 41 | config.num_steps = 0 42 | preds_layer_s = [] 43 | in_words = [] 44 | print 'processing %d sentences' % ((len(data) / batch_size) * batch_size,) 45 | for i in range(len(data) / batch_size): 46 | batch.read(data, i * batch_size, config, fill=True) 47 | if i % 100 == 0: 48 | print 'making features', i, 'of', len(data) / batch_size, 49 | print 'rnn size', config.num_steps 50 | n_words = len(batch.features[0]) 51 | if n_words > config.num_steps: 52 | config.num_steps = n_words 53 | tf.get_variable_scope().reuse_variables() 54 | (input_ids, targets, preds_layer, criterion, 55 | accuracy) = make_network(config, params, reuse=True) 56 | f_dict = {input_ids: batch.features} 57 | tmp_preds = [[(batch.tag_windows_one_hot[i][j].index(1), token_preds) 58 | for j, token_preds in enumerate(sentence) if 1 in batch.tag_windows_one_hot[i][j]] 59 | for i, sentence in enumerate(list(preds_layer.eval(feed_dict=f_dict)))] 60 | res += tmp_preds 61 | # re-order data 62 | res = res[:len(pre_data)] 63 | res = [dat for i, dat in sorted(zip(mixed_indices, res), key=lambda x:x[0])] 64 | config.num_steps = save_num_steps 65 | return res 66 | 67 | 68 | def train_model(train_data, dev_data, sequ_nn, config, params, graph): 69 | #~ train_data_32 = cut_and_pad(train_data, config) 70 | #~ dev_data_32 = cut_and_pad(dev_data, config) 71 | train_data_32 = cut_batches(train_data, config) 72 | dev_data_32 = cut_batches(dev_data, config) 73 | accuracies = [] 74 | preds = {} 75 | for i in range(config.num_epochs): 76 | print i 77 | shuffle(train_data_32) 78 | sequ_nn.train_epoch(train_data_32, config, params) 79 | train_acc = sequ_nn.validate_accuracy(train_data_32, config) 80 | dev_acc = sequ_nn.validate_accuracy(dev_data_32, config) 81 | accuracies += [(train_acc, dev_acc)] 82 | if i % config.num_predict == config.num_predict - 1: 83 | preds[i+1] = tag_dataset(dev_data, config, params, graph) 84 | return (accuracies, preds) 85 | 86 | -------------------------------------------------------------------------------- /training.py: -------------------------------------------------------------------------------- 1 | from pprint import pprint 2 | 3 | from model_config import * 4 | from model_defs import * 5 | from model_use import * 6 | 7 | ############################################### 8 | # Load the data # 9 | ############################################### 10 | config = base_convo_config(input_features, l1_list, tag_list) 11 | 12 | train_data = read_data(train_file, features, config) 13 | dev_data = read_data(dev_file, features, config) 14 | dev_spans = treat_spans(dev_spans_file) 15 | 16 | config.make_mappings(train_data + dev_data) 17 | 18 | if config.init_words: 19 | word_vectors = read_vectors(vecs_file, config.feature_maps['word']['reverse']) 20 | pre_trained = {'word': word_vectors} 21 | else: 22 | pre_trained = {} 23 | 24 | params = Parameters(init=pre_trained) 25 | 26 | ############################################### 27 | # make and test the NN # 28 | ############################################### 29 | 30 | graph = tf.Graph() 31 | sess = tf.InteractiveSession() 32 | 33 | (inputs, targets, preds_layer, criterion, accuracy) = make_network(config, params) 34 | train_step = tf.train.AdagradOptimizer(config.learning_rate).minimize(criterion) 35 | sess.run(tf.initialize_all_variables()) 36 | 37 | accuracies, preds = train_model(train_data, dev_data, inputs, targets, 38 | train_step, accuracy, config, params, graph) 39 | 40 | predictions = [fuse_preds(sent, pred, config) 41 | for sent, pred in zip(dev_data, preds[config.num_epochs])] 42 | 43 | merged = merge(predictions, dev_spans) 44 | 45 | if True: 46 | print '##### Parameters' 47 | pprint(config.to_string().splitlines()) 48 | print '##### Train/dev accuracies' 49 | pprint(accuracies) 50 | print '##### P-R-F curves' 51 | for i in range(10): 52 | evaluate(merged, 0.1 * i) 53 | 54 | #~ execfile('training.py') 55 | 56 | 57 | # code to assign computation nodes: 58 | #~ graph = tf.Graph() 59 | #~ with graph.as_default(): 60 | #~ with graph.device(device_for_node): 61 | -------------------------------------------------------------------------------- /training_crf.py: -------------------------------------------------------------------------------- 1 | from pprint import pprint 2 | from random import shuffle 3 | 4 | from model_config import * 5 | from crf_defs import * 6 | 7 | ############################################### 8 | # Load the data # 9 | ############################################### 10 | config = base_crf_config(input_features, l1_list, tag_list) 11 | 12 | train_data = read_data(train_file, features, config) 13 | dev_data = read_data(dev_file, features, config) 14 | dev_spans = treat_spans(dev_spans_file) 15 | 16 | config.make_mappings(train_data + dev_data) 17 | 18 | if config.init_words: 19 | word_vectors = read_vectors(vecs_file, config.feature_maps['word']['reverse']) 20 | pre_trained = {'word': word_vectors} 21 | else: 22 | pre_trained = {} 23 | 24 | params = Parameters(init=pre_trained) 25 | 26 | 27 | #~ train_data_32 = cut_batches(train_data, config) 28 | #~ dev_data_32 = cut_batches(dev_data, config) 29 | 30 | train_data_32 = cut_and_pad(train_data, config) 31 | dev_data_32 = cut_and_pad(dev_data, config) 32 | 33 | ############################################### 34 | # make and test the CRF # 35 | ############################################### 36 | 37 | sess = tf.InteractiveSession() 38 | 39 | ### pseudo_ll 40 | config.learning_rate = 1e-2 41 | config.l1_reg = 0 42 | config.l2_list = config.input_features 43 | config.l2_reg = 1e-2 44 | 45 | crf = CRF(config) 46 | crf.make(config, params) 47 | sess.run(tf.initialize_all_variables()) 48 | 49 | for i in range(2): 50 | print 'epoch ----------------', i 51 | shuffle(train_data_32) 52 | crf.train_epoch(train_data_32, config, params, sess, crit_type='pseudo') 53 | crf.validate_accuracy(train_data_32, config) 54 | crf.validate_accuracy(dev_data_32, config) 55 | 56 | 57 | ### log-likelihood 58 | config.learning_rate = 1e-3 59 | config.l1_reg = 1 60 | config.l2_list = config.input_features 61 | config.l2_reg = 2e-2 62 | 63 | crf = CRF(config) 64 | crf.make(config, params) 65 | sess.run(tf.initialize_all_variables()) 66 | 67 | for i in range(5): 68 | print 'epoch ----------------', i 69 | shuffle(train_data_32) 70 | crf.train_epoch(train_data_32, config, params) 71 | crf.validate_accuracy(train_data_32, config) 72 | crf.validate_accuracy(dev_data_32, config) 73 | 74 | 75 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | # A few utility functions 2 | import itertools 3 | import numpy as np 4 | 5 | 6 | ############################################### 7 | # Generally useful functions # 8 | ############################################### 9 | # useful with reshape 10 | def linearize_indices(indices, dims): 11 | res = [] 12 | remain = indices 13 | for i, _ in enumerate(dims): 14 | res = [remain % dims[-i - 1]] + res 15 | remain = remain / dims[-i - 1] 16 | linearized = tf.transpose(tf.pack(res)) 17 | return linearized 18 | 19 | 20 | ############################################### 21 | # Data reading functions # 22 | ############################################### 23 | class Config: 24 | def __init__(self, batch_size=20, num_steps=32, learning_rate=1e-2, 25 | l1_reg=2e-3, l1_list=[], 26 | l2_reg=2e-3, l2_list=[], 27 | features_dim=50, init_words=False, input_features=[], 28 | use_rnn=False, rnn_hidden_units=100, rnn_output_size=50, 29 | use_convo=False, conv_window=5, conv_dim=50, 30 | pot_size=1, 31 | pred_window=1, tag_list=[], 32 | verbose=False, num_epochs=10, num_predict=5): 33 | # optimization parameters 34 | self.batch_size = batch_size 35 | self.num_steps = num_steps 36 | self.learning_rate = learning_rate 37 | # regularization parameters 38 | self.l1_reg = l1_reg 39 | self.l1_list = l1_list 40 | self.l2_reg = l2_reg 41 | self.l2_list = l2_list 42 | # input layer 43 | self.features_dim = features_dim 44 | self.init_words = init_words 45 | self.input_features = input_features 46 | # recurrent layer 47 | self.use_rnn = use_rnn 48 | self.rnn_hidden_units = rnn_hidden_units 49 | self.rnn_output_size = rnn_output_size 50 | # convolutional layer 51 | self.use_convo = use_convo 52 | self.conv_window = conv_window 53 | self.conv_dim = conv_dim 54 | # CRF parameters: 55 | self.pot_size = pot_size 56 | self.n_tags = len(tag_list) 57 | # output layer 58 | self.pred_window = pred_window 59 | self.tag_list = tag_list 60 | self.label_dict = {} 61 | tags_ct = 0 62 | for element in itertools.product(tag_list, repeat=pred_window): 63 | tag_st = '_'.join(element) 64 | mid = element[pred_window / 2] 65 | if mid == '
': 66 | self.label_dict[tag_st] = (-1, tag_list.index(mid)) 67 | else: 68 | self.label_dict[tag_st] = (tags_ct, tag_list.index(mid)) 69 | tags_ct += 1 70 | self.n_outcomes = tags_ct 71 | # misc parameters 72 | self.verbose = verbose 73 | self.num_epochs = num_epochs 74 | self.num_predict = num_predict 75 | 76 | def make_mappings(self, data): 77 | self.feature_maps = dict([(feat, {'lookup': {'_unk_': 0}, 78 | 'reverse': ['_unk_']}) 79 | for feat in data[0][0]]) 80 | for sentence in data: 81 | for token in sentence: 82 | for feat in data[0][0]: 83 | ft = token[feat] 84 | if ft not in self.feature_maps[feat]['lookup']: 85 | self.feature_maps[feat]['lookup'][ft] = \ 86 | len(self.feature_maps[feat]['reverse']) 87 | self.feature_maps[feat]['reverse'] += [ft] 88 | 89 | def to_string(self): 90 | st = '' 91 | for k, v in self.__dict__.items(): 92 | if k not in ['feature_maps', 'label_dict']: 93 | st += k + ' --- ' + str(v) + ' \n' 94 | return st 95 | 96 | 97 | class Batch: 98 | def __init__(self): 99 | # features: {'word': 'have', 'pos': 'VB', ...} -> 100 | # [1345, 12 * num_features + 1,...] 101 | self.features = [] 102 | # tags: 'B' -> 1 103 | self.tags = [] 104 | # tags_one_hot: 'B' -> [0, 1, 0, 0, 0, 0] 105 | self.tags_one_hot = [] 106 | # tag_windows: '
_B_O' -> [0, 1, 3] 107 | self.tag_windows = [] 108 | # tag_windows_lin: '
_B_O' -> num_values * token_id + 0 * config.n_tags **2 + 1 * config.n_tags + 3 109 | self.tag_windows_lin = [] 110 | # tag_windows_one_hot: '
_B_O' -> [0, ..., 0, 1, 0, ..., 0] 111 | self.tag_windows_one_hot = [] 112 | # tag_neighbours: '
_B_O' -> [0, 3] 113 | self.tag_neighbours = [] 114 | # tag_neighbours_linearized: '
_B_O' -> num_values * token_id + 0 * config.n_tags + 3 115 | self.tag_neighbours_lin = [] 116 | # mask:
-> 0, everything else -> 1 117 | def read(self, data, start, config, fill=False): 118 | num_features = len(config.input_features) 119 | batch_data = data[start:start + config.batch_size] 120 | batch_features = [[[config.feature_maps[feat]['lookup'][token[feat]] 121 | for feat in config.input_features] 122 | for token in sentence] 123 | for sentence in batch_data] 124 | batch_labels = [[config.label_dict[token['label']] 125 | for token in sentence] 126 | for sentence in batch_data] 127 | # multiply feature indices for use in tf.nn.embedding_lookup 128 | self.features = [[[num_features * ft + i for i, ft in enumerate(word)] 129 | for word in sentence] for sentence in batch_features] 130 | self.tags = [[label[1] for label in sentence] 131 | for sentence in batch_labels] 132 | self.tags_one_hot = [[[int(x == label[1] and x > 0) # TODO: count padding tokens? 133 | for x in range(config.n_tags)] 134 | for label in sentence] 135 | for sentence in batch_labels] 136 | self.tag_windows_one_hot = [[[int(x == label[0]) 137 | for x in range(config.n_outcomes)] 138 | for label in sentence] 139 | for sentence in batch_labels] 140 | if fill: 141 | max_len = max(config.conv_window, 142 | max([len(sentence) for sentence in batch_data]) + 2) 143 | for i in range(config.batch_size): 144 | current_len = len(batch_data[i]) 145 | pre_len = (max_len - current_len) / 2 146 | post_len = max_len - pre_len - current_len 147 | self.features[i] = [range(num_features)] * pre_len + \ 148 | self.features[i] + \ 149 | [range(num_features)] * post_len 150 | self.tags[i] = [0] * pre_len + self.tags[i] + [0] * post_len 151 | self.tags_one_hot[i] = [[0] * config.n_outcomes] * pre_len + \ 152 | self.tags_one_hot[i] + \ 153 | [[0] * config.n_outcomes] * post_len 154 | self.tag_windows_one_hot[i] = [[0] * config.n_outcomes] * pre_len + \ 155 | self.tag_windows_one_hot[i] + \ 156 | [[0] * config.n_outcomes] * post_len 157 | mid = config.pot_window / 2 158 | padded_tags = [[0] * mid + sentence + [0] * mid 159 | for sentence in self.tags] 160 | # get linearized window indices 161 | self.tag_windows = [[sent[i + j] for j in range(-mid, mid + 1)] 162 | for sent in padded_tags 163 | for i in range(mid, len(sent) - mid)] 164 | n_indices = config.n_tags ** config.pot_window 165 | self.tag_windows_lin = [sum([t * (config.n_tags ** (config.pot_window - 1 - i)) 166 | for i, t in enumerate(window)]) + i * n_indices 167 | for i, window in enumerate(self.tag_windows)] 168 | # get linearized potential indices 169 | self.tag_neighbours = [[sent[i + j] 170 | for j in range(-mid, 0) + range(1, mid + 1)] 171 | for sent in padded_tags 172 | for i in range(mid, len(sent) - mid)] 173 | max_pow = config.pot_window - 1 174 | n_indices = config.n_tags ** max_pow 175 | self.tag_neighbours_lin = [sum([idx * (config.n_tags) ** (max_pow - j - 1) 176 | for j, idx in enumerate(token)]) + i * n_indices 177 | for i, token in enumerate(self.tag_neighbours)] 178 | # make mask: 179 | self.mask = [[int(tag > 0) for tag in sent] for sent in self.tags] 180 | 181 | 182 | def aggregate_labels(sentence, config): 183 | pre_tags = ['
'] * (config.pred_window / 2) 184 | sentence_ext = pre_tags + [token['label'] 185 | for token in sentence] + pre_tags 186 | for i, token in enumerate(sentence): 187 | current = token['label'] 188 | sentence[i]['label'] = '_'.join([sentence_ext[i+j] 189 | for j in range(config.pred_window)]) 190 | 191 | 192 | def read_data(file_name, features, config): 193 | sentences = [] 194 | sentence = [] 195 | f = open(file_name) 196 | c = 0 197 | for line in f: 198 | c += 1 199 | if c % 100000 == 0: 200 | print c, 'lines read' 201 | if len(line.strip()) == 0 and len(sentence) > 0: 202 | sentences += [sentence[:]] 203 | sentence = [] 204 | else: 205 | sentence += [dict(zip(features, line.strip().split('\t')))] 206 | if len(sentence) > 0: 207 | sentences += [sentence[:]] 208 | f.close() 209 | foo = [aggregate_labels(sentence, config) for sentence in sentences] 210 | return sentences 211 | 212 | 213 | def show(sentence): 214 | return ' '.join([token['word']+'/'+token['label'] for token in sentence]) 215 | 216 | 217 | # read pre_trained word vectors 218 | def read_vectors(file_name, vocab): 219 | vectors = {} 220 | f = open(file_name) 221 | dim = int(f.readline().strip().split()[1]) 222 | for line in f: 223 | w = line.split()[0] 224 | vec = [float(x) for x in line.strip().split()[1:]] 225 | vectors[w] = np.array(vec) 226 | f.close() 227 | res = np.zeros((len(vocab), dim)) 228 | for i, w in enumerate(vocab): 229 | res[i] = vectors.get(w, np.zeros(dim)) 230 | return res 231 | 232 | 233 | # extract windows from data to fit into unrolled RNN. Independent sentences 234 | def cut_and_pad(data, config): 235 | pad_token = dict([(feat, '_unk_') for feat in data[0][0]]) 236 | pad_token['label'] = '_'.join(['
'] * config.pred_window) 237 | num_steps = config.num_steps 238 | res = [] 239 | seen = 0 240 | pad_len = max(config.pred_window, config.pot_window) / 2 241 | sen = [pad_token] * pad_len + data[0] + [pad_token] * pad_len 242 | while seen < len(data): 243 | if len(sen) < num_steps: 244 | if sen[0]['label'] == '
': 245 | new_sen = ((num_steps - len(sen)) / 2) * [pad_token] + sen 246 | else: 247 | new_sen = sen 248 | new_sen = new_sen + (num_steps - len(new_sen)) * [pad_token] 249 | res += [new_sen[:]] 250 | seen += 1 251 | if seen < len(data): 252 | sen = [pad_token] * pad_len + data[seen] + [pad_token] * pad_len 253 | else: 254 | res += [sen[:num_steps]] 255 | sen = sen[(2 * num_steps) / 3:] 256 | return res 257 | 258 | 259 | # extract windows from data to fit into unrolled RNN. Continuous model 260 | def cut_batches(data, config): 261 | pad_token = dict([(feat, '_unk_') for feat in data[0][0]]) 262 | pad_token['label'] = '_'.join(['
'] * config.pred_window) 263 | padding = [pad_token] * config.pred_window 264 | new_data = padding + [tok for sentence in data 265 | for tok in sentence + padding] 266 | step_size = (config.num_steps / 2) 267 | num_cuts = len(new_data) / step_size 268 | res = [new_data[i * step_size: i * step_size + config.num_steps] 269 | for i in range(num_cuts)] 270 | res[-1] = res[-1] + [pad_token] * (config.num_steps - len(res[-1])) 271 | return res 272 | 273 | 274 | ############################################### 275 | # NN evaluation functions # 276 | ############################################### 277 | def treat_spans(spans_file): 278 | span_lists = [] 279 | f = open(spans_file) 280 | y = [] 281 | for line in f: 282 | if line.strip() == '': 283 | span_lists += [y[:]] 284 | y = [] 285 | else: 286 | lsp = line.strip().split() 287 | y = y + [(int(lsp[0]), int(lsp[1]), lsp[2])] 288 | f.close() 289 | return span_lists 290 | 291 | 292 | def find_gold(sentence): 293 | gold = [] 294 | current_gold = [] 295 | for i, token in enumerate(sentence): 296 | if token['label'] == 'B' or token['label'] == 'O': 297 | if len(current_gold) > 0: 298 | gold += [tuple(current_gold)] 299 | current_gold = [] 300 | if 'I' in token['label'] or token['label'] == 'B': 301 | current_gold += [i] 302 | if len(current_gold) > 0: 303 | gold += [tuple(current_gold)] 304 | return gold 305 | 306 | 307 | def make_scores(token, thr): 308 | res = dict([(key, val) 309 | for key, val in token.items() 310 | if key in ['O', 'OD', 'I', 'ID', 'B'] and val > thr]) 311 | return res 312 | 313 | 314 | def find_mentions(sentence, thr=0.02): 315 | scores = [make_scores(token, thr) for token in sentence] 316 | found = [] 317 | working = [] 318 | for i, score in enumerate(scores): 319 | if 'B' in score or 'O' in score: 320 | for work in working: 321 | if work[0][-1] == i-1: 322 | sc = work[1] + np.log(score.get('B', 0) + 323 | score.get('O', 0)) 324 | sc /= (work[0][-1] + 2 - work[0][0]) 325 | found += [(tuple(work[0]), np.exp(sc))] 326 | if len(score) == 1 and 'O' in score: 327 | working = [] 328 | else: 329 | new_working = [] 330 | if 'B' in score: 331 | new_working = [[[i], np.log(score['B']), False]] 332 | for work in working: 333 | for tg, sc in score.items(): 334 | if tg == 'OD': 335 | new_working += [[work[0], work[1] + np.log(sc), True]] 336 | elif tg == 'ID' and work[2]: 337 | new_working += [[work[0] + [i], work[1] + np.log(sc), 338 | True]] 339 | elif tg == 'I' and not work[2]: 340 | new_working += [[work[0] + [i], work[1] + np.log(sc), 341 | False]] 342 | working = new_working[:] 343 | if len(working) > 1000: 344 | working = sorted(working, key=lambda x: x[1], 345 | reverse=True)[:1000] 346 | return sorted(found, key=lambda x: x[1], reverse=True) 347 | 348 | 349 | def read_sentence(sentence): 350 | return (sentence, find_gold(sentence), find_mentions(sentence)) 351 | 352 | 353 | def merge(sentences, spans): 354 | res = [] 355 | sent = read_sentence(sentences[0]) 356 | span = spans[0] 357 | for i, sp in enumerate(spans): 358 | if i == 0: 359 | continue 360 | if sp[0] == span[0]: 361 | sen = read_sentence(sentences[i]) 362 | gold = sorted(list(set(sen[1] + sent[1]))) 363 | sent = (sen[0], gold, sen[2]) 364 | else: 365 | res += [(sent, span)] 366 | sent = read_sentence(sentences[i]) 367 | span = spans[i] 368 | res += [(sent, span)] 369 | return res 370 | 371 | 372 | def evaluate(merged_sentences, threshold): 373 | TP = 0 374 | FP = 0 375 | FN = 0 376 | for sentence in merged_sentences: 377 | true_mentions = sentence[0][1] 378 | tp = 0 379 | for pred in sentence[0][2]: 380 | if pred[1] >= threshold: 381 | if pred[0] in true_mentions: 382 | tp += 1 383 | else: 384 | FP += 1 385 | TP += tp 386 | FN += len(true_mentions) - tp 387 | if (TP + FP) == 0: 388 | prec = 0 389 | recall = 0 390 | else: 391 | prec = float(TP) / (TP + FP) 392 | recall = float(TP) / (TP + FN) 393 | if prec == 0 or recall == 0: 394 | f1 = 0 395 | else: 396 | f1 = 2 * (prec * recall) / (prec + recall) 397 | print 'TH:', threshold, '\t', 'P:', prec, '\t', 'R:', recall, '\t', 'F:', f1 398 | --------------------------------------------------------------------------------