├── README.md ├── data_utils_sentihood.py ├── delayed_entnet_sentihood.py ├── main.py └── vocab_processor.py /README.md: -------------------------------------------------------------------------------- 1 | # delayed-memory-update-entnet 2 | Recurrent Entity Networks with Delayed Memory Update for Targeted Aspect-based Sentiment Analysis, published at NAACL 2018 3 | 4 | ``` 5 | Python-2.7.12 6 | TensorFlow-1.4.1 7 | Numpy-1.14.2 8 | ``` 9 | 10 | ```shell 11 | $ python main.py --embedding_file PATH/TO/GLOVE_EMBEDDING_FILE 12 | ``` 13 | 14 | Note that the code assumes that the first line in the embedding file specifies the vocabulary size and dimension size: 15 | ```shell 16 | sed -i '1i VOCAB_SIZE DIM_SIZE' PATH/TO/GLOVE_EMBEDDING_FILE 17 | ``` 18 | 19 | ``` 20 | @InProceedings{Liu+:2018, 21 | author = {Liu, Fei and Cohn, Trevor and Baldwin, Timothy}, 22 | title = {Recurrent Entity Networks with Delayed Memory Update for Targeted Aspect-based Sentiment Analysis}, 23 | booktitle = {Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies}, 24 | year = {2018}, 25 | address = {New Orleans, USA}, 26 | pages = {278--283} 27 | } 28 | ``` 29 | -------------------------------------------------------------------------------- /data_utils_sentihood.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | import os, sys 4 | import re 5 | import numpy as np 6 | import xml.etree.ElementTree 7 | from collections import defaultdict 8 | import nltk 9 | # from vocab_processor import * 10 | import operator 11 | import json 12 | 13 | def vectorize_data(sentences, max_sentence_len, max_target_len, max_aspect_len, 14 | word_processor, label_processor): 15 | ret_sentences = word_processor.transform( 16 | [text for _, text, _, _, _ in sentences] 17 | ) 18 | # [None, max_sentence_len] 19 | assert ret_sentences.shape[1] == max_sentence_len 20 | 21 | ret_loc_indicator = np.zeros((len(sentences), 1), dtype=np.int32) 22 | for i, (_, _, target, _, _) in enumerate(sentences): 23 | assert target.lower() in ['location1', 'location2'] 24 | ret_loc_indicator[i, :] = [0 if target.lower() == 'location1' else 1] 25 | 26 | ret_targets = word_processor.transform( 27 | [[target] for _, _, target, _, _ in sentences] 28 | ) 29 | assert ret_targets.shape[1] == max_sentence_len 30 | ret_targets = ret_targets[:, :max_target_len] 31 | 32 | ret_aspects = word_processor.transform( 33 | [aspect_term for _, _, _, aspect_term, _ in sentences] 34 | ) 35 | assert ret_aspects.shape[1] == max_sentence_len 36 | ret_aspects = ret_aspects[:, :max_aspect_len] 37 | 38 | ret_label = label_processor.transform( 39 | [label for _, _, _, _, label in sentences] 40 | ) 41 | # [None, 1] 42 | 43 | ret_ids = [sent_id for sent_id, _, _, _, _ in sentences] 44 | return ret_sentences, ret_targets, ret_loc_indicator, ret_aspects, ret_label, np.array(ret_ids, dtype=np.object) 45 | 46 | def load_task(data_dir, aspect2idx): 47 | in_file = os.path.join(data_dir, 'sentihood-train.json') 48 | train = parse_sentihood_json(in_file) 49 | in_file = os.path.join(data_dir, 'sentihood-dev.json') 50 | dev = parse_sentihood_json(in_file) 51 | in_file = os.path.join(data_dir, 'sentihood-test.json') 52 | test = parse_sentihood_json(in_file) 53 | 54 | train = convert_input(train, aspect2idx) 55 | train_aspect_idx = get_aspect_idx(train, aspect2idx) 56 | train = tokenize(train) 57 | dev = convert_input(dev, aspect2idx) 58 | dev_aspect_idx = get_aspect_idx(dev, aspect2idx) 59 | dev = tokenize(dev) 60 | test = convert_input(test, aspect2idx) 61 | test_aspect_idx = get_aspect_idx(test, aspect2idx) 62 | test = tokenize(test) 63 | 64 | return (train, train_aspect_idx), (dev, dev_aspect_idx), (test, test_aspect_idx) 65 | 66 | def get_aspect_idx(data, aspect2idx): 67 | ret = [] 68 | for _, _, _, aspect, _ in data: 69 | ret.append(aspect2idx[aspect]) 70 | assert len(data) == len(ret) 71 | return np.array(ret) 72 | 73 | def remove_replacement(data, replacement): 74 | ret_data = [] 75 | ret_indices = [] 76 | for sent in data: 77 | text = sent[0] 78 | assert replacement in text 79 | index = text.index(replacement) 80 | new_text = text[:index] + text[index+1:] 81 | ret_data.append(( 82 | new_text, sent[1], sent[2] 83 | )) 84 | ret_indices.append(index) 85 | return ret_data, ret_indices 86 | 87 | def lower_case(data): 88 | ret = [] 89 | for sent_id, text, target, aspect, sentiment in data: 90 | new_text = map(lambda x: x.lower(), text) 91 | new_aspect = map(lambda x: x.lower(), aspect) 92 | ret.append((sent_id, new_text, target.lower(), new_aspect, sentiment)) 93 | return ret 94 | 95 | def parse_sentihood_json(in_file): 96 | with open(in_file) as f: 97 | data = json.load(f) 98 | ret = [] 99 | for d in data: 100 | text = d['text'] 101 | sent_id = d['id'] 102 | opinions = [] 103 | targets = set() 104 | for opinion in d['opinions']: 105 | sentiment = opinion['sentiment'] 106 | aspect = opinion['aspect'] 107 | target_entity = opinion['target_entity'] 108 | targets.add(target_entity) 109 | opinions.append((target_entity, aspect, sentiment)) 110 | ret.append((sent_id, text, opinions)) 111 | return ret 112 | 113 | def get_all_aspects(data): 114 | aspects = set() 115 | for sent_id, text, opinions in data: 116 | for target_entity, aspect, sentiment in opinions: 117 | aspects.add(aspect) 118 | return aspects 119 | 120 | def convert_input(data, all_aspects): 121 | ret = [] 122 | for sent_id, text, opinions in data: 123 | for target_entity, aspect, sentiment in opinions: 124 | if aspect not in all_aspects: 125 | continue 126 | ret.append((sent_id, text, target_entity, aspect, sentiment)) 127 | assert 'LOCATION1' in text 128 | targets = set(['LOCATION1']) 129 | if 'LOCATION2' in text: 130 | targets.add('LOCATION2') 131 | for target in targets: 132 | aspects = set([a for t, a, _ in opinions if t == target]) 133 | none_aspects = [a for a in all_aspects if a not in aspects] 134 | for aspect in none_aspects: 135 | ret.append((sent_id, text, target, aspect, 'None')) 136 | return ret 137 | 138 | def tokenize(data): 139 | ret = [] 140 | for sent_id, text, target_entity, aspect, sentiment in data: 141 | new_text = nltk.word_tokenize(text) 142 | new_aspect = aspect.split('-') 143 | ret.append((sent_id, new_text, target_entity, new_aspect, sentiment)) 144 | return ret 145 | -------------------------------------------------------------------------------- /delayed_entnet_sentihood.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | 4 | import tensorflow as tf 5 | import numpy as np 6 | from six.moves import range 7 | 8 | from tensorflow import name_scope 9 | 10 | from functools import partial 11 | 12 | from tensorflow.contrib.rnn import LSTMStateTuple 13 | from tensorflow.contrib.rnn.ops import gen_gru_ops 14 | from tensorflow.python.ops import init_ops 15 | 16 | 17 | class DynamicMemoryCell(tf.contrib.rnn.RNNCell): 18 | """ 19 | Implementation of a dynamic memory cell as a gated recurrent network. 20 | The cell's hidden state is divided into blocks and each block's weights are tied. 21 | """ 22 | 23 | def __init__(self, 24 | num_blocks, 25 | num_units_per_block, 26 | keys, 27 | initializer=None, 28 | recurrent_initializer=None, 29 | activation=tf.nn.relu,): 30 | self._num_blocks = num_blocks # M 31 | self._num_units_per_block = num_units_per_block # d 32 | self._keys = keys 33 | self._activation = activation # \phi 34 | self._initializer = initializer 35 | self._recurrent_initializer = recurrent_initializer 36 | 37 | @property 38 | def state_size(self): 39 | "Return the total state size of the cell, across all blocks." 40 | return self._num_blocks * self._num_units_per_block * 2 41 | 42 | @property 43 | def output_size(self): 44 | "Return the total output size of the cell, across all blocks." 45 | return self._num_blocks * self._num_units_per_block 46 | 47 | def zero_state(self, batch_size, dtype): 48 | "Initialize the memory to the key values." 49 | zero_state = tf.concat([tf.expand_dims(key, axis=0) for key in self._keys], axis=1) 50 | zero_state_batch = tf.tile(zero_state, [batch_size, 1]) 51 | return tf.concat( 52 | values=[ 53 | zero_state_batch, 54 | tf.zeros( 55 | shape=[batch_size, self._num_blocks * self._num_units_per_block], 56 | dtype=tf.float32, 57 | ), 58 | ], 59 | axis=1 60 | ) 61 | 62 | def get_gate(self, state_j, key_j, inputs, v=None, prev_a=None): 63 | """ 64 | Implements the gate (scalar for each block). Equation 2: 65 | 66 | g_j <- \sigma(s_t^T h_j + s_t^T w_j) 67 | """ 68 | a = tf.reduce_sum(inputs * state_j, axis=1) 69 | b = tf.reduce_sum(inputs * key_j, axis=1) 70 | assert v is not None 71 | c = tf.reduce_sum(prev_a * v, axis=1) 72 | return tf.sigmoid(a + b + c) 73 | 74 | def get_candidate(self, state_j, key_j, inputs, U, V, W, U_bias): 75 | """ 76 | Represents the new memory candidate that will be weighted by the 77 | gate value and combined with the existing memory. Equation 3: 78 | 79 | h_j^~ <- \phi(U h_j + V w_j + W s_t) 80 | """ 81 | key_V = tf.matmul(key_j, V) 82 | state_U = tf.matmul(state_j, U) + U_bias 83 | inputs_W = tf.matmul(inputs, W) 84 | return self._activation(state_U + inputs_W + key_V) 85 | 86 | def __call__(self, inputs, state, scope=None): 87 | with tf.variable_scope(scope or type(self).__name__, initializer=self._initializer): 88 | U = tf.get_variable('U', [self._num_units_per_block, self._num_units_per_block], 89 | initializer=self._recurrent_initializer) 90 | V = tf.get_variable('V', [self._num_units_per_block, self._num_units_per_block], 91 | initializer=self._recurrent_initializer) 92 | W = tf.get_variable('W', [self._num_units_per_block, self._num_units_per_block], 93 | initializer=self._recurrent_initializer) 94 | 95 | U_bias = tf.get_variable('U_bias', [self._num_units_per_block]) 96 | 97 | state, state_a = tf.split( 98 | value=state, 99 | num_or_size_splits=[ 100 | self._num_blocks * self._num_units_per_block, 101 | self._num_blocks * self._num_units_per_block 102 | ], 103 | axis=1, 104 | ) 105 | state_a = tf.split(state_a, self._num_blocks, axis=1) 106 | assert len(state_a) == self._num_blocks 107 | 108 | # Split the hidden state into blocks (each U, V, W are shared across blocks). 109 | state = tf.split(state, self._num_blocks, axis=1) 110 | assert len(state) == self._num_blocks 111 | 112 | next_states = [] 113 | next_a_states = [] 114 | for j, state_j in enumerate(state): # Hidden State (j) 115 | key_j = tf.expand_dims(self._keys[j], axis=0) 116 | candidate_j = self.get_candidate(state_j, key_j, inputs, U, V, W, U_bias) 117 | 118 | reuse = False 119 | if j != 0: 120 | reuse = True 121 | with tf.variable_scope("entnet_gru", reuse=reuse) as gru_scope: 122 | w_ru = tf.get_variable( 123 | "w_ru", 124 | [self._num_units_per_block * 2, self._num_units_per_block * 2] 125 | ) 126 | b_ru = tf.get_variable( 127 | "b_ru", [self._num_units_per_block * 2], 128 | initializer=init_ops.constant_initializer(1.0)) 129 | w_c = tf.get_variable("w_c", 130 | [self._num_units_per_block * 2, self._num_units_per_block] 131 | ) 132 | b_c = tf.get_variable( 133 | "b_c", [self._num_units_per_block], 134 | initializer=init_ops.constant_initializer(0.0)) 135 | _gru_block_cell = gen_gru_ops.gru_block_cell # pylint: disable=invalid-name 136 | _, _, _, new_a = _gru_block_cell( 137 | x=candidate_j, h_prev=state_a[j], 138 | w_ru=w_ru, w_c=w_c, b_ru=b_ru, b_c=b_c) 139 | 140 | v_a = tf.get_variable( 141 | "v_a", [self._num_units_per_block], 142 | initializer=self._initializer, 143 | ) 144 | 145 | next_a_states.append(new_a) 146 | 147 | gate_j = self.get_gate(state_j, key_j, inputs, v_a, new_a) 148 | 149 | # Equation 4: h_j <- h_j + g_j * h_j^~ 150 | # Perform an update of the hidden state (memory). 151 | state_j_next = state_j + tf.expand_dims(gate_j, -1) * candidate_j 152 | 153 | # Equation 5: h_j <- h_j / \norm{h_j} 154 | # Forget previous memories by normalization. 155 | state_j_next_norm = tf.norm( 156 | tensor=state_j_next, 157 | ord='euclidean', 158 | axis=-1, 159 | keep_dims=True) 160 | state_j_next_norm = tf.where( 161 | tf.greater(state_j_next_norm, 0.0), 162 | state_j_next_norm, 163 | tf.ones_like(state_j_next_norm)) 164 | state_j_next = state_j_next / state_j_next_norm 165 | 166 | next_states.append(state_j_next) 167 | state_next = tf.concat(next_states, axis=1) 168 | state_a_next = tf.concat(next_a_states, axis=1) 169 | return state_next, tf.concat(values=[state_next, state_a_next], axis=1) 170 | 171 | def zero_nil_slot(t, name=None): 172 | """ 173 | Overwrites the nil_slot (first row) of the input Tensor with zeros. 174 | 175 | The nil_slot is a dummy slot and should not be trained and influence 176 | the training algorithm. 177 | """ 178 | with name_scope(values=[t], name=name, default_name="zero_nil_slot") as name: 179 | t = tf.convert_to_tensor(t, name="t") 180 | s = tf.shape(t)[1] 181 | z = tf.zeros(tf.stack([1, s])) 182 | return tf.concat( 183 | axis=0, values=[z, tf.slice(t, [1, 0], [-1, -1])], name=name 184 | ) 185 | 186 | def prelu(features, alpha, scope=None): 187 | """ 188 | Implementation of [Parametric ReLU](https://arxiv.org/abs/1502.01852) borrowed from Keras. 189 | """ 190 | with tf.variable_scope(scope, 'PReLU'): 191 | pos = tf.nn.relu(features) 192 | neg = alpha * (features - tf.abs(features)) * 0.5 193 | return pos + neg 194 | 195 | 196 | class Delayed_EntNet_Sentihood(object): 197 | def __init__(self, 198 | batch_size, vocab_size, target_len, aspect_len, sentence_len, 199 | answer_size, embedding_size, 200 | weight_tying="adj", 201 | hops=3, 202 | embedding_mat=None, 203 | update_embeddings=False, 204 | softmax_mask=True, 205 | max_grad_norm=5.0, 206 | n_keys=6, 207 | tied_keys=[], 208 | l2_final_layer=0.0, 209 | initializer=tf.contrib.layers.xavier_initializer(), 210 | optimizer=tf.train.AdamOptimizer(learning_rate=1e-2), 211 | global_step=None, 212 | session=None, 213 | name='Delayed_EntNet_Sentihood'): 214 | 215 | print name 216 | 217 | self._batch_size = batch_size 218 | self._vocab_size = vocab_size 219 | self._target_len = target_len 220 | self._aspect_len = aspect_len 221 | self._sentence_len = sentence_len 222 | self._embedding_size = embedding_size 223 | self._answer_size = answer_size 224 | self._max_grad_norm = max_grad_norm 225 | self._init = initializer 226 | self._opt = optimizer 227 | self._global_step = global_step 228 | self._name = name 229 | self._embedding_mat = embedding_mat 230 | self._update_embeddings = update_embeddings 231 | 232 | assert len(tied_keys) <= n_keys 233 | self._n_keys = n_keys 234 | self._tied_keys = tied_keys 235 | self._l2_final_layer = l2_final_layer 236 | 237 | self._build_inputs() 238 | self._build_vars() 239 | 240 | logits = self._inference_adj( 241 | self._sentences, 242 | self._targets, 243 | self._aspects, 244 | self._entnet_input_keep_prob, 245 | self._entnet_output_keep_prob, 246 | self._entnet_state_keep_prob, 247 | self._final_layer_keep_prob, 248 | ) 249 | 250 | cross_entropy = tf.nn.softmax_cross_entropy_with_logits( 251 | logits=logits, labels=tf.cast(self._answers_one_hot, tf.float32), 252 | name="cross_entropy" 253 | ) 254 | cross_entropy_mean = tf.reduce_mean( 255 | cross_entropy, name="cross_entropy_mean" 256 | ) 257 | 258 | # l2 regularization 259 | trainable_variables = tf.trainable_variables() 260 | l2_loss_final_layer = 0.0 261 | assert self._l2_final_layer >= 0 262 | 263 | if self._l2_final_layer > 0: 264 | final_layer_weights = [ tf.nn.l2_loss(v) for v in trainable_variables 265 | if 'R:0' in v.name] 266 | assert len(final_layer_weights) == 1 267 | l2_loss_final_layer = self._l2_final_layer * tf.add_n(final_layer_weights) 268 | 269 | # loss op 270 | loss_op = cross_entropy_mean + l2_loss_final_layer 271 | 272 | # gradient pipeline 273 | grads_and_vars = self._opt.compute_gradients(loss_op) 274 | 275 | grads_and_vars = [(tf.clip_by_norm(g, self._max_grad_norm), v) for g,v in grads_and_vars] 276 | nil_grads_and_vars = [] 277 | for g, v in grads_and_vars: 278 | if v.name in self._nil_vars: 279 | nil_grads_and_vars.append((zero_nil_slot(g), v)) 280 | else: 281 | nil_grads_and_vars.append((g, v)) 282 | train_op = self._opt.apply_gradients(nil_grads_and_vars, global_step=self._global_step, name="train_op") 283 | 284 | # predict ops 285 | predict_op = tf.argmax(logits, 1, name="predict_op") 286 | predict_proba_op = tf.nn.softmax(logits, name="predict_proba_op") 287 | 288 | # assign ops 289 | self.loss_op = loss_op 290 | self.predict_op = predict_op 291 | self.predict_proba_op = predict_proba_op 292 | self.train_op = train_op 293 | 294 | init_op = tf.global_variables_initializer() 295 | self._sess = session 296 | self._sess.run(init_op, feed_dict={self._input_embedding: self._embedding_mat}) 297 | 298 | def _build_inputs(self): 299 | self._sentences = tf.placeholder( 300 | tf.int32, [None, self._sentence_len], 301 | name="sentences" 302 | ) 303 | self._targets = tf.placeholder( 304 | tf.int32, [None, self._target_len], 305 | name="targets" 306 | ) 307 | self._aspects = tf.placeholder( 308 | tf.int32, [None, self._aspect_len], 309 | name="aspects" 310 | ) 311 | self._answers = tf.placeholder( 312 | tf.int32, [None], 313 | name="answers" 314 | ) 315 | self._answers_one_hot = tf.one_hot( 316 | indices=self._answers, 317 | depth=self._answer_size, 318 | ) 319 | self._input_embedding = tf.placeholder( 320 | tf.float32, shape=self._embedding_mat.shape, 321 | name="input_embedding" 322 | ) 323 | self._entnet_input_keep_prob = tf.placeholder( 324 | tf.float32, shape=[], 325 | name="entnet_input_keep_prob" 326 | ) 327 | self._entnet_output_keep_prob = tf.placeholder( 328 | tf.float32, shape=[], 329 | name="entnet_output_keep_prob" 330 | ) 331 | self._entnet_state_keep_prob = tf.placeholder( 332 | tf.float32, shape=[], 333 | name="entnet_state_keep_prob" 334 | ) 335 | self._final_layer_keep_prob = tf.placeholder( 336 | tf.float32, shape=[], 337 | name="final_layer_keep_prob" 338 | ) 339 | 340 | def _build_vars(self): 341 | with tf.variable_scope(self._name): 342 | self._embedding = tf.get_variable( 343 | name="embedding", 344 | dtype=tf.float32, 345 | initializer=self._input_embedding, 346 | trainable=self._update_embeddings, 347 | ) 348 | 349 | self._free_keys_embedding = tf.get_variable( 350 | name="free_keys_embedding", 351 | dtype=tf.float32, 352 | shape=[self._n_keys - len(self._tied_keys), self._embedding_size], 353 | initializer=self._init, 354 | trainable=True, 355 | ) 356 | 357 | self._nil_vars = set([self._embedding.name]) 358 | 359 | def _mask_embedding(self, embedding): 360 | vocab_size, embedding_size = self._embedding_mat.shape 361 | embedding_mask = tf.constant( 362 | value=[0 if i == 0 else 1 for i in range(vocab_size)], 363 | shape=[vocab_size, 1], 364 | dtype=tf.float32, 365 | name="embedding_mask", 366 | ) 367 | return embedding * embedding_mask 368 | 369 | def _inference_adj(self, sentences, targets, aspects, 370 | entnet_input_keep_prob, entnet_output_keep_prob, 371 | entnet_state_keep_prob, final_layer_keep_prob): 372 | with tf.variable_scope(self._name): 373 | masked_embedding = self._mask_embedding(self._embedding) 374 | 375 | batch_size = tf.shape(sentences)[0] 376 | 377 | targets_emb = tf.nn.embedding_lookup(masked_embedding, targets) 378 | # [None, entity_size, emb_size] 379 | targets_emb = tf.reduce_mean( 380 | input_tensor=targets_emb, 381 | axis=1, 382 | keep_dims=True, 383 | ) 384 | # [None, 1, emb_size] 385 | aspects_emb = tf.nn.embedding_lookup(masked_embedding, aspects) 386 | # [None, aspect_size, emb_size] 387 | aspects_emb = tf.reduce_mean( 388 | input_tensor=aspects_emb, 389 | axis=1, 390 | keep_dims=True, 391 | ) 392 | # [None, 1, emb_size] 393 | 394 | sentences_emb = tf.nn.embedding_lookup(masked_embedding, sentences) 395 | # [None, memory_size, emb_size] 396 | 397 | sentences_len = self._sentence_length(sentences_emb) 398 | # [None] 399 | 400 | tied_keys_emb = tf.nn.embedding_lookup(masked_embedding, self._tied_keys) 401 | # [len(self._tied_keys), max_key_len, emb_size] 402 | tied_keys_emb = tf.reduce_mean( 403 | input_tensor=tied_keys_emb, 404 | axis=1, 405 | ) 406 | # [len(self._tied_keys), emb_size] 407 | free_keys_emb = self._free_keys_embedding 408 | # [n_keys - len(self._tied_keys), emb_size] 409 | 410 | keys_emb = tf.concat( 411 | values=[tied_keys_emb, free_keys_emb], 412 | axis=0, 413 | name="keys_emb", 414 | ) 415 | # [n_keys, emb_size] 416 | 417 | batched_keys_emb = tf.tile( 418 | input=tf.expand_dims(input=keys_emb, axis=0), 419 | multiples=[batch_size, 1, 1] 420 | ) 421 | # [None, n_keys, emb_size] 422 | 423 | keys = tf.split(keys_emb, self._n_keys, axis=0) 424 | # list of [1, emb_size] 425 | keys = [tf.squeeze(key, axis=0) for key in keys] 426 | # list of [emb_size] 427 | 428 | alpha = tf.get_variable( 429 | name='alpha', 430 | shape=self._embedding_size, 431 | initializer=tf.constant_initializer(1.0) 432 | ) 433 | activation = partial(prelu, alpha=alpha) 434 | 435 | cell_fw = DynamicMemoryCell( 436 | num_blocks=self._n_keys, 437 | num_units_per_block=self._embedding_size, 438 | keys=keys, 439 | initializer=self._init, 440 | recurrent_initializer=self._init, 441 | activation=activation, 442 | ) 443 | initial_state_fw = cell_fw.zero_state(batch_size, tf.float32) 444 | sentences_emb_shape = sentences_emb.get_shape() 445 | cell_fw = tf.contrib.rnn.DropoutWrapper( 446 | cell=cell_fw, 447 | input_keep_prob=entnet_input_keep_prob, 448 | output_keep_prob=entnet_output_keep_prob, 449 | state_keep_prob=entnet_state_keep_prob, 450 | variational_recurrent=True, 451 | input_size=(sentences_emb_shape[2]), 452 | dtype=tf.float32, 453 | ) 454 | 455 | cell_bw = DynamicMemoryCell( 456 | num_blocks=self._n_keys, 457 | num_units_per_block=self._embedding_size, 458 | keys=keys, 459 | initializer=self._init, 460 | recurrent_initializer=self._init, 461 | activation=activation, 462 | ) 463 | initial_state_bw = cell_bw.zero_state(batch_size, tf.float32) 464 | cell_bw = tf.contrib.rnn.DropoutWrapper( 465 | cell=cell_bw, 466 | input_keep_prob=entnet_input_keep_prob, 467 | output_keep_prob=entnet_output_keep_prob, 468 | state_keep_prob=entnet_state_keep_prob, 469 | variational_recurrent=True, 470 | input_size=(sentences_emb_shape[2]), 471 | dtype=tf.float32, 472 | ) 473 | (_, _), (last_state_fw, last_state_bw) = tf.nn.bidirectional_dynamic_rnn( 474 | cell_fw=cell_fw, 475 | cell_bw=cell_bw, 476 | inputs=sentences_emb, 477 | sequence_length=sentences_len, 478 | initial_state_fw=initial_state_fw, 479 | initial_state_bw=initial_state_bw, 480 | ) 481 | 482 | last_state_fw, _ = tf.split( 483 | value=last_state_fw, 484 | num_or_size_splits=[ 485 | self._n_keys * self._embedding_size, 486 | self._n_keys * self._embedding_size, 487 | ], 488 | axis=1 489 | ) 490 | last_state_bw, _ = tf.split( 491 | value=last_state_bw, 492 | num_or_size_splits=[ 493 | self._n_keys * self._embedding_size, 494 | self._n_keys * self._embedding_size, 495 | ], 496 | axis=1 497 | ) 498 | # last_state_f/bw: [None, emb_size * n_keys] 499 | 500 | last_state_fw = tf.stack( 501 | tf.split(last_state_fw, self._n_keys, axis=1), axis=1) 502 | # [None, n_keys, emb_size] 503 | last_state_bw = tf.stack( 504 | tf.split(last_state_bw, self._n_keys, axis=1), axis=1) 505 | # [None, n_keys, emb_size] 506 | 507 | last_state = last_state_fw + last_state_bw 508 | # [None, n_keys, emb_size] 509 | 510 | asp_att = tf.concat(values=[targets_emb, aspects_emb], axis=2) 511 | # [None, 1, emb_size * 2] 512 | W_asp_att = tf.get_variable( 513 | name='W_asp_att', 514 | shape=[self._embedding_size, self._embedding_size * 2], 515 | dtype=tf.float32, 516 | initializer=self._init, 517 | ) 518 | temp = tf.tensordot( 519 | batched_keys_emb, W_asp_att, [[2], [0]] 520 | ) 521 | # [None, n_keys, emb_size * 2] 522 | attention = tf.reduce_sum(temp * asp_att, axis=2) 523 | # [None, n_keys] 524 | attention_max = tf.reduce_max(attention, axis=-1, keep_dims=True) 525 | # [None, 1] 526 | attention = tf.nn.softmax(attention - attention_max) 527 | # [None, n_keys] 528 | attention = tf.expand_dims(attention, axis=2) 529 | # [None, n_keys, 1] 530 | 531 | u = tf.reduce_sum(last_state * attention, axis=1) 532 | # [None, emb_size] 533 | 534 | R = tf.get_variable('R', [self._embedding_size, self._answer_size]) 535 | H = tf.get_variable('H', [self._embedding_size, self._embedding_size]) 536 | 537 | a = tf.squeeze(aspects_emb, axis=1) 538 | # [None, emb_size] 539 | hidden = activation(a + tf.matmul(u, H)) 540 | # [None, emb)size] 541 | hidden = tf.nn.dropout(x=hidden, keep_prob=final_layer_keep_prob) 542 | # [None, emb_size] 543 | y = tf.matmul(hidden, R) 544 | # [None, 1] 545 | 546 | return y 547 | 548 | def _get_mini_batch_start_end(self, n_train, batch_size=None): 549 | ''' 550 | Args: 551 | n_train: int, number of training instances 552 | batch_size: int (or None if full batch) 553 | 554 | Returns: 555 | batches: list of tuples of (start, end) of each mini batch 556 | ''' 557 | mini_batch_size = n_train if batch_size is None else batch_size 558 | batches = zip( 559 | range(0, n_train, mini_batch_size), 560 | list(range(mini_batch_size, n_train, mini_batch_size)) + [n_train] 561 | ) 562 | return batches 563 | 564 | def fit(self, sentences, targets, aspects, answers, entnet_input_keep_prob, 565 | entnet_output_keep_prob, entnet_state_keep_prob, 566 | final_layer_keep_prob, batch_size=None): 567 | assert len(sentences) == len(targets) 568 | assert len(sentences) == len(aspects) 569 | assert len(sentences) == len(answers) 570 | batches = self._get_mini_batch_start_end(len(sentences), batch_size) 571 | total_loss = 0. 572 | for start, end in batches: 573 | feed_dict = { 574 | self._sentences: sentences[start:end], 575 | self._targets: targets[start:end], 576 | self._aspects: aspects[start:end], 577 | self._answers: answers[start:end], 578 | self._entnet_input_keep_prob: entnet_input_keep_prob, 579 | self._entnet_output_keep_prob: entnet_output_keep_prob, 580 | self._entnet_state_keep_prob: entnet_state_keep_prob, 581 | self._final_layer_keep_prob: final_layer_keep_prob, 582 | } 583 | loss, _ = self._sess.run( 584 | [self.loss_op, self.train_op], 585 | feed_dict=feed_dict 586 | ) 587 | total_loss = loss * len(sentences[start:end]) 588 | return total_loss 589 | 590 | def predict(self, sentences, targets, aspects, batch_size=None): 591 | assert len(sentences) == len(targets) 592 | assert len(sentences) == len(aspects) 593 | batches = self._get_mini_batch_start_end(len(sentences), batch_size) 594 | predictions, predictions_prob = [], [] 595 | for start, end in batches: 596 | feed_dict = { 597 | self._sentences: sentences[start:end], 598 | self._targets: targets[start:end], 599 | self._aspects: aspects[start:end], 600 | self._entnet_input_keep_prob: 1.0, 601 | self._entnet_output_keep_prob: 1.0, 602 | self._entnet_state_keep_prob: 1.0, 603 | self._final_layer_keep_prob: 1.0, 604 | } 605 | prediction, prediction_prob = self._sess.run( 606 | [self.predict_op, self.predict_proba_op], 607 | feed_dict=feed_dict 608 | ) 609 | predictions.extend(prediction) 610 | predictions_prob.extend(prediction_prob) 611 | return predictions, np.array(predictions_prob) 612 | 613 | def _sentence_length(self, sentences): 614 | ''' 615 | sentences: (None, sentence_len, embedding_size) 616 | ''' 617 | used = tf.sign(tf.reduce_max(tf.abs(sentences), reduction_indices=2)) 618 | length = tf.reduce_sum(used, reduction_indices=1) 619 | length = tf.cast(length, tf.int32) 620 | return length 621 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import print_function 3 | 4 | from data_utils_sentihood import * 5 | from vocab_processor import * 6 | from sklearn import metrics 7 | from delayed_entnet_sentihood import Delayed_EntNet_Sentihood 8 | from itertools import chain 9 | from six.moves import range 10 | from collections import defaultdict 11 | 12 | import tensorflow as tf 13 | import numpy as np 14 | 15 | import sys 16 | import random 17 | import logging 18 | import cPickle as pickle 19 | 20 | import pprint 21 | pp = pprint.PrettyPrinter() 22 | 23 | tf.flags.DEFINE_float("learning_rate", 0.05, "Learning rate for the optimizer.") 24 | tf.flags.DEFINE_float("max_grad_norm", 5.0, "Clip gradients to this norm.") 25 | tf.flags.DEFINE_integer("evaluation_interval", 1, "Evaluate and print results every x epochs") 26 | tf.flags.DEFINE_integer("batch_size", 128, "Batch size for training.") 27 | tf.flags.DEFINE_integer("epochs", 800, "Number of epochs to train for.") 28 | tf.flags.DEFINE_integer("embedding_size", 20, "Embedding size for embedding matrices.") 29 | tf.flags.DEFINE_integer("sentence_len", 50, "Maximum len of sentence.") 30 | tf.flags.DEFINE_string("task", "Sentihood", "Sentihood") 31 | tf.flags.DEFINE_integer("random_state", 67, "Random state.") 32 | tf.flags.DEFINE_string("data_dir", "data/sentihood/", "Directory containing Sentihood data") 33 | tf.flags.DEFINE_string("opt", "ftrl", "Optimizer [ftrl]") 34 | tf.flags.DEFINE_string("embedding_file_path", None, "Embedding file path [None]") 35 | tf.flags.DEFINE_boolean("update_embeddings", False, "Update embeddings [False]") 36 | tf.flags.DEFINE_boolean("case_folding", True, "Case folding [True]") 37 | tf.flags.DEFINE_integer("n_cpus", 6, "N CPUs [6]") 38 | tf.flags.DEFINE_integer("n_keys", 7, "Number of keys [7]") 39 | tf.flags.DEFINE_integer("n_tied", 2, "Number of tied keys [2]") 40 | tf.flags.DEFINE_float("entnet_input_keep_prob", 0.8, "entnet input keep prob [0.8]") 41 | tf.flags.DEFINE_float("entnet_output_keep_prob", 1.0, "entnet output keep prob [1.0]") 42 | tf.flags.DEFINE_float("entnet_state_keep_prob", 1.0, "entnet state keep prob [1.0]") 43 | tf.flags.DEFINE_float("final_layer_keep_prob", 0.8, "final layer keep prob [0.8]") 44 | tf.flags.DEFINE_float("l2_final_layer", 1e-3, "Lambda L2 final layer [1e-3]") 45 | 46 | FLAGS = tf.flags.FLAGS 47 | 48 | if __name__ == "__main__": 49 | logger = logging.getLogger() 50 | logger.setLevel(logging.DEBUG) 51 | ch = logging.StreamHandler() 52 | ch.setLevel(logging.DEBUG) 53 | formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') 54 | ch.setFormatter(formatter) 55 | logger.addHandler(ch) 56 | 57 | logger.info(" ".join(sys.argv)) 58 | logger.info("Started Task: %s" % FLAGS.task) 59 | 60 | logger.info(pp.pformat(FLAGS.__flags)) 61 | 62 | session_conf = tf.ConfigProto( 63 | intra_op_parallelism_threads=FLAGS.n_cpus, 64 | inter_op_parallelism_threads=FLAGS.n_cpus, 65 | ) 66 | 67 | aspect2idx = { 68 | 'general': 0, 69 | 'price': 1, 70 | 'transit-location': 2, 71 | 'safety': 3, 72 | } 73 | 74 | assert FLAGS.n_keys >= 2 75 | assert FLAGS.n_tied == 2 76 | 77 | with tf.Session(config=session_conf) as sess: 78 | 79 | np.random.seed(FLAGS.random_state) 80 | 81 | # task data 82 | (train, train_aspect_idx), (val, val_aspect_idx), (test, test_aspect_idx) = load_task(FLAGS.data_dir, aspect2idx) 83 | 84 | if FLAGS.case_folding: 85 | train = lower_case(train) 86 | val = lower_case(val) 87 | test = lower_case(test) 88 | 89 | data = train + val + test 90 | 91 | max_sentence_len = max(map(lambda x: len(x[1]), data)) 92 | max_sentence_len = min(FLAGS.sentence_len, max_sentence_len) 93 | logger.info('Max sentence len: %d' % max_sentence_len) 94 | max_target_len = 1 # should be one 95 | max_aspect_len = max(map(lambda x: len(x), [d[3] for d in data])) 96 | assert max_aspect_len == 2 97 | logger.info('Max target size: %d' % max_target_len) 98 | logger.info('Max aspect size: %d' % max_aspect_len) 99 | 100 | assert FLAGS.embedding_file_path is not None 101 | word_vocab = EmbeddingVocabulary( 102 | in_file=FLAGS.embedding_file_path, 103 | ) 104 | word_vocab_processor = EmbeddingVocabularyProcessor( 105 | max_document_length=max_sentence_len, 106 | vocabulary=word_vocab, 107 | ) 108 | embedding_mat = word_vocab.embeddings 109 | embedding_size = word_vocab.embeddings.shape[1] 110 | 111 | label_vocab = LabelVocabulary() 112 | label_vocab_processor = LabelVocabularyProcessor( 113 | vocabulary=label_vocab, 114 | min_frequency=0, 115 | ) 116 | 117 | positive_idx = label_vocab.get('Positive') 118 | negative_idx = label_vocab.get('Negative') 119 | none_idx = label_vocab.get('None') 120 | 121 | train_sentences, train_targets, train_loc_indicators, train_aspects, train_labels, train_ids = vectorize_data( 122 | train, 123 | max_sentence_len, 124 | max_target_len, 125 | max_aspect_len, 126 | word_vocab_processor, 127 | label_vocab_processor, 128 | ) 129 | 130 | val_sentences, val_targets, val_loc_indicators, val_aspects, val_labels, val_ids = vectorize_data( 131 | val, 132 | max_sentence_len, 133 | max_target_len, 134 | max_aspect_len, 135 | word_vocab_processor, 136 | label_vocab_processor, 137 | ) 138 | 139 | test_sentences, test_targets, test_loc_indicators, test_aspects, test_labels, test_ids = vectorize_data( 140 | test, 141 | max_sentence_len, 142 | max_target_len, 143 | max_aspect_len, 144 | word_vocab_processor, 145 | label_vocab_processor, 146 | ) 147 | 148 | target_terms = [['location1'], ['location2']] 149 | target_terms = word_vocab_processor.transform(target_terms)[:, :max_target_len] 150 | 151 | sentence_len = max_sentence_len 152 | vocab_size = len(word_vocab) 153 | answer_size = len(label_vocab) 154 | 155 | logger.info("Training sentences shape " + str(train_sentences.shape)) 156 | logger.info("Training targets shape " + str(train_targets.shape)) 157 | logger.info("Training aspects shape " + str(train_aspects.shape)) 158 | logger.info("Validation sentences shape " + str(val_sentences.shape)) 159 | logger.info("Validation targets shape " + str(val_targets.shape)) 160 | logger.info("Validation aspects shape " + str(val_aspects.shape)) 161 | logger.info("Test sentences shape " + str(test_sentences.shape)) 162 | logger.info("Test targets shape " + str(test_targets.shape)) 163 | logger.info("Test aspects shape " + str(test_aspects.shape)) 164 | 165 | # params 166 | n_train = train_sentences.shape[0] 167 | n_val = val_sentences.shape[0] 168 | n_test = test_sentences.shape[0] 169 | 170 | logger.info("Training Size %d" % n_train) 171 | logger.info("Validation Size %d" % n_val) 172 | logger.info("Testing Size %d" % n_test) 173 | 174 | tf.set_random_seed(FLAGS.random_state) 175 | batch_size = FLAGS.batch_size 176 | 177 | global_step = None 178 | optimizer = None 179 | 180 | train_positive_idx = np.where(train_labels == positive_idx)[0] 181 | train_negative_idx = np.where(train_labels == negative_idx)[0] 182 | train_none_idx = np.where(train_labels == none_idx)[0] 183 | 184 | train_positive_sentences = train_sentences[train_positive_idx] 185 | train_positive_targets = train_targets[train_positive_idx] 186 | train_positive_aspects = train_aspects[train_positive_idx] 187 | train_positive_labels = train_labels[train_positive_idx] 188 | 189 | train_negative_sentences = train_sentences[train_negative_idx] 190 | train_negative_targets = train_targets[train_negative_idx] 191 | train_negative_aspects = train_aspects[train_negative_idx] 192 | train_negative_labels = train_labels[train_negative_idx] 193 | 194 | train_none_sentences = train_sentences[train_none_idx] 195 | train_none_targets = train_targets[train_none_idx] 196 | train_none_aspects = train_aspects[train_none_idx] 197 | train_none_labels = train_labels[train_none_idx] 198 | 199 | assert len(train_none_idx) > len(train_positive_idx) 200 | assert len(train_positive_idx) > len(train_negative_idx) 201 | 202 | n_positive_train = len(train_positive_idx) 203 | n_negative_train = len(train_negative_idx) 204 | n_none_train = len(train_none_idx) 205 | n_train = n_negative_train # down-sampling 206 | 207 | logger.info("Positive training Size %d" % n_positive_train) 208 | logger.info("Negative training Size %d" % n_negative_train) 209 | logger.info("None training Size %d" % n_none_train) 210 | 211 | if FLAGS.opt == 'adam': 212 | optimizer = tf.train.AdamOptimizer( 213 | learning_rate=FLAGS.learning_rate, epsilon=FLAGS.epsilon) 214 | elif FLAGS.opt == 'ftrl': 215 | optimizer = tf.train.FtrlOptimizer( 216 | learning_rate=FLAGS.learning_rate 217 | ) 218 | 219 | batches = zip( 220 | range(0, max(1, n_train-batch_size), batch_size), 221 | range(batch_size, max(batch_size + 1, n_train), batch_size) 222 | ) 223 | batches = [(start, end) for start, end in batches] 224 | 225 | model = Delayed_EntNet_Sentihood( 226 | batch_size, 227 | vocab_size, 228 | max_target_len, 229 | max_aspect_len, 230 | sentence_len, 231 | answer_size, 232 | embedding_size, 233 | session=sess, 234 | embedding_mat=word_vocab.embeddings, 235 | update_embeddings=FLAGS.update_embeddings, 236 | n_keys=FLAGS.n_keys, 237 | tied_keys=target_terms, 238 | l2_final_layer=FLAGS.l2_final_layer, 239 | max_grad_norm=FLAGS.max_grad_norm, 240 | optimizer=optimizer, 241 | global_step=global_step 242 | ) 243 | for t in range(1, FLAGS.epochs+1): 244 | np.random.shuffle(batches) 245 | total_cost = 0.0 246 | total_training_instances = 0 247 | 248 | for start, end in batches: 249 | # train negative 250 | sentences = train_negative_sentences[start:end] 251 | targets = train_negative_targets[start:end] 252 | aspects = train_negative_aspects[start:end] 253 | answers = train_negative_labels[start:end] 254 | cost_t = model.fit(sentences, targets, aspects, answers, 255 | FLAGS.entnet_input_keep_prob, 256 | FLAGS.entnet_output_keep_prob, 257 | FLAGS.entnet_state_keep_prob, 258 | FLAGS.final_layer_keep_prob) 259 | total_cost += cost_t 260 | total_training_instances += len(train_negative_sentences[start:end]) 261 | 262 | # train positive 263 | positive_start = random.randint(0, n_positive_train - batch_size) 264 | positive_end = positive_start + batch_size 265 | sentences = train_positive_sentences[positive_start:positive_end] 266 | targets = train_positive_targets[positive_start:positive_end] 267 | aspects = train_positive_aspects[positive_start:positive_end] 268 | answers = train_positive_labels[positive_start:positive_end] 269 | cost_t = model.fit(sentences, targets, aspects, answers, 270 | FLAGS.entnet_input_keep_prob, 271 | FLAGS.entnet_output_keep_prob, 272 | FLAGS.entnet_state_keep_prob, 273 | FLAGS.final_layer_keep_prob) 274 | total_cost += cost_t 275 | total_training_instances += len(train_positive_sentences[positive_start:positive_end]) 276 | 277 | # train none 278 | none_start = random.randint(0, n_none_train - batch_size) 279 | none_end = none_start + batch_size 280 | sentences = train_none_sentences[none_start:none_end] 281 | targets = train_none_targets[none_start:none_end] 282 | aspects = train_none_aspects[none_start:none_end] 283 | answers = train_none_labels[none_start:none_end] 284 | cost_t = model.fit(sentences, targets, aspects, answers, 285 | FLAGS.entnet_input_keep_prob, 286 | FLAGS.entnet_output_keep_prob, 287 | FLAGS.entnet_state_keep_prob, 288 | FLAGS.final_layer_keep_prob) 289 | 290 | total_cost += cost_t 291 | total_training_instances += len(train_none_sentences[none_start:none_end]) 292 | 293 | if t % FLAGS.evaluation_interval == 0: 294 | train_preds, train_preds_prob = model.predict( 295 | train_sentences, train_targets, train_aspects, 296 | batch_size=batch_size, 297 | ) 298 | 299 | train_acc = metrics.accuracy_score( 300 | train_labels, np.array(train_preds) 301 | ) 302 | 303 | val_preds, val_preds_prob = model.predict( 304 | val_sentences, val_targets, val_aspects, 305 | batch_size=batch_size, 306 | ) 307 | 308 | val_acc = metrics.accuracy_score( 309 | val_labels, np.array(val_preds) 310 | ) 311 | 312 | test_preds, test_preds_prob = model.predict( 313 | test_sentences, test_targets, test_aspects, 314 | batch_size=batch_size 315 | ) 316 | test_acc = metrics.accuracy_score( 317 | test_labels, np.array(test_preds) 318 | ) 319 | 320 | assert total_training_instances != 0 321 | 322 | logger.info('-----------------------') 323 | logger.info('Epoch %d' % t) 324 | logger.info('Avg Cost: %f' % (total_cost / total_training_instances)) 325 | logger.info('Training Accuracy: %f' % train_acc) 326 | logger.info('Validation Accuracy: %f' % val_acc) 327 | logger.info('Test Accuracy: %f' % test_acc) 328 | logger.info('-----------------------') 329 | -------------------------------------------------------------------------------- /vocab_processor.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import collections 4 | 5 | class LabelVocabulary(tf.contrib.learn.preprocessing.CategoricalVocabulary): 6 | def __init__(self, support_reverse=True): 7 | self._mapping = {} 8 | self._support_reverse = support_reverse 9 | if support_reverse: 10 | self._reverse_mapping = [] 11 | self._freq = collections.defaultdict(int) 12 | self._freeze = False 13 | 14 | def get(self, category): 15 | """Returns word's id in the vocabulary. 16 | If category is new, creates a new id for it. 17 | Args: 18 | category: string or integer to lookup in vocabulary. 19 | Returns: 20 | interger, id in the vocabulary. 21 | """ 22 | if category not in self._mapping: 23 | if self._freeze: 24 | assert False # should not happen 25 | # return self._mapping[self._unknown_token] 26 | self._mapping[category] = len(self._mapping) 27 | if self._support_reverse: 28 | self._reverse_mapping.append(category) 29 | return self._mapping[category] 30 | 31 | def trim(self, min_frequency, max_frequency=-1): 32 | """Trims vocabulary for minimum frequency. 33 | Remaps ids from 1..n in sort frequency order. 34 | where n - number of elements left. 35 | Args: 36 | min_frequency: minimum frequency to keep. 37 | max_frequency: optional, maximum frequency to keep. 38 | Useful to remove very frequent categories (like stop words). 39 | """ 40 | # no need to trim for label vocab 41 | return 42 | 43 | class EmbeddingVocabulary(tf.contrib.learn.preprocessing.CategoricalVocabulary): 44 | def __init__(self, 45 | in_file, 46 | binary=False, 47 | padding_token="", 48 | unknown_token="", 49 | support_reverse=True): 50 | self._unknown_token = unknown_token 51 | self._padding_token = padding_token 52 | self._mapping = {padding_token: 0, unknown_token: 1} 53 | self._support_reverse = support_reverse 54 | if support_reverse: 55 | self._reverse_mapping = [padding_token, unknown_token] 56 | # no need to count frequency 57 | # self._freq = collections.defaultdict(int) 58 | self._load_embeddings(in_file, binary=binary) 59 | # freeze the vocabulary once the embeddings have been loaded 60 | self._freeze = True 61 | 62 | def _load_embeddings(self, in_file, binary=False): 63 | # emb = word2vec.Word2Vec.load_word2vec_format(in_file, binary=binary) 64 | with open(in_file) as in_f: 65 | nb_words, nb_dim = None, None 66 | for line in in_f: 67 | line = line.strip() 68 | attrs = line.split(' ') 69 | if len(attrs) == 2: 70 | nb_words = int(attrs[0]) 71 | nb_dim = int(attrs[1]) 72 | self._embeddings = np.zeros((nb_words + 2, nb_dim), dtype=np.float32) 73 | continue 74 | word = attrs[0] 75 | emb = map(float, attrs[1:]) 76 | self._mapping[word] = len(self._mapping) if not self._support_reverse else len(self._reverse_mapping) 77 | self._embeddings[self._mapping[word], :] = emb 78 | if self._support_reverse: 79 | self._reverse_mapping.append(word) 80 | 81 | unk = np.mean(self._embeddings[2:], axis=0) 82 | self._embeddings[self._mapping[self._unknown_token]] = unk 83 | 84 | def _get_mean_embeddings(self, emb): 85 | syn0 = emb.syn0 86 | return np.mean(syn0, axis=0) 87 | 88 | @property 89 | def embeddings(self): 90 | return self._embeddings 91 | 92 | def freeze(self, freeze=True): 93 | """Freezes the vocabulary, after which new words return unknown token id. 94 | Args: 95 | freeze: True to freeze, False to unfreeze. 96 | """ 97 | self._freeze = True # should always be True after __init__ 98 | 99 | def get(self, category): 100 | """Returns word's id in the vocabulary. 101 | If category is new, creates a new id for it. 102 | Args: 103 | category: string or integer to lookup in vocabulary. 104 | Returns: 105 | interger, id in the vocabulary. 106 | """ 107 | if category not in self._mapping: 108 | if self._freeze: 109 | return self._mapping[self._unknown_token] 110 | assert False # should not happey 111 | self._mapping[category] = len(self._mapping) 112 | if self._support_reverse: 113 | self._reverse_mapping.append(category) 114 | return self._mapping[category] 115 | 116 | def add(self, category, count=1): 117 | """Adds count of the category to the frequency table. 118 | Args: 119 | category: string or integer, category to add frequency to. 120 | count: optional integer, how many to add. 121 | """ 122 | # do nothing 123 | return 124 | 125 | def trim(self, min_frequency, max_frequency=-1): 126 | """Trims vocabulary for minimum frequency. 127 | Remaps ids from 1..n in sort frequency order. 128 | where n - number of elements left. 129 | Args: 130 | min_frequency: minimum frequency to keep. 131 | max_frequency: optional, maximum frequency to keep. 132 | Useful to remove very frequent categories (like stop words). 133 | """ 134 | # don't trim embedding vocab 135 | return 136 | 137 | class EmbeddingVocabularyProcessor(tf.contrib.learn.preprocessing.VocabularyProcessor): 138 | 139 | def __init__(self, 140 | max_document_length, 141 | vocabulary, 142 | min_frequency=0, 143 | tokenizer_fn=None): 144 | self.max_document_length = max_document_length 145 | self.vocabulary_ = vocabulary # EmbeddingVocabulary object 146 | self.min_frequency = min_frequency 147 | 148 | @staticmethod 149 | def tokenize(sentence): 150 | # for value in iterator: 151 | # yield value.split(' ') 152 | return sentence.split(' ') 153 | 154 | def fit(self, sentences, unused_y=None): 155 | # do nothing given that the embeddings have already been 156 | # initialized in EmbeddingVocabulary 157 | for sentence in sentences: 158 | for token in sentence: 159 | self.vocabulary_.add(token) 160 | if self.min_frequency > 0: 161 | self.vocabulary_.trim(self.min_frequency) 162 | self.vocabulary_.freeze() 163 | return self 164 | 165 | def transform(self, sentences): 166 | ''' 167 | Args: 168 | sentences: list of list of words 169 | Returns: 170 | indices: list of list of word indices 171 | ''' 172 | word_ids = np.zeros((len(sentences), self.max_document_length), np.int32) 173 | for i, sentence in enumerate(sentences): 174 | # word_ids = np.zeros(self.max_document_length, np.int32) 175 | for j, token in enumerate(sentence): 176 | if j >= self.max_document_length: 177 | break 178 | word_ids[i, j] = self.vocabulary_.get(token) 179 | return word_ids 180 | 181 | def reverse(self, sentences): 182 | """Reverses output of vocabulary mapping to words. 183 | Args: 184 | sentences: list of list of word indices 185 | Returns: 186 | output: list of list of words 187 | """ 188 | output = [] 189 | for sentence in sentences: 190 | output.append( 191 | [self.vocabulary_.reverse(word_id) for word_id in sentence] 192 | ) 193 | return output 194 | 195 | class LabelVocabularyProcessor(tf.contrib.learn.preprocessing.VocabularyProcessor): 196 | 197 | def __init__(self, 198 | # max_document_length, 199 | vocabulary, 200 | min_frequency=0, 201 | tokenizer_fn=None): 202 | self.vocabulary_ = vocabulary # EmbeddingVocabulary object 203 | self.min_frequency = min_frequency 204 | 205 | @staticmethod 206 | def tokenize(sentence): 207 | # for value in iterator: 208 | # yield value.split(' ') 209 | return sentence.split(' ') 210 | 211 | def fit(self, sentences, unused_y=None): 212 | # do nothing given that the embeddings have already been 213 | # initialized in EmbeddingVocabulary 214 | for label in sentences: 215 | self.vocabulary_.add(token) 216 | if self.min_frequency > 0: 217 | self.vocabulary_.trim(self.min_frequency) 218 | self.vocabulary_.freeze() 219 | return self 220 | 221 | def transform(self, sentences): 222 | ''' 223 | Args: 224 | sentences: list of list of words 225 | Returns: 226 | indices: list of list of word indices 227 | ''' 228 | label_ids = np.full((len(sentences)), -1, dtype=np.int32) 229 | for i, label in enumerate(sentences): 230 | label_ids[i] = self.vocabulary_.get(label) 231 | # for j, token in enumerate(sentence): 232 | # if j >= self.max_document_length: 233 | # break 234 | # label_ids[i, j] = self.vocabulary_.get(token) 235 | return label_ids 236 | 237 | def reverse(self, sentences): 238 | """Reverses output of vocabulary mapping to words. 239 | Args: 240 | sentences: list of list of word indices 241 | Returns: 242 | output: list of list of words 243 | """ 244 | output = [] 245 | for label_id in sentences: 246 | output.append( 247 | self.vocabulary_.reverse(label_id) 248 | ) 249 | return output 250 | --------------------------------------------------------------------------------