├── README.md
├── data_utils_sentihood.py
├── delayed_entnet_sentihood.py
├── main.py
└── vocab_processor.py


/README.md:
--------------------------------------------------------------------------------
 1 | # delayed-memory-update-entnet
 2 | Recurrent Entity Networks with Delayed Memory Update for Targeted Aspect-based Sentiment Analysis, published at NAACL 2018
 3 | 
 4 | ```
 5 | Python-2.7.12
 6 | TensorFlow-1.4.1
 7 | Numpy-1.14.2
 8 | ```
 9 | 
10 | ```shell
11 | $ python main.py --embedding_file PATH/TO/GLOVE_EMBEDDING_FILE
12 | ```
13 | 
14 | Note that the code assumes that the first line in the embedding file specifies the vocabulary size and dimension size:
15 | ```shell
16 | sed -i '1i VOCAB_SIZE DIM_SIZE' PATH/TO/GLOVE_EMBEDDING_FILE
17 | ```
18 | 
19 | ```
20 | @InProceedings{Liu+:2018,
21 |   author    = {Liu, Fei  and Cohn, Trevor and Baldwin, Timothy},
22 |   title     = {Recurrent Entity Networks with Delayed Memory Update for Targeted Aspect-based Sentiment Analysis},
23 |   booktitle = {Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies},
24 |   year      = {2018},
25 |   address   = {New Orleans, USA},
26 |   pages     = {278--283}
27 | }
28 | ```
29 | 


--------------------------------------------------------------------------------
/data_utils_sentihood.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | 
  3 | import os, sys
  4 | import re
  5 | import numpy as np
  6 | import xml.etree.ElementTree
  7 | from collections import defaultdict
  8 | import nltk
  9 | # from vocab_processor import *
 10 | import operator
 11 | import json
 12 | 
 13 | def vectorize_data(sentences, max_sentence_len, max_target_len, max_aspect_len, 
 14 |                    word_processor, label_processor):
 15 |     ret_sentences = word_processor.transform(
 16 |         [text for _, text, _, _, _ in sentences]
 17 |     )
 18 |     # [None, max_sentence_len]
 19 |     assert ret_sentences.shape[1] == max_sentence_len
 20 | 
 21 |     ret_loc_indicator = np.zeros((len(sentences), 1), dtype=np.int32)
 22 |     for i, (_, _, target, _, _) in enumerate(sentences):
 23 |         assert target.lower() in ['location1', 'location2']
 24 |         ret_loc_indicator[i, :] = [0 if target.lower() == 'location1' else 1]
 25 |     
 26 |     ret_targets = word_processor.transform(
 27 |         [[target] for _, _, target, _, _ in sentences]
 28 |     )
 29 |     assert ret_targets.shape[1] == max_sentence_len
 30 |     ret_targets = ret_targets[:, :max_target_len]
 31 | 
 32 |     ret_aspects = word_processor.transform(
 33 |         [aspect_term for _, _, _, aspect_term, _ in sentences]
 34 |     )
 35 |     assert ret_aspects.shape[1] == max_sentence_len
 36 |     ret_aspects = ret_aspects[:, :max_aspect_len]
 37 | 
 38 |     ret_label = label_processor.transform(
 39 |         [label for _, _, _, _, label in sentences]
 40 |     )
 41 |     # [None, 1]
 42 | 
 43 |     ret_ids = [sent_id for sent_id, _, _, _, _ in sentences]
 44 |     return ret_sentences, ret_targets, ret_loc_indicator, ret_aspects, ret_label, np.array(ret_ids, dtype=np.object)
 45 | 
 46 | def load_task(data_dir, aspect2idx):
 47 |     in_file = os.path.join(data_dir, 'sentihood-train.json')
 48 |     train = parse_sentihood_json(in_file)
 49 |     in_file = os.path.join(data_dir, 'sentihood-dev.json')
 50 |     dev = parse_sentihood_json(in_file)
 51 |     in_file = os.path.join(data_dir, 'sentihood-test.json')
 52 |     test = parse_sentihood_json(in_file)
 53 |     
 54 |     train = convert_input(train, aspect2idx)
 55 |     train_aspect_idx = get_aspect_idx(train, aspect2idx)
 56 |     train = tokenize(train)
 57 |     dev = convert_input(dev, aspect2idx)
 58 |     dev_aspect_idx = get_aspect_idx(dev, aspect2idx)
 59 |     dev = tokenize(dev)
 60 |     test = convert_input(test, aspect2idx)
 61 |     test_aspect_idx = get_aspect_idx(test, aspect2idx)
 62 |     test = tokenize(test)
 63 | 
 64 |     return (train, train_aspect_idx), (dev, dev_aspect_idx), (test, test_aspect_idx)
 65 | 
 66 | def get_aspect_idx(data, aspect2idx):
 67 |     ret = []
 68 |     for _, _, _, aspect, _ in data:
 69 |         ret.append(aspect2idx[aspect])
 70 |     assert len(data) == len(ret)
 71 |     return np.array(ret)
 72 | 
 73 | def remove_replacement(data, replacement):
 74 |     ret_data = []
 75 |     ret_indices = []
 76 |     for sent in data:
 77 |         text = sent[0]
 78 |         assert replacement in text
 79 |         index = text.index(replacement)
 80 |         new_text = text[:index] + text[index+1:]
 81 |         ret_data.append((
 82 |             new_text, sent[1], sent[2]
 83 |         ))
 84 |         ret_indices.append(index)
 85 |     return ret_data, ret_indices
 86 | 
 87 | def lower_case(data):
 88 |     ret = []
 89 |     for sent_id, text, target, aspect, sentiment in data:
 90 |         new_text = map(lambda x: x.lower(), text)
 91 |         new_aspect = map(lambda x: x.lower(), aspect)
 92 |         ret.append((sent_id, new_text, target.lower(), new_aspect, sentiment))
 93 |     return ret
 94 | 
 95 | def parse_sentihood_json(in_file):
 96 |     with open(in_file) as f:
 97 |         data = json.load(f)
 98 |     ret = []
 99 |     for d in data:
100 |         text = d['text']
101 |         sent_id = d['id']
102 |         opinions = []
103 |         targets = set()
104 |         for opinion in d['opinions']:
105 |             sentiment = opinion['sentiment']
106 |             aspect = opinion['aspect']
107 |             target_entity = opinion['target_entity']
108 |             targets.add(target_entity)
109 |             opinions.append((target_entity, aspect, sentiment))
110 |         ret.append((sent_id, text, opinions))
111 |     return ret
112 | 
113 | def get_all_aspects(data):
114 |     aspects = set()
115 |     for sent_id, text, opinions in data:
116 |         for target_entity, aspect, sentiment in opinions:
117 |             aspects.add(aspect)
118 |     return aspects
119 | 
120 | def convert_input(data, all_aspects):
121 |     ret = []
122 |     for sent_id, text, opinions in data:
123 |         for target_entity, aspect, sentiment in opinions:
124 |             if aspect not in all_aspects:
125 |                 continue
126 |             ret.append((sent_id, text, target_entity, aspect, sentiment))
127 |         assert 'LOCATION1' in text
128 |         targets = set(['LOCATION1'])
129 |         if 'LOCATION2' in text:
130 |             targets.add('LOCATION2')
131 |         for target in targets:
132 |             aspects = set([a for t, a, _ in opinions if t == target])
133 |             none_aspects = [a for a in all_aspects if a not in aspects]
134 |             for aspect in none_aspects:
135 |                 ret.append((sent_id, text, target, aspect, 'None'))
136 |     return ret
137 |         
138 | def tokenize(data):
139 |     ret = []
140 |     for sent_id, text, target_entity, aspect, sentiment in data:
141 |         new_text = nltk.word_tokenize(text)
142 |         new_aspect = aspect.split('-')
143 |         ret.append((sent_id, new_text, target_entity, new_aspect, sentiment))
144 |     return ret
145 | 


--------------------------------------------------------------------------------
/delayed_entnet_sentihood.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import division
  3 | 
  4 | import tensorflow as tf
  5 | import numpy as np
  6 | from six.moves import range
  7 | 
  8 | from tensorflow import name_scope
  9 | 
 10 | from functools import partial
 11 | 
 12 | from tensorflow.contrib.rnn import LSTMStateTuple
 13 | from tensorflow.contrib.rnn.ops import gen_gru_ops
 14 | from tensorflow.python.ops import init_ops
 15 | 
 16 | 
 17 | class DynamicMemoryCell(tf.contrib.rnn.RNNCell):
 18 |     """
 19 |     Implementation of a dynamic memory cell as a gated recurrent network.
 20 |     The cell's hidden state is divided into blocks and each block's weights are tied.
 21 |     """
 22 | 
 23 |     def __init__(self,
 24 |                  num_blocks,
 25 |                  num_units_per_block,
 26 |                  keys,
 27 |                  initializer=None,
 28 |                  recurrent_initializer=None,
 29 |                  activation=tf.nn.relu,):
 30 |         self._num_blocks = num_blocks # M
 31 |         self._num_units_per_block = num_units_per_block # d
 32 |         self._keys = keys
 33 |         self._activation = activation # \phi
 34 |         self._initializer = initializer
 35 |         self._recurrent_initializer = recurrent_initializer
 36 | 
 37 |     @property
 38 |     def state_size(self):
 39 |         "Return the total state size of the cell, across all blocks."
 40 |         return self._num_blocks * self._num_units_per_block * 2
 41 | 
 42 |     @property
 43 |     def output_size(self):
 44 |         "Return the total output size of the cell, across all blocks."
 45 |         return self._num_blocks * self._num_units_per_block
 46 | 
 47 |     def zero_state(self, batch_size, dtype):
 48 |         "Initialize the memory to the key values."
 49 |         zero_state = tf.concat([tf.expand_dims(key, axis=0) for key in self._keys], axis=1)
 50 |         zero_state_batch = tf.tile(zero_state, [batch_size, 1])
 51 |         return tf.concat(
 52 |             values=[
 53 |                 zero_state_batch,
 54 |                 tf.zeros(
 55 |                     shape=[batch_size, self._num_blocks * self._num_units_per_block],
 56 |                     dtype=tf.float32,
 57 |                 ),
 58 |             ],
 59 |             axis=1
 60 |         )
 61 | 
 62 |     def get_gate(self, state_j, key_j, inputs, v=None, prev_a=None):
 63 |         """
 64 |         Implements the gate (scalar for each block). Equation 2:
 65 | 
 66 |         g_j <- \sigma(s_t^T h_j + s_t^T w_j)
 67 |         """
 68 |         a = tf.reduce_sum(inputs * state_j, axis=1)
 69 |         b = tf.reduce_sum(inputs * key_j, axis=1)
 70 |         assert v is not None
 71 |         c = tf.reduce_sum(prev_a * v, axis=1)
 72 |         return tf.sigmoid(a + b + c)
 73 | 
 74 |     def get_candidate(self, state_j, key_j, inputs, U, V, W, U_bias):
 75 |         """
 76 |         Represents the new memory candidate that will be weighted by the
 77 |         gate value and combined with the existing memory. Equation 3:
 78 | 
 79 |         h_j^~ <- \phi(U h_j + V w_j + W s_t)
 80 |         """
 81 |         key_V = tf.matmul(key_j, V)
 82 |         state_U = tf.matmul(state_j, U) + U_bias
 83 |         inputs_W = tf.matmul(inputs, W)
 84 |         return self._activation(state_U + inputs_W + key_V)
 85 | 
 86 |     def __call__(self, inputs, state, scope=None):
 87 |         with tf.variable_scope(scope or type(self).__name__, initializer=self._initializer):
 88 |             U = tf.get_variable('U', [self._num_units_per_block, self._num_units_per_block],
 89 |                                 initializer=self._recurrent_initializer)
 90 |             V = tf.get_variable('V', [self._num_units_per_block, self._num_units_per_block],
 91 |                                 initializer=self._recurrent_initializer)
 92 |             W = tf.get_variable('W', [self._num_units_per_block, self._num_units_per_block],
 93 |                                 initializer=self._recurrent_initializer)
 94 | 
 95 |             U_bias = tf.get_variable('U_bias', [self._num_units_per_block])
 96 | 
 97 |             state, state_a = tf.split(
 98 |                 value=state,
 99 |                 num_or_size_splits=[
100 |                     self._num_blocks * self._num_units_per_block,
101 |                     self._num_blocks * self._num_units_per_block
102 |                 ],
103 |                 axis=1,
104 |             )
105 |             state_a = tf.split(state_a, self._num_blocks, axis=1)
106 |             assert len(state_a) == self._num_blocks
107 | 
108 |             # Split the hidden state into blocks (each U, V, W are shared across blocks).
109 |             state = tf.split(state, self._num_blocks, axis=1)
110 |             assert len(state) == self._num_blocks
111 | 
112 |             next_states = []
113 |             next_a_states = []
114 |             for j, state_j in enumerate(state): # Hidden State (j)
115 |                 key_j = tf.expand_dims(self._keys[j], axis=0)
116 |                 candidate_j = self.get_candidate(state_j, key_j, inputs, U, V, W, U_bias)
117 | 
118 |                 reuse = False
119 |                 if j != 0:
120 |                     reuse = True
121 |                 with tf.variable_scope("entnet_gru", reuse=reuse) as gru_scope:
122 |                     w_ru = tf.get_variable(
123 |                         "w_ru", 
124 |                         [self._num_units_per_block * 2, self._num_units_per_block * 2]
125 |                     )
126 |                     b_ru = tf.get_variable(
127 |                         "b_ru", [self._num_units_per_block * 2],
128 |                         initializer=init_ops.constant_initializer(1.0))
129 |                     w_c = tf.get_variable("w_c",
130 |                         [self._num_units_per_block * 2, self._num_units_per_block]
131 |                     )
132 |                     b_c = tf.get_variable(
133 |                         "b_c", [self._num_units_per_block],
134 |                         initializer=init_ops.constant_initializer(0.0))
135 |                     _gru_block_cell = gen_gru_ops.gru_block_cell  # pylint: disable=invalid-name
136 |                     _, _, _, new_a = _gru_block_cell(
137 |                         x=candidate_j, h_prev=state_a[j], 
138 |                         w_ru=w_ru, w_c=w_c, b_ru=b_ru, b_c=b_c)
139 |                     
140 |                     v_a = tf.get_variable(
141 |                         "v_a", [self._num_units_per_block],
142 |                         initializer=self._initializer,
143 |                     )
144 |                 
145 |                 next_a_states.append(new_a)
146 | 
147 |                 gate_j = self.get_gate(state_j, key_j, inputs, v_a, new_a)
148 | 
149 |                 # Equation 4: h_j <- h_j + g_j * h_j^~
150 |                 # Perform an update of the hidden state (memory).
151 |                 state_j_next = state_j + tf.expand_dims(gate_j, -1) * candidate_j
152 | 
153 |                 # Equation 5: h_j <- h_j / \norm{h_j}
154 |                 # Forget previous memories by normalization.
155 |                 state_j_next_norm = tf.norm(
156 |                     tensor=state_j_next,
157 |                     ord='euclidean',
158 |                     axis=-1,
159 |                     keep_dims=True)
160 |                 state_j_next_norm = tf.where(
161 |                     tf.greater(state_j_next_norm, 0.0),
162 |                     state_j_next_norm,
163 |                     tf.ones_like(state_j_next_norm))
164 |                 state_j_next = state_j_next / state_j_next_norm
165 | 
166 |                 next_states.append(state_j_next)
167 |             state_next = tf.concat(next_states, axis=1)
168 |             state_a_next = tf.concat(next_a_states, axis=1)
169 |             return state_next, tf.concat(values=[state_next, state_a_next], axis=1)
170 | 
171 | def zero_nil_slot(t, name=None):
172 |     """
173 |     Overwrites the nil_slot (first row) of the input Tensor with zeros.
174 | 
175 |     The nil_slot is a dummy slot and should not be trained and influence
176 |     the training algorithm.
177 |     """
178 |     with name_scope(values=[t], name=name, default_name="zero_nil_slot") as name:
179 |         t = tf.convert_to_tensor(t, name="t")
180 |         s = tf.shape(t)[1]
181 |         z = tf.zeros(tf.stack([1, s]))
182 |         return tf.concat(
183 |             axis=0, values=[z, tf.slice(t, [1, 0], [-1, -1])], name=name
184 |         )
185 | 
186 | def prelu(features, alpha, scope=None):
187 |     """
188 |     Implementation of [Parametric ReLU](https://arxiv.org/abs/1502.01852) borrowed from Keras.
189 |     """
190 |     with tf.variable_scope(scope, 'PReLU'):
191 |         pos = tf.nn.relu(features)
192 |         neg = alpha * (features - tf.abs(features)) * 0.5
193 |         return pos + neg
194 | 
195 | 
196 | class Delayed_EntNet_Sentihood(object):
197 |     def __init__(self, 
198 |         batch_size, vocab_size, target_len, aspect_len, sentence_len, 
199 |         answer_size, embedding_size,
200 |         weight_tying="adj",
201 |         hops=3,
202 |         embedding_mat=None,
203 |         update_embeddings=False,
204 |         softmax_mask=True,
205 |         max_grad_norm=5.0,
206 |         n_keys=6,
207 |         tied_keys=[],
208 |         l2_final_layer=0.0,
209 |         initializer=tf.contrib.layers.xavier_initializer(),
210 |         optimizer=tf.train.AdamOptimizer(learning_rate=1e-2),
211 |         global_step=None,
212 |         session=None,
213 |         name='Delayed_EntNet_Sentihood'):
214 | 
215 |         print name
216 | 
217 |         self._batch_size = batch_size
218 |         self._vocab_size = vocab_size
219 |         self._target_len = target_len
220 |         self._aspect_len = aspect_len
221 |         self._sentence_len = sentence_len
222 |         self._embedding_size = embedding_size
223 |         self._answer_size = answer_size
224 |         self._max_grad_norm = max_grad_norm
225 |         self._init = initializer
226 |         self._opt = optimizer
227 |         self._global_step = global_step
228 |         self._name = name
229 |         self._embedding_mat = embedding_mat
230 |         self._update_embeddings = update_embeddings
231 | 
232 |         assert len(tied_keys) <= n_keys
233 |         self._n_keys = n_keys
234 |         self._tied_keys = tied_keys
235 |         self._l2_final_layer = l2_final_layer
236 | 
237 |         self._build_inputs()
238 |         self._build_vars()
239 | 
240 |         logits = self._inference_adj(
241 |             self._sentences, 
242 |             self._targets,
243 |             self._aspects,
244 |             self._entnet_input_keep_prob,
245 |             self._entnet_output_keep_prob,
246 |             self._entnet_state_keep_prob,
247 |             self._final_layer_keep_prob,
248 |         )
249 |         
250 |         cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
251 |             logits=logits, labels=tf.cast(self._answers_one_hot, tf.float32), 
252 |             name="cross_entropy"
253 |         )
254 |         cross_entropy_mean = tf.reduce_mean(
255 |             cross_entropy, name="cross_entropy_mean"
256 |         )
257 | 
258 |         # l2 regularization
259 |         trainable_variables = tf.trainable_variables()
260 |         l2_loss_final_layer = 0.0
261 |         assert self._l2_final_layer >= 0
262 | 
263 |         if self._l2_final_layer > 0:
264 |             final_layer_weights = [ tf.nn.l2_loss(v) for v in trainable_variables
265 |                                     if 'R:0' in v.name]
266 |             assert len(final_layer_weights) == 1
267 |             l2_loss_final_layer = self._l2_final_layer * tf.add_n(final_layer_weights)
268 | 
269 |         # loss op
270 |         loss_op = cross_entropy_mean + l2_loss_final_layer
271 | 
272 |         # gradient pipeline
273 |         grads_and_vars = self._opt.compute_gradients(loss_op)
274 | 
275 |         grads_and_vars = [(tf.clip_by_norm(g, self._max_grad_norm), v) for g,v in grads_and_vars]
276 |         nil_grads_and_vars = []
277 |         for g, v in grads_and_vars:
278 |             if v.name in self._nil_vars:
279 |                 nil_grads_and_vars.append((zero_nil_slot(g), v))
280 |             else:
281 |                 nil_grads_and_vars.append((g, v))
282 |         train_op = self._opt.apply_gradients(nil_grads_and_vars, global_step=self._global_step, name="train_op")
283 | 
284 |         # predict ops
285 |         predict_op = tf.argmax(logits, 1, name="predict_op")
286 |         predict_proba_op = tf.nn.softmax(logits, name="predict_proba_op")
287 | 
288 |         # assign ops
289 |         self.loss_op = loss_op
290 |         self.predict_op = predict_op
291 |         self.predict_proba_op = predict_proba_op
292 |         self.train_op = train_op
293 | 
294 |         init_op = tf.global_variables_initializer()
295 |         self._sess = session
296 |         self._sess.run(init_op, feed_dict={self._input_embedding: self._embedding_mat})
297 | 
298 |     def _build_inputs(self):
299 |         self._sentences = tf.placeholder(
300 |             tf.int32, [None, self._sentence_len], 
301 |             name="sentences"
302 |         )
303 |         self._targets = tf.placeholder(
304 |             tf.int32, [None, self._target_len],
305 |             name="targets"
306 |         )
307 |         self._aspects = tf.placeholder(
308 |             tf.int32, [None, self._aspect_len],
309 |             name="aspects"
310 |         )
311 |         self._answers = tf.placeholder(
312 |             tf.int32, [None], 
313 |             name="answers"
314 |         )
315 |         self._answers_one_hot = tf.one_hot(
316 |             indices=self._answers,
317 |             depth=self._answer_size,
318 |         )
319 |         self._input_embedding = tf.placeholder(
320 |             tf.float32, shape=self._embedding_mat.shape,
321 |             name="input_embedding"
322 |         )
323 |         self._entnet_input_keep_prob = tf.placeholder(
324 |             tf.float32, shape=[],
325 |             name="entnet_input_keep_prob"
326 |         )
327 |         self._entnet_output_keep_prob = tf.placeholder(
328 |             tf.float32, shape=[],
329 |             name="entnet_output_keep_prob"
330 |         )
331 |         self._entnet_state_keep_prob = tf.placeholder(
332 |             tf.float32, shape=[],
333 |             name="entnet_state_keep_prob"
334 |         )
335 |         self._final_layer_keep_prob = tf.placeholder(
336 |             tf.float32, shape=[],
337 |             name="final_layer_keep_prob"
338 |         )
339 | 
340 |     def _build_vars(self):
341 |         with tf.variable_scope(self._name):
342 |             self._embedding = tf.get_variable(
343 |                 name="embedding",
344 |                 dtype=tf.float32,
345 |                 initializer=self._input_embedding,
346 |                 trainable=self._update_embeddings,
347 |             )
348 | 
349 |             self._free_keys_embedding = tf.get_variable(
350 |                 name="free_keys_embedding",
351 |                 dtype=tf.float32,
352 |                 shape=[self._n_keys - len(self._tied_keys), self._embedding_size],
353 |                 initializer=self._init,
354 |                 trainable=True,
355 |             )
356 | 
357 |         self._nil_vars = set([self._embedding.name])
358 | 
359 |     def _mask_embedding(self, embedding):
360 |         vocab_size, embedding_size = self._embedding_mat.shape
361 |         embedding_mask = tf.constant(
362 |             value=[0 if i == 0 else 1 for i in range(vocab_size)],
363 |             shape=[vocab_size, 1],
364 |             dtype=tf.float32,
365 |             name="embedding_mask",
366 |         )
367 |         return embedding * embedding_mask
368 | 
369 |     def _inference_adj(self, sentences, targets, aspects, 
370 |                        entnet_input_keep_prob, entnet_output_keep_prob, 
371 |                        entnet_state_keep_prob, final_layer_keep_prob):
372 |         with tf.variable_scope(self._name):
373 |             masked_embedding = self._mask_embedding(self._embedding)
374 | 
375 |             batch_size = tf.shape(sentences)[0]
376 |             
377 |             targets_emb = tf.nn.embedding_lookup(masked_embedding, targets)
378 |             # [None, entity_size, emb_size]
379 |             targets_emb = tf.reduce_mean(
380 |                 input_tensor=targets_emb,
381 |                 axis=1,
382 |                 keep_dims=True,
383 |             )
384 |             # [None, 1, emb_size]
385 |             aspects_emb = tf.nn.embedding_lookup(masked_embedding, aspects)
386 |             # [None, aspect_size, emb_size]
387 |             aspects_emb = tf.reduce_mean(
388 |                 input_tensor=aspects_emb,
389 |                 axis=1,
390 |                 keep_dims=True,
391 |             )
392 |             # [None, 1, emb_size]
393 | 
394 |             sentences_emb = tf.nn.embedding_lookup(masked_embedding, sentences)
395 |             # [None, memory_size, emb_size]
396 | 
397 |             sentences_len = self._sentence_length(sentences_emb)
398 |             # [None]
399 | 
400 |             tied_keys_emb = tf.nn.embedding_lookup(masked_embedding, self._tied_keys)
401 |             # [len(self._tied_keys), max_key_len, emb_size]
402 |             tied_keys_emb = tf.reduce_mean(
403 |                 input_tensor=tied_keys_emb,
404 |                 axis=1,
405 |             )
406 |             # [len(self._tied_keys), emb_size]
407 |             free_keys_emb = self._free_keys_embedding
408 |             # [n_keys - len(self._tied_keys), emb_size]
409 | 
410 |             keys_emb = tf.concat(
411 |                 values=[tied_keys_emb, free_keys_emb],
412 |                 axis=0,
413 |                 name="keys_emb",
414 |             )
415 |             # [n_keys, emb_size]
416 | 
417 |             batched_keys_emb = tf.tile(
418 |                 input=tf.expand_dims(input=keys_emb, axis=0),
419 |                 multiples=[batch_size, 1, 1]
420 |             )
421 |             # [None, n_keys, emb_size]
422 | 
423 |             keys = tf.split(keys_emb, self._n_keys, axis=0)
424 |             # list of [1, emb_size]
425 |             keys = [tf.squeeze(key, axis=0) for key in keys]
426 |             # list of [emb_size]
427 | 
428 |             alpha = tf.get_variable(
429 |                 name='alpha',
430 |                 shape=self._embedding_size,
431 |                 initializer=tf.constant_initializer(1.0)
432 |             )
433 |             activation = partial(prelu, alpha=alpha)
434 | 
435 |             cell_fw = DynamicMemoryCell(
436 |                 num_blocks=self._n_keys,
437 |                 num_units_per_block=self._embedding_size,
438 |                 keys=keys,
439 |                 initializer=self._init,
440 |                 recurrent_initializer=self._init,
441 |                 activation=activation,
442 |             )
443 |             initial_state_fw = cell_fw.zero_state(batch_size, tf.float32)
444 |             sentences_emb_shape = sentences_emb.get_shape()
445 |             cell_fw = tf.contrib.rnn.DropoutWrapper(
446 |                 cell=cell_fw,
447 |                 input_keep_prob=entnet_input_keep_prob,
448 |                 output_keep_prob=entnet_output_keep_prob,
449 |                 state_keep_prob=entnet_state_keep_prob,
450 |                 variational_recurrent=True,
451 |                 input_size=(sentences_emb_shape[2]),
452 |                 dtype=tf.float32,
453 |             )
454 | 
455 |             cell_bw = DynamicMemoryCell(
456 |                 num_blocks=self._n_keys,
457 |                 num_units_per_block=self._embedding_size,
458 |                 keys=keys,
459 |                 initializer=self._init,
460 |                 recurrent_initializer=self._init,
461 |                 activation=activation,
462 |             )
463 |             initial_state_bw = cell_bw.zero_state(batch_size, tf.float32)
464 |             cell_bw = tf.contrib.rnn.DropoutWrapper(
465 |                 cell=cell_bw,
466 |                 input_keep_prob=entnet_input_keep_prob,
467 |                 output_keep_prob=entnet_output_keep_prob,
468 |                 state_keep_prob=entnet_state_keep_prob,
469 |                 variational_recurrent=True,
470 |                 input_size=(sentences_emb_shape[2]),
471 |                 dtype=tf.float32,
472 |             )
473 |             (_, _), (last_state_fw, last_state_bw) = tf.nn.bidirectional_dynamic_rnn(
474 |                 cell_fw=cell_fw,
475 |                 cell_bw=cell_bw,
476 |                 inputs=sentences_emb,
477 |                 sequence_length=sentences_len,
478 |                 initial_state_fw=initial_state_fw,
479 |                 initial_state_bw=initial_state_bw,
480 |             )
481 | 
482 |             last_state_fw, _ = tf.split(
483 |                 value=last_state_fw,
484 |                 num_or_size_splits=[
485 |                     self._n_keys * self._embedding_size, 
486 |                     self._n_keys * self._embedding_size,
487 |                 ],
488 |                 axis=1
489 |             )
490 |             last_state_bw, _ = tf.split(
491 |                 value=last_state_bw,
492 |                 num_or_size_splits=[
493 |                     self._n_keys * self._embedding_size, 
494 |                     self._n_keys * self._embedding_size,
495 |                 ],
496 |                 axis=1
497 |             )
498 |             # last_state_f/bw: [None, emb_size * n_keys]
499 | 
500 |             last_state_fw = tf.stack(
501 |                 tf.split(last_state_fw, self._n_keys, axis=1), axis=1)
502 |             # [None, n_keys, emb_size]
503 |             last_state_bw = tf.stack(
504 |                 tf.split(last_state_bw, self._n_keys, axis=1), axis=1)
505 |             # [None, n_keys, emb_size]
506 | 
507 |             last_state = last_state_fw + last_state_bw
508 |             # [None, n_keys, emb_size]
509 |             
510 |             asp_att = tf.concat(values=[targets_emb, aspects_emb], axis=2)
511 |             # [None, 1, emb_size * 2]
512 |             W_asp_att = tf.get_variable(
513 |                 name='W_asp_att',
514 |                 shape=[self._embedding_size, self._embedding_size * 2],
515 |                 dtype=tf.float32,
516 |                 initializer=self._init,
517 |             )
518 |             temp = tf.tensordot(
519 |                 batched_keys_emb, W_asp_att, [[2], [0]]
520 |             )
521 |             # [None, n_keys, emb_size * 2]
522 |             attention = tf.reduce_sum(temp * asp_att, axis=2)
523 |             # [None, n_keys]
524 |             attention_max = tf.reduce_max(attention, axis=-1, keep_dims=True)
525 |             # [None, 1]
526 |             attention = tf.nn.softmax(attention - attention_max)
527 |             # [None, n_keys]
528 |             attention = tf.expand_dims(attention, axis=2)
529 |             # [None, n_keys, 1]
530 | 
531 |             u = tf.reduce_sum(last_state * attention, axis=1)
532 |             # [None, emb_size]
533 |             
534 |             R = tf.get_variable('R', [self._embedding_size, self._answer_size])
535 |             H = tf.get_variable('H', [self._embedding_size, self._embedding_size])
536 | 
537 |             a = tf.squeeze(aspects_emb, axis=1)
538 |             # [None, emb_size]
539 |             hidden = activation(a + tf.matmul(u, H))
540 |             # [None, emb)size]
541 |             hidden = tf.nn.dropout(x=hidden, keep_prob=final_layer_keep_prob)
542 |             # [None, emb_size]
543 |             y = tf.matmul(hidden, R)
544 |             # [None, 1]
545 | 
546 |             return y
547 | 
548 |     def _get_mini_batch_start_end(self, n_train, batch_size=None):
549 |         '''
550 |         Args:
551 |             n_train: int, number of training instances
552 |             batch_size: int (or None if full batch)
553 |         
554 |         Returns:
555 |             batches: list of tuples of (start, end) of each mini batch
556 |         '''
557 |         mini_batch_size = n_train if batch_size is None else batch_size
558 |         batches = zip(
559 |             range(0, n_train, mini_batch_size),
560 |             list(range(mini_batch_size, n_train, mini_batch_size)) + [n_train]
561 |         )
562 |         return batches
563 | 
564 |     def fit(self, sentences, targets, aspects, answers, entnet_input_keep_prob, 
565 |             entnet_output_keep_prob, entnet_state_keep_prob, 
566 |             final_layer_keep_prob, batch_size=None):
567 |         assert len(sentences) == len(targets)
568 |         assert len(sentences) == len(aspects)
569 |         assert len(sentences) == len(answers)
570 |         batches = self._get_mini_batch_start_end(len(sentences), batch_size)
571 |         total_loss = 0.
572 |         for start, end in batches:
573 |             feed_dict = {
574 |                 self._sentences: sentences[start:end], 
575 |                 self._targets: targets[start:end],
576 |                 self._aspects: aspects[start:end],
577 |                 self._answers: answers[start:end], 
578 |                 self._entnet_input_keep_prob: entnet_input_keep_prob,
579 |                 self._entnet_output_keep_prob: entnet_output_keep_prob,
580 |                 self._entnet_state_keep_prob: entnet_state_keep_prob,
581 |                 self._final_layer_keep_prob: final_layer_keep_prob,
582 |             }
583 |             loss, _ = self._sess.run(
584 |                 [self.loss_op, self.train_op], 
585 |                 feed_dict=feed_dict
586 |             )
587 |             total_loss = loss * len(sentences[start:end])
588 |         return total_loss
589 | 
590 |     def predict(self, sentences, targets, aspects, batch_size=None):
591 |         assert len(sentences) == len(targets)
592 |         assert len(sentences) == len(aspects)
593 |         batches = self._get_mini_batch_start_end(len(sentences), batch_size)
594 |         predictions, predictions_prob = [], []
595 |         for start, end in batches:
596 |             feed_dict = {
597 |                 self._sentences: sentences[start:end], 
598 |                 self._targets: targets[start:end],
599 |                 self._aspects: aspects[start:end],
600 |                 self._entnet_input_keep_prob: 1.0,
601 |                 self._entnet_output_keep_prob: 1.0,
602 |                 self._entnet_state_keep_prob: 1.0,
603 |                 self._final_layer_keep_prob: 1.0,
604 |             }
605 |             prediction, prediction_prob = self._sess.run(
606 |                 [self.predict_op, self.predict_proba_op],
607 |                 feed_dict=feed_dict
608 |             )
609 |             predictions.extend(prediction)
610 |             predictions_prob.extend(prediction_prob)
611 |         return predictions, np.array(predictions_prob)
612 | 
613 |     def _sentence_length(self, sentences):
614 |         '''
615 |         sentences: (None, sentence_len, embedding_size)
616 |         '''
617 |         used = tf.sign(tf.reduce_max(tf.abs(sentences), reduction_indices=2))
618 |         length = tf.reduce_sum(used, reduction_indices=1)
619 |         length = tf.cast(length, tf.int32)
620 |         return length
621 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import print_function
  3 | 
  4 | from data_utils_sentihood import *
  5 | from vocab_processor import *
  6 | from sklearn import metrics
  7 | from delayed_entnet_sentihood import Delayed_EntNet_Sentihood
  8 | from itertools import chain
  9 | from six.moves import range
 10 | from collections import defaultdict
 11 | 
 12 | import tensorflow as tf
 13 | import numpy as np
 14 | 
 15 | import sys
 16 | import random
 17 | import logging
 18 | import cPickle as pickle
 19 | 
 20 | import pprint
 21 | pp = pprint.PrettyPrinter()
 22 | 
 23 | tf.flags.DEFINE_float("learning_rate", 0.05, "Learning rate for the optimizer.")
 24 | tf.flags.DEFINE_float("max_grad_norm", 5.0, "Clip gradients to this norm.")
 25 | tf.flags.DEFINE_integer("evaluation_interval", 1, "Evaluate and print results every x epochs")
 26 | tf.flags.DEFINE_integer("batch_size", 128, "Batch size for training.")
 27 | tf.flags.DEFINE_integer("epochs", 800, "Number of epochs to train for.")
 28 | tf.flags.DEFINE_integer("embedding_size", 20, "Embedding size for embedding matrices.")
 29 | tf.flags.DEFINE_integer("sentence_len", 50, "Maximum len of sentence.")
 30 | tf.flags.DEFINE_string("task", "Sentihood", "Sentihood")
 31 | tf.flags.DEFINE_integer("random_state", 67, "Random state.")
 32 | tf.flags.DEFINE_string("data_dir", "data/sentihood/", "Directory containing Sentihood data")
 33 | tf.flags.DEFINE_string("opt", "ftrl", "Optimizer [ftrl]")
 34 | tf.flags.DEFINE_string("embedding_file_path", None, "Embedding file path [None]")
 35 | tf.flags.DEFINE_boolean("update_embeddings", False, "Update embeddings [False]")
 36 | tf.flags.DEFINE_boolean("case_folding", True, "Case folding [True]")
 37 | tf.flags.DEFINE_integer("n_cpus", 6, "N CPUs [6]")
 38 | tf.flags.DEFINE_integer("n_keys", 7, "Number of keys [7]")
 39 | tf.flags.DEFINE_integer("n_tied", 2, "Number of tied keys [2]")
 40 | tf.flags.DEFINE_float("entnet_input_keep_prob", 0.8, "entnet input keep prob [0.8]")
 41 | tf.flags.DEFINE_float("entnet_output_keep_prob", 1.0, "entnet output keep prob [1.0]")
 42 | tf.flags.DEFINE_float("entnet_state_keep_prob", 1.0, "entnet state keep prob [1.0]")
 43 | tf.flags.DEFINE_float("final_layer_keep_prob", 0.8, "final layer keep prob [0.8]")
 44 | tf.flags.DEFINE_float("l2_final_layer", 1e-3, "Lambda L2 final layer [1e-3]")
 45 | 
 46 | FLAGS = tf.flags.FLAGS
 47 | 
 48 | if __name__ == "__main__":
 49 |     logger = logging.getLogger()
 50 |     logger.setLevel(logging.DEBUG)
 51 |     ch = logging.StreamHandler()
 52 |     ch.setLevel(logging.DEBUG)
 53 |     formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 54 |     ch.setFormatter(formatter)
 55 |     logger.addHandler(ch)
 56 | 
 57 |     logger.info(" ".join(sys.argv))
 58 |     logger.info("Started Task: %s" % FLAGS.task)
 59 |     
 60 |     logger.info(pp.pformat(FLAGS.__flags))
 61 | 
 62 |     session_conf = tf.ConfigProto(
 63 |         intra_op_parallelism_threads=FLAGS.n_cpus,
 64 |         inter_op_parallelism_threads=FLAGS.n_cpus,
 65 |     )
 66 | 
 67 |     aspect2idx = {
 68 |         'general': 0,
 69 |         'price': 1,
 70 |         'transit-location': 2,
 71 |         'safety': 3,
 72 |     }
 73 | 
 74 |     assert FLAGS.n_keys >= 2
 75 |     assert FLAGS.n_tied == 2
 76 | 
 77 |     with tf.Session(config=session_conf) as sess:
 78 | 
 79 |         np.random.seed(FLAGS.random_state)
 80 | 
 81 |         # task data
 82 |         (train, train_aspect_idx), (val, val_aspect_idx), (test, test_aspect_idx) = load_task(FLAGS.data_dir, aspect2idx)
 83 | 
 84 |         if FLAGS.case_folding:
 85 |             train = lower_case(train)
 86 |             val = lower_case(val)
 87 |             test = lower_case(test)
 88 | 
 89 |         data = train + val + test
 90 | 
 91 |         max_sentence_len = max(map(lambda x: len(x[1]), data))
 92 |         max_sentence_len = min(FLAGS.sentence_len, max_sentence_len)
 93 |         logger.info('Max sentence len: %d' % max_sentence_len)
 94 |         max_target_len = 1 # should be one
 95 |         max_aspect_len = max(map(lambda x: len(x), [d[3] for d in data]))
 96 |         assert max_aspect_len == 2
 97 |         logger.info('Max target size: %d' % max_target_len)
 98 |         logger.info('Max aspect size: %d' % max_aspect_len)
 99 | 
100 |         assert FLAGS.embedding_file_path is not None
101 |         word_vocab = EmbeddingVocabulary(
102 |             in_file=FLAGS.embedding_file_path,
103 |         )
104 |         word_vocab_processor = EmbeddingVocabularyProcessor(
105 |             max_document_length=max_sentence_len,
106 |             vocabulary=word_vocab,
107 |         )
108 |         embedding_mat = word_vocab.embeddings
109 |         embedding_size = word_vocab.embeddings.shape[1]
110 | 
111 |         label_vocab = LabelVocabulary()
112 |         label_vocab_processor = LabelVocabularyProcessor(
113 |             vocabulary=label_vocab,
114 |             min_frequency=0,
115 |         )
116 | 
117 |         positive_idx = label_vocab.get('Positive')
118 |         negative_idx = label_vocab.get('Negative')
119 |         none_idx = label_vocab.get('None')
120 | 
121 |         train_sentences, train_targets, train_loc_indicators, train_aspects, train_labels, train_ids = vectorize_data(
122 |             train,
123 |             max_sentence_len,
124 |             max_target_len,
125 |             max_aspect_len,
126 |             word_vocab_processor,
127 |             label_vocab_processor,
128 |         )
129 |         
130 |         val_sentences, val_targets, val_loc_indicators, val_aspects, val_labels, val_ids = vectorize_data(
131 |             val,
132 |             max_sentence_len,
133 |             max_target_len,
134 |             max_aspect_len,
135 |             word_vocab_processor,
136 |             label_vocab_processor,
137 |         )
138 |         
139 |         test_sentences, test_targets, test_loc_indicators, test_aspects, test_labels, test_ids = vectorize_data(
140 |             test,
141 |             max_sentence_len,
142 |             max_target_len,
143 |             max_aspect_len,
144 |             word_vocab_processor,
145 |             label_vocab_processor,
146 |         )
147 | 
148 |         target_terms = [['location1'], ['location2']]
149 |         target_terms = word_vocab_processor.transform(target_terms)[:, :max_target_len]
150 |         
151 |         sentence_len = max_sentence_len
152 |         vocab_size = len(word_vocab)
153 |         answer_size = len(label_vocab)
154 | 
155 |         logger.info("Training sentences shape " + str(train_sentences.shape))
156 |         logger.info("Training targets shape " + str(train_targets.shape))
157 |         logger.info("Training aspects shape " + str(train_aspects.shape))
158 |         logger.info("Validation sentences shape " + str(val_sentences.shape))
159 |         logger.info("Validation targets shape " + str(val_targets.shape))
160 |         logger.info("Validation aspects shape " + str(val_aspects.shape))
161 |         logger.info("Test sentences shape " + str(test_sentences.shape))
162 |         logger.info("Test targets shape " + str(test_targets.shape))
163 |         logger.info("Test aspects shape " + str(test_aspects.shape))
164 |         
165 |         # params
166 |         n_train = train_sentences.shape[0]
167 |         n_val = val_sentences.shape[0]
168 |         n_test = test_sentences.shape[0]
169 |         
170 |         logger.info("Training Size %d" % n_train)
171 |         logger.info("Validation Size %d" % n_val)
172 |         logger.info("Testing Size %d" % n_test)
173 |         
174 |         tf.set_random_seed(FLAGS.random_state)
175 |         batch_size = FLAGS.batch_size
176 |         
177 |         global_step = None
178 |         optimizer = None
179 | 
180 |         train_positive_idx = np.where(train_labels == positive_idx)[0]
181 |         train_negative_idx = np.where(train_labels == negative_idx)[0]
182 |         train_none_idx = np.where(train_labels == none_idx)[0]
183 | 
184 |         train_positive_sentences = train_sentences[train_positive_idx]
185 |         train_positive_targets = train_targets[train_positive_idx]
186 |         train_positive_aspects = train_aspects[train_positive_idx]
187 |         train_positive_labels = train_labels[train_positive_idx]
188 | 
189 |         train_negative_sentences = train_sentences[train_negative_idx]
190 |         train_negative_targets = train_targets[train_negative_idx]
191 |         train_negative_aspects = train_aspects[train_negative_idx]
192 |         train_negative_labels = train_labels[train_negative_idx]
193 | 
194 |         train_none_sentences = train_sentences[train_none_idx]
195 |         train_none_targets = train_targets[train_none_idx]
196 |         train_none_aspects = train_aspects[train_none_idx]
197 |         train_none_labels = train_labels[train_none_idx]
198 | 
199 |         assert len(train_none_idx) > len(train_positive_idx)
200 |         assert len(train_positive_idx) > len(train_negative_idx)
201 | 
202 |         n_positive_train = len(train_positive_idx)
203 |         n_negative_train = len(train_negative_idx)
204 |         n_none_train = len(train_none_idx)
205 |         n_train = n_negative_train # down-sampling
206 | 
207 |         logger.info("Positive training Size %d" % n_positive_train)
208 |         logger.info("Negative training Size %d" % n_negative_train)
209 |         logger.info("None training Size %d" % n_none_train)
210 | 
211 |         if FLAGS.opt == 'adam':
212 |             optimizer = tf.train.AdamOptimizer(
213 |                 learning_rate=FLAGS.learning_rate, epsilon=FLAGS.epsilon)
214 |         elif FLAGS.opt == 'ftrl':
215 |             optimizer = tf.train.FtrlOptimizer(
216 |                 learning_rate=FLAGS.learning_rate
217 |             )
218 | 
219 |         batches = zip(
220 |             range(0, max(1, n_train-batch_size), batch_size), 
221 |             range(batch_size, max(batch_size + 1, n_train), batch_size)
222 |         )
223 |         batches = [(start, end) for start, end in batches]
224 |         
225 |         model = Delayed_EntNet_Sentihood(
226 |             batch_size, 
227 |             vocab_size, 
228 |             max_target_len,
229 |             max_aspect_len,
230 |             sentence_len, 
231 |             answer_size,
232 |             embedding_size, 
233 |             session=sess,
234 |             embedding_mat=word_vocab.embeddings,
235 |             update_embeddings=FLAGS.update_embeddings,
236 |             n_keys=FLAGS.n_keys,
237 |             tied_keys=target_terms,
238 |             l2_final_layer=FLAGS.l2_final_layer,
239 |             max_grad_norm=FLAGS.max_grad_norm, 
240 |             optimizer=optimizer,
241 |             global_step=global_step
242 |         )
243 |         for t in range(1, FLAGS.epochs+1):
244 |             np.random.shuffle(batches)
245 |             total_cost = 0.0
246 |             total_training_instances = 0
247 |             
248 |             for start, end in batches:
249 |                 # train negative
250 |                 sentences = train_negative_sentences[start:end]
251 |                 targets = train_negative_targets[start:end]
252 |                 aspects = train_negative_aspects[start:end]
253 |                 answers = train_negative_labels[start:end]
254 |                 cost_t = model.fit(sentences, targets, aspects, answers,
255 |                                    FLAGS.entnet_input_keep_prob,
256 |                                    FLAGS.entnet_output_keep_prob,
257 |                                    FLAGS.entnet_state_keep_prob,
258 |                                    FLAGS.final_layer_keep_prob)
259 |                 total_cost += cost_t
260 |                 total_training_instances += len(train_negative_sentences[start:end])
261 | 
262 |                 # train positive
263 |                 positive_start = random.randint(0, n_positive_train - batch_size)
264 |                 positive_end = positive_start + batch_size
265 |                 sentences = train_positive_sentences[positive_start:positive_end]
266 |                 targets = train_positive_targets[positive_start:positive_end]
267 |                 aspects = train_positive_aspects[positive_start:positive_end]
268 |                 answers = train_positive_labels[positive_start:positive_end]
269 |                 cost_t = model.fit(sentences, targets, aspects, answers, 
270 |                                    FLAGS.entnet_input_keep_prob,
271 |                                    FLAGS.entnet_output_keep_prob,
272 |                                    FLAGS.entnet_state_keep_prob,
273 |                                    FLAGS.final_layer_keep_prob)
274 |                 total_cost += cost_t
275 |                 total_training_instances += len(train_positive_sentences[positive_start:positive_end])
276 | 
277 |                 # train none
278 |                 none_start = random.randint(0, n_none_train - batch_size)
279 |                 none_end = none_start + batch_size
280 |                 sentences = train_none_sentences[none_start:none_end]
281 |                 targets = train_none_targets[none_start:none_end]
282 |                 aspects = train_none_aspects[none_start:none_end]
283 |                 answers = train_none_labels[none_start:none_end]
284 |                 cost_t = model.fit(sentences, targets, aspects, answers, 
285 |                                    FLAGS.entnet_input_keep_prob,
286 |                                    FLAGS.entnet_output_keep_prob,
287 |                                    FLAGS.entnet_state_keep_prob,
288 |                                    FLAGS.final_layer_keep_prob)
289 | 
290 |                 total_cost += cost_t
291 |                 total_training_instances += len(train_none_sentences[none_start:none_end])
292 |     
293 |             if t % FLAGS.evaluation_interval == 0:
294 |                 train_preds, train_preds_prob = model.predict(
295 |                     train_sentences, train_targets, train_aspects, 
296 |                     batch_size=batch_size,
297 |                 )
298 |                 
299 |                 train_acc = metrics.accuracy_score(
300 |                     train_labels, np.array(train_preds)
301 |                 )
302 | 
303 |                 val_preds, val_preds_prob = model.predict(
304 |                     val_sentences, val_targets, val_aspects, 
305 |                     batch_size=batch_size,
306 |                 )
307 |     
308 |                 val_acc = metrics.accuracy_score(
309 |                     val_labels, np.array(val_preds)
310 |                 )
311 | 
312 |                 test_preds, test_preds_prob = model.predict(
313 |                     test_sentences, test_targets, test_aspects,
314 |                     batch_size=batch_size
315 |                 )
316 |                 test_acc = metrics.accuracy_score(
317 |                     test_labels, np.array(test_preds)
318 |                 )
319 | 
320 |                 assert total_training_instances != 0
321 | 
322 |                 logger.info('-----------------------')
323 |                 logger.info('Epoch %d' % t)
324 |                 logger.info('Avg Cost: %f' % (total_cost / total_training_instances))
325 |                 logger.info('Training Accuracy: %f' % train_acc)
326 |                 logger.info('Validation Accuracy: %f' % val_acc)
327 |                 logger.info('Test Accuracy: %f' % test_acc)
328 |                 logger.info('-----------------------')
329 | 


--------------------------------------------------------------------------------
/vocab_processor.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | import collections
  4 | 
  5 | class LabelVocabulary(tf.contrib.learn.preprocessing.CategoricalVocabulary):
  6 |     def __init__(self, support_reverse=True):
  7 |         self._mapping = {}
  8 |         self._support_reverse = support_reverse
  9 |         if support_reverse:
 10 |             self._reverse_mapping = []
 11 |         self._freq = collections.defaultdict(int)
 12 |         self._freeze = False
 13 | 
 14 |     def get(self, category):
 15 |         """Returns word's id in the vocabulary.
 16 |         If category is new, creates a new id for it.
 17 |         Args:
 18 |         category: string or integer to lookup in vocabulary.
 19 |         Returns:
 20 |         interger, id in the vocabulary.
 21 |         """
 22 |         if category not in self._mapping:
 23 |             if self._freeze:
 24 |                 assert False # should not happen
 25 |                 # return self._mapping[self._unknown_token]
 26 |             self._mapping[category] = len(self._mapping)
 27 |             if self._support_reverse:
 28 |                 self._reverse_mapping.append(category)
 29 |         return self._mapping[category]
 30 | 
 31 |     def trim(self, min_frequency, max_frequency=-1):
 32 |         """Trims vocabulary for minimum frequency.
 33 |         Remaps ids from 1..n in sort frequency order.
 34 |         where n - number of elements left.
 35 |         Args:
 36 |         min_frequency: minimum frequency to keep.
 37 |         max_frequency: optional, maximum frequency to keep.
 38 |             Useful to remove very frequent categories (like stop words).
 39 |         """
 40 |         # no need to trim for label vocab
 41 |         return
 42 | 
 43 | class EmbeddingVocabulary(tf.contrib.learn.preprocessing.CategoricalVocabulary):
 44 |     def __init__(self,
 45 |                  in_file,
 46 |                  binary=False,
 47 |                  padding_token="<PAD>",
 48 |                  unknown_token="<UNK>",
 49 |                  support_reverse=True):
 50 |         self._unknown_token = unknown_token
 51 |         self._padding_token = padding_token
 52 |         self._mapping = {padding_token: 0, unknown_token: 1}
 53 |         self._support_reverse = support_reverse
 54 |         if support_reverse:
 55 |             self._reverse_mapping = [padding_token, unknown_token]
 56 |         # no need to count frequency
 57 |         # self._freq = collections.defaultdict(int)
 58 |         self._load_embeddings(in_file, binary=binary)
 59 |         # freeze the vocabulary once the embeddings have been loaded
 60 |         self._freeze = True
 61 | 
 62 |     def _load_embeddings(self, in_file, binary=False):
 63 |         # emb = word2vec.Word2Vec.load_word2vec_format(in_file, binary=binary)
 64 |         with open(in_file) as in_f:
 65 |             nb_words, nb_dim = None, None
 66 |             for line in in_f:
 67 |                 line = line.strip()
 68 |                 attrs = line.split(' ')
 69 |                 if len(attrs) == 2:
 70 |                     nb_words = int(attrs[0])
 71 |                     nb_dim = int(attrs[1])
 72 |                     self._embeddings = np.zeros((nb_words + 2, nb_dim), dtype=np.float32)
 73 |                     continue
 74 |                 word = attrs[0]
 75 |                 emb = map(float, attrs[1:])
 76 |                 self._mapping[word] = len(self._mapping) if not self._support_reverse else len(self._reverse_mapping)
 77 |                 self._embeddings[self._mapping[word], :] = emb
 78 |                 if self._support_reverse:
 79 |                     self._reverse_mapping.append(word)
 80 | 
 81 |             unk = np.mean(self._embeddings[2:], axis=0)
 82 |             self._embeddings[self._mapping[self._unknown_token]] = unk
 83 | 
 84 |     def _get_mean_embeddings(self, emb):
 85 |         syn0 = emb.syn0
 86 |         return np.mean(syn0, axis=0)
 87 | 
 88 |     @property
 89 |     def embeddings(self):
 90 |         return self._embeddings
 91 | 
 92 |     def freeze(self, freeze=True):
 93 |         """Freezes the vocabulary, after which new words return unknown token id.
 94 |         Args:
 95 |         freeze: True to freeze, False to unfreeze.
 96 |         """
 97 |         self._freeze = True # should always be True after __init__
 98 | 
 99 |     def get(self, category):
100 |         """Returns word's id in the vocabulary.
101 |         If category is new, creates a new id for it.
102 |         Args:
103 |             category: string or integer to lookup in vocabulary.
104 |         Returns:
105 |             interger, id in the vocabulary.
106 |         """
107 |         if category not in self._mapping:
108 |             if self._freeze:
109 |                 return self._mapping[self._unknown_token]
110 |             assert False # should not happey
111 |             self._mapping[category] = len(self._mapping)
112 |             if self._support_reverse:
113 |                 self._reverse_mapping.append(category)
114 |         return self._mapping[category]
115 | 
116 |     def add(self, category, count=1):
117 |         """Adds count of the category to the frequency table.
118 |         Args:
119 |         category: string or integer, category to add frequency to.
120 |         count: optional integer, how many to add.
121 |         """
122 |         # do nothing
123 |         return
124 | 
125 |     def trim(self, min_frequency, max_frequency=-1):
126 |         """Trims vocabulary for minimum frequency.
127 |         Remaps ids from 1..n in sort frequency order.
128 |         where n - number of elements left.
129 |         Args:
130 |             min_frequency: minimum frequency to keep.
131 |             max_frequency: optional, maximum frequency to keep.
132 |                 Useful to remove very frequent categories (like stop words).
133 |         """
134 |         # don't trim embedding vocab
135 |         return
136 | 
137 | class EmbeddingVocabularyProcessor(tf.contrib.learn.preprocessing.VocabularyProcessor):
138 | 
139 |     def __init__(self,
140 |                  max_document_length,
141 |                  vocabulary,
142 |                  min_frequency=0,
143 |                  tokenizer_fn=None):
144 |         self.max_document_length = max_document_length
145 |         self.vocabulary_ = vocabulary # EmbeddingVocabulary object
146 |         self.min_frequency = min_frequency
147 | 
148 |     @staticmethod
149 |     def tokenize(sentence):
150 |         # for value in iterator:
151 |         #     yield value.split(' ')
152 |         return sentence.split(' ')
153 | 
154 |     def fit(self, sentences, unused_y=None):
155 |         # do nothing given that the embeddings have already been
156 |         # initialized in EmbeddingVocabulary
157 |         for sentence in sentences:
158 |             for token in sentence:
159 |                 self.vocabulary_.add(token)
160 |         if self.min_frequency > 0:
161 |             self.vocabulary_.trim(self.min_frequency)
162 |         self.vocabulary_.freeze()
163 |         return self
164 | 
165 |     def transform(self, sentences):
166 |         '''
167 |         Args:
168 |             sentences: list of list of words
169 |         Returns:
170 |             indices: list of list of word indices
171 |         '''
172 |         word_ids = np.zeros((len(sentences), self.max_document_length), np.int32)
173 |         for i, sentence in enumerate(sentences):
174 |             # word_ids = np.zeros(self.max_document_length, np.int32)
175 |             for j, token in enumerate(sentence):
176 |                 if j >= self.max_document_length:
177 |                     break
178 |                 word_ids[i, j] = self.vocabulary_.get(token)
179 |         return word_ids
180 | 
181 |     def reverse(self, sentences):
182 |         """Reverses output of vocabulary mapping to words.
183 |         Args:
184 |             sentences: list of list of word indices
185 |         Returns:
186 |             output: list of list of words
187 |         """
188 |         output = []
189 |         for sentence in sentences:
190 |             output.append(
191 |                 [self.vocabulary_.reverse(word_id) for word_id in sentence]
192 |             )
193 |         return output
194 | 
195 | class LabelVocabularyProcessor(tf.contrib.learn.preprocessing.VocabularyProcessor):
196 | 
197 |     def __init__(self,
198 |                 #  max_document_length,
199 |                  vocabulary,
200 |                  min_frequency=0,
201 |                  tokenizer_fn=None):
202 |         self.vocabulary_ = vocabulary # EmbeddingVocabulary object
203 |         self.min_frequency = min_frequency
204 | 
205 |     @staticmethod
206 |     def tokenize(sentence):
207 |         # for value in iterator:
208 |         #     yield value.split(' ')
209 |         return sentence.split(' ')
210 | 
211 |     def fit(self, sentences, unused_y=None):
212 |         # do nothing given that the embeddings have already been
213 |         # initialized in EmbeddingVocabulary
214 |         for label in sentences:
215 |             self.vocabulary_.add(token)
216 |         if self.min_frequency > 0:
217 |             self.vocabulary_.trim(self.min_frequency)
218 |         self.vocabulary_.freeze()
219 |         return self
220 | 
221 |     def transform(self, sentences):
222 |         '''
223 |         Args:
224 |             sentences: list of list of words
225 |         Returns:
226 |             indices: list of list of word indices
227 |         '''
228 |         label_ids = np.full((len(sentences)), -1, dtype=np.int32)
229 |         for i, label in enumerate(sentences):
230 |             label_ids[i] = self.vocabulary_.get(label)
231 |             # for j, token in enumerate(sentence):
232 |             #     if j >= self.max_document_length:
233 |             #         break
234 |             #     label_ids[i, j] = self.vocabulary_.get(token)
235 |         return label_ids
236 | 
237 |     def reverse(self, sentences):
238 |         """Reverses output of vocabulary mapping to words.
239 |         Args:
240 |             sentences: list of list of word indices
241 |         Returns:
242 |             output: list of list of words
243 |         """
244 |         output = []
245 |         for label_id in sentences:
246 |             output.append(
247 |                 self.vocabulary_.reverse(label_id)
248 |             )
249 |         return output
250 | 


--------------------------------------------------------------------------------