├── README.md
├── models
    ├── LPNet.py
    ├── layers.py
    └── ops.py
├── prepare
    ├── README.md
    ├── convert_charades.py
    ├── download_activitynet_video.py
    ├── extract_activitynet.py
    ├── extract_activitynet_org.py
    ├── extract_charades.py
    ├── extract_tacos.py
    ├── extract_tacos_org.py
    ├── feature_extractor.py
    └── videotransforms.py
├── run_activitynet.py
├── run_charades.py
├── run_tacos.py
├── statistic
    ├── convert_tacos.py
    ├── stat_activitynet.py
    ├── stat_charades.py
    └── stat_tacos.py
└── utils
    ├── data_utils.py
    ├── prepro_activitynet.py
    ├── prepro_charades.py
    ├── prepro_tacos.py
    └── runner_utils.py


/README.md:
--------------------------------------------------------------------------------
1 | # The code of LPNet.
2 | ## under update...
3 | 


--------------------------------------------------------------------------------
/models/LPNet.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import numpy as np
  3 | import tensorflow as tf
  4 | from models.ops import create_optimizer, count_params, regularizer
  5 | from models.layers import word_embedding_lookup, char_embedding_lookup, conv1d, video_query_attention, highlight_layer
  6 | from models.layers import context_query_concat, feature_encoder, conditioned_predictor, localization_loss, iou_regression
  7 | from models.layers import generate_proposal_boxes, dynamic_head
  8 | from models.layers import bilstm, multi_modal_sa, st_video_encoder, boundary_predictor, boundary_loss
  9 | 
 10 | class LPNet:
 11 |     def __init__(self, configs, graph):
 12 |         self.configs = configs
 13 |         graph = graph if graph is not None else tf.Graph()
 14 |         with graph.as_default():
 15 |             self.global_step = tf.train.create_global_step()
 16 |             self._add_placeholders()
 17 |             self._build_model()
 18 |             if configs.mode == 'train':
 19 |                 print('\x1b[1;33m' + 'Total trainable parameters: {}'.format(count_params()) + '\x1b[0m', flush=True)
 20 |             else:
 21 |                 print('\x1b[1;33m' + 'Total parameters: {}'.format(count_params()) + '\x1b[0m', flush=True)
 22 | 
 23 |     def _add_placeholders(self):
 24 |         self.video_inputs = tf.placeholder(dtype=tf.float32, shape=[None, None, self.configs.video_feature_dim],
 25 |                                            name='video_inputs')
 26 |         self.video_seq_length = tf.placeholder(dtype=tf.int32, shape=[None], name='video_sequence_length')
 27 |         self.word_ids = tf.placeholder(dtype=tf.int32, shape=[None, None], name='word_ids')
 28 |         self.char_ids = tf.placeholder(dtype=tf.int32, shape=[None, None, None], name='char_ids')
 29 |         self.highlight_labels = tf.placeholder(dtype=tf.int32, shape=[None, None], name='highlight_labels')
 30 |         
 31 |         self.is_training = tf.placeholder(tf.bool, shape=[])
 32 |         # self.dx1 = tf.placeholder(dtype=tf.int32, shape=[None, None], name='dx')
 33 |         # self.dy1 = tf.placeholder(dtype=tf.int32, shape=[None, None], name='dy')
 34 |         # self.mask1 = tf.placeholder(dtype=tf.float32, shape=[None, None, None, 1], name='batch_mask')
 35 | 
 36 |         self.y1 = tf.placeholder(dtype=tf.float32, shape=[None, None], name='start_indexes')
 37 |         self.y2 = tf.placeholder(dtype=tf.float32, shape=[None, None], name='end_indexes')
 38 |         # hyper-parameters
 39 |         self.drop_rate = tf.placeholder_with_default(input=0.0, shape=[], name='dropout_rate')
 40 |         # create mask
 41 |         self.v_mask = tf.sequence_mask(lengths=self.video_seq_length, maxlen=tf.reduce_max(self.video_seq_length),
 42 |                                        dtype=tf.int32)
 43 |         self.q_mask = tf.cast(tf.cast(self.word_ids, dtype=tf.bool), dtype=tf.int32)
 44 | 
 45 |         self.q_length = tf.reduce_sum(self.q_mask, -1)
 46 |         self.v_length = tf.reduce_sum(self.v_mask, -1)
 47 | 
 48 |     def _build_model(self):
 49 |         # word embedding & visual features
 50 |         init_word_vectors = np.load(os.path.join(self.configs.save_dir, 'word_vectors.npz'))['vectors']
 51 |         word_emb = word_embedding_lookup(self.word_ids, dim=self.configs.word_dim, drop_rate=self.drop_rate,
 52 |                                          vectors=init_word_vectors, finetune=False, reuse=False, name='word_embeddings')
 53 |         char_emb = char_embedding_lookup(self.char_ids, char_size=self.configs.char_size, dim=self.configs.char_dim,
 54 |                                          kernels=[1, 2, 3, 4], filters=[10, 20, 30, 40], drop_rate=self.drop_rate,
 55 |                                          activation=tf.nn.relu, reuse=False, name='char_embeddings')
 56 |         word_emb = tf.concat([word_emb, char_emb], axis=-1)
 57 |         video_features = tf.nn.dropout(self.video_inputs, rate=self.drop_rate)
 58 | 
 59 |         # feature projection (map both word and video feature to the same dimension)
 60 |         vfeats = conv1d(video_features, dim=self.configs.hidden_size, use_bias=True, reuse=False, name='video_conv1d')
 61 |         qfeats = conv1d(word_emb, dim=self.configs.hidden_size, use_bias=True, reuse=False, name='query_conv1d')
 62 | 
 63 |         vfeats0 = feature_encoder(vfeats, hidden_size=self.configs.hidden_size, num_heads=self.configs.num_heads,
 64 |                                   max_position_length=self.configs.max_position_length, drop_rate=self.drop_rate,
 65 |                                   mask=self.v_mask, reuse=False, name='feature_encoder')
 66 |         qfeats0 = feature_encoder(qfeats, hidden_size=self.configs.hidden_size, num_heads=self.configs.num_heads,
 67 |                                  max_position_length=self.configs.max_position_length, drop_rate=self.drop_rate,
 68 |                                  mask=self.q_mask, reuse=True, name='feature_encoder')
 69 | 
 70 |         # # video query attention
 71 |         outputs, self.vq_score = video_query_attention(vfeats0, qfeats0, self.v_mask, self.q_mask, reuse=False,
 72 |                                                        drop_rate=self.drop_rate, name='video_query_attention')
 73 | 
 74 |         # # weighted pooling and concatenation
 75 |         outputs0 = context_query_concat(outputs, qfeats0, q_mask=self.q_mask, reuse=False, name='context_query_concat')
 76 | 
 77 |         self.highlight_loss, self.highlight_scores = highlight_layer(outputs0, self.highlight_labels, mask=self.v_mask,
 78 |                                                                      reuse=False, name='highlighting_layer')
 79 |         outputs0 = tf.multiply(outputs0, tf.expand_dims(self.highlight_scores, axis=-1))
 80 | 
 81 |         start_logits, end_logits = conditioned_predictor(outputs0, hidden_size=self.configs.hidden_size,
 82 |                                                          seq_len=self.video_seq_length, mask=self.v_mask,
 83 |                                                        reuse=False, name='conditioned_predictor')
 84 |         # compute localization loss
 85 |         self.start_prob, self.end_prob, self.start_index, self.end_index, self.loss = boundary_loss(
 86 |                                                     start_logits, end_logits, self.y1, self.y2, self.v_length,self.configs)
 87 | 
 88 | 
 89 |         self.proposal_box, self.dx, self.dy, self.boxes = generate_proposal_boxes(vfeats0, self.v_length, self.configs)
 90 |         self.reg_loss, self.l1reg_loss, self.l1_loss, self.iou_loss, self.regular, self.regular2, self.train, abc, self.px, self.py = dynamic_head(
 91 |             configs=self.configs,
 92 |             proposal_box=self.proposal_box,
 93 |             v_mask=self.v_mask,
 94 |             q_mask=self.q_mask,
 95 |             vfeats0=outputs,
 96 |             qfeats0=qfeats0,
 97 |             drop_rate=self.drop_rate,
 98 |             dx=self.dx,
 99 |             dy=self.dy,
100 |             boxes=self.boxes,
101 |             y1=self.y1,
102 |             y2=self.y2,
103 |             train=self.is_training)
104 | 
105 |         self.my_loss = 5 * self.iou_loss + self.regular   #+ self.l1_loss  #+ 100*self.reg_loss + self.highlight_loss + 0.2*self.loss#+ self.regular + self.regular2 + self.l1_loss
106 |         self.reg_loss = 100 * self.reg_loss + self.highlight_loss + self.loss


--------------------------------------------------------------------------------
/models/ops.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import numpy as np
  3 | import tensorflow as tf
  4 | 
  5 | regularizer = tf.contrib.layers.l2_regularizer(scale=3e-7)
  6 | 
  7 | 
  8 | def count_params(scope=None):
  9 |     if scope is None:
 10 |         return int(np.sum([np.prod(v.get_shape().as_list()) for v in tf.trainable_variables()]))
 11 | 
 12 |     else:
 13 |         return int(np.sum([np.prod(v.get_shape().as_list()) for v in tf.trainable_variables(scope)]))
 14 | 
 15 | 
 16 | def get_shape_list(tensor):
 17 |     shape = tensor.shape.as_list()
 18 |     non_static_indexes = []
 19 | 
 20 |     for (index, dim) in enumerate(shape):
 21 |         if dim is None:
 22 |             non_static_indexes.append(index)
 23 | 
 24 |     if not non_static_indexes:
 25 |         return shape
 26 | 
 27 |     dyn_shape = tf.shape(tensor)
 28 |     for index in non_static_indexes:
 29 |         shape[index] = dyn_shape[index]
 30 | 
 31 |     return shape
 32 | 
 33 | 
 34 | def mask_logits(inputs, mask, mask_value=-1e30):
 35 |     mask = tf.cast(mask, tf.float32)
 36 |     return inputs * mask + mask_value * (1.0 - mask)
 37 | 
 38 | 
 39 | def ndim(x):
 40 |     return x.get_shape().ndims
 41 | 
 42 | 
 43 | def dot(x, y):
 44 |     if ndim(x) is not None and (ndim(x) > 2 or ndim(y) > 2):
 45 |         x_shape = []
 46 | 
 47 |         for i, s in zip(x.get_shape().as_list(), tf.unstack(tf.shape(x))):
 48 |             if i is not None:
 49 |                 x_shape.append(i)
 50 |             else:
 51 |                 x_shape.append(s)
 52 | 
 53 |         x_shape = tuple(x_shape)
 54 |         y_shape = []
 55 | 
 56 |         for i, s in zip(y.get_shape().as_list(), tf.unstack(tf.shape(y))):
 57 |             if i is not None:
 58 |                 y_shape.append(i)
 59 |             else:
 60 |                 y_shape.append(s)
 61 | 
 62 |         y_shape = tuple(y_shape)
 63 |         y_permute_dim = list(range(ndim(y)))
 64 |         y_permute_dim = [y_permute_dim.pop(-2)] + y_permute_dim
 65 |         xt = tf.reshape(x, [-1, x_shape[-1]])
 66 |         yt = tf.reshape(tf.transpose(y, perm=y_permute_dim), [y_shape[-2], -1])
 67 |         return tf.reshape(tf.matmul(xt, yt), x_shape[:-1] + y_shape[:-2] + y_shape[-1:])
 68 | 
 69 |     if isinstance(x, tf.SparseTensor):
 70 |         out = tf.sparse_tensor_dense_matmul(x, y)
 71 | 
 72 |     else:
 73 |         out = tf.matmul(x, y)
 74 | 
 75 |     return out
 76 | 
 77 | 
 78 | def batch_dot(x, y, axes=None):
 79 |     if isinstance(axes, int):
 80 |         axes = (axes, axes)
 81 | 
 82 |     x_ndim = ndim(x)
 83 |     y_ndim = ndim(y)
 84 | 
 85 |     if x_ndim > y_ndim:
 86 |         diff = x_ndim - y_ndim
 87 |         y = tf.reshape(y, tf.concat([tf.shape(y), [1] * diff], axis=0))
 88 | 
 89 |     elif y_ndim > x_ndim:
 90 |         diff = y_ndim - x_ndim
 91 |         x = tf.reshape(x, tf.concat([tf.shape(x), [1] * diff], axis=0))
 92 | 
 93 |     else:
 94 |         diff = 0
 95 | 
 96 |     if ndim(x) == 2 and ndim(y) == 2:
 97 |         if axes[0] == axes[1]:
 98 |             out = tf.reduce_sum(tf.multiply(x, y), axes[0])
 99 | 
100 |         else:
101 |             out = tf.reduce_sum(tf.multiply(tf.transpose(x, [1, 0]), y), axes[1])
102 | 
103 |     else:
104 |         if axes is not None:
105 |             adj_x = None if axes[0] == ndim(x) - 1 else True
106 |             adj_y = True if axes[1] == ndim(y) - 1 else None
107 | 
108 |         else:
109 |             adj_x = None
110 |             adj_y = None
111 | 
112 |         out = tf.matmul(x, y, adjoint_a=adj_x, adjoint_b=adj_y)
113 | 
114 |     if diff:
115 |         if x_ndim > y_ndim:
116 |             idx = x_ndim + y_ndim - 3
117 | 
118 |         else:
119 |             idx = x_ndim - 1
120 | 
121 |         out = tf.squeeze(out, list(range(idx, idx + diff)))
122 | 
123 |     if ndim(out) == 1:
124 |         out = tf.expand_dims(out, 1)
125 | 
126 |     return out
127 | 
128 | 
129 | def trilinear_attention(args, v_maxlen, q_maxlen, drop_rate=0.0, reuse=None, name='efficient_trilinear'):
130 |     assert len(args) == 2, 'just use for computing attention with two input'
131 |     arg0_shape = args[0].get_shape().as_list()
132 |     arg1_shape = args[1].get_shape().as_list()
133 | 
134 |     if len(arg0_shape) != 3 or len(arg1_shape) != 3:
135 |         raise ValueError('`args` must be 3 dims (batch_size, len, dimension)')
136 | 
137 |     if arg0_shape[2] != arg1_shape[2]:
138 |         raise ValueError('the last dimension of `args` must equal')
139 | 
140 |     arg_size = arg0_shape[2]
141 |     dtype = args[0].dtype
142 |     drop_args = [tf.nn.dropout(arg, rate=drop_rate) for arg in args]
143 | 
144 |     with tf.variable_scope(name, reuse=reuse):
145 |         weights4arg0 = tf.get_variable('linear_kernel4arg0', [arg_size, 1], dtype=dtype, regularizer=regularizer)
146 |         weights4arg1 = tf.get_variable('linear_kernel4arg1', [arg_size, 1], dtype=dtype, regularizer=regularizer)
147 |         weights4mlu = tf.get_variable('linear_kernel4mul', [1, 1, arg_size], dtype=dtype, regularizer=regularizer)
148 | 
149 |         subres0 = tf.tile(dot(drop_args[0], weights4arg0), [1, 1, q_maxlen])
150 |         subres1 = tf.tile(tf.transpose(dot(drop_args[1], weights4arg1), perm=(0, 2, 1)), [1, v_maxlen, 1])
151 |         subres2 = batch_dot(drop_args[0] * weights4mlu, tf.transpose(drop_args[1], perm=(0, 2, 1)))
152 |         res = subres0 + subres1 + subres2
153 | 
154 |         return res
155 | 
156 | 
157 | def create_adam_optimizer(loss,init_lr,num_train_steps,num_warmup_steps = False,clip_norm=1.0):
158 |     """Creates an optimizer training op."""
159 | 
160 |     learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32)
161 |     learning_rate = tf.train.polynomial_decay(learning_rate,
162 |                                               global_step,
163 |                                               num_train_steps,
164 |                                               end_learning_rate=0.0,
165 |                                               power=1.0,
166 |                                               cycle=False)
167 | 
168 |     optimizer = AdamWeightDecayOptimizer(
169 |         learning_rate=learning_rate,
170 |         weight_decay_rate=0.01,
171 |         beta_1=0.9,
172 |         beta_2=0.999,
173 |         epsilon=1e-6,
174 |         exclude_from_weight_decay=['LayerNorm', 'layer_norm', 'bias'])
175 | 
176 |     tvars = tf.trainable_variables()
177 |     grads = tf.gradients(loss, tvars)
178 |     (grads, _) = tf.clip_by_global_norm(grads, clip_norm=clip_norm)
179 |     train_op = optimizer.apply_gradients(zip(grads, tvars),
180 |                                          global_step=global_step)
181 | 
182 |     # Normally the global step update is done inside of `apply_gradients`. However, `AdamWeightDecayOptimizer` doesn't
183 |     # do this. But if you use a different optimizer, you should probably take this line out.
184 |     new_global_step = global_step + 1
185 |     train_op = tf.group(train_op, [global_step.assign(new_global_step)])
186 | 
187 |     return train_op
188 | 
189 | 
190 | def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, clip_norm=1.0):
191 |     """Creates an optimizer training op."""
192 |     global_step = tf.train.get_or_create_global_step()
193 | 
194 |     learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32)
195 |     learning_rate = tf.train.polynomial_decay(learning_rate,
196 |                                               global_step,
197 |                                               num_train_steps,
198 |                                               end_learning_rate=0.0,
199 |                                               power=1.0,
200 |                                               cycle=False)
201 | 
202 |     if num_warmup_steps:
203 |         global_steps_int = tf.cast(global_step, tf.int32)
204 |         warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)
205 | 
206 |         global_steps_float = tf.cast(global_steps_int, tf.float32)
207 |         warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)
208 | 
209 |         warmup_percent_done = global_steps_float / warmup_steps_float
210 |         warmup_learning_rate = init_lr * warmup_percent_done
211 | 
212 |         is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
213 |         learning_rate = ((1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate)
214 | 
215 |     optimizer = AdamWeightDecayOptimizer(learning_rate=learning_rate, weight_decay_rate=0.01, beta_1=0.9, beta_2=0.999,
216 |                                          epsilon=1e-6, exclude_from_weight_decay=['LayerNorm', 'layer_norm', 'bias'])
217 | 
218 |     tvars = tf.trainable_variables()
219 |     grads = tf.gradients(loss, tvars)
220 |     (grads, _) = tf.clip_by_global_norm(grads, clip_norm=clip_norm)
221 |     train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=global_step)
222 | 
223 |     # Normally the global step update is done inside of `apply_gradients`. However, `AdamWeightDecayOptimizer` doesn't
224 |     # do this. But if you use a different optimizer, you should probably take this line out.
225 |     new_global_step = global_step + 1
226 |     train_op = tf.group(train_op, [global_step.assign(new_global_step)])
227 | 
228 |     return train_op
229 | 
230 | 
231 | class AdamWeightDecayOptimizer(tf.train.Optimizer):
232 |     """A basic Adam optimizer that includes "correct" L2 weight decay."""
233 | 
234 |     def __init__(self, learning_rate, weight_decay_rate=0.0, beta_1=0.9, beta_2=0.999, epsilon=1e-6,
235 |                  exclude_from_weight_decay=None, name='AdamWeightDecayOptimizer'):
236 |         """Constructs a AdamWeightDecayOptimizer."""
237 |         super(AdamWeightDecayOptimizer, self).__init__(False, name)
238 | 
239 |         self.learning_rate = learning_rate
240 |         self.weight_decay_rate = weight_decay_rate
241 |         self.beta_1 = beta_1
242 |         self.beta_2 = beta_2
243 |         self.epsilon = epsilon
244 |         self.exclude_from_weight_decay = exclude_from_weight_decay
245 | 
246 |     def apply_gradients(self, grads_and_vars, global_step=None, name=None):
247 |         """See base class."""
248 |         assignments = []
249 |         for (grad, param) in grads_and_vars:
250 |             if grad is None or param is None:
251 |                 continue
252 | 
253 |             param_name = self._get_variable_name(param.name)
254 | 
255 |             m = tf.get_variable(name=param_name + '/adam_m',
256 |                                 shape=param.shape.as_list(),
257 |                                 dtype=tf.float32,
258 |                                 trainable=False,
259 |                                 initializer=tf.zeros_initializer())
260 | 
261 |             v = tf.get_variable(name=param_name + '/adam_v',
262 |                                 shape=param.shape.as_list(),
263 |                                 dtype=tf.float32,
264 |                                 trainable=False,
265 |                                 initializer=tf.zeros_initializer())
266 | 
267 |             next_m = (tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad))
268 |             next_v = (tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2, tf.square(grad)))
269 | 
270 |             update = next_m / (tf.sqrt(next_v) + self.epsilon)
271 |             if self._do_use_weight_decay(param_name):
272 |                 update += self.weight_decay_rate * param
273 | 
274 |             update_with_lr = self.learning_rate * update
275 |             next_param = param - update_with_lr
276 |             assignments.extend([param.assign(next_param), m.assign(next_m), v.assign(next_v)])
277 | 
278 |         return tf.group(*assignments, name=name)
279 | 
280 |     def _do_use_weight_decay(self, param_name):
281 |         """Whether to use L2 weight decay for `param_name`."""
282 |         if not self.weight_decay_rate:
283 |             return False
284 | 
285 |         if self.exclude_from_weight_decay:
286 |             for r in self.exclude_from_weight_decay:
287 |                 if re.search(r, param_name) is not None:
288 |                     return False
289 | 
290 |         return True
291 | 
292 |     @staticmethod
293 |     def _get_variable_name(param_name):
294 |         """Get the variable name from the tensor name."""
295 |         m = re.match("^(.*):\\d+$", param_name)
296 | 
297 |         if m is not None:
298 |             param_name = m.group(1)
299 | 
300 |         return param_name
301 | 
302 |     def _apply_dense(self, grad, var):
303 |         pass
304 | 
305 |     def _resource_apply_dense(self, grad, handle):
306 |         pass
307 | 
308 |     def _resource_apply_sparse(self, grad, handle, indices):
309 |         pass
310 | 
311 |     def _apply_sparse(self, grad, var):
312 |         pass
313 | 


--------------------------------------------------------------------------------
/prepare/README.md:
--------------------------------------------------------------------------------
  1 | # Extract Features
  2 | 
  3 | - We use the pre-trained 3D ConvNets ([here](https://github.com/piergiaj/pytorch-i3d)) to prepare the visual features, the 
  4 | extraction codes are placed in this folder. Please download the pre-trained weights [`rgb_charades.pt`](
  5 | https://github.com/piergiaj/pytorch-i3d/blob/master/models/rgb_charades.pt) and [`rgb_imagenet.pt`](
  6 | https://github.com/piergiaj/pytorch-i3d/blob/master/models/rgb_imagenet.pt). 
  7 | - The pre-trained GloVe embedding is available at [here](https://nlp.stanford.edu/projects/glove/), please download
  8 | `glove.840B.300d.zip`, unzip and put it under `data/` folder.
  9 | 
 10 | ## Charades STA
 11 | The train/test datasets of Charades-STA are available at [[jiyanggao/TALL]](https://github.com/jiyanggao/TALL) 
 12 | ([`charades_sta_train.txt`](https://drive.google.com/file/d/1ZjG7wJpPSMIBYnW7BAG2u9VVEoNvFm5c/view) and 
 13 | [`charades_sta_test.txt`](https://drive.google.com/file/d/1QG4MXFkoj6JFU0YK5olTY75xTARKSW5e/view)).
 14 | 
 15 | The `charades.json` file is required ([here](https://github.com/piergiaj/super-events-cvpr18/blob/master/data/charades.json)), 
 16 | which contains the video length information. Download and place it into the same directory of the train/test datasets.
 17 | 
 18 | The videos/images for Charades-STA dataset is available at [here](https://allenai.org/plato/charades/), please download 
 19 | either `RGB frames at 24fps (76 GB)` (image frames) or `Data (original size) (55 GB)` (videos). For the second one, the 
 20 | extractor will automatically decompose the video into images.
 21 | ```shell script
 22 | # download RGB frames
 23 | wget http://ai2-website.s3.amazonaws.com/data/Charades_v1_rgb.tar
 24 | # or, download videos
 25 | wget http://ai2-website.s3.amazonaws.com/data/Charades_v1.zip
 26 | ```
 27 | 
 28 | Extract visual features for Charades-STA:
 29 | ```shell script
 30 | # use the weights fine-tuned on Charades or the weights pre-trained on ImageNet
 31 | python3 extract_charades.py --use_finetuned --load_model <path to>/rgb_charades.pt  \  # rgb_imagenet.pt
 32 |       --video_dir <path to video dir>  \
 33 |       --dataset_dir <path to charades-sta dataset dir>  \
 34 |       --images_dir <path to images dir>  \  # if images not exist, decompose video into images
 35 |       --save_dir <path to save extracted visual features>  \
 36 |       --fps 24 --strides 24 --remove_images  # whether remove extract images to release space
 37 | ```
 38 | 
 39 | ## TACoS
 40 | TACoS dataset is from [[jiyanggao/TALL]](https://github.com/jiyanggao/TALL), while the videos of TACoS is from MPII 
 41 | Cooking Composite Activities dataset, which can be download [here](
 42 | https://www.mpi-inf.mpg.de/departments/computer-vision-and-machine-learning/research/human-activity-recognition/mpii-cooking-composite-activities/).
 43 | Note that we also use the processed TACoS dataset in [[microsoft/2D-TAN]](https://github.com/microsoft/2D-TAN). 
 44 | 
 45 | Extract visual features for TACoS:
 46 | ```shell script
 47 | python3 extract_tacos.py --load_model <path to>/rgb_imagenet.pt  \
 48 |       --video_dir <path to video dir>  \
 49 |       --dataset_dir <path to charades-sta dataset dir>  \
 50 |       --images_dir <path to images dir>  \  # if images not exist, decompose video into images
 51 |       --save_dir <path to save extracted visual features>  \
 52 |       --strides 16 --remove_images  # whether remove extracted images to release space
 53 | ```
 54 | 
 55 | (Optional) Convert the pre-trained C3D visual features from [[jiyanggao/TALL]](https://github.com/jiyanggao/TALL) 
 56 | ([Interval64_128_256_512_overlap0.8_c3d_fc6.tar](https://drive.google.com/file/d/1zQp0aYGFCm8PqqHOh4UtXfy2U3pJMBeu/view), 
 57 | [Interval128_256_overlap0.8_c3d_fc6.tar](https://drive.google.com/file/d/1zC-UrspRf42Qiu5prQw4fQrbgLQfJN-P/view)):
 58 | ```shell script
 59 | python3 extract_tacos_org.py --data_path <path to tacos annotation dataset>  \
 60 |       --feature_path <path to downloaded C3D features>  \
 61 |       --save_dir <path to save extracted visual features>  \
 62 |       --sample_rate 64  # sliding windows
 63 | ```
 64 | 
 65 | ## ActivityNet Captions
 66 | The train/test sets of ActivityNet Caption are available at [here](
 67 | https://cs.stanford.edu/people/ranjaykrishna/densevid/). The videos can be downloaded using:
 68 | ```shell script
 69 | python3 download_activitynet_video.py --video_dir <path to save videos>  \
 70 |       --dataset_dir <path to activitynet caption datasets>  \
 71 |       --bash_file <path to save generated bash file for downloading videos>
 72 | ```
 73 | It will generate a bash file which contains the commands to download all the videos. Suppose the generated bash file is 
 74 | `video_downloader.sh`, then simply run `bash video_downloader.sh`, it will download the videos and save them into 
 75 | `video_dir` automatically.
 76 | 
 77 | Extract visual features for ActivityNet Captions:
 78 | ```shell script
 79 | python3 extract_activitynet.py --load_model <path to>/rgb_imagenet.pt  \
 80 |       --video_dir <path to video dir>  \
 81 |       --dataset_dir <path to charades-sta dataset dir>  \
 82 |       --images_dir <path to images dir>  \  # if images not exist, decompose video into images
 83 |       --save_dir <path to save extracted visual features>  \
 84 |       --strides 16 --remove_images  # whether remove extracted images to release space
 85 | ```
 86 | 
 87 | (Optional) We also have the codes to convert the C3D visual features provided in [ActivityNet official website](
 88 | http://activity-net.org/challenges/2016/download.html):
 89 | 
 90 | - download the C3D visual features
 91 | ```shell script
 92 | wget http://ec2-52-25-205-214.us-west-2.compute.amazonaws.com/data/challenge16/features/c3d/activitynet_v1-3.part-00
 93 | wget http://ec2-52-25-205-214.us-west-2.compute.amazonaws.com/data/challenge16/features/c3d/activitynet_v1-3.part-01
 94 | wget http://ec2-52-25-205-214.us-west-2.compute.amazonaws.com/data/challenge16/features/c3d/activitynet_v1-3.part-02
 95 | wget http://ec2-52-25-205-214.us-west-2.compute.amazonaws.com/data/challenge16/features/c3d/activitynet_v1-3.part-03
 96 | wget http://ec2-52-25-205-214.us-west-2.compute.amazonaws.com/data/challenge16/features/c3d/activitynet_v1-3.part-04
 97 | wget http://ec2-52-25-205-214.us-west-2.compute.amazonaws.com/data/challenge16/features/c3d/activitynet_v1-3.part-05
 98 | cat activitynet_v1-3.part-* > features.zip && unzip features.zip
 99 | rm features.zip
100 | rm activitynet_v1-3.part-*
101 | ```
102 | - convert the features as
103 | ```shell script
104 | python3 extract_activitynet_org.py --dataset_dir <path to activitynet caption annotation dataset>  \
105 |       --hdf5_file <path to downloaded C3D features>  \
106 |       --save_dir <path to save extracted features>
107 | ```
108 | 


--------------------------------------------------------------------------------
/prepare/convert_charades.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import h5py
 3 | import json
 4 | import numpy as np
 5 | from tqdm import tqdm
 6 | from argparse import ArgumentParser
 7 | 
 8 | parser = ArgumentParser()
 9 | parser.add_argument("--dataset_dir", type=str, required=True, help="dataset path")
10 | parser.add_argument("--hdf5_file", type=str, required=True, help="downloaded activitynet features")
11 | parser.add_argument("--save_dir", type=str, required=True, help="save dir")
12 | args = parser.parse_args()
13 | 
14 | with open(os.path.join(args.dataset_dir, "charades.json"), mode="r", encoding="utf-8") as f:
15 |     all_data = json.load(f)
16 | # with open(os.path.join(args.dataset_dir, "val_1.json"), mode="r", encoding="utf-8") as f:
17 | #     val_data = json.load(f)
18 | # with open(os.path.join(args.dataset_dir, "val_2.json"), mode="r", encoding="utf-8") as f:
19 | #     test_data = json.load(f)
20 | 
21 | video_ids = list(set(list(all_data.keys())))
22 | # print(video_ids)
23 | # print(len(video_ids)) #9948
24 | 
25 | with h5py.File(args.hdf5_file, mode="r") as f:
26 |     print(type(f)) #<class 'h5py._hl.files.File'>
27 |     print(len(f)) #9846
28 |     # print(f.keys()) 
29 |     print(type(f['001YG']))
30 |     print(f['001YG'].shape)
31 |     print(f['ZZ3HT'].shape)
32 | if not os.path.exists(args.save_dir):
33 |     os.makedirs(args.save_dir)
34 | 
35 | feature_shapes = dict()
36 | with h5py.File(args.hdf5_file, mode="r") as f:
37 |     group_key = list(f.keys())
38 |     for key in tqdm(group_key, total=len(group_key), desc="extract features"):
39 |         video_id = key
40 |         if video_id not in video_ids:
41 |             continue
42 |         data = f[key]
43 |         feature_shapes[video_id] = data.shape[0]
44 |         np.save(os.path.join(args.save_dir, video_id), arr=data)
45 | # with h5py.File(args.hdf5_file, mode="r") as f:
46 | #     group_key = list(f.keys())
47 | #     for key in tqdm(group_key, total=len(group_key), desc="extract features"):
48 | #         video_id = key
49 | #         if video_id not in video_ids:
50 | #             continue
51 | #         data = f[key]["c3d_features"][()]
52 | #         feature_shapes[video_id] = data.shape[0]
53 | #         np.save(os.path.join(args.save_dir, video_id), arr=data)
54 | 
55 | with open(os.path.join(args.save_dir, "feature_shapes.json"), mode="w", encoding="utf-8") as f:
56 |     json.dump(feature_shapes, f)
57 | 


--------------------------------------------------------------------------------
/prepare/download_activitynet_video.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Codes are modified from https://github.com/waybarrios/Anet_tools2.0
 3 | """
 4 | import os
 5 | import glob
 6 | import json
 7 | from argparse import ArgumentParser
 8 | 
 9 | 
10 | def crosscheck_videos(video_path, all_video_ids):
11 |     # Get existing videos
12 |     existing_videos = glob.glob("%s/*.mp4" % video_path)
13 |     for idx, vid in enumerate(existing_videos):
14 |         basename = os.path.basename(vid).split(".mp4")[0]
15 |         if len(basename) == 13:
16 |             existing_videos[idx] = basename[2:]
17 |         elif len(basename) == 11:
18 |             existing_videos[idx] = basename
19 |         else:
20 |             raise RuntimeError("Unknown filename format: %s", vid)
21 | 
22 |     non_existing_videos = []
23 |     for vid in all_video_ids:
24 |         if vid in existing_videos:
25 |             continue
26 |         else:
27 |             non_existing_videos.append(vid)
28 | 
29 |     return non_existing_videos
30 | 
31 | 
32 | def main(video_dir, dataset_dir, bash_file):
33 |     with open(os.path.join(dataset_dir, "train.json"), mode="r", encoding="utf-8") as f:
34 |         train_ids = list(json.load(f).keys())
35 |         train_ids = [vid[2:] if len(vid) == 13 else vid for vid in train_ids]
36 | 
37 |     with open(os.path.join(dataset_dir, "val_1.json"), mode="r", encoding="utf-8") as f:
38 |         val_ids = list(json.load(f).keys())
39 |         val_ids = [vid[2:] if len(vid) == 13 else vid for vid in val_ids]
40 | 
41 |     with open(os.path.join(dataset_dir, "val_2.json"), mode="r", encoding="utf-8") as f:
42 |         test_ids = list(json.load(f).keys())
43 |         test_ids = [vid[2:] if len(vid) == 13 else vid for vid in test_ids]
44 | 
45 |     all_video_ids = list(set(train_ids + val_ids + test_ids))
46 |     print("train_video_ids", len(train_ids))
47 |     print("val_1_video_ids", len(val_ids))
48 |     print("val_2_video_ids", len(test_ids))
49 |     print("all_video_ids", len(all_video_ids))
50 | 
51 |     non_existing_videos = crosscheck_videos(video_dir, all_video_ids)
52 | 
53 |     # save command to bash file
54 |     with open(bash_file + '.sh', mode="w", encoding="utf-8") as f:
55 |         f.write("#!/usr/bin/env bash\n\n")  # write bash file header
56 |         filename = os.path.join(video_dir, "v_%s.mp4")
57 |         cmd_base = "youtube-dl -f best -f mp4 "
58 |         cmd_base += '"https://www.youtube.com/watch?v=%s" '
59 |         cmd_base += '-o "%s"' % filename
60 | 
61 |         for vid in non_existing_videos:
62 |             cmd = cmd_base % (vid, vid)
63 |             f.write("%s\n" % cmd)
64 | 
65 | 
66 | if __name__ == "__main__":
67 |     parser = ArgumentParser(description="Script to double check video content.")
68 |     parser.add_argument("--video_dir", type=str, required=True, help="where to save the downloaded videos")
69 |     parser.add_argument("--dataset_dir", type=str, required=True, help="where are the annotation files")
70 |     parser.add_argument("--bash_file", type=str, required=True, help="where to save command list script")
71 | 
72 |     args = vars(parser.parse_args())
73 |     main(**args)
74 |     """
75 |     After running this python file, it will generate an script file. Using the terminal to run this script, it will 
76 |     automatically download all the required videos from YouTube.
77 |     """
78 | 


--------------------------------------------------------------------------------
/prepare/extract_activitynet.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import cv2
  3 | import glob
  4 | import json
  5 | import torch
  6 | import argparse
  7 | import subprocess
  8 | import numpy as np
  9 | from . import videotransforms
 10 | from .feature_extractor import InceptionI3d
 11 | from torchvision import transforms
 12 | from torch.autograd import Variable
 13 | 
 14 | parser = argparse.ArgumentParser()
 15 | parser.add_argument("--gpu_idx", type=str, default="0", help="gpu index")
 16 | parser.add_argument("--load_model", type=str, required=True, help="pre-trained model")
 17 | parser.add_argument("--video_dir", type=str, required=True, help="where are located the videos")
 18 | parser.add_argument("--images_dir", type=str, required=True, help="where to save extracted images")
 19 | parser.add_argument("--save_dir", type=str, required=True, help="where to save extracted features")
 20 | parser.add_argument("--fps", type=int, default=None, help="frames per second")
 21 | parser.add_argument("--video_format", type=str, default="mp4", help="video format")
 22 | parser.add_argument("--strides", type=int, default=16, help="window size")
 23 | parser.add_argument("--remove_images", action="store_true", help="whether remove extract images to release space")
 24 | args = parser.parse_args()
 25 | 
 26 | os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 27 | os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_idx
 28 | 
 29 | 
 30 | def load_images(img_dir, vid, start_frame, lengths):
 31 |     img_frames, raw_height, raw_width = [], None, None
 32 |     for x in range(start_frame, start_frame + lengths):
 33 |         image = cv2.imread(os.path.join(img_dir, "{}-{}.jpg".format(vid, str(x).zfill(6))))[:, :, [2, 1, 0]]
 34 |         width, height, channel = image.shape
 35 |         raw_width, raw_height = width, height
 36 |         # resize image
 37 |         scale = 1 + (224.0 - min(width, height)) / min(width, height)
 38 |         image = cv2.resize(image, dsize=(0, 0), fx=scale, fy=scale)
 39 |         # normalize image to [0, 1]
 40 |         image = (image / 255.0) * 2 - 1
 41 |         img_frames.append(image)
 42 |     return img_frames, raw_width, raw_height
 43 | 
 44 | 
 45 | def extract_features(image_tensor, model, strides):
 46 |     b, c, t, h, w = image_tensor.shape
 47 |     extracted_features = []
 48 |     for start in range(0, t, strides):
 49 |         end = min(t - 1, start + strides)
 50 |         if end - start < strides:
 51 |             start = max(0, end - strides)
 52 |         ip = Variable(torch.from_numpy(image_tensor.numpy()[:, :, start:end]).cuda(), volatile=True)
 53 |         feature = model.extract_features(ip).data.cpu().numpy()
 54 |         extracted_features.append(feature)
 55 |     extracted_features = np.concatenate(extracted_features, axis=0)
 56 |     return extracted_features
 57 | 
 58 | 
 59 | if not os.path.exists(args.video_dir):
 60 |     raise ValueError("The video directory '{}' does not exist!!!".format(args.video_dir))
 61 | 
 62 | if not os.path.exists(args.images_dir):
 63 |     os.makedirs(args.images_dir)
 64 | 
 65 | if not os.path.exists(args.save_dir):
 66 |     os.makedirs(args.save_dir)
 67 | 
 68 | # create I3D model and load pre-trained model
 69 | i3d_model = InceptionI3d(400, in_channels=3)
 70 | i3d_model.load_state_dict(torch.load(args.load_model))
 71 | i3d_model.cuda()
 72 | i3d_model.train(False)
 73 | video_transforms = transforms.Compose([videotransforms.CenterCrop(224)])
 74 | 
 75 | # extract images and features
 76 | feature_shapes = dict()
 77 | video_paths = glob.glob(os.path.join(args.video_dir, "*.{}".format(args.video_format)))
 78 | for idx, video_path in enumerate(video_paths):
 79 |     video_id = os.path.basename(video_path)[0:-4]  # remove suffix
 80 |     image_dir = os.path.join(args.images_dir, video_id)
 81 | 
 82 |     print("{} / {}: extract features for video {}".format(idx + 1, len(video_paths), video_id), flush=True)
 83 | 
 84 |     if os.path.exists(os.path.join(args.save_dir, "{}.npy".format(video_id))):
 85 |         print("the visual features for video {} are exist in {}...\n".format(video_id, args.save_dir), flush=True)
 86 |         continue
 87 | 
 88 |     # extract images
 89 |     if os.path.exists(image_dir):
 90 |         print("the images for video {} already are exist in {}...".format(video_id, args.images_dir))
 91 |     else:
 92 |         os.makedirs(image_dir)
 93 |         print("extract images with fps={}...".format(args.fps), flush=True)
 94 |         if args.fps is None or args.fps <= 0:
 95 |             subprocess.call("ffmpeg -hide_banner -loglevel panic -i {} {}/{}-%6d.jpg".format(
 96 |                 video_path, image_dir, video_id), shell=True)
 97 |         else:
 98 |             subprocess.call("ffmpeg -hide_banner -loglevel panic -i {} -filter:v fps=fps={} {}/{}-%6d.jpg".format(
 99 |                 video_path, args.fps, image_dir, video_id), shell=True)
100 | 
101 |     # process extracted images
102 |     print("load RGB frames...", flush=True)
103 |     num_frames = len(os.listdir(image_dir))
104 | 
105 |     if num_frames < 10000:
106 |         frames, raw_w, raw_h = load_images(image_dir, video_id, 1, num_frames)
107 |         frames = np.asarray(frames, dtype=np.float32)
108 |         imgs = video_transforms(frames)
109 |         img_tensor = torch.from_numpy(np.expand_dims(imgs.transpose([3, 0, 1, 2]), axis=0))
110 |         print("process images:", (frames.shape[0], raw_w, raw_h, frames.shape[-1]), "-->", frames.shape, "-->",
111 |               imgs.shape, "-->", tuple(img_tensor.size()), flush=True)
112 | 
113 |         print("extract visual features...", flush=True)
114 |         features = extract_features(img_tensor, i3d_model, args.strides)
115 |         np.save(os.path.join(args.save_dir, video_id), arr=features)
116 |         print("extracted features shape: {}".format(features.shape), flush=True)
117 |         feature_shapes[video_id] = features.shape[0]
118 | 
119 |     else:
120 |         all_features = []
121 |         for start_idx in range(1, num_frames, 10000):
122 |             end_idx = min(start_idx + 10000, num_frames + 1)
123 |             cur_num_frames = end_idx - start_idx
124 |             if cur_num_frames < args.strides:
125 |                 cur_num_frames = args.strides
126 |                 start_idx = end_idx - cur_num_frames
127 |             frames, raw_w, raw_h = load_images(image_dir, video_id, start_idx, cur_num_frames)
128 |             frames = np.asarray(frames, dtype=np.float32)
129 |             imgs = video_transforms(frames)
130 |             img_tensor = torch.from_numpy(np.expand_dims(imgs.transpose([3, 0, 1, 2]), axis=0))
131 |             print("process images:", (frames.shape[0], raw_w, raw_h, frames.shape[-1]), "-->", frames.shape, "-->",
132 |                   imgs.shape, "-->", tuple(img_tensor.size()), flush=True)
133 |             print("extract visual features...", flush=True)
134 |             features = extract_features(img_tensor, i3d_model, args.strides)
135 |             all_features.append(features)
136 |         all_features = np.concatenate(all_features, axis=0)
137 |         np.save(os.path.join(args.save_dir, video_id), arr=all_features)
138 |         print("extracted features shape: {}".format(all_features.shape), flush=True)
139 |         feature_shapes[video_id] = all_features.shape[0]
140 | 
141 |     if args.remove_images:
142 |         # remove extract images to release memory space
143 |         subprocess.call("rm -rf {}".format(image_dir), shell=True)
144 | 
145 | with open(os.path.join(args.save_dir, "feature_shapes.json"), mode="w", encoding="utf-8") as f:
146 |     json.dump(feature_shapes, f)
147 | 


--------------------------------------------------------------------------------
/prepare/extract_activitynet_org.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import h5py
 3 | import json
 4 | import numpy as np
 5 | from tqdm import tqdm
 6 | from argparse import ArgumentParser
 7 | 
 8 | parser = ArgumentParser()
 9 | parser.add_argument("--dataset_dir", type=str, required=True, help="dataset path")
10 | parser.add_argument("--hdf5_file", type=str, required=True, help="downloaded activitynet features")
11 | parser.add_argument("--save_dir", type=str, required=True, help="save dir")
12 | args = parser.parse_args()
13 | 
14 | with open(os.path.join(args.dataset_dir, "train.json"), mode="r", encoding="utf-8") as f:
15 |     train_data = json.load(f)
16 | with open(os.path.join(args.dataset_dir, "val_1.json"), mode="r", encoding="utf-8") as f:
17 |     val_data = json.load(f)
18 | with open(os.path.join(args.dataset_dir, "val_2.json"), mode="r", encoding="utf-8") as f:
19 |     test_data = json.load(f)
20 | 
21 | video_ids = list(set(list(train_data.keys()) + list(val_data.keys()) + list(test_data.keys())))
22 | print(video_ids)
23 | print(len(video_ids))
24 | 
25 | if not os.path.exists(args.save_dir):
26 |     os.makedirs(args.save_dir)
27 | 
28 | feature_shapes = dict()
29 | with h5py.File(args.hdf5_file, mode="r") as f:
30 |     group_key = list(f.keys())
31 |     for key in tqdm(group_key, total=len(group_key), desc="extract features"):
32 |         video_id = key
33 |         if video_id not in video_ids:
34 |             continue
35 |         data = f[key]["c3d_features"][()]
36 |         feature_shapes[video_id] = data.shape[0]
37 |         np.save(os.path.join(args.save_dir, video_id), arr=data)
38 | 
39 | with open(os.path.join(args.save_dir, "feature_shapes.json"), mode="w", encoding="utf-8") as f:
40 |     json.dump(feature_shapes, f)
41 | 


--------------------------------------------------------------------------------
/prepare/extract_charades.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import cv2
  3 | import json
  4 | import torch
  5 | import argparse
  6 | import subprocess
  7 | import numpy as np
  8 | from . import videotransforms
  9 | from .feature_extractor import InceptionI3d
 10 | from torchvision import transforms
 11 | from torch.autograd import Variable
 12 | 
 13 | parser = argparse.ArgumentParser()
 14 | parser.add_argument("--gpu_idx", type=str, default="0", help="gpu index")
 15 | parser.add_argument("--use_finetuned", action="store_true", help="whether use fine-tuned feature extractor")
 16 | parser.add_argument("--load_model", type=str, required=True, help="pre-trained model")
 17 | parser.add_argument("--video_dir", type=str, required=True, help="where are located the videos")
 18 | parser.add_argument("--dataset_dir", type=str, required=True, help="where are located the dataset files")
 19 | parser.add_argument("--images_dir", type=str, required=True, help="where to save extracted images")
 20 | parser.add_argument("--save_dir", type=str, required=True, help="where to save extracted features")
 21 | parser.add_argument("--fps", type=int, default=24, help="frames per second")
 22 | parser.add_argument("--video_format", type=str, default="mp4", help="video format")
 23 | parser.add_argument("--strides", type=int, default=24, help="window size")
 24 | parser.add_argument("--remove_images", action="store_true", help="whether remove extract images to release space")
 25 | args = parser.parse_args()
 26 | 
 27 | os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 28 | os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_idx
 29 | 
 30 | 
 31 | if not os.path.exists(args.video_dir):
 32 |     raise ValueError("The video directory '{}' does not exist!!!".format(args.video_dir))
 33 | 
 34 | if not os.path.exists(args.images_dir):
 35 |     os.makedirs(args.images_dir)
 36 | 
 37 | if not os.path.exists(args.save_dir):
 38 |     os.makedirs(args.save_dir)
 39 | 
 40 | # create I3D model and load pre-trained model
 41 | i3d_model = InceptionI3d(400, in_channels=3)
 42 | if args.use_fine_tuned:
 43 |     i3d_model.replace_logits(157)  # charades has 157 activity types
 44 | i3d_model.load_state_dict(torch.load(args.load_model))
 45 | i3d_model.cuda()
 46 | i3d_model.train(False)
 47 | video_transforms = transforms.Compose([videotransforms.CenterCrop(224)])
 48 | 
 49 | # load video ids
 50 | video_ids = []
 51 | for filename in ["charades_sta_train.txt", "charades_sta_test.txt"]:
 52 |     with open(os.path.join(args.dataset_dir, filename), mode="r", encoding="utf-8") as f:
 53 |         for line in f:
 54 |             line = line.lstrip().rstrip()
 55 |             if len(line) == 0:
 56 |                 continue
 57 |             vid = line.split("##")[0].split(" ")[0]
 58 |             video_ids.append(vid)
 59 | video_ids = list(set(video_ids))
 60 | 
 61 | # extract images and features
 62 | feature_shapes = dict()
 63 | for idx, video_id in enumerate(video_ids):
 64 |     video_path = os.path.join(args.video_dir, "{}.mp4".format(video_id))
 65 |     image_dir = os.path.join(args.images_dir, video_id)
 66 | 
 67 |     print("{} / {}: extract features for video {}".format(idx + 1, len(video_ids), video_id), flush=True)
 68 | 
 69 |     if os.path.exists(os.path.join(args.save_dir, "{}.npy".format(video_id))):
 70 |         print("the visual features for video {} are exist in {}...\n".format(video_id, args.save_dir), flush=True)
 71 |         continue
 72 | 
 73 |     # extract images
 74 |     if os.path.exists(image_dir):
 75 |         print("the images for video {} already are exist in {}...".format(video_id, args.images_dir))
 76 |     else:
 77 |         os.makedirs(image_dir)
 78 |         print("extract images with fps={}...".format(args.fps), flush=True)
 79 |         if args.fps is None or args.fps <= 0:
 80 |             subprocess.call("ffmpeg -hide_banner -loglevel panic -i {} {}/{}-%6d.jpg".format(
 81 |                 video_path, image_dir, video_id), shell=True)
 82 |         else:
 83 |             subprocess.call("ffmpeg -hide_banner -loglevel panic -i {} -filter:v fps=fps={} {}/{}-%6d.jpg".format(
 84 |                 video_path, args.fps, image_dir, video_id), shell=True)
 85 | 
 86 |     # process extracted images
 87 |     print("load RGB frames...", flush=True)
 88 |     num_frames = len(os.listdir(image_dir))
 89 |     frames, raw_w, raw_h = [], None, None
 90 |     for i in range(1, num_frames + 1):
 91 |         # cv2.imread() read image with BGR format by default, so we convert it to RGB format
 92 |         img = cv2.imread(os.path.join(image_dir, "{}-{}.jpg".format(video_id, str(i).zfill(6))))[:, :, [2, 1, 0]]
 93 |         w, h, c = img.shape
 94 |         raw_w, raw_h = w, h
 95 |         if w < 226 or h < 226:
 96 |             d = 226. - min(w, h)
 97 |             sc = 1 + d / min(w, h)
 98 |             img = cv2.resize(img, dsize=(0, 0), fx=sc, fy=sc)
 99 |         img = (img / 255.) * 2 - 1
100 |         frames.append(img)
101 |     frames = np.asarray(frames, dtype=np.float32)
102 |     imgs = video_transforms(frames)
103 |     img_tensor = torch.from_numpy(np.expand_dims(imgs.transpose([3, 0, 1, 2]), axis=0))
104 |     print("process images:", (frames.shape[0], raw_w, raw_h, frames.shape[-1]), "-->", frames.shape, "-->",
105 |           imgs.shape, "-->", tuple(img_tensor.size()), flush=True)
106 | 
107 |     if args.remove_images:
108 |         # remove extract images to release memory space
109 |         subprocess.call("rm -rf {}".format(image_dir), shell=True)
110 | 
111 |     print("extract visual visual features...", flush=True)
112 |     b, c, t, h, w = img_tensor.shape
113 |     features = []
114 |     for start in range(0, t, args.strides):
115 |         end = min(t - 1, start + args.strides)
116 |         if end - start < args.strides:
117 |             start = max(0, end - args.strides)
118 |         ip = Variable(torch.from_numpy(img_tensor.numpy()[:, :, start:end]).cuda(), volatile=True)
119 |         feature = i3d_model.extract_features(ip).data.cpu().numpy()
120 |         features.append(feature)
121 |     features = np.concatenate(features, axis=0)
122 |     np.save(os.path.join(args.save_dir, video_id), arr=features)
123 |     print("extracted feature shape: {}\n".format(features.shape), flush=True)
124 |     feature_shapes[video_id] = features.shape[0]
125 | 
126 | with open(os.path.join(args.save_dir, "feature_shapes.json"), mode="w", encoding="utf-8") as f:
127 |     json.dump(feature_shapes, f)
128 | 


--------------------------------------------------------------------------------
/prepare/extract_tacos.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import cv2
  3 | import glob
  4 | import json
  5 | import torch
  6 | import argparse
  7 | import subprocess
  8 | import numpy as np
  9 | from . import videotransforms
 10 | from .feature_extractor import InceptionI3d
 11 | from torchvision import transforms
 12 | from torch.autograd import Variable
 13 | 
 14 | parser = argparse.ArgumentParser()
 15 | parser.add_argument("--gpu_idx", type=str, default="0", help="gpu index")
 16 | parser.add_argument("--load_model", type=str, required=True, help="pre-trained model")
 17 | parser.add_argument("--video_dir", type=str, required=True, help="where are located the videos")
 18 | parser.add_argument("--images_dir", type=str, required=True, help="where to save extracted images")
 19 | parser.add_argument("--save_dir", type=str, required=True, help="where to save extracted features")
 20 | parser.add_argument("--fps", type=float, default=None, help="frames per second")  # TACoS's default fps is 29.4
 21 | parser.add_argument("--video_format", type=str, default="avi", help="video format")
 22 | parser.add_argument("--strides", type=int, default=16, help="window size")
 23 | parser.add_argument("--remove_images", action="store_true", help="whether remove extract images to release space")
 24 | args = parser.parse_args()
 25 | 
 26 | os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 27 | os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_idx
 28 | 
 29 | 
 30 | def load_images(img_dir, vid, start_frame, lengths):
 31 |     img_frames, raw_height, raw_width = [], None, None
 32 |     for x in range(start_frame, start_frame + lengths):
 33 |         image = cv2.imread(os.path.join(img_dir, "{}-{}.jpg".format(vid, str(x).zfill(6))))[:, :, [2, 1, 0]]
 34 |         width, height, channel = image.shape
 35 |         raw_width, raw_height = width, height
 36 |         # resize image
 37 |         scale = 1 + (224.0 - min(width, height)) / min(width, height)
 38 |         image = cv2.resize(image, dsize=(0, 0), fx=scale, fy=scale)
 39 |         # normalize image to [0, 1]
 40 |         image = (image / 255.0) * 2 - 1
 41 |         img_frames.append(image)
 42 |     return img_frames, raw_width, raw_height
 43 | 
 44 | 
 45 | def extract_features(image_tensor, model, strides):
 46 |     b, c, t, h, w = image_tensor.shape
 47 |     extracted_features = []
 48 |     for start in range(0, t, strides):
 49 |         end = min(t - 1, start + strides)
 50 |         if end - start < strides:
 51 |             start = max(0, end - strides)
 52 |         ip = Variable(torch.from_numpy(image_tensor.numpy()[:, :, start:end]).cuda(), volatile=True)
 53 |         feature = model.extract_features(ip).data.cpu().numpy()
 54 |         extracted_features.append(feature)
 55 |     extracted_features = np.concatenate(extracted_features, axis=0)
 56 |     return extracted_features
 57 | 
 58 | 
 59 | if not os.path.exists(args.video_dir):
 60 |     raise ValueError("The video directory '{}' does not exist!!!".format(args.video_dir))
 61 | 
 62 | if not os.path.exists(args.images_dir):
 63 |     os.makedirs(args.images_dir)
 64 | 
 65 | if not os.path.exists(args.save_dir):
 66 |     os.makedirs(args.save_dir)
 67 | 
 68 | # create I3D model and load pre-trained model
 69 | i3d_model = InceptionI3d(400, in_channels=3)
 70 | i3d_model.load_state_dict(torch.load(args.load_model))
 71 | i3d_model.cuda()
 72 | i3d_model.train(False)
 73 | video_transforms = transforms.Compose([videotransforms.CenterCrop(224)])
 74 | 
 75 | # extract images and features
 76 | feature_shapes = dict()
 77 | video_paths = glob.glob(os.path.join(args.video_dir, "*.{}".format(args.video_format)))
 78 | for idx, video_path in enumerate(video_paths):
 79 |     video_id = os.path.basename(video_path)[0:-4]  # remove suffix
 80 |     image_dir = os.path.join(args.images_dir, video_id)
 81 | 
 82 |     print("{} / {}: extract features for video {}".format(idx + 1, len(video_paths), video_id), flush=True)
 83 | 
 84 |     if os.path.exists(os.path.join(args.save_dir, "{}.npy".format(video_id))):
 85 |         print("the visual features for video {} are exist in {}...".format(video_id, args.save_dir), flush=True)
 86 |         continue
 87 | 
 88 |     # extract images
 89 |     if os.path.exists(image_dir):
 90 |         print("the images for video {} already are exist in {}...".format(video_id, args.images_dir))
 91 |     else:
 92 |         os.makedirs(image_dir)
 93 |         print("extract images with fps={}...".format(args.fps), flush=True)
 94 |         subprocess.call("ffmpeg -hide_banner -loglevel panic -i {} {}/{}-%6d.jpg".format(video_path, image_dir,
 95 |                                                                                          video_id), shell=True)
 96 | 
 97 |     # process extracted images
 98 |     print("load RGB frames...", flush=True)
 99 |     num_frames = len(os.listdir(image_dir))
100 | 
101 |     if num_frames < 10000:
102 |         frames, raw_w, raw_h = load_images(image_dir, video_id, 1, num_frames)
103 |         frames = np.asarray(frames, dtype=np.float32)
104 |         imgs = video_transforms(frames)
105 |         img_tensor = torch.from_numpy(np.expand_dims(imgs.transpose([3, 0, 1, 2]), axis=0))
106 |         print("process images:", (frames.shape[0], raw_w, raw_h, frames.shape[-1]), "-->", frames.shape, "-->",
107 |               imgs.shape, "-->", tuple(img_tensor.size()), flush=True)
108 | 
109 |         print("extract visual features...", flush=True)
110 |         features = extract_features(img_tensor, i3d_model, args.strides)
111 |         np.save(os.path.join(args.save_dir, video_id), arr=features)
112 |         print("extracted features shape: {}".format(features.shape), flush=True)
113 |         feature_shapes[video_id] = features.shape[0]
114 | 
115 |     else:
116 |         all_features = []
117 |         for start_idx in range(1, num_frames, 10000):
118 |             end_idx = min(start_idx + 10000, num_frames + 1)
119 |             cur_num_frames = end_idx - start_idx
120 |             if cur_num_frames < args.strides:
121 |                 cur_num_frames = args.strides
122 |                 start_idx = end_idx - cur_num_frames
123 |             frames, raw_w, raw_h = load_images(image_dir, video_id, start_idx, cur_num_frames)
124 |             frames = np.asarray(frames, dtype=np.float32)
125 |             imgs = video_transforms(frames)
126 |             img_tensor = torch.from_numpy(np.expand_dims(imgs.transpose([3, 0, 1, 2]), axis=0))
127 |             print("process images:", (frames.shape[0], raw_w, raw_h, frames.shape[-1]), "-->", frames.shape, "-->",
128 |                   imgs.shape, "-->", tuple(img_tensor.size()), flush=True)
129 |             print("extract visual features...", flush=True)
130 |             features = extract_features(img_tensor, i3d_model, args.strides)
131 |             all_features.append(features)
132 |         all_features = np.concatenate(all_features, axis=0)
133 |         np.save(os.path.join(args.save_dir, video_id), arr=all_features)
134 |         print("extracted features shape: {}".format(all_features.shape), flush=True)
135 |         feature_shapes[video_id] = all_features.shape[0]
136 | 
137 |     if args.remove_images:
138 |         # remove extract images to release memory space
139 |         subprocess.call("rm -rf {}".format(image_dir), shell=True)
140 | 
141 | with open(os.path.join(args.save_dir, "feature_shapes.json"), mode="w", encoding="utf-8") as f:
142 |     json.dump(feature_shapes, f)
143 | 


--------------------------------------------------------------------------------
/prepare/extract_tacos_org.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import argparse
 4 | import numpy as np
 5 | from tqdm import tqdm
 6 | 
 7 | # 1. step download pre-trained C3D features from https://github.com/jiyanggao/TALL
 8 | # 2. convert the features
 9 | 
10 | parser = argparse.ArgumentParser()
11 | parser.add_argument("--data_path", type=str, required=True, default="tacos dataset")
12 | parser.add_argument("--feature_path", type=str, required=True, help="pre-trained C3D features")
13 | parser.add_argument("--save_dir", type=str, required=True, help="extracted feature save path")
14 | parser.add_argument("--sample_rate", type=int, default=64, help="sample rate [64 | 128 | 256 | 512]")
15 | args = parser.parse_args()
16 | 
17 | stride = args.sample_rate // 5  # due to 0.8 overlap of the pre-trained C3D features
18 | 
19 | if not os.path.exists(args.save_dir):
20 |     os.makedirs(args.save_dir)
21 | 
22 | with open(os.path.join(args.data_path, "train.json"), mode="r", encoding="utf-8") as f:
23 |     dataset = json.load(f)
24 | with open(os.path.join(args.data_path, "val.json"), mode="r", encoding="utf-8") as f:
25 |     dataset.update(json.load(f))
26 | with open(os.path.join(args.data_path, "test.json"), mode="r", encoding="utf-8") as f:
27 |     dataset.update(json.load(f))
28 | 
29 | feature_shapes = dict()
30 | for video_id, annotations in tqdm(dataset.items(), total=len(dataset), desc=""):
31 |     video_features = []
32 |     num_frames = annotations["num_frames"] - 16  # trick from 2D-TAN
33 |     for idx in range(0, (num_frames - args.sample_rate) // stride + 1):
34 |         s_idx = idx * stride + 1
35 |         e_idx = s_idx + args.sample_rate
36 |         feature_path = os.path.join(args.feature_path, "{}_{}_{}.npy".format(video_id, s_idx, e_idx))
37 |         feature = np.load(feature_path)
38 |         video_features.append(feature)
39 |     video_features = np.stack(video_features, axis=0)
40 |     np.save(os.path.join(args.save_dir, video_id), arr=video_features)
41 |     feature_shapes[video_id] = video_features.shape[0]
42 | 
43 | with open(os.path.join(args.save_dir, "feature_shapes.json"), mode="w", encoding="utf-8") as f:
44 |     json.dump(feature_shapes, f)
45 | 


--------------------------------------------------------------------------------
/prepare/feature_extractor.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Downloaded from https://github.com/piergiaj/pytorch-i3d/blob/master/pytorch_i3d.py
  3 | Minor modification are applied to fit our requirements
  4 | """
  5 | import torch
  6 | import torch.nn as nn
  7 | import torch.nn.functional as F
  8 | 
  9 | 
 10 | class MaxPool3dSamePadding(nn.MaxPool3d):
 11 | 
 12 |     def compute_pad(self, dim, s):
 13 |         if s % self.stride[dim] == 0:
 14 |             return max(self.kernel_size[dim] - self.stride[dim], 0)
 15 |         else:
 16 |             return max(self.kernel_size[dim] - (s % self.stride[dim]), 0)
 17 | 
 18 |     def forward(self, x):
 19 |         # compute 'same' padding
 20 |         (batch, channel, t, h, w) = x.size()
 21 |         pad_t = self.compute_pad(0, t)
 22 |         pad_h = self.compute_pad(1, h)
 23 |         pad_w = self.compute_pad(2, w)
 24 | 
 25 |         pad_t_f = pad_t // 2
 26 |         pad_t_b = pad_t - pad_t_f
 27 |         pad_h_f = pad_h // 2
 28 |         pad_h_b = pad_h - pad_h_f
 29 |         pad_w_f = pad_w // 2
 30 |         pad_w_b = pad_w - pad_w_f
 31 | 
 32 |         pad = [pad_w_f, pad_w_b, pad_h_f, pad_h_b, pad_t_f, pad_t_b]
 33 |         x = F.pad(x, pad)
 34 |         return super(MaxPool3dSamePadding, self).forward(x)
 35 | 
 36 | 
 37 | class Unit3D(nn.Module):
 38 | 
 39 |     def __init__(self, in_channels,
 40 |                  output_channels,
 41 |                  kernel_shape=(1, 1, 1),
 42 |                  stride=(1, 1, 1),
 43 |                  padding=0,
 44 |                  activation_fn=None,
 45 |                  use_batch_norm=True,
 46 |                  use_bias=False,
 47 |                  name='unit_3d'):
 48 | 
 49 |         """Initializes Unit3D module."""
 50 |         super(Unit3D, self).__init__()
 51 | 
 52 |         self._output_channels = output_channels
 53 |         self._kernel_shape = kernel_shape
 54 |         self._stride = stride
 55 |         self._use_batch_norm = use_batch_norm
 56 |         self._activation_fn = activation_fn
 57 |         self._use_bias = use_bias
 58 |         self.name = name
 59 |         self.padding = padding
 60 | 
 61 |         self.conv3d = nn.Conv3d(in_channels=in_channels,
 62 |                                 out_channels=self._output_channels,
 63 |                                 kernel_size=self._kernel_shape,
 64 |                                 stride=self._stride,
 65 |                                 padding=0,
 66 |                                 # we always want padding to be 0 here. We will dynamically pad based on input size
 67 |                                 # in forward function
 68 |                                 bias=self._use_bias)
 69 | 
 70 |         if self._use_batch_norm:
 71 |             self.bn = nn.BatchNorm3d(self._output_channels, eps=0.001, momentum=0.01)
 72 | 
 73 |     def compute_pad(self, dim, s):
 74 |         if s % self._stride[dim] == 0:
 75 |             return max(self._kernel_shape[dim] - self._stride[dim], 0)
 76 |         else:
 77 |             return max(self._kernel_shape[dim] - (s % self._stride[dim]), 0)
 78 | 
 79 |     def forward(self, x):
 80 |         # compute 'same' padding
 81 |         (batch, channel, t, h, w) = x.size()
 82 |         pad_t = self.compute_pad(0, t)
 83 |         pad_h = self.compute_pad(1, h)
 84 |         pad_w = self.compute_pad(2, w)
 85 | 
 86 |         pad_t_f = pad_t // 2
 87 |         pad_t_b = pad_t - pad_t_f
 88 |         pad_h_f = pad_h // 2
 89 |         pad_h_b = pad_h - pad_h_f
 90 |         pad_w_f = pad_w // 2
 91 |         pad_w_b = pad_w - pad_w_f
 92 | 
 93 |         pad = [pad_w_f, pad_w_b, pad_h_f, pad_h_b, pad_t_f, pad_t_b]
 94 |         x = F.pad(x, pad)
 95 | 
 96 |         x = self.conv3d(x)
 97 |         if self._use_batch_norm:
 98 |             x = self.bn(x)
 99 |         if self._activation_fn is not None:
100 |             x = self._activation_fn(x)
101 |         return x
102 | 
103 | 
104 | class InceptionModule(nn.Module):
105 |     def __init__(self, in_channels, out_channels, name):
106 |         super(InceptionModule, self).__init__()
107 | 
108 |         self.b0 = Unit3D(in_channels=in_channels, output_channels=out_channels[0], kernel_shape=[1, 1, 1], padding=0,
109 |                          activation_fn=F.relu, name=name + '/Branch_0/Conv3d_0a_1x1')
110 |         self.b1a = Unit3D(in_channels=in_channels, output_channels=out_channels[1], kernel_shape=[1, 1, 1], padding=0,
111 |                           activation_fn=F.relu, name=name + '/Branch_1/Conv3d_0a_1x1')
112 |         self.b1b = Unit3D(in_channels=out_channels[1], output_channels=out_channels[2], kernel_shape=[3, 3, 3],
113 |                           activation_fn=F.relu, name=name + '/Branch_1/Conv3d_0b_3x3')
114 |         self.b2a = Unit3D(in_channels=in_channels, output_channels=out_channels[3], kernel_shape=[1, 1, 1], padding=0,
115 |                           activation_fn=F.relu, name=name + '/Branch_2/Conv3d_0a_1x1')
116 |         self.b2b = Unit3D(in_channels=out_channels[3], output_channels=out_channels[4], kernel_shape=[3, 3, 3],
117 |                           activation_fn=F.relu, name=name + '/Branch_2/Conv3d_0b_3x3')
118 |         self.b3a = MaxPool3dSamePadding(kernel_size=[3, 3, 3],
119 |                                         stride=(1, 1, 1), padding=0)
120 |         self.b3b = Unit3D(in_channels=in_channels, output_channels=out_channels[5], kernel_shape=[1, 1, 1], padding=0,
121 |                           activation_fn=F.relu, name=name + '/Branch_3/Conv3d_0b_1x1')
122 |         self.name = name
123 | 
124 |     def forward(self, x):
125 |         b0 = self.b0(x)
126 |         b1 = self.b1b(self.b1a(x))
127 |         b2 = self.b2b(self.b2a(x))
128 |         b3 = self.b3b(self.b3a(x))
129 |         return torch.cat([b0, b1, b2, b3], dim=1)
130 | 
131 | 
132 | class InceptionI3d(nn.Module):
133 |     """Inception-v1 I3D architecture.
134 |     The model is introduced in:
135 |         Quo Vadis, Action Recognition? A New Model and the Kinetics Dataset
136 |         Joao Carreira, Andrew Zisserman
137 |         https://arxiv.org/pdf/1705.07750v1.pdf.
138 |     See also the Inception architecture, introduced in:
139 |         Going deeper with convolutions
140 |         Christian Szegedy, Wei Liu, Yangqing Jia, Pierre Sermanet, Scott Reed,
141 |         Dragomir Anguelov, Dumitru Erhan, Vincent Vanhoucke, Andrew Rabinovich.
142 |         http://arxiv.org/pdf/1409.4842v1.pdf.
143 |     """
144 | 
145 |     # Endpoints of the model in order. During construction, all the endpoints up
146 |     # to a designated `final_endpoint` are returned in a dictionary as the
147 |     # second return value.
148 |     VALID_ENDPOINTS = (
149 |         'Conv3d_1a_7x7',
150 |         'MaxPool3d_2a_3x3',
151 |         'Conv3d_2b_1x1',
152 |         'Conv3d_2c_3x3',
153 |         'MaxPool3d_3a_3x3',
154 |         'Mixed_3b',
155 |         'Mixed_3c',
156 |         'MaxPool3d_4a_3x3',
157 |         'Mixed_4b',
158 |         'Mixed_4c',
159 |         'Mixed_4d',
160 |         'Mixed_4e',
161 |         'Mixed_4f',
162 |         'MaxPool3d_5a_2x2',
163 |         'Mixed_5b',
164 |         'Mixed_5c',
165 |         'Logits',
166 |         'Predictions',
167 |     )
168 | 
169 |     def __init__(self, num_classes=400, spatial_squeeze=True,
170 |                  final_endpoint='Logits', name='inception_i3d', in_channels=3, dropout_keep_prob=0.5):
171 |         """Initializes I3D model instance.
172 |         Args:
173 |           num_classes: The number of outputs in the logit layer (default 400, which
174 |               matches the Kinetics dataset).
175 |           spatial_squeeze: Whether to squeeze the spatial dimensions for the logits
176 |               before returning (default True).
177 |           final_endpoint: The model contains many possible endpoints.
178 |               `final_endpoint` specifies the last endpoint for the model to be built
179 |               up to. In addition to the output at `final_endpoint`, all the outputs
180 |               at endpoints up to `final_endpoint` will also be returned, in a
181 |               dictionary. `final_endpoint` must be one of
182 |               InceptionI3d.VALID_ENDPOINTS (default 'Logits').
183 |           name: A string (optional). The name of this module.
184 |         Raises:
185 |           ValueError: if `final_endpoint` is not recognized.
186 |         """
187 | 
188 |         if final_endpoint not in self.VALID_ENDPOINTS:
189 |             raise ValueError('Unknown final endpoint %s' % final_endpoint)
190 | 
191 |         super(InceptionI3d, self).__init__()
192 |         self._num_classes = num_classes
193 |         self._spatial_squeeze = spatial_squeeze
194 |         self._final_endpoint = final_endpoint
195 |         self.logits = None
196 | 
197 |         if self._final_endpoint not in self.VALID_ENDPOINTS:
198 |             raise ValueError('Unknown final endpoint %s' % self._final_endpoint)
199 | 
200 |         self.end_points = {}
201 |         end_point = 'Conv3d_1a_7x7'
202 |         self.end_points[end_point] = Unit3D(in_channels=in_channels, output_channels=64, kernel_shape=[7, 7, 7],
203 |                                             activation_fn=F.relu, stride=(2, 2, 2), padding=3,  # padding=(3, 3, 3),
204 |                                             name=name + end_point)
205 |         if self._final_endpoint == end_point:
206 |             return
207 | 
208 |         end_point = 'MaxPool3d_2a_3x3'
209 |         self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[1, 3, 3], stride=(1, 2, 2),
210 |                                                           padding=0)
211 |         if self._final_endpoint == end_point:
212 |             return
213 | 
214 |         end_point = 'Conv3d_2b_1x1'
215 |         self.end_points[end_point] = Unit3D(in_channels=64, output_channels=64, kernel_shape=[1, 1, 1], padding=0,
216 |                                             activation_fn=F.relu, name=name + end_point)
217 |         if self._final_endpoint == end_point:
218 |             return
219 | 
220 |         end_point = 'Conv3d_2c_3x3'
221 |         self.end_points[end_point] = Unit3D(in_channels=64, output_channels=192, kernel_shape=[3, 3, 3], padding=1,
222 |                                             activation_fn=F.relu, name=name + end_point)
223 |         if self._final_endpoint == end_point:
224 |             return
225 | 
226 |         end_point = 'MaxPool3d_3a_3x3'
227 |         self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[1, 3, 3], stride=(1, 2, 2),
228 |                                                           padding=0)
229 |         if self._final_endpoint == end_point:
230 |             return
231 | 
232 |         end_point = 'Mixed_3b'
233 |         self.end_points[end_point] = InceptionModule(192, [64, 96, 128, 16, 32, 32], name + end_point)
234 |         if self._final_endpoint == end_point:
235 |             return
236 | 
237 |         end_point = 'Mixed_3c'
238 |         self.end_points[end_point] = InceptionModule(256, [128, 128, 192, 32, 96, 64], name + end_point)
239 |         if self._final_endpoint == end_point:
240 |             return
241 | 
242 |         end_point = 'MaxPool3d_4a_3x3'
243 |         self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[3, 3, 3], stride=(2, 2, 2),
244 |                                                           padding=0)
245 |         if self._final_endpoint == end_point:
246 |             return
247 | 
248 |         end_point = 'Mixed_4b'
249 |         self.end_points[end_point] = InceptionModule(128 + 192 + 96 + 64, [192, 96, 208, 16, 48, 64], name + end_point)
250 |         if self._final_endpoint == end_point:
251 |             return
252 | 
253 |         end_point = 'Mixed_4c'
254 |         self.end_points[end_point] = InceptionModule(192 + 208 + 48 + 64, [160, 112, 224, 24, 64, 64], name + end_point)
255 |         if self._final_endpoint == end_point:
256 |             return
257 | 
258 |         end_point = 'Mixed_4d'
259 |         self.end_points[end_point] = InceptionModule(160 + 224 + 64 + 64, [128, 128, 256, 24, 64, 64], name + end_point)
260 |         if self._final_endpoint == end_point:
261 |             return
262 | 
263 |         end_point = 'Mixed_4e'
264 |         self.end_points[end_point] = InceptionModule(128 + 256 + 64 + 64, [112, 144, 288, 32, 64, 64], name + end_point)
265 |         if self._final_endpoint == end_point:
266 |             return
267 | 
268 |         end_point = 'Mixed_4f'
269 |         self.end_points[end_point] = InceptionModule(112 + 288 + 64 + 64, [256, 160, 320, 32, 128, 128],
270 |                                                      name + end_point)
271 |         if self._final_endpoint == end_point:
272 |             return
273 | 
274 |         end_point = 'MaxPool3d_5a_2x2'
275 |         self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[2, 2, 2], stride=(2, 2, 2),
276 |                                                           padding=0)
277 |         if self._final_endpoint == end_point:
278 |             return
279 | 
280 |         end_point = 'Mixed_5b'
281 |         self.end_points[end_point] = InceptionModule(256 + 320 + 128 + 128, [256, 160, 320, 32, 128, 128],
282 |                                                      name + end_point)
283 |         if self._final_endpoint == end_point:
284 |             return
285 | 
286 |         end_point = 'Mixed_5c'
287 |         self.end_points[end_point] = InceptionModule(256 + 320 + 128 + 128, [384, 192, 384, 48, 128, 128],
288 |                                                      name + end_point)
289 |         if self._final_endpoint == end_point:
290 |             return
291 | 
292 |         # end_point = 'Logits'
293 |         self.avg_pool = nn.AvgPool3d(kernel_size=[2, 7, 7], stride=(1, 1, 1))
294 |         self.dropout = nn.Dropout(dropout_keep_prob)
295 |         self.logits = Unit3D(in_channels=384 + 384 + 128 + 128, output_channels=self._num_classes,
296 |                              kernel_shape=[1, 1, 1],
297 |                              padding=0,
298 |                              use_batch_norm=False,
299 |                              use_bias=True,
300 |                              name='logits')
301 | 
302 |         self.build()
303 | 
304 |     def replace_logits(self, num_classes):
305 |         self._num_classes = num_classes
306 |         self.logits = Unit3D(in_channels=384 + 384 + 128 + 128, output_channels=self._num_classes,
307 |                              kernel_shape=[1, 1, 1],
308 |                              padding=0,
309 |                              use_batch_norm=False,
310 |                              use_bias=True,
311 |                              name='logits')
312 | 
313 |     def build(self):
314 |         for k in self.end_points.keys():
315 |             self.add_module(k, self.end_points[k])
316 | 
317 |     def forward(self, x):
318 |         for end_point in self.VALID_ENDPOINTS:
319 |             if end_point in self.end_points:
320 |                 x = self._modules[end_point](x)  # use _modules to work with data parallel
321 |         x = self.avg_pool(x)
322 |         logits = self.logits(self.dropout(x))
323 |         if self._spatial_squeeze:
324 |             logits = x.squeeze(3).squeeze(3)
325 |         # logits is batch X time X classes, which is what we want to work with
326 |         return logits
327 | 
328 |     def extract_features(self, x):
329 |         for end_point in self.VALID_ENDPOINTS:
330 |             if end_point in self.end_points:
331 |                 x = self._modules[end_point](x)
332 |         # x = [batch_size, channels, time, height, width]
333 |         x = self.avg_pool(x)  # 384 + 384 + 128 + 128 = 1024
334 |         x = x.squeeze(0).permute(1, 2, 3, 0)  # x = [time, height, width, channels]
335 |         x = x.squeeze(1).squeeze(1)  # x = [time, channels]
336 |         return x
337 | 


--------------------------------------------------------------------------------
/prepare/videotransforms.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import numbers
  3 | import random
  4 | 
  5 | 
  6 | class RandomCrop(object):
  7 |     """Crop the given video sequences (t x h x w) at a random location.
  8 |     Args:
  9 |         size (sequence or int): Desired output size of the crop. If size is an
 10 |             int instead of sequence like (h, w), a square crop (size, size) is
 11 |             made.
 12 |     """
 13 | 
 14 |     def __init__(self, size):
 15 |         if isinstance(size, numbers.Number):
 16 |             self.size = (size, size)
 17 |         else:
 18 |             self.size = size
 19 | 
 20 |     @staticmethod
 21 |     def get_params(img, output_size):
 22 |         """Get parameters for ``crop`` for a random crop.
 23 |         Args:
 24 |             img (PIL Image): Image to be cropped.
 25 |             output_size (tuple): Expected output size of the crop.
 26 |         Returns:
 27 |             tuple: params (i, j, h, w) to be passed to ``crop`` for random crop.
 28 |         """
 29 |         t, h, w, c = img.shape
 30 |         th, tw = output_size
 31 |         if w == tw and h == th:
 32 |             return 0, 0, h, w
 33 | 
 34 |         i = random.randint(0, h - th) if h != th else 0
 35 |         j = random.randint(0, w - tw) if w != tw else 0
 36 |         return i, j, th, tw
 37 | 
 38 |     def __call__(self, imgs):
 39 | 
 40 |         i, j, h, w = self.get_params(imgs, self.size)
 41 | 
 42 |         imgs = imgs[:, i:i + h, j:j + w, :]
 43 |         return imgs
 44 | 
 45 |     def __repr__(self):
 46 |         return self.__class__.__name__ + '(size={0})'.format(self.size)
 47 | 
 48 | 
 49 | class CenterCrop(object):
 50 |     """Crops the given seq Images at the center.
 51 |     Args:
 52 |         size (sequence or int): Desired output size of the crop. If size is an
 53 |             int instead of sequence like (h, w), a square crop (size, size) is
 54 |             made.
 55 |     """
 56 | 
 57 |     def __init__(self, size):
 58 |         if isinstance(size, numbers.Number):
 59 |             self.size = (size, size)
 60 |         else:
 61 |             self.size = size
 62 | 
 63 |     def __call__(self, imgs):
 64 |         """
 65 |         Args:
 66 |             imgs (PIL Image): Image to be cropped.
 67 |         Returns:
 68 |             PIL Image: Cropped image.
 69 |         """
 70 |         t, h, w, c = imgs.shape
 71 |         th, tw = self.size
 72 |         i = int(np.round((h - th) / 2.))
 73 |         j = int(np.round((w - tw) / 2.))
 74 | 
 75 |         return imgs[:, i:i + th, j:j + tw, :]
 76 | 
 77 |     def __repr__(self):
 78 |         return self.__class__.__name__ + '(size={0})'.format(self.size)
 79 | 
 80 | 
 81 | class RandomHorizontalFlip(object):
 82 |     """Horizontally flip the given seq Images randomly with a given probability.
 83 |     Args:
 84 |         p (float): probability of the image being flipped. Default value is 0.5
 85 |     """
 86 | 
 87 |     def __init__(self, p=0.5):
 88 |         self.p = p
 89 | 
 90 |     def __call__(self, imgs):
 91 |         """
 92 |         Args:
 93 |             imgs (seq Images): seq Images to be flipped.
 94 |         Returns:
 95 |             seq Images: Randomly flipped seq images.
 96 |         """
 97 |         if random.random() < self.p:
 98 |             # t x h x w
 99 |             return np.flip(imgs, axis=2).copy()
100 |         return imgs
101 | 
102 |     def __repr__(self):
103 |         return self.__class__.__name__ + '(p={})'.format(self.p)
104 | 


--------------------------------------------------------------------------------
/run_activitynet.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import math
  3 | import numpy as np
  4 | import tensorflow as tf
  5 | from tqdm import tqdm
  6 | from argparse import ArgumentParser
  7 | from models.LPNet import LPNet
  8 | from utils.prepro_activitynet import prepro_activitynet
  9 | from utils.data_utils import load_video_features, load_json, write_json, batch_iter
 10 | from utils.runner_utils import write_tf_summary, eval_test, get_feed_dict
 11 | import json
 12 | 
 13 | parser = ArgumentParser()
 14 | parser.add_argument("--gpu_idx", type=str, default="0", help="GPU index")
 15 | parser.add_argument("--seed", type=int, default=12345, help="random seed")
 16 | parser.add_argument("--mode", type=str, default="train", help="prepro | train | test")
 17 | parser.add_argument("--feature", type=str, default='new', help="[new | org]")
 18 | parser.add_argument("--root", type=str, default='data/ActivityNet', help="root directory for store raw data")
 19 | parser.add_argument("--wordvec_path", type=str, default='data/glove.840B.300d.txt', help="glove word embedding path")
 20 | parser.add_argument("--home_dir", type=str, default=None, help="home directory for saving models")
 21 | parser.add_argument("--save_dir", type=str, default=None, help="directory for saving processed dataset")
 22 | parser.add_argument("--num_train_steps", type=int, default=None, help="number of training steps")
 23 | parser.add_argument("--char_size", type=int, default=None, help="number of characters")
 24 | parser.add_argument("--epochs", type=int, default=100, help="number of epochs")
 25 | parser.add_argument("--batch_size", type=int, default=16, help="batch size")
 26 | parser.add_argument("--word_dim", type=int, default=300, help="word embedding dimension")
 27 | parser.add_argument("--video_feature_dim", type=int, default=1024, help="video feature input dimension")
 28 | parser.add_argument("--char_dim", type=int, default=100, help="character dimension")
 29 | parser.add_argument("--hidden_size", type=int, default=256, help="hidden size")
 30 | parser.add_argument("--max_position_length", type=int, default=512, help="max position length")
 31 | parser.add_argument("--highlight_lambda", type=float, default=5.0, help="lambda for highlight region")
 32 | parser.add_argument("--extend", type=float, default=0.1, help="highlight region extension")
 33 | parser.add_argument("--num_heads", type=int, default=8, help="number of heads")
 34 | parser.add_argument("--drop_rate", type=float, default=0.1, help="dropout rate")
 35 | parser.add_argument("--clip_norm", type=float, default=1.0, help="gradient clip norm")
 36 | parser.add_argument("--init_lr", type=float, default=0.0001, help="initial learning rate")
 37 | parser.add_argument("--warmup_proportion", type=float, default=0.0, help="warmup proportion")
 38 | parser.add_argument("--period", type=int, default=100, help="training loss print period")
 39 | parser.add_argument("--eval_period", type=int, default=37421, help="evaluation period")
 40 | configs = parser.parse_args()
 41 | 
 42 | # os environment
 43 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = "3"
 44 | os.environ["CUDA_VISIBLE_DEVICES"] = configs.gpu_idx
 45 | 
 46 | np.random.seed(configs.seed)
 47 | tf.set_random_seed(configs.seed)
 48 | tf.random.set_random_seed(configs.seed)
 49 | 
 50 | class MyEncoder(json.JSONEncoder):
 51 |     def default(self, obj):
 52 |         if isinstance(obj, np.integer):
 53 |             return int(obj)
 54 |         elif isinstance(obj, np.floating):
 55 |             return float(obj)
 56 |         elif isinstance(obj, np.ndarray):
 57 |             return obj.tolist()
 58 |         if isinstance(obj, time):
 59 |             return obj.__str__()
 60 |         else:
 61 |             return super(NpEncoder, self).default(obj)
 62 | 
 63 | # specify the dataset directory
 64 | if configs.home_dir is None:
 65 |     configs.home_dir = "ckpt/activitynet_{}_{}".format(configs.feature, configs.max_position_length)
 66 | configs.save_dir = "datasets/activitynet_{}/{}".format(configs.feature, configs.max_position_length)
 67 | configs.video_feature_dim = 1024 if configs.feature == "new" else 500
 68 | 
 69 | if configs.mode.lower() == "prepro":
 70 |     prepro_activitynet(configs)
 71 | 
 72 | elif configs.mode.lower() == "train":
 73 |     video_feature_path = os.path.join(configs.root, "activitynet_features_{}".format(configs.feature))
 74 |     video_features = load_video_features(video_feature_path, max_position_length=configs.max_position_length)
 75 | 
 76 |     train_set = load_json(os.path.join(configs.save_dir, "train_set.json"))
 77 |     test_set = load_json(os.path.join(configs.save_dir, "test2_set.json"))
 78 |     num_train_batches = math.ceil(len(train_set) / configs.batch_size)
 79 | 
 80 |     if configs.num_train_steps is None:
 81 |         configs.num_train_steps = num_train_batches * configs.epochs
 82 |     if configs.char_size is None:
 83 |         configs.char_size = len(load_json(os.path.join(configs.save_dir, "char_dict.json")))
 84 | 
 85 |     log_dir = os.path.join(configs.home_dir, "event")
 86 |     model_dir = os.path.join(configs.home_dir, "model")
 87 |     if not os.path.exists(model_dir):
 88 |         os.makedirs(model_dir)
 89 |     if not os.path.exists(log_dir):
 90 |         os.makedirs(log_dir)
 91 | 
 92 |     # write configs to json file
 93 |     write_json(vars(configs), save_path=os.path.join(model_dir, "configs.json"), pretty=True)
 94 | 
 95 |     with tf.Graph().as_default() as graph:
 96 |         model = LPNet(configs, graph=graph)
 97 |         sess_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
 98 |         sess_config.gpu_options.allow_growth = True
 99 | 
100 |         with tf.Session(config=sess_config) as sess:
101 |             learning_rate = tf.train.exponential_decay(learning_rate=configs.init_lr, global_step=model.global_step, decay_steps=100000, decay_rate=0.9,staircase=True)
102 | 
103 |             optimizer = tf.train.AdamOptimizer(learning_rate,
104 |                                     beta1=0.9,
105 |                                     beta2=0.999,
106 |                                     name='AdamOptimizer')
107 |             # train_op = optimizer.minimize(model.my_loss, global_step=model.global_step)
108 |             trainable_vars = tf.trainable_variables()
109 |             freeze_bbox_var_list = [
110 |                 t for t in trainable_vars
111 |                 if not t.name.startswith(u'proposal_box')
112 |             ]
113 |             bbox_var_list = [
114 |                 t for t in trainable_vars if t.name.startswith(u'proposal_box')
115 |             ]
116 |             train_op1 = optimizer.minimize(model.reg_loss,
117 |                                            global_step=model.global_step,
118 |                                            var_list=freeze_bbox_var_list)
119 |             train_op2 = optimizer.minimize(model.my_loss,
120 |                                            global_step=model.global_step,
121 |                                            var_list=bbox_var_list)
122 | 
123 |             saver = tf.train.Saver(max_to_keep=5)
124 |             writer = tf.summary.FileWriter(log_dir)
125 |             sess.run(tf.global_variables_initializer())
126 | 
127 |             best_r1i7 = -1.0
128 |             score_writer = open(os.path.join(model_dir, "eval_results.txt"), mode="w", encoding="utf-8")
129 |             l = 0
130 |             r = 0
131 |             o = 0
132 |             for epoch in range(configs.epochs):
133 |                 for data in tqdm(batch_iter(train_set, video_features, configs.batch_size, configs.extend, True, True),
134 |                                  total=num_train_batches, desc="Epoch %d / %d" % (epoch + 1, configs.epochs)):
135 | 
136 |                     # run the model
137 |                     feed_dict = get_feed_dict(data, model, configs.drop_rate)
138 | 
139 |                     _, _, loss, rloss, iloss, lloss, global_step = sess.run([train_op1, train_op2, model.my_loss, model.reg_loss, model.iou_loss, model.l1_loss, model.global_step], feed_dict=feed_dict)
140 | 
141 |                     if global_step % configs.period == 0:
142 |                         # write_tf_summary(writer, [("train/my_loss", loss)], global_step)
143 |                         write_tf_summary(writer, [("train/my_loss", loss),
144 |                                                   ("train/reg_loss", rloss),
145 |                                                   ("train/iou_loss", iloss),
146 |                                                   ("train/l1_loss", lloss)],
147 |                                          global_step)
148 |                     # evaluate
149 |                     # if global_step % configs.eval_period == 0 or global_step % num_train_batches == 0:
150 |                     if (global_step/2+1) % num_train_batches == 0:
151 | 
152 |                         r1i3, r1i5, r1i7, mi, value_pairs, score_str = eval_test(
153 |                             sess=sess, model=model, dataset=test_set, video_features=video_features,
154 |                             configs=configs, epoch=epoch + 1, global_step=global_step, name="test")
155 | 
156 |                         write_tf_summary(writer, value_pairs, global_step)
157 |                         score_writer.write(score_str)
158 |                         score_writer.flush()
159 | 
160 |                         # save the model according to the result of Rank@1, IoU=0.7
161 |                         if r1i7 > best_r1i7:
162 |                             best_r1i7 = r1i7
163 |                             filename = os.path.join(model_dir, "model_{}.ckpt".format(global_step))
164 |                             saver.save(sess, filename)
165 | 
166 |             score_writer.close()
167 | 
168 | elif configs.mode.lower() == "test":
169 | 
170 |     # load previous configs
171 |     model_dir = os.path.join(configs.home_dir, "model")
172 |     pre_configs = load_json(os.path.join(model_dir, "configs.json"))
173 |     parser.set_defaults(**pre_configs)
174 |     configs = parser.parse_args()
175 | 
176 |     # load video features
177 |     video_feature_path = os.path.join(configs.root, "activitynet_features_{}".format(configs.feature))
178 |     video_features = load_video_features(video_feature_path, max_position_length=configs.max_position_length)
179 | 
180 |     # load test dataset
181 |     test_set = load_json(os.path.join(configs.save_dir, "test2_set.json"))
182 | 
183 |     # restore model and evaluate
184 |     with tf.Graph().as_default() as graph:
185 |         model = LPNet(configs, graph=graph)
186 |         sess_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
187 |         sess_config.gpu_options.allow_growth = True
188 | 
189 |         with tf.Session(config=sess_config) as sess:
190 |             saver = tf.train.Saver()
191 |             sess.run(tf.global_variables_initializer())
192 |             saver.restore(sess, tf.train.latest_checkpoint(model_dir))
193 | 
194 |             r1i3, r1i5, r1i7, mi, *_ = eval_test(sess, model, dataset=test_set, video_features=video_features,
195 |                                                  configs=configs, name="test")
196 | 
197 |             print("\n" + "\x1b[1;31m" + "Rank@1, IoU=0.3:\t{:.2f}".format(r1i3) + "\x1b[0m", flush=True)
198 |             print("\x1b[1;31m" + "Rank@1, IoU=0.5:\t{:.2f}".format(r1i5) + "\x1b[0m", flush=True)
199 |             print("\x1b[1;31m" + "Rank@1, IoU=0.7:\t{:.2f}".format(r1i7) + "\x1b[0m", flush=True)
200 |             print("\x1b[1;31m" + "{}:\t{:.2f}".format("mean IoU".ljust(15), mi[0]) + "\x1b[0m", flush=True)
201 | 
202 | else:
203 |     raise ValueError("Unknown mode {}!!!".format(configs.mode))
204 | 


--------------------------------------------------------------------------------
/run_charades.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import math
  3 | import numpy as np
  4 | import tensorflow as tf
  5 | from tensorflow.python.keras.backend import learning_phase
  6 | from tqdm import tqdm
  7 | from argparse import ArgumentParser
  8 | from models.LPNet import LPNet
  9 | from utils.prepro_charades import prepro_charades
 10 | from utils.data_utils import load_video_features, load_json, write_json, batch_iter
 11 | from utils.runner_utils import write_tf_summary, eval_test, get_feed_dict
 12 | import json
 13 | import scipy.signal as signal
 14 | 
 15 | parser = ArgumentParser()
 16 | parser.add_argument("--gpu_idx", type=str, default="0", help="GPU index")
 17 | parser.add_argument("--seed", type=int, default=12345, help="random seed")
 18 | parser.add_argument("--mode", type=str, default="train", help="prepro | train | test")
 19 | parser.add_argument("--feature", type=str, default='c3d', help="[finetune | raw]")
 20 | parser.add_argument("--root", type=str, default='data/Charades', help="root directory for store raw data")
 21 | parser.add_argument("--wordvec_path", type=str, default='data/glove.840B.300d.txt', help="glove word embedding path")
 22 | parser.add_argument("--home_dir", type=str, default=None, help="home directory for saving models")
 23 | parser.add_argument("--save_dir", type=str, default=None, help="directory for saving processed dataset")
 24 | parser.add_argument("--num_train_steps", type=int, default=None, help="number of training steps")
 25 | parser.add_argument("--char_size", type=int, default=None, help="number of characters")
 26 | parser.add_argument("--epochs", type=int, default=200, help="number of epochs")
 27 | parser.add_argument("--batch_size", type=int, default=32, help="batch size")
 28 | parser.add_argument("--word_dim", type=int, default=300, help="word embedding dimension")
 29 | parser.add_argument("--video_feature_dim", type=int, default=500, help="video feature input dimension")
 30 | parser.add_argument("--char_dim", type=int, default=50, help="character dimension")
 31 | parser.add_argument("--hidden_size", type=int, default=256, help="hidden size")
 32 | parser.add_argument("--max_position_length", type=int, default=256, help="max position length")
 33 | parser.add_argument("--highlight_lambda", type=float, default=5.0, help="lambda for highlight region")
 34 | parser.add_argument("--extend", type=float, default=0.1, help="highlight region extension")
 35 | parser.add_argument("--num_heads", type=int, default=8, help="number of heads")
 36 | parser.add_argument("--drop_rate", type=float, default=0.1, help="dropout rate")
 37 | parser.add_argument("--clip_norm", type=float, default=1.0, help="gradient clip norm")
 38 | parser.add_argument("--init_lr", type=float, default=0.0001, help="initial learning rate")
 39 | parser.add_argument("--warmup_proportion", type=float, default=0.0, help="warmup proportion")
 40 | parser.add_argument("--period", type=int, default=100, help="training loss print period")
 41 | parser.add_argument("--eval_period", type=int, default=1000, help="evaluation period")
 42 | configs = parser.parse_args()
 43 | 
 44 | # os environment
 45 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = "3"
 46 | os.environ["CUDA_VISIBLE_DEVICES"] = configs.gpu_idx
 47 | 
 48 | np.random.seed(configs.seed)
 49 | tf.set_random_seed(configs.seed)
 50 | tf.random.set_random_seed(configs.seed)
 51 | 
 52 | # specify the dataset and model directory
 53 | if configs.home_dir is None:
 54 |     configs.home_dir = "ckpt/charades_{}_{}".format(configs.feature, configs.max_position_length)
 55 | configs.save_dir = "datasets/charades_{}/{}".format(configs.feature, configs.max_position_length)
 56 | 
 57 | class MyEncoder(json.JSONEncoder):
 58 |     def default(self, obj):
 59 |         if isinstance(obj, np.integer):
 60 |             return int(obj)
 61 |         elif isinstance(obj, np.floating):
 62 |             return float(obj)
 63 |         elif isinstance(obj, np.ndarray):
 64 |             return obj.tolist()
 65 |         if isinstance(obj, time):
 66 |             return obj.__str__()
 67 |         else:
 68 |             return super(NpEncoder, self).default(obj)
 69 | 
 70 | if configs.mode.lower() == "prepro":
 71 |     prepro_charades(configs)
 72 | 
 73 | elif configs.mode.lower() == "train":
 74 |     video_feature_path = os.path.join(configs.root, "charades_features_{}".format(configs.feature))
 75 |     video_features = load_video_features(video_feature_path, max_position_length=configs.max_position_length)
 76 |     train_set = load_json(os.path.join(configs.save_dir, "train_set.json"))
 77 |     test_set = load_json(os.path.join(configs.save_dir, "test_set.json"))
 78 |     # demo_set =load_json(os.path.join(configs.save_dir, "demo_set.json"))
 79 |     num_train_batches = math.ceil(len(train_set) / configs.batch_size)
 80 | 
 81 |     if configs.num_train_steps is None:
 82 |         configs.num_train_steps = num_train_batches * configs.epochs
 83 |     if configs.char_size is None:
 84 |         configs.char_size = len(load_json(os.path.join(configs.save_dir, "char_dict.json")))
 85 | 
 86 |     log_dir = os.path.join(configs.home_dir, "event")
 87 |     model_dir = os.path.join(configs.home_dir, "model")
 88 |     if not os.path.exists(model_dir):
 89 |         os.makedirs(model_dir)
 90 |     if not os.path.exists(log_dir):
 91 |         os.makedirs(log_dir)
 92 | 
 93 |     # write configs to json file
 94 |     write_json(vars(configs), save_path=os.path.join(model_dir, "configs.json"), pretty=True)
 95 | 
 96 |     with tf.Graph().as_default() as graph:
 97 |         model = LPNet(configs, graph=graph)
 98 |         sess_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
 99 |         sess_config.gpu_options.allow_growth = True
100 | 
101 | 
102 | 
103 |         with tf.Session(config=sess_config) as sess:
104 |             learning_rate = tf.train.exponential_decay(learning_rate=configs.init_lr, global_step=model.global_step, decay_steps=100000, decay_rate=0.9, staircase=True)
105 |             optimizer = tf.train.AdamOptimizer(learning_rate,
106 |                                                beta1=0.9,
107 |                                                beta2=0.999,
108 |                                                name='AdamOptimizer')
109 |             
110 |             trainable_vars = tf.trainable_variables()
111 |             freeze_bbox_var_list = [t for t in trainable_vars if not t.name.startswith(u'proposal_box')]
112 |             bbox_var_list = [t for t in trainable_vars if t.name.startswith(u'proposal_box')]
113 |             # print(freeze_bbox_var_list)
114 |             train_op1 = optimizer.minimize(model.reg_loss, global_step=model.global_step, var_list=freeze_bbox_var_list)
115 |             train_op2 = optimizer.minimize(model.my_loss, var_list=bbox_var_list)
116 | 
117 |             writer = tf.summary.FileWriter(log_dir)
118 |             sess.run(tf.global_variables_initializer())
119 |             saver_all = tf.train.Saver(max_to_keep=5)
120 | 
121 |             best_r1i7 = -1.0
122 |             score_writer = open(os.path.join(model_dir, "eval_results.txt"), mode="w", encoding="utf-8")
123 |             l = 0
124 |             r = 0
125 |             o = 0
126 |             for epoch in range(configs.epochs):
127 |                 for data in tqdm(batch_iter(train_set, video_features, configs.batch_size, configs.extend, train=True, shuffle=True),
128 |                                  total=num_train_batches, desc="Epoch %d / %d" % (epoch + 1, configs.epochs)):
129 | 
130 |                     # run the model
131 |                     feed_dict = get_feed_dict(data, model, configs.drop_rate)
132 | 
133 |                     _, _, train, hloss, lloss, loss, rloss, iloss, l1loss, global_step = sess.run([train_op1, train_op2, model.train, model.highlight_loss, model.loss, model.my_loss, model.reg_loss, model.iou_loss, model.l1_loss, model.global_step], feed_dict=feed_dict)
134 |                     # print(train)
135 |                     if global_step % configs.period == 0:
136 |                         write_tf_summary(writer, [("train/my_loss", loss),("train/reg_loss", rloss),("train/iou_loss", iloss),("train/l1_loss", l1loss),("train/loss", lloss),("train/highlight_loss", hloss)], global_step)
137 | 
138 |                     if (global_step + 1)% num_train_batches == 0:
139 |                     # if global_step % 800 == 0:
140 |                         lr = sess.run(learning_rate)
141 |                         print(lr)
142 |                         r1i3, r1i5, r1i7, mi, value_pairs, score_str = eval_test(
143 |                             sess=sess, model=model, dataset=test_set, video_features=video_features,
144 |                             configs=configs, epoch=epoch + 1, global_step=global_step, name="test")
145 | 
146 |                         write_tf_summary(writer, value_pairs, global_step)
147 |                         score_writer.write(score_str)
148 |                         score_writer.flush()
149 | 
150 |                         #save the model according to the result of Rank@1, IoU=0.7
151 |                         if r1i7 > best_r1i7:
152 |                             best_r1i7 = r1i7
153 |                             filename = os.path.join(model_dir, "model_{}.ckpt".format(global_step))
154 |                             saver_all.save(sess, filename)
155 |                 
156 |                 print(iloss)
157 |             score_writer.close()
158 | 
159 | elif configs.mode.lower() == "test":
160 | 
161 |     # load previous configs
162 |     model_dir = os.path.join(configs.home_dir, "model")
163 |     pre_configs = load_json(os.path.join(model_dir, "configs.json"))
164 |     parser.set_defaults(**pre_configs)
165 |     configs = parser.parse_args()
166 | 
167 |     # load video features
168 |     video_feature_path = os.path.join(configs.root, "charades_features_{}".format(configs.feature))
169 |     video_features = load_video_features(video_feature_path, max_position_length=configs.max_position_length)
170 | 
171 |     # load test dataset
172 |     test_set = load_json(os.path.join(configs.save_dir, "test_set.json"))
173 |     # restore model and evaluate
174 |     with tf.Graph().as_default() as graph:
175 |         model = LPNet(configs, graph=graph)
176 |         sess_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
177 |         sess_config.gpu_options.allow_growth = True
178 | 
179 |         with tf.Session(config=sess_config) as sess:
180 |             saver = tf.train.Saver()
181 |             sess.run(tf.global_variables_initializer())
182 |             # saver.restore(sess, tf.train.latest_checkpoint(model_dir))
183 |             ## print(tf.train.latest_checkpoint(model_dir))
184 |             saver.restore(
185 |                 sess, tf.train.latest_checkpoint(model_dir)) 
186 | 
187 |             r1i3, r1i5, r1i7, mi, *_ = eval_test(
188 |                 sess,
189 |                 model,
190 |                 dataset=test_set,
191 |                 video_features=video_features,
192 |                 configs=configs,
193 |                 name="test")
194 | 
195 |             print("\n" + "\x1b[1;31m" + "Rank@1, IoU=0.3:\t{:.2f}".format(r1i3) + "\x1b[0m", flush=True)
196 |             print("\x1b[1;31m" + "Rank@1, IoU=0.5:\t{:.2f}".format(r1i5) + "\x1b[0m", flush=True)
197 |             print("\x1b[1;31m" + "Rank@1, IoU=0.7:\t{:.2f}".format(r1i7) + "\x1b[0m", flush=True)
198 |             print("\x1b[1;31m" + "{}:\t{:.2f}".format("mean IoU".ljust(15), mi[0]) + "\x1b[0m", flush=True)
199 | 
200 | else:
201 |     raise ValueError("Unknown mode {}!!!".format(configs.mode))
202 | 


--------------------------------------------------------------------------------
/run_tacos.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import math
  3 | import numpy as np
  4 | import tensorflow as tf
  5 | from tqdm import tqdm
  6 | from argparse import ArgumentParser
  7 | from models.LPNet import LPNet
  8 | from utils.prepro_tacos import prepro_tacos
  9 | from utils.data_utils import load_video_features, load_json, write_json, batch_iter
 10 | from utils.runner_utils import write_tf_summary, eval_test, get_feed_dict
 11 | 
 12 | parser = ArgumentParser()
 13 | parser.add_argument("--gpu_idx", type=str, default="0", help="GPU index")
 14 | parser.add_argument("--seed", type=int, default=12345, help="random seed")
 15 | parser.add_argument("--mode", type=str, default="train", help="prepro | train | test")
 16 | parser.add_argument("--feature", type=str, default="org", help="[new | org], org: the visual feature from Gao et al.")
 17 | parser.add_argument("--root", type=str, default='data/TACoS', help="root directory for store raw data")
 18 | parser.add_argument("--wordvec_path", type=str, default="data/glove.840B.300d.txt", help="glove word embedding path")
 19 | parser.add_argument("--home_dir", type=str, default=None, help="home directory for saving models")
 20 | parser.add_argument("--save_dir", type=str, default=None, help="directory for saving processed dataset")
 21 | parser.add_argument("--num_train_steps", type=int, default=None, help="number of training steps")
 22 | parser.add_argument("--char_size", type=int, default=None, help="number of characters")
 23 | parser.add_argument("--epochs", type=int, default=100, help="number of epochs")
 24 | parser.add_argument("--batch_size", type=int, default=16, help="batch size")
 25 | parser.add_argument("--word_dim", type=int, default=300, help="word embedding dimension")
 26 | parser.add_argument("--video_feature_dim", type=int, default=1024, help="video feature input dimension")
 27 | parser.add_argument("--char_dim", type=int, default=50, help="character dimension")
 28 | parser.add_argument("--hidden_size", type=int, default=256, help="hidden size")
 29 | parser.add_argument("--max_position_length", type=int, default=512, help="max position length")
 30 | parser.add_argument("--highlight_lambda", type=float, default=5.0, help="lambda for highlight region")
 31 | parser.add_argument("--extend", type=float, default=0.1, help="highlight region extension")
 32 | parser.add_argument("--num_heads", type=int, default=8, help="number of heads")
 33 | parser.add_argument("--drop_rate", type=float, default=0.1, help="dropout rate")
 34 | parser.add_argument("--clip_norm", type=float, default=1.0, help="gradient clip norm")
 35 | parser.add_argument("--init_lr", type=float, default=0.0001, help="initial learning rate")
 36 | parser.add_argument("--warmup_proportion", type=float, default=0.0, help="warmup proportion")
 37 | parser.add_argument("--period", type=int, default=100, help="training loss print period")
 38 | parser.add_argument("--eval_period", type=int, default=None, help="evaluation period")
 39 | configs = parser.parse_args()
 40 | 
 41 | # os environment
 42 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = "3"
 43 | os.environ["CUDA_VISIBLE_DEVICES"] = configs.gpu_idx
 44 | 
 45 | np.random.seed(configs.seed)
 46 | tf.set_random_seed(configs.seed)
 47 | tf.random.set_random_seed(configs.seed)
 48 | 
 49 | # specify the dataset directory
 50 | if configs.home_dir is None:
 51 |     configs.home_dir = "ckpt/tacos_{}_{}".format(configs.feature, configs.max_position_length)
 52 | configs.save_dir = "datasets/tacos_{}/{}".format(configs.feature, configs.max_position_length)
 53 | configs.video_feature_dim = 1024 if configs.feature == "new" else 4096
 54 | 
 55 | if configs.mode.lower() == "prepro":
 56 |     prepro_tacos(configs)
 57 | 
 58 | elif configs.mode.lower() == "train":
 59 |     video_feature_path = os.path.join(configs.root, "tacos_features_{}".format(configs.feature))
 60 |     video_features = load_video_features(video_feature_path, max_position_length=configs.max_position_length)
 61 | 
 62 |     train_set = load_json(os.path.join(configs.save_dir, "train_set.json"))
 63 |     test_set = load_json(os.path.join(configs.save_dir, "test_set.json"))
 64 |     num_train_batches = math.ceil(len(train_set) / configs.batch_size)
 65 | 
 66 |     if configs.eval_period is None:
 67 |         configs.eval_period = num_train_batches
 68 |     if configs.num_train_steps is None:
 69 |         configs.num_train_steps = num_train_batches * configs.epochs
 70 |     if configs.char_size is None:
 71 |         configs.char_size = len(load_json(os.path.join(configs.save_dir, "char_dict.json")))
 72 | 
 73 |     log_dir = os.path.join(configs.home_dir, "event")
 74 |     model_dir = os.path.join(configs.home_dir, "model")
 75 |     if not os.path.exists(model_dir):
 76 |         os.makedirs(model_dir)
 77 |     if not os.path.exists(log_dir):
 78 |         os.makedirs(log_dir)
 79 | 
 80 |     # write configs to json file
 81 |     write_json(vars(configs), save_path=os.path.join(model_dir, "configs.json"), pretty=True)
 82 | 
 83 |     with tf.Graph().as_default() as graph:
 84 |         model = LPNet(configs, graph=graph)
 85 |         sess_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
 86 |         sess_config.gpu_options.allow_growth = True
 87 | 
 88 |         with tf.Session(config=sess_config) as sess:
 89 |             optimizer = tf.train.AdamOptimizer(configs.init_lr,
 90 |                                                beta1=0.9,
 91 |                                                beta2=0.999,
 92 |                                                name='AdamOptimizer')
 93 |             # train_op = optimizer.minimize(model.my_loss, global_step=model.global_step)
 94 |             trainable_vars = tf.trainable_variables()
 95 |             freeze_bbox_var_list = [t for t in trainable_vars if not t.name.startswith(u'proposal_box')]
 96 |             bbox_var_list = [t for t in trainable_vars if t.name.startswith(u'proposal_box')]
 97 |             train_op1 = optimizer.minimize(model.reg_loss, global_step=model.global_step, var_list=freeze_bbox_var_list)
 98 |             train_op2 = optimizer.minimize(model.my_loss, var_list=bbox_var_list)
 99 |             saver = tf.train.Saver(max_to_keep=5)
100 |             writer = tf.summary.FileWriter(log_dir)
101 |             sess.run(tf.global_variables_initializer())
102 | 
103 |             best_r1i7 = -1.0
104 |             score_writer = open(os.path.join(model_dir, "eval_results.txt"), mode="w", encoding="utf-8")
105 | 
106 |             for epoch in range(configs.epochs):
107 |                 for data in tqdm(batch_iter(train_set, video_features, configs.batch_size, configs.extend, True, True),
108 |                                  total=num_train_batches, desc="Epoch %d / %d" % (epoch + 1, configs.epochs)):
109 | 
110 |                     # run the model
111 |                     feed_dict = get_feed_dict(data, model, configs.drop_rate)
112 |                     # _, loss, h_loss,lloss, rloss, global_step = sess.run([train_op, model.my_loss, model.highlight_loss, model.loss, model.reg_loss,
113 |                     #                                          model.global_step], feed_dict=feed_dict)
114 |                     # if global_step % configs.period == 0:
115 |                     #     write_tf_summary(writer, [("train/my_loss", loss), ("train/highlight_loss", h_loss),("train/reg_loss", rloss), ("train/cls_loss", lloss)], global_step)
116 | 
117 |                     _, _, loss, rloss, iloss, lloss, kloss, hloss, global_step = sess.run(
118 |                         [ 
119 |                             train_op1, train_op2,
120 |                             model.my_loss,
121 |                             model.reg_loss, model.iou_loss, model.l1_loss, model.loss, model.highlight_loss,
122 |                             model.global_step
123 |                         ],
124 |                         feed_dict=feed_dict)
125 |                     if global_step % configs.period == 0:
126 |                         # write_tf_summary(writer, [("train/my_loss", loss)], global_step)
127 |                         write_tf_summary(writer, [("train/my_loss", loss),
128 |                                                   ("train/reg_loss", rloss),
129 |                                                   ("train/iou_loss", iloss),
130 |                                                   ("train/l1_loss", lloss),
131 |                                                   ("train/kl_loss", kloss),
132 |                                                   ("train/hl_loss", hloss)],
133 |                                          global_step)
134 | 
135 |                     # evaluate
136 |                     if global_step % num_train_batches == 0:
137 | 
138 |                         r1i3, r1i5, r1i7, mi, value_pairs, score_str = eval_test(
139 |                             sess=sess, model=model, dataset=test_set, video_features=video_features,
140 |                             configs=configs, epoch=epoch + 1, global_step=global_step, name="test")
141 | 
142 |                         write_tf_summary(writer, value_pairs, global_step)
143 |                         score_writer.write(score_str)
144 |                         score_writer.flush()
145 | 
146 |                         # save the model according to the result of Rank@1, IoU=0.7
147 |                         if r1i7 > best_r1i7:
148 |                             best_r1i7 = r1i7
149 |                             filename = os.path.join(model_dir, "model_{}.ckpt".format(global_step))
150 |                             saver.save(sess, filename)
151 | 
152 |             score_writer.close()
153 | 
154 | elif configs.mode.lower() == "test":
155 | 
156 |     # load previous configs
157 |     model_dir = os.path.join(configs.home_dir, "model")
158 |     pre_configs = load_json(os.path.join(model_dir, "configs.json"))
159 |     parser.set_defaults(**pre_configs)
160 |     configs = parser.parse_args()
161 | 
162 |     # load video features
163 |     video_feature_path = os.path.join(configs.root, "tacos_features_{}".format(configs.feature))
164 |     video_features = load_video_features(video_feature_path, max_position_length=configs.max_position_length)
165 | 
166 |     # load test dataset
167 |     test_set = load_json(os.path.join(configs.save_dir, "test_set.json"))
168 | 
169 |     # restore model and evaluate
170 |     with tf.Graph().as_default() as graph:
171 |         model = LPNet(configs, graph=graph)
172 |         sess_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
173 |         sess_config.gpu_options.allow_growth = True
174 | 
175 |         with tf.Session(config=sess_config) as sess:
176 |             saver = tf.train.Saver()
177 |             sess.run(tf.global_variables_initializer())
178 |             saver.restore(sess, tf.train.latest_checkpoint(model_dir))
179 | 
180 |             r1i3, r1i5, r1i7, mi, *_ = eval_test(sess, model, dataset=test_set, video_features=video_features,
181 |                                                  configs=configs, name="test")
182 | 
183 |             print("\n" + "\x1b[1;31m" + "Rank@1, IoU=0.3:\t{:.2f}".format(r1i3) + "\x1b[0m", flush=True)
184 |             print("\x1b[1;31m" + "Rank@1, IoU=0.5:\t{:.2f}".format(r1i5) + "\x1b[0m", flush=True)
185 |             print("\x1b[1;31m" + "Rank@1, IoU=0.7:\t{:.2f}".format(r1i7) + "\x1b[0m", flush=True)
186 |             print("\x1b[1;31m" + "{}:\t{:.2f}".format("mean IoU".ljust(15), mi[0]) + "\x1b[0m", flush=True)
187 | 
188 | else:
189 |     raise ValueError("Unknown mode {}!!!".format(configs.mode))
190 | 


--------------------------------------------------------------------------------
/statistic/convert_tacos.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import glob
  4 | import argparse
  5 | import subprocess
  6 | import numpy as np
  7 | from tqdm import tqdm
  8 | from nltk.tokenize import word_tokenize
  9 | from moviepy.editor import VideoFileClip
 10 | 
 11 | 
 12 | def extract_video_to_images(video_dir, video_names, save_dir):
 13 |     if not os.path.exists(video_dir):
 14 |         raise ValueError("The video directory '{}' does not exist!!!".format(video_dir))
 15 | 
 16 |     if not os.path.exists(save_dir):
 17 |         os.makedirs(save_dir)
 18 | 
 19 |     for video_name in tqdm(video_names, total=len(video_names), desc="extract video to images"):
 20 |         video_path = os.path.join(video_dir, video_name)
 21 |         video_id = video_name[0:-4]
 22 |         image_dir = os.path.join(save_dir, video_id)
 23 | 
 24 |         if os.path.exists(image_dir):
 25 |             continue
 26 |         else:
 27 |             os.makedirs(image_dir)
 28 | 
 29 |         subprocess.call("ffmpeg -hide_banner -loglevel panic -i {} -filter:v fps=fps=29.4 {}/{}-%6d.jpg".format(
 30 |             video_path, image_dir, video_id), shell=True)
 31 | 
 32 | 
 33 | def load_frames_and_times(image_dir, video_dir, video_names):
 34 |     dirs = glob.glob(os.path.join(image_dir, "*/"))
 35 |     video_frames = dict()
 36 | 
 37 |     for directory in dirs:
 38 |         vid = os.path.basename(directory[0:-1])
 39 |         num_frames = len(glob.glob(os.path.join(directory, "*.jpg")))
 40 |         video_frames[vid] = num_frames
 41 | 
 42 |     video_times = dict()
 43 |     fps = None
 44 | 
 45 |     for video_name in video_names:
 46 |         video_id = video_name[0:-4]
 47 |         clip = VideoFileClip(os.path.join(video_dir, video_name))
 48 |         fps = clip.fps  # all the videos with the same fps
 49 |         duration = clip.duration
 50 |         video_times[video_id] = duration
 51 | 
 52 |     return video_frames, video_times, fps
 53 | 
 54 | 
 55 | def load_video_names(dataset_dir):
 56 |     video_names = []
 57 |     video_files = ["TACoS_train_videos.txt", "TACoS_val_videos.txt", "TACoS_test_videos.txt"]
 58 | 
 59 |     for video_file in video_files:
 60 |         with open(os.path.join(dataset_dir, video_file), mode="r", encoding="utf-8") as f:
 61 |             for line in f:
 62 |                 line = line.strip()
 63 | 
 64 |                 if len(line) == 0:
 65 |                     continue
 66 | 
 67 |                 video_names.append(line)
 68 | 
 69 |     return video_names
 70 | 
 71 | 
 72 | def read_data(filename):
 73 |     results = []
 74 |     with open(filename, mode="r", encoding="utf-8") as f:
 75 |         for line in f:
 76 |             line = line.strip()
 77 | 
 78 |             if len(line) == 0:
 79 |                 continue
 80 | 
 81 |             video, text = line.split(":")
 82 | 
 83 |             if text.endswith("#"):
 84 |                 text = text[0:-1]
 85 | 
 86 |             sentences = [sentence.strip().lower() for sentence in text.split("#")]
 87 |             vid, start_frame, end_frame = video.split("_")
 88 |             vid = vid[0:-4]
 89 |             start_frame = int(start_frame)
 90 |             end_frame = int(end_frame)
 91 | 
 92 |             result = (vid, start_frame, end_frame, sentences)
 93 |             results.append(result)
 94 | 
 95 |     return results
 96 | 
 97 | 
 98 | def reconstruct_tacos_dataset(dataset, video_frames, fps):
 99 |     temp_data = dict()
100 |     for data in dataset:
101 |         vid, start_frame, end_frame, sentences = data
102 |         temp_data[vid] = temp_data.get(vid, []) + [(start_frame, end_frame, sentences)]
103 | 
104 |     new_dataset = dict()
105 |     for vid, records in temp_data.items():
106 |         num_frames = video_frames[vid]
107 |         timestamps, sentences = [], []
108 | 
109 |         for record in records:
110 |             start_frame, end_frame, sents = record
111 | 
112 |             for sent in sents:
113 |                 timestamps.append([start_frame, end_frame])
114 |                 sentences.append(sent)
115 | 
116 |         new_dataset[vid] = {"timestamps": timestamps, "sentences": sentences, "fps": fps, "num_frames": num_frames}
117 |     return new_dataset
118 | 
119 | 
120 | def stat_data_info(data, fps):
121 |     num_samples, query_lengths, num_words, moment_lengths = 0, [], [], []
122 |     for record in data:
123 |         moment_length = float(record[2] - record[1]) / fps
124 |         num_samples += len(record[-1])
125 | 
126 |         for sentence in record[-1]:
127 |             words = word_tokenize(sentence)
128 |             query_lengths.append(len(words))
129 |             num_words.extend(words)
130 | 
131 |         moment_lengths.append(moment_length)
132 |     return num_samples, query_lengths, num_words, moment_lengths
133 | 
134 | 
135 | def main():
136 |     parser = argparse.ArgumentParser()
137 |     parser.add_argument("--video_dir", type=str, required=True, help="TACoS video directory")
138 |     parser.add_argument("--dataset_dir", type=str, required=True, help="TACoS dataset directory")
139 |     parser.add_argument("--save_dir", type=str, required=True, help="directory to save extracted images")
140 |     args = parser.parse_args()
141 | 
142 |     # load video ids
143 |     video_names = load_video_names(args.dataset_dir)
144 | 
145 |     # extract video information
146 |     extract_video_to_images(args.video_dir, video_names, args.save_dir)
147 |     video_frames, video_times, fps = load_frames_and_times(args.save_dir, args.video_dir, video_names)
148 | 
149 |     # load TACoS datasets
150 |     train_data = read_data(os.path.join(args.dataset_dir, "TACoS_train_samples.txt"))
151 |     val_data = read_data(os.path.join(args.dataset_dir, "TACoS_val_samples.txt"))
152 |     test_data = read_data(os.path.join(args.dataset_dir, "TACoS_test_samples.txt"))
153 | 
154 |     train_set = reconstruct_tacos_dataset(train_data, video_frames, fps)
155 |     val_set = reconstruct_tacos_dataset(val_data, video_frames, fps)
156 |     test_set = reconstruct_tacos_dataset(test_data, video_frames, fps)
157 | 
158 |     with open(os.path.join(args.dataset_dir, "train.json"), mode="w", encoding="utf-8") as f:
159 |         json.dump(train_set, f)
160 | 
161 |     with open(os.path.join(args.dataset_dir, "val.json"), mode="w", encoding="utf-8") as f:
162 |         json.dump(val_set, f)
163 | 
164 |     with open(os.path.join(args.dataset_dir, "test.json"), mode="w", encoding="utf-8") as f:
165 |         json.dump(test_set, f)
166 | 
167 |     # statistics
168 |     train_samples, train_query_lengths, train_num_words, train_moment_lengths = stat_data_info(train_data, fps)
169 |     val_samples, val_query_lengths, val_num_words, val_moment_lengths = stat_data_info(val_data, fps)
170 |     test_samples, test_query_lengths, test_num_words, test_moment_lengths = stat_data_info(test_data, fps)
171 |     query_lengths = train_query_lengths + val_query_lengths + test_query_lengths
172 |     num_words = train_num_words + val_num_words + test_num_words
173 |     moment_lengths = train_moment_lengths + val_moment_lengths + test_moment_lengths
174 |     durations = list(video_times.values())
175 | 
176 |     # print
177 |     print("Training samples:", train_samples)
178 |     print("Validation samples:", val_samples)
179 |     print("Test samples:", test_samples)
180 |     print("Vocabulary size:", len(set(num_words)))
181 |     print("Average video length:", np.mean(durations))
182 |     print("Average query length:", np.mean(query_lengths))
183 |     print("Average moment length:", np.mean(moment_lengths))
184 |     print("Std. of moment length:", np.std(moment_lengths))
185 | 
186 | 
187 | if __name__ == "__main__":
188 |     main()
189 | 


--------------------------------------------------------------------------------
/statistic/stat_activitynet.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import argparse
 4 | import numpy as np
 5 | from nltk.tokenize import word_tokenize
 6 | 
 7 | 
 8 | def stat_data_info(data):
 9 |     num_videos, num_anns, video_lengths, query_lengths, moment_lengths, num_words = 0, 0, list(), list(), list(), list()
10 |     for key, value in data.items():
11 |         num_videos += 1
12 |         num_anns += len(value["timestamps"])
13 |         video_lengths.append(float(value["duration"]))
14 | 
15 |         for val in value["timestamps"]:
16 |             moment_lengths.append(val[1] - val[0])
17 | 
18 |         for sentence in value["sentences"]:
19 |             words = word_tokenize(sentence.strip().lower())
20 |             num_words.extend(words)
21 |             query_lengths.append(len(words))
22 | 
23 |     return num_videos, num_anns, video_lengths, query_lengths, moment_lengths, num_words
24 | 
25 | 
26 | def main():
27 |     parser = argparse.ArgumentParser()
28 |     parser.add_argument("--dataset_dir", type=str, required=True, help="ActivityNet Caption dataset directory")
29 |     args = parser.parse_args()
30 | 
31 |     with open(os.path.join(args.dataset_dir, "train.json"), mode="r", encoding="utf-8") as f:
32 |         train_data = json.load(f)
33 | 
34 |     with open(os.path.join(args.dataset_dir, "val_1.json"), mode="r", encoding="utf-8") as f:
35 |         test_data = json.load(f)
36 | 
37 |     with open(os.path.join(args.dataset_dir, "val_2.json"), mode="r", encoding="utf-8") as f:
38 |         test2_data = json.load(f)
39 | 
40 |     (train_num_videos, train_num_anns, train_video_lengths, train_query_lengths, train_moment_lengths,
41 |      train_num_words) = stat_data_info(train_data)
42 | 
43 |     (test_num_videos, test_num_anns, test_video_lengths, test_query_lengths, test_moment_lengths,
44 |      test_num_words) = stat_data_info(test_data)
45 | 
46 |     (test2_num_videos, test2_num_anns, test2_video_lengths, test2_query_lengths, test2_moment_lengths,
47 |      test2_num_words) = stat_data_info(test2_data)
48 | 
49 |     video_lengths = train_video_lengths + test_video_lengths + test2_video_lengths
50 |     query_lengths = train_query_lengths + test_query_lengths + test2_query_lengths
51 |     moment_lengths = train_moment_lengths + test_moment_lengths + test2_moment_lengths
52 |     num_words = train_num_words + test_num_words + test2_num_words
53 | 
54 |     print("Training videos:", train_num_videos)
55 |     print("Test videos:", test_num_videos)
56 |     print("Training samples:", train_num_anns)
57 |     print("Test samples:", test_num_anns)
58 |     print("Vocabulary size:", len(set(num_words)))
59 |     print("Average video length:", np.mean(video_lengths))
60 |     print("Average query length:", np.mean(query_lengths))
61 |     print("Average moment length:", np.mean(moment_lengths))
62 |     print("Std. of moment length:", np.std(moment_lengths))
63 | 
64 | 
65 | if __name__ == "__main__":
66 |     main()
67 | 


--------------------------------------------------------------------------------
/statistic/stat_charades.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import argparse
 4 | import numpy as np
 5 | from nltk.tokenize import word_tokenize
 6 | 
 7 | 
 8 | def load_charades_sta_data(charades_sta_file, charades):
 9 |     with open(charades_sta_file, mode="r", encoding="utf-8") as f_sta:
10 |         vids, data = [], []
11 |         for line in f_sta:
12 |             line = line.lstrip().rstrip()
13 | 
14 |             if len(line) == 0:
15 |                 continue
16 | 
17 |             video_info, sentence = line.split("##")
18 |             vid, start_time, end_time = video_info.split(" ")
19 |             words = word_tokenize(sentence.lower(), language="english")
20 |             start_time, end_time = float(start_time), float(end_time)
21 |             duration = float(charades[vid]["duration"])
22 | 
23 |             vids.append(vid)
24 |             data.append((vid, start_time, end_time, duration, words))
25 | 
26 |         return vids, data
27 | 
28 | 
29 | def stat_data_info(data):
30 |     query_lengths, moment_lengths, num_words = [], [], []
31 | 
32 |     for record in data:
33 |         moment_length = record[2] - record[1]
34 |         moment_lengths.append(moment_length)
35 |         query_lengths.append(len(record[-1]))
36 |         num_words.extend(record[-1])
37 | 
38 |     return query_lengths, moment_lengths, num_words
39 | 
40 | 
41 | def main():
42 |     parser = argparse.ArgumentParser()
43 |     parser.add_argument("--dataset_dir", type=str, required=True, help="Charades-STA dataset directory")
44 |     args = parser.parse_args()
45 | 
46 |     with open(os.path.join(args.dataset_dir, "charades.json"), mode="r", encoding="utf-8") as f:
47 |         charades = json.load(f)
48 | 
49 |     train_vids, train_data = load_charades_sta_data(os.path.join(args.dataset_dir, "charades_sta_train.txt"), charades)
50 |     test_vids, test_data = load_charades_sta_data(os.path.join(args.dataset_dir, "charades_sta_test.txt"), charades)
51 | 
52 |     num_train_videos = len(set(train_vids))
53 |     num_test_videos = len(set(test_vids))
54 |     num_train_anns = len(train_data)
55 |     num_test_anns = len(test_data)
56 | 
57 |     vids = list(set(train_vids + test_vids))
58 |     video_lengths = []
59 |     for vid in vids:
60 |         duration = charades[vid]["duration"]
61 |         video_lengths.append(float(duration))
62 | 
63 |     train_query_lengths, train_moment_lengths, train_num_words = stat_data_info(train_data)
64 |     test_query_lengths, test_moment_lengths, test_num_words = stat_data_info(test_data)
65 |     query_lengths = train_query_lengths + test_query_lengths
66 |     moment_lengths = train_moment_lengths + test_moment_lengths
67 |     num_words = train_num_words + test_num_words
68 | 
69 |     print("Training videos:", num_train_videos)
70 |     print("Test videos:", num_test_videos)
71 |     print("Training samples:", num_train_anns)
72 |     print("Test samples:", num_test_anns)
73 |     print("Vocabulary size:", len(set(num_words)))
74 |     print("Average video length:", np.mean(video_lengths))
75 |     print("min video length:", np.min(video_lengths))
76 |     print("max video length:", np.max(video_lengths))
77 |     print("min video length:", np.min(video_lengths))
78 |     print("median video length:", np.median(video_lengths))
79 |     print("Average query length:", np.mean(query_lengths))
80 |     print("Average moment length:", np.mean(moment_lengths))
81 |     print("Std. of moment length:", np.std(moment_lengths))
82 | 
83 | 
84 | if __name__ == "__main__":
85 |     main()
86 | 


--------------------------------------------------------------------------------
/statistic/stat_tacos.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import glob
  4 | import argparse
  5 | import subprocess
  6 | import numpy as np
  7 | from tqdm import tqdm
  8 | from nltk.tokenize import word_tokenize
  9 | from moviepy.editor import VideoFileClip
 10 | 
 11 | 
 12 | def extract_video_to_images(video_dir, video_names, save_dir):
 13 |     if not os.path.exists(video_dir):
 14 |         raise ValueError("The video directory '{}' does not exist!!!".format(video_dir))
 15 | 
 16 |     if not os.path.exists(save_dir):
 17 |         os.makedirs(save_dir)
 18 | 
 19 |     for video_name in tqdm(video_names, total=len(video_names), desc="extract video to images"):
 20 |         video_path = os.path.join(video_dir, video_name)
 21 |         video_id = video_name[0:-4]
 22 |         image_dir = os.path.join(save_dir, video_id)
 23 | 
 24 |         if os.path.exists(image_dir):
 25 |             continue
 26 |         else:
 27 |             os.makedirs(image_dir)
 28 | 
 29 |         subprocess.call("ffmpeg -hide_banner -loglevel panic -i {} -filter:v fps=fps=29.4 {}/{}-%6d.jpg".format(
 30 |             video_path, image_dir, video_id), shell=True)
 31 | 
 32 | 
 33 | def load_frames_and_times(image_dir, video_dir, video_names):
 34 |     dirs = glob.glob(os.path.join(image_dir, "*/"))
 35 |     video_frames = dict()
 36 | 
 37 |     for directory in dirs:
 38 |         vid = os.path.basename(directory[0:-1])
 39 |         num_frames = len(glob.glob(os.path.join(directory, "*.jpg")))
 40 |         video_frames[vid] = num_frames
 41 | 
 42 |     video_times = dict()
 43 |     fps = None
 44 | 
 45 |     for video_name in video_names:
 46 |         video_id = video_name[0:-4]
 47 |         clip = VideoFileClip(os.path.join(video_dir, video_name))
 48 |         fps = clip.fps  # all the videos with the same fps
 49 |         duration = clip.duration
 50 |         video_times[video_id] = duration
 51 | 
 52 |     return video_frames, video_times, fps
 53 | 
 54 | 
 55 | def load_video_names(dataset_dir):
 56 |     video_names = []
 57 |     video_files = ["TACoS_train_videos.txt", "TACoS_val_videos.txt", "TACoS_test_videos.txt"]
 58 | 
 59 |     for video_file in video_files:
 60 |         with open(os.path.join(dataset_dir, video_file), mode="r", encoding="utf-8") as f:
 61 |             for line in f:
 62 |                 line = line.strip()
 63 | 
 64 |                 if len(line) == 0:
 65 |                     continue
 66 | 
 67 |                 video_names.append(line)
 68 | 
 69 |     return video_names
 70 | 
 71 | 
 72 | def read_data(filename):
 73 |     results = []
 74 |     with open(filename, mode="r", encoding="utf-8") as f:
 75 |         for line in f:
 76 |             line = line.strip()
 77 | 
 78 |             if len(line) == 0:
 79 |                 continue
 80 | 
 81 |             video, text = line.split(":")
 82 | 
 83 |             if text.endswith("#"):
 84 |                 text = text[0:-1]
 85 | 
 86 |             sentences = [sentence.strip().lower() for sentence in text.split("#")]
 87 |             vid, start_frame, end_frame = video.split("_")
 88 |             vid = vid[0:-4]
 89 |             start_frame = int(start_frame)
 90 |             end_frame = int(end_frame)
 91 | 
 92 |             result = (vid, start_frame, end_frame, sentences)
 93 |             results.append(result)
 94 | 
 95 |     return results
 96 | 
 97 | 
 98 | def reconstruct_tacos_dataset(dataset, video_frames, fps):
 99 |     temp_data = dict()
100 |     for data in dataset:
101 |         vid, start_frame, end_frame, sentences = data
102 |         temp_data[vid] = temp_data.get(vid, []) + [(start_frame, end_frame, sentences)]
103 | 
104 |     new_dataset = dict()
105 |     for vid, records in temp_data.items():
106 |         num_frames = video_frames[vid]
107 |         timestamps, sentences = [], []
108 | 
109 |         for record in records:
110 |             start_frame, end_frame, sents = record
111 | 
112 |             for sent in sents:
113 |                 timestamps.append([start_frame, end_frame])
114 |                 sentences.append(sent)
115 | 
116 |         new_dataset[vid] = {"timestamps": timestamps, "sentences": sentences, "fps": fps, "num_frames": num_frames}
117 |     return new_dataset
118 | 
119 | 
120 | def stat_data_info(data, fps):
121 |     num_samples, query_lengths, num_words, moment_lengths = 0, [], [], []
122 |     for record in data:
123 |         moment_length = float(record[2] - record[1]) / fps
124 |         num_samples += len(record[-1])
125 | 
126 |         for sentence in record[-1]:
127 |             words = word_tokenize(sentence)
128 |             query_lengths.append(len(words))
129 |             num_words.extend(words)
130 | 
131 |         moment_lengths.append(moment_length)
132 |     return num_samples, query_lengths, num_words, moment_lengths
133 | 
134 | 
135 | def main():
136 |     parser = argparse.ArgumentParser()
137 |     parser.add_argument("--video_dir", type=str, required=True, help="TACoS video directory")
138 |     parser.add_argument("--dataset_dir", type=str, required=True, help="TACoS dataset directory")
139 |     parser.add_argument("--save_dir", type=str, required=True, help="directory to save extracted images")
140 |     args = parser.parse_args()
141 | 
142 |     # load video ids
143 |     video_names = load_video_names(args.dataset_dir)
144 | 
145 |     # extract video information
146 |     extract_video_to_images(args.video_dir, video_names, args.save_dir)
147 |     video_frames, video_times, fps = load_frames_and_times(args.save_dir, args.video_dir, video_names)
148 | 
149 |     # load TACoS datasets
150 |     train_data = read_data(os.path.join(args.dataset_dir, "TACoS_train_samples.txt"))
151 |     val_data = read_data(os.path.join(args.dataset_dir, "TACoS_val_samples.txt"))
152 |     test_data = read_data(os.path.join(args.dataset_dir, "TACoS_test_samples.txt"))
153 | 
154 |     train_set = reconstruct_tacos_dataset(train_data, video_frames, fps)
155 |     val_set = reconstruct_tacos_dataset(val_data, video_frames, fps)
156 |     test_set = reconstruct_tacos_dataset(test_data, video_frames, fps)
157 | 
158 |     with open(os.path.join(args.dataset_dir, "train.json"), mode="w", encoding="utf-8") as f:
159 |         json.dump(train_set, f)
160 | 
161 |     with open(os.path.join(args.dataset_dir, "val.json"), mode="w", encoding="utf-8") as f:
162 |         json.dump(val_set, f)
163 | 
164 |     with open(os.path.join(args.dataset_dir, "test.json"), mode="w", encoding="utf-8") as f:
165 |         json.dump(test_set, f)
166 | 
167 |     # statistics
168 |     train_samples, train_query_lengths, train_num_words, train_moment_lengths = stat_data_info(train_data, fps)
169 |     val_samples, val_query_lengths, val_num_words, val_moment_lengths = stat_data_info(val_data, fps)
170 |     test_samples, test_query_lengths, test_num_words, test_moment_lengths = stat_data_info(test_data, fps)
171 |     query_lengths = train_query_lengths + val_query_lengths + test_query_lengths
172 |     num_words = train_num_words + val_num_words + test_num_words
173 |     moment_lengths = train_moment_lengths + val_moment_lengths + test_moment_lengths
174 |     durations = list(video_times.values())
175 | 
176 |     # print
177 |     print("Training samples:", train_samples)
178 |     print("Validation samples:", val_samples)
179 |     print("Test samples:", test_samples)
180 |     print("Vocabulary size:", len(set(num_words)))
181 |     print("Average video length:", np.mean(durations))
182 |     print("Average query length:", np.mean(query_lengths))
183 |     print("Average moment length:", np.mean(moment_lengths))
184 |     print("Std. of moment length:", np.std(moment_lengths))
185 | 
186 |     print("Max moment length:", np.max(moment_lengths))
187 |     print("Min moment length:", np.mean(moment_lengths))
188 | 
189 | if __name__ == "__main__":
190 |     main()
191 | 


--------------------------------------------------------------------------------
/utils/data_utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import glob
  3 | import json
  4 | import random
  5 | import codecs
  6 | import numpy as np
  7 | from tqdm import tqdm
  8 | import tensorflow as tf
  9 | 
 10 | glove_sizes = {'6B': int(4e5), '42B': int(1.9e6), '840B': int(2.2e6), '2B': int(1.2e6)}
 11 | PAD, UNK = "<PAD>", "<UNK>"
 12 | 
 13 | 
 14 | def load_glove(glove_path, dim):
 15 |     vocab = list()
 16 |     with codecs.open(glove_path, mode="r", encoding="utf-8") as f:
 17 |         total = glove_sizes[glove_path.split(".")[-3]]
 18 | 
 19 |         for line in tqdm(f, total=total, desc="load glove vocabulary"):
 20 |             line = line.lstrip().rstrip().split(" ")
 21 | 
 22 |             if len(line) == 2 or len(line) != dim + 1:
 23 |                 continue
 24 | 
 25 |             word = line[0]
 26 |             vocab.append(word)
 27 | 
 28 |     return set(vocab)
 29 | 
 30 | 
 31 | def filter_glove_embedding(word_dict, glove_path, dim):
 32 |     vectors = np.zeros(shape=[len(word_dict), dim], dtype=np.float32)
 33 | 
 34 |     with codecs.open(glove_path, mode="r", encoding="utf-8") as f:
 35 |         total = glove_sizes[glove_path.split(".")[-3]]
 36 | 
 37 |         for line in tqdm(f, total=total, desc="load glove embeddings"):
 38 |             line = line.lstrip().rstrip().split(" ")
 39 | 
 40 |             if len(line) == 2 or len(line) != dim + 1:
 41 |                 continue
 42 | 
 43 |             word = line[0]
 44 | 
 45 |             if word in word_dict:
 46 |                 vector = [float(x) for x in line[1:]]
 47 |                 word_index = word_dict[word]
 48 |                 vectors[word_index] = np.asarray(vector)
 49 | 
 50 |     return np.asarray(vectors)
 51 | 
 52 | 
 53 | def load_video_features(root, max_position_length):
 54 |     video_features = dict()
 55 |     filenames = glob.glob(os.path.join(root, "*.npy"))
 56 | 
 57 |     for filename in tqdm(filenames, total=len(filenames), desc="load video features"):
 58 |         video_id = filename.split("/")[-1].split(".")[0]
 59 |         feature = np.load(filename)
 60 | 
 61 |         if max_position_length is None:
 62 |             video_features[video_id] = feature
 63 | 
 64 |         else:
 65 |             new_feature = visual_feature_sampling(feature, max_num_clips=max_position_length)
 66 |             video_features[video_id] = new_feature
 67 | 
 68 |     return video_features
 69 | 
 70 | 
 71 | def visual_feature_sampling(visual_feature, max_num_clips):
 72 |     num_clips = visual_feature.shape[0]
 73 | 
 74 |     if num_clips <= max_num_clips:
 75 |         return visual_feature
 76 | 
 77 |     idxs = np.arange(0, max_num_clips + 1, 1.0) / max_num_clips * num_clips
 78 |     idxs = np.round(idxs).astype(np.int32)
 79 |     idxs[idxs > num_clips - 1] = num_clips - 1
 80 | 
 81 |     new_visual_feature = []
 82 |     for i in range(max_num_clips):
 83 |         s_idx, e_idx = idxs[i], idxs[i + 1]
 84 | 
 85 |         if s_idx < e_idx:
 86 |             new_visual_feature.append(np.mean(visual_feature[s_idx:e_idx], axis=0))
 87 | 
 88 |         else:
 89 |             new_visual_feature.append(visual_feature[s_idx])
 90 | 
 91 |     new_visual_feature = np.asarray(new_visual_feature)
 92 | 
 93 |     return new_visual_feature
 94 | 
 95 | 
 96 | def iou(pred, gt):  # require pred and gt is numpy
 97 |     assert isinstance(pred, list) and isinstance(gt, list)
 98 | 
 99 |     pred_is_list = isinstance(pred[0], list)
100 |     gt_is_list = isinstance(gt[0], list)
101 | 
102 |     if not pred_is_list:
103 |         pred = [pred]
104 | 
105 |     if not gt_is_list:
106 |         gt = [gt]
107 | 
108 |     pred, gt = np.array(pred), np.array(gt)
109 | 
110 |     inter_left = np.maximum(pred[:, 0, None], gt[None, :, 0])
111 |     inter_right = np.minimum(pred[:, 1, None], gt[None, :, 1])
112 |     inter = np.maximum(0.0, inter_right - inter_left)
113 | 
114 |     union_left = np.minimum(pred[:, 0, None], gt[None, :, 0])
115 |     union_right = np.maximum(pred[:, 1, None], gt[None, :, 1])
116 |     union = np.maximum(1e-12, union_right - union_left)
117 | 
118 |     overlap = 1.0 * inter / union
119 | 
120 |     if not gt_is_list:
121 |         overlap = overlap[:, 0]
122 | 
123 |     if not pred_is_list:
124 |         overlap = overlap[0]
125 | 
126 |     return overlap
127 | 
128 | 
129 | def time_to_index(start_time, end_time, feature_shape, duration):
130 |     s_times = np.arange(0, feature_shape).astype(np.float32) * duration / float(feature_shape)
131 |     e_times = np.arange(1, feature_shape + 1).astype(np.float32) * duration / float(feature_shape)
132 | 
133 |     candidates = np.stack([np.repeat(s_times[:, None], repeats=feature_shape, axis=1),
134 |                            np.repeat(e_times[None, :], repeats=feature_shape, axis=0)], axis=2).reshape((-1, 2))
135 | 
136 |     overlaps = iou(candidates.tolist(), [start_time, end_time]).reshape(feature_shape, feature_shape)
137 |     start_index = np.argmax(overlaps) // feature_shape
138 |     end_index = np.argmax(overlaps) % feature_shape
139 | 
140 |     return start_index, end_index
141 | 
142 | 
143 | def load_video_ids(root):
144 |     video_ids = []
145 |     filenames = glob.glob(os.path.join(root, "*.npy"))
146 | 
147 |     for filename in filenames:
148 |         basename = os.path.basename(filename)
149 |         vid = basename[0:-4]
150 |         video_ids.append(vid)
151 | 
152 |     return video_ids
153 | 
154 | 
155 | def write_json(dataset, save_path, pretty=False):
156 |     with codecs.open(filename=save_path, mode="w", encoding="utf-8") as f:
157 |         if pretty:
158 |             json.dump(dataset, f, indent=4, sort_keys=True)
159 |         else:
160 |             json.dump(dataset, f)
161 | 
162 | 
163 | def load_json(filename):
164 |     with codecs.open(filename=filename, mode="r", encoding="utf-8") as f:
165 |         data = json.load(f)
166 |     return data
167 | 
168 | 
169 | def word_convert(word, word_lower=True, char_lower=True):
170 |     if char_lower:
171 |         chars = [c for c in word.lower()]
172 |     else:
173 |         chars = [c for c in word]
174 | 
175 |     if word_lower:
176 |         word = word.lower()
177 | 
178 |     return word, chars
179 | 
180 | 
181 | def create_vocabularies(configs, word_counter, char_counter):
182 |     # generate word dict and vectors
183 |     emb_vocab = load_glove(configs.wordvec_path, configs.word_dim)
184 | 
185 |     word_vocab = list()
186 |     for word, _ in word_counter.most_common():
187 |         if word in emb_vocab:
188 |             word_vocab.append(word)
189 | 
190 |     tmp_word_dict = dict([(word, index) for index, word in enumerate(word_vocab)])
191 |     vectors = filter_glove_embedding(tmp_word_dict, configs.wordvec_path, configs.word_dim)
192 | 
193 |     word_vocab = [PAD, UNK] + word_vocab
194 |     word_dict = dict([(word, idx) for idx, word in enumerate(word_vocab)])
195 | 
196 |     # generate character dict
197 |     char_vocab = [PAD, UNK] + [char for char, count in char_counter.most_common() if count >= 5]
198 |     char_dict = dict([(char, idx) for idx, char in enumerate(char_vocab)])
199 | 
200 |     return word_dict, char_dict, vectors
201 | 
202 | 
203 | def boolean_string(bool_str):
204 |     bool_str = bool_str.lower()
205 | 
206 |     if bool_str not in {"false", "true"}:
207 |         raise ValueError("Not a valid boolean string!!!")
208 | 
209 |     return bool_str == "true"
210 | 
211 | 
212 | def pad_sequences(sequences, pad_tok=None, max_length=None):
213 |     if pad_tok is None:
214 |         pad_tok = 0  # 0: "PAD" for words and chars, "PAD" for tags
215 | 
216 |     if max_length is None:
217 |         max_length = max([len(seq) for seq in sequences])
218 | 
219 |     sequence_padded, sequence_length = [], []
220 | 
221 |     for seq in sequences:
222 |         seq_ = seq[:max_length] + [pad_tok] * max(max_length - len(seq), 0)
223 |         sequence_padded.append(seq_)
224 |         sequence_length.append(min(len(seq), max_length))
225 | 
226 |     return sequence_padded, sequence_length
227 | 
228 | 
229 | def pad_char_sequences(sequences, max_length=None, max_length_2=None):
230 |     sequence_padded, sequence_length = [], []
231 | 
232 |     if max_length is None:
233 |         max_length = max(map(lambda x: len(x), sequences))
234 | 
235 |     if max_length_2 is None:
236 |         max_length_2 = max([max(map(lambda x: len(x), seq)) for seq in sequences])
237 | 
238 |     for seq in sequences:
239 |         sp, sl = pad_sequences(seq, max_length=max_length_2)
240 |         sequence_padded.append(sp)
241 |         sequence_length.append(sl)
242 | 
243 |     sequence_padded, _ = pad_sequences(sequence_padded, pad_tok=[0] * max_length_2, max_length=max_length)
244 |     sequence_length, _ = pad_sequences(sequence_length, max_length=max_length)
245 | 
246 |     return sequence_padded, sequence_length
247 | 
248 | 
249 | def pad_video_sequence(sequences, max_length=None):
250 |     if max_length is None:
251 |         max_length = max([vfeat.shape[0] for vfeat in sequences])
252 | 
253 |     feature_length = sequences[0].shape[1]
254 |     sequence_padded, sequence_length = [], []
255 | 
256 |     for seq in sequences:
257 |         add_length = max_length - seq.shape[0]
258 |         sequence_length.append(seq.shape[0])
259 | 
260 |         if add_length > 0:
261 |             add_feature = np.zeros(shape=[add_length, feature_length], dtype=np.float32)
262 |             seq_ = np.concatenate([seq, add_feature], axis=0)
263 | 
264 |         else:
265 |             seq_ = seq
266 | 
267 |         sequence_padded.append(seq_)
268 | 
269 |     return sequence_padded, sequence_length
270 |     
271 | def pad_mask_sequence(seq, max_length=None):
272 | 
273 |     feature_length = len(seq)
274 | 
275 |     add_length = max_length - feature_length
276 |     # sequence_length.append(seq.shape[0])
277 | 
278 |     if add_length > 0:
279 |         add_feature = np.zeros(shape=[add_length], dtype=np.int32)
280 |         seq_ = np.concatenate([seq, add_feature], axis=0)
281 | 
282 |     else:
283 |         seq_ = seq
284 | 
285 |     return seq_, feature_length
286 | def sliding_window(length):
287 |     dx_ = []
288 |     dy_ = []
289 |     x5 = 0
290 |     x0 = 0
291 |     x1 = 0
292 |     x2 = 0
293 |     x3 = 0
294 |     x4 = 0
295 |     # print(5 > length)
296 |     # for i in range(int((length - 3) / 1)):
297 |     #     y5 = x5 + 3
298 |     #     dx_.append(x5)
299 |     #     dy_.append(y5)
300 |     #     x5 = x5 + 1    
301 |     # # for i in range(int((length - 32) / 12)):
302 |     #     y0 = x0 + 47
303 |     #     dx_.append(x0)
304 |     #     dy_.append(y0)
305 |     #     x0 = x0 + 12
306 |     # for i in range(int((length - 64) / 24)):
307 |     #     y1 = x1 + 95
308 |     #     dx_.append(x1)
309 |     #     dy_.append(y1)
310 |     #     x1 = x1 + 24
311 | 
312 | 
313 |     for i in range(int((length - 6) / 2)):
314 |         y0 = x0 + 7
315 |         dx_.append(x0)
316 |         dy_.append(y0)
317 |         x0 = x0 + 2
318 |     for i in range(int((length - 12) / 4)):
319 |         y1 = x1 + 15
320 |         dx_.append(x1)
321 |         dy_.append(y1)
322 |         x1 = x1 + 4
323 |     for i in range(int((length - 24) / 8)):
324 |         y2 = x2 + 31
325 |         dx_.append(x2)
326 |         dy_.append(y2)
327 |         x2 = x2 + 8
328 |     for i in range(int((length - 48) / 16)):
329 |         y3 = x3 + 63
330 |         dx_.append(x3)
331 |         dy_.append(y3)
332 |         x3 = x3 + 16
333 |     for i in range(int((length - 96) / 32)):
334 |         y4 = x4 + 127
335 |         dx_.append(x4)
336 |         dy_.append(y4)
337 |         x4 = x4 + 32
338 |     # dx_ = np.reshape(dx_ * batch_size, [batch_size, -1])
339 |     # dy_ = np.reshape(dy_ * batch_size, [batch_size, -1])
340 |     # dx = tf.cast(tf.convert_to_tensor(dx_), tf.int32)
341 |     # dy = tf.cast(tf.convert_to_tensor(dy_), tf.int32)
342 |     # mask_dx = tf.sequence_mask(lengths=dx, maxlen=length, dtype=tf.float32)
343 |     # mask_dy = tf.sequence_mask(lengths=dy + 1, maxlen=length, dtype=tf.float32)
344 |     # mask = mask_dy - mask_dx
345 |     # dx = np.concatenate(dx_, np.zeros(batch_max_length-len(dx)), axis=0)
346 |     # dy = np.concatenate(dy_, np.zeros(batch_max_length-len(dy)), axis=0)
347 |     # print(len(dy_))
348 |     if len(dx_)==0:
349 |         dx_.append(0)
350 |         dy_.append(length-1)
351 | 
352 |     return dx_, dy_
353 |     
354 | def proposal_mask(dx, dy, length):
355 | 
356 |     mask_dx = np.concatenate((np.ones(dx), np.zeros(length-dx)), axis=0)
357 |     mask_dy = np.concatenate((np.ones(dy+1), np.zeros(length-dy-1)), axis=0)
358 |     mask = mask_dy - mask_dx
359 |     return mask
360 | 
361 | 
362 | 
363 | def batch_iter(dataset, all_video_features, batch_size, extend=0.2, train=True, shuffle=False):
364 |     if shuffle:
365 |         random.shuffle(dataset)
366 | 
367 |     for index in range(0, len(dataset), batch_size):
368 |         batch_data = dataset[index:(index + batch_size)]
369 |         video_ids, word_ids, char_ids, start_indexes, end_indexes = [], [], [], [], []
370 | 
371 |         for data in batch_data:
372 |             video_ids.append(data["video_id"].split('.')[0])
373 |             word_ids.append(data["word_ids"])
374 |             char_ids.append(data["char_ids"])
375 |             start_indexes.append(data["start_index"])
376 |             end_indexes.append(data["end_index"])
377 | 
378 |         true_batch_size = len(batch_data)
379 | 
380 |         # add by xsn
381 |         if true_batch_size < batch_size:
382 |             break
383 | 
384 |         # process word ids
385 |         word_ids, _ = pad_sequences(word_ids)
386 |         word_ids = np.asarray(word_ids, dtype=np.int32)
387 | 
388 |         # process char ids
389 |         char_ids, _ = pad_char_sequences(char_ids)
390 |         char_ids = np.asarray(char_ids, dtype=np.int32)
391 | 
392 |         # process video features
393 |         video_features = [all_video_features[video_id] for video_id in video_ids]
394 |         max_length = max([vfeat.shape[0] for vfeat in video_features])
395 |         vfeat_lens = [vfeat.shape[0] for vfeat in video_features]
396 |         vfeat_lens = np.asarray(vfeat_lens, dtype=np.int32)
397 |         # for bbox proposals
398 |         # batch_mask = []
399 |         # dx = []
400 |         # dy = []
401 |         # for vfeat in video_features:
402 |         #     length = vfeat.shape[0] 
403 |         #     # print(length)
404 |         #     dx_, dy_ = sliding_window(length)
405 |         #     dx_, _ = pad_mask_sequence(dx_, max_length=233)
406 |         #     dy_, _ = pad_mask_sequence(dy_, max_length=233)
407 |         #     dx.append(dx_)
408 |         #     dy.append(dy_)
409 |         #     dx_new = np.reshape(dx_, [len(dx_),1])
410 |         #     dy_new = np.reshape(dy_, [len(dy_),1])
411 |         #     dxy = np.concatenate((dx_new, dy_new), -1)
412 |         #     masks = [np.reshape(proposal_mask(x, y, length),[length,1]) for x,y in dxy]
413 |         #     masks, video_seq_length = pad_video_sequence(masks, max_length=max_length)
414 |         #     batch_mask.append(masks)
415 |         # dx = np.asarray(dx, dtype=np.int32)
416 |         # dy = np.asarray(dy, dtype=np.int32)
417 |         # batch_mask = np.asarray(batch_mask, dtype=np.float32)
418 |         # print(np.shape(dy))
419 |         video_features, video_seq_length = pad_video_sequence(video_features, max_length=max_length)
420 |         video_features = np.asarray(video_features, dtype=np.float32)
421 |         video_seq_length = np.asarray(video_seq_length, dtype=np.int32)
422 | 
423 |         epsilon = 1E-8
424 | 
425 |         # soft label
426 |         y = (1 - (max_length-3) * epsilon - 0.5)/ 2
427 |         start_label = np.ones(shape=[true_batch_size, max_length], dtype=np.int32) * epsilon
428 |         end_label = np.ones(shape=[true_batch_size, max_length], dtype=np.int32) * epsilon
429 | 
430 |         # generate labels
431 |         # start_label = np.zeros(shape=[true_batch_size, max_length], dtype=np.int32)
432 |         # end_label = np.zeros(shape=[true_batch_size, max_length], dtype=np.int32)
433 |         highlight_labels = np.zeros(shape=[true_batch_size, max_length], dtype=np.int32)
434 | 
435 | 
436 |         for idx in range(true_batch_size):
437 |             st, et = start_indexes[idx], end_indexes[idx]
438 |             if st > 0:
439 |                 start_label[idx][st - 1] = y
440 |             if st < max_length-1:
441 |                 start_label[idx][st + 1] = y
442 |             start_label[idx][st] = 0.5
443 | 
444 |             if et > 0:
445 |                 end_label[idx][et - 1] = y
446 |             if et < max_length-1:
447 |                 end_label[idx][et + 1] = y
448 |             end_label[idx][et] = 0.5
449 | 
450 |             # start_label[idx][st] = 1
451 |             # end_label[idx][et] = 1
452 |             cur_max_len = vfeat_lens[idx]
453 |             extend_len = round(extend * float(et - st + 1))
454 |             if extend_len > 0:
455 |                 st_ = max(0, st - extend_len)
456 |                 et_ = min(et + extend_len, cur_max_len - 1)
457 |                 highlight_labels[idx][st_:(et_ + 1)] = 1
458 |             else:
459 |                 highlight_labels[idx][st:(et + 1)] = 1
460 | 
461 |         # yield (batch_data, video_features, word_ids, char_ids, video_seq_length, start_label, end_label,
462 |         #    highlight_labels, dx, dy, batch_mask)
463 |         if train is True:
464 |             is_training = True
465 |         else:
466 |             is_training = False
467 |         yield (batch_data, video_features, word_ids, char_ids, video_seq_length, start_label, end_label,
468 |                highlight_labels, is_training)


--------------------------------------------------------------------------------
/utils/prepro_activitynet.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import numpy as np
  4 | from tqdm import tqdm
  5 | from collections import Counter
  6 | from nltk.tokenize import word_tokenize
  7 | from utils.data_utils import load_video_ids, create_vocabularies, load_json, write_json, UNK, time_to_index
  8 | 
  9 | 
 10 | def read_activitynet_data(activitynet_dir, feature_name, max_sentence_length):
 11 |     with open(os.path.join(activitynet_dir, "captions", "train.json"), mode="r", encoding="utf-8") as f:
 12 |         train_data = json.load(f)
 13 | 
 14 |     with open(os.path.join(activitynet_dir, "captions", "val_1.json"), mode="r", encoding="utf-8") as f:
 15 |         test_data = json.load(f)  # used as test set follow Yuan et al.
 16 | 
 17 |     with open(os.path.join(activitynet_dir, "captions", "val_2.json"), mode="r", encoding="utf-8") as f:
 18 |         test2_data = json.load(f)
 19 | 
 20 |     video_ids = load_video_ids(os.path.join(activitynet_dir, "activitynet_features_{}".format(feature_name)))
 21 | 
 22 |     def load_information(data, valid_vids):
 23 |         vids, results = [], []
 24 | 
 25 |         for vid, records in data.items():
 26 |             if vid not in valid_vids:
 27 |                 continue  # handle missing video records
 28 | 
 29 |             vids.append(vid)
 30 | 
 31 |             for timestamp, sentence in zip(records["timestamps"], records["sentences"]):
 32 |                 duration = float(records["duration"])
 33 |                 start_time = max(0.0, float(timestamp[0]))
 34 |                 end_time = min(float(timestamp[1]), duration)
 35 |                 words = word_tokenize(sentence.strip().lower(), language="english")
 36 | 
 37 |                 if max_sentence_length is not None:
 38 |                     words = words[0:max_sentence_length]
 39 | 
 40 |                 results.append((vid, start_time, end_time, duration, words))
 41 | 
 42 |         return vids, results
 43 | 
 44 |     train_vids, train_data = load_information(train_data, video_ids)
 45 |     test_vids, test_data = load_information(test_data, video_ids)
 46 |     test2_vids, test2_data = load_information(test2_data, video_ids)
 47 |     filtered_video_ids = list(set(train_vids + test_vids + test2_vids))
 48 |     return train_data, test_data, test2_data, filtered_video_ids
 49 | 
 50 | 
 51 | def generate_dataset(data, feature_shapes, word_dict, char_dict, scope):
 52 |     dataset = list()
 53 | 
 54 |     for record in tqdm(data, total=len(data), desc="process {} data".format(scope)):
 55 |         video_id, start_time, end_time, duration, words = record
 56 |         feature_shape = feature_shapes[video_id]
 57 | 
 58 |         # compute best start and end indices
 59 |         start_index, end_index = time_to_index(start_time, end_time, feature_shape, duration)
 60 | 
 61 |         # convert words and characters
 62 |         word_indices, char_indices = list(), list()
 63 |         for word in words:
 64 |             word_index = word_dict[word] if word in word_dict else word_dict[UNK]
 65 |             char_index = [char_dict[char] if char in char_dict else char_dict[UNK] for char in word]
 66 |             word_indices.append(word_index)
 67 |             char_indices.append(char_index)
 68 | 
 69 |         example = {"video_id": str(video_id), "start_time": float(start_time), "end_time": float(end_time),
 70 |                    "duration": float(duration), "start_index": int(start_index), "end_index": int(end_index),
 71 |                    "feature_shape": int(feature_shape), "word_ids": word_indices, "char_ids": char_indices}
 72 |         dataset.append(example)
 73 | 
 74 |     return dataset
 75 | 
 76 | 
 77 | def prepro_activitynet(configs):
 78 | 
 79 |     if not os.path.exists(configs.save_dir):
 80 |         os.makedirs(configs.save_dir)
 81 | 
 82 |     # train/test data format: (video_id, start_time, end_time, duration, words)
 83 |     train_data, test_data, test2_data, _ = read_activitynet_data(configs.root, configs.feature,
 84 |                                                                  configs.max_position_length)
 85 | 
 86 |     # load features and sample feature shapes if possible
 87 |     features_path = os.path.join(configs.root, "activitynet_features_{}/feature_shapes.json".format(configs.feature))
 88 |     feature_shapes = dict()
 89 |     for vid, length in load_json(features_path).items():
 90 |         if configs.max_position_length is not None and length > configs.max_position_length:
 91 |             length = configs.max_position_length
 92 |         feature_shapes[vid] = length
 93 | 
 94 |     # generate token dicts and load pre-trained vectors
 95 |     word_counter, char_counter = Counter(), Counter()
 96 |     for data in [train_data, test_data, test2_data]:
 97 |         for record in data:
 98 |             words = record[-1]
 99 |             for word in words:
100 |                 word_counter[word] += 1
101 |                 for char in list(word):
102 |                     char_counter[char] += 1
103 |     word_dict, char_dict, word_vectors = create_vocabularies(configs, word_counter, char_counter)
104 | 
105 |     # generate datasets
106 |     train_set = generate_dataset(train_data, feature_shapes, word_dict, char_dict, "train")
107 |     test_set = generate_dataset(test_data, feature_shapes, word_dict, char_dict, "test")
108 |     test2_set = generate_dataset(test2_data, feature_shapes, word_dict, char_dict, "test2")
109 | 
110 |     # save to directory
111 |     write_json(word_dict, save_path=os.path.join(configs.save_dir, "word_dict.json"))
112 |     write_json(char_dict, save_path=os.path.join(configs.save_dir, "char_dict.json"))
113 |     np.savez_compressed(os.path.join(configs.save_dir, "word_vectors.npz"), vectors=word_vectors)
114 |     write_json(train_set, save_path=os.path.join(configs.save_dir, "train_set.json"))
115 |     write_json(test_set, save_path=os.path.join(configs.save_dir, "test_set.json"))
116 |     write_json(test2_set, save_path=os.path.join(configs.save_dir, "test2_set.json"))
117 | 


--------------------------------------------------------------------------------
/utils/prepro_charades.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import numpy as np
  4 | from tqdm import tqdm
  5 | from collections import Counter
  6 | from nltk.tokenize import word_tokenize
  7 | from utils.data_utils import create_vocabularies, load_json, write_json, UNK, time_to_index
  8 | 
  9 | 
 10 | def read_charades_data(charades_dir, max_sentence_length):
 11 |     # load charades json file
 12 |     with open(os.path.join(charades_dir, "charades.json"), mode="r", encoding="utf-8") as f:
 13 |         charades = json.load(f)
 14 | 
 15 |     def load_information(charades_sta_dir):
 16 |         with open(charades_sta_dir, mode="r", encoding="utf-8") as f_sta:
 17 |             vids, data = [], []
 18 | 
 19 |             for line in f_sta:
 20 |                 line = line.lstrip().rstrip()
 21 | 
 22 |                 if len(line) == 0:
 23 |                     continue
 24 | 
 25 |                 video_info, sentence = line.split("##")
 26 |                 vid, start_time, end_time = video_info.split(" ")
 27 |                 words = word_tokenize(sentence.lower(), language="english")
 28 | 
 29 |                 if max_sentence_length is not None:
 30 |                     words = words[0:max_sentence_length]
 31 | 
 32 |                 duration = float(charades[vid]["duration"])
 33 |                 start_time = max(0.0, float(start_time))
 34 |                 end_time = min(float(end_time), duration)
 35 | 
 36 |                 vids.append(vid)
 37 |                 data.append((vid, start_time, end_time, duration, words))
 38 | 
 39 |             return vids, data
 40 | 
 41 |     # load train and test dataset
 42 |     train_vids, train_data = load_information(os.path.join(charades_dir, "charades_sta_train.txt"))
 43 |     test_vids, test_data = load_information(os.path.join(charades_dir, "charades_sta_test.txt"))
 44 | 
 45 |     video_ids = list(set(train_vids + test_vids))
 46 | 
 47 |     return train_data, test_data, video_ids
 48 | 
 49 | 
 50 | def generate_dataset(data, feature_shapes, word_dict, char_dict, scope):
 51 |     dataset = list()
 52 |     for record in tqdm(data, total=len(data), desc="process {} data".format(scope)):
 53 | 
 54 |         video_id, start_time, end_time, duration, words = record
 55 |         if video_id not in list(feature_shapes.keys()):
 56 |             continue
 57 |         feature_shape = feature_shapes[video_id]
 58 | 
 59 |         # compute best start and end indices
 60 |         start_index, end_index = time_to_index(start_time, end_time, feature_shape, duration)
 61 | 
 62 |         # convert words and characters
 63 |         word_indices, char_indices = list(), list()
 64 |         for word in words:
 65 |             word_index = word_dict[word] if word in word_dict else word_dict[UNK]
 66 |             char_index = [char_dict[char] if char in char_dict else char_dict[UNK] for char in word]
 67 |             word_indices.append(word_index)
 68 |             char_indices.append(char_index)
 69 | 
 70 |         example = {"video_id": str(video_id), "start_time": float(start_time), "end_time": float(end_time),
 71 |                    "duration": float(duration), "start_index": int(start_index), "end_index": int(end_index),
 72 |                    "feature_shape": int(feature_shape), "word_ids": word_indices, "char_ids": char_indices}
 73 |         dataset.append(example)
 74 | 
 75 |     return dataset
 76 | 
 77 | 
 78 | def prepro_charades(configs):
 79 | 
 80 |     if not os.path.exists(configs.save_dir):
 81 |         os.makedirs(configs.save_dir)
 82 | 
 83 |     # train/test data format: (video_id, start_time, end_time, duration, words)
 84 |     train_data, test_data, _ = read_charades_data(configs.root, configs.max_position_length)
 85 | 
 86 |     # load features and sample feature shapes if possible
 87 |     features_path = os.path.join(configs.root, "charades_features_{}/feature_shapes.json".format(configs.feature))
 88 |     feature_shapes = dict()
 89 |     for vid, length in load_json(features_path).items():
 90 |         if configs.max_position_length is not None and length > configs.max_position_length:
 91 |             length = configs.max_position_length
 92 |         feature_shapes[vid] = length
 93 | 
 94 |     # generate token dicts and load pre-trained vectors
 95 |     word_counter, char_counter = Counter(), Counter()
 96 |     for data in [train_data, test_data]:
 97 |         for record in data:
 98 |             words = record[-1]
 99 |             for word in words:
100 |                 word_counter[word] += 1
101 |                 for char in list(word):
102 |                     char_counter[char] += 1
103 |     word_dict, char_dict, word_vectors = create_vocabularies(configs, word_counter, char_counter)
104 | 
105 |     # generate datasets
106 |     train_set = generate_dataset(train_data, feature_shapes, word_dict, char_dict, "train")
107 |     test_set = generate_dataset(test_data, feature_shapes, word_dict, char_dict, "test")
108 | 
109 |     # save to directory
110 |     write_json(word_dict, save_path=os.path.join(configs.save_dir, "word_dict.json"))
111 |     write_json(char_dict, save_path=os.path.join(configs.save_dir, "char_dict.json"))
112 |     np.savez_compressed(os.path.join(configs.save_dir, "word_vectors.npz"), vectors=word_vectors)
113 |     write_json(train_set, save_path=os.path.join(configs.save_dir, "train_set.json"))
114 |     write_json(test_set, save_path=os.path.join(configs.save_dir, "test_set.json"))
115 | 


--------------------------------------------------------------------------------
/utils/prepro_tacos.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import numpy as np
  4 | from tqdm import tqdm
  5 | from collections import Counter
  6 | from nltk.tokenize import word_tokenize
  7 | from utils.data_utils import create_vocabularies, load_json, write_json, UNK, time_to_index
  8 | 
  9 | 
 10 | def read_tacos_data(tacos_dir, max_sentence_length):
 11 |     with open(os.path.join(tacos_dir, "train.json"), mode="r", encoding="utf-8") as f:
 12 |         train_data = json.load(f)
 13 | 
 14 |     with open(os.path.join(tacos_dir, "val.json"), mode="r", encoding="utf-8") as f:
 15 |         val_data = json.load(f)
 16 | 
 17 |     with open(os.path.join(tacos_dir, "test.json"), mode="r", encoding="utf-8") as f:
 18 |         test_data = json.load(f)
 19 | 
 20 |     def load_information(data):
 21 |         results = []
 22 |         for vid, records in data.items():
 23 |             if vid.endswith(".avi"):
 24 |                 vid = vid[0:-4]
 25 | 
 26 |             duration = float(records["num_frames"]) / float(records["fps"])
 27 | 
 28 |             for timestamp, sentence in zip(records["timestamps"], records["sentences"]):
 29 |                 start_time = max(0.0, float(timestamp[0]) / float(records["fps"]))
 30 |                 end_time = min(float(timestamp[1]) / float(records["fps"]), duration)
 31 |                 words = word_tokenize(sentence.strip().lower(), language="english")
 32 | 
 33 |                 if max_sentence_length is not None:
 34 |                     words = words[0:max_sentence_length]
 35 | 
 36 |                 results.append((vid, start_time, end_time, duration, words))
 37 | 
 38 |         return results
 39 | 
 40 |     train_data = load_information(train_data)
 41 |     val_data = load_information(val_data)
 42 |     test_data = load_information(test_data)
 43 |     return train_data, val_data, test_data
 44 | 
 45 | 
 46 | def generate_dataset(data, feature_shapes, word_dict, char_dict, scope):
 47 |     dataset = list()
 48 |     for record in tqdm(data, total=len(data), desc="process {} data".format(scope)):
 49 |         video_id, start_time, end_time, duration, words = record
 50 |         # video_id = video_id + '.avi'
 51 |         video_id = video_id 
 52 |         feature_shape = feature_shapes[video_id]
 53 | 
 54 |         # compute best start and end indices
 55 |         start_index, end_index = time_to_index(start_time, end_time, feature_shape, duration)
 56 | 
 57 |         # convert words and characters
 58 |         word_indices, char_indices = list(), list()
 59 |         for word in words:
 60 |             word_index = word_dict[word] if word in word_dict else word_dict[UNK]
 61 |             char_index = [char_dict[char] if char in char_dict else char_dict[UNK] for char in word]
 62 |             word_indices.append(word_index)
 63 |             char_indices.append(char_index)
 64 | 
 65 |         example = {"video_id": str(video_id), "start_time": float(start_time), "end_time": float(end_time),
 66 |                    "duration": float(duration), "start_index": int(start_index), "end_index": int(end_index),
 67 |                    "feature_shape": int(feature_shape), "word_ids": word_indices, "char_ids": char_indices}
 68 |         dataset.append(example)
 69 | 
 70 |     return dataset
 71 | 
 72 | 
 73 | def prepro_tacos(configs):
 74 | 
 75 |     if not os.path.exists(configs.save_dir):
 76 |         os.makedirs(configs.save_dir)
 77 | 
 78 |     # train/test data format: (video_id, start_time, end_time, duration, words)
 79 |     train_data, val_data, test_data = read_tacos_data(configs.root, configs.max_position_length)
 80 | 
 81 |     # load features and sample feature shapes if possible
 82 |     features_path = os.path.join(configs.root, "tacos_features_{}/feature_shapes.json".format(configs.feature))
 83 |     feature_shapes = dict()
 84 |     for vid, length in load_json(features_path).items():
 85 |         if configs.max_position_length is not None and length > configs.max_position_length:
 86 |             length = configs.max_position_length
 87 |         feature_shapes[vid] = length
 88 | 
 89 |     # generate token dicts and load pre-trained vectors
 90 |     word_counter, char_counter = Counter(), Counter()
 91 |     for data in [train_data, val_data, test_data]:
 92 |         for record in data:
 93 |             words = record[-1]
 94 |             for word in words:
 95 |                 word_counter[word] += 1
 96 |                 for char in list(word):
 97 |                     char_counter[char] += 1
 98 |     word_dict, char_dict, word_vectors = create_vocabularies(configs, word_counter, char_counter)
 99 | 
100 |     # generate datasets
101 |     train_set = generate_dataset(train_data, feature_shapes, word_dict, char_dict, "train")
102 |     val_set = generate_dataset(val_data, feature_shapes, word_dict, char_dict, "val")
103 |     test_set = generate_dataset(test_data, feature_shapes, word_dict, char_dict, "test")
104 | 
105 |     # save to directory
106 |     write_json(word_dict, save_path=os.path.join(configs.save_dir, "word_dict.json"))
107 |     write_json(char_dict, save_path=os.path.join(configs.save_dir, "char_dict.json"))
108 |     np.savez_compressed(os.path.join(configs.save_dir, "word_vectors.npz"), vectors=word_vectors)
109 |     write_json(train_set, save_path=os.path.join(configs.save_dir, "train_set.json"))
110 |     write_json(val_set, save_path=os.path.join(configs.save_dir, "val_set.json"))
111 |     write_json(test_set, save_path=os.path.join(configs.save_dir, "test_set.json"))
112 | 


--------------------------------------------------------------------------------
/utils/runner_utils.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import numpy as np
  3 | import tensorflow as tf
  4 | from tqdm import tqdm
  5 | from utils.data_utils import batch_iter
  6 | import pickle
  7 | import os
  8 | 
  9 | def write_tf_summary(writer, value_pairs, global_step):
 10 |     for tag, value in value_pairs:
 11 |         summ = tf.Summary(value=[tf.Summary.Value(tag=tag, simple_value=value)])
 12 |         writer.add_summary(summ, global_step=global_step)
 13 |     writer.flush()
 14 | 
 15 | 
 16 | def calculate_iou_accuracy(ious, threshold):
 17 |     total_size = float(len(ious))
 18 |     count = 0
 19 | 
 20 |     for iou in ious:
 21 |         if iou >= threshold:
 22 |             count += 1
 23 | 
 24 |     return float(count) / total_size * 100.0
 25 | 
 26 | 
 27 | def calculate_iou(i0, i1):
 28 |     union = (min(i0[0], i1[0]), max(i0[1], i1[1]))
 29 |     inter = (max(i0[0], i1[0]), min(i0[1], i1[1]))
 30 | 
 31 |     iou = 1.0 * (inter[1] - inter[0]) / (union[1] - union[0])
 32 | 
 33 |     return max(0.0, iou)
 34 | 
 35 | 
 36 | def convert_to_time(start_index, end_index, num_features, duration):
 37 |     s_times = np.arange(0, num_features).astype(np.float32) * duration / float(num_features)
 38 |     e_times = np.arange(1, num_features + 1).astype(np.float32) * duration / float(num_features)
 39 |     if start_index >= num_features:
 40 |         start_index = num_features - 1
 41 |     if end_index >= num_features:
 42 |         end_index = num_features - 1
 43 |     if start_index < 0:
 44 |         start_index = 0
 45 |     if end_index <0:
 46 |         end_index = 0
 47 |     start_time = s_times[start_index]
 48 |     end_time = e_times[end_index]
 49 | 
 50 |     return start_time, end_time
 51 | 
 52 | 
 53 | def get_feed_dict(batch_data, model, drop_rate=None, mode='train'):
 54 |     if mode == 'train':  # training
 55 |         #(_, video_features, word_ids, char_ids, video_seq_length, start_label, end_label, highlight_labels, dx, dy, batch_mask) = batch_data
 56 | 
 57 |         # feed_dict = {model.video_inputs: video_features, model.video_seq_length: video_seq_length,
 58 |         #              model.word_ids: word_ids, model.char_ids: char_ids, model.y1: start_label, model.y2: end_label,
 59 |         #              model.drop_rate: drop_rate, model.highlight_labels: highlight_labels,
 60 |         #              model.dx1 : dx, model.dy1 : dy, model.mask1 : batch_mask}
 61 |         (_, video_features, word_ids, char_ids, video_seq_length, start_label, end_label, highlight_labels, is_training) = batch_data
 62 | 
 63 |         feed_dict = {model.video_inputs: video_features, model.video_seq_length: video_seq_length,
 64 |                      model.word_ids: word_ids, model.char_ids: char_ids, model.y1: start_label, model.y2: end_label,
 65 |                      model.drop_rate: drop_rate, model.highlight_labels: highlight_labels, model.is_training:is_training}
 66 | 
 67 |         return feed_dict
 68 | 
 69 |     else:  # eval
 70 |         # raw_data, video_features, word_ids, char_ids, video_seq_length, _, _, _, dx, dy, batch_mask = batch_data
 71 | 
 72 |         # feed_dict = {model.video_inputs: video_features, model.video_seq_length: video_seq_length,
 73 |         #              model.word_ids: word_ids, model.char_ids: char_ids,
 74 |         #              model.dx1 : dx, model.dy1 : dy, model.mask1 : batch_mask}
 75 | 
 76 |         # raw_data, video_features, word_ids, char_ids, video_seq_length, *_ = batch_data
 77 |         # feed_dict = {model.video_inputs: video_features, model.video_seq_length: video_seq_length,
 78 |         #              model.word_ids: word_ids, model.char_ids: char_ids}
 79 | 
 80 |         raw_data, video_features, word_ids, char_ids, video_seq_length, start_label, end_label, highlight_labels, is_training = batch_data
 81 |         feed_dict = {model.video_inputs: video_features, model.video_seq_length: video_seq_length,
 82 |                      model.word_ids: word_ids, model.char_ids: char_ids, model.y1: start_label, model.y2: end_label, model.is_training:is_training}
 83 |         return raw_data, feed_dict
 84 | 
 85 | 
 86 | # def eval_test(sess, model, dataset, video_features, configs, epoch=None, global_step=None, name="test"):
 87 | #     num_test_batches = math.ceil(len(dataset) / configs.batch_size)
 88 | #     ious = list()
 89 | #     extent = list()
 90 | #     prob = list()
 91 | 
 92 | #     for data in tqdm(batch_iter(dataset, video_features, configs.batch_size, configs.extend, False),
 93 | #                      total=num_test_batches, desc="evaluate {}".format(name)):
 94 | 
 95 | #         raw_data, feed_dict = get_feed_dict(data, model, mode=name)
 96 | #         # start_indexes, end_indexes = sess.run([model.start_index, model.end_index], feed_dict=feed_dict)
 97 | #         # iou_loss = sess.run([model.iou_loss], feed_dict=feed_dict)
 98 | #         start_indexes, end_indexes, start_prob, end_prob, iou_loss = sess.run([model.dx1, model.dy1, model.start_prob, model.end_prob, model.iou_loss], feed_dict=feed_dict)
 99 | 
100 | #         # print(y1)
101 | #         prob.append(iou_loss)
102 | #         for record, start_index_, end_index_ in zip(raw_data, start_indexes, end_indexes):
103 | #             for start_index, end_index in zip(start_index_, end_index_):
104 | #             # print(record["feature_shape"]) 62
105 | #                 start_time, end_time = convert_to_time(start_index, end_index, record["feature_shape"], record["duration"])
106 | #                 iou = calculate_iou(i0=[start_time, end_time], i1=[record["start_time"], record["end_time"]])
107 | #                 ious.append(iou)
108 | #                 s = start_time - record["start_time"]
109 | #                 e = end_time - record["end_time"]
110 | #                 seg = record["end_time"] - record["start_time"]
111 | #                 d = record["duration"]
112 | #                 item = [s,e,seg,d]
113 | #                 extent.append(item)
114 | 
115 | 
116 | #     # r1i3 = calculate_iou_accuracy(ious, threshold=0.1)
117 | #     r1i3 = calculate_iou_accuracy(ious, threshold=0.3)
118 | #     r1i5 = calculate_iou_accuracy(ious, threshold=0.5)
119 | #     r1i7 = calculate_iou_accuracy(ious, threshold=0.7)
120 | #     mi = np.mean(ious) * 100.0
121 | 
122 | #     value_pairs = 0
123 | 
124 | #     # write the scores
125 | #     score_str = "Epoch {}, Step {}:\n".format(epoch, global_step)
126 | #     score_str += "Rank@1, IoU=0.3: {:.2f}\t".format(r1i3)
127 | #     score_str += "Rank@1, IoU=0.5: {:.2f}\t".format(r1i5)
128 | #     score_str += "Rank@1, IoU=0.7: {:.2f}\t".format(r1i7)
129 | #     print("在这里", mi, type(mi), np.shape(ious))
130 | #     score_str += "mean IoU: {:.2f}\n".format(mi)
131 | #     # return extent, r1i3, r1i5, r1i7, mi, value_pairs, score_str
132 | #     return r1i3, r1i5, r1i7, mi, value_pairs, score_str
133 | 
134 | 
135 | def eval_test(sess, model, dataset, video_features, configs, epoch=None, global_step=None, name="test"):
136 |     num_test_batches = math.ceil(len(dataset) / configs.batch_size)
137 |     ious = list()
138 |     extent = list()
139 |     prob = list()
140 |     pse = list()
141 | 
142 |     # query_txts = ["person reading a book.", "person opens the door."]
143 |     # fps_list = [30.00, 19.75]
144 |     for data in tqdm(batch_iter(dataset, video_features, configs.batch_size, configs.extend, train=False, shuffle=False),
145 |                      total=num_test_batches, desc="evaluate {}".format(name)):
146 | 
147 |         raw_data, feed_dict = get_feed_dict(data, model, mode=name)
148 |         # start_indexes, end_indexes = sess.run([model.start_index, model.end_index], feed_dict=feed_dict)
149 |         # start_indexes, end_indexes, dx, dy, length= sess.run([model.px, model.py, model.dx, model.dy, model.video_seq_length], feed_dict=feed_dict)
150 |         start_indexes, end_indexes, proposal_box = sess.run([model.px, model.py, model.proposal_box], feed_dict=feed_dict)
151 | 
152 |         # iou_loss = sess.run([model.iou_loss], feed_dict=feed_dict)
153 |         # start_indexes, end_indexes, start_prob, end_prob, iou_loss = sess.run([model.dx, model.dy, model.start_prob, model.end_prob, model.iou_loss], feed_dict=feed_dict)
154 | 
155 |         # print(proposal_box)
156 |         # np.savetxt('tocos_pre6.out', proposal_box)
157 | 
158 |         # print(np.shape(start_indexes))
159 |         # prob.append(iou_loss)
160 |         i = 0
161 |         for record, start_index, end_index in zip(raw_data, start_indexes, end_indexes):
162 |             # print(record["feature_shape"]) 62
163 |             start_time, end_time = convert_to_time(start_index, end_index, record["feature_shape"], record["duration"])
164 |             # print(start_time, end_time)
165 |             # prediction_result = {'video_path':"/home/xsn/VSLNet/nlvl/charaders/videos/" + record["video_id"] + ".mp4",
166 |             #                          'fps':fps_list[i],
167 |             #                          'query_txt':query_txts[i],
168 |             #                          'prediction':[start_time[0], end_time[0]],
169 |             #                          'ground_truth':[record["start_time"], record["end_time"]]}
170 | 
171 |             # with open("prediction_result_"+str(i)+".pkl",'wb') as f:
172 |             #     pickle.dump(prediction_result, f)
173 |             # i = i + 1
174 | 
175 |             iou = calculate_iou(i0=[start_time, end_time], i1=[record["start_time"], record["end_time"]])
176 |             ious.append(iou)
177 | 
178 |             # print(record.keys()) #dict_keys(['video_id', 'start_time', 'end_time', 'duration', 'start_index', 'end_index', 'feature_shape', 'word_ids', 'char_ids'])
179 |             s = record["start_time"]/record["duration"]
180 |             e = record["end_time"]/record["duration"]
181 |             p = (e+s)/2
182 |             l = (e-s)/2
183 |             item = [p, l]
184 |             extent.append(item)
185 | 
186 |             # print(record.keys()) #dict_keys(['video_id', 'start_time', 'end_time', 'duration', 'start_index', 'end_index', 'feature_shape', 'word_ids', 'char_ids'])
187 |             # s = start_time - record["start_time"]
188 |             # e = end_time - record["end_time"]
189 |             # seg = record["end_time"] - record["start_time"]
190 |             # d = record["duration"]
191 |             # item = [s,e,seg,d]
192 |             # extent.append(item)
193 |             if iou > 0.8:
194 |                 s = record["start_time"]
195 |                 e = record["end_time"]
196 |                 ps = float(start_time[0])
197 |                 # print(type(end_time))
198 |                 if isinstance(end_time, np.ndarray):
199 |                     pe = float(end_time[0])
200 |                 else:
201 |                     pe = float(end_time)
202 |                 vid = record["video_id"]
203 |                 d = record["duration"]
204 |                 item = [s,e,ps,pe,vid, d]
205 |                 # print(type(s), type(e), type(ps), type(pe), type(vid))
206 |                 if s > 3.0 and e < record['duration']-3.0:
207 |                     pse.append(item)
208 | 
209 |     # np.savetxt('gth0.8.out', pse) 
210 |     # np.savetxt('t_real_proposal.out', extent) 
211 |     # r1i3 = calculate_iou_accuracy(ious, threshold=0.1)
212 |     r1i3 = calculate_iou_accuracy(ious, threshold=0.3)
213 |     r1i5 = calculate_iou_accuracy(ious, threshold=0.5)
214 |     r1i7 = calculate_iou_accuracy(ious, threshold=0.7)
215 |     mi = np.mean(ious) * 100.0
216 | 
217 |     # value_pairs = [("{}/Rank@1, IoU=0.3".format(name), r1i3), ("{}/Rank@1, IoU=0.5".format(name), r1i5),
218 |     #                ("{}/Rank@1, IoU=0.7".format(name), r1i7), ("{}/mean IoU".format(name), mi[0])]
219 |     value_pairs = [("{}/Rank@1, IoU=0.3".format(name), r1i3),
220 |                    ("{}/Rank@1, IoU=0.5".format(name), r1i5),
221 |                    ("{}/Rank@1, IoU=0.7".format(name), r1i7),
222 |                    ("{}/mean IoU".format(name), mi)]
223 |     # write the scores
224 |     score_str = "Epoch {}, Step {}:\n".format(epoch, global_step)
225 |     score_str += "Rank@1, IoU=0.3: {:.2f}\t".format(r1i3)
226 |     score_str += "Rank@1, IoU=0.5: {:.2f}\t".format(r1i5)
227 |     score_str += "Rank@1, IoU=0.7: {:.2f}\t".format(r1i7)
228 |     # print("在这里", mi, type(mi), np.shape(ious))
229 |     # score_str += "mean IoU: {:.2f}\n".format(mi[0])
230 |     score_str += "mean IoU: {}\n".format(mi)
231 |     # return extent, r1i3, r1i5, r1i7, mi, value_pairs, score_str
232 |     # return pse, r1i3, r1i5, r1i7, mi, value_pairs, score_str
233 |     return r1i3, r1i5, r1i7, mi, value_pairs, score_str
234 | 


--------------------------------------------------------------------------------