├── README.md ├── models ├── LPNet.py ├── layers.py └── ops.py ├── prepare ├── README.md ├── convert_charades.py ├── download_activitynet_video.py ├── extract_activitynet.py ├── extract_activitynet_org.py ├── extract_charades.py ├── extract_tacos.py ├── extract_tacos_org.py ├── feature_extractor.py └── videotransforms.py ├── run_activitynet.py ├── run_charades.py ├── run_tacos.py ├── statistic ├── convert_tacos.py ├── stat_activitynet.py ├── stat_charades.py └── stat_tacos.py └── utils ├── data_utils.py ├── prepro_activitynet.py ├── prepro_charades.py ├── prepro_tacos.py └── runner_utils.py /README.md: -------------------------------------------------------------------------------- 1 | # The code of LPNet. 2 | ## under update... 3 | -------------------------------------------------------------------------------- /models/LPNet.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import tensorflow as tf 4 | from models.ops import create_optimizer, count_params, regularizer 5 | from models.layers import word_embedding_lookup, char_embedding_lookup, conv1d, video_query_attention, highlight_layer 6 | from models.layers import context_query_concat, feature_encoder, conditioned_predictor, localization_loss, iou_regression 7 | from models.layers import generate_proposal_boxes, dynamic_head 8 | from models.layers import bilstm, multi_modal_sa, st_video_encoder, boundary_predictor, boundary_loss 9 | 10 | class LPNet: 11 | def __init__(self, configs, graph): 12 | self.configs = configs 13 | graph = graph if graph is not None else tf.Graph() 14 | with graph.as_default(): 15 | self.global_step = tf.train.create_global_step() 16 | self._add_placeholders() 17 | self._build_model() 18 | if configs.mode == 'train': 19 | print('\x1b[1;33m' + 'Total trainable parameters: {}'.format(count_params()) + '\x1b[0m', flush=True) 20 | else: 21 | print('\x1b[1;33m' + 'Total parameters: {}'.format(count_params()) + '\x1b[0m', flush=True) 22 | 23 | def _add_placeholders(self): 24 | self.video_inputs = tf.placeholder(dtype=tf.float32, shape=[None, None, self.configs.video_feature_dim], 25 | name='video_inputs') 26 | self.video_seq_length = tf.placeholder(dtype=tf.int32, shape=[None], name='video_sequence_length') 27 | self.word_ids = tf.placeholder(dtype=tf.int32, shape=[None, None], name='word_ids') 28 | self.char_ids = tf.placeholder(dtype=tf.int32, shape=[None, None, None], name='char_ids') 29 | self.highlight_labels = tf.placeholder(dtype=tf.int32, shape=[None, None], name='highlight_labels') 30 | 31 | self.is_training = tf.placeholder(tf.bool, shape=[]) 32 | # self.dx1 = tf.placeholder(dtype=tf.int32, shape=[None, None], name='dx') 33 | # self.dy1 = tf.placeholder(dtype=tf.int32, shape=[None, None], name='dy') 34 | # self.mask1 = tf.placeholder(dtype=tf.float32, shape=[None, None, None, 1], name='batch_mask') 35 | 36 | self.y1 = tf.placeholder(dtype=tf.float32, shape=[None, None], name='start_indexes') 37 | self.y2 = tf.placeholder(dtype=tf.float32, shape=[None, None], name='end_indexes') 38 | # hyper-parameters 39 | self.drop_rate = tf.placeholder_with_default(input=0.0, shape=[], name='dropout_rate') 40 | # create mask 41 | self.v_mask = tf.sequence_mask(lengths=self.video_seq_length, maxlen=tf.reduce_max(self.video_seq_length), 42 | dtype=tf.int32) 43 | self.q_mask = tf.cast(tf.cast(self.word_ids, dtype=tf.bool), dtype=tf.int32) 44 | 45 | self.q_length = tf.reduce_sum(self.q_mask, -1) 46 | self.v_length = tf.reduce_sum(self.v_mask, -1) 47 | 48 | def _build_model(self): 49 | # word embedding & visual features 50 | init_word_vectors = np.load(os.path.join(self.configs.save_dir, 'word_vectors.npz'))['vectors'] 51 | word_emb = word_embedding_lookup(self.word_ids, dim=self.configs.word_dim, drop_rate=self.drop_rate, 52 | vectors=init_word_vectors, finetune=False, reuse=False, name='word_embeddings') 53 | char_emb = char_embedding_lookup(self.char_ids, char_size=self.configs.char_size, dim=self.configs.char_dim, 54 | kernels=[1, 2, 3, 4], filters=[10, 20, 30, 40], drop_rate=self.drop_rate, 55 | activation=tf.nn.relu, reuse=False, name='char_embeddings') 56 | word_emb = tf.concat([word_emb, char_emb], axis=-1) 57 | video_features = tf.nn.dropout(self.video_inputs, rate=self.drop_rate) 58 | 59 | # feature projection (map both word and video feature to the same dimension) 60 | vfeats = conv1d(video_features, dim=self.configs.hidden_size, use_bias=True, reuse=False, name='video_conv1d') 61 | qfeats = conv1d(word_emb, dim=self.configs.hidden_size, use_bias=True, reuse=False, name='query_conv1d') 62 | 63 | vfeats0 = feature_encoder(vfeats, hidden_size=self.configs.hidden_size, num_heads=self.configs.num_heads, 64 | max_position_length=self.configs.max_position_length, drop_rate=self.drop_rate, 65 | mask=self.v_mask, reuse=False, name='feature_encoder') 66 | qfeats0 = feature_encoder(qfeats, hidden_size=self.configs.hidden_size, num_heads=self.configs.num_heads, 67 | max_position_length=self.configs.max_position_length, drop_rate=self.drop_rate, 68 | mask=self.q_mask, reuse=True, name='feature_encoder') 69 | 70 | # # video query attention 71 | outputs, self.vq_score = video_query_attention(vfeats0, qfeats0, self.v_mask, self.q_mask, reuse=False, 72 | drop_rate=self.drop_rate, name='video_query_attention') 73 | 74 | # # weighted pooling and concatenation 75 | outputs0 = context_query_concat(outputs, qfeats0, q_mask=self.q_mask, reuse=False, name='context_query_concat') 76 | 77 | self.highlight_loss, self.highlight_scores = highlight_layer(outputs0, self.highlight_labels, mask=self.v_mask, 78 | reuse=False, name='highlighting_layer') 79 | outputs0 = tf.multiply(outputs0, tf.expand_dims(self.highlight_scores, axis=-1)) 80 | 81 | start_logits, end_logits = conditioned_predictor(outputs0, hidden_size=self.configs.hidden_size, 82 | seq_len=self.video_seq_length, mask=self.v_mask, 83 | reuse=False, name='conditioned_predictor') 84 | # compute localization loss 85 | self.start_prob, self.end_prob, self.start_index, self.end_index, self.loss = boundary_loss( 86 | start_logits, end_logits, self.y1, self.y2, self.v_length,self.configs) 87 | 88 | 89 | self.proposal_box, self.dx, self.dy, self.boxes = generate_proposal_boxes(vfeats0, self.v_length, self.configs) 90 | self.reg_loss, self.l1reg_loss, self.l1_loss, self.iou_loss, self.regular, self.regular2, self.train, abc, self.px, self.py = dynamic_head( 91 | configs=self.configs, 92 | proposal_box=self.proposal_box, 93 | v_mask=self.v_mask, 94 | q_mask=self.q_mask, 95 | vfeats0=outputs, 96 | qfeats0=qfeats0, 97 | drop_rate=self.drop_rate, 98 | dx=self.dx, 99 | dy=self.dy, 100 | boxes=self.boxes, 101 | y1=self.y1, 102 | y2=self.y2, 103 | train=self.is_training) 104 | 105 | self.my_loss = 5 * self.iou_loss + self.regular #+ self.l1_loss #+ 100*self.reg_loss + self.highlight_loss + 0.2*self.loss#+ self.regular + self.regular2 + self.l1_loss 106 | self.reg_loss = 100 * self.reg_loss + self.highlight_loss + self.loss -------------------------------------------------------------------------------- /models/ops.py: -------------------------------------------------------------------------------- 1 | import re 2 | import numpy as np 3 | import tensorflow as tf 4 | 5 | regularizer = tf.contrib.layers.l2_regularizer(scale=3e-7) 6 | 7 | 8 | def count_params(scope=None): 9 | if scope is None: 10 | return int(np.sum([np.prod(v.get_shape().as_list()) for v in tf.trainable_variables()])) 11 | 12 | else: 13 | return int(np.sum([np.prod(v.get_shape().as_list()) for v in tf.trainable_variables(scope)])) 14 | 15 | 16 | def get_shape_list(tensor): 17 | shape = tensor.shape.as_list() 18 | non_static_indexes = [] 19 | 20 | for (index, dim) in enumerate(shape): 21 | if dim is None: 22 | non_static_indexes.append(index) 23 | 24 | if not non_static_indexes: 25 | return shape 26 | 27 | dyn_shape = tf.shape(tensor) 28 | for index in non_static_indexes: 29 | shape[index] = dyn_shape[index] 30 | 31 | return shape 32 | 33 | 34 | def mask_logits(inputs, mask, mask_value=-1e30): 35 | mask = tf.cast(mask, tf.float32) 36 | return inputs * mask + mask_value * (1.0 - mask) 37 | 38 | 39 | def ndim(x): 40 | return x.get_shape().ndims 41 | 42 | 43 | def dot(x, y): 44 | if ndim(x) is not None and (ndim(x) > 2 or ndim(y) > 2): 45 | x_shape = [] 46 | 47 | for i, s in zip(x.get_shape().as_list(), tf.unstack(tf.shape(x))): 48 | if i is not None: 49 | x_shape.append(i) 50 | else: 51 | x_shape.append(s) 52 | 53 | x_shape = tuple(x_shape) 54 | y_shape = [] 55 | 56 | for i, s in zip(y.get_shape().as_list(), tf.unstack(tf.shape(y))): 57 | if i is not None: 58 | y_shape.append(i) 59 | else: 60 | y_shape.append(s) 61 | 62 | y_shape = tuple(y_shape) 63 | y_permute_dim = list(range(ndim(y))) 64 | y_permute_dim = [y_permute_dim.pop(-2)] + y_permute_dim 65 | xt = tf.reshape(x, [-1, x_shape[-1]]) 66 | yt = tf.reshape(tf.transpose(y, perm=y_permute_dim), [y_shape[-2], -1]) 67 | return tf.reshape(tf.matmul(xt, yt), x_shape[:-1] + y_shape[:-2] + y_shape[-1:]) 68 | 69 | if isinstance(x, tf.SparseTensor): 70 | out = tf.sparse_tensor_dense_matmul(x, y) 71 | 72 | else: 73 | out = tf.matmul(x, y) 74 | 75 | return out 76 | 77 | 78 | def batch_dot(x, y, axes=None): 79 | if isinstance(axes, int): 80 | axes = (axes, axes) 81 | 82 | x_ndim = ndim(x) 83 | y_ndim = ndim(y) 84 | 85 | if x_ndim > y_ndim: 86 | diff = x_ndim - y_ndim 87 | y = tf.reshape(y, tf.concat([tf.shape(y), [1] * diff], axis=0)) 88 | 89 | elif y_ndim > x_ndim: 90 | diff = y_ndim - x_ndim 91 | x = tf.reshape(x, tf.concat([tf.shape(x), [1] * diff], axis=0)) 92 | 93 | else: 94 | diff = 0 95 | 96 | if ndim(x) == 2 and ndim(y) == 2: 97 | if axes[0] == axes[1]: 98 | out = tf.reduce_sum(tf.multiply(x, y), axes[0]) 99 | 100 | else: 101 | out = tf.reduce_sum(tf.multiply(tf.transpose(x, [1, 0]), y), axes[1]) 102 | 103 | else: 104 | if axes is not None: 105 | adj_x = None if axes[0] == ndim(x) - 1 else True 106 | adj_y = True if axes[1] == ndim(y) - 1 else None 107 | 108 | else: 109 | adj_x = None 110 | adj_y = None 111 | 112 | out = tf.matmul(x, y, adjoint_a=adj_x, adjoint_b=adj_y) 113 | 114 | if diff: 115 | if x_ndim > y_ndim: 116 | idx = x_ndim + y_ndim - 3 117 | 118 | else: 119 | idx = x_ndim - 1 120 | 121 | out = tf.squeeze(out, list(range(idx, idx + diff))) 122 | 123 | if ndim(out) == 1: 124 | out = tf.expand_dims(out, 1) 125 | 126 | return out 127 | 128 | 129 | def trilinear_attention(args, v_maxlen, q_maxlen, drop_rate=0.0, reuse=None, name='efficient_trilinear'): 130 | assert len(args) == 2, 'just use for computing attention with two input' 131 | arg0_shape = args[0].get_shape().as_list() 132 | arg1_shape = args[1].get_shape().as_list() 133 | 134 | if len(arg0_shape) != 3 or len(arg1_shape) != 3: 135 | raise ValueError('`args` must be 3 dims (batch_size, len, dimension)') 136 | 137 | if arg0_shape[2] != arg1_shape[2]: 138 | raise ValueError('the last dimension of `args` must equal') 139 | 140 | arg_size = arg0_shape[2] 141 | dtype = args[0].dtype 142 | drop_args = [tf.nn.dropout(arg, rate=drop_rate) for arg in args] 143 | 144 | with tf.variable_scope(name, reuse=reuse): 145 | weights4arg0 = tf.get_variable('linear_kernel4arg0', [arg_size, 1], dtype=dtype, regularizer=regularizer) 146 | weights4arg1 = tf.get_variable('linear_kernel4arg1', [arg_size, 1], dtype=dtype, regularizer=regularizer) 147 | weights4mlu = tf.get_variable('linear_kernel4mul', [1, 1, arg_size], dtype=dtype, regularizer=regularizer) 148 | 149 | subres0 = tf.tile(dot(drop_args[0], weights4arg0), [1, 1, q_maxlen]) 150 | subres1 = tf.tile(tf.transpose(dot(drop_args[1], weights4arg1), perm=(0, 2, 1)), [1, v_maxlen, 1]) 151 | subres2 = batch_dot(drop_args[0] * weights4mlu, tf.transpose(drop_args[1], perm=(0, 2, 1))) 152 | res = subres0 + subres1 + subres2 153 | 154 | return res 155 | 156 | 157 | def create_adam_optimizer(loss,init_lr,num_train_steps,num_warmup_steps = False,clip_norm=1.0): 158 | """Creates an optimizer training op.""" 159 | 160 | learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32) 161 | learning_rate = tf.train.polynomial_decay(learning_rate, 162 | global_step, 163 | num_train_steps, 164 | end_learning_rate=0.0, 165 | power=1.0, 166 | cycle=False) 167 | 168 | optimizer = AdamWeightDecayOptimizer( 169 | learning_rate=learning_rate, 170 | weight_decay_rate=0.01, 171 | beta_1=0.9, 172 | beta_2=0.999, 173 | epsilon=1e-6, 174 | exclude_from_weight_decay=['LayerNorm', 'layer_norm', 'bias']) 175 | 176 | tvars = tf.trainable_variables() 177 | grads = tf.gradients(loss, tvars) 178 | (grads, _) = tf.clip_by_global_norm(grads, clip_norm=clip_norm) 179 | train_op = optimizer.apply_gradients(zip(grads, tvars), 180 | global_step=global_step) 181 | 182 | # Normally the global step update is done inside of `apply_gradients`. However, `AdamWeightDecayOptimizer` doesn't 183 | # do this. But if you use a different optimizer, you should probably take this line out. 184 | new_global_step = global_step + 1 185 | train_op = tf.group(train_op, [global_step.assign(new_global_step)]) 186 | 187 | return train_op 188 | 189 | 190 | def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, clip_norm=1.0): 191 | """Creates an optimizer training op.""" 192 | global_step = tf.train.get_or_create_global_step() 193 | 194 | learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32) 195 | learning_rate = tf.train.polynomial_decay(learning_rate, 196 | global_step, 197 | num_train_steps, 198 | end_learning_rate=0.0, 199 | power=1.0, 200 | cycle=False) 201 | 202 | if num_warmup_steps: 203 | global_steps_int = tf.cast(global_step, tf.int32) 204 | warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32) 205 | 206 | global_steps_float = tf.cast(global_steps_int, tf.float32) 207 | warmup_steps_float = tf.cast(warmup_steps_int, tf.float32) 208 | 209 | warmup_percent_done = global_steps_float / warmup_steps_float 210 | warmup_learning_rate = init_lr * warmup_percent_done 211 | 212 | is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32) 213 | learning_rate = ((1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate) 214 | 215 | optimizer = AdamWeightDecayOptimizer(learning_rate=learning_rate, weight_decay_rate=0.01, beta_1=0.9, beta_2=0.999, 216 | epsilon=1e-6, exclude_from_weight_decay=['LayerNorm', 'layer_norm', 'bias']) 217 | 218 | tvars = tf.trainable_variables() 219 | grads = tf.gradients(loss, tvars) 220 | (grads, _) = tf.clip_by_global_norm(grads, clip_norm=clip_norm) 221 | train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=global_step) 222 | 223 | # Normally the global step update is done inside of `apply_gradients`. However, `AdamWeightDecayOptimizer` doesn't 224 | # do this. But if you use a different optimizer, you should probably take this line out. 225 | new_global_step = global_step + 1 226 | train_op = tf.group(train_op, [global_step.assign(new_global_step)]) 227 | 228 | return train_op 229 | 230 | 231 | class AdamWeightDecayOptimizer(tf.train.Optimizer): 232 | """A basic Adam optimizer that includes "correct" L2 weight decay.""" 233 | 234 | def __init__(self, learning_rate, weight_decay_rate=0.0, beta_1=0.9, beta_2=0.999, epsilon=1e-6, 235 | exclude_from_weight_decay=None, name='AdamWeightDecayOptimizer'): 236 | """Constructs a AdamWeightDecayOptimizer.""" 237 | super(AdamWeightDecayOptimizer, self).__init__(False, name) 238 | 239 | self.learning_rate = learning_rate 240 | self.weight_decay_rate = weight_decay_rate 241 | self.beta_1 = beta_1 242 | self.beta_2 = beta_2 243 | self.epsilon = epsilon 244 | self.exclude_from_weight_decay = exclude_from_weight_decay 245 | 246 | def apply_gradients(self, grads_and_vars, global_step=None, name=None): 247 | """See base class.""" 248 | assignments = [] 249 | for (grad, param) in grads_and_vars: 250 | if grad is None or param is None: 251 | continue 252 | 253 | param_name = self._get_variable_name(param.name) 254 | 255 | m = tf.get_variable(name=param_name + '/adam_m', 256 | shape=param.shape.as_list(), 257 | dtype=tf.float32, 258 | trainable=False, 259 | initializer=tf.zeros_initializer()) 260 | 261 | v = tf.get_variable(name=param_name + '/adam_v', 262 | shape=param.shape.as_list(), 263 | dtype=tf.float32, 264 | trainable=False, 265 | initializer=tf.zeros_initializer()) 266 | 267 | next_m = (tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad)) 268 | next_v = (tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2, tf.square(grad))) 269 | 270 | update = next_m / (tf.sqrt(next_v) + self.epsilon) 271 | if self._do_use_weight_decay(param_name): 272 | update += self.weight_decay_rate * param 273 | 274 | update_with_lr = self.learning_rate * update 275 | next_param = param - update_with_lr 276 | assignments.extend([param.assign(next_param), m.assign(next_m), v.assign(next_v)]) 277 | 278 | return tf.group(*assignments, name=name) 279 | 280 | def _do_use_weight_decay(self, param_name): 281 | """Whether to use L2 weight decay for `param_name`.""" 282 | if not self.weight_decay_rate: 283 | return False 284 | 285 | if self.exclude_from_weight_decay: 286 | for r in self.exclude_from_weight_decay: 287 | if re.search(r, param_name) is not None: 288 | return False 289 | 290 | return True 291 | 292 | @staticmethod 293 | def _get_variable_name(param_name): 294 | """Get the variable name from the tensor name.""" 295 | m = re.match("^(.*):\\d+$", param_name) 296 | 297 | if m is not None: 298 | param_name = m.group(1) 299 | 300 | return param_name 301 | 302 | def _apply_dense(self, grad, var): 303 | pass 304 | 305 | def _resource_apply_dense(self, grad, handle): 306 | pass 307 | 308 | def _resource_apply_sparse(self, grad, handle, indices): 309 | pass 310 | 311 | def _apply_sparse(self, grad, var): 312 | pass 313 | -------------------------------------------------------------------------------- /prepare/README.md: -------------------------------------------------------------------------------- 1 | # Extract Features 2 | 3 | - We use the pre-trained 3D ConvNets ([here](https://github.com/piergiaj/pytorch-i3d)) to prepare the visual features, the 4 | extraction codes are placed in this folder. Please download the pre-trained weights [`rgb_charades.pt`]( 5 | https://github.com/piergiaj/pytorch-i3d/blob/master/models/rgb_charades.pt) and [`rgb_imagenet.pt`]( 6 | https://github.com/piergiaj/pytorch-i3d/blob/master/models/rgb_imagenet.pt). 7 | - The pre-trained GloVe embedding is available at [here](https://nlp.stanford.edu/projects/glove/), please download 8 | `glove.840B.300d.zip`, unzip and put it under `data/` folder. 9 | 10 | ## Charades STA 11 | The train/test datasets of Charades-STA are available at [[jiyanggao/TALL]](https://github.com/jiyanggao/TALL) 12 | ([`charades_sta_train.txt`](https://drive.google.com/file/d/1ZjG7wJpPSMIBYnW7BAG2u9VVEoNvFm5c/view) and 13 | [`charades_sta_test.txt`](https://drive.google.com/file/d/1QG4MXFkoj6JFU0YK5olTY75xTARKSW5e/view)). 14 | 15 | The `charades.json` file is required ([here](https://github.com/piergiaj/super-events-cvpr18/blob/master/data/charades.json)), 16 | which contains the video length information. Download and place it into the same directory of the train/test datasets. 17 | 18 | The videos/images for Charades-STA dataset is available at [here](https://allenai.org/plato/charades/), please download 19 | either `RGB frames at 24fps (76 GB)` (image frames) or `Data (original size) (55 GB)` (videos). For the second one, the 20 | extractor will automatically decompose the video into images. 21 | ```shell script 22 | # download RGB frames 23 | wget http://ai2-website.s3.amazonaws.com/data/Charades_v1_rgb.tar 24 | # or, download videos 25 | wget http://ai2-website.s3.amazonaws.com/data/Charades_v1.zip 26 | ``` 27 | 28 | Extract visual features for Charades-STA: 29 | ```shell script 30 | # use the weights fine-tuned on Charades or the weights pre-trained on ImageNet 31 | python3 extract_charades.py --use_finetuned --load_model /rgb_charades.pt \ # rgb_imagenet.pt 32 | --video_dir \ 33 | --dataset_dir \ 34 | --images_dir \ # if images not exist, decompose video into images 35 | --save_dir \ 36 | --fps 24 --strides 24 --remove_images # whether remove extract images to release space 37 | ``` 38 | 39 | ## TACoS 40 | TACoS dataset is from [[jiyanggao/TALL]](https://github.com/jiyanggao/TALL), while the videos of TACoS is from MPII 41 | Cooking Composite Activities dataset, which can be download [here]( 42 | https://www.mpi-inf.mpg.de/departments/computer-vision-and-machine-learning/research/human-activity-recognition/mpii-cooking-composite-activities/). 43 | Note that we also use the processed TACoS dataset in [[microsoft/2D-TAN]](https://github.com/microsoft/2D-TAN). 44 | 45 | Extract visual features for TACoS: 46 | ```shell script 47 | python3 extract_tacos.py --load_model /rgb_imagenet.pt \ 48 | --video_dir \ 49 | --dataset_dir \ 50 | --images_dir \ # if images not exist, decompose video into images 51 | --save_dir \ 52 | --strides 16 --remove_images # whether remove extracted images to release space 53 | ``` 54 | 55 | (Optional) Convert the pre-trained C3D visual features from [[jiyanggao/TALL]](https://github.com/jiyanggao/TALL) 56 | ([Interval64_128_256_512_overlap0.8_c3d_fc6.tar](https://drive.google.com/file/d/1zQp0aYGFCm8PqqHOh4UtXfy2U3pJMBeu/view), 57 | [Interval128_256_overlap0.8_c3d_fc6.tar](https://drive.google.com/file/d/1zC-UrspRf42Qiu5prQw4fQrbgLQfJN-P/view)): 58 | ```shell script 59 | python3 extract_tacos_org.py --data_path \ 60 | --feature_path \ 61 | --save_dir \ 62 | --sample_rate 64 # sliding windows 63 | ``` 64 | 65 | ## ActivityNet Captions 66 | The train/test sets of ActivityNet Caption are available at [here]( 67 | https://cs.stanford.edu/people/ranjaykrishna/densevid/). The videos can be downloaded using: 68 | ```shell script 69 | python3 download_activitynet_video.py --video_dir \ 70 | --dataset_dir \ 71 | --bash_file 72 | ``` 73 | It will generate a bash file which contains the commands to download all the videos. Suppose the generated bash file is 74 | `video_downloader.sh`, then simply run `bash video_downloader.sh`, it will download the videos and save them into 75 | `video_dir` automatically. 76 | 77 | Extract visual features for ActivityNet Captions: 78 | ```shell script 79 | python3 extract_activitynet.py --load_model /rgb_imagenet.pt \ 80 | --video_dir \ 81 | --dataset_dir \ 82 | --images_dir \ # if images not exist, decompose video into images 83 | --save_dir \ 84 | --strides 16 --remove_images # whether remove extracted images to release space 85 | ``` 86 | 87 | (Optional) We also have the codes to convert the C3D visual features provided in [ActivityNet official website]( 88 | http://activity-net.org/challenges/2016/download.html): 89 | 90 | - download the C3D visual features 91 | ```shell script 92 | wget http://ec2-52-25-205-214.us-west-2.compute.amazonaws.com/data/challenge16/features/c3d/activitynet_v1-3.part-00 93 | wget http://ec2-52-25-205-214.us-west-2.compute.amazonaws.com/data/challenge16/features/c3d/activitynet_v1-3.part-01 94 | wget http://ec2-52-25-205-214.us-west-2.compute.amazonaws.com/data/challenge16/features/c3d/activitynet_v1-3.part-02 95 | wget http://ec2-52-25-205-214.us-west-2.compute.amazonaws.com/data/challenge16/features/c3d/activitynet_v1-3.part-03 96 | wget http://ec2-52-25-205-214.us-west-2.compute.amazonaws.com/data/challenge16/features/c3d/activitynet_v1-3.part-04 97 | wget http://ec2-52-25-205-214.us-west-2.compute.amazonaws.com/data/challenge16/features/c3d/activitynet_v1-3.part-05 98 | cat activitynet_v1-3.part-* > features.zip && unzip features.zip 99 | rm features.zip 100 | rm activitynet_v1-3.part-* 101 | ``` 102 | - convert the features as 103 | ```shell script 104 | python3 extract_activitynet_org.py --dataset_dir \ 105 | --hdf5_file \ 106 | --save_dir 107 | ``` 108 | -------------------------------------------------------------------------------- /prepare/convert_charades.py: -------------------------------------------------------------------------------- 1 | import os 2 | import h5py 3 | import json 4 | import numpy as np 5 | from tqdm import tqdm 6 | from argparse import ArgumentParser 7 | 8 | parser = ArgumentParser() 9 | parser.add_argument("--dataset_dir", type=str, required=True, help="dataset path") 10 | parser.add_argument("--hdf5_file", type=str, required=True, help="downloaded activitynet features") 11 | parser.add_argument("--save_dir", type=str, required=True, help="save dir") 12 | args = parser.parse_args() 13 | 14 | with open(os.path.join(args.dataset_dir, "charades.json"), mode="r", encoding="utf-8") as f: 15 | all_data = json.load(f) 16 | # with open(os.path.join(args.dataset_dir, "val_1.json"), mode="r", encoding="utf-8") as f: 17 | # val_data = json.load(f) 18 | # with open(os.path.join(args.dataset_dir, "val_2.json"), mode="r", encoding="utf-8") as f: 19 | # test_data = json.load(f) 20 | 21 | video_ids = list(set(list(all_data.keys()))) 22 | # print(video_ids) 23 | # print(len(video_ids)) #9948 24 | 25 | with h5py.File(args.hdf5_file, mode="r") as f: 26 | print(type(f)) # 27 | print(len(f)) #9846 28 | # print(f.keys()) 29 | print(type(f['001YG'])) 30 | print(f['001YG'].shape) 31 | print(f['ZZ3HT'].shape) 32 | if not os.path.exists(args.save_dir): 33 | os.makedirs(args.save_dir) 34 | 35 | feature_shapes = dict() 36 | with h5py.File(args.hdf5_file, mode="r") as f: 37 | group_key = list(f.keys()) 38 | for key in tqdm(group_key, total=len(group_key), desc="extract features"): 39 | video_id = key 40 | if video_id not in video_ids: 41 | continue 42 | data = f[key] 43 | feature_shapes[video_id] = data.shape[0] 44 | np.save(os.path.join(args.save_dir, video_id), arr=data) 45 | # with h5py.File(args.hdf5_file, mode="r") as f: 46 | # group_key = list(f.keys()) 47 | # for key in tqdm(group_key, total=len(group_key), desc="extract features"): 48 | # video_id = key 49 | # if video_id not in video_ids: 50 | # continue 51 | # data = f[key]["c3d_features"][()] 52 | # feature_shapes[video_id] = data.shape[0] 53 | # np.save(os.path.join(args.save_dir, video_id), arr=data) 54 | 55 | with open(os.path.join(args.save_dir, "feature_shapes.json"), mode="w", encoding="utf-8") as f: 56 | json.dump(feature_shapes, f) 57 | -------------------------------------------------------------------------------- /prepare/download_activitynet_video.py: -------------------------------------------------------------------------------- 1 | """ 2 | Codes are modified from https://github.com/waybarrios/Anet_tools2.0 3 | """ 4 | import os 5 | import glob 6 | import json 7 | from argparse import ArgumentParser 8 | 9 | 10 | def crosscheck_videos(video_path, all_video_ids): 11 | # Get existing videos 12 | existing_videos = glob.glob("%s/*.mp4" % video_path) 13 | for idx, vid in enumerate(existing_videos): 14 | basename = os.path.basename(vid).split(".mp4")[0] 15 | if len(basename) == 13: 16 | existing_videos[idx] = basename[2:] 17 | elif len(basename) == 11: 18 | existing_videos[idx] = basename 19 | else: 20 | raise RuntimeError("Unknown filename format: %s", vid) 21 | 22 | non_existing_videos = [] 23 | for vid in all_video_ids: 24 | if vid in existing_videos: 25 | continue 26 | else: 27 | non_existing_videos.append(vid) 28 | 29 | return non_existing_videos 30 | 31 | 32 | def main(video_dir, dataset_dir, bash_file): 33 | with open(os.path.join(dataset_dir, "train.json"), mode="r", encoding="utf-8") as f: 34 | train_ids = list(json.load(f).keys()) 35 | train_ids = [vid[2:] if len(vid) == 13 else vid for vid in train_ids] 36 | 37 | with open(os.path.join(dataset_dir, "val_1.json"), mode="r", encoding="utf-8") as f: 38 | val_ids = list(json.load(f).keys()) 39 | val_ids = [vid[2:] if len(vid) == 13 else vid for vid in val_ids] 40 | 41 | with open(os.path.join(dataset_dir, "val_2.json"), mode="r", encoding="utf-8") as f: 42 | test_ids = list(json.load(f).keys()) 43 | test_ids = [vid[2:] if len(vid) == 13 else vid for vid in test_ids] 44 | 45 | all_video_ids = list(set(train_ids + val_ids + test_ids)) 46 | print("train_video_ids", len(train_ids)) 47 | print("val_1_video_ids", len(val_ids)) 48 | print("val_2_video_ids", len(test_ids)) 49 | print("all_video_ids", len(all_video_ids)) 50 | 51 | non_existing_videos = crosscheck_videos(video_dir, all_video_ids) 52 | 53 | # save command to bash file 54 | with open(bash_file + '.sh', mode="w", encoding="utf-8") as f: 55 | f.write("#!/usr/bin/env bash\n\n") # write bash file header 56 | filename = os.path.join(video_dir, "v_%s.mp4") 57 | cmd_base = "youtube-dl -f best -f mp4 " 58 | cmd_base += '"https://www.youtube.com/watch?v=%s" ' 59 | cmd_base += '-o "%s"' % filename 60 | 61 | for vid in non_existing_videos: 62 | cmd = cmd_base % (vid, vid) 63 | f.write("%s\n" % cmd) 64 | 65 | 66 | if __name__ == "__main__": 67 | parser = ArgumentParser(description="Script to double check video content.") 68 | parser.add_argument("--video_dir", type=str, required=True, help="where to save the downloaded videos") 69 | parser.add_argument("--dataset_dir", type=str, required=True, help="where are the annotation files") 70 | parser.add_argument("--bash_file", type=str, required=True, help="where to save command list script") 71 | 72 | args = vars(parser.parse_args()) 73 | main(**args) 74 | """ 75 | After running this python file, it will generate an script file. Using the terminal to run this script, it will 76 | automatically download all the required videos from YouTube. 77 | """ 78 | -------------------------------------------------------------------------------- /prepare/extract_activitynet.py: -------------------------------------------------------------------------------- 1 | import os 2 | import cv2 3 | import glob 4 | import json 5 | import torch 6 | import argparse 7 | import subprocess 8 | import numpy as np 9 | from . import videotransforms 10 | from .feature_extractor import InceptionI3d 11 | from torchvision import transforms 12 | from torch.autograd import Variable 13 | 14 | parser = argparse.ArgumentParser() 15 | parser.add_argument("--gpu_idx", type=str, default="0", help="gpu index") 16 | parser.add_argument("--load_model", type=str, required=True, help="pre-trained model") 17 | parser.add_argument("--video_dir", type=str, required=True, help="where are located the videos") 18 | parser.add_argument("--images_dir", type=str, required=True, help="where to save extracted images") 19 | parser.add_argument("--save_dir", type=str, required=True, help="where to save extracted features") 20 | parser.add_argument("--fps", type=int, default=None, help="frames per second") 21 | parser.add_argument("--video_format", type=str, default="mp4", help="video format") 22 | parser.add_argument("--strides", type=int, default=16, help="window size") 23 | parser.add_argument("--remove_images", action="store_true", help="whether remove extract images to release space") 24 | args = parser.parse_args() 25 | 26 | os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" 27 | os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_idx 28 | 29 | 30 | def load_images(img_dir, vid, start_frame, lengths): 31 | img_frames, raw_height, raw_width = [], None, None 32 | for x in range(start_frame, start_frame + lengths): 33 | image = cv2.imread(os.path.join(img_dir, "{}-{}.jpg".format(vid, str(x).zfill(6))))[:, :, [2, 1, 0]] 34 | width, height, channel = image.shape 35 | raw_width, raw_height = width, height 36 | # resize image 37 | scale = 1 + (224.0 - min(width, height)) / min(width, height) 38 | image = cv2.resize(image, dsize=(0, 0), fx=scale, fy=scale) 39 | # normalize image to [0, 1] 40 | image = (image / 255.0) * 2 - 1 41 | img_frames.append(image) 42 | return img_frames, raw_width, raw_height 43 | 44 | 45 | def extract_features(image_tensor, model, strides): 46 | b, c, t, h, w = image_tensor.shape 47 | extracted_features = [] 48 | for start in range(0, t, strides): 49 | end = min(t - 1, start + strides) 50 | if end - start < strides: 51 | start = max(0, end - strides) 52 | ip = Variable(torch.from_numpy(image_tensor.numpy()[:, :, start:end]).cuda(), volatile=True) 53 | feature = model.extract_features(ip).data.cpu().numpy() 54 | extracted_features.append(feature) 55 | extracted_features = np.concatenate(extracted_features, axis=0) 56 | return extracted_features 57 | 58 | 59 | if not os.path.exists(args.video_dir): 60 | raise ValueError("The video directory '{}' does not exist!!!".format(args.video_dir)) 61 | 62 | if not os.path.exists(args.images_dir): 63 | os.makedirs(args.images_dir) 64 | 65 | if not os.path.exists(args.save_dir): 66 | os.makedirs(args.save_dir) 67 | 68 | # create I3D model and load pre-trained model 69 | i3d_model = InceptionI3d(400, in_channels=3) 70 | i3d_model.load_state_dict(torch.load(args.load_model)) 71 | i3d_model.cuda() 72 | i3d_model.train(False) 73 | video_transforms = transforms.Compose([videotransforms.CenterCrop(224)]) 74 | 75 | # extract images and features 76 | feature_shapes = dict() 77 | video_paths = glob.glob(os.path.join(args.video_dir, "*.{}".format(args.video_format))) 78 | for idx, video_path in enumerate(video_paths): 79 | video_id = os.path.basename(video_path)[0:-4] # remove suffix 80 | image_dir = os.path.join(args.images_dir, video_id) 81 | 82 | print("{} / {}: extract features for video {}".format(idx + 1, len(video_paths), video_id), flush=True) 83 | 84 | if os.path.exists(os.path.join(args.save_dir, "{}.npy".format(video_id))): 85 | print("the visual features for video {} are exist in {}...\n".format(video_id, args.save_dir), flush=True) 86 | continue 87 | 88 | # extract images 89 | if os.path.exists(image_dir): 90 | print("the images for video {} already are exist in {}...".format(video_id, args.images_dir)) 91 | else: 92 | os.makedirs(image_dir) 93 | print("extract images with fps={}...".format(args.fps), flush=True) 94 | if args.fps is None or args.fps <= 0: 95 | subprocess.call("ffmpeg -hide_banner -loglevel panic -i {} {}/{}-%6d.jpg".format( 96 | video_path, image_dir, video_id), shell=True) 97 | else: 98 | subprocess.call("ffmpeg -hide_banner -loglevel panic -i {} -filter:v fps=fps={} {}/{}-%6d.jpg".format( 99 | video_path, args.fps, image_dir, video_id), shell=True) 100 | 101 | # process extracted images 102 | print("load RGB frames...", flush=True) 103 | num_frames = len(os.listdir(image_dir)) 104 | 105 | if num_frames < 10000: 106 | frames, raw_w, raw_h = load_images(image_dir, video_id, 1, num_frames) 107 | frames = np.asarray(frames, dtype=np.float32) 108 | imgs = video_transforms(frames) 109 | img_tensor = torch.from_numpy(np.expand_dims(imgs.transpose([3, 0, 1, 2]), axis=0)) 110 | print("process images:", (frames.shape[0], raw_w, raw_h, frames.shape[-1]), "-->", frames.shape, "-->", 111 | imgs.shape, "-->", tuple(img_tensor.size()), flush=True) 112 | 113 | print("extract visual features...", flush=True) 114 | features = extract_features(img_tensor, i3d_model, args.strides) 115 | np.save(os.path.join(args.save_dir, video_id), arr=features) 116 | print("extracted features shape: {}".format(features.shape), flush=True) 117 | feature_shapes[video_id] = features.shape[0] 118 | 119 | else: 120 | all_features = [] 121 | for start_idx in range(1, num_frames, 10000): 122 | end_idx = min(start_idx + 10000, num_frames + 1) 123 | cur_num_frames = end_idx - start_idx 124 | if cur_num_frames < args.strides: 125 | cur_num_frames = args.strides 126 | start_idx = end_idx - cur_num_frames 127 | frames, raw_w, raw_h = load_images(image_dir, video_id, start_idx, cur_num_frames) 128 | frames = np.asarray(frames, dtype=np.float32) 129 | imgs = video_transforms(frames) 130 | img_tensor = torch.from_numpy(np.expand_dims(imgs.transpose([3, 0, 1, 2]), axis=0)) 131 | print("process images:", (frames.shape[0], raw_w, raw_h, frames.shape[-1]), "-->", frames.shape, "-->", 132 | imgs.shape, "-->", tuple(img_tensor.size()), flush=True) 133 | print("extract visual features...", flush=True) 134 | features = extract_features(img_tensor, i3d_model, args.strides) 135 | all_features.append(features) 136 | all_features = np.concatenate(all_features, axis=0) 137 | np.save(os.path.join(args.save_dir, video_id), arr=all_features) 138 | print("extracted features shape: {}".format(all_features.shape), flush=True) 139 | feature_shapes[video_id] = all_features.shape[0] 140 | 141 | if args.remove_images: 142 | # remove extract images to release memory space 143 | subprocess.call("rm -rf {}".format(image_dir), shell=True) 144 | 145 | with open(os.path.join(args.save_dir, "feature_shapes.json"), mode="w", encoding="utf-8") as f: 146 | json.dump(feature_shapes, f) 147 | -------------------------------------------------------------------------------- /prepare/extract_activitynet_org.py: -------------------------------------------------------------------------------- 1 | import os 2 | import h5py 3 | import json 4 | import numpy as np 5 | from tqdm import tqdm 6 | from argparse import ArgumentParser 7 | 8 | parser = ArgumentParser() 9 | parser.add_argument("--dataset_dir", type=str, required=True, help="dataset path") 10 | parser.add_argument("--hdf5_file", type=str, required=True, help="downloaded activitynet features") 11 | parser.add_argument("--save_dir", type=str, required=True, help="save dir") 12 | args = parser.parse_args() 13 | 14 | with open(os.path.join(args.dataset_dir, "train.json"), mode="r", encoding="utf-8") as f: 15 | train_data = json.load(f) 16 | with open(os.path.join(args.dataset_dir, "val_1.json"), mode="r", encoding="utf-8") as f: 17 | val_data = json.load(f) 18 | with open(os.path.join(args.dataset_dir, "val_2.json"), mode="r", encoding="utf-8") as f: 19 | test_data = json.load(f) 20 | 21 | video_ids = list(set(list(train_data.keys()) + list(val_data.keys()) + list(test_data.keys()))) 22 | print(video_ids) 23 | print(len(video_ids)) 24 | 25 | if not os.path.exists(args.save_dir): 26 | os.makedirs(args.save_dir) 27 | 28 | feature_shapes = dict() 29 | with h5py.File(args.hdf5_file, mode="r") as f: 30 | group_key = list(f.keys()) 31 | for key in tqdm(group_key, total=len(group_key), desc="extract features"): 32 | video_id = key 33 | if video_id not in video_ids: 34 | continue 35 | data = f[key]["c3d_features"][()] 36 | feature_shapes[video_id] = data.shape[0] 37 | np.save(os.path.join(args.save_dir, video_id), arr=data) 38 | 39 | with open(os.path.join(args.save_dir, "feature_shapes.json"), mode="w", encoding="utf-8") as f: 40 | json.dump(feature_shapes, f) 41 | -------------------------------------------------------------------------------- /prepare/extract_charades.py: -------------------------------------------------------------------------------- 1 | import os 2 | import cv2 3 | import json 4 | import torch 5 | import argparse 6 | import subprocess 7 | import numpy as np 8 | from . import videotransforms 9 | from .feature_extractor import InceptionI3d 10 | from torchvision import transforms 11 | from torch.autograd import Variable 12 | 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument("--gpu_idx", type=str, default="0", help="gpu index") 15 | parser.add_argument("--use_finetuned", action="store_true", help="whether use fine-tuned feature extractor") 16 | parser.add_argument("--load_model", type=str, required=True, help="pre-trained model") 17 | parser.add_argument("--video_dir", type=str, required=True, help="where are located the videos") 18 | parser.add_argument("--dataset_dir", type=str, required=True, help="where are located the dataset files") 19 | parser.add_argument("--images_dir", type=str, required=True, help="where to save extracted images") 20 | parser.add_argument("--save_dir", type=str, required=True, help="where to save extracted features") 21 | parser.add_argument("--fps", type=int, default=24, help="frames per second") 22 | parser.add_argument("--video_format", type=str, default="mp4", help="video format") 23 | parser.add_argument("--strides", type=int, default=24, help="window size") 24 | parser.add_argument("--remove_images", action="store_true", help="whether remove extract images to release space") 25 | args = parser.parse_args() 26 | 27 | os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" 28 | os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_idx 29 | 30 | 31 | if not os.path.exists(args.video_dir): 32 | raise ValueError("The video directory '{}' does not exist!!!".format(args.video_dir)) 33 | 34 | if not os.path.exists(args.images_dir): 35 | os.makedirs(args.images_dir) 36 | 37 | if not os.path.exists(args.save_dir): 38 | os.makedirs(args.save_dir) 39 | 40 | # create I3D model and load pre-trained model 41 | i3d_model = InceptionI3d(400, in_channels=3) 42 | if args.use_fine_tuned: 43 | i3d_model.replace_logits(157) # charades has 157 activity types 44 | i3d_model.load_state_dict(torch.load(args.load_model)) 45 | i3d_model.cuda() 46 | i3d_model.train(False) 47 | video_transforms = transforms.Compose([videotransforms.CenterCrop(224)]) 48 | 49 | # load video ids 50 | video_ids = [] 51 | for filename in ["charades_sta_train.txt", "charades_sta_test.txt"]: 52 | with open(os.path.join(args.dataset_dir, filename), mode="r", encoding="utf-8") as f: 53 | for line in f: 54 | line = line.lstrip().rstrip() 55 | if len(line) == 0: 56 | continue 57 | vid = line.split("##")[0].split(" ")[0] 58 | video_ids.append(vid) 59 | video_ids = list(set(video_ids)) 60 | 61 | # extract images and features 62 | feature_shapes = dict() 63 | for idx, video_id in enumerate(video_ids): 64 | video_path = os.path.join(args.video_dir, "{}.mp4".format(video_id)) 65 | image_dir = os.path.join(args.images_dir, video_id) 66 | 67 | print("{} / {}: extract features for video {}".format(idx + 1, len(video_ids), video_id), flush=True) 68 | 69 | if os.path.exists(os.path.join(args.save_dir, "{}.npy".format(video_id))): 70 | print("the visual features for video {} are exist in {}...\n".format(video_id, args.save_dir), flush=True) 71 | continue 72 | 73 | # extract images 74 | if os.path.exists(image_dir): 75 | print("the images for video {} already are exist in {}...".format(video_id, args.images_dir)) 76 | else: 77 | os.makedirs(image_dir) 78 | print("extract images with fps={}...".format(args.fps), flush=True) 79 | if args.fps is None or args.fps <= 0: 80 | subprocess.call("ffmpeg -hide_banner -loglevel panic -i {} {}/{}-%6d.jpg".format( 81 | video_path, image_dir, video_id), shell=True) 82 | else: 83 | subprocess.call("ffmpeg -hide_banner -loglevel panic -i {} -filter:v fps=fps={} {}/{}-%6d.jpg".format( 84 | video_path, args.fps, image_dir, video_id), shell=True) 85 | 86 | # process extracted images 87 | print("load RGB frames...", flush=True) 88 | num_frames = len(os.listdir(image_dir)) 89 | frames, raw_w, raw_h = [], None, None 90 | for i in range(1, num_frames + 1): 91 | # cv2.imread() read image with BGR format by default, so we convert it to RGB format 92 | img = cv2.imread(os.path.join(image_dir, "{}-{}.jpg".format(video_id, str(i).zfill(6))))[:, :, [2, 1, 0]] 93 | w, h, c = img.shape 94 | raw_w, raw_h = w, h 95 | if w < 226 or h < 226: 96 | d = 226. - min(w, h) 97 | sc = 1 + d / min(w, h) 98 | img = cv2.resize(img, dsize=(0, 0), fx=sc, fy=sc) 99 | img = (img / 255.) * 2 - 1 100 | frames.append(img) 101 | frames = np.asarray(frames, dtype=np.float32) 102 | imgs = video_transforms(frames) 103 | img_tensor = torch.from_numpy(np.expand_dims(imgs.transpose([3, 0, 1, 2]), axis=0)) 104 | print("process images:", (frames.shape[0], raw_w, raw_h, frames.shape[-1]), "-->", frames.shape, "-->", 105 | imgs.shape, "-->", tuple(img_tensor.size()), flush=True) 106 | 107 | if args.remove_images: 108 | # remove extract images to release memory space 109 | subprocess.call("rm -rf {}".format(image_dir), shell=True) 110 | 111 | print("extract visual visual features...", flush=True) 112 | b, c, t, h, w = img_tensor.shape 113 | features = [] 114 | for start in range(0, t, args.strides): 115 | end = min(t - 1, start + args.strides) 116 | if end - start < args.strides: 117 | start = max(0, end - args.strides) 118 | ip = Variable(torch.from_numpy(img_tensor.numpy()[:, :, start:end]).cuda(), volatile=True) 119 | feature = i3d_model.extract_features(ip).data.cpu().numpy() 120 | features.append(feature) 121 | features = np.concatenate(features, axis=0) 122 | np.save(os.path.join(args.save_dir, video_id), arr=features) 123 | print("extracted feature shape: {}\n".format(features.shape), flush=True) 124 | feature_shapes[video_id] = features.shape[0] 125 | 126 | with open(os.path.join(args.save_dir, "feature_shapes.json"), mode="w", encoding="utf-8") as f: 127 | json.dump(feature_shapes, f) 128 | -------------------------------------------------------------------------------- /prepare/extract_tacos.py: -------------------------------------------------------------------------------- 1 | import os 2 | import cv2 3 | import glob 4 | import json 5 | import torch 6 | import argparse 7 | import subprocess 8 | import numpy as np 9 | from . import videotransforms 10 | from .feature_extractor import InceptionI3d 11 | from torchvision import transforms 12 | from torch.autograd import Variable 13 | 14 | parser = argparse.ArgumentParser() 15 | parser.add_argument("--gpu_idx", type=str, default="0", help="gpu index") 16 | parser.add_argument("--load_model", type=str, required=True, help="pre-trained model") 17 | parser.add_argument("--video_dir", type=str, required=True, help="where are located the videos") 18 | parser.add_argument("--images_dir", type=str, required=True, help="where to save extracted images") 19 | parser.add_argument("--save_dir", type=str, required=True, help="where to save extracted features") 20 | parser.add_argument("--fps", type=float, default=None, help="frames per second") # TACoS's default fps is 29.4 21 | parser.add_argument("--video_format", type=str, default="avi", help="video format") 22 | parser.add_argument("--strides", type=int, default=16, help="window size") 23 | parser.add_argument("--remove_images", action="store_true", help="whether remove extract images to release space") 24 | args = parser.parse_args() 25 | 26 | os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" 27 | os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_idx 28 | 29 | 30 | def load_images(img_dir, vid, start_frame, lengths): 31 | img_frames, raw_height, raw_width = [], None, None 32 | for x in range(start_frame, start_frame + lengths): 33 | image = cv2.imread(os.path.join(img_dir, "{}-{}.jpg".format(vid, str(x).zfill(6))))[:, :, [2, 1, 0]] 34 | width, height, channel = image.shape 35 | raw_width, raw_height = width, height 36 | # resize image 37 | scale = 1 + (224.0 - min(width, height)) / min(width, height) 38 | image = cv2.resize(image, dsize=(0, 0), fx=scale, fy=scale) 39 | # normalize image to [0, 1] 40 | image = (image / 255.0) * 2 - 1 41 | img_frames.append(image) 42 | return img_frames, raw_width, raw_height 43 | 44 | 45 | def extract_features(image_tensor, model, strides): 46 | b, c, t, h, w = image_tensor.shape 47 | extracted_features = [] 48 | for start in range(0, t, strides): 49 | end = min(t - 1, start + strides) 50 | if end - start < strides: 51 | start = max(0, end - strides) 52 | ip = Variable(torch.from_numpy(image_tensor.numpy()[:, :, start:end]).cuda(), volatile=True) 53 | feature = model.extract_features(ip).data.cpu().numpy() 54 | extracted_features.append(feature) 55 | extracted_features = np.concatenate(extracted_features, axis=0) 56 | return extracted_features 57 | 58 | 59 | if not os.path.exists(args.video_dir): 60 | raise ValueError("The video directory '{}' does not exist!!!".format(args.video_dir)) 61 | 62 | if not os.path.exists(args.images_dir): 63 | os.makedirs(args.images_dir) 64 | 65 | if not os.path.exists(args.save_dir): 66 | os.makedirs(args.save_dir) 67 | 68 | # create I3D model and load pre-trained model 69 | i3d_model = InceptionI3d(400, in_channels=3) 70 | i3d_model.load_state_dict(torch.load(args.load_model)) 71 | i3d_model.cuda() 72 | i3d_model.train(False) 73 | video_transforms = transforms.Compose([videotransforms.CenterCrop(224)]) 74 | 75 | # extract images and features 76 | feature_shapes = dict() 77 | video_paths = glob.glob(os.path.join(args.video_dir, "*.{}".format(args.video_format))) 78 | for idx, video_path in enumerate(video_paths): 79 | video_id = os.path.basename(video_path)[0:-4] # remove suffix 80 | image_dir = os.path.join(args.images_dir, video_id) 81 | 82 | print("{} / {}: extract features for video {}".format(idx + 1, len(video_paths), video_id), flush=True) 83 | 84 | if os.path.exists(os.path.join(args.save_dir, "{}.npy".format(video_id))): 85 | print("the visual features for video {} are exist in {}...".format(video_id, args.save_dir), flush=True) 86 | continue 87 | 88 | # extract images 89 | if os.path.exists(image_dir): 90 | print("the images for video {} already are exist in {}...".format(video_id, args.images_dir)) 91 | else: 92 | os.makedirs(image_dir) 93 | print("extract images with fps={}...".format(args.fps), flush=True) 94 | subprocess.call("ffmpeg -hide_banner -loglevel panic -i {} {}/{}-%6d.jpg".format(video_path, image_dir, 95 | video_id), shell=True) 96 | 97 | # process extracted images 98 | print("load RGB frames...", flush=True) 99 | num_frames = len(os.listdir(image_dir)) 100 | 101 | if num_frames < 10000: 102 | frames, raw_w, raw_h = load_images(image_dir, video_id, 1, num_frames) 103 | frames = np.asarray(frames, dtype=np.float32) 104 | imgs = video_transforms(frames) 105 | img_tensor = torch.from_numpy(np.expand_dims(imgs.transpose([3, 0, 1, 2]), axis=0)) 106 | print("process images:", (frames.shape[0], raw_w, raw_h, frames.shape[-1]), "-->", frames.shape, "-->", 107 | imgs.shape, "-->", tuple(img_tensor.size()), flush=True) 108 | 109 | print("extract visual features...", flush=True) 110 | features = extract_features(img_tensor, i3d_model, args.strides) 111 | np.save(os.path.join(args.save_dir, video_id), arr=features) 112 | print("extracted features shape: {}".format(features.shape), flush=True) 113 | feature_shapes[video_id] = features.shape[0] 114 | 115 | else: 116 | all_features = [] 117 | for start_idx in range(1, num_frames, 10000): 118 | end_idx = min(start_idx + 10000, num_frames + 1) 119 | cur_num_frames = end_idx - start_idx 120 | if cur_num_frames < args.strides: 121 | cur_num_frames = args.strides 122 | start_idx = end_idx - cur_num_frames 123 | frames, raw_w, raw_h = load_images(image_dir, video_id, start_idx, cur_num_frames) 124 | frames = np.asarray(frames, dtype=np.float32) 125 | imgs = video_transforms(frames) 126 | img_tensor = torch.from_numpy(np.expand_dims(imgs.transpose([3, 0, 1, 2]), axis=0)) 127 | print("process images:", (frames.shape[0], raw_w, raw_h, frames.shape[-1]), "-->", frames.shape, "-->", 128 | imgs.shape, "-->", tuple(img_tensor.size()), flush=True) 129 | print("extract visual features...", flush=True) 130 | features = extract_features(img_tensor, i3d_model, args.strides) 131 | all_features.append(features) 132 | all_features = np.concatenate(all_features, axis=0) 133 | np.save(os.path.join(args.save_dir, video_id), arr=all_features) 134 | print("extracted features shape: {}".format(all_features.shape), flush=True) 135 | feature_shapes[video_id] = all_features.shape[0] 136 | 137 | if args.remove_images: 138 | # remove extract images to release memory space 139 | subprocess.call("rm -rf {}".format(image_dir), shell=True) 140 | 141 | with open(os.path.join(args.save_dir, "feature_shapes.json"), mode="w", encoding="utf-8") as f: 142 | json.dump(feature_shapes, f) 143 | -------------------------------------------------------------------------------- /prepare/extract_tacos_org.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | import numpy as np 5 | from tqdm import tqdm 6 | 7 | # 1. step download pre-trained C3D features from https://github.com/jiyanggao/TALL 8 | # 2. convert the features 9 | 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument("--data_path", type=str, required=True, default="tacos dataset") 12 | parser.add_argument("--feature_path", type=str, required=True, help="pre-trained C3D features") 13 | parser.add_argument("--save_dir", type=str, required=True, help="extracted feature save path") 14 | parser.add_argument("--sample_rate", type=int, default=64, help="sample rate [64 | 128 | 256 | 512]") 15 | args = parser.parse_args() 16 | 17 | stride = args.sample_rate // 5 # due to 0.8 overlap of the pre-trained C3D features 18 | 19 | if not os.path.exists(args.save_dir): 20 | os.makedirs(args.save_dir) 21 | 22 | with open(os.path.join(args.data_path, "train.json"), mode="r", encoding="utf-8") as f: 23 | dataset = json.load(f) 24 | with open(os.path.join(args.data_path, "val.json"), mode="r", encoding="utf-8") as f: 25 | dataset.update(json.load(f)) 26 | with open(os.path.join(args.data_path, "test.json"), mode="r", encoding="utf-8") as f: 27 | dataset.update(json.load(f)) 28 | 29 | feature_shapes = dict() 30 | for video_id, annotations in tqdm(dataset.items(), total=len(dataset), desc=""): 31 | video_features = [] 32 | num_frames = annotations["num_frames"] - 16 # trick from 2D-TAN 33 | for idx in range(0, (num_frames - args.sample_rate) // stride + 1): 34 | s_idx = idx * stride + 1 35 | e_idx = s_idx + args.sample_rate 36 | feature_path = os.path.join(args.feature_path, "{}_{}_{}.npy".format(video_id, s_idx, e_idx)) 37 | feature = np.load(feature_path) 38 | video_features.append(feature) 39 | video_features = np.stack(video_features, axis=0) 40 | np.save(os.path.join(args.save_dir, video_id), arr=video_features) 41 | feature_shapes[video_id] = video_features.shape[0] 42 | 43 | with open(os.path.join(args.save_dir, "feature_shapes.json"), mode="w", encoding="utf-8") as f: 44 | json.dump(feature_shapes, f) 45 | -------------------------------------------------------------------------------- /prepare/feature_extractor.py: -------------------------------------------------------------------------------- 1 | """ 2 | Downloaded from https://github.com/piergiaj/pytorch-i3d/blob/master/pytorch_i3d.py 3 | Minor modification are applied to fit our requirements 4 | """ 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | 9 | 10 | class MaxPool3dSamePadding(nn.MaxPool3d): 11 | 12 | def compute_pad(self, dim, s): 13 | if s % self.stride[dim] == 0: 14 | return max(self.kernel_size[dim] - self.stride[dim], 0) 15 | else: 16 | return max(self.kernel_size[dim] - (s % self.stride[dim]), 0) 17 | 18 | def forward(self, x): 19 | # compute 'same' padding 20 | (batch, channel, t, h, w) = x.size() 21 | pad_t = self.compute_pad(0, t) 22 | pad_h = self.compute_pad(1, h) 23 | pad_w = self.compute_pad(2, w) 24 | 25 | pad_t_f = pad_t // 2 26 | pad_t_b = pad_t - pad_t_f 27 | pad_h_f = pad_h // 2 28 | pad_h_b = pad_h - pad_h_f 29 | pad_w_f = pad_w // 2 30 | pad_w_b = pad_w - pad_w_f 31 | 32 | pad = [pad_w_f, pad_w_b, pad_h_f, pad_h_b, pad_t_f, pad_t_b] 33 | x = F.pad(x, pad) 34 | return super(MaxPool3dSamePadding, self).forward(x) 35 | 36 | 37 | class Unit3D(nn.Module): 38 | 39 | def __init__(self, in_channels, 40 | output_channels, 41 | kernel_shape=(1, 1, 1), 42 | stride=(1, 1, 1), 43 | padding=0, 44 | activation_fn=None, 45 | use_batch_norm=True, 46 | use_bias=False, 47 | name='unit_3d'): 48 | 49 | """Initializes Unit3D module.""" 50 | super(Unit3D, self).__init__() 51 | 52 | self._output_channels = output_channels 53 | self._kernel_shape = kernel_shape 54 | self._stride = stride 55 | self._use_batch_norm = use_batch_norm 56 | self._activation_fn = activation_fn 57 | self._use_bias = use_bias 58 | self.name = name 59 | self.padding = padding 60 | 61 | self.conv3d = nn.Conv3d(in_channels=in_channels, 62 | out_channels=self._output_channels, 63 | kernel_size=self._kernel_shape, 64 | stride=self._stride, 65 | padding=0, 66 | # we always want padding to be 0 here. We will dynamically pad based on input size 67 | # in forward function 68 | bias=self._use_bias) 69 | 70 | if self._use_batch_norm: 71 | self.bn = nn.BatchNorm3d(self._output_channels, eps=0.001, momentum=0.01) 72 | 73 | def compute_pad(self, dim, s): 74 | if s % self._stride[dim] == 0: 75 | return max(self._kernel_shape[dim] - self._stride[dim], 0) 76 | else: 77 | return max(self._kernel_shape[dim] - (s % self._stride[dim]), 0) 78 | 79 | def forward(self, x): 80 | # compute 'same' padding 81 | (batch, channel, t, h, w) = x.size() 82 | pad_t = self.compute_pad(0, t) 83 | pad_h = self.compute_pad(1, h) 84 | pad_w = self.compute_pad(2, w) 85 | 86 | pad_t_f = pad_t // 2 87 | pad_t_b = pad_t - pad_t_f 88 | pad_h_f = pad_h // 2 89 | pad_h_b = pad_h - pad_h_f 90 | pad_w_f = pad_w // 2 91 | pad_w_b = pad_w - pad_w_f 92 | 93 | pad = [pad_w_f, pad_w_b, pad_h_f, pad_h_b, pad_t_f, pad_t_b] 94 | x = F.pad(x, pad) 95 | 96 | x = self.conv3d(x) 97 | if self._use_batch_norm: 98 | x = self.bn(x) 99 | if self._activation_fn is not None: 100 | x = self._activation_fn(x) 101 | return x 102 | 103 | 104 | class InceptionModule(nn.Module): 105 | def __init__(self, in_channels, out_channels, name): 106 | super(InceptionModule, self).__init__() 107 | 108 | self.b0 = Unit3D(in_channels=in_channels, output_channels=out_channels[0], kernel_shape=[1, 1, 1], padding=0, 109 | activation_fn=F.relu, name=name + '/Branch_0/Conv3d_0a_1x1') 110 | self.b1a = Unit3D(in_channels=in_channels, output_channels=out_channels[1], kernel_shape=[1, 1, 1], padding=0, 111 | activation_fn=F.relu, name=name + '/Branch_1/Conv3d_0a_1x1') 112 | self.b1b = Unit3D(in_channels=out_channels[1], output_channels=out_channels[2], kernel_shape=[3, 3, 3], 113 | activation_fn=F.relu, name=name + '/Branch_1/Conv3d_0b_3x3') 114 | self.b2a = Unit3D(in_channels=in_channels, output_channels=out_channels[3], kernel_shape=[1, 1, 1], padding=0, 115 | activation_fn=F.relu, name=name + '/Branch_2/Conv3d_0a_1x1') 116 | self.b2b = Unit3D(in_channels=out_channels[3], output_channels=out_channels[4], kernel_shape=[3, 3, 3], 117 | activation_fn=F.relu, name=name + '/Branch_2/Conv3d_0b_3x3') 118 | self.b3a = MaxPool3dSamePadding(kernel_size=[3, 3, 3], 119 | stride=(1, 1, 1), padding=0) 120 | self.b3b = Unit3D(in_channels=in_channels, output_channels=out_channels[5], kernel_shape=[1, 1, 1], padding=0, 121 | activation_fn=F.relu, name=name + '/Branch_3/Conv3d_0b_1x1') 122 | self.name = name 123 | 124 | def forward(self, x): 125 | b0 = self.b0(x) 126 | b1 = self.b1b(self.b1a(x)) 127 | b2 = self.b2b(self.b2a(x)) 128 | b3 = self.b3b(self.b3a(x)) 129 | return torch.cat([b0, b1, b2, b3], dim=1) 130 | 131 | 132 | class InceptionI3d(nn.Module): 133 | """Inception-v1 I3D architecture. 134 | The model is introduced in: 135 | Quo Vadis, Action Recognition? A New Model and the Kinetics Dataset 136 | Joao Carreira, Andrew Zisserman 137 | https://arxiv.org/pdf/1705.07750v1.pdf. 138 | See also the Inception architecture, introduced in: 139 | Going deeper with convolutions 140 | Christian Szegedy, Wei Liu, Yangqing Jia, Pierre Sermanet, Scott Reed, 141 | Dragomir Anguelov, Dumitru Erhan, Vincent Vanhoucke, Andrew Rabinovich. 142 | http://arxiv.org/pdf/1409.4842v1.pdf. 143 | """ 144 | 145 | # Endpoints of the model in order. During construction, all the endpoints up 146 | # to a designated `final_endpoint` are returned in a dictionary as the 147 | # second return value. 148 | VALID_ENDPOINTS = ( 149 | 'Conv3d_1a_7x7', 150 | 'MaxPool3d_2a_3x3', 151 | 'Conv3d_2b_1x1', 152 | 'Conv3d_2c_3x3', 153 | 'MaxPool3d_3a_3x3', 154 | 'Mixed_3b', 155 | 'Mixed_3c', 156 | 'MaxPool3d_4a_3x3', 157 | 'Mixed_4b', 158 | 'Mixed_4c', 159 | 'Mixed_4d', 160 | 'Mixed_4e', 161 | 'Mixed_4f', 162 | 'MaxPool3d_5a_2x2', 163 | 'Mixed_5b', 164 | 'Mixed_5c', 165 | 'Logits', 166 | 'Predictions', 167 | ) 168 | 169 | def __init__(self, num_classes=400, spatial_squeeze=True, 170 | final_endpoint='Logits', name='inception_i3d', in_channels=3, dropout_keep_prob=0.5): 171 | """Initializes I3D model instance. 172 | Args: 173 | num_classes: The number of outputs in the logit layer (default 400, which 174 | matches the Kinetics dataset). 175 | spatial_squeeze: Whether to squeeze the spatial dimensions for the logits 176 | before returning (default True). 177 | final_endpoint: The model contains many possible endpoints. 178 | `final_endpoint` specifies the last endpoint for the model to be built 179 | up to. In addition to the output at `final_endpoint`, all the outputs 180 | at endpoints up to `final_endpoint` will also be returned, in a 181 | dictionary. `final_endpoint` must be one of 182 | InceptionI3d.VALID_ENDPOINTS (default 'Logits'). 183 | name: A string (optional). The name of this module. 184 | Raises: 185 | ValueError: if `final_endpoint` is not recognized. 186 | """ 187 | 188 | if final_endpoint not in self.VALID_ENDPOINTS: 189 | raise ValueError('Unknown final endpoint %s' % final_endpoint) 190 | 191 | super(InceptionI3d, self).__init__() 192 | self._num_classes = num_classes 193 | self._spatial_squeeze = spatial_squeeze 194 | self._final_endpoint = final_endpoint 195 | self.logits = None 196 | 197 | if self._final_endpoint not in self.VALID_ENDPOINTS: 198 | raise ValueError('Unknown final endpoint %s' % self._final_endpoint) 199 | 200 | self.end_points = {} 201 | end_point = 'Conv3d_1a_7x7' 202 | self.end_points[end_point] = Unit3D(in_channels=in_channels, output_channels=64, kernel_shape=[7, 7, 7], 203 | activation_fn=F.relu, stride=(2, 2, 2), padding=3, # padding=(3, 3, 3), 204 | name=name + end_point) 205 | if self._final_endpoint == end_point: 206 | return 207 | 208 | end_point = 'MaxPool3d_2a_3x3' 209 | self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[1, 3, 3], stride=(1, 2, 2), 210 | padding=0) 211 | if self._final_endpoint == end_point: 212 | return 213 | 214 | end_point = 'Conv3d_2b_1x1' 215 | self.end_points[end_point] = Unit3D(in_channels=64, output_channels=64, kernel_shape=[1, 1, 1], padding=0, 216 | activation_fn=F.relu, name=name + end_point) 217 | if self._final_endpoint == end_point: 218 | return 219 | 220 | end_point = 'Conv3d_2c_3x3' 221 | self.end_points[end_point] = Unit3D(in_channels=64, output_channels=192, kernel_shape=[3, 3, 3], padding=1, 222 | activation_fn=F.relu, name=name + end_point) 223 | if self._final_endpoint == end_point: 224 | return 225 | 226 | end_point = 'MaxPool3d_3a_3x3' 227 | self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[1, 3, 3], stride=(1, 2, 2), 228 | padding=0) 229 | if self._final_endpoint == end_point: 230 | return 231 | 232 | end_point = 'Mixed_3b' 233 | self.end_points[end_point] = InceptionModule(192, [64, 96, 128, 16, 32, 32], name + end_point) 234 | if self._final_endpoint == end_point: 235 | return 236 | 237 | end_point = 'Mixed_3c' 238 | self.end_points[end_point] = InceptionModule(256, [128, 128, 192, 32, 96, 64], name + end_point) 239 | if self._final_endpoint == end_point: 240 | return 241 | 242 | end_point = 'MaxPool3d_4a_3x3' 243 | self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[3, 3, 3], stride=(2, 2, 2), 244 | padding=0) 245 | if self._final_endpoint == end_point: 246 | return 247 | 248 | end_point = 'Mixed_4b' 249 | self.end_points[end_point] = InceptionModule(128 + 192 + 96 + 64, [192, 96, 208, 16, 48, 64], name + end_point) 250 | if self._final_endpoint == end_point: 251 | return 252 | 253 | end_point = 'Mixed_4c' 254 | self.end_points[end_point] = InceptionModule(192 + 208 + 48 + 64, [160, 112, 224, 24, 64, 64], name + end_point) 255 | if self._final_endpoint == end_point: 256 | return 257 | 258 | end_point = 'Mixed_4d' 259 | self.end_points[end_point] = InceptionModule(160 + 224 + 64 + 64, [128, 128, 256, 24, 64, 64], name + end_point) 260 | if self._final_endpoint == end_point: 261 | return 262 | 263 | end_point = 'Mixed_4e' 264 | self.end_points[end_point] = InceptionModule(128 + 256 + 64 + 64, [112, 144, 288, 32, 64, 64], name + end_point) 265 | if self._final_endpoint == end_point: 266 | return 267 | 268 | end_point = 'Mixed_4f' 269 | self.end_points[end_point] = InceptionModule(112 + 288 + 64 + 64, [256, 160, 320, 32, 128, 128], 270 | name + end_point) 271 | if self._final_endpoint == end_point: 272 | return 273 | 274 | end_point = 'MaxPool3d_5a_2x2' 275 | self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[2, 2, 2], stride=(2, 2, 2), 276 | padding=0) 277 | if self._final_endpoint == end_point: 278 | return 279 | 280 | end_point = 'Mixed_5b' 281 | self.end_points[end_point] = InceptionModule(256 + 320 + 128 + 128, [256, 160, 320, 32, 128, 128], 282 | name + end_point) 283 | if self._final_endpoint == end_point: 284 | return 285 | 286 | end_point = 'Mixed_5c' 287 | self.end_points[end_point] = InceptionModule(256 + 320 + 128 + 128, [384, 192, 384, 48, 128, 128], 288 | name + end_point) 289 | if self._final_endpoint == end_point: 290 | return 291 | 292 | # end_point = 'Logits' 293 | self.avg_pool = nn.AvgPool3d(kernel_size=[2, 7, 7], stride=(1, 1, 1)) 294 | self.dropout = nn.Dropout(dropout_keep_prob) 295 | self.logits = Unit3D(in_channels=384 + 384 + 128 + 128, output_channels=self._num_classes, 296 | kernel_shape=[1, 1, 1], 297 | padding=0, 298 | use_batch_norm=False, 299 | use_bias=True, 300 | name='logits') 301 | 302 | self.build() 303 | 304 | def replace_logits(self, num_classes): 305 | self._num_classes = num_classes 306 | self.logits = Unit3D(in_channels=384 + 384 + 128 + 128, output_channels=self._num_classes, 307 | kernel_shape=[1, 1, 1], 308 | padding=0, 309 | use_batch_norm=False, 310 | use_bias=True, 311 | name='logits') 312 | 313 | def build(self): 314 | for k in self.end_points.keys(): 315 | self.add_module(k, self.end_points[k]) 316 | 317 | def forward(self, x): 318 | for end_point in self.VALID_ENDPOINTS: 319 | if end_point in self.end_points: 320 | x = self._modules[end_point](x) # use _modules to work with data parallel 321 | x = self.avg_pool(x) 322 | logits = self.logits(self.dropout(x)) 323 | if self._spatial_squeeze: 324 | logits = x.squeeze(3).squeeze(3) 325 | # logits is batch X time X classes, which is what we want to work with 326 | return logits 327 | 328 | def extract_features(self, x): 329 | for end_point in self.VALID_ENDPOINTS: 330 | if end_point in self.end_points: 331 | x = self._modules[end_point](x) 332 | # x = [batch_size, channels, time, height, width] 333 | x = self.avg_pool(x) # 384 + 384 + 128 + 128 = 1024 334 | x = x.squeeze(0).permute(1, 2, 3, 0) # x = [time, height, width, channels] 335 | x = x.squeeze(1).squeeze(1) # x = [time, channels] 336 | return x 337 | -------------------------------------------------------------------------------- /prepare/videotransforms.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import numbers 3 | import random 4 | 5 | 6 | class RandomCrop(object): 7 | """Crop the given video sequences (t x h x w) at a random location. 8 | Args: 9 | size (sequence or int): Desired output size of the crop. If size is an 10 | int instead of sequence like (h, w), a square crop (size, size) is 11 | made. 12 | """ 13 | 14 | def __init__(self, size): 15 | if isinstance(size, numbers.Number): 16 | self.size = (size, size) 17 | else: 18 | self.size = size 19 | 20 | @staticmethod 21 | def get_params(img, output_size): 22 | """Get parameters for ``crop`` for a random crop. 23 | Args: 24 | img (PIL Image): Image to be cropped. 25 | output_size (tuple): Expected output size of the crop. 26 | Returns: 27 | tuple: params (i, j, h, w) to be passed to ``crop`` for random crop. 28 | """ 29 | t, h, w, c = img.shape 30 | th, tw = output_size 31 | if w == tw and h == th: 32 | return 0, 0, h, w 33 | 34 | i = random.randint(0, h - th) if h != th else 0 35 | j = random.randint(0, w - tw) if w != tw else 0 36 | return i, j, th, tw 37 | 38 | def __call__(self, imgs): 39 | 40 | i, j, h, w = self.get_params(imgs, self.size) 41 | 42 | imgs = imgs[:, i:i + h, j:j + w, :] 43 | return imgs 44 | 45 | def __repr__(self): 46 | return self.__class__.__name__ + '(size={0})'.format(self.size) 47 | 48 | 49 | class CenterCrop(object): 50 | """Crops the given seq Images at the center. 51 | Args: 52 | size (sequence or int): Desired output size of the crop. If size is an 53 | int instead of sequence like (h, w), a square crop (size, size) is 54 | made. 55 | """ 56 | 57 | def __init__(self, size): 58 | if isinstance(size, numbers.Number): 59 | self.size = (size, size) 60 | else: 61 | self.size = size 62 | 63 | def __call__(self, imgs): 64 | """ 65 | Args: 66 | imgs (PIL Image): Image to be cropped. 67 | Returns: 68 | PIL Image: Cropped image. 69 | """ 70 | t, h, w, c = imgs.shape 71 | th, tw = self.size 72 | i = int(np.round((h - th) / 2.)) 73 | j = int(np.round((w - tw) / 2.)) 74 | 75 | return imgs[:, i:i + th, j:j + tw, :] 76 | 77 | def __repr__(self): 78 | return self.__class__.__name__ + '(size={0})'.format(self.size) 79 | 80 | 81 | class RandomHorizontalFlip(object): 82 | """Horizontally flip the given seq Images randomly with a given probability. 83 | Args: 84 | p (float): probability of the image being flipped. Default value is 0.5 85 | """ 86 | 87 | def __init__(self, p=0.5): 88 | self.p = p 89 | 90 | def __call__(self, imgs): 91 | """ 92 | Args: 93 | imgs (seq Images): seq Images to be flipped. 94 | Returns: 95 | seq Images: Randomly flipped seq images. 96 | """ 97 | if random.random() < self.p: 98 | # t x h x w 99 | return np.flip(imgs, axis=2).copy() 100 | return imgs 101 | 102 | def __repr__(self): 103 | return self.__class__.__name__ + '(p={})'.format(self.p) 104 | -------------------------------------------------------------------------------- /run_activitynet.py: -------------------------------------------------------------------------------- 1 | import os 2 | import math 3 | import numpy as np 4 | import tensorflow as tf 5 | from tqdm import tqdm 6 | from argparse import ArgumentParser 7 | from models.LPNet import LPNet 8 | from utils.prepro_activitynet import prepro_activitynet 9 | from utils.data_utils import load_video_features, load_json, write_json, batch_iter 10 | from utils.runner_utils import write_tf_summary, eval_test, get_feed_dict 11 | import json 12 | 13 | parser = ArgumentParser() 14 | parser.add_argument("--gpu_idx", type=str, default="0", help="GPU index") 15 | parser.add_argument("--seed", type=int, default=12345, help="random seed") 16 | parser.add_argument("--mode", type=str, default="train", help="prepro | train | test") 17 | parser.add_argument("--feature", type=str, default='new', help="[new | org]") 18 | parser.add_argument("--root", type=str, default='data/ActivityNet', help="root directory for store raw data") 19 | parser.add_argument("--wordvec_path", type=str, default='data/glove.840B.300d.txt', help="glove word embedding path") 20 | parser.add_argument("--home_dir", type=str, default=None, help="home directory for saving models") 21 | parser.add_argument("--save_dir", type=str, default=None, help="directory for saving processed dataset") 22 | parser.add_argument("--num_train_steps", type=int, default=None, help="number of training steps") 23 | parser.add_argument("--char_size", type=int, default=None, help="number of characters") 24 | parser.add_argument("--epochs", type=int, default=100, help="number of epochs") 25 | parser.add_argument("--batch_size", type=int, default=16, help="batch size") 26 | parser.add_argument("--word_dim", type=int, default=300, help="word embedding dimension") 27 | parser.add_argument("--video_feature_dim", type=int, default=1024, help="video feature input dimension") 28 | parser.add_argument("--char_dim", type=int, default=100, help="character dimension") 29 | parser.add_argument("--hidden_size", type=int, default=256, help="hidden size") 30 | parser.add_argument("--max_position_length", type=int, default=512, help="max position length") 31 | parser.add_argument("--highlight_lambda", type=float, default=5.0, help="lambda for highlight region") 32 | parser.add_argument("--extend", type=float, default=0.1, help="highlight region extension") 33 | parser.add_argument("--num_heads", type=int, default=8, help="number of heads") 34 | parser.add_argument("--drop_rate", type=float, default=0.1, help="dropout rate") 35 | parser.add_argument("--clip_norm", type=float, default=1.0, help="gradient clip norm") 36 | parser.add_argument("--init_lr", type=float, default=0.0001, help="initial learning rate") 37 | parser.add_argument("--warmup_proportion", type=float, default=0.0, help="warmup proportion") 38 | parser.add_argument("--period", type=int, default=100, help="training loss print period") 39 | parser.add_argument("--eval_period", type=int, default=37421, help="evaluation period") 40 | configs = parser.parse_args() 41 | 42 | # os environment 43 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = "3" 44 | os.environ["CUDA_VISIBLE_DEVICES"] = configs.gpu_idx 45 | 46 | np.random.seed(configs.seed) 47 | tf.set_random_seed(configs.seed) 48 | tf.random.set_random_seed(configs.seed) 49 | 50 | class MyEncoder(json.JSONEncoder): 51 | def default(self, obj): 52 | if isinstance(obj, np.integer): 53 | return int(obj) 54 | elif isinstance(obj, np.floating): 55 | return float(obj) 56 | elif isinstance(obj, np.ndarray): 57 | return obj.tolist() 58 | if isinstance(obj, time): 59 | return obj.__str__() 60 | else: 61 | return super(NpEncoder, self).default(obj) 62 | 63 | # specify the dataset directory 64 | if configs.home_dir is None: 65 | configs.home_dir = "ckpt/activitynet_{}_{}".format(configs.feature, configs.max_position_length) 66 | configs.save_dir = "datasets/activitynet_{}/{}".format(configs.feature, configs.max_position_length) 67 | configs.video_feature_dim = 1024 if configs.feature == "new" else 500 68 | 69 | if configs.mode.lower() == "prepro": 70 | prepro_activitynet(configs) 71 | 72 | elif configs.mode.lower() == "train": 73 | video_feature_path = os.path.join(configs.root, "activitynet_features_{}".format(configs.feature)) 74 | video_features = load_video_features(video_feature_path, max_position_length=configs.max_position_length) 75 | 76 | train_set = load_json(os.path.join(configs.save_dir, "train_set.json")) 77 | test_set = load_json(os.path.join(configs.save_dir, "test2_set.json")) 78 | num_train_batches = math.ceil(len(train_set) / configs.batch_size) 79 | 80 | if configs.num_train_steps is None: 81 | configs.num_train_steps = num_train_batches * configs.epochs 82 | if configs.char_size is None: 83 | configs.char_size = len(load_json(os.path.join(configs.save_dir, "char_dict.json"))) 84 | 85 | log_dir = os.path.join(configs.home_dir, "event") 86 | model_dir = os.path.join(configs.home_dir, "model") 87 | if not os.path.exists(model_dir): 88 | os.makedirs(model_dir) 89 | if not os.path.exists(log_dir): 90 | os.makedirs(log_dir) 91 | 92 | # write configs to json file 93 | write_json(vars(configs), save_path=os.path.join(model_dir, "configs.json"), pretty=True) 94 | 95 | with tf.Graph().as_default() as graph: 96 | model = LPNet(configs, graph=graph) 97 | sess_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) 98 | sess_config.gpu_options.allow_growth = True 99 | 100 | with tf.Session(config=sess_config) as sess: 101 | learning_rate = tf.train.exponential_decay(learning_rate=configs.init_lr, global_step=model.global_step, decay_steps=100000, decay_rate=0.9,staircase=True) 102 | 103 | optimizer = tf.train.AdamOptimizer(learning_rate, 104 | beta1=0.9, 105 | beta2=0.999, 106 | name='AdamOptimizer') 107 | # train_op = optimizer.minimize(model.my_loss, global_step=model.global_step) 108 | trainable_vars = tf.trainable_variables() 109 | freeze_bbox_var_list = [ 110 | t for t in trainable_vars 111 | if not t.name.startswith(u'proposal_box') 112 | ] 113 | bbox_var_list = [ 114 | t for t in trainable_vars if t.name.startswith(u'proposal_box') 115 | ] 116 | train_op1 = optimizer.minimize(model.reg_loss, 117 | global_step=model.global_step, 118 | var_list=freeze_bbox_var_list) 119 | train_op2 = optimizer.minimize(model.my_loss, 120 | global_step=model.global_step, 121 | var_list=bbox_var_list) 122 | 123 | saver = tf.train.Saver(max_to_keep=5) 124 | writer = tf.summary.FileWriter(log_dir) 125 | sess.run(tf.global_variables_initializer()) 126 | 127 | best_r1i7 = -1.0 128 | score_writer = open(os.path.join(model_dir, "eval_results.txt"), mode="w", encoding="utf-8") 129 | l = 0 130 | r = 0 131 | o = 0 132 | for epoch in range(configs.epochs): 133 | for data in tqdm(batch_iter(train_set, video_features, configs.batch_size, configs.extend, True, True), 134 | total=num_train_batches, desc="Epoch %d / %d" % (epoch + 1, configs.epochs)): 135 | 136 | # run the model 137 | feed_dict = get_feed_dict(data, model, configs.drop_rate) 138 | 139 | _, _, loss, rloss, iloss, lloss, global_step = sess.run([train_op1, train_op2, model.my_loss, model.reg_loss, model.iou_loss, model.l1_loss, model.global_step], feed_dict=feed_dict) 140 | 141 | if global_step % configs.period == 0: 142 | # write_tf_summary(writer, [("train/my_loss", loss)], global_step) 143 | write_tf_summary(writer, [("train/my_loss", loss), 144 | ("train/reg_loss", rloss), 145 | ("train/iou_loss", iloss), 146 | ("train/l1_loss", lloss)], 147 | global_step) 148 | # evaluate 149 | # if global_step % configs.eval_period == 0 or global_step % num_train_batches == 0: 150 | if (global_step/2+1) % num_train_batches == 0: 151 | 152 | r1i3, r1i5, r1i7, mi, value_pairs, score_str = eval_test( 153 | sess=sess, model=model, dataset=test_set, video_features=video_features, 154 | configs=configs, epoch=epoch + 1, global_step=global_step, name="test") 155 | 156 | write_tf_summary(writer, value_pairs, global_step) 157 | score_writer.write(score_str) 158 | score_writer.flush() 159 | 160 | # save the model according to the result of Rank@1, IoU=0.7 161 | if r1i7 > best_r1i7: 162 | best_r1i7 = r1i7 163 | filename = os.path.join(model_dir, "model_{}.ckpt".format(global_step)) 164 | saver.save(sess, filename) 165 | 166 | score_writer.close() 167 | 168 | elif configs.mode.lower() == "test": 169 | 170 | # load previous configs 171 | model_dir = os.path.join(configs.home_dir, "model") 172 | pre_configs = load_json(os.path.join(model_dir, "configs.json")) 173 | parser.set_defaults(**pre_configs) 174 | configs = parser.parse_args() 175 | 176 | # load video features 177 | video_feature_path = os.path.join(configs.root, "activitynet_features_{}".format(configs.feature)) 178 | video_features = load_video_features(video_feature_path, max_position_length=configs.max_position_length) 179 | 180 | # load test dataset 181 | test_set = load_json(os.path.join(configs.save_dir, "test2_set.json")) 182 | 183 | # restore model and evaluate 184 | with tf.Graph().as_default() as graph: 185 | model = LPNet(configs, graph=graph) 186 | sess_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) 187 | sess_config.gpu_options.allow_growth = True 188 | 189 | with tf.Session(config=sess_config) as sess: 190 | saver = tf.train.Saver() 191 | sess.run(tf.global_variables_initializer()) 192 | saver.restore(sess, tf.train.latest_checkpoint(model_dir)) 193 | 194 | r1i3, r1i5, r1i7, mi, *_ = eval_test(sess, model, dataset=test_set, video_features=video_features, 195 | configs=configs, name="test") 196 | 197 | print("\n" + "\x1b[1;31m" + "Rank@1, IoU=0.3:\t{:.2f}".format(r1i3) + "\x1b[0m", flush=True) 198 | print("\x1b[1;31m" + "Rank@1, IoU=0.5:\t{:.2f}".format(r1i5) + "\x1b[0m", flush=True) 199 | print("\x1b[1;31m" + "Rank@1, IoU=0.7:\t{:.2f}".format(r1i7) + "\x1b[0m", flush=True) 200 | print("\x1b[1;31m" + "{}:\t{:.2f}".format("mean IoU".ljust(15), mi[0]) + "\x1b[0m", flush=True) 201 | 202 | else: 203 | raise ValueError("Unknown mode {}!!!".format(configs.mode)) 204 | -------------------------------------------------------------------------------- /run_charades.py: -------------------------------------------------------------------------------- 1 | import os 2 | import math 3 | import numpy as np 4 | import tensorflow as tf 5 | from tensorflow.python.keras.backend import learning_phase 6 | from tqdm import tqdm 7 | from argparse import ArgumentParser 8 | from models.LPNet import LPNet 9 | from utils.prepro_charades import prepro_charades 10 | from utils.data_utils import load_video_features, load_json, write_json, batch_iter 11 | from utils.runner_utils import write_tf_summary, eval_test, get_feed_dict 12 | import json 13 | import scipy.signal as signal 14 | 15 | parser = ArgumentParser() 16 | parser.add_argument("--gpu_idx", type=str, default="0", help="GPU index") 17 | parser.add_argument("--seed", type=int, default=12345, help="random seed") 18 | parser.add_argument("--mode", type=str, default="train", help="prepro | train | test") 19 | parser.add_argument("--feature", type=str, default='c3d', help="[finetune | raw]") 20 | parser.add_argument("--root", type=str, default='data/Charades', help="root directory for store raw data") 21 | parser.add_argument("--wordvec_path", type=str, default='data/glove.840B.300d.txt', help="glove word embedding path") 22 | parser.add_argument("--home_dir", type=str, default=None, help="home directory for saving models") 23 | parser.add_argument("--save_dir", type=str, default=None, help="directory for saving processed dataset") 24 | parser.add_argument("--num_train_steps", type=int, default=None, help="number of training steps") 25 | parser.add_argument("--char_size", type=int, default=None, help="number of characters") 26 | parser.add_argument("--epochs", type=int, default=200, help="number of epochs") 27 | parser.add_argument("--batch_size", type=int, default=32, help="batch size") 28 | parser.add_argument("--word_dim", type=int, default=300, help="word embedding dimension") 29 | parser.add_argument("--video_feature_dim", type=int, default=500, help="video feature input dimension") 30 | parser.add_argument("--char_dim", type=int, default=50, help="character dimension") 31 | parser.add_argument("--hidden_size", type=int, default=256, help="hidden size") 32 | parser.add_argument("--max_position_length", type=int, default=256, help="max position length") 33 | parser.add_argument("--highlight_lambda", type=float, default=5.0, help="lambda for highlight region") 34 | parser.add_argument("--extend", type=float, default=0.1, help="highlight region extension") 35 | parser.add_argument("--num_heads", type=int, default=8, help="number of heads") 36 | parser.add_argument("--drop_rate", type=float, default=0.1, help="dropout rate") 37 | parser.add_argument("--clip_norm", type=float, default=1.0, help="gradient clip norm") 38 | parser.add_argument("--init_lr", type=float, default=0.0001, help="initial learning rate") 39 | parser.add_argument("--warmup_proportion", type=float, default=0.0, help="warmup proportion") 40 | parser.add_argument("--period", type=int, default=100, help="training loss print period") 41 | parser.add_argument("--eval_period", type=int, default=1000, help="evaluation period") 42 | configs = parser.parse_args() 43 | 44 | # os environment 45 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = "3" 46 | os.environ["CUDA_VISIBLE_DEVICES"] = configs.gpu_idx 47 | 48 | np.random.seed(configs.seed) 49 | tf.set_random_seed(configs.seed) 50 | tf.random.set_random_seed(configs.seed) 51 | 52 | # specify the dataset and model directory 53 | if configs.home_dir is None: 54 | configs.home_dir = "ckpt/charades_{}_{}".format(configs.feature, configs.max_position_length) 55 | configs.save_dir = "datasets/charades_{}/{}".format(configs.feature, configs.max_position_length) 56 | 57 | class MyEncoder(json.JSONEncoder): 58 | def default(self, obj): 59 | if isinstance(obj, np.integer): 60 | return int(obj) 61 | elif isinstance(obj, np.floating): 62 | return float(obj) 63 | elif isinstance(obj, np.ndarray): 64 | return obj.tolist() 65 | if isinstance(obj, time): 66 | return obj.__str__() 67 | else: 68 | return super(NpEncoder, self).default(obj) 69 | 70 | if configs.mode.lower() == "prepro": 71 | prepro_charades(configs) 72 | 73 | elif configs.mode.lower() == "train": 74 | video_feature_path = os.path.join(configs.root, "charades_features_{}".format(configs.feature)) 75 | video_features = load_video_features(video_feature_path, max_position_length=configs.max_position_length) 76 | train_set = load_json(os.path.join(configs.save_dir, "train_set.json")) 77 | test_set = load_json(os.path.join(configs.save_dir, "test_set.json")) 78 | # demo_set =load_json(os.path.join(configs.save_dir, "demo_set.json")) 79 | num_train_batches = math.ceil(len(train_set) / configs.batch_size) 80 | 81 | if configs.num_train_steps is None: 82 | configs.num_train_steps = num_train_batches * configs.epochs 83 | if configs.char_size is None: 84 | configs.char_size = len(load_json(os.path.join(configs.save_dir, "char_dict.json"))) 85 | 86 | log_dir = os.path.join(configs.home_dir, "event") 87 | model_dir = os.path.join(configs.home_dir, "model") 88 | if not os.path.exists(model_dir): 89 | os.makedirs(model_dir) 90 | if not os.path.exists(log_dir): 91 | os.makedirs(log_dir) 92 | 93 | # write configs to json file 94 | write_json(vars(configs), save_path=os.path.join(model_dir, "configs.json"), pretty=True) 95 | 96 | with tf.Graph().as_default() as graph: 97 | model = LPNet(configs, graph=graph) 98 | sess_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) 99 | sess_config.gpu_options.allow_growth = True 100 | 101 | 102 | 103 | with tf.Session(config=sess_config) as sess: 104 | learning_rate = tf.train.exponential_decay(learning_rate=configs.init_lr, global_step=model.global_step, decay_steps=100000, decay_rate=0.9, staircase=True) 105 | optimizer = tf.train.AdamOptimizer(learning_rate, 106 | beta1=0.9, 107 | beta2=0.999, 108 | name='AdamOptimizer') 109 | 110 | trainable_vars = tf.trainable_variables() 111 | freeze_bbox_var_list = [t for t in trainable_vars if not t.name.startswith(u'proposal_box')] 112 | bbox_var_list = [t for t in trainable_vars if t.name.startswith(u'proposal_box')] 113 | # print(freeze_bbox_var_list) 114 | train_op1 = optimizer.minimize(model.reg_loss, global_step=model.global_step, var_list=freeze_bbox_var_list) 115 | train_op2 = optimizer.minimize(model.my_loss, var_list=bbox_var_list) 116 | 117 | writer = tf.summary.FileWriter(log_dir) 118 | sess.run(tf.global_variables_initializer()) 119 | saver_all = tf.train.Saver(max_to_keep=5) 120 | 121 | best_r1i7 = -1.0 122 | score_writer = open(os.path.join(model_dir, "eval_results.txt"), mode="w", encoding="utf-8") 123 | l = 0 124 | r = 0 125 | o = 0 126 | for epoch in range(configs.epochs): 127 | for data in tqdm(batch_iter(train_set, video_features, configs.batch_size, configs.extend, train=True, shuffle=True), 128 | total=num_train_batches, desc="Epoch %d / %d" % (epoch + 1, configs.epochs)): 129 | 130 | # run the model 131 | feed_dict = get_feed_dict(data, model, configs.drop_rate) 132 | 133 | _, _, train, hloss, lloss, loss, rloss, iloss, l1loss, global_step = sess.run([train_op1, train_op2, model.train, model.highlight_loss, model.loss, model.my_loss, model.reg_loss, model.iou_loss, model.l1_loss, model.global_step], feed_dict=feed_dict) 134 | # print(train) 135 | if global_step % configs.period == 0: 136 | write_tf_summary(writer, [("train/my_loss", loss),("train/reg_loss", rloss),("train/iou_loss", iloss),("train/l1_loss", l1loss),("train/loss", lloss),("train/highlight_loss", hloss)], global_step) 137 | 138 | if (global_step + 1)% num_train_batches == 0: 139 | # if global_step % 800 == 0: 140 | lr = sess.run(learning_rate) 141 | print(lr) 142 | r1i3, r1i5, r1i7, mi, value_pairs, score_str = eval_test( 143 | sess=sess, model=model, dataset=test_set, video_features=video_features, 144 | configs=configs, epoch=epoch + 1, global_step=global_step, name="test") 145 | 146 | write_tf_summary(writer, value_pairs, global_step) 147 | score_writer.write(score_str) 148 | score_writer.flush() 149 | 150 | #save the model according to the result of Rank@1, IoU=0.7 151 | if r1i7 > best_r1i7: 152 | best_r1i7 = r1i7 153 | filename = os.path.join(model_dir, "model_{}.ckpt".format(global_step)) 154 | saver_all.save(sess, filename) 155 | 156 | print(iloss) 157 | score_writer.close() 158 | 159 | elif configs.mode.lower() == "test": 160 | 161 | # load previous configs 162 | model_dir = os.path.join(configs.home_dir, "model") 163 | pre_configs = load_json(os.path.join(model_dir, "configs.json")) 164 | parser.set_defaults(**pre_configs) 165 | configs = parser.parse_args() 166 | 167 | # load video features 168 | video_feature_path = os.path.join(configs.root, "charades_features_{}".format(configs.feature)) 169 | video_features = load_video_features(video_feature_path, max_position_length=configs.max_position_length) 170 | 171 | # load test dataset 172 | test_set = load_json(os.path.join(configs.save_dir, "test_set.json")) 173 | # restore model and evaluate 174 | with tf.Graph().as_default() as graph: 175 | model = LPNet(configs, graph=graph) 176 | sess_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) 177 | sess_config.gpu_options.allow_growth = True 178 | 179 | with tf.Session(config=sess_config) as sess: 180 | saver = tf.train.Saver() 181 | sess.run(tf.global_variables_initializer()) 182 | # saver.restore(sess, tf.train.latest_checkpoint(model_dir)) 183 | ## print(tf.train.latest_checkpoint(model_dir)) 184 | saver.restore( 185 | sess, tf.train.latest_checkpoint(model_dir)) 186 | 187 | r1i3, r1i5, r1i7, mi, *_ = eval_test( 188 | sess, 189 | model, 190 | dataset=test_set, 191 | video_features=video_features, 192 | configs=configs, 193 | name="test") 194 | 195 | print("\n" + "\x1b[1;31m" + "Rank@1, IoU=0.3:\t{:.2f}".format(r1i3) + "\x1b[0m", flush=True) 196 | print("\x1b[1;31m" + "Rank@1, IoU=0.5:\t{:.2f}".format(r1i5) + "\x1b[0m", flush=True) 197 | print("\x1b[1;31m" + "Rank@1, IoU=0.7:\t{:.2f}".format(r1i7) + "\x1b[0m", flush=True) 198 | print("\x1b[1;31m" + "{}:\t{:.2f}".format("mean IoU".ljust(15), mi[0]) + "\x1b[0m", flush=True) 199 | 200 | else: 201 | raise ValueError("Unknown mode {}!!!".format(configs.mode)) 202 | -------------------------------------------------------------------------------- /run_tacos.py: -------------------------------------------------------------------------------- 1 | import os 2 | import math 3 | import numpy as np 4 | import tensorflow as tf 5 | from tqdm import tqdm 6 | from argparse import ArgumentParser 7 | from models.LPNet import LPNet 8 | from utils.prepro_tacos import prepro_tacos 9 | from utils.data_utils import load_video_features, load_json, write_json, batch_iter 10 | from utils.runner_utils import write_tf_summary, eval_test, get_feed_dict 11 | 12 | parser = ArgumentParser() 13 | parser.add_argument("--gpu_idx", type=str, default="0", help="GPU index") 14 | parser.add_argument("--seed", type=int, default=12345, help="random seed") 15 | parser.add_argument("--mode", type=str, default="train", help="prepro | train | test") 16 | parser.add_argument("--feature", type=str, default="org", help="[new | org], org: the visual feature from Gao et al.") 17 | parser.add_argument("--root", type=str, default='data/TACoS', help="root directory for store raw data") 18 | parser.add_argument("--wordvec_path", type=str, default="data/glove.840B.300d.txt", help="glove word embedding path") 19 | parser.add_argument("--home_dir", type=str, default=None, help="home directory for saving models") 20 | parser.add_argument("--save_dir", type=str, default=None, help="directory for saving processed dataset") 21 | parser.add_argument("--num_train_steps", type=int, default=None, help="number of training steps") 22 | parser.add_argument("--char_size", type=int, default=None, help="number of characters") 23 | parser.add_argument("--epochs", type=int, default=100, help="number of epochs") 24 | parser.add_argument("--batch_size", type=int, default=16, help="batch size") 25 | parser.add_argument("--word_dim", type=int, default=300, help="word embedding dimension") 26 | parser.add_argument("--video_feature_dim", type=int, default=1024, help="video feature input dimension") 27 | parser.add_argument("--char_dim", type=int, default=50, help="character dimension") 28 | parser.add_argument("--hidden_size", type=int, default=256, help="hidden size") 29 | parser.add_argument("--max_position_length", type=int, default=512, help="max position length") 30 | parser.add_argument("--highlight_lambda", type=float, default=5.0, help="lambda for highlight region") 31 | parser.add_argument("--extend", type=float, default=0.1, help="highlight region extension") 32 | parser.add_argument("--num_heads", type=int, default=8, help="number of heads") 33 | parser.add_argument("--drop_rate", type=float, default=0.1, help="dropout rate") 34 | parser.add_argument("--clip_norm", type=float, default=1.0, help="gradient clip norm") 35 | parser.add_argument("--init_lr", type=float, default=0.0001, help="initial learning rate") 36 | parser.add_argument("--warmup_proportion", type=float, default=0.0, help="warmup proportion") 37 | parser.add_argument("--period", type=int, default=100, help="training loss print period") 38 | parser.add_argument("--eval_period", type=int, default=None, help="evaluation period") 39 | configs = parser.parse_args() 40 | 41 | # os environment 42 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = "3" 43 | os.environ["CUDA_VISIBLE_DEVICES"] = configs.gpu_idx 44 | 45 | np.random.seed(configs.seed) 46 | tf.set_random_seed(configs.seed) 47 | tf.random.set_random_seed(configs.seed) 48 | 49 | # specify the dataset directory 50 | if configs.home_dir is None: 51 | configs.home_dir = "ckpt/tacos_{}_{}".format(configs.feature, configs.max_position_length) 52 | configs.save_dir = "datasets/tacos_{}/{}".format(configs.feature, configs.max_position_length) 53 | configs.video_feature_dim = 1024 if configs.feature == "new" else 4096 54 | 55 | if configs.mode.lower() == "prepro": 56 | prepro_tacos(configs) 57 | 58 | elif configs.mode.lower() == "train": 59 | video_feature_path = os.path.join(configs.root, "tacos_features_{}".format(configs.feature)) 60 | video_features = load_video_features(video_feature_path, max_position_length=configs.max_position_length) 61 | 62 | train_set = load_json(os.path.join(configs.save_dir, "train_set.json")) 63 | test_set = load_json(os.path.join(configs.save_dir, "test_set.json")) 64 | num_train_batches = math.ceil(len(train_set) / configs.batch_size) 65 | 66 | if configs.eval_period is None: 67 | configs.eval_period = num_train_batches 68 | if configs.num_train_steps is None: 69 | configs.num_train_steps = num_train_batches * configs.epochs 70 | if configs.char_size is None: 71 | configs.char_size = len(load_json(os.path.join(configs.save_dir, "char_dict.json"))) 72 | 73 | log_dir = os.path.join(configs.home_dir, "event") 74 | model_dir = os.path.join(configs.home_dir, "model") 75 | if not os.path.exists(model_dir): 76 | os.makedirs(model_dir) 77 | if not os.path.exists(log_dir): 78 | os.makedirs(log_dir) 79 | 80 | # write configs to json file 81 | write_json(vars(configs), save_path=os.path.join(model_dir, "configs.json"), pretty=True) 82 | 83 | with tf.Graph().as_default() as graph: 84 | model = LPNet(configs, graph=graph) 85 | sess_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) 86 | sess_config.gpu_options.allow_growth = True 87 | 88 | with tf.Session(config=sess_config) as sess: 89 | optimizer = tf.train.AdamOptimizer(configs.init_lr, 90 | beta1=0.9, 91 | beta2=0.999, 92 | name='AdamOptimizer') 93 | # train_op = optimizer.minimize(model.my_loss, global_step=model.global_step) 94 | trainable_vars = tf.trainable_variables() 95 | freeze_bbox_var_list = [t for t in trainable_vars if not t.name.startswith(u'proposal_box')] 96 | bbox_var_list = [t for t in trainable_vars if t.name.startswith(u'proposal_box')] 97 | train_op1 = optimizer.minimize(model.reg_loss, global_step=model.global_step, var_list=freeze_bbox_var_list) 98 | train_op2 = optimizer.minimize(model.my_loss, var_list=bbox_var_list) 99 | saver = tf.train.Saver(max_to_keep=5) 100 | writer = tf.summary.FileWriter(log_dir) 101 | sess.run(tf.global_variables_initializer()) 102 | 103 | best_r1i7 = -1.0 104 | score_writer = open(os.path.join(model_dir, "eval_results.txt"), mode="w", encoding="utf-8") 105 | 106 | for epoch in range(configs.epochs): 107 | for data in tqdm(batch_iter(train_set, video_features, configs.batch_size, configs.extend, True, True), 108 | total=num_train_batches, desc="Epoch %d / %d" % (epoch + 1, configs.epochs)): 109 | 110 | # run the model 111 | feed_dict = get_feed_dict(data, model, configs.drop_rate) 112 | # _, loss, h_loss,lloss, rloss, global_step = sess.run([train_op, model.my_loss, model.highlight_loss, model.loss, model.reg_loss, 113 | # model.global_step], feed_dict=feed_dict) 114 | # if global_step % configs.period == 0: 115 | # write_tf_summary(writer, [("train/my_loss", loss), ("train/highlight_loss", h_loss),("train/reg_loss", rloss), ("train/cls_loss", lloss)], global_step) 116 | 117 | _, _, loss, rloss, iloss, lloss, kloss, hloss, global_step = sess.run( 118 | [ 119 | train_op1, train_op2, 120 | model.my_loss, 121 | model.reg_loss, model.iou_loss, model.l1_loss, model.loss, model.highlight_loss, 122 | model.global_step 123 | ], 124 | feed_dict=feed_dict) 125 | if global_step % configs.period == 0: 126 | # write_tf_summary(writer, [("train/my_loss", loss)], global_step) 127 | write_tf_summary(writer, [("train/my_loss", loss), 128 | ("train/reg_loss", rloss), 129 | ("train/iou_loss", iloss), 130 | ("train/l1_loss", lloss), 131 | ("train/kl_loss", kloss), 132 | ("train/hl_loss", hloss)], 133 | global_step) 134 | 135 | # evaluate 136 | if global_step % num_train_batches == 0: 137 | 138 | r1i3, r1i5, r1i7, mi, value_pairs, score_str = eval_test( 139 | sess=sess, model=model, dataset=test_set, video_features=video_features, 140 | configs=configs, epoch=epoch + 1, global_step=global_step, name="test") 141 | 142 | write_tf_summary(writer, value_pairs, global_step) 143 | score_writer.write(score_str) 144 | score_writer.flush() 145 | 146 | # save the model according to the result of Rank@1, IoU=0.7 147 | if r1i7 > best_r1i7: 148 | best_r1i7 = r1i7 149 | filename = os.path.join(model_dir, "model_{}.ckpt".format(global_step)) 150 | saver.save(sess, filename) 151 | 152 | score_writer.close() 153 | 154 | elif configs.mode.lower() == "test": 155 | 156 | # load previous configs 157 | model_dir = os.path.join(configs.home_dir, "model") 158 | pre_configs = load_json(os.path.join(model_dir, "configs.json")) 159 | parser.set_defaults(**pre_configs) 160 | configs = parser.parse_args() 161 | 162 | # load video features 163 | video_feature_path = os.path.join(configs.root, "tacos_features_{}".format(configs.feature)) 164 | video_features = load_video_features(video_feature_path, max_position_length=configs.max_position_length) 165 | 166 | # load test dataset 167 | test_set = load_json(os.path.join(configs.save_dir, "test_set.json")) 168 | 169 | # restore model and evaluate 170 | with tf.Graph().as_default() as graph: 171 | model = LPNet(configs, graph=graph) 172 | sess_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) 173 | sess_config.gpu_options.allow_growth = True 174 | 175 | with tf.Session(config=sess_config) as sess: 176 | saver = tf.train.Saver() 177 | sess.run(tf.global_variables_initializer()) 178 | saver.restore(sess, tf.train.latest_checkpoint(model_dir)) 179 | 180 | r1i3, r1i5, r1i7, mi, *_ = eval_test(sess, model, dataset=test_set, video_features=video_features, 181 | configs=configs, name="test") 182 | 183 | print("\n" + "\x1b[1;31m" + "Rank@1, IoU=0.3:\t{:.2f}".format(r1i3) + "\x1b[0m", flush=True) 184 | print("\x1b[1;31m" + "Rank@1, IoU=0.5:\t{:.2f}".format(r1i5) + "\x1b[0m", flush=True) 185 | print("\x1b[1;31m" + "Rank@1, IoU=0.7:\t{:.2f}".format(r1i7) + "\x1b[0m", flush=True) 186 | print("\x1b[1;31m" + "{}:\t{:.2f}".format("mean IoU".ljust(15), mi[0]) + "\x1b[0m", flush=True) 187 | 188 | else: 189 | raise ValueError("Unknown mode {}!!!".format(configs.mode)) 190 | -------------------------------------------------------------------------------- /statistic/convert_tacos.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import glob 4 | import argparse 5 | import subprocess 6 | import numpy as np 7 | from tqdm import tqdm 8 | from nltk.tokenize import word_tokenize 9 | from moviepy.editor import VideoFileClip 10 | 11 | 12 | def extract_video_to_images(video_dir, video_names, save_dir): 13 | if not os.path.exists(video_dir): 14 | raise ValueError("The video directory '{}' does not exist!!!".format(video_dir)) 15 | 16 | if not os.path.exists(save_dir): 17 | os.makedirs(save_dir) 18 | 19 | for video_name in tqdm(video_names, total=len(video_names), desc="extract video to images"): 20 | video_path = os.path.join(video_dir, video_name) 21 | video_id = video_name[0:-4] 22 | image_dir = os.path.join(save_dir, video_id) 23 | 24 | if os.path.exists(image_dir): 25 | continue 26 | else: 27 | os.makedirs(image_dir) 28 | 29 | subprocess.call("ffmpeg -hide_banner -loglevel panic -i {} -filter:v fps=fps=29.4 {}/{}-%6d.jpg".format( 30 | video_path, image_dir, video_id), shell=True) 31 | 32 | 33 | def load_frames_and_times(image_dir, video_dir, video_names): 34 | dirs = glob.glob(os.path.join(image_dir, "*/")) 35 | video_frames = dict() 36 | 37 | for directory in dirs: 38 | vid = os.path.basename(directory[0:-1]) 39 | num_frames = len(glob.glob(os.path.join(directory, "*.jpg"))) 40 | video_frames[vid] = num_frames 41 | 42 | video_times = dict() 43 | fps = None 44 | 45 | for video_name in video_names: 46 | video_id = video_name[0:-4] 47 | clip = VideoFileClip(os.path.join(video_dir, video_name)) 48 | fps = clip.fps # all the videos with the same fps 49 | duration = clip.duration 50 | video_times[video_id] = duration 51 | 52 | return video_frames, video_times, fps 53 | 54 | 55 | def load_video_names(dataset_dir): 56 | video_names = [] 57 | video_files = ["TACoS_train_videos.txt", "TACoS_val_videos.txt", "TACoS_test_videos.txt"] 58 | 59 | for video_file in video_files: 60 | with open(os.path.join(dataset_dir, video_file), mode="r", encoding="utf-8") as f: 61 | for line in f: 62 | line = line.strip() 63 | 64 | if len(line) == 0: 65 | continue 66 | 67 | video_names.append(line) 68 | 69 | return video_names 70 | 71 | 72 | def read_data(filename): 73 | results = [] 74 | with open(filename, mode="r", encoding="utf-8") as f: 75 | for line in f: 76 | line = line.strip() 77 | 78 | if len(line) == 0: 79 | continue 80 | 81 | video, text = line.split(":") 82 | 83 | if text.endswith("#"): 84 | text = text[0:-1] 85 | 86 | sentences = [sentence.strip().lower() for sentence in text.split("#")] 87 | vid, start_frame, end_frame = video.split("_") 88 | vid = vid[0:-4] 89 | start_frame = int(start_frame) 90 | end_frame = int(end_frame) 91 | 92 | result = (vid, start_frame, end_frame, sentences) 93 | results.append(result) 94 | 95 | return results 96 | 97 | 98 | def reconstruct_tacos_dataset(dataset, video_frames, fps): 99 | temp_data = dict() 100 | for data in dataset: 101 | vid, start_frame, end_frame, sentences = data 102 | temp_data[vid] = temp_data.get(vid, []) + [(start_frame, end_frame, sentences)] 103 | 104 | new_dataset = dict() 105 | for vid, records in temp_data.items(): 106 | num_frames = video_frames[vid] 107 | timestamps, sentences = [], [] 108 | 109 | for record in records: 110 | start_frame, end_frame, sents = record 111 | 112 | for sent in sents: 113 | timestamps.append([start_frame, end_frame]) 114 | sentences.append(sent) 115 | 116 | new_dataset[vid] = {"timestamps": timestamps, "sentences": sentences, "fps": fps, "num_frames": num_frames} 117 | return new_dataset 118 | 119 | 120 | def stat_data_info(data, fps): 121 | num_samples, query_lengths, num_words, moment_lengths = 0, [], [], [] 122 | for record in data: 123 | moment_length = float(record[2] - record[1]) / fps 124 | num_samples += len(record[-1]) 125 | 126 | for sentence in record[-1]: 127 | words = word_tokenize(sentence) 128 | query_lengths.append(len(words)) 129 | num_words.extend(words) 130 | 131 | moment_lengths.append(moment_length) 132 | return num_samples, query_lengths, num_words, moment_lengths 133 | 134 | 135 | def main(): 136 | parser = argparse.ArgumentParser() 137 | parser.add_argument("--video_dir", type=str, required=True, help="TACoS video directory") 138 | parser.add_argument("--dataset_dir", type=str, required=True, help="TACoS dataset directory") 139 | parser.add_argument("--save_dir", type=str, required=True, help="directory to save extracted images") 140 | args = parser.parse_args() 141 | 142 | # load video ids 143 | video_names = load_video_names(args.dataset_dir) 144 | 145 | # extract video information 146 | extract_video_to_images(args.video_dir, video_names, args.save_dir) 147 | video_frames, video_times, fps = load_frames_and_times(args.save_dir, args.video_dir, video_names) 148 | 149 | # load TACoS datasets 150 | train_data = read_data(os.path.join(args.dataset_dir, "TACoS_train_samples.txt")) 151 | val_data = read_data(os.path.join(args.dataset_dir, "TACoS_val_samples.txt")) 152 | test_data = read_data(os.path.join(args.dataset_dir, "TACoS_test_samples.txt")) 153 | 154 | train_set = reconstruct_tacos_dataset(train_data, video_frames, fps) 155 | val_set = reconstruct_tacos_dataset(val_data, video_frames, fps) 156 | test_set = reconstruct_tacos_dataset(test_data, video_frames, fps) 157 | 158 | with open(os.path.join(args.dataset_dir, "train.json"), mode="w", encoding="utf-8") as f: 159 | json.dump(train_set, f) 160 | 161 | with open(os.path.join(args.dataset_dir, "val.json"), mode="w", encoding="utf-8") as f: 162 | json.dump(val_set, f) 163 | 164 | with open(os.path.join(args.dataset_dir, "test.json"), mode="w", encoding="utf-8") as f: 165 | json.dump(test_set, f) 166 | 167 | # statistics 168 | train_samples, train_query_lengths, train_num_words, train_moment_lengths = stat_data_info(train_data, fps) 169 | val_samples, val_query_lengths, val_num_words, val_moment_lengths = stat_data_info(val_data, fps) 170 | test_samples, test_query_lengths, test_num_words, test_moment_lengths = stat_data_info(test_data, fps) 171 | query_lengths = train_query_lengths + val_query_lengths + test_query_lengths 172 | num_words = train_num_words + val_num_words + test_num_words 173 | moment_lengths = train_moment_lengths + val_moment_lengths + test_moment_lengths 174 | durations = list(video_times.values()) 175 | 176 | # print 177 | print("Training samples:", train_samples) 178 | print("Validation samples:", val_samples) 179 | print("Test samples:", test_samples) 180 | print("Vocabulary size:", len(set(num_words))) 181 | print("Average video length:", np.mean(durations)) 182 | print("Average query length:", np.mean(query_lengths)) 183 | print("Average moment length:", np.mean(moment_lengths)) 184 | print("Std. of moment length:", np.std(moment_lengths)) 185 | 186 | 187 | if __name__ == "__main__": 188 | main() 189 | -------------------------------------------------------------------------------- /statistic/stat_activitynet.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | import numpy as np 5 | from nltk.tokenize import word_tokenize 6 | 7 | 8 | def stat_data_info(data): 9 | num_videos, num_anns, video_lengths, query_lengths, moment_lengths, num_words = 0, 0, list(), list(), list(), list() 10 | for key, value in data.items(): 11 | num_videos += 1 12 | num_anns += len(value["timestamps"]) 13 | video_lengths.append(float(value["duration"])) 14 | 15 | for val in value["timestamps"]: 16 | moment_lengths.append(val[1] - val[0]) 17 | 18 | for sentence in value["sentences"]: 19 | words = word_tokenize(sentence.strip().lower()) 20 | num_words.extend(words) 21 | query_lengths.append(len(words)) 22 | 23 | return num_videos, num_anns, video_lengths, query_lengths, moment_lengths, num_words 24 | 25 | 26 | def main(): 27 | parser = argparse.ArgumentParser() 28 | parser.add_argument("--dataset_dir", type=str, required=True, help="ActivityNet Caption dataset directory") 29 | args = parser.parse_args() 30 | 31 | with open(os.path.join(args.dataset_dir, "train.json"), mode="r", encoding="utf-8") as f: 32 | train_data = json.load(f) 33 | 34 | with open(os.path.join(args.dataset_dir, "val_1.json"), mode="r", encoding="utf-8") as f: 35 | test_data = json.load(f) 36 | 37 | with open(os.path.join(args.dataset_dir, "val_2.json"), mode="r", encoding="utf-8") as f: 38 | test2_data = json.load(f) 39 | 40 | (train_num_videos, train_num_anns, train_video_lengths, train_query_lengths, train_moment_lengths, 41 | train_num_words) = stat_data_info(train_data) 42 | 43 | (test_num_videos, test_num_anns, test_video_lengths, test_query_lengths, test_moment_lengths, 44 | test_num_words) = stat_data_info(test_data) 45 | 46 | (test2_num_videos, test2_num_anns, test2_video_lengths, test2_query_lengths, test2_moment_lengths, 47 | test2_num_words) = stat_data_info(test2_data) 48 | 49 | video_lengths = train_video_lengths + test_video_lengths + test2_video_lengths 50 | query_lengths = train_query_lengths + test_query_lengths + test2_query_lengths 51 | moment_lengths = train_moment_lengths + test_moment_lengths + test2_moment_lengths 52 | num_words = train_num_words + test_num_words + test2_num_words 53 | 54 | print("Training videos:", train_num_videos) 55 | print("Test videos:", test_num_videos) 56 | print("Training samples:", train_num_anns) 57 | print("Test samples:", test_num_anns) 58 | print("Vocabulary size:", len(set(num_words))) 59 | print("Average video length:", np.mean(video_lengths)) 60 | print("Average query length:", np.mean(query_lengths)) 61 | print("Average moment length:", np.mean(moment_lengths)) 62 | print("Std. of moment length:", np.std(moment_lengths)) 63 | 64 | 65 | if __name__ == "__main__": 66 | main() 67 | -------------------------------------------------------------------------------- /statistic/stat_charades.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | import numpy as np 5 | from nltk.tokenize import word_tokenize 6 | 7 | 8 | def load_charades_sta_data(charades_sta_file, charades): 9 | with open(charades_sta_file, mode="r", encoding="utf-8") as f_sta: 10 | vids, data = [], [] 11 | for line in f_sta: 12 | line = line.lstrip().rstrip() 13 | 14 | if len(line) == 0: 15 | continue 16 | 17 | video_info, sentence = line.split("##") 18 | vid, start_time, end_time = video_info.split(" ") 19 | words = word_tokenize(sentence.lower(), language="english") 20 | start_time, end_time = float(start_time), float(end_time) 21 | duration = float(charades[vid]["duration"]) 22 | 23 | vids.append(vid) 24 | data.append((vid, start_time, end_time, duration, words)) 25 | 26 | return vids, data 27 | 28 | 29 | def stat_data_info(data): 30 | query_lengths, moment_lengths, num_words = [], [], [] 31 | 32 | for record in data: 33 | moment_length = record[2] - record[1] 34 | moment_lengths.append(moment_length) 35 | query_lengths.append(len(record[-1])) 36 | num_words.extend(record[-1]) 37 | 38 | return query_lengths, moment_lengths, num_words 39 | 40 | 41 | def main(): 42 | parser = argparse.ArgumentParser() 43 | parser.add_argument("--dataset_dir", type=str, required=True, help="Charades-STA dataset directory") 44 | args = parser.parse_args() 45 | 46 | with open(os.path.join(args.dataset_dir, "charades.json"), mode="r", encoding="utf-8") as f: 47 | charades = json.load(f) 48 | 49 | train_vids, train_data = load_charades_sta_data(os.path.join(args.dataset_dir, "charades_sta_train.txt"), charades) 50 | test_vids, test_data = load_charades_sta_data(os.path.join(args.dataset_dir, "charades_sta_test.txt"), charades) 51 | 52 | num_train_videos = len(set(train_vids)) 53 | num_test_videos = len(set(test_vids)) 54 | num_train_anns = len(train_data) 55 | num_test_anns = len(test_data) 56 | 57 | vids = list(set(train_vids + test_vids)) 58 | video_lengths = [] 59 | for vid in vids: 60 | duration = charades[vid]["duration"] 61 | video_lengths.append(float(duration)) 62 | 63 | train_query_lengths, train_moment_lengths, train_num_words = stat_data_info(train_data) 64 | test_query_lengths, test_moment_lengths, test_num_words = stat_data_info(test_data) 65 | query_lengths = train_query_lengths + test_query_lengths 66 | moment_lengths = train_moment_lengths + test_moment_lengths 67 | num_words = train_num_words + test_num_words 68 | 69 | print("Training videos:", num_train_videos) 70 | print("Test videos:", num_test_videos) 71 | print("Training samples:", num_train_anns) 72 | print("Test samples:", num_test_anns) 73 | print("Vocabulary size:", len(set(num_words))) 74 | print("Average video length:", np.mean(video_lengths)) 75 | print("min video length:", np.min(video_lengths)) 76 | print("max video length:", np.max(video_lengths)) 77 | print("min video length:", np.min(video_lengths)) 78 | print("median video length:", np.median(video_lengths)) 79 | print("Average query length:", np.mean(query_lengths)) 80 | print("Average moment length:", np.mean(moment_lengths)) 81 | print("Std. of moment length:", np.std(moment_lengths)) 82 | 83 | 84 | if __name__ == "__main__": 85 | main() 86 | -------------------------------------------------------------------------------- /statistic/stat_tacos.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import glob 4 | import argparse 5 | import subprocess 6 | import numpy as np 7 | from tqdm import tqdm 8 | from nltk.tokenize import word_tokenize 9 | from moviepy.editor import VideoFileClip 10 | 11 | 12 | def extract_video_to_images(video_dir, video_names, save_dir): 13 | if not os.path.exists(video_dir): 14 | raise ValueError("The video directory '{}' does not exist!!!".format(video_dir)) 15 | 16 | if not os.path.exists(save_dir): 17 | os.makedirs(save_dir) 18 | 19 | for video_name in tqdm(video_names, total=len(video_names), desc="extract video to images"): 20 | video_path = os.path.join(video_dir, video_name) 21 | video_id = video_name[0:-4] 22 | image_dir = os.path.join(save_dir, video_id) 23 | 24 | if os.path.exists(image_dir): 25 | continue 26 | else: 27 | os.makedirs(image_dir) 28 | 29 | subprocess.call("ffmpeg -hide_banner -loglevel panic -i {} -filter:v fps=fps=29.4 {}/{}-%6d.jpg".format( 30 | video_path, image_dir, video_id), shell=True) 31 | 32 | 33 | def load_frames_and_times(image_dir, video_dir, video_names): 34 | dirs = glob.glob(os.path.join(image_dir, "*/")) 35 | video_frames = dict() 36 | 37 | for directory in dirs: 38 | vid = os.path.basename(directory[0:-1]) 39 | num_frames = len(glob.glob(os.path.join(directory, "*.jpg"))) 40 | video_frames[vid] = num_frames 41 | 42 | video_times = dict() 43 | fps = None 44 | 45 | for video_name in video_names: 46 | video_id = video_name[0:-4] 47 | clip = VideoFileClip(os.path.join(video_dir, video_name)) 48 | fps = clip.fps # all the videos with the same fps 49 | duration = clip.duration 50 | video_times[video_id] = duration 51 | 52 | return video_frames, video_times, fps 53 | 54 | 55 | def load_video_names(dataset_dir): 56 | video_names = [] 57 | video_files = ["TACoS_train_videos.txt", "TACoS_val_videos.txt", "TACoS_test_videos.txt"] 58 | 59 | for video_file in video_files: 60 | with open(os.path.join(dataset_dir, video_file), mode="r", encoding="utf-8") as f: 61 | for line in f: 62 | line = line.strip() 63 | 64 | if len(line) == 0: 65 | continue 66 | 67 | video_names.append(line) 68 | 69 | return video_names 70 | 71 | 72 | def read_data(filename): 73 | results = [] 74 | with open(filename, mode="r", encoding="utf-8") as f: 75 | for line in f: 76 | line = line.strip() 77 | 78 | if len(line) == 0: 79 | continue 80 | 81 | video, text = line.split(":") 82 | 83 | if text.endswith("#"): 84 | text = text[0:-1] 85 | 86 | sentences = [sentence.strip().lower() for sentence in text.split("#")] 87 | vid, start_frame, end_frame = video.split("_") 88 | vid = vid[0:-4] 89 | start_frame = int(start_frame) 90 | end_frame = int(end_frame) 91 | 92 | result = (vid, start_frame, end_frame, sentences) 93 | results.append(result) 94 | 95 | return results 96 | 97 | 98 | def reconstruct_tacos_dataset(dataset, video_frames, fps): 99 | temp_data = dict() 100 | for data in dataset: 101 | vid, start_frame, end_frame, sentences = data 102 | temp_data[vid] = temp_data.get(vid, []) + [(start_frame, end_frame, sentences)] 103 | 104 | new_dataset = dict() 105 | for vid, records in temp_data.items(): 106 | num_frames = video_frames[vid] 107 | timestamps, sentences = [], [] 108 | 109 | for record in records: 110 | start_frame, end_frame, sents = record 111 | 112 | for sent in sents: 113 | timestamps.append([start_frame, end_frame]) 114 | sentences.append(sent) 115 | 116 | new_dataset[vid] = {"timestamps": timestamps, "sentences": sentences, "fps": fps, "num_frames": num_frames} 117 | return new_dataset 118 | 119 | 120 | def stat_data_info(data, fps): 121 | num_samples, query_lengths, num_words, moment_lengths = 0, [], [], [] 122 | for record in data: 123 | moment_length = float(record[2] - record[1]) / fps 124 | num_samples += len(record[-1]) 125 | 126 | for sentence in record[-1]: 127 | words = word_tokenize(sentence) 128 | query_lengths.append(len(words)) 129 | num_words.extend(words) 130 | 131 | moment_lengths.append(moment_length) 132 | return num_samples, query_lengths, num_words, moment_lengths 133 | 134 | 135 | def main(): 136 | parser = argparse.ArgumentParser() 137 | parser.add_argument("--video_dir", type=str, required=True, help="TACoS video directory") 138 | parser.add_argument("--dataset_dir", type=str, required=True, help="TACoS dataset directory") 139 | parser.add_argument("--save_dir", type=str, required=True, help="directory to save extracted images") 140 | args = parser.parse_args() 141 | 142 | # load video ids 143 | video_names = load_video_names(args.dataset_dir) 144 | 145 | # extract video information 146 | extract_video_to_images(args.video_dir, video_names, args.save_dir) 147 | video_frames, video_times, fps = load_frames_and_times(args.save_dir, args.video_dir, video_names) 148 | 149 | # load TACoS datasets 150 | train_data = read_data(os.path.join(args.dataset_dir, "TACoS_train_samples.txt")) 151 | val_data = read_data(os.path.join(args.dataset_dir, "TACoS_val_samples.txt")) 152 | test_data = read_data(os.path.join(args.dataset_dir, "TACoS_test_samples.txt")) 153 | 154 | train_set = reconstruct_tacos_dataset(train_data, video_frames, fps) 155 | val_set = reconstruct_tacos_dataset(val_data, video_frames, fps) 156 | test_set = reconstruct_tacos_dataset(test_data, video_frames, fps) 157 | 158 | with open(os.path.join(args.dataset_dir, "train.json"), mode="w", encoding="utf-8") as f: 159 | json.dump(train_set, f) 160 | 161 | with open(os.path.join(args.dataset_dir, "val.json"), mode="w", encoding="utf-8") as f: 162 | json.dump(val_set, f) 163 | 164 | with open(os.path.join(args.dataset_dir, "test.json"), mode="w", encoding="utf-8") as f: 165 | json.dump(test_set, f) 166 | 167 | # statistics 168 | train_samples, train_query_lengths, train_num_words, train_moment_lengths = stat_data_info(train_data, fps) 169 | val_samples, val_query_lengths, val_num_words, val_moment_lengths = stat_data_info(val_data, fps) 170 | test_samples, test_query_lengths, test_num_words, test_moment_lengths = stat_data_info(test_data, fps) 171 | query_lengths = train_query_lengths + val_query_lengths + test_query_lengths 172 | num_words = train_num_words + val_num_words + test_num_words 173 | moment_lengths = train_moment_lengths + val_moment_lengths + test_moment_lengths 174 | durations = list(video_times.values()) 175 | 176 | # print 177 | print("Training samples:", train_samples) 178 | print("Validation samples:", val_samples) 179 | print("Test samples:", test_samples) 180 | print("Vocabulary size:", len(set(num_words))) 181 | print("Average video length:", np.mean(durations)) 182 | print("Average query length:", np.mean(query_lengths)) 183 | print("Average moment length:", np.mean(moment_lengths)) 184 | print("Std. of moment length:", np.std(moment_lengths)) 185 | 186 | print("Max moment length:", np.max(moment_lengths)) 187 | print("Min moment length:", np.mean(moment_lengths)) 188 | 189 | if __name__ == "__main__": 190 | main() 191 | -------------------------------------------------------------------------------- /utils/data_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | import json 4 | import random 5 | import codecs 6 | import numpy as np 7 | from tqdm import tqdm 8 | import tensorflow as tf 9 | 10 | glove_sizes = {'6B': int(4e5), '42B': int(1.9e6), '840B': int(2.2e6), '2B': int(1.2e6)} 11 | PAD, UNK = "", "" 12 | 13 | 14 | def load_glove(glove_path, dim): 15 | vocab = list() 16 | with codecs.open(glove_path, mode="r", encoding="utf-8") as f: 17 | total = glove_sizes[glove_path.split(".")[-3]] 18 | 19 | for line in tqdm(f, total=total, desc="load glove vocabulary"): 20 | line = line.lstrip().rstrip().split(" ") 21 | 22 | if len(line) == 2 or len(line) != dim + 1: 23 | continue 24 | 25 | word = line[0] 26 | vocab.append(word) 27 | 28 | return set(vocab) 29 | 30 | 31 | def filter_glove_embedding(word_dict, glove_path, dim): 32 | vectors = np.zeros(shape=[len(word_dict), dim], dtype=np.float32) 33 | 34 | with codecs.open(glove_path, mode="r", encoding="utf-8") as f: 35 | total = glove_sizes[glove_path.split(".")[-3]] 36 | 37 | for line in tqdm(f, total=total, desc="load glove embeddings"): 38 | line = line.lstrip().rstrip().split(" ") 39 | 40 | if len(line) == 2 or len(line) != dim + 1: 41 | continue 42 | 43 | word = line[0] 44 | 45 | if word in word_dict: 46 | vector = [float(x) for x in line[1:]] 47 | word_index = word_dict[word] 48 | vectors[word_index] = np.asarray(vector) 49 | 50 | return np.asarray(vectors) 51 | 52 | 53 | def load_video_features(root, max_position_length): 54 | video_features = dict() 55 | filenames = glob.glob(os.path.join(root, "*.npy")) 56 | 57 | for filename in tqdm(filenames, total=len(filenames), desc="load video features"): 58 | video_id = filename.split("/")[-1].split(".")[0] 59 | feature = np.load(filename) 60 | 61 | if max_position_length is None: 62 | video_features[video_id] = feature 63 | 64 | else: 65 | new_feature = visual_feature_sampling(feature, max_num_clips=max_position_length) 66 | video_features[video_id] = new_feature 67 | 68 | return video_features 69 | 70 | 71 | def visual_feature_sampling(visual_feature, max_num_clips): 72 | num_clips = visual_feature.shape[0] 73 | 74 | if num_clips <= max_num_clips: 75 | return visual_feature 76 | 77 | idxs = np.arange(0, max_num_clips + 1, 1.0) / max_num_clips * num_clips 78 | idxs = np.round(idxs).astype(np.int32) 79 | idxs[idxs > num_clips - 1] = num_clips - 1 80 | 81 | new_visual_feature = [] 82 | for i in range(max_num_clips): 83 | s_idx, e_idx = idxs[i], idxs[i + 1] 84 | 85 | if s_idx < e_idx: 86 | new_visual_feature.append(np.mean(visual_feature[s_idx:e_idx], axis=0)) 87 | 88 | else: 89 | new_visual_feature.append(visual_feature[s_idx]) 90 | 91 | new_visual_feature = np.asarray(new_visual_feature) 92 | 93 | return new_visual_feature 94 | 95 | 96 | def iou(pred, gt): # require pred and gt is numpy 97 | assert isinstance(pred, list) and isinstance(gt, list) 98 | 99 | pred_is_list = isinstance(pred[0], list) 100 | gt_is_list = isinstance(gt[0], list) 101 | 102 | if not pred_is_list: 103 | pred = [pred] 104 | 105 | if not gt_is_list: 106 | gt = [gt] 107 | 108 | pred, gt = np.array(pred), np.array(gt) 109 | 110 | inter_left = np.maximum(pred[:, 0, None], gt[None, :, 0]) 111 | inter_right = np.minimum(pred[:, 1, None], gt[None, :, 1]) 112 | inter = np.maximum(0.0, inter_right - inter_left) 113 | 114 | union_left = np.minimum(pred[:, 0, None], gt[None, :, 0]) 115 | union_right = np.maximum(pred[:, 1, None], gt[None, :, 1]) 116 | union = np.maximum(1e-12, union_right - union_left) 117 | 118 | overlap = 1.0 * inter / union 119 | 120 | if not gt_is_list: 121 | overlap = overlap[:, 0] 122 | 123 | if not pred_is_list: 124 | overlap = overlap[0] 125 | 126 | return overlap 127 | 128 | 129 | def time_to_index(start_time, end_time, feature_shape, duration): 130 | s_times = np.arange(0, feature_shape).astype(np.float32) * duration / float(feature_shape) 131 | e_times = np.arange(1, feature_shape + 1).astype(np.float32) * duration / float(feature_shape) 132 | 133 | candidates = np.stack([np.repeat(s_times[:, None], repeats=feature_shape, axis=1), 134 | np.repeat(e_times[None, :], repeats=feature_shape, axis=0)], axis=2).reshape((-1, 2)) 135 | 136 | overlaps = iou(candidates.tolist(), [start_time, end_time]).reshape(feature_shape, feature_shape) 137 | start_index = np.argmax(overlaps) // feature_shape 138 | end_index = np.argmax(overlaps) % feature_shape 139 | 140 | return start_index, end_index 141 | 142 | 143 | def load_video_ids(root): 144 | video_ids = [] 145 | filenames = glob.glob(os.path.join(root, "*.npy")) 146 | 147 | for filename in filenames: 148 | basename = os.path.basename(filename) 149 | vid = basename[0:-4] 150 | video_ids.append(vid) 151 | 152 | return video_ids 153 | 154 | 155 | def write_json(dataset, save_path, pretty=False): 156 | with codecs.open(filename=save_path, mode="w", encoding="utf-8") as f: 157 | if pretty: 158 | json.dump(dataset, f, indent=4, sort_keys=True) 159 | else: 160 | json.dump(dataset, f) 161 | 162 | 163 | def load_json(filename): 164 | with codecs.open(filename=filename, mode="r", encoding="utf-8") as f: 165 | data = json.load(f) 166 | return data 167 | 168 | 169 | def word_convert(word, word_lower=True, char_lower=True): 170 | if char_lower: 171 | chars = [c for c in word.lower()] 172 | else: 173 | chars = [c for c in word] 174 | 175 | if word_lower: 176 | word = word.lower() 177 | 178 | return word, chars 179 | 180 | 181 | def create_vocabularies(configs, word_counter, char_counter): 182 | # generate word dict and vectors 183 | emb_vocab = load_glove(configs.wordvec_path, configs.word_dim) 184 | 185 | word_vocab = list() 186 | for word, _ in word_counter.most_common(): 187 | if word in emb_vocab: 188 | word_vocab.append(word) 189 | 190 | tmp_word_dict = dict([(word, index) for index, word in enumerate(word_vocab)]) 191 | vectors = filter_glove_embedding(tmp_word_dict, configs.wordvec_path, configs.word_dim) 192 | 193 | word_vocab = [PAD, UNK] + word_vocab 194 | word_dict = dict([(word, idx) for idx, word in enumerate(word_vocab)]) 195 | 196 | # generate character dict 197 | char_vocab = [PAD, UNK] + [char for char, count in char_counter.most_common() if count >= 5] 198 | char_dict = dict([(char, idx) for idx, char in enumerate(char_vocab)]) 199 | 200 | return word_dict, char_dict, vectors 201 | 202 | 203 | def boolean_string(bool_str): 204 | bool_str = bool_str.lower() 205 | 206 | if bool_str not in {"false", "true"}: 207 | raise ValueError("Not a valid boolean string!!!") 208 | 209 | return bool_str == "true" 210 | 211 | 212 | def pad_sequences(sequences, pad_tok=None, max_length=None): 213 | if pad_tok is None: 214 | pad_tok = 0 # 0: "PAD" for words and chars, "PAD" for tags 215 | 216 | if max_length is None: 217 | max_length = max([len(seq) for seq in sequences]) 218 | 219 | sequence_padded, sequence_length = [], [] 220 | 221 | for seq in sequences: 222 | seq_ = seq[:max_length] + [pad_tok] * max(max_length - len(seq), 0) 223 | sequence_padded.append(seq_) 224 | sequence_length.append(min(len(seq), max_length)) 225 | 226 | return sequence_padded, sequence_length 227 | 228 | 229 | def pad_char_sequences(sequences, max_length=None, max_length_2=None): 230 | sequence_padded, sequence_length = [], [] 231 | 232 | if max_length is None: 233 | max_length = max(map(lambda x: len(x), sequences)) 234 | 235 | if max_length_2 is None: 236 | max_length_2 = max([max(map(lambda x: len(x), seq)) for seq in sequences]) 237 | 238 | for seq in sequences: 239 | sp, sl = pad_sequences(seq, max_length=max_length_2) 240 | sequence_padded.append(sp) 241 | sequence_length.append(sl) 242 | 243 | sequence_padded, _ = pad_sequences(sequence_padded, pad_tok=[0] * max_length_2, max_length=max_length) 244 | sequence_length, _ = pad_sequences(sequence_length, max_length=max_length) 245 | 246 | return sequence_padded, sequence_length 247 | 248 | 249 | def pad_video_sequence(sequences, max_length=None): 250 | if max_length is None: 251 | max_length = max([vfeat.shape[0] for vfeat in sequences]) 252 | 253 | feature_length = sequences[0].shape[1] 254 | sequence_padded, sequence_length = [], [] 255 | 256 | for seq in sequences: 257 | add_length = max_length - seq.shape[0] 258 | sequence_length.append(seq.shape[0]) 259 | 260 | if add_length > 0: 261 | add_feature = np.zeros(shape=[add_length, feature_length], dtype=np.float32) 262 | seq_ = np.concatenate([seq, add_feature], axis=0) 263 | 264 | else: 265 | seq_ = seq 266 | 267 | sequence_padded.append(seq_) 268 | 269 | return sequence_padded, sequence_length 270 | 271 | def pad_mask_sequence(seq, max_length=None): 272 | 273 | feature_length = len(seq) 274 | 275 | add_length = max_length - feature_length 276 | # sequence_length.append(seq.shape[0]) 277 | 278 | if add_length > 0: 279 | add_feature = np.zeros(shape=[add_length], dtype=np.int32) 280 | seq_ = np.concatenate([seq, add_feature], axis=0) 281 | 282 | else: 283 | seq_ = seq 284 | 285 | return seq_, feature_length 286 | def sliding_window(length): 287 | dx_ = [] 288 | dy_ = [] 289 | x5 = 0 290 | x0 = 0 291 | x1 = 0 292 | x2 = 0 293 | x3 = 0 294 | x4 = 0 295 | # print(5 > length) 296 | # for i in range(int((length - 3) / 1)): 297 | # y5 = x5 + 3 298 | # dx_.append(x5) 299 | # dy_.append(y5) 300 | # x5 = x5 + 1 301 | # # for i in range(int((length - 32) / 12)): 302 | # y0 = x0 + 47 303 | # dx_.append(x0) 304 | # dy_.append(y0) 305 | # x0 = x0 + 12 306 | # for i in range(int((length - 64) / 24)): 307 | # y1 = x1 + 95 308 | # dx_.append(x1) 309 | # dy_.append(y1) 310 | # x1 = x1 + 24 311 | 312 | 313 | for i in range(int((length - 6) / 2)): 314 | y0 = x0 + 7 315 | dx_.append(x0) 316 | dy_.append(y0) 317 | x0 = x0 + 2 318 | for i in range(int((length - 12) / 4)): 319 | y1 = x1 + 15 320 | dx_.append(x1) 321 | dy_.append(y1) 322 | x1 = x1 + 4 323 | for i in range(int((length - 24) / 8)): 324 | y2 = x2 + 31 325 | dx_.append(x2) 326 | dy_.append(y2) 327 | x2 = x2 + 8 328 | for i in range(int((length - 48) / 16)): 329 | y3 = x3 + 63 330 | dx_.append(x3) 331 | dy_.append(y3) 332 | x3 = x3 + 16 333 | for i in range(int((length - 96) / 32)): 334 | y4 = x4 + 127 335 | dx_.append(x4) 336 | dy_.append(y4) 337 | x4 = x4 + 32 338 | # dx_ = np.reshape(dx_ * batch_size, [batch_size, -1]) 339 | # dy_ = np.reshape(dy_ * batch_size, [batch_size, -1]) 340 | # dx = tf.cast(tf.convert_to_tensor(dx_), tf.int32) 341 | # dy = tf.cast(tf.convert_to_tensor(dy_), tf.int32) 342 | # mask_dx = tf.sequence_mask(lengths=dx, maxlen=length, dtype=tf.float32) 343 | # mask_dy = tf.sequence_mask(lengths=dy + 1, maxlen=length, dtype=tf.float32) 344 | # mask = mask_dy - mask_dx 345 | # dx = np.concatenate(dx_, np.zeros(batch_max_length-len(dx)), axis=0) 346 | # dy = np.concatenate(dy_, np.zeros(batch_max_length-len(dy)), axis=0) 347 | # print(len(dy_)) 348 | if len(dx_)==0: 349 | dx_.append(0) 350 | dy_.append(length-1) 351 | 352 | return dx_, dy_ 353 | 354 | def proposal_mask(dx, dy, length): 355 | 356 | mask_dx = np.concatenate((np.ones(dx), np.zeros(length-dx)), axis=0) 357 | mask_dy = np.concatenate((np.ones(dy+1), np.zeros(length-dy-1)), axis=0) 358 | mask = mask_dy - mask_dx 359 | return mask 360 | 361 | 362 | 363 | def batch_iter(dataset, all_video_features, batch_size, extend=0.2, train=True, shuffle=False): 364 | if shuffle: 365 | random.shuffle(dataset) 366 | 367 | for index in range(0, len(dataset), batch_size): 368 | batch_data = dataset[index:(index + batch_size)] 369 | video_ids, word_ids, char_ids, start_indexes, end_indexes = [], [], [], [], [] 370 | 371 | for data in batch_data: 372 | video_ids.append(data["video_id"].split('.')[0]) 373 | word_ids.append(data["word_ids"]) 374 | char_ids.append(data["char_ids"]) 375 | start_indexes.append(data["start_index"]) 376 | end_indexes.append(data["end_index"]) 377 | 378 | true_batch_size = len(batch_data) 379 | 380 | # add by xsn 381 | if true_batch_size < batch_size: 382 | break 383 | 384 | # process word ids 385 | word_ids, _ = pad_sequences(word_ids) 386 | word_ids = np.asarray(word_ids, dtype=np.int32) 387 | 388 | # process char ids 389 | char_ids, _ = pad_char_sequences(char_ids) 390 | char_ids = np.asarray(char_ids, dtype=np.int32) 391 | 392 | # process video features 393 | video_features = [all_video_features[video_id] for video_id in video_ids] 394 | max_length = max([vfeat.shape[0] for vfeat in video_features]) 395 | vfeat_lens = [vfeat.shape[0] for vfeat in video_features] 396 | vfeat_lens = np.asarray(vfeat_lens, dtype=np.int32) 397 | # for bbox proposals 398 | # batch_mask = [] 399 | # dx = [] 400 | # dy = [] 401 | # for vfeat in video_features: 402 | # length = vfeat.shape[0] 403 | # # print(length) 404 | # dx_, dy_ = sliding_window(length) 405 | # dx_, _ = pad_mask_sequence(dx_, max_length=233) 406 | # dy_, _ = pad_mask_sequence(dy_, max_length=233) 407 | # dx.append(dx_) 408 | # dy.append(dy_) 409 | # dx_new = np.reshape(dx_, [len(dx_),1]) 410 | # dy_new = np.reshape(dy_, [len(dy_),1]) 411 | # dxy = np.concatenate((dx_new, dy_new), -1) 412 | # masks = [np.reshape(proposal_mask(x, y, length),[length,1]) for x,y in dxy] 413 | # masks, video_seq_length = pad_video_sequence(masks, max_length=max_length) 414 | # batch_mask.append(masks) 415 | # dx = np.asarray(dx, dtype=np.int32) 416 | # dy = np.asarray(dy, dtype=np.int32) 417 | # batch_mask = np.asarray(batch_mask, dtype=np.float32) 418 | # print(np.shape(dy)) 419 | video_features, video_seq_length = pad_video_sequence(video_features, max_length=max_length) 420 | video_features = np.asarray(video_features, dtype=np.float32) 421 | video_seq_length = np.asarray(video_seq_length, dtype=np.int32) 422 | 423 | epsilon = 1E-8 424 | 425 | # soft label 426 | y = (1 - (max_length-3) * epsilon - 0.5)/ 2 427 | start_label = np.ones(shape=[true_batch_size, max_length], dtype=np.int32) * epsilon 428 | end_label = np.ones(shape=[true_batch_size, max_length], dtype=np.int32) * epsilon 429 | 430 | # generate labels 431 | # start_label = np.zeros(shape=[true_batch_size, max_length], dtype=np.int32) 432 | # end_label = np.zeros(shape=[true_batch_size, max_length], dtype=np.int32) 433 | highlight_labels = np.zeros(shape=[true_batch_size, max_length], dtype=np.int32) 434 | 435 | 436 | for idx in range(true_batch_size): 437 | st, et = start_indexes[idx], end_indexes[idx] 438 | if st > 0: 439 | start_label[idx][st - 1] = y 440 | if st < max_length-1: 441 | start_label[idx][st + 1] = y 442 | start_label[idx][st] = 0.5 443 | 444 | if et > 0: 445 | end_label[idx][et - 1] = y 446 | if et < max_length-1: 447 | end_label[idx][et + 1] = y 448 | end_label[idx][et] = 0.5 449 | 450 | # start_label[idx][st] = 1 451 | # end_label[idx][et] = 1 452 | cur_max_len = vfeat_lens[idx] 453 | extend_len = round(extend * float(et - st + 1)) 454 | if extend_len > 0: 455 | st_ = max(0, st - extend_len) 456 | et_ = min(et + extend_len, cur_max_len - 1) 457 | highlight_labels[idx][st_:(et_ + 1)] = 1 458 | else: 459 | highlight_labels[idx][st:(et + 1)] = 1 460 | 461 | # yield (batch_data, video_features, word_ids, char_ids, video_seq_length, start_label, end_label, 462 | # highlight_labels, dx, dy, batch_mask) 463 | if train is True: 464 | is_training = True 465 | else: 466 | is_training = False 467 | yield (batch_data, video_features, word_ids, char_ids, video_seq_length, start_label, end_label, 468 | highlight_labels, is_training) -------------------------------------------------------------------------------- /utils/prepro_activitynet.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import numpy as np 4 | from tqdm import tqdm 5 | from collections import Counter 6 | from nltk.tokenize import word_tokenize 7 | from utils.data_utils import load_video_ids, create_vocabularies, load_json, write_json, UNK, time_to_index 8 | 9 | 10 | def read_activitynet_data(activitynet_dir, feature_name, max_sentence_length): 11 | with open(os.path.join(activitynet_dir, "captions", "train.json"), mode="r", encoding="utf-8") as f: 12 | train_data = json.load(f) 13 | 14 | with open(os.path.join(activitynet_dir, "captions", "val_1.json"), mode="r", encoding="utf-8") as f: 15 | test_data = json.load(f) # used as test set follow Yuan et al. 16 | 17 | with open(os.path.join(activitynet_dir, "captions", "val_2.json"), mode="r", encoding="utf-8") as f: 18 | test2_data = json.load(f) 19 | 20 | video_ids = load_video_ids(os.path.join(activitynet_dir, "activitynet_features_{}".format(feature_name))) 21 | 22 | def load_information(data, valid_vids): 23 | vids, results = [], [] 24 | 25 | for vid, records in data.items(): 26 | if vid not in valid_vids: 27 | continue # handle missing video records 28 | 29 | vids.append(vid) 30 | 31 | for timestamp, sentence in zip(records["timestamps"], records["sentences"]): 32 | duration = float(records["duration"]) 33 | start_time = max(0.0, float(timestamp[0])) 34 | end_time = min(float(timestamp[1]), duration) 35 | words = word_tokenize(sentence.strip().lower(), language="english") 36 | 37 | if max_sentence_length is not None: 38 | words = words[0:max_sentence_length] 39 | 40 | results.append((vid, start_time, end_time, duration, words)) 41 | 42 | return vids, results 43 | 44 | train_vids, train_data = load_information(train_data, video_ids) 45 | test_vids, test_data = load_information(test_data, video_ids) 46 | test2_vids, test2_data = load_information(test2_data, video_ids) 47 | filtered_video_ids = list(set(train_vids + test_vids + test2_vids)) 48 | return train_data, test_data, test2_data, filtered_video_ids 49 | 50 | 51 | def generate_dataset(data, feature_shapes, word_dict, char_dict, scope): 52 | dataset = list() 53 | 54 | for record in tqdm(data, total=len(data), desc="process {} data".format(scope)): 55 | video_id, start_time, end_time, duration, words = record 56 | feature_shape = feature_shapes[video_id] 57 | 58 | # compute best start and end indices 59 | start_index, end_index = time_to_index(start_time, end_time, feature_shape, duration) 60 | 61 | # convert words and characters 62 | word_indices, char_indices = list(), list() 63 | for word in words: 64 | word_index = word_dict[word] if word in word_dict else word_dict[UNK] 65 | char_index = [char_dict[char] if char in char_dict else char_dict[UNK] for char in word] 66 | word_indices.append(word_index) 67 | char_indices.append(char_index) 68 | 69 | example = {"video_id": str(video_id), "start_time": float(start_time), "end_time": float(end_time), 70 | "duration": float(duration), "start_index": int(start_index), "end_index": int(end_index), 71 | "feature_shape": int(feature_shape), "word_ids": word_indices, "char_ids": char_indices} 72 | dataset.append(example) 73 | 74 | return dataset 75 | 76 | 77 | def prepro_activitynet(configs): 78 | 79 | if not os.path.exists(configs.save_dir): 80 | os.makedirs(configs.save_dir) 81 | 82 | # train/test data format: (video_id, start_time, end_time, duration, words) 83 | train_data, test_data, test2_data, _ = read_activitynet_data(configs.root, configs.feature, 84 | configs.max_position_length) 85 | 86 | # load features and sample feature shapes if possible 87 | features_path = os.path.join(configs.root, "activitynet_features_{}/feature_shapes.json".format(configs.feature)) 88 | feature_shapes = dict() 89 | for vid, length in load_json(features_path).items(): 90 | if configs.max_position_length is not None and length > configs.max_position_length: 91 | length = configs.max_position_length 92 | feature_shapes[vid] = length 93 | 94 | # generate token dicts and load pre-trained vectors 95 | word_counter, char_counter = Counter(), Counter() 96 | for data in [train_data, test_data, test2_data]: 97 | for record in data: 98 | words = record[-1] 99 | for word in words: 100 | word_counter[word] += 1 101 | for char in list(word): 102 | char_counter[char] += 1 103 | word_dict, char_dict, word_vectors = create_vocabularies(configs, word_counter, char_counter) 104 | 105 | # generate datasets 106 | train_set = generate_dataset(train_data, feature_shapes, word_dict, char_dict, "train") 107 | test_set = generate_dataset(test_data, feature_shapes, word_dict, char_dict, "test") 108 | test2_set = generate_dataset(test2_data, feature_shapes, word_dict, char_dict, "test2") 109 | 110 | # save to directory 111 | write_json(word_dict, save_path=os.path.join(configs.save_dir, "word_dict.json")) 112 | write_json(char_dict, save_path=os.path.join(configs.save_dir, "char_dict.json")) 113 | np.savez_compressed(os.path.join(configs.save_dir, "word_vectors.npz"), vectors=word_vectors) 114 | write_json(train_set, save_path=os.path.join(configs.save_dir, "train_set.json")) 115 | write_json(test_set, save_path=os.path.join(configs.save_dir, "test_set.json")) 116 | write_json(test2_set, save_path=os.path.join(configs.save_dir, "test2_set.json")) 117 | -------------------------------------------------------------------------------- /utils/prepro_charades.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import numpy as np 4 | from tqdm import tqdm 5 | from collections import Counter 6 | from nltk.tokenize import word_tokenize 7 | from utils.data_utils import create_vocabularies, load_json, write_json, UNK, time_to_index 8 | 9 | 10 | def read_charades_data(charades_dir, max_sentence_length): 11 | # load charades json file 12 | with open(os.path.join(charades_dir, "charades.json"), mode="r", encoding="utf-8") as f: 13 | charades = json.load(f) 14 | 15 | def load_information(charades_sta_dir): 16 | with open(charades_sta_dir, mode="r", encoding="utf-8") as f_sta: 17 | vids, data = [], [] 18 | 19 | for line in f_sta: 20 | line = line.lstrip().rstrip() 21 | 22 | if len(line) == 0: 23 | continue 24 | 25 | video_info, sentence = line.split("##") 26 | vid, start_time, end_time = video_info.split(" ") 27 | words = word_tokenize(sentence.lower(), language="english") 28 | 29 | if max_sentence_length is not None: 30 | words = words[0:max_sentence_length] 31 | 32 | duration = float(charades[vid]["duration"]) 33 | start_time = max(0.0, float(start_time)) 34 | end_time = min(float(end_time), duration) 35 | 36 | vids.append(vid) 37 | data.append((vid, start_time, end_time, duration, words)) 38 | 39 | return vids, data 40 | 41 | # load train and test dataset 42 | train_vids, train_data = load_information(os.path.join(charades_dir, "charades_sta_train.txt")) 43 | test_vids, test_data = load_information(os.path.join(charades_dir, "charades_sta_test.txt")) 44 | 45 | video_ids = list(set(train_vids + test_vids)) 46 | 47 | return train_data, test_data, video_ids 48 | 49 | 50 | def generate_dataset(data, feature_shapes, word_dict, char_dict, scope): 51 | dataset = list() 52 | for record in tqdm(data, total=len(data), desc="process {} data".format(scope)): 53 | 54 | video_id, start_time, end_time, duration, words = record 55 | if video_id not in list(feature_shapes.keys()): 56 | continue 57 | feature_shape = feature_shapes[video_id] 58 | 59 | # compute best start and end indices 60 | start_index, end_index = time_to_index(start_time, end_time, feature_shape, duration) 61 | 62 | # convert words and characters 63 | word_indices, char_indices = list(), list() 64 | for word in words: 65 | word_index = word_dict[word] if word in word_dict else word_dict[UNK] 66 | char_index = [char_dict[char] if char in char_dict else char_dict[UNK] for char in word] 67 | word_indices.append(word_index) 68 | char_indices.append(char_index) 69 | 70 | example = {"video_id": str(video_id), "start_time": float(start_time), "end_time": float(end_time), 71 | "duration": float(duration), "start_index": int(start_index), "end_index": int(end_index), 72 | "feature_shape": int(feature_shape), "word_ids": word_indices, "char_ids": char_indices} 73 | dataset.append(example) 74 | 75 | return dataset 76 | 77 | 78 | def prepro_charades(configs): 79 | 80 | if not os.path.exists(configs.save_dir): 81 | os.makedirs(configs.save_dir) 82 | 83 | # train/test data format: (video_id, start_time, end_time, duration, words) 84 | train_data, test_data, _ = read_charades_data(configs.root, configs.max_position_length) 85 | 86 | # load features and sample feature shapes if possible 87 | features_path = os.path.join(configs.root, "charades_features_{}/feature_shapes.json".format(configs.feature)) 88 | feature_shapes = dict() 89 | for vid, length in load_json(features_path).items(): 90 | if configs.max_position_length is not None and length > configs.max_position_length: 91 | length = configs.max_position_length 92 | feature_shapes[vid] = length 93 | 94 | # generate token dicts and load pre-trained vectors 95 | word_counter, char_counter = Counter(), Counter() 96 | for data in [train_data, test_data]: 97 | for record in data: 98 | words = record[-1] 99 | for word in words: 100 | word_counter[word] += 1 101 | for char in list(word): 102 | char_counter[char] += 1 103 | word_dict, char_dict, word_vectors = create_vocabularies(configs, word_counter, char_counter) 104 | 105 | # generate datasets 106 | train_set = generate_dataset(train_data, feature_shapes, word_dict, char_dict, "train") 107 | test_set = generate_dataset(test_data, feature_shapes, word_dict, char_dict, "test") 108 | 109 | # save to directory 110 | write_json(word_dict, save_path=os.path.join(configs.save_dir, "word_dict.json")) 111 | write_json(char_dict, save_path=os.path.join(configs.save_dir, "char_dict.json")) 112 | np.savez_compressed(os.path.join(configs.save_dir, "word_vectors.npz"), vectors=word_vectors) 113 | write_json(train_set, save_path=os.path.join(configs.save_dir, "train_set.json")) 114 | write_json(test_set, save_path=os.path.join(configs.save_dir, "test_set.json")) 115 | -------------------------------------------------------------------------------- /utils/prepro_tacos.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import numpy as np 4 | from tqdm import tqdm 5 | from collections import Counter 6 | from nltk.tokenize import word_tokenize 7 | from utils.data_utils import create_vocabularies, load_json, write_json, UNK, time_to_index 8 | 9 | 10 | def read_tacos_data(tacos_dir, max_sentence_length): 11 | with open(os.path.join(tacos_dir, "train.json"), mode="r", encoding="utf-8") as f: 12 | train_data = json.load(f) 13 | 14 | with open(os.path.join(tacos_dir, "val.json"), mode="r", encoding="utf-8") as f: 15 | val_data = json.load(f) 16 | 17 | with open(os.path.join(tacos_dir, "test.json"), mode="r", encoding="utf-8") as f: 18 | test_data = json.load(f) 19 | 20 | def load_information(data): 21 | results = [] 22 | for vid, records in data.items(): 23 | if vid.endswith(".avi"): 24 | vid = vid[0:-4] 25 | 26 | duration = float(records["num_frames"]) / float(records["fps"]) 27 | 28 | for timestamp, sentence in zip(records["timestamps"], records["sentences"]): 29 | start_time = max(0.0, float(timestamp[0]) / float(records["fps"])) 30 | end_time = min(float(timestamp[1]) / float(records["fps"]), duration) 31 | words = word_tokenize(sentence.strip().lower(), language="english") 32 | 33 | if max_sentence_length is not None: 34 | words = words[0:max_sentence_length] 35 | 36 | results.append((vid, start_time, end_time, duration, words)) 37 | 38 | return results 39 | 40 | train_data = load_information(train_data) 41 | val_data = load_information(val_data) 42 | test_data = load_information(test_data) 43 | return train_data, val_data, test_data 44 | 45 | 46 | def generate_dataset(data, feature_shapes, word_dict, char_dict, scope): 47 | dataset = list() 48 | for record in tqdm(data, total=len(data), desc="process {} data".format(scope)): 49 | video_id, start_time, end_time, duration, words = record 50 | # video_id = video_id + '.avi' 51 | video_id = video_id 52 | feature_shape = feature_shapes[video_id] 53 | 54 | # compute best start and end indices 55 | start_index, end_index = time_to_index(start_time, end_time, feature_shape, duration) 56 | 57 | # convert words and characters 58 | word_indices, char_indices = list(), list() 59 | for word in words: 60 | word_index = word_dict[word] if word in word_dict else word_dict[UNK] 61 | char_index = [char_dict[char] if char in char_dict else char_dict[UNK] for char in word] 62 | word_indices.append(word_index) 63 | char_indices.append(char_index) 64 | 65 | example = {"video_id": str(video_id), "start_time": float(start_time), "end_time": float(end_time), 66 | "duration": float(duration), "start_index": int(start_index), "end_index": int(end_index), 67 | "feature_shape": int(feature_shape), "word_ids": word_indices, "char_ids": char_indices} 68 | dataset.append(example) 69 | 70 | return dataset 71 | 72 | 73 | def prepro_tacos(configs): 74 | 75 | if not os.path.exists(configs.save_dir): 76 | os.makedirs(configs.save_dir) 77 | 78 | # train/test data format: (video_id, start_time, end_time, duration, words) 79 | train_data, val_data, test_data = read_tacos_data(configs.root, configs.max_position_length) 80 | 81 | # load features and sample feature shapes if possible 82 | features_path = os.path.join(configs.root, "tacos_features_{}/feature_shapes.json".format(configs.feature)) 83 | feature_shapes = dict() 84 | for vid, length in load_json(features_path).items(): 85 | if configs.max_position_length is not None and length > configs.max_position_length: 86 | length = configs.max_position_length 87 | feature_shapes[vid] = length 88 | 89 | # generate token dicts and load pre-trained vectors 90 | word_counter, char_counter = Counter(), Counter() 91 | for data in [train_data, val_data, test_data]: 92 | for record in data: 93 | words = record[-1] 94 | for word in words: 95 | word_counter[word] += 1 96 | for char in list(word): 97 | char_counter[char] += 1 98 | word_dict, char_dict, word_vectors = create_vocabularies(configs, word_counter, char_counter) 99 | 100 | # generate datasets 101 | train_set = generate_dataset(train_data, feature_shapes, word_dict, char_dict, "train") 102 | val_set = generate_dataset(val_data, feature_shapes, word_dict, char_dict, "val") 103 | test_set = generate_dataset(test_data, feature_shapes, word_dict, char_dict, "test") 104 | 105 | # save to directory 106 | write_json(word_dict, save_path=os.path.join(configs.save_dir, "word_dict.json")) 107 | write_json(char_dict, save_path=os.path.join(configs.save_dir, "char_dict.json")) 108 | np.savez_compressed(os.path.join(configs.save_dir, "word_vectors.npz"), vectors=word_vectors) 109 | write_json(train_set, save_path=os.path.join(configs.save_dir, "train_set.json")) 110 | write_json(val_set, save_path=os.path.join(configs.save_dir, "val_set.json")) 111 | write_json(test_set, save_path=os.path.join(configs.save_dir, "test_set.json")) 112 | -------------------------------------------------------------------------------- /utils/runner_utils.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | import tensorflow as tf 4 | from tqdm import tqdm 5 | from utils.data_utils import batch_iter 6 | import pickle 7 | import os 8 | 9 | def write_tf_summary(writer, value_pairs, global_step): 10 | for tag, value in value_pairs: 11 | summ = tf.Summary(value=[tf.Summary.Value(tag=tag, simple_value=value)]) 12 | writer.add_summary(summ, global_step=global_step) 13 | writer.flush() 14 | 15 | 16 | def calculate_iou_accuracy(ious, threshold): 17 | total_size = float(len(ious)) 18 | count = 0 19 | 20 | for iou in ious: 21 | if iou >= threshold: 22 | count += 1 23 | 24 | return float(count) / total_size * 100.0 25 | 26 | 27 | def calculate_iou(i0, i1): 28 | union = (min(i0[0], i1[0]), max(i0[1], i1[1])) 29 | inter = (max(i0[0], i1[0]), min(i0[1], i1[1])) 30 | 31 | iou = 1.0 * (inter[1] - inter[0]) / (union[1] - union[0]) 32 | 33 | return max(0.0, iou) 34 | 35 | 36 | def convert_to_time(start_index, end_index, num_features, duration): 37 | s_times = np.arange(0, num_features).astype(np.float32) * duration / float(num_features) 38 | e_times = np.arange(1, num_features + 1).astype(np.float32) * duration / float(num_features) 39 | if start_index >= num_features: 40 | start_index = num_features - 1 41 | if end_index >= num_features: 42 | end_index = num_features - 1 43 | if start_index < 0: 44 | start_index = 0 45 | if end_index <0: 46 | end_index = 0 47 | start_time = s_times[start_index] 48 | end_time = e_times[end_index] 49 | 50 | return start_time, end_time 51 | 52 | 53 | def get_feed_dict(batch_data, model, drop_rate=None, mode='train'): 54 | if mode == 'train': # training 55 | #(_, video_features, word_ids, char_ids, video_seq_length, start_label, end_label, highlight_labels, dx, dy, batch_mask) = batch_data 56 | 57 | # feed_dict = {model.video_inputs: video_features, model.video_seq_length: video_seq_length, 58 | # model.word_ids: word_ids, model.char_ids: char_ids, model.y1: start_label, model.y2: end_label, 59 | # model.drop_rate: drop_rate, model.highlight_labels: highlight_labels, 60 | # model.dx1 : dx, model.dy1 : dy, model.mask1 : batch_mask} 61 | (_, video_features, word_ids, char_ids, video_seq_length, start_label, end_label, highlight_labels, is_training) = batch_data 62 | 63 | feed_dict = {model.video_inputs: video_features, model.video_seq_length: video_seq_length, 64 | model.word_ids: word_ids, model.char_ids: char_ids, model.y1: start_label, model.y2: end_label, 65 | model.drop_rate: drop_rate, model.highlight_labels: highlight_labels, model.is_training:is_training} 66 | 67 | return feed_dict 68 | 69 | else: # eval 70 | # raw_data, video_features, word_ids, char_ids, video_seq_length, _, _, _, dx, dy, batch_mask = batch_data 71 | 72 | # feed_dict = {model.video_inputs: video_features, model.video_seq_length: video_seq_length, 73 | # model.word_ids: word_ids, model.char_ids: char_ids, 74 | # model.dx1 : dx, model.dy1 : dy, model.mask1 : batch_mask} 75 | 76 | # raw_data, video_features, word_ids, char_ids, video_seq_length, *_ = batch_data 77 | # feed_dict = {model.video_inputs: video_features, model.video_seq_length: video_seq_length, 78 | # model.word_ids: word_ids, model.char_ids: char_ids} 79 | 80 | raw_data, video_features, word_ids, char_ids, video_seq_length, start_label, end_label, highlight_labels, is_training = batch_data 81 | feed_dict = {model.video_inputs: video_features, model.video_seq_length: video_seq_length, 82 | model.word_ids: word_ids, model.char_ids: char_ids, model.y1: start_label, model.y2: end_label, model.is_training:is_training} 83 | return raw_data, feed_dict 84 | 85 | 86 | # def eval_test(sess, model, dataset, video_features, configs, epoch=None, global_step=None, name="test"): 87 | # num_test_batches = math.ceil(len(dataset) / configs.batch_size) 88 | # ious = list() 89 | # extent = list() 90 | # prob = list() 91 | 92 | # for data in tqdm(batch_iter(dataset, video_features, configs.batch_size, configs.extend, False), 93 | # total=num_test_batches, desc="evaluate {}".format(name)): 94 | 95 | # raw_data, feed_dict = get_feed_dict(data, model, mode=name) 96 | # # start_indexes, end_indexes = sess.run([model.start_index, model.end_index], feed_dict=feed_dict) 97 | # # iou_loss = sess.run([model.iou_loss], feed_dict=feed_dict) 98 | # start_indexes, end_indexes, start_prob, end_prob, iou_loss = sess.run([model.dx1, model.dy1, model.start_prob, model.end_prob, model.iou_loss], feed_dict=feed_dict) 99 | 100 | # # print(y1) 101 | # prob.append(iou_loss) 102 | # for record, start_index_, end_index_ in zip(raw_data, start_indexes, end_indexes): 103 | # for start_index, end_index in zip(start_index_, end_index_): 104 | # # print(record["feature_shape"]) 62 105 | # start_time, end_time = convert_to_time(start_index, end_index, record["feature_shape"], record["duration"]) 106 | # iou = calculate_iou(i0=[start_time, end_time], i1=[record["start_time"], record["end_time"]]) 107 | # ious.append(iou) 108 | # s = start_time - record["start_time"] 109 | # e = end_time - record["end_time"] 110 | # seg = record["end_time"] - record["start_time"] 111 | # d = record["duration"] 112 | # item = [s,e,seg,d] 113 | # extent.append(item) 114 | 115 | 116 | # # r1i3 = calculate_iou_accuracy(ious, threshold=0.1) 117 | # r1i3 = calculate_iou_accuracy(ious, threshold=0.3) 118 | # r1i5 = calculate_iou_accuracy(ious, threshold=0.5) 119 | # r1i7 = calculate_iou_accuracy(ious, threshold=0.7) 120 | # mi = np.mean(ious) * 100.0 121 | 122 | # value_pairs = 0 123 | 124 | # # write the scores 125 | # score_str = "Epoch {}, Step {}:\n".format(epoch, global_step) 126 | # score_str += "Rank@1, IoU=0.3: {:.2f}\t".format(r1i3) 127 | # score_str += "Rank@1, IoU=0.5: {:.2f}\t".format(r1i5) 128 | # score_str += "Rank@1, IoU=0.7: {:.2f}\t".format(r1i7) 129 | # print("在这里", mi, type(mi), np.shape(ious)) 130 | # score_str += "mean IoU: {:.2f}\n".format(mi) 131 | # # return extent, r1i3, r1i5, r1i7, mi, value_pairs, score_str 132 | # return r1i3, r1i5, r1i7, mi, value_pairs, score_str 133 | 134 | 135 | def eval_test(sess, model, dataset, video_features, configs, epoch=None, global_step=None, name="test"): 136 | num_test_batches = math.ceil(len(dataset) / configs.batch_size) 137 | ious = list() 138 | extent = list() 139 | prob = list() 140 | pse = list() 141 | 142 | # query_txts = ["person reading a book.", "person opens the door."] 143 | # fps_list = [30.00, 19.75] 144 | for data in tqdm(batch_iter(dataset, video_features, configs.batch_size, configs.extend, train=False, shuffle=False), 145 | total=num_test_batches, desc="evaluate {}".format(name)): 146 | 147 | raw_data, feed_dict = get_feed_dict(data, model, mode=name) 148 | # start_indexes, end_indexes = sess.run([model.start_index, model.end_index], feed_dict=feed_dict) 149 | # start_indexes, end_indexes, dx, dy, length= sess.run([model.px, model.py, model.dx, model.dy, model.video_seq_length], feed_dict=feed_dict) 150 | start_indexes, end_indexes, proposal_box = sess.run([model.px, model.py, model.proposal_box], feed_dict=feed_dict) 151 | 152 | # iou_loss = sess.run([model.iou_loss], feed_dict=feed_dict) 153 | # start_indexes, end_indexes, start_prob, end_prob, iou_loss = sess.run([model.dx, model.dy, model.start_prob, model.end_prob, model.iou_loss], feed_dict=feed_dict) 154 | 155 | # print(proposal_box) 156 | # np.savetxt('tocos_pre6.out', proposal_box) 157 | 158 | # print(np.shape(start_indexes)) 159 | # prob.append(iou_loss) 160 | i = 0 161 | for record, start_index, end_index in zip(raw_data, start_indexes, end_indexes): 162 | # print(record["feature_shape"]) 62 163 | start_time, end_time = convert_to_time(start_index, end_index, record["feature_shape"], record["duration"]) 164 | # print(start_time, end_time) 165 | # prediction_result = {'video_path':"/home/xsn/VSLNet/nlvl/charaders/videos/" + record["video_id"] + ".mp4", 166 | # 'fps':fps_list[i], 167 | # 'query_txt':query_txts[i], 168 | # 'prediction':[start_time[0], end_time[0]], 169 | # 'ground_truth':[record["start_time"], record["end_time"]]} 170 | 171 | # with open("prediction_result_"+str(i)+".pkl",'wb') as f: 172 | # pickle.dump(prediction_result, f) 173 | # i = i + 1 174 | 175 | iou = calculate_iou(i0=[start_time, end_time], i1=[record["start_time"], record["end_time"]]) 176 | ious.append(iou) 177 | 178 | # print(record.keys()) #dict_keys(['video_id', 'start_time', 'end_time', 'duration', 'start_index', 'end_index', 'feature_shape', 'word_ids', 'char_ids']) 179 | s = record["start_time"]/record["duration"] 180 | e = record["end_time"]/record["duration"] 181 | p = (e+s)/2 182 | l = (e-s)/2 183 | item = [p, l] 184 | extent.append(item) 185 | 186 | # print(record.keys()) #dict_keys(['video_id', 'start_time', 'end_time', 'duration', 'start_index', 'end_index', 'feature_shape', 'word_ids', 'char_ids']) 187 | # s = start_time - record["start_time"] 188 | # e = end_time - record["end_time"] 189 | # seg = record["end_time"] - record["start_time"] 190 | # d = record["duration"] 191 | # item = [s,e,seg,d] 192 | # extent.append(item) 193 | if iou > 0.8: 194 | s = record["start_time"] 195 | e = record["end_time"] 196 | ps = float(start_time[0]) 197 | # print(type(end_time)) 198 | if isinstance(end_time, np.ndarray): 199 | pe = float(end_time[0]) 200 | else: 201 | pe = float(end_time) 202 | vid = record["video_id"] 203 | d = record["duration"] 204 | item = [s,e,ps,pe,vid, d] 205 | # print(type(s), type(e), type(ps), type(pe), type(vid)) 206 | if s > 3.0 and e < record['duration']-3.0: 207 | pse.append(item) 208 | 209 | # np.savetxt('gth0.8.out', pse) 210 | # np.savetxt('t_real_proposal.out', extent) 211 | # r1i3 = calculate_iou_accuracy(ious, threshold=0.1) 212 | r1i3 = calculate_iou_accuracy(ious, threshold=0.3) 213 | r1i5 = calculate_iou_accuracy(ious, threshold=0.5) 214 | r1i7 = calculate_iou_accuracy(ious, threshold=0.7) 215 | mi = np.mean(ious) * 100.0 216 | 217 | # value_pairs = [("{}/Rank@1, IoU=0.3".format(name), r1i3), ("{}/Rank@1, IoU=0.5".format(name), r1i5), 218 | # ("{}/Rank@1, IoU=0.7".format(name), r1i7), ("{}/mean IoU".format(name), mi[0])] 219 | value_pairs = [("{}/Rank@1, IoU=0.3".format(name), r1i3), 220 | ("{}/Rank@1, IoU=0.5".format(name), r1i5), 221 | ("{}/Rank@1, IoU=0.7".format(name), r1i7), 222 | ("{}/mean IoU".format(name), mi)] 223 | # write the scores 224 | score_str = "Epoch {}, Step {}:\n".format(epoch, global_step) 225 | score_str += "Rank@1, IoU=0.3: {:.2f}\t".format(r1i3) 226 | score_str += "Rank@1, IoU=0.5: {:.2f}\t".format(r1i5) 227 | score_str += "Rank@1, IoU=0.7: {:.2f}\t".format(r1i7) 228 | # print("在这里", mi, type(mi), np.shape(ious)) 229 | # score_str += "mean IoU: {:.2f}\n".format(mi[0]) 230 | score_str += "mean IoU: {}\n".format(mi) 231 | # return extent, r1i3, r1i5, r1i7, mi, value_pairs, score_str 232 | # return pse, r1i3, r1i5, r1i7, mi, value_pairs, score_str 233 | return r1i3, r1i5, r1i7, mi, value_pairs, score_str 234 | --------------------------------------------------------------------------------