├── model ├── __init__.py ├── VSLNet_t7.py ├── VSLNet.py ├── ops.py ├── layers.py └── layers_t7.py ├── util ├── __init__.py ├── runner_utils.py ├── data_loader_t7.py ├── runner_utils_t7.py ├── data_loader.py ├── data_util.py └── data_gen.py ├── prepare ├── __init__.py ├── extract_activitynet_org.py ├── extract_tacos_org.py ├── download_activitynet_video.py ├── videotransforms.py ├── extract_charades.py ├── README.md ├── extract_tacos.py ├── extract_activitynet.py └── feature_extractor.py ├── figures └── overview.jpg ├── LICENSE ├── .gitignore ├── README.md ├── main.py └── main_t7.py /model/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /util/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /prepare/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /figures/overview.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/26hzhang/VSLNet/HEAD/figures/overview.jpg -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 ZHANG HAO 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /prepare/extract_activitynet_org.py: -------------------------------------------------------------------------------- 1 | import os 2 | import h5py 3 | import json 4 | import numpy as np 5 | from tqdm import tqdm 6 | from argparse import ArgumentParser 7 | 8 | parser = ArgumentParser() 9 | parser.add_argument("--dataset_dir", type=str, required=True, help="dataset path") 10 | parser.add_argument("--hdf5_file", type=str, required=True, help="downloaded activitynet features") 11 | parser.add_argument("--save_dir", type=str, required=True, help="save dir") 12 | args = parser.parse_args() 13 | 14 | with open(os.path.join(args.dataset_dir, "train.json"), mode="r", encoding="utf-8") as f: 15 | train_data = json.load(f) 16 | with open(os.path.join(args.dataset_dir, "val_1.json"), mode="r", encoding="utf-8") as f: 17 | val_data = json.load(f) 18 | with open(os.path.join(args.dataset_dir, "val_2.json"), mode="r", encoding="utf-8") as f: 19 | test_data = json.load(f) 20 | 21 | video_ids = list(set(list(train_data.keys()) + list(val_data.keys()) + list(test_data.keys()))) 22 | print(video_ids) 23 | print(len(video_ids)) 24 | 25 | if not os.path.exists(args.save_dir): 26 | os.makedirs(args.save_dir) 27 | 28 | feature_shapes = dict() 29 | with h5py.File(args.hdf5_file, mode="r") as f: 30 | group_key = list(f.keys()) 31 | for key in tqdm(group_key, total=len(group_key), desc="extract features"): 32 | video_id = key 33 | if video_id not in video_ids: 34 | continue 35 | data = f[key]["c3d_features"][()] 36 | feature_shapes[video_id] = data.shape[0] 37 | np.save(os.path.join(args.save_dir, video_id), arr=data) 38 | 39 | with open(os.path.join(args.save_dir, "feature_shapes.json"), mode="w", encoding="utf-8") as f: 40 | json.dump(feature_shapes, f) 41 | -------------------------------------------------------------------------------- /prepare/extract_tacos_org.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | import numpy as np 5 | from tqdm import tqdm 6 | 7 | # 1. step download pre-trained C3D features from https://github.com/jiyanggao/TALL 8 | # 2. convert the features 9 | 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument("--data_path", type=str, required=True, default="tacos dataset") 12 | parser.add_argument("--feature_path", type=str, required=True, help="pre-trained C3D features") 13 | parser.add_argument("--save_dir", type=str, required=True, help="extracted feature save path") 14 | parser.add_argument("--sample_rate", type=int, default=64, help="sample rate [64 | 128 | 256 | 512]") 15 | args = parser.parse_args() 16 | 17 | stride = args.sample_rate // 5 # due to 0.8 overlap of the pre-trained C3D features 18 | 19 | if not os.path.exists(args.save_dir): 20 | os.makedirs(args.save_dir) 21 | 22 | with open(os.path.join(args.data_path, "train.json"), mode="r", encoding="utf-8") as f: 23 | dataset = json.load(f) 24 | with open(os.path.join(args.data_path, "val.json"), mode="r", encoding="utf-8") as f: 25 | dataset.update(json.load(f)) 26 | with open(os.path.join(args.data_path, "test.json"), mode="r", encoding="utf-8") as f: 27 | dataset.update(json.load(f)) 28 | 29 | feature_shapes = dict() 30 | for video_id, annotations in tqdm(dataset.items(), total=len(dataset), desc=""): 31 | video_features = [] 32 | num_frames = annotations["num_frames"] - 16 # trick from 2D-TAN 33 | for idx in range(0, (num_frames - args.sample_rate) // stride + 1): 34 | s_idx = idx * stride + 1 35 | e_idx = s_idx + args.sample_rate 36 | feature_path = os.path.join(args.feature_path, "{}.avi_{}_{}.npy".format(video_id, s_idx, e_idx)) 37 | feature = np.load(feature_path) 38 | video_features.append(feature) 39 | video_features = np.stack(video_features, axis=0) 40 | np.save(os.path.join(args.save_dir, video_id), arr=video_features) 41 | feature_shapes[video_id] = video_features.shape[0] 42 | 43 | with open(os.path.join(args.save_dir, "feature_shapes.json"), mode="w", encoding="utf-8") as f: 44 | json.dump(feature_shapes, f) 45 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | # add 132 | .idea/ 133 | .vscode/ 134 | ckpt/ 135 | ckpt_t7/ 136 | ckpt*/ 137 | data/features/ 138 | datasets/ 139 | .DS_Store 140 | *.DS_Store 141 | -------------------------------------------------------------------------------- /prepare/download_activitynet_video.py: -------------------------------------------------------------------------------- 1 | """ 2 | Codes are modified from https://github.com/waybarrios/Anet_tools2.0 3 | """ 4 | import os 5 | import glob 6 | import json 7 | from argparse import ArgumentParser 8 | 9 | 10 | def crosscheck_videos(video_path, all_video_ids): 11 | # Get existing videos 12 | existing_videos = glob.glob("%s/*.mp4" % video_path) 13 | for idx, vid in enumerate(existing_videos): 14 | basename = os.path.basename(vid).split(".mp4")[0] 15 | if len(basename) == 13: 16 | existing_videos[idx] = basename[2:] 17 | elif len(basename) == 11: 18 | existing_videos[idx] = basename 19 | else: 20 | raise RuntimeError("Unknown filename format: %s", vid) 21 | 22 | non_existing_videos = [] 23 | for vid in all_video_ids: 24 | if vid in existing_videos: 25 | continue 26 | else: 27 | non_existing_videos.append(vid) 28 | 29 | return non_existing_videos 30 | 31 | 32 | def main(video_dir, dataset_dir, bash_file): 33 | with open(os.path.join(dataset_dir, "train.json"), mode="r", encoding="utf-8") as f: 34 | train_ids = list(json.load(f).keys()) 35 | train_ids = [vid[2:] if len(vid) == 13 else vid for vid in train_ids] 36 | 37 | with open(os.path.join(dataset_dir, "val_1.json"), mode="r", encoding="utf-8") as f: 38 | val_ids = list(json.load(f).keys()) 39 | val_ids = [vid[2:] if len(vid) == 13 else vid for vid in val_ids] 40 | 41 | with open(os.path.join(dataset_dir, "val_2.json"), mode="r", encoding="utf-8") as f: 42 | test_ids = list(json.load(f).keys()) 43 | test_ids = [vid[2:] if len(vid) == 13 else vid for vid in test_ids] 44 | 45 | all_video_ids = list(set(train_ids + val_ids + test_ids)) 46 | print("train_video_ids", len(train_ids)) 47 | print("val_1_video_ids", len(val_ids)) 48 | print("val_2_video_ids", len(test_ids)) 49 | print("all_video_ids", len(all_video_ids)) 50 | 51 | non_existing_videos = crosscheck_videos(video_dir, all_video_ids) 52 | 53 | # save command to bash file 54 | with open(bash_file + '.sh', mode="w", encoding="utf-8") as f: 55 | f.write("#!/usr/bin/env bash\n\n") # write bash file header 56 | filename = os.path.join(video_dir, "v_%s.mp4") 57 | cmd_base = "youtube-dl -f best -f mp4 " 58 | cmd_base += '"https://www.youtube.com/watch?v=%s" ' 59 | cmd_base += '-o "%s"' % filename 60 | 61 | for vid in non_existing_videos: 62 | cmd = cmd_base % (vid, vid) 63 | f.write("%s\n" % cmd) 64 | 65 | 66 | if __name__ == "__main__": 67 | parser = ArgumentParser(description="Script to double check video content.") 68 | parser.add_argument("--video_dir", type=str, required=True, help="where to save the downloaded videos") 69 | parser.add_argument("--dataset_dir", type=str, required=True, help="where are the annotation files") 70 | parser.add_argument("--bash_file", type=str, required=True, help="where to save command list script") 71 | 72 | args = vars(parser.parse_args()) 73 | main(**args) 74 | """ 75 | After running this python file, it will generate an script file. Using the terminal to run this script, it will 76 | automatically download all the required videos from YouTube. 77 | """ 78 | -------------------------------------------------------------------------------- /prepare/videotransforms.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import numbers 3 | import random 4 | 5 | 6 | class RandomCrop(object): 7 | """Crop the given video sequences (t x h x w) at a random location. 8 | Args: 9 | size (sequence or int): Desired output size of the crop. If size is an 10 | int instead of sequence like (h, w), a square crop (size, size) is 11 | made. 12 | """ 13 | 14 | def __init__(self, size): 15 | if isinstance(size, numbers.Number): 16 | self.size = (size, size) 17 | else: 18 | self.size = size 19 | 20 | @staticmethod 21 | def get_params(img, output_size): 22 | """Get parameters for ``crop`` for a random crop. 23 | Args: 24 | img (PIL Image): Image to be cropped. 25 | output_size (tuple): Expected output size of the crop. 26 | Returns: 27 | tuple: params (i, j, h, w) to be passed to ``crop`` for random crop. 28 | """ 29 | t, h, w, c = img.shape 30 | th, tw = output_size 31 | if w == tw and h == th: 32 | return 0, 0, h, w 33 | 34 | i = random.randint(0, h - th) if h != th else 0 35 | j = random.randint(0, w - tw) if w != tw else 0 36 | return i, j, th, tw 37 | 38 | def __call__(self, imgs): 39 | 40 | i, j, h, w = self.get_params(imgs, self.size) 41 | 42 | imgs = imgs[:, i:i + h, j:j + w, :] 43 | return imgs 44 | 45 | def __repr__(self): 46 | return self.__class__.__name__ + '(size={0})'.format(self.size) 47 | 48 | 49 | class CenterCrop(object): 50 | """Crops the given seq Images at the center. 51 | Args: 52 | size (sequence or int): Desired output size of the crop. If size is an 53 | int instead of sequence like (h, w), a square crop (size, size) is 54 | made. 55 | """ 56 | 57 | def __init__(self, size): 58 | if isinstance(size, numbers.Number): 59 | self.size = (size, size) 60 | else: 61 | self.size = size 62 | 63 | def __call__(self, imgs): 64 | """ 65 | Args: 66 | imgs (PIL Image): Image to be cropped. 67 | Returns: 68 | PIL Image: Cropped image. 69 | """ 70 | t, h, w, c = imgs.shape 71 | th, tw = self.size 72 | i = int(np.round((h - th) / 2.)) 73 | j = int(np.round((w - tw) / 2.)) 74 | 75 | return imgs[:, i:i + th, j:j + tw, :] 76 | 77 | def __repr__(self): 78 | return self.__class__.__name__ + '(size={0})'.format(self.size) 79 | 80 | 81 | class RandomHorizontalFlip(object): 82 | """Horizontally flip the given seq Images randomly with a given probability. 83 | Args: 84 | p (float): probability of the image being flipped. Default value is 0.5 85 | """ 86 | 87 | def __init__(self, p=0.5): 88 | self.p = p 89 | 90 | def __call__(self, imgs): 91 | """ 92 | Args: 93 | imgs (seq Images): seq Images to be flipped. 94 | Returns: 95 | seq Images: Randomly flipped seq images. 96 | """ 97 | if random.random() < self.p: 98 | # t x h x w 99 | return np.flip(imgs, axis=2).copy() 100 | return imgs 101 | 102 | def __repr__(self): 103 | return self.__class__.__name__ + '(p={})'.format(self.p) 104 | -------------------------------------------------------------------------------- /util/runner_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import tensorflow as tf 4 | from tqdm import tqdm 5 | from util.data_util import index_to_time 6 | 7 | if tf.__version__.startswith('2'): 8 | tf = tf.compat.v1 9 | tf.disable_v2_behavior() 10 | tf.disable_eager_execution() 11 | 12 | 13 | def set_tf_config(seed, gpu_idx): 14 | # os environment 15 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = "3" 16 | os.environ["CUDA_VISIBLE_DEVICES"] = gpu_idx 17 | # random seed 18 | np.random.seed(seed) 19 | tf.set_random_seed(seed) 20 | tf.random.set_random_seed(seed) 21 | 22 | 23 | def write_tf_summary(writer, value_pairs, global_step): 24 | for tag, value in value_pairs: 25 | summ = tf.Summary(value=[tf.Summary.Value(tag=tag, simple_value=value)]) 26 | writer.add_summary(summ, global_step=global_step) 27 | writer.flush() 28 | 29 | 30 | def calculate_iou_accuracy(ious, threshold): 31 | total_size = float(len(ious)) 32 | count = 0 33 | for iou in ious: 34 | if iou >= threshold: 35 | count += 1 36 | return float(count) / total_size * 100.0 37 | 38 | 39 | def calculate_iou(i0, i1): 40 | union = (min(i0[0], i1[0]), max(i0[1], i1[1])) 41 | inter = (max(i0[0], i1[0]), min(i0[1], i1[1])) 42 | iou = 1.0 * (inter[1] - inter[0]) / (union[1] - union[0]) 43 | return max(0.0, iou) 44 | 45 | 46 | def get_feed_dict(batch_data, model, drop_rate=None, mode='train'): 47 | if mode == 'train': # training 48 | (_, vfeats, vfeat_lens, word_ids, char_ids, s_labels, e_labels, h_labels) = batch_data 49 | feed_dict = {model.video_inputs: vfeats, model.video_seq_length: vfeat_lens, model.word_ids: word_ids, 50 | model.char_ids: char_ids, model.y1: s_labels, model.y2: e_labels, model.drop_rate: drop_rate, 51 | model.highlight_labels: h_labels} 52 | return feed_dict 53 | else: # eval 54 | raw_data, vfeats, vfeat_lens, word_ids, char_ids = batch_data 55 | feed_dict = {model.video_inputs: vfeats, model.video_seq_length: vfeat_lens, model.word_ids: word_ids, 56 | model.char_ids: char_ids} 57 | return raw_data, feed_dict 58 | 59 | 60 | def eval_test(sess, model, data_loader, epoch=None, global_step=None, mode="test"): 61 | ious = list() 62 | for data in tqdm(data_loader.test_iter(mode), total=data_loader.num_batches(mode), desc="evaluate {}".format(mode)): 63 | raw_data, feed_dict = get_feed_dict(data, model, mode=mode) 64 | start_indexes, end_indexes = sess.run([model.start_index, model.end_index], feed_dict=feed_dict) 65 | for record, start_index, end_index in zip(raw_data, start_indexes, end_indexes): 66 | start_time, end_time = index_to_time(start_index, end_index, record["v_len"], record["duration"]) 67 | iou = calculate_iou(i0=[start_time, end_time], i1=[record["s_time"], record["e_time"]]) 68 | ious.append(iou) 69 | r1i3 = calculate_iou_accuracy(ious, threshold=0.3) 70 | r1i5 = calculate_iou_accuracy(ious, threshold=0.5) 71 | r1i7 = calculate_iou_accuracy(ious, threshold=0.7) 72 | mi = np.mean(ious) * 100.0 73 | value_pairs = [("{}/Rank@1, IoU=0.3".format(mode), r1i3), ("{}/Rank@1, IoU=0.5".format(mode), r1i5), 74 | ("{}/Rank@1, IoU=0.7".format(mode), r1i7), ("{}/mean IoU".format(mode), mi)] 75 | # write the scores 76 | score_str = "Epoch {}, Step {}:\n".format(epoch, global_step) 77 | score_str += "Rank@1, IoU=0.3: {:.2f}\t".format(r1i3) 78 | score_str += "Rank@1, IoU=0.5: {:.2f}\t".format(r1i5) 79 | score_str += "Rank@1, IoU=0.7: {:.2f}\t".format(r1i7) 80 | score_str += "mean IoU: {:.2f}\n".format(mi) 81 | return r1i3, r1i5, r1i7, mi, value_pairs, score_str 82 | -------------------------------------------------------------------------------- /model/VSLNet_t7.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from model.layers_t7 import Embedding, VisualProjection, FeatureEncoder, CQAttention, CQConcatenate, \ 4 | ConditionedPredictor, HighLightLayer 5 | from transformers import AdamW, get_linear_schedule_with_warmup 6 | 7 | 8 | def build_optimizer_and_scheduler(model, configs): 9 | no_decay = ['bias', 'layer_norm', 'LayerNorm'] # no decay for parameters of layer norm and bias 10 | optimizer_grouped_parameters = [ 11 | {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 12 | 'weight_decay': 0.01}, 13 | {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}] 14 | optimizer = AdamW(optimizer_grouped_parameters, lr=configs.init_lr) 15 | scheduler = get_linear_schedule_with_warmup(optimizer, configs.num_train_steps * configs.warmup_proportion, 16 | configs.num_train_steps) 17 | return optimizer, scheduler 18 | 19 | 20 | class VSLNet(nn.Module): 21 | def __init__(self, configs, word_vectors): 22 | super(VSLNet, self).__init__() 23 | self.configs = configs 24 | self.embedding_net = Embedding(num_words=configs.word_size, num_chars=configs.char_size, out_dim=configs.dim, 25 | word_dim=configs.word_dim, char_dim=configs.char_dim, word_vectors=word_vectors, 26 | drop_rate=configs.drop_rate) 27 | self.video_affine = VisualProjection(visual_dim=configs.video_feature_dim, dim=configs.dim, 28 | drop_rate=configs.drop_rate) 29 | self.feature_encoder = FeatureEncoder(dim=configs.dim, num_heads=configs.num_heads, kernel_size=7, num_layers=4, 30 | max_pos_len=configs.max_pos_len, drop_rate=configs.drop_rate) 31 | # video and query fusion 32 | self.cq_attention = CQAttention(dim=configs.dim, drop_rate=configs.drop_rate) 33 | self.cq_concat = CQConcatenate(dim=configs.dim) 34 | # query-guided highlighting 35 | self.highlight_layer = HighLightLayer(dim=configs.dim) 36 | # conditioned predictor 37 | self.predictor = ConditionedPredictor(dim=configs.dim, num_heads=configs.num_heads, drop_rate=configs.drop_rate, 38 | max_pos_len=configs.max_pos_len, predictor=configs.predictor) 39 | # init parameters 40 | self.init_parameters() 41 | 42 | def init_parameters(self): 43 | def init_weights(m): 44 | if isinstance(m, nn.Conv2d) or isinstance(m, nn.Conv1d) or isinstance(m, nn.Linear): 45 | torch.nn.init.xavier_uniform_(m.weight) 46 | if m.bias is not None: 47 | torch.nn.init.zeros_(m.bias) 48 | elif isinstance(m, nn.LSTM): 49 | m.reset_parameters() 50 | self.apply(init_weights) 51 | 52 | def forward(self, word_ids, char_ids, video_features, v_mask, q_mask): 53 | video_features = self.video_affine(video_features) 54 | query_features = self.embedding_net(word_ids, char_ids) 55 | video_features = self.feature_encoder(video_features, mask=v_mask) 56 | query_features = self.feature_encoder(query_features, mask=q_mask) 57 | features = self.cq_attention(video_features, query_features, v_mask, q_mask) 58 | features = self.cq_concat(features, query_features, q_mask) 59 | h_score = self.highlight_layer(features, v_mask) 60 | features = features * h_score.unsqueeze(2) 61 | start_logits, end_logits = self.predictor(features, mask=v_mask) 62 | return h_score, start_logits, end_logits 63 | 64 | def extract_index(self, start_logits, end_logits): 65 | return self.predictor.extract_index(start_logits=start_logits, end_logits=end_logits) 66 | 67 | def compute_highlight_loss(self, scores, labels, mask): 68 | return self.highlight_layer.compute_loss(scores=scores, labels=labels, mask=mask) 69 | 70 | def compute_loss(self, start_logits, end_logits, start_labels, end_labels): 71 | return self.predictor.compute_cross_entropy_loss(start_logits=start_logits, end_logits=end_logits, 72 | start_labels=start_labels, end_labels=end_labels) 73 | -------------------------------------------------------------------------------- /util/data_loader_t7.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.utils.data 4 | from util.data_util import pad_seq, pad_char_seq, pad_video_seq 5 | 6 | 7 | class Dataset(torch.utils.data.Dataset): 8 | def __init__(self, dataset, video_features): 9 | super(Dataset, self).__init__() 10 | self.dataset = dataset 11 | self.video_features = video_features 12 | 13 | def __getitem__(self, index): 14 | record = self.dataset[index] 15 | video_feature = self.video_features[record['vid']] 16 | s_ind, e_ind = int(record['s_ind']), int(record['e_ind']) 17 | word_ids, char_ids = record['w_ids'], record['c_ids'] 18 | return record, video_feature, word_ids, char_ids, s_ind, e_ind 19 | 20 | def __len__(self): 21 | return len(self.dataset) 22 | 23 | 24 | def train_collate_fn(data): 25 | records, video_features, word_ids, char_ids, s_inds, e_inds = zip(*data) 26 | # process word ids 27 | word_ids, _ = pad_seq(word_ids) 28 | word_ids = np.asarray(word_ids, dtype=np.int32) # (batch_size, w_seq_len) 29 | # process char ids 30 | char_ids, _ = pad_char_seq(char_ids) 31 | char_ids = np.asarray(char_ids, dtype=np.int32) # (batch_size, w_seq_len, c_seq_len) 32 | # process video features 33 | vfeats, vfeat_lens = pad_video_seq(video_features) 34 | vfeats = np.asarray(vfeats, dtype=np.float32) # (batch_size, v_seq_len, v_dim) 35 | vfeat_lens = np.asarray(vfeat_lens, dtype=np.int32) # (batch_size, ) 36 | # process labels 37 | max_len = np.max(vfeat_lens) 38 | batch_size = vfeat_lens.shape[0] 39 | s_labels = np.asarray(s_inds, dtype=np.int64) 40 | e_labels = np.asarray(e_inds, dtype=np.int64) 41 | h_labels = np.zeros(shape=[batch_size, max_len], dtype=np.int32) 42 | extend = 0.1 43 | for idx in range(batch_size): 44 | st, et = s_inds[idx], e_inds[idx] 45 | cur_max_len = vfeat_lens[idx] 46 | extend_len = round(extend * float(et - st + 1)) 47 | if extend_len > 0: 48 | st_ = max(0, st - extend_len) 49 | et_ = min(et + extend_len, cur_max_len - 1) 50 | h_labels[idx][st_:(et_ + 1)] = 1 51 | else: 52 | h_labels[idx][st:(et + 1)] = 1 53 | # convert to torch tensor 54 | vfeats = torch.tensor(vfeats, dtype=torch.float32) 55 | vfeat_lens = torch.tensor(vfeat_lens, dtype=torch.int64) 56 | word_ids = torch.tensor(word_ids, dtype=torch.int64) 57 | char_ids = torch.tensor(char_ids, dtype=torch.int64) 58 | s_labels = torch.tensor(s_labels, dtype=torch.int64) 59 | e_labels = torch.tensor(e_labels, dtype=torch.int64) 60 | h_labels = torch.tensor(h_labels, dtype=torch.int64) 61 | return records, vfeats, vfeat_lens, word_ids, char_ids, s_labels, e_labels, h_labels 62 | 63 | 64 | def test_collate_fn(data): 65 | records, video_features, word_ids, char_ids, *_ = zip(*data) 66 | # process word ids 67 | word_ids, _ = pad_seq(word_ids) 68 | word_ids = np.asarray(word_ids, dtype=np.int32) # (batch_size, w_seq_len) 69 | # process char ids 70 | char_ids, _ = pad_char_seq(char_ids) 71 | char_ids = np.asarray(char_ids, dtype=np.int32) # (batch_size, w_seq_len, c_seq_len) 72 | # process video features 73 | vfeats, vfeat_lens = pad_video_seq(video_features) 74 | vfeats = np.asarray(vfeats, dtype=np.float32) # (batch_size, v_seq_len, v_dim) 75 | vfeat_lens = np.asarray(vfeat_lens, dtype=np.int32) # (batch_size, ) 76 | # convert to torch tensor 77 | vfeats = torch.tensor(vfeats, dtype=torch.float32) 78 | vfeat_lens = torch.tensor(vfeat_lens, dtype=torch.int64) 79 | word_ids = torch.tensor(word_ids, dtype=torch.int64) 80 | char_ids = torch.tensor(char_ids, dtype=torch.int64) 81 | return records, vfeats, vfeat_lens, word_ids, char_ids 82 | 83 | 84 | def get_train_loader(dataset, video_features, configs): 85 | train_set = Dataset(dataset=dataset, video_features=video_features) 86 | train_loader = torch.utils.data.DataLoader(dataset=train_set, batch_size=configs.batch_size, shuffle=True, 87 | collate_fn=train_collate_fn) 88 | return train_loader 89 | 90 | 91 | def get_test_loader(dataset, video_features, configs): 92 | test_set = Dataset(dataset=dataset, video_features=video_features) 93 | test_loader = torch.utils.data.DataLoader(dataset=test_set, batch_size=configs.batch_size, shuffle=False, 94 | collate_fn=test_collate_fn) 95 | return test_loader 96 | -------------------------------------------------------------------------------- /util/runner_utils_t7.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | import random 4 | import numpy as np 5 | import torch 6 | import torch.utils.data 7 | import torch.backends.cudnn 8 | from tqdm import tqdm 9 | from util.data_util import index_to_time 10 | 11 | 12 | def set_th_config(seed): 13 | random.seed(seed) 14 | np.random.seed(seed) 15 | torch.manual_seed(seed) 16 | torch.cuda.manual_seed(seed) 17 | torch.cuda.manual_seed_all(seed) 18 | torch.backends.cudnn.benchmark = False 19 | torch.backends.cudnn.deterministic = True 20 | 21 | 22 | def filter_checkpoints(model_dir, suffix='t7', max_to_keep=5): 23 | model_paths = glob.glob(os.path.join(model_dir, '*.{}'.format(suffix))) 24 | if len(model_paths) > max_to_keep: 25 | model_file_dict = dict() 26 | suffix_len = len(suffix) + 1 27 | for model_path in model_paths: 28 | step = int(os.path.basename(model_path).split('_')[1][0:-suffix_len]) 29 | model_file_dict[step] = model_path 30 | sorted_tuples = sorted(model_file_dict.items()) 31 | unused_tuples = sorted_tuples[0:-max_to_keep] 32 | for _, model_path in unused_tuples: 33 | os.remove(model_path) 34 | 35 | 36 | def get_last_checkpoint(model_dir, suffix='t7'): 37 | model_filenames = glob.glob(os.path.join(model_dir, '*.{}'.format(suffix))) 38 | model_file_dict = dict() 39 | suffix_len = len(suffix) + 1 40 | for model_filename in model_filenames: 41 | step = int(os.path.basename(model_filename).split('_')[1][0:-suffix_len]) 42 | model_file_dict[step] = model_filename 43 | sorted_tuples = sorted(model_file_dict.items()) 44 | last_checkpoint = sorted_tuples[-1] 45 | return last_checkpoint[1] 46 | 47 | 48 | def convert_length_to_mask(lengths): 49 | max_len = lengths.max().item() 50 | mask = torch.arange(max_len, device=lengths.device).expand(lengths.size()[0], max_len) < lengths.unsqueeze(1) 51 | mask = mask.float() 52 | return mask 53 | 54 | 55 | def calculate_iou_accuracy(ious, threshold): 56 | total_size = float(len(ious)) 57 | count = 0 58 | for iou in ious: 59 | if iou >= threshold: 60 | count += 1 61 | return float(count) / total_size * 100.0 62 | 63 | 64 | def calculate_iou(i0, i1): 65 | union = (min(i0[0], i1[0]), max(i0[1], i1[1])) 66 | inter = (max(i0[0], i1[0]), min(i0[1], i1[1])) 67 | iou = 1.0 * (inter[1] - inter[0]) / (union[1] - union[0]) 68 | return max(0.0, iou) 69 | 70 | 71 | def eval_test(model, data_loader, device, mode='test', epoch=None, global_step=None): 72 | ious = [] 73 | with torch.no_grad(): 74 | for idx, (records, vfeats, vfeat_lens, word_ids, char_ids) in tqdm( 75 | enumerate(data_loader), total=len(data_loader), desc='evaluate {}'.format(mode)): 76 | # prepare features 77 | vfeats, vfeat_lens = vfeats.to(device), vfeat_lens.to(device) 78 | word_ids, char_ids = word_ids.to(device), char_ids.to(device) 79 | # generate mask 80 | query_mask = (torch.zeros_like(word_ids) != word_ids).float().to(device) 81 | video_mask = convert_length_to_mask(vfeat_lens).to(device) 82 | # compute predicted results 83 | _, start_logits, end_logits = model(word_ids, char_ids, vfeats, video_mask, query_mask) 84 | start_indices, end_indices = model.extract_index(start_logits, end_logits) 85 | start_indices = start_indices.cpu().numpy() 86 | end_indices = end_indices.cpu().numpy() 87 | for record, start_index, end_index in zip(records, start_indices, end_indices): 88 | start_time, end_time = index_to_time(start_index, end_index, record["v_len"], record["duration"]) 89 | iou = calculate_iou(i0=[start_time, end_time], i1=[record["s_time"], record["e_time"]]) 90 | ious.append(iou) 91 | r1i3 = calculate_iou_accuracy(ious, threshold=0.3) 92 | r1i5 = calculate_iou_accuracy(ious, threshold=0.5) 93 | r1i7 = calculate_iou_accuracy(ious, threshold=0.7) 94 | mi = np.mean(ious) * 100.0 95 | # write the scores 96 | score_str = "Epoch {}, Step {}:\n".format(epoch, global_step) 97 | score_str += "Rank@1, IoU=0.3: {:.2f}\t".format(r1i3) 98 | score_str += "Rank@1, IoU=0.5: {:.2f}\t".format(r1i5) 99 | score_str += "Rank@1, IoU=0.7: {:.2f}\t".format(r1i7) 100 | score_str += "mean IoU: {:.2f}\n".format(mi) 101 | return r1i3, r1i5, r1i7, mi, score_str 102 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Span-based Localizing Network for Natural Language Video Localization 2 | 3 | This is implementation for the paper "Span-based Localizing Network for Natural Language Video 4 | Localization" (**ACL 2020**, long paper): [ACL version](https://www.aclweb.org/anthology/2020.acl-main.585.pdf), 5 | [ArXiv version](https://arxiv.org/abs/2004.13931). 6 | 7 | ![overview](/figures/overview.jpg) 8 | 9 | ## Updates 10 | - 2021/06/06: rewrite and optimize the codes, and upload complete visual features to the Box drive. Add the stacked 11 | transformers predictor head (VSLNet with transformer head performs better than that of rnn head in general). 12 | - 2021/07/21: add support to TensorFlow 2.x (test on Tensorflow `2.5.0` with cuda `11.2` and cudnn `8.2`). 13 | ```shell 14 | # preparing environment for TensorFlow 2.5.0 15 | conda create --name vslnet_tf2 python=3.9 16 | conda activate vslnet_tf2 17 | conda install -c conda-forge cudnn # will install cuda 11.2 automatically 18 | pip install tensorflow-gpu==2.5.0 19 | pip install nltk 20 | pip install torch torchvision torchaudio 21 | python3.9 -m nltk.downloader punkt 22 | ``` 23 | 24 | ## Prerequisites 25 | - python 3.x with tensorflow (`1.13.1`), pytorch (`1.1.0`), torchvision, opencv-python, moviepy, tqdm, nltk, 26 | transformers 27 | - youtube-dl 28 | - cuda10, cudnn 29 | 30 | If you have [Anaconda](https://www.anaconda.com/distribution/) installed, the conda environment of VSLNet can be built 31 | as follow (take python 3.7 as an example): 32 | ```shell script 33 | # preparing environment 34 | conda create --name vslnet python=3.7 35 | conda activate vslnet 36 | conda install -c anaconda cudatoolkit=10.0 cudnn 37 | conda install tensorflow-gpu==1.13.1 38 | conda install -c anaconda nltk pillow=6.2.1 39 | conda install pytorch==1.1.0 torchvision==0.3.0 cudatoolkit=10.0 -c pytorch 40 | conda install -c conda-forge transformers opencv moviepy tqdm youtube-dl 41 | # download punkt for word tokenizer 42 | python3.7 -m nltk.downloader punkt 43 | ``` 44 | 45 | ## Preparation 46 | The details about how to prepare the `Charades-STA`, `ActivityNet Captions` and `TACoS` features are summarized 47 | here: [[data preparation]](/prepare). Alternatively, you can download the prepared visual features from 48 | [Box Drive](https://app.box.com/s/h0sxa5klco6qve5ahnz50ly2nksmuedw), and place them to the `./data/` directory. 49 | Download the word embeddings from [here](http://nlp.stanford.edu/data/glove.840B.300d.zip) and place it to 50 | `./data/features/` directory. 51 | 52 | ## Quick Start 53 | ### TensorFlow version 54 | **Train** and **Test** 55 | ```shell script 56 | # processed dataset will be automatically generated or loaded if exist 57 | # set `--mode test` for evaluation 58 | # set `--predictor transformer` to change the answer predictor from stacked lstms to stacked transformers 59 | # train VSLNet on Charades-STA dataset 60 | python main.py --task charades --predictor rnn --mode train 61 | # train VSLNet on ActivityNet Captions dataset 62 | python main.py --task activitynet --predictor rnn --mode train 63 | # train VSLNet on TACoS dataset 64 | python main.py --task tacos --predictor rnn --mode train 65 | ``` 66 | Please refer each python file for more parameter settings. You can also download the checkpoints for each task 67 | from [here](https://app.box.com/s/f20aeutwp2wg8c5laaqtbfdg864g8mj0) and the corresponding processed dataset from 68 | [here](https://app.box.com/s/065efky2sjjgc2xxzyelast15y7tsehs), and save them to the `./ckpt/` and `./datasets/` 69 | directories, respectively. More hyper-parameter settings are in the `main.py`. 70 | 71 | ### Pytorch Version 72 | **Train** and **Test** 73 | ```shell script 74 | # the same as the usage of tf version 75 | # train VSLNet on Charades-STA dataset 76 | python main.py --task charades --predictor rnn --mode train 77 | # train VSLNet on ActivityNet Captions dataset 78 | python main.py --task activitynet --predictor rnn --mode train 79 | # train VSLNet on TACoS dataset 80 | python main.py --task tacos --predictor rnn --mode train 81 | ``` 82 | > For unknown reasons, the performance of PyTorch codes is inferior to that of TensorFlow codes on some datasets. 83 | 84 | ## Citation 85 | If you feel this project helpful to your research, please cite our work. 86 | ``` 87 | @inproceedings{zhang2020span, 88 | title = "Span-based Localizing Network for Natural Language Video Localization", 89 | author = "Zhang, Hao and Sun, Aixin and Jing, Wei and Zhou, Joey Tianyi", 90 | booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics", 91 | month = jul, 92 | year = "2020", 93 | address = "Online", 94 | publisher = "Association for Computational Linguistics", 95 | url = "https://www.aclweb.org/anthology/2020.acl-main.585", 96 | pages = "6543--6554" 97 | } 98 | ``` 99 | and 100 | ``` 101 | @article{zhang2021natural, 102 | author={H. {Zhang} and A. {Sun} and W. {Jing} and L. {Zhen} and J. T. {Zhou} and R. S. M. {Goh}}, 103 | journal={IEEE Transactions on Pattern Analysis and Machine Intelligence}, 104 | title={Natural Language Video Localization: A Revisit in Span-based Question Answering Framework}, 105 | year={2021}, 106 | doi={10.1109/TPAMI.2021.3060449} 107 | } 108 | ``` 109 | -------------------------------------------------------------------------------- /prepare/extract_charades.py: -------------------------------------------------------------------------------- 1 | import os 2 | import cv2 3 | import json 4 | import torch 5 | import argparse 6 | import subprocess 7 | import numpy as np 8 | from . import videotransforms 9 | from .feature_extractor import InceptionI3d 10 | from torchvision import transforms 11 | from torch.autograd import Variable 12 | 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument("--gpu_idx", type=str, default="0", help="gpu index") 15 | parser.add_argument("--use_finetuned", action="store_true", help="whether use fine-tuned feature extractor") 16 | parser.add_argument("--load_model", type=str, required=True, help="pre-trained model") 17 | parser.add_argument("--video_dir", type=str, required=True, help="where are located the videos") 18 | parser.add_argument("--dataset_dir", type=str, required=True, help="where are located the dataset files") 19 | parser.add_argument("--images_dir", type=str, required=True, help="where to save extracted images") 20 | parser.add_argument("--save_dir", type=str, required=True, help="where to save extracted features") 21 | parser.add_argument("--fps", type=int, default=24, help="frames per second") 22 | parser.add_argument("--video_format", type=str, default="mp4", help="video format") 23 | parser.add_argument("--strides", type=int, default=24, help="window size") 24 | parser.add_argument("--remove_images", action="store_true", help="whether remove extract images to release space") 25 | args = parser.parse_args() 26 | 27 | os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" 28 | os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_idx 29 | 30 | 31 | if not os.path.exists(args.video_dir): 32 | raise ValueError("The video directory '{}' does not exist!!!".format(args.video_dir)) 33 | 34 | if not os.path.exists(args.images_dir): 35 | os.makedirs(args.images_dir) 36 | 37 | if not os.path.exists(args.save_dir): 38 | os.makedirs(args.save_dir) 39 | 40 | # create I3D model and load pre-trained model 41 | i3d_model = InceptionI3d(400, in_channels=3) 42 | if args.use_fine_tuned: 43 | i3d_model.replace_logits(157) # charades has 157 activity types 44 | i3d_model.load_state_dict(torch.load(args.load_model)) 45 | i3d_model.cuda() 46 | i3d_model.train(False) 47 | video_transforms = transforms.Compose([videotransforms.CenterCrop(224)]) 48 | 49 | # load video ids 50 | video_ids = [] 51 | for filename in ["charades_sta_train.txt", "charades_sta_test.txt"]: 52 | with open(os.path.join(args.dataset_dir, filename), mode="r", encoding="utf-8") as f: 53 | for line in f: 54 | line = line.lstrip().rstrip() 55 | if len(line) == 0: 56 | continue 57 | vid = line.split("##")[0].split(" ")[0] 58 | video_ids.append(vid) 59 | video_ids = list(set(video_ids)) 60 | 61 | # extract images and features 62 | feature_shapes = dict() 63 | for idx, video_id in enumerate(video_ids): 64 | video_path = os.path.join(args.video_dir, "{}.mp4".format(video_id)) 65 | image_dir = os.path.join(args.images_dir, video_id) 66 | 67 | print("{} / {}: extract features for video {}".format(idx + 1, len(video_ids), video_id), flush=True) 68 | 69 | if os.path.exists(os.path.join(args.save_dir, "{}.npy".format(video_id))): 70 | print("the visual features for video {} are exist in {}...\n".format(video_id, args.save_dir), flush=True) 71 | continue 72 | 73 | # extract images 74 | if os.path.exists(image_dir): 75 | print("the images for video {} already are exist in {}...".format(video_id, args.images_dir)) 76 | else: 77 | os.makedirs(image_dir) 78 | print("extract images with fps={}...".format(args.fps), flush=True) 79 | if args.fps is None or args.fps <= 0: 80 | subprocess.call("ffmpeg -hide_banner -loglevel panic -i {} {}/{}-%6d.jpg".format( 81 | video_path, image_dir, video_id), shell=True) 82 | else: 83 | subprocess.call("ffmpeg -hide_banner -loglevel panic -i {} -filter:v fps=fps={} {}/{}-%6d.jpg".format( 84 | video_path, args.fps, image_dir, video_id), shell=True) 85 | 86 | # process extracted images 87 | print("load RGB frames...", flush=True) 88 | num_frames = len(os.listdir(image_dir)) 89 | frames, raw_w, raw_h = [], None, None 90 | for i in range(1, num_frames + 1): 91 | # cv2.imread() read image with BGR format by default, so we convert it to RGB format 92 | img = cv2.imread(os.path.join(image_dir, "{}-{}.jpg".format(video_id, str(i).zfill(6))))[:, :, [2, 1, 0]] 93 | w, h, c = img.shape 94 | raw_w, raw_h = w, h 95 | if w < 226 or h < 226: 96 | d = 226. - min(w, h) 97 | sc = 1 + d / min(w, h) 98 | img = cv2.resize(img, dsize=(0, 0), fx=sc, fy=sc) 99 | img = (img / 255.) * 2 - 1 100 | frames.append(img) 101 | frames = np.asarray(frames, dtype=np.float32) 102 | imgs = video_transforms(frames) 103 | img_tensor = torch.from_numpy(np.expand_dims(imgs.transpose([3, 0, 1, 2]), axis=0)) 104 | print("process images:", (frames.shape[0], raw_w, raw_h, frames.shape[-1]), "-->", frames.shape, "-->", 105 | imgs.shape, "-->", tuple(img_tensor.size()), flush=True) 106 | 107 | if args.remove_images: 108 | # remove extract images to release memory space 109 | subprocess.call("rm -rf {}".format(image_dir), shell=True) 110 | 111 | print("extract visual visual features...", flush=True) 112 | b, c, t, h, w = img_tensor.shape 113 | features = [] 114 | for start in range(0, t, args.strides): 115 | end = min(t - 1, start + args.strides) 116 | if end - start < args.strides: 117 | start = max(0, end - args.strides) 118 | ip = Variable(torch.from_numpy(img_tensor.numpy()[:, :, start:end]).cuda(), volatile=True) 119 | feature = i3d_model.extract_features(ip).data.cpu().numpy() 120 | features.append(feature) 121 | features = np.concatenate(features, axis=0) 122 | np.save(os.path.join(args.save_dir, video_id), arr=features) 123 | print("extracted feature shape: {}\n".format(features.shape), flush=True) 124 | feature_shapes[video_id] = features.shape[0] 125 | 126 | with open(os.path.join(args.save_dir, "feature_shapes.json"), mode="w", encoding="utf-8") as f: 127 | json.dump(feature_shapes, f) 128 | -------------------------------------------------------------------------------- /util/data_loader.py: -------------------------------------------------------------------------------- 1 | import math 2 | import random 3 | import numpy as np 4 | from util.data_util import pad_seq, pad_char_seq, pad_video_seq 5 | 6 | 7 | class TrainLoader: 8 | def __init__(self, dataset, visual_features, configs): 9 | super(TrainLoader, self).__init__() 10 | self.dataset = dataset 11 | self.visual_feats = visual_features 12 | self.extend = configs.extend 13 | self.batch_size = configs.batch_size 14 | 15 | def set_extend(self, extend): 16 | self.extend = extend 17 | 18 | def set_batch_size(self, batch_size): 19 | self.batch_size = batch_size 20 | 21 | def num_samples(self): 22 | return len(self.dataset) 23 | 24 | def num_batches(self): 25 | return math.ceil(len(self.dataset) / self.batch_size) 26 | 27 | def batch_iter(self): 28 | random.shuffle(self.dataset) # shuffle the train set first 29 | for index in range(0, len(self.dataset), self.batch_size): 30 | batch_data = self.dataset[index:(index + self.batch_size)] 31 | vfeats, vfeat_lens, word_ids, char_ids, s_labels, e_labels, h_labels = self.process_batch(batch_data) 32 | yield batch_data, vfeats, vfeat_lens, word_ids, char_ids, s_labels, e_labels, h_labels 33 | 34 | def process_batch(self, batch_data): 35 | vfeats, word_ids, char_ids, s_inds, e_inds = [], [], [], [], [] 36 | for data in batch_data: 37 | vfeat = self.visual_feats[data['vid']] 38 | vfeats.append(vfeat) 39 | word_ids.append(data['w_ids']) 40 | char_ids.append(data['c_ids']) 41 | s_inds.append(data['s_ind']) 42 | e_inds.append(data['e_ind']) 43 | batch_size = len(batch_data) 44 | # process word ids 45 | word_ids, _ = pad_seq(word_ids) 46 | word_ids = np.asarray(word_ids, dtype=np.int32) # (batch_size, w_seq_len) 47 | # process char ids 48 | char_ids, _ = pad_char_seq(char_ids) 49 | char_ids = np.asarray(char_ids, dtype=np.int32) # (batch_size, w_seq_len, c_seq_len) 50 | # process video features 51 | vfeats, vfeat_lens = pad_video_seq(vfeats) 52 | vfeats = np.asarray(vfeats, dtype=np.float32) # (batch_size, v_seq_len, v_dim) 53 | vfeat_lens = np.asarray(vfeat_lens, dtype=np.int32) # (batch_size, ) 54 | # process labels 55 | max_len = np.max(vfeat_lens) 56 | s_labels = np.zeros(shape=[batch_size, max_len], dtype=np.int32) 57 | e_labels = np.zeros(shape=[batch_size, max_len], dtype=np.int32) 58 | h_labels = np.zeros(shape=[batch_size, max_len], dtype=np.int32) 59 | for idx in range(batch_size): 60 | st, et = s_inds[idx], e_inds[idx] 61 | s_labels[idx][st] = 1 62 | e_labels[idx][et] = 1 63 | cur_max_len = vfeat_lens[idx] 64 | extend_len = round(self.extend * float(et - st + 1)) 65 | if extend_len > 0: 66 | st_ = max(0, st - extend_len) 67 | et_ = min(et + extend_len, cur_max_len - 1) 68 | h_labels[idx][st_:(et_ + 1)] = 1 69 | else: 70 | h_labels[idx][st:(et + 1)] = 1 71 | return vfeats, vfeat_lens, word_ids, char_ids, s_labels, e_labels, h_labels 72 | 73 | 74 | class TestLoader: 75 | def __init__(self, datasets, visual_features, configs): 76 | self.visual_feats = visual_features 77 | self.val_set = None if datasets['val_set'] is None else datasets['val_set'] 78 | self.test_set = datasets['test_set'] 79 | self.batch_size = configs.batch_size 80 | 81 | def set_batch_size(self, batch_size): 82 | self.batch_size = batch_size 83 | 84 | def num_samples(self, mode='test'): 85 | if mode == 'val': 86 | if self.val_set is None: 87 | return 0 88 | return len(self.val_set) 89 | elif mode == 'test': 90 | return len(self.test_set) 91 | else: 92 | raise ValueError('Unknown mode!!! Only support [val | test | test_iid | test_ood].') 93 | 94 | def num_batches(self, mode='test'): 95 | if mode == 'val': 96 | if self.val_set is None: 97 | return 0 98 | return math.ceil(len(self.val_set) / self.batch_size) 99 | elif mode == 'test': 100 | return math.ceil(len(self.test_set) / self.batch_size) 101 | else: 102 | raise ValueError('Unknown mode!!! Only support [val | test].') 103 | 104 | def test_iter(self, mode='test'): 105 | if mode not in ['val', 'test']: 106 | raise ValueError('Unknown mode!!! Only support [val | test].') 107 | test_sets = {'val': self.val_set, 'test': self.test_set} 108 | dataset = test_sets[mode] 109 | if mode == 'val' and dataset is None: 110 | raise ValueError('val set is not available!!!') 111 | for index in range(0, len(dataset), self.batch_size): 112 | batch_data = dataset[index:(index + self.batch_size)] 113 | vfeats, vfeat_lens, word_ids, char_ids = self.process_batch(batch_data) 114 | yield batch_data, vfeats, vfeat_lens, word_ids, char_ids 115 | 116 | def process_batch(self, batch_data): 117 | vfeats, word_ids, char_ids, s_inds, e_inds = [], [], [], [], [] 118 | for data in batch_data: 119 | vfeats.append(self.visual_feats[data['vid']]) 120 | word_ids.append(data['w_ids']) 121 | char_ids.append(data['c_ids']) 122 | s_inds.append(data['s_ind']) 123 | e_inds.append(data['e_ind']) 124 | # process word ids 125 | word_ids, _ = pad_seq(word_ids) 126 | word_ids = np.asarray(word_ids, dtype=np.int32) # (batch_size, w_seq_len) 127 | # process char ids 128 | char_ids, _ = pad_char_seq(char_ids) 129 | char_ids = np.asarray(char_ids, dtype=np.int32) # (batch_size, w_seq_len, c_seq_len) 130 | # process video features 131 | vfeats, vfeat_lens = pad_video_seq(vfeats) 132 | vfeats = np.asarray(vfeats, dtype=np.float32) # (batch_size, v_seq_len, v_dim) 133 | vfeat_lens = np.asarray(vfeat_lens, dtype=np.int32) # (batch_size, ) 134 | return vfeats, vfeat_lens, word_ids, char_ids 135 | -------------------------------------------------------------------------------- /model/VSLNet.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from model.ops import create_optimizer, count_params 3 | from model.layers import word_embedding_lookup, char_embedding_lookup, conv1d, video_query_attention, highlight_layer 4 | from model.layers import context_query_concat, feature_encoder, conditioned_predictor, localization_loss 5 | 6 | if tf.__version__.startswith('2'): 7 | tf = tf.compat.v1 8 | tf.disable_v2_behavior() 9 | tf.disable_eager_execution() 10 | 11 | 12 | class VSLNet: 13 | def __init__(self, configs, graph, vectors): 14 | self.configs = configs 15 | graph = graph if graph is not None else tf.Graph() 16 | with graph.as_default(): 17 | self.global_step = tf.train.create_global_step() 18 | self._add_placeholders() 19 | self._build_model(vectors) 20 | if configs.mode == 'train': 21 | print('\x1b[1;33m' + 'Total trainable parameters: {}'.format(count_params()) + '\x1b[0m', flush=True) 22 | else: 23 | print('\x1b[1;33m' + 'Total parameters: {}'.format(count_params()) + '\x1b[0m', flush=True) 24 | 25 | def _add_placeholders(self): 26 | self.video_inputs = tf.placeholder(dtype=tf.float32, shape=[None, None, self.configs.video_feature_dim], 27 | name='video_inputs') 28 | self.video_seq_length = tf.placeholder(dtype=tf.int32, shape=[None], name='video_sequence_length') 29 | self.word_ids = tf.placeholder(dtype=tf.int32, shape=[None, None], name='word_ids') 30 | self.char_ids = tf.placeholder(dtype=tf.int32, shape=[None, None, None], name='char_ids') 31 | self.highlight_labels = tf.placeholder(dtype=tf.int32, shape=[None, None], name='highlight_labels') 32 | self.y1 = tf.placeholder(dtype=tf.int32, shape=[None, None], name='start_indexes') 33 | self.y2 = tf.placeholder(dtype=tf.int32, shape=[None, None], name='end_indexes') 34 | # hyper-parameters 35 | self.drop_rate = tf.placeholder_with_default(input=0.0, shape=[], name='dropout_rate') 36 | # create mask 37 | self.v_mask = tf.sequence_mask(lengths=self.video_seq_length, maxlen=tf.reduce_max(self.video_seq_length), 38 | dtype=tf.int32) 39 | self.q_mask = tf.cast(tf.cast(self.word_ids, dtype=tf.bool), dtype=tf.int32) 40 | 41 | def _build_model(self, vectors): 42 | # word embedding & visual features 43 | word_emb = word_embedding_lookup(self.word_ids, dim=self.configs.word_dim, drop_rate=self.drop_rate, 44 | vectors=vectors, finetune=False, reuse=False, name='word_embeddings') 45 | char_emb = char_embedding_lookup(self.char_ids, char_size=self.configs.char_size, dim=self.configs.char_dim, 46 | kernels=[1, 2, 3, 4], filters=[10, 20, 30, 40], drop_rate=self.drop_rate, 47 | activation=tf.nn.relu, reuse=False, name='char_embeddings') 48 | word_emb = tf.concat([word_emb, char_emb], axis=-1) 49 | video_features = tf.nn.dropout(self.video_inputs, rate=self.drop_rate) 50 | # feature projection (map both word and video feature to the same dimension) 51 | vfeats = conv1d(video_features, dim=self.configs.hidden_size, use_bias=True, reuse=False, name='video_conv1d') 52 | qfeats = conv1d(word_emb, dim=self.configs.hidden_size, use_bias=True, reuse=False, name='query_conv1d') 53 | # feature encoder 54 | vfeats = feature_encoder(vfeats, hidden_size=self.configs.hidden_size, num_heads=self.configs.num_heads, 55 | max_position_length=self.configs.max_pos_len, drop_rate=self.drop_rate, 56 | mask=self.v_mask, reuse=False, name='feature_encoder') 57 | qfeats = feature_encoder(qfeats, hidden_size=self.configs.hidden_size, num_heads=self.configs.num_heads, 58 | max_position_length=self.configs.max_pos_len, drop_rate=self.drop_rate, 59 | mask=self.q_mask, reuse=True, name='feature_encoder') 60 | # video query attention 61 | outputs, self.vq_score = video_query_attention(vfeats, qfeats, self.v_mask, self.q_mask, reuse=False, 62 | drop_rate=self.drop_rate, name='video_query_attention') 63 | # weighted pooling and concatenation 64 | outputs = context_query_concat(outputs, qfeats, q_mask=self.q_mask, reuse=False, name='context_query_concat') 65 | # highlighting layer 66 | self.highlight_loss, self.highlight_scores = highlight_layer(outputs, self.highlight_labels, mask=self.v_mask, 67 | reuse=False, name='highlighting_layer') 68 | outputs = tf.multiply(outputs, tf.expand_dims(self.highlight_scores, axis=-1)) 69 | # prediction layer 70 | start_logits, end_logits = conditioned_predictor(outputs, hidden_size=self.configs.hidden_size, 71 | seq_len=self.video_seq_length, mask=self.v_mask, 72 | num_heads=self.configs.num_heads, drop_rate=self.drop_rate, 73 | max_position_length=self.configs.max_pos_len, reuse=False, 74 | mode=self.configs.predictor, name='conditioned_predictor') 75 | # compute localization loss 76 | self.start_prob, self.end_prob, self.start_index, self.end_index, self.loss = localization_loss( 77 | start_logits, end_logits, self.y1, self.y2) 78 | # add l2 regularizer loss (uncomment if required) 79 | l2_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) 80 | self.loss += tf.reduce_sum(l2_losses) 81 | # collect regularization losses 82 | self.total_loss = self.loss + self.configs.highlight_lambda * self.highlight_loss 83 | # create optimizer 84 | if self.configs.warmup_proportion > 1.0: 85 | num_warmup_steps = int(self.configs.warmup_proportion) 86 | else: 87 | num_warmup_steps = int(self.configs.num_train_steps * self.configs.warmup_proportion) 88 | self.train_op = create_optimizer(self.total_loss, self.configs.init_lr, self.configs.num_train_steps, 89 | num_warmup_steps, clip_norm=self.configs.clip_norm) 90 | -------------------------------------------------------------------------------- /prepare/README.md: -------------------------------------------------------------------------------- 1 | # Extract Features 2 | 3 | - We use the pre-trained 3D ConvNets ([here](https://github.com/piergiaj/pytorch-i3d)) to prepare the visual features, the 4 | extraction codes are placed in this folder. Please download the pre-trained weights [`rgb_charades.pt`]( 5 | https://github.com/piergiaj/pytorch-i3d/blob/master/models/rgb_charades.pt) and [`rgb_imagenet.pt`]( 6 | https://github.com/piergiaj/pytorch-i3d/blob/master/models/rgb_imagenet.pt). 7 | - The pre-trained GloVe embedding is available at [here](https://nlp.stanford.edu/projects/glove/), please download 8 | `glove.840B.300d.zip`, unzip and put it under `data/` folder. 9 | 10 | ## Charades STA 11 | The train/test datasets of Charades-STA are available at [[jiyanggao/TALL]](https://github.com/jiyanggao/TALL) 12 | ([`charades_sta_train.txt`](https://drive.google.com/file/d/1ZjG7wJpPSMIBYnW7BAG2u9VVEoNvFm5c/view) and 13 | [`charades_sta_test.txt`](https://drive.google.com/file/d/1QG4MXFkoj6JFU0YK5olTY75xTARKSW5e/view)). 14 | 15 | The `charades.json` file is required ([here](https://github.com/piergiaj/super-events-cvpr18/blob/master/data/charades.json)), 16 | which contains the video length information. Download and place it into the same directory of the train/test datasets. 17 | 18 | The videos/images for Charades-STA dataset is available at [here](https://allenai.org/plato/charades/), please download 19 | either `RGB frames at 24fps (76 GB)` (image frames) or `Data (original size) (55 GB)` (videos). For the second one, the 20 | extractor will automatically decompose the video into images. 21 | ```shell script 22 | # download RGB frames 23 | wget http://ai2-website.s3.amazonaws.com/data/Charades_v1_rgb.tar 24 | # or, download videos 25 | wget http://ai2-website.s3.amazonaws.com/data/Charades_v1.zip 26 | ``` 27 | 28 | Extract visual features for Charades-STA: 29 | ```shell script 30 | # use the weights fine-tuned on Charades or the weights pre-trained on ImageNet 31 | python3 extract_charades.py --use_finetuned --load_model /rgb_charades.pt \ # rgb_imagenet.pt 32 | --video_dir \ 33 | --dataset_dir \ 34 | --images_dir \ # if images not exist, decompose video into images 35 | --save_dir \ 36 | --fps 24 --strides 24 --remove_images # whether remove extract images to release space 37 | ``` 38 | 39 | ## TACoS 40 | TACoS dataset is from [[jiyanggao/TALL]](https://github.com/jiyanggao/TALL), while the videos of TACoS is from MPII 41 | Cooking Composite Activities dataset, which can be download [here]( 42 | https://www.mpi-inf.mpg.de/departments/computer-vision-and-machine-learning/research/human-activity-recognition/mpii-cooking-composite-activities/). 43 | Note that we also use the processed TACoS dataset in [[microsoft/2D-TAN]](https://github.com/microsoft/2D-TAN). 44 | 45 | Extract visual features for TACoS: 46 | ```shell script 47 | python3 extract_tacos.py --load_model /rgb_imagenet.pt \ 48 | --video_dir \ 49 | --dataset_dir \ 50 | --images_dir \ # if images not exist, decompose video into images 51 | --save_dir \ 52 | --strides 16 --remove_images # whether remove extracted images to release space 53 | ``` 54 | 55 | (Optional) Convert the pre-trained C3D visual features from [[jiyanggao/TALL]](https://github.com/jiyanggao/TALL) 56 | ([Interval64_128_256_512_overlap0.8_c3d_fc6.tar](https://drive.google.com/file/d/1zQp0aYGFCm8PqqHOh4UtXfy2U3pJMBeu/view), 57 | [Interval128_256_overlap0.8_c3d_fc6.tar](https://drive.google.com/file/d/1zC-UrspRf42Qiu5prQw4fQrbgLQfJN-P/view)): 58 | ```shell script 59 | python3 extract_tacos_org.py --data_path \ 60 | --feature_path \ 61 | --save_dir \ 62 | --sample_rate 64 # sliding windows 63 | ``` 64 | 65 | ## ActivityNet Captions 66 | The train/test sets of ActivityNet Caption are available at [here]( 67 | https://cs.stanford.edu/people/ranjaykrishna/densevid/). The videos can be downloaded using: 68 | ```shell script 69 | python3 download_activitynet_video.py --video_dir \ 70 | --dataset_dir \ 71 | --bash_file 72 | ``` 73 | It will generate a bash file which contains the commands to download all the videos. Suppose the generated bash file is 74 | `video_downloader.sh`, then simply run `bash video_downloader.sh`, it will download the videos and save them into 75 | `video_dir` automatically. 76 | 77 | Extract visual features for ActivityNet Captions: 78 | ```shell script 79 | python3 extract_activitynet.py --load_model /rgb_imagenet.pt \ 80 | --video_dir \ 81 | --dataset_dir \ 82 | --images_dir \ # if images not exist, decompose video into images 83 | --save_dir \ 84 | --strides 16 --remove_images # whether remove extracted images to release space 85 | ``` 86 | 87 | (Optional) We also have the codes to convert the C3D visual features provided in [ActivityNet official website]( 88 | http://activity-net.org/challenges/2016/download.html): 89 | 90 | - download the C3D visual features 91 | ```shell script 92 | wget http://ec2-52-25-205-214.us-west-2.compute.amazonaws.com/data/challenge16/features/c3d/activitynet_v1-3.part-00 93 | wget http://ec2-52-25-205-214.us-west-2.compute.amazonaws.com/data/challenge16/features/c3d/activitynet_v1-3.part-01 94 | wget http://ec2-52-25-205-214.us-west-2.compute.amazonaws.com/data/challenge16/features/c3d/activitynet_v1-3.part-02 95 | wget http://ec2-52-25-205-214.us-west-2.compute.amazonaws.com/data/challenge16/features/c3d/activitynet_v1-3.part-03 96 | wget http://ec2-52-25-205-214.us-west-2.compute.amazonaws.com/data/challenge16/features/c3d/activitynet_v1-3.part-04 97 | wget http://ec2-52-25-205-214.us-west-2.compute.amazonaws.com/data/challenge16/features/c3d/activitynet_v1-3.part-05 98 | cat activitynet_v1-3.part-* > features.zip && unzip features.zip 99 | rm features.zip 100 | rm activitynet_v1-3.part-* 101 | ``` 102 | - convert the features as 103 | ```shell script 104 | python3 extract_activitynet_org.py --dataset_dir \ 105 | --hdf5_file \ 106 | --save_dir 107 | ``` 108 | -------------------------------------------------------------------------------- /util/data_util.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | import json 4 | import pickle 5 | import numpy as np 6 | from tqdm import tqdm 7 | 8 | 9 | def load_json(filename): 10 | with open(filename, mode='r', encoding='utf-8') as f: 11 | data = json.load(f) 12 | return data 13 | 14 | 15 | def save_json(data, filename, save_pretty=False, sort_keys=False): 16 | with open(filename, mode='w', encoding='utf-8') as f: 17 | if save_pretty: 18 | f.write(json.dumps(data, indent=4, sort_keys=sort_keys)) 19 | else: 20 | json.dump(data, f) 21 | 22 | 23 | def load_lines(filename): 24 | with open(filename, mode='r', encoding='utf-8') as f: 25 | return [e.strip("\n") for e in f.readlines()] 26 | 27 | 28 | def save_lines(data, filename): 29 | with open(filename, mode='w', encoding='utf-8') as f: 30 | f.write("\n".join(data)) 31 | 32 | 33 | def load_pickle(filename): 34 | with open(filename, mode='rb') as handle: 35 | data = pickle.load(handle) 36 | return data 37 | 38 | 39 | def save_pickle(data, filename): 40 | with open(filename, mode='wb') as handle: 41 | pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL) 42 | 43 | 44 | def load_video_features(root, max_position_length): 45 | video_features = dict() 46 | filenames = glob.glob(os.path.join(root, "*.npy")) 47 | for filename in tqdm(filenames, total=len(filenames), desc="load video features"): 48 | video_id = filename.split("/")[-1].split(".")[0] 49 | feature = np.load(filename) 50 | if max_position_length is None: 51 | video_features[video_id] = feature 52 | else: 53 | new_feature = visual_feature_sampling(feature, max_num_clips=max_position_length) 54 | video_features[video_id] = new_feature 55 | return video_features 56 | 57 | 58 | def visual_feature_sampling(visual_feature, max_num_clips): 59 | num_clips = visual_feature.shape[0] 60 | if num_clips <= max_num_clips: 61 | return visual_feature 62 | idxs = np.arange(0, max_num_clips + 1, 1.0) / max_num_clips * num_clips 63 | idxs = np.round(idxs).astype(np.int32) 64 | idxs[idxs > num_clips - 1] = num_clips - 1 65 | new_visual_feature = [] 66 | for i in range(max_num_clips): 67 | s_idx, e_idx = idxs[i], idxs[i + 1] 68 | if s_idx < e_idx: 69 | new_visual_feature.append(np.mean(visual_feature[s_idx:e_idx], axis=0)) 70 | else: 71 | new_visual_feature.append(visual_feature[s_idx]) 72 | new_visual_feature = np.asarray(new_visual_feature) 73 | return new_visual_feature 74 | 75 | 76 | def compute_overlap(pred, gt): 77 | # check format 78 | assert isinstance(pred, list) and isinstance(gt, list) 79 | pred_is_list = isinstance(pred[0], list) 80 | gt_is_list = isinstance(gt[0], list) 81 | pred = pred if pred_is_list else [pred] 82 | gt = gt if gt_is_list else [gt] 83 | # compute overlap 84 | pred, gt = np.array(pred), np.array(gt) 85 | inter_left = np.maximum(pred[:, 0, None], gt[None, :, 0]) 86 | inter_right = np.minimum(pred[:, 1, None], gt[None, :, 1]) 87 | inter = np.maximum(0.0, inter_right - inter_left) 88 | union_left = np.minimum(pred[:, 0, None], gt[None, :, 0]) 89 | union_right = np.maximum(pred[:, 1, None], gt[None, :, 1]) 90 | union = np.maximum(1e-12, union_right - union_left) 91 | overlap = 1.0 * inter / union 92 | # reformat output 93 | overlap = overlap if gt_is_list else overlap[:, 0] 94 | overlap = overlap if pred_is_list else overlap[0] 95 | return overlap 96 | 97 | 98 | def time_to_index(start_time, end_time, num_units, duration): 99 | s_times = np.arange(0, num_units).astype(np.float32) / float(num_units) * duration 100 | e_times = np.arange(1, num_units + 1).astype(np.float32) / float(num_units) * duration 101 | candidates = np.stack([np.repeat(s_times[:, None], repeats=num_units, axis=1), 102 | np.repeat(e_times[None, :], repeats=num_units, axis=0)], axis=2).reshape((-1, 2)) 103 | overlaps = compute_overlap(candidates.tolist(), [start_time, end_time]).reshape(num_units, num_units) 104 | start_index = np.argmax(overlaps) // num_units 105 | end_index = np.argmax(overlaps) % num_units 106 | return start_index, end_index, overlaps 107 | 108 | 109 | def index_to_time(start_index, end_index, num_units, duration): 110 | s_times = np.arange(0, num_units).astype(np.float32) * duration / float(num_units) 111 | e_times = np.arange(1, num_units + 1).astype(np.float32) * duration / float(num_units) 112 | start_time = s_times[start_index] 113 | end_time = e_times[end_index] 114 | return start_time, end_time 115 | 116 | 117 | def pad_seq(sequences, pad_tok=None, max_length=None): 118 | if pad_tok is None: 119 | pad_tok = 0 # 0: "PAD" for words and chars, "PAD" for tags 120 | if max_length is None: 121 | max_length = max([len(seq) for seq in sequences]) 122 | sequence_padded, sequence_length = [], [] 123 | for seq in sequences: 124 | seq_ = seq[:max_length] + [pad_tok] * max(max_length - len(seq), 0) 125 | sequence_padded.append(seq_) 126 | sequence_length.append(min(len(seq), max_length)) 127 | return sequence_padded, sequence_length 128 | 129 | 130 | def pad_char_seq(sequences, max_length=None, max_length_2=None): 131 | sequence_padded, sequence_length = [], [] 132 | if max_length is None: 133 | max_length = max(map(lambda x: len(x), sequences)) 134 | if max_length_2 is None: 135 | max_length_2 = max([max(map(lambda x: len(x), seq)) for seq in sequences]) 136 | for seq in sequences: 137 | sp, sl = pad_seq(seq, max_length=max_length_2) 138 | sequence_padded.append(sp) 139 | sequence_length.append(sl) 140 | sequence_padded, _ = pad_seq(sequence_padded, pad_tok=[0] * max_length_2, max_length=max_length) 141 | sequence_length, _ = pad_seq(sequence_length, max_length=max_length) 142 | return sequence_padded, sequence_length 143 | 144 | 145 | def pad_video_seq(sequences, max_length=None): 146 | if max_length is None: 147 | max_length = max([vfeat.shape[0] for vfeat in sequences]) 148 | feature_length = sequences[0].shape[1] 149 | sequence_padded, sequence_length = [], [] 150 | for seq in sequences: 151 | add_length = max_length - seq.shape[0] 152 | sequence_length.append(seq.shape[0]) 153 | if add_length > 0: 154 | add_feature = np.zeros(shape=[add_length, feature_length], dtype=np.float32) 155 | seq_ = np.concatenate([seq, add_feature], axis=0) 156 | else: 157 | seq_ = seq 158 | sequence_padded.append(seq_) 159 | return sequence_padded, sequence_length 160 | -------------------------------------------------------------------------------- /prepare/extract_tacos.py: -------------------------------------------------------------------------------- 1 | import os 2 | import cv2 3 | import glob 4 | import json 5 | import torch 6 | import argparse 7 | import subprocess 8 | import numpy as np 9 | from . import videotransforms 10 | from .feature_extractor import InceptionI3d 11 | from torchvision import transforms 12 | from torch.autograd import Variable 13 | 14 | parser = argparse.ArgumentParser() 15 | parser.add_argument("--gpu_idx", type=str, default="0", help="gpu index") 16 | parser.add_argument("--load_model", type=str, required=True, help="pre-trained model") 17 | parser.add_argument("--video_dir", type=str, required=True, help="where are located the videos") 18 | parser.add_argument("--images_dir", type=str, required=True, help="where to save extracted images") 19 | parser.add_argument("--save_dir", type=str, required=True, help="where to save extracted features") 20 | parser.add_argument("--fps", type=float, default=None, help="frames per second") # TACoS's default fps is 29.4 21 | parser.add_argument("--video_format", type=str, default="avi", help="video format") 22 | parser.add_argument("--strides", type=int, default=16, help="window size") 23 | parser.add_argument("--remove_images", action="store_true", help="whether remove extract images to release space") 24 | args = parser.parse_args() 25 | 26 | os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" 27 | os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_idx 28 | 29 | 30 | def load_images(img_dir, vid, start_frame, lengths): 31 | img_frames, raw_height, raw_width = [], None, None 32 | for x in range(start_frame, start_frame + lengths): 33 | image = cv2.imread(os.path.join(img_dir, "{}-{}.jpg".format(vid, str(x).zfill(6))))[:, :, [2, 1, 0]] 34 | width, height, channel = image.shape 35 | raw_width, raw_height = width, height 36 | # resize image 37 | scale = 1 + (224.0 - min(width, height)) / min(width, height) 38 | image = cv2.resize(image, dsize=(0, 0), fx=scale, fy=scale) 39 | # normalize image to [0, 1] 40 | image = (image / 255.0) * 2 - 1 41 | img_frames.append(image) 42 | return img_frames, raw_width, raw_height 43 | 44 | 45 | def extract_features(image_tensor, model, strides): 46 | b, c, t, h, w = image_tensor.shape 47 | extracted_features = [] 48 | for start in range(0, t, strides): 49 | end = min(t - 1, start + strides) 50 | if end - start < strides: 51 | start = max(0, end - strides) 52 | ip = Variable(torch.from_numpy(image_tensor.numpy()[:, :, start:end]).cuda(), volatile=True) 53 | feature = model.extract_features(ip).data.cpu().numpy() 54 | extracted_features.append(feature) 55 | extracted_features = np.concatenate(extracted_features, axis=0) 56 | return extracted_features 57 | 58 | 59 | if not os.path.exists(args.video_dir): 60 | raise ValueError("The video directory '{}' does not exist!!!".format(args.video_dir)) 61 | 62 | if not os.path.exists(args.images_dir): 63 | os.makedirs(args.images_dir) 64 | 65 | if not os.path.exists(args.save_dir): 66 | os.makedirs(args.save_dir) 67 | 68 | # create I3D model and load pre-trained model 69 | i3d_model = InceptionI3d(400, in_channels=3) 70 | i3d_model.load_state_dict(torch.load(args.load_model)) 71 | i3d_model.cuda() 72 | i3d_model.train(False) 73 | video_transforms = transforms.Compose([videotransforms.CenterCrop(224)]) 74 | 75 | # extract images and features 76 | feature_shapes = dict() 77 | video_paths = glob.glob(os.path.join(args.video_dir, "*.{}".format(args.video_format))) 78 | for idx, video_path in enumerate(video_paths): 79 | video_id = os.path.basename(video_path)[0:-4] # remove suffix 80 | image_dir = os.path.join(args.images_dir, video_id) 81 | 82 | print("{} / {}: extract features for video {}".format(idx + 1, len(video_paths), video_id), flush=True) 83 | 84 | if os.path.exists(os.path.join(args.save_dir, "{}.npy".format(video_id))): 85 | print("the visual features for video {} are exist in {}...".format(video_id, args.save_dir), flush=True) 86 | continue 87 | 88 | # extract images 89 | if os.path.exists(image_dir): 90 | print("the images for video {} already are exist in {}...".format(video_id, args.images_dir)) 91 | else: 92 | os.makedirs(image_dir) 93 | print("extract images with fps={}...".format(args.fps), flush=True) 94 | subprocess.call("ffmpeg -hide_banner -loglevel panic -i {} {}/{}-%6d.jpg".format(video_path, image_dir, 95 | video_id), shell=True) 96 | 97 | # process extracted images 98 | print("load RGB frames...", flush=True) 99 | num_frames = len(os.listdir(image_dir)) 100 | 101 | if num_frames < 10000: 102 | frames, raw_w, raw_h = load_images(image_dir, video_id, 1, num_frames) 103 | frames = np.asarray(frames, dtype=np.float32) 104 | imgs = video_transforms(frames) 105 | img_tensor = torch.from_numpy(np.expand_dims(imgs.transpose([3, 0, 1, 2]), axis=0)) 106 | print("process images:", (frames.shape[0], raw_w, raw_h, frames.shape[-1]), "-->", frames.shape, "-->", 107 | imgs.shape, "-->", tuple(img_tensor.size()), flush=True) 108 | 109 | print("extract visual features...", flush=True) 110 | features = extract_features(img_tensor, i3d_model, args.strides) 111 | np.save(os.path.join(args.save_dir, video_id), arr=features) 112 | print("extracted features shape: {}".format(features.shape), flush=True) 113 | feature_shapes[video_id] = features.shape[0] 114 | 115 | else: 116 | all_features = [] 117 | for start_idx in range(1, num_frames, 10000): 118 | end_idx = min(start_idx + 10000, num_frames + 1) 119 | cur_num_frames = end_idx - start_idx 120 | if cur_num_frames < args.strides: 121 | cur_num_frames = args.strides 122 | start_idx = end_idx - cur_num_frames 123 | frames, raw_w, raw_h = load_images(image_dir, video_id, start_idx, cur_num_frames) 124 | frames = np.asarray(frames, dtype=np.float32) 125 | imgs = video_transforms(frames) 126 | img_tensor = torch.from_numpy(np.expand_dims(imgs.transpose([3, 0, 1, 2]), axis=0)) 127 | print("process images:", (frames.shape[0], raw_w, raw_h, frames.shape[-1]), "-->", frames.shape, "-->", 128 | imgs.shape, "-->", tuple(img_tensor.size()), flush=True) 129 | print("extract visual features...", flush=True) 130 | features = extract_features(img_tensor, i3d_model, args.strides) 131 | all_features.append(features) 132 | all_features = np.concatenate(all_features, axis=0) 133 | np.save(os.path.join(args.save_dir, video_id), arr=all_features) 134 | print("extracted features shape: {}".format(all_features.shape), flush=True) 135 | feature_shapes[video_id] = all_features.shape[0] 136 | 137 | if args.remove_images: 138 | # remove extract images to release memory space 139 | subprocess.call("rm -rf {}".format(image_dir), shell=True) 140 | 141 | with open(os.path.join(args.save_dir, "feature_shapes.json"), mode="w", encoding="utf-8") as f: 142 | json.dump(feature_shapes, f) 143 | -------------------------------------------------------------------------------- /prepare/extract_activitynet.py: -------------------------------------------------------------------------------- 1 | import os 2 | import cv2 3 | import glob 4 | import json 5 | import torch 6 | import argparse 7 | import subprocess 8 | import numpy as np 9 | from . import videotransforms 10 | from .feature_extractor import InceptionI3d 11 | from torchvision import transforms 12 | from torch.autograd import Variable 13 | 14 | parser = argparse.ArgumentParser() 15 | parser.add_argument("--gpu_idx", type=str, default="0", help="gpu index") 16 | parser.add_argument("--load_model", type=str, required=True, help="pre-trained model") 17 | parser.add_argument("--video_dir", type=str, required=True, help="where are located the videos") 18 | parser.add_argument("--images_dir", type=str, required=True, help="where to save extracted images") 19 | parser.add_argument("--save_dir", type=str, required=True, help="where to save extracted features") 20 | parser.add_argument("--fps", type=int, default=None, help="frames per second") 21 | parser.add_argument("--video_format", type=str, default="mp4", help="video format") 22 | parser.add_argument("--strides", type=int, default=16, help="window size") 23 | parser.add_argument("--remove_images", action="store_true", help="whether remove extract images to release space") 24 | args = parser.parse_args() 25 | 26 | os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" 27 | os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_idx 28 | 29 | 30 | def load_images(img_dir, vid, start_frame, lengths): 31 | img_frames, raw_height, raw_width = [], None, None 32 | for x in range(start_frame, start_frame + lengths): 33 | image = cv2.imread(os.path.join(img_dir, "{}-{}.jpg".format(vid, str(x).zfill(6))))[:, :, [2, 1, 0]] 34 | width, height, channel = image.shape 35 | raw_width, raw_height = width, height 36 | # resize image 37 | scale = 1 + (224.0 - min(width, height)) / min(width, height) 38 | image = cv2.resize(image, dsize=(0, 0), fx=scale, fy=scale) 39 | # normalize image to [0, 1] 40 | image = (image / 255.0) * 2 - 1 41 | img_frames.append(image) 42 | return img_frames, raw_width, raw_height 43 | 44 | 45 | def extract_features(image_tensor, model, strides): 46 | b, c, t, h, w = image_tensor.shape 47 | extracted_features = [] 48 | for start in range(0, t, strides): 49 | end = min(t - 1, start + strides) 50 | if end - start < strides: 51 | start = max(0, end - strides) 52 | ip = Variable(torch.from_numpy(image_tensor.numpy()[:, :, start:end]).cuda(), volatile=True) 53 | feature = model.extract_features(ip).data.cpu().numpy() 54 | extracted_features.append(feature) 55 | extracted_features = np.concatenate(extracted_features, axis=0) 56 | return extracted_features 57 | 58 | 59 | if not os.path.exists(args.video_dir): 60 | raise ValueError("The video directory '{}' does not exist!!!".format(args.video_dir)) 61 | 62 | if not os.path.exists(args.images_dir): 63 | os.makedirs(args.images_dir) 64 | 65 | if not os.path.exists(args.save_dir): 66 | os.makedirs(args.save_dir) 67 | 68 | # create I3D model and load pre-trained model 69 | i3d_model = InceptionI3d(400, in_channels=3) 70 | i3d_model.load_state_dict(torch.load(args.load_model)) 71 | i3d_model.cuda() 72 | i3d_model.train(False) 73 | video_transforms = transforms.Compose([videotransforms.CenterCrop(224)]) 74 | 75 | # extract images and features 76 | feature_shapes = dict() 77 | video_paths = glob.glob(os.path.join(args.video_dir, "*.{}".format(args.video_format))) 78 | for idx, video_path in enumerate(video_paths): 79 | video_id = os.path.basename(video_path)[0:-4] # remove suffix 80 | image_dir = os.path.join(args.images_dir, video_id) 81 | 82 | print("{} / {}: extract features for video {}".format(idx + 1, len(video_paths), video_id), flush=True) 83 | 84 | if os.path.exists(os.path.join(args.save_dir, "{}.npy".format(video_id))): 85 | print("the visual features for video {} are exist in {}...\n".format(video_id, args.save_dir), flush=True) 86 | continue 87 | 88 | # extract images 89 | if os.path.exists(image_dir): 90 | print("the images for video {} already are exist in {}...".format(video_id, args.images_dir)) 91 | else: 92 | os.makedirs(image_dir) 93 | print("extract images with fps={}...".format(args.fps), flush=True) 94 | if args.fps is None or args.fps <= 0: 95 | subprocess.call("ffmpeg -hide_banner -loglevel panic -i {} {}/{}-%6d.jpg".format( 96 | video_path, image_dir, video_id), shell=True) 97 | else: 98 | subprocess.call("ffmpeg -hide_banner -loglevel panic -i {} -filter:v fps=fps={} {}/{}-%6d.jpg".format( 99 | video_path, args.fps, image_dir, video_id), shell=True) 100 | 101 | # process extracted images 102 | print("load RGB frames...", flush=True) 103 | num_frames = len(os.listdir(image_dir)) 104 | 105 | if num_frames < 10000: 106 | frames, raw_w, raw_h = load_images(image_dir, video_id, 1, num_frames) 107 | frames = np.asarray(frames, dtype=np.float32) 108 | imgs = video_transforms(frames) 109 | img_tensor = torch.from_numpy(np.expand_dims(imgs.transpose([3, 0, 1, 2]), axis=0)) 110 | print("process images:", (frames.shape[0], raw_w, raw_h, frames.shape[-1]), "-->", frames.shape, "-->", 111 | imgs.shape, "-->", tuple(img_tensor.size()), flush=True) 112 | 113 | print("extract visual features...", flush=True) 114 | features = extract_features(img_tensor, i3d_model, args.strides) 115 | np.save(os.path.join(args.save_dir, video_id), arr=features) 116 | print("extracted features shape: {}".format(features.shape), flush=True) 117 | feature_shapes[video_id] = features.shape[0] 118 | 119 | else: 120 | all_features = [] 121 | for start_idx in range(1, num_frames, 10000): 122 | end_idx = min(start_idx + 10000, num_frames + 1) 123 | cur_num_frames = end_idx - start_idx 124 | if cur_num_frames < args.strides: 125 | cur_num_frames = args.strides 126 | start_idx = end_idx - cur_num_frames 127 | frames, raw_w, raw_h = load_images(image_dir, video_id, start_idx, cur_num_frames) 128 | frames = np.asarray(frames, dtype=np.float32) 129 | imgs = video_transforms(frames) 130 | img_tensor = torch.from_numpy(np.expand_dims(imgs.transpose([3, 0, 1, 2]), axis=0)) 131 | print("process images:", (frames.shape[0], raw_w, raw_h, frames.shape[-1]), "-->", frames.shape, "-->", 132 | imgs.shape, "-->", tuple(img_tensor.size()), flush=True) 133 | print("extract visual features...", flush=True) 134 | features = extract_features(img_tensor, i3d_model, args.strides) 135 | all_features.append(features) 136 | all_features = np.concatenate(all_features, axis=0) 137 | np.save(os.path.join(args.save_dir, video_id), arr=all_features) 138 | print("extracted features shape: {}".format(all_features.shape), flush=True) 139 | feature_shapes[video_id] = all_features.shape[0] 140 | 141 | if args.remove_images: 142 | # remove extract images to release memory space 143 | subprocess.call("rm -rf {}".format(image_dir), shell=True) 144 | 145 | with open(os.path.join(args.save_dir, "feature_shapes.json"), mode="w", encoding="utf-8") as f: 146 | json.dump(feature_shapes, f) 147 | -------------------------------------------------------------------------------- /model/ops.py: -------------------------------------------------------------------------------- 1 | import re 2 | import numpy as np 3 | import tensorflow as tf 4 | 5 | if tf.__version__.startswith('2'): 6 | tf = tf.compat.v1 7 | tf.disable_v2_behavior() 8 | tf.disable_eager_execution() 9 | regularizer = tf.keras.regularizers.l2(l2=3e-7) 10 | else: 11 | regularizer = tf.contrib.layers.l2_regularizer(scale=3e-7) 12 | 13 | 14 | def count_params(scope=None): 15 | if scope is None: 16 | return int(np.sum([np.prod(v.get_shape().as_list()) for v in tf.trainable_variables()])) 17 | else: 18 | return int(np.sum([np.prod(v.get_shape().as_list()) for v in tf.trainable_variables(scope)])) 19 | 20 | 21 | def get_shape_list(tensor): 22 | shape = tensor.shape.as_list() 23 | non_static_indexes = [] 24 | for (index, dim) in enumerate(shape): 25 | if dim is None: 26 | non_static_indexes.append(index) 27 | if not non_static_indexes: 28 | return shape 29 | dyn_shape = tf.shape(tensor) 30 | for index in non_static_indexes: 31 | shape[index] = dyn_shape[index] 32 | return shape 33 | 34 | 35 | def mask_logits(inputs, mask, mask_value=-1e30): 36 | mask = tf.cast(mask, tf.float32) 37 | return inputs * mask + mask_value * (1.0 - mask) 38 | 39 | 40 | def trilinear_attention(args, v_maxlen, q_maxlen, drop_rate=0.0, reuse=None, name='efficient_trilinear'): 41 | assert len(args) == 2, 'just use for computing attention with two input' 42 | arg0_shape = args[0].get_shape().as_list() 43 | arg1_shape = args[1].get_shape().as_list() 44 | if len(arg0_shape) != 3 or len(arg1_shape) != 3: 45 | raise ValueError('`args` must be 3 dims (batch_size, len, dimension)') 46 | if arg0_shape[2] != arg1_shape[2]: 47 | raise ValueError('the last dimension of `args` must equal') 48 | arg_size = arg0_shape[2] 49 | dtype = args[0].dtype 50 | drop_args = [tf.nn.dropout(arg, rate=drop_rate) for arg in args] 51 | with tf.variable_scope(name, reuse=reuse): 52 | weights4arg0 = tf.get_variable('linear_kernel4arg0', [arg_size, 1], dtype=dtype, regularizer=regularizer) 53 | weights4arg1 = tf.get_variable('linear_kernel4arg1', [arg_size, 1], dtype=dtype, regularizer=regularizer) 54 | weights4mlu = tf.get_variable('linear_kernel4mul', [1, 1, arg_size], dtype=dtype, regularizer=regularizer) 55 | # compute results 56 | weights4arg0 = tf.tile(tf.expand_dims(weights4arg0, axis=0), multiples=[tf.shape(args[0])[0], 1, 1]) 57 | subres0 = tf.tile(tf.matmul(drop_args[0], weights4arg0), [1, 1, q_maxlen]) 58 | weights4arg1 = tf.tile(tf.expand_dims(weights4arg1, axis=0), multiples=[tf.shape(args[1])[0], 1, 1]) 59 | subres1 = tf.tile(tf.transpose(tf.matmul(drop_args[1], weights4arg1), perm=(0, 2, 1)), [1, v_maxlen, 1]) 60 | subres2 = tf.matmul(drop_args[0] * weights4mlu, tf.transpose(drop_args[1], perm=(0, 2, 1))) 61 | res = subres0 + subres1 + subres2 62 | return res 63 | 64 | 65 | def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, clip_norm=1.0): 66 | """Creates an optimizer training op.""" 67 | global_step = tf.train.get_or_create_global_step() 68 | 69 | learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32) 70 | learning_rate = tf.train.polynomial_decay(learning_rate, global_step, num_train_steps, end_learning_rate=0.0, 71 | power=1.0, cycle=False) 72 | if num_warmup_steps: 73 | global_steps_int = tf.cast(global_step, tf.int32) 74 | warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32) 75 | global_steps_float = tf.cast(global_steps_int, tf.float32) 76 | warmup_steps_float = tf.cast(warmup_steps_int, tf.float32) 77 | warmup_percent_done = global_steps_float / warmup_steps_float 78 | warmup_learning_rate = init_lr * warmup_percent_done 79 | is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32) 80 | learning_rate = ((1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate) 81 | optimizer = AdamWeightDecayOptimizer(learning_rate=learning_rate, weight_decay_rate=0.01, beta_1=0.9, beta_2=0.999, 82 | epsilon=1e-6, exclude_from_weight_decay=['LayerNorm', 'layer_norm', 'bias']) 83 | tvars = tf.trainable_variables() 84 | grads = tf.gradients(loss, tvars) 85 | (grads, _) = tf.clip_by_global_norm(grads, clip_norm=clip_norm) 86 | train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=global_step) 87 | # Normally the global step update is done inside of `apply_gradients`. However, `AdamWeightDecayOptimizer` doesn't 88 | # do this. But if you use a different optimizer, you should probably take this line out. 89 | new_global_step = global_step + 1 90 | train_op = tf.group(train_op, [global_step.assign(new_global_step)]) 91 | return train_op 92 | 93 | 94 | class AdamWeightDecayOptimizer(tf.train.Optimizer): 95 | """A basic Adam optimizer that includes "correct" L2 weight decay.""" 96 | 97 | def __init__(self, learning_rate, weight_decay_rate=0.0, beta_1=0.9, beta_2=0.999, epsilon=1e-6, 98 | exclude_from_weight_decay=None, name='AdamWeightDecayOptimizer'): 99 | """Constructs a AdamWeightDecayOptimizer.""" 100 | super(AdamWeightDecayOptimizer, self).__init__(False, name) 101 | self.learning_rate = learning_rate 102 | self.weight_decay_rate = weight_decay_rate 103 | self.beta_1 = beta_1 104 | self.beta_2 = beta_2 105 | self.epsilon = epsilon 106 | self.exclude_from_weight_decay = exclude_from_weight_decay 107 | 108 | def apply_gradients(self, grads_and_vars, global_step=None, name=None): 109 | """See base class.""" 110 | assignments = [] 111 | for (grad, param) in grads_and_vars: 112 | if grad is None or param is None: 113 | continue 114 | param_name = self._get_variable_name(param.name) 115 | m = tf.get_variable(name=param_name + '/adam_m', shape=param.shape.as_list(), dtype=tf.float32, 116 | trainable=False, initializer=tf.zeros_initializer()) 117 | v = tf.get_variable(name=param_name + '/adam_v', shape=param.shape.as_list(), dtype=tf.float32, 118 | trainable=False, initializer=tf.zeros_initializer()) 119 | next_m = (tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad)) 120 | next_v = (tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2, tf.square(grad))) 121 | update = next_m / (tf.sqrt(next_v) + self.epsilon) 122 | if self._do_use_weight_decay(param_name): 123 | update += self.weight_decay_rate * param 124 | update_with_lr = self.learning_rate * update 125 | next_param = param - update_with_lr 126 | assignments.extend([param.assign(next_param), m.assign(next_m), v.assign(next_v)]) 127 | return tf.group(*assignments, name=name) 128 | 129 | def _do_use_weight_decay(self, param_name): 130 | """Whether to use L2 weight decay for `param_name`.""" 131 | if not self.weight_decay_rate: 132 | return False 133 | if self.exclude_from_weight_decay: 134 | for r in self.exclude_from_weight_decay: 135 | if re.search(r, param_name) is not None: 136 | return False 137 | return True 138 | 139 | @staticmethod 140 | def _get_variable_name(param_name): 141 | """Get the variable name from the tensor name.""" 142 | m = re.match("^(.*):\\d+$", param_name) 143 | if m is not None: 144 | param_name = m.group(1) 145 | return param_name 146 | 147 | def _apply_dense(self, grad, var): 148 | pass 149 | 150 | def _resource_apply_dense(self, grad, handle): 151 | pass 152 | 153 | def _resource_apply_sparse(self, grad, handle, indices): 154 | pass 155 | 156 | def _apply_sparse(self, grad, var): 157 | pass 158 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import tensorflow as tf 4 | from tqdm import tqdm 5 | from model.VSLNet import VSLNet 6 | from util.data_gen import gen_or_load_dataset 7 | from util.data_util import load_video_features, save_json, load_json 8 | from util.data_loader import TrainLoader, TestLoader 9 | from util.runner_utils import set_tf_config, get_feed_dict, write_tf_summary, eval_test 10 | 11 | if tf.__version__.startswith('2'): 12 | tf = tf.compat.v1 13 | tf.disable_v2_behavior() 14 | tf.disable_eager_execution() 15 | 16 | parser = argparse.ArgumentParser() 17 | # data parameters 18 | parser.add_argument('--save_dir', type=str, default='datasets', help='path to save processed dataset') 19 | parser.add_argument('--task', type=str, default='charades', help='target task') 20 | parser.add_argument('--fv', type=str, default='new', help='[new | org] for visual features') 21 | parser.add_argument('--max_pos_len', type=int, default=128, help='maximal position sequence length allowed') 22 | # model parameters 23 | parser.add_argument("--char_size", type=int, default=None, help="number of characters") 24 | parser.add_argument("--word_dim", type=int, default=300, help="word embedding dimension") 25 | parser.add_argument("--video_feature_dim", type=int, default=1024, help="video feature input dimension") 26 | parser.add_argument("--char_dim", type=int, default=50, help="character dimension, set to 100 for activitynet") 27 | parser.add_argument("--hidden_size", type=int, default=128, help="hidden size") 28 | parser.add_argument("--highlight_lambda", type=float, default=5.0, help="lambda for highlight region") 29 | parser.add_argument("--num_heads", type=int, default=8, help="number of heads") 30 | parser.add_argument("--drop_rate", type=float, default=0.2, help="dropout rate") 31 | parser.add_argument('--predictor', type=str, default='rnn', help='[rnn | transformer]') 32 | # training/evaluation parameters 33 | parser.add_argument("--gpu_idx", type=str, default="0", help="GPU index") 34 | parser.add_argument("--seed", type=int, default=12345, help="random seed") 35 | parser.add_argument("--mode", type=str, default="train", help="[train | test]") 36 | parser.add_argument("--epochs", type=int, default=100, help="number of epochs") 37 | parser.add_argument("--batch_size", type=int, default=16, help="batch size") 38 | parser.add_argument("--num_train_steps", type=int, default=None, help="number of training steps") 39 | parser.add_argument("--init_lr", type=float, default=0.0001, help="initial learning rate") 40 | parser.add_argument("--clip_norm", type=float, default=1.0, help="gradient clip norm") 41 | parser.add_argument("--warmup_proportion", type=float, default=0.0, help="warmup proportion") 42 | parser.add_argument("--extend", type=float, default=0.1, help="highlight region extension") 43 | parser.add_argument("--period", type=int, default=100, help="training loss print period") 44 | parser.add_argument('--model_dir', type=str, default='ckpt', help='path to save trained model weights') 45 | parser.add_argument('--model_name', type=str, default='vslnet', help='model name') 46 | parser.add_argument('--suffix', type=str, default=None, help='set to the last `_xxx` in ckpt repo to eval results') 47 | configs = parser.parse_args() 48 | 49 | # set tensorflow configs 50 | set_tf_config(configs.seed, configs.gpu_idx) 51 | 52 | # prepare or load dataset 53 | if tf.__version__.startswith('2'): 54 | configs.save_dir = 'datasets_tf2' # avoid `ValueError: unsupported pickle protocol: 5` 55 | configs.model_dir = 'ckpt_tf2' 56 | dataset = gen_or_load_dataset(configs) 57 | configs.char_size = dataset['n_chars'] 58 | 59 | # get train and test loader 60 | visual_features = load_video_features(os.path.join('data', 'features', configs.task, configs.fv), configs.max_pos_len) 61 | train_loader = TrainLoader(dataset=dataset['train_set'], visual_features=visual_features, configs=configs) 62 | test_loader = TestLoader(datasets=dataset, visual_features=visual_features, configs=configs) 63 | configs.num_train_steps = train_loader.num_batches() * configs.epochs 64 | num_train_batches = train_loader.num_batches() 65 | 66 | # create model dir 67 | home_dir = os.path.join(configs.model_dir, '_'.join([configs.model_name, configs.task, configs.fv, 68 | str(configs.max_pos_len), configs.predictor])) 69 | if configs.suffix is not None: 70 | home_dir = home_dir + '_' + configs.suffix 71 | log_dir = os.path.join(home_dir, "event") 72 | model_dir = os.path.join(home_dir, "model") 73 | 74 | # train and test 75 | if configs.mode.lower() == 'train': 76 | if not os.path.exists(model_dir): 77 | os.makedirs(model_dir) 78 | if not os.path.exists(log_dir): 79 | os.makedirs(log_dir) 80 | eval_period = num_train_batches // 2 81 | save_json(vars(configs), os.path.join(model_dir, 'configs.json'), sort_keys=True, save_pretty=True) 82 | with tf.Graph().as_default() as graph: 83 | model = VSLNet(configs, graph=graph, vectors=dataset['word_vector']) 84 | sess_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) 85 | sess_config.gpu_options.allow_growth = True 86 | with tf.Session(config=sess_config) as sess: 87 | saver = tf.train.Saver(max_to_keep=3) 88 | writer = tf.summary.FileWriter(log_dir) 89 | sess.run(tf.global_variables_initializer()) 90 | best_r1i7 = -1.0 91 | score_writer = open(os.path.join(model_dir, "eval_results.txt"), mode="w", encoding="utf-8") 92 | for epoch in range(configs.epochs): 93 | for data in tqdm(train_loader.batch_iter(), total=num_train_batches, desc='Epoch %3d / 3%d' % ( 94 | epoch + 1, configs.epochs)): 95 | # run the model 96 | feed_dict = get_feed_dict(data, model, drop_rate=configs.drop_rate) 97 | _, loss, h_loss, global_step = sess.run([model.train_op, model.loss, model.highlight_loss, 98 | model.global_step], feed_dict=feed_dict) 99 | if global_step % configs.period == 0: 100 | write_tf_summary(writer, [("train/loss", loss), ("train/highlight_loss", h_loss)], global_step) 101 | # evaluate 102 | if global_step % eval_period == 0 or global_step % num_train_batches == 0: 103 | r1i3, r1i5, r1i7, mi, value_pairs, score_str = eval_test( 104 | sess=sess, model=model, data_loader=test_loader, epoch=epoch + 1, global_step=global_step, 105 | mode="test") 106 | print('\nEpoch: %2d | Step: %5d | r1i3: %.2f | r1i5: %.2f | r1i7: %.2f | mIoU: %.2f' % ( 107 | epoch + 1, global_step, r1i3, r1i5, r1i7, mi), flush=True) 108 | write_tf_summary(writer, value_pairs, global_step) 109 | score_writer.write(score_str) 110 | score_writer.flush() 111 | if r1i7 > best_r1i7: 112 | best_r1i7 = r1i7 113 | filename = os.path.join(model_dir, "{}_{}.ckpt".format(configs.model_name, global_step)) 114 | saver.save(sess, filename) 115 | score_writer.close() 116 | 117 | elif configs.mode.lower() == 'test': 118 | if not os.path.exists(model_dir): 119 | raise ValueError('No pre-trained weights exist') 120 | # load previous configs 121 | pre_configs = load_json(os.path.join(model_dir, "configs.json")) 122 | parser.set_defaults(**pre_configs) 123 | configs = parser.parse_args() 124 | with tf.Graph().as_default() as graph: 125 | model = VSLNet(configs, graph=graph, vectors=dataset['word_vector']) 126 | sess_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) 127 | sess_config.gpu_options.allow_growth = True 128 | with tf.Session(config=sess_config) as sess: 129 | saver = tf.train.Saver() 130 | sess.run(tf.global_variables_initializer()) 131 | saver.restore(sess, tf.train.latest_checkpoint(model_dir)) 132 | r1i3, r1i5, r1i7, mi, *_ = eval_test(sess, model, data_loader=test_loader, mode="test") 133 | print("\n" + "\x1b[1;31m" + "Rank@1, IoU=0.3:\t{:.2f}".format(r1i3) + "\x1b[0m", flush=True) 134 | print("\x1b[1;31m" + "Rank@1, IoU=0.5:\t{:.2f}".format(r1i5) + "\x1b[0m", flush=True) 135 | print("\x1b[1;31m" + "Rank@1, IoU=0.7:\t{:.2f}".format(r1i7) + "\x1b[0m", flush=True) 136 | print("\x1b[1;31m" + "{}:\t{:.2f}".format("mean IoU".ljust(15), mi) + "\x1b[0m", flush=True) 137 | 138 | else: 139 | raise ValueError("Unknown mode {}!!!".format(configs.mode)) 140 | -------------------------------------------------------------------------------- /main_t7.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import torch 4 | import torch.nn as nn 5 | from tqdm import tqdm 6 | from model.VSLNet_t7 import VSLNet, build_optimizer_and_scheduler 7 | from util.data_util import load_video_features, save_json, load_json 8 | from util.data_gen import gen_or_load_dataset 9 | from util.data_loader_t7 import get_train_loader, get_test_loader 10 | from util.runner_utils_t7 import set_th_config, convert_length_to_mask, eval_test, filter_checkpoints, \ 11 | get_last_checkpoint 12 | 13 | parser = argparse.ArgumentParser() 14 | # data parameters 15 | parser.add_argument('--save_dir', type=str, default='datasets_t7', help='path to save processed dataset') 16 | parser.add_argument('--task', type=str, default='charades', help='target task') 17 | parser.add_argument('--fv', type=str, default='new', help='[new | org] for visual features') 18 | parser.add_argument('--max_pos_len', type=int, default=128, help='maximal position sequence length allowed') 19 | # model parameters 20 | parser.add_argument("--word_size", type=int, default=None, help="number of words") 21 | parser.add_argument("--char_size", type=int, default=None, help="number of characters") 22 | parser.add_argument("--word_dim", type=int, default=300, help="word embedding dimension") 23 | parser.add_argument("--video_feature_dim", type=int, default=1024, help="video feature input dimension") 24 | parser.add_argument("--char_dim", type=int, default=50, help="character dimension, set to 100 for activitynet") 25 | parser.add_argument("--dim", type=int, default=128, help="hidden size") 26 | parser.add_argument("--highlight_lambda", type=float, default=5.0, help="lambda for highlight region") 27 | parser.add_argument("--num_heads", type=int, default=8, help="number of heads") 28 | parser.add_argument("--drop_rate", type=float, default=0.2, help="dropout rate") 29 | parser.add_argument('--predictor', type=str, default='rnn', help='[rnn | transformer]') 30 | # training/evaluation parameters 31 | parser.add_argument("--gpu_idx", type=str, default="0", help="GPU index") 32 | parser.add_argument("--seed", type=int, default=12345, help="random seed") 33 | parser.add_argument("--mode", type=str, default="train", help="[train | test]") 34 | parser.add_argument("--epochs", type=int, default=100, help="number of epochs") 35 | parser.add_argument("--batch_size", type=int, default=16, help="batch size") 36 | parser.add_argument("--num_train_steps", type=int, default=None, help="number of training steps") 37 | parser.add_argument("--init_lr", type=float, default=0.0001, help="initial learning rate") 38 | parser.add_argument("--clip_norm", type=float, default=1.0, help="gradient clip norm") 39 | parser.add_argument("--warmup_proportion", type=float, default=0.0, help="warmup proportion") 40 | parser.add_argument("--extend", type=float, default=0.1, help="highlight region extension") 41 | parser.add_argument("--period", type=int, default=100, help="training loss print period") 42 | parser.add_argument('--model_dir', type=str, default='ckpt_t7', help='path to save trained model weights') 43 | parser.add_argument('--model_name', type=str, default='vslnet', help='model name') 44 | parser.add_argument('--suffix', type=str, default=None, help='set to the last `_xxx` in ckpt repo to eval results') 45 | configs = parser.parse_args() 46 | 47 | # set tensorflow configs 48 | set_th_config(configs.seed) 49 | 50 | # prepare or load dataset 51 | dataset = gen_or_load_dataset(configs) 52 | configs.char_size = dataset['n_chars'] 53 | configs.word_size = dataset['n_words'] 54 | 55 | # get train and test loader 56 | visual_features = load_video_features(os.path.join('data', 'features', configs.task, configs.fv), configs.max_pos_len) 57 | train_loader = get_train_loader(dataset=dataset['train_set'], video_features=visual_features, configs=configs) 58 | val_loader = None if dataset['val_set'] is None else get_test_loader(dataset['val_set'], visual_features, configs) 59 | test_loader = get_test_loader(dataset=dataset['test_set'], video_features=visual_features, configs=configs) 60 | configs.num_train_steps = len(train_loader) * configs.epochs 61 | num_train_batches = len(train_loader) 62 | num_val_batches = 0 if val_loader is None else len(val_loader) 63 | num_test_batches = len(test_loader) 64 | 65 | # Device configuration 66 | cuda_str = 'cuda' if configs.gpu_idx is None else 'cuda:{}'.format(configs.gpu_idx) 67 | device = torch.device(cuda_str if torch.cuda.is_available() else 'cpu') 68 | 69 | # create model dir 70 | home_dir = os.path.join(configs.model_dir, '_'.join([configs.model_name, configs.task, configs.fv, 71 | str(configs.max_pos_len), configs.predictor])) 72 | if configs.suffix is not None: 73 | home_dir = home_dir + '_' + configs.suffix 74 | model_dir = os.path.join(home_dir, "model") 75 | 76 | # train and test 77 | if configs.mode.lower() == 'train': 78 | if not os.path.exists(model_dir): 79 | os.makedirs(model_dir) 80 | eval_period = num_train_batches // 2 81 | save_json(vars(configs), os.path.join(model_dir, 'configs.json'), sort_keys=True, save_pretty=True) 82 | # build model 83 | model = VSLNet(configs=configs, word_vectors=dataset['word_vector']).to(device) 84 | optimizer, scheduler = build_optimizer_and_scheduler(model, configs=configs) 85 | # start training 86 | best_r1i7 = -1.0 87 | score_writer = open(os.path.join(model_dir, "eval_results.txt"), mode="w", encoding="utf-8") 88 | print('start training...', flush=True) 89 | global_step = 0 90 | for epoch in range(configs.epochs): 91 | model.train() 92 | for data in tqdm(train_loader, total=num_train_batches, desc='Epoch %3d / %3d' % (epoch + 1, configs.epochs)): 93 | global_step += 1 94 | _, vfeats, vfeat_lens, word_ids, char_ids, s_labels, e_labels, h_labels = data 95 | # prepare features 96 | vfeats, vfeat_lens = vfeats.to(device), vfeat_lens.to(device) 97 | word_ids, char_ids = word_ids.to(device), char_ids.to(device) 98 | s_labels, e_labels, h_labels = s_labels.to(device), e_labels.to(device), h_labels.to(device) 99 | # generate mask 100 | query_mask = (torch.zeros_like(word_ids) != word_ids).float().to(device) 101 | video_mask = convert_length_to_mask(vfeat_lens).to(device) 102 | # compute logits 103 | h_score, start_logits, end_logits = model(word_ids, char_ids, vfeats, video_mask, query_mask) 104 | # compute loss 105 | highlight_loss = model.compute_highlight_loss(h_score, h_labels, video_mask) 106 | loc_loss = model.compute_loss(start_logits, end_logits, s_labels, e_labels) 107 | total_loss = loc_loss + configs.highlight_lambda * highlight_loss 108 | # compute and apply gradients 109 | optimizer.zero_grad() 110 | total_loss.backward() 111 | nn.utils.clip_grad_norm_(model.parameters(), configs.clip_norm) # clip gradient 112 | optimizer.step() 113 | scheduler.step() 114 | # evaluate 115 | if global_step % eval_period == 0 or global_step % num_train_batches == 0: 116 | model.eval() 117 | r1i3, r1i5, r1i7, mi, score_str = eval_test(model=model, data_loader=test_loader, device=device, 118 | mode='test', epoch=epoch + 1, global_step=global_step) 119 | print('\nEpoch: %2d | Step: %5d | r1i3: %.2f | r1i5: %.2f | r1i7: %.2f | mIoU: %.2f' % ( 120 | epoch + 1, global_step, r1i3, r1i5, r1i7, mi), flush=True) 121 | score_writer.write(score_str) 122 | score_writer.flush() 123 | if r1i7 >= best_r1i7: 124 | best_r1i7 = r1i7 125 | torch.save(model.state_dict(), os.path.join(model_dir, '{}_{}.t7'.format(configs.model_name, 126 | global_step))) 127 | # only keep the top-3 model checkpoints 128 | filter_checkpoints(model_dir, suffix='t7', max_to_keep=3) 129 | model.train() 130 | score_writer.close() 131 | 132 | elif configs.mode.lower() == 'test': 133 | if not os.path.exists(model_dir): 134 | raise ValueError('No pre-trained weights exist') 135 | # load previous configs 136 | pre_configs = load_json(os.path.join(model_dir, "configs.json")) 137 | parser.set_defaults(**pre_configs) 138 | configs = parser.parse_args() 139 | # build model 140 | model = VSLNet(configs=configs, word_vectors=dataset['word_vector']).to(device) 141 | # get last checkpoint file 142 | filename = get_last_checkpoint(model_dir, suffix='t7') 143 | model.load_state_dict(torch.load(filename)) 144 | model.eval() 145 | r1i3, r1i5, r1i7, mi, _ = eval_test(model=model, data_loader=test_loader, device=device, mode='test') 146 | print("\n" + "\x1b[1;31m" + "Rank@1, IoU=0.3:\t{:.2f}".format(r1i3) + "\x1b[0m", flush=True) 147 | print("\x1b[1;31m" + "Rank@1, IoU=0.5:\t{:.2f}".format(r1i5) + "\x1b[0m", flush=True) 148 | print("\x1b[1;31m" + "Rank@1, IoU=0.7:\t{:.2f}".format(r1i7) + "\x1b[0m", flush=True) 149 | print("\x1b[1;31m" + "{}:\t{:.2f}".format("mean IoU".ljust(15), mi) + "\x1b[0m", flush=True) 150 | -------------------------------------------------------------------------------- /util/data_gen.py: -------------------------------------------------------------------------------- 1 | import os 2 | import codecs 3 | import numpy as np 4 | from tqdm import tqdm 5 | from collections import Counter 6 | from nltk.tokenize import word_tokenize 7 | from util.data_util import load_json, load_lines, load_pickle, save_pickle, time_to_index 8 | 9 | PAD, UNK = "", "" 10 | 11 | 12 | class CharadesProcessor: 13 | def __init__(self): 14 | super(CharadesProcessor, self).__init__() 15 | self.idx_counter = 0 16 | 17 | def reset_idx_counter(self): 18 | self.idx_counter = 0 19 | 20 | def process_data(self, data, charades, scope): 21 | results = [] 22 | for line in tqdm(data, total=len(data), desc='process charades-sta {}'.format(scope)): 23 | line = line.lstrip().rstrip() 24 | if len(line) == 0: 25 | continue 26 | video_info, sentence = line.split('##') 27 | vid, start_time, end_time = video_info.split(' ') 28 | duration = float(charades[vid]['duration']) 29 | start_time = max(0.0, float(start_time)) 30 | end_time = min(float(end_time), duration) 31 | words = word_tokenize(sentence.strip().lower(), language="english") 32 | record = {'sample_id': self.idx_counter, 'vid': str(vid), 's_time': start_time, 'e_time': end_time, 33 | 'duration': duration, 'words': words} 34 | results.append(record) 35 | self.idx_counter += 1 36 | return results 37 | 38 | def convert(self, data_dir): 39 | self.reset_idx_counter() 40 | if not os.path.exists(data_dir): 41 | raise ValueError('data dir {} does not exist'.format(data_dir)) 42 | # load raw data 43 | charades = load_json(os.path.join(data_dir, 'charades.json')) 44 | train_data = load_lines(os.path.join(data_dir, 'charades_sta_train.txt')) 45 | test_data = load_lines(os.path.join(data_dir, 'charades_sta_test.txt')) 46 | # process data 47 | train_set = self.process_data(train_data, charades, scope='train') 48 | test_set = self.process_data(test_data, charades, scope='test') 49 | return train_set, None, test_set # train/val/test 50 | 51 | 52 | class ActivityNetProcessor: 53 | def __init__(self): 54 | super(ActivityNetProcessor, self).__init__() 55 | self.idx_counter = 0 56 | 57 | def reset_idx_counter(self): 58 | self.idx_counter = 0 59 | 60 | def process_data(self, data, scope): 61 | results = [] 62 | for vid, data_item in tqdm(data.items(), total=len(data), desc='process activitynet {}'.format(scope)): 63 | duration = float(data_item['duration']) 64 | for timestamp, sentence in zip(data_item["timestamps"], data_item["sentences"]): 65 | start_time = max(0.0, float(timestamp[0])) 66 | end_time = min(float(timestamp[1]), duration) 67 | words = word_tokenize(sentence.strip().lower(), language="english") 68 | record = {'sample_id': self.idx_counter, 'vid': str(vid), 's_time': start_time, 'e_time': end_time, 69 | 'duration': duration, 'words': words} 70 | results.append(record) 71 | self.idx_counter += 1 72 | return results 73 | 74 | def convert(self, data_dir): 75 | self.reset_idx_counter() 76 | if not os.path.exists(data_dir): 77 | raise ValueError('data dir {} does not exist'.format(data_dir)) 78 | # load raw data 79 | train_data = load_json(os.path.join(data_dir, 'train.json')) 80 | val_data = load_json(os.path.join(data_dir, 'val_2.json')) 81 | test_data = load_json(os.path.join(data_dir, 'val_1.json')) 82 | # process data 83 | train_set = self.process_data(train_data, scope='train') 84 | val_set = self.process_data(val_data, scope='val') 85 | test_set = self.process_data(test_data, scope='test') 86 | return train_set, val_set, test_set 87 | 88 | 89 | class TACoSProcessor: 90 | def __init__(self): 91 | super(TACoSProcessor, self).__init__() 92 | self.idx_counter = 0 93 | 94 | def reset_idx_counter(self): 95 | self.idx_counter = 0 96 | 97 | def process_data_tan(self, data, scope): 98 | results = [] 99 | for vid, data_item in tqdm(data.items(), total=len(data), desc='process tacos {}'.format(scope)): 100 | if vid.endswith('.avi'): 101 | vid = vid[0:-4] 102 | fps = float(data_item['fps']) 103 | duration = float(data_item['num_frames']) / fps 104 | for timestamp, sentence in zip(data_item['timestamps'], data_item['sentences']): 105 | start_time = max(0.0, float(timestamp[0]) / fps) 106 | end_time = min(float(timestamp[1]) / fps, duration) 107 | words = word_tokenize(sentence.strip().lower(), language="english") 108 | record = {'sample_id': self.idx_counter, 'vid': str(vid), 's_time': start_time, 'e_time': end_time, 109 | 'duration': duration, 'words': words} 110 | results.append(record) 111 | self.idx_counter += 1 112 | return results 113 | 114 | def convert(self, data_dir): 115 | self.reset_idx_counter() 116 | if not os.path.exists(data_dir): 117 | raise ValueError('data dir {} does not exist'.format(data_dir)) 118 | # load raw data 119 | train_data = load_json(os.path.join(data_dir, 'train.json')) 120 | val_data = load_json(os.path.join(data_dir, 'val.json')) 121 | test_data = load_json(os.path.join(data_dir, 'test.json')) 122 | # process data 123 | train_set = self.process_data_tan(train_data, scope='train') 124 | val_set = self.process_data_tan(val_data, scope='val') 125 | test_set = self.process_data_tan(test_data, scope='test') 126 | return train_set, val_set, test_set 127 | 128 | 129 | def load_glove(glove_path): 130 | vocab = list() 131 | with codecs.open(glove_path, mode="r", encoding="utf-8") as f: 132 | for line in tqdm(f, total=2196018, desc="load glove vocabulary"): 133 | line = line.lstrip().rstrip().split(" ") 134 | if len(line) == 2 or len(line) != 301: 135 | continue 136 | word = line[0] 137 | vocab.append(word) 138 | return set(vocab) 139 | 140 | 141 | def filter_glove_embedding(word_dict, glove_path): 142 | vectors = np.zeros(shape=[len(word_dict), 300], dtype=np.float32) 143 | with codecs.open(glove_path, mode="r", encoding="utf-8") as f: 144 | for line in tqdm(f, total=2196018, desc="load glove embeddings"): 145 | line = line.lstrip().rstrip().split(" ") 146 | if len(line) == 2 or len(line) != 301: 147 | continue 148 | word = line[0] 149 | if word in word_dict: 150 | vector = [float(x) for x in line[1:]] 151 | word_index = word_dict[word] 152 | vectors[word_index] = np.asarray(vector) 153 | return np.asarray(vectors) 154 | 155 | 156 | def vocab_emb_gen(datasets, emb_path): 157 | # generate word dict and vectors 158 | emb_vocab = load_glove(emb_path) 159 | word_counter, char_counter = Counter(), Counter() 160 | for data in datasets: 161 | for record in data: 162 | for word in record['words']: 163 | word_counter[word] += 1 164 | for char in list(word): 165 | char_counter[char] += 1 166 | word_vocab = list() 167 | for word, _ in word_counter.most_common(): 168 | if word in emb_vocab: 169 | word_vocab.append(word) 170 | tmp_word_dict = dict([(word, index) for index, word in enumerate(word_vocab)]) 171 | vectors = filter_glove_embedding(tmp_word_dict, emb_path) 172 | word_vocab = [PAD, UNK] + word_vocab 173 | word_dict = dict([(word, idx) for idx, word in enumerate(word_vocab)]) 174 | # generate character dict 175 | char_vocab = [PAD, UNK] + [char for char, count in char_counter.most_common() if count >= 5] 176 | char_dict = dict([(char, idx) for idx, char in enumerate(char_vocab)]) 177 | return word_dict, char_dict, vectors 178 | 179 | 180 | def dataset_gen(data, vfeat_lens, word_dict, char_dict, max_pos_len, scope): 181 | dataset = list() 182 | for record in tqdm(data, total=len(data), desc='process {} data'.format(scope)): 183 | vid = record['vid'] 184 | if vid not in vfeat_lens: 185 | continue 186 | s_ind, e_ind, _ = time_to_index(record['s_time'], record['e_time'], vfeat_lens[vid], record['duration']) 187 | word_ids, char_ids = [], [] 188 | for word in record['words'][0:max_pos_len]: 189 | word_id = word_dict[word] if word in word_dict else word_dict[UNK] 190 | char_id = [char_dict[char] if char in char_dict else char_dict[UNK] for char in word] 191 | word_ids.append(word_id) 192 | char_ids.append(char_id) 193 | result = {'sample_id': record['sample_id'], 'vid': record['vid'], 's_time': record['s_time'], 194 | 'e_time': record['e_time'], 'duration': record['duration'], 'words': record['words'], 195 | 's_ind': int(s_ind), 'e_ind': int(e_ind), 'v_len': vfeat_lens[vid], 'w_ids': word_ids, 196 | 'c_ids': char_ids} 197 | dataset.append(result) 198 | return dataset 199 | 200 | 201 | def gen_or_load_dataset(configs): 202 | if not os.path.exists(configs.save_dir): 203 | os.makedirs(configs.save_dir) 204 | data_dir = os.path.join('data', 'dataset', configs.task) 205 | feature_dir = os.path.join('data', 'features', configs.task, configs.fv) 206 | if configs.suffix is None: 207 | save_path = os.path.join(configs.save_dir, '_'.join([configs.task, configs.fv, str(configs.max_pos_len)]) + 208 | '.pkl') 209 | else: 210 | save_path = os.path.join(configs.save_dir, '_'.join([configs.task, configs.fv, str(configs.max_pos_len), 211 | configs.suffix]) + '.pkl') 212 | if os.path.exists(save_path): 213 | dataset = load_pickle(save_path) 214 | return dataset 215 | feat_len_path = os.path.join(feature_dir, 'feature_shapes.json') 216 | emb_path = os.path.join('data', 'features', 'glove.840B.300d.txt') 217 | # load video feature length 218 | vfeat_lens = load_json(feat_len_path) 219 | for vid, vfeat_len in vfeat_lens.items(): 220 | vfeat_lens[vid] = min(configs.max_pos_len, vfeat_len) 221 | # load data 222 | if configs.task == 'charades': 223 | processor = CharadesProcessor() 224 | elif configs.task == 'activitynet': 225 | processor = ActivityNetProcessor() 226 | elif configs.task == 'tacos': 227 | processor = TACoSProcessor() 228 | else: 229 | raise ValueError('Unknown task {}!!!'.format(configs.task)) 230 | train_data, val_data, test_data = processor.convert(data_dir) 231 | # generate dataset 232 | data_list = [train_data, test_data] if val_data is None else [train_data, val_data, test_data] 233 | word_dict, char_dict, vectors = vocab_emb_gen(data_list, emb_path) 234 | train_set = dataset_gen(train_data, vfeat_lens, word_dict, char_dict, configs.max_pos_len, 'train') 235 | val_set = None if val_data is None else dataset_gen(val_data, vfeat_lens, word_dict, char_dict, 236 | configs.max_pos_len, 'val') 237 | test_set = dataset_gen(test_data, vfeat_lens, word_dict, char_dict, configs.max_pos_len, 'test') 238 | # save dataset 239 | n_val = 0 if val_set is None else len(val_set) 240 | dataset = {'train_set': train_set, 'val_set': val_set, 'test_set': test_set, 'word_dict': word_dict, 241 | 'char_dict': char_dict, 'word_vector': vectors, 'n_train': len(train_set), 'n_val': n_val, 242 | 'n_test': len(test_set), 'n_words': len(word_dict), 'n_chars': len(char_dict)} 243 | save_pickle(dataset, save_path) 244 | return dataset 245 | -------------------------------------------------------------------------------- /prepare/feature_extractor.py: -------------------------------------------------------------------------------- 1 | """ 2 | Downloaded from https://github.com/piergiaj/pytorch-i3d/blob/master/pytorch_i3d.py 3 | Minor modification are applied to fit our requirements 4 | """ 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | 9 | 10 | class MaxPool3dSamePadding(nn.MaxPool3d): 11 | 12 | def compute_pad(self, dim, s): 13 | if s % self.stride[dim] == 0: 14 | return max(self.kernel_size[dim] - self.stride[dim], 0) 15 | else: 16 | return max(self.kernel_size[dim] - (s % self.stride[dim]), 0) 17 | 18 | def forward(self, x): 19 | # compute 'same' padding 20 | (batch, channel, t, h, w) = x.size() 21 | pad_t = self.compute_pad(0, t) 22 | pad_h = self.compute_pad(1, h) 23 | pad_w = self.compute_pad(2, w) 24 | 25 | pad_t_f = pad_t // 2 26 | pad_t_b = pad_t - pad_t_f 27 | pad_h_f = pad_h // 2 28 | pad_h_b = pad_h - pad_h_f 29 | pad_w_f = pad_w // 2 30 | pad_w_b = pad_w - pad_w_f 31 | 32 | pad = [pad_w_f, pad_w_b, pad_h_f, pad_h_b, pad_t_f, pad_t_b] 33 | x = F.pad(x, pad) 34 | return super(MaxPool3dSamePadding, self).forward(x) 35 | 36 | 37 | class Unit3D(nn.Module): 38 | 39 | def __init__(self, in_channels, 40 | output_channels, 41 | kernel_shape=(1, 1, 1), 42 | stride=(1, 1, 1), 43 | padding=0, 44 | activation_fn=None, 45 | use_batch_norm=True, 46 | use_bias=False, 47 | name='unit_3d'): 48 | 49 | """Initializes Unit3D module.""" 50 | super(Unit3D, self).__init__() 51 | 52 | self._output_channels = output_channels 53 | self._kernel_shape = kernel_shape 54 | self._stride = stride 55 | self._use_batch_norm = use_batch_norm 56 | self._activation_fn = activation_fn 57 | self._use_bias = use_bias 58 | self.name = name 59 | self.padding = padding 60 | 61 | self.conv3d = nn.Conv3d(in_channels=in_channels, 62 | out_channels=self._output_channels, 63 | kernel_size=self._kernel_shape, 64 | stride=self._stride, 65 | padding=0, 66 | # we always want padding to be 0 here. We will dynamically pad based on input size 67 | # in forward function 68 | bias=self._use_bias) 69 | 70 | if self._use_batch_norm: 71 | self.bn = nn.BatchNorm3d(self._output_channels, eps=0.001, momentum=0.01) 72 | 73 | def compute_pad(self, dim, s): 74 | if s % self._stride[dim] == 0: 75 | return max(self._kernel_shape[dim] - self._stride[dim], 0) 76 | else: 77 | return max(self._kernel_shape[dim] - (s % self._stride[dim]), 0) 78 | 79 | def forward(self, x): 80 | # compute 'same' padding 81 | (batch, channel, t, h, w) = x.size() 82 | pad_t = self.compute_pad(0, t) 83 | pad_h = self.compute_pad(1, h) 84 | pad_w = self.compute_pad(2, w) 85 | 86 | pad_t_f = pad_t // 2 87 | pad_t_b = pad_t - pad_t_f 88 | pad_h_f = pad_h // 2 89 | pad_h_b = pad_h - pad_h_f 90 | pad_w_f = pad_w // 2 91 | pad_w_b = pad_w - pad_w_f 92 | 93 | pad = [pad_w_f, pad_w_b, pad_h_f, pad_h_b, pad_t_f, pad_t_b] 94 | x = F.pad(x, pad) 95 | 96 | x = self.conv3d(x) 97 | if self._use_batch_norm: 98 | x = self.bn(x) 99 | if self._activation_fn is not None: 100 | x = self._activation_fn(x) 101 | return x 102 | 103 | 104 | class InceptionModule(nn.Module): 105 | def __init__(self, in_channels, out_channels, name): 106 | super(InceptionModule, self).__init__() 107 | 108 | self.b0 = Unit3D(in_channels=in_channels, output_channels=out_channels[0], kernel_shape=[1, 1, 1], padding=0, 109 | activation_fn=F.relu, name=name + '/Branch_0/Conv3d_0a_1x1') 110 | self.b1a = Unit3D(in_channels=in_channels, output_channels=out_channels[1], kernel_shape=[1, 1, 1], padding=0, 111 | activation_fn=F.relu, name=name + '/Branch_1/Conv3d_0a_1x1') 112 | self.b1b = Unit3D(in_channels=out_channels[1], output_channels=out_channels[2], kernel_shape=[3, 3, 3], 113 | activation_fn=F.relu, name=name + '/Branch_1/Conv3d_0b_3x3') 114 | self.b2a = Unit3D(in_channels=in_channels, output_channels=out_channels[3], kernel_shape=[1, 1, 1], padding=0, 115 | activation_fn=F.relu, name=name + '/Branch_2/Conv3d_0a_1x1') 116 | self.b2b = Unit3D(in_channels=out_channels[3], output_channels=out_channels[4], kernel_shape=[3, 3, 3], 117 | activation_fn=F.relu, name=name + '/Branch_2/Conv3d_0b_3x3') 118 | self.b3a = MaxPool3dSamePadding(kernel_size=[3, 3, 3], 119 | stride=(1, 1, 1), padding=0) 120 | self.b3b = Unit3D(in_channels=in_channels, output_channels=out_channels[5], kernel_shape=[1, 1, 1], padding=0, 121 | activation_fn=F.relu, name=name + '/Branch_3/Conv3d_0b_1x1') 122 | self.name = name 123 | 124 | def forward(self, x): 125 | b0 = self.b0(x) 126 | b1 = self.b1b(self.b1a(x)) 127 | b2 = self.b2b(self.b2a(x)) 128 | b3 = self.b3b(self.b3a(x)) 129 | return torch.cat([b0, b1, b2, b3], dim=1) 130 | 131 | 132 | class InceptionI3d(nn.Module): 133 | """Inception-v1 I3D architecture. 134 | The model is introduced in: 135 | Quo Vadis, Action Recognition? A New Model and the Kinetics Dataset 136 | Joao Carreira, Andrew Zisserman 137 | https://arxiv.org/pdf/1705.07750v1.pdf. 138 | See also the Inception architecture, introduced in: 139 | Going deeper with convolutions 140 | Christian Szegedy, Wei Liu, Yangqing Jia, Pierre Sermanet, Scott Reed, 141 | Dragomir Anguelov, Dumitru Erhan, Vincent Vanhoucke, Andrew Rabinovich. 142 | http://arxiv.org/pdf/1409.4842v1.pdf. 143 | """ 144 | 145 | # Endpoints of the model in order. During construction, all the endpoints up 146 | # to a designated `final_endpoint` are returned in a dictionary as the 147 | # second return value. 148 | VALID_ENDPOINTS = ( 149 | 'Conv3d_1a_7x7', 150 | 'MaxPool3d_2a_3x3', 151 | 'Conv3d_2b_1x1', 152 | 'Conv3d_2c_3x3', 153 | 'MaxPool3d_3a_3x3', 154 | 'Mixed_3b', 155 | 'Mixed_3c', 156 | 'MaxPool3d_4a_3x3', 157 | 'Mixed_4b', 158 | 'Mixed_4c', 159 | 'Mixed_4d', 160 | 'Mixed_4e', 161 | 'Mixed_4f', 162 | 'MaxPool3d_5a_2x2', 163 | 'Mixed_5b', 164 | 'Mixed_5c', 165 | 'Logits', 166 | 'Predictions', 167 | ) 168 | 169 | def __init__(self, num_classes=400, spatial_squeeze=True, 170 | final_endpoint='Logits', name='inception_i3d', in_channels=3, dropout_keep_prob=0.5): 171 | """Initializes I3D model instance. 172 | Args: 173 | num_classes: The number of outputs in the logit layer (default 400, which 174 | matches the Kinetics dataset). 175 | spatial_squeeze: Whether to squeeze the spatial dimensions for the logits 176 | before returning (default True). 177 | final_endpoint: The model contains many possible endpoints. 178 | `final_endpoint` specifies the last endpoint for the model to be built 179 | up to. In addition to the output at `final_endpoint`, all the outputs 180 | at endpoints up to `final_endpoint` will also be returned, in a 181 | dictionary. `final_endpoint` must be one of 182 | InceptionI3d.VALID_ENDPOINTS (default 'Logits'). 183 | name: A string (optional). The name of this module. 184 | Raises: 185 | ValueError: if `final_endpoint` is not recognized. 186 | """ 187 | 188 | if final_endpoint not in self.VALID_ENDPOINTS: 189 | raise ValueError('Unknown final endpoint %s' % final_endpoint) 190 | 191 | super(InceptionI3d, self).__init__() 192 | self._num_classes = num_classes 193 | self._spatial_squeeze = spatial_squeeze 194 | self._final_endpoint = final_endpoint 195 | self.logits = None 196 | 197 | if self._final_endpoint not in self.VALID_ENDPOINTS: 198 | raise ValueError('Unknown final endpoint %s' % self._final_endpoint) 199 | 200 | self.end_points = {} 201 | end_point = 'Conv3d_1a_7x7' 202 | self.end_points[end_point] = Unit3D(in_channels=in_channels, output_channels=64, kernel_shape=[7, 7, 7], 203 | activation_fn=F.relu, stride=(2, 2, 2), padding=3, # padding=(3, 3, 3), 204 | name=name + end_point) 205 | if self._final_endpoint == end_point: 206 | return 207 | 208 | end_point = 'MaxPool3d_2a_3x3' 209 | self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[1, 3, 3], stride=(1, 2, 2), 210 | padding=0) 211 | if self._final_endpoint == end_point: 212 | return 213 | 214 | end_point = 'Conv3d_2b_1x1' 215 | self.end_points[end_point] = Unit3D(in_channels=64, output_channels=64, kernel_shape=[1, 1, 1], padding=0, 216 | activation_fn=F.relu, name=name + end_point) 217 | if self._final_endpoint == end_point: 218 | return 219 | 220 | end_point = 'Conv3d_2c_3x3' 221 | self.end_points[end_point] = Unit3D(in_channels=64, output_channels=192, kernel_shape=[3, 3, 3], padding=1, 222 | activation_fn=F.relu, name=name + end_point) 223 | if self._final_endpoint == end_point: 224 | return 225 | 226 | end_point = 'MaxPool3d_3a_3x3' 227 | self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[1, 3, 3], stride=(1, 2, 2), 228 | padding=0) 229 | if self._final_endpoint == end_point: 230 | return 231 | 232 | end_point = 'Mixed_3b' 233 | self.end_points[end_point] = InceptionModule(192, [64, 96, 128, 16, 32, 32], name + end_point) 234 | if self._final_endpoint == end_point: 235 | return 236 | 237 | end_point = 'Mixed_3c' 238 | self.end_points[end_point] = InceptionModule(256, [128, 128, 192, 32, 96, 64], name + end_point) 239 | if self._final_endpoint == end_point: 240 | return 241 | 242 | end_point = 'MaxPool3d_4a_3x3' 243 | self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[3, 3, 3], stride=(2, 2, 2), 244 | padding=0) 245 | if self._final_endpoint == end_point: 246 | return 247 | 248 | end_point = 'Mixed_4b' 249 | self.end_points[end_point] = InceptionModule(128 + 192 + 96 + 64, [192, 96, 208, 16, 48, 64], name + end_point) 250 | if self._final_endpoint == end_point: 251 | return 252 | 253 | end_point = 'Mixed_4c' 254 | self.end_points[end_point] = InceptionModule(192 + 208 + 48 + 64, [160, 112, 224, 24, 64, 64], name + end_point) 255 | if self._final_endpoint == end_point: 256 | return 257 | 258 | end_point = 'Mixed_4d' 259 | self.end_points[end_point] = InceptionModule(160 + 224 + 64 + 64, [128, 128, 256, 24, 64, 64], name + end_point) 260 | if self._final_endpoint == end_point: 261 | return 262 | 263 | end_point = 'Mixed_4e' 264 | self.end_points[end_point] = InceptionModule(128 + 256 + 64 + 64, [112, 144, 288, 32, 64, 64], name + end_point) 265 | if self._final_endpoint == end_point: 266 | return 267 | 268 | end_point = 'Mixed_4f' 269 | self.end_points[end_point] = InceptionModule(112 + 288 + 64 + 64, [256, 160, 320, 32, 128, 128], 270 | name + end_point) 271 | if self._final_endpoint == end_point: 272 | return 273 | 274 | end_point = 'MaxPool3d_5a_2x2' 275 | self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[2, 2, 2], stride=(2, 2, 2), 276 | padding=0) 277 | if self._final_endpoint == end_point: 278 | return 279 | 280 | end_point = 'Mixed_5b' 281 | self.end_points[end_point] = InceptionModule(256 + 320 + 128 + 128, [256, 160, 320, 32, 128, 128], 282 | name + end_point) 283 | if self._final_endpoint == end_point: 284 | return 285 | 286 | end_point = 'Mixed_5c' 287 | self.end_points[end_point] = InceptionModule(256 + 320 + 128 + 128, [384, 192, 384, 48, 128, 128], 288 | name + end_point) 289 | if self._final_endpoint == end_point: 290 | return 291 | 292 | # end_point = 'Logits' 293 | self.avg_pool = nn.AvgPool3d(kernel_size=[2, 7, 7], stride=(1, 1, 1)) 294 | self.dropout = nn.Dropout(dropout_keep_prob) 295 | self.logits = Unit3D(in_channels=384 + 384 + 128 + 128, output_channels=self._num_classes, 296 | kernel_shape=[1, 1, 1], 297 | padding=0, 298 | use_batch_norm=False, 299 | use_bias=True, 300 | name='logits') 301 | 302 | self.build() 303 | 304 | def replace_logits(self, num_classes): 305 | self._num_classes = num_classes 306 | self.logits = Unit3D(in_channels=384 + 384 + 128 + 128, output_channels=self._num_classes, 307 | kernel_shape=[1, 1, 1], 308 | padding=0, 309 | use_batch_norm=False, 310 | use_bias=True, 311 | name='logits') 312 | 313 | def build(self): 314 | for k in self.end_points.keys(): 315 | self.add_module(k, self.end_points[k]) 316 | 317 | def forward(self, x): 318 | for end_point in self.VALID_ENDPOINTS: 319 | if end_point in self.end_points: 320 | x = self._modules[end_point](x) # use _modules to work with data parallel 321 | x = self.avg_pool(x) 322 | logits = self.logits(self.dropout(x)) 323 | if self._spatial_squeeze: 324 | logits = x.squeeze(3).squeeze(3) 325 | # logits is batch X time X classes, which is what we want to work with 326 | return logits 327 | 328 | def extract_features(self, x): 329 | for end_point in self.VALID_ENDPOINTS: 330 | if end_point in self.end_points: 331 | x = self._modules[end_point](x) 332 | # x = [batch_size, channels, time, height, width] 333 | x = self.avg_pool(x) # 384 + 384 + 128 + 128 = 1024 334 | x = x.squeeze(0).permute(1, 2, 3, 0) # x = [time, height, width, channels] 335 | x = x.squeeze(1).squeeze(1) # x = [time, channels] 336 | return x 337 | -------------------------------------------------------------------------------- /model/layers.py: -------------------------------------------------------------------------------- 1 | import math 2 | import tensorflow as tf 3 | from model.ops import get_shape_list, mask_logits, trilinear_attention, regularizer 4 | 5 | if tf.__version__.startswith('2'): 6 | tf = tf.compat.v1 7 | tf.disable_v2_behavior() 8 | tf.disable_eager_execution() 9 | 10 | 11 | def layer_norm(inputs, epsilon=1e-6, reuse=None, name='layer_norm'): 12 | """Layer normalize the tensor x, averaging over the last dimension.""" 13 | with tf.variable_scope(name, default_name="layer_norm", values=[inputs], reuse=reuse): 14 | dim = get_shape_list(inputs)[-1] 15 | scale = tf.get_variable("layer_norm_scale", [dim], initializer=tf.ones_initializer(), regularizer=regularizer) 16 | bias = tf.get_variable("layer_norm_bias", [dim], initializer=tf.zeros_initializer(), regularizer=regularizer) 17 | mean = tf.reduce_mean(inputs, axis=[-1], keep_dims=True) 18 | variance = tf.reduce_mean(tf.square(inputs - mean), axis=[-1], keep_dims=True) 19 | norm_inputs = (inputs - mean) * tf.rsqrt(variance + epsilon) 20 | result = norm_inputs * scale + bias 21 | return result 22 | 23 | 24 | def word_embedding_lookup(word_ids, dim, vectors, drop_rate=0.0, finetune=False, reuse=None, name='word_embeddings'): 25 | with tf.variable_scope(name, reuse=reuse): 26 | table = tf.Variable(vectors, name='word_table', dtype=tf.float32, trainable=finetune) 27 | unk = tf.get_variable(name='unk', shape=[1, dim], dtype=tf.float32, trainable=True) 28 | zero = tf.zeros(shape=[1, dim], dtype=tf.float32) 29 | word_table = tf.concat([zero, unk, table], axis=0) 30 | word_emb = tf.nn.embedding_lookup(word_table, word_ids) 31 | word_emb = tf.nn.dropout(word_emb, rate=drop_rate) 32 | return word_emb 33 | 34 | 35 | def char_embedding_lookup(char_ids, char_size, dim, kernels, filters, drop_rate=0.0, activation=tf.nn.relu, 36 | padding='VALID', reuse=None, name='char_embeddings'): 37 | with tf.variable_scope(name, reuse=reuse): 38 | # char embeddings lookup 39 | table = tf.get_variable(name='char_table', shape=[char_size - 1, dim], dtype=tf.float32, trainable=True) 40 | zero = tf.zeros(shape=[1, dim], dtype=tf.float32) 41 | char_table = tf.concat([zero, table], axis=0) 42 | char_emb = tf.nn.embedding_lookup(char_table, char_ids) 43 | char_emb = tf.nn.dropout(char_emb, rate=drop_rate) 44 | # char-level cnn 45 | outputs = [] 46 | for i, (kernel, channel) in enumerate(zip(kernels, filters)): 47 | weight = tf.get_variable('filter_%d' % i, shape=[1, kernel, dim, channel], dtype=tf.float32, 48 | regularizer=regularizer) 49 | bias = tf.get_variable('bias_%d' % i, shape=[channel], dtype=tf.float32, initializer=tf.zeros_initializer(), 50 | regularizer=regularizer) 51 | output = tf.nn.conv2d(char_emb, weight, strides=[1, 1, 1, 1], padding=padding, name='conv_%d' % i) 52 | output = tf.nn.bias_add(output, bias=bias) 53 | output = tf.reduce_max(activation(output), axis=2) 54 | outputs.append(output) 55 | outputs = tf.concat(values=outputs, axis=-1) 56 | return outputs 57 | 58 | 59 | def conv1d(inputs, dim, kernel_size=1, use_bias=False, activation=None, padding='VALID', reuse=None, name='conv1d'): 60 | with tf.variable_scope(name, reuse=reuse): 61 | shapes = get_shape_list(inputs) 62 | kernel = tf.get_variable(name='kernel', shape=[kernel_size, shapes[-1], dim], dtype=tf.float32, 63 | regularizer=regularizer) 64 | outputs = tf.nn.conv1d(inputs, filters=kernel, stride=1, padding=padding) 65 | if use_bias: 66 | bias = tf.get_variable(name='bias', shape=[1, 1, dim], dtype=tf.float32, initializer=tf.zeros_initializer(), 67 | regularizer=regularizer) 68 | outputs += bias 69 | if activation is not None: 70 | return activation(outputs) 71 | else: 72 | return outputs 73 | 74 | 75 | def depthwise_separable_conv(inputs, kernel_size, dim, use_bias=True, reuse=None, activation=tf.nn.relu, 76 | name='depthwise_separable_conv'): 77 | with tf.variable_scope(name, reuse=reuse): 78 | shapes = get_shape_list(inputs) 79 | depthwise_filter = tf.get_variable(name='depthwise_filter', dtype=tf.float32, regularizer=regularizer, 80 | shape=[kernel_size[0], kernel_size[1], shapes[-1], 1]) 81 | pointwise_filter = tf.get_variable(name='pointwise_filter', shape=[1, 1, shapes[-1], dim], dtype=tf.float32, 82 | regularizer=regularizer) 83 | outputs = tf.nn.separable_conv2d(inputs, depthwise_filter, pointwise_filter, strides=[1, 1, 1, 1], 84 | padding='SAME') 85 | if use_bias: 86 | b = tf.get_variable('bias', outputs.shape[-1], initializer=tf.zeros_initializer(), regularizer=regularizer) 87 | outputs += b 88 | outputs = activation(outputs) 89 | return outputs 90 | 91 | 92 | def add_positional_embedding(inputs, max_position_length, reuse=None, name='positional_embedding'): 93 | with tf.variable_scope(name, reuse=reuse): 94 | batch_size, seq_length, dim = get_shape_list(inputs) 95 | assert_op = tf.assert_less_equal(seq_length, max_position_length) 96 | with tf.control_dependencies([assert_op]): 97 | full_position_embeddings = tf.get_variable(name='position_embeddings', shape=[max_position_length, dim], 98 | dtype=tf.float32) 99 | position_embeddings = tf.slice(full_position_embeddings, [0, 0], [seq_length, -1]) 100 | num_dims = len(inputs.shape.as_list()) 101 | position_broadcast_shape = [] 102 | for _ in range(num_dims - 2): 103 | position_broadcast_shape.append(1) 104 | position_broadcast_shape.extend([seq_length, dim]) 105 | position_embeddings = tf.reshape(position_embeddings, shape=position_broadcast_shape) 106 | outputs = inputs + position_embeddings 107 | return outputs 108 | 109 | 110 | def conv_block(inputs, kernel_size, dim, num_layers, drop_rate=0.0, reuse=None, name='conv_block'): 111 | with tf.variable_scope(name, reuse=reuse): 112 | outputs = tf.expand_dims(inputs, axis=2) 113 | for layer_idx in range(num_layers): 114 | residual = outputs 115 | outputs = layer_norm(outputs, reuse=reuse, name='layer_norm_%d' % layer_idx) 116 | outputs = depthwise_separable_conv(outputs, kernel_size=(kernel_size, 1), dim=dim, use_bias=True, 117 | activation=tf.nn.relu, name='depthwise_conv_layers_%d' % layer_idx, 118 | reuse=reuse) 119 | outputs = tf.nn.dropout(outputs, rate=drop_rate) + residual 120 | return tf.squeeze(outputs, 2) 121 | 122 | 123 | def multihead_attention(inputs, dim, num_heads, mask=None, drop_rate=0.0, reuse=None, name='multihead_attention'): 124 | with tf.variable_scope(name, reuse=reuse): 125 | if dim % num_heads != 0: 126 | raise ValueError('The hidden size (%d) is not a multiple of the attention heads (%d)' % (dim, num_heads)) 127 | batch_size, seq_length, _ = get_shape_list(inputs) 128 | head_size = dim // num_heads 129 | 130 | def transpose_for_scores(input_tensor, batch_size_, seq_length_, num_heads_, head_size_): 131 | output_tensor = tf.reshape(input_tensor, shape=[batch_size_, seq_length_, num_heads_, head_size_]) 132 | output_tensor = tf.transpose(output_tensor, perm=[0, 2, 1, 3]) 133 | return output_tensor 134 | 135 | # projection 136 | query = conv1d(inputs, dim=dim, use_bias=True, reuse=reuse, name='query') 137 | key = conv1d(inputs, dim=dim, use_bias=True, reuse=reuse, name='key') 138 | value = conv1d(inputs, dim=dim, use_bias=True, reuse=reuse, name='value') 139 | # reshape & transpose: (batch_size, seq_length, dim) --> (batch_size, num_heads, seq_length, head_size) 140 | query = transpose_for_scores(query, batch_size, seq_length, num_heads, head_size) 141 | key = transpose_for_scores(key, batch_size, seq_length, num_heads, head_size) 142 | value = transpose_for_scores(value, batch_size, seq_length, num_heads, head_size) 143 | # compute attention score 144 | query = tf.multiply(query, 1.0 / math.sqrt(float(head_size))) 145 | attention_score = tf.matmul(query, key, transpose_b=True) 146 | if mask is not None: 147 | shapes = get_shape_list(attention_score) 148 | mask = tf.cast(tf.reshape(mask, shape=[shapes[0], 1, 1, shapes[-1]]), dtype=tf.float32) 149 | attention_score += (1.0 - mask) * -1e30 150 | attention_score = tf.nn.softmax(attention_score) # shape = (batch_size, num_heads, seq_length, seq_length) 151 | attention_score = tf.nn.dropout(attention_score, rate=drop_rate) 152 | # compute value 153 | value = tf.matmul(attention_score, value) # shape = (batch_size, num_heads, seq_length, head_size) 154 | value = tf.transpose(value, perm=[0, 2, 1, 3]) 155 | value = tf.reshape(value, shape=[batch_size, seq_length, num_heads * head_size]) 156 | return value 157 | 158 | 159 | def multihead_attention_block(inputs, dim, num_heads, mask=None, use_bias=True, drop_rate=0.0, reuse=None, 160 | name='multihead_attention_block'): 161 | with tf.variable_scope(name, reuse=reuse): 162 | # multihead attention layer 163 | outputs = layer_norm(inputs, reuse=reuse, name='layer_norm_1') 164 | outputs = tf.nn.dropout(outputs, rate=drop_rate) 165 | outputs = multihead_attention(outputs, dim=dim, num_heads=num_heads, mask=mask, drop_rate=drop_rate, 166 | name='multihead_attention') 167 | outputs = tf.nn.dropout(outputs, rate=drop_rate) 168 | residual = outputs + inputs 169 | # feed forward layer 170 | outputs = layer_norm(residual, reuse=reuse, name='layer_norm_2') 171 | outputs = tf.nn.dropout(outputs, rate=drop_rate) 172 | outputs = conv1d(outputs, dim=dim, use_bias=use_bias, activation=None, reuse=reuse, name='dense') 173 | outputs = tf.nn.dropout(outputs, rate=drop_rate) 174 | outputs = outputs + residual 175 | return outputs 176 | 177 | 178 | def feature_encoder(inputs, hidden_size, num_heads, max_position_length, drop_rate, mask, reuse=None, 179 | name='feature_encoder'): 180 | with tf.variable_scope(name, reuse=reuse): 181 | features = add_positional_embedding(inputs, max_position_length=max_position_length, reuse=reuse, 182 | name='positional_embedding') 183 | features = conv_block(features, kernel_size=7, dim=hidden_size, num_layers=4, reuse=reuse, drop_rate=drop_rate, 184 | name='conv_block') 185 | features = multihead_attention_block(features, dim=hidden_size, num_heads=num_heads, mask=mask, use_bias=True, 186 | drop_rate=drop_rate, reuse=False, name='multihead_attention_block') 187 | return features 188 | 189 | 190 | def video_query_attention(video_features, query_features, v_mask, q_mask, drop_rate=0.0, reuse=None, 191 | name='video_query_attention'): 192 | with tf.variable_scope(name, reuse=reuse): 193 | dim = get_shape_list(video_features)[-1] 194 | v_maxlen = tf.reduce_max(tf.reduce_sum(v_mask, axis=1)) 195 | q_maxlen = tf.reduce_max(tf.reduce_sum(q_mask, axis=1)) 196 | score = trilinear_attention([video_features, query_features], v_maxlen=v_maxlen, q_maxlen=q_maxlen, 197 | drop_rate=drop_rate, reuse=reuse, name='efficient_trilinear') 198 | mask_q = tf.expand_dims(q_mask, 1) 199 | score_ = tf.nn.softmax(mask_logits(score, mask=mask_q)) 200 | mask_v = tf.expand_dims(v_mask, 2) 201 | score_t = tf.transpose(tf.nn.softmax(mask_logits(score, mask=mask_v), dim=1), perm=[0, 2, 1]) 202 | v2q = tf.matmul(score_, query_features) 203 | q2v = tf.matmul(tf.matmul(score_, score_t), video_features) 204 | attention_outputs = tf.concat([video_features, v2q, video_features * v2q, video_features * q2v], axis=-1) 205 | outputs = conv1d(attention_outputs, dim=dim, use_bias=False, activation=None, reuse=reuse, name='dense') 206 | return outputs, score 207 | 208 | 209 | def context_query_concat(inputs, qfeats, q_mask, reuse=None, name='context_query_concat'): 210 | with tf.variable_scope(name, reuse=reuse): 211 | dim = get_shape_list(qfeats)[-1] 212 | # compute pooled query feature 213 | weight = tf.get_variable(name='weight', shape=[dim, 1], dtype=tf.float32, regularizer=regularizer) 214 | x = tf.tensordot(qfeats, weight, axes=1) # shape = (batch_size, seq_length, 1) 215 | q_mask = tf.expand_dims(q_mask, axis=-1) # shape = (batch_size, seq_length, 1) 216 | x = mask_logits(x, mask=q_mask) 217 | alphas = tf.nn.softmax(x, axis=1) 218 | q_pooled = tf.matmul(tf.transpose(qfeats, perm=[0, 2, 1]), alphas) 219 | q_pooled = tf.squeeze(q_pooled, axis=-1) # shape = (batch_size, dim) 220 | # concatenation 221 | q_pooled = tf.tile(tf.expand_dims(q_pooled, axis=1), multiples=[1, tf.shape(inputs)[1], 1]) 222 | outputs = tf.concat([inputs, q_pooled], axis=-1) 223 | outputs = conv1d(outputs, dim=dim, use_bias=True, reuse=False, name='dense') 224 | return outputs 225 | 226 | 227 | def highlight_layer(inputs, labels, mask, epsilon=1e-12, reuse=None, name='highlight_layer'): 228 | with tf.variable_scope(name, reuse=reuse): 229 | logits = conv1d(inputs, dim=1, use_bias=True, padding='VALID', reuse=reuse, name='dense') 230 | logits = tf.squeeze(logits, axis=-1) # (batch_size, seq_length) 231 | logits = mask_logits(logits, mask=mask) 232 | # prepare labels and weights 233 | labels = tf.cast(labels, dtype=logits.dtype) 234 | weights = tf.where(tf.equal(labels, 0.0), x=labels + 1.0, y=labels * 2.0) 235 | # binary cross entropy with sigmoid activation 236 | loss_per_location = tf.nn.sigmoid_cross_entropy_with_logits(labels=labels, logits=logits) 237 | loss_per_location = loss_per_location * weights 238 | mask = tf.cast(mask, dtype=logits.dtype) 239 | loss = tf.reduce_sum(loss_per_location * mask) / (tf.reduce_sum(mask) + epsilon) 240 | # compute scores 241 | scores = tf.sigmoid(logits) 242 | return loss, scores 243 | 244 | 245 | def dynamic_rnn(inputs, seq_len, dim, reuse=None, name='dynamic_rnn'): 246 | with tf.variable_scope(name, reuse=reuse): 247 | cell = tf.nn.rnn_cell.LSTMCell(num_units=dim, use_peepholes=False, name='lstm_cell') 248 | outputs, _ = tf.nn.dynamic_rnn(cell, inputs, sequence_length=seq_len, dtype=tf.float32) 249 | return outputs 250 | 251 | 252 | def conditioned_predictor(inputs, hidden_size, seq_len, mask, num_heads, max_position_length, drop_rate, mode='rnn', 253 | reuse=None, name='conditioned_predictor'): 254 | with tf.variable_scope(name, reuse=reuse): 255 | if mode == 'rnn': 256 | start_features = dynamic_rnn(inputs, seq_len, dim=hidden_size, reuse=False, name='start_rnn') 257 | end_features = dynamic_rnn(start_features, seq_len, dim=hidden_size, reuse=False, name='end_rnn') 258 | else: 259 | start_features = feature_encoder(inputs, hidden_size=hidden_size, num_heads=num_heads, mask=mask, 260 | max_position_length=max_position_length, drop_rate=drop_rate, reuse=False, 261 | name='feature_encoder') 262 | end_features = feature_encoder(start_features, hidden_size=hidden_size, num_heads=num_heads, mask=mask, 263 | max_position_length=max_position_length, drop_rate=drop_rate, reuse=True, 264 | name='feature_encoder') 265 | start_features = layer_norm(start_features, reuse=False, name='s_layer_norm') 266 | end_features = layer_norm(end_features, reuse=False, name='e_layer_norm') 267 | start_features = conv1d(tf.concat([start_features, inputs], axis=-1), dim=hidden_size, use_bias=True, 268 | reuse=False, activation=tf.nn.relu, name='start_hidden') 269 | end_features = conv1d(tf.concat([end_features, inputs], axis=-1), dim=hidden_size, use_bias=True, reuse=False, 270 | activation=tf.nn.relu, name='end_hidden') 271 | start_logits = conv1d(start_features, dim=1, use_bias=True, reuse=reuse, name='start_dense') 272 | end_logits = conv1d(end_features, dim=1, use_bias=True, reuse=reuse, name='end_dense') 273 | start_logits = mask_logits(tf.squeeze(start_logits, axis=-1), mask=mask) # shape = (batch_size, seq_length) 274 | end_logits = mask_logits(tf.squeeze(end_logits, axis=-1), mask=mask) # shape = (batch_size, seq_length) 275 | return start_logits, end_logits 276 | 277 | 278 | def localization_loss(start_logits, end_logits, y1, y2): 279 | start_prob = tf.nn.softmax(start_logits, axis=1) 280 | end_prob = tf.nn.softmax(end_logits, axis=1) 281 | outer = tf.matmul(tf.expand_dims(start_prob, axis=2), tf.expand_dims(end_prob, axis=1)) 282 | outer = tf.matrix_band_part(outer, num_lower=0, num_upper=-1) 283 | start_index = tf.argmax(tf.reduce_max(outer, axis=2), axis=1) 284 | end_index = tf.argmax(tf.reduce_max(outer, axis=1), axis=1) 285 | start_losses = tf.nn.softmax_cross_entropy_with_logits_v2(logits=start_logits, labels=y1) 286 | end_losses = tf.nn.softmax_cross_entropy_with_logits_v2(logits=end_logits, labels=y2) 287 | loss = tf.reduce_mean(start_losses + end_losses) 288 | return start_prob, end_prob, start_index, end_index, loss 289 | -------------------------------------------------------------------------------- /model/layers_t7.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | 6 | 7 | def mask_logits(inputs, mask, mask_value=-1e30): 8 | mask = mask.type(torch.float32) 9 | return inputs + (1.0 - mask) * mask_value 10 | 11 | 12 | class Conv1D(nn.Module): 13 | def __init__(self, in_dim, out_dim, kernel_size=1, stride=1, padding=0, bias=True): 14 | super(Conv1D, self).__init__() 15 | self.conv1d = nn.Conv1d(in_channels=in_dim, out_channels=out_dim, kernel_size=kernel_size, padding=padding, 16 | stride=stride, bias=bias) 17 | 18 | def forward(self, x): 19 | # suppose all the input with shape (batch_size, seq_len, dim) 20 | x = x.transpose(1, 2) # (batch_size, dim, seq_len) 21 | x = self.conv1d(x) 22 | return x.transpose(1, 2) # (batch_size, seq_len, dim) 23 | 24 | 25 | class WordEmbedding(nn.Module): 26 | def __init__(self, num_words, word_dim, drop_rate, word_vectors=None): 27 | super(WordEmbedding, self).__init__() 28 | self.is_pretrained = False if word_vectors is None else True 29 | if self.is_pretrained: 30 | self.pad_vec = nn.Parameter(torch.zeros(size=(1, word_dim), dtype=torch.float32), requires_grad=False) 31 | unk_vec = torch.empty(size=(1, word_dim), requires_grad=True, dtype=torch.float32) 32 | nn.init.xavier_uniform_(unk_vec) 33 | self.unk_vec = nn.Parameter(unk_vec, requires_grad=True) 34 | self.glove_vec = nn.Parameter(torch.tensor(word_vectors, dtype=torch.float32), requires_grad=False) 35 | else: 36 | self.word_emb = nn.Embedding(num_words, word_dim, padding_idx=0) 37 | self.dropout = nn.Dropout(p=drop_rate) 38 | 39 | def forward(self, word_ids): 40 | if self.is_pretrained: 41 | word_emb = F.embedding(word_ids, torch.cat([self.pad_vec, self.unk_vec, self.glove_vec], dim=0), 42 | padding_idx=0) 43 | else: 44 | word_emb = self.word_emb(word_ids) 45 | return self.dropout(word_emb) 46 | 47 | 48 | class CharacterEmbedding(nn.Module): 49 | def __init__(self, num_chars, char_dim, drop_rate): 50 | super(CharacterEmbedding, self).__init__() 51 | self.char_emb = nn.Embedding(num_chars, char_dim, padding_idx=0) 52 | kernels, channels = [1, 2, 3, 4], [10, 20, 30, 40] 53 | self.char_convs = nn.ModuleList([ 54 | nn.Sequential( 55 | nn.Conv2d(in_channels=char_dim, out_channels=channel, kernel_size=(1, kernel), stride=(1, 1), padding=0, 56 | bias=True), 57 | nn.ReLU() 58 | ) for kernel, channel in zip(kernels, channels) 59 | ]) 60 | self.dropout = nn.Dropout(p=drop_rate) 61 | 62 | def forward(self, char_ids): 63 | char_emb = self.char_emb(char_ids) # (batch_size, w_seq_len, c_seq_len, char_dim) 64 | char_emb = self.dropout(char_emb) 65 | char_emb = char_emb.permute(0, 3, 1, 2) # (batch_size, char_dim, w_seq_len, c_seq_len) 66 | char_outputs = [] 67 | for conv_layer in self.char_convs: 68 | output = conv_layer(char_emb) 69 | output, _ = torch.max(output, dim=3, keepdim=False) # reduce max (batch_size, channel, w_seq_len) 70 | char_outputs.append(output) 71 | char_output = torch.cat(char_outputs, dim=1) # (batch_size, sum(channels), w_seq_len) 72 | return char_output.permute(0, 2, 1) # (batch_size, w_seq_len, sum(channels)) 73 | 74 | 75 | class Embedding(nn.Module): 76 | def __init__(self, num_words, num_chars, word_dim, char_dim, drop_rate, out_dim, word_vectors=None): 77 | super(Embedding, self).__init__() 78 | self.word_emb = WordEmbedding(num_words, word_dim, drop_rate, word_vectors=word_vectors) 79 | self.char_emb = CharacterEmbedding(num_chars, char_dim, drop_rate) 80 | # output linear layer 81 | self.linear = Conv1D(in_dim=word_dim + 100, out_dim=out_dim, kernel_size=1, stride=1, padding=0, bias=True) 82 | 83 | def forward(self, word_ids, char_ids): 84 | word_emb = self.word_emb(word_ids) # (batch_size, w_seq_len, word_dim) 85 | char_emb = self.char_emb(char_ids) # (batch_size, w_seq_len, 100) 86 | emb = torch.cat([word_emb, char_emb], dim=2) # (batch_size, w_seq_len, word_dim + 100) 87 | emb = self.linear(emb) # (batch_size, w_seq_len, dim) 88 | return emb 89 | 90 | 91 | class PositionalEmbedding(nn.Module): 92 | """Construct the embeddings from word, position and token_type embeddings.""" 93 | def __init__(self, num_embeddings, embedding_dim): 94 | super(PositionalEmbedding, self).__init__() 95 | self.position_embeddings = nn.Embedding(num_embeddings, embedding_dim) 96 | 97 | def forward(self, inputs): 98 | bsz, seq_length = inputs.shape[:2] 99 | position_ids = torch.arange(seq_length, dtype=torch.long, device=inputs.device) 100 | position_ids = position_ids.unsqueeze(0).repeat(bsz, 1) # (N, L) 101 | position_embeddings = self.position_embeddings(position_ids) 102 | return position_embeddings 103 | 104 | 105 | class VisualProjection(nn.Module): 106 | def __init__(self, visual_dim, dim, drop_rate=0.0): 107 | super(VisualProjection, self).__init__() 108 | self.drop = nn.Dropout(p=drop_rate) 109 | self.linear = Conv1D(in_dim=visual_dim, out_dim=dim, kernel_size=1, stride=1, bias=True, padding=0) 110 | 111 | def forward(self, visual_features): 112 | # the input visual feature with shape (batch_size, seq_len, visual_dim) 113 | visual_features = self.drop(visual_features) 114 | output = self.linear(visual_features) # (batch_size, seq_len, dim) 115 | return output 116 | 117 | 118 | class DepthwiseSeparableConvBlock(nn.Module): 119 | def __init__(self, dim, kernel_size, drop_rate, num_layers=4): 120 | super(DepthwiseSeparableConvBlock, self).__init__() 121 | self.depthwise_separable_conv = nn.ModuleList([ 122 | nn.Sequential( 123 | nn.Conv1d(in_channels=dim, out_channels=dim, kernel_size=kernel_size, groups=dim, 124 | padding=kernel_size // 2, bias=False), 125 | nn.Conv1d(in_channels=dim, out_channels=dim, kernel_size=1, padding=0, bias=True), 126 | nn.ReLU(), 127 | ) for _ in range(num_layers)]) 128 | self.layer_norms = nn.ModuleList([nn.LayerNorm(dim, eps=1e-6) for _ in range(num_layers)]) 129 | self.dropout = nn.Dropout(p=drop_rate) 130 | 131 | def forward(self, x): 132 | output = x # (batch_size, seq_len, dim) 133 | for idx, conv_layer in enumerate(self.depthwise_separable_conv): 134 | residual = output 135 | output = self.layer_norms[idx](output) # (batch_size, seq_len, dim) 136 | output = output.transpose(1, 2) # (batch_size, dim, seq_len) 137 | output = conv_layer(output) 138 | output = self.dropout(output) 139 | output = output.transpose(1, 2) + residual # (batch_size, seq_len, dim) 140 | return output 141 | 142 | 143 | class MultiHeadAttentionBlock(nn.Module): 144 | def __init__(self, dim, num_heads, drop_rate): 145 | super(MultiHeadAttentionBlock, self).__init__() 146 | assert dim % num_heads == 0, 'The channels (%d) is not a multiple of attention heads (%d)' % (dim, num_heads) 147 | self.head_size, self.num_heads, self.dim = int(dim / num_heads), num_heads, dim 148 | self.dropout = nn.Dropout(p=drop_rate) 149 | self.query = Conv1D(in_dim=dim, out_dim=dim, kernel_size=1, stride=1, padding=0, bias=True) 150 | self.key = Conv1D(in_dim=dim, out_dim=dim, kernel_size=1, stride=1, padding=0, bias=True) 151 | self.value = Conv1D(in_dim=dim, out_dim=dim, kernel_size=1, stride=1, padding=0, bias=True) 152 | self.layer_norm1 = nn.LayerNorm(dim, eps=1e-6) 153 | self.layer_norm2 = nn.LayerNorm(dim, eps=1e-6) 154 | self.out_layer = Conv1D(in_dim=dim, out_dim=dim, kernel_size=1, stride=1, padding=0, bias=True) 155 | 156 | def transpose_for_scores(self, x): 157 | new_x_shape = x.size()[:-1] + (self.num_heads, self.head_size) 158 | x = x.view(*new_x_shape) 159 | return x.permute(0, 2, 1, 3) # (batch_size, num_heads, w_seq_len, head_size) 160 | 161 | @staticmethod 162 | def combine_last_two_dim(x): 163 | old_shape = list(x.size()) 164 | new_shape = old_shape[:-2] + [old_shape[-2] * old_shape[-1]] 165 | return x.reshape(shape=new_shape) 166 | 167 | def forward(self, x, mask=None): 168 | output = self.layer_norm1(x) # (batch_size, seq_len, dim) 169 | output = self.dropout(output) 170 | # multi-head attention layer 171 | query = self.transpose_for_scores(self.query(output)) # (batch_size, num_heads, seq_len, head_size) 172 | key = self.transpose_for_scores(self.key(output)) 173 | value = self.transpose_for_scores(self.value(output)) 174 | attention_scores = torch.matmul(query, key.transpose(-1, -2)) # (batch_size, num_heads, seq_len, seq_len) 175 | attention_scores = attention_scores / math.sqrt(self.head_size) 176 | if mask is not None: # masking 177 | mask = mask.unsqueeze(1).unsqueeze(2) # (batch_size, 1, 1, seq_len) 178 | attention_scores = mask_logits(attention_scores, mask) 179 | attention_probs = nn.Softmax(dim=-1)(attention_scores) # (batch_size, num_heads, seq_len, seq_len) 180 | attention_probs = self.dropout(attention_probs) 181 | value = torch.matmul(attention_probs, value) # (batch_size, num_heads, seq_len, head_size) 182 | value = self.combine_last_two_dim(value.permute(0, 2, 1, 3)) # (batch_size, seq_len, dim) 183 | # intermediate layer 184 | output = self.dropout(value) 185 | residual = output + x 186 | output = self.layer_norm2(residual) 187 | output = self.dropout(output) 188 | output = self.out_layer(output) 189 | output = self.dropout(output) + residual 190 | return output 191 | 192 | 193 | class FeatureEncoder(nn.Module): 194 | def __init__(self, dim, num_heads, max_pos_len, kernel_size=7, num_layers=4, drop_rate=0.0): 195 | super(FeatureEncoder, self).__init__() 196 | self.pos_embedding = PositionalEmbedding(num_embeddings=max_pos_len, embedding_dim=dim) 197 | self.conv_block = DepthwiseSeparableConvBlock(dim=dim, kernel_size=kernel_size, drop_rate=drop_rate, 198 | num_layers=num_layers) 199 | self.attention_block = MultiHeadAttentionBlock(dim=dim, num_heads=num_heads, drop_rate=drop_rate) 200 | 201 | def forward(self, x, mask=None): 202 | features = x + self.pos_embedding(x) # (batch_size, seq_len, dim) 203 | features = self.conv_block(features) # (batch_size, seq_len, dim) 204 | features = self.attention_block(features, mask=mask) # (batch_size, seq_len, dim) 205 | return features 206 | 207 | 208 | class CQAttention(nn.Module): 209 | def __init__(self, dim, drop_rate=0.0): 210 | super(CQAttention, self).__init__() 211 | w4C = torch.empty(dim, 1) 212 | w4Q = torch.empty(dim, 1) 213 | w4mlu = torch.empty(1, 1, dim) 214 | nn.init.xavier_uniform_(w4C) 215 | nn.init.xavier_uniform_(w4Q) 216 | nn.init.xavier_uniform_(w4mlu) 217 | self.w4C = nn.Parameter(w4C, requires_grad=True) 218 | self.w4Q = nn.Parameter(w4Q, requires_grad=True) 219 | self.w4mlu = nn.Parameter(w4mlu, requires_grad=True) 220 | self.dropout = nn.Dropout(p=drop_rate) 221 | self.cqa_linear = Conv1D(in_dim=4 * dim, out_dim=dim, kernel_size=1, stride=1, padding=0, bias=True) 222 | 223 | def forward(self, context, query, c_mask, q_mask): 224 | score = self.trilinear_attention(context, query) # (batch_size, c_seq_len, q_seq_len) 225 | score_ = nn.Softmax(dim=2)(mask_logits(score, q_mask.unsqueeze(1))) # (batch_size, c_seq_len, q_seq_len) 226 | score_t = nn.Softmax(dim=1)(mask_logits(score, c_mask.unsqueeze(2))) # (batch_size, c_seq_len, q_seq_len) 227 | score_t = score_t.transpose(1, 2) # (batch_size, q_seq_len, c_seq_len) 228 | c2q = torch.matmul(score_, query) # (batch_size, c_seq_len, dim) 229 | q2c = torch.matmul(torch.matmul(score_, score_t), context) # (batch_size, c_seq_len, dim) 230 | output = torch.cat([context, c2q, torch.mul(context, c2q), torch.mul(context, q2c)], dim=2) 231 | output = self.cqa_linear(output) # (batch_size, c_seq_len, dim) 232 | return output 233 | 234 | def trilinear_attention(self, context, query): 235 | batch_size, c_seq_len, dim = context.shape 236 | batch_size, q_seq_len, dim = query.shape 237 | context = self.dropout(context) 238 | query = self.dropout(query) 239 | subres0 = torch.matmul(context, self.w4C).expand([-1, -1, q_seq_len]) # (batch_size, c_seq_len, q_seq_len) 240 | subres1 = torch.matmul(query, self.w4Q).transpose(1, 2).expand([-1, c_seq_len, -1]) 241 | subres2 = torch.matmul(context * self.w4mlu, query.transpose(1, 2)) 242 | res = subres0 + subres1 + subres2 # (batch_size, c_seq_len, q_seq_len) 243 | return res 244 | 245 | 246 | class WeightedPool(nn.Module): 247 | def __init__(self, dim): 248 | super(WeightedPool, self).__init__() 249 | weight = torch.empty(dim, 1) 250 | nn.init.xavier_uniform_(weight) 251 | self.weight = nn.Parameter(weight, requires_grad=True) 252 | 253 | def forward(self, x, mask): 254 | alpha = torch.tensordot(x, self.weight, dims=1) # shape = (batch_size, seq_length, 1) 255 | alpha = mask_logits(alpha, mask=mask.unsqueeze(2)) 256 | alphas = nn.Softmax(dim=1)(alpha) 257 | pooled_x = torch.matmul(x.transpose(1, 2), alphas) # (batch_size, dim, 1) 258 | pooled_x = pooled_x.squeeze(2) 259 | return pooled_x 260 | 261 | 262 | class CQConcatenate(nn.Module): 263 | def __init__(self, dim): 264 | super(CQConcatenate, self).__init__() 265 | self.weighted_pool = WeightedPool(dim=dim) 266 | self.conv1d = Conv1D(in_dim=2 * dim, out_dim=dim, kernel_size=1, stride=1, padding=0, bias=True) 267 | 268 | def forward(self, context, query, q_mask): 269 | pooled_query = self.weighted_pool(query, q_mask) # (batch_size, dim) 270 | _, c_seq_len, _ = context.shape 271 | pooled_query = pooled_query.unsqueeze(1).repeat(1, c_seq_len, 1) # (batch_size, c_seq_len, dim) 272 | output = torch.cat([context, pooled_query], dim=2) # (batch_size, c_seq_len, 2*dim) 273 | output = self.conv1d(output) 274 | return output 275 | 276 | 277 | class HighLightLayer(nn.Module): 278 | def __init__(self, dim): 279 | super(HighLightLayer, self).__init__() 280 | self.conv1d = Conv1D(in_dim=dim, out_dim=1, kernel_size=1, stride=1, padding=0, bias=True) 281 | 282 | def forward(self, x, mask): 283 | # compute logits 284 | logits = self.conv1d(x) 285 | logits = logits.squeeze(2) 286 | logits = mask_logits(logits, mask) 287 | # compute score 288 | scores = nn.Sigmoid()(logits) 289 | return scores 290 | 291 | @staticmethod 292 | def compute_loss(scores, labels, mask, epsilon=1e-12): 293 | labels = labels.type(torch.float32) 294 | weights = torch.where(labels == 0.0, labels + 1.0, 2.0 * labels) 295 | loss_per_location = nn.BCELoss(reduction='none')(scores, labels) 296 | loss_per_location = loss_per_location * weights 297 | mask = mask.type(torch.float32) 298 | loss = torch.sum(loss_per_location * mask) / (torch.sum(mask) + epsilon) 299 | return loss 300 | 301 | 302 | class DynamicRNN(nn.Module): 303 | def __init__(self, dim): 304 | super(DynamicRNN, self).__init__() 305 | self.lstm = nn.LSTM(input_size=dim, hidden_size=dim, num_layers=1, bias=True, batch_first=True, 306 | bidirectional=False) 307 | 308 | def forward(self, x, mask): 309 | out, _ = self.lstm(x) # (bsz, seq_len, dim) 310 | mask = mask.type(torch.float32) 311 | mask = mask.unsqueeze(2) 312 | out = out * mask 313 | return out 314 | 315 | 316 | class ConditionedPredictor(nn.Module): 317 | def __init__(self, dim, num_heads, max_pos_len, drop_rate=0.0, predictor='rnn'): 318 | super(ConditionedPredictor, self).__init__() 319 | self.predictor = predictor 320 | if predictor == 'rnn': 321 | self.start_encoder = DynamicRNN(dim=dim) 322 | self.end_encoder = DynamicRNN(dim=dim) 323 | else: 324 | self.encoder = FeatureEncoder(dim=dim, num_heads=num_heads, kernel_size=7, num_layers=4, 325 | max_pos_len=max_pos_len, drop_rate=drop_rate) 326 | self.start_layer_norm = nn.LayerNorm(dim, eps=1e-6) 327 | self.end_layer_norm = nn.LayerNorm(dim, eps=1e-6) 328 | 329 | self.start_block = nn.Sequential( 330 | Conv1D(in_dim=2 * dim, out_dim=dim, kernel_size=1, stride=1, padding=0, bias=True), 331 | nn.ReLU(), 332 | Conv1D(in_dim=dim, out_dim=1, kernel_size=1, stride=1, padding=0, bias=True) 333 | ) 334 | self.end_block = nn.Sequential( 335 | Conv1D(in_dim=2 * dim, out_dim=dim, kernel_size=1, stride=1, padding=0, bias=True), 336 | nn.ReLU(), 337 | Conv1D(in_dim=dim, out_dim=1, kernel_size=1, stride=1, padding=0, bias=True) 338 | ) 339 | 340 | def forward(self, x, mask): 341 | if self.predictor == 'rnn': 342 | start_features = self.start_encoder(x, mask) # (batch_size, seq_len, dim) 343 | end_features = self.end_encoder(start_features, mask) 344 | else: 345 | start_features = self.encoder(x, mask) 346 | end_features = self.encoder(start_features, mask) 347 | start_features = self.start_layer_norm(start_features) 348 | end_features = self.end_layer_norm(end_features) 349 | start_features = self.start_block(torch.cat([start_features, x], dim=2)) # (batch_size, seq_len, 1) 350 | end_features = self.end_block(torch.cat([end_features, x], dim=2)) 351 | start_logits = mask_logits(start_features.squeeze(2), mask=mask) 352 | end_logits = mask_logits(end_features.squeeze(2), mask=mask) 353 | return start_logits, end_logits 354 | 355 | @staticmethod 356 | def extract_index(start_logits, end_logits): 357 | start_prob = nn.Softmax(dim=1)(start_logits) 358 | end_prob = nn.Softmax(dim=1)(end_logits) 359 | outer = torch.matmul(start_prob.unsqueeze(dim=2), end_prob.unsqueeze(dim=1)) 360 | outer = torch.triu(outer, diagonal=0) 361 | _, start_index = torch.max(torch.max(outer, dim=2)[0], dim=1) # (batch_size, ) 362 | _, end_index = torch.max(torch.max(outer, dim=1)[0], dim=1) # (batch_size, ) 363 | return start_index, end_index 364 | 365 | @staticmethod 366 | def compute_cross_entropy_loss(start_logits, end_logits, start_labels, end_labels): 367 | start_loss = nn.CrossEntropyLoss(reduction='mean')(start_logits, start_labels) 368 | end_loss = nn.CrossEntropyLoss(reduction='mean')(end_logits, end_labels) 369 | return start_loss + end_loss 370 | --------------------------------------------------------------------------------