├── .gitignore ├── LICENSE ├── README.md ├── __init__.py ├── data ├── tvqa_preprocessed_subtitles.jsonl ├── tvr_test_public_release.jsonl ├── tvr_train_release.jsonl ├── tvr_val_release.jsonl └── tvr_video2dur_idx.json ├── figures └── model.png ├── method_tvr ├── __init__.py ├── config.py ├── contrastive.py ├── inference.py ├── model.py ├── model_components.py ├── optimization.py ├── proposal.py ├── scripts │ ├── eval.sh │ ├── inference.sh │ └── train.sh ├── start_end_dataset.py └── train.py ├── setup.sh ├── standalone_eval ├── README.md ├── __init__.py ├── eval.py └── eval_sample.sh └── utils ├── __init__.py ├── basic_utils.py ├── mk_video_split_with_duration.py ├── model_utils.py ├── temporal_nms.py ├── tensor_utils.py ├── text_feature ├── README.md ├── convert_sub_feature_word_to_clip.py ├── convert_sub_feature_word_to_clip.sh ├── extract_single_sentence_embeddings.sh ├── extract_single_sentence_tokens.sh ├── lm_finetuning_on_single_sentences.py ├── preprocess_subtitles.py └── train_lm_finetuning_single_sentence.sh └── video_feature ├── README.md ├── convert_feature_frm_to_clip.py ├── convert_feature_frm_to_clip.sh ├── extract_i3d_features.py ├── extract_i3d_features.sh ├── extract_image_features.py ├── extract_resnet152_2048_features.sh ├── i3d.py ├── merge_align_i3d.py ├── merge_align_i3d.sh ├── normalize_and_concat.py └── normalize_and_concat.sh /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | # custom 132 | .idea/ 133 | .vscode/ 134 | .DS_Store 135 | *.DS_Store 136 | data/tvr_feature_release/ 137 | method_tvr/results/ 138 | method_act/results/ 139 | 140 | 141 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 ZHANG HAO 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Video Corpus Moment Retrieval with Contrastive Learning 2 | 3 | PyTorch implementation for the paper "Video Corpus Moment Retrieval with Contrastive Learning" (**SIGIR 2021**, 4 | long paper): [SIGIR version](https://dl.acm.org/doi/10.1145/3404835.3462874), [ArXiv version]( 5 | https://arxiv.org/pdf/2105.06247.pdf). 6 | 7 | ![model_overview](./figures/model.png) 8 | 9 | > The codes are modified from [TVRetrieval](https://github.com/jayleicn/TVRetrieval). 10 | 11 | ## Prerequisites 12 | - python 3.x with pytorch (`1.7.0`), torchvision, transformers, tensorboard, tqdm, h5py, easydict 13 | - cuda, cudnn 14 | 15 | If you have [Anaconda](https://www.anaconda.com/distribution/) installed, the conda environment of ReLoCLNet can be 16 | built as follows (take python 3.7 as an example): 17 | ```shell 18 | conda create --name reloclnet python=3.7 19 | conda activate reloclnet 20 | conda install -c anaconda cudatoolkit cudnn # ignore this if you already have cuda installed 21 | conda install pytorch==1.7.0 torchvision==0.8.0 torchaudio==0.7.0 cudatoolkit=11.0 -c pytorch 22 | conda install -c anaconda h5py=2.9.0 23 | conda install -c conda-forge transformers tensorboard tqdm easydict 24 | ``` 25 | > The conda environment of [TVRetrieval](https://github.com/jayleicn/TVRetrieval) also works. 26 | 27 | 28 | ## Getting started 29 | 1. Clone this repository 30 | ```shell 31 | $ git clone git@github.com:IsaacChanghau/ReLoCLNet.git 32 | $ cd ReLoCLNet 33 | ``` 34 | 35 | 2. Download features 36 | 37 | For the features of TVR dataset, please download [tvr_feature_release.tar.gz]( 38 | https://drive.google.com/file/d/1j4mVkXjKCgafW3ReNjZ2Rk6CKx0Fk_n5/view?usp=sharing) (link is copied from 39 | [TVRetrieval#prerequisites](https://github.com/jayleicn/TVRetrieval#prerequisites)) and extract it to the `data` 40 | directory: 41 | ```shell 42 | $ tar -xf path/to/tvr_feature_release.tar.gz -C data 43 | ``` 44 | This [link](https://medium.com/@acpanjan/download-google-drive-files-using-wget-3c2c025a8b99) may be useful for you to 45 | directly download Google Drive files using `wget`. Please refer [TVRetrieval#prerequisites]( 46 | https://github.com/jayleicn/TVRetrieval#prerequisites) for more details about how the features are extracted if you are 47 | interested. 48 | 49 | 3. Add project root to `PYTHONPATH` (**Note that you need to do this each time you start a new session.**) 50 | ```shell 51 | $ source setup.sh 52 | ``` 53 | 54 | ## Training and Inference 55 | 56 | **TVR dataset** 57 | ```shell 58 | # train, refer `method_tvr/scripts/train.sh` and `method_tvr/config.py` more details about hyper-parameters 59 | $ bash method_tvr/scripts/train.sh tvr video_sub_tef resnet_i3d --exp_id reloclnet 60 | # inference 61 | # the model directory placed in method_tvr/results/tvr-video_sub_tef-reloclnet-* 62 | # change the MODEL_DIR_NAME as tvr-video_sub_tef-reloclnet-* 63 | # SPLIT_NAME: [val | test] 64 | $ bash method_tvr/scripts/inference.sh MODEL_DIR_NAME SPLIT_NAME 65 | ``` 66 | 67 | For more details about evaluation and submission, please refer [TVRetrieval#training-and-inference]( 68 | https://github.com/jayleicn/TVRetrieval#training-and-inference). 69 | 70 | ## Citation 71 | If you feel this project helpful to your research, please cite our work. 72 | ``` 73 | @inproceedings{zhang2021video, 74 | author = {Zhang, Hao and Sun, Aixin and Jing, Wei and Nan, Guoshun and Zhen, Liangli and Zhou, Joey Tianyi and Goh, Rick Siow Mong}, 75 | title = {Video Corpus Moment Retrieval with Contrastive Learning}, 76 | year = {2021}, 77 | isbn = {9781450380379}, 78 | publisher = {Association for Computing Machinery}, 79 | address = {New York, NY, USA}, 80 | url = {https://doi.org/10.1145/3404835.3462874}, 81 | doi = {10.1145/3404835.3462874}, 82 | booktitle = {Proceedings of the 44th International ACM SIGIR Conference on Research and Development in Information Retrieval}, 83 | pages = {685–695}, 84 | numpages = {11}, 85 | location = {Virtual Event, Canada}, 86 | series = {SIGIR '21} 87 | } 88 | ``` 89 | 90 | ## TODO 91 | - Upload codes for ActivityNet Captions dataset 92 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/26hzhang/ReLoCLNet/56cb666ce516cce9acbcfce78fb4e95d81e11e54/__init__.py -------------------------------------------------------------------------------- /figures/model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/26hzhang/ReLoCLNet/56cb666ce516cce9acbcfce78fb4e95d81e11e54/figures/model.png -------------------------------------------------------------------------------- /method_tvr/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/26hzhang/ReLoCLNet/56cb666ce516cce9acbcfce78fb4e95d81e11e54/method_tvr/__init__.py -------------------------------------------------------------------------------- /method_tvr/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import torch 4 | import argparse 5 | from utils.basic_utils import mkdirp, load_json, save_json, make_zipfile 6 | from method_tvr.proposal import ProposalConfigs 7 | 8 | 9 | class BaseOptions(object): 10 | saved_option_filename = "opt.json" 11 | ckpt_filename = "model.ckpt" 12 | tensorboard_log_dir = "tensorboard_log" 13 | train_log_filename = "train.log.txt" 14 | eval_log_filename = "eval.log.txt" 15 | 16 | def __init__(self): 17 | self.parser = argparse.ArgumentParser() 18 | self.initialized = False 19 | self.opt = None 20 | 21 | def initialize(self): 22 | self.initialized = True 23 | self.parser.add_argument("--dset_name", type=str, choices=["tvr"]) 24 | self.parser.add_argument("--eval_split_name", type=str, default="val", 25 | help="should match keys in video_duration_idx_path, must set for VCMR") 26 | self.parser.add_argument("--debug", action="store_true", 27 | help="debug (fast) mode, break all loops, do not load all data into memory.") 28 | self.parser.add_argument("--data_ratio", type=float, default=1.0, 29 | help="how many training and eval data to use. 1.0: use all, 0.1: use 10%." 30 | "Use small portion for debug purposes. Note this is different from --debug, " 31 | "which works by breaking the loops, typically they are not used together.") 32 | self.parser.add_argument("--results_root", type=str, default="results") 33 | self.parser.add_argument("--exp_id", type=str, default=None, help="id of this run, required at training") 34 | self.parser.add_argument("--seed", type=int, default=2018, help="random seed") 35 | self.parser.add_argument("--device", type=int, default=0, help="0 cuda, -1 cpu") 36 | self.parser.add_argument("--device_ids", type=int, nargs="+", default=[0], help="GPU ids to run the job") 37 | self.parser.add_argument("--num_workers", type=int, default=8, 38 | help="num subprocesses used to load the data, 0: use main process") 39 | self.parser.add_argument("--no_core_driver", action="store_true", 40 | help="hdf5 driver, default use `core` (load into RAM), if specified, use `None`") 41 | self.parser.add_argument("--no_pin_memory", action="store_true", help="No use pin_memory=True for dataloader") 42 | # training config 43 | self.parser.add_argument("--lr", type=float, default=1e-4, help="learning rate") 44 | self.parser.add_argument("--lr_warmup_proportion", type=float, default=0.01, 45 | help="Proportion of training to perform linear learning rate warmup.") 46 | self.parser.add_argument("--wd", type=float, default=0.01, help="weight decay") 47 | self.parser.add_argument("--n_epoch", type=int, default=100, help="number of epochs to run") 48 | self.parser.add_argument("--max_es_cnt", type=int, default=10, 49 | help="number of epochs to early stop, use -1 to disable early stop") 50 | self.parser.add_argument("--stop_task", type=str, default="VCMR", choices=["VCMR", "SVMR", "VR"], 51 | help="Use metric associated with stop_task for early stop") 52 | self.parser.add_argument("--eval_tasks_at_training", type=str, nargs="+", default=["VCMR", "SVMR", "VR"], 53 | choices=["VCMR", "SVMR", "VR"], help="evaluate and report numbers for tasks.") 54 | self.parser.add_argument("--bsz", type=int, default=128, help="mini-batch size") 55 | self.parser.add_argument("--eval_query_bsz", type=int, default=50, help="minibatch size at inference for query") 56 | self.parser.add_argument("--eval_context_bsz", type=int, default=200, 57 | help="mini-batch size at inference, for video/sub") 58 | self.parser.add_argument("--eval_untrained", action="store_true", help="Evaluate on un-trained model") 59 | self.parser.add_argument("--grad_clip", type=float, default=-1, help="perform gradient clip, -1: disable") 60 | self.parser.add_argument("--margin", type=float, default=0.1, help="margin for hinge loss") 61 | self.parser.add_argument("--lw_neg_q", type=float, default=1, 62 | help="weight for ranking loss with negative query and positive context") 63 | self.parser.add_argument("--lw_neg_ctx", type=float, default=1, 64 | help="weight for ranking loss with positive query and negative context") 65 | self.parser.add_argument("--lw_st_ed", type=float, default=0.01, help="weight for st ed prediction loss") 66 | self.parser.add_argument("--lw_fcl", type=float, default=0.03, help="weight for frame CL loss") 67 | self.parser.add_argument("--lw_vcl", type=float, default=0.03, help="weight for video CL loss") 68 | self.parser.add_argument("--train_span_start_epoch", type=int, default=0, 69 | help="which epoch to start training span prediction, -1 to disable") 70 | self.parser.add_argument("--ranking_loss_type", type=str, default="hinge", choices=["hinge", "lse"], 71 | help="att loss type, can be hinge loss or its smooth approximation LogSumExp") 72 | self.parser.add_argument("--hard_negative_start_epoch", type=int, default=20, 73 | help="which epoch to start hard negative sampling for video-level ranking loss," 74 | "use -1 to disable") 75 | self.parser.add_argument("--hard_pool_size", type=int, default=20, 76 | help="hard negatives are still sampled, but from a harder pool.") 77 | # Model and Data config 78 | self.parser.add_argument("--max_sub_l", type=int, default=50, 79 | help="max length of all sub sentence 97.71 under 50 for 3 sentences") 80 | self.parser.add_argument("--max_desc_l", type=int, default=30, help="max length of descriptions") 81 | self.parser.add_argument("--max_ctx_l", type=int, default=128, 82 | help="max number of snippets, 100 for tvr clip_length=1.5, oly 109/21825 > 100") 83 | self.parser.add_argument("--train_path", type=str, default=None) 84 | self.parser.add_argument("--eval_path", type=str, default=None, 85 | help="Evaluating during training, for Dev set. If None, will only do training, " 86 | "anet_cap and charades_sta has no dev set, so None") 87 | self.parser.add_argument("--desc_bert_path", type=str, default=None) 88 | self.parser.add_argument("--sub_bert_path", type=str, default=None) 89 | self.parser.add_argument("--sub_feat_size", type=int, default=768, help="feature dim for sub feature") 90 | self.parser.add_argument("--q_feat_size", type=int, default=768, help="feature dim for sub feature") 91 | self.parser.add_argument("--ctx_mode", type=str, help="which context to use a combination of [video, sub, tef]", 92 | choices=["video", "sub", "video_sub", "tef", "video_tef", "sub_tef", "video_sub_tef"]) 93 | self.parser.add_argument("--video_duration_idx_path", type=str, default=None) 94 | self.parser.add_argument("--vid_feat_path", type=str, default="") 95 | self.parser.add_argument("--no_norm_vfeat", action="store_true", 96 | help="Do not do normalization on video feat, use it only when using resnet_i3d feat") 97 | self.parser.add_argument("--no_norm_tfeat", action="store_true", help="Do not do normalization on text feat") 98 | self.parser.add_argument("--clip_length", type=float, default=None, 99 | help="each video will be uniformly segmented into small clips, " 100 | "will automatically loaded from ProposalConfigs if None") 101 | self.parser.add_argument("--vid_feat_size", type=int, help="feature dim for video feature") 102 | self.parser.add_argument("--max_position_embeddings", type=int, default=300) 103 | self.parser.add_argument("--hidden_size", type=int, default=384) 104 | self.parser.add_argument("--n_heads", type=int, default=8) 105 | self.parser.add_argument("--input_drop", type=float, default=0.1, help="Applied to all inputs") 106 | self.parser.add_argument("--drop", type=float, default=0.1, help="Applied to all other layers") 107 | self.parser.add_argument("--conv_kernel_size", type=int, default=5) 108 | self.parser.add_argument("--conv_stride", type=int, default=1) 109 | self.parser.add_argument("--initializer_range", type=float, default=0.02, help="initializer range for layers") 110 | # post processing 111 | self.parser.add_argument("--min_pred_l", type=int, default=2, 112 | help="constrain the [st, ed] with ed - st >= 2 (2 clips with length 1.5 each, 3 secs " 113 | "in total this is the min length for proposal-based backup_method)") 114 | self.parser.add_argument("--max_pred_l", type=int, default=16, 115 | help="constrain the [st, ed] pairs with ed - st <= 16, 24 secs in total (16 clips " 116 | "with length 1.5 each, this is the max length for proposal-based backup_method)") 117 | self.parser.add_argument("--q2c_alpha", type=float, default=30, 118 | help="give more importance to top scored videos' spans, " 119 | "the new score will be: s_new = exp(alpha * s), " 120 | "higher alpha indicates more importance. Note s in [-1, 1]") 121 | self.parser.add_argument("--max_before_nms", type=int, default=200) 122 | self.parser.add_argument("--max_vcmr_video", type=int, default=100, help="re-ranking in top-max_vcmr_video") 123 | self.parser.add_argument("--nms_thd", type=float, default=-1, 124 | help="additionally use non-maximum suppression (or non-minimum suppression for " 125 | "distance) to post-processing the predictions. -1: do not use nms. 0.6 for " 126 | "charades_sta, 0.5 for anet_cap") 127 | 128 | def display_save(self, opt): 129 | args = vars(opt) 130 | # Display settings 131 | print("------------ Options -------------\n{}\n-------------------".format({str(k): str(v) for k, v in 132 | sorted(args.items())})) 133 | # Save settings 134 | if not isinstance(self, TestOptions): 135 | option_file_path = os.path.join(opt.results_dir, self.saved_option_filename) # not yaml file indeed 136 | save_json(args, option_file_path, save_pretty=True) 137 | 138 | def parse(self): 139 | if not self.initialized: 140 | self.initialize() 141 | opt = self.parser.parse_args() 142 | if opt.debug: 143 | opt.results_root = os.path.sep.join(opt.results_root.split(os.path.sep)[:-1] + ["debug_results", ]) 144 | opt.no_core_driver = True 145 | opt.num_workers = 0 146 | opt.eval_query_bsz = 100 147 | if isinstance(self, TestOptions): 148 | # modify model_dir to absolute path 149 | opt.model_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "results", opt.model_dir) 150 | saved_options = load_json(os.path.join(opt.model_dir, self.saved_option_filename)) 151 | for arg in saved_options: # use saved options to overwrite all BaseOptions args. 152 | if arg not in ["results_root", "num_workers", "nms_thd", "debug", 153 | "eval_split_name", "eval_path", "eval_query_bsz", "eval_context_bsz", 154 | "max_pred_l", "min_pred_l", "external_inference_vr_res_path"]: 155 | setattr(opt, arg, saved_options[arg]) 156 | else: 157 | if opt.exp_id is None: 158 | raise ValueError("--exp_id is required for at a training option!") 159 | if opt.clip_length is None: 160 | opt.clip_length = ProposalConfigs[opt.dset_name]["clip_length"] 161 | print("Loaded clip_length {} from proposal config file".format(opt.clip_length)) 162 | opt.results_dir = os.path.join(opt.results_root, "-".join([opt.dset_name, opt.ctx_mode, opt.exp_id, 163 | time.strftime("%Y_%m_%d_%H_%M_%S")])) 164 | mkdirp(opt.results_dir) 165 | # save a copy of current code 166 | code_dir = os.path.dirname(os.path.realpath(__file__)) 167 | code_zip_filename = os.path.join(opt.results_dir, "code.zip") 168 | make_zipfile(code_dir, code_zip_filename, enclosing_dir="code", exclude_dirs_substring="results", 169 | exclude_dirs=["results", "debug_results", "__pycache__"], 170 | exclude_extensions=[".pyc", ".ipynb", ".swap"],) 171 | self.display_save(opt) 172 | if "sub" in opt.ctx_mode: 173 | assert opt.dset_name == "tvr", "sub is only supported for tvr dataset" 174 | if opt.hard_negative_start_epoch != -1: 175 | if opt.hard_pool_size > opt.bsz: 176 | print("[WARNING] hard_pool_size is larger than bsz") 177 | assert opt.stop_task in opt.eval_tasks_at_training 178 | opt.ckpt_filepath = os.path.join(opt.results_dir, self.ckpt_filename) 179 | opt.train_log_filepath = os.path.join(opt.results_dir, self.train_log_filename) 180 | opt.eval_log_filepath = os.path.join(opt.results_dir, self.eval_log_filename) 181 | opt.tensorboard_log_dir = os.path.join(opt.results_dir, self.tensorboard_log_dir) 182 | opt.device = torch.device("cuda:%d" % opt.device_ids[0] if opt.device >= 0 else "cpu") 183 | opt.h5driver = None if opt.no_core_driver else "core" 184 | # num_workers > 1 will only work with "core" mode, i.e., memory-mapped hdf5 185 | opt.num_workers = 1 if opt.no_core_driver else opt.num_workers 186 | opt.pin_memory = not opt.no_pin_memory 187 | if "video" in opt.ctx_mode and opt.vid_feat_size > 3000: # 3072, the normalized concatenation of resnet+i3d 188 | assert opt.no_norm_vfeat 189 | if "tef" in opt.ctx_mode and "video" in opt.ctx_mode: 190 | opt.vid_feat_size += 2 191 | if "tef" in opt.ctx_mode and "sub" in opt.ctx_mode: 192 | opt.sub_feat_size += 2 193 | self.opt = opt 194 | return opt 195 | 196 | 197 | class TestOptions(BaseOptions): 198 | """add additional options for evaluating""" 199 | def initialize(self): 200 | BaseOptions.initialize(self) 201 | # also need to specify --eval_split_name 202 | self.parser.add_argument("--eval_id", type=str, help="evaluation id") 203 | self.parser.add_argument("--model_dir", type=str, 204 | help="dir contains the model file, will be converted to absolute path afterwards") 205 | self.parser.add_argument("--tasks", type=str, nargs="+", 206 | choices=["VCMR", "SVMR", "VR"], default=["VCMR", "SVMR", "VR"], 207 | help="Which tasks to run." 208 | "VCMR: Video Corpus Moment Retrieval;" 209 | "SVMR: Single Video Moment Retrieval;" 210 | "VR: regular Video Retrieval. (will be performed automatically with VCMR)") 211 | -------------------------------------------------------------------------------- /method_tvr/contrastive.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import math 3 | import torch.nn.functional as F 4 | 5 | 6 | def log_sum_exp(x, axis=None): 7 | """ 8 | Log sum exp function 9 | Args: 10 | x: Input. 11 | axis: Axis over which to perform sum. 12 | Returns: 13 | torch.Tensor: log sum exp 14 | """ 15 | x_max = torch.max(x, axis)[0] 16 | y = torch.log((torch.exp(x - x_max)).sum(axis)) + x_max 17 | return y 18 | 19 | 20 | def get_positive_expectation(p_samples, measure='JSD', average=True): 21 | """ 22 | Computes the positive part of a divergence / difference. 23 | Args: 24 | p_samples: Positive samples. 25 | measure: Measure to compute for. 26 | average: Average the result over samples. 27 | Returns: 28 | torch.Tensor 29 | """ 30 | log_2 = math.log(2.) 31 | if measure == 'GAN': 32 | Ep = - F.softplus(-p_samples) 33 | elif measure == 'JSD': 34 | Ep = log_2 - F.softplus(-p_samples) 35 | elif measure == 'X2': 36 | Ep = p_samples ** 2 37 | elif measure == 'KL': 38 | Ep = p_samples + 1. 39 | elif measure == 'RKL': 40 | Ep = -torch.exp(-p_samples) 41 | elif measure == 'DV': 42 | Ep = p_samples 43 | elif measure == 'H2': 44 | Ep = torch.ones_like(p_samples) - torch.exp(-p_samples) 45 | elif measure == 'W1': 46 | Ep = p_samples 47 | else: 48 | raise ValueError('Unknown measurement {}'.format(measure)) 49 | if average: 50 | return Ep.mean() 51 | else: 52 | return Ep 53 | 54 | 55 | def get_negative_expectation(q_samples, measure='JSD', average=True): 56 | """ 57 | Computes the negative part of a divergence / difference. 58 | Args: 59 | q_samples: Negative samples. 60 | measure: Measure to compute for. 61 | average: Average the result over samples. 62 | Returns: 63 | torch.Tensor 64 | """ 65 | log_2 = math.log(2.) 66 | if measure == 'GAN': 67 | Eq = F.softplus(-q_samples) + q_samples 68 | elif measure == 'JSD': 69 | Eq = F.softplus(-q_samples) + q_samples - log_2 70 | elif measure == 'X2': 71 | Eq = -0.5 * ((torch.sqrt(q_samples ** 2) + 1.) ** 2) 72 | elif measure == 'KL': 73 | Eq = torch.exp(q_samples) 74 | elif measure == 'RKL': 75 | Eq = q_samples - 1. 76 | elif measure == 'DV': 77 | Eq = log_sum_exp(q_samples, 0) - math.log(q_samples.size(0)) 78 | elif measure == 'H2': 79 | Eq = torch.exp(q_samples) - 1. 80 | elif measure == 'W1': 81 | Eq = q_samples 82 | else: 83 | raise ValueError('Unknown measurement {}'.format(measure)) 84 | if average: 85 | return Eq.mean() 86 | else: 87 | return Eq 88 | 89 | 90 | def batch_video_query_loss(video, query, match_labels, mask, measure='JSD'): 91 | """ 92 | QV-CL module 93 | Computing the Contrastive Loss between the video and query. 94 | :param video: video rep (bsz, Lv, dim) 95 | :param query: query rep (bsz, dim) 96 | :param match_labels: match labels (bsz, Lv) 97 | :param mask: mask (bsz, Lv) 98 | :param measure: estimator of the mutual information 99 | :return: L_{qv} 100 | """ 101 | # generate mask 102 | pos_mask = match_labels.type(torch.float32) # (bsz, Lv) 103 | neg_mask = (torch.ones_like(pos_mask) - pos_mask) * mask # (bsz, Lv) 104 | 105 | # compute scores 106 | query = query.unsqueeze(2) # (bsz, dim, 1) 107 | res = torch.matmul(video, query).squeeze(2) # (bsz, Lv) 108 | 109 | # computing expectation for the MI between the target moment (positive samples) and query. 110 | E_pos = get_positive_expectation(res * pos_mask, measure, average=False) 111 | E_pos = torch.sum(E_pos * pos_mask, dim=1) / (torch.sum(pos_mask, dim=1) + 1e-12) # (bsz, ) 112 | 113 | # computing expectation for the MI between clips except target moment (negative samples) and query. 114 | E_neg = get_negative_expectation(res * neg_mask, measure, average=False) 115 | E_neg = torch.sum(E_neg * neg_mask, dim=1) / (torch.sum(neg_mask, dim=1) + 1e-12) # (bsz, ) 116 | 117 | E = E_neg - E_pos # (bsz, ) 118 | return torch.mean(E) 119 | 120 | 121 | def batch_video_video_loss(video, st_ed_indices, match_labels, mask, measure='JSD'): 122 | """ 123 | VV-CL module 124 | Computing the Contrastive loss between the start/end clips and the video 125 | :param video: video rep (bsz, Lv, dim) 126 | :param st_ed_indices: (bsz, 2) 127 | :param match_labels: match labels (bsz, Lv) 128 | :param mask: mask (bsz, Lv) 129 | :param measure: estimator of the mutual information 130 | :return: L_{vv} 131 | """ 132 | # generate mask 133 | pos_mask = match_labels.type(torch.float32) # (bsz, Lv) 134 | neg_mask = (torch.ones_like(pos_mask) - pos_mask) * mask # (bsz, Lv) 135 | 136 | # select start and end indices features 137 | st_indices, ed_indices = st_ed_indices[:, 0], st_ed_indices[:, 1] # (bsz, ) 138 | batch_indices = torch.arange(0, video.shape[0]).long() # (bsz, ) 139 | video_s = video[batch_indices, st_indices, :] # (bsz, dim) 140 | video_e = video[batch_indices, ed_indices, :] # (bsz, dim) 141 | 142 | # compute scores 143 | video_s = video_s.unsqueeze(2) # (bsz, dim, 1) 144 | res_s = torch.matmul(video, video_s).squeeze(2) # (bsz, Lv), fusion between the start clips and the video 145 | video_e = video_e.unsqueeze(2) # (bsz, dim, 1) 146 | res_e = torch.matmul(video, video_e).squeeze(2) # (bsz, Lv), fusion between the end clips and the video 147 | 148 | # start clips: MI expectation for all positive samples 149 | E_s_pos = get_positive_expectation(res_s * pos_mask, measure, average=False) 150 | E_s_pos = torch.sum(E_s_pos * pos_mask, dim=1) / (torch.sum(pos_mask, dim=1) + 1e-12) # (bsz, ) 151 | # end clips: MI expectation for all positive samples 152 | E_e_pos = get_positive_expectation(res_e * pos_mask, measure, average=False) 153 | E_e_pos = torch.sum(E_e_pos * pos_mask, dim=1) / (torch.sum(pos_mask, dim=1) + 1e-12) 154 | E_pos = E_s_pos + E_e_pos 155 | 156 | # start clips: MI expectation for all negative samples 157 | E_s_neg = get_negative_expectation(res_s * neg_mask, measure, average=False) 158 | E_s_neg = torch.sum(E_s_neg * neg_mask, dim=1) / (torch.sum(neg_mask, dim=1) + 1e-12) 159 | 160 | # end clips: MI expectation for all negative samples 161 | E_e_neg = get_negative_expectation(res_e * neg_mask, measure, average=False) 162 | E_e_neg = torch.sum(E_e_neg * neg_mask, dim=1) / (torch.sum(neg_mask, dim=1) + 1e-12) 163 | E_neg = E_s_neg + E_e_neg 164 | 165 | E = E_neg - E_pos # (bsz, ) 166 | return torch.mean(E) 167 | -------------------------------------------------------------------------------- /method_tvr/model.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | from easydict import EasyDict as edict 6 | from method_tvr.model_components import BertAttention, LinearLayer, BertSelfAttention, TrainablePositionalEncoding 7 | from method_tvr.model_components import MILNCELoss 8 | from method_tvr.contrastive import batch_video_query_loss 9 | 10 | 11 | class ReLoCLNet(nn.Module): 12 | def __init__(self, config): 13 | super(ReLoCLNet, self).__init__() 14 | self.config = config 15 | 16 | self.query_pos_embed = TrainablePositionalEncoding(max_position_embeddings=config.max_desc_l, 17 | hidden_size=config.hidden_size, dropout=config.input_drop) 18 | self.ctx_pos_embed = TrainablePositionalEncoding(max_position_embeddings=config.max_ctx_l, 19 | hidden_size=config.hidden_size, dropout=config.input_drop) 20 | 21 | self.query_input_proj = LinearLayer(config.query_input_size, config.hidden_size, layer_norm=True, 22 | dropout=config.input_drop, relu=True) 23 | 24 | self.query_encoder = BertAttention(edict(hidden_size=config.hidden_size, intermediate_size=config.hidden_size, 25 | hidden_dropout_prob=config.drop, num_attention_heads=config.n_heads, 26 | attention_probs_dropout_prob=config.drop)) 27 | self.query_encoder1 = copy.deepcopy(self.query_encoder) 28 | 29 | cross_att_cfg = edict(hidden_size=config.hidden_size, num_attention_heads=config.n_heads, 30 | attention_probs_dropout_prob=config.drop) 31 | # use_video 32 | self.video_input_proj = LinearLayer(config.visual_input_size, config.hidden_size, layer_norm=True, 33 | dropout=config.input_drop, relu=True) 34 | self.video_encoder1 = copy.deepcopy(self.query_encoder) 35 | self.video_encoder2 = copy.deepcopy(self.query_encoder) 36 | self.video_encoder3 = copy.deepcopy(self.query_encoder) 37 | self.video_cross_att = BertSelfAttention(cross_att_cfg) 38 | self.video_cross_layernorm = nn.LayerNorm(config.hidden_size) 39 | self.video_query_linear = nn.Linear(config.hidden_size, config.hidden_size) 40 | 41 | # use_sub 42 | self.sub_input_proj = LinearLayer(config.sub_input_size, config.hidden_size, layer_norm=True, 43 | dropout=config.input_drop, relu=True) 44 | self.sub_encoder1 = copy.deepcopy(self.query_encoder) 45 | self.sub_encoder2 = copy.deepcopy(self.query_encoder) 46 | self.sub_encoder3 = copy.deepcopy(self.query_encoder) 47 | self.sub_cross_att = BertSelfAttention(cross_att_cfg) 48 | self.sub_cross_layernorm = nn.LayerNorm(config.hidden_size) 49 | self.sub_query_linear = nn.Linear(config.hidden_size, config.hidden_size) 50 | 51 | self.modular_vector_mapping = nn.Linear(in_features=config.hidden_size, out_features=2, bias=False) 52 | 53 | conv_cfg = dict(in_channels=1, out_channels=1, kernel_size=config.conv_kernel_size, 54 | stride=config.conv_stride, padding=config.conv_kernel_size // 2, bias=False) 55 | self.merged_st_predictor = nn.Conv1d(**conv_cfg) 56 | self.merged_ed_predictor = nn.Conv1d(**conv_cfg) 57 | 58 | self.temporal_criterion = nn.CrossEntropyLoss(reduction="mean") 59 | self.nce_criterion = MILNCELoss(reduction='mean') 60 | 61 | self.reset_parameters() 62 | 63 | def reset_parameters(self): 64 | """ Initialize the weights.""" 65 | def re_init(module): 66 | if isinstance(module, (nn.Linear, nn.Embedding)): 67 | # Slightly different from the TF version which uses truncated_normal for initialization 68 | # cf https://github.com/pytorch/pytorch/pull/5617 69 | module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) 70 | elif isinstance(module, nn.LayerNorm): 71 | module.bias.data.zero_() 72 | module.weight.data.fill_(1.0) 73 | elif isinstance(module, nn.Conv1d): 74 | module.reset_parameters() 75 | if isinstance(module, nn.Linear) and module.bias is not None: 76 | module.bias.data.zero_() 77 | 78 | self.apply(re_init) 79 | 80 | def set_hard_negative(self, use_hard_negative, hard_pool_size): 81 | """use_hard_negative: bool; hard_pool_size: int, """ 82 | self.config.use_hard_negative = use_hard_negative 83 | self.config.hard_pool_size = hard_pool_size 84 | 85 | def set_train_st_ed(self, lw_st_ed): 86 | """pre-train video retrieval then span prediction""" 87 | self.config.lw_st_ed = lw_st_ed 88 | 89 | def forward(self, query_feat, query_mask, video_feat, video_mask, sub_feat, sub_mask, st_ed_indices, match_labels): 90 | """ 91 | Args: 92 | query_feat: (N, Lq, Dq) 93 | query_mask: (N, Lq) 94 | video_feat: (N, Lv, Dv) or None 95 | video_mask: (N, Lv) or None 96 | sub_feat: (N, Lv, Ds) or None 97 | sub_mask: (N, Lv) or None 98 | st_ed_indices: (N, 2), torch.LongTensor, 1st, 2nd columns are st, ed labels respectively. 99 | match_labels: (N, Lv), torch.LongTensor, matching labels for detecting foreground and background (not used) 100 | """ 101 | video_feat, sub_feat, mid_x_video_feat, mid_x_sub_feat, x_video_feat, x_sub_feat = self.encode_context( 102 | video_feat, video_mask, sub_feat, sub_mask, return_mid_output=True) 103 | video_query, sub_query, query_context_scores, st_prob, ed_prob = self.get_pred_from_raw_query( 104 | query_feat, query_mask, x_video_feat, video_mask, x_sub_feat, sub_mask, cross=False, 105 | return_query_feats=True) 106 | # frame level contrastive learning loss (FrameCL) 107 | loss_fcl = 0 108 | if self.config.lw_fcl != 0: 109 | loss_fcl_vq = batch_video_query_loss(mid_x_video_feat, video_query, match_labels, video_mask, measure='JSD') 110 | loss_fcl_sq = batch_video_query_loss(mid_x_sub_feat, sub_query, match_labels, sub_mask, measure='JSD') 111 | loss_fcl = (loss_fcl_vq + loss_fcl_sq) / 2.0 112 | loss_fcl = self.config.lw_fcl * loss_fcl 113 | # video level contrastive learning loss (VideoCL) 114 | loss_vcl = 0 115 | if self.config.lw_vcl != 0: 116 | mid_video_q2ctx_scores = self.get_unnormalized_video_level_scores(video_query, mid_x_video_feat, video_mask) 117 | mid_sub_q2ctx_scores = self.get_unnormalized_video_level_scores(sub_query, mid_x_sub_feat, sub_mask) 118 | mid_video_q2ctx_scores, _ = torch.max(mid_video_q2ctx_scores, dim=1) 119 | mid_sub_q2ctx_scores, _ = torch.max(mid_sub_q2ctx_scores, dim=1) 120 | mid_q2ctx_scores = (mid_video_q2ctx_scores + mid_sub_q2ctx_scores) / 2.0 121 | loss_vcl = self.nce_criterion(mid_q2ctx_scores) 122 | loss_vcl = self.config.lw_vcl * loss_vcl 123 | # moment localization loss 124 | loss_st_ed = 0 125 | if self.config.lw_st_ed != 0: 126 | loss_st = self.temporal_criterion(st_prob, st_ed_indices[:, 0]) 127 | loss_ed = self.temporal_criterion(ed_prob, st_ed_indices[:, 1]) 128 | loss_st_ed = loss_st + loss_ed 129 | loss_st_ed = self.config.lw_st_ed * loss_st_ed 130 | # video level retrieval loss 131 | loss_neg_ctx, loss_neg_q = 0, 0 132 | if self.config.lw_neg_ctx != 0 or self.config.lw_neg_q != 0: 133 | loss_neg_ctx, loss_neg_q = self.get_video_level_loss(query_context_scores) 134 | loss_neg_ctx = self.config.lw_neg_ctx * loss_neg_ctx 135 | loss_neg_q = self.config.lw_neg_q * loss_neg_q 136 | # sum loss 137 | loss = loss_fcl + loss_vcl + loss_st_ed + loss_neg_ctx + loss_neg_q 138 | return loss, {"loss_st_ed": float(loss_st_ed), "loss_fcl": float(loss_fcl), "loss_vcl": loss_vcl, 139 | "loss_neg_ctx": float(loss_neg_ctx), "loss_neg_q": float(loss_neg_q), "loss_overall": float(loss)} 140 | 141 | def encode_query(self, query_feat, query_mask): 142 | encoded_query = self.encode_input(query_feat, query_mask, self.query_input_proj, self.query_encoder, 143 | self.query_pos_embed) # (N, Lq, D) 144 | encoded_query = self.query_encoder1(encoded_query, query_mask.unsqueeze(1)) 145 | video_query, sub_query = self.get_modularized_queries(encoded_query, query_mask) # (N, D) * 2 146 | return video_query, sub_query 147 | 148 | def encode_context(self, video_feat, video_mask, sub_feat, sub_mask, return_mid_output=False): 149 | # encoding video and subtitle features, respectively 150 | encoded_video_feat = self.encode_input(video_feat, video_mask, self.video_input_proj, self.video_encoder1, 151 | self.ctx_pos_embed) 152 | encoded_sub_feat = self.encode_input(sub_feat, sub_mask, self.sub_input_proj, self.sub_encoder1, 153 | self.ctx_pos_embed) 154 | # cross encoding subtitle features 155 | x_encoded_video_feat = self.cross_context_encoder(encoded_video_feat, video_mask, encoded_sub_feat, sub_mask, 156 | self.video_cross_att, self.video_cross_layernorm) # (N, L, D) 157 | x_encoded_video_feat_ = self.video_encoder2(x_encoded_video_feat, video_mask.unsqueeze(1)) 158 | # cross encoding video features 159 | x_encoded_sub_feat = self.cross_context_encoder(encoded_sub_feat, sub_mask, encoded_video_feat, video_mask, 160 | self.sub_cross_att, self.sub_cross_layernorm) # (N, L, D) 161 | x_encoded_sub_feat_ = self.sub_encoder2(x_encoded_sub_feat, sub_mask.unsqueeze(1)) 162 | # additional self encoding process 163 | x_encoded_video_feat = self.video_encoder3(x_encoded_video_feat_, video_mask.unsqueeze(1)) 164 | x_encoded_sub_feat = self.sub_encoder3(x_encoded_sub_feat_, sub_mask.unsqueeze(1)) 165 | if return_mid_output: 166 | return (encoded_video_feat, encoded_sub_feat, x_encoded_video_feat_, x_encoded_sub_feat_, 167 | x_encoded_video_feat, x_encoded_sub_feat) 168 | else: 169 | return x_encoded_video_feat, x_encoded_sub_feat 170 | 171 | @staticmethod 172 | def cross_context_encoder(main_context_feat, main_context_mask, side_context_feat, side_context_mask, 173 | cross_att_layer, norm_layer): 174 | """ 175 | Args: 176 | main_context_feat: (N, Lq, D) 177 | main_context_mask: (N, Lq) 178 | side_context_feat: (N, Lk, D) 179 | side_context_mask: (N, Lk) 180 | cross_att_layer: cross attention layer 181 | norm_layer: layer norm layer 182 | """ 183 | cross_mask = torch.einsum("bm,bn->bmn", main_context_mask, side_context_mask) # (N, Lq, Lk) 184 | cross_out = cross_att_layer(main_context_feat, side_context_feat, side_context_feat, cross_mask) # (N, Lq, D) 185 | residual_out = norm_layer(cross_out + main_context_feat) 186 | return residual_out 187 | 188 | @staticmethod 189 | def encode_input(feat, mask, input_proj_layer, encoder_layer, pos_embed_layer): 190 | """ 191 | Args: 192 | feat: (N, L, D_input), torch.float32 193 | mask: (N, L), torch.float32, with 1 indicates valid query, 0 indicates mask 194 | input_proj_layer: down project input 195 | encoder_layer: encoder layer 196 | pos_embed_layer: positional embedding layer 197 | """ 198 | feat = input_proj_layer(feat) 199 | feat = pos_embed_layer(feat) 200 | mask = mask.unsqueeze(1) # (N, 1, L), torch.FloatTensor 201 | return encoder_layer(feat, mask) # (N, L, D_hidden) 202 | 203 | def get_modularized_queries(self, encoded_query, query_mask, return_modular_att=False): 204 | """ 205 | Args: 206 | encoded_query: (N, L, D) 207 | query_mask: (N, L) 208 | return_modular_att: bool 209 | """ 210 | modular_attention_scores = self.modular_vector_mapping(encoded_query) # (N, L, 2 or 1) 211 | modular_attention_scores = F.softmax(mask_logits(modular_attention_scores, query_mask.unsqueeze(2)), dim=1) 212 | modular_queries = torch.einsum("blm,bld->bmd", modular_attention_scores, encoded_query) # (N, 2 or 1, D) 213 | if return_modular_att: 214 | assert modular_queries.shape[1] == 2 215 | return modular_queries[:, 0], modular_queries[:, 1], modular_attention_scores 216 | else: 217 | assert modular_queries.shape[1] == 2 218 | return modular_queries[:, 0], modular_queries[:, 1] # (N, D) * 2 219 | 220 | @staticmethod 221 | def get_video_level_scores(modularied_query, context_feat, context_mask): 222 | """ Calculate video2query scores for each pair of video and query inside the batch. 223 | Args: 224 | modularied_query: (N, D) 225 | context_feat: (N, L, D), output of the first transformer encoder layer 226 | context_mask: (N, L) 227 | Returns: 228 | context_query_scores: (N, N) score of each query w.r.t. each video inside the batch, 229 | diagonal positions are positive. used to get negative samples. 230 | """ 231 | modularied_query = F.normalize(modularied_query, dim=-1) 232 | context_feat = F.normalize(context_feat, dim=-1) 233 | query_context_scores = torch.einsum("md,nld->mln", modularied_query, context_feat) # (N, L, N) 234 | context_mask = context_mask.transpose(0, 1).unsqueeze(0) # (1, L, N) 235 | query_context_scores = mask_logits(query_context_scores, context_mask) # (N, L, N) 236 | query_context_scores, _ = torch.max(query_context_scores, dim=1) # (N, N) diagonal positions are positive pairs 237 | return query_context_scores 238 | 239 | @staticmethod 240 | def get_unnormalized_video_level_scores(modularied_query, context_feat, context_mask): 241 | """ Calculate video2query scores for each pair of video and query inside the batch. 242 | Args: 243 | modularied_query: (N, D) 244 | context_feat: (N, L, D), output of the first transformer encoder layer 245 | context_mask: (N, L) 246 | Returns: 247 | context_query_scores: (N, N) score of each query w.r.t. each video inside the batch, 248 | diagonal positions are positive. used to get negative samples. 249 | """ 250 | query_context_scores = torch.einsum("md,nld->mln", modularied_query, context_feat) # (N, L, N) 251 | context_mask = context_mask.transpose(0, 1).unsqueeze(0) # (1, L, N) 252 | query_context_scores = mask_logits(query_context_scores, context_mask) # (N, L, N) 253 | return query_context_scores 254 | 255 | def get_merged_score(self, video_query, video_feat, sub_query, sub_feat, cross=False): 256 | video_query = self.video_query_linear(video_query) 257 | sub_query = self.sub_query_linear(sub_query) 258 | if cross: 259 | video_similarity = torch.einsum("md,nld->mnl", video_query, video_feat) 260 | sub_similarity = torch.einsum("md,nld->mnl", sub_query, sub_feat) 261 | similarity = (video_similarity + sub_similarity) / 2 # (Nq, Nv, L) from query to all videos. 262 | else: 263 | video_similarity = torch.einsum("bd,bld->bl", video_query, video_feat) # (N, L) 264 | sub_similarity = torch.einsum("bd,bld->bl", sub_query, sub_feat) # (N, L) 265 | similarity = (video_similarity + sub_similarity) / 2 266 | return similarity 267 | 268 | def get_merged_st_ed_prob(self, similarity, context_mask, cross=False): 269 | if cross: 270 | n_q, n_c, length = similarity.shape 271 | similarity = similarity.view(n_q * n_c, 1, length) 272 | st_prob = self.merged_st_predictor(similarity).view(n_q, n_c, length) # (Nq, Nv, L) 273 | ed_prob = self.merged_ed_predictor(similarity).view(n_q, n_c, length) # (Nq, Nv, L) 274 | else: 275 | st_prob = self.merged_st_predictor(similarity.unsqueeze(1)).squeeze() # (N, L) 276 | ed_prob = self.merged_ed_predictor(similarity.unsqueeze(1)).squeeze() # (N, L) 277 | st_prob = mask_logits(st_prob, context_mask) # (N, L) 278 | ed_prob = mask_logits(ed_prob, context_mask) 279 | return st_prob, ed_prob 280 | 281 | def get_pred_from_raw_query(self, query_feat, query_mask, video_feat, video_mask, sub_feat, sub_mask, cross=False, 282 | return_query_feats=False): 283 | """ 284 | Args: 285 | query_feat: (N, Lq, Dq) 286 | query_mask: (N, Lq) 287 | video_feat: (N, Lv, D) or None 288 | video_mask: (N, Lv) 289 | sub_feat: (N, Lv, D) or None 290 | sub_mask: (N, Lv) 291 | cross: 292 | return_query_feats: 293 | """ 294 | video_query, sub_query = self.encode_query(query_feat, query_mask) 295 | # get video-level retrieval scores 296 | video_q2ctx_scores = self.get_video_level_scores(video_query, video_feat, video_mask) 297 | sub_q2ctx_scores = self.get_video_level_scores(sub_query, sub_feat, sub_mask) 298 | q2ctx_scores = (video_q2ctx_scores + sub_q2ctx_scores) / 2 # (N, N) 299 | # compute start and end probs 300 | similarity = self.get_merged_score(video_query, video_feat, sub_query, sub_feat, cross=cross) 301 | st_prob, ed_prob = self.get_merged_st_ed_prob(similarity, video_mask, cross=cross) 302 | if return_query_feats: 303 | return video_query, sub_query, q2ctx_scores, st_prob, ed_prob 304 | else: 305 | return q2ctx_scores, st_prob, ed_prob # un-normalized masked probabilities!!!!! 306 | 307 | def get_video_level_loss(self, query_context_scores): 308 | """ ranking loss between (pos. query + pos. video) and (pos. query + neg. video) or (neg. query + pos. video) 309 | Args: 310 | query_context_scores: (N, N), cosine similarity [-1, 1], 311 | Each row contains the scores between the query to each of the videos inside the batch. 312 | """ 313 | bsz = len(query_context_scores) 314 | diagonal_indices = torch.arange(bsz).to(query_context_scores.device) 315 | pos_scores = query_context_scores[diagonal_indices, diagonal_indices] # (N, ) 316 | query_context_scores_masked = copy.deepcopy(query_context_scores.data) 317 | # impossibly large for cosine similarity, the copy is created as modifying the original will cause error 318 | query_context_scores_masked[diagonal_indices, diagonal_indices] = 999 319 | pos_query_neg_context_scores = self.get_neg_scores(query_context_scores, query_context_scores_masked) 320 | neg_query_pos_context_scores = self.get_neg_scores(query_context_scores.transpose(0, 1), 321 | query_context_scores_masked.transpose(0, 1)) 322 | loss_neg_ctx = self.get_ranking_loss(pos_scores, pos_query_neg_context_scores) 323 | loss_neg_q = self.get_ranking_loss(pos_scores, neg_query_pos_context_scores) 324 | return loss_neg_ctx, loss_neg_q 325 | 326 | def get_neg_scores(self, scores, scores_masked): 327 | """ 328 | scores: (N, N), cosine similarity [-1, 1], 329 | Each row are scores: query --> all videos. Transposed version: video --> all queries. 330 | scores_masked: (N, N) the same as scores, except that the diagonal (positive) positions 331 | are masked with a large value. 332 | """ 333 | bsz = len(scores) 334 | batch_indices = torch.arange(bsz).to(scores.device) 335 | _, sorted_scores_indices = torch.sort(scores_masked, descending=True, dim=1) 336 | sample_min_idx = 1 # skip the masked positive 337 | sample_max_idx = min(sample_min_idx + self.config.hard_pool_size, bsz) if self.config.use_hard_negative else bsz 338 | # (N, ) 339 | sampled_neg_score_indices = sorted_scores_indices[batch_indices, torch.randint(sample_min_idx, sample_max_idx, 340 | size=(bsz,)).to(scores.device)] 341 | sampled_neg_scores = scores[batch_indices, sampled_neg_score_indices] # (N, ) 342 | return sampled_neg_scores 343 | 344 | def get_ranking_loss(self, pos_score, neg_score): 345 | """ Note here we encourage positive scores to be larger than negative scores. 346 | Args: 347 | pos_score: (N, ), torch.float32 348 | neg_score: (N, ), torch.float32 349 | """ 350 | if self.config.ranking_loss_type == "hinge": # max(0, m + S_neg - S_pos) 351 | return torch.clamp(self.config.margin + neg_score - pos_score, min=0).sum() / len(pos_score) 352 | elif self.config.ranking_loss_type == "lse": # log[1 + exp(S_neg - S_pos)] 353 | return torch.log1p(torch.exp(neg_score - pos_score)).sum() / len(pos_score) 354 | else: 355 | raise NotImplementedError("Only support 'hinge' and 'lse'") 356 | 357 | 358 | def mask_logits(target, mask): 359 | return target * mask + (1 - mask) * (-1e10) 360 | -------------------------------------------------------------------------------- /method_tvr/model_components.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | 6 | 7 | def onehot(indexes, N=None): 8 | """ 9 | Creates a one-representation of indexes with N possible entries 10 | if N is not specified, it will suit the maximum index appearing. 11 | indexes is a long-tensor of indexes 12 | """ 13 | if N is None: 14 | N = indexes.max() + 1 15 | sz = list(indexes.size()) 16 | output = indexes.new().long().resize_(*sz, N).zero_() 17 | output.scatter_(-1, indexes.unsqueeze(-1), 1) 18 | return output 19 | 20 | 21 | class SmoothedCrossEntropyLoss(nn.Module): 22 | def __init__(self, reduction='mean'): 23 | super(SmoothedCrossEntropyLoss, self).__init__() 24 | self.reduction = reduction 25 | 26 | def forward(self, logits, labels, smooth_eps=0.1, mask=None, from_logits=True): 27 | """ 28 | Args: 29 | logits: (N, Lv), unnormalized probabilities, torch.float32 30 | labels: (N, Lv) or (N, ), one hot labels or indices labels, torch.float32 or torch.int64 31 | smooth_eps: float 32 | mask: (N, Lv) 33 | from_logits: bool 34 | """ 35 | if from_logits: 36 | probs = F.log_softmax(logits, dim=-1) 37 | else: 38 | probs = logits 39 | num_classes = probs.size()[-1] 40 | if len(probs.size()) > len(labels.size()): 41 | labels = onehot(labels, num_classes).type(probs.dtype) 42 | if mask is None: 43 | labels = labels * (1 - smooth_eps) + smooth_eps / num_classes 44 | else: 45 | mask = mask.type(probs.dtype) 46 | valid_samples = torch.sum(mask, dim=-1, keepdim=True, dtype=probs.dtype) # (N, 1) 47 | eps_per_sample = smooth_eps / valid_samples 48 | labels = (labels * (1 - smooth_eps) + eps_per_sample) * mask 49 | loss = -torch.sum(labels * probs, dim=-1) 50 | if self.reduction == 'sum': 51 | return torch.sum(loss) 52 | elif self.reduction == 'mean': 53 | return torch.mean(loss) 54 | else: 55 | return loss # (N, ) 56 | 57 | 58 | class MILNCELoss(nn.Module): 59 | def __init__(self, reduction='mean'): 60 | super(MILNCELoss, self).__init__() 61 | self.reduction = reduction 62 | 63 | def forward(self, q2ctx_scores=None, contexts=None, queries=None): 64 | if q2ctx_scores is None: 65 | assert contexts is not None and queries is not None 66 | x = torch.matmul(contexts, queries.t()) 67 | device = contexts.device 68 | bsz = contexts.shape[0] 69 | else: 70 | x = q2ctx_scores 71 | device = q2ctx_scores.device 72 | bsz = q2ctx_scores.shape[0] 73 | x = x.view(bsz, bsz, -1) 74 | nominator = x * torch.eye(x.shape[0], dtype=torch.float32, device=device)[:, :, None] 75 | nominator = nominator.sum(dim=1) 76 | nominator = torch.logsumexp(nominator, dim=1) 77 | denominator = torch.cat((x, x.permute(1, 0, 2)), dim=1).view(x.shape[0], -1) 78 | denominator = torch.logsumexp(denominator, dim=1) 79 | if self.reduction: 80 | return torch.mean(denominator - nominator) 81 | else: 82 | return denominator - nominator 83 | 84 | 85 | class DepthwiseSeparableConv(nn.Module): 86 | """ 87 | Depth-wise separable convolution uses less parameters to generate output by convolution. 88 | :Examples: 89 | >>> m = DepthwiseSeparableConv(300, 200, 5, dim=1) 90 | >>> input_tensor = torch.randn(32, 300, 20) 91 | >>> output = m(input_tensor) 92 | """ 93 | def __init__(self, in_ch, out_ch, k, dim=1, relu=True): 94 | """ 95 | :param in_ch: input hidden dimension size 96 | :param out_ch: output hidden dimension size 97 | :param k: kernel size 98 | :param dim: default 1. 1D conv or 2D conv 99 | """ 100 | super(DepthwiseSeparableConv, self).__init__() 101 | self.relu = relu 102 | if dim == 1: 103 | self.depthwise_conv = nn.Conv1d(in_channels=in_ch, out_channels=in_ch, kernel_size=k, groups=in_ch, 104 | padding=k // 2) 105 | self.pointwise_conv = nn.Conv1d(in_channels=in_ch, out_channels=out_ch, kernel_size=1, padding=0) 106 | elif dim == 2: 107 | self.depthwise_conv = nn.Conv2d(in_channels=in_ch, out_channels=in_ch, kernel_size=k, groups=in_ch, 108 | padding=k // 2) 109 | self.pointwise_conv = nn.Conv2d(in_channels=in_ch, out_channels=out_ch, kernel_size=1, padding=0) 110 | else: 111 | raise Exception("Incorrect dimension!") 112 | 113 | def forward(self, x): 114 | """ 115 | :Input: (N, L_in, D) 116 | :Output: (N, L_out, D) 117 | """ 118 | x = x.transpose(1, 2) 119 | if self.relu: 120 | out = F.relu(self.pointwise_conv(self.depthwise_conv(x)), inplace=True) 121 | else: 122 | out = self.pointwise_conv(self.depthwise_conv(x)) 123 | return out.transpose(1, 2) # (N, L, D) 124 | 125 | 126 | class ConvEncoder(nn.Module): 127 | def __init__(self, kernel_size=7, n_filters=128, dropout=0.1): 128 | super(ConvEncoder, self).__init__() 129 | self.dropout = nn.Dropout(dropout) 130 | self.layer_norm = nn.LayerNorm(n_filters) 131 | self.conv = DepthwiseSeparableConv(in_ch=n_filters, out_ch=n_filters, k=kernel_size, relu=True) 132 | 133 | def forward(self, x): 134 | """ 135 | :param x: (N, L, D) 136 | :return: (N, L, D) 137 | """ 138 | return self.layer_norm(self.dropout(self.conv(x)) + x) # (N, L, D) 139 | 140 | 141 | class TrainablePositionalEncoding(nn.Module): 142 | """Construct the embeddings from word, position and token_type embeddings.""" 143 | def __init__(self, max_position_embeddings, hidden_size, dropout=0.1): 144 | super(TrainablePositionalEncoding, self).__init__() 145 | self.position_embeddings = nn.Embedding(max_position_embeddings, hidden_size) 146 | self.LayerNorm = nn.LayerNorm(hidden_size) 147 | self.dropout = nn.Dropout(dropout) 148 | 149 | def forward(self, input_feat): 150 | bsz, seq_length = input_feat.shape[:2] 151 | position_ids = torch.arange(seq_length, dtype=torch.long, device=input_feat.device) 152 | position_ids = position_ids.unsqueeze(0).repeat(bsz, 1) # (N, L) 153 | position_embeddings = self.position_embeddings(position_ids) 154 | embeddings = self.LayerNorm(input_feat + position_embeddings) 155 | embeddings = self.dropout(embeddings) 156 | return embeddings 157 | 158 | def add_position_emb(self, input_feat): 159 | bsz, seq_length = input_feat.shape[:2] 160 | position_ids = torch.arange(seq_length, dtype=torch.long, device=input_feat.device) 161 | position_ids = position_ids.unsqueeze(0).repeat(bsz, 1) # (N, L) 162 | position_embeddings = self.position_embeddings(position_ids) 163 | return input_feat + position_embeddings 164 | 165 | 166 | class LinearLayer(nn.Module): 167 | """linear layer configurable with layer normalization, dropout, ReLU.""" 168 | def __init__(self, in_hsz, out_hsz, layer_norm=True, dropout=0.1, relu=True): 169 | super(LinearLayer, self).__init__() 170 | self.relu = relu 171 | self.layer_norm = layer_norm 172 | if layer_norm: 173 | self.LayerNorm = nn.LayerNorm(in_hsz) 174 | layers = [nn.Dropout(dropout), nn.Linear(in_hsz, out_hsz)] 175 | self.net = nn.Sequential(*layers) 176 | 177 | def forward(self, x): 178 | """(N, L, D)""" 179 | if self.layer_norm: 180 | x = self.LayerNorm(x) 181 | x = self.net(x) 182 | if self.relu: 183 | x = F.relu(x, inplace=True) 184 | return x # (N, L, D) 185 | 186 | 187 | class BertLayer(nn.Module): 188 | def __init__(self, config, use_self_attention=True): 189 | super(BertLayer, self).__init__() 190 | self.use_self_attention = use_self_attention 191 | if use_self_attention: 192 | self.attention = BertAttention(config) 193 | self.intermediate = BertIntermediate(config) 194 | self.output = BertOutput(config) 195 | 196 | def forward(self, hidden_states, attention_mask): 197 | """ 198 | Args: 199 | hidden_states: (N, L, D) 200 | attention_mask: (N, L) with 1 indicate valid, 0 indicates invalid 201 | """ 202 | if self.use_self_attention: 203 | attention_output = self.attention(hidden_states, attention_mask) 204 | else: 205 | attention_output = hidden_states 206 | intermediate_output = self.intermediate(attention_output) 207 | layer_output = self.output(intermediate_output, attention_output) 208 | return layer_output 209 | 210 | 211 | class BertAttention(nn.Module): 212 | def __init__(self, config): 213 | super(BertAttention, self).__init__() 214 | self.self = BertSelfAttention(config) 215 | self.output = BertSelfOutput(config) 216 | 217 | def forward(self, input_tensor, attention_mask): 218 | """ 219 | Args: 220 | input_tensor: (N, L, D) 221 | attention_mask: (N, L) 222 | """ 223 | self_output = self.self(input_tensor, input_tensor, input_tensor, attention_mask) 224 | attention_output = self.output(self_output, input_tensor) 225 | return attention_output 226 | 227 | 228 | class BertIntermediate(nn.Module): 229 | def __init__(self, config): 230 | super(BertIntermediate, self).__init__() 231 | self.dense = nn.Sequential(nn.Linear(config.hidden_size, config.intermediate_size), nn.ReLU(True)) 232 | 233 | def forward(self, hidden_states): 234 | return self.dense(hidden_states) 235 | 236 | 237 | class BertOutput(nn.Module): 238 | def __init__(self, config): 239 | super(BertOutput, self).__init__() 240 | self.dense = nn.Linear(config.intermediate_size, config.hidden_size) 241 | self.LayerNorm = nn.LayerNorm(config.hidden_size) 242 | self.dropout = nn.Dropout(config.hidden_dropout_prob) 243 | 244 | def forward(self, hidden_states, input_tensor): 245 | hidden_states = self.dense(hidden_states) 246 | hidden_states = self.dropout(hidden_states) 247 | hidden_states = self.LayerNorm(hidden_states + input_tensor) 248 | return hidden_states 249 | 250 | 251 | class BertSelfAttention(nn.Module): 252 | def __init__(self, config): 253 | super(BertSelfAttention, self).__init__() 254 | if config.hidden_size % config.num_attention_heads != 0: 255 | raise ValueError("The hidden size (%d) is not a multiple of the number of attention heads (%d)" % ( 256 | config.hidden_size, config.num_attention_heads)) 257 | self.num_attention_heads = config.num_attention_heads 258 | self.attention_head_size = int(config.hidden_size / config.num_attention_heads) 259 | self.all_head_size = self.num_attention_heads * self.attention_head_size 260 | self.query = nn.Linear(config.hidden_size, self.all_head_size) 261 | self.key = nn.Linear(config.hidden_size, self.all_head_size) 262 | self.value = nn.Linear(config.hidden_size, self.all_head_size) 263 | self.dropout = nn.Dropout(config.attention_probs_dropout_prob) 264 | 265 | def transpose_for_scores(self, x): 266 | new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) # (N, L, nh, dh) 267 | x = x.view(*new_x_shape) 268 | return x.permute(0, 2, 1, 3) # (N, nh, L, dh) 269 | 270 | def forward(self, query_states, key_states, value_states, attention_mask): 271 | """ 272 | Args: 273 | query_states: (N, Lq, D) 274 | key_states: (N, L, D) 275 | value_states: (N, L, D) 276 | attention_mask: (N, Lq, L) 277 | """ 278 | # only need to mask the dimension where the softmax (last dim) is applied, as another dim (second last) 279 | # will be ignored in future computation anyway 280 | attention_mask = (1 - attention_mask.unsqueeze(1)) * -10000. # (N, 1, Lq, L) 281 | mixed_query_layer = self.query(query_states) 282 | mixed_key_layer = self.key(key_states) 283 | mixed_value_layer = self.value(value_states) 284 | # transpose 285 | query_layer = self.transpose_for_scores(mixed_query_layer) # (N, nh, Lq, dh) 286 | key_layer = self.transpose_for_scores(mixed_key_layer) # (N, nh, L, dh) 287 | value_layer = self.transpose_for_scores(mixed_value_layer) # (N, nh, L, dh) 288 | # Take the dot product between "query" and "key" to get the raw attention scores. 289 | attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) # (N, nh, Lq, L) 290 | attention_scores = attention_scores / math.sqrt(self.attention_head_size) 291 | # Apply the attention mask is (precomputed for all layers in BertModel forward() function) 292 | attention_scores = attention_scores + attention_mask 293 | # Normalize the attention scores to probabilities. 294 | attention_probs = nn.Softmax(dim=-1)(attention_scores) 295 | # This is actually dropping out entire tokens to attend to, which might 296 | # seem a bit unusual, but is taken from the original Transformer paper. 297 | attention_probs = self.dropout(attention_probs) 298 | # compute output context 299 | context_layer = torch.matmul(attention_probs, value_layer) 300 | context_layer = context_layer.permute(0, 2, 1, 3).contiguous() 301 | new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) 302 | context_layer = context_layer.view(*new_context_layer_shape) 303 | return context_layer 304 | 305 | 306 | class BertSelfOutput(nn.Module): 307 | def __init__(self, config): 308 | super(BertSelfOutput, self).__init__() 309 | self.dense = nn.Linear(config.hidden_size, config.hidden_size) 310 | self.LayerNorm = nn.LayerNorm(config.hidden_size) 311 | self.dropout = nn.Dropout(config.hidden_dropout_prob) 312 | 313 | def forward(self, hidden_states, input_tensor): 314 | hidden_states = self.dense(hidden_states) 315 | hidden_states = self.dropout(hidden_states) 316 | hidden_states = self.LayerNorm(hidden_states + input_tensor) 317 | return hidden_states 318 | -------------------------------------------------------------------------------- /method_tvr/optimization.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """PyTorch optimization for BERT model.""" 16 | 17 | import math 18 | import torch 19 | from torch.optim import Optimizer 20 | from torch.optim.optimizer import required 21 | from torch.nn.utils import clip_grad_norm_ 22 | import logging 23 | import abc 24 | import sys 25 | 26 | logger = logging.getLogger(__name__) 27 | 28 | 29 | if sys.version_info >= (3, 4): 30 | ABC = abc.ABC 31 | else: 32 | ABC = abc.ABCMeta('ABC', (), {}) 33 | 34 | 35 | class _LRSchedule(ABC): 36 | """ Parent of all LRSchedules here. """ 37 | warn_t_total = False # is set to True for schedules where progressing beyond t_total steps doesn't make sense 38 | 39 | def __init__(self, warmup=0.002, t_total=-1, **kw): 40 | """ 41 | :param warmup: what fraction of t_total steps will be used for linear warmup 42 | :param t_total: how many training steps (updates) are planned 43 | :param kw: 44 | """ 45 | super(_LRSchedule, self).__init__(**kw) 46 | if t_total < 0: 47 | logger.warning("t_total value of {} results in schedule not being applied".format(t_total)) 48 | if not 0.0 <= warmup < 1.0 and not warmup == -1: 49 | raise ValueError("Invalid warmup: {} - should be in [0.0, 1.0[ or -1".format(warmup)) 50 | warmup = max(warmup, 0.) 51 | self.warmup, self.t_total = float(warmup), float(t_total) 52 | self.warned_for_t_total_at_progress = -1 53 | 54 | def get_lr(self, step, nowarn=False): 55 | """ 56 | :param step: which of t_total steps we're on 57 | :param nowarn: set to True to suppress warning regarding training beyond specified 't_total' steps 58 | :return: learning rate multiplier for current update 59 | """ 60 | if self.t_total < 0: 61 | return 1. 62 | progress = float(step) / self.t_total 63 | ret = self.get_lr_(progress) 64 | # warning for exceeding t_total (only active with warmup_linear 65 | if not nowarn and self.warn_t_total and progress > 1. and progress > self.warned_for_t_total_at_progress: 66 | logger.warning("Training beyond specified 't_total'. Learning rate multiplier set to {}. Please " 67 | "set 't_total' of {} correctly.".format(ret, self.__class__.__name__)) 68 | self.warned_for_t_total_at_progress = progress 69 | # end warning 70 | return ret 71 | 72 | @abc.abstractmethod 73 | def get_lr_(self, progress): 74 | """ 75 | :param progress: value between 0 and 1 (unless going beyond t_total steps) specifying training progress 76 | :return: learning rate multiplier for current update 77 | """ 78 | return 1. 79 | 80 | 81 | class ConstantLR(_LRSchedule): 82 | def get_lr_(self, progress): 83 | return 1. 84 | 85 | 86 | class WarmupCosineSchedule(_LRSchedule): 87 | """ 88 | Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps. 89 | Decreases learning rate from 1. to 0. over remaining `1 - warmup` steps following a cosine curve. 90 | If `cycles` (default=0.5) is different from default, learning rate follows cosine function after warmup. 91 | """ 92 | warn_t_total = True 93 | 94 | def __init__(self, warmup=0.002, t_total=-1, cycles=.5, **kw): 95 | """ 96 | :param warmup: see LRSchedule 97 | :param t_total: see LRSchedule 98 | :param cycles: number of cycles. Default: 0.5, corresponding to cosine decay from 1. 99 | at progress==warmup and 0 at progress==1. 100 | :param kw: 101 | """ 102 | super(WarmupCosineSchedule, self).__init__(warmup=warmup, t_total=t_total, **kw) 103 | self.cycles = cycles 104 | 105 | def get_lr_(self, progress): 106 | if progress < self.warmup: 107 | return progress / self.warmup 108 | else: 109 | progress = (progress - self.warmup) / (1 - self.warmup) # progress after warmup 110 | return 0.5 * (1. + math.cos(math.pi * self.cycles * 2 * progress)) 111 | 112 | 113 | class WarmupCosineWithHardRestartsSchedule(WarmupCosineSchedule): 114 | """ 115 | Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps. 116 | If `cycles` (default=1.) is different from default, learning rate follows `cycles` times a cosine decaying 117 | learning rate (with hard restarts). 118 | """ 119 | def __init__(self, warmup=0.002, t_total=-1, cycles=1., **kw): 120 | super(WarmupCosineWithHardRestartsSchedule, self).__init__(warmup=warmup, t_total=t_total, cycles=cycles, **kw) 121 | assert(cycles >= 1.) 122 | 123 | def get_lr_(self, progress): 124 | if progress < self.warmup: 125 | return progress / self.warmup 126 | else: 127 | progress = (progress - self.warmup) / (1 - self.warmup) # progress after warmup 128 | ret = 0.5 * (1. + math.cos(math.pi * ((self.cycles * progress) % 1))) 129 | return ret 130 | 131 | 132 | class WarmupCosineWithWarmupRestartsSchedule(WarmupCosineWithHardRestartsSchedule): 133 | """ 134 | All training progress is divided in `cycles` (default=1.) parts of equal length. 135 | Every part follows a schedule with the first `warmup` fraction of training steps linearly increasing from 0. to 1., 136 | followed by a learning rate decreasing from 1. to 0. following a cosine curve. 137 | """ 138 | def __init__(self, warmup=0.002, t_total=-1, cycles=1., **kw): 139 | assert(warmup * cycles < 1.) 140 | warmup = warmup * cycles if warmup >= 0 else warmup 141 | super(WarmupCosineWithWarmupRestartsSchedule, self).__init__(warmup=warmup, t_total=t_total, cycles=cycles, 142 | **kw) 143 | 144 | def get_lr_(self, progress): 145 | progress = progress * self.cycles % 1. 146 | if progress < self.warmup: 147 | return progress / self.warmup 148 | else: 149 | progress = (progress - self.warmup) / (1 - self.warmup) # progress after warmup 150 | ret = 0.5 * (1. + math.cos(math.pi * progress)) 151 | return ret 152 | 153 | 154 | class WarmupConstantSchedule(_LRSchedule): 155 | """ 156 | Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps. 157 | Keeps learning rate equal to 1. after warmup. 158 | """ 159 | def get_lr_(self, progress): 160 | if progress < self.warmup: 161 | return progress / self.warmup 162 | return 1. 163 | 164 | 165 | class WarmupLinearSchedule(_LRSchedule): 166 | """ 167 | Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps. 168 | Linearly decreases learning rate from 1. to 0. over remaining `1 - warmup` steps. 169 | """ 170 | warn_t_total = True 171 | 172 | def get_lr_(self, progress): 173 | if progress < self.warmup: 174 | return progress / self.warmup 175 | return max((progress - 1.) / (self.warmup - 1.), 0.) 176 | 177 | 178 | SCHEDULES = { 179 | None: ConstantLR, 180 | "none": ConstantLR, 181 | "warmup_cosine": WarmupCosineSchedule, 182 | "warmup_constant": WarmupConstantSchedule, 183 | "warmup_linear": WarmupLinearSchedule 184 | } 185 | 186 | 187 | class EMA(object): 188 | """ Exponential Moving Average for model parameters. 189 | references: 190 | [1] https://github.com/BangLiu/QANet-PyTorch/blob/master/model/modules/ema.py 191 | [2] https://github.com/hengruo/QANet-pytorch/blob/e2de07cd2c711d525f5ffee35c3764335d4b501d/main.py""" 192 | def __init__(self, decay): 193 | self.decay = decay 194 | self.shadow = {} 195 | self.original = {} 196 | 197 | def register(self, name, val): 198 | self.shadow[name] = val.clone() 199 | 200 | def __call__(self, model, step): 201 | decay = min(self.decay, (1 + step) / (10.0 + step)) 202 | for name, param in model.named_parameters(): 203 | if param.requires_grad: 204 | assert name in self.shadow 205 | new_average = \ 206 | (1.0 - decay) * param.data + decay * self.shadow[name] 207 | self.shadow[name] = new_average.clone() 208 | 209 | def assign(self, model): 210 | for name, param in model.named_parameters(): 211 | if param.requires_grad: 212 | assert name in self.shadow 213 | self.original[name] = param.data.clone() 214 | param.data = self.shadow[name] 215 | 216 | def resume(self, model): 217 | for name, param in model.named_parameters(): 218 | if param.requires_grad: 219 | assert name in self.shadow 220 | param.data = self.original[name] 221 | 222 | 223 | class BertAdam(Optimizer): 224 | """Implements BERT version of Adam algorithm with weight decay fix. 225 | Params: 226 | lr: learning rate 227 | warmup: portion of t_total for the warmup, -1 means no warmup. Default: -1 228 | t_total: total number of training steps for the learning 229 | rate schedule, -1 means constant learning rate of 1. (no warmup regardless of warmup setting). Default: -1 230 | schedule: schedule to use for the warmup (see above). 231 | Can be `'warmup_linear'`, `'warmup_constant'`, `'warmup_cosine'`, `'none'`, `None` or a `_LRSchedule` object 232 | (see below). 233 | If `None` or `'none'`, learning rate is always kept constant. 234 | Default : `'warmup_linear'` 235 | b1: Adams b1. Default: 0.9 236 | b2: Adams b2. Default: 0.999 237 | e: Adams epsilon. Default: 1e-6 238 | weight_decay: Weight decay. Default: 0.01 239 | max_grad_norm: Maximum norm for the gradients (-1 means no clipping). Default: 1.0 240 | """ 241 | def __init__(self, params, lr=required, warmup=-1, t_total=-1, schedule='warmup_linear', 242 | b1=0.9, b2=0.999, e=1e-6, weight_decay=0.01, max_grad_norm=1.0, **kwargs): 243 | if lr is not required and lr < 0.0: 244 | raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr)) 245 | if not isinstance(schedule, _LRSchedule) and schedule not in SCHEDULES: 246 | raise ValueError("Invalid schedule parameter: {}".format(schedule)) 247 | if not 0.0 <= b1 < 1.0: 248 | raise ValueError("Invalid b1 parameter: {} - should be in [0.0, 1.0[".format(b1)) 249 | if not 0.0 <= b2 < 1.0: 250 | raise ValueError("Invalid b2 parameter: {} - should be in [0.0, 1.0[".format(b2)) 251 | if not e >= 0.0: 252 | raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e)) 253 | # initialize schedule object 254 | if not isinstance(schedule, _LRSchedule): 255 | schedule_type = SCHEDULES[schedule] 256 | schedule = schedule_type(warmup=warmup, t_total=t_total) 257 | else: 258 | if warmup != -1 or t_total != -1: 259 | logger.warning("warmup and t_total on the optimizer are ineffective when _LRSchedule object is " 260 | "provided as schedule. Please specify custom warmup and t_total in _LRSchedule object.") 261 | defaults = dict(lr=lr, schedule=schedule, 262 | b1=b1, b2=b2, e=e, weight_decay=weight_decay, 263 | max_grad_norm=max_grad_norm) 264 | super(BertAdam, self).__init__(params, defaults) 265 | 266 | def get_lr(self): 267 | lr = [] 268 | for group in self.param_groups: 269 | for p in group['params']: 270 | state = self.state[p] 271 | if len(state) == 0: 272 | return [0] 273 | lr_scheduled = group['lr'] 274 | lr_scheduled *= group['schedule'].get_lr(state['step']) 275 | lr.append(lr_scheduled) 276 | return lr 277 | 278 | def step(self, closure=None): 279 | """Performs a single optimization step. 280 | 281 | Arguments: 282 | closure (callable, optional): A closure that reevaluates the model 283 | and returns the loss. 284 | """ 285 | loss = None 286 | if closure is not None: 287 | loss = closure() 288 | 289 | for group in self.param_groups: 290 | for p in group['params']: 291 | if p.grad is None: 292 | continue 293 | grad = p.grad.data 294 | if grad.is_sparse: 295 | raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead') 296 | 297 | state = self.state[p] 298 | 299 | # State initialization 300 | if len(state) == 0: 301 | state['step'] = 0 302 | # Exponential moving average of gradient values 303 | state['next_m'] = torch.zeros_like(p.data) 304 | # Exponential moving average of squared gradient values 305 | state['next_v'] = torch.zeros_like(p.data) 306 | 307 | next_m, next_v = state['next_m'], state['next_v'] 308 | beta1, beta2 = group['b1'], group['b2'] 309 | 310 | # Add grad clipping 311 | if group['max_grad_norm'] > 0: 312 | clip_grad_norm_(p, group['max_grad_norm']) 313 | 314 | # Decay the first and second moment running average coefficient 315 | # In-place operations to update the averages at the same time 316 | next_m.mul_(beta1).add_(1 - beta1, grad) 317 | next_v.mul_(beta2).addcmul_(1 - beta2, grad, grad) 318 | update = next_m / (next_v.sqrt() + group['e']) 319 | 320 | # Just adding the square of the weights to the loss function is *not* 321 | # the correct way of using L2 regularization/weight decay with Adam, 322 | # since that will interact with the m and v parameters in strange ways. 323 | # 324 | # Instead we want to decay the weights in a manner that doesn't interact 325 | # with the m/v parameters. This is equivalent to adding the square 326 | # of the weights to the loss with plain (non-momentum) SGD. 327 | if group['weight_decay'] > 0.0: 328 | update += group['weight_decay'] * p.data 329 | 330 | lr_scheduled = group['lr'] 331 | lr_scheduled *= group['schedule'].get_lr(state['step']) 332 | 333 | update_with_lr = lr_scheduled * update 334 | p.data.add_(-update_with_lr) 335 | 336 | state['step'] += 1 337 | 338 | # step_size = lr_scheduled * math.sqrt(bias_correction2) / bias_correction1 339 | # No bias correction 340 | # bias_correction1 = 1 - beta1 ** state['step'] 341 | # bias_correction2 = 1 - beta2 ** state['step'] 342 | 343 | return loss 344 | -------------------------------------------------------------------------------- /method_tvr/proposal.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2018 Victor Escorcia Castillo 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | # ============================================================================== 23 | """ 24 | Group multiple methods to generate salient temporal windows in a video""" 25 | import itertools 26 | import numpy as np 27 | 28 | PROPOSAL_SCHEMES = ['DidemoICCV17SS', 'SlidingWindowMSRSS'] 29 | 30 | 31 | class TemporalProposalsBase: 32 | """Base class (signature) to generate temporal candidate in video""" 33 | def __call__(self, video_id, metadata=None, feature_collection=None): 34 | raise NotImplementedError('Implement with the signature above') 35 | 36 | 37 | class DidemoICCV17SS(TemporalProposalsBase): 38 | """Original search space of moments proposed in ICCV-2017 39 | 40 | Attributes: 41 | clip_length_min (float) : minimum length, in seconds, of a video clip. 42 | proposals (numpy array) : of shape [21, 2] representing all the 43 | possible temporal segments of valid annotations of DiDeMo dataset. 44 | It represents the search space of a temporal localization 45 | algorithm. 46 | 47 | Reference: Hendricks et al. Localizing Moments in Video with Natural 48 | Language. ICCV 2017. 49 | """ 50 | clip_length_min = 5.0 51 | 52 | def __init__(self, *args, dtype=np.float32, **kwargs): 53 | clips_indices = [(0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)] 54 | for i in itertools.combinations(range(len(clips_indices)), 2): 55 | clips_indices.append(i) 56 | self.proposals = np.array(clips_indices, dtype=dtype) 57 | self.proposals *= self.clip_length_min 58 | self.proposals[:, 1] += self.clip_length_min 59 | 60 | def __call__(self, *args, **kwargs): 61 | return self.proposals 62 | 63 | 64 | class SlidingWindowMSRSS(TemporalProposalsBase): 65 | """Multi-scale sliding window with relative stride within the same scale 66 | 67 | Attributes: 68 | length (float) : length of smallest window. 69 | scales (sequence of int) : duration of moments relative to 70 | `length`. 71 | stride (float) : relative stride between two windows with the same 72 | duration. We used different strides for each scale rounding it 73 | towards a multiple of `length`. Note that the minimum stride is 74 | `length` for any window will be the `length` itself. 75 | dtype (numpy.dtype) : 76 | """ 77 | 78 | def __init__(self, length, scales, stride=0.5, round_base=0.5, dtype=np.float32): 79 | self.length = length 80 | self.scales = scales 81 | self.round_base = round_base 82 | self.relative_stride = stride 83 | # pick strides per scale that are multiples of length 84 | self.strides = [max(round(s * stride / round_base) * round_base, round_base) 85 | * length for s in scales] 86 | self.dtype = dtype 87 | assert len(scales) > 0 88 | 89 | def sliding_windows(self, t_end, t_start=0): 90 | """sliding canonical windows over a given time interval""" 91 | windows_ = [] 92 | for i, stride in enumerate(self.strides): 93 | num_i = np.ceil((t_end - t_start) / stride) 94 | windows_i = np.empty((int(num_i), 2), dtype=np.float32) 95 | windows_i[:, 0] = np.arange(t_start, t_end, stride) 96 | windows_i[:, 1] = windows_i[:, 0] + self.length * self.scales[i] 97 | windows_i[windows_i[:, 1] > t_end, 1] = t_end 98 | windows_.append(windows_i) 99 | # print("--------------------------------{}".format(i)) 100 | # print(windows_i) 101 | # import sys 102 | # sys.exit(1) 103 | windows = np.concatenate(windows_, axis=0) 104 | # Hacky way to make windows fit inside video 105 | # It implies windows at the end may not belong to the set spanned by 106 | # length and scales. 107 | return np.unique(windows, axis=0) 108 | 109 | def __call__(self, video_id, metadata=None, feature_collection=None): 110 | """return: (N_window, 2), each row contains (start, end)""" 111 | duration = metadata.get('duration') 112 | assert duration is not None 113 | return self.sliding_windows(duration) 114 | 115 | 116 | ProposalConfigs = { 117 | "didemo": { 118 | "proposal_interface": "DidemoICCV17SS", 119 | "clip_length": 2.5, 120 | }, 121 | "tvr": { 122 | "length": 3, # min proposal length 123 | "scales": [1, 2, 4, 8], 124 | "stride": 0.3, 125 | "round_base": 1, 126 | "min_proposal_length": 3, # length * min(scales) 127 | "clip_length": 1.5, # length should be divisible by clip_length 128 | "proposal_interface": "SlidingWindowMSRSS", 129 | }, 130 | "anet_cap": { 131 | "length": 5, 132 | "scales": [2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26], 133 | "stride": 0.3, 134 | "round_base": 1, 135 | "min_proposal_length": 10, # length * min(scales) 136 | "clip_length": 5, # length * min(scales) / 2 137 | "proposal_interface": "SlidingWindowMSRSS", 138 | }, 139 | "charades_sta": { 140 | "length": 3, 141 | "scales": [2, 3, 4, 5, 6, 7, 8], 142 | "stride": 0.3, 143 | "round_base": 1, 144 | "min_proposal_length": 6, # length * min(scales) 145 | "clip_length": 3, # length * min(scales) / 2 146 | "proposal_interface": "SlidingWindowMSRSS", 147 | }, 148 | "profiling": { 149 | "length": 5, 150 | "scales": [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], 151 | "stride": 0.3, 152 | "round_base": 1, 153 | "clip_length": 5, # length * min(scales) / 2 154 | "proposal_interface": "SlidingWindowMSRSS", 155 | }, 156 | } 157 | """ 158 | 'min_clip_length' is used to uniformly segment the video into smaller clips, it is a half of 159 | the 'min_proposal_length'. Thus we can enforce each moment has at least 2 clips. 160 | """ 161 | 162 | 163 | def get_proposal_interface(dset_name): 164 | """ dset_name (str): one of ["tvr"] """ 165 | assert dset_name in ProposalConfigs 166 | if dset_name == "didemo": 167 | return DidemoICCV17SS() 168 | else: 169 | arg_names = ["length", "scales", "stride", "round_base"] 170 | func_args = {k: ProposalConfigs[dset_name][k] for k in arg_names} 171 | return SlidingWindowMSRSS(**func_args) 172 | 173 | 174 | if __name__ == '__main__': 175 | test_fns_args = [(DidemoICCV17SS, (),), 176 | (SlidingWindowMSRSS, (1.5, [2, 4, 6, 12]))] 177 | for fn_i, args_i in test_fns_args: 178 | proposal_fn = fn_i(*args_i) 179 | x = proposal_fn('hola', {'duration': 15}) 180 | if fn_i == DidemoICCV17SS: 181 | assert len(x) == 21 182 | -------------------------------------------------------------------------------- /method_tvr/scripts/eval.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # run at project root dir 3 | # Usage: 4 | # bash method/scripts/eval.sh ANY_OTHER_PYTHON_ARGS 5 | eval_split_name=$1 6 | submission_path=$2 7 | save_path=$3 8 | gt_path=data/tvr_${eval_split_name}_release.jsonl 9 | 10 | python standalone_eval/eval.py \ 11 | --gt_path ${gt_path} \ 12 | --submission_path ${submission_path} \ 13 | --save_path ${save_path} \ 14 | ${@:4} 15 | -------------------------------------------------------------------------------- /method_tvr/scripts/inference.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # run at project root dir 3 | # Usage: 4 | # bash method/scripts/inference.sh ANY_OTHER_PYTHON_ARGS 5 | model_dir=$1 6 | eval_split_name=$2 # [val] 7 | eval_path=data/tvr_${eval_split_name}_release.jsonl 8 | tasks=() 9 | tasks+=(VCMR) 10 | tasks+=(SVMR) 11 | tasks+=(VR) 12 | echo "tasks ${tasks[@]}" 13 | python method_tvr/inference.py \ 14 | --model_dir ${model_dir} \ 15 | --tasks ${tasks[@]} \ 16 | --eval_split_name ${eval_split_name} \ 17 | --eval_path ${eval_path} \ 18 | ${@:3} 19 | -------------------------------------------------------------------------------- /method_tvr/scripts/train.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # run at project root dir 3 | # Usage: 4 | # bash method/scripts/train.sh tvr all ANY_OTHER_PYTHON_ARGS 5 | # use --eval_tasks_at_training ["VR", "SVMR", "VCMR"] --stop_task ["VR", "SVMR", "VCMR"] for 6 | # use --lw_neg_q 0 --lw_neg_ctx 0 for training SVMR/SVMR only 7 | # use --lw_st_ed 0 for training with VR only 8 | dset_name=$1 # see case below 9 | ctx_mode=$2 # [video, sub, tef, video_sub, video_tef, sub_tef, video_sub_tef] 10 | vid_feat_type=$3 # [resnet, i3d, resnet_i3d] 11 | feature_root=data/tvr_feature_release 12 | results_root=method_tvr/results 13 | vid_feat_size=2048 14 | extra_args=() 15 | 16 | if [[ ${ctx_mode} == *"sub"* ]] || [[ ${ctx_mode} == "sub" ]]; then 17 | if [[ ${dset_name} != "tvr" ]]; then 18 | echo "The use of subtitles is only supported in tvr." 19 | exit 1 20 | fi 21 | fi 22 | 23 | 24 | case ${dset_name} in 25 | tvr) 26 | train_path=data/tvr_train_release.jsonl 27 | video_duration_idx_path=data/tvr_video2dur_idx.json 28 | desc_bert_path=${feature_root}/bert_feature/query_only/tvr_query_pretrained_w_query.h5 29 | if [[ ${vid_feat_type} == "i3d" ]]; then 30 | echo "Using I3D feature with shape 1024" 31 | vid_feat_path=${feature_root}/video_feature/tvr_i3d_rgb600_avg_cl-1.5.h5 32 | vid_feat_size=1024 33 | elif [[ ${vid_feat_type} == "resnet" ]]; then 34 | echo "Using ResNet feature with shape 2048" 35 | vid_feat_path=${feature_root}/video_feature/tvr_resnet152_rgb_max_cl-1.5.h5 36 | vid_feat_size=2048 37 | elif [[ ${vid_feat_type} == "resnet_i3d" ]]; then 38 | echo "Using concatenated ResNet and I3D feature with shape 2048+1024" 39 | vid_feat_path=${feature_root}/video_feature/tvr_resnet152_rgb_max_i3d_rgb600_avg_cat_cl-1.5.h5 40 | vid_feat_size=3072 41 | extra_args+=(--no_norm_vfeat) # since they are already normalized. 42 | fi 43 | eval_split_name=val 44 | nms_thd=-1 45 | extra_args+=(--eval_path) 46 | extra_args+=(data/tvr_val_release.jsonl) 47 | clip_length=1.5 48 | # extra_args+=(--max_ctx_l) 49 | # extra_args+=(100) # max_ctx_l = 100 for clip_length = 1.5, only ~109/21825 has more than 100. 50 | extra_args+=(--max_pred_l) 51 | extra_args+=(16) 52 | if [[ ${ctx_mode} == *"sub"* ]] || [[ ${ctx_mode} == "sub" ]]; then 53 | echo "Running with sub." 54 | desc_bert_path=${feature_root}/bert_feature/sub_query/tvr_query_pretrained_w_sub_query.h5 # overwrite 55 | sub_bert_path=${feature_root}/bert_feature/sub_query/tvr_sub_pretrained_w_sub_query_max_cl-1.5.h5 56 | sub_feat_size=768 57 | extra_args+=(--sub_feat_size) 58 | extra_args+=(${sub_feat_size}) 59 | extra_args+=(--sub_bert_path) 60 | extra_args+=(${sub_bert_path}) 61 | fi 62 | ;; 63 | *) 64 | echo -n "Unknown argument" 65 | ;; 66 | esac 67 | 68 | echo "Start training with dataset [${dset_name}] in Context Mode [${ctx_mode}]" 69 | echo "Extra args ${extra_args[@]}" 70 | python method_tvr/train.py \ 71 | --dset_name=${dset_name} \ 72 | --eval_split_name=${eval_split_name} \ 73 | --nms_thd=${nms_thd} \ 74 | --results_root=${results_root} \ 75 | --train_path=${train_path} \ 76 | --desc_bert_path=${desc_bert_path} \ 77 | --video_duration_idx_path=${video_duration_idx_path} \ 78 | --vid_feat_path=${vid_feat_path} \ 79 | --clip_length=${clip_length} \ 80 | --vid_feat_size=${vid_feat_size} \ 81 | --ctx_mode=${ctx_mode} \ 82 | ${extra_args[@]} \ 83 | ${@:4} 84 | -------------------------------------------------------------------------------- /method_tvr/start_end_dataset.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import h5py 3 | import math 4 | import numpy as np 5 | import torch 6 | from torch.utils.data import Dataset 7 | from utils.basic_utils import load_jsonl, load_json, l2_normalize_np_array, uniform_feature_sampling 8 | from utils.tensor_utils import pad_sequences_1d 9 | from method_tvr.config import BaseOptions 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | 14 | class StartEndDataset(Dataset): 15 | """ 16 | Args: 17 | dset_name, str, ["tvr"] 18 | ctx_mode: str, 19 | Return: 20 | a dict: { 21 | "meta": { 22 | "desc_id": int, 23 | "desc": str, 24 | "vid_name": str, 25 | "duration": float, 26 | "ts": [st (float), ed (float)], seconds, ground_truth timestamps 27 | } 28 | "model_inputs": { 29 | "query_feat": torch.tensor, (L, D_q) 30 | "video_feat": torch.tensor, (n_clip_in_moment, D_video) 31 | "sub_feat": torch.tensor, (n_clip_in_moment, D_sub) 32 | "st_ed_indices": torch.LongTensor, (2, ) 33 | } 34 | } 35 | """ 36 | def __init__(self, dset_name, data_path, desc_bert_path_or_handler, sub_bert_path_or_handler, max_desc_len, 37 | max_ctx_len, vid_feat_path_or_handler, clip_length, ctx_mode="video", normalize_vfeat=True, 38 | normalize_tfeat=True, h5driver=None, data_ratio=1.0): 39 | self.dset_name = dset_name 40 | self.data_path = data_path 41 | self.data_ratio = data_ratio 42 | 43 | self.desc_bert_path_or_handler = desc_bert_path_or_handler 44 | self.max_desc_len = max_desc_len 45 | 46 | self.sub_bert_path_or_handler = sub_bert_path_or_handler 47 | self.max_ctx_len = max_ctx_len 48 | self.vid_feat_path_or_handler = vid_feat_path_or_handler 49 | self.clip_length = clip_length 50 | self.ctx_mode = ctx_mode 51 | 52 | # prepare desc data 53 | self.data = load_jsonl(data_path) 54 | if self.data_ratio != 1: 55 | n_examples = int(len(self.data) * data_ratio) 56 | self.data = self.data[:n_examples] 57 | logger.info("Using {}% of the data: {} examples".format(data_ratio * 100, n_examples)) 58 | 59 | self.use_video = "video" in self.ctx_mode 60 | self.use_sub = "sub" in self.ctx_mode 61 | self.use_tef = "tef" in self.ctx_mode 62 | 63 | if self.use_video: 64 | if isinstance(vid_feat_path_or_handler, h5py.File): 65 | self.vid_feat_h5 = vid_feat_path_or_handler 66 | else: # str path 67 | self.vid_feat_h5 = h5py.File(vid_feat_path_or_handler, "r", driver=h5driver) 68 | 69 | if isinstance(desc_bert_path_or_handler, h5py.File): 70 | self.desc_bert_h5 = desc_bert_path_or_handler 71 | else: 72 | self.desc_bert_h5 = h5py.File(desc_bert_path_or_handler, "r", driver=h5driver) 73 | 74 | if self.use_sub: 75 | if isinstance(sub_bert_path_or_handler, h5py.File): 76 | self.sub_bert_h5 = sub_bert_path_or_handler 77 | else: # str path 78 | self.sub_bert_h5 = h5py.File(sub_bert_path_or_handler, "r", driver=h5driver) 79 | 80 | self.normalize_vfeat = normalize_vfeat 81 | self.normalize_tfeat = normalize_tfeat 82 | 83 | def __len__(self): 84 | return len(self.data) 85 | 86 | def __getitem__(self, index): 87 | raw_data = self.data[index] 88 | # initialize with basic data 89 | meta = dict(desc_id=raw_data["desc_id"], desc=raw_data["desc"], vid_name=raw_data["vid_name"], 90 | duration=raw_data["duration"], ts=raw_data["ts"]) 91 | model_inputs = dict() 92 | model_inputs["query_feat"] = self.get_query_feat_by_desc_id(meta["desc_id"]) 93 | 94 | ctx_l = 0 95 | if self.use_video: 96 | video_feat = uniform_feature_sampling(self.vid_feat_h5[meta['vid_name']][:], self.max_ctx_len) 97 | if self.normalize_vfeat: 98 | video_feat = l2_normalize_np_array(video_feat) 99 | model_inputs["video_feat"] = torch.from_numpy(video_feat) 100 | ctx_l = len(video_feat) 101 | else: 102 | model_inputs["video_feat"] = torch.zeros((2, 2)) 103 | 104 | if self.use_sub: # no need for ctx feature, as the features are already contextualized 105 | sub_feat = uniform_feature_sampling(self.sub_bert_h5[meta["vid_name"]][:], self.max_ctx_len) 106 | if self.normalize_tfeat: 107 | sub_feat = l2_normalize_np_array(sub_feat) 108 | model_inputs["sub_feat"] = torch.from_numpy(sub_feat) 109 | ctx_l = len(sub_feat) 110 | else: 111 | model_inputs["sub_feat"] = torch.zeros((2, 2)) 112 | 113 | if self.use_tef: 114 | # note the tef features here are normalized clip indices (1.5 secs), instead of the original time (1 sec) 115 | ctx_l = meta["duration"] // self.clip_length + 1 if ctx_l == 0 else ctx_l 116 | tef_st = torch.arange(0, ctx_l, 1.0) / ctx_l 117 | tef_ed = torch.arange(1, ctx_l + 1, 1.0) / ctx_l 118 | tef = torch.stack([tef_st, tef_ed], dim=1) # (N_clips, 2) 119 | tef_feat = tef 120 | else: 121 | tef_feat = torch.zeros((2, 2)) 122 | 123 | if self.use_video and self.use_tef: # (N_clips, D + 2) 124 | model_inputs["video_feat"] = torch.cat([model_inputs["video_feat"], tef_feat], dim=1) 125 | if self.use_sub and self.use_tef: # (N_clips, D_t + 2) 126 | model_inputs["sub_feat"] = torch.cat([model_inputs["sub_feat"], tef_feat], dim=1) 127 | 128 | model_inputs["st_ed_indices"] = self.get_st_ed_label(meta["ts"], max_idx=ctx_l - 1) 129 | return dict(meta=meta, model_inputs=model_inputs) 130 | 131 | def get_st_ed_label(self, ts, max_idx): 132 | """ 133 | Args: 134 | ts: [st (float), ed (float)] in seconds, ed > st 135 | max_idx: length of the video 136 | Returns: 137 | [st_idx, ed_idx]: int, 138 | Given ts = [3.2, 7.6], st_idx = 2, ed_idx = 6, 139 | clips should be indexed as [2: 6), the translated back ts should be [3:9]. 140 | """ 141 | st_idx = min(math.floor(ts[0] / self.clip_length), max_idx) 142 | ed_idx = min(math.ceil(ts[1] / self.clip_length), max_idx) # -1 143 | return torch.tensor([st_idx, ed_idx], dtype=torch.long) 144 | 145 | def get_query_feat_by_desc_id(self, desc_id): 146 | query_feat = self.desc_bert_h5[str(desc_id)][:self.max_desc_len] 147 | if self.normalize_tfeat: 148 | query_feat = l2_normalize_np_array(query_feat) 149 | return torch.from_numpy(query_feat) 150 | 151 | 152 | class StartEndEvalDataset(Dataset): 153 | """ 154 | init_data_mode: `video_query` or `video_only` or `query_only`, 155 | it indicates which data to load when initialize the Dataset object. 156 | data_mode: `context` or `query`, it indicates which data to return for self.__get_item__() 157 | desc_bert_path_or_handler: h5py.File object or str path 158 | vid_feat_path_or_handler: h5py.File object or str path 159 | eval_proposal_bsz: the proposals for a single video will be sorted in length and batched here with 160 | max batch size to be eval_proposal_bsz. A single video might have multiple batches of proposals. 161 | load_gt_video: load GroundTruth Video, useful when evaluating single video moment retrieval. 162 | data_ratio: percentage of query data to use. 163 | """ 164 | def __init__(self, dset_name, eval_split_name, data_path=None, desc_bert_path_or_handler=None, max_desc_len=None, 165 | max_ctx_len=None, sub_bert_path_or_handler=None, vid_feat_path_or_handler=None, 166 | video_duration_idx_path=None, clip_length=None, ctx_mode="video", data_mode="context", h5driver=None, 167 | data_ratio=1.0, normalize_vfeat=True, normalize_tfeat=True): 168 | self.dset_name = dset_name 169 | self.eval_split_name = eval_split_name 170 | self.ctx_mode = ctx_mode 171 | self.load_gt_video = False 172 | self.data_ratio = data_ratio # only affect query data 173 | self.normalize_vfeat = normalize_vfeat 174 | self.normalize_tfeat = normalize_tfeat 175 | 176 | self.data_mode = None 177 | self.set_data_mode(data_mode) 178 | 179 | self.max_desc_len = max_desc_len 180 | self.max_ctx_len = max_ctx_len 181 | self.data_path = data_path 182 | if isinstance(desc_bert_path_or_handler, h5py.File): 183 | self.desc_bert_h5 = desc_bert_path_or_handler 184 | else: 185 | self.desc_bert_h5 = h5py.File(desc_bert_path_or_handler, "r", driver=h5driver) 186 | 187 | video_data = load_json(video_duration_idx_path)[self.eval_split_name] 188 | self.video_data = [{"vid_name": k, "duration": v[0]} for k, v in video_data.items()] 189 | self.video2idx = {k: v[1] for k, v in video_data.items()} 190 | self.clip_length = clip_length 191 | 192 | self.use_video = "video" in self.ctx_mode 193 | self.use_sub = "sub" in self.ctx_mode 194 | self.use_tef = "tef" in self.ctx_mode 195 | 196 | if self.use_video: 197 | if isinstance(vid_feat_path_or_handler, h5py.File): 198 | self.vid_feat_h5 = vid_feat_path_or_handler 199 | else: # str path 200 | self.vid_feat_h5 = h5py.File(vid_feat_path_or_handler, "r", driver=h5driver) 201 | 202 | if self.use_sub: 203 | if isinstance(sub_bert_path_or_handler, h5py.File): 204 | self.sub_bert_h5 = sub_bert_path_or_handler 205 | else: # str path 206 | self.sub_bert_h5 = h5py.File(sub_bert_path_or_handler, "r", driver=h5driver) 207 | 208 | self.query_data = load_jsonl(data_path) 209 | if data_ratio != 1: 210 | n_examples = int(len(self.query_data) * data_ratio) 211 | self.query_data = self.query_data[:n_examples] 212 | logger.info("Using {}% of the data: {} examples".format(data_ratio * 100, n_examples)) 213 | 214 | def set_data_mode(self, data_mode): 215 | """context or query""" 216 | assert data_mode in ["context", "query"] 217 | self.data_mode = data_mode 218 | 219 | def load_gt_vid_name_for_query(self, load_gt_video): 220 | """load_gt_video: bool, affect the returned value of self._get_item_query""" 221 | if load_gt_video: 222 | assert "vid_name" in self.query_data[0] 223 | self.load_gt_video = load_gt_video 224 | 225 | def __len__(self): 226 | if self.data_mode == "context": 227 | return len(self.video_data) 228 | else: 229 | return len(self.query_data) 230 | 231 | def __getitem__(self, index): 232 | if self.data_mode == "context": 233 | return self._get_item_context(index) 234 | else: 235 | return self._get_item_query(index) 236 | 237 | def get_query_feat_by_desc_id(self, desc_id): 238 | query_feat = self.desc_bert_h5[str(desc_id)][:self.max_desc_len] 239 | if self.normalize_tfeat: 240 | query_feat = l2_normalize_np_array(query_feat) 241 | return torch.from_numpy(query_feat) 242 | 243 | def _get_item_query(self, index): 244 | """Need to batch""" 245 | raw_data = self.query_data[index] 246 | meta = dict(desc_id=raw_data["desc_id"], desc=raw_data["desc"], 247 | vid_name=raw_data["vid_name"] if self.load_gt_video else None) 248 | model_inputs = dict() 249 | model_inputs["query_feat"] = self.get_query_feat_by_desc_id(meta["desc_id"]) 250 | return dict(meta=meta, model_inputs=model_inputs) 251 | 252 | def get_st_ed_label(self, ts, max_idx): 253 | st_idx = min(math.floor(ts[0] / self.clip_length), max_idx) 254 | ed_idx = min(math.ceil(ts[1] / self.clip_length), max_idx) 255 | return torch.tensor([st_idx, ed_idx], dtype=torch.long) 256 | 257 | def _get_item_context(self, index): 258 | """No need to batch, since it has already been batched here""" 259 | raw_data = self.video_data[index] 260 | # initialize with basic data 261 | meta = dict(vid_name=raw_data["vid_name"], duration=raw_data["duration"]) 262 | model_inputs = dict() 263 | ctx_l = 0 264 | 265 | if self.use_video: 266 | video_feat = uniform_feature_sampling(self.vid_feat_h5[meta["vid_name"]][:], self.max_ctx_len) 267 | if self.normalize_vfeat: 268 | video_feat = l2_normalize_np_array(video_feat) 269 | model_inputs["video_feat"] = torch.from_numpy(video_feat) 270 | ctx_l = len(video_feat) 271 | else: 272 | model_inputs["video_feat"] = torch.zeros((2, 2)) 273 | 274 | if self.use_sub: # no need for ctx feature, as the features are already contextualized 275 | sub_feat = uniform_feature_sampling(self.sub_bert_h5[meta["vid_name"]][:], self.max_ctx_len) 276 | if self.normalize_tfeat: 277 | sub_feat = l2_normalize_np_array(sub_feat) 278 | model_inputs["sub_feat"] = torch.from_numpy(sub_feat) 279 | ctx_l = len(sub_feat) 280 | else: 281 | model_inputs["sub_feat"] = torch.zeros((2, 2)) 282 | 283 | if self.use_tef: 284 | ctx_l = meta["duration"] // self.clip_length + 1 if ctx_l == 0 else ctx_l 285 | tef_st = torch.arange(0, ctx_l, 1.0) / ctx_l 286 | tef_ed = tef_st + 1.0 / ctx_l 287 | tef = torch.stack([tef_st, tef_ed], dim=1) # (N_clips, 2) 288 | tef_feat = tef 289 | else: 290 | tef_feat = torch.zeros((2, 2)) 291 | 292 | if self.use_video and self.use_tef: # (N_clips, D+2) 293 | model_inputs["video_feat"] = torch.cat([model_inputs["video_feat"], tef_feat], dim=1) 294 | if self.use_sub and self.use_tef: # (N_clips, D_t+2) 295 | model_inputs["sub_feat"] = torch.cat([model_inputs["sub_feat"], tef_feat], dim=1) 296 | return dict(meta=meta, model_inputs=model_inputs) 297 | 298 | 299 | def start_end_collate(batch): 300 | batch_meta = [e["meta"] for e in batch] 301 | model_inputs_keys = batch[0]["model_inputs"].keys() 302 | batched_data = dict() 303 | for k in model_inputs_keys: 304 | if "feat" in k: 305 | if k in ['video_feat', 'sub_feat', 'tef_feat']: 306 | fixed_length = 128 307 | else: 308 | fixed_length = None 309 | batched_data[k] = pad_sequences_1d([e["model_inputs"][k] for e in batch], dtype=torch.float32, 310 | fixed_length=fixed_length) 311 | fixed_length = 128 312 | if "st_ed_indices" in model_inputs_keys: 313 | st_ed_indices = [e["model_inputs"]["st_ed_indices"] for e in batch] 314 | # construct moment localization labels 315 | batched_data["st_ed_indices"] = torch.stack(st_ed_indices, dim=0) 316 | # construct moment localization foreground and background labels 317 | match_labels = np.zeros(shape=(len(st_ed_indices), fixed_length), dtype=np.int32) 318 | for idx, st_ed_index in enumerate(st_ed_indices): 319 | st_ed = st_ed_index.cpu().numpy() 320 | st, ed = st_ed[0], st_ed[1] 321 | match_labels[idx][st:(ed + 1)] = 1 322 | batched_data['match_labels'] = torch.tensor(match_labels, dtype=torch.long) 323 | return batch_meta, batched_data 324 | 325 | 326 | def prepare_batch_inputs(batched_model_inputs, device, non_blocking=False): 327 | model_inputs = {} 328 | for k, v in batched_model_inputs.items(): 329 | if "feat" in k: 330 | model_inputs[k] = v[0].to(device, non_blocking=non_blocking) 331 | model_inputs[k.replace("feat", "mask")] = v[1].to(device, non_blocking=non_blocking) 332 | else: 333 | model_inputs[k] = v.to(device, non_blocking=non_blocking) 334 | return model_inputs 335 | 336 | 337 | if __name__ == '__main__': 338 | options = BaseOptions().parse() 339 | -------------------------------------------------------------------------------- /method_tvr/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import time 4 | import json 5 | import pprint 6 | import random 7 | import numpy as np 8 | from easydict import EasyDict as EDict 9 | from tqdm import tqdm, trange 10 | from collections import OrderedDict 11 | import torch 12 | import torch.nn as nn 13 | import torch.backends.cudnn as cudnn 14 | from torch.utils.data import DataLoader 15 | from torch.utils.tensorboard import SummaryWriter 16 | from method_tvr.config import BaseOptions 17 | from method_tvr.model import ReLoCLNet 18 | from method_tvr.start_end_dataset import StartEndDataset, start_end_collate, StartEndEvalDataset, prepare_batch_inputs 19 | from method_tvr.inference import eval_epoch, start_inference 20 | from method_tvr.optimization import BertAdam 21 | from utils.basic_utils import AverageMeter 22 | from utils.model_utils import count_parameters 23 | 24 | 25 | import logging 26 | logger = logging.getLogger(__name__) 27 | logging.basicConfig(format="%(asctime)s.%(msecs)03d:%(levelname)s:%(name)s - %(message)s", 28 | datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO) 29 | 30 | 31 | def set_seed(seed, use_cuda=True): 32 | random.seed(seed) 33 | np.random.seed(seed) 34 | torch.manual_seed(seed) 35 | if use_cuda: 36 | torch.cuda.manual_seed_all(seed) 37 | 38 | 39 | def train_epoch(model, train_loader, optimizer, opt, epoch_i, training=True): 40 | logger.info("use train_epoch func for training: {}".format(training)) 41 | model.train(mode=training) 42 | if opt.hard_negative_start_epoch != -1 and epoch_i >= opt.hard_negative_start_epoch: 43 | model.set_hard_negative(True, opt.hard_pool_size) 44 | if opt.train_span_start_epoch != -1 and epoch_i >= opt.train_span_start_epoch: 45 | model.set_train_st_ed(opt.lw_st_ed) 46 | 47 | # init meters 48 | dataloading_time = AverageMeter() 49 | prepare_inputs_time = AverageMeter() 50 | model_forward_time = AverageMeter() 51 | model_backward_time = AverageMeter() 52 | loss_meters = OrderedDict(loss_st_ed=AverageMeter(), loss_fcl=AverageMeter(), loss_vcl=AverageMeter(), 53 | loss_neg_ctx=AverageMeter(), loss_neg_q=AverageMeter(), 54 | loss_overall=AverageMeter()) 55 | 56 | num_training_examples = len(train_loader) 57 | timer_dataloading = time.time() 58 | for batch_idx, batch in tqdm(enumerate(train_loader), desc="Training Iteration", total=num_training_examples): 59 | global_step = epoch_i * num_training_examples + batch_idx 60 | dataloading_time.update(time.time() - timer_dataloading) 61 | 62 | # continue 63 | timer_start = time.time() 64 | model_inputs = prepare_batch_inputs(batch[1], opt.device, non_blocking=opt.pin_memory) 65 | prepare_inputs_time.update(time.time() - timer_start) 66 | timer_start = time.time() 67 | loss, loss_dict = model(**model_inputs) 68 | model_forward_time.update(time.time() - timer_start) 69 | timer_start = time.time() 70 | if training: 71 | optimizer.zero_grad() 72 | loss.backward() 73 | if opt.grad_clip != -1: 74 | nn.utils.clip_grad_norm_(model.parameters(), opt.grad_clip) 75 | optimizer.step() 76 | model_backward_time.update(time.time() - timer_start) 77 | 78 | opt.writer.add_scalar("Train/LR", float(optimizer.param_groups[0]["lr"]), global_step) 79 | for k, v in loss_dict.items(): 80 | opt.writer.add_scalar("Train/{}".format(k), v, global_step) 81 | 82 | for k, v in loss_dict.items(): 83 | loss_meters[k].update(float(v)) 84 | 85 | timer_dataloading = time.time() 86 | if opt.debug and batch_idx == 3: 87 | break 88 | 89 | if training: 90 | to_write = opt.train_log_txt_formatter.format(time_str=time.strftime("%Y_%m_%d_%H_%M_%S"), epoch=epoch_i, 91 | loss_str=" ".join(["{} {:.4f}".format(k, v.avg) 92 | for k, v in loss_meters.items()])) 93 | with open(opt.train_log_filepath, "a") as f: 94 | f.write(to_write) 95 | print("Epoch time stats:") 96 | print("dataloading_time: max {dataloading_time.max} min {dataloading_time.min} avg {dataloading_time.avg}\n" 97 | "prepare_inputs_time: max {prepare_inputs_time.max} " 98 | "min {prepare_inputs_time.min} avg {prepare_inputs_time.avg}\n" 99 | "model_forward_time: max {model_forward_time.max} " 100 | "min {model_forward_time.min} avg {model_forward_time.avg}\n" 101 | "model_backward_time: max {model_backward_time.max} " 102 | "min {model_backward_time.min} avg {model_backward_time.avg}\n".format( 103 | dataloading_time=dataloading_time, prepare_inputs_time=prepare_inputs_time, 104 | model_forward_time=model_forward_time, model_backward_time=model_backward_time)) 105 | else: 106 | for k, v in loss_meters.items(): 107 | opt.writer.add_scalar("Eval_Loss/{}".format(k), v.avg, epoch_i) 108 | 109 | 110 | def rm_key_from_odict(odict_obj, rm_suffix): 111 | """remove key entry from the OrderedDict""" 112 | return OrderedDict([(k, v) for k, v in odict_obj.items() if rm_suffix not in k]) 113 | 114 | 115 | def train(model, train_dataset, train_eval_dataset, val_dataset, opt): 116 | # Prepare optimizer 117 | if opt.device.type == "cuda": 118 | logger.info("CUDA enabled.") 119 | model.to(opt.device) 120 | if len(opt.device_ids) > 1: 121 | logger.info("Use multi GPU", opt.device_ids) 122 | model = torch.nn.DataParallel(model, device_ids=opt.device_ids) # use multi GPU 123 | 124 | train_loader = DataLoader(train_dataset, collate_fn=start_end_collate, batch_size=opt.bsz, 125 | num_workers=opt.num_workers, shuffle=True, pin_memory=opt.pin_memory) 126 | train_eval_loader = DataLoader(train_eval_dataset, collate_fn=start_end_collate, batch_size=opt.bsz, 127 | num_workers=opt.num_workers, shuffle=False, pin_memory=opt.pin_memory) 128 | # Prepare optimizer 129 | param_optimizer = list(model.named_parameters()) 130 | no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] 131 | optimizer_grouped_parameters = [ 132 | {"params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], "weight_decay": 0.01}, 133 | {"params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0}] 134 | 135 | num_train_optimization_steps = len(train_loader) * opt.n_epoch 136 | optimizer = BertAdam(optimizer_grouped_parameters, lr=opt.lr, weight_decay=opt.wd, warmup=opt.lr_warmup_proportion, 137 | t_total=num_train_optimization_steps, schedule="warmup_linear") 138 | prev_best_score = 0. 139 | es_cnt = 0 140 | start_epoch = -1 if opt.eval_untrained else 0 141 | eval_tasks_at_training = opt.eval_tasks_at_training # VR is computed along with VCMR 142 | save_submission_filename = "latest_{}_{}_predictions_{}.json".format(opt.dset_name, opt.eval_split_name, 143 | "_".join(eval_tasks_at_training)) 144 | for epoch_i in trange(start_epoch, opt.n_epoch, desc="Epoch"): 145 | if epoch_i > -1: 146 | with torch.autograd.detect_anomaly(): 147 | train_epoch(model, train_loader, optimizer, opt, epoch_i, training=True) 148 | global_step = (epoch_i + 1) * len(train_loader) 149 | if opt.eval_path is not None: 150 | with torch.no_grad(): 151 | train_epoch(model, train_eval_loader, optimizer, opt, epoch_i, training=False) 152 | metrics_no_nms, metrics_nms, latest_file_paths = eval_epoch( 153 | model, val_dataset, opt, save_submission_filename, tasks=eval_tasks_at_training, max_after_nms=100) 154 | to_write = opt.eval_log_txt_formatter.format(time_str=time.strftime("%Y_%m_%d_%H_%M_%S"), epoch=epoch_i, 155 | eval_metrics_str=json.dumps(metrics_no_nms)) 156 | with open(opt.eval_log_filepath, "a") as f: 157 | f.write(to_write) 158 | logger.info("metrics_no_nms {}".format(pprint.pformat( 159 | rm_key_from_odict(metrics_no_nms, rm_suffix="by_type"), indent=4))) 160 | logger.info("metrics_nms {}".format(pprint.pformat(metrics_nms, indent=4))) 161 | # metrics = metrics_nms if metrics_nms is not None else metrics_no_nms 162 | metrics = metrics_no_nms 163 | # early stop/ log / save model 164 | for task_type in ["SVMR", "VCMR"]: 165 | if task_type in metrics: 166 | task_metrics = metrics[task_type] 167 | for iou_thd in [0.5, 0.7]: 168 | opt.writer.add_scalars("Eval/{}-{}".format(task_type, iou_thd), 169 | {k: v for k, v in task_metrics.items() if str(iou_thd) in k}, 170 | global_step) 171 | task_type = "VR" 172 | if task_type in metrics: 173 | task_metrics = metrics[task_type] 174 | opt.writer.add_scalars("Eval/{}".format(task_type), {k: v for k, v in task_metrics.items()}, 175 | global_step) 176 | # use the most strict metric available 177 | stop_metric_names = ["r1"] if opt.stop_task == "VR" else ["0.5-r1", "0.7-r1"] 178 | stop_score = sum([metrics[opt.stop_task][e] for e in stop_metric_names]) 179 | if stop_score > prev_best_score: 180 | es_cnt = 0 181 | prev_best_score = stop_score 182 | checkpoint = {"model": model.state_dict(), "model_cfg": model.config, "epoch": epoch_i} 183 | torch.save(checkpoint, opt.ckpt_filepath) 184 | best_file_paths = [e.replace("latest", "best") for e in latest_file_paths] 185 | for src, tgt in zip(latest_file_paths, best_file_paths): 186 | os.renames(src, tgt) 187 | logger.info("The checkpoint file has been updated.") 188 | else: 189 | es_cnt += 1 190 | if opt.max_es_cnt != -1 and es_cnt > opt.max_es_cnt: # early stop 191 | with open(opt.train_log_filepath, "a") as f: 192 | f.write("Early Stop at epoch {}".format(epoch_i)) 193 | logger.info("Early stop at {} with {} {}".format( 194 | epoch_i, " ".join([opt.stop_task] + stop_metric_names), prev_best_score)) 195 | break 196 | else: 197 | checkpoint = {"model": model.state_dict(), "model_cfg": model.config, "epoch": epoch_i} 198 | torch.save(checkpoint, opt.ckpt_filepath) 199 | 200 | if opt.debug: 201 | break 202 | 203 | opt.writer.close() 204 | 205 | 206 | def start_training(): 207 | logger.info("Setup config, data and model...") 208 | opt = BaseOptions().parse() 209 | set_seed(opt.seed) 210 | if opt.debug: # keep the model run deterministically 211 | # 'cudnn.benchmark = True' enabled auto finding the best algorithm for a specific input/net config. 212 | # Enable this only when input size is fixed. 213 | cudnn.benchmark = False 214 | cudnn.deterministic = True 215 | 216 | opt.writer = SummaryWriter(opt.tensorboard_log_dir) 217 | opt.train_log_txt_formatter = "{time_str} [Epoch] {epoch:03d} [Loss] {loss_str}\n" 218 | opt.eval_log_txt_formatter = "{time_str} [Epoch] {epoch:03d} [Metrics] {eval_metrics_str}\n" 219 | 220 | train_dataset = StartEndDataset( 221 | dset_name=opt.dset_name, 222 | data_path=opt.train_path, 223 | desc_bert_path_or_handler=opt.desc_bert_path, 224 | sub_bert_path_or_handler=opt.sub_bert_path, 225 | max_desc_len=opt.max_desc_l, 226 | max_ctx_len=opt.max_ctx_l, 227 | vid_feat_path_or_handler=opt.vid_feat_path, 228 | clip_length=opt.clip_length, 229 | ctx_mode=opt.ctx_mode, 230 | h5driver=opt.h5driver, 231 | data_ratio=opt.data_ratio, 232 | normalize_vfeat=not opt.no_norm_vfeat, 233 | normalize_tfeat=not opt.no_norm_tfeat) 234 | 235 | if opt.eval_path is not None: 236 | # val dataset, used to get eval loss 237 | train_eval_dataset = StartEndDataset( 238 | dset_name=opt.dset_name, 239 | data_path=opt.eval_path, 240 | desc_bert_path_or_handler=train_dataset.desc_bert_h5, 241 | sub_bert_path_or_handler=train_dataset.sub_bert_h5 if "sub" in opt.ctx_mode else None, 242 | max_desc_len=opt.max_desc_l, 243 | max_ctx_len=opt.max_ctx_l, 244 | vid_feat_path_or_handler=train_dataset.vid_feat_h5 if "video" in opt.ctx_mode else None, 245 | clip_length=opt.clip_length, 246 | ctx_mode=opt.ctx_mode, 247 | h5driver=opt.h5driver, 248 | data_ratio=opt.data_ratio, 249 | normalize_vfeat=not opt.no_norm_vfeat, 250 | normalize_tfeat=not opt.no_norm_tfeat) 251 | 252 | eval_dataset = StartEndEvalDataset( 253 | dset_name=opt.dset_name, 254 | eval_split_name=opt.eval_split_name, # should only be val set 255 | data_path=opt.eval_path, 256 | desc_bert_path_or_handler=train_dataset.desc_bert_h5, 257 | sub_bert_path_or_handler=train_dataset.sub_bert_h5 if "sub" in opt.ctx_mode else None, 258 | max_desc_len=opt.max_desc_l, 259 | max_ctx_len=opt.max_ctx_l, 260 | video_duration_idx_path=opt.video_duration_idx_path, 261 | vid_feat_path_or_handler=train_dataset.vid_feat_h5 if "video" in opt.ctx_mode else None, 262 | clip_length=opt.clip_length, 263 | ctx_mode=opt.ctx_mode, 264 | data_mode="query", 265 | h5driver=opt.h5driver, 266 | data_ratio=opt.data_ratio, 267 | normalize_vfeat=not opt.no_norm_vfeat, 268 | normalize_tfeat=not opt.no_norm_tfeat) 269 | else: 270 | train_eval_dataset, eval_dataset = None, None 271 | 272 | model_config = EDict( 273 | visual_input_size=opt.vid_feat_size, 274 | sub_input_size=opt.sub_feat_size, # for both desc and subtitles 275 | query_input_size=opt.q_feat_size, # for both desc and subtitles 276 | hidden_size=opt.hidden_size, # hidden dimension 277 | conv_kernel_size=opt.conv_kernel_size, 278 | conv_stride=opt.conv_stride, 279 | max_ctx_l=opt.max_ctx_l, 280 | max_desc_l=opt.max_desc_l, 281 | input_drop=opt.input_drop, 282 | drop=opt.drop, 283 | n_heads=opt.n_heads, # self-att heads 284 | initializer_range=opt.initializer_range, # for linear layer 285 | ctx_mode=opt.ctx_mode, # video, sub or video_sub 286 | margin=opt.margin, # margin for ranking loss 287 | ranking_loss_type=opt.ranking_loss_type, # loss type, 'hinge' or 'lse' 288 | lw_neg_q=opt.lw_neg_q, # loss weight for neg. query and pos. context 289 | lw_neg_ctx=opt.lw_neg_ctx, # loss weight for pos. query and neg. context 290 | lw_fcl=opt.lw_fcl, # loss weight for frame level contrastive learning 291 | lw_vcl=opt.lw_vcl, # loss weight for video level contrastive learning 292 | lw_st_ed=0, # will be assigned dynamically at training time 293 | use_hard_negative=False, # reset at each epoch 294 | hard_pool_size=opt.hard_pool_size) 295 | logger.info("model_config {}".format(model_config)) 296 | model = ReLoCLNet(model_config) 297 | count_parameters(model) 298 | logger.info("Start Training...") 299 | train(model, train_dataset, train_eval_dataset, eval_dataset, opt) 300 | return opt.results_dir, opt.eval_split_name, opt.eval_path, opt.debug 301 | 302 | 303 | if __name__ == '__main__': 304 | model_dir, eval_split_name, eval_path, debug = start_training() 305 | if not debug: 306 | model_dir = model_dir.split(os.sep)[-1] 307 | tasks = ["SVMR", "VCMR", "VR"] 308 | input_args = ["--model_dir", model_dir, "--nms_thd", "0.5", "--eval_split_name", eval_split_name, 309 | "--eval_path", eval_path, "--tasks"] + tasks 310 | sys.argv[1:] = input_args 311 | logger.info("\n\n\nFINISHED TRAINING!!!") 312 | logger.info("Evaluating model in {}".format(model_dir)) 313 | logger.info("Input args {}".format(sys.argv[1:])) 314 | start_inference() 315 | -------------------------------------------------------------------------------- /setup.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # source setup.sh 4 | export DIR_PWD="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 5 | export PYTHONPATH="$PYTHONPATH:$DIR_PWD" 6 | 7 | echo $PYTHONPATH 8 | -------------------------------------------------------------------------------- /standalone_eval/README.md: -------------------------------------------------------------------------------- 1 | TVR Evalation 2 | ================================================================ 3 | 4 | ### Task Definition 5 | Given a natural language query and a large pool of videos (with subtitles), 6 | the TVR (VCMR) task requires a system to retrieve a relevant moment from the videos. 7 | The table below shows a comparison of the TVR task and the subtasks: 8 | 9 | | Task | Description | 10 | | --- | --- | 11 | | VCMR | or VSCMR, *Video (-Subtitle) Corpus Moment Retrieval*. Localize a moment from a large video corpus. | 12 | | SVMR | or SVSMR, *Single Video (-Subtitle) Moment Retrieval*. Localize a moment from a given video. | 13 | | VR | or VSR, *Video (-Subtitle) Retrieval*. Retrieve a video from a large video corpus. | 14 | 15 | VCMR and VR only requires a query and a video corpus, SVMR additionally requires knowing the ground-truth video. 16 | Thus it is not possible to perform SVMR on our `test-public` set, where the ground-truth video is hidden. 17 | 18 | 19 | ### How to construct a prediction file? 20 | 21 | An example of such file is [sample_val_predictions.json](sample_val_predictions.json), it is formatted as: 22 | ``` 23 | { 24 | "video2idx": { 25 | "castle_s01e02_seg02_clip_09": 19614, 26 | ... 27 | }, 28 | "VCMR": [{ 29 | "desc_id": 90200, 30 | "desc": "Phoebe puts one of her ponytails in her mouth.", 31 | "predictions": [ 32 | [19614, 9.0, 12.0, 1.7275], 33 | [20384, 12.0, 18.0, 1.7315], 34 | [20384, 15.0, 21.0, 1.7351], 35 | ... 36 | ] 37 | }, 38 | ... 39 | ], 40 | "SVMR": [{ 41 | "desc_id": 90200, 42 | "desc": "Phoebe puts one of her ponytails in her mouth.", 43 | "predictions": [ 44 | [20092, 36.0, 42.0, -1.9082], 45 | [20092, 18.0, 24.0, -1.9145], 46 | [20092, 51.0, 54.0, -1.922], 47 | ... 48 | ] 49 | }, 50 | ... 51 | ], 52 | "VR": [{ 53 | "desc_id": 90200, 54 | "desc": "Phoebe puts one of her ponytails in her mouth.", 55 | "predictions": [ 56 | [19614, 0, 0, 1.7275], 57 | [20384, 0, 0, 1.7315], 58 | [20384, 0, 0, 1.7351], 59 | ... 60 | ] 61 | }, 62 | ... 63 | ] 64 | } 65 | ``` 66 | 67 | | entry | description | 68 | | --- | ----| 69 | | video2idx | `dict`, `{vid_name: vid_idx}`. A mapping of video names to unique video IDs for current set. From [tvr_video2dur_idx.json](../data/tvr_video2dur_idx.json). | 70 | | VCMR | `list(dicts)`, stores predictions for the task `VCMR`. | 71 | | SVMR | `list(dicts)`, stores predictions for the task `SVMR`. Not required for `test-public` submission. | 72 | | VR | `list(dicts)`, stores predictions for the task `VR`. | 73 | 74 | The evaluation script will evaluate the predictions for tasks `[VCMR, SVMR, VR]` independently. 75 | Each dict in VCMR/SVMR/VR list is: 76 | ``` 77 | { 78 | "desc": str, 79 | "desc_id": int, 80 | "predictions": [[vid_id (int), st (float), ed (float), score (float)], ...] 81 | } 82 | ``` 83 | 84 | `predictions` is a `list` containing 100 `sublist`, each `sublist` has exactly 4 items: 85 | `[vid_id (int), st (float), ed (float), score (float)]`, 86 | which are `vid_id` (video id), `st` and `ed` (moment start and end time, in seconds.), 87 | `score` (score of the prediction). 88 | The `score` item will not be used in the evaluation script, it is left here for record. 89 | 90 | 91 | ### Run Evaluation 92 | At project root, run 93 | ``` 94 | bash standalone_eval/eval_sample.sh 95 | ``` 96 | This command will use [eval.py](eval.py) to evaluate the provided `sample_val_predictions.json` file, 97 | the output will be written into `sample_val_predictions_metrics.json`. 98 | Its content should be similar if not the same as `sample_val_predictions_metrics_raw.json` file. 99 | 100 | 101 | ### Codalab Submission 102 | To test your model's performance on `test-public` set, 103 | please submit both `val` and `test-public` predictions to our 104 | [Codalab evaluation server](https://competitions.codalab.org/competitions/22780). 105 | The submission file should be a single `.zip ` file (no enclosing folder) 106 | that contains the two prediction files 107 | `tvr_test_public_submission.json` and `tvr_val_submission.json`, each of the `*submission.json` file 108 | should be formatted as instructed above. 109 | Note that `tvr_val_submission.json` will have all the 4 entries, while 110 | `tvr_test_public_submission.json` will have only 3 entries, without `SVMR`. 111 | 112 | 113 | -------------------------------------------------------------------------------- /standalone_eval/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/26hzhang/ReLoCLNet/56cb666ce516cce9acbcfce78fb4e95d81e11e54/standalone_eval/__init__.py -------------------------------------------------------------------------------- /standalone_eval/eval.py: -------------------------------------------------------------------------------- 1 | """ 2 | Load prediction file and GT file to calculate TVR metrics: 3 | - recall at top K (R@K), for a specified IoU, where K in [1, 5, 10, 100], IoU in [0.5, 0.7] 4 | """ 5 | import json 6 | import numpy as np 7 | from tqdm import tqdm 8 | from collections import OrderedDict, defaultdict 9 | 10 | 11 | def load_json(filename): 12 | with open(filename, "r") as f: 13 | return json.load(f) 14 | 15 | 16 | def load_jsonl(filename): 17 | with open(filename, "r") as f: 18 | return [json.loads(l.strip("\n")) for l in f.readlines()] 19 | 20 | 21 | def pad_sequences_1d_np(sequences, dtype=np.float32): 22 | 23 | """ Pad a single-nested list or a sequence of n-d array (torch.tensor or np.ndarray) 24 | into a (n+1)-d array, only allow the first dim has variable lengths. 25 | Args: 26 | sequences: list(n-d tensor or list) 27 | dtype: np.dtype or torch.dtype 28 | Returns: 29 | padded_seqs: ((n+1)-d tensor) padded with zeros 30 | mask: (2d tensor) of the same shape as the first two dims of padded_seqs, 31 | 1 indicate valid, 0 otherwise 32 | Examples: 33 | >>> test_data_list = [[1,2,3], [1,2], [3,4,7,9]] 34 | >>> pad_sequences_1d(test_data_list, dtype=np.float32) 35 | >>> test_data_3d = [np.random.randn(2,3,4), np.random.randn(4,3,4), np.random.randn(1,3,4)] 36 | >>> pad_sequences_1d(test_data_3d, dtype=np.float32) 37 | """ 38 | if isinstance(sequences[0], list): 39 | sequences = [np.asarray(s, dtype=dtype) for s in sequences] 40 | 41 | extra_dims = sequences[0].shape[1:] # the extra dims should be the same for all elements 42 | lengths = [len(seq) for seq in sequences] 43 | assert "numpy" in str(dtype), "dtype and input type does not match" 44 | padded_seqs = np.zeros((len(sequences), max(lengths)) + extra_dims, dtype=dtype) 45 | mask = np.zeros((len(sequences), max(lengths)), dtype=np.float32) 46 | 47 | for idx, seq in enumerate(sequences): 48 | end = lengths[idx] 49 | padded_seqs[idx, :end] = seq 50 | mask[idx, :end] = 1 51 | return padded_seqs, mask 52 | 53 | 54 | def compute_temporal_iou_batch(preds, gt): 55 | """ compute intersection-over-union along temporal axis 56 | This function is significantly faster than `compute_temporal_iou`, 57 | the result should be the same. 58 | Args: 59 | preds: np.ndarray, (N, 2), [st (float), ed (float)] * N 60 | gt: [st (float), ed (float)] 61 | Returns: 62 | iou (float): np.ndarray, (N, ) 63 | 64 | References: 65 | for np.divide with zeros, see https://stackoverflow.com/a/37977222 66 | """ 67 | intersection = np.maximum(0, np.minimum(preds[:, 1], gt[1]) - np.maximum(preds[:, 0], gt[0])) 68 | union = np.maximum(preds[:, 1], gt[1]) - np.minimum(preds[:, 0], gt[0]) # not the correct union though 69 | return np.divide(intersection, union, out=np.zeros_like(intersection), where=union != 0) 70 | 71 | 72 | def get_rounded_percentage(float_number, n_floats=2): 73 | return round(float_number * 100, n_floats) 74 | 75 | 76 | TASK_TYPES = OrderedDict([ 77 | ("VCMR", "Video Corpus Moment Retrieval"), 78 | ("SVMR", "Single Video Moment Retrieval"), 79 | ("VR", "regular Video Retrieval") 80 | ]) 81 | 82 | 83 | def eval_by_task_type(moment_predictions, video2idx, ground_truth, 84 | iou_thds=(0.5, 0.7), recall_topks=(1, 5, 10, 100), 85 | task_type="SVMR", max_pred_per_query=100, match_number=True, verbose=True, use_desc_type=True): 86 | """ a predicted triplet is positive only if: 87 | 1) its vid_name matches the GT vid_name 88 | 2) IoU between its timestamp and GT timestamp is higher than the given threshold 89 | 90 | moment_predictions w.r.t. different task_type: 91 | For each query, evaluated on top max_pred_per_query [vid_name, st, ed] triplets. (score entry ignored) 92 | VCMR: vid_name might be repeating. 93 | SVMR: vid_name is fixed to be the GT vid_name. 94 | VR: vid_name is not repeating, st and ed will not be used. 95 | 96 | Args: 97 | video2idx: {vid_name (str): index (int), ...} 98 | moment_predictions: list(dict), each dict is { 99 | "desc": str, 100 | "desc_id": int, 101 | "predictions": [vid_name_idx (int), st (float), ed (float), score (float)] * n_pred, 102 | sorted predictions, n_pred could be different for all dicts. For each prediction, 103 | only the first 3 elements [vid_name (str), st (float), ed (float),] are used, 104 | any other following elements are ignored. We leave score here for record. 105 | } 106 | ground_truth: list(dict), each dict is { 107 | "desc": str, 108 | "desc_id": int, 109 | "type": str, one of [v, t, vt] 110 | "vid_name": str 111 | "ts": [st (float), ed (float)], or list([st (float), ed (float)]), len == 4. 112 | ... 113 | } 114 | iou_thds: temporal IoU thresholds 115 | recall_topks: recall at different top k 116 | task_type: str, could be: ["VCMR", "SVMR", "VR"], see TASK_TYPES for definition. 117 | max_pred_per_query: int, only top max_pred_per_query predictions for each query are used. 118 | match_number: bool, must set to True if when do evaluation, False is only used for debug. 119 | verbose: 120 | use_desc_type: only TVR has desc type 121 | Returns: 122 | 123 | """ 124 | assert task_type in TASK_TYPES, "task_type must be one of {}".format(list(TASK_TYPES.keys())) 125 | if verbose: 126 | print("Running evaluation with task_type {}, n results {}; n gt {}" 127 | .format(task_type, len(moment_predictions), len(ground_truth))) 128 | 129 | predictions_by_desc_id = {e["desc_id"]: e for e in moment_predictions} 130 | gt_by_desc_id = {e["desc_id"]: e for e in ground_truth} 131 | desc_type2idx = {"v": 0, "t": 1, "vt": 2} 132 | desc_types = [] # n_desc 133 | 134 | if match_number: 135 | assert set(gt_by_desc_id.keys()) == set(predictions_by_desc_id.keys()), \ 136 | "desc_ids in predictions and ground_truth must match" 137 | # assert len(set([len(e["predictions"]) for e in predictions_by_desc_id.values()])) == 1, \ 138 | # "all queries must have the same number of predictions" 139 | 140 | pred_info_matrix_collection = [] 141 | for k, gt_item in tqdm(gt_by_desc_id.items(), desc="Loop over moments", leave=False): 142 | if not match_number and k not in predictions_by_desc_id: 143 | continue 144 | pred_info_matrix = np.array( 145 | [e[:3] for e in predictions_by_desc_id[k]["predictions"]][:max_pred_per_query], 146 | dtype=np.float32) # (n_pred, 3) 147 | if use_desc_type: 148 | desc_types.append(desc_type2idx[gt_item["type"]]) 149 | vid_name_matched_pred = pred_info_matrix[:, 0] == video2idx[gt_item["vid_name"]] # bool, (n_pred, ) 150 | pred_info_matrix = np.concatenate([pred_info_matrix, vid_name_matched_pred[:, None]], axis=1) # (n_pred, 4) 151 | 152 | # add 1 + len(iou_thds) columns, iou_scores, iou_corrects for each iou_thd. 153 | iou_thd_corrects_columns = [] 154 | if len(gt_item["ts"]) >= 4: # didemo, fro all 3 splits, at least 4 ts for each, < 0.5% has more than 4. 155 | least_n_overlap = 2 # True if overlapped with at least least_n_overlap GT ts. 156 | iou_corrects_dict = defaultdict(list) 157 | for single_gt_ts in gt_item["ts"]: 158 | single_gt_ts = np.array(single_gt_ts, dtype=np.float32) # (2, ) 159 | # iou scores of the predictions that have wrong vid_name are set to 0. 160 | iou_scores = compute_temporal_iou_batch(pred_info_matrix[:, 1:3], single_gt_ts) * vid_name_matched_pred 161 | for iou_thd in iou_thds: 162 | iou_corrects_dict[iou_thd].append(iou_scores >= iou_thd) 163 | for iou_thd in iou_thds: 164 | iou_corrects = sum(iou_corrects_dict[iou_thd]) >= least_n_overlap # bool, (n_pred, ) 165 | iou_thd_corrects_columns.append(iou_corrects[:, None]) 166 | 167 | else: # should be 2, len([st, ed]) == 2 168 | single_gt_ts = np.array(gt_item["ts"], dtype=np.float32) # (2, ) 169 | # iou scores of the predictions that have wrong vid_name are set to 0. 170 | iou_scores = compute_temporal_iou_batch(pred_info_matrix[:, 1:3], single_gt_ts) * vid_name_matched_pred 171 | 172 | for iou_thd in iou_thds: 173 | iou_corrects = iou_scores >= iou_thd # bool, (n_pred, ) 174 | iou_thd_corrects_columns.append(iou_corrects[:, None]) 175 | 176 | pred_info_matrix = np.concatenate([pred_info_matrix, ] + iou_thd_corrects_columns, axis=1) # (n_pred, 6) 177 | pred_info_matrix_collection.append(pred_info_matrix) 178 | 179 | # column header [vid_name_idx (int), st (float), ed (float), is_vid_name_match (bool), 180 | # iou_scores>=iou_thd0 (bool), iou_scores>=iou_thd1 (bool)] 181 | pred_info_matrix_collection = pad_sequences_1d_np(pred_info_matrix_collection)[0] # (n_desc, n_pred, 6) 182 | if use_desc_type: 183 | desc_types = np.array(desc_types) # (n_desc) 184 | 185 | # results wrapper 186 | metrics = OrderedDict() 187 | metrics_by_type = OrderedDict() 188 | 189 | iou_c_offset = 4 # iou_corrects column index starts here 190 | if task_type == "VCMR": 191 | for iou_idx, iou_thd in enumerate(iou_thds): 192 | iou_corrects = pred_info_matrix_collection[:, :, iou_c_offset + iou_idx].astype(np.bool) # (n_desc, n_pred) 193 | # 1) there might be more than one positive clip, so use `>= 1` 194 | for k in recall_topks: 195 | metrics["{}-r{}".format(iou_thd, k)] = \ 196 | get_rounded_percentage(np.mean(np.sum(iou_corrects[:, :k], axis=1) >= 1)) 197 | if use_desc_type: 198 | for desc_type in desc_type2idx: 199 | type_corrects = desc_types == desc_type2idx[desc_type] # (n_desc) 200 | n_desc_in_type = np.sum(type_corrects) # (n_desc) 201 | for iou_idx, iou_thd in enumerate(iou_thds): 202 | # (n_desc, n_pred) 203 | iou_corrects = pred_info_matrix_collection[:, :, iou_c_offset + iou_idx].astype(np.bool) 204 | for k in recall_topks: 205 | metrics_by_type["{}-{}-r{}".format(desc_type, iou_thd, k)] = get_rounded_percentage( 206 | 1.0 * np.sum(np.logical_and(np.sum(iou_corrects[:, :k], axis=1) >= 1, type_corrects)) 207 | / n_desc_in_type 208 | ) 209 | elif task_type == "SVMR": 210 | vid_name_matched = pred_info_matrix_collection[:, :, 3].astype(np.bool) # (n_desc, n_pred) 211 | n_desc = len(vid_name_matched) 212 | for iou_idx, iou_thd in enumerate(iou_thds): 213 | iou_corrects = pred_info_matrix_collection[:, :, iou_c_offset + iou_idx].astype(np.bool) # (n_desc, n_pred) 214 | # 1) there might be more than one positive clip, so use `>= 1` 215 | for k in recall_topks: 216 | metrics["{}-r{}".format(iou_thd, k)] = get_rounded_percentage(np.mean( 217 | [np.sum(iou_corrects[idx][vid_name_matched[idx]][:k]) >= 1 for idx in range(n_desc)] 218 | )) 219 | if use_desc_type: 220 | for desc_type in desc_type2idx: 221 | type_corrects = desc_types == desc_type2idx[desc_type] # (n_desc) 222 | n_desc_in_type = np.sum(type_corrects) # (n_desc) 223 | for iou_idx, iou_thd in enumerate(iou_thds): 224 | # (n_desc, n_pred) 225 | iou_corrects = pred_info_matrix_collection[:, :, iou_c_offset + iou_idx].astype(np.bool) 226 | # 1) there might be more than one positive clip, so use `>= 1` 227 | for k in recall_topks: 228 | metrics_by_type["{}-{}-r{}".format(desc_type, iou_thd, k)] = get_rounded_percentage( 229 | 1.0 * np.sum([np.sum(iou_corrects[idx][vid_name_matched[idx]][:k]) >= 1 and type_corrects[idx] 230 | for idx in range(n_desc)]) 231 | / n_desc_in_type) 232 | 233 | elif task_type == "VR": 234 | vid_name_matched = pred_info_matrix_collection[:, :, 3].astype(np.bool) # (n_desc, n_pred) 235 | for k in recall_topks: 236 | metrics["r{}".format(k)] = \ 237 | get_rounded_percentage(np.mean(np.sum(vid_name_matched[:, :k], axis=1) >= 1)) 238 | if use_desc_type: 239 | for desc_type in desc_type2idx: 240 | type_corrects = desc_types == desc_type2idx[desc_type] # (n_desc) 241 | n_desc_in_type = np.sum(type_corrects) # (n_desc) 242 | for k in recall_topks: 243 | metrics_by_type["{}-r{}".format(desc_type, k)] = get_rounded_percentage( 244 | 1.0 * np.sum(np.logical_and(np.sum(vid_name_matched[:, :k], axis=1) >= 1, type_corrects)) 245 | / n_desc_in_type) 246 | else: 247 | raise ValueError("task_type wrong.") 248 | if use_desc_type: 249 | metrics_by_type["desc_type_ratio"] = "v {} t {} vt {}"\ 250 | .format(*[get_rounded_percentage(1.0 * np.sum(desc_types == desc_type2idx[k]) / len(desc_types)) 251 | for k in ["v", "t", "vt"]]) 252 | return metrics, metrics_by_type 253 | 254 | 255 | def eval_retrieval(submission, ground_truth, iou_thds=(0.5, 0.7), verbose=True, match_number=True, use_desc_type=True): 256 | video2idx = submission["video2idx"] 257 | submitted_task_types = [k for k in TASK_TYPES if k in submission] 258 | if verbose: 259 | print("Evaluating for task {}".format(submitted_task_types)) 260 | eval_metrics = OrderedDict() 261 | metrics_raw_dict = {} 262 | for task_type in submitted_task_types: 263 | metrics, metrics_by_type = eval_by_task_type( 264 | submission[task_type], video2idx, ground_truth, 265 | iou_thds=iou_thds, recall_topks=(1, 10, 100), # (1, 5, 10, 20, 50, 100), 266 | task_type=task_type, max_pred_per_query=100, 267 | match_number=match_number, verbose=verbose, use_desc_type=use_desc_type) 268 | metrics_raw_dict[task_type] = metrics 269 | metrics_raw_dict[task_type+"_by_type"] = metrics_by_type 270 | 271 | for task_type in submitted_task_types: 272 | eval_metrics[task_type] = metrics_raw_dict[task_type] 273 | if use_desc_type: 274 | for task_type in submitted_task_types: 275 | eval_metrics[task_type+"_by_type"] = metrics_raw_dict[task_type+"_by_type"] 276 | return eval_metrics 277 | 278 | 279 | def eval_main(): 280 | import argparse 281 | parser = argparse.ArgumentParser(description="TVR Evaluation Script") 282 | parser.add_argument("--submission_path", type=str, help="path to generated prediction file") 283 | parser.add_argument("--gt_path", type=str, help="path to GT file") 284 | parser.add_argument("--save_path", type=str, help="path to save the results") 285 | parser.add_argument("--not_verbose", action="store_true") 286 | args = parser.parse_args() 287 | 288 | verbose = not args.not_verbose 289 | submission = load_json(args.submission_path) 290 | gt = load_jsonl(args.gt_path) 291 | results = eval_retrieval(submission, gt, iou_thds=(0.5, 0.7), verbose=verbose) 292 | if verbose: 293 | print(json.dumps(results, indent=4)) 294 | 295 | with open(args.save_path, "w") as f: 296 | f.write(json.dumps(results, indent=4)) 297 | 298 | 299 | if __name__ == '__main__': 300 | eval_main() 301 | -------------------------------------------------------------------------------- /standalone_eval/eval_sample.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Usage: bash standalone_eval/eval_sample.sh 3 | submission_path=standalone_eval/sample_val_predictions.json 4 | gt_path=data/tvr_val_release.jsonl 5 | save_path=standalone_eval/sample_val_predictions_metrics.json 6 | 7 | python standalone_eval/eval.py \ 8 | --submission_path ${submission_path} \ 9 | --gt_path ${gt_path} \ 10 | --save_path ${save_path} 11 | -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/26hzhang/ReLoCLNet/56cb666ce516cce9acbcfce78fb4e95d81e11e54/utils/__init__.py -------------------------------------------------------------------------------- /utils/basic_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import zipfile 4 | import numpy as np 5 | import pickle 6 | 7 | 8 | def uniform_feature_sampling(features, max_len): 9 | num_clips = features.shape[0] 10 | if max_len is None or num_clips <= max_len: 11 | return features 12 | idxs = np.arange(0, max_len + 1, 1.0) / max_len * num_clips 13 | idxs = np.round(idxs).astype(np.int32) 14 | idxs[idxs > num_clips - 1] = num_clips - 1 15 | new_features = [] 16 | for i in range(max_len): 17 | s_idx, e_idx = idxs[i], idxs[i + 1] 18 | if s_idx < e_idx: 19 | new_features.append(np.mean(features[s_idx:e_idx], axis=0)) 20 | else: 21 | new_features.append(features[s_idx]) 22 | new_features = np.asarray(new_features) 23 | return new_features 24 | 25 | 26 | def compute_overlap(pred, gt): 27 | # check format 28 | assert isinstance(pred, list) and isinstance(gt, list) 29 | pred_is_list = isinstance(pred[0], list) 30 | gt_is_list = isinstance(gt[0], list) 31 | pred = pred if pred_is_list else [pred] 32 | gt = gt if gt_is_list else [gt] 33 | # compute overlap 34 | pred, gt = np.array(pred), np.array(gt) 35 | inter_left = np.maximum(pred[:, 0, None], gt[None, :, 0]) 36 | inter_right = np.minimum(pred[:, 1, None], gt[None, :, 1]) 37 | inter = np.maximum(0.0, inter_right - inter_left) 38 | union_left = np.minimum(pred[:, 0, None], gt[None, :, 0]) 39 | union_right = np.maximum(pred[:, 1, None], gt[None, :, 1]) 40 | union = np.maximum(1e-12, union_right - union_left) 41 | overlap = 1.0 * inter / union 42 | # reformat output 43 | overlap = overlap if gt_is_list else overlap[:, 0] 44 | overlap = overlap if pred_is_list else overlap[0] 45 | return overlap 46 | 47 | 48 | def time_to_index(start_time, end_time, num_units, duration): 49 | s_times = np.arange(0, num_units).astype(np.float32) / float(num_units) * duration 50 | e_times = np.arange(1, num_units + 1).astype(np.float32) / float(num_units) * duration 51 | candidates = np.stack([np.repeat(s_times[:, None], repeats=num_units, axis=1), 52 | np.repeat(e_times[None, :], repeats=num_units, axis=0)], axis=2).reshape((-1, 2)) 53 | overlaps = compute_overlap(candidates.tolist(), [start_time, end_time]).reshape(num_units, num_units) 54 | start_index = np.argmax(overlaps) // num_units 55 | end_index = np.argmax(overlaps) % num_units 56 | return start_index, end_index 57 | 58 | 59 | def load_pickle(filename): 60 | with open(filename, "rb") as f: 61 | return pickle.load(f) 62 | 63 | 64 | def save_pickle(data, filename): 65 | with open(filename, "wb") as f: 66 | pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL) 67 | 68 | 69 | def load_json(filename): 70 | with open(filename, "r") as f: 71 | return json.load(f) 72 | 73 | 74 | def save_json(data, filename, save_pretty=False, sort_keys=False): 75 | with open(filename, "w") as f: 76 | if save_pretty: 77 | f.write(json.dumps(data, indent=4, sort_keys=sort_keys)) 78 | else: 79 | json.dump(data, f) 80 | 81 | 82 | def load_jsonl(filename): 83 | with open(filename, "r") as f: 84 | return [json.loads(l.strip("\n")) for l in f.readlines()] 85 | 86 | 87 | def save_jsonl(data, filename): 88 | """data is a list""" 89 | with open(filename, "w") as f: 90 | f.write("\n".join([json.dumps(e) for e in data])) 91 | 92 | 93 | def save_lines(list_of_str, filepath): 94 | with open(filepath, "w") as f: 95 | f.write("\n".join(list_of_str)) 96 | 97 | 98 | def read_lines(filepath): 99 | with open(filepath, "r") as f: 100 | return [e.strip("\n") for e in f.readlines()] 101 | 102 | 103 | def mkdirp(p): 104 | if not os.path.exists(p): 105 | os.makedirs(p) 106 | 107 | 108 | def flat_list_of_lists(l): 109 | """flatten a list of lists [[1,2], [3,4]] to [1,2,3,4]""" 110 | return [item for sublist in l for item in sublist] 111 | 112 | 113 | def convert_to_seconds(hms_time): 114 | """ convert '00:01:12' to 72 seconds. 115 | :hms_time (str): time in comma separated string, e.g. '00:01:12' 116 | :return (int): time in seconds, e.g. 72 117 | """ 118 | times = [float(t) for t in hms_time.split(":")] 119 | return times[0] * 3600 + times[1] * 60 + times[2] 120 | 121 | 122 | def get_video_name_from_url(url): 123 | return url.split("/")[-1][:-4] 124 | 125 | 126 | def merge_dicts(list_dicts): 127 | merged_dict = list_dicts[0].copy() 128 | for i in range(1, len(list_dicts)): 129 | merged_dict.update(list_dicts[i]) 130 | return merged_dict 131 | 132 | 133 | def l2_normalize_np_array(np_array, eps=1e-5): 134 | """np_array: np.ndarray, (*, D), where the last dim will be normalized""" 135 | return np_array / (np.linalg.norm(np_array, axis=-1, keepdims=True) + eps) 136 | 137 | 138 | def make_zipfile(src_dir, save_path, enclosing_dir="", exclude_dirs=None, exclude_extensions=None, 139 | exclude_dirs_substring=None): 140 | """make a zip file of root_dir, save it to save_path. 141 | exclude_paths will be excluded if it is a subdir of root_dir. 142 | An enclosing_dir is added is specified. 143 | """ 144 | abs_src = os.path.abspath(src_dir) 145 | with zipfile.ZipFile(save_path, "w") as zf: 146 | for dirname, subdirs, files in os.walk(src_dir): 147 | if exclude_dirs is not None: 148 | for e_p in exclude_dirs: 149 | if e_p in subdirs: 150 | subdirs.remove(e_p) 151 | if exclude_dirs_substring is not None: 152 | to_rm = [] 153 | for d in subdirs: 154 | if exclude_dirs_substring in d: 155 | to_rm.append(d) 156 | for e in to_rm: 157 | subdirs.remove(e) 158 | arcname = os.path.join(enclosing_dir, dirname[len(abs_src) + 1:]) 159 | zf.write(dirname, arcname) 160 | for filename in files: 161 | if exclude_extensions is not None: 162 | if os.path.splitext(filename)[1] in exclude_extensions: 163 | continue # do not zip it 164 | absname = os.path.join(dirname, filename) 165 | arcname = os.path.join(enclosing_dir, absname[len(abs_src) + 1:]) 166 | zf.write(absname, arcname) 167 | 168 | 169 | class AverageMeter(object): 170 | """Computes and stores the average and current/max/min value""" 171 | def __init__(self): 172 | self.val = 0 173 | self.avg = 0 174 | self.sum = 0 175 | self.count = 0 176 | self.max = -1e10 177 | self.min = 1e10 178 | self.reset() 179 | 180 | def reset(self): 181 | self.val = 0 182 | self.avg = 0 183 | self.sum = 0 184 | self.count = 0 185 | self.max = -1e10 186 | self.min = 1e10 187 | 188 | def update(self, val, n=1): 189 | self.max = max(val, self.max) 190 | self.min = min(val, self.min) 191 | self.val = val 192 | self.sum += val * n 193 | self.count += n 194 | self.avg = self.sum / self.count 195 | 196 | 197 | def dissect_by_lengths(np_array, lengths, dim=0, assert_equal=True): 198 | """Dissect an array (N, D) into a list a sub-array, 199 | np_array.shape[0] == sum(lengths), Output is a list of nd arrays, singlton dimention is kept""" 200 | if assert_equal: 201 | assert len(np_array) == sum(lengths) 202 | length_indices = [0, ] 203 | for i in range(len(lengths)): 204 | length_indices.append(length_indices[i] + lengths[i]) 205 | if dim == 0: 206 | array_list = [np_array[length_indices[i]:length_indices[i+1]] for i in range(len(lengths))] 207 | elif dim == 1: 208 | array_list = [np_array[:, length_indices[i]:length_indices[i + 1]] for i in range(len(lengths))] 209 | elif dim == 2: 210 | array_list = [np_array[:, :, length_indices[i]:length_indices[i + 1]] for i in range(len(lengths))] 211 | else: 212 | raise NotImplementedError 213 | return array_list 214 | 215 | 216 | def get_ratio_from_counter(counter_obj, threshold=200): 217 | keys = counter_obj.keys() 218 | values = counter_obj.values() 219 | filtered_values = [counter_obj[k] for k in keys if k > threshold] 220 | return float(sum(filtered_values)) / sum(values) 221 | 222 | 223 | def get_show_name(vid_name): 224 | """ 225 | get tvshow name from vid_name 226 | :param vid_name: video clip name 227 | :return: tvshow name 228 | """ 229 | show_list = ["friends", "met", "castle", "house", "grey"] 230 | vid_name_prefix = vid_name.split("_")[0] 231 | show_name = vid_name_prefix if vid_name_prefix in show_list else "bbt" 232 | return show_name 233 | -------------------------------------------------------------------------------- /utils/mk_video_split_with_duration.py: -------------------------------------------------------------------------------- 1 | from utils.basic_utils import load_json, save_json 2 | 3 | 4 | def combine(video_name_split_path, video_duration_path, save_path): 5 | video_name_split = load_json(video_name_split_path) 6 | video_duration_dict = load_json(video_duration_path) 7 | 8 | combined_dict = {} 9 | for split_name, split_video_names in video_name_split.items(): 10 | combined_dict[split_name] = {vid_name: video_duration_dict[vid_name] 11 | for vid_name in split_video_names} 12 | save_json(combined_dict, save_path) 13 | 14 | 15 | if __name__ == '__main__': 16 | import sys 17 | combine(*sys.argv[1:]) 18 | 19 | -------------------------------------------------------------------------------- /utils/model_utils.py: -------------------------------------------------------------------------------- 1 | __author__ = "Jie Lei" 2 | 3 | # ref: https://github.com/lichengunc/MAttNet/blob/master/lib/layers/lang_encoder.py#L11 4 | # ref: https://github.com/easonnie/flint/blob/master/torch_util.py#L272 5 | import torch 6 | import torch.nn as nn 7 | from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence 8 | 9 | 10 | class RNNEncoder(nn.Module): 11 | """A RNN wrapper handles variable length inputs, always set batch_first=True. 12 | Supports LSTM, GRU and RNN. Tested with PyTorch 0.3 and 0.4 13 | """ 14 | def __init__(self, word_embedding_size, hidden_size, bidirectional=True, 15 | dropout_p=0, n_layers=1, rnn_type="lstm", 16 | return_hidden=True, return_outputs=True, 17 | allow_zero=False): 18 | super(RNNEncoder, self).__init__() 19 | """ 20 | :param word_embedding_size: rnn input size 21 | :param hidden_size: rnn output size 22 | :param dropout_p: between rnn layers, only useful when n_layer >= 2 23 | """ 24 | self.allow_zero = allow_zero 25 | self.rnn_type = rnn_type 26 | self.n_dirs = 2 if bidirectional else 1 27 | # - add return_hidden keyword arg to reduce computation if hidden is not needed. 28 | self.return_hidden = return_hidden 29 | self.return_outputs = return_outputs 30 | self.rnn = getattr(nn, rnn_type.upper())(word_embedding_size, hidden_size, n_layers, 31 | batch_first=True, 32 | bidirectional=bidirectional, 33 | dropout=dropout_p) 34 | 35 | def sort_batch(self, seq, lengths): 36 | sorted_lengths, perm_idx = lengths.sort(0, descending=True) 37 | if self.allow_zero: # deal with zero by change it to one. 38 | sorted_lengths[sorted_lengths == 0] = 1 39 | reverse_indices = [0] * len(perm_idx) 40 | for i in range(len(perm_idx)): 41 | reverse_indices[perm_idx[i]] = i 42 | sorted_seq = seq[perm_idx] 43 | return sorted_seq, list(sorted_lengths), reverse_indices 44 | 45 | def forward(self, inputs, lengths): 46 | """ 47 | inputs, sorted_inputs -> (B, T, D) 48 | lengths -> (B, ) 49 | outputs -> (B, T, n_dirs * D) 50 | hidden -> (n_layers * n_dirs, B, D) -> (B, n_dirs * D) keep the last layer 51 | - add total_length in pad_packed_sequence for compatiblity with nn.DataParallel, --remove it 52 | """ 53 | assert len(inputs) == len(lengths) 54 | sorted_inputs, sorted_lengths, reverse_indices = self.sort_batch(inputs, lengths) 55 | packed_inputs = pack_padded_sequence(sorted_inputs, sorted_lengths, batch_first=True) 56 | outputs, hidden = self.rnn(packed_inputs) 57 | if self.return_outputs: 58 | # outputs, lengths = pad_packed_sequence(outputs, batch_first=True, total_length=int(max(lengths))) 59 | outputs, lengths = pad_packed_sequence(outputs, batch_first=True) 60 | outputs = outputs[reverse_indices] 61 | else: 62 | outputs = None 63 | if self.return_hidden: # 64 | if self.rnn_type.lower() == "lstm": 65 | hidden = hidden[0] 66 | hidden = hidden[-self.n_dirs:, :, :] 67 | hidden = hidden.transpose(0, 1).contiguous() 68 | hidden = hidden.view(hidden.size(0), -1) 69 | hidden = hidden[reverse_indices] 70 | else: 71 | hidden = None 72 | return outputs, hidden 73 | 74 | 75 | def pool_across_time(outputs, lengths, pool_type="max"): 76 | """ Get maximum responses from RNN outputs along time axis 77 | :param outputs: (B, T, D) 78 | :param lengths: (B, ) 79 | :param pool_type: str, 'max' or 'mean' 80 | :return: (B, D) 81 | """ 82 | if pool_type == "max": 83 | outputs = [outputs[i, :int(lengths[i]), :].max(dim=0)[0] for i in range(len(lengths))] 84 | elif pool_type == "mean": 85 | outputs = [outputs[i, :int(lengths[i]), :].mean(dim=0) for i in range(len(lengths))] 86 | else: 87 | raise NotImplementedError("Only support mean and max pooling") 88 | return torch.stack(outputs, dim=0) 89 | 90 | 91 | def count_parameters(model, verbose=True): 92 | """Count number of parameters in PyTorch model, 93 | References: https://discuss.pytorch.org/t/how-do-i-check-the-number-of-parameters-of-a-model/4325/7. 94 | 95 | from utils.utils import count_parameters 96 | count_parameters(model) 97 | import sys 98 | sys.exit(1) 99 | """ 100 | n_all = sum(p.numel() for p in model.parameters()) 101 | n_trainable = sum(p.numel() for p in model.parameters() if p.requires_grad) 102 | if verbose: 103 | print("Parameter Count: all {:,d}; trainable {:,d}".format(n_all, n_trainable)) 104 | return n_all, n_trainable 105 | 106 | -------------------------------------------------------------------------------- /utils/temporal_nms.py: -------------------------------------------------------------------------------- 1 | """ 2 | Non-Maximum Suppression for video proposals. 3 | """ 4 | 5 | 6 | def compute_temporal_iou(pred, gt): 7 | """ deprecated due to performance concerns 8 | compute intersection-over-union along temporal axis 9 | Args: 10 | pred: [st (float), ed (float)] 11 | gt: [st (float), ed (float)] 12 | Returns: 13 | iou (float): 14 | 15 | Ref: https://github.com/LisaAnne/LocalizingMoments/blob/master/utils/eval.py 16 | """ 17 | intersection = max(0, min(pred[1], gt[1]) - max(pred[0], gt[0])) 18 | union = max(pred[1], gt[1]) - min(pred[0], gt[0]) # not the correct union though 19 | if union == 0: 20 | return 0 21 | else: 22 | return 1.0 * intersection / union 23 | 24 | 25 | def temporal_non_maximum_suppression(predictions, nms_threshold, max_after_nms=100): 26 | """ 27 | Args: 28 | predictions: list(sublist), each sublist is [st (float), ed(float), score (float)], 29 | note larger scores are better and are preserved. For metrics that are better when smaller, 30 | please convert to its negative, e.g., convert distance to negative distance. 31 | nms_threshold: float in [0, 1] 32 | max_after_nms: 33 | Returns: 34 | predictions_after_nms: list(sublist), each sublist is [st (float), ed(float), score (float)] 35 | References: 36 | https://github.com/wzmsltw/BSN-boundary-sensitive-network/blob/7b101fc5978802aa3c95ba5779eb54151c6173c6/Post_processing.py#L42 37 | """ 38 | if len(predictions) == 1: # only has one prediction, no need for nms 39 | return predictions 40 | 41 | predictions = sorted(predictions, key=lambda x: x[2], reverse=True) # descending order 42 | 43 | tstart = [e[0] for e in predictions] 44 | tend = [e[1] for e in predictions] 45 | tscore = [e[2] for e in predictions] 46 | rstart = [] 47 | rend = [] 48 | rscore = [] 49 | while len(tstart) > 1 and len(rscore) < max_after_nms: # max 100 after nms 50 | idx = 1 51 | while idx < len(tstart): # compare with every prediction in the list. 52 | if compute_temporal_iou([tstart[0], tend[0]], [tstart[idx], tend[idx]]) > nms_threshold: 53 | # rm highly overlapped lower score entries. 54 | tstart.pop(idx) 55 | tend.pop(idx) 56 | tscore.pop(idx) 57 | # print("--------------------------------") 58 | # print(compute_temporal_iou([tstart[0], tend[0]], [tstart[idx], tend[idx]])) 59 | # print([tstart[0], tend[0]], [tstart[idx], tend[idx]]) 60 | # print(tstart.pop(idx), tend.pop(idx), tscore.pop(idx)) 61 | else: 62 | # move to next 63 | idx += 1 64 | rstart.append(tstart.pop(0)) 65 | rend.append(tend.pop(0)) 66 | rscore.append(tscore.pop(0)) 67 | 68 | if len(rscore) < max_after_nms and len(tstart) >= 1: # add the last, possibly empty. 69 | rstart.append(tstart.pop(0)) 70 | rend.append(tend.pop(0)) 71 | rscore.append(tscore.pop(0)) 72 | 73 | predictions_after_nms = [[st, ed, s] for s, st, ed in zip(rscore, rstart, rend)] 74 | return predictions_after_nms 75 | -------------------------------------------------------------------------------- /utils/tensor_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | 5 | def pad_sequences_1d(sequences, dtype=torch.long, device=torch.device("cpu"), fixed_length=None): 6 | """ Pad a single-nested list or a sequence of n-d array (torch.tensor or np.ndarray) 7 | into a (n+1)-d array, only allow the first dim has variable lengths. 8 | Args: 9 | sequences: list(n-d tensor or list) 10 | dtype: np.dtype or torch.dtype 11 | device: 12 | fixed_length: pad all seq in sequences to fixed length. All seq should have a length <= fixed_length. 13 | return will be of shape [len(sequences), fixed_length, ...] 14 | Returns: 15 | padded_seqs: ((n+1)-d tensor) padded with zeros 16 | mask: (2d tensor) of the same shape as the first two dims of padded_seqs, 17 | 1 indicate valid, 0 otherwise 18 | Examples: 19 | >>> test_data_list = [[1,2,3], [1,2], [3,4,7,9]] 20 | >>> pad_sequences_1d(test_data_list, dtype=torch.long) 21 | >>> test_data_3d = [torch.randn(2,3,4), torch.randn(4,3,4), torch.randn(1,3,4)] 22 | >>> pad_sequences_1d(test_data_3d, dtype=torch.float) 23 | >>> test_data_list = [[1,2,3], [1,2], [3,4,7,9]] 24 | >>> pad_sequences_1d(test_data_list, dtype=np.float32) 25 | >>> test_data_3d = [np.random.randn(2,3,4), np.random.randn(4,3,4), np.random.randn(1,3,4)] 26 | >>> pad_sequences_1d(test_data_3d, dtype=np.float32) 27 | """ 28 | if isinstance(sequences[0], list): 29 | if "torch" in str(dtype): 30 | sequences = [torch.tensor(s, dtype=dtype, device=device) for s in sequences] 31 | else: 32 | sequences = [np.asarray(s, dtype=dtype) for s in sequences] 33 | 34 | extra_dims = sequences[0].shape[1:] # the extra dims should be the same for all elements 35 | lengths = [len(seq) for seq in sequences] 36 | if fixed_length is not None: 37 | max_length = fixed_length 38 | else: 39 | max_length = max(lengths) 40 | if isinstance(sequences[0], torch.Tensor): 41 | assert "torch" in str(dtype), "dtype and input type does not match" 42 | padded_seqs = torch.zeros((len(sequences), max_length) + extra_dims, dtype=dtype, device=device) 43 | mask = torch.zeros((len(sequences), max_length), dtype=torch.float32, device=device) 44 | else: # np 45 | assert "numpy" in str(dtype), "dtype and input type does not match" 46 | padded_seqs = np.zeros((len(sequences), max_length) + extra_dims, dtype=dtype) 47 | mask = np.zeros((len(sequences), max_length), dtype=np.float32) 48 | 49 | for idx, seq in enumerate(sequences): 50 | end = lengths[idx] 51 | padded_seqs[idx, :end] = seq 52 | mask[idx, :end] = 1 53 | return padded_seqs, mask # , lengths 54 | 55 | 56 | def pad_sequences_2d(sequences, dtype=torch.long): 57 | """ Pad a double-nested list or a sequence of n-d torch tensor into a (n+1)-d tensor, 58 | only allow the first two dims has variable lengths 59 | Args: 60 | sequences: list(n-d tensor or list) 61 | dtype: torch.long for word indices / torch.float (float32) for other cases 62 | Returns: 63 | Examples: 64 | >>> test_data_list = [[[1, 3, 5], [3, 7, 4, 1]], [[98, 34, 11, 89, 90], [22], [34, 56]],] 65 | >>> pad_sequences_2d(test_data_list, dtype=torch.long) # torch.Size([2, 3, 5]) 66 | >>> test_data_3d = [torch.randn(2,2,4), torch.randn(4,3,4), torch.randn(1,5,4)] 67 | >>> pad_sequences_2d(test_data_3d, dtype=torch.float) # torch.Size([2, 3, 5]) 68 | >>> test_data_3d2 = [[torch.randn(2,4), ], [torch.randn(3,4), torch.randn(5,4)]] 69 | >>> pad_sequences_2d(test_data_3d2, dtype=torch.float) # torch.Size([2, 3, 5]) 70 | # TODO add support for numpy array 71 | """ 72 | bsz = len(sequences) 73 | para_lengths = [len(seq) for seq in sequences] 74 | max_para_len = max(para_lengths) 75 | sen_lengths = [[len(word_seq) for word_seq in seq] for seq in sequences] 76 | max_sen_len = max([max(e) for e in sen_lengths]) 77 | 78 | if isinstance(sequences[0], torch.Tensor): 79 | extra_dims = sequences[0].shape[2:] 80 | elif isinstance(sequences[0][0], torch.Tensor): 81 | extra_dims = sequences[0][0].shape[1:] 82 | else: 83 | sequences = [[torch.Tensor(word_seq, dtype=dtype) for word_seq in seq] for seq in sequences] 84 | extra_dims = () 85 | 86 | padded_seqs = torch.zeros((bsz, max_para_len, max_sen_len) + extra_dims, dtype=dtype) 87 | mask = torch.zeros(bsz, max_para_len, max_sen_len).float() 88 | 89 | for b_i in range(bsz): 90 | for sen_i, sen_l in enumerate(sen_lengths[b_i]): 91 | padded_seqs[b_i, sen_i, :sen_l] = sequences[b_i][sen_i] 92 | mask[b_i, sen_i, :sen_l] = 1 93 | return padded_seqs, mask # , sen_lengths 94 | 95 | 96 | def find_max_triples(st_prob, ed_prob, top_n=5, prob_thd=None, tensor_type="torch"): 97 | """ Find a list of (k1, k2) where k1 < k2 with the maximum values of st_prob[k1] * ed_prob[k2] 98 | Args: 99 | st_prob (torch.Tensor or np.ndarray): (N, L) batched start_idx probabilities 100 | ed_prob (torch.Tensor or np.ndarray): (N, L) batched end_idx probabilities 101 | top_n (int): return topN pairs with highest values 102 | prob_thd (float): 103 | tensor_type: str, np or torch 104 | Returns: 105 | batched_sorted_triple: N * [(st_idx, ed_idx, confidence), ...] 106 | """ 107 | if tensor_type == "torch": 108 | st_prob, ed_prob = st_prob.data.numpy(), ed_prob.data.numpy() 109 | product = np.einsum("bm,bn->bmn", st_prob, ed_prob) 110 | # (N, L, L) the lower part becomes zeros, start_idx < ed_idx 111 | upper_product = np.triu(product, k=1) 112 | return find_max_triples_from_upper_triangle_product(upper_product, top_n=top_n, prob_thd=prob_thd) 113 | 114 | 115 | def find_max_triples_from_upper_triangle_product(upper_product, top_n=5, prob_thd=None): 116 | """ Find a list of (k1, k2) where k1 < k2 with the maximum values of p1[k1] * p2[k2] 117 | Args: 118 | upper_product (torch.Tensor or np.ndarray): (N, L, L), the lower part becomes zeros, end_idx > start_idx 119 | top_n (int): return topN pairs with highest values 120 | prob_thd (float or None): 121 | Returns: 122 | batched_sorted_triple: N * [(st_idx, ed_idx, confidence), ...] 123 | """ 124 | batched_sorted_triple = [] 125 | for idx, e in enumerate(upper_product): 126 | sorted_triple = top_n_array_2d(e, top_n=top_n) 127 | if prob_thd is not None: 128 | sorted_triple = sorted_triple[sorted_triple[2] >= prob_thd] 129 | batched_sorted_triple.append(sorted_triple) 130 | return batched_sorted_triple 131 | 132 | 133 | def top_n_array_2d(array_2d, top_n): 134 | """ Get topN indices and values of a 2d array, return a tuple of indices and their values, 135 | ranked by the value 136 | """ 137 | row_indices, column_indices = np.unravel_index(np.argsort(array_2d, axis=None), array_2d.shape) 138 | row_indices = row_indices[::-1][:top_n] 139 | column_indices = column_indices[::-1][:top_n] 140 | sorted_values = array_2d[row_indices, column_indices] 141 | return np.stack([row_indices, column_indices, sorted_values], axis=1) # (N, 3) 142 | -------------------------------------------------------------------------------- /utils/text_feature/README.md: -------------------------------------------------------------------------------- 1 | Language Model Fine-tuning and Feature Extraction 2 | ==== 3 | 4 | ### Install Dependencies 5 | 6 | The code requires installing [transformers](https://github.com/huggingface/transformers) package as well as [tensorboardX](https://github.com/lanpa/tensorboardX): 7 | ``` 8 | # install transformers 9 | git clone https://github.com/huggingface/transformers.git 10 | cd transformers 11 | git checkout e1b2949ae6cb34cc39e3934ca87423474f8c8d02 12 | pip install . 13 | 14 | # install tensorboardX 15 | pip install tensorboardX 16 | ``` 17 | 18 | ### Language Model Fine-tuning 19 | 20 | We fine-tune pre-trained [RoBERTa](https://arxiv.org/abs/1907.11692) base Model on TVR text with Masked Language Model (MLM) objective for 1 epoch: 21 | ``` 22 | bash utils/text_feature/train_lm_finetuning_single_sentence.sh FINETUNE_MODE OUTPUT_ROOT 23 | ``` 24 | `FINETUNE_MODE` could be `query_only` where only query text (in train set) is used to fine-tune the pre-trained model, 25 | this feature is used when we want to test model performance without subtitles. It can also be `sub_query` where 26 | both subtitle and query text are used in the fine-tuning process. `OUTPUT_ROOT` is a directory used to store the 27 | fine-tuned model and extracted features. You can append an additional `--debug` flag after the command to do 28 | a fast run of the code to test your configuration before actually running fine-tuning. 29 | 30 | At fine-tuning, each query is treated as a single sequence, each subtitle is split into max-length=256 segments 31 | where each of the resulting segments wil be treated as a single sequence. 32 | 33 | ### Feature Extraction 34 | After fine-tuning, you will get fine-tuned model at `OUTPUT_ROOT/FINETUNE_MODE/roberta-base_tuned_model`. 35 | 36 | Extract features at token-level: 37 | ``` 38 | bash utils/text_feature/extract_single_sentence_embeddings.sh \ 39 | OUTPUT_ROOT FINETUNE_MODE EXTRACTION_MODE SAVE_FILEPATH 40 | ``` 41 | `EXTRACTION_MODE` could be `sub` or `query`, 42 | `SAVE_FILEPATH` is a `.h5` filepath that will save the extracted features. 43 | 44 | To get the tokens that correspond to these feature vectors, run 45 | ``` 46 | bash utils/text_feature/extract_single_sentence_tokens.sh \ 47 | OUTPUT_ROOT FINETUNE_MODE EXTRACTION_MODE SAVE_FILEPATH 48 | ``` 49 | `SAVE_FILEPATH` is a `.jsonl` filepath that stores the extracted tokens. 50 | This is useful if you want to visualize attentions from the attended feature vectors back to the word tokens. 51 | 52 | The extracted query features can be directly used for training our XML model, 53 | while subtitle features needs one additional step: convert token-level features to clip-level features. 54 | Specifically, we max-pool/avg-pool the subtitle token embeddings every 1.5 seconds to get the clip-level 55 | embeddings: 56 | ``` 57 | bash utils/text_feature/convert_sub_feature_word_to_clip.sh \ 58 | POOL_TYPE CLIP_LENGTH SUB_TOKEN_H5 SUB_CLIP_H5 VID_CLIP_H5 59 | ``` 60 | `POOL_TYPE` could be `max` or `avg`, which defines how to aggregate token-level features to clip-level features. 61 | `CLIP_LENGTH` is set to 1.5 (seconds). `SUB_TOKEN_H5` is the path to extracted subtitle token-level features. 62 | `SUB_CLIP_H5` is the path to save the aggregated subtitle clip-level features. 63 | `VID_CLIP_H5` is the path to extracted video clip-level features, 64 | which is used to make sure each subtitle's clip-level features 65 | has the same length as the its corresponding video clip-level features. 66 | 67 | 68 | -------------------------------------------------------------------------------- /utils/text_feature/convert_sub_feature_word_to_clip.py: -------------------------------------------------------------------------------- 1 | import os 2 | import h5py 3 | import numpy as np 4 | from tqdm import tqdm 5 | from collections import Counter 6 | from utils.basic_utils import flat_list_of_lists, load_jsonl, save_json, load_json 7 | 8 | 9 | def process_single_vid_sub(sub_listdicts, clip_length): 10 | """ 11 | Args: 12 | sub_listdicts: list(dicts), each dict is, e.g., 13 | {'text': " Chase : That's all this is?", 'start': 0.862, 'end': 1.862} 14 | clip_length: float 15 | Returns: 16 | clip_idx2sentence_indices: dict, {clip_idx: [sen_idx1, sen_idx2, ...]}, which sentences are 17 | associated with which clips. The indices are in ascending order, i.e., sen_idx1 < sen_idx2 < ... 18 | """ 19 | timestamps = np.array([[e["start"], e["end"]] for e in sub_listdicts], dtype=np.float32) # (n_sub_sen, 2) 20 | timestamps = timestamps / clip_length 21 | # r-th row of clip_indices is [st_idx, ed_idx), where [st_idx, st_idx+1, ..., ed_idx-1] 22 | # should be with r-th clip, which is [r*clip_length, (r+1)*clip_length] 23 | sentence2clip_st_ed = np.empty_like(timestamps, dtype=np.int) 24 | sentence2clip_st_ed[:, 0] = np.floor(timestamps[:, 0]) 25 | sentence2clip_st_ed[:, 1] = np.ceil(timestamps[:, 1]) 26 | sentence_idx2clip_indices = {sen_idx: set(range(clip_st_idx, clip_ed_idx)) 27 | for sen_idx, (clip_st_idx, clip_ed_idx) in enumerate(sentence2clip_st_ed)} 28 | all_clip_indices = set(flat_list_of_lists(list(sentence_idx2clip_indices.values()))) 29 | clip_idx2sentence_indices = \ 30 | {str(clip_idx): sorted([k for k, v in sentence_idx2clip_indices.items() if clip_idx in v]) 31 | for clip_idx in all_clip_indices} 32 | return clip_idx2sentence_indices 33 | 34 | 35 | def load_process_sub_meta(sub_meta_path, clip_length): 36 | """ which subtitle sentences should be assigned to which clips 37 | Args: 38 | sub_meta_path: contains a jsonl file, each line is a dict {"vid_name": str, "sub": list(dicts)}, 39 | each dict under "sub" is, e.g., {'text': " Chase : That's all this is?", 'start': 0.862, 'end': 1.862}. 40 | The dicts under "sub" are ordered the same as the original .srt files. 41 | clip_length: float, assign each subtitle sentence to a clip segment 42 | Returns: 43 | """ 44 | video2sub = {e["vid_name"]: e for e in load_jsonl(sub_meta_path)} 45 | for vid_name, sub_info in tqdm(video2sub.items(), desc="processing subtitles"): 46 | sub_info["clip2sen"] = process_single_vid_sub(sub_info["sub"], clip_length) 47 | video2sub[vid_name] = sub_info 48 | return video2sub 49 | 50 | 51 | def convert_h5(sub_words_h5, vid_clip_h5, sub_clip_h5, video2sub_info, pool_type="max", debug=False): 52 | assert pool_type in ["max", "avg"] 53 | np_pool_func = np.max if pool_type == "max" else np.mean 54 | debug_cnt = 0 55 | not_equal_cnt = [] 56 | skip_cnt = 0 57 | for k in tqdm(sub_words_h5.keys(), desc="Converting to clip features"): 58 | if "-lengths" in k: 59 | continue 60 | sub_words_features = sub_words_h5[k] 61 | sub_sen_lengths = sub_words_h5[k + "-lengths"] 62 | num_sens = len(sub_sen_lengths) 63 | clip2sen = video2sub_info[k]["clip2sen"] 64 | 65 | if len(sub_sen_lengths) != len(video2sub_info[k]["sub"]): 66 | not_equal_cnt.append(len(video2sub_info[k]["sub"]) - len(sub_sen_lengths)) 67 | 68 | length_indices = [0, ] 69 | for i in range(len(sub_sen_lengths)): 70 | length_indices.append(length_indices[i] + sub_sen_lengths[i]) 71 | 72 | n_clips = len(vid_clip_h5[k]) 73 | clip_features = np.zeros((n_clips, sub_words_features.shape[-1]), dtype=np.float32) 74 | clip_mask = np.zeros(n_clips, dtype=np.float32) 75 | for clip_idx in range(n_clips): 76 | if str(clip_idx) in clip2sen: 77 | # the sen_indices tells which sentences belong to this clip, 78 | # e.g., [1, 2, 3] mean we should get [1, 4) to include all the indicated sentences 79 | sen_indices = [min(e, num_sens-1) for e in clip2sen[str(clip_idx)]] 80 | word_st_idx = length_indices[sen_indices[0]] 81 | word_ed_idx = length_indices[sen_indices[-1] + 1] 82 | if word_st_idx == word_ed_idx: 83 | skip_cnt += 1 84 | continue 85 | clip_features[clip_idx] = np_pool_func(sub_words_features[word_st_idx:word_ed_idx], axis=0) 86 | clip_mask[clip_idx] = 1 87 | sub_clip_h5.create_dataset(k, data=clip_features, dtype=np.float32) 88 | sub_clip_h5.create_dataset(k + "-mask", data=clip_mask, dtype=np.float32) 89 | debug_cnt += 1 90 | if debug and debug_cnt == 5: 91 | break 92 | print("skip_cnt {}".format(skip_cnt)) 93 | print("Counter not_equal_cnt {}".format(Counter(not_equal_cnt).most_common())) 94 | # Counter not_equal_cnt [(1, 150), (2, 7), (4, 1)] for clip_length==1.5 95 | 96 | 97 | def main_convert(): 98 | import argparse 99 | parser = argparse.ArgumentParser() 100 | parser.add_argument("--src_h5_file", type=str, help="subtitle words level feature .h5 file") 101 | parser.add_argument("--vid_clip_h5_file", type=str, help="video clip level feature .h5 file") 102 | parser.add_argument("--sub_meta_path", type=str, help="processed subtitle .jsonl path") 103 | parser.add_argument("--tgt_h5_file", type=str, help=".h5 path to stores the converted data") 104 | parser.add_argument("--pool_type", type=str, default="max", 105 | choices=["max", "avg"], help="how to aggreate frame features") 106 | parser.add_argument("--clip_length", type=float, default=1.5) 107 | parser.add_argument("--debug", action="store_true") 108 | args = parser.parse_args() 109 | 110 | sub_info_cache_path = args.tgt_h5_file.replace(".h5", "_sub_info.json") 111 | if not os.path.exists(sub_info_cache_path): 112 | video2sub_info = load_process_sub_meta(args.sub_meta_path, clip_length=args.clip_length) 113 | save_json(video2sub_info, sub_info_cache_path) 114 | else: 115 | video2sub_info = load_json(sub_info_cache_path) 116 | with h5py.File(args.src_h5_file, "r") as src_h5: 117 | with h5py.File(args.vid_clip_h5_file, "r") as vid_clip_h5: 118 | with h5py.File(args.tgt_h5_file, "w") as tgt_h5: 119 | convert_h5(src_h5, vid_clip_h5, tgt_h5, video2sub_info, 120 | pool_type=args.pool_type, debug=args.debug) 121 | 122 | 123 | if __name__ == '__main__': 124 | main_convert() 125 | -------------------------------------------------------------------------------- /utils/text_feature/convert_sub_feature_word_to_clip.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Usage: 3 | # bash utils/text_feature/convert_sub_feature_word_to_clip.sh POOL_TYPE CLIP_LENGTH [--debug] 4 | 5 | pool_type=$1 # [max, avg] 6 | clip_length=$2 7 | sub_token_h5_file=$3 8 | sub_clip_h5_file=$4 9 | vid_clip_h5_file=$5 # .h5 file stores the clip-level video features, to make sure subtitle clip-level features have the same length as the video features. 10 | sub_meta_path=data/tvqa_preprocessed_subtitles.jsonl 11 | 12 | python utils/text_feature/convert_sub_feature_word_to_clip.py \ 13 | --pool_type ${pool_type} \ 14 | --clip_length ${clip_length} \ 15 | --src_h5_file ${sub_token_h5_file} \ 16 | --tgt_h5_file ${sub_clip_h5_file} \ 17 | --sub_meta_path ${sub_meta_path} \ 18 | --vid_clip_h5_file ${vid_clip_h5_file} \ 19 | ${@:3} 20 | -------------------------------------------------------------------------------- /utils/text_feature/extract_single_sentence_embeddings.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Usage: 3 | # bash utils/text_feature/extract_single_sentence_embeddings.sh \ 4 | # OUTPUT_ROOT FINETUNE_MODE EXTRACTION_MODE SAVE_FILEPATH 5 | # Examples: 6 | # bash utils/text_feature/extract_single_sentence_embeddings.sh ${output_root} sub_query sub tvr_sub_pretrained_w_sub_query.h5 --debug 7 | # bash utils/text_feature/extract_single_sentence_embeddings.sh ${output_root} sub_query query tvr_query_pretrained_w_sub_query.h5 --debug 8 | output_root=$1 9 | finetune_mode=$2 # sub_query or query_only 10 | extraction_mode=$3 # sub or query 11 | extracted_file_name=$4 # tvr_query_pretrained_w_sub_query.h5, will be saved at output_dir 12 | 13 | data_root="data" 14 | train_data_file="${data_root}/tvr_train_release.jsonl" 15 | val_data_file="${data_root}/tvr_val_release.jsonl" 16 | test_data_file1="${data_root}/tvr_test_public_release.jsonl" 17 | sub_data_file="${data_root}/tvqa_preprocessed_subtitles.jsonl" 18 | 19 | ="/net/bvisionserver14/playpen-ssd/jielei/data/tvr/bert_feature" 20 | output_dir="${output_root}/${finetune_mode}" 21 | model_type="roberta" 22 | model_name_or_path="${output_dir}/roberta-base_tuned_model" 23 | 24 | 25 | if [[ ${extraction_mode} == query ]]; then 26 | max_length=30 27 | extra_args=(--train_data_file) 28 | extra_args+=(${train_data_file}) 29 | extra_args+=(${val_data_file}) 30 | extra_args+=(${test_data_file1}) 31 | elif [[ ${extraction_mode} == sub ]]; then 32 | max_length=256 33 | extra_args=(--use_sub) 34 | extra_args+=(--sub_data_file) 35 | extra_args+=(${sub_data_file}) 36 | fi 37 | 38 | python utils/text_feature/lm_finetuning_on_single_sentences.py \ 39 | --output_dir ${output_dir} \ 40 | --model_type ${model_type} \ 41 | --model_name_or_path ${model_name_or_path} \ 42 | --do_extract \ 43 | --extracted_file_name ${extracted_file_name} \ 44 | --block_size ${max_length} \ 45 | ${extra_args[@]} \ 46 | ${@:5} 47 | -------------------------------------------------------------------------------- /utils/text_feature/extract_single_sentence_tokens.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Usage: 3 | # bash utils/text_feature/extract_single_sentence_tokens.sh \ 4 | # OUTPUT_ROOT FINETUNE_MODE EXTRACTION_MODE SAVE_FILEPATH 5 | # Examples: 6 | # bash utils/text_feature/extract_single_sentence_tokens.sh ${output_root} query query tvr_query_roberta_tokenized.jsonl --debug 7 | output_root=$1 8 | finetune_mode=$2 # sub_query or query_only 9 | extraction_mode=$3 # sub or query 10 | extracted_file_name=$4 # "*jsonl" file 11 | 12 | 13 | 14 | data_root="data" 15 | train_data_file="${data_root}/tvr_train_release.jsonl" 16 | val_data_file="${data_root}/tvr_val_release.jsonl" 17 | test_data_file1="${data_root}/tvr_test_public_release.jsonl" 18 | test_data_file2="${data_root}/tvr_test_challenge_release.jsonl" 19 | sub_data_file="${data_root}/tvqa_preprocessed_subtitles.jsonl" 20 | 21 | output_dir="${output_root}/${finetune_mode}" 22 | model_type="roberta" 23 | model_name_or_path="${output_dir}/roberta-base_tuned_model" 24 | 25 | 26 | if [[ ${extraction_mode} == query ]]; then 27 | max_length=30 28 | extra_args=(--train_data_file) 29 | extra_args+=(${train_data_file}) 30 | extra_args+=(${val_data_file}) 31 | extra_args+=(${test_data_file1}) 32 | extra_args+=(${test_data_file2}) 33 | #elif [[ ${extraction_mode} == sub ]]; then 34 | # max_length=256 35 | # extra_args=(--use_sub) 36 | # extra_args+=(--sub_data_file) 37 | # extra_args+=(${sub_data_file}) 38 | fi 39 | 40 | python utils/text_feature/lm_finetuning_on_single_sentences.py \ 41 | --output_dir ${output_dir} \ 42 | --model_type ${model_type} \ 43 | --model_name_or_path ${model_name_or_path} \ 44 | --do_tokenize \ 45 | --extracted_file_name ${extracted_file_name} \ 46 | ${extra_args[@]} \ 47 | ${@:5} 48 | -------------------------------------------------------------------------------- /utils/text_feature/preprocess_subtitles.py: -------------------------------------------------------------------------------- 1 | """ 2 | Running basic pre-processing for the .srt subtitle files from 3 | http://tvqa.cs.unc.edu/download_tvqa.html#tvqa-download-2. 4 | """ 5 | import re 6 | import os 7 | import pysrt 8 | import glob 9 | from tqdm import tqdm 10 | from utils.basic_utils import save_jsonl 11 | 12 | 13 | def convert_sub_time_to_seconds(sub_time): 14 | """sub_time is a SubRipTime object defined by pysrt""" 15 | return 60 * sub_time.minutes + sub_time.seconds + 0.001 * sub_time.milliseconds 16 | 17 | 18 | def clean_single_sub_sentence(sub_sentence): 19 | """sub_sentence: str, """ 20 | sub_sentence = sub_sentence.replace("\n", " ") 21 | sub_sentence = sub_sentence.replace("(", " ") 22 | sub_sentence = sub_sentence.replace(")", " ") 23 | sub_sentence = sub_sentence.replace(":", " : ") 24 | sub_sentence = re.sub(r"\s{2,}", " ", sub_sentence) 25 | return sub_sentence 26 | 27 | 28 | def preprocess_subtitles_from_dir(srt_dir, save_path): 29 | """ 30 | return: A python dict, the keys are the video names, the entries are lists, 31 | each contains all the text from a .srt file 32 | sub_times are the start time of the sentences. 33 | """ 34 | assert not os.path.exists(save_path), "File {} already exists".format(save_path) 35 | 36 | print("Loading srt files from %s ..." % srt_dir) 37 | srt_paths = glob.glob(os.path.join(srt_dir, "*.srt")) 38 | srt_datalist = [] 39 | for sub_path in tqdm(srt_paths, desc="Loop over subtitle files"): 40 | subs = pysrt.open(sub_path, encoding="iso-8859-1") 41 | if len(subs) == 0: 42 | subs = pysrt.open(sub_path) 43 | 44 | sub_data = [] 45 | for cur_sub in subs: 46 | sub_data.append(dict( 47 | text=clean_single_sub_sentence(cur_sub.text), 48 | start=convert_sub_time_to_seconds(cur_sub.start), 49 | end=convert_sub_time_to_seconds(cur_sub.end) 50 | )) 51 | 52 | srt_datalist.append(dict( 53 | vid_name=os.path.splitext(os.path.basename(sub_path))[0], 54 | sub=sub_data 55 | )) 56 | save_jsonl(srt_datalist, save_path) 57 | 58 | 59 | if __name__ == '__main__': 60 | import argparse 61 | parser = argparse.ArgumentParser() 62 | parser.add_argument("-srt_dir", type=str, 63 | help="path to the dir containing all the TVQA subtitle .srt files") 64 | parser.add_argument("-save_path", type=str, help="path to save the preprocessed subtitles") 65 | args = parser.parse_args() 66 | 67 | preprocess_subtitles_from_dir(args.srt_dir, args.save_path) 68 | -------------------------------------------------------------------------------- /utils/text_feature/train_lm_finetuning_single_sentence.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Usage: bash utils/text_feature/train_lm_finetuning_single_sentence.sh FINETUNE_MODE OUTPUT_ROOT 3 | finetune_mode=$1 # [query_only, sub_query] 4 | output_root=$2 # path to store the generated output 5 | data_root="data" 6 | train_data_file="${data_root}/tvr_train_release.jsonl" 7 | sub_data_file="${data_root}/tvqa_preprocessed_subtitles.jsonl" 8 | model_type="roberta" 9 | model_name_or_path="roberta-base" 10 | 11 | num_train_epochs=1 12 | output_dir="${output_root}/${finetune_mode}/roberta-base_tuned_model" 13 | 14 | if [[ ${finetune_mode} == query_only ]]; then 15 | max_length=32 16 | gradient_accumulation_steps=1 17 | 18 | extra_args=() 19 | elif [[ ${finetune_mode} == sub_query ]]; then 20 | max_length=256 # since sub is longer 21 | gradient_accumulation_steps=4 22 | 23 | extra_args=(--use_sub) 24 | extra_args+=(--sub_data_file) 25 | extra_args+=(${sub_data_file}) 26 | fi 27 | 28 | python utils/text_feature/lm_finetuning_on_single_sentences.py \ 29 | --output_dir ${output_dir} \ 30 | --model_type ${model_type} \ 31 | --model_name_or_path ${model_name_or_path} \ 32 | --do_train \ 33 | --train_data_file ${train_data_file} \ 34 | --gradient_accumulation_steps ${gradient_accumulation_steps} \ 35 | --block_size ${max_length} \ 36 | --mlm \ 37 | --num_train_epochs ${num_train_epochs} \ 38 | ${extra_args[@]} \ 39 | ${@:3} 40 | -------------------------------------------------------------------------------- /utils/video_feature/README.md: -------------------------------------------------------------------------------- 1 | ### video feature extraction 2 | 3 | #### I3D feature extraction requirements: 4 | - tensorflow-gpu==1.14 5 | - dm-sonnet-gpu==1.32 6 | - opencv-python 7 | 8 | #### ResNet-152 feature extraction requirements: 9 | - PyTorch 10 | - Torchvision 11 | 12 | Note the video features released at 13 | [tvr_feature_release.tar.gz](https://drive.google.com/file/d/1j4mVkXjKCgafW3ReNjZ2Rk6CKx0Fk_n5/view?usp=sharing) 14 | is extracted from 15 FPS frames, which is not publicly available 15 | (we only released [3 FPS frames](http://tvqa.cs.unc.edu/download_tvqa.html#tvqa-download-4)). 16 | -------------------------------------------------------------------------------- /utils/video_feature/convert_feature_frm_to_clip.py: -------------------------------------------------------------------------------- 1 | """ 2 | Convert frame level (FPS1) features for videos to clip level (FPS2) features, by pooling across multiple frames. 3 | 4 | FeaturePerSecond (FPS): FPS1 > FPS2. 5 | """ 6 | import os 7 | import h5py 8 | import numpy as np 9 | from tqdm import tqdm 10 | 11 | 12 | def convert_for_single_h5(frm_h5, clip_h5, clip_boundaries_in_frm_idx, pool_type="max", debug=False): 13 | """ 14 | Args: 15 | frm_h5: h5py.File object, containing the frame level features 16 | clip_h5: h5py.File object, containing the clip level features 17 | clip_boundaries_in_frm_idx: list, features belong to clip `clip_idx` should be indexed as 18 | features[clip_boundaries_in_frm_idx[clip_idx]:clip_boundaries_in_frm_idx[clip_idx+1]] 19 | pool_type: max or avg 20 | debug: 21 | Returns: 22 | 23 | """ 24 | assert pool_type in ["max", "avg"] 25 | np_pool_func = np.max if pool_type == "max" else np.mean 26 | for k in tqdm(frm_h5.keys()): 27 | frm_features = frm_h5[k] 28 | clip_features = [] 29 | for idx in range(len(clip_boundaries_in_frm_idx)): 30 | cur_clip_feat = frm_features[clip_boundaries_in_frm_idx[idx]:clip_boundaries_in_frm_idx[idx+1]] 31 | if len(cur_clip_feat) == 0: 32 | break 33 | cur_clip_feat = np_pool_func(cur_clip_feat, axis=0, keepdims=True) 34 | clip_features.append(cur_clip_feat) 35 | clip_h5.create_dataset(k, data=np.concatenate(clip_features, axis=0), dtype=np.float32) 36 | if debug: 37 | break 38 | 39 | 40 | def get_clip2frm_idx_mapping(clip_length=1.5, max_video_length=300): 41 | """ This function depends on how the features are extracted. 42 | original features are extract from frames (video fps=30): 43 | [3, 13, 23] frame in a second. 44 | Args: 45 | clip_length: float, 46 | max_video_length: int, 47 | 48 | Returns: 49 | {clip_idx1 (int): [frm_idx0, frm_idx1, ...], 50 | ... 51 | } 52 | """ 53 | # frame 0 in the feature is actually the frame 3 in the original video, so its 54 | # corresponding time is 3 / 30 = 0.1s. More generally ==> [0.1, 0.43, 0.77] + n. 55 | frm2seconds = np.concatenate([ 56 | np.array([3, 13, 23]) / 30. + offset for offset in np.arange(0, max_video_length)], axis=0) 57 | 58 | clip_boundaries = np.arange(0, max_video_length, clip_length) 59 | # no need to worry about search boundary. 60 | # indexed as clip_boundaries_in_frm_idx[idx]:clip_boundaries_in_frm_idx[idx+1] 61 | clip_boundaries_in_frm_idx = np.searchsorted(frm2seconds, clip_boundaries) 62 | return clip_boundaries_in_frm_idx 63 | 64 | 65 | def main_convert(): 66 | import argparse 67 | parser = argparse.ArgumentParser() 68 | parser.add_argument("--src_h5_files", type=str, nargs='+', help="frm .h5 file paths") 69 | parser.add_argument("--tgt_h5_file", type=str, help=".h5 path to stores the converted data") 70 | parser.add_argument("--pool_type", type=str, default="max", 71 | choices=["max", "avg"], help="how to aggreate frame features") 72 | parser.add_argument("--clip_length", type=float, default=1.5) 73 | parser.add_argument("--debug", action="store_true") 74 | args = parser.parse_args() 75 | 76 | clip_boundaries_in_frm_idx = get_clip2frm_idx_mapping(clip_length=args.clip_length) 77 | assert not os.path.exists(args.tgt_h5_file) 78 | with h5py.File(args.tgt_h5_file, "a") as tgt_h5: 79 | for src_f in args.src_h5_files: 80 | with h5py.File(src_f, "r") as src_h5: 81 | convert_for_single_h5(src_h5, tgt_h5, clip_boundaries_in_frm_idx, 82 | pool_type=args.pool_type, debug=args.debug) 83 | 84 | 85 | if __name__ == '__main__': 86 | main_convert() 87 | -------------------------------------------------------------------------------- /utils/video_feature/convert_feature_frm_to_clip.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Usage: 3 | # bash utils/video_feature/convert_feature_frm_to_clip.sh [clip_length] ANY_OTHER_PYTHON_ARGS 4 | clip_length=${1} 5 | feature_root=/net/bvisionserver14/playpen-ssd/jielei/data/tvr/video_feature 6 | src_h5_files=() 7 | for show_name in bbt friends grey house met castle 8 | do 9 | cur_src_h5_file=${feature_root}/frm_level_resnet152/tvr_${show_name}_resnet152_3fps.h5 10 | src_h5_files+=(${cur_src_h5_file}) 11 | done 12 | echo "Running with src_h5_files ${src_h5_files}" 13 | 14 | pool_type=max 15 | tgt_h5_file=${feature_root}/tvr_resnet152_rgb_${pool_type}_cl-${clip_length}.h5 16 | 17 | python utils/video_feature/convert_feature_frm_to_clip.py \ 18 | --src_h5_files ${src_h5_files[@]} \ 19 | --tgt_h5_file ${tgt_h5_file} \ 20 | --pool_type ${pool_type} \ 21 | --clip_length ${clip_length} \ 22 | ${@:2} 23 | -------------------------------------------------------------------------------- /utils/video_feature/extract_i3d_features.py: -------------------------------------------------------------------------------- 1 | """Extract feature ActivityNet i3d RGB/Flow feature 2 | Modified from [1] and [2] 3 | [1] https://github.com/deepmind/kinetics-i3d/blob/master/evaluate_sample.py 4 | [2] https://github.com/tensorflow/hub/blob/master/examples/colab/action_recognition_with_tf_hub.ipynb 5 | 6 | Model Notes: 7 | For model performance on Kinetics-400, please see the repository. In a nutshell, 8 | 1) imagenet_pretrained models are better than scratch models 9 | 2) RGB models are better than Flow models 10 | 11 | Dataset Notes: 12 | 1) Kinetics-400 has 400 classes, each with at least 400 video clips, 13 | 2) Kinetics-600 has 600 classes, each with at least 600 video clips. 14 | 15 | Please find any missing files/resources/info from https://github.com/deepmind/kinetics-i3d. 16 | """ 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import os 22 | import cv2 23 | import json 24 | import math 25 | import argparse 26 | import numpy as np 27 | import tensorflow as tf 28 | from tqdm import tqdm 29 | import time 30 | import utils.video_feature.i3d as i3d 31 | import h5py 32 | from multiprocessing import Pool 33 | 34 | from utils.basic_utils import save_lines, read_lines, load_pickle, save_pickle 35 | 36 | _IMAGE_SIZE = 224 37 | MIN_N_FRAMES = 9 38 | CLIP2N_FRAMES = { # fps ==15 39 | 1: 15, 40 | 1.5: 23 # evenly separated every 3 seconds, 41 | } 42 | 43 | 44 | _KINETICS_HOME = "/net/bvisionserver4/playpen10/jielei/tools/VideoFeatureExtraction/kinetics-i3d" 45 | _CHECKPOINT_PATHS = { 46 | "rgb": os.path.join(_KINETICS_HOME, "data/checkpoints/rgb_scratch/model.ckpt"), 47 | "rgb600": os.path.join(_KINETICS_HOME, "data/checkpoints/rgb_scratch_kin600/model.ckpt"), 48 | "flow": os.path.join(_KINETICS_HOME, "data/checkpoints/flow_scratch/model.ckpt"), 49 | "rgb_imagenet": os.path.join(_KINETICS_HOME, "data/checkpoints/rgb_imagenet/model.ckpt"), 50 | "flow_imagenet": os.path.join(_KINETICS_HOME, "data/checkpoints/flow_imagenet/model.ckpt"), 51 | } 52 | 53 | _LABEL_MAP_PATH = os.path.join(_KINETICS_HOME, "data/label_map.txt") 54 | _LABEL_MAP_PATH_600 = os.path.join(_KINETICS_HOME, "data/label_map_600.txt") 55 | 56 | 57 | def crop_center_square(frame): 58 | y, x = frame.shape[0:2] 59 | min_dim = min(y, x) 60 | start_x = (x // 2) - (min_dim // 2) 61 | start_y = (y // 2) - (min_dim // 2) 62 | return frame[start_y:start_y + min_dim, start_x:start_x + min_dim] 63 | 64 | 65 | def process_single_image(image_path, resize=(224, 224)): 66 | img = cv2.imread(image_path) # BGR image 67 | img = crop_center_square(img) 68 | img = cv2.resize(img, resize) 69 | return int(image_path.split("/")[-1].split(".")[0][-5:]), img[:, :, [2, 1, 0]] 70 | 71 | 72 | def process_images(multi_pool, image_paths): 73 | pairs = multi_pool.imap_unordered(process_single_image, image_paths) 74 | pairs = sorted(pairs, key=lambda x: x[0]) 75 | imgs = [e[1] for e in pairs] 76 | return np.array(imgs) / 255.0 77 | 78 | 79 | def mk_divisible(array, divisor): 80 | """array: (N x _IMAGE_SIZE x _IMAGE_SIZE x 3) 81 | append N to make it divisible by 82 | """ 83 | raw_length = len(array) 84 | residual = raw_length % divisor 85 | if residual != 0: 86 | if raw_length < divisor - residual: 87 | array = np.concatenate([array] + [array] * (int((divisor - residual) / raw_length) + 1))[-divisor:] 88 | else: 89 | array = np.concatenate([array, array[-int(divisor-residual):]], axis=0) 90 | return array 91 | 92 | 93 | def mk_batch(images_array, batch_size, clip_length=1.5): 94 | """images_array: N x _IMAGE_SIZE x _IMAGE_SIZE x 3 95 | return [B x _N_FRAMES x _IMAGE_SIZE x _IMAGE_SIZE x 3, ] (B <= batch_size) 96 | """ 97 | assert clip_length in CLIP2N_FRAMES 98 | n_frm = CLIP2N_FRAMES[clip_length] 99 | 100 | if clip_length == 1: 101 | n_frm = 15 102 | images_array = mk_divisible(images_array, n_frm) 103 | elif clip_length == 1.5: 104 | n_frm = 23 # math.ceil(45 / 2) 105 | n_frm_3_secs = 45 106 | clipwise_image_array = [] 107 | for idx in range(math.ceil(len(images_array)/n_frm_3_secs)): 108 | clipwise_image_array.append(images_array[idx * n_frm_3_secs: idx * n_frm_3_secs + n_frm]) 109 | clipwise_image_array.append(images_array[(idx+1) * n_frm_3_secs - n_frm: (idx+1) * n_frm_3_secs]) 110 | images_array = np.concatenate( 111 | [mk_divisible(e, n_frm) for e in clipwise_image_array if len(e) > 0], axis=0) 112 | 113 | images_array = images_array.reshape(-1, n_frm, _IMAGE_SIZE, _IMAGE_SIZE, 3) 114 | n_clips = len(images_array) 115 | if n_clips > batch_size: 116 | batches = [images_array[idx * batch_size:(idx + 1) * batch_size] for idx in 117 | range(int(n_clips / batch_size) + 1)] 118 | if len(batches[-1]) == 0: # when n_clips / batch_size is an integer 119 | del batches[-1] 120 | return batches 121 | else: 122 | return [images_array] 123 | 124 | 125 | def get_image_paths(dir_path, image_filename_pattern="img_{:05d}.jpg"): 126 | """each dir contains the same number of flow_x_{:05d}.jpg, flow_y_{:05d}.jpg, img_{:05d}.jpg. 127 | Index starts at 1, not 0, thus there is no img_00000.jpg, etc. 128 | """ 129 | num_rgb_images = int(len(os.listdir(dir_path)) / 3) # must be divisible by 3 130 | # original frames are extracted for the following frames, (video fps=30): [1-5], [11-15], [21-25] + 30*n 131 | selected_img_indices = np.arange(num_rgb_images) + 1 # index starting from 1 132 | return [image_filename_pattern.format(e) for e in selected_img_indices] 133 | 134 | 135 | def get_img_info_by_dir(base_dir, cache_file): 136 | """frm_info_list: list(sublist), 137 | each sublist[0] is vid_name, sublist[1] is an ordered list of image full paths, """ 138 | if os.path.exists(cache_file): 139 | tf.logging.info("Found cache file, loading at {}".format(cache_file)) 140 | return load_pickle(cache_file) 141 | tf.logging.info("Cache file not found, building from scratch") 142 | frm_info_list = [] 143 | sub_dirs = [d for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d))] 144 | for k in tqdm(sub_dirs, desc="Get image info from directory"): 145 | frm_info_list.append([k, get_image_paths(os.path.join(base_dir, k))]) 146 | save_pickle(frm_info_list, cache_file) 147 | return frm_info_list 148 | 149 | 150 | def get_args(): 151 | parser = argparse.ArgumentParser("i3d feature extractor") 152 | parser.add_argument("--eval_type", type=str, default="rgb600", choices=["rgb", "rgb600"]) 153 | parser.add_argument("--imagenet_pretrained", default=False, action="store_true") 154 | parser.add_argument("--batch_size", type=int, default=100, help="batch_size * clips") 155 | parser.add_argument("--base_dir", type=str, help="frame_dir/*/*jpg") 156 | parser.add_argument("--feature_file", type=str, help="path to save the features") 157 | parser.add_argument("--cache_file", type=str, help="path to store all the videos") 158 | parser.add_argument("--clip_length", type=float, default=1.5, 159 | help="clip length in seconds, each clip will have its own feature") 160 | parser.add_argument("--debug", action="store_true") 161 | args = parser.parse_args() 162 | tf.logging.info("Args: %s", json.dumps(vars(args), indent=4, sort_keys=True)) 163 | return args 164 | 165 | 166 | def main(unused_argv): 167 | tf.logging.set_verbosity(tf.logging.INFO) 168 | args = get_args() 169 | eval_type = args.eval_type 170 | imagenet_pretrained = args.imagenet_pretrained 171 | 172 | NUM_CLASSES = 600 if eval_type == "rgb600" else 400 173 | 174 | if eval_type not in ["rgb", "rgb600", "flow", "joint"]: 175 | raise ValueError("Bad `eval_type`, must be one of rgb, rgb600, flow, joint") 176 | 177 | frame_infos = get_img_info_by_dir(args.base_dir, cache_file=args.cache_file) 178 | 179 | n_frm = CLIP2N_FRAMES[args.clip_length] 180 | assert n_frm >= MIN_N_FRAMES, "Number of input frames must be larger than or equal to 9" 181 | 182 | # RGB input has 3 channels. 183 | rgb_input = tf.placeholder(tf.float32, shape=(None, n_frm, _IMAGE_SIZE, _IMAGE_SIZE, 3)) 184 | 185 | with tf.variable_scope("RGB"): 186 | rgb_model = i3d.InceptionI3d(NUM_CLASSES, spatial_squeeze=True, final_endpoint="Logits") 187 | rgb_logits, end_points = rgb_model(rgb_input, is_training=False, dropout_keep_prob=1.0) 188 | 189 | rgb_variable_map = {} 190 | for variable in tf.global_variables(): 191 | if eval_type == "rgb600": 192 | rgb_variable_map[variable.name.replace(":0", "")[len("RGB/inception_i3d/"):]] = variable 193 | else: 194 | rgb_variable_map[variable.name.replace(":0", "")] = variable 195 | 196 | rgb_saver = tf.train.Saver(var_list=rgb_variable_map, reshape=True) 197 | 198 | with tf.Session() as sess: 199 | feed_dict = {} 200 | if imagenet_pretrained: 201 | rgb_saver.restore(sess, _CHECKPOINT_PATHS["rgb_imagenet"]) 202 | else: 203 | rgb_saver.restore(sess, _CHECKPOINT_PATHS[eval_type]) 204 | tf.logging.info("RGB checkpoint restored") 205 | 206 | feed_dict[rgb_input] = np.random.randn(args.batch_size, n_frm, _IMAGE_SIZE, _IMAGE_SIZE, 3) 207 | avg_pool3d_feature = sess.run([end_points["avg_pool3d"]], feed_dict=feed_dict)[0] 208 | avg_pool3d_feature = np.squeeze(avg_pool3d_feature, axis=(1, 2, 3)) 209 | tf.logging.info("Test input size {}, output feature size {}" 210 | .format(feed_dict[rgb_input].shape, avg_pool3d_feature.shape)) 211 | 212 | pool = Pool(24) 213 | feat_h5 = h5py.File(args.feature_file, "a") 214 | exist_keys = list(feat_h5.keys()) 215 | debug_loop_cnt = 10 216 | frame_infos = [e for e in frame_infos if e[0] not in exist_keys] 217 | for videoname, frame_paths in tqdm(frame_infos, desc="Extracting"): 218 | frame_paths = [os.path.join(args.base_dir, videoname, e) for e in frame_paths] 219 | debug_loop_cnt -= 1 220 | if args.debug and debug_loop_cnt == 0: 221 | break 222 | try: 223 | images = process_images(pool, frame_paths) 224 | if len(images) == 0: 225 | continue 226 | 227 | batches = mk_batch(images, args.batch_size, clip_length=args.clip_length) 228 | features = [] 229 | for batch in batches: 230 | feed_dict[rgb_input] = batch 231 | avg_pool3d_feature = sess.run([end_points["avg_pool3d"]], feed_dict=feed_dict)[0] 232 | avg_pool3d_feature = np.squeeze(avg_pool3d_feature, axis=(1, 2, 3)) 233 | features.append(avg_pool3d_feature) 234 | 235 | # write to file 236 | feat_h5.create_dataset(videoname, data=np.concatenate(features, axis=0), dtype=np.float32) 237 | except Exception as e: 238 | print("Exception ", e) 239 | continue 240 | 241 | feat_h5.close() 242 | pool.close() 243 | 244 | 245 | if __name__ == "__main__": 246 | tf.app.run(main) 247 | 248 | 249 | 250 | -------------------------------------------------------------------------------- /utils/video_feature/extract_i3d_features.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | show_name=$1 3 | clip_length=$2 4 | eval_type=rgb600 5 | feature_root=/net/bvisionserver14/playpen-ssd/jielei/data/tvr/video_feature 6 | image_root=/net/bvisionserver4/playpen1/jielei/data/preprocessed_video_data/dense_flow_frames_step1_new 7 | feature_file=${feature_root}/tvr_${show_name}_i3d_${eval_type}_avg_cl-${clip_length}.h5 # !!!!! TODO 8 | cache_file=cache/tvr_${show_name}_vid_all_frm_pairs.pkl 9 | 10 | 11 | echo "Running with show ${show_name}" 12 | case ${show_name} in 13 | bbt) 14 | base_dir=${image_root}/new_bbt 15 | ;; 16 | friends | grey | house | met | castle) 17 | base_dir=${image_root}/${show_name} 18 | ;; 19 | *) 20 | echo -n "Unknown argument" 21 | ;; 22 | esac 23 | 24 | 25 | python utils/video_feature/extract_i3d_features.py \ 26 | --eval_type=${eval_type} \ 27 | --batch_size=60 \ 28 | --base_dir=${base_dir} \ 29 | --feature_file=${feature_file} \ 30 | --cache_file=${cache_file} \ 31 | --clip_length=${clip_length} \ 32 | ${@:3} 33 | -------------------------------------------------------------------------------- /utils/video_feature/extract_image_features.py: -------------------------------------------------------------------------------- 1 | import h5py 2 | import torch 3 | import torch.nn as nn 4 | import torch.backends.cudnn as cudnn 5 | import numpy as np 6 | import sys 7 | import six 8 | import os 9 | 10 | from torchvision import models, transforms 11 | from tqdm import tqdm 12 | from PIL import Image 13 | 14 | import logging 15 | logging.basicConfig(format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 16 | datefmt='%m/%d/%Y %H:%M:%S', 17 | level=logging.INFO) 18 | logger = logging.getLogger(__name__) 19 | 20 | 21 | IMAGENET_NORMALIZATION_PARAMS = dict( 22 | mean=[0.485, 0.456, 0.406], 23 | std=[0.229, 0.224, 0.225] 24 | ) 25 | 26 | 27 | class ImageNetResNetFeature(nn.Module): 28 | def __init__(self, output_dim="2048"): 29 | super(ImageNetResNetFeature, self).__init__() 30 | resnet = models.resnet152(pretrained=True) 31 | if output_dim == "2048": 32 | n_layers_to_rm = 1 # remove last fc layer 33 | elif output_dim == "2048x7x7": 34 | n_layers_to_rm = 2 # remove last fc layer and its precedent 7x7 avg pooling layer 35 | else: 36 | raise ValueError("Wrong value for argument output_dim") 37 | self.feature = nn.Sequential(*list(resnet.children())[:-n_layers_to_rm]) 38 | 39 | def forward(self, x): 40 | """return: B x 2048 or B x 2048x7x7""" 41 | return self.feature(x).squeeze() 42 | 43 | 44 | class ResNetC3FeatureExtractor(nn.Module): 45 | def __init__(self): 46 | super(ResNetC3FeatureExtractor, self).__init__() 47 | resnet = models.resnet152(pretrained=True) 48 | component_list = list(resnet.children())[:-3] 49 | component_list.extend(list(resnet.layer4.children())[:2]) 50 | self.resnet_base = nn.Sequential(*component_list) 51 | layer4_children = list(resnet.layer4.children())[2] 52 | 53 | # resnet.layer4[2].downsample is None 54 | self.layer4_head = nn.Sequential( 55 | layer4_children.conv1, 56 | layer4_children.bn1, 57 | layer4_children.relu, 58 | layer4_children.conv2, 59 | layer4_children.bn2, 60 | layer4_children.relu, 61 | ) 62 | 63 | def forward(self, x): 64 | base_out = self.resnet_base(x) 65 | c3_feature = self.layer4_head(base_out) 66 | return c3_feature 67 | 68 | 69 | def make_image_tensor(image_paths, zoom_out=1): 70 | tensors = [] 71 | for ele in image_paths: 72 | image = Image.open(ele).convert('RGB') 73 | image = imagenet_transform(image) 74 | image = image.view(1, 3, 224*zoom_out, 224*zoom_out) 75 | tensors.append(image) 76 | return torch.cat(tensors, 0) 77 | 78 | 79 | def get_image_batch_features(image_paths, net, batch_size, zoom_out=1): 80 | """ 81 | input: 82 | path to the frames for a single video 83 | return: 84 | image features for the frames 85 | """ 86 | num_batches = int(np.ceil(float(len(image_paths)) / batch_size)) 87 | feature_list = [] 88 | for i in range(num_batches): 89 | inputs = make_image_tensor(image_paths[i*batch_size:(i+1)*batch_size], zoom_out=zoom_out) 90 | inputs = inputs.cuda() 91 | cur_features = net(inputs) 92 | feature_list.append(cur_features) 93 | features = torch.cat(feature_list, 0) 94 | return features.data.cpu().numpy() 95 | 96 | 97 | def extract_all(feature_path, base_dir, video_name2image_filenames, video_names, net, batch_size, 98 | zoom_out=1, debug=False): 99 | """ 100 | Args: 101 | feature_path: h5py file path to save the features 102 | base_dir: os.path.join(base_dir, vid_name, image_filename) is the absolute path to the image 103 | video_name2image_filenames: dict(), with video names as keys, list of image filenames as values 104 | video_names: 105 | net: 106 | batch_size: 107 | zoom_out: 108 | debug: 109 | 110 | Returns: 111 | 112 | """ 113 | feature_h5 = h5py.File(feature_path, "w") 114 | 115 | for i in tqdm(range(len(video_names)), desc="Extracting for videos"): 116 | cur_vname = video_names[i] 117 | image_paths = [os.path.join(base_dir, cur_vname, e) for e in video_name2image_filenames[cur_vname]] 118 | try: 119 | data_features = get_image_batch_features(image_paths, net, batch_size, zoom_out=zoom_out) 120 | except Exception as e: 121 | logger.debug(e) 122 | continue 123 | feature_h5.create_dataset(cur_vname, data=data_features, dtype=np.float32) 124 | 125 | if debug: 126 | logger.info("subdir (key name) {}, feature shape {}".format(cur_vname, data_features.shape)) 127 | break 128 | feature_h5.close() 129 | 130 | 131 | def get_image_paths(dir_path, image_filename_pattern="img_{:05d}.jpg", fps=15): 132 | """each dir contains the same number of flow_x_{:05d}.jpg, flow_y_{:05d}.jpg, img_{:05d}.jpg. 133 | Index starts at 1, not 0, thus there is no img_00000.jpg, etc. 134 | """ 135 | num_rgb_images = int(len(os.listdir(dir_path)) / 3) # must be divisible by 3 136 | offsets_per_second = np.arange(0, num_rgb_images, fps) # (0, 30, 15) => [0, 15] 137 | # original frames are extracted for the following frames, (video fps=30): [1-5], [11-15], [21-25] + 30*n 138 | offsets_inside_second = [3, 8, 13] # the middle of every 5 frames., note this is not used for indexing. 139 | selected_img_indices = np.concatenate( 140 | [offsets_per_second + e for e in offsets_inside_second] 141 | , axis=0) 142 | selected_img_indices = selected_img_indices[selected_img_indices <= num_rgb_images] 143 | return [image_filename_pattern.format(e) for e in selected_img_indices] 144 | 145 | 146 | if __name__ == "__main__": 147 | # settings 148 | import argparse 149 | parser = argparse.ArgumentParser() 150 | parser.add_argument("--feature_file", type=str, default=None) 151 | parser.add_argument("--base_dir", type=str, default=None) 152 | parser.add_argument("--feature_type", type=str, default="imagenet2048", 153 | choices=["2048", "2048x7x7", "c3"]) 154 | parser.add_argument("--zoom_out", type=int, default=1, help="224 * zoom_out is the input spatial size") 155 | parser.add_argument("--batch_size", type=int, default=300) 156 | parser.add_argument("--cache_dir", type=str, default="") 157 | parser.add_argument("--bypass_user_input", action="store_true") 158 | parser.add_argument("--debug", action="store_true") 159 | args = parser.parse_args() 160 | logging.info(vars(args)) 161 | 162 | logger.info("[Phase 1] Setup feature extractor.") 163 | # https://github.com/KaimingHe/deep-residual-networks/blob/master/prototxt/ResNet-152-deploy.prototxt 164 | # see the link above for resnet architectrue, layer_name, etc. 165 | feature_type = args.feature_type 166 | if feature_type == "2048": 167 | extractor = ImageNetResNetFeature(output_dim="2048") 168 | elif feature_type == "2048x7x7": 169 | extractor = ImageNetResNetFeature(output_dim="2048x7x7") 170 | elif feature_type == "c3": 171 | extractor = ResNetC3FeatureExtractor() 172 | else: 173 | raise NotImplementedError("Not supported feature type") 174 | 175 | # Step 2, set experiment settings 176 | logger.info("[Phase 2] Config settings.") 177 | 178 | if os.path.exists(args.feature_file): 179 | logger.info("feature_file {} already exists".format(args.feature_file)) 180 | sys.exit(1) 181 | 182 | USE_CUDA = torch.cuda.is_available() 183 | if not USE_CUDA: 184 | logger.info("no GPU available") 185 | sys.exit(1) 186 | cudnn.benchmark = True 187 | 188 | extractor.cuda() 189 | extractor.eval() 190 | 191 | zoom_out = args.zoom_out 192 | # testing 193 | with torch.no_grad(): 194 | sample_input = torch.randn(args.batch_size, 3, 224 * zoom_out, 224 * zoom_out) 195 | if USE_CUDA: 196 | sample_input = sample_input.cuda() 197 | logger.info(" Extraction on GPU.") 198 | sample_output1 = extractor(sample_input) 199 | 200 | logger.info(" Input Size is: {}".format(sample_input.shape)) 201 | logger.info(" Feature Size is: {}".format(sample_output1.shape)) 202 | if args.bypass_user_input: 203 | s = "y" 204 | else: 205 | s = six.moves.input("Do you want to proceed (Y/N): ") 206 | if s.lower() == "y": 207 | imagenet_transform = transforms.Compose([ 208 | transforms.Resize((224 * zoom_out, 224 * zoom_out)), 209 | transforms.ToTensor(), 210 | transforms.Normalize(**IMAGENET_NORMALIZATION_PARAMS), 211 | ]) 212 | 213 | logger.info("[Phase 3] : Feature Extraction") 214 | sub_dirs = [d for d in os.listdir(args.base_dir) if os.path.isdir(os.path.join(args.base_dir, d))] 215 | cache_video_name2image_filenames_path = \ 216 | os.path.join(args.cache_dir, "{}_video_name2image_filenames.cache.pt" 217 | .format(os.path.split(args.feature_file)[-1])) 218 | if os.path.exists(cache_video_name2image_filenames_path): 219 | logger.info("Loading from cache {}".format(cache_video_name2image_filenames_path)) 220 | video_name2image_filenames = torch.load(cache_video_name2image_filenames_path) 221 | else: 222 | logger.info("Cache not found, creating and saving at {}" 223 | .format(cache_video_name2image_filenames_path)) 224 | video_name2image_filenames = { 225 | k: get_image_paths(os.path.join(args.base_dir, k)) 226 | for k in tqdm(sub_dirs, desc="Gathering image paths for each video") 227 | } 228 | torch.save(video_name2image_filenames, cache_video_name2image_filenames_path) 229 | logger.info("video_name2image_filenames len {} keys[:3] {} values [0][:10] {}" 230 | .format(len(video_name2image_filenames), 231 | list(video_name2image_filenames.keys())[:3], 232 | list(video_name2image_filenames.values())[0][:10])) 233 | with torch.no_grad(): 234 | extract_all(args.feature_file, args.base_dir, video_name2image_filenames, sub_dirs, extractor, 235 | args.batch_size, zoom_out=zoom_out, debug=args.debug) 236 | else: 237 | logging.info("Aborting") 238 | -------------------------------------------------------------------------------- /utils/video_feature/extract_resnet152_2048_features.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | show_name=$1 3 | feature_root=/net/bvisionserver14/playpen-ssd/jielei/data/tvr/video_feature 4 | image_root=/net/bvisionserver4/playpen1/jielei/data/preprocessed_video_data/dense_flow_frames_step1_new 5 | feature_file=${feature_root}/tvr_${show_name}_resnet152_3fps.h5 6 | cache_dir=cache 7 | 8 | 9 | echo "Running with show ${show_name}" 10 | case ${show_name} in 11 | bbt) 12 | base_dir=${image_root}/new_bbt 13 | ;; 14 | friends | grey | house | met | castle) 15 | base_dir=${image_root}/${show_name} 16 | ;; 17 | *) 18 | echo -n "Unknown argument" 19 | ;; 20 | esac 21 | 22 | 23 | python utils/video_feature/extract_image_features.py \ 24 | --feature_file=${feature_file} \ 25 | --base_dir=${base_dir} \ 26 | --feature_type=2048 \ 27 | --batch_size=300 \ 28 | --cache_dir=${cache_dir} \ 29 | ${@:2} 30 | -------------------------------------------------------------------------------- /utils/video_feature/merge_align_i3d.py: -------------------------------------------------------------------------------- 1 | """ 2 | Merge i3d features from all shows. Meanwhile, align it with the imagenet feature 3 | so that they have the same number of feature vectors. 4 | """ 5 | import os 6 | import h5py 7 | import numpy as np 8 | from tqdm import tqdm 9 | from collections import Counter 10 | 11 | 12 | def convert_for_single_h5(src_h5, tgt_h5, align_h5_key2len, debug=False): 13 | """ 14 | Args: 15 | src_h5: h5py.File object, containing the frame level features 16 | tgt_h5: h5py.File object, containing the clip level features 17 | align_h5_key2len: dict, {key: len}, each value indicates the length (L) of the array (L, D) 18 | debug: 19 | Returns: 20 | 21 | """ 22 | for k, feat in tqdm(src_h5.items()): 23 | if k in align_h5_key2len: 24 | if len(feat) != align_h5_key2len[k]: 25 | align_len = align_h5_key2len[k] 26 | aligned_feat = np.zeros((align_h5_key2len[k], feat.shape[1]), dtype=np.float32) 27 | aligned_feat[:len(feat)] = feat[:align_len] 28 | feat = aligned_feat 29 | tgt_h5.create_dataset(k, data=feat, dtype=np.float32) 30 | else: 31 | print("Skipping {}".format(k)) 32 | if debug: 33 | break 34 | 35 | 36 | def get_clip2frm_idx_mapping(clip_length=1.5, max_video_length=300): 37 | """ This function depends on how the features are extracted. 38 | original features are extract from frames (video fps=30): 39 | [3, 13, 23] frame in a second. 40 | Args: 41 | clip_length: float, 42 | max_video_length: int, 43 | 44 | Returns: 45 | {clip_idx1 (int): [frm_idx0, frm_idx1, ...], 46 | ... 47 | } 48 | """ 49 | # frame 0 in the feature is actually the frame 3 in the original video, so its 50 | # corresponding time is 3 / 30 = 0.1s. More generally ==> [0.1, 0.43, 0.77] + n. 51 | frm2seconds = np.concatenate([ 52 | np.array([3, 13, 23]) / 30. + offset for offset in np.arange(0, max_video_length)], axis=0) 53 | 54 | clip_boundaries = np.arange(0, max_video_length, clip_length) 55 | # no need to worry about search boundary. 56 | # indexed as clip_boundaries_in_frm_idx[idx]:clip_boundaries_in_frm_idx[idx+1] 57 | clip_boundaries_in_frm_idx = np.searchsorted(frm2seconds, clip_boundaries) 58 | return clip_boundaries_in_frm_idx 59 | 60 | 61 | def main_convert(): 62 | import argparse 63 | parser = argparse.ArgumentParser() 64 | parser.add_argument("--src_h5_files", type=str, nargs='+', help="frm .h5 file paths") 65 | parser.add_argument("--tgt_h5_file", type=str, help=".h5 path to stores the converted data") 66 | parser.add_argument("--align_h5_file", type=str, help=".h5 path to the file to align at length dim") 67 | parser.add_argument("--check_alignment_only", action="store_true", help="Check alignment only") 68 | parser.add_argument("--debug", action="store_true") 69 | args = parser.parse_args() 70 | 71 | with h5py.File(args.align_h5_file, "r") as align_h5: 72 | align_h5_key2len = {k: len(v) for k, v in tqdm(align_h5.items(), desc="[Get Length] Loop over align h5")} 73 | 74 | src_h5_key2len = {} 75 | for src_f in args.src_h5_files: 76 | with h5py.File(src_f, "r") as src_h5: 77 | for k, v in tqdm(src_h5.items(), desc="[Get length] Loop over one of the src h5"): 78 | src_h5_key2len[k] = len(v) 79 | 80 | not_found_keys = list(set(align_h5_key2len.keys()) - set(src_h5_key2len.keys())) 81 | diff_key2len = {k: align_h5_key2len[k] - src_h5_key2len[k] for k in align_h5_key2len if k in src_h5_key2len} 82 | diff_counter = Counter(list(diff_key2len.values())) 83 | print("Not found keys total {}, examples: {}".format(len(not_found_keys), not_found_keys[:3])) 84 | print("diff_counter {}".format(diff_counter.most_common())) 85 | 86 | if not args.check_alignment_only: 87 | assert not os.path.exists(args.tgt_h5_file) 88 | with h5py.File(args.tgt_h5_file, "a") as tgt_h5: 89 | for src_f in args.src_h5_files: 90 | with h5py.File(src_f, "r") as src_h5: 91 | convert_for_single_h5(src_h5, tgt_h5, align_h5_key2len, debug=args.debug) 92 | 93 | 94 | if __name__ == '__main__': 95 | main_convert() 96 | -------------------------------------------------------------------------------- /utils/video_feature/merge_align_i3d.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Usage: 3 | # bash utils/video_feature/merge_align_i3d.sh [clip_length] ANY_OTHER_PYTHON_ARGS 4 | clip_length=${1} 5 | feature_root=/net/bvisionserver14/playpen-ssd/jielei/data/tvr/video_feature 6 | src_h5_files=() 7 | for show_name in bbt friends grey house met castle 8 | do 9 | cur_src_h5_file=${feature_root}/i3d_featrues_by_show/tvr_${show_name}_i3d_rgb600_avg_cl-${clip_length}.h5 10 | src_h5_files+=(${cur_src_h5_file}) 11 | done 12 | echo "Running with src_h5_files ${src_h5_files}" 13 | 14 | pool_type=max 15 | tgt_h5_file=${feature_root}/tvr_i3d_rgb600_avg_cl-${clip_length}.h5 16 | align_h5_file=${feature_root}/tvr_resnet152_rgb_max_cl-${clip_length}.h5 17 | 18 | python utils/video_feature/merge_align_i3d.py \ 19 | --src_h5_files ${src_h5_files[@]} \ 20 | --tgt_h5_file ${tgt_h5_file} \ 21 | --align_h5_file ${align_h5_file} \ 22 | ${@:2} 23 | -------------------------------------------------------------------------------- /utils/video_feature/normalize_and_concat.py: -------------------------------------------------------------------------------- 1 | """ 2 | L2 Normalize then concat I3D and ResNet features 3 | """ 4 | import os 5 | import h5py 6 | import numpy as np 7 | from tqdm import tqdm 8 | from utils.basic_utils import l2_normalize_np_array 9 | 10 | 11 | def main_norm_cat(): 12 | import argparse 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument("--resnet_h5_file", type=str, help="ResNet .h5 file paths") 15 | parser.add_argument("--i3d_h5_file", type=str, help="I3D .h5 file paths") 16 | parser.add_argument("--tgt_h5_file", type=str, help=".h5 path to stores the converted data") 17 | parser.add_argument("--debug", action="store_true") 18 | args = parser.parse_args() 19 | 20 | assert not os.path.exists(args.tgt_h5_file) 21 | with h5py.File(args.resnet_h5_file, "r") as resnet_h5: 22 | with h5py.File(args.i3d_h5_file, "r") as i3d_h5: 23 | with h5py.File(args.tgt_h5_file, "w") as tgt_h5: 24 | for k in tqdm(resnet_h5.keys()): 25 | resnet_feat = l2_normalize_np_array(resnet_h5[k][:]) 26 | i3d_feat = l2_normalize_np_array(i3d_h5[k][:]) 27 | tgt_h5.create_dataset(k, 28 | data=np.concatenate([resnet_feat, i3d_feat], axis=-1), 29 | dtype=np.float32) 30 | 31 | 32 | if __name__ == '__main__': 33 | main_norm_cat() 34 | -------------------------------------------------------------------------------- /utils/video_feature/normalize_and_concat.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Usage: 3 | # bash utils/video_feature/normalize_and_concat.sh [clip_length] ANY_OTHER_PYTHON_ARGS 4 | clip_length=${1} 5 | feature_root=/net/bvisionserver14/playpen-ssd/jielei/data/tvr/video_feature 6 | resnet_h5_file=${feature_root}/tvr_resnet152_rgb_max_cl-${clip_length}.h5 7 | i3d_h5_file=${feature_root}/tvr_i3d_rgb600_avg_cl-${clip_length}.h5 8 | tgt_h5_file=${feature_root}/tvr_resnet152_rgb_max_i3d_rgb600_avg_cat_cl-${clip_length}.h5 9 | 10 | python utils/video_feature/normalize_and_concat.py \ 11 | --resnet_h5_file ${resnet_h5_file} \ 12 | --i3d_h5_file ${i3d_h5_file} \ 13 | --tgt_h5_file ${tgt_h5_file} \ 14 | ${@:2} 15 | --------------------------------------------------------------------------------