├── .gitignore
├── LICENSE
├── README.md
├── __init__.py
├── data
    ├── tvqa_preprocessed_subtitles.jsonl
    ├── tvr_test_public_release.jsonl
    ├── tvr_train_release.jsonl
    ├── tvr_val_release.jsonl
    └── tvr_video2dur_idx.json
├── figures
    └── model.png
├── method_tvr
    ├── __init__.py
    ├── config.py
    ├── contrastive.py
    ├── inference.py
    ├── model.py
    ├── model_components.py
    ├── optimization.py
    ├── proposal.py
    ├── scripts
    │   ├── eval.sh
    │   ├── inference.sh
    │   └── train.sh
    ├── start_end_dataset.py
    └── train.py
├── setup.sh
├── standalone_eval
    ├── README.md
    ├── __init__.py
    ├── eval.py
    └── eval_sample.sh
└── utils
    ├── __init__.py
    ├── basic_utils.py
    ├── mk_video_split_with_duration.py
    ├── model_utils.py
    ├── temporal_nms.py
    ├── tensor_utils.py
    ├── text_feature
        ├── README.md
        ├── convert_sub_feature_word_to_clip.py
        ├── convert_sub_feature_word_to_clip.sh
        ├── extract_single_sentence_embeddings.sh
        ├── extract_single_sentence_tokens.sh
        ├── lm_finetuning_on_single_sentences.py
        ├── preprocess_subtitles.py
        └── train_lm_finetuning_single_sentence.sh
    └── video_feature
        ├── README.md
        ├── convert_feature_frm_to_clip.py
        ├── convert_feature_frm_to_clip.sh
        ├── extract_i3d_features.py
        ├── extract_i3d_features.sh
        ├── extract_image_features.py
        ├── extract_resnet152_2048_features.sh
        ├── i3d.py
        ├── merge_align_i3d.py
        ├── merge_align_i3d.sh
        ├── normalize_and_concat.py
        └── normalize_and_concat.sh


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | # custom
132 | .idea/
133 | .vscode/
134 | .DS_Store
135 | *.DS_Store
136 | data/tvr_feature_release/
137 | method_tvr/results/
138 | method_act/results/
139 | 
140 | 
141 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 ZHANG HAO
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Video Corpus Moment Retrieval with Contrastive Learning
 2 | 
 3 | PyTorch implementation for the paper "Video Corpus Moment Retrieval with Contrastive Learning" (**SIGIR 2021**, 
 4 | long paper): [SIGIR version](https://dl.acm.org/doi/10.1145/3404835.3462874), [ArXiv version](
 5 | https://arxiv.org/pdf/2105.06247.pdf).
 6 | 
 7 | ![model_overview](./figures/model.png)
 8 | 
 9 | > The codes are modified from [TVRetrieval](https://github.com/jayleicn/TVRetrieval).
10 | 
11 | ## Prerequisites
12 | - python 3.x with pytorch (`1.7.0`), torchvision, transformers, tensorboard, tqdm, h5py, easydict
13 | - cuda, cudnn
14 | 
15 | If you have [Anaconda](https://www.anaconda.com/distribution/) installed, the conda environment of ReLoCLNet can be 
16 | built as follows (take python 3.7 as an example):
17 | ```shell
18 | conda create --name reloclnet python=3.7
19 | conda activate reloclnet
20 | conda install -c anaconda cudatoolkit cudnn  # ignore this if you already have cuda installed
21 | conda install pytorch==1.7.0 torchvision==0.8.0 torchaudio==0.7.0 cudatoolkit=11.0 -c pytorch
22 | conda install -c anaconda h5py=2.9.0
23 | conda install -c conda-forge transformers tensorboard tqdm easydict
24 | ```
25 | > The conda environment of [TVRetrieval](https://github.com/jayleicn/TVRetrieval) also works.
26 | 
27 | 
28 | ## Getting started
29 | 1. Clone this repository
30 | ```shell
31 | $ git clone git@github.com:IsaacChanghau/ReLoCLNet.git
32 | $ cd ReLoCLNet
33 | ```
34 | 
35 | 2. Download features
36 | 
37 | For the features of TVR dataset, please download [tvr_feature_release.tar.gz](
38 | https://drive.google.com/file/d/1j4mVkXjKCgafW3ReNjZ2Rk6CKx0Fk_n5/view?usp=sharing) (link is copied from 
39 | [TVRetrieval#prerequisites](https://github.com/jayleicn/TVRetrieval#prerequisites)) and extract it to the `data` 
40 | directory:
41 | ```shell
42 | $ tar -xf path/to/tvr_feature_release.tar.gz -C data
43 | ```
44 | This [link](https://medium.com/@acpanjan/download-google-drive-files-using-wget-3c2c025a8b99) may be useful for you to
45 | directly download Google Drive files using `wget`.  Please refer [TVRetrieval#prerequisites](
46 | https://github.com/jayleicn/TVRetrieval#prerequisites) for more details about how the features are extracted if you are 
47 | interested.
48 | 
49 | 3. Add project root to `PYTHONPATH` (**Note that you need to do this each time you start a new session.**)
50 | ```shell
51 | $ source setup.sh
52 | ```
53 | 
54 | ## Training and Inference
55 | 
56 | **TVR dataset**
57 | ```shell
58 | # train, refer `method_tvr/scripts/train.sh` and `method_tvr/config.py` more details about hyper-parameters
59 | $ bash method_tvr/scripts/train.sh tvr video_sub_tef resnet_i3d --exp_id reloclnet
60 | # inference
61 | # the model directory placed in method_tvr/results/tvr-video_sub_tef-reloclnet-*
62 | # change the MODEL_DIR_NAME as tvr-video_sub_tef-reloclnet-*
63 | # SPLIT_NAME: [val | test]
64 | $ bash method_tvr/scripts/inference.sh MODEL_DIR_NAME SPLIT_NAME
65 | ```
66 | 
67 | For more details about evaluation and submission, please refer [TVRetrieval#training-and-inference](
68 | https://github.com/jayleicn/TVRetrieval#training-and-inference).
69 | 
70 | ## Citation
71 | If you feel this project helpful to your research, please cite our work.
72 | ```
73 | @inproceedings{zhang2021video,
74 | 	author = {Zhang, Hao and Sun, Aixin and Jing, Wei and Nan, Guoshun and Zhen, Liangli and Zhou, Joey Tianyi and Goh, Rick Siow Mong},
75 | 	title = {Video Corpus Moment Retrieval with Contrastive Learning},
76 | 	year = {2021},
77 | 	isbn = {9781450380379},
78 | 	publisher = {Association for Computing Machinery},
79 | 	address = {New York, NY, USA},
80 | 	url = {https://doi.org/10.1145/3404835.3462874},
81 | 	doi = {10.1145/3404835.3462874},
82 | 	booktitle = {Proceedings of the 44th International ACM SIGIR Conference on Research and Development in Information Retrieval},
83 | 	pages = {685–695},
84 | 	numpages = {11},
85 | 	location = {Virtual Event, Canada},
86 | 	series = {SIGIR '21}
87 | }
88 | ```
89 | 
90 | ## TODO
91 | - Upload codes for ActivityNet Captions dataset
92 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/26hzhang/ReLoCLNet/56cb666ce516cce9acbcfce78fb4e95d81e11e54/__init__.py


--------------------------------------------------------------------------------
/figures/model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/26hzhang/ReLoCLNet/56cb666ce516cce9acbcfce78fb4e95d81e11e54/figures/model.png


--------------------------------------------------------------------------------
/method_tvr/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/26hzhang/ReLoCLNet/56cb666ce516cce9acbcfce78fb4e95d81e11e54/method_tvr/__init__.py


--------------------------------------------------------------------------------
/method_tvr/config.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | import torch
  4 | import argparse
  5 | from utils.basic_utils import mkdirp, load_json, save_json, make_zipfile
  6 | from method_tvr.proposal import ProposalConfigs
  7 | 
  8 | 
  9 | class BaseOptions(object):
 10 |     saved_option_filename = "opt.json"
 11 |     ckpt_filename = "model.ckpt"
 12 |     tensorboard_log_dir = "tensorboard_log"
 13 |     train_log_filename = "train.log.txt"
 14 |     eval_log_filename = "eval.log.txt"
 15 | 
 16 |     def __init__(self):
 17 |         self.parser = argparse.ArgumentParser()
 18 |         self.initialized = False
 19 |         self.opt = None
 20 | 
 21 |     def initialize(self):
 22 |         self.initialized = True
 23 |         self.parser.add_argument("--dset_name", type=str, choices=["tvr"])
 24 |         self.parser.add_argument("--eval_split_name", type=str, default="val",
 25 |                                  help="should match keys in video_duration_idx_path, must set for VCMR")
 26 |         self.parser.add_argument("--debug", action="store_true",
 27 |                                  help="debug (fast) mode, break all loops, do not load all data into memory.")
 28 |         self.parser.add_argument("--data_ratio", type=float, default=1.0,
 29 |                                  help="how many training and eval data to use. 1.0: use all, 0.1: use 10%."
 30 |                                       "Use small portion for debug purposes. Note this is different from --debug, "
 31 |                                       "which works by breaking the loops, typically they are not used together.")
 32 |         self.parser.add_argument("--results_root", type=str, default="results")
 33 |         self.parser.add_argument("--exp_id", type=str, default=None, help="id of this run, required at training")
 34 |         self.parser.add_argument("--seed", type=int, default=2018, help="random seed")
 35 |         self.parser.add_argument("--device", type=int, default=0, help="0 cuda, -1 cpu")
 36 |         self.parser.add_argument("--device_ids", type=int, nargs="+", default=[0], help="GPU ids to run the job")
 37 |         self.parser.add_argument("--num_workers", type=int, default=8,
 38 |                                  help="num subprocesses used to load the data, 0: use main process")
 39 |         self.parser.add_argument("--no_core_driver", action="store_true",
 40 |                                  help="hdf5 driver, default use `core` (load into RAM), if specified, use `None`")
 41 |         self.parser.add_argument("--no_pin_memory", action="store_true", help="No use pin_memory=True for dataloader")
 42 |         # training config
 43 |         self.parser.add_argument("--lr", type=float, default=1e-4, help="learning rate")
 44 |         self.parser.add_argument("--lr_warmup_proportion", type=float, default=0.01,
 45 |                                  help="Proportion of training to perform linear learning rate warmup.")
 46 |         self.parser.add_argument("--wd", type=float, default=0.01, help="weight decay")
 47 |         self.parser.add_argument("--n_epoch", type=int, default=100, help="number of epochs to run")
 48 |         self.parser.add_argument("--max_es_cnt", type=int, default=10,
 49 |                                  help="number of epochs to early stop, use -1 to disable early stop")
 50 |         self.parser.add_argument("--stop_task", type=str, default="VCMR", choices=["VCMR", "SVMR", "VR"],
 51 |                                  help="Use metric associated with stop_task for early stop")
 52 |         self.parser.add_argument("--eval_tasks_at_training", type=str, nargs="+", default=["VCMR", "SVMR", "VR"],
 53 |                                  choices=["VCMR", "SVMR", "VR"], help="evaluate and report numbers for tasks.")
 54 |         self.parser.add_argument("--bsz", type=int, default=128, help="mini-batch size")
 55 |         self.parser.add_argument("--eval_query_bsz", type=int, default=50, help="minibatch size at inference for query")
 56 |         self.parser.add_argument("--eval_context_bsz", type=int, default=200,
 57 |                                  help="mini-batch size at inference, for video/sub")
 58 |         self.parser.add_argument("--eval_untrained", action="store_true", help="Evaluate on un-trained model")
 59 |         self.parser.add_argument("--grad_clip", type=float, default=-1, help="perform gradient clip, -1: disable")
 60 |         self.parser.add_argument("--margin", type=float, default=0.1, help="margin for hinge loss")
 61 |         self.parser.add_argument("--lw_neg_q", type=float, default=1,
 62 |                                  help="weight for ranking loss with negative query and positive context")
 63 |         self.parser.add_argument("--lw_neg_ctx", type=float, default=1,
 64 |                                  help="weight for ranking loss with positive query and negative context")
 65 |         self.parser.add_argument("--lw_st_ed", type=float, default=0.01, help="weight for st ed prediction loss")
 66 |         self.parser.add_argument("--lw_fcl", type=float, default=0.03, help="weight for frame CL loss")
 67 |         self.parser.add_argument("--lw_vcl", type=float, default=0.03, help="weight for video CL loss")
 68 |         self.parser.add_argument("--train_span_start_epoch", type=int, default=0,
 69 |                                  help="which epoch to start training span prediction, -1 to disable")
 70 |         self.parser.add_argument("--ranking_loss_type", type=str, default="hinge", choices=["hinge", "lse"],
 71 |                                  help="att loss type, can be hinge loss or its smooth approximation LogSumExp")
 72 |         self.parser.add_argument("--hard_negative_start_epoch", type=int, default=20,
 73 |                                  help="which epoch to start hard negative sampling for video-level ranking loss,"
 74 |                                       "use -1 to disable")
 75 |         self.parser.add_argument("--hard_pool_size", type=int, default=20,
 76 |                                  help="hard negatives are still sampled, but from a harder pool.")
 77 |         # Model and Data config
 78 |         self.parser.add_argument("--max_sub_l", type=int, default=50,
 79 |                                  help="max length of all sub sentence 97.71 under 50 for 3 sentences")
 80 |         self.parser.add_argument("--max_desc_l", type=int, default=30, help="max length of descriptions")
 81 |         self.parser.add_argument("--max_ctx_l", type=int, default=128,
 82 |                                  help="max number of snippets, 100 for tvr clip_length=1.5, oly 109/21825 > 100")
 83 |         self.parser.add_argument("--train_path", type=str, default=None)
 84 |         self.parser.add_argument("--eval_path", type=str, default=None,
 85 |                                  help="Evaluating during training, for Dev set. If None, will only do training, "
 86 |                                       "anet_cap and charades_sta has no dev set, so None")
 87 |         self.parser.add_argument("--desc_bert_path", type=str, default=None)
 88 |         self.parser.add_argument("--sub_bert_path", type=str, default=None)
 89 |         self.parser.add_argument("--sub_feat_size", type=int, default=768, help="feature dim for sub feature")
 90 |         self.parser.add_argument("--q_feat_size", type=int, default=768, help="feature dim for sub feature")
 91 |         self.parser.add_argument("--ctx_mode", type=str, help="which context to use a combination of [video, sub, tef]",
 92 |                                  choices=["video", "sub", "video_sub", "tef", "video_tef", "sub_tef", "video_sub_tef"])
 93 |         self.parser.add_argument("--video_duration_idx_path", type=str, default=None)
 94 |         self.parser.add_argument("--vid_feat_path", type=str, default="")
 95 |         self.parser.add_argument("--no_norm_vfeat", action="store_true",
 96 |                                  help="Do not do normalization on video feat, use it only when using resnet_i3d feat")
 97 |         self.parser.add_argument("--no_norm_tfeat", action="store_true", help="Do not do normalization on text feat")
 98 |         self.parser.add_argument("--clip_length", type=float, default=None,
 99 |                                  help="each video will be uniformly segmented into small clips, "
100 |                                       "will automatically loaded from ProposalConfigs if None")
101 |         self.parser.add_argument("--vid_feat_size", type=int, help="feature dim for video feature")
102 |         self.parser.add_argument("--max_position_embeddings", type=int, default=300)
103 |         self.parser.add_argument("--hidden_size", type=int, default=384)
104 |         self.parser.add_argument("--n_heads", type=int, default=8)
105 |         self.parser.add_argument("--input_drop", type=float, default=0.1, help="Applied to all inputs")
106 |         self.parser.add_argument("--drop", type=float, default=0.1, help="Applied to all other layers")
107 |         self.parser.add_argument("--conv_kernel_size", type=int, default=5)
108 |         self.parser.add_argument("--conv_stride", type=int, default=1)
109 |         self.parser.add_argument("--initializer_range", type=float, default=0.02, help="initializer range for layers")
110 |         # post processing
111 |         self.parser.add_argument("--min_pred_l", type=int, default=2,
112 |                                  help="constrain the [st, ed] with ed - st >= 2 (2 clips with length 1.5 each, 3 secs "
113 |                                       "in total this is the min length for proposal-based backup_method)")
114 |         self.parser.add_argument("--max_pred_l", type=int, default=16,
115 |                                  help="constrain the [st, ed] pairs with ed - st <= 16, 24 secs in total (16 clips "
116 |                                       "with length 1.5 each, this is the max length for proposal-based backup_method)")
117 |         self.parser.add_argument("--q2c_alpha", type=float, default=30,
118 |                                  help="give more importance to top scored videos' spans,  "
119 |                                       "the new score will be: s_new = exp(alpha * s), "
120 |                                       "higher alpha indicates more importance. Note s in [-1, 1]")
121 |         self.parser.add_argument("--max_before_nms", type=int, default=200)
122 |         self.parser.add_argument("--max_vcmr_video", type=int, default=100, help="re-ranking in top-max_vcmr_video")
123 |         self.parser.add_argument("--nms_thd", type=float, default=-1,
124 |                                  help="additionally use non-maximum suppression (or non-minimum suppression for "
125 |                                       "distance) to post-processing the predictions. -1: do not use nms. 0.6 for "
126 |                                       "charades_sta, 0.5 for anet_cap")
127 | 
128 |     def display_save(self, opt):
129 |         args = vars(opt)
130 |         # Display settings
131 |         print("------------ Options -------------\n{}\n-------------------".format({str(k): str(v) for k, v in
132 |                                                                                     sorted(args.items())}))
133 |         # Save settings
134 |         if not isinstance(self, TestOptions):
135 |             option_file_path = os.path.join(opt.results_dir, self.saved_option_filename)  # not yaml file indeed
136 |             save_json(args, option_file_path, save_pretty=True)
137 | 
138 |     def parse(self):
139 |         if not self.initialized:
140 |             self.initialize()
141 |         opt = self.parser.parse_args()
142 |         if opt.debug:
143 |             opt.results_root = os.path.sep.join(opt.results_root.split(os.path.sep)[:-1] + ["debug_results", ])
144 |             opt.no_core_driver = True
145 |             opt.num_workers = 0
146 |             opt.eval_query_bsz = 100
147 |         if isinstance(self, TestOptions):
148 |             # modify model_dir to absolute path
149 |             opt.model_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "results", opt.model_dir)
150 |             saved_options = load_json(os.path.join(opt.model_dir, self.saved_option_filename))
151 |             for arg in saved_options:  # use saved options to overwrite all BaseOptions args.
152 |                 if arg not in ["results_root", "num_workers", "nms_thd", "debug",
153 |                                "eval_split_name", "eval_path", "eval_query_bsz", "eval_context_bsz",
154 |                                "max_pred_l", "min_pred_l", "external_inference_vr_res_path"]:
155 |                     setattr(opt, arg, saved_options[arg])
156 |         else:
157 |             if opt.exp_id is None:
158 |                 raise ValueError("--exp_id is required for at a training option!")
159 |             if opt.clip_length is None:
160 |                 opt.clip_length = ProposalConfigs[opt.dset_name]["clip_length"]
161 |                 print("Loaded clip_length {} from proposal config file".format(opt.clip_length))
162 |             opt.results_dir = os.path.join(opt.results_root, "-".join([opt.dset_name, opt.ctx_mode, opt.exp_id,
163 |                                                                        time.strftime("%Y_%m_%d_%H_%M_%S")]))
164 |             mkdirp(opt.results_dir)
165 |             # save a copy of current code
166 |             code_dir = os.path.dirname(os.path.realpath(__file__))
167 |             code_zip_filename = os.path.join(opt.results_dir, "code.zip")
168 |             make_zipfile(code_dir, code_zip_filename, enclosing_dir="code", exclude_dirs_substring="results",
169 |                          exclude_dirs=["results", "debug_results", "__pycache__"],
170 |                          exclude_extensions=[".pyc", ".ipynb", ".swap"],)
171 |         self.display_save(opt)
172 |         if "sub" in opt.ctx_mode:
173 |             assert opt.dset_name == "tvr", "sub is only supported for tvr dataset"
174 |         if opt.hard_negative_start_epoch != -1:
175 |             if opt.hard_pool_size > opt.bsz:
176 |                 print("[WARNING] hard_pool_size is larger than bsz")
177 |         assert opt.stop_task in opt.eval_tasks_at_training
178 |         opt.ckpt_filepath = os.path.join(opt.results_dir, self.ckpt_filename)
179 |         opt.train_log_filepath = os.path.join(opt.results_dir, self.train_log_filename)
180 |         opt.eval_log_filepath = os.path.join(opt.results_dir, self.eval_log_filename)
181 |         opt.tensorboard_log_dir = os.path.join(opt.results_dir, self.tensorboard_log_dir)
182 |         opt.device = torch.device("cuda:%d" % opt.device_ids[0] if opt.device >= 0 else "cpu")
183 |         opt.h5driver = None if opt.no_core_driver else "core"
184 |         # num_workers > 1 will only work with "core" mode, i.e., memory-mapped hdf5
185 |         opt.num_workers = 1 if opt.no_core_driver else opt.num_workers
186 |         opt.pin_memory = not opt.no_pin_memory
187 |         if "video" in opt.ctx_mode and opt.vid_feat_size > 3000:  # 3072, the normalized concatenation of resnet+i3d
188 |             assert opt.no_norm_vfeat
189 |         if "tef" in opt.ctx_mode and "video" in opt.ctx_mode:
190 |             opt.vid_feat_size += 2
191 |         if "tef" in opt.ctx_mode and "sub" in opt.ctx_mode:
192 |             opt.sub_feat_size += 2
193 |         self.opt = opt
194 |         return opt
195 | 
196 | 
197 | class TestOptions(BaseOptions):
198 |     """add additional options for evaluating"""
199 |     def initialize(self):
200 |         BaseOptions.initialize(self)
201 |         # also need to specify --eval_split_name
202 |         self.parser.add_argument("--eval_id", type=str, help="evaluation id")
203 |         self.parser.add_argument("--model_dir", type=str,
204 |                                  help="dir contains the model file, will be converted to absolute path afterwards")
205 |         self.parser.add_argument("--tasks", type=str, nargs="+",
206 |                                  choices=["VCMR", "SVMR", "VR"], default=["VCMR", "SVMR", "VR"],
207 |                                  help="Which tasks to run."
208 |                                       "VCMR: Video Corpus Moment Retrieval;"
209 |                                       "SVMR: Single Video Moment Retrieval;"
210 |                                       "VR: regular Video Retrieval. (will be performed automatically with VCMR)")
211 | 


--------------------------------------------------------------------------------
/method_tvr/contrastive.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import math
  3 | import torch.nn.functional as F
  4 | 
  5 | 
  6 | def log_sum_exp(x, axis=None):
  7 |     """
  8 |     Log sum exp function
  9 |     Args:
 10 |         x: Input.
 11 |         axis: Axis over which to perform sum.
 12 |     Returns:
 13 |         torch.Tensor: log sum exp
 14 |     """
 15 |     x_max = torch.max(x, axis)[0]
 16 |     y = torch.log((torch.exp(x - x_max)).sum(axis)) + x_max
 17 |     return y
 18 | 
 19 | 
 20 | def get_positive_expectation(p_samples, measure='JSD', average=True):
 21 |     """
 22 |     Computes the positive part of a divergence / difference.
 23 |     Args:
 24 |         p_samples: Positive samples.
 25 |         measure: Measure to compute for.
 26 |         average: Average the result over samples.
 27 |     Returns:
 28 |         torch.Tensor
 29 |     """
 30 |     log_2 = math.log(2.)
 31 |     if measure == 'GAN':
 32 |         Ep = - F.softplus(-p_samples)
 33 |     elif measure == 'JSD':
 34 |         Ep = log_2 - F.softplus(-p_samples)
 35 |     elif measure == 'X2':
 36 |         Ep = p_samples ** 2
 37 |     elif measure == 'KL':
 38 |         Ep = p_samples + 1.
 39 |     elif measure == 'RKL':
 40 |         Ep = -torch.exp(-p_samples)
 41 |     elif measure == 'DV':
 42 |         Ep = p_samples
 43 |     elif measure == 'H2':
 44 |         Ep = torch.ones_like(p_samples) - torch.exp(-p_samples)
 45 |     elif measure == 'W1':
 46 |         Ep = p_samples
 47 |     else:
 48 |         raise ValueError('Unknown measurement {}'.format(measure))
 49 |     if average:
 50 |         return Ep.mean()
 51 |     else:
 52 |         return Ep
 53 | 
 54 | 
 55 | def get_negative_expectation(q_samples, measure='JSD', average=True):
 56 |     """
 57 |     Computes the negative part of a divergence / difference.
 58 |     Args:
 59 |         q_samples: Negative samples.
 60 |         measure: Measure to compute for.
 61 |         average: Average the result over samples.
 62 |     Returns:
 63 |         torch.Tensor
 64 |     """
 65 |     log_2 = math.log(2.)
 66 |     if measure == 'GAN':
 67 |         Eq = F.softplus(-q_samples) + q_samples
 68 |     elif measure == 'JSD':
 69 |         Eq = F.softplus(-q_samples) + q_samples - log_2
 70 |     elif measure == 'X2':
 71 |         Eq = -0.5 * ((torch.sqrt(q_samples ** 2) + 1.) ** 2)
 72 |     elif measure == 'KL':
 73 |         Eq = torch.exp(q_samples)
 74 |     elif measure == 'RKL':
 75 |         Eq = q_samples - 1.
 76 |     elif measure == 'DV':
 77 |         Eq = log_sum_exp(q_samples, 0) - math.log(q_samples.size(0))
 78 |     elif measure == 'H2':
 79 |         Eq = torch.exp(q_samples) - 1.
 80 |     elif measure == 'W1':
 81 |         Eq = q_samples
 82 |     else:
 83 |         raise ValueError('Unknown measurement {}'.format(measure))
 84 |     if average:
 85 |         return Eq.mean()
 86 |     else:
 87 |         return Eq
 88 | 
 89 | 
 90 | def batch_video_query_loss(video, query, match_labels, mask, measure='JSD'):
 91 |     """
 92 |         QV-CL module
 93 |         Computing the Contrastive Loss between the video and query.
 94 |         :param video: video rep (bsz, Lv, dim)
 95 |         :param query: query rep (bsz, dim)
 96 |         :param match_labels: match labels (bsz, Lv)
 97 |         :param mask: mask (bsz, Lv)
 98 |         :param measure: estimator of the mutual information
 99 |         :return: L_{qv}
100 |     """
101 |     # generate mask
102 |     pos_mask = match_labels.type(torch.float32)  # (bsz, Lv)
103 |     neg_mask = (torch.ones_like(pos_mask) - pos_mask) * mask  # (bsz, Lv)
104 | 
105 |     # compute scores
106 |     query = query.unsqueeze(2)  # (bsz, dim, 1)
107 |     res = torch.matmul(video, query).squeeze(2)  # (bsz, Lv)
108 | 
109 |     # computing expectation for the MI between the target moment (positive samples) and query.
110 |     E_pos = get_positive_expectation(res * pos_mask, measure, average=False)
111 |     E_pos = torch.sum(E_pos * pos_mask, dim=1) / (torch.sum(pos_mask, dim=1) + 1e-12)  # (bsz, )
112 | 
113 |     # computing expectation for the MI between clips except target moment (negative samples) and query.
114 |     E_neg = get_negative_expectation(res * neg_mask, measure, average=False)
115 |     E_neg = torch.sum(E_neg * neg_mask, dim=1) / (torch.sum(neg_mask, dim=1) + 1e-12)  # (bsz, )
116 | 
117 |     E = E_neg - E_pos  # (bsz, )
118 |     return torch.mean(E)
119 | 
120 | 
121 | def batch_video_video_loss(video, st_ed_indices, match_labels, mask, measure='JSD'):
122 |     """
123 |         VV-CL module
124 |         Computing the Contrastive loss between the start/end clips and the video
125 |         :param video: video rep (bsz, Lv, dim)
126 |         :param st_ed_indices: (bsz, 2)
127 |         :param match_labels: match labels (bsz, Lv)
128 |         :param mask: mask (bsz, Lv)
129 |         :param measure: estimator of the mutual information
130 |         :return: L_{vv}
131 |     """
132 |     # generate mask
133 |     pos_mask = match_labels.type(torch.float32)  # (bsz, Lv)
134 |     neg_mask = (torch.ones_like(pos_mask) - pos_mask) * mask  # (bsz, Lv)
135 | 
136 |     # select start and end indices features
137 |     st_indices, ed_indices = st_ed_indices[:, 0], st_ed_indices[:, 1]  # (bsz, )
138 |     batch_indices = torch.arange(0, video.shape[0]).long()  # (bsz, )
139 |     video_s = video[batch_indices, st_indices, :]  # (bsz, dim)
140 |     video_e = video[batch_indices, ed_indices, :]  # (bsz, dim)
141 | 
142 |     # compute scores
143 |     video_s = video_s.unsqueeze(2)  # (bsz, dim, 1)
144 |     res_s = torch.matmul(video, video_s).squeeze(2)  # (bsz, Lv), fusion between the start clips and the video
145 |     video_e = video_e.unsqueeze(2)  # (bsz, dim, 1)
146 |     res_e = torch.matmul(video, video_e).squeeze(2)  # (bsz, Lv), fusion between the end clips and the video
147 | 
148 |     # start clips: MI expectation for all positive samples
149 |     E_s_pos = get_positive_expectation(res_s * pos_mask, measure, average=False)
150 |     E_s_pos = torch.sum(E_s_pos * pos_mask, dim=1) / (torch.sum(pos_mask, dim=1) + 1e-12)  # (bsz, )
151 |     # end clips: MI expectation for all positive samples
152 |     E_e_pos = get_positive_expectation(res_e * pos_mask, measure, average=False)
153 |     E_e_pos = torch.sum(E_e_pos * pos_mask, dim=1) / (torch.sum(pos_mask, dim=1) + 1e-12)
154 |     E_pos = E_s_pos + E_e_pos
155 | 
156 |     # start clips: MI expectation for all negative samples
157 |     E_s_neg = get_negative_expectation(res_s * neg_mask, measure, average=False)
158 |     E_s_neg = torch.sum(E_s_neg * neg_mask, dim=1) / (torch.sum(neg_mask, dim=1) + 1e-12)
159 | 
160 |     # end clips: MI expectation for all negative samples
161 |     E_e_neg = get_negative_expectation(res_e * neg_mask, measure, average=False)
162 |     E_e_neg = torch.sum(E_e_neg * neg_mask, dim=1) / (torch.sum(neg_mask, dim=1) + 1e-12)
163 |     E_neg = E_s_neg + E_e_neg
164 | 
165 |     E = E_neg - E_pos  # (bsz, )
166 |     return torch.mean(E)
167 | 


--------------------------------------------------------------------------------
/method_tvr/model.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import torch
  3 | import torch.nn as nn
  4 | import torch.nn.functional as F
  5 | from easydict import EasyDict as edict
  6 | from method_tvr.model_components import BertAttention, LinearLayer, BertSelfAttention, TrainablePositionalEncoding
  7 | from method_tvr.model_components import MILNCELoss
  8 | from method_tvr.contrastive import batch_video_query_loss
  9 | 
 10 | 
 11 | class ReLoCLNet(nn.Module):
 12 |     def __init__(self, config):
 13 |         super(ReLoCLNet, self).__init__()
 14 |         self.config = config
 15 | 
 16 |         self.query_pos_embed = TrainablePositionalEncoding(max_position_embeddings=config.max_desc_l,
 17 |                                                            hidden_size=config.hidden_size, dropout=config.input_drop)
 18 |         self.ctx_pos_embed = TrainablePositionalEncoding(max_position_embeddings=config.max_ctx_l,
 19 |                                                          hidden_size=config.hidden_size, dropout=config.input_drop)
 20 | 
 21 |         self.query_input_proj = LinearLayer(config.query_input_size, config.hidden_size, layer_norm=True,
 22 |                                             dropout=config.input_drop, relu=True)
 23 | 
 24 |         self.query_encoder = BertAttention(edict(hidden_size=config.hidden_size, intermediate_size=config.hidden_size,
 25 |                                                  hidden_dropout_prob=config.drop, num_attention_heads=config.n_heads,
 26 |                                                  attention_probs_dropout_prob=config.drop))
 27 |         self.query_encoder1 = copy.deepcopy(self.query_encoder)
 28 | 
 29 |         cross_att_cfg = edict(hidden_size=config.hidden_size, num_attention_heads=config.n_heads,
 30 |                               attention_probs_dropout_prob=config.drop)
 31 |         # use_video
 32 |         self.video_input_proj = LinearLayer(config.visual_input_size, config.hidden_size, layer_norm=True,
 33 |                                             dropout=config.input_drop, relu=True)
 34 |         self.video_encoder1 = copy.deepcopy(self.query_encoder)
 35 |         self.video_encoder2 = copy.deepcopy(self.query_encoder)
 36 |         self.video_encoder3 = copy.deepcopy(self.query_encoder)
 37 |         self.video_cross_att = BertSelfAttention(cross_att_cfg)
 38 |         self.video_cross_layernorm = nn.LayerNorm(config.hidden_size)
 39 |         self.video_query_linear = nn.Linear(config.hidden_size, config.hidden_size)
 40 | 
 41 |         # use_sub
 42 |         self.sub_input_proj = LinearLayer(config.sub_input_size, config.hidden_size, layer_norm=True,
 43 |                                           dropout=config.input_drop, relu=True)
 44 |         self.sub_encoder1 = copy.deepcopy(self.query_encoder)
 45 |         self.sub_encoder2 = copy.deepcopy(self.query_encoder)
 46 |         self.sub_encoder3 = copy.deepcopy(self.query_encoder)
 47 |         self.sub_cross_att = BertSelfAttention(cross_att_cfg)
 48 |         self.sub_cross_layernorm = nn.LayerNorm(config.hidden_size)
 49 |         self.sub_query_linear = nn.Linear(config.hidden_size, config.hidden_size)
 50 | 
 51 |         self.modular_vector_mapping = nn.Linear(in_features=config.hidden_size, out_features=2, bias=False)
 52 | 
 53 |         conv_cfg = dict(in_channels=1, out_channels=1, kernel_size=config.conv_kernel_size,
 54 |                         stride=config.conv_stride, padding=config.conv_kernel_size // 2, bias=False)
 55 |         self.merged_st_predictor = nn.Conv1d(**conv_cfg)
 56 |         self.merged_ed_predictor = nn.Conv1d(**conv_cfg)
 57 | 
 58 |         self.temporal_criterion = nn.CrossEntropyLoss(reduction="mean")
 59 |         self.nce_criterion = MILNCELoss(reduction='mean')
 60 | 
 61 |         self.reset_parameters()
 62 | 
 63 |     def reset_parameters(self):
 64 |         """ Initialize the weights."""
 65 |         def re_init(module):
 66 |             if isinstance(module, (nn.Linear, nn.Embedding)):
 67 |                 # Slightly different from the TF version which uses truncated_normal for initialization
 68 |                 # cf https://github.com/pytorch/pytorch/pull/5617
 69 |                 module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
 70 |             elif isinstance(module, nn.LayerNorm):
 71 |                 module.bias.data.zero_()
 72 |                 module.weight.data.fill_(1.0)
 73 |             elif isinstance(module, nn.Conv1d):
 74 |                 module.reset_parameters()
 75 |             if isinstance(module, nn.Linear) and module.bias is not None:
 76 |                 module.bias.data.zero_()
 77 | 
 78 |         self.apply(re_init)
 79 | 
 80 |     def set_hard_negative(self, use_hard_negative, hard_pool_size):
 81 |         """use_hard_negative: bool; hard_pool_size: int, """
 82 |         self.config.use_hard_negative = use_hard_negative
 83 |         self.config.hard_pool_size = hard_pool_size
 84 | 
 85 |     def set_train_st_ed(self, lw_st_ed):
 86 |         """pre-train video retrieval then span prediction"""
 87 |         self.config.lw_st_ed = lw_st_ed
 88 | 
 89 |     def forward(self, query_feat, query_mask, video_feat, video_mask, sub_feat, sub_mask, st_ed_indices, match_labels):
 90 |         """
 91 |         Args:
 92 |             query_feat: (N, Lq, Dq)
 93 |             query_mask: (N, Lq)
 94 |             video_feat: (N, Lv, Dv) or None
 95 |             video_mask: (N, Lv) or None
 96 |             sub_feat: (N, Lv, Ds) or None
 97 |             sub_mask: (N, Lv) or None
 98 |             st_ed_indices: (N, 2), torch.LongTensor, 1st, 2nd columns are st, ed labels respectively.
 99 |             match_labels: (N, Lv), torch.LongTensor, matching labels for detecting foreground and background (not used)
100 |         """
101 |         video_feat, sub_feat, mid_x_video_feat, mid_x_sub_feat, x_video_feat, x_sub_feat = self.encode_context(
102 |             video_feat, video_mask, sub_feat, sub_mask, return_mid_output=True)
103 |         video_query, sub_query, query_context_scores, st_prob, ed_prob = self.get_pred_from_raw_query(
104 |             query_feat, query_mask, x_video_feat, video_mask, x_sub_feat, sub_mask, cross=False,
105 |             return_query_feats=True)
106 |         # frame level contrastive learning loss (FrameCL)
107 |         loss_fcl = 0
108 |         if self.config.lw_fcl != 0:
109 |             loss_fcl_vq = batch_video_query_loss(mid_x_video_feat, video_query, match_labels, video_mask, measure='JSD')
110 |             loss_fcl_sq = batch_video_query_loss(mid_x_sub_feat, sub_query, match_labels, sub_mask, measure='JSD')
111 |             loss_fcl = (loss_fcl_vq + loss_fcl_sq) / 2.0
112 |             loss_fcl = self.config.lw_fcl * loss_fcl
113 |         # video level contrastive learning loss (VideoCL)
114 |         loss_vcl = 0
115 |         if self.config.lw_vcl != 0:
116 |             mid_video_q2ctx_scores = self.get_unnormalized_video_level_scores(video_query, mid_x_video_feat, video_mask)
117 |             mid_sub_q2ctx_scores = self.get_unnormalized_video_level_scores(sub_query, mid_x_sub_feat, sub_mask)
118 |             mid_video_q2ctx_scores, _ = torch.max(mid_video_q2ctx_scores, dim=1)
119 |             mid_sub_q2ctx_scores, _ = torch.max(mid_sub_q2ctx_scores, dim=1)
120 |             mid_q2ctx_scores = (mid_video_q2ctx_scores + mid_sub_q2ctx_scores) / 2.0
121 |             loss_vcl = self.nce_criterion(mid_q2ctx_scores)
122 |             loss_vcl = self.config.lw_vcl * loss_vcl
123 |         # moment localization loss
124 |         loss_st_ed = 0
125 |         if self.config.lw_st_ed != 0:
126 |             loss_st = self.temporal_criterion(st_prob, st_ed_indices[:, 0])
127 |             loss_ed = self.temporal_criterion(ed_prob, st_ed_indices[:, 1])
128 |             loss_st_ed = loss_st + loss_ed
129 |             loss_st_ed = self.config.lw_st_ed * loss_st_ed
130 |         # video level retrieval loss
131 |         loss_neg_ctx, loss_neg_q = 0, 0
132 |         if self.config.lw_neg_ctx != 0 or self.config.lw_neg_q != 0:
133 |             loss_neg_ctx, loss_neg_q = self.get_video_level_loss(query_context_scores)
134 |             loss_neg_ctx = self.config.lw_neg_ctx * loss_neg_ctx
135 |             loss_neg_q = self.config.lw_neg_q * loss_neg_q
136 |         # sum loss
137 |         loss = loss_fcl + loss_vcl + loss_st_ed + loss_neg_ctx + loss_neg_q
138 |         return loss, {"loss_st_ed": float(loss_st_ed), "loss_fcl": float(loss_fcl), "loss_vcl": loss_vcl,
139 |                       "loss_neg_ctx": float(loss_neg_ctx), "loss_neg_q": float(loss_neg_q), "loss_overall": float(loss)}
140 | 
141 |     def encode_query(self, query_feat, query_mask):
142 |         encoded_query = self.encode_input(query_feat, query_mask, self.query_input_proj, self.query_encoder,
143 |                                           self.query_pos_embed)  # (N, Lq, D)
144 |         encoded_query = self.query_encoder1(encoded_query, query_mask.unsqueeze(1))
145 |         video_query, sub_query = self.get_modularized_queries(encoded_query, query_mask)  # (N, D) * 2
146 |         return video_query, sub_query
147 | 
148 |     def encode_context(self, video_feat, video_mask, sub_feat, sub_mask, return_mid_output=False):
149 |         # encoding video and subtitle features, respectively
150 |         encoded_video_feat = self.encode_input(video_feat, video_mask, self.video_input_proj, self.video_encoder1,
151 |                                                self.ctx_pos_embed)
152 |         encoded_sub_feat = self.encode_input(sub_feat, sub_mask, self.sub_input_proj, self.sub_encoder1,
153 |                                              self.ctx_pos_embed)
154 |         # cross encoding subtitle features
155 |         x_encoded_video_feat = self.cross_context_encoder(encoded_video_feat, video_mask, encoded_sub_feat, sub_mask,
156 |                                                           self.video_cross_att, self.video_cross_layernorm)  # (N, L, D)
157 |         x_encoded_video_feat_ = self.video_encoder2(x_encoded_video_feat, video_mask.unsqueeze(1))
158 |         # cross encoding video features
159 |         x_encoded_sub_feat = self.cross_context_encoder(encoded_sub_feat, sub_mask, encoded_video_feat, video_mask,
160 |                                                         self.sub_cross_att, self.sub_cross_layernorm)  # (N, L, D)
161 |         x_encoded_sub_feat_ = self.sub_encoder2(x_encoded_sub_feat, sub_mask.unsqueeze(1))
162 |         # additional self encoding process
163 |         x_encoded_video_feat = self.video_encoder3(x_encoded_video_feat_, video_mask.unsqueeze(1))
164 |         x_encoded_sub_feat = self.sub_encoder3(x_encoded_sub_feat_, sub_mask.unsqueeze(1))
165 |         if return_mid_output:
166 |             return (encoded_video_feat, encoded_sub_feat, x_encoded_video_feat_, x_encoded_sub_feat_,
167 |                     x_encoded_video_feat, x_encoded_sub_feat)
168 |         else:
169 |             return x_encoded_video_feat, x_encoded_sub_feat
170 | 
171 |     @staticmethod
172 |     def cross_context_encoder(main_context_feat, main_context_mask, side_context_feat, side_context_mask,
173 |                               cross_att_layer, norm_layer):
174 |         """
175 |         Args:
176 |             main_context_feat: (N, Lq, D)
177 |             main_context_mask: (N, Lq)
178 |             side_context_feat: (N, Lk, D)
179 |             side_context_mask: (N, Lk)
180 |             cross_att_layer: cross attention layer
181 |             norm_layer: layer norm layer
182 |         """
183 |         cross_mask = torch.einsum("bm,bn->bmn", main_context_mask, side_context_mask)  # (N, Lq, Lk)
184 |         cross_out = cross_att_layer(main_context_feat, side_context_feat, side_context_feat, cross_mask)  # (N, Lq, D)
185 |         residual_out = norm_layer(cross_out + main_context_feat)
186 |         return residual_out
187 | 
188 |     @staticmethod
189 |     def encode_input(feat, mask, input_proj_layer, encoder_layer, pos_embed_layer):
190 |         """
191 |         Args:
192 |             feat: (N, L, D_input), torch.float32
193 |             mask: (N, L), torch.float32, with 1 indicates valid query, 0 indicates mask
194 |             input_proj_layer: down project input
195 |             encoder_layer: encoder layer
196 |             pos_embed_layer: positional embedding layer
197 |         """
198 |         feat = input_proj_layer(feat)
199 |         feat = pos_embed_layer(feat)
200 |         mask = mask.unsqueeze(1)  # (N, 1, L), torch.FloatTensor
201 |         return encoder_layer(feat, mask)  # (N, L, D_hidden)
202 | 
203 |     def get_modularized_queries(self, encoded_query, query_mask, return_modular_att=False):
204 |         """
205 |         Args:
206 |             encoded_query: (N, L, D)
207 |             query_mask: (N, L)
208 |             return_modular_att: bool
209 |         """
210 |         modular_attention_scores = self.modular_vector_mapping(encoded_query)  # (N, L, 2 or 1)
211 |         modular_attention_scores = F.softmax(mask_logits(modular_attention_scores, query_mask.unsqueeze(2)), dim=1)
212 |         modular_queries = torch.einsum("blm,bld->bmd", modular_attention_scores, encoded_query)  # (N, 2 or 1, D)
213 |         if return_modular_att:
214 |             assert modular_queries.shape[1] == 2
215 |             return modular_queries[:, 0], modular_queries[:, 1], modular_attention_scores
216 |         else:
217 |             assert modular_queries.shape[1] == 2
218 |             return modular_queries[:, 0], modular_queries[:, 1]  # (N, D) * 2
219 | 
220 |     @staticmethod
221 |     def get_video_level_scores(modularied_query, context_feat, context_mask):
222 |         """ Calculate video2query scores for each pair of video and query inside the batch.
223 |         Args:
224 |             modularied_query: (N, D)
225 |             context_feat: (N, L, D), output of the first transformer encoder layer
226 |             context_mask: (N, L)
227 |         Returns:
228 |             context_query_scores: (N, N)  score of each query w.r.t. each video inside the batch,
229 |                 diagonal positions are positive. used to get negative samples.
230 |         """
231 |         modularied_query = F.normalize(modularied_query, dim=-1)
232 |         context_feat = F.normalize(context_feat, dim=-1)
233 |         query_context_scores = torch.einsum("md,nld->mln", modularied_query, context_feat)  # (N, L, N)
234 |         context_mask = context_mask.transpose(0, 1).unsqueeze(0)  # (1, L, N)
235 |         query_context_scores = mask_logits(query_context_scores, context_mask)  # (N, L, N)
236 |         query_context_scores, _ = torch.max(query_context_scores, dim=1)  # (N, N) diagonal positions are positive pairs
237 |         return query_context_scores
238 | 
239 |     @staticmethod
240 |     def get_unnormalized_video_level_scores(modularied_query, context_feat, context_mask):
241 |         """ Calculate video2query scores for each pair of video and query inside the batch.
242 |         Args:
243 |             modularied_query: (N, D)
244 |             context_feat: (N, L, D), output of the first transformer encoder layer
245 |             context_mask: (N, L)
246 |         Returns:
247 |             context_query_scores: (N, N)  score of each query w.r.t. each video inside the batch,
248 |                 diagonal positions are positive. used to get negative samples.
249 |         """
250 |         query_context_scores = torch.einsum("md,nld->mln", modularied_query, context_feat)  # (N, L, N)
251 |         context_mask = context_mask.transpose(0, 1).unsqueeze(0)  # (1, L, N)
252 |         query_context_scores = mask_logits(query_context_scores, context_mask)  # (N, L, N)
253 |         return query_context_scores
254 | 
255 |     def get_merged_score(self, video_query, video_feat, sub_query, sub_feat, cross=False):
256 |         video_query = self.video_query_linear(video_query)
257 |         sub_query = self.sub_query_linear(sub_query)
258 |         if cross:
259 |             video_similarity = torch.einsum("md,nld->mnl", video_query, video_feat)
260 |             sub_similarity = torch.einsum("md,nld->mnl", sub_query, sub_feat)
261 |             similarity = (video_similarity + sub_similarity) / 2  # (Nq, Nv, L)  from query to all videos.
262 |         else:
263 |             video_similarity = torch.einsum("bd,bld->bl", video_query, video_feat)  # (N, L)
264 |             sub_similarity = torch.einsum("bd,bld->bl", sub_query, sub_feat)  # (N, L)
265 |             similarity = (video_similarity + sub_similarity) / 2
266 |         return similarity
267 | 
268 |     def get_merged_st_ed_prob(self, similarity, context_mask, cross=False):
269 |         if cross:
270 |             n_q, n_c, length = similarity.shape
271 |             similarity = similarity.view(n_q * n_c, 1, length)
272 |             st_prob = self.merged_st_predictor(similarity).view(n_q, n_c, length)  # (Nq, Nv, L)
273 |             ed_prob = self.merged_ed_predictor(similarity).view(n_q, n_c, length)  # (Nq, Nv, L)
274 |         else:
275 |             st_prob = self.merged_st_predictor(similarity.unsqueeze(1)).squeeze()  # (N, L)
276 |             ed_prob = self.merged_ed_predictor(similarity.unsqueeze(1)).squeeze()  # (N, L)
277 |         st_prob = mask_logits(st_prob, context_mask)  # (N, L)
278 |         ed_prob = mask_logits(ed_prob, context_mask)
279 |         return st_prob, ed_prob
280 | 
281 |     def get_pred_from_raw_query(self, query_feat, query_mask, video_feat, video_mask, sub_feat, sub_mask, cross=False,
282 |                                 return_query_feats=False):
283 |         """
284 |         Args:
285 |             query_feat: (N, Lq, Dq)
286 |             query_mask: (N, Lq)
287 |             video_feat: (N, Lv, D) or None
288 |             video_mask: (N, Lv)
289 |             sub_feat: (N, Lv, D) or None
290 |             sub_mask: (N, Lv)
291 |             cross:
292 |             return_query_feats:
293 |         """
294 |         video_query, sub_query = self.encode_query(query_feat, query_mask)
295 |         # get video-level retrieval scores
296 |         video_q2ctx_scores = self.get_video_level_scores(video_query, video_feat, video_mask)
297 |         sub_q2ctx_scores = self.get_video_level_scores(sub_query, sub_feat, sub_mask)
298 |         q2ctx_scores = (video_q2ctx_scores + sub_q2ctx_scores) / 2  # (N, N)
299 |         # compute start and end probs
300 |         similarity = self.get_merged_score(video_query, video_feat, sub_query, sub_feat, cross=cross)
301 |         st_prob, ed_prob = self.get_merged_st_ed_prob(similarity, video_mask, cross=cross)
302 |         if return_query_feats:
303 |             return video_query, sub_query, q2ctx_scores, st_prob, ed_prob
304 |         else:
305 |             return q2ctx_scores, st_prob, ed_prob  # un-normalized masked probabilities!!!!!
306 | 
307 |     def get_video_level_loss(self, query_context_scores):
308 |         """ ranking loss between (pos. query + pos. video) and (pos. query + neg. video) or (neg. query + pos. video)
309 |         Args:
310 |             query_context_scores: (N, N), cosine similarity [-1, 1],
311 |                 Each row contains the scores between the query to each of the videos inside the batch.
312 |         """
313 |         bsz = len(query_context_scores)
314 |         diagonal_indices = torch.arange(bsz).to(query_context_scores.device)
315 |         pos_scores = query_context_scores[diagonal_indices, diagonal_indices]  # (N, )
316 |         query_context_scores_masked = copy.deepcopy(query_context_scores.data)
317 |         # impossibly large for cosine similarity, the copy is created as modifying the original will cause error
318 |         query_context_scores_masked[diagonal_indices, diagonal_indices] = 999
319 |         pos_query_neg_context_scores = self.get_neg_scores(query_context_scores, query_context_scores_masked)
320 |         neg_query_pos_context_scores = self.get_neg_scores(query_context_scores.transpose(0, 1),
321 |                                                            query_context_scores_masked.transpose(0, 1))
322 |         loss_neg_ctx = self.get_ranking_loss(pos_scores, pos_query_neg_context_scores)
323 |         loss_neg_q = self.get_ranking_loss(pos_scores, neg_query_pos_context_scores)
324 |         return loss_neg_ctx, loss_neg_q
325 | 
326 |     def get_neg_scores(self, scores, scores_masked):
327 |         """
328 |         scores: (N, N), cosine similarity [-1, 1],
329 |             Each row are scores: query --> all videos. Transposed version: video --> all queries.
330 |         scores_masked: (N, N) the same as scores, except that the diagonal (positive) positions
331 |             are masked with a large value.
332 |         """
333 |         bsz = len(scores)
334 |         batch_indices = torch.arange(bsz).to(scores.device)
335 |         _, sorted_scores_indices = torch.sort(scores_masked, descending=True, dim=1)
336 |         sample_min_idx = 1  # skip the masked positive
337 |         sample_max_idx = min(sample_min_idx + self.config.hard_pool_size, bsz) if self.config.use_hard_negative else bsz
338 |         # (N, )
339 |         sampled_neg_score_indices = sorted_scores_indices[batch_indices, torch.randint(sample_min_idx, sample_max_idx,
340 |                                                                                        size=(bsz,)).to(scores.device)]
341 |         sampled_neg_scores = scores[batch_indices, sampled_neg_score_indices]  # (N, )
342 |         return sampled_neg_scores
343 | 
344 |     def get_ranking_loss(self, pos_score, neg_score):
345 |         """ Note here we encourage positive scores to be larger than negative scores.
346 |         Args:
347 |             pos_score: (N, ), torch.float32
348 |             neg_score: (N, ), torch.float32
349 |         """
350 |         if self.config.ranking_loss_type == "hinge":  # max(0, m + S_neg - S_pos)
351 |             return torch.clamp(self.config.margin + neg_score - pos_score, min=0).sum() / len(pos_score)
352 |         elif self.config.ranking_loss_type == "lse":  # log[1 + exp(S_neg - S_pos)]
353 |             return torch.log1p(torch.exp(neg_score - pos_score)).sum() / len(pos_score)
354 |         else:
355 |             raise NotImplementedError("Only support 'hinge' and 'lse'")
356 | 
357 | 
358 | def mask_logits(target, mask):
359 |     return target * mask + (1 - mask) * (-1e10)
360 | 


--------------------------------------------------------------------------------
/method_tvr/model_components.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import torch
  3 | import torch.nn as nn
  4 | import torch.nn.functional as F
  5 | 
  6 | 
  7 | def onehot(indexes, N=None):
  8 |     """
  9 |     Creates a one-representation of indexes with N possible entries
 10 |     if N is not specified, it will suit the maximum index appearing.
 11 |     indexes is a long-tensor of indexes
 12 |     """
 13 |     if N is None:
 14 |         N = indexes.max() + 1
 15 |     sz = list(indexes.size())
 16 |     output = indexes.new().long().resize_(*sz, N).zero_()
 17 |     output.scatter_(-1, indexes.unsqueeze(-1), 1)
 18 |     return output
 19 | 
 20 | 
 21 | class SmoothedCrossEntropyLoss(nn.Module):
 22 |     def __init__(self, reduction='mean'):
 23 |         super(SmoothedCrossEntropyLoss, self).__init__()
 24 |         self.reduction = reduction
 25 | 
 26 |     def forward(self, logits, labels, smooth_eps=0.1, mask=None, from_logits=True):
 27 |         """
 28 |         Args:
 29 |             logits: (N, Lv), unnormalized probabilities, torch.float32
 30 |             labels: (N, Lv) or (N, ), one hot labels or indices labels, torch.float32 or torch.int64
 31 |             smooth_eps: float
 32 |             mask: (N, Lv)
 33 |             from_logits: bool
 34 |         """
 35 |         if from_logits:
 36 |             probs = F.log_softmax(logits, dim=-1)
 37 |         else:
 38 |             probs = logits
 39 |         num_classes = probs.size()[-1]
 40 |         if len(probs.size()) > len(labels.size()):
 41 |             labels = onehot(labels, num_classes).type(probs.dtype)
 42 |         if mask is None:
 43 |             labels = labels * (1 - smooth_eps) + smooth_eps / num_classes
 44 |         else:
 45 |             mask = mask.type(probs.dtype)
 46 |             valid_samples = torch.sum(mask, dim=-1, keepdim=True, dtype=probs.dtype)  # (N, 1)
 47 |             eps_per_sample = smooth_eps / valid_samples
 48 |             labels = (labels * (1 - smooth_eps) + eps_per_sample) * mask
 49 |         loss = -torch.sum(labels * probs, dim=-1)
 50 |         if self.reduction == 'sum':
 51 |             return torch.sum(loss)
 52 |         elif self.reduction == 'mean':
 53 |             return torch.mean(loss)
 54 |         else:
 55 |             return loss  # (N, )
 56 | 
 57 | 
 58 | class MILNCELoss(nn.Module):
 59 |     def __init__(self, reduction='mean'):
 60 |         super(MILNCELoss, self).__init__()
 61 |         self.reduction = reduction
 62 | 
 63 |     def forward(self, q2ctx_scores=None, contexts=None, queries=None):
 64 |         if q2ctx_scores is None:
 65 |             assert contexts is not None and queries is not None
 66 |             x = torch.matmul(contexts, queries.t())
 67 |             device = contexts.device
 68 |             bsz = contexts.shape[0]
 69 |         else:
 70 |             x = q2ctx_scores
 71 |             device = q2ctx_scores.device
 72 |             bsz = q2ctx_scores.shape[0]
 73 |         x = x.view(bsz, bsz, -1)
 74 |         nominator = x * torch.eye(x.shape[0], dtype=torch.float32, device=device)[:, :, None]
 75 |         nominator = nominator.sum(dim=1)
 76 |         nominator = torch.logsumexp(nominator, dim=1)
 77 |         denominator = torch.cat((x, x.permute(1, 0, 2)), dim=1).view(x.shape[0], -1)
 78 |         denominator = torch.logsumexp(denominator, dim=1)
 79 |         if self.reduction:
 80 |             return torch.mean(denominator - nominator)
 81 |         else:
 82 |             return denominator - nominator
 83 | 
 84 | 
 85 | class DepthwiseSeparableConv(nn.Module):
 86 |     """
 87 |     Depth-wise separable convolution uses less parameters to generate output by convolution.
 88 |     :Examples:
 89 |         >>> m = DepthwiseSeparableConv(300, 200, 5, dim=1)
 90 |         >>> input_tensor = torch.randn(32, 300, 20)
 91 |         >>> output = m(input_tensor)
 92 |     """
 93 |     def __init__(self, in_ch, out_ch, k, dim=1, relu=True):
 94 |         """
 95 |         :param in_ch: input hidden dimension size
 96 |         :param out_ch: output hidden dimension size
 97 |         :param k: kernel size
 98 |         :param dim: default 1. 1D conv or 2D conv
 99 |         """
100 |         super(DepthwiseSeparableConv, self).__init__()
101 |         self.relu = relu
102 |         if dim == 1:
103 |             self.depthwise_conv = nn.Conv1d(in_channels=in_ch, out_channels=in_ch, kernel_size=k, groups=in_ch,
104 |                                             padding=k // 2)
105 |             self.pointwise_conv = nn.Conv1d(in_channels=in_ch, out_channels=out_ch, kernel_size=1, padding=0)
106 |         elif dim == 2:
107 |             self.depthwise_conv = nn.Conv2d(in_channels=in_ch, out_channels=in_ch, kernel_size=k, groups=in_ch,
108 |                                             padding=k // 2)
109 |             self.pointwise_conv = nn.Conv2d(in_channels=in_ch, out_channels=out_ch, kernel_size=1, padding=0)
110 |         else:
111 |             raise Exception("Incorrect dimension!")
112 | 
113 |     def forward(self, x):
114 |         """
115 |         :Input: (N, L_in, D)
116 |         :Output: (N, L_out, D)
117 |         """
118 |         x = x.transpose(1, 2)
119 |         if self.relu:
120 |             out = F.relu(self.pointwise_conv(self.depthwise_conv(x)), inplace=True)
121 |         else:
122 |             out = self.pointwise_conv(self.depthwise_conv(x))
123 |         return out.transpose(1, 2)  # (N, L, D)
124 | 
125 | 
126 | class ConvEncoder(nn.Module):
127 |     def __init__(self, kernel_size=7, n_filters=128, dropout=0.1):
128 |         super(ConvEncoder, self).__init__()
129 |         self.dropout = nn.Dropout(dropout)
130 |         self.layer_norm = nn.LayerNorm(n_filters)
131 |         self.conv = DepthwiseSeparableConv(in_ch=n_filters, out_ch=n_filters, k=kernel_size, relu=True)
132 | 
133 |     def forward(self, x):
134 |         """
135 |         :param x: (N, L, D)
136 |         :return: (N, L, D)
137 |         """
138 |         return self.layer_norm(self.dropout(self.conv(x)) + x)  # (N, L, D)
139 | 
140 | 
141 | class TrainablePositionalEncoding(nn.Module):
142 |     """Construct the embeddings from word, position and token_type embeddings."""
143 |     def __init__(self, max_position_embeddings, hidden_size, dropout=0.1):
144 |         super(TrainablePositionalEncoding, self).__init__()
145 |         self.position_embeddings = nn.Embedding(max_position_embeddings, hidden_size)
146 |         self.LayerNorm = nn.LayerNorm(hidden_size)
147 |         self.dropout = nn.Dropout(dropout)
148 | 
149 |     def forward(self, input_feat):
150 |         bsz, seq_length = input_feat.shape[:2]
151 |         position_ids = torch.arange(seq_length, dtype=torch.long, device=input_feat.device)
152 |         position_ids = position_ids.unsqueeze(0).repeat(bsz, 1)  # (N, L)
153 |         position_embeddings = self.position_embeddings(position_ids)
154 |         embeddings = self.LayerNorm(input_feat + position_embeddings)
155 |         embeddings = self.dropout(embeddings)
156 |         return embeddings
157 | 
158 |     def add_position_emb(self, input_feat):
159 |         bsz, seq_length = input_feat.shape[:2]
160 |         position_ids = torch.arange(seq_length, dtype=torch.long, device=input_feat.device)
161 |         position_ids = position_ids.unsqueeze(0).repeat(bsz, 1)  # (N, L)
162 |         position_embeddings = self.position_embeddings(position_ids)
163 |         return input_feat + position_embeddings
164 | 
165 | 
166 | class LinearLayer(nn.Module):
167 |     """linear layer configurable with layer normalization, dropout, ReLU."""
168 |     def __init__(self, in_hsz, out_hsz, layer_norm=True, dropout=0.1, relu=True):
169 |         super(LinearLayer, self).__init__()
170 |         self.relu = relu
171 |         self.layer_norm = layer_norm
172 |         if layer_norm:
173 |             self.LayerNorm = nn.LayerNorm(in_hsz)
174 |         layers = [nn.Dropout(dropout), nn.Linear(in_hsz, out_hsz)]
175 |         self.net = nn.Sequential(*layers)
176 | 
177 |     def forward(self, x):
178 |         """(N, L, D)"""
179 |         if self.layer_norm:
180 |             x = self.LayerNorm(x)
181 |         x = self.net(x)
182 |         if self.relu:
183 |             x = F.relu(x, inplace=True)
184 |         return x  # (N, L, D)
185 | 
186 | 
187 | class BertLayer(nn.Module):
188 |     def __init__(self, config, use_self_attention=True):
189 |         super(BertLayer, self).__init__()
190 |         self.use_self_attention = use_self_attention
191 |         if use_self_attention:
192 |             self.attention = BertAttention(config)
193 |         self.intermediate = BertIntermediate(config)
194 |         self.output = BertOutput(config)
195 | 
196 |     def forward(self, hidden_states, attention_mask):
197 |         """
198 |         Args:
199 |             hidden_states:  (N, L, D)
200 |             attention_mask:  (N, L) with 1 indicate valid, 0 indicates invalid
201 |         """
202 |         if self.use_self_attention:
203 |             attention_output = self.attention(hidden_states, attention_mask)
204 |         else:
205 |             attention_output = hidden_states
206 |         intermediate_output = self.intermediate(attention_output)
207 |         layer_output = self.output(intermediate_output, attention_output)
208 |         return layer_output
209 | 
210 | 
211 | class BertAttention(nn.Module):
212 |     def __init__(self, config):
213 |         super(BertAttention, self).__init__()
214 |         self.self = BertSelfAttention(config)
215 |         self.output = BertSelfOutput(config)
216 | 
217 |     def forward(self, input_tensor, attention_mask):
218 |         """
219 |         Args:
220 |             input_tensor: (N, L, D)
221 |             attention_mask: (N, L)
222 |         """
223 |         self_output = self.self(input_tensor, input_tensor, input_tensor, attention_mask)
224 |         attention_output = self.output(self_output, input_tensor)
225 |         return attention_output
226 | 
227 | 
228 | class BertIntermediate(nn.Module):
229 |     def __init__(self, config):
230 |         super(BertIntermediate, self).__init__()
231 |         self.dense = nn.Sequential(nn.Linear(config.hidden_size, config.intermediate_size), nn.ReLU(True))
232 | 
233 |     def forward(self, hidden_states):
234 |         return self.dense(hidden_states)
235 | 
236 | 
237 | class BertOutput(nn.Module):
238 |     def __init__(self, config):
239 |         super(BertOutput, self).__init__()
240 |         self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
241 |         self.LayerNorm = nn.LayerNorm(config.hidden_size)
242 |         self.dropout = nn.Dropout(config.hidden_dropout_prob)
243 | 
244 |     def forward(self, hidden_states, input_tensor):
245 |         hidden_states = self.dense(hidden_states)
246 |         hidden_states = self.dropout(hidden_states)
247 |         hidden_states = self.LayerNorm(hidden_states + input_tensor)
248 |         return hidden_states
249 | 
250 | 
251 | class BertSelfAttention(nn.Module):
252 |     def __init__(self, config):
253 |         super(BertSelfAttention, self).__init__()
254 |         if config.hidden_size % config.num_attention_heads != 0:
255 |             raise ValueError("The hidden size (%d) is not a multiple of the number of attention heads (%d)" % (
256 |                 config.hidden_size, config.num_attention_heads))
257 |         self.num_attention_heads = config.num_attention_heads
258 |         self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
259 |         self.all_head_size = self.num_attention_heads * self.attention_head_size
260 |         self.query = nn.Linear(config.hidden_size, self.all_head_size)
261 |         self.key = nn.Linear(config.hidden_size, self.all_head_size)
262 |         self.value = nn.Linear(config.hidden_size, self.all_head_size)
263 |         self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
264 | 
265 |     def transpose_for_scores(self, x):
266 |         new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)  # (N, L, nh, dh)
267 |         x = x.view(*new_x_shape)
268 |         return x.permute(0, 2, 1, 3)  # (N, nh, L, dh)
269 | 
270 |     def forward(self, query_states, key_states, value_states, attention_mask):
271 |         """
272 |         Args:
273 |             query_states: (N, Lq, D)
274 |             key_states: (N, L, D)
275 |             value_states: (N, L, D)
276 |             attention_mask: (N, Lq, L)
277 |         """
278 |         # only need to mask the dimension where the softmax (last dim) is applied, as another dim (second last)
279 |         # will be ignored in future computation anyway
280 |         attention_mask = (1 - attention_mask.unsqueeze(1)) * -10000.  # (N, 1, Lq, L)
281 |         mixed_query_layer = self.query(query_states)
282 |         mixed_key_layer = self.key(key_states)
283 |         mixed_value_layer = self.value(value_states)
284 |         # transpose
285 |         query_layer = self.transpose_for_scores(mixed_query_layer)  # (N, nh, Lq, dh)
286 |         key_layer = self.transpose_for_scores(mixed_key_layer)  # (N, nh, L, dh)
287 |         value_layer = self.transpose_for_scores(mixed_value_layer)  # (N, nh, L, dh)
288 |         # Take the dot product between "query" and "key" to get the raw attention scores.
289 |         attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))  # (N, nh, Lq, L)
290 |         attention_scores = attention_scores / math.sqrt(self.attention_head_size)
291 |         # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
292 |         attention_scores = attention_scores + attention_mask
293 |         # Normalize the attention scores to probabilities.
294 |         attention_probs = nn.Softmax(dim=-1)(attention_scores)
295 |         # This is actually dropping out entire tokens to attend to, which might
296 |         # seem a bit unusual, but is taken from the original Transformer paper.
297 |         attention_probs = self.dropout(attention_probs)
298 |         # compute output context
299 |         context_layer = torch.matmul(attention_probs, value_layer)
300 |         context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
301 |         new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
302 |         context_layer = context_layer.view(*new_context_layer_shape)
303 |         return context_layer
304 | 
305 | 
306 | class BertSelfOutput(nn.Module):
307 |     def __init__(self, config):
308 |         super(BertSelfOutput, self).__init__()
309 |         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
310 |         self.LayerNorm = nn.LayerNorm(config.hidden_size)
311 |         self.dropout = nn.Dropout(config.hidden_dropout_prob)
312 | 
313 |     def forward(self, hidden_states, input_tensor):
314 |         hidden_states = self.dense(hidden_states)
315 |         hidden_states = self.dropout(hidden_states)
316 |         hidden_states = self.LayerNorm(hidden_states + input_tensor)
317 |         return hidden_states
318 | 


--------------------------------------------------------------------------------
/method_tvr/optimization.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """PyTorch optimization for BERT model."""
 16 | 
 17 | import math
 18 | import torch
 19 | from torch.optim import Optimizer
 20 | from torch.optim.optimizer import required
 21 | from torch.nn.utils import clip_grad_norm_
 22 | import logging
 23 | import abc
 24 | import sys
 25 | 
 26 | logger = logging.getLogger(__name__)
 27 | 
 28 | 
 29 | if sys.version_info >= (3, 4):
 30 |     ABC = abc.ABC
 31 | else:
 32 |     ABC = abc.ABCMeta('ABC', (), {})
 33 | 
 34 | 
 35 | class _LRSchedule(ABC):
 36 |     """ Parent of all LRSchedules here. """
 37 |     warn_t_total = False        # is set to True for schedules where progressing beyond t_total steps doesn't make sense
 38 | 
 39 |     def __init__(self, warmup=0.002, t_total=-1, **kw):
 40 |         """
 41 |         :param warmup:  what fraction of t_total steps will be used for linear warmup
 42 |         :param t_total: how many training steps (updates) are planned
 43 |         :param kw:
 44 |         """
 45 |         super(_LRSchedule, self).__init__(**kw)
 46 |         if t_total < 0:
 47 |             logger.warning("t_total value of {} results in schedule not being applied".format(t_total))
 48 |         if not 0.0 <= warmup < 1.0 and not warmup == -1:
 49 |             raise ValueError("Invalid warmup: {} - should be in [0.0, 1.0[ or -1".format(warmup))
 50 |         warmup = max(warmup, 0.)
 51 |         self.warmup, self.t_total = float(warmup), float(t_total)
 52 |         self.warned_for_t_total_at_progress = -1
 53 | 
 54 |     def get_lr(self, step, nowarn=False):
 55 |         """
 56 |         :param step:    which of t_total steps we're on
 57 |         :param nowarn:  set to True to suppress warning regarding training beyond specified 't_total' steps
 58 |         :return:        learning rate multiplier for current update
 59 |         """
 60 |         if self.t_total < 0:
 61 |             return 1.
 62 |         progress = float(step) / self.t_total
 63 |         ret = self.get_lr_(progress)
 64 |         # warning for exceeding t_total (only active with warmup_linear
 65 |         if not nowarn and self.warn_t_total and progress > 1. and progress > self.warned_for_t_total_at_progress:
 66 |             logger.warning("Training beyond specified 't_total'. Learning rate multiplier set to {}. Please "
 67 |                            "set 't_total' of {} correctly.".format(ret, self.__class__.__name__))
 68 |             self.warned_for_t_total_at_progress = progress
 69 |         # end warning
 70 |         return ret
 71 | 
 72 |     @abc.abstractmethod
 73 |     def get_lr_(self, progress):
 74 |         """
 75 |         :param progress:    value between 0 and 1 (unless going beyond t_total steps) specifying training progress
 76 |         :return:            learning rate multiplier for current update
 77 |         """
 78 |         return 1.
 79 | 
 80 | 
 81 | class ConstantLR(_LRSchedule):
 82 |     def get_lr_(self, progress):
 83 |         return 1.
 84 | 
 85 | 
 86 | class WarmupCosineSchedule(_LRSchedule):
 87 |     """
 88 |     Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
 89 |     Decreases learning rate from 1. to 0. over remaining `1 - warmup` steps following a cosine curve.
 90 |     If `cycles` (default=0.5) is different from default, learning rate follows cosine function after warmup.
 91 |     """
 92 |     warn_t_total = True
 93 | 
 94 |     def __init__(self, warmup=0.002, t_total=-1, cycles=.5, **kw):
 95 |         """
 96 |         :param warmup:      see LRSchedule
 97 |         :param t_total:     see LRSchedule
 98 |         :param cycles:      number of cycles. Default: 0.5, corresponding to cosine decay from 1.
 99 |                             at progress==warmup and 0 at progress==1.
100 |         :param kw:
101 |         """
102 |         super(WarmupCosineSchedule, self).__init__(warmup=warmup, t_total=t_total, **kw)
103 |         self.cycles = cycles
104 | 
105 |     def get_lr_(self, progress):
106 |         if progress < self.warmup:
107 |             return progress / self.warmup
108 |         else:
109 |             progress = (progress - self.warmup) / (1 - self.warmup)   # progress after warmup
110 |             return 0.5 * (1. + math.cos(math.pi * self.cycles * 2 * progress))
111 | 
112 | 
113 | class WarmupCosineWithHardRestartsSchedule(WarmupCosineSchedule):
114 |     """
115 |     Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
116 |     If `cycles` (default=1.) is different from default, learning rate follows `cycles` times a cosine decaying
117 |     learning rate (with hard restarts).
118 |     """
119 |     def __init__(self, warmup=0.002, t_total=-1, cycles=1., **kw):
120 |         super(WarmupCosineWithHardRestartsSchedule, self).__init__(warmup=warmup, t_total=t_total, cycles=cycles, **kw)
121 |         assert(cycles >= 1.)
122 | 
123 |     def get_lr_(self, progress):
124 |         if progress < self.warmup:
125 |             return progress / self.warmup
126 |         else:
127 |             progress = (progress - self.warmup) / (1 - self.warmup)     # progress after warmup
128 |             ret = 0.5 * (1. + math.cos(math.pi * ((self.cycles * progress) % 1)))
129 |             return ret
130 | 
131 | 
132 | class WarmupCosineWithWarmupRestartsSchedule(WarmupCosineWithHardRestartsSchedule):
133 |     """
134 |     All training progress is divided in `cycles` (default=1.) parts of equal length.
135 |     Every part follows a schedule with the first `warmup` fraction of training steps linearly increasing from 0. to 1.,
136 |     followed by a learning rate decreasing from 1. to 0. following a cosine curve.
137 |     """
138 |     def __init__(self, warmup=0.002, t_total=-1, cycles=1., **kw):
139 |         assert(warmup * cycles < 1.)
140 |         warmup = warmup * cycles if warmup >= 0 else warmup
141 |         super(WarmupCosineWithWarmupRestartsSchedule, self).__init__(warmup=warmup, t_total=t_total, cycles=cycles,
142 |                                                                      **kw)
143 | 
144 |     def get_lr_(self, progress):
145 |         progress = progress * self.cycles % 1.
146 |         if progress < self.warmup:
147 |             return progress / self.warmup
148 |         else:
149 |             progress = (progress - self.warmup) / (1 - self.warmup)     # progress after warmup
150 |             ret = 0.5 * (1. + math.cos(math.pi * progress))
151 |             return ret
152 | 
153 | 
154 | class WarmupConstantSchedule(_LRSchedule):
155 |     """
156 |     Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
157 |     Keeps learning rate equal to 1. after warmup.
158 |     """
159 |     def get_lr_(self, progress):
160 |         if progress < self.warmup:
161 |             return progress / self.warmup
162 |         return 1.
163 | 
164 | 
165 | class WarmupLinearSchedule(_LRSchedule):
166 |     """
167 |     Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
168 |     Linearly decreases learning rate from 1. to 0. over remaining `1 - warmup` steps.
169 |     """
170 |     warn_t_total = True
171 | 
172 |     def get_lr_(self, progress):
173 |         if progress < self.warmup:
174 |             return progress / self.warmup
175 |         return max((progress - 1.) / (self.warmup - 1.), 0.)
176 | 
177 | 
178 | SCHEDULES = {
179 |     None:       ConstantLR,
180 |     "none":     ConstantLR,
181 |     "warmup_cosine": WarmupCosineSchedule,
182 |     "warmup_constant": WarmupConstantSchedule,
183 |     "warmup_linear": WarmupLinearSchedule
184 | }
185 | 
186 | 
187 | class EMA(object):
188 |     """ Exponential Moving Average for model parameters.
189 |     references:
190 |     [1] https://github.com/BangLiu/QANet-PyTorch/blob/master/model/modules/ema.py
191 |     [2] https://github.com/hengruo/QANet-pytorch/blob/e2de07cd2c711d525f5ffee35c3764335d4b501d/main.py"""
192 |     def __init__(self, decay):
193 |         self.decay = decay
194 |         self.shadow = {}
195 |         self.original = {}
196 | 
197 |     def register(self, name, val):
198 |         self.shadow[name] = val.clone()
199 | 
200 |     def __call__(self, model, step):
201 |         decay = min(self.decay,  (1 + step) / (10.0 + step))
202 |         for name, param in model.named_parameters():
203 |             if param.requires_grad:
204 |                 assert name in self.shadow
205 |                 new_average = \
206 |                     (1.0 - decay) * param.data + decay * self.shadow[name]
207 |                 self.shadow[name] = new_average.clone()
208 | 
209 |     def assign(self, model):
210 |         for name, param in model.named_parameters():
211 |             if param.requires_grad:
212 |                 assert name in self.shadow
213 |                 self.original[name] = param.data.clone()
214 |                 param.data = self.shadow[name]
215 | 
216 |     def resume(self, model):
217 |         for name, param in model.named_parameters():
218 |             if param.requires_grad:
219 |                 assert name in self.shadow
220 |                 param.data = self.original[name]
221 | 
222 | 
223 | class BertAdam(Optimizer):
224 |     """Implements BERT version of Adam algorithm with weight decay fix.
225 |     Params:
226 |         lr: learning rate
227 |         warmup: portion of t_total for the warmup, -1  means no warmup. Default: -1
228 |         t_total: total number of training steps for the learning
229 |             rate schedule, -1  means constant learning rate of 1. (no warmup regardless of warmup setting). Default: -1
230 |         schedule: schedule to use for the warmup (see above).
231 |             Can be `'warmup_linear'`, `'warmup_constant'`, `'warmup_cosine'`, `'none'`, `None` or a `_LRSchedule` object
232 |             (see below).
233 |             If `None` or `'none'`, learning rate is always kept constant.
234 |             Default : `'warmup_linear'`
235 |         b1: Adams b1. Default: 0.9
236 |         b2: Adams b2. Default: 0.999
237 |         e: Adams epsilon. Default: 1e-6
238 |         weight_decay: Weight decay. Default: 0.01
239 |         max_grad_norm: Maximum norm for the gradients (-1 means no clipping). Default: 1.0
240 |     """
241 |     def __init__(self, params, lr=required, warmup=-1, t_total=-1, schedule='warmup_linear',
242 |                  b1=0.9, b2=0.999, e=1e-6, weight_decay=0.01, max_grad_norm=1.0, **kwargs):
243 |         if lr is not required and lr < 0.0:
244 |             raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
245 |         if not isinstance(schedule, _LRSchedule) and schedule not in SCHEDULES:
246 |             raise ValueError("Invalid schedule parameter: {}".format(schedule))
247 |         if not 0.0 <= b1 < 1.0:
248 |             raise ValueError("Invalid b1 parameter: {} - should be in [0.0, 1.0[".format(b1))
249 |         if not 0.0 <= b2 < 1.0:
250 |             raise ValueError("Invalid b2 parameter: {} - should be in [0.0, 1.0[".format(b2))
251 |         if not e >= 0.0:
252 |             raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e))
253 |         # initialize schedule object
254 |         if not isinstance(schedule, _LRSchedule):
255 |             schedule_type = SCHEDULES[schedule]
256 |             schedule = schedule_type(warmup=warmup, t_total=t_total)
257 |         else:
258 |             if warmup != -1 or t_total != -1:
259 |                 logger.warning("warmup and t_total on the optimizer are ineffective when _LRSchedule object is "
260 |                                "provided as schedule. Please specify custom warmup and t_total in _LRSchedule object.")
261 |         defaults = dict(lr=lr, schedule=schedule,
262 |                         b1=b1, b2=b2, e=e, weight_decay=weight_decay,
263 |                         max_grad_norm=max_grad_norm)
264 |         super(BertAdam, self).__init__(params, defaults)
265 | 
266 |     def get_lr(self):
267 |         lr = []
268 |         for group in self.param_groups:
269 |             for p in group['params']:
270 |                 state = self.state[p]
271 |                 if len(state) == 0:
272 |                     return [0]
273 |                 lr_scheduled = group['lr']
274 |                 lr_scheduled *= group['schedule'].get_lr(state['step'])
275 |                 lr.append(lr_scheduled)
276 |         return lr
277 | 
278 |     def step(self, closure=None):
279 |         """Performs a single optimization step.
280 | 
281 |         Arguments:
282 |             closure (callable, optional): A closure that reevaluates the model
283 |                 and returns the loss.
284 |         """
285 |         loss = None
286 |         if closure is not None:
287 |             loss = closure()
288 | 
289 |         for group in self.param_groups:
290 |             for p in group['params']:
291 |                 if p.grad is None:
292 |                     continue
293 |                 grad = p.grad.data
294 |                 if grad.is_sparse:
295 |                     raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
296 | 
297 |                 state = self.state[p]
298 | 
299 |                 # State initialization
300 |                 if len(state) == 0:
301 |                     state['step'] = 0
302 |                     # Exponential moving average of gradient values
303 |                     state['next_m'] = torch.zeros_like(p.data)
304 |                     # Exponential moving average of squared gradient values
305 |                     state['next_v'] = torch.zeros_like(p.data)
306 | 
307 |                 next_m, next_v = state['next_m'], state['next_v']
308 |                 beta1, beta2 = group['b1'], group['b2']
309 | 
310 |                 # Add grad clipping
311 |                 if group['max_grad_norm'] > 0:
312 |                     clip_grad_norm_(p, group['max_grad_norm'])
313 | 
314 |                 # Decay the first and second moment running average coefficient
315 |                 # In-place operations to update the averages at the same time
316 |                 next_m.mul_(beta1).add_(1 - beta1, grad)
317 |                 next_v.mul_(beta2).addcmul_(1 - beta2, grad, grad)
318 |                 update = next_m / (next_v.sqrt() + group['e'])
319 | 
320 |                 # Just adding the square of the weights to the loss function is *not*
321 |                 # the correct way of using L2 regularization/weight decay with Adam,
322 |                 # since that will interact with the m and v parameters in strange ways.
323 |                 #
324 |                 # Instead we want to decay the weights in a manner that doesn't interact
325 |                 # with the m/v parameters. This is equivalent to adding the square
326 |                 # of the weights to the loss with plain (non-momentum) SGD.
327 |                 if group['weight_decay'] > 0.0:
328 |                     update += group['weight_decay'] * p.data
329 | 
330 |                 lr_scheduled = group['lr']
331 |                 lr_scheduled *= group['schedule'].get_lr(state['step'])
332 | 
333 |                 update_with_lr = lr_scheduled * update
334 |                 p.data.add_(-update_with_lr)
335 | 
336 |                 state['step'] += 1
337 | 
338 |                 # step_size = lr_scheduled * math.sqrt(bias_correction2) / bias_correction1
339 |                 # No bias correction
340 |                 # bias_correction1 = 1 - beta1 ** state['step']
341 |                 # bias_correction2 = 1 - beta2 ** state['step']
342 | 
343 |         return loss
344 | 


--------------------------------------------------------------------------------
/method_tvr/proposal.py:
--------------------------------------------------------------------------------
  1 | # MIT License
  2 | #
  3 | # Copyright (c) 2018 Victor Escorcia Castillo
  4 | #
  5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  6 | # of this software and associated documentation files (the "Software"), to deal
  7 | # in the Software without restriction, including without limitation the rights
  8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 | # copies of the Software, and to permit persons to whom the Software is
 10 | # furnished to do so, subject to the following conditions:
 11 | #
 12 | # The above copyright notice and this permission notice shall be included in all
 13 | # copies or substantial portions of the Software.
 14 | #
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 21 | # SOFTWARE.
 22 | # ==============================================================================
 23 | """
 24 | Group multiple methods to generate salient temporal windows in a video"""
 25 | import itertools
 26 | import numpy as np
 27 | 
 28 | PROPOSAL_SCHEMES = ['DidemoICCV17SS', 'SlidingWindowMSRSS']
 29 | 
 30 | 
 31 | class TemporalProposalsBase:
 32 |     """Base class (signature) to generate temporal candidate in video"""
 33 |     def __call__(self, video_id, metadata=None, feature_collection=None):
 34 |         raise NotImplementedError('Implement with the signature above')
 35 | 
 36 | 
 37 | class DidemoICCV17SS(TemporalProposalsBase):
 38 |     """Original search space of moments proposed in ICCV-2017
 39 | 
 40 |     Attributes:
 41 |         clip_length_min (float) : minimum length, in seconds, of a video clip.
 42 |         proposals (numpy array) : of shape [21, 2] representing all the
 43 |             possible temporal segments of valid annotations of DiDeMo dataset.
 44 |             It represents the search space of a temporal localization
 45 |             algorithm.
 46 | 
 47 |     Reference: Hendricks et al. Localizing Moments in Video with Natural
 48 |         Language. ICCV 2017.
 49 |     """
 50 |     clip_length_min = 5.0
 51 | 
 52 |     def __init__(self, *args, dtype=np.float32, **kwargs):
 53 |         clips_indices = [(0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5)]
 54 |         for i in itertools.combinations(range(len(clips_indices)), 2):
 55 |             clips_indices.append(i)
 56 |         self.proposals = np.array(clips_indices, dtype=dtype)
 57 |         self.proposals *= self.clip_length_min
 58 |         self.proposals[:, 1] += self.clip_length_min
 59 | 
 60 |     def __call__(self, *args, **kwargs):
 61 |         return self.proposals
 62 | 
 63 | 
 64 | class SlidingWindowMSRSS(TemporalProposalsBase):
 65 |     """Multi-scale sliding window with relative stride within the same scale
 66 | 
 67 |     Attributes:
 68 |         length (float) : length of smallest window.
 69 |         scales (sequence of int) : duration of moments relative to
 70 |             `length`.
 71 |         stride (float) : relative stride between two windows with the same
 72 |             duration. We used different strides for each scale rounding it
 73 |             towards a multiple of `length`. Note that the minimum stride is
 74 |             `length` for any window will be the `length` itself.
 75 |         dtype (numpy.dtype) :
 76 |     """
 77 | 
 78 |     def __init__(self, length, scales, stride=0.5, round_base=0.5, dtype=np.float32):
 79 |         self.length = length
 80 |         self.scales = scales
 81 |         self.round_base = round_base
 82 |         self.relative_stride = stride
 83 |         # pick strides per scale that are multiples of length
 84 |         self.strides = [max(round(s * stride / round_base) * round_base, round_base)
 85 |                         * length for s in scales]
 86 |         self.dtype = dtype
 87 |         assert len(scales) > 0
 88 | 
 89 |     def sliding_windows(self, t_end, t_start=0):
 90 |         """sliding canonical windows over a given time interval"""
 91 |         windows_ = []
 92 |         for i, stride in enumerate(self.strides):
 93 |             num_i = np.ceil((t_end - t_start) / stride)
 94 |             windows_i = np.empty((int(num_i), 2), dtype=np.float32)
 95 |             windows_i[:, 0] = np.arange(t_start, t_end, stride)
 96 |             windows_i[:, 1] = windows_i[:, 0] + self.length * self.scales[i]
 97 |             windows_i[windows_i[:, 1] > t_end, 1] = t_end
 98 |             windows_.append(windows_i)
 99 |             # print("--------------------------------{}".format(i))
100 |             # print(windows_i)
101 |         # import sys
102 |         # sys.exit(1)
103 |         windows = np.concatenate(windows_, axis=0)
104 |         # Hacky way to make windows fit inside video
105 |         # It implies windows at the end may not belong to the set spanned by
106 |         # length and scales.
107 |         return np.unique(windows, axis=0)
108 | 
109 |     def __call__(self, video_id, metadata=None, feature_collection=None):
110 |         """return: (N_window, 2), each row contains (start, end)"""
111 |         duration = metadata.get('duration')
112 |         assert duration is not None
113 |         return self.sliding_windows(duration)
114 | 
115 | 
116 | ProposalConfigs = {
117 |     "didemo": {
118 |         "proposal_interface": "DidemoICCV17SS",
119 |         "clip_length": 2.5,
120 |     },
121 |     "tvr": {
122 |         "length": 3,  # min proposal length
123 |         "scales": [1, 2, 4, 8],
124 |         "stride": 0.3,
125 |         "round_base": 1,
126 |         "min_proposal_length": 3,  # length * min(scales)
127 |         "clip_length": 1.5,  # length should be divisible by clip_length
128 |         "proposal_interface": "SlidingWindowMSRSS",
129 |     },
130 |     "anet_cap": {
131 |         "length": 5,
132 |         "scales": [2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26],
133 |         "stride": 0.3,
134 |         "round_base": 1,
135 |         "min_proposal_length": 10,  # length * min(scales)
136 |         "clip_length": 5,  # length * min(scales) / 2
137 |         "proposal_interface": "SlidingWindowMSRSS",
138 |     },
139 |     "charades_sta": {
140 |         "length": 3,
141 |         "scales": [2, 3, 4, 5, 6, 7, 8],
142 |         "stride": 0.3,
143 |         "round_base": 1,
144 |         "min_proposal_length": 6,  # length * min(scales)
145 |         "clip_length": 3,  # length * min(scales) / 2
146 |         "proposal_interface": "SlidingWindowMSRSS",
147 |     },
148 |     "profiling": {
149 |         "length": 5,
150 |         "scales": [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
151 |         "stride": 0.3,
152 |         "round_base": 1,
153 |         "clip_length": 5,  # length * min(scales) / 2
154 |         "proposal_interface": "SlidingWindowMSRSS",
155 |     },
156 | }
157 | """
158 | 'min_clip_length' is used to uniformly segment the video into smaller clips, it is a half of
159 | the 'min_proposal_length'. Thus we can enforce each moment has at least 2 clips.
160 | """
161 | 
162 | 
163 | def get_proposal_interface(dset_name):
164 |     """ dset_name (str): one of ["tvr"] """
165 |     assert dset_name in ProposalConfigs
166 |     if dset_name == "didemo":
167 |         return DidemoICCV17SS()
168 |     else:
169 |         arg_names = ["length", "scales", "stride", "round_base"]
170 |         func_args = {k: ProposalConfigs[dset_name][k] for k in arg_names}
171 |         return SlidingWindowMSRSS(**func_args)
172 | 
173 | 
174 | if __name__ == '__main__':
175 |     test_fns_args = [(DidemoICCV17SS, (),),
176 |                      (SlidingWindowMSRSS, (1.5, [2, 4, 6, 12]))]
177 |     for fn_i, args_i in test_fns_args:
178 |         proposal_fn = fn_i(*args_i)
179 |         x = proposal_fn('hola', {'duration': 15})
180 |         if fn_i == DidemoICCV17SS:
181 |             assert len(x) == 21
182 | 


--------------------------------------------------------------------------------
/method_tvr/scripts/eval.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # run at project root dir
 3 | # Usage:
 4 | # bash method/scripts/eval.sh ANY_OTHER_PYTHON_ARGS
 5 | eval_split_name=$1
 6 | submission_path=$2
 7 | save_path=$3
 8 | gt_path=data/tvr_${eval_split_name}_release.jsonl
 9 | 
10 | python standalone_eval/eval.py \
11 | --gt_path ${gt_path} \
12 | --submission_path ${submission_path} \
13 | --save_path ${save_path} \
14 | ${@:4}
15 | 


--------------------------------------------------------------------------------
/method_tvr/scripts/inference.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # run at project root dir
 3 | # Usage:
 4 | # bash method/scripts/inference.sh ANY_OTHER_PYTHON_ARGS
 5 | model_dir=$1
 6 | eval_split_name=$2  # [val]
 7 | eval_path=data/tvr_${eval_split_name}_release.jsonl
 8 | tasks=()
 9 | tasks+=(VCMR)
10 | tasks+=(SVMR)
11 | tasks+=(VR)
12 | echo "tasks ${tasks[@]}"
13 | python method_tvr/inference.py \
14 | --model_dir ${model_dir} \
15 | --tasks ${tasks[@]} \
16 | --eval_split_name ${eval_split_name} \
17 | --eval_path ${eval_path} \
18 | ${@:3}
19 | 


--------------------------------------------------------------------------------
/method_tvr/scripts/train.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # run at project root dir
 3 | # Usage:
 4 | # bash method/scripts/train.sh tvr all ANY_OTHER_PYTHON_ARGS
 5 | # use --eval_tasks_at_training ["VR", "SVMR", "VCMR"] --stop_task ["VR", "SVMR", "VCMR"] for
 6 | # use --lw_neg_q 0 --lw_neg_ctx 0 for training SVMR/SVMR only
 7 | # use --lw_st_ed 0 for training with VR only
 8 | dset_name=$1  # see case below
 9 | ctx_mode=$2  # [video, sub, tef, video_sub, video_tef, sub_tef, video_sub_tef]
10 | vid_feat_type=$3  # [resnet, i3d, resnet_i3d]
11 | feature_root=data/tvr_feature_release
12 | results_root=method_tvr/results
13 | vid_feat_size=2048
14 | extra_args=()
15 | 
16 | if [[ ${ctx_mode} == *"sub"* ]] || [[ ${ctx_mode} == "sub" ]]; then
17 |     if [[ ${dset_name} != "tvr" ]]; then
18 |         echo "The use of subtitles is only supported in tvr."
19 |         exit 1
20 |     fi
21 | fi
22 | 
23 | 
24 | case ${dset_name} in
25 |     tvr)
26 |         train_path=data/tvr_train_release.jsonl
27 |         video_duration_idx_path=data/tvr_video2dur_idx.json
28 |         desc_bert_path=${feature_root}/bert_feature/query_only/tvr_query_pretrained_w_query.h5
29 |         if [[ ${vid_feat_type} == "i3d" ]]; then
30 |             echo "Using I3D feature with shape 1024"
31 |             vid_feat_path=${feature_root}/video_feature/tvr_i3d_rgb600_avg_cl-1.5.h5
32 |             vid_feat_size=1024
33 |         elif [[ ${vid_feat_type} == "resnet" ]]; then
34 |             echo "Using ResNet feature with shape 2048"
35 |             vid_feat_path=${feature_root}/video_feature/tvr_resnet152_rgb_max_cl-1.5.h5
36 |             vid_feat_size=2048
37 |         elif [[ ${vid_feat_type} == "resnet_i3d" ]]; then
38 |             echo "Using concatenated ResNet and I3D feature with shape 2048+1024"
39 |             vid_feat_path=${feature_root}/video_feature/tvr_resnet152_rgb_max_i3d_rgb600_avg_cat_cl-1.5.h5
40 |             vid_feat_size=3072
41 |             extra_args+=(--no_norm_vfeat)  # since they are already normalized.
42 |         fi
43 |         eval_split_name=val
44 |         nms_thd=-1
45 |         extra_args+=(--eval_path)
46 |         extra_args+=(data/tvr_val_release.jsonl)
47 |         clip_length=1.5
48 |         # extra_args+=(--max_ctx_l)
49 |         # extra_args+=(100)  # max_ctx_l = 100 for clip_length = 1.5, only ~109/21825 has more than 100.
50 |         extra_args+=(--max_pred_l)
51 |         extra_args+=(16)
52 |         if [[ ${ctx_mode} == *"sub"* ]] || [[ ${ctx_mode} == "sub" ]]; then
53 |             echo "Running with sub."
54 |             desc_bert_path=${feature_root}/bert_feature/sub_query/tvr_query_pretrained_w_sub_query.h5  # overwrite
55 |             sub_bert_path=${feature_root}/bert_feature/sub_query/tvr_sub_pretrained_w_sub_query_max_cl-1.5.h5
56 |             sub_feat_size=768
57 |             extra_args+=(--sub_feat_size)
58 |             extra_args+=(${sub_feat_size})
59 |             extra_args+=(--sub_bert_path)
60 |             extra_args+=(${sub_bert_path})
61 |         fi
62 |         ;;
63 |     *)
64 |         echo -n "Unknown argument"
65 |         ;;
66 | esac
67 | 
68 | echo "Start training with dataset [${dset_name}] in Context Mode [${ctx_mode}]"
69 | echo "Extra args ${extra_args[@]}"
70 | python method_tvr/train.py \
71 | --dset_name=${dset_name} \
72 | --eval_split_name=${eval_split_name} \
73 | --nms_thd=${nms_thd} \
74 | --results_root=${results_root} \
75 | --train_path=${train_path} \
76 | --desc_bert_path=${desc_bert_path} \
77 | --video_duration_idx_path=${video_duration_idx_path} \
78 | --vid_feat_path=${vid_feat_path} \
79 | --clip_length=${clip_length} \
80 | --vid_feat_size=${vid_feat_size} \
81 | --ctx_mode=${ctx_mode} \
82 | ${extra_args[@]} \
83 | ${@:4}
84 | 


--------------------------------------------------------------------------------
/method_tvr/start_end_dataset.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import h5py
  3 | import math
  4 | import numpy as np
  5 | import torch
  6 | from torch.utils.data import Dataset
  7 | from utils.basic_utils import load_jsonl, load_json, l2_normalize_np_array, uniform_feature_sampling
  8 | from utils.tensor_utils import pad_sequences_1d
  9 | from method_tvr.config import BaseOptions
 10 | 
 11 | logger = logging.getLogger(__name__)
 12 | 
 13 | 
 14 | class StartEndDataset(Dataset):
 15 |     """
 16 |     Args:
 17 |         dset_name, str, ["tvr"]
 18 |         ctx_mode: str,
 19 |     Return:
 20 |         a dict: {
 21 |             "meta": {
 22 |                 "desc_id": int,
 23 |                 "desc": str,
 24 |                 "vid_name": str,
 25 |                 "duration": float,
 26 |                 "ts": [st (float), ed (float)], seconds, ground_truth timestamps
 27 |             }
 28 |             "model_inputs": {
 29 |                 "query_feat": torch.tensor, (L, D_q)
 30 |                 "video_feat": torch.tensor, (n_clip_in_moment, D_video)
 31 |                 "sub_feat": torch.tensor, (n_clip_in_moment, D_sub)
 32 |                 "st_ed_indices": torch.LongTensor, (2, )
 33 |             }
 34 |         }
 35 |     """
 36 |     def __init__(self, dset_name, data_path, desc_bert_path_or_handler, sub_bert_path_or_handler, max_desc_len,
 37 |                  max_ctx_len, vid_feat_path_or_handler, clip_length, ctx_mode="video", normalize_vfeat=True,
 38 |                  normalize_tfeat=True, h5driver=None, data_ratio=1.0):
 39 |         self.dset_name = dset_name
 40 |         self.data_path = data_path
 41 |         self.data_ratio = data_ratio
 42 | 
 43 |         self.desc_bert_path_or_handler = desc_bert_path_or_handler
 44 |         self.max_desc_len = max_desc_len
 45 | 
 46 |         self.sub_bert_path_or_handler = sub_bert_path_or_handler
 47 |         self.max_ctx_len = max_ctx_len
 48 |         self.vid_feat_path_or_handler = vid_feat_path_or_handler
 49 |         self.clip_length = clip_length
 50 |         self.ctx_mode = ctx_mode
 51 | 
 52 |         # prepare desc data
 53 |         self.data = load_jsonl(data_path)
 54 |         if self.data_ratio != 1:
 55 |             n_examples = int(len(self.data) * data_ratio)
 56 |             self.data = self.data[:n_examples]
 57 |             logger.info("Using {}% of the data: {} examples".format(data_ratio * 100, n_examples))
 58 | 
 59 |         self.use_video = "video" in self.ctx_mode
 60 |         self.use_sub = "sub" in self.ctx_mode
 61 |         self.use_tef = "tef" in self.ctx_mode
 62 | 
 63 |         if self.use_video:
 64 |             if isinstance(vid_feat_path_or_handler, h5py.File):
 65 |                 self.vid_feat_h5 = vid_feat_path_or_handler
 66 |             else:  # str path
 67 |                 self.vid_feat_h5 = h5py.File(vid_feat_path_or_handler, "r", driver=h5driver)
 68 | 
 69 |         if isinstance(desc_bert_path_or_handler, h5py.File):
 70 |             self.desc_bert_h5 = desc_bert_path_or_handler
 71 |         else:
 72 |             self.desc_bert_h5 = h5py.File(desc_bert_path_or_handler, "r", driver=h5driver)
 73 | 
 74 |         if self.use_sub:
 75 |             if isinstance(sub_bert_path_or_handler, h5py.File):
 76 |                 self.sub_bert_h5 = sub_bert_path_or_handler
 77 |             else:  # str path
 78 |                 self.sub_bert_h5 = h5py.File(sub_bert_path_or_handler, "r", driver=h5driver)
 79 | 
 80 |         self.normalize_vfeat = normalize_vfeat
 81 |         self.normalize_tfeat = normalize_tfeat
 82 | 
 83 |     def __len__(self):
 84 |         return len(self.data)
 85 | 
 86 |     def __getitem__(self, index):
 87 |         raw_data = self.data[index]
 88 |         # initialize with basic data
 89 |         meta = dict(desc_id=raw_data["desc_id"], desc=raw_data["desc"], vid_name=raw_data["vid_name"],
 90 |                     duration=raw_data["duration"], ts=raw_data["ts"])
 91 |         model_inputs = dict()
 92 |         model_inputs["query_feat"] = self.get_query_feat_by_desc_id(meta["desc_id"])
 93 | 
 94 |         ctx_l = 0
 95 |         if self.use_video:
 96 |             video_feat = uniform_feature_sampling(self.vid_feat_h5[meta['vid_name']][:], self.max_ctx_len)
 97 |             if self.normalize_vfeat:
 98 |                 video_feat = l2_normalize_np_array(video_feat)
 99 |             model_inputs["video_feat"] = torch.from_numpy(video_feat)
100 |             ctx_l = len(video_feat)
101 |         else:
102 |             model_inputs["video_feat"] = torch.zeros((2, 2))
103 | 
104 |         if self.use_sub:  # no need for ctx feature, as the features are already contextualized
105 |             sub_feat = uniform_feature_sampling(self.sub_bert_h5[meta["vid_name"]][:], self.max_ctx_len)
106 |             if self.normalize_tfeat:
107 |                 sub_feat = l2_normalize_np_array(sub_feat)
108 |             model_inputs["sub_feat"] = torch.from_numpy(sub_feat)
109 |             ctx_l = len(sub_feat)
110 |         else:
111 |             model_inputs["sub_feat"] = torch.zeros((2, 2))
112 | 
113 |         if self.use_tef:
114 |             # note the tef features here are normalized clip indices (1.5 secs), instead of the original time (1 sec)
115 |             ctx_l = meta["duration"] // self.clip_length + 1 if ctx_l == 0 else ctx_l
116 |             tef_st = torch.arange(0, ctx_l, 1.0) / ctx_l
117 |             tef_ed = torch.arange(1, ctx_l + 1, 1.0) / ctx_l
118 |             tef = torch.stack([tef_st, tef_ed], dim=1)  # (N_clips, 2)
119 |             tef_feat = tef
120 |         else:
121 |             tef_feat = torch.zeros((2, 2))
122 | 
123 |         if self.use_video and self.use_tef:  # (N_clips, D + 2)
124 |             model_inputs["video_feat"] = torch.cat([model_inputs["video_feat"], tef_feat], dim=1)
125 |         if self.use_sub and self.use_tef:  # (N_clips, D_t + 2)
126 |             model_inputs["sub_feat"] = torch.cat([model_inputs["sub_feat"], tef_feat], dim=1)
127 | 
128 |         model_inputs["st_ed_indices"] = self.get_st_ed_label(meta["ts"], max_idx=ctx_l - 1)
129 |         return dict(meta=meta, model_inputs=model_inputs)
130 | 
131 |     def get_st_ed_label(self, ts, max_idx):
132 |         """
133 |         Args:
134 |             ts: [st (float), ed (float)] in seconds, ed > st
135 |             max_idx: length of the video
136 |         Returns:
137 |             [st_idx, ed_idx]: int,
138 |         Given ts = [3.2, 7.6], st_idx = 2, ed_idx = 6,
139 |         clips should be indexed as [2: 6), the translated back ts should be [3:9].
140 |         """
141 |         st_idx = min(math.floor(ts[0] / self.clip_length), max_idx)
142 |         ed_idx = min(math.ceil(ts[1] / self.clip_length), max_idx)  # -1
143 |         return torch.tensor([st_idx, ed_idx], dtype=torch.long)
144 | 
145 |     def get_query_feat_by_desc_id(self, desc_id):
146 |         query_feat = self.desc_bert_h5[str(desc_id)][:self.max_desc_len]
147 |         if self.normalize_tfeat:
148 |             query_feat = l2_normalize_np_array(query_feat)
149 |         return torch.from_numpy(query_feat)
150 | 
151 | 
152 | class StartEndEvalDataset(Dataset):
153 |     """
154 |     init_data_mode: `video_query` or `video_only` or `query_only`,
155 |         it indicates which data to load when initialize the Dataset object.
156 |     data_mode: `context` or `query`, it indicates which data to return for self.__get_item__()
157 |     desc_bert_path_or_handler: h5py.File object or str path
158 |     vid_feat_path_or_handler: h5py.File object or str path
159 |     eval_proposal_bsz: the proposals for a single video will be sorted in length and batched here with
160 |         max batch size to be eval_proposal_bsz. A single video might have multiple batches of proposals.
161 |     load_gt_video: load GroundTruth Video, useful when evaluating single video moment retrieval.
162 |     data_ratio: percentage of query data to use.
163 |     """
164 |     def __init__(self, dset_name, eval_split_name, data_path=None, desc_bert_path_or_handler=None, max_desc_len=None,
165 |                  max_ctx_len=None, sub_bert_path_or_handler=None, vid_feat_path_or_handler=None,
166 |                  video_duration_idx_path=None, clip_length=None, ctx_mode="video", data_mode="context", h5driver=None,
167 |                  data_ratio=1.0, normalize_vfeat=True, normalize_tfeat=True):
168 |         self.dset_name = dset_name
169 |         self.eval_split_name = eval_split_name
170 |         self.ctx_mode = ctx_mode
171 |         self.load_gt_video = False
172 |         self.data_ratio = data_ratio  # only affect query data
173 |         self.normalize_vfeat = normalize_vfeat
174 |         self.normalize_tfeat = normalize_tfeat
175 | 
176 |         self.data_mode = None
177 |         self.set_data_mode(data_mode)
178 | 
179 |         self.max_desc_len = max_desc_len
180 |         self.max_ctx_len = max_ctx_len
181 |         self.data_path = data_path
182 |         if isinstance(desc_bert_path_or_handler, h5py.File):
183 |             self.desc_bert_h5 = desc_bert_path_or_handler
184 |         else:
185 |             self.desc_bert_h5 = h5py.File(desc_bert_path_or_handler, "r", driver=h5driver)
186 | 
187 |         video_data = load_json(video_duration_idx_path)[self.eval_split_name]
188 |         self.video_data = [{"vid_name": k, "duration": v[0]} for k, v in video_data.items()]
189 |         self.video2idx = {k: v[1] for k, v in video_data.items()}
190 |         self.clip_length = clip_length
191 | 
192 |         self.use_video = "video" in self.ctx_mode
193 |         self.use_sub = "sub" in self.ctx_mode
194 |         self.use_tef = "tef" in self.ctx_mode
195 | 
196 |         if self.use_video:
197 |             if isinstance(vid_feat_path_or_handler, h5py.File):
198 |                 self.vid_feat_h5 = vid_feat_path_or_handler
199 |             else:  # str path
200 |                 self.vid_feat_h5 = h5py.File(vid_feat_path_or_handler, "r", driver=h5driver)
201 | 
202 |         if self.use_sub:
203 |             if isinstance(sub_bert_path_or_handler, h5py.File):
204 |                 self.sub_bert_h5 = sub_bert_path_or_handler
205 |             else:  # str path
206 |                 self.sub_bert_h5 = h5py.File(sub_bert_path_or_handler, "r", driver=h5driver)
207 | 
208 |         self.query_data = load_jsonl(data_path)
209 |         if data_ratio != 1:
210 |             n_examples = int(len(self.query_data) * data_ratio)
211 |             self.query_data = self.query_data[:n_examples]
212 |             logger.info("Using {}% of the data: {} examples".format(data_ratio * 100, n_examples))
213 | 
214 |     def set_data_mode(self, data_mode):
215 |         """context or query"""
216 |         assert data_mode in ["context", "query"]
217 |         self.data_mode = data_mode
218 | 
219 |     def load_gt_vid_name_for_query(self, load_gt_video):
220 |         """load_gt_video: bool, affect the returned value of self._get_item_query"""
221 |         if load_gt_video:
222 |             assert "vid_name" in self.query_data[0]
223 |         self.load_gt_video = load_gt_video
224 | 
225 |     def __len__(self):
226 |         if self.data_mode == "context":
227 |             return len(self.video_data)
228 |         else:
229 |             return len(self.query_data)
230 | 
231 |     def __getitem__(self, index):
232 |         if self.data_mode == "context":
233 |             return self._get_item_context(index)
234 |         else:
235 |             return self._get_item_query(index)
236 | 
237 |     def get_query_feat_by_desc_id(self, desc_id):
238 |         query_feat = self.desc_bert_h5[str(desc_id)][:self.max_desc_len]
239 |         if self.normalize_tfeat:
240 |             query_feat = l2_normalize_np_array(query_feat)
241 |         return torch.from_numpy(query_feat)
242 | 
243 |     def _get_item_query(self, index):
244 |         """Need to batch"""
245 |         raw_data = self.query_data[index]
246 |         meta = dict(desc_id=raw_data["desc_id"], desc=raw_data["desc"],
247 |                     vid_name=raw_data["vid_name"] if self.load_gt_video else None)
248 |         model_inputs = dict()
249 |         model_inputs["query_feat"] = self.get_query_feat_by_desc_id(meta["desc_id"])
250 |         return dict(meta=meta, model_inputs=model_inputs)
251 | 
252 |     def get_st_ed_label(self, ts, max_idx):
253 |         st_idx = min(math.floor(ts[0] / self.clip_length), max_idx)
254 |         ed_idx = min(math.ceil(ts[1] / self.clip_length), max_idx)
255 |         return torch.tensor([st_idx, ed_idx], dtype=torch.long)
256 | 
257 |     def _get_item_context(self, index):
258 |         """No need to batch, since it has already been batched here"""
259 |         raw_data = self.video_data[index]
260 |         # initialize with basic data
261 |         meta = dict(vid_name=raw_data["vid_name"], duration=raw_data["duration"])
262 |         model_inputs = dict()
263 |         ctx_l = 0
264 | 
265 |         if self.use_video:
266 |             video_feat = uniform_feature_sampling(self.vid_feat_h5[meta["vid_name"]][:], self.max_ctx_len)
267 |             if self.normalize_vfeat:
268 |                 video_feat = l2_normalize_np_array(video_feat)
269 |             model_inputs["video_feat"] = torch.from_numpy(video_feat)
270 |             ctx_l = len(video_feat)
271 |         else:
272 |             model_inputs["video_feat"] = torch.zeros((2, 2))
273 | 
274 |         if self.use_sub:  # no need for ctx feature, as the features are already contextualized
275 |             sub_feat = uniform_feature_sampling(self.sub_bert_h5[meta["vid_name"]][:], self.max_ctx_len)
276 |             if self.normalize_tfeat:
277 |                 sub_feat = l2_normalize_np_array(sub_feat)
278 |             model_inputs["sub_feat"] = torch.from_numpy(sub_feat)
279 |             ctx_l = len(sub_feat)
280 |         else:
281 |             model_inputs["sub_feat"] = torch.zeros((2, 2))
282 | 
283 |         if self.use_tef:
284 |             ctx_l = meta["duration"] // self.clip_length + 1 if ctx_l == 0 else ctx_l
285 |             tef_st = torch.arange(0, ctx_l, 1.0) / ctx_l
286 |             tef_ed = tef_st + 1.0 / ctx_l
287 |             tef = torch.stack([tef_st, tef_ed], dim=1)  # (N_clips, 2)
288 |             tef_feat = tef
289 |         else:
290 |             tef_feat = torch.zeros((2, 2))
291 | 
292 |         if self.use_video and self.use_tef:  # (N_clips, D+2)
293 |             model_inputs["video_feat"] = torch.cat([model_inputs["video_feat"], tef_feat], dim=1)
294 |         if self.use_sub and self.use_tef:  # (N_clips, D_t+2)
295 |             model_inputs["sub_feat"] = torch.cat([model_inputs["sub_feat"], tef_feat], dim=1)
296 |         return dict(meta=meta, model_inputs=model_inputs)
297 | 
298 | 
299 | def start_end_collate(batch):
300 |     batch_meta = [e["meta"] for e in batch]
301 |     model_inputs_keys = batch[0]["model_inputs"].keys()
302 |     batched_data = dict()
303 |     for k in model_inputs_keys:
304 |         if "feat" in k:
305 |             if k in ['video_feat', 'sub_feat', 'tef_feat']:
306 |                 fixed_length = 128
307 |             else:
308 |                 fixed_length = None
309 |             batched_data[k] = pad_sequences_1d([e["model_inputs"][k] for e in batch], dtype=torch.float32,
310 |                                                fixed_length=fixed_length)
311 |     fixed_length = 128
312 |     if "st_ed_indices" in model_inputs_keys:
313 |         st_ed_indices = [e["model_inputs"]["st_ed_indices"] for e in batch]
314 |         # construct moment localization labels
315 |         batched_data["st_ed_indices"] = torch.stack(st_ed_indices, dim=0)
316 |         # construct moment localization foreground and background labels
317 |         match_labels = np.zeros(shape=(len(st_ed_indices), fixed_length), dtype=np.int32)
318 |         for idx, st_ed_index in enumerate(st_ed_indices):
319 |             st_ed = st_ed_index.cpu().numpy()
320 |             st, ed = st_ed[0], st_ed[1]
321 |             match_labels[idx][st:(ed + 1)] = 1
322 |         batched_data['match_labels'] = torch.tensor(match_labels, dtype=torch.long)
323 |     return batch_meta, batched_data
324 | 
325 | 
326 | def prepare_batch_inputs(batched_model_inputs, device, non_blocking=False):
327 |     model_inputs = {}
328 |     for k, v in batched_model_inputs.items():
329 |         if "feat" in k:
330 |             model_inputs[k] = v[0].to(device, non_blocking=non_blocking)
331 |             model_inputs[k.replace("feat", "mask")] = v[1].to(device, non_blocking=non_blocking)
332 |         else:
333 |             model_inputs[k] = v.to(device, non_blocking=non_blocking)
334 |     return model_inputs
335 | 
336 | 
337 | if __name__ == '__main__':
338 |     options = BaseOptions().parse()
339 | 


--------------------------------------------------------------------------------
/method_tvr/train.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import time
  4 | import json
  5 | import pprint
  6 | import random
  7 | import numpy as np
  8 | from easydict import EasyDict as EDict
  9 | from tqdm import tqdm, trange
 10 | from collections import OrderedDict
 11 | import torch
 12 | import torch.nn as nn
 13 | import torch.backends.cudnn as cudnn
 14 | from torch.utils.data import DataLoader
 15 | from torch.utils.tensorboard import SummaryWriter
 16 | from method_tvr.config import BaseOptions
 17 | from method_tvr.model import ReLoCLNet
 18 | from method_tvr.start_end_dataset import StartEndDataset, start_end_collate, StartEndEvalDataset, prepare_batch_inputs
 19 | from method_tvr.inference import eval_epoch, start_inference
 20 | from method_tvr.optimization import BertAdam
 21 | from utils.basic_utils import AverageMeter
 22 | from utils.model_utils import count_parameters
 23 | 
 24 | 
 25 | import logging
 26 | logger = logging.getLogger(__name__)
 27 | logging.basicConfig(format="%(asctime)s.%(msecs)03d:%(levelname)s:%(name)s - %(message)s",
 28 |                     datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO)
 29 | 
 30 | 
 31 | def set_seed(seed, use_cuda=True):
 32 |     random.seed(seed)
 33 |     np.random.seed(seed)
 34 |     torch.manual_seed(seed)
 35 |     if use_cuda:
 36 |         torch.cuda.manual_seed_all(seed)
 37 | 
 38 | 
 39 | def train_epoch(model, train_loader, optimizer, opt, epoch_i, training=True):
 40 |     logger.info("use train_epoch func for training: {}".format(training))
 41 |     model.train(mode=training)
 42 |     if opt.hard_negative_start_epoch != -1 and epoch_i >= opt.hard_negative_start_epoch:
 43 |         model.set_hard_negative(True, opt.hard_pool_size)
 44 |     if opt.train_span_start_epoch != -1 and epoch_i >= opt.train_span_start_epoch:
 45 |         model.set_train_st_ed(opt.lw_st_ed)
 46 | 
 47 |     # init meters
 48 |     dataloading_time = AverageMeter()
 49 |     prepare_inputs_time = AverageMeter()
 50 |     model_forward_time = AverageMeter()
 51 |     model_backward_time = AverageMeter()
 52 |     loss_meters = OrderedDict(loss_st_ed=AverageMeter(), loss_fcl=AverageMeter(), loss_vcl=AverageMeter(),
 53 |                               loss_neg_ctx=AverageMeter(), loss_neg_q=AverageMeter(),
 54 |                               loss_overall=AverageMeter())
 55 | 
 56 |     num_training_examples = len(train_loader)
 57 |     timer_dataloading = time.time()
 58 |     for batch_idx, batch in tqdm(enumerate(train_loader), desc="Training Iteration", total=num_training_examples):
 59 |         global_step = epoch_i * num_training_examples + batch_idx
 60 |         dataloading_time.update(time.time() - timer_dataloading)
 61 | 
 62 |         # continue
 63 |         timer_start = time.time()
 64 |         model_inputs = prepare_batch_inputs(batch[1], opt.device, non_blocking=opt.pin_memory)
 65 |         prepare_inputs_time.update(time.time() - timer_start)
 66 |         timer_start = time.time()
 67 |         loss, loss_dict = model(**model_inputs)
 68 |         model_forward_time.update(time.time() - timer_start)
 69 |         timer_start = time.time()
 70 |         if training:
 71 |             optimizer.zero_grad()
 72 |             loss.backward()
 73 |             if opt.grad_clip != -1:
 74 |                 nn.utils.clip_grad_norm_(model.parameters(), opt.grad_clip)
 75 |             optimizer.step()
 76 |             model_backward_time.update(time.time() - timer_start)
 77 | 
 78 |             opt.writer.add_scalar("Train/LR", float(optimizer.param_groups[0]["lr"]), global_step)
 79 |             for k, v in loss_dict.items():
 80 |                 opt.writer.add_scalar("Train/{}".format(k), v, global_step)
 81 | 
 82 |         for k, v in loss_dict.items():
 83 |             loss_meters[k].update(float(v))
 84 | 
 85 |         timer_dataloading = time.time()
 86 |         if opt.debug and batch_idx == 3:
 87 |             break
 88 | 
 89 |     if training:
 90 |         to_write = opt.train_log_txt_formatter.format(time_str=time.strftime("%Y_%m_%d_%H_%M_%S"), epoch=epoch_i,
 91 |                                                       loss_str=" ".join(["{} {:.4f}".format(k, v.avg)
 92 |                                                                          for k, v in loss_meters.items()]))
 93 |         with open(opt.train_log_filepath, "a") as f:
 94 |             f.write(to_write)
 95 |         print("Epoch time stats:")
 96 |         print("dataloading_time: max {dataloading_time.max} min {dataloading_time.min} avg {dataloading_time.avg}\n"
 97 |               "prepare_inputs_time: max {prepare_inputs_time.max} "
 98 |               "min {prepare_inputs_time.min} avg {prepare_inputs_time.avg}\n"
 99 |               "model_forward_time: max {model_forward_time.max} "
100 |               "min {model_forward_time.min} avg {model_forward_time.avg}\n"
101 |               "model_backward_time: max {model_backward_time.max} "
102 |               "min {model_backward_time.min} avg {model_backward_time.avg}\n".format(
103 |             dataloading_time=dataloading_time, prepare_inputs_time=prepare_inputs_time,
104 |             model_forward_time=model_forward_time, model_backward_time=model_backward_time))
105 |     else:
106 |         for k, v in loss_meters.items():
107 |             opt.writer.add_scalar("Eval_Loss/{}".format(k), v.avg, epoch_i)
108 | 
109 | 
110 | def rm_key_from_odict(odict_obj, rm_suffix):
111 |     """remove key entry from the OrderedDict"""
112 |     return OrderedDict([(k, v) for k, v in odict_obj.items() if rm_suffix not in k])
113 | 
114 | 
115 | def train(model, train_dataset, train_eval_dataset, val_dataset, opt):
116 |     # Prepare optimizer
117 |     if opt.device.type == "cuda":
118 |         logger.info("CUDA enabled.")
119 |         model.to(opt.device)
120 |         if len(opt.device_ids) > 1:
121 |             logger.info("Use multi GPU", opt.device_ids)
122 |             model = torch.nn.DataParallel(model, device_ids=opt.device_ids)  # use multi GPU
123 | 
124 |     train_loader = DataLoader(train_dataset, collate_fn=start_end_collate, batch_size=opt.bsz,
125 |                               num_workers=opt.num_workers, shuffle=True, pin_memory=opt.pin_memory)
126 |     train_eval_loader = DataLoader(train_eval_dataset, collate_fn=start_end_collate, batch_size=opt.bsz,
127 |                                    num_workers=opt.num_workers, shuffle=False, pin_memory=opt.pin_memory)
128 |     # Prepare optimizer
129 |     param_optimizer = list(model.named_parameters())
130 |     no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
131 |     optimizer_grouped_parameters = [
132 |         {"params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], "weight_decay": 0.01},
133 |         {"params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0}]
134 | 
135 |     num_train_optimization_steps = len(train_loader) * opt.n_epoch
136 |     optimizer = BertAdam(optimizer_grouped_parameters, lr=opt.lr, weight_decay=opt.wd, warmup=opt.lr_warmup_proportion,
137 |                          t_total=num_train_optimization_steps, schedule="warmup_linear")
138 |     prev_best_score = 0.
139 |     es_cnt = 0
140 |     start_epoch = -1 if opt.eval_untrained else 0
141 |     eval_tasks_at_training = opt.eval_tasks_at_training  # VR is computed along with VCMR
142 |     save_submission_filename = "latest_{}_{}_predictions_{}.json".format(opt.dset_name, opt.eval_split_name,
143 |                                                                          "_".join(eval_tasks_at_training))
144 |     for epoch_i in trange(start_epoch, opt.n_epoch, desc="Epoch"):
145 |         if epoch_i > -1:
146 |             with torch.autograd.detect_anomaly():
147 |                 train_epoch(model, train_loader, optimizer, opt, epoch_i, training=True)
148 |         global_step = (epoch_i + 1) * len(train_loader)
149 |         if opt.eval_path is not None:
150 |             with torch.no_grad():
151 |                 train_epoch(model, train_eval_loader, optimizer, opt, epoch_i, training=False)
152 |                 metrics_no_nms, metrics_nms, latest_file_paths = eval_epoch(
153 |                     model, val_dataset, opt, save_submission_filename, tasks=eval_tasks_at_training, max_after_nms=100)
154 |             to_write = opt.eval_log_txt_formatter.format(time_str=time.strftime("%Y_%m_%d_%H_%M_%S"), epoch=epoch_i,
155 |                                                          eval_metrics_str=json.dumps(metrics_no_nms))
156 |             with open(opt.eval_log_filepath, "a") as f:
157 |                 f.write(to_write)
158 |             logger.info("metrics_no_nms {}".format(pprint.pformat(
159 |                 rm_key_from_odict(metrics_no_nms, rm_suffix="by_type"), indent=4)))
160 |             logger.info("metrics_nms {}".format(pprint.pformat(metrics_nms, indent=4)))
161 |             # metrics = metrics_nms if metrics_nms is not None else metrics_no_nms
162 |             metrics = metrics_no_nms
163 |             # early stop/ log / save model
164 |             for task_type in ["SVMR", "VCMR"]:
165 |                 if task_type in metrics:
166 |                     task_metrics = metrics[task_type]
167 |                     for iou_thd in [0.5, 0.7]:
168 |                         opt.writer.add_scalars("Eval/{}-{}".format(task_type, iou_thd),
169 |                                                {k: v for k, v in task_metrics.items() if str(iou_thd) in k},
170 |                                                global_step)
171 |             task_type = "VR"
172 |             if task_type in metrics:
173 |                 task_metrics = metrics[task_type]
174 |                 opt.writer.add_scalars("Eval/{}".format(task_type), {k: v for k, v in task_metrics.items()},
175 |                                        global_step)
176 |             # use the most strict metric available
177 |             stop_metric_names = ["r1"] if opt.stop_task == "VR" else ["0.5-r1", "0.7-r1"]
178 |             stop_score = sum([metrics[opt.stop_task][e] for e in stop_metric_names])
179 |             if stop_score > prev_best_score:
180 |                 es_cnt = 0
181 |                 prev_best_score = stop_score
182 |                 checkpoint = {"model": model.state_dict(), "model_cfg": model.config, "epoch": epoch_i}
183 |                 torch.save(checkpoint, opt.ckpt_filepath)
184 |                 best_file_paths = [e.replace("latest", "best") for e in latest_file_paths]
185 |                 for src, tgt in zip(latest_file_paths, best_file_paths):
186 |                     os.renames(src, tgt)
187 |                 logger.info("The checkpoint file has been updated.")
188 |             else:
189 |                 es_cnt += 1
190 |                 if opt.max_es_cnt != -1 and es_cnt > opt.max_es_cnt:  # early stop
191 |                     with open(opt.train_log_filepath, "a") as f:
192 |                         f.write("Early Stop at epoch {}".format(epoch_i))
193 |                     logger.info("Early stop at {} with {} {}".format(
194 |                         epoch_i, " ".join([opt.stop_task] + stop_metric_names), prev_best_score))
195 |                     break
196 |         else:
197 |             checkpoint = {"model": model.state_dict(), "model_cfg": model.config, "epoch": epoch_i}
198 |             torch.save(checkpoint, opt.ckpt_filepath)
199 | 
200 |         if opt.debug:
201 |             break
202 | 
203 |     opt.writer.close()
204 | 
205 | 
206 | def start_training():
207 |     logger.info("Setup config, data and model...")
208 |     opt = BaseOptions().parse()
209 |     set_seed(opt.seed)
210 |     if opt.debug:  # keep the model run deterministically
211 |         # 'cudnn.benchmark = True' enabled auto finding the best algorithm for a specific input/net config.
212 |         # Enable this only when input size is fixed.
213 |         cudnn.benchmark = False
214 |         cudnn.deterministic = True
215 | 
216 |     opt.writer = SummaryWriter(opt.tensorboard_log_dir)
217 |     opt.train_log_txt_formatter = "{time_str} [Epoch] {epoch:03d} [Loss] {loss_str}\n"
218 |     opt.eval_log_txt_formatter = "{time_str} [Epoch] {epoch:03d} [Metrics] {eval_metrics_str}\n"
219 | 
220 |     train_dataset = StartEndDataset(
221 |         dset_name=opt.dset_name,
222 |         data_path=opt.train_path,
223 |         desc_bert_path_or_handler=opt.desc_bert_path,
224 |         sub_bert_path_or_handler=opt.sub_bert_path,
225 |         max_desc_len=opt.max_desc_l,
226 |         max_ctx_len=opt.max_ctx_l,
227 |         vid_feat_path_or_handler=opt.vid_feat_path,
228 |         clip_length=opt.clip_length,
229 |         ctx_mode=opt.ctx_mode,
230 |         h5driver=opt.h5driver,
231 |         data_ratio=opt.data_ratio,
232 |         normalize_vfeat=not opt.no_norm_vfeat,
233 |         normalize_tfeat=not opt.no_norm_tfeat)
234 | 
235 |     if opt.eval_path is not None:
236 |         # val dataset, used to get eval loss
237 |         train_eval_dataset = StartEndDataset(
238 |             dset_name=opt.dset_name,
239 |             data_path=opt.eval_path,
240 |             desc_bert_path_or_handler=train_dataset.desc_bert_h5,
241 |             sub_bert_path_or_handler=train_dataset.sub_bert_h5 if "sub" in opt.ctx_mode else None,
242 |             max_desc_len=opt.max_desc_l,
243 |             max_ctx_len=opt.max_ctx_l,
244 |             vid_feat_path_or_handler=train_dataset.vid_feat_h5 if "video" in opt.ctx_mode else None,
245 |             clip_length=opt.clip_length,
246 |             ctx_mode=opt.ctx_mode,
247 |             h5driver=opt.h5driver,
248 |             data_ratio=opt.data_ratio,
249 |             normalize_vfeat=not opt.no_norm_vfeat,
250 |             normalize_tfeat=not opt.no_norm_tfeat)
251 | 
252 |         eval_dataset = StartEndEvalDataset(
253 |             dset_name=opt.dset_name,
254 |             eval_split_name=opt.eval_split_name,  # should only be val set
255 |             data_path=opt.eval_path,
256 |             desc_bert_path_or_handler=train_dataset.desc_bert_h5,
257 |             sub_bert_path_or_handler=train_dataset.sub_bert_h5 if "sub" in opt.ctx_mode else None,
258 |             max_desc_len=opt.max_desc_l,
259 |             max_ctx_len=opt.max_ctx_l,
260 |             video_duration_idx_path=opt.video_duration_idx_path,
261 |             vid_feat_path_or_handler=train_dataset.vid_feat_h5 if "video" in opt.ctx_mode else None,
262 |             clip_length=opt.clip_length,
263 |             ctx_mode=opt.ctx_mode,
264 |             data_mode="query",
265 |             h5driver=opt.h5driver,
266 |             data_ratio=opt.data_ratio,
267 |             normalize_vfeat=not opt.no_norm_vfeat,
268 |             normalize_tfeat=not opt.no_norm_tfeat)
269 |     else:
270 |         train_eval_dataset, eval_dataset = None, None
271 | 
272 |     model_config = EDict(
273 |         visual_input_size=opt.vid_feat_size,
274 |         sub_input_size=opt.sub_feat_size,  # for both desc and subtitles
275 |         query_input_size=opt.q_feat_size,  # for both desc and subtitles
276 |         hidden_size=opt.hidden_size,  # hidden dimension
277 |         conv_kernel_size=opt.conv_kernel_size,
278 |         conv_stride=opt.conv_stride,
279 |         max_ctx_l=opt.max_ctx_l,
280 |         max_desc_l=opt.max_desc_l,
281 |         input_drop=opt.input_drop,
282 |         drop=opt.drop,
283 |         n_heads=opt.n_heads,  # self-att heads
284 |         initializer_range=opt.initializer_range,  # for linear layer
285 |         ctx_mode=opt.ctx_mode,  # video, sub or video_sub
286 |         margin=opt.margin,  # margin for ranking loss
287 |         ranking_loss_type=opt.ranking_loss_type,  # loss type, 'hinge' or 'lse'
288 |         lw_neg_q=opt.lw_neg_q,  # loss weight for neg. query and pos. context
289 |         lw_neg_ctx=opt.lw_neg_ctx,  # loss weight for pos. query and neg. context
290 |         lw_fcl=opt.lw_fcl,  # loss weight for frame level contrastive learning
291 |         lw_vcl=opt.lw_vcl,  # loss weight for video level contrastive learning
292 |         lw_st_ed=0,  # will be assigned dynamically at training time
293 |         use_hard_negative=False,  # reset at each epoch
294 |         hard_pool_size=opt.hard_pool_size)
295 |     logger.info("model_config {}".format(model_config))
296 |     model = ReLoCLNet(model_config)
297 |     count_parameters(model)
298 |     logger.info("Start Training...")
299 |     train(model, train_dataset, train_eval_dataset, eval_dataset, opt)
300 |     return opt.results_dir, opt.eval_split_name, opt.eval_path, opt.debug
301 | 
302 | 
303 | if __name__ == '__main__':
304 |     model_dir, eval_split_name, eval_path, debug = start_training()
305 |     if not debug:
306 |         model_dir = model_dir.split(os.sep)[-1]
307 |         tasks = ["SVMR", "VCMR", "VR"]
308 |         input_args = ["--model_dir", model_dir, "--nms_thd", "0.5", "--eval_split_name", eval_split_name,
309 |                       "--eval_path", eval_path, "--tasks"] + tasks
310 |         sys.argv[1:] = input_args
311 |         logger.info("\n\n\nFINISHED TRAINING!!!")
312 |         logger.info("Evaluating model in {}".format(model_dir))
313 |         logger.info("Input args {}".format(sys.argv[1:]))
314 |         start_inference()
315 | 


--------------------------------------------------------------------------------
/setup.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | # source setup.sh
4 | export DIR_PWD="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
5 | export PYTHONPATH="$PYTHONPATH:$DIR_PWD"
6 | 
7 | echo $PYTHONPATH
8 | 


--------------------------------------------------------------------------------
/standalone_eval/README.md:
--------------------------------------------------------------------------------
  1 | TVR Evalation
  2 | ================================================================
  3 | 
  4 | ### Task Definition
  5 | Given a natural language query and a large pool of videos (with subtitles),
  6 | the TVR (VCMR) task requires a system to retrieve a relevant moment from the videos.
  7 | The table below shows a comparison of the TVR task and the subtasks: 
  8 | 
  9 | | Task | Description |
 10 | | --- | --- | 
 11 | | VCMR | or VSCMR, *Video (-Subtitle) Corpus Moment Retrieval*. Localize a moment from a large video corpus. |
 12 | | SVMR | or SVSMR, *Single Video (-Subtitle) Moment Retrieval*. Localize a moment from a given video. |
 13 | | VR | or VSR, *Video (-Subtitle) Retrieval*. Retrieve a video from a large video corpus. |
 14 | 
 15 | VCMR and VR only requires a query and a video corpus, SVMR additionally requires knowing the ground-truth video. 
 16 | Thus it is not possible to perform SVMR on our `test-public` set, where the ground-truth video is hidden. 
 17 | 
 18 | 
 19 | ### How to construct a prediction file?
 20 | 
 21 | An example of such file is [sample_val_predictions.json](sample_val_predictions.json), it is formatted as:
 22 | ```
 23 | {
 24 |     "video2idx": {
 25 |         "castle_s01e02_seg02_clip_09": 19614,
 26 |         ...
 27 |     },
 28 |     "VCMR": [{
 29 |             "desc_id": 90200,
 30 |             "desc": "Phoebe puts one of her ponytails in her mouth.",
 31 |             "predictions": [
 32 |                 [19614, 9.0, 12.0, 1.7275],
 33 |                 [20384, 12.0, 18.0, 1.7315],
 34 |                 [20384, 15.0, 21.0, 1.7351],
 35 |                 ...
 36 |             ]
 37 |         },
 38 |         ...
 39 |     ],
 40 |     "SVMR": [{
 41 |             "desc_id": 90200,
 42 |             "desc": "Phoebe puts one of her ponytails in her mouth.",
 43 |             "predictions": [
 44 |                 [20092, 36.0, 42.0, -1.9082],
 45 |                 [20092, 18.0, 24.0, -1.9145],
 46 |                 [20092, 51.0, 54.0, -1.922],
 47 |                 ...
 48 |             ]
 49 |         },
 50 |         ...
 51 |     ],
 52 |     "VR": [{
 53 |             "desc_id": 90200,
 54 |             "desc": "Phoebe puts one of her ponytails in her mouth.",
 55 |             "predictions": [
 56 |                 [19614, 0, 0, 1.7275],
 57 |                 [20384, 0, 0, 1.7315],
 58 |                 [20384, 0, 0, 1.7351],
 59 |                 ...
 60 |             ]
 61 |         },
 62 |         ...
 63 |     ]
 64 | }
 65 | ``` 
 66 | 
 67 | | entry | description |
 68 | | --- | ----|
 69 | | video2idx | `dict`, `{vid_name: vid_idx}`. A mapping of video names to unique video IDs for current set. From [tvr_video2dur_idx.json](../data/tvr_video2dur_idx.json). |
 70 | | VCMR | `list(dicts)`, stores predictions for the task `VCMR`. | 
 71 | | SVMR | `list(dicts)`, stores predictions for the task `SVMR`. Not required for `test-public` submission. | 
 72 | | VR | `list(dicts)`, stores predictions for the task `VR`. | 
 73 | 
 74 | The evaluation script will evaluate the predictions for tasks `[VCMR, SVMR, VR]` independently.
 75 | Each dict in VCMR/SVMR/VR list is:
 76 | ```
 77 | {
 78 |     "desc": str,
 79 |     "desc_id": int,
 80 |     "predictions": [[vid_id (int), st (float), ed (float), score (float)], ...]
 81 | }
 82 | ```
 83 | 
 84 | `predictions` is a `list` containing 100 `sublist`, each `sublist` has exactly 4 items: 
 85 | `[vid_id (int), st (float), ed (float), score (float)]`,
 86 | which are `vid_id` (video id), `st` and `ed` (moment start and end time, in seconds.), 
 87 | `score` (score of the prediction). 
 88 | The `score` item will not be used in the evaluation script, it is left here for record. 
 89 | 
 90 |  
 91 | ### Run Evaluation
 92 | At project root, run
 93 | ```
 94 | bash standalone_eval/eval_sample.sh 
 95 | ```
 96 | This command will use [eval.py](eval.py) to evaluate the provided `sample_val_predictions.json` file, 
 97 | the output will be written into `sample_val_predictions_metrics.json`. 
 98 | Its content should be similar if not the same as `sample_val_predictions_metrics_raw.json` file.
 99 | 
100 | 
101 | ### Codalab Submission
102 | To test your model's performance on `test-public` set, 
103 | please submit both `val` and `test-public` predictions to our 
104 | [Codalab evaluation server](https://competitions.codalab.org/competitions/22780). 
105 | The submission file should be a single `.zip ` file (no enclosing folder) 
106 | that contains the two prediction files 
107 | `tvr_test_public_submission.json` and `tvr_val_submission.json`, each of the `*submission.json` file 
108 | should be formatted as instructed above. 
109 | Note that `tvr_val_submission.json` will have all the 4 entries, while 
110 | `tvr_test_public_submission.json` will have only 3 entries, without `SVMR`.
111 | 
112 | 
113 | 


--------------------------------------------------------------------------------
/standalone_eval/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/26hzhang/ReLoCLNet/56cb666ce516cce9acbcfce78fb4e95d81e11e54/standalone_eval/__init__.py


--------------------------------------------------------------------------------
/standalone_eval/eval.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Load prediction file and GT file to calculate TVR metrics:
  3 | - recall at top K (R@K), for a specified IoU, where K in [1, 5, 10, 100], IoU in [0.5, 0.7]
  4 | """
  5 | import json
  6 | import numpy as np
  7 | from tqdm import tqdm
  8 | from collections import OrderedDict, defaultdict
  9 | 
 10 | 
 11 | def load_json(filename):
 12 |     with open(filename, "r") as f:
 13 |         return json.load(f)
 14 | 
 15 | 
 16 | def load_jsonl(filename):
 17 |     with open(filename, "r") as f:
 18 |         return [json.loads(l.strip("\n")) for l in f.readlines()]
 19 | 
 20 | 
 21 | def pad_sequences_1d_np(sequences, dtype=np.float32):
 22 | 
 23 |     """ Pad a single-nested list or a sequence of n-d array (torch.tensor or np.ndarray)
 24 |     into a (n+1)-d array, only allow the first dim has variable lengths.
 25 |     Args:
 26 |         sequences: list(n-d tensor or list)
 27 |         dtype: np.dtype or torch.dtype
 28 |     Returns:
 29 |         padded_seqs: ((n+1)-d tensor) padded with zeros
 30 |         mask: (2d tensor) of the same shape as the first two dims of padded_seqs,
 31 |               1 indicate valid, 0 otherwise
 32 |     Examples:
 33 |         >>> test_data_list = [[1,2,3], [1,2], [3,4,7,9]]
 34 |         >>> pad_sequences_1d(test_data_list, dtype=np.float32)
 35 |         >>> test_data_3d = [np.random.randn(2,3,4), np.random.randn(4,3,4), np.random.randn(1,3,4)]
 36 |         >>> pad_sequences_1d(test_data_3d, dtype=np.float32)
 37 |     """
 38 |     if isinstance(sequences[0], list):
 39 |         sequences = [np.asarray(s, dtype=dtype) for s in sequences]
 40 | 
 41 |     extra_dims = sequences[0].shape[1:]  # the extra dims should be the same for all elements
 42 |     lengths = [len(seq) for seq in sequences]
 43 |     assert "numpy" in str(dtype), "dtype and input type does not match"
 44 |     padded_seqs = np.zeros((len(sequences), max(lengths)) + extra_dims, dtype=dtype)
 45 |     mask = np.zeros((len(sequences), max(lengths)), dtype=np.float32)
 46 | 
 47 |     for idx, seq in enumerate(sequences):
 48 |         end = lengths[idx]
 49 |         padded_seqs[idx, :end] = seq
 50 |         mask[idx, :end] = 1
 51 |     return padded_seqs, mask
 52 | 
 53 | 
 54 | def compute_temporal_iou_batch(preds, gt):
 55 |     """ compute intersection-over-union along temporal axis
 56 |     This function is significantly faster than `compute_temporal_iou`,
 57 |     the result should be the same.
 58 |     Args:
 59 |         preds: np.ndarray, (N, 2), [st (float), ed (float)] * N
 60 |         gt: [st (float), ed (float)]
 61 |     Returns:
 62 |         iou (float): np.ndarray, (N, )
 63 | 
 64 |     References:
 65 |         for np.divide with zeros, see https://stackoverflow.com/a/37977222
 66 |     """
 67 |     intersection = np.maximum(0, np.minimum(preds[:, 1], gt[1]) - np.maximum(preds[:, 0], gt[0]))
 68 |     union = np.maximum(preds[:, 1], gt[1]) - np.minimum(preds[:, 0], gt[0])  # not the correct union though
 69 |     return np.divide(intersection, union, out=np.zeros_like(intersection), where=union != 0)
 70 | 
 71 | 
 72 | def get_rounded_percentage(float_number, n_floats=2):
 73 |     return round(float_number * 100, n_floats)
 74 | 
 75 | 
 76 | TASK_TYPES = OrderedDict([
 77 |     ("VCMR", "Video Corpus Moment Retrieval"),
 78 |     ("SVMR", "Single Video Moment Retrieval"),
 79 |     ("VR", "regular Video Retrieval")
 80 | ])
 81 | 
 82 | 
 83 | def eval_by_task_type(moment_predictions, video2idx, ground_truth,
 84 |                      iou_thds=(0.5, 0.7), recall_topks=(1, 5, 10, 100),
 85 |                      task_type="SVMR", max_pred_per_query=100, match_number=True, verbose=True, use_desc_type=True):
 86 |     """ a predicted triplet is positive only if:
 87 |     1) its vid_name matches the GT vid_name
 88 |     2) IoU between its timestamp and GT timestamp is higher than the given threshold
 89 | 
 90 |     moment_predictions w.r.t. different task_type:
 91 |         For each query, evaluated on top max_pred_per_query [vid_name, st, ed] triplets. (score entry ignored)
 92 |         VCMR: vid_name might be repeating.
 93 |         SVMR: vid_name is fixed to be the GT vid_name.
 94 |         VR: vid_name is not repeating, st and ed will not be used.
 95 | 
 96 |     Args:
 97 |         video2idx: {vid_name (str): index (int), ...}
 98 |         moment_predictions: list(dict), each dict is {
 99 |             "desc": str,
100 |             "desc_id": int,
101 |             "predictions": [vid_name_idx (int), st (float), ed (float), score (float)] * n_pred,
102 |                 sorted predictions, n_pred could be different for all dicts. For each prediction,
103 |                 only the first 3 elements [vid_name (str), st (float), ed (float),] are used,
104 |                 any other following elements are ignored. We leave score here for record.
105 |         }
106 |         ground_truth: list(dict), each dict is {
107 |             "desc": str,
108 |             "desc_id": int,
109 |             "type": str, one of [v, t, vt]
110 |             "vid_name": str
111 |             "ts": [st (float), ed (float)], or list([st (float), ed (float)]), len == 4.
112 |             ...
113 |         }
114 |         iou_thds: temporal IoU thresholds
115 |         recall_topks: recall at different top k
116 |         task_type: str, could be: ["VCMR", "SVMR", "VR"], see TASK_TYPES for definition.
117 |         max_pred_per_query: int, only top max_pred_per_query predictions for each query are used.
118 |         match_number: bool, must set to True if when do evaluation, False is only used for debug.
119 |         verbose:
120 |         use_desc_type: only TVR has desc type
121 |     Returns:
122 | 
123 |     """
124 |     assert task_type in TASK_TYPES, "task_type must be one of {}".format(list(TASK_TYPES.keys()))
125 |     if verbose:
126 |         print("Running evaluation with task_type {}, n results {}; n gt {}"
127 |               .format(task_type, len(moment_predictions), len(ground_truth)))
128 | 
129 |     predictions_by_desc_id = {e["desc_id"]: e for e in moment_predictions}
130 |     gt_by_desc_id = {e["desc_id"]: e for e in ground_truth}
131 |     desc_type2idx = {"v": 0, "t": 1, "vt": 2}
132 |     desc_types = []  # n_desc
133 | 
134 |     if match_number:
135 |         assert set(gt_by_desc_id.keys()) == set(predictions_by_desc_id.keys()), \
136 |             "desc_ids in predictions and ground_truth must match"
137 |     # assert len(set([len(e["predictions"]) for e in predictions_by_desc_id.values()])) == 1, \
138 |     #     "all queries must have the same number of predictions"
139 | 
140 |     pred_info_matrix_collection = []
141 |     for k, gt_item in tqdm(gt_by_desc_id.items(), desc="Loop over moments", leave=False):
142 |         if not match_number and k not in predictions_by_desc_id:
143 |             continue
144 |         pred_info_matrix = np.array(
145 |             [e[:3] for e in predictions_by_desc_id[k]["predictions"]][:max_pred_per_query],
146 |             dtype=np.float32)  # (n_pred, 3)
147 |         if use_desc_type:
148 |             desc_types.append(desc_type2idx[gt_item["type"]])
149 |         vid_name_matched_pred = pred_info_matrix[:, 0] == video2idx[gt_item["vid_name"]]  # bool, (n_pred, )
150 |         pred_info_matrix = np.concatenate([pred_info_matrix, vid_name_matched_pred[:, None]], axis=1)  # (n_pred, 4)
151 | 
152 |         # add 1 + len(iou_thds) columns, iou_scores, iou_corrects for each iou_thd.
153 |         iou_thd_corrects_columns = []
154 |         if len(gt_item["ts"]) >= 4:  # didemo, fro all 3 splits, at least 4 ts for each, < 0.5% has more than 4.
155 |             least_n_overlap = 2  # True if overlapped with at least least_n_overlap GT ts.
156 |             iou_corrects_dict = defaultdict(list)
157 |             for single_gt_ts in gt_item["ts"]:
158 |                 single_gt_ts = np.array(single_gt_ts, dtype=np.float32)  # (2, )
159 |                 # iou scores of the predictions that have wrong vid_name are set to 0.
160 |                 iou_scores = compute_temporal_iou_batch(pred_info_matrix[:, 1:3], single_gt_ts) * vid_name_matched_pred
161 |                 for iou_thd in iou_thds:
162 |                     iou_corrects_dict[iou_thd].append(iou_scores >= iou_thd)
163 |             for iou_thd in iou_thds:
164 |                 iou_corrects = sum(iou_corrects_dict[iou_thd]) >= least_n_overlap  # bool, (n_pred, )
165 |                 iou_thd_corrects_columns.append(iou_corrects[:, None])
166 | 
167 |         else:  # should be 2, len([st, ed]) == 2
168 |             single_gt_ts = np.array(gt_item["ts"], dtype=np.float32)  # (2, )
169 |             # iou scores of the predictions that have wrong vid_name are set to 0.
170 |             iou_scores = compute_temporal_iou_batch(pred_info_matrix[:, 1:3], single_gt_ts) * vid_name_matched_pred
171 | 
172 |             for iou_thd in iou_thds:
173 |                 iou_corrects = iou_scores >= iou_thd  # bool, (n_pred, )
174 |                 iou_thd_corrects_columns.append(iou_corrects[:, None])
175 | 
176 |         pred_info_matrix = np.concatenate([pred_info_matrix, ] + iou_thd_corrects_columns, axis=1)  # (n_pred, 6)
177 |         pred_info_matrix_collection.append(pred_info_matrix)
178 | 
179 |     # column header [vid_name_idx (int), st (float), ed (float), is_vid_name_match (bool),
180 |     # iou_scores>=iou_thd0 (bool), iou_scores>=iou_thd1 (bool)]
181 |     pred_info_matrix_collection = pad_sequences_1d_np(pred_info_matrix_collection)[0]  # (n_desc, n_pred, 6)
182 |     if use_desc_type:
183 |         desc_types = np.array(desc_types)  # (n_desc)
184 | 
185 |     # results wrapper
186 |     metrics = OrderedDict()
187 |     metrics_by_type = OrderedDict()
188 | 
189 |     iou_c_offset = 4  # iou_corrects column index starts here
190 |     if task_type == "VCMR":
191 |         for iou_idx, iou_thd in enumerate(iou_thds):
192 |             iou_corrects = pred_info_matrix_collection[:, :, iou_c_offset + iou_idx].astype(np.bool)  # (n_desc, n_pred)
193 |             # 1) there might be more than one positive clip, so use `>= 1`
194 |             for k in recall_topks:
195 |                 metrics["{}-r{}".format(iou_thd, k)] = \
196 |                     get_rounded_percentage(np.mean(np.sum(iou_corrects[:, :k], axis=1) >= 1))
197 |         if use_desc_type:
198 |             for desc_type in desc_type2idx:
199 |                 type_corrects = desc_types == desc_type2idx[desc_type]  # (n_desc)
200 |                 n_desc_in_type = np.sum(type_corrects)  # (n_desc)
201 |                 for iou_idx, iou_thd in enumerate(iou_thds):
202 |                     # (n_desc, n_pred)
203 |                     iou_corrects = pred_info_matrix_collection[:, :, iou_c_offset + iou_idx].astype(np.bool)
204 |                     for k in recall_topks:
205 |                         metrics_by_type["{}-{}-r{}".format(desc_type, iou_thd, k)] = get_rounded_percentage(
206 |                             1.0 * np.sum(np.logical_and(np.sum(iou_corrects[:, :k], axis=1) >= 1, type_corrects))
207 |                             / n_desc_in_type
208 |                         )
209 |     elif task_type == "SVMR":
210 |         vid_name_matched = pred_info_matrix_collection[:, :, 3].astype(np.bool)  # (n_desc, n_pred)
211 |         n_desc = len(vid_name_matched)
212 |         for iou_idx, iou_thd in enumerate(iou_thds):
213 |             iou_corrects = pred_info_matrix_collection[:, :, iou_c_offset + iou_idx].astype(np.bool)  # (n_desc, n_pred)
214 |             # 1) there might be more than one positive clip, so use `>= 1`
215 |             for k in recall_topks:
216 |                 metrics["{}-r{}".format(iou_thd, k)] = get_rounded_percentage(np.mean(
217 |                     [np.sum(iou_corrects[idx][vid_name_matched[idx]][:k]) >= 1 for idx in range(n_desc)]
218 |                 ))
219 |         if use_desc_type:
220 |             for desc_type in desc_type2idx:
221 |                 type_corrects = desc_types == desc_type2idx[desc_type]  # (n_desc)
222 |                 n_desc_in_type = np.sum(type_corrects)  # (n_desc)
223 |                 for iou_idx, iou_thd in enumerate(iou_thds):
224 |                     # (n_desc, n_pred)
225 |                     iou_corrects = pred_info_matrix_collection[:, :, iou_c_offset + iou_idx].astype(np.bool)
226 |                     # 1) there might be more than one positive clip, so use `>= 1`
227 |                     for k in recall_topks:
228 |                         metrics_by_type["{}-{}-r{}".format(desc_type, iou_thd, k)] = get_rounded_percentage(
229 |                             1.0 * np.sum([np.sum(iou_corrects[idx][vid_name_matched[idx]][:k]) >= 1 and type_corrects[idx]
230 |                                          for idx in range(n_desc)])
231 |                             / n_desc_in_type)
232 | 
233 |     elif task_type == "VR":
234 |         vid_name_matched = pred_info_matrix_collection[:, :, 3].astype(np.bool)  # (n_desc, n_pred)
235 |         for k in recall_topks:
236 |             metrics["r{}".format(k)] = \
237 |                 get_rounded_percentage(np.mean(np.sum(vid_name_matched[:, :k], axis=1) >= 1))
238 |         if use_desc_type:
239 |             for desc_type in desc_type2idx:
240 |                 type_corrects = desc_types == desc_type2idx[desc_type]  # (n_desc)
241 |                 n_desc_in_type = np.sum(type_corrects)  # (n_desc)
242 |                 for k in recall_topks:
243 |                     metrics_by_type["{}-r{}".format(desc_type, k)] = get_rounded_percentage(
244 |                         1.0 * np.sum(np.logical_and(np.sum(vid_name_matched[:, :k], axis=1) >= 1, type_corrects))
245 |                         / n_desc_in_type)
246 |     else:
247 |         raise ValueError("task_type wrong.")
248 |     if use_desc_type:
249 |         metrics_by_type["desc_type_ratio"] = "v {} t {} vt {}"\
250 |             .format(*[get_rounded_percentage(1.0 * np.sum(desc_types == desc_type2idx[k]) / len(desc_types))
251 |                       for k in ["v", "t", "vt"]])
252 |     return metrics, metrics_by_type
253 | 
254 | 
255 | def eval_retrieval(submission, ground_truth, iou_thds=(0.5, 0.7), verbose=True, match_number=True, use_desc_type=True):
256 |     video2idx = submission["video2idx"]
257 |     submitted_task_types = [k for k in TASK_TYPES if k in submission]
258 |     if verbose:
259 |         print("Evaluating for task {}".format(submitted_task_types))
260 |     eval_metrics = OrderedDict()
261 |     metrics_raw_dict = {}
262 |     for task_type in submitted_task_types:
263 |         metrics, metrics_by_type = eval_by_task_type(
264 |             submission[task_type], video2idx, ground_truth,
265 |             iou_thds=iou_thds, recall_topks=(1, 10, 100),  # (1, 5, 10, 20, 50, 100),
266 |             task_type=task_type, max_pred_per_query=100,
267 |             match_number=match_number, verbose=verbose, use_desc_type=use_desc_type)
268 |         metrics_raw_dict[task_type] = metrics
269 |         metrics_raw_dict[task_type+"_by_type"] = metrics_by_type
270 | 
271 |     for task_type in submitted_task_types:
272 |         eval_metrics[task_type] = metrics_raw_dict[task_type]
273 |     if use_desc_type:
274 |         for task_type in submitted_task_types:
275 |             eval_metrics[task_type+"_by_type"] = metrics_raw_dict[task_type+"_by_type"]
276 |     return eval_metrics
277 | 
278 | 
279 | def eval_main():
280 |     import argparse
281 |     parser = argparse.ArgumentParser(description="TVR Evaluation Script")
282 |     parser.add_argument("--submission_path", type=str, help="path to generated prediction file")
283 |     parser.add_argument("--gt_path", type=str, help="path to GT file")
284 |     parser.add_argument("--save_path", type=str, help="path to save the results")
285 |     parser.add_argument("--not_verbose", action="store_true")
286 |     args = parser.parse_args()
287 | 
288 |     verbose = not args.not_verbose
289 |     submission = load_json(args.submission_path)
290 |     gt = load_jsonl(args.gt_path)
291 |     results = eval_retrieval(submission, gt, iou_thds=(0.5, 0.7), verbose=verbose)
292 |     if verbose:
293 |         print(json.dumps(results, indent=4))
294 | 
295 |     with open(args.save_path, "w") as f:
296 |         f.write(json.dumps(results, indent=4))
297 | 
298 | 
299 | if __name__ == '__main__':
300 |     eval_main()
301 | 


--------------------------------------------------------------------------------
/standalone_eval/eval_sample.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # Usage: bash standalone_eval/eval_sample.sh
 3 | submission_path=standalone_eval/sample_val_predictions.json
 4 | gt_path=data/tvr_val_release.jsonl
 5 | save_path=standalone_eval/sample_val_predictions_metrics.json
 6 | 
 7 | python standalone_eval/eval.py \
 8 | --submission_path ${submission_path} \
 9 | --gt_path ${gt_path} \
10 | --save_path ${save_path}
11 | 


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/26hzhang/ReLoCLNet/56cb666ce516cce9acbcfce78fb4e95d81e11e54/utils/__init__.py


--------------------------------------------------------------------------------
/utils/basic_utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import zipfile
  4 | import numpy as np
  5 | import pickle
  6 | 
  7 | 
  8 | def uniform_feature_sampling(features, max_len):
  9 |     num_clips = features.shape[0]
 10 |     if max_len is None or num_clips <= max_len:
 11 |         return features
 12 |     idxs = np.arange(0, max_len + 1, 1.0) / max_len * num_clips
 13 |     idxs = np.round(idxs).astype(np.int32)
 14 |     idxs[idxs > num_clips - 1] = num_clips - 1
 15 |     new_features = []
 16 |     for i in range(max_len):
 17 |         s_idx, e_idx = idxs[i], idxs[i + 1]
 18 |         if s_idx < e_idx:
 19 |             new_features.append(np.mean(features[s_idx:e_idx], axis=0))
 20 |         else:
 21 |             new_features.append(features[s_idx])
 22 |     new_features = np.asarray(new_features)
 23 |     return new_features
 24 | 
 25 | 
 26 | def compute_overlap(pred, gt):
 27 |     # check format
 28 |     assert isinstance(pred, list) and isinstance(gt, list)
 29 |     pred_is_list = isinstance(pred[0], list)
 30 |     gt_is_list = isinstance(gt[0], list)
 31 |     pred = pred if pred_is_list else [pred]
 32 |     gt = gt if gt_is_list else [gt]
 33 |     # compute overlap
 34 |     pred, gt = np.array(pred), np.array(gt)
 35 |     inter_left = np.maximum(pred[:, 0, None], gt[None, :, 0])
 36 |     inter_right = np.minimum(pred[:, 1, None], gt[None, :, 1])
 37 |     inter = np.maximum(0.0, inter_right - inter_left)
 38 |     union_left = np.minimum(pred[:, 0, None], gt[None, :, 0])
 39 |     union_right = np.maximum(pred[:, 1, None], gt[None, :, 1])
 40 |     union = np.maximum(1e-12, union_right - union_left)
 41 |     overlap = 1.0 * inter / union
 42 |     # reformat output
 43 |     overlap = overlap if gt_is_list else overlap[:, 0]
 44 |     overlap = overlap if pred_is_list else overlap[0]
 45 |     return overlap
 46 | 
 47 | 
 48 | def time_to_index(start_time, end_time, num_units, duration):
 49 |     s_times = np.arange(0, num_units).astype(np.float32) / float(num_units) * duration
 50 |     e_times = np.arange(1, num_units + 1).astype(np.float32) / float(num_units) * duration
 51 |     candidates = np.stack([np.repeat(s_times[:, None], repeats=num_units, axis=1),
 52 |                            np.repeat(e_times[None, :], repeats=num_units, axis=0)], axis=2).reshape((-1, 2))
 53 |     overlaps = compute_overlap(candidates.tolist(), [start_time, end_time]).reshape(num_units, num_units)
 54 |     start_index = np.argmax(overlaps) // num_units
 55 |     end_index = np.argmax(overlaps) % num_units
 56 |     return start_index, end_index
 57 | 
 58 | 
 59 | def load_pickle(filename):
 60 |     with open(filename, "rb") as f:
 61 |         return pickle.load(f)
 62 | 
 63 | 
 64 | def save_pickle(data, filename):
 65 |     with open(filename, "wb") as f:
 66 |         pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL)
 67 | 
 68 | 
 69 | def load_json(filename):
 70 |     with open(filename, "r") as f:
 71 |         return json.load(f)
 72 | 
 73 | 
 74 | def save_json(data, filename, save_pretty=False, sort_keys=False):
 75 |     with open(filename, "w") as f:
 76 |         if save_pretty:
 77 |             f.write(json.dumps(data, indent=4, sort_keys=sort_keys))
 78 |         else:
 79 |             json.dump(data, f)
 80 | 
 81 | 
 82 | def load_jsonl(filename):
 83 |     with open(filename, "r") as f:
 84 |         return [json.loads(l.strip("\n")) for l in f.readlines()]
 85 | 
 86 | 
 87 | def save_jsonl(data, filename):
 88 |     """data is a list"""
 89 |     with open(filename, "w") as f:
 90 |         f.write("\n".join([json.dumps(e) for e in data]))
 91 | 
 92 | 
 93 | def save_lines(list_of_str, filepath):
 94 |     with open(filepath, "w") as f:
 95 |         f.write("\n".join(list_of_str))
 96 | 
 97 | 
 98 | def read_lines(filepath):
 99 |     with open(filepath, "r") as f:
100 |         return [e.strip("\n") for e in f.readlines()]
101 | 
102 | 
103 | def mkdirp(p):
104 |     if not os.path.exists(p):
105 |         os.makedirs(p)
106 | 
107 | 
108 | def flat_list_of_lists(l):
109 |     """flatten a list of lists [[1,2], [3,4]] to [1,2,3,4]"""
110 |     return [item for sublist in l for item in sublist]
111 | 
112 | 
113 | def convert_to_seconds(hms_time):
114 |     """ convert '00:01:12' to 72 seconds.
115 |     :hms_time (str): time in comma separated string, e.g. '00:01:12'
116 |     :return (int): time in seconds, e.g. 72
117 |     """
118 |     times = [float(t) for t in hms_time.split(":")]
119 |     return times[0] * 3600 + times[1] * 60 + times[2]
120 | 
121 | 
122 | def get_video_name_from_url(url):
123 |     return url.split("/")[-1][:-4]
124 | 
125 | 
126 | def merge_dicts(list_dicts):
127 |     merged_dict = list_dicts[0].copy()
128 |     for i in range(1, len(list_dicts)):
129 |         merged_dict.update(list_dicts[i])
130 |     return merged_dict
131 | 
132 | 
133 | def l2_normalize_np_array(np_array, eps=1e-5):
134 |     """np_array: np.ndarray, (*, D), where the last dim will be normalized"""
135 |     return np_array / (np.linalg.norm(np_array, axis=-1, keepdims=True) + eps)
136 | 
137 | 
138 | def make_zipfile(src_dir, save_path, enclosing_dir="", exclude_dirs=None, exclude_extensions=None,
139 |                  exclude_dirs_substring=None):
140 |     """make a zip file of root_dir, save it to save_path.
141 |     exclude_paths will be excluded if it is a subdir of root_dir.
142 |     An enclosing_dir is added is specified.
143 |     """
144 |     abs_src = os.path.abspath(src_dir)
145 |     with zipfile.ZipFile(save_path, "w") as zf:
146 |         for dirname, subdirs, files in os.walk(src_dir):
147 |             if exclude_dirs is not None:
148 |                 for e_p in exclude_dirs:
149 |                     if e_p in subdirs:
150 |                         subdirs.remove(e_p)
151 |             if exclude_dirs_substring is not None:
152 |                 to_rm = []
153 |                 for d in subdirs:
154 |                     if exclude_dirs_substring in d:
155 |                         to_rm.append(d)
156 |                 for e in to_rm:
157 |                     subdirs.remove(e)
158 |             arcname = os.path.join(enclosing_dir, dirname[len(abs_src) + 1:])
159 |             zf.write(dirname, arcname)
160 |             for filename in files:
161 |                 if exclude_extensions is not None:
162 |                     if os.path.splitext(filename)[1] in exclude_extensions:
163 |                         continue  # do not zip it
164 |                 absname = os.path.join(dirname, filename)
165 |                 arcname = os.path.join(enclosing_dir, absname[len(abs_src) + 1:])
166 |                 zf.write(absname, arcname)
167 | 
168 | 
169 | class AverageMeter(object):
170 |     """Computes and stores the average and current/max/min value"""
171 |     def __init__(self):
172 |         self.val = 0
173 |         self.avg = 0
174 |         self.sum = 0
175 |         self.count = 0
176 |         self.max = -1e10
177 |         self.min = 1e10
178 |         self.reset()
179 | 
180 |     def reset(self):
181 |         self.val = 0
182 |         self.avg = 0
183 |         self.sum = 0
184 |         self.count = 0
185 |         self.max = -1e10
186 |         self.min = 1e10
187 | 
188 |     def update(self, val, n=1):
189 |         self.max = max(val, self.max)
190 |         self.min = min(val, self.min)
191 |         self.val = val
192 |         self.sum += val * n
193 |         self.count += n
194 |         self.avg = self.sum / self.count
195 | 
196 | 
197 | def dissect_by_lengths(np_array, lengths, dim=0, assert_equal=True):
198 |     """Dissect an array (N, D) into a list a sub-array,
199 |     np_array.shape[0] == sum(lengths), Output is a list of nd arrays, singlton dimention is kept"""
200 |     if assert_equal:
201 |         assert len(np_array) == sum(lengths)
202 |     length_indices = [0, ]
203 |     for i in range(len(lengths)):
204 |         length_indices.append(length_indices[i] + lengths[i])
205 |     if dim == 0:
206 |         array_list = [np_array[length_indices[i]:length_indices[i+1]] for i in range(len(lengths))]
207 |     elif dim == 1:
208 |         array_list = [np_array[:, length_indices[i]:length_indices[i + 1]] for i in range(len(lengths))]
209 |     elif dim == 2:
210 |         array_list = [np_array[:, :, length_indices[i]:length_indices[i + 1]] for i in range(len(lengths))]
211 |     else:
212 |         raise NotImplementedError
213 |     return array_list
214 | 
215 | 
216 | def get_ratio_from_counter(counter_obj, threshold=200):
217 |     keys = counter_obj.keys()
218 |     values = counter_obj.values()
219 |     filtered_values = [counter_obj[k] for k in keys if k > threshold]
220 |     return float(sum(filtered_values)) / sum(values)
221 | 
222 | 
223 | def get_show_name(vid_name):
224 |     """
225 |     get tvshow name from vid_name
226 |     :param vid_name: video clip name
227 |     :return: tvshow name
228 |     """
229 |     show_list = ["friends", "met", "castle", "house", "grey"]
230 |     vid_name_prefix = vid_name.split("_")[0]
231 |     show_name = vid_name_prefix if vid_name_prefix in show_list else "bbt"
232 |     return show_name
233 | 


--------------------------------------------------------------------------------
/utils/mk_video_split_with_duration.py:
--------------------------------------------------------------------------------
 1 | from utils.basic_utils import load_json, save_json
 2 | 
 3 | 
 4 | def combine(video_name_split_path, video_duration_path, save_path):
 5 |     video_name_split = load_json(video_name_split_path)
 6 |     video_duration_dict = load_json(video_duration_path)
 7 | 
 8 |     combined_dict = {}
 9 |     for split_name, split_video_names in video_name_split.items():
10 |         combined_dict[split_name] = {vid_name: video_duration_dict[vid_name]
11 |                                      for vid_name in split_video_names}
12 |     save_json(combined_dict, save_path)
13 | 
14 | 
15 | if __name__ == '__main__':
16 |     import sys
17 |     combine(*sys.argv[1:])
18 | 
19 | 


--------------------------------------------------------------------------------
/utils/model_utils.py:
--------------------------------------------------------------------------------
  1 | __author__ = "Jie Lei"
  2 | 
  3 | #  ref: https://github.com/lichengunc/MAttNet/blob/master/lib/layers/lang_encoder.py#L11
  4 | #  ref: https://github.com/easonnie/flint/blob/master/torch_util.py#L272
  5 | import torch
  6 | import torch.nn as nn
  7 | from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
  8 | 
  9 | 
 10 | class RNNEncoder(nn.Module):
 11 |     """A RNN wrapper handles variable length inputs, always set batch_first=True.
 12 |     Supports LSTM, GRU and RNN. Tested with PyTorch 0.3 and 0.4
 13 |     """
 14 |     def __init__(self, word_embedding_size, hidden_size, bidirectional=True,
 15 |                  dropout_p=0, n_layers=1, rnn_type="lstm",
 16 |                  return_hidden=True, return_outputs=True,
 17 |                  allow_zero=False):
 18 |         super(RNNEncoder, self).__init__()
 19 |         """  
 20 |         :param word_embedding_size: rnn input size
 21 |         :param hidden_size: rnn output size
 22 |         :param dropout_p: between rnn layers, only useful when n_layer >= 2
 23 |         """
 24 |         self.allow_zero = allow_zero
 25 |         self.rnn_type = rnn_type
 26 |         self.n_dirs = 2 if bidirectional else 1
 27 |         # - add return_hidden keyword arg to reduce computation if hidden is not needed.
 28 |         self.return_hidden = return_hidden
 29 |         self.return_outputs = return_outputs
 30 |         self.rnn = getattr(nn, rnn_type.upper())(word_embedding_size, hidden_size, n_layers,
 31 |                                                  batch_first=True,
 32 |                                                  bidirectional=bidirectional,
 33 |                                                  dropout=dropout_p)
 34 | 
 35 |     def sort_batch(self, seq, lengths):
 36 |         sorted_lengths, perm_idx = lengths.sort(0, descending=True)
 37 |         if self.allow_zero:  # deal with zero by change it to one.
 38 |             sorted_lengths[sorted_lengths == 0] = 1
 39 |         reverse_indices = [0] * len(perm_idx)
 40 |         for i in range(len(perm_idx)):
 41 |             reverse_indices[perm_idx[i]] = i
 42 |         sorted_seq = seq[perm_idx]
 43 |         return sorted_seq, list(sorted_lengths), reverse_indices
 44 | 
 45 |     def forward(self, inputs, lengths):
 46 |         """
 47 |         inputs, sorted_inputs -> (B, T, D)
 48 |         lengths -> (B, )
 49 |         outputs -> (B, T, n_dirs * D)
 50 |         hidden -> (n_layers * n_dirs, B, D) -> (B, n_dirs * D)  keep the last layer
 51 |         - add total_length in pad_packed_sequence for compatiblity with nn.DataParallel, --remove it
 52 |         """
 53 |         assert len(inputs) == len(lengths)
 54 |         sorted_inputs, sorted_lengths, reverse_indices = self.sort_batch(inputs, lengths)
 55 |         packed_inputs = pack_padded_sequence(sorted_inputs, sorted_lengths, batch_first=True)
 56 |         outputs, hidden = self.rnn(packed_inputs)
 57 |         if self.return_outputs:
 58 |             # outputs, lengths = pad_packed_sequence(outputs, batch_first=True, total_length=int(max(lengths)))
 59 |             outputs, lengths = pad_packed_sequence(outputs, batch_first=True)
 60 |             outputs = outputs[reverse_indices]
 61 |         else:
 62 |             outputs = None
 63 |         if self.return_hidden:  #
 64 |             if self.rnn_type.lower() == "lstm":
 65 |                 hidden = hidden[0]
 66 |             hidden = hidden[-self.n_dirs:, :, :]
 67 |             hidden = hidden.transpose(0, 1).contiguous()
 68 |             hidden = hidden.view(hidden.size(0), -1)
 69 |             hidden = hidden[reverse_indices]
 70 |         else:
 71 |             hidden = None
 72 |         return outputs, hidden
 73 | 
 74 | 
 75 | def pool_across_time(outputs, lengths, pool_type="max"):
 76 |     """ Get maximum responses from RNN outputs along time axis
 77 |     :param outputs: (B, T, D)
 78 |     :param lengths: (B, )
 79 |     :param pool_type: str, 'max' or 'mean'
 80 |     :return: (B, D)
 81 |     """
 82 |     if pool_type == "max":
 83 |         outputs = [outputs[i, :int(lengths[i]), :].max(dim=0)[0] for i in range(len(lengths))]
 84 |     elif pool_type == "mean":
 85 |         outputs = [outputs[i, :int(lengths[i]), :].mean(dim=0) for i in range(len(lengths))]
 86 |     else:
 87 |         raise NotImplementedError("Only support mean and max pooling")
 88 |     return torch.stack(outputs, dim=0)
 89 | 
 90 | 
 91 | def count_parameters(model, verbose=True):
 92 |     """Count number of parameters in PyTorch model,
 93 |     References: https://discuss.pytorch.org/t/how-do-i-check-the-number-of-parameters-of-a-model/4325/7.
 94 | 
 95 |     from utils.utils import count_parameters
 96 |     count_parameters(model)
 97 |     import sys
 98 |     sys.exit(1)
 99 |     """
100 |     n_all = sum(p.numel() for p in model.parameters())
101 |     n_trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
102 |     if verbose:
103 |         print("Parameter Count: all {:,d}; trainable {:,d}".format(n_all, n_trainable))
104 |     return n_all, n_trainable
105 | 
106 | 


--------------------------------------------------------------------------------
/utils/temporal_nms.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Non-Maximum Suppression for video proposals.
 3 | """
 4 | 
 5 | 
 6 | def compute_temporal_iou(pred, gt):
 7 |     """ deprecated due to performance concerns
 8 |     compute intersection-over-union along temporal axis
 9 |     Args:
10 |         pred: [st (float), ed (float)]
11 |         gt: [st (float), ed (float)]
12 |     Returns:
13 |         iou (float):
14 | 
15 |     Ref: https://github.com/LisaAnne/LocalizingMoments/blob/master/utils/eval.py
16 |     """
17 |     intersection = max(0, min(pred[1], gt[1]) - max(pred[0], gt[0]))
18 |     union = max(pred[1], gt[1]) - min(pred[0], gt[0])  # not the correct union though
19 |     if union == 0:
20 |         return 0
21 |     else:
22 |         return 1.0 * intersection / union
23 | 
24 | 
25 | def temporal_non_maximum_suppression(predictions, nms_threshold, max_after_nms=100):
26 |     """
27 |     Args:
28 |         predictions: list(sublist), each sublist is [st (float), ed(float), score (float)],
29 |             note larger scores are better and are preserved. For metrics that are better when smaller,
30 |             please convert to its negative, e.g., convert distance to negative distance.
31 |         nms_threshold: float in [0, 1]
32 |         max_after_nms:
33 |     Returns:
34 |         predictions_after_nms: list(sublist), each sublist is [st (float), ed(float), score (float)]
35 |     References:
36 |         https://github.com/wzmsltw/BSN-boundary-sensitive-network/blob/7b101fc5978802aa3c95ba5779eb54151c6173c6/Post_processing.py#L42
37 |     """
38 |     if len(predictions) == 1:  # only has one prediction, no need for nms
39 |         return predictions
40 | 
41 |     predictions = sorted(predictions, key=lambda x: x[2], reverse=True)  # descending order
42 | 
43 |     tstart = [e[0] for e in predictions]
44 |     tend = [e[1] for e in predictions]
45 |     tscore = [e[2] for e in predictions]
46 |     rstart = []
47 |     rend = []
48 |     rscore = []
49 |     while len(tstart) > 1 and len(rscore) < max_after_nms:  # max 100 after nms
50 |         idx = 1
51 |         while idx < len(tstart):  # compare with every prediction in the list.
52 |             if compute_temporal_iou([tstart[0], tend[0]], [tstart[idx], tend[idx]]) > nms_threshold:
53 |                 # rm highly overlapped lower score entries.
54 |                 tstart.pop(idx)
55 |                 tend.pop(idx)
56 |                 tscore.pop(idx)
57 |                 # print("--------------------------------")
58 |                 # print(compute_temporal_iou([tstart[0], tend[0]], [tstart[idx], tend[idx]]))
59 |                 # print([tstart[0], tend[0]], [tstart[idx], tend[idx]])
60 |                 # print(tstart.pop(idx), tend.pop(idx), tscore.pop(idx))
61 |             else:
62 |                 # move to next
63 |                 idx += 1
64 |         rstart.append(tstart.pop(0))
65 |         rend.append(tend.pop(0))
66 |         rscore.append(tscore.pop(0))
67 | 
68 |     if len(rscore) < max_after_nms and len(tstart) >= 1:  # add the last, possibly empty.
69 |         rstart.append(tstart.pop(0))
70 |         rend.append(tend.pop(0))
71 |         rscore.append(tscore.pop(0))
72 | 
73 |     predictions_after_nms = [[st, ed, s] for s, st, ed in zip(rscore, rstart, rend)]
74 |     return predictions_after_nms
75 | 


--------------------------------------------------------------------------------
/utils/tensor_utils.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | 
  4 | 
  5 | def pad_sequences_1d(sequences, dtype=torch.long, device=torch.device("cpu"), fixed_length=None):
  6 |     """ Pad a single-nested list or a sequence of n-d array (torch.tensor or np.ndarray)
  7 |     into a (n+1)-d array, only allow the first dim has variable lengths.
  8 |     Args:
  9 |         sequences: list(n-d tensor or list)
 10 |         dtype: np.dtype or torch.dtype
 11 |         device:
 12 |         fixed_length: pad all seq in sequences to fixed length. All seq should have a length <= fixed_length.
 13 |             return will be of shape [len(sequences), fixed_length, ...]
 14 |     Returns:
 15 |         padded_seqs: ((n+1)-d tensor) padded with zeros
 16 |         mask: (2d tensor) of the same shape as the first two dims of padded_seqs,
 17 |               1 indicate valid, 0 otherwise
 18 |     Examples:
 19 |         >>> test_data_list = [[1,2,3], [1,2], [3,4,7,9]]
 20 |         >>> pad_sequences_1d(test_data_list, dtype=torch.long)
 21 |         >>> test_data_3d = [torch.randn(2,3,4), torch.randn(4,3,4), torch.randn(1,3,4)]
 22 |         >>> pad_sequences_1d(test_data_3d, dtype=torch.float)
 23 |         >>> test_data_list = [[1,2,3], [1,2], [3,4,7,9]]
 24 |         >>> pad_sequences_1d(test_data_list, dtype=np.float32)
 25 |         >>> test_data_3d = [np.random.randn(2,3,4), np.random.randn(4,3,4), np.random.randn(1,3,4)]
 26 |         >>> pad_sequences_1d(test_data_3d, dtype=np.float32)
 27 |     """
 28 |     if isinstance(sequences[0], list):
 29 |         if "torch" in str(dtype):
 30 |             sequences = [torch.tensor(s, dtype=dtype, device=device) for s in sequences]
 31 |         else:
 32 |             sequences = [np.asarray(s, dtype=dtype) for s in sequences]
 33 | 
 34 |     extra_dims = sequences[0].shape[1:]  # the extra dims should be the same for all elements
 35 |     lengths = [len(seq) for seq in sequences]
 36 |     if fixed_length is not None:
 37 |         max_length = fixed_length
 38 |     else:
 39 |         max_length = max(lengths)
 40 |     if isinstance(sequences[0], torch.Tensor):
 41 |         assert "torch" in str(dtype), "dtype and input type does not match"
 42 |         padded_seqs = torch.zeros((len(sequences), max_length) + extra_dims, dtype=dtype, device=device)
 43 |         mask = torch.zeros((len(sequences), max_length), dtype=torch.float32, device=device)
 44 |     else:  # np
 45 |         assert "numpy" in str(dtype), "dtype and input type does not match"
 46 |         padded_seqs = np.zeros((len(sequences), max_length) + extra_dims, dtype=dtype)
 47 |         mask = np.zeros((len(sequences), max_length), dtype=np.float32)
 48 | 
 49 |     for idx, seq in enumerate(sequences):
 50 |         end = lengths[idx]
 51 |         padded_seqs[idx, :end] = seq
 52 |         mask[idx, :end] = 1
 53 |     return padded_seqs, mask  # , lengths
 54 | 
 55 | 
 56 | def pad_sequences_2d(sequences, dtype=torch.long):
 57 |     """ Pad a double-nested list or a sequence of n-d torch tensor into a (n+1)-d tensor,
 58 |         only allow the first two dims has variable lengths
 59 |     Args:
 60 |         sequences: list(n-d tensor or list)
 61 |         dtype: torch.long for word indices / torch.float (float32) for other cases
 62 |     Returns:
 63 |     Examples:
 64 |         >>> test_data_list = [[[1, 3, 5], [3, 7, 4, 1]], [[98, 34, 11, 89, 90], [22], [34, 56]],]
 65 |         >>> pad_sequences_2d(test_data_list, dtype=torch.long)  # torch.Size([2, 3, 5])
 66 |         >>> test_data_3d = [torch.randn(2,2,4), torch.randn(4,3,4), torch.randn(1,5,4)]
 67 |         >>> pad_sequences_2d(test_data_3d, dtype=torch.float)  # torch.Size([2, 3, 5])
 68 |         >>> test_data_3d2 = [[torch.randn(2,4), ], [torch.randn(3,4), torch.randn(5,4)]]
 69 |         >>> pad_sequences_2d(test_data_3d2, dtype=torch.float)  # torch.Size([2, 3, 5])
 70 |     # TODO add support for numpy array
 71 |     """
 72 |     bsz = len(sequences)
 73 |     para_lengths = [len(seq) for seq in sequences]
 74 |     max_para_len = max(para_lengths)
 75 |     sen_lengths = [[len(word_seq) for word_seq in seq] for seq in sequences]
 76 |     max_sen_len = max([max(e) for e in sen_lengths])
 77 | 
 78 |     if isinstance(sequences[0], torch.Tensor):
 79 |         extra_dims = sequences[0].shape[2:]
 80 |     elif isinstance(sequences[0][0], torch.Tensor):
 81 |         extra_dims = sequences[0][0].shape[1:]
 82 |     else:
 83 |         sequences = [[torch.Tensor(word_seq, dtype=dtype) for word_seq in seq] for seq in sequences]
 84 |         extra_dims = ()
 85 | 
 86 |     padded_seqs = torch.zeros((bsz, max_para_len, max_sen_len) + extra_dims, dtype=dtype)
 87 |     mask = torch.zeros(bsz, max_para_len, max_sen_len).float()
 88 | 
 89 |     for b_i in range(bsz):
 90 |         for sen_i, sen_l in enumerate(sen_lengths[b_i]):
 91 |             padded_seqs[b_i, sen_i, :sen_l] = sequences[b_i][sen_i]
 92 |             mask[b_i, sen_i, :sen_l] = 1
 93 |     return padded_seqs, mask  # , sen_lengths
 94 | 
 95 | 
 96 | def find_max_triples(st_prob, ed_prob, top_n=5, prob_thd=None, tensor_type="torch"):
 97 |     """ Find a list of (k1, k2) where k1 < k2 with the maximum values of st_prob[k1] * ed_prob[k2]
 98 |     Args:
 99 |         st_prob (torch.Tensor or np.ndarray): (N, L) batched start_idx probabilities
100 |         ed_prob (torch.Tensor  or np.ndarray): (N, L) batched end_idx probabilities
101 |         top_n (int): return topN pairs with highest values
102 |         prob_thd (float):
103 |         tensor_type: str, np or torch
104 |     Returns:
105 |         batched_sorted_triple: N * [(st_idx, ed_idx, confidence), ...]
106 |     """
107 |     if tensor_type == "torch":
108 |         st_prob, ed_prob = st_prob.data.numpy(), ed_prob.data.numpy()
109 |     product = np.einsum("bm,bn->bmn", st_prob, ed_prob)
110 |     # (N, L, L) the lower part becomes zeros, start_idx < ed_idx
111 |     upper_product = np.triu(product, k=1)
112 |     return find_max_triples_from_upper_triangle_product(upper_product, top_n=top_n, prob_thd=prob_thd)
113 | 
114 | 
115 | def find_max_triples_from_upper_triangle_product(upper_product, top_n=5, prob_thd=None):
116 |     """ Find a list of (k1, k2) where k1 < k2 with the maximum values of p1[k1] * p2[k2]
117 |     Args:
118 |         upper_product (torch.Tensor or np.ndarray): (N, L, L), the lower part becomes zeros, end_idx > start_idx
119 |         top_n (int): return topN pairs with highest values
120 |         prob_thd (float or None):
121 |     Returns:
122 |         batched_sorted_triple: N * [(st_idx, ed_idx, confidence), ...]
123 |     """
124 |     batched_sorted_triple = []
125 |     for idx, e in enumerate(upper_product):
126 |         sorted_triple = top_n_array_2d(e, top_n=top_n)
127 |         if prob_thd is not None:
128 |             sorted_triple = sorted_triple[sorted_triple[2] >= prob_thd]
129 |         batched_sorted_triple.append(sorted_triple)
130 |     return batched_sorted_triple
131 | 
132 | 
133 | def top_n_array_2d(array_2d, top_n):
134 |     """ Get topN indices and values of a 2d array, return a tuple of indices and their values,
135 |     ranked by the value
136 |     """
137 |     row_indices, column_indices = np.unravel_index(np.argsort(array_2d, axis=None), array_2d.shape)
138 |     row_indices = row_indices[::-1][:top_n]
139 |     column_indices = column_indices[::-1][:top_n]
140 |     sorted_values = array_2d[row_indices, column_indices]
141 |     return np.stack([row_indices, column_indices, sorted_values], axis=1)  # (N, 3)
142 | 


--------------------------------------------------------------------------------
/utils/text_feature/README.md:
--------------------------------------------------------------------------------
 1 | Language Model Fine-tuning and Feature Extraction
 2 | ====
 3 | 
 4 | ### Install Dependencies
 5 | 
 6 | The code requires installing [transformers](https://github.com/huggingface/transformers) package as well as [tensorboardX](https://github.com/lanpa/tensorboardX):
 7 | ```
 8 | # install transformers
 9 | git clone https://github.com/huggingface/transformers.git
10 | cd transformers
11 | git checkout e1b2949ae6cb34cc39e3934ca87423474f8c8d02
12 | pip install .
13 | 
14 | # install tensorboardX
15 | pip install tensorboardX
16 | ```
17 | 
18 | ###  Language Model Fine-tuning
19 | 
20 | We fine-tune pre-trained [RoBERTa](https://arxiv.org/abs/1907.11692) base Model on TVR text with Masked Language Model (MLM) objective for 1 epoch:
21 | ```
22 | bash utils/text_feature/train_lm_finetuning_single_sentence.sh FINETUNE_MODE OUTPUT_ROOT
23 | ```
24 | `FINETUNE_MODE` could be `query_only` where only query text (in train set) is used to fine-tune the pre-trained model, 
25 | this feature is used when we want to test model performance without subtitles. It can also be `sub_query` where 
26 | both subtitle and query text are used in the fine-tuning process. `OUTPUT_ROOT` is a directory used to store the 
27 | fine-tuned model and extracted features. You can append an additional `--debug` flag after the command to do 
28 | a fast run of the code to test your configuration before actually running fine-tuning.
29 | 
30 | At fine-tuning, each query is treated as a single sequence, each subtitle is split into max-length=256 segments 
31 | where each of the resulting segments wil be treated as a single sequence.
32 | 
33 | ### Feature Extraction
34 | After fine-tuning, you will get fine-tuned model at `OUTPUT_ROOT/FINETUNE_MODE/roberta-base_tuned_model`. 
35 | 
36 | Extract features at token-level:
37 | ```
38 | bash utils/text_feature/extract_single_sentence_embeddings.sh \
39 | OUTPUT_ROOT FINETUNE_MODE EXTRACTION_MODE SAVE_FILEPATH
40 | ```
41 | `EXTRACTION_MODE` could be `sub` or `query`, 
42 | `SAVE_FILEPATH` is a `.h5` filepath that will save the extracted features.
43 | 
44 | To get the tokens that correspond to these feature vectors, run
45 | ```
46 | bash utils/text_feature/extract_single_sentence_tokens.sh \
47 | OUTPUT_ROOT FINETUNE_MODE EXTRACTION_MODE SAVE_FILEPATH
48 | ```
49 | `SAVE_FILEPATH` is a `.jsonl` filepath that stores the extracted tokens. 
50 | This is useful if you want to visualize attentions from the attended feature vectors back to the word tokens.
51 | 
52 | The extracted query features can be directly used for training our XML model,
53 | while subtitle features needs one additional step: convert token-level features to clip-level features. 
54 | Specifically, we max-pool/avg-pool the subtitle token embeddings every 1.5 seconds to get the clip-level 
55 | embeddings:
56 | ```
57 | bash utils/text_feature/convert_sub_feature_word_to_clip.sh \
58 | POOL_TYPE CLIP_LENGTH SUB_TOKEN_H5 SUB_CLIP_H5 VID_CLIP_H5  
59 | ```
60 | `POOL_TYPE` could be `max` or `avg`, which defines how to aggregate token-level features to clip-level features.
61 | `CLIP_LENGTH` is set to 1.5 (seconds). `SUB_TOKEN_H5` is the path to extracted subtitle token-level features.
62 | `SUB_CLIP_H5` is the path to save the aggregated subtitle clip-level features. 
63 | `VID_CLIP_H5` is the path to extracted video clip-level features, 
64 | which is used to make sure each subtitle's clip-level features 
65 | has the same length as the its corresponding video clip-level features.
66 | 
67 | 
68 | 


--------------------------------------------------------------------------------
/utils/text_feature/convert_sub_feature_word_to_clip.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import h5py
  3 | import numpy as np
  4 | from tqdm import tqdm
  5 | from collections import Counter
  6 | from utils.basic_utils import flat_list_of_lists, load_jsonl, save_json, load_json
  7 | 
  8 | 
  9 | def process_single_vid_sub(sub_listdicts, clip_length):
 10 |     """
 11 |     Args:
 12 |         sub_listdicts: list(dicts), each dict is, e.g.,
 13 |             {'text': " Chase : That's all this is?", 'start': 0.862, 'end': 1.862}
 14 |         clip_length: float
 15 |     Returns:
 16 |         clip_idx2sentence_indices: dict, {clip_idx: [sen_idx1, sen_idx2, ...]}, which sentences are
 17 |             associated with which clips. The indices are in ascending order, i.e., sen_idx1 < sen_idx2 < ...
 18 |     """
 19 |     timestamps = np.array([[e["start"], e["end"]] for e in sub_listdicts], dtype=np.float32)  # (n_sub_sen, 2)
 20 |     timestamps = timestamps / clip_length
 21 |     # r-th row of clip_indices is [st_idx, ed_idx), where [st_idx, st_idx+1, ..., ed_idx-1]
 22 |     # should be with r-th clip, which is [r*clip_length, (r+1)*clip_length]
 23 |     sentence2clip_st_ed = np.empty_like(timestamps, dtype=np.int)
 24 |     sentence2clip_st_ed[:, 0] = np.floor(timestamps[:, 0])
 25 |     sentence2clip_st_ed[:, 1] = np.ceil(timestamps[:, 1])
 26 |     sentence_idx2clip_indices = {sen_idx: set(range(clip_st_idx, clip_ed_idx))
 27 |                                  for sen_idx, (clip_st_idx, clip_ed_idx) in enumerate(sentence2clip_st_ed)}
 28 |     all_clip_indices = set(flat_list_of_lists(list(sentence_idx2clip_indices.values())))
 29 |     clip_idx2sentence_indices = \
 30 |         {str(clip_idx): sorted([k for k, v in sentence_idx2clip_indices.items() if clip_idx in v])
 31 |          for clip_idx in all_clip_indices}
 32 |     return clip_idx2sentence_indices
 33 | 
 34 | 
 35 | def load_process_sub_meta(sub_meta_path, clip_length):
 36 |     """ which subtitle sentences should be assigned to which clips
 37 |     Args:
 38 |         sub_meta_path: contains a jsonl file, each line is a dict {"vid_name": str, "sub": list(dicts)},
 39 |             each dict under "sub" is, e.g., {'text': " Chase : That's all this is?", 'start': 0.862, 'end': 1.862}.
 40 |             The dicts under "sub" are ordered the same as the original .srt files.
 41 |         clip_length: float, assign each subtitle sentence to a clip segment
 42 |     Returns:
 43 |     """
 44 |     video2sub = {e["vid_name"]: e for e in load_jsonl(sub_meta_path)}
 45 |     for vid_name, sub_info in tqdm(video2sub.items(), desc="processing subtitles"):
 46 |         sub_info["clip2sen"] = process_single_vid_sub(sub_info["sub"], clip_length)
 47 |         video2sub[vid_name] = sub_info
 48 |     return video2sub
 49 | 
 50 | 
 51 | def convert_h5(sub_words_h5, vid_clip_h5, sub_clip_h5, video2sub_info, pool_type="max", debug=False):
 52 |     assert pool_type in ["max", "avg"]
 53 |     np_pool_func = np.max if pool_type == "max" else np.mean
 54 |     debug_cnt = 0
 55 |     not_equal_cnt = []
 56 |     skip_cnt = 0
 57 |     for k in tqdm(sub_words_h5.keys(), desc="Converting to clip features"):
 58 |         if "-lengths" in k:
 59 |             continue
 60 |         sub_words_features = sub_words_h5[k]
 61 |         sub_sen_lengths = sub_words_h5[k + "-lengths"]
 62 |         num_sens = len(sub_sen_lengths)
 63 |         clip2sen = video2sub_info[k]["clip2sen"]
 64 | 
 65 |         if len(sub_sen_lengths) != len(video2sub_info[k]["sub"]):
 66 |             not_equal_cnt.append(len(video2sub_info[k]["sub"]) - len(sub_sen_lengths))
 67 | 
 68 |         length_indices = [0, ]
 69 |         for i in range(len(sub_sen_lengths)):
 70 |             length_indices.append(length_indices[i] + sub_sen_lengths[i])
 71 | 
 72 |         n_clips = len(vid_clip_h5[k])
 73 |         clip_features = np.zeros((n_clips, sub_words_features.shape[-1]), dtype=np.float32)
 74 |         clip_mask = np.zeros(n_clips, dtype=np.float32)
 75 |         for clip_idx in range(n_clips):
 76 |             if str(clip_idx) in clip2sen:
 77 |                 # the sen_indices tells which sentences belong to this clip,
 78 |                 # e.g., [1, 2, 3] mean we should get [1, 4) to include all the indicated sentences
 79 |                 sen_indices = [min(e, num_sens-1) for e in clip2sen[str(clip_idx)]]
 80 |                 word_st_idx = length_indices[sen_indices[0]]
 81 |                 word_ed_idx = length_indices[sen_indices[-1] + 1]
 82 |                 if word_st_idx == word_ed_idx:
 83 |                     skip_cnt += 1
 84 |                     continue
 85 |                 clip_features[clip_idx] = np_pool_func(sub_words_features[word_st_idx:word_ed_idx], axis=0)
 86 |                 clip_mask[clip_idx] = 1
 87 |         sub_clip_h5.create_dataset(k, data=clip_features, dtype=np.float32)
 88 |         sub_clip_h5.create_dataset(k + "-mask", data=clip_mask, dtype=np.float32)
 89 |         debug_cnt += 1
 90 |         if debug and debug_cnt == 5:
 91 |             break
 92 |     print("skip_cnt {}".format(skip_cnt))
 93 |     print("Counter not_equal_cnt {}".format(Counter(not_equal_cnt).most_common()))
 94 |     # Counter not_equal_cnt [(1, 150), (2, 7), (4, 1)] for clip_length==1.5
 95 | 
 96 | 
 97 | def main_convert():
 98 |     import argparse
 99 |     parser = argparse.ArgumentParser()
100 |     parser.add_argument("--src_h5_file", type=str, help="subtitle words level feature .h5 file")
101 |     parser.add_argument("--vid_clip_h5_file", type=str, help="video clip level feature .h5 file")
102 |     parser.add_argument("--sub_meta_path", type=str, help="processed subtitle .jsonl path")
103 |     parser.add_argument("--tgt_h5_file", type=str, help=".h5 path to stores the converted data")
104 |     parser.add_argument("--pool_type", type=str, default="max",
105 |                         choices=["max", "avg"], help="how to aggreate frame features")
106 |     parser.add_argument("--clip_length", type=float, default=1.5)
107 |     parser.add_argument("--debug", action="store_true")
108 |     args = parser.parse_args()
109 | 
110 |     sub_info_cache_path = args.tgt_h5_file.replace(".h5", "_sub_info.json")
111 |     if not os.path.exists(sub_info_cache_path):
112 |         video2sub_info = load_process_sub_meta(args.sub_meta_path, clip_length=args.clip_length)
113 |         save_json(video2sub_info, sub_info_cache_path)
114 |     else:
115 |         video2sub_info = load_json(sub_info_cache_path)
116 |     with h5py.File(args.src_h5_file, "r") as src_h5:
117 |         with h5py.File(args.vid_clip_h5_file, "r") as vid_clip_h5:
118 |             with h5py.File(args.tgt_h5_file, "w") as tgt_h5:
119 |                 convert_h5(src_h5, vid_clip_h5, tgt_h5, video2sub_info,
120 |                            pool_type=args.pool_type, debug=args.debug)
121 | 
122 | 
123 | if __name__ == '__main__':
124 |     main_convert()
125 | 


--------------------------------------------------------------------------------
/utils/text_feature/convert_sub_feature_word_to_clip.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # Usage:
 3 | # bash utils/text_feature/convert_sub_feature_word_to_clip.sh POOL_TYPE CLIP_LENGTH [--debug]
 4 | 
 5 | pool_type=$1  # [max, avg]
 6 | clip_length=$2
 7 | sub_token_h5_file=$3
 8 | sub_clip_h5_file=$4
 9 | vid_clip_h5_file=$5  # .h5 file stores the clip-level video features, to make sure subtitle clip-level features have the same length as the video features.
10 | sub_meta_path=data/tvqa_preprocessed_subtitles.jsonl
11 | 
12 | python utils/text_feature/convert_sub_feature_word_to_clip.py \
13 | --pool_type ${pool_type} \
14 | --clip_length ${clip_length} \
15 | --src_h5_file ${sub_token_h5_file} \
16 | --tgt_h5_file ${sub_clip_h5_file} \
17 | --sub_meta_path ${sub_meta_path} \
18 | --vid_clip_h5_file ${vid_clip_h5_file} \
19 | ${@:3}
20 | 


--------------------------------------------------------------------------------
/utils/text_feature/extract_single_sentence_embeddings.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # Usage:
 3 | # bash utils/text_feature/extract_single_sentence_embeddings.sh \
 4 | # OUTPUT_ROOT FINETUNE_MODE EXTRACTION_MODE SAVE_FILEPATH
 5 | # Examples:
 6 | # bash utils/text_feature/extract_single_sentence_embeddings.sh ${output_root} sub_query sub tvr_sub_pretrained_w_sub_query.h5 --debug
 7 | # bash utils/text_feature/extract_single_sentence_embeddings.sh ${output_root} sub_query query tvr_query_pretrained_w_sub_query.h5 --debug
 8 | output_root=$1
 9 | finetune_mode=$2  # sub_query or query_only
10 | extraction_mode=$3  # sub or query
11 | extracted_file_name=$4  # tvr_query_pretrained_w_sub_query.h5, will be saved at output_dir
12 | 
13 | data_root="data"
14 | train_data_file="${data_root}/tvr_train_release.jsonl"
15 | val_data_file="${data_root}/tvr_val_release.jsonl"
16 | test_data_file1="${data_root}/tvr_test_public_release.jsonl"
17 | sub_data_file="${data_root}/tvqa_preprocessed_subtitles.jsonl"
18 | 
19 | ="/net/bvisionserver14/playpen-ssd/jielei/data/tvr/bert_feature"
20 | output_dir="${output_root}/${finetune_mode}"
21 | model_type="roberta"
22 | model_name_or_path="${output_dir}/roberta-base_tuned_model"
23 | 
24 | 
25 | if [[ ${extraction_mode} == query ]]; then
26 |     max_length=30
27 |     extra_args=(--train_data_file)
28 |     extra_args+=(${train_data_file})
29 |     extra_args+=(${val_data_file})
30 |     extra_args+=(${test_data_file1})
31 | elif [[ ${extraction_mode} == sub ]]; then
32 |     max_length=256
33 |     extra_args=(--use_sub)
34 |     extra_args+=(--sub_data_file)
35 |     extra_args+=(${sub_data_file})
36 | fi
37 | 
38 | python utils/text_feature/lm_finetuning_on_single_sentences.py \
39 | --output_dir ${output_dir} \
40 | --model_type ${model_type} \
41 | --model_name_or_path ${model_name_or_path} \
42 | --do_extract \
43 | --extracted_file_name ${extracted_file_name} \
44 | --block_size ${max_length} \
45 | ${extra_args[@]} \
46 | ${@:5}
47 | 


--------------------------------------------------------------------------------
/utils/text_feature/extract_single_sentence_tokens.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # Usage:
 3 | # bash utils/text_feature/extract_single_sentence_tokens.sh \
 4 | # OUTPUT_ROOT FINETUNE_MODE EXTRACTION_MODE SAVE_FILEPATH
 5 | # Examples:
 6 | # bash utils/text_feature/extract_single_sentence_tokens.sh ${output_root} query query tvr_query_roberta_tokenized.jsonl --debug
 7 | output_root=$1
 8 | finetune_mode=$2  # sub_query or query_only
 9 | extraction_mode=$3  # sub or query
10 | extracted_file_name=$4  # "*jsonl" file
11 | 
12 | 
13 | 
14 | data_root="data"
15 | train_data_file="${data_root}/tvr_train_release.jsonl"
16 | val_data_file="${data_root}/tvr_val_release.jsonl"
17 | test_data_file1="${data_root}/tvr_test_public_release.jsonl"
18 | test_data_file2="${data_root}/tvr_test_challenge_release.jsonl"
19 | sub_data_file="${data_root}/tvqa_preprocessed_subtitles.jsonl"
20 | 
21 | output_dir="${output_root}/${finetune_mode}"
22 | model_type="roberta"
23 | model_name_or_path="${output_dir}/roberta-base_tuned_model"
24 | 
25 | 
26 | if [[ ${extraction_mode} == query ]]; then
27 |     max_length=30
28 |     extra_args=(--train_data_file)
29 |     extra_args+=(${train_data_file})
30 |     extra_args+=(${val_data_file})
31 |     extra_args+=(${test_data_file1})
32 |     extra_args+=(${test_data_file2})
33 | #elif [[ ${extraction_mode} == sub ]]; then
34 | #    max_length=256
35 | #    extra_args=(--use_sub)
36 | #    extra_args+=(--sub_data_file)
37 | #    extra_args+=(${sub_data_file})
38 | fi
39 | 
40 | python utils/text_feature/lm_finetuning_on_single_sentences.py \
41 | --output_dir ${output_dir} \
42 | --model_type ${model_type} \
43 | --model_name_or_path ${model_name_or_path} \
44 | --do_tokenize \
45 | --extracted_file_name ${extracted_file_name} \
46 | ${extra_args[@]} \
47 | ${@:5}
48 | 


--------------------------------------------------------------------------------
/utils/text_feature/preprocess_subtitles.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Running basic pre-processing for the .srt subtitle files from
 3 | http://tvqa.cs.unc.edu/download_tvqa.html#tvqa-download-2.
 4 | """
 5 | import re
 6 | import os
 7 | import pysrt
 8 | import glob
 9 | from tqdm import tqdm
10 | from utils.basic_utils import save_jsonl
11 | 
12 | 
13 | def convert_sub_time_to_seconds(sub_time):
14 |     """sub_time is a SubRipTime object defined by pysrt"""
15 |     return 60 * sub_time.minutes + sub_time.seconds + 0.001 * sub_time.milliseconds
16 | 
17 | 
18 | def clean_single_sub_sentence(sub_sentence):
19 |     """sub_sentence: str, """
20 |     sub_sentence = sub_sentence.replace("\n", " ")
21 |     sub_sentence = sub_sentence.replace("(", " ")
22 |     sub_sentence = sub_sentence.replace(")", " ")
23 |     sub_sentence = sub_sentence.replace(":", " : ")
24 |     sub_sentence = re.sub(r"\s{2,}", " ", sub_sentence)
25 |     return sub_sentence
26 | 
27 | 
28 | def preprocess_subtitles_from_dir(srt_dir, save_path):
29 |     """
30 |     return: A python dict, the keys are the video names, the entries are lists,
31 |             each contains all the text from a .srt file
32 |     sub_times are the start time of the sentences.
33 |     """
34 |     assert not os.path.exists(save_path), "File {} already exists".format(save_path)
35 | 
36 |     print("Loading srt files from %s ..." % srt_dir)
37 |     srt_paths = glob.glob(os.path.join(srt_dir, "*.srt"))
38 |     srt_datalist = []
39 |     for sub_path in tqdm(srt_paths, desc="Loop over subtitle files"):
40 |         subs = pysrt.open(sub_path, encoding="iso-8859-1")
41 |         if len(subs) == 0:
42 |             subs = pysrt.open(sub_path)
43 | 
44 |         sub_data = []
45 |         for cur_sub in subs:
46 |             sub_data.append(dict(
47 |                 text=clean_single_sub_sentence(cur_sub.text),
48 |                 start=convert_sub_time_to_seconds(cur_sub.start),
49 |                 end=convert_sub_time_to_seconds(cur_sub.end)
50 |             ))
51 | 
52 |         srt_datalist.append(dict(
53 |             vid_name=os.path.splitext(os.path.basename(sub_path))[0],
54 |             sub=sub_data
55 |         ))
56 |     save_jsonl(srt_datalist, save_path)
57 | 
58 | 
59 | if __name__ == '__main__':
60 |     import argparse
61 |     parser = argparse.ArgumentParser()
62 |     parser.add_argument("-srt_dir", type=str,
63 |                         help="path to the dir containing all the TVQA subtitle .srt files")
64 |     parser.add_argument("-save_path", type=str, help="path to save the preprocessed subtitles")
65 |     args = parser.parse_args()
66 | 
67 |     preprocess_subtitles_from_dir(args.srt_dir, args.save_path)
68 | 


--------------------------------------------------------------------------------
/utils/text_feature/train_lm_finetuning_single_sentence.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # Usage: bash utils/text_feature/train_lm_finetuning_single_sentence.sh FINETUNE_MODE OUTPUT_ROOT
 3 | finetune_mode=$1  # [query_only, sub_query]
 4 | output_root=$2  # path to store the generated output
 5 | data_root="data"
 6 | train_data_file="${data_root}/tvr_train_release.jsonl"
 7 | sub_data_file="${data_root}/tvqa_preprocessed_subtitles.jsonl"
 8 | model_type="roberta"
 9 | model_name_or_path="roberta-base"
10 | 
11 | num_train_epochs=1
12 | output_dir="${output_root}/${finetune_mode}/roberta-base_tuned_model"
13 | 
14 | if [[ ${finetune_mode} == query_only ]]; then
15 |     max_length=32
16 |     gradient_accumulation_steps=1
17 | 
18 |     extra_args=()
19 | elif [[ ${finetune_mode} == sub_query ]]; then
20 |     max_length=256  # since sub is longer
21 |     gradient_accumulation_steps=4
22 | 
23 |     extra_args=(--use_sub)
24 |     extra_args+=(--sub_data_file)
25 |     extra_args+=(${sub_data_file})
26 | fi
27 | 
28 | python utils/text_feature/lm_finetuning_on_single_sentences.py \
29 | --output_dir ${output_dir} \
30 | --model_type ${model_type} \
31 | --model_name_or_path ${model_name_or_path} \
32 | --do_train \
33 | --train_data_file ${train_data_file} \
34 | --gradient_accumulation_steps ${gradient_accumulation_steps} \
35 | --block_size ${max_length} \
36 | --mlm \
37 | --num_train_epochs ${num_train_epochs} \
38 | ${extra_args[@]} \
39 | ${@:3}
40 | 


--------------------------------------------------------------------------------
/utils/video_feature/README.md:
--------------------------------------------------------------------------------
 1 | ### video feature extraction 
 2 | 
 3 | #### I3D feature extraction requirements:
 4 | - tensorflow-gpu==1.14
 5 | - dm-sonnet-gpu==1.32
 6 | - opencv-python
 7 | 
 8 | #### ResNet-152 feature extraction requirements:
 9 | - PyTorch
10 | - Torchvision
11 | 
12 | Note the video features released at 
13 | [tvr_feature_release.tar.gz](https://drive.google.com/file/d/1j4mVkXjKCgafW3ReNjZ2Rk6CKx0Fk_n5/view?usp=sharing) 
14 | is extracted from 15 FPS frames, which is not publicly available
15 | (we only released [3 FPS frames](http://tvqa.cs.unc.edu/download_tvqa.html#tvqa-download-4)).
16 | 


--------------------------------------------------------------------------------
/utils/video_feature/convert_feature_frm_to_clip.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Convert frame level (FPS1) features for videos to clip level (FPS2) features, by pooling across multiple frames.
 3 | 
 4 | FeaturePerSecond (FPS): FPS1 > FPS2.
 5 | """
 6 | import os
 7 | import h5py
 8 | import numpy as np
 9 | from tqdm import tqdm
10 | 
11 | 
12 | def convert_for_single_h5(frm_h5, clip_h5, clip_boundaries_in_frm_idx, pool_type="max", debug=False):
13 |     """
14 |     Args:
15 |         frm_h5: h5py.File object, containing the frame level features
16 |         clip_h5: h5py.File object, containing the clip level features
17 |         clip_boundaries_in_frm_idx: list, features belong to clip `clip_idx` should be indexed as
18 |             features[clip_boundaries_in_frm_idx[clip_idx]:clip_boundaries_in_frm_idx[clip_idx+1]]
19 |         pool_type: max or avg
20 |         debug:
21 |     Returns:
22 | 
23 |     """
24 |     assert pool_type in ["max", "avg"]
25 |     np_pool_func = np.max if pool_type == "max" else np.mean
26 |     for k in tqdm(frm_h5.keys()):
27 |         frm_features = frm_h5[k]
28 |         clip_features = []
29 |         for idx in range(len(clip_boundaries_in_frm_idx)):
30 |             cur_clip_feat = frm_features[clip_boundaries_in_frm_idx[idx]:clip_boundaries_in_frm_idx[idx+1]]
31 |             if len(cur_clip_feat) == 0:
32 |                 break
33 |             cur_clip_feat = np_pool_func(cur_clip_feat, axis=0, keepdims=True)
34 |             clip_features.append(cur_clip_feat)
35 |         clip_h5.create_dataset(k, data=np.concatenate(clip_features, axis=0), dtype=np.float32)
36 |         if debug:
37 |             break
38 | 
39 | 
40 | def get_clip2frm_idx_mapping(clip_length=1.5, max_video_length=300):
41 |     """ This function depends on how the features are extracted.
42 |     original features are extract from frames (video fps=30):
43 |     [3, 13, 23] frame in a second.
44 |     Args:
45 |         clip_length: float,
46 |         max_video_length: int,
47 | 
48 |     Returns:
49 |         {clip_idx1 (int): [frm_idx0, frm_idx1, ...],
50 |          ...
51 |         }
52 |     """
53 |     # frame 0 in the feature is actually the frame 3 in the original video, so its
54 |     # corresponding time is 3 / 30 = 0.1s. More generally ==> [0.1, 0.43, 0.77] + n.
55 |     frm2seconds = np.concatenate([
56 |         np.array([3, 13, 23]) / 30. + offset for offset in np.arange(0, max_video_length)], axis=0)
57 | 
58 |     clip_boundaries = np.arange(0, max_video_length, clip_length)
59 |     # no need to worry about search boundary.
60 |     # indexed as clip_boundaries_in_frm_idx[idx]:clip_boundaries_in_frm_idx[idx+1]
61 |     clip_boundaries_in_frm_idx = np.searchsorted(frm2seconds, clip_boundaries)
62 |     return clip_boundaries_in_frm_idx
63 | 
64 | 
65 | def main_convert():
66 |     import argparse
67 |     parser = argparse.ArgumentParser()
68 |     parser.add_argument("--src_h5_files", type=str, nargs='+', help="frm .h5 file paths")
69 |     parser.add_argument("--tgt_h5_file", type=str, help=".h5 path to stores the converted data")
70 |     parser.add_argument("--pool_type", type=str, default="max",
71 |                         choices=["max", "avg"], help="how to aggreate frame features")
72 |     parser.add_argument("--clip_length", type=float, default=1.5)
73 |     parser.add_argument("--debug", action="store_true")
74 |     args = parser.parse_args()
75 | 
76 |     clip_boundaries_in_frm_idx = get_clip2frm_idx_mapping(clip_length=args.clip_length)
77 |     assert not os.path.exists(args.tgt_h5_file)
78 |     with h5py.File(args.tgt_h5_file, "a") as tgt_h5:
79 |         for src_f in args.src_h5_files:
80 |             with h5py.File(src_f, "r") as src_h5:
81 |                 convert_for_single_h5(src_h5, tgt_h5, clip_boundaries_in_frm_idx,
82 |                                       pool_type=args.pool_type, debug=args.debug)
83 | 
84 | 
85 | if __name__ == '__main__':
86 |     main_convert()
87 | 


--------------------------------------------------------------------------------
/utils/video_feature/convert_feature_frm_to_clip.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # Usage:
 3 | # bash utils/video_feature/convert_feature_frm_to_clip.sh [clip_length] ANY_OTHER_PYTHON_ARGS
 4 | clip_length=${1}
 5 | feature_root=/net/bvisionserver14/playpen-ssd/jielei/data/tvr/video_feature
 6 | src_h5_files=()
 7 | for show_name in bbt friends grey house met castle
 8 | do
 9 |     cur_src_h5_file=${feature_root}/frm_level_resnet152/tvr_${show_name}_resnet152_3fps.h5
10 |     src_h5_files+=(${cur_src_h5_file})
11 | done
12 | echo "Running with src_h5_files ${src_h5_files}"
13 | 
14 | pool_type=max
15 | tgt_h5_file=${feature_root}/tvr_resnet152_rgb_${pool_type}_cl-${clip_length}.h5
16 | 
17 | python utils/video_feature/convert_feature_frm_to_clip.py \
18 | --src_h5_files ${src_h5_files[@]} \
19 | --tgt_h5_file ${tgt_h5_file} \
20 | --pool_type ${pool_type} \
21 | --clip_length ${clip_length} \
22 | ${@:2}
23 | 


--------------------------------------------------------------------------------
/utils/video_feature/extract_i3d_features.py:
--------------------------------------------------------------------------------
  1 | """Extract feature ActivityNet i3d RGB/Flow feature
  2 | Modified from [1] and [2]
  3 | [1] https://github.com/deepmind/kinetics-i3d/blob/master/evaluate_sample.py
  4 | [2] https://github.com/tensorflow/hub/blob/master/examples/colab/action_recognition_with_tf_hub.ipynb
  5 | 
  6 | Model Notes:
  7 |     For model performance on Kinetics-400, please see the repository. In a nutshell,
  8 |     1) imagenet_pretrained models are better than scratch models
  9 |     2) RGB models are better than Flow models
 10 | 
 11 | Dataset Notes:
 12 |     1) Kinetics-400 has 400 classes, each with at least 400 video clips,
 13 |     2) Kinetics-600 has 600 classes, each with at least 600 video clips.
 14 | 
 15 | Please find any missing files/resources/info from https://github.com/deepmind/kinetics-i3d.
 16 | """
 17 | from __future__ import absolute_import
 18 | from __future__ import division
 19 | from __future__ import print_function
 20 | 
 21 | import os
 22 | import cv2
 23 | import json
 24 | import math
 25 | import argparse
 26 | import numpy as np
 27 | import tensorflow as tf
 28 | from tqdm import tqdm
 29 | import time
 30 | import utils.video_feature.i3d as i3d
 31 | import h5py
 32 | from multiprocessing import Pool
 33 | 
 34 | from utils.basic_utils import save_lines, read_lines, load_pickle, save_pickle
 35 | 
 36 | _IMAGE_SIZE = 224
 37 | MIN_N_FRAMES = 9
 38 | CLIP2N_FRAMES = {  # fps ==15
 39 |     1: 15,
 40 |     1.5: 23  # evenly separated every 3 seconds,
 41 | }
 42 | 
 43 | 
 44 | _KINETICS_HOME = "/net/bvisionserver4/playpen10/jielei/tools/VideoFeatureExtraction/kinetics-i3d"
 45 | _CHECKPOINT_PATHS = {
 46 |     "rgb": os.path.join(_KINETICS_HOME, "data/checkpoints/rgb_scratch/model.ckpt"),
 47 |     "rgb600": os.path.join(_KINETICS_HOME, "data/checkpoints/rgb_scratch_kin600/model.ckpt"),
 48 |     "flow": os.path.join(_KINETICS_HOME, "data/checkpoints/flow_scratch/model.ckpt"),
 49 |     "rgb_imagenet": os.path.join(_KINETICS_HOME, "data/checkpoints/rgb_imagenet/model.ckpt"),
 50 |     "flow_imagenet": os.path.join(_KINETICS_HOME, "data/checkpoints/flow_imagenet/model.ckpt"),
 51 | }
 52 | 
 53 | _LABEL_MAP_PATH = os.path.join(_KINETICS_HOME, "data/label_map.txt")
 54 | _LABEL_MAP_PATH_600 = os.path.join(_KINETICS_HOME, "data/label_map_600.txt")
 55 | 
 56 | 
 57 | def crop_center_square(frame):
 58 |     y, x = frame.shape[0:2]
 59 |     min_dim = min(y, x)
 60 |     start_x = (x // 2) - (min_dim // 2)
 61 |     start_y = (y // 2) - (min_dim // 2)
 62 |     return frame[start_y:start_y + min_dim, start_x:start_x + min_dim]
 63 | 
 64 | 
 65 | def process_single_image(image_path, resize=(224, 224)):
 66 |     img = cv2.imread(image_path)  # BGR image
 67 |     img = crop_center_square(img)
 68 |     img = cv2.resize(img, resize)
 69 |     return int(image_path.split("/")[-1].split(".")[0][-5:]), img[:, :, [2, 1, 0]]
 70 | 
 71 | 
 72 | def process_images(multi_pool, image_paths):
 73 |     pairs = multi_pool.imap_unordered(process_single_image, image_paths)
 74 |     pairs = sorted(pairs, key=lambda x: x[0])
 75 |     imgs = [e[1] for e in pairs]
 76 |     return np.array(imgs) / 255.0
 77 | 
 78 | 
 79 | def mk_divisible(array, divisor):
 80 |     """array: (N x _IMAGE_SIZE x _IMAGE_SIZE x 3)
 81 |     append N to make it divisible by
 82 |     """
 83 |     raw_length = len(array)
 84 |     residual = raw_length % divisor
 85 |     if residual != 0:
 86 |         if raw_length < divisor - residual:
 87 |             array = np.concatenate([array] + [array] * (int((divisor - residual) / raw_length) + 1))[-divisor:]
 88 |         else:
 89 |             array = np.concatenate([array, array[-int(divisor-residual):]], axis=0)
 90 |     return array
 91 | 
 92 | 
 93 | def mk_batch(images_array, batch_size, clip_length=1.5):
 94 |     """images_array: N x _IMAGE_SIZE x _IMAGE_SIZE x 3
 95 |     return [B x _N_FRAMES x _IMAGE_SIZE x _IMAGE_SIZE x 3, ] (B <= batch_size)
 96 |     """
 97 |     assert clip_length in CLIP2N_FRAMES
 98 |     n_frm = CLIP2N_FRAMES[clip_length]
 99 | 
100 |     if clip_length == 1:
101 |         n_frm = 15
102 |         images_array = mk_divisible(images_array, n_frm)
103 |     elif clip_length == 1.5:
104 |         n_frm = 23  # math.ceil(45 / 2)
105 |         n_frm_3_secs = 45
106 |         clipwise_image_array = []
107 |         for idx in range(math.ceil(len(images_array)/n_frm_3_secs)):
108 |             clipwise_image_array.append(images_array[idx * n_frm_3_secs: idx * n_frm_3_secs + n_frm])
109 |             clipwise_image_array.append(images_array[(idx+1) * n_frm_3_secs - n_frm: (idx+1) * n_frm_3_secs])
110 |         images_array = np.concatenate(
111 |             [mk_divisible(e, n_frm) for e in clipwise_image_array if len(e) > 0], axis=0)
112 | 
113 |     images_array = images_array.reshape(-1, n_frm, _IMAGE_SIZE, _IMAGE_SIZE, 3)
114 |     n_clips = len(images_array)
115 |     if n_clips > batch_size:
116 |         batches = [images_array[idx * batch_size:(idx + 1) * batch_size] for idx in
117 |                    range(int(n_clips / batch_size) + 1)]
118 |         if len(batches[-1]) == 0:  # when n_clips / batch_size is an integer
119 |             del batches[-1]
120 |         return batches
121 |     else:
122 |         return [images_array]
123 | 
124 | 
125 | def get_image_paths(dir_path, image_filename_pattern="img_{:05d}.jpg"):
126 |     """each dir contains the same number of flow_x_{:05d}.jpg, flow_y_{:05d}.jpg, img_{:05d}.jpg.
127 |     Index starts at 1, not 0, thus there is no img_00000.jpg, etc.
128 |     """
129 |     num_rgb_images = int(len(os.listdir(dir_path)) / 3)  # must be divisible by 3
130 |     # original frames are extracted for the following frames, (video fps=30): [1-5], [11-15], [21-25] + 30*n
131 |     selected_img_indices = np.arange(num_rgb_images) + 1  # index starting from 1
132 |     return [image_filename_pattern.format(e) for e in selected_img_indices]
133 | 
134 | 
135 | def get_img_info_by_dir(base_dir, cache_file):
136 |     """frm_info_list: list(sublist),
137 |         each sublist[0] is vid_name, sublist[1] is an ordered list of image full paths, """
138 |     if os.path.exists(cache_file):
139 |         tf.logging.info("Found cache file, loading at {}".format(cache_file))
140 |         return load_pickle(cache_file)
141 |     tf.logging.info("Cache file not found, building from scratch")
142 |     frm_info_list = []
143 |     sub_dirs = [d for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d))]
144 |     for k in tqdm(sub_dirs, desc="Get image info from directory"):
145 |         frm_info_list.append([k, get_image_paths(os.path.join(base_dir, k))])
146 |     save_pickle(frm_info_list, cache_file)
147 |     return frm_info_list
148 | 
149 | 
150 | def get_args():
151 |     parser = argparse.ArgumentParser("i3d feature extractor")
152 |     parser.add_argument("--eval_type", type=str, default="rgb600", choices=["rgb", "rgb600"])
153 |     parser.add_argument("--imagenet_pretrained", default=False, action="store_true")
154 |     parser.add_argument("--batch_size", type=int, default=100, help="batch_size * clips")
155 |     parser.add_argument("--base_dir", type=str, help="frame_dir/*/*jpg")
156 |     parser.add_argument("--feature_file", type=str, help="path to save the features")
157 |     parser.add_argument("--cache_file", type=str, help="path to store all the videos")
158 |     parser.add_argument("--clip_length", type=float, default=1.5,
159 |                         help="clip length in seconds, each clip will have its own feature")
160 |     parser.add_argument("--debug", action="store_true")
161 |     args = parser.parse_args()
162 |     tf.logging.info("Args: %s", json.dumps(vars(args), indent=4, sort_keys=True))
163 |     return args
164 | 
165 | 
166 | def main(unused_argv):
167 |     tf.logging.set_verbosity(tf.logging.INFO)
168 |     args = get_args()
169 |     eval_type = args.eval_type
170 |     imagenet_pretrained = args.imagenet_pretrained
171 | 
172 |     NUM_CLASSES = 600 if eval_type == "rgb600" else 400
173 | 
174 |     if eval_type not in ["rgb", "rgb600", "flow", "joint"]:
175 |         raise ValueError("Bad `eval_type`, must be one of rgb, rgb600, flow, joint")
176 | 
177 |     frame_infos = get_img_info_by_dir(args.base_dir, cache_file=args.cache_file)
178 | 
179 |     n_frm = CLIP2N_FRAMES[args.clip_length]
180 |     assert n_frm >= MIN_N_FRAMES, "Number of input frames must be larger than or equal to 9"
181 | 
182 |     # RGB input has 3 channels.
183 |     rgb_input = tf.placeholder(tf.float32, shape=(None, n_frm, _IMAGE_SIZE, _IMAGE_SIZE, 3))
184 | 
185 |     with tf.variable_scope("RGB"):
186 |         rgb_model = i3d.InceptionI3d(NUM_CLASSES, spatial_squeeze=True, final_endpoint="Logits")
187 |         rgb_logits, end_points = rgb_model(rgb_input, is_training=False, dropout_keep_prob=1.0)
188 | 
189 |     rgb_variable_map = {}
190 |     for variable in tf.global_variables():
191 |         if eval_type == "rgb600":
192 |             rgb_variable_map[variable.name.replace(":0", "")[len("RGB/inception_i3d/"):]] = variable
193 |         else:
194 |             rgb_variable_map[variable.name.replace(":0", "")] = variable
195 | 
196 |     rgb_saver = tf.train.Saver(var_list=rgb_variable_map, reshape=True)
197 | 
198 |     with tf.Session() as sess:
199 |         feed_dict = {}
200 |         if imagenet_pretrained:
201 |             rgb_saver.restore(sess, _CHECKPOINT_PATHS["rgb_imagenet"])
202 |         else:
203 |             rgb_saver.restore(sess, _CHECKPOINT_PATHS[eval_type])
204 |         tf.logging.info("RGB checkpoint restored")
205 | 
206 |         feed_dict[rgb_input] = np.random.randn(args.batch_size, n_frm, _IMAGE_SIZE, _IMAGE_SIZE, 3)
207 |         avg_pool3d_feature = sess.run([end_points["avg_pool3d"]], feed_dict=feed_dict)[0]
208 |         avg_pool3d_feature = np.squeeze(avg_pool3d_feature, axis=(1, 2, 3))
209 |         tf.logging.info("Test input size {}, output feature size {}"
210 |                         .format(feed_dict[rgb_input].shape, avg_pool3d_feature.shape))
211 | 
212 |         pool = Pool(24)
213 |         feat_h5 = h5py.File(args.feature_file, "a")
214 |         exist_keys = list(feat_h5.keys())
215 |         debug_loop_cnt = 10
216 |         frame_infos = [e for e in frame_infos if e[0] not in exist_keys]
217 |         for videoname, frame_paths in tqdm(frame_infos, desc="Extracting"):
218 |             frame_paths = [os.path.join(args.base_dir, videoname, e) for e in frame_paths]
219 |             debug_loop_cnt -= 1
220 |             if args.debug and debug_loop_cnt == 0:
221 |                 break
222 |             try:
223 |                 images = process_images(pool, frame_paths)
224 |                 if len(images) == 0:
225 |                     continue
226 | 
227 |                 batches = mk_batch(images, args.batch_size, clip_length=args.clip_length)
228 |                 features = []
229 |                 for batch in batches:
230 |                     feed_dict[rgb_input] = batch
231 |                     avg_pool3d_feature = sess.run([end_points["avg_pool3d"]], feed_dict=feed_dict)[0]
232 |                     avg_pool3d_feature = np.squeeze(avg_pool3d_feature, axis=(1, 2, 3))
233 |                     features.append(avg_pool3d_feature)
234 | 
235 |                 # write to file
236 |                 feat_h5.create_dataset(videoname, data=np.concatenate(features, axis=0), dtype=np.float32)
237 |             except Exception as e:
238 |                 print("Exception ", e)
239 |                 continue
240 | 
241 |         feat_h5.close()
242 |         pool.close()
243 | 
244 | 
245 | if __name__ == "__main__":
246 |     tf.app.run(main)
247 | 
248 | 
249 | 
250 | 


--------------------------------------------------------------------------------
/utils/video_feature/extract_i3d_features.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | show_name=$1
 3 | clip_length=$2
 4 | eval_type=rgb600
 5 | feature_root=/net/bvisionserver14/playpen-ssd/jielei/data/tvr/video_feature
 6 | image_root=/net/bvisionserver4/playpen1/jielei/data/preprocessed_video_data/dense_flow_frames_step1_new
 7 | feature_file=${feature_root}/tvr_${show_name}_i3d_${eval_type}_avg_cl-${clip_length}.h5  # !!!!! TODO
 8 | cache_file=cache/tvr_${show_name}_vid_all_frm_pairs.pkl
 9 | 
10 | 
11 | echo "Running with show ${show_name}"
12 | case ${show_name} in
13 |     bbt)
14 |         base_dir=${image_root}/new_bbt
15 |         ;;
16 |     friends | grey | house | met | castle)
17 |         base_dir=${image_root}/${show_name}
18 |         ;;
19 |     *)
20 |         echo -n "Unknown argument"
21 |         ;;
22 | esac
23 | 
24 | 
25 | python utils/video_feature/extract_i3d_features.py \
26 | --eval_type=${eval_type} \
27 | --batch_size=60 \
28 | --base_dir=${base_dir} \
29 | --feature_file=${feature_file} \
30 | --cache_file=${cache_file} \
31 | --clip_length=${clip_length} \
32 | ${@:3}
33 | 


--------------------------------------------------------------------------------
/utils/video_feature/extract_image_features.py:
--------------------------------------------------------------------------------
  1 | import h5py
  2 | import torch
  3 | import torch.nn as nn
  4 | import torch.backends.cudnn as cudnn
  5 | import numpy as np
  6 | import sys
  7 | import six
  8 | import os
  9 | 
 10 | from torchvision import models, transforms
 11 | from tqdm import tqdm
 12 | from PIL import Image
 13 | 
 14 | import logging
 15 | logging.basicConfig(format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
 16 |                     datefmt='%m/%d/%Y %H:%M:%S',
 17 |                     level=logging.INFO)
 18 | logger = logging.getLogger(__name__)
 19 | 
 20 | 
 21 | IMAGENET_NORMALIZATION_PARAMS = dict(
 22 |     mean=[0.485, 0.456, 0.406],
 23 |     std=[0.229, 0.224, 0.225]
 24 | )
 25 | 
 26 | 
 27 | class ImageNetResNetFeature(nn.Module):
 28 |     def __init__(self, output_dim="2048"):
 29 |         super(ImageNetResNetFeature, self).__init__()
 30 |         resnet = models.resnet152(pretrained=True)
 31 |         if output_dim == "2048":
 32 |             n_layers_to_rm = 1  # remove last fc layer
 33 |         elif output_dim == "2048x7x7":
 34 |             n_layers_to_rm = 2  # remove last fc layer and its precedent 7x7 avg pooling layer
 35 |         else:
 36 |             raise ValueError("Wrong value for argument output_dim")
 37 |         self.feature = nn.Sequential(*list(resnet.children())[:-n_layers_to_rm])
 38 | 
 39 |     def forward(self, x):
 40 |         """return: B x 2048 or B x 2048x7x7"""
 41 |         return self.feature(x).squeeze()
 42 | 
 43 | 
 44 | class ResNetC3FeatureExtractor(nn.Module):
 45 |     def __init__(self):
 46 |         super(ResNetC3FeatureExtractor, self).__init__()
 47 |         resnet = models.resnet152(pretrained=True)
 48 |         component_list = list(resnet.children())[:-3]
 49 |         component_list.extend(list(resnet.layer4.children())[:2])
 50 |         self.resnet_base = nn.Sequential(*component_list)
 51 |         layer4_children = list(resnet.layer4.children())[2]
 52 | 
 53 |         # resnet.layer4[2].downsample is None
 54 |         self.layer4_head = nn.Sequential(
 55 |             layer4_children.conv1,
 56 |             layer4_children.bn1,
 57 |             layer4_children.relu,
 58 |             layer4_children.conv2,
 59 |             layer4_children.bn2,
 60 |             layer4_children.relu,
 61 |         )
 62 | 
 63 |     def forward(self, x):
 64 |         base_out = self.resnet_base(x)
 65 |         c3_feature = self.layer4_head(base_out)
 66 |         return c3_feature
 67 | 
 68 | 
 69 | def make_image_tensor(image_paths, zoom_out=1):
 70 |     tensors = []
 71 |     for ele in image_paths:
 72 |         image = Image.open(ele).convert('RGB')
 73 |         image = imagenet_transform(image)
 74 |         image = image.view(1, 3, 224*zoom_out, 224*zoom_out)
 75 |         tensors.append(image)
 76 |     return torch.cat(tensors, 0)
 77 | 
 78 | 
 79 | def get_image_batch_features(image_paths, net, batch_size, zoom_out=1):
 80 |     """
 81 |     input:
 82 |         path to the frames for a single video
 83 |     return:
 84 |         image features for the frames
 85 |     """
 86 |     num_batches = int(np.ceil(float(len(image_paths)) / batch_size))
 87 |     feature_list = []
 88 |     for i in range(num_batches):
 89 |         inputs = make_image_tensor(image_paths[i*batch_size:(i+1)*batch_size], zoom_out=zoom_out)
 90 |         inputs = inputs.cuda()
 91 |         cur_features = net(inputs)
 92 |         feature_list.append(cur_features)
 93 |     features = torch.cat(feature_list, 0)
 94 |     return features.data.cpu().numpy()
 95 | 
 96 | 
 97 | def extract_all(feature_path, base_dir, video_name2image_filenames, video_names, net, batch_size,
 98 |                 zoom_out=1, debug=False):
 99 |     """
100 |     Args:
101 |         feature_path: h5py file path to save the features
102 |         base_dir: os.path.join(base_dir, vid_name, image_filename) is the absolute path to the image
103 |         video_name2image_filenames:  dict(), with video names as keys, list of image filenames as values
104 |         video_names:
105 |         net:
106 |         batch_size:
107 |         zoom_out:
108 |         debug:
109 | 
110 |     Returns:
111 | 
112 |     """
113 |     feature_h5 = h5py.File(feature_path, "w")
114 | 
115 |     for i in tqdm(range(len(video_names)), desc="Extracting for videos"):
116 |         cur_vname = video_names[i]
117 |         image_paths = [os.path.join(base_dir, cur_vname, e) for e in video_name2image_filenames[cur_vname]]
118 |         try:
119 |             data_features = get_image_batch_features(image_paths, net, batch_size, zoom_out=zoom_out)
120 |         except Exception as e:
121 |             logger.debug(e)
122 |             continue
123 |         feature_h5.create_dataset(cur_vname, data=data_features, dtype=np.float32)
124 | 
125 |         if debug:
126 |             logger.info("subdir (key name) {}, feature shape {}".format(cur_vname, data_features.shape))
127 |             break
128 |     feature_h5.close()
129 | 
130 | 
131 | def get_image_paths(dir_path, image_filename_pattern="img_{:05d}.jpg", fps=15):
132 |     """each dir contains the same number of flow_x_{:05d}.jpg, flow_y_{:05d}.jpg, img_{:05d}.jpg.
133 |     Index starts at 1, not 0, thus there is no img_00000.jpg, etc.
134 |     """
135 |     num_rgb_images = int(len(os.listdir(dir_path)) / 3)  # must be divisible by 3
136 |     offsets_per_second = np.arange(0, num_rgb_images, fps)  # (0, 30, 15) => [0, 15]
137 |     # original frames are extracted for the following frames, (video fps=30): [1-5], [11-15], [21-25] + 30*n
138 |     offsets_inside_second = [3, 8, 13]  # the middle of every 5 frames., note this is not used for indexing.
139 |     selected_img_indices = np.concatenate(
140 |         [offsets_per_second + e for e in offsets_inside_second]
141 |         , axis=0)
142 |     selected_img_indices = selected_img_indices[selected_img_indices <= num_rgb_images]
143 |     return [image_filename_pattern.format(e) for e in selected_img_indices]
144 | 
145 | 
146 | if __name__ == "__main__":
147 |     # settings
148 |     import argparse
149 |     parser = argparse.ArgumentParser()
150 |     parser.add_argument("--feature_file", type=str, default=None)
151 |     parser.add_argument("--base_dir", type=str, default=None)
152 |     parser.add_argument("--feature_type", type=str, default="imagenet2048",
153 |                         choices=["2048", "2048x7x7", "c3"])
154 |     parser.add_argument("--zoom_out", type=int, default=1, help="224 * zoom_out is the input spatial size")
155 |     parser.add_argument("--batch_size", type=int, default=300)
156 |     parser.add_argument("--cache_dir", type=str, default="")
157 |     parser.add_argument("--bypass_user_input", action="store_true")
158 |     parser.add_argument("--debug", action="store_true")
159 |     args = parser.parse_args()
160 |     logging.info(vars(args))
161 | 
162 |     logger.info("[Phase 1] Setup feature extractor.")
163 |     # https://github.com/KaimingHe/deep-residual-networks/blob/master/prototxt/ResNet-152-deploy.prototxt
164 |     # see the link above for resnet architectrue, layer_name, etc.
165 |     feature_type = args.feature_type
166 |     if feature_type == "2048":
167 |         extractor = ImageNetResNetFeature(output_dim="2048")
168 |     elif feature_type == "2048x7x7":
169 |         extractor = ImageNetResNetFeature(output_dim="2048x7x7")
170 |     elif feature_type == "c3":
171 |         extractor = ResNetC3FeatureExtractor()
172 |     else:
173 |         raise NotImplementedError("Not supported feature type")
174 | 
175 |     # Step 2, set experiment settings
176 |     logger.info("[Phase 2] Config settings.")
177 | 
178 |     if os.path.exists(args.feature_file):
179 |         logger.info("feature_file {} already exists".format(args.feature_file))
180 |         sys.exit(1)
181 | 
182 |     USE_CUDA = torch.cuda.is_available()
183 |     if not USE_CUDA:
184 |         logger.info("no GPU available")
185 |         sys.exit(1)
186 |     cudnn.benchmark = True
187 | 
188 |     extractor.cuda()
189 |     extractor.eval()
190 | 
191 |     zoom_out = args.zoom_out
192 |     # testing
193 |     with torch.no_grad():
194 |         sample_input = torch.randn(args.batch_size, 3, 224 * zoom_out, 224 * zoom_out)
195 |         if USE_CUDA:
196 |             sample_input = sample_input.cuda()
197 |             logger.info(" Extraction on GPU.")
198 |         sample_output1 = extractor(sample_input)
199 | 
200 |     logger.info(" Input Size is: {}".format(sample_input.shape))
201 |     logger.info(" Feature Size is: {}".format(sample_output1.shape))
202 |     if args.bypass_user_input:
203 |         s = "y"
204 |     else:
205 |         s = six.moves.input("Do you want to proceed (Y/N): ")
206 |     if s.lower() == "y":
207 |         imagenet_transform = transforms.Compose([
208 |                     transforms.Resize((224 * zoom_out, 224 * zoom_out)),
209 |                     transforms.ToTensor(),
210 |                     transforms.Normalize(**IMAGENET_NORMALIZATION_PARAMS),
211 |                 ])
212 | 
213 |         logger.info("[Phase 3] : Feature Extraction")
214 |         sub_dirs = [d for d in os.listdir(args.base_dir) if os.path.isdir(os.path.join(args.base_dir, d))]
215 |         cache_video_name2image_filenames_path = \
216 |             os.path.join(args.cache_dir, "{}_video_name2image_filenames.cache.pt"
217 |                                          .format(os.path.split(args.feature_file)[-1]))
218 |         if os.path.exists(cache_video_name2image_filenames_path):
219 |             logger.info("Loading from cache {}".format(cache_video_name2image_filenames_path))
220 |             video_name2image_filenames = torch.load(cache_video_name2image_filenames_path)
221 |         else:
222 |             logger.info("Cache not found, creating and saving at {}"
223 |                         .format(cache_video_name2image_filenames_path))
224 |             video_name2image_filenames = {
225 |                 k: get_image_paths(os.path.join(args.base_dir, k))
226 |                 for k in tqdm(sub_dirs, desc="Gathering image paths for each video")
227 |             }
228 |             torch.save(video_name2image_filenames, cache_video_name2image_filenames_path)
229 |         logger.info("video_name2image_filenames len {} keys[:3] {} values [0][:10] {}"
230 |                     .format(len(video_name2image_filenames),
231 |                             list(video_name2image_filenames.keys())[:3],
232 |                             list(video_name2image_filenames.values())[0][:10]))
233 |         with torch.no_grad():
234 |             extract_all(args.feature_file, args.base_dir, video_name2image_filenames, sub_dirs, extractor,
235 |                         args.batch_size, zoom_out=zoom_out, debug=args.debug)
236 |     else:
237 |         logging.info("Aborting")
238 | 


--------------------------------------------------------------------------------
/utils/video_feature/extract_resnet152_2048_features.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | show_name=$1
 3 | feature_root=/net/bvisionserver14/playpen-ssd/jielei/data/tvr/video_feature
 4 | image_root=/net/bvisionserver4/playpen1/jielei/data/preprocessed_video_data/dense_flow_frames_step1_new
 5 | feature_file=${feature_root}/tvr_${show_name}_resnet152_3fps.h5
 6 | cache_dir=cache
 7 | 
 8 | 
 9 | echo "Running with show ${show_name}"
10 | case ${show_name} in
11 |     bbt)
12 |         base_dir=${image_root}/new_bbt
13 |         ;;
14 |     friends | grey | house | met | castle)
15 |         base_dir=${image_root}/${show_name}
16 |         ;;
17 |     *)
18 |         echo -n "Unknown argument"
19 |         ;;
20 | esac
21 | 
22 | 
23 | python utils/video_feature/extract_image_features.py \
24 | --feature_file=${feature_file} \
25 | --base_dir=${base_dir} \
26 | --feature_type=2048 \
27 | --batch_size=300 \
28 | --cache_dir=${cache_dir} \
29 | ${@:2}
30 | 


--------------------------------------------------------------------------------
/utils/video_feature/merge_align_i3d.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Merge i3d features from all shows. Meanwhile, align it with the imagenet feature
 3 | so that they have the same number of feature vectors.
 4 | """
 5 | import os
 6 | import h5py
 7 | import numpy as np
 8 | from tqdm import tqdm
 9 | from collections import Counter
10 | 
11 | 
12 | def convert_for_single_h5(src_h5, tgt_h5, align_h5_key2len, debug=False):
13 |     """
14 |     Args:
15 |         src_h5: h5py.File object, containing the frame level features
16 |         tgt_h5: h5py.File object, containing the clip level features
17 |         align_h5_key2len: dict, {key: len}, each value indicates the length (L) of the array (L, D)
18 |         debug:
19 |     Returns:
20 | 
21 |     """
22 |     for k, feat in tqdm(src_h5.items()):
23 |         if k in align_h5_key2len:
24 |             if len(feat) != align_h5_key2len[k]:
25 |                 align_len = align_h5_key2len[k]
26 |                 aligned_feat = np.zeros((align_h5_key2len[k], feat.shape[1]), dtype=np.float32)
27 |                 aligned_feat[:len(feat)] = feat[:align_len]
28 |                 feat = aligned_feat
29 |             tgt_h5.create_dataset(k, data=feat, dtype=np.float32)
30 |         else:
31 |             print("Skipping {}".format(k))
32 |         if debug:
33 |             break
34 | 
35 | 
36 | def get_clip2frm_idx_mapping(clip_length=1.5, max_video_length=300):
37 |     """ This function depends on how the features are extracted.
38 |     original features are extract from frames (video fps=30):
39 |     [3, 13, 23] frame in a second.
40 |     Args:
41 |         clip_length: float,
42 |         max_video_length: int,
43 | 
44 |     Returns:
45 |         {clip_idx1 (int): [frm_idx0, frm_idx1, ...],
46 |          ...
47 |         }
48 |     """
49 |     # frame 0 in the feature is actually the frame 3 in the original video, so its
50 |     # corresponding time is 3 / 30 = 0.1s. More generally ==> [0.1, 0.43, 0.77] + n.
51 |     frm2seconds = np.concatenate([
52 |         np.array([3, 13, 23]) / 30. + offset for offset in np.arange(0, max_video_length)], axis=0)
53 | 
54 |     clip_boundaries = np.arange(0, max_video_length, clip_length)
55 |     # no need to worry about search boundary.
56 |     # indexed as clip_boundaries_in_frm_idx[idx]:clip_boundaries_in_frm_idx[idx+1]
57 |     clip_boundaries_in_frm_idx = np.searchsorted(frm2seconds, clip_boundaries)
58 |     return clip_boundaries_in_frm_idx
59 | 
60 | 
61 | def main_convert():
62 |     import argparse
63 |     parser = argparse.ArgumentParser()
64 |     parser.add_argument("--src_h5_files", type=str, nargs='+', help="frm .h5 file paths")
65 |     parser.add_argument("--tgt_h5_file", type=str, help=".h5 path to stores the converted data")
66 |     parser.add_argument("--align_h5_file", type=str, help=".h5 path to the file to align at length dim")
67 |     parser.add_argument("--check_alignment_only", action="store_true", help="Check alignment only")
68 |     parser.add_argument("--debug", action="store_true")
69 |     args = parser.parse_args()
70 | 
71 |     with h5py.File(args.align_h5_file, "r") as align_h5:
72 |         align_h5_key2len = {k: len(v) for k, v in tqdm(align_h5.items(), desc="[Get Length] Loop over align h5")}
73 | 
74 |     src_h5_key2len = {}
75 |     for src_f in args.src_h5_files:
76 |         with h5py.File(src_f, "r") as src_h5:
77 |             for k, v in tqdm(src_h5.items(), desc="[Get length] Loop over one of the src h5"):
78 |                 src_h5_key2len[k] = len(v)
79 | 
80 |     not_found_keys = list(set(align_h5_key2len.keys()) - set(src_h5_key2len.keys()))
81 |     diff_key2len = {k: align_h5_key2len[k] - src_h5_key2len[k] for k in align_h5_key2len if k in src_h5_key2len}
82 |     diff_counter = Counter(list(diff_key2len.values()))
83 |     print("Not found keys total {}, examples: {}".format(len(not_found_keys), not_found_keys[:3]))
84 |     print("diff_counter {}".format(diff_counter.most_common()))
85 | 
86 |     if not args.check_alignment_only:
87 |         assert not os.path.exists(args.tgt_h5_file)
88 |         with h5py.File(args.tgt_h5_file, "a") as tgt_h5:
89 |             for src_f in args.src_h5_files:
90 |                 with h5py.File(src_f, "r") as src_h5:
91 |                     convert_for_single_h5(src_h5, tgt_h5, align_h5_key2len, debug=args.debug)
92 | 
93 | 
94 | if __name__ == '__main__':
95 |     main_convert()
96 | 


--------------------------------------------------------------------------------
/utils/video_feature/merge_align_i3d.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # Usage:
 3 | # bash utils/video_feature/merge_align_i3d.sh [clip_length] ANY_OTHER_PYTHON_ARGS
 4 | clip_length=${1}
 5 | feature_root=/net/bvisionserver14/playpen-ssd/jielei/data/tvr/video_feature
 6 | src_h5_files=()
 7 | for show_name in bbt friends grey house met castle
 8 | do
 9 |     cur_src_h5_file=${feature_root}/i3d_featrues_by_show/tvr_${show_name}_i3d_rgb600_avg_cl-${clip_length}.h5
10 |     src_h5_files+=(${cur_src_h5_file})
11 | done
12 | echo "Running with src_h5_files ${src_h5_files}"
13 | 
14 | pool_type=max
15 | tgt_h5_file=${feature_root}/tvr_i3d_rgb600_avg_cl-${clip_length}.h5
16 | align_h5_file=${feature_root}/tvr_resnet152_rgb_max_cl-${clip_length}.h5
17 | 
18 | python utils/video_feature/merge_align_i3d.py \
19 | --src_h5_files ${src_h5_files[@]} \
20 | --tgt_h5_file ${tgt_h5_file} \
21 | --align_h5_file ${align_h5_file} \
22 | ${@:2}
23 | 


--------------------------------------------------------------------------------
/utils/video_feature/normalize_and_concat.py:
--------------------------------------------------------------------------------
 1 | """
 2 | L2 Normalize then concat I3D and ResNet features
 3 | """
 4 | import os
 5 | import h5py
 6 | import numpy as np
 7 | from tqdm import tqdm
 8 | from utils.basic_utils import l2_normalize_np_array
 9 | 
10 | 
11 | def main_norm_cat():
12 |     import argparse
13 |     parser = argparse.ArgumentParser()
14 |     parser.add_argument("--resnet_h5_file", type=str, help="ResNet .h5 file paths")
15 |     parser.add_argument("--i3d_h5_file", type=str, help="I3D .h5 file paths")
16 |     parser.add_argument("--tgt_h5_file", type=str, help=".h5 path to stores the converted data")
17 |     parser.add_argument("--debug", action="store_true")
18 |     args = parser.parse_args()
19 | 
20 |     assert not os.path.exists(args.tgt_h5_file)
21 |     with h5py.File(args.resnet_h5_file, "r") as resnet_h5:
22 |         with h5py.File(args.i3d_h5_file, "r") as i3d_h5:
23 |             with h5py.File(args.tgt_h5_file, "w") as tgt_h5:
24 |                 for k in tqdm(resnet_h5.keys()):
25 |                     resnet_feat = l2_normalize_np_array(resnet_h5[k][:])
26 |                     i3d_feat = l2_normalize_np_array(i3d_h5[k][:])
27 |                     tgt_h5.create_dataset(k,
28 |                                           data=np.concatenate([resnet_feat, i3d_feat], axis=-1),
29 |                                           dtype=np.float32)
30 | 
31 | 
32 | if __name__ == '__main__':
33 |     main_norm_cat()
34 | 


--------------------------------------------------------------------------------
/utils/video_feature/normalize_and_concat.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # Usage:
 3 | # bash utils/video_feature/normalize_and_concat.sh [clip_length] ANY_OTHER_PYTHON_ARGS
 4 | clip_length=${1}
 5 | feature_root=/net/bvisionserver14/playpen-ssd/jielei/data/tvr/video_feature
 6 | resnet_h5_file=${feature_root}/tvr_resnet152_rgb_max_cl-${clip_length}.h5
 7 | i3d_h5_file=${feature_root}/tvr_i3d_rgb600_avg_cl-${clip_length}.h5
 8 | tgt_h5_file=${feature_root}/tvr_resnet152_rgb_max_i3d_rgb600_avg_cat_cl-${clip_length}.h5
 9 | 
10 | python utils/video_feature/normalize_and_concat.py \
11 | --resnet_h5_file ${resnet_h5_file} \
12 | --i3d_h5_file ${i3d_h5_file} \
13 | --tgt_h5_file ${tgt_h5_file} \
14 | ${@:2}
15 | 


--------------------------------------------------------------------------------