├── .gitignore ├── LICENSE ├── README.md ├── args.py ├── dataloader ├── .gitignore └── cvqa_loader.py ├── datasets └── nextqa │ ├── map_vid_vidorID.json │ ├── test.csv │ ├── train.csv │ ├── val.csv │ └── vlist.json ├── eval_next.py ├── global_parameters.py ├── loss.py ├── main.py ├── misc ├── CoVGT-res.png └── CoVGT.png ├── model ├── .gitignore ├── CoVGT.py ├── EncoderVid.py ├── cmatt.py ├── graph.py ├── language_model.py └── vqa_model.py ├── requirements.txt ├── shells ├── cvid_test.sh ├── cvid_train.sh ├── msrvtt_test.sh ├── msrvtt_train.sh ├── next_test.sh ├── next_train.sh ├── tgif_ftrain.sh ├── tgif_test.sh ├── tgif_train.sh └── webvid_train.sh ├── tools ├── __pycache__ │ └── object_align.cpython-38.pyc ├── bbox_visualizer.py ├── colors.txt ├── datautils │ ├── msrvtt_qa.py │ ├── msvd_qa.py │ ├── nextqa.py │ ├── tgif_qa.py │ └── utils.py ├── demo.py ├── extract_video.py ├── feat_app.sh ├── models │ ├── __init__.py │ ├── densenet.py │ ├── pre_act_resnet.py │ ├── resnet.py │ ├── resnext.py │ └── wide_resnet.py ├── object_align.py ├── preprocess_features.py └── split_dataset_feat.py ├── train ├── __pycache__ │ └── train_covgt.cpython-38.pyc └── train_covgt.py └── util.py /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | __pycache__ 3 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Contrastive Video Question Answering via Video Graph Transformer 2 |
3 | Abstract 4 | This repo holds the code for our paper CoVGT accepted to IEEE T-PAMI'23. The work extends our preliminary publication at ECCV'22. We highlight the following differences compared to the conference version: 5 | 6 | * Jointly supervised and self-supervised contrastive objectives to optimize VGT. 7 | * Substitute BERT with a stronger language model (e.g., RoBERTa) for QA embedding. 8 | * Extended results on Causal-VidQA and STAR-QA and more comprehensive ablation studies. 9 | 10 | The code is based on VGT. 11 |
12 | 13 | 14 |
15 | Illustration of contrastive learning strategy 16 |
17 | 18 | ## Todo 19 | 1. [ ] Release feature of other datasets. Please email the first author and specify the reason as the data is strictly for research purpose. 20 | 21 | ## Environment 22 | Assume you have installed Anaconda3, cuda version > 11.0 with gpu memory >= 24G, please do the following to setup the envs: 23 | ``` 24 | >conda create -n videoqa python==3.8.16 25 | >conda activate videoqa 26 | >git clone https://github.com/doc-doc/CoVGT.git 27 | >pip install -r requirements.txt 28 | >conda install pytorch==1.8.1 torchvision==0.9.1 cudatoolkit=11.1 -c pytorch -c nvidia 29 | ``` 30 | ## Preparation 31 | Please create a data folder outside this repo, so you have two folders in your workspace 'workspace/data/' and 'workspace/CoVGT/'. 32 | 33 | Below we use NExT-QA as an example to get you farmiliar with the code. 34 | Please download the related video feature and QA annotations according to the links provided in the ```Results and Resources``` section. Note that the QA annotations will be saved into ```workspace/CoVGT/datasets/nextqa/``` after you clone this repo., video features into ```workspace/data/nextqa/``` and checkpoint files into ```workspace/data/save_models/nextqa/```. Change default paths in global_parameters.py and args.py for your own datasets. 35 | 36 | ## Inference 37 | ``` 38 | ./shell/next_test.sh 0 39 | ``` 40 | ## Evaluation 41 | ``` 42 | python eval_next.py --folder CoVGT_FTCoWV --mode test 43 | ``` 44 | 45 | ## Results and Resources 46 | **

Table 1. VideoQA Accuracy (%) on Test Set.

** 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 |
Cross-Modal PretrainNExT-QACausal-VidQASTARTGIF-QA (Action)TGIF-QA (Trans)TGIF-QA (FrameQA)TGIF-QA-R* (Action)TGIF-QA-R* (Trans)MSRVTT-QA
-59.459.144.094.797.661.660.873.838.3
WebVid0.18M59.760.846.291.396.261.761.073.240.0
-featsfeatsfeatsfeatsfeatsfeatsfeatsfeatsfeats
-videosvideosvideosvideosvideosvideosvideosvideosvideos
-Q&AQ&AQ&AQ&AQ&AQ&AQ&AQ&AQ&A
122 | (The feature files are identical to VGT. We have merged some files of the same dataset to avoid too many links.) 123 | 124 | ## Train 125 | We have provided all the scripts in the folder 'shells', you can start your training by specifying the GPU IDs behind the script. (If you have multiple GPUs, you can separate them with comma: ./shell/nextqa_train.sh 0,1) 126 | ``` 127 | ./shell/nextqa_train.sh 0 128 | ``` 129 | It will train the model and save to the folder 'save_models/nextqa/CoVGT/'. You will get results around 60.1% and 59.4% on the val and test set respectively. 130 | 131 | ### Result Visualization (NExT-QA) 132 |
133 | VGT vs VGT without DGT 134 |
135 | 136 | ## Citations 137 | ``` 138 | @ARTICLE {xiao2023contrastive, 139 | author = {Junbin Xiao and Pan Zhou and Angela Yao and Yicong Li and Richang Hong and Shuicheng Yan and Tat Seng Chua}, 140 | journal = {IEEE Transactions on Pattern Analysis & Machine Intelligence}, 141 | title = {Contrastive Video Question Answering via Video Graph Transformer}, 142 | year = {2023}, 143 | volume = {45}, 144 | number = {11}, 145 | issn = {1939-3539}, 146 | pages = {13265-13280}, 147 | doi = {10.1109/TPAMI.2023.3292266}, 148 | publisher = {IEEE Computer Society}, 149 | address = {Los Alamitos, CA, USA}, 150 | month = {nov} 151 | } 152 | ``` 153 | ``` 154 | @inproceedings{xiao2022video, 155 | title={Video Graph Transformer for Video Question Answering}, 156 | author={Xiao, Junbin and Zhou, Pan and Chua, Tat-Seng and Yan, Shuicheng}, 157 | booktitle={European Conference on Computer Vision}, 158 | pages={39--58}, 159 | year={2022}, 160 | organization={Springer} 161 | } 162 | ``` 163 | ## Notes 164 | If you use any resources from this repo, please kindly cite our paper and acknowledge the source. 165 | ## License 166 | This repository is released under the Apache 2.0 license as found in the [LICENSE](LICENSE) file. 167 | -------------------------------------------------------------------------------- /args.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | from global_parameters import ( 5 | DEFAULT_DATASET_DIR, 6 | DEFAULT_CKPT_DIR, 7 | TRANSFORMERS_PATH, 8 | SSD_DIR, 9 | dataset2folder, 10 | ) 11 | 12 | def get_args(): 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument( 15 | "--dataset", 16 | type=str, 17 | default="ivqa", 18 | choices=[ 19 | "ivqa", 20 | "msrvtt", 21 | "msrvttmc", 22 | "msvd", 23 | "webvid", 24 | "activitynet", 25 | "howto100m", 26 | "howtovqa", 27 | "how2qa", 28 | "nextqa", 29 | "star", 30 | "tgifqa/transition", 31 | "tgifqa/action", 32 | "tgifqa/frameqa", 33 | "tgifqa2/transition", 34 | "tgifqa2/action", 35 | "causalvid" 36 | ], 37 | ) 38 | parser.add_argument( 39 | "--subset", 40 | type=str, 41 | default="", 42 | choices=["", "1", "10", "20", "50"], 43 | help="use a subset of the generated dataset", 44 | ) 45 | 46 | # Model 47 | parser.add_argument( 48 | "--baseline", 49 | type=str, 50 | default="", 51 | choices=["", "qa"], 52 | help="qa baseline does not use the video, video baseline does not use the question", 53 | ) 54 | parser.add_argument( 55 | "--n_layers", 56 | type=int, 57 | default=2, 58 | help="number of layers in the multi-modal transformer", 59 | ) 60 | parser.add_argument( 61 | "--n_heads", 62 | type=int, 63 | default=8, 64 | help="number of attention heads in the multi-modal transformer", 65 | ) 66 | parser.add_argument( 67 | "--embd_dim", 68 | type=int, 69 | default=512, 70 | help="multi-modal transformer and final embedding dimension", 71 | ) 72 | parser.add_argument( 73 | "--ff_dim", 74 | type=int, 75 | default=2048, 76 | help="multi-modal transformer feed-forward dimension", 77 | ) 78 | parser.add_argument( 79 | "--dropout", 80 | type=float, 81 | default=0.1, 82 | help="dropout rate in the multi-modal transformer", 83 | ) 84 | parser.add_argument( 85 | "--sentence_dim", 86 | type=int, 87 | default=2048, 88 | help="sentence dimension for the differentiable bag-of-words embedding the answers", 89 | ) 90 | parser.add_argument( 91 | "--qmax_words", 92 | type=int, 93 | default=20, 94 | help="maximum number of words in the question", 95 | ) 96 | parser.add_argument( 97 | "--amax_words", 98 | type=int, 99 | default=10, 100 | help="maximum number of words in the answer", 101 | ) 102 | parser.add_argument( 103 | "--max_feats", 104 | type=int, 105 | default=20, 106 | help="maximum number of video features considered", 107 | ) 108 | 109 | # Paths 110 | parser.add_argument( 111 | "--dataset_dir", 112 | type=str, 113 | default=DEFAULT_DATASET_DIR, 114 | help="folder where the datasets folders are stored", 115 | ) 116 | parser.add_argument( 117 | "--ssd_dir", 118 | type=str, 119 | default=SSD_DIR, 120 | help="folder with ssd storage where the HowTo100M features are stored", 121 | ) 122 | parser.add_argument( 123 | "--checkpoint_predir", 124 | type=str, 125 | default=DEFAULT_CKPT_DIR, 126 | help="folder to store checkpoints", 127 | ) 128 | parser.add_argument( 129 | "--checkpoint_dir", type=str, default="", help="subfolder to store checkpoint" 130 | ) 131 | parser.add_argument( 132 | "--pretrain_path", type=str, default="", help="path to pretrained checkpoint" 133 | ) 134 | parser.add_argument( 135 | "--bert_path", 136 | type=str, 137 | default=TRANSFORMERS_PATH, 138 | help="path to transformer models checkpoints", 139 | ) 140 | 141 | # Train 142 | parser.add_argument("--batch_size", type=int, default=256) 143 | parser.add_argument("--batch_size_val", type=int, default=2048) 144 | parser.add_argument( 145 | "--n_pair", 146 | type=int, 147 | default=32, 148 | help="number of clips per video to consider to train on HowToVQA69M", 149 | ) 150 | parser.add_argument("--seed", type=int, default=1) 151 | parser.add_argument("--epochs", type=int, default=20) 152 | parser.add_argument( 153 | "--test", type=int, default=0, help="use to evaluate without training" 154 | ) 155 | parser.add_argument( 156 | "--lr", type=float, default=0.00005, help="initial learning rate" 157 | ) 158 | parser.add_argument("--weight_decay", type=float, default=0, help="weight decay") 159 | parser.add_argument( 160 | "--clip", 161 | type=float, 162 | default=12, 163 | help="gradient clipping", 164 | ) 165 | 166 | # Print 167 | parser.add_argument( 168 | "--freq_display", type=int, default=3, help="number of train prints per epoch" 169 | ) 170 | parser.add_argument( 171 | "--num_thread_reader", type=int, default=16, help="number of workers" 172 | ) 173 | 174 | # Masked Language Modeling and Cross-Modal Matching parameters 175 | parser.add_argument("--mlm_prob", type=float, default=0.15) 176 | parser.add_argument("--n_negs", type=int, default=1) 177 | parser.add_argument("--lr_decay", type=float, default=0.9) 178 | parser.add_argument("--min_time", type=int, default=10) 179 | parser.add_argument("--min_words", type=int, default=10) 180 | 181 | # Demo parameters 182 | parser.add_argument( 183 | "--question_example", type=str, default="", help="demo question text" 184 | ) 185 | parser.add_argument("--video_example", type=str, default="", help="demo video path") 186 | parser.add_argument("--port", type=int, default=8899, help="demo port") 187 | parser.add_argument( 188 | "--pretrain_path2", type=str, default="", help="second demo model" 189 | ) 190 | parser.add_argument( 191 | "--save_dir", type=str, default="./save_models/", help="path to save dir" 192 | ) 193 | parser.add_argument( 194 | "--mc", type=int, default=5, help="number of multiple choices" 195 | ) 196 | parser.add_argument( 197 | "--bnum", type=int, default=10, help="number of region proposal" 198 | ) 199 | parser.add_argument( 200 | "--cl_loss", type=float, default=0, help="trade offf with contrastive loss" 201 | ) 202 | parser.add_argument( 203 | "--lan", type=str, default='RoBERTa', help="BERT or RoBERTa" 204 | ) 205 | 206 | args = parser.parse_args() 207 | 208 | os.environ["TRANSFORMERS_CACHE"] = args.bert_path 209 | # args.save_dir = './save_dir/' 210 | 211 | #args.save_dir = os.path.join(args.checkpoint_predir, args.checkpoint_dir) 212 | 213 | # multiple-choice arg 214 | # args.mc = 4 if args.dataset == "how2qa" else 0 215 | # args.mc = 5 if args.dataset == "nextqa" else 0 216 | 217 | # feature dimension 218 | args.feature_dim = 2048 # S3D:1024 app_mot:4096 #2048 RoI 219 | args.word_dim = 768 # DistilBERT 220 | 221 | # Map from dataset name to folder name 222 | 223 | load_path = os.path.join(args.dataset_dir, args.dataset) 224 | args.load_path = load_path 225 | 226 | if args.dataset not in ["howto100m", "howtovqa"]: # VideoQA dataset 227 | args.features_path = f'../data/{args.dataset}/' #os.path.join(load_path, "s3d.pth") 228 | # args.features_path = f'/data/datasets/{args.dataset}/' 229 | args.train_csv_path = os.path.join(load_path, "train.csv") 230 | if args.dataset == 'tgifqa': 231 | args.val_csv_path = os.path.join(load_path, "test.csv") 232 | else: 233 | args.val_csv_path = os.path.join(load_path, "val.csv") 234 | args.test_csv_path = os.path.join(load_path, "test.csv") 235 | args.vocab_path = os.path.join(load_path, "vocab.json") 236 | else: # Pretraining dataset 237 | args.features_path = os.path.join( 238 | args.ssd_dir, "s3d_features", "howto100m_s3d_features" 239 | ) 240 | if args.dataset == "howto100m": 241 | args.caption_path = os.path.join( 242 | load_path, "caption_howto100m_sw_nointersec_norepeat.pickle" 243 | ) 244 | args.train_csv_path = os.path.join( 245 | load_path, f"s3d_features_nointersec.csv" 246 | ) 247 | args.youcook_val_path = os.path.join( 248 | args.dataset_dir, "YouCook2", "youcook_unpooled_val.pkl" 249 | ) 250 | args.msrvtt_test_csv_path = os.path.join( 251 | args.dataset_dir, "MSR-VTT", "MSRVTT_JSFUSION_test.csv" 252 | ) 253 | args.msrvtt_test_features_path = os.path.join( 254 | args.dataset_dir, "MSR-VTT", "msrvtt_test_unpooled_s3d_features.pth" 255 | ) 256 | elif args.dataset == "howtovqa": 257 | if not args.subset: 258 | args.caption_path = os.path.join(load_path, "howtovqa.pkl") 259 | args.train_csv_path = os.path.join(load_path, "train_howtovqa.csv") 260 | args.val_csv_path = os.path.join(load_path, "val_howtovqa.csv") 261 | else: 262 | args.caption_path = os.path.join( 263 | load_path, f"howtovqa_{args.subset}.pickle" 264 | ) 265 | args.train_csv_path = os.path.join( 266 | load_path, f"train_howtovqa_{args.subset}.csv" 267 | ) 268 | args.val_csv_path = os.path.join( 269 | load_path, f"val_howtovqa_{args.subset}.csv" 270 | ) 271 | 272 | return args 273 | -------------------------------------------------------------------------------- /dataloader/.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | __pycache__ 3 | -------------------------------------------------------------------------------- /eval_next.py: -------------------------------------------------------------------------------- 1 | import os.path as osp 2 | from util import load_file 3 | import argparse 4 | 5 | map_name = {'CW': 'Why', 'CH': 'How', 'TN': 'Bef&Aft', 'TC': 'When', 6 | 'DC': 'Cnt', 'DL': 'Loc', 'DO': 'Other', 'C': 'Acc_C', 7 | 'T': 'Acc_T', 'D': 'Acc_D'} 8 | 9 | def accuracy_metric(sample_list, result): 10 | 11 | group = {'CW':[], 'CH':[], 'TN':[], 'TC':[], 'DC':[], 'DL':[], 'DO':[]} 12 | for id, row in sample_list.iterrows(): 13 | qns_id = str(row['video_id']) + '_' + str(row['qid']) 14 | qtype = str(row['type']) 15 | #(combine temporal qns of previous and next as 'TN') 16 | if qtype == 'TP': 17 | qtype = 'TN' 18 | group[qtype].append(qns_id) 19 | 20 | preds = result 21 | group_acc = {'CW': 0, 'CH': 0, 'TN': 0, 'TC': 0, 'DC': 0, 'DL': 0, 'DO': 0} 22 | group_cnt = {'CW': 0, 'CH': 0, 'TN': 0, 'TC': 0, 'DC': 0, 'DL': 0, 'DO': 0} 23 | overall_acc = {'C':0, 'T':0, 'D':0} 24 | overall_cnt = {'C':0, 'T':0, 'D':0} 25 | all_acc = 0 26 | all_cnt = 0 27 | for qtype, qns_ids in group.items(): 28 | cnt = 0 29 | acc = 0 30 | for qid in qns_ids: 31 | 32 | cnt += 1 33 | answer = preds[qid]['answer'] 34 | pred = preds[qid]['prediction'] 35 | if answer == pred: 36 | acc += 1 37 | 38 | group_cnt[qtype] = cnt 39 | group_acc[qtype] += acc 40 | overall_acc[qtype[0]] += acc 41 | overall_cnt[qtype[0]] += cnt 42 | all_acc += acc 43 | all_cnt += cnt 44 | 45 | 46 | for qtype, value in overall_acc.items(): 47 | group_acc[qtype] = value 48 | group_cnt[qtype] = overall_cnt[qtype] 49 | 50 | for qtype in group_acc: 51 | if group_cnt[qtype] == 0: continue 52 | print(map_name[qtype], end='\t') 53 | print('') 54 | for qtype, acc in group_acc.items(): 55 | if group_cnt[qtype] == 0: continue 56 | print('{:.2f}'.format(acc*100.0/group_cnt[qtype]), end ='\t') 57 | print('') 58 | print('Acc: {:.2f}'.format(all_acc*100.0/all_cnt)) 59 | 60 | 61 | 62 | def accuracy_metric_sub(sample_list, result, sub_ids): 63 | 64 | sub_ids = [int(id) for id in sub_ids] 65 | subset = sample_list.iloc[sub_ids] 66 | 67 | accuracy_metric(subset, result) 68 | 69 | 70 | 71 | def main(result_file, mode='val'): 72 | dataset_dir = '../data/datasets/nextqa/' 73 | data_set = mode 74 | sample_list_file = osp.join(dataset_dir, data_set+'.csv') 75 | print('Evaluating {}'.format(result_file)) 76 | 77 | sample_list = load_file(sample_list_file) 78 | result = load_file(result_file) 79 | accuracy_metric(sample_list, result) 80 | 81 | if mode == 'val': 82 | hard_subset = osp.join(dataset_dir, 'atp-hard-ct4.txt') 83 | sub_ids = load_file(hard_subset) 84 | accuracy_metric_sub(sample_list, result, sub_ids) 85 | 86 | 87 | 88 | if __name__ == "__main__": 89 | parser = argparse.ArgumentParser() 90 | parser.add_argument("--mode", type=str, default='val', choices=['val','test']) 91 | parser.add_argument("--folder", type=str) 92 | args = parser.parse_args() 93 | res_dir = '../data/save_models/nextqa/'+args.folder 94 | #res_dir = '../data/models/nextqa/' 95 | mode = args.mode 96 | model_prefix = 'res' 97 | result_file = '{}/{}-{}.json'.format(res_dir, mode, model_prefix) 98 | main(result_file, mode) 99 | -------------------------------------------------------------------------------- /global_parameters.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | # Fill the paths 4 | DEFAULT_DATASET_DIR = "./datasets/" # where the datasets folders are 5 | DEFAULT_CKPT_DIR = "../data/models/" # where the training checkpoints and logs will be saved 6 | DEFAULT_MODEL_DIR = "../data/pretrain_models/" # where the pretrained models are 7 | SSD_DIR = "../data/feats/" # where the HowTo100M S3D features are 8 | HOWTO_FEATURES_PATH = os.path.join(SSD_DIR, "s3d_features", "howto100m_s3d_features") 9 | 10 | # Map from dataset name to folder name 11 | dataset2folder = { 12 | "ivqa": "iVQA", 13 | "msrvtt": "MSRVTT-QA", 14 | "msvd": "msvd", 15 | "activitynet": "ActivityNet-QA", 16 | "howto100m": "HowTo100M", 17 | "howtovqa": "HowToVQA69M", 18 | "how2qa": "How2QA", 19 | "nextqa": "nextqa" 20 | } 21 | 22 | # Datasets 23 | IVQA_PATH = os.path.join( 24 | DEFAULT_DATASET_DIR, dataset2folder["ivqa"] 25 | ) # Path where iVQA is downloaded 26 | MSRVTT_PATH = os.path.join( 27 | DEFAULT_DATASET_DIR, dataset2folder["msrvtt"] 28 | ) # Path where MSRVTT-QA is downloaded 29 | MSVD_PATH = os.path.join( 30 | DEFAULT_DATASET_DIR, dataset2folder["msvd"] 31 | ) # Path where MSVD-QA is downloaded 32 | ACT_PATH = os.path.join( 33 | DEFAULT_DATASET_DIR, dataset2folder["activitynet"] 34 | ) # Path where ActivityNet-QA is downloaded 35 | HOWTO_PATH = os.path.join( 36 | DEFAULT_DATASET_DIR, dataset2folder["howto100m"] 37 | ) # Path where HowTo100M is downloaded 38 | HOWTOVQA_PATH = os.path.join( 39 | DEFAULT_DATASET_DIR, dataset2folder["howtovqa"] 40 | ) # Path where HowToVQA69M is downloaded / generated 41 | HOW2QA_PATH = os.path.join( 42 | DEFAULT_DATASET_DIR, dataset2folder["how2qa"] 43 | ) # Path where How2QA is downloaded 44 | NEXTQA_PATH = os.path.join( 45 | DEFAULT_DATASET_DIR, dataset2folder["nextqa"] 46 | ) # Path where How2QA is downloaded 47 | 48 | 49 | # Models 50 | S3D_PATH = os.path.join( 51 | DEFAULT_MODEL_DIR, "s3d_howto100m.pth" 52 | ) # Path to S3D checkpoint 53 | S3D_DICT_PATH = os.path.join( 54 | DEFAULT_MODEL_DIR, "s3d_dict.npy" 55 | ) # Path to S3D dictionary 56 | PUNCTUATOR_PATH = os.path.join( 57 | DEFAULT_MODEL_DIR, "INTERSPEECH-T-BRNN.pcl" 58 | ) # Path to Punctuator2 checkpoint 59 | TRANSFORMERS_PATH = os.path.join( 60 | DEFAULT_MODEL_DIR, "transformers" 61 | ) # Path where the transformers checkpoints will be saved 62 | 63 | # Question-answer Generation 64 | punct_dir = os.path.join( 65 | SSD_DIR, "punct" 66 | ) # Path where the punctuated clips will be created (1 file per unique video) 67 | QG_REPO_DIR = "" # Path where the question generation repo is cloned 68 | answers_dir = os.path.join( 69 | SSD_DIR, "ans" 70 | ) # Path where the extracted answers will be saved (1 file per unique video) 71 | qas_dir = os.path.join( 72 | SSD_DIR, "qas" 73 | ) # Path where the generated question-answers will be saved (1 file per unique video) 74 | -------------------------------------------------------------------------------- /loss.py: -------------------------------------------------------------------------------- 1 | import torch as torch 2 | import torch.nn.functional as F 3 | 4 | 5 | class Contrastive_Loss(torch.nn.Module): 6 | def __init__(self): 7 | super(Contrastive_Loss, self).__init__() 8 | self.ce_loss = torch.nn.CrossEntropyLoss() 9 | 10 | def forward(self, x, target): 11 | return self.ce_loss(x, target) 12 | 13 | 14 | class LogSoftmax(torch.nn.Module): 15 | def __init__(self, dim): 16 | super(LogSoftmax, self).__init__() 17 | self.dim = dim 18 | 19 | def forward(self, x, a): 20 | nll = -F.log_softmax(x, self.dim, _stacklevel=5) 21 | return (nll * a / a.sum(1, keepdim=True).clamp(min=1)).sum(dim=1).mean() 22 | 23 | 24 | class NCELoss(torch.nn.Module): 25 | def __init__(self, batch_size=4096): 26 | super(NCELoss, self).__init__() 27 | self.ce_loss = torch.nn.CrossEntropyLoss() 28 | 29 | def forward(self, x): 30 | batch_size = len(x) 31 | target = torch.arange(batch_size).cuda() 32 | x = torch.cat((x, x.t()), dim=1) 33 | return self.ce_loss(x, target) 34 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.optim as optim 4 | import numpy as np 5 | import random 6 | import os 7 | import os.path as osp 8 | import logging 9 | 10 | from transformers import get_cosine_schedule_with_warmup 11 | from args import get_args 12 | from model.CoVGT import VGT 13 | from loss import LogSoftmax 14 | from util import compute_a2v, load_model_by_key, save_to 15 | from dataloader.cvqa_loader import get_videoqa_loaders 16 | from train.train_covgt import train, eval 17 | 18 | 19 | 20 | def main(args): 21 | if not (os.path.isdir(args.save_dir)): 22 | os.mkdir(os.path.join(args.save_dir)) 23 | logging.basicConfig( 24 | level=logging.INFO, format="%(asctime)s %(levelname)-8s %(message)s" 25 | ) 26 | logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") 27 | rootLogger = logging.getLogger() 28 | fileHandler = logging.FileHandler(os.path.join(args.save_dir, "stdout.log"), "w+") 29 | fileHandler.setFormatter(logFormatter) 30 | rootLogger.addHandler(fileHandler) 31 | logging.info(args) 32 | 33 | 34 | if args.lan == 'BERT': 35 | from transformers import BertTokenizer 36 | tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") 37 | elif args.lan == 'RoBERTa': 38 | from transformers import RobertaTokenizerFast,RobertaTokenizer 39 | tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base") 40 | 41 | 42 | a2id, id2a, a2v = None, None, None 43 | if not args.mc: 44 | a2id, id2a, a2v = compute_a2v( 45 | vocab_path=args.vocab_path, 46 | bert_tokenizer=tokenizer, 47 | amax_words=args.amax_words, 48 | ) 49 | logging.info(f"Length of Answer Vocabulary: {len(a2id)}") 50 | 51 | # Model 52 | model = VGT( 53 | tokenizer = tokenizer, 54 | feature_dim=args.feature_dim, 55 | word_dim=args.word_dim, 56 | N=args.n_layers, 57 | d_model=args.embd_dim, 58 | d_ff=args.ff_dim, 59 | h=args.n_heads, 60 | dropout=args.dropout, 61 | T=args.max_feats, 62 | Q=args.qmax_words, 63 | vocab_size = tokenizer.vocab_size, 64 | baseline=args.baseline, 65 | bnum=args.bnum, 66 | lan=args.lan 67 | ) 68 | model.cuda() 69 | logging.info("Using {} GPUs".format(torch.cuda.device_count())) 70 | 71 | # Load pretrain path 72 | model = nn.DataParallel(model) 73 | 74 | if args.pretrain_path != "": 75 | # model.load_state_dict(torch.load(args.pretrain_path)) 76 | model.load_state_dict(load_model_by_key(model, args.pretrain_path)) 77 | logging.info(f"Loaded checkpoint {args.pretrain_path}") 78 | logging.info( 79 | f"Nb of trainable params:{sum(p.numel() for p in model.parameters() if p.requires_grad)}" 80 | ) 81 | 82 | ( 83 | train_loader, 84 | val_loader, 85 | test_loader, 86 | ) = get_videoqa_loaders(args, args.features_path, a2id, tokenizer, test_mode = args.test) 87 | 88 | if args.test: 89 | logging.info("number of test instances: {}".format(len(test_loader.dataset))) 90 | else: 91 | logging.info("number of train instances: {}".format(len(train_loader.dataset))) 92 | logging.info("number of val instances: {}".format(len(val_loader.dataset))) 93 | 94 | 95 | criterion = nn.CrossEntropyLoss(ignore_index=-1) 96 | # criterion = MultipleChoiceLoss() 97 | params_for_optimization = list(p for p in model.parameters() if p.requires_grad) 98 | optimizer = optim.Adam( 99 | params_for_optimization, lr=args.lr, weight_decay=args.weight_decay 100 | ) 101 | criterion.cuda() 102 | 103 | # Training 104 | if not args.test: 105 | scheduler = get_cosine_schedule_with_warmup( 106 | optimizer, 0, len(train_loader) * args.epochs 107 | ) 108 | logging.info( 109 | f"Set cosine schedule with {len(train_loader) * args.epochs} iterations" 110 | ) 111 | if args.pretrain_path != "": 112 | val_acc, results = eval(model, val_loader, a2v, args, test=False, tokenizer=tokenizer) # zero-shot VideoQA 113 | save_path = osp.join(args.save_dir, 'val-res0.json') 114 | save_to (save_path, results) 115 | best_val_acc = 0 if args.pretrain_path == "" else val_acc 116 | best_epoch = 0 117 | for epoch in range(args.epochs): 118 | train(model, train_loader, a2v, optimizer, criterion, scheduler, epoch, args, tokenizer) 119 | val_acc, results = eval(model, val_loader, a2v, args, test=False, tokenizer=tokenizer) 120 | if val_acc > best_val_acc: 121 | best_val_acc = val_acc 122 | best_epoch = epoch 123 | torch.save( 124 | model.state_dict(), os.path.join(args.save_dir, "best_model.pth") 125 | ) 126 | save_path = osp.join(args.save_dir, 'val-res.json') 127 | save_to (save_path, results) 128 | if args.dataset == 'webvid': 129 | ep_file = os.path.join(args.save_dir, f"e{epoch}.pth") 130 | torch.save(model.state_dict(), ep_file) 131 | logging.info('Save to '+ep_file) 132 | logging.info(f"Best val model at epoch {best_epoch + 1}") 133 | else: 134 | # Evaluate on test set 135 | test_acc, results = eval(model, test_loader, a2v, args, test=True, tokenizer=tokenizer) 136 | save_path = osp.join(args.save_dir, 'test-res.json') 137 | save_to(save_path, results) 138 | 139 | 140 | if __name__ == "__main__": 141 | # set random seeds 142 | args = get_args() 143 | torch.backends.cudnn.enabled = False 144 | torch.cuda.manual_seed(args.seed) 145 | torch.manual_seed(args.seed) 146 | np.random.seed(args.seed) 147 | random.seed(args.seed) 148 | torch.backends.cudnn.benchmark = True 149 | 150 | main(args) 151 | -------------------------------------------------------------------------------- /misc/CoVGT-res.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doc-doc/CoVGT/cbc9fa7830b304f3c3f9c53040489ea9ad35a9aa/misc/CoVGT-res.png -------------------------------------------------------------------------------- /misc/CoVGT.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doc-doc/CoVGT/cbc9fa7830b304f3c3f9c53040489ea9ad35a9aa/misc/CoVGT.png -------------------------------------------------------------------------------- /model/.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | __pycache__ 3 | -------------------------------------------------------------------------------- /model/EncoderVid.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 Garena Online Private Limited 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import torch.nn as nn 16 | import torch 17 | 18 | class EncoderVid(nn.Module): 19 | def __init__(self, feat_dim, bbox_dim, feat_hidden, pos_hidden, input_dropout_p=0.3): 20 | 21 | super(EncoderVid, self).__init__() 22 | self.dim_feat = feat_dim 23 | self.dim_bbox = bbox_dim 24 | self.dim_hidden = feat_hidden 25 | self.input_dropout_p = input_dropout_p 26 | 27 | input_dim = feat_dim 28 | 29 | input_dim += pos_hidden 30 | self.bbox_conv = nn.Sequential( 31 | nn.Conv2d(self.dim_bbox, pos_hidden, kernel_size=1), 32 | nn.BatchNorm2d(pos_hidden), 33 | nn.ReLU(), 34 | nn.Conv2d(pos_hidden, pos_hidden, kernel_size=1), 35 | nn.BatchNorm2d(pos_hidden), 36 | nn.ReLU(), 37 | 38 | ) 39 | 40 | self.tohid = nn.Sequential( 41 | nn.Linear(feat_dim+pos_hidden, feat_hidden), 42 | nn.ELU(inplace=True)) 43 | 44 | # self.roi_conv = nn.Sequential( 45 | # nn.Conv1d(feat_dim, feat_hidden, kernel_size=3, padding=1), 46 | # nn.ELU(inplace=True) 47 | # ) 48 | 49 | # self.roi_conv = nn.Sequential( 50 | # nn.Conv2d(4, 4, kernel_size=1), 51 | # nn.BatchNorm2d(4), 52 | # nn.ReLU(), 53 | # ) 54 | 55 | 56 | def forward(self, video_o): 57 | 58 | bsize, numc, numf, numr, fdim = video_o.shape 59 | 60 | video_o = video_o.view(bsize, numc*numf, numr, fdim) 61 | roi_feat = video_o[:,:,:, :self.dim_feat] 62 | roi_bbox = video_o[:,:,:, self.dim_feat:(self.dim_feat+self.dim_bbox)] 63 | 64 | bbox_pos = self.bbox_conv(roi_bbox.permute( 65 | 0, 3, 1, 2)).permute(0, 2, 3, 1) 66 | 67 | bbox_features = torch.cat([roi_feat, bbox_pos], dim=-1) 68 | 69 | bbox_feat = self.tohid(bbox_features) 70 | 71 | return bbox_feat 72 | -------------------------------------------------------------------------------- /model/cmatt.py: -------------------------------------------------------------------------------- 1 | __author__ = "Jie Lei" 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | from torch.autograd import Variable 7 | 8 | 9 | class CMAtten(nn.Module): 10 | 11 | def __init__(self): 12 | super(CMAtten, self).__init__() 13 | 14 | 15 | def similarity(self, s1, l1, s2, l2): 16 | """ 17 | :param s1: [B, t1, D] 18 | :param l1: [B] 19 | :param s2: [B, t2, D] 20 | :param l2: [B] 21 | :return: 22 | """ 23 | s = torch.bmm(s1, s2.transpose(1, 2)) 24 | 25 | # import ipdb; ipdb.set_trace() 26 | s_mask = s.data.new(*s.size()).fill_(1).bool() # [B, T1, T2] 27 | # Init similarity mask using lengths 28 | for i, (l_1, l_2) in enumerate(zip(l1, l2)): 29 | s_mask[i][:l_1, :l_2] = 0 30 | 31 | s_mask = Variable(s_mask) 32 | s.data.masked_fill_(s_mask.data, -float("inf")) 33 | return s 34 | 35 | @classmethod 36 | def get_u_tile(cls, s, s2): 37 | """ 38 | attended vectors of s2 for each word in s1, 39 | signify which words in s2 are most relevant to words in s1 40 | """ 41 | a_weight = F.softmax(s, dim=2) # [B, l1, l2] 42 | # remove nan from softmax on -inf 43 | # print(a_weight.shape, s2.shape) 44 | a_weight.data.masked_fill_(a_weight.data != a_weight.data, 0) 45 | # [B, l1, l2] * [B, l2, D] -> [B, l1, D] 46 | u_tile = torch.bmm(a_weight, s2) 47 | return u_tile, a_weight 48 | 49 | 50 | def forward(self, s1, l1, s2, l2): 51 | s = self.similarity(s1, l1, s2, l2) 52 | u_tile, a_weight = self.get_u_tile(s, s2) 53 | 54 | return u_tile, a_weight 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | -------------------------------------------------------------------------------- /model/graph.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch 3 | from torch.autograd import Variable 4 | import torch.nn.functional as F 5 | from torch.nn.parameter import Parameter 6 | import math 7 | 8 | class GraphConvolution(nn.Module): 9 | """ 10 | Simple GCN layer, similar to https://arxiv.org/abs/1609.02907 11 | """ 12 | 13 | def __init__(self, in_features, out_features, bias=True, skip=True): 14 | super(GraphConvolution, self).__init__() 15 | self.skip = skip 16 | self.in_features = in_features 17 | self.out_features = out_features 18 | self.weight = Parameter(torch.Tensor(in_features, out_features)) 19 | if bias: 20 | self.bias = Parameter(torch.Tensor(out_features)) 21 | else: 22 | self.register_parameter('bias', None) 23 | self.reset_parameters() 24 | 25 | def reset_parameters(self): 26 | stdv = 1. / math.sqrt(self.weight.size(1)) 27 | self.weight.data.uniform_(-stdv, stdv) 28 | if self.bias is not None: 29 | self.bias.data.uniform_(-stdv, stdv) 30 | 31 | def forward(self, input, adj): 32 | # TODO make fc more efficient via "pack_padded_sequence" 33 | 34 | support = torch.bmm(input, self.weight.unsqueeze( 35 | 0).expand(input.shape[0], -1, -1)) 36 | output = torch.bmm(adj, support) 37 | #output = SparseMM(adj)(support) 38 | if self.bias is not None: 39 | output += self.bias.unsqueeze(0).expand(input.shape[0], -1, -1) 40 | if self.skip: 41 | output += support 42 | 43 | return output 44 | 45 | def __repr__(self): 46 | return self.__class__.__name__ + ' (' \ 47 | + str(self.in_features) + ' -> ' \ 48 | + str(self.out_features) + ')' 49 | 50 | 51 | class Graph(nn.Module): 52 | 53 | def __init__(self, dim_in, dim_hidden, dim_out, num_layers, dropout): 54 | super(Graph, self).__init__() 55 | self.fc_k = nn.Linear(dim_in, dim_hidden) 56 | self.fc_q = nn.Linear(dim_in, dim_hidden) 57 | 58 | dim_hidden = dim_out if num_layers == 1 else dim_hidden 59 | self.layers = nn.ModuleList([ 60 | GraphConvolution(dim_in, dim_hidden) 61 | ]) 62 | 63 | for i in range(num_layers - 1): 64 | dim_tmp = dim_out if i == num_layers-2 else dim_hidden 65 | self.layers.append(GraphConvolution(dim_hidden, dim_tmp)) 66 | 67 | self.dropout = dropout 68 | 69 | 70 | def build_graph(self, x): 71 | batch_size, s_len = x.shape[0], x.shape[1] 72 | emb_k = self.fc_k(x) 73 | emb_q = self.fc_q(x) 74 | length = torch.tensor([s_len] * batch_size, dtype=torch.long) 75 | 76 | s = torch.bmm(emb_k, emb_q.transpose(1, 2)) 77 | 78 | s_mask = s.data.new(*s.size()).fill_(1).bool() # [B, T1, T2] 79 | # Init similarity mask using lengths 80 | for i, (l_1, l_2) in enumerate(zip(length, length)): 81 | s_mask[i][:l_1, :l_2] = 0 82 | s_mask = Variable(s_mask) 83 | s.data.masked_fill_(s_mask.data, -float("inf")) 84 | 85 | A = s #F.softmax(s, dim=2) # [B, t1, t2] 86 | 87 | # remove nan from softmax on -inf 88 | A.data.masked_fill_(A.data != A.data, 0) 89 | 90 | return A 91 | 92 | def forward(self, X, A): 93 | for layer in self.layers: 94 | X = F.relu(layer(X, A)) 95 | X = F.dropout(X, self.dropout, training=self.training) 96 | return X 97 | -------------------------------------------------------------------------------- /model/language_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from transformers.activations import gelu 5 | from model.cmatt import CMAtten 6 | 7 | class Bert(nn.Module): 8 | """ Finetuned *BERT module """ 9 | 10 | def __init__(self, tokenizer, lan='RoBERTa'): 11 | super(Bert, self).__init__() 12 | 13 | if lan == 'BERT': 14 | from transformers import BertTokenizer, BertModel, BertConfig 15 | config = BertConfig.from_pretrained("bert-base-uncased", output_hidden_states=True) 16 | self.bert = BertModel.from_pretrained("bert-base-uncased", config=config) 17 | elif lan == 'RoBERTa': 18 | from transformers import RobertaModel, RobertaConfig, RobertaTokenizerFast 19 | config = RobertaConfig.from_pretrained("roberta-base", output_hidden_states=True) 20 | self.bert = RobertaModel.from_pretrained("roberta-base", config=config) 21 | self.tokenizer = tokenizer 22 | 23 | # for name, param in self.bert.named_parameters(): 24 | # param.requires_grad = False 25 | 26 | def forward(self, tokens): 27 | attention_mask = (tokens != self.tokenizer.pad_token_id).float() 28 | outs = self.bert(tokens, attention_mask=attention_mask) 29 | embds = outs[0] 30 | return embds, outs[1][-2] 31 | 32 | 33 | class Sentence_Maxpool(nn.Module): 34 | """ Utilitary for the answer module """ 35 | 36 | def __init__(self, word_dimension, output_dim, relu=True): 37 | super(Sentence_Maxpool, self).__init__() 38 | self.fc = nn.Linear(word_dimension, output_dim) 39 | self.out_dim = output_dim 40 | self.relu = relu 41 | 42 | def forward(self, x_in): 43 | x = self.fc(x_in) 44 | x = torch.max(x, dim=1)[0] 45 | if self.relu: 46 | x = F.relu(x) 47 | return x 48 | 49 | 50 | class FFN(nn.Module): 51 | def __init__(self, word_dim, hidden_dim, out_dim, dropout=0.3): 52 | super().__init__() 53 | activation = "gelu" 54 | self.dropout = nn.Dropout(p=dropout) 55 | self.lin1 = nn.Linear(in_features=word_dim, out_features=hidden_dim) 56 | self.lin2 = nn.Linear(in_features=hidden_dim, out_features=out_dim) 57 | assert activation in [ 58 | "relu", 59 | "gelu", 60 | ], "activation ({}) must be in ['relu', 'gelu']".format(activation) 61 | self.activation = gelu if activation == "gelu" else nn.ReLU() 62 | 63 | def forward(self, input): 64 | x = self.lin1(input) 65 | x = self.activation(x) 66 | x = self.lin2(x) 67 | x = self.dropout(x) 68 | return x 69 | 70 | class AModel(nn.Module): 71 | """ 72 | Answer embedding module 73 | """ 74 | 75 | def __init__(self, tokenizer, lan='RoBERTa', word_dim=768, out_dim=512): 76 | super(AModel, self).__init__() 77 | self.bert = Bert(tokenizer, lan=lan) 78 | self.linear_text = nn.Linear(word_dim, out_dim) 79 | 80 | # self.linear_text = FFN(word_dim, out_dim, out_dim) 81 | 82 | def forward(self, answer): 83 | 84 | if len(answer.shape) == 3: 85 | #multi-choice 86 | bs, nans, lans = answer.shape 87 | answer = answer.view(bs * nans, lans) 88 | answer, hd_state = self.bert(answer) 89 | answer = self.linear_text(answer) 90 | answer_g = answer.mean(dim=1) 91 | # answer_g = answer[:, 0, :] 92 | answer_g = answer_g.view(bs, nans, -1) 93 | else: 94 | answer, hd_state = self.bert(answer) 95 | answer = self.linear_text(answer) 96 | answer_g = answer.mean(dim=1) 97 | # answer_g = answer[:, 0, :] 98 | 99 | return answer_g, answer 100 | 101 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | h5py==3.9.0 2 | hostlist==1.4.8 3 | huggingface-hub==0.16.4 4 | numpy==1.22.0 5 | pandas==1.4.1 6 | Pillow==9.3.0 7 | python-dateutil==2.8.2 8 | PyYAML==6.0 9 | scikit-learn==1.0.2 10 | scipy==1.8.0 11 | sentencepiece==0.1.96 12 | tokenizers==0.11.6 13 | torch==1.8.1 14 | torchvision==0.9.1 15 | tqdm==4.63.1 16 | transformers==4.17.0 17 | -------------------------------------------------------------------------------- /shells/cvid_test.sh: -------------------------------------------------------------------------------- 1 | GPU=$1 2 | CUDA_VISIBLE_DEVICES=$GPU python main.py --checkpoint_dir=causalvid \ 3 | --dataset=causalvid \ 4 | --mc=5 \ 5 | --bnum=10 \ 6 | --test=1 \ 7 | --qmax_words=0 \ 8 | --amax_words=38 \ 9 | --max_feats=32 \ 10 | --batch_size=64 \ 11 | --batch_size_val=64 \ 12 | --num_thread_reader=8 \ 13 | --mlm_prob=0 \ 14 | --n_layers=1 \ 15 | --embd_dim=512 \ 16 | --ff_dim=1024 \ 17 | --dropout=0.3 \ 18 | --lan="RoBERTa" \ 19 | --save_dir='./save_models/causalvid/CoVGT/' \ 20 | --pretrain_path='./save_models/causalvid/CoVGT/best_model.pth' 21 | -------------------------------------------------------------------------------- /shells/cvid_train.sh: -------------------------------------------------------------------------------- 1 | GPU=$1 2 | CUDA_VISIBLE_DEVICES=$GPU python main.py --checkpoint_dir=causalvid \ 3 | --dataset=causalvid \ 4 | --mc=5 \ 5 | --bnum=10 \ 6 | --epochs=20 \ 7 | --lr=0.00001 \ 8 | --qmax_words=0 \ 9 | --amax_words=38 \ 10 | --max_feats=32 \ 11 | --batch_size=64 \ 12 | --batch_size_val=64 \ 13 | --num_thread_reader=8 \ 14 | --mlm_prob=0 \ 15 | --n_layers=1 \ 16 | --embd_dim=512 \ 17 | --ff_dim=1024 \ 18 | --dropout=0.3 \ 19 | --seed=666 \ 20 | --cl_loss=0 \ 21 | --lan="RoBERTa" \ 22 | --save_dir='./save_models/causalvid/CoVGT/' \ 23 | --pretrain_path='./save_models/causalvid/CoVGT/best_model.pth' -------------------------------------------------------------------------------- /shells/msrvtt_test.sh: -------------------------------------------------------------------------------- 1 | GPU=$1 2 | CUDA_VISIBLE_DEVICES=$GPU python main.py --checkpoint_dir=msrvtt \ 3 | --dataset=msrvtt \ 4 | --mc=0 \ 5 | --bnum=10 \ 6 | --test=1 \ 7 | --qmax_words=20 \ 8 | --amax_words=5 \ 9 | --max_feats=32 \ 10 | --batch_size=64 \ 11 | --batch_size_val=64 \ 12 | --num_thread_reader=8 \ 13 | --mlm_prob=0 \ 14 | --n_layers=1 \ 15 | --embd_dim=512 \ 16 | --ff_dim=1024 \ 17 | --dropout=0.3 \ 18 | --save_dir='../data/save_models/msrvtt/180k_ft/' \ 19 | --pretrain_path='../data/save_models/msrvtt/180k_ft/best_model.pth' 20 | -------------------------------------------------------------------------------- /shells/msrvtt_train.sh: -------------------------------------------------------------------------------- 1 | GPU=$1 2 | CUDA_VISIBLE_DEVICES=$GPU python main.py --checkpoint_dir=msrvtt \ 3 | --dataset=msrvtt \ 4 | --mc=0 \ 5 | --bnum=10 \ 6 | --epochs=30 \ 7 | --lr=0.00001 \ 8 | --qmax_words=20 \ 9 | --amax_words=5 \ 10 | --max_feats=32 \ 11 | --batch_size=64 \ 12 | --batch_size_val=64 \ 13 | --num_thread_reader=8 \ 14 | --mlm_prob=0 \ 15 | --n_layers=1 \ 16 | --embd_dim=512 \ 17 | --ff_dim=1024 \ 18 | --dropout=0.3 \ 19 | --save_dir='../data/save_models/msrvtt/180k+_ft/' \ 20 | --seed=666 \ 21 | --pretrain_path='../data/save_models/msrvtt/180k+_ft/best_model.pth' 22 | -------------------------------------------------------------------------------- /shells/next_test.sh: -------------------------------------------------------------------------------- 1 | GPU=$1 2 | CUDA_VISIBLE_DEVICES=$GPU python main.py --checkpoint_dir=nextqa \ 3 | --dataset=nextqa \ 4 | --mc=5 \ 5 | --bnum=10 \ 6 | --test=1 \ 7 | --qmax_words=0 \ 8 | --amax_words=38 \ 9 | --max_feats=32 \ 10 | --batch_size=64 \ 11 | --batch_size_val=64 \ 12 | --num_thread_reader=4 \ 13 | --mlm_prob=0 \ 14 | --n_layers=1 \ 15 | --embd_dim=512 \ 16 | --ff_dim=1024 \ 17 | --dropout=0.3 \ 18 | --lan="RoBERTa" \ 19 | --save_dir='../data/save_models/nextqa/CoVGT_FTCoWV/' \ 20 | --pretrain_path='../data/save_models/nextqa/CoVGT_FTCoWV/best_model.pth' \ 21 | #--CM_PT=1 22 | -------------------------------------------------------------------------------- /shells/next_train.sh: -------------------------------------------------------------------------------- 1 | GPU=$1 2 | CUDA_VISIBLE_DEVICES=$GPU python main.py --checkpoint_dir=nextqa \ 3 | --dataset=nextqa \ 4 | --mc=5 \ 5 | --bnum=5 \ 6 | --epochs=20 \ 7 | --lr=0.00001 \ 8 | --qmax_words=30 \ 9 | --amax_words=38 \ 10 | --max_feats=32 \ 11 | --batch_size=64 \ 12 | --batch_size_val=64 \ 13 | --num_thread_reader=8 \ 14 | --mlm_prob=0 \ 15 | --cl_loss=1 \ 16 | --n_layers=1 \ 17 | --embd_dim=512 \ 18 | --ff_dim=1024 \ 19 | --dropout=0.3 \ 20 | --seed=666 \ 21 | --lan="RoBERTa" \ 22 | --save_dir='../data/save_models/nextqa/CoVGT/' \ 23 | #--pretrain_path=../data/save_models/webvid180K/co_e1.pth \ 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /shells/tgif_ftrain.sh: -------------------------------------------------------------------------------- 1 | GPU=$1 2 | CUDA_VISIBLE_DEVICES=$GPU python main.py --checkpoint_dir=tgifqa \ 3 | --dataset=tgifqa/frameqa \ 4 | --mc=0 \ 5 | --bnum=10 \ 6 | --epochs=30 \ 7 | --lr=0.00001 \ 8 | --qmax_words=20 \ 9 | --amax_words=5 \ 10 | --max_feats=32 \ 11 | --batch_size=64 \ 12 | --batch_size_val=64 \ 13 | --num_thread_reader=8 \ 14 | --mlm_prob=0 \ 15 | --n_layers=1 \ 16 | --embd_dim=512 \ 17 | --ff_dim=1024 \ 18 | --dropout=0.3 \ 19 | --save_dir='../data/save_models/tgifqa/frameqa/VGT/' \ 20 | --seed=666 \ 21 | # --pretrain_path='../data/save_models/webvid/180K/e1.pth' -------------------------------------------------------------------------------- /shells/tgif_test.sh: -------------------------------------------------------------------------------- 1 | GPU=$1 2 | CUDA_VISIBLE_DEVICES=$GPU python main.py --checkpoint_dir=tgifqa \ 3 | --dataset=tgifqa/transition \ 4 | --mc=5 \ 5 | --test=1 \ 6 | --qmax_words=0 \ 7 | --amax_words=20 \ 8 | --max_feats=32 \ 9 | --batch_size=64 \ 10 | --batch_size_val=64 \ 11 | --num_thread_reader=8 \ 12 | --mlm_prob=0 \ 13 | --n_layers=1 \ 14 | --embd_dim=512 \ 15 | --ff_dim=1024 \ 16 | --dropout=0.3 \ 17 | --save_dir='../data/save_models/tgifqa/transition/VGT/' \ 18 | --pretrain_path='../data/save_models/tgifqa/transition/VGT/best_model.pth' 19 | -------------------------------------------------------------------------------- /shells/tgif_train.sh: -------------------------------------------------------------------------------- 1 | GPU=$1 2 | CUDA_VISIBLE_DEVICES=$GPU python main.py --checkpoint_dir=tgifqa \ 3 | --dataset=tgifqa/action \ 4 | --mc=5 \ 5 | --epochs=30 \ 6 | --lr=0.00001 \ 7 | --qmax_words=0 \ 8 | --amax_words=20 \ 9 | --max_feats=32 \ 10 | --batch_size=64 \ 11 | --batch_size_val=64 \ 12 | --num_thread_reader=4 \ 13 | --mlm_prob=0 \ 14 | --n_layers=1 \ 15 | --embd_dim=512 \ 16 | --ff_dim=1024 \ 17 | --dropout=0.3 \ 18 | --save_dir='../data/save_models/tgifqa/action/VGT/' \ 19 | --seed=666 \ 20 | # --pretrain_path=../data/save_models/webvid/180K/e1.pth 21 | -------------------------------------------------------------------------------- /shells/webvid_train.sh: -------------------------------------------------------------------------------- 1 | GPU=$1 2 | CUDA_VISIBLE_DEVICES=$GPU python main.py --checkpoint_dir=webvid \ 3 | --dataset=webvid \ 4 | --mc=64 \ 5 | --epochs=3 \ 6 | --lr=0.00005 \ 7 | --qmax_words=0 \ 8 | --amax_words=20 \ 9 | --max_feats=32 \ 10 | --batch_size=64 \ 11 | --batch_size_val=64 \ 12 | --num_thread_reader=16 \ 13 | --mlm_prob=0.15 \ 14 | --n_layers=1 \ 15 | --embd_dim=512 \ 16 | --ff_dim=1024 \ 17 | --dropout=0.3 \ 18 | --save_dir='./save_models/webvid/025/' \ 19 | --seed=666 \ 20 | 21 | -------------------------------------------------------------------------------- /tools/__pycache__/object_align.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doc-doc/CoVGT/cbc9fa7830b304f3c3f9c53040489ea9ad35a9aa/tools/__pycache__/object_align.cpython-38.pyc -------------------------------------------------------------------------------- /tools/bbox_visualizer.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | 3 | 4 | def draw_rectangle(img, 5 | bbox, 6 | bbox_color=(255, 255, 255), 7 | thickness=3, 8 | is_opaque=False, 9 | alpha=0.5): 10 | """Draws the rectangle around the object 11 | 12 | Parameters 13 | ---------- 14 | img : ndarray 15 | the actual image 16 | bbox : list 17 | a list containing x_min, y_min, x_max and y_max of the rectangle positions 18 | bbox_color : tuple, optional 19 | the color of the box, by default (255,255,255) 20 | thickness : int, optional 21 | thickness of the outline of the box, by default 3 22 | is_opaque : bool, optional 23 | if False, draws a solid rectangular outline. Else, a filled rectangle which is semi transparent, by default False 24 | alpha : float, optional 25 | strength of the opacity, by default 0.5 26 | 27 | Returns 28 | ------- 29 | ndarray 30 | the image with the bounding box drawn 31 | """ 32 | 33 | output = img.copy() 34 | if not is_opaque: 35 | cv2.rectangle(output, (bbox[0], bbox[1]), (bbox[2], bbox[3]), 36 | bbox_color, thickness) 37 | else: 38 | overlay = img.copy() 39 | 40 | cv2.rectangle(overlay, (bbox[0], bbox[1]), (bbox[2], bbox[3]), 41 | bbox_color, -1) 42 | cv2.addWeighted(overlay, alpha, output, 1 - alpha, 0, output) 43 | 44 | return output 45 | 46 | 47 | def add_label(img, 48 | label, 49 | bbox, 50 | draw_bg=True, 51 | text_bg_color=(255, 255, 255), 52 | text_color=(0, 0, 0), 53 | top=True): 54 | """adds label, inside or outside the rectangle 55 | 56 | Parameters 57 | ---------- 58 | img : ndarray 59 | the image on which the label is to be written, preferably the image with the rectangular bounding box drawn 60 | label : str 61 | the text (label) to be written 62 | bbox : list 63 | a list containing x_min, y_min, x_max and y_max of the rectangle positions 64 | draw_bg : bool, optional 65 | if True, draws the background of the text, else just the text is written, by default True 66 | text_bg_color : tuple, optional 67 | the background color of the label that is filled, by default (255, 255, 255) 68 | text_color : tuple, optional 69 | color of the text (label) to be written, by default (0, 0, 0) 70 | top : bool, optional 71 | if True, writes the label on top of the bounding box, else inside, by default True 72 | 73 | Returns 74 | ------- 75 | ndarray 76 | the image with the label written 77 | """ 78 | 79 | text_width = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 1, 2)[0][0] 80 | 81 | if top: 82 | label_bg = [bbox[0], bbox[1], bbox[0] + text_width, bbox[1] - 30] 83 | if draw_bg: 84 | cv2.rectangle(img, (label_bg[0], label_bg[1]), 85 | (label_bg[2] + 5, label_bg[3]), text_bg_color, -1) 86 | cv2.putText(img, label, (bbox[0] + 5, bbox[1] - 5), 87 | cv2.FONT_HERSHEY_SIMPLEX, 1, text_color, 2) 88 | 89 | else: 90 | label_bg = [bbox[0], bbox[1], bbox[0] + text_width, bbox[1] + 30] 91 | if draw_bg: 92 | cv2.rectangle(img, (label_bg[0], label_bg[1]), 93 | (label_bg[2] + 5, label_bg[3]), text_bg_color, -1) 94 | cv2.putText(img, label, (bbox[0] + 5, bbox[1] - 5 + 30), 95 | cv2.FONT_HERSHEY_SIMPLEX, 1, text_color, 2) 96 | 97 | return img 98 | 99 | 100 | def add_T_label(img, 101 | label, 102 | bbox, 103 | draw_bg=True, 104 | text_bg_color=(255, 255, 255), 105 | text_color=(0, 0, 0)): 106 | """adds a T label to the rectangle, originating from the top of the rectangle 107 | 108 | Parameters 109 | ---------- 110 | img : ndarray 111 | the image on which the T label is to be written/drawn, preferably the image with the rectangular bounding box drawn 112 | label : str 113 | the text (label) to be written 114 | bbox : list 115 | a list containing x_min, y_min, x_max and y_max of the rectangle positions 116 | draw_bg : bool, optional 117 | if True, draws the background of the text, else just the text is written, by default True 118 | text_bg_color : tuple, optional 119 | the background color of the label that is filled, by default (255, 255, 255) 120 | text_color : tuple, optional 121 | color of the text (label) to be written, by default (0, 0, 0) 122 | 123 | Returns 124 | ------- 125 | ndarray 126 | the image with the T label drawn/written 127 | """ 128 | 129 | text_width = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 1, 2)[0][0] 130 | text_height = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 1, 2)[0][1] 131 | 132 | # draw vertical line 133 | x_center = (bbox[0] + bbox[2]) // 2 134 | y_top = bbox[1] - 50 135 | cv2.line(img, (x_center, bbox[1]), (x_center, y_top), text_bg_color, 3) 136 | 137 | # draw rectangle with label 138 | y_bottom = y_top 139 | y_top = y_bottom - text_height - 5 140 | x_left = x_center - (text_width // 2) - 5 141 | x_right = x_center + (text_width // 2) + 5 142 | if draw_bg: 143 | cv2.rectangle(img, (x_left, y_top - 3), (x_right, y_bottom), 144 | text_bg_color, -1) 145 | cv2.putText(img, label, (x_left + 5, y_bottom - 7), 146 | cv2.FONT_HERSHEY_SIMPLEX, 1, text_color, 2) 147 | 148 | return img 149 | 150 | 151 | def draw_flag_with_label(img, 152 | label, 153 | bbox, 154 | write_label=True, 155 | line_color=(255, 255, 255), 156 | text_bg_color=(255, 255, 255), 157 | text_color=(0, 0, 0)): 158 | """draws a pole from the middle of the object that is to be labeled and adds the label to the flag 159 | 160 | Parameters 161 | ---------- 162 | img : ndarray 163 | the image on which the flag is to be drawn 164 | label : str 165 | label that is written inside the flag 166 | bbox : list 167 | a list containing x_min, y_min, x_max and y_max of the rectangle positions 168 | write_label : bool, optional 169 | if True, writes the label, otherwise, it's just a vertical line, by default True 170 | line_color : tuple, optional 171 | the color of the pole of the flag, by default (255, 255, 255) 172 | text_bg_color : tuple, optional 173 | the background color of the label that is filled, by default (255, 255, 255) 174 | text_color : tuple, optional 175 | color of the text (label) to be written, by default (0, 0, 0) 176 | 177 | Returns 178 | ------- 179 | ndarray 180 | the image with flag drawn and the label written in the flag 181 | """ 182 | 183 | # draw vertical line 184 | 185 | x_center = (bbox[0] + bbox[2]) // 2 186 | y_bottom = int((bbox[1] * .75 + bbox[3] * .25)) 187 | y_top = bbox[1] - (y_bottom - bbox[1]) 188 | 189 | start_point = (x_center, y_top) 190 | end_point = (x_center, y_bottom) 191 | 192 | cv2.line(img, start_point, end_point, line_color, 3) 193 | 194 | # write label 195 | 196 | if write_label: 197 | text_width = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 1, 198 | 2)[0][0] 199 | label_bg = [ 200 | start_point[0], start_point[1], start_point[0] + text_width, 201 | start_point[1] + 30 202 | ] 203 | cv2.rectangle(img, (label_bg[0], label_bg[1]), 204 | (label_bg[2] + 5, label_bg[3]), text_bg_color, -1) 205 | cv2.putText(img, label, (start_point[0] + 5, start_point[1] - 5 + 30), 206 | cv2.FONT_HERSHEY_SIMPLEX, 1, text_color, 2) 207 | 208 | return img 209 | 210 | 211 | # THE FOLLOWING ARE OPTIONAL FUNCTIONS THAT CAN BE USED FOR DRAWING OR LABELLING MULTIPLE OBJECTS IN THE SAME 212 | # IMAGE. IN ORDER TO HAVE FULL CONTROL OF YOUR VISUALIZATIONS IT IS ADVISABLE TO USE THE ABOVE FUNCTIONS IN FOR LOOPS 213 | # INSTEAD OF THE FUNCTIONS BELOW 214 | 215 | 216 | def draw_multiple_rectangles(img, 217 | bboxes, 218 | bbox_color=(255, 255, 255), 219 | thickness=2, 220 | is_opaque=False, 221 | alpha=0.5): 222 | """draws multiple rectangles 223 | 224 | img : ndarray 225 | the actual image 226 | bboxes : list 227 | a list of lists, each inner list containing x_min, y_min, x_max and y_max of the rectangle positions 228 | bbox_color : tuple, optional 229 | the color of the boxes, by default (255,255,255) 230 | thickness : int, optional 231 | thickness of the outline of the boxes, by default 3 232 | is_opaque : bool, optional 233 | if False, draws solid rectangular outlines for rectangles. Else, filled rectangles which are semi transparent, by default False 234 | alpha : float, optional 235 | strength of the opacity, by default 0.5 236 | 237 | Returns 238 | ------- 239 | ndarray 240 | the image with the bounding boxes drawn 241 | """ 242 | 243 | for bid, bbox in enumerate(bboxes): 244 | img = draw_rectangle(img, bbox, bbox_color[bid], thickness, is_opaque, 245 | alpha) 246 | return img 247 | 248 | 249 | def add_multiple_labels(img, 250 | labels, 251 | bboxes, 252 | draw_bg=True, 253 | text_bg_color=(255, 255, 255), 254 | text_color=(0, 0, 0), 255 | top=True): 256 | """add labels, inside or outside the rectangles 257 | 258 | Parameters 259 | ---------- 260 | img : ndarray 261 | the image on which the labels are to be written, preferably the image with the rectangular bounding boxes drawn 262 | labels : list 263 | a list of string of the texts (labels) to be written 264 | bboxes : list 265 | a list of lists, each inner list containing x_min, y_min, x_max and y_max of the rectangle positions 266 | draw_bg : bool, optional 267 | if True, draws the background of the texts, else just the texts are written, by default True 268 | text_bg_color : tuple, optional 269 | the background color of the labels that are filled, by default (255, 255, 255) 270 | text_color : tuple, optional 271 | color of the texts (labels) to be written, by default (0, 0, 0) 272 | top : bool, optional 273 | if True, writes the labels on top of the bounding boxes, else inside, by default True 274 | 275 | Returns 276 | ------- 277 | ndarray 278 | the image with the labels written 279 | """ 280 | 281 | for label, bbox in zip(labels, bboxes): 282 | img = add_label(img, label, bbox, draw_bg, text_bg_color, text_color, 283 | top) 284 | 285 | return img 286 | 287 | 288 | def add_multiple_T_labels(img, 289 | labels, 290 | bboxes, 291 | draw_bg=True, 292 | text_bg_color=(255, 255, 255), 293 | text_color=(0, 0, 0)): 294 | """adds T labels to the rectangles, each originating from the top of the rectangle 295 | 296 | Parameters 297 | ---------- 298 | img : ndarray 299 | the image on which the T labels are to be written/drawn, preferably the image with the rectangular bounding boxes drawn 300 | labels : list 301 | the texts (labels) to be written 302 | bboxes : list 303 | a list of lists, each inner list containing x_min, y_min, x_max and y_max of the rectangle positions 304 | draw_bg : bool, optional 305 | if True, draws the background of the texts, else just the texts are written, by default True 306 | text_bg_color : tuple, optional 307 | the background color of the labels that are filled, by default (255, 255, 255) 308 | text_color : tuple, optional 309 | color of the texts (labels) to be written, by default (0, 0, 0) 310 | 311 | Returns 312 | ------- 313 | ndarray 314 | the image with the T labels drawn/written 315 | """ 316 | 317 | for label, bbox in zip(labels, bboxes): 318 | add_T_label(img, label, bbox, draw_bg, text_bg_color, text_color) 319 | 320 | return img 321 | 322 | 323 | def draw_multiple_flags_with_labels(img, 324 | labels, 325 | bboxes, 326 | write_label=True, 327 | line_color=(255, 255, 255), 328 | text_bg_color=(255, 255, 255), 329 | text_color=(0, 0, 0)): 330 | """draws poles from the middle of the objects that are to be labeled and adds the labels to the flags 331 | 332 | Parameters 333 | ---------- 334 | img : ndarray 335 | the image on which the flags are to be drawn 336 | labels : list 337 | labels that are written inside the flags 338 | bbox : list 339 | a list of lists, each inner list containing x_min, y_min, x_max and y_max of the rectangle positions 340 | write_label : bool, optional 341 | if True, writes the labels, otherwise, it's just a vertical line for each object, by default True 342 | line_color : tuple, optional 343 | the color of the pole of the flags, by default (255, 255, 255) 344 | text_bg_color : tuple, optional 345 | the background color of the labels that are filled, by default (255, 255, 255) 346 | text_color : tuple, optional 347 | color of the texts (labels) to be written, by default (0, 0, 0) 348 | 349 | Returns 350 | ------- 351 | ndarray 352 | the image with flags drawn and the labels written in the flags 353 | """ 354 | 355 | for label, bbox in zip(labels, bboxes): 356 | img = draw_flag_with_label(img, label, bbox, write_label, line_color, 357 | text_bg_color, text_color) 358 | return img 359 | -------------------------------------------------------------------------------- /tools/colors.txt: -------------------------------------------------------------------------------- 1 | 255 0 0 2 | 255 255 0 3 | 0 255 0 4 | 255 153 18 5 | 0 255 255 6 | 63 211 144 7 | 240 141 163 8 | 149 139 206 9 | 166 31 247 10 | 210 148 204 11 | 196 142 86 12 | 138 48 98 13 | 85 16 165 14 | 84 103 158 15 | 186 202 87 16 | 149 52 56 17 | 169 184 132 18 | 156 176 226 19 | 233 214 139 20 | 35 124 145 21 | 10 116 109 22 | 89 231 101 23 | 198 145 242 24 | 113 43 121 25 | 49 61 103 26 | 196 239 149 27 | 227 80 71 28 | 70 3 76 29 | 143 43 181 30 | 159 31 2 31 | 171 53 200 32 | 233 49 105 33 | 75 127 208 34 | 221 246 66 35 | 238 11 216 36 | 101 36 178 37 | 198 5 97 38 | 42 179 23 39 | 124 62 186 40 | 25 90 250 41 | 180 50 78 42 | 40 107 146 43 | 147 80 68 44 | 110 147 182 45 | 141 199 99 46 | 183 74 21 47 | 6 157 170 48 | 133 168 215 49 | 18 51 5 50 | 136 196 212 51 | 224 237 188 52 | 172 61 214 53 | -------------------------------------------------------------------------------- /tools/datautils/msrvtt_qa.py: -------------------------------------------------------------------------------- 1 | import json 2 | from datautils import utils 3 | import nltk 4 | from collections import Counter 5 | 6 | import pickle 7 | import numpy as np 8 | 9 | 10 | def load_video_paths(args): 11 | ''' Load a list of (path,image_id tuples).''' 12 | video_paths = [] 13 | modes = ['train', 'val', 'test'] 14 | for mode in modes: 15 | with open(args.annotation_file.format(mode), 'r') as anno_file: 16 | instances = json.load(anno_file) 17 | video_ids = [instance['video_id'] for instance in instances] 18 | video_ids = set(video_ids) 19 | if mode in ['train', 'val']: 20 | for video_id in video_ids: 21 | video_paths.append((args.video_dir + 'videos/video{}.mp4'.format(video_id), video_id)) 22 | else: 23 | for video_id in video_ids: 24 | video_paths.append((args.video_dir + 'videos/video{}.mp4'.format(video_id), video_id)) 25 | 26 | return video_paths 27 | 28 | 29 | def process_questions(args): 30 | ''' Encode question tokens''' 31 | print('Loading dataset') 32 | with open(args.annotation_file, 'r') as dataset_file: 33 | instances = json.load(dataset_file) 34 | 35 | # Either create the vocab or load it from disk 36 | if args.mode in ['train']: 37 | print('Building vocab') 38 | answer_cnt = {} 39 | for instance in instances: 40 | answer = instance['answer'] 41 | answer_cnt[answer] = answer_cnt.get(answer, 0) + 1 42 | 43 | answer_token_to_idx = {'': 0, '': 1} 44 | answer_counter = Counter(answer_cnt) 45 | frequent_answers = answer_counter.most_common(args.answer_top) 46 | total_ans = sum(item[1] for item in answer_counter.items()) 47 | total_freq_ans = sum(item[1] for item in frequent_answers) 48 | print("Number of unique answers:", len(answer_counter)) 49 | print("Total number of answers:", total_ans) 50 | print("Top %i answers account for %f%%" % (len(frequent_answers), total_freq_ans * 100.0 / total_ans)) 51 | 52 | for token, cnt in Counter(answer_cnt).most_common(args.answer_top): 53 | answer_token_to_idx[token] = len(answer_token_to_idx) 54 | print('Get answer_token_to_idx, num: %d' % len(answer_token_to_idx)) 55 | 56 | question_token_to_idx = {'': 0, '': 1} 57 | for i, instance in enumerate(instances): 58 | question = instance['question'].lower()[:-1] 59 | for token in nltk.word_tokenize(question): 60 | if token not in question_token_to_idx: 61 | question_token_to_idx[token] = len(question_token_to_idx) 62 | print('Get question_token_to_idx') 63 | print(len(question_token_to_idx)) 64 | 65 | vocab = { 66 | 'question_token_to_idx': question_token_to_idx, 67 | 'answer_token_to_idx': answer_token_to_idx, 68 | 'question_answer_token_to_idx': {'': 0, '': 1} 69 | } 70 | 71 | print('Write into %s' % args.vocab_json.format(args.dataset, args.dataset)) 72 | with open(args.vocab_json.format(args.dataset, args.dataset), 'w') as f: 73 | json.dump(vocab, f, indent=4) 74 | else: 75 | print('Loading vocab') 76 | with open(args.vocab_json.format(args.dataset, args.dataset), 'r') as f: 77 | vocab = json.load(f) 78 | 79 | # Encode all questions 80 | print('Encoding dataset') 81 | questions_encoded = [] 82 | questions_len = [] 83 | question_ids = [] 84 | video_ids_tbw = [] 85 | video_names_tbw = [] 86 | all_answers = [] 87 | for idx, instance in enumerate(instances): 88 | question = instance['question'].lower()[:-1] 89 | question_tokens = nltk.word_tokenize(question) 90 | question_encoded = utils.encode(question_tokens, vocab['question_token_to_idx'], allow_unk=True) 91 | questions_encoded.append(question_encoded) 92 | questions_len.append(len(question_encoded)) 93 | question_ids.append(idx) 94 | im_name = instance['video_id'] 95 | video_ids_tbw.append(im_name) 96 | video_names_tbw.append(im_name) 97 | 98 | if instance['answer'] in vocab['answer_token_to_idx']: 99 | answer = vocab['answer_token_to_idx'][instance['answer']] 100 | elif args.mode in ['train']: 101 | answer = 0 102 | elif args.mode in ['val', 'test']: 103 | answer = 1 104 | 105 | all_answers.append(answer) 106 | max_question_length = max(len(x) for x in questions_encoded) 107 | for qe in questions_encoded: 108 | while len(qe) < max_question_length: 109 | qe.append(vocab['question_token_to_idx']['']) 110 | 111 | questions_encoded = np.asarray(questions_encoded, dtype=np.int32) 112 | questions_len = np.asarray(questions_len, dtype=np.int32) 113 | print(questions_encoded.shape) 114 | 115 | glove_matrix = None 116 | if args.mode == 'train': 117 | token_itow = {i: w for w, i in vocab['question_token_to_idx'].items()} 118 | print("Load glove from %s" % args.glove_pt) 119 | glove = pickle.load(open(args.glove_pt, 'rb')) 120 | dim_word = glove['the'].shape[0] 121 | glove_matrix = [] 122 | for i in range(len(token_itow)): 123 | vector = glove.get(token_itow[i], np.zeros((dim_word,))) 124 | glove_matrix.append(vector) 125 | glove_matrix = np.asarray(glove_matrix, dtype=np.float32) 126 | print(glove_matrix.shape) 127 | 128 | print('Writing', args.output_pt.format(args.dataset, args.dataset, args.mode)) 129 | obj = { 130 | 'questions': questions_encoded, 131 | 'questions_len': questions_len, 132 | 'question_id': question_ids, 133 | 'video_ids': np.asarray(video_ids_tbw), 134 | 'video_names': np.array(video_names_tbw), 135 | 'answers': all_answers, 136 | 'glove': glove_matrix, 137 | } 138 | with open(args.output_pt.format(args.dataset, args.dataset, args.mode), 'wb') as f: 139 | pickle.dump(obj, f) 140 | -------------------------------------------------------------------------------- /tools/datautils/msvd_qa.py: -------------------------------------------------------------------------------- 1 | import json 2 | from datautils import utils 3 | import nltk 4 | from collections import Counter 5 | 6 | import pickle 7 | import numpy as np 8 | 9 | 10 | def load_video_paths(args): 11 | ''' Load a list of (path,image_id tuples).''' 12 | video_paths = [] 13 | video_ids = [] 14 | modes = ['train', 'val', 'test'] 15 | for mode in modes: 16 | with open(args.annotation_file.format(mode), 'r') as anno_file: 17 | instances = json.load(anno_file) 18 | [video_ids.append(instance['video_id']) for instance in instances] 19 | video_ids = set(video_ids) 20 | with open(args.video_name_mapping, 'r') as mapping: 21 | mapping_pairs = mapping.read().split('\n') 22 | mapping_dict = {} 23 | for idx in range(len(mapping_pairs)): 24 | cur_pair = mapping_pairs[idx].split(' ') 25 | mapping_dict[cur_pair[1]] = cur_pair[0] 26 | for video_id in video_ids: 27 | video_paths.append((args.video_dir + 'YouTubeClips/{}.avi'.format(mapping_dict['vid' + str(video_id)]), video_id)) 28 | return video_paths 29 | 30 | 31 | def process_questions(args): 32 | ''' Encode question tokens''' 33 | print('Loading dataset') 34 | with open(args.annotation_file, 'r') as dataset_file: 35 | instances = json.load(dataset_file) 36 | 37 | # Either create the vocab or load it from disk 38 | if args.mode in ['train']: 39 | print('Building vocab') 40 | answer_cnt = {} 41 | for instance in instances: 42 | answer = instance['answer'] 43 | answer_cnt[answer] = answer_cnt.get(answer, 0) + 1 44 | 45 | answer_token_to_idx = {'': 0, '': 1} 46 | answer_counter = Counter(answer_cnt) 47 | frequent_answers = answer_counter.most_common(args.answer_top) 48 | total_ans = sum(item[1] for item in answer_counter.items()) 49 | total_freq_ans = sum(item[1] for item in frequent_answers) 50 | print("Number of unique answers:", len(answer_counter)) 51 | print("Total number of answers:", total_ans) 52 | print("Top %i answers account for %f%%" % (len(frequent_answers), total_freq_ans * 100.0 / total_ans)) 53 | 54 | for token, cnt in Counter(answer_cnt).most_common(args.answer_top): 55 | answer_token_to_idx[token] = len(answer_token_to_idx) 56 | print('Get answer_token_to_idx, num: %d' % len(answer_token_to_idx)) 57 | 58 | question_token_to_idx = {'': 0, '': 1} 59 | for i, instance in enumerate(instances): 60 | question = instance['question'].lower()[:-1] 61 | for token in nltk.word_tokenize(question): 62 | if token not in question_token_to_idx: 63 | question_token_to_idx[token] = len(question_token_to_idx) 64 | print('Get question_token_to_idx') 65 | print(len(question_token_to_idx)) 66 | 67 | vocab = { 68 | 'question_token_to_idx': question_token_to_idx, 69 | 'answer_token_to_idx': answer_token_to_idx, 70 | 'question_answer_token_to_idx': {'': 0, '': 1} 71 | } 72 | 73 | print('Write into %s' % args.vocab_json.format(args.dataset, args.dataset)) 74 | with open(args.vocab_json.format(args.dataset, args.dataset), 'w') as f: 75 | json.dump(vocab, f, indent=4) 76 | else: 77 | print('Loading vocab') 78 | with open(args.vocab_json.format(args.dataset, args.dataset), 'r') as f: 79 | vocab = json.load(f) 80 | 81 | # Encode all questions 82 | print('Encoding dataset') 83 | questions_encoded = [] 84 | questions_len = [] 85 | question_ids = [] 86 | video_ids_tbw = [] 87 | video_names_tbw = [] 88 | all_answers = [] 89 | for idx, instance in enumerate(instances): 90 | question = instance['question'].lower()[:-1] 91 | question_tokens = nltk.word_tokenize(question) 92 | question_encoded = utils.encode(question_tokens, vocab['question_token_to_idx'], allow_unk=True) 93 | questions_encoded.append(question_encoded) 94 | questions_len.append(len(question_encoded)) 95 | question_ids.append(idx) 96 | im_name = instance['video_id'] 97 | video_ids_tbw.append(im_name) 98 | video_names_tbw.append(im_name) 99 | 100 | if instance['answer'] in vocab['answer_token_to_idx']: 101 | answer = vocab['answer_token_to_idx'][instance['answer']] 102 | elif args.mode in ['train']: 103 | answer = 0 104 | elif args.mode in ['val', 'test']: 105 | answer = 1 106 | 107 | all_answers.append(answer) 108 | max_question_length = max(len(x) for x in questions_encoded) 109 | for qe in questions_encoded: 110 | while len(qe) < max_question_length: 111 | qe.append(vocab['question_token_to_idx']['']) 112 | 113 | questions_encoded = np.asarray(questions_encoded, dtype=np.int32) 114 | questions_len = np.asarray(questions_len, dtype=np.int32) 115 | print(questions_encoded.shape) 116 | 117 | glove_matrix = None 118 | if args.mode == 'train': 119 | token_itow = {i: w for w, i in vocab['question_token_to_idx'].items()} 120 | print("Load glove from %s" % args.glove_pt) 121 | glove = pickle.load(open(args.glove_pt, 'rb')) 122 | dim_word = glove['the'].shape[0] 123 | glove_matrix = [] 124 | for i in range(len(token_itow)): 125 | vector = glove.get(token_itow[i], np.zeros((dim_word,))) 126 | glove_matrix.append(vector) 127 | glove_matrix = np.asarray(glove_matrix, dtype=np.float32) 128 | print(glove_matrix.shape) 129 | 130 | print('Writing', args.output_pt.format(args.dataset, args.dataset, args.mode)) 131 | obj = { 132 | 'questions': questions_encoded, 133 | 'questions_len': questions_len, 134 | 'question_id': question_ids, 135 | 'video_ids': np.asarray(video_ids_tbw), 136 | 'video_names': np.array(video_names_tbw), 137 | 'answers': all_answers, 138 | 'glove': glove_matrix, 139 | } 140 | with open(args.output_pt.format(args.dataset, args.dataset, args.mode), 'wb') as f: 141 | pickle.dump(obj, f) 142 | -------------------------------------------------------------------------------- /tools/datautils/nextqa.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | import json 4 | from datautils import utils 5 | import nltk 6 | import os.path as osp 7 | import pickle 8 | import numpy as np 9 | 10 | 11 | def load_video_paths(args): 12 | ''' Load a list of (path,image_id tuples).''' 13 | input_paths = [] 14 | annotation = pd.read_csv(args.annotation_file.format(args.question_type), delimiter='\t') 15 | gif_names = list(annotation['gif_name']) 16 | keys = list(annotation['key']) 17 | print("Number of questions: {}".format(len(gif_names))) 18 | for idx, gif in enumerate(gif_names): 19 | gif_abs_path = os.path.join(args.video_dir, ''.join([gif, '.gif'])) 20 | input_paths.append((gif_abs_path, keys[idx])) 21 | input_paths = list(set(input_paths)) 22 | print("Number of unique videos: {}".format(len(input_paths))) 23 | 24 | return input_paths 25 | 26 | 27 | def openeded_encoding_data(args, vocab, questions, video_names, video_ids, answers, mode='train'): 28 | ''' Encode question tokens''' 29 | print('Encoding dataset') 30 | questions_encoded = [] 31 | questions_len = [] 32 | video_ids_tbw = [] 33 | video_names_tbw = [] 34 | all_answers = [] 35 | question_ids = [] 36 | for idx, question in enumerate(questions): 37 | question = question.lower()[:-1] 38 | question_tokens = nltk.word_tokenize(question) 39 | question_encoded = utils.encode(question_tokens, vocab['question_token_to_idx'], allow_unk=True) 40 | questions_encoded.append(question_encoded) 41 | questions_len.append(len(question_encoded)) 42 | question_ids.append(idx) 43 | video_names_tbw.append(video_names[idx]) 44 | video_ids_tbw.append(video_ids[idx]) 45 | 46 | if args.question_type == "frameqa": 47 | answer = answers[idx] 48 | if answer in vocab['answer_token_to_idx']: 49 | answer = vocab['answer_token_to_idx'][answer] 50 | elif mode in ['train']: 51 | answer = 0 52 | elif mode in ['val', 'test']: 53 | answer = 1 54 | else: 55 | answer = max(int(answers[idx]), 1) 56 | all_answers.append(answer) 57 | 58 | # Pad encoded questions 59 | max_question_length = max(len(x) for x in questions_encoded) 60 | for qe in questions_encoded: 61 | while len(qe) < max_question_length: 62 | qe.append(vocab['question_token_to_idx']['']) 63 | 64 | questions_encoded = np.asarray(questions_encoded, dtype=np.int32) 65 | questions_len = np.asarray(questions_len, dtype=np.int32) 66 | print(questions_encoded.shape) 67 | 68 | glove_matrix = None 69 | if mode == 'train': 70 | token_itow = {i: w for w, i in vocab['question_token_to_idx'].items()} 71 | print("Load glove from %s" % args.glove_pt) 72 | glove = pickle.load(open(args.glove_pt, 'rb')) 73 | dim_word = glove['the'].shape[0] 74 | glove_matrix = [] 75 | for i in range(len(token_itow)): 76 | vector = glove.get(token_itow[i], np.zeros((dim_word,))) 77 | glove_matrix.append(vector) 78 | glove_matrix = np.asarray(glove_matrix, dtype=np.float32) 79 | print(glove_matrix.shape) 80 | 81 | print('Writing ', args.output_pt.format(args.question_type, args.question_type, mode)) 82 | obj = { 83 | 'questions': questions_encoded, 84 | 'questions_len': questions_len, 85 | 'question_id': question_ids, 86 | 'video_ids': np.asarray(video_ids_tbw), 87 | 'video_names': np.array(video_names_tbw), 88 | 'answers': all_answers, 89 | 'glove': glove_matrix, 90 | } 91 | with open(args.output_pt.format(args.question_type, args.question_type, mode), 'wb') as f: 92 | pickle.dump(obj, f) 93 | 94 | def multichoice_encoding_data(args, vocab, questions, qns_ids, video_names, video_ids, answers, ans_candidates, mode='train'): 95 | # Encode all questions 96 | print('Encoding dataset') 97 | questions_encoded = [] 98 | questions_len = [] 99 | question_ids = qns_ids 100 | all_answer_cands_encoded = [] 101 | all_answer_cands_len = [] 102 | video_ids_tbw = [] 103 | video_names_tbw = [] 104 | correct_answers = [] 105 | for idx, question in enumerate(questions): 106 | 107 | question = question.lower() 108 | question_tokens = nltk.word_tokenize(question) 109 | question_encoded = utils.encode(question_tokens, vocab['question_answer_token_to_idx'], allow_unk=True) 110 | questions_encoded.append(question_encoded) 111 | questions_len.append(len(question_encoded)) 112 | # question_ids.append(idx) 113 | video_names_tbw.append(video_names[idx]) 114 | video_ids_tbw.append(video_ids[idx]) 115 | # grounthtruth 116 | answer = int(answers[idx]) 117 | correct_answers.append(answer) 118 | # answer candidates 119 | candidates = ans_candidates[idx] 120 | candidates_encoded = [] 121 | candidates_len = [] 122 | for ans in candidates: 123 | 124 | ans = ans.lower() 125 | ans_tokens = nltk.word_tokenize(ans) 126 | cand_encoded = utils.encode(ans_tokens, vocab['question_answer_token_to_idx'], allow_unk=True) 127 | candidates_encoded.append(cand_encoded) 128 | candidates_len.append(len(cand_encoded)) 129 | all_answer_cands_encoded.append(candidates_encoded) 130 | all_answer_cands_len.append(candidates_len) 131 | 132 | # Pad encoded questions 133 | max_question_length = max(len(x) for x in questions_encoded) 134 | for qe in questions_encoded: 135 | while len(qe) < max_question_length: 136 | qe.append(vocab['question_answer_token_to_idx']['']) 137 | 138 | questions_encoded = np.asarray(questions_encoded, dtype=np.int32) 139 | questions_len = np.asarray(questions_len, dtype=np.int32) 140 | print(questions_encoded.shape) 141 | 142 | # Pad encoded answer candidates 143 | max_answer_cand_length = max(max(len(x) for x in candidate) for candidate in all_answer_cands_encoded) 144 | for ans_cands in all_answer_cands_encoded: 145 | for ans in ans_cands: 146 | while len(ans) < max_answer_cand_length: 147 | ans.append(vocab['question_answer_token_to_idx']['']) 148 | all_answer_cands_encoded = np.asarray(all_answer_cands_encoded, dtype=np.int32) 149 | all_answer_cands_len = np.asarray(all_answer_cands_len, dtype=np.int32) 150 | print(all_answer_cands_encoded.shape) 151 | 152 | glove_matrix = None 153 | # if mode in ['train']: 154 | # token_itow = {i: w for w, i in vocab['question_answer_token_to_idx'].items()} 155 | # print("Load glove from %s" % args.glove_pt) 156 | # glove = pickle.load(open(args.glove_pt, 'rb')) 157 | # dim_word = glove['the'].shape[0] 158 | # glove_matrix = [] 159 | # for i in range(len(token_itow)): 160 | # vector = glove.get(token_itow[i], np.zeros((dim_word,))) 161 | # glove_matrix.append(vector) 162 | # glove_matrix = np.asarray(glove_matrix, dtype=np.float32) 163 | # print(glove_matrix.shape) 164 | 165 | print('Writing ', args.output_pt.format(mode)) 166 | obj = { 167 | 'questions': questions_encoded, 168 | 'questions_len': questions_len, 169 | 'question_id': question_ids, 170 | 'video_ids': np.asarray(video_ids_tbw), 171 | 'video_names': np.array(video_names_tbw), 172 | 'ans_candidates': all_answer_cands_encoded, 173 | 'ans_candidates_len': all_answer_cands_len, 174 | 'answers': correct_answers, 175 | 'glove': glove_matrix, 176 | } 177 | with open(args.output_pt.format(mode), 'wb') as f: 178 | pickle.dump(obj, f) 179 | 180 | def process_questions_openended(args): 181 | print('Loading dataset') 182 | if args.mode in ["train"]: 183 | csv_data = pd.read_csv(args.annotation_file.format("Train", args.question_type), delimiter='\t') 184 | else: 185 | csv_data = pd.read_csv(args.annotation_file.format("Test", args.question_type), delimiter='\t') 186 | csv_data = csv_data.iloc[np.random.permutation(len(csv_data))] 187 | questions = list(csv_data['question']) 188 | answers = list(csv_data['answer']) 189 | video_names = list(csv_data['gif_name']) 190 | video_ids = list(csv_data['key']) 191 | 192 | print('number of questions: %s' % len(questions)) 193 | # Either create the vocab or load it from disk 194 | if args.mode in ['train']: 195 | print('Building vocab') 196 | answer_cnt = {} 197 | 198 | if args.question_type == "frameqa": 199 | for i, answer in enumerate(answers): 200 | answer_cnt[answer] = answer_cnt.get(answer, 0) + 1 201 | 202 | answer_token_to_idx = {'': 0} 203 | for token in answer_cnt: 204 | answer_token_to_idx[token] = len(answer_token_to_idx) 205 | print('Get answer_token_to_idx, num: %d' % len(answer_token_to_idx)) 206 | elif args.question_type == 'count': 207 | answer_token_to_idx = {'': 0} 208 | 209 | question_token_to_idx = {'': 0, '': 1} 210 | for i, q in enumerate(questions): 211 | question = q.lower()[:-1] 212 | for token in nltk.word_tokenize(question): 213 | if token not in question_token_to_idx: 214 | question_token_to_idx[token] = len(question_token_to_idx) 215 | print('Get question_token_to_idx') 216 | print(len(question_token_to_idx)) 217 | 218 | vocab = { 219 | 'question_token_to_idx': question_token_to_idx, 220 | 'answer_token_to_idx': answer_token_to_idx, 221 | 'question_answer_token_to_idx': {'': 0, '': 1} 222 | } 223 | 224 | print('Write into %s' % args.vocab_json.format(args.question_type, args.question_type)) 225 | with open(args.vocab_json.format(args.question_type, args.question_type), 'w') as f: 226 | json.dump(vocab, f, indent=4) 227 | 228 | # split 10% of questions for evaluation 229 | split = int(0.9 * len(questions)) 230 | train_questions = questions[:split] 231 | train_answers = answers[:split] 232 | train_video_names = video_names[:split] 233 | train_video_ids = video_ids[:split] 234 | 235 | val_questions = questions[split:] 236 | val_answers = answers[split:] 237 | val_video_names = video_names[split:] 238 | val_video_ids = video_ids[split:] 239 | 240 | openeded_encoding_data(args, vocab, train_questions, train_video_names, train_video_ids, train_answers, mode='train') 241 | openeded_encoding_data(args, vocab, val_questions, val_video_names, val_video_ids, val_answers, mode='val') 242 | else: 243 | print('Loading vocab') 244 | with open(args.vocab_json.format(args.question_type, args.question_type), 'r') as f: 245 | vocab = json.load(f) 246 | openeded_encoding_data(args, vocab, questions, video_names, video_ids, answers, mode='test') 247 | 248 | 249 | def process_questions_mulchoices(args): 250 | print('Loading dataset') 251 | # if args.mode in ["train", "val"]: 252 | # csv_data = pd.read_csv(args.annotation_file.format("Train", args.question_type), delimiter=',') 253 | # else: 254 | # csv_data = pd.read_csv(args.annotation_file.format("Test", args.question_type), delimiter=',') 255 | 256 | if args.mode == 'all': 257 | csv_data = pd.read_csv(args.annotation_file.format(args.mode), delimiter=',').astype('string') 258 | else: 259 | csv_data = pd.read_csv(args.annotation_file.format(args.mode), delimiter=',').astype('string') 260 | 261 | # csv_data = csv_data.iloc[np.random.permutation(len(csv_data))] 262 | questions = list(csv_data['question']) 263 | answers = list(csv_data['answer']) 264 | video_names = list(csv_data['video']) 265 | video_ids = list(csv_data['video']) 266 | qns_ids = list(csv_data['qid']) 267 | qns_ids = [vname+'_'+qid for vname, qid in zip(video_names, qns_ids)] 268 | ans_candidates = np.asarray([csv_data['a0'], csv_data['a1'], csv_data['a2'], csv_data['a3'], csv_data['a4']]) 269 | ans_candidates = ans_candidates.transpose() 270 | print(ans_candidates.shape) 271 | # ans_candidates: (num_ques, 5) 272 | print('number of questions: %s' % len(questions)) 273 | # Either create the vocab or load it from disk 274 | #if args.mode in ['train']: 275 | if not osp.exists(args.vocab_json.format('train')): 276 | print('Building vocab') 277 | answer_token_to_idx = {'': 0, '': 1} 278 | question_answer_token_to_idx = {'': 0, '': 1} 279 | for candidates in ans_candidates: 280 | #print(candidates) 281 | for ans in candidates: 282 | if type(ans) != 'str': continue 283 | ans = ans.lower() 284 | for token in nltk.word_tokenize(ans): 285 | if token not in answer_token_to_idx: 286 | answer_token_to_idx[token] = len(answer_token_to_idx) 287 | if token not in question_answer_token_to_idx: 288 | question_answer_token_to_idx[token] = len(question_answer_token_to_idx) 289 | print('Get answer_token_to_idx, num: %d' % len(answer_token_to_idx)) 290 | 291 | question_token_to_idx = {'': 0, '': 1} 292 | for i, q in enumerate(questions): 293 | question = str(q).lower()[:-1] 294 | for token in nltk.word_tokenize(question): 295 | if token not in question_token_to_idx: 296 | question_token_to_idx[token] = len(question_token_to_idx) 297 | if token not in question_answer_token_to_idx: 298 | question_answer_token_to_idx[token] = len(question_answer_token_to_idx) 299 | 300 | print('Get question_token_to_idx') 301 | print(len(question_token_to_idx)) 302 | print('Get question_answer_token_to_idx') 303 | print(len(question_answer_token_to_idx)) 304 | 305 | vocab = { 306 | 'question_token_to_idx': question_token_to_idx, 307 | 'answer_token_to_idx': answer_token_to_idx, 308 | 'question_answer_token_to_idx': question_answer_token_to_idx, 309 | } 310 | 311 | print('Write into %s' % args.vocab_json.format(args.mode)) 312 | with open(args.vocab_json.format(args.mode), 'w') as f: 313 | json.dump(vocab, f, indent=4) 314 | 315 | # split 10% of questions for evaluation 316 | # split = int(0.9 * len(questions)) 317 | # train_questions = questions[:split] 318 | # train_answers = answers[:split] 319 | # train_video_names = video_names[:split] 320 | # train_video_ids = video_ids[:split] 321 | # train_ans_candidates = ans_candidates[:split, :] 322 | # 323 | # val_questions = questions[split:] 324 | # val_answers = answers[split:] 325 | # val_video_names = video_names[split:] 326 | # val_video_ids = video_ids[split:] 327 | # val_ans_candidates = ans_candidates[split:, :] 328 | 329 | multichoice_encoding_data(args, vocab, questions, qns_ids, video_names, video_ids, answers, 330 | ans_candidates, mode='train') 331 | # multichoice_encoding_data(args, vocab, val_questions, val_video_names, val_video_ids, val_answers, 332 | # val_ans_candidates, mode='val') 333 | else: 334 | print('Loading vocab') 335 | with open(args.vocab_json.format('train'), 'r') as f: 336 | vocab = json.load(f) 337 | multichoice_encoding_data(args, vocab, questions, qns_ids, video_names, video_ids, answers, 338 | ans_candidates, mode=args.mode) 339 | -------------------------------------------------------------------------------- /tools/datautils/tgif_qa.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | import json 4 | from datautils import utils 5 | import nltk 6 | 7 | import pickle 8 | import numpy as np 9 | 10 | 11 | def load_video_paths(args): 12 | ''' Load a list of (path,image_id tuples).''' 13 | input_paths = [] 14 | annotation = pd.read_csv(args.annotation_file.format(args.question_type), delimiter='\t') 15 | gif_names = list(annotation['gif_name']) 16 | keys = list(annotation['key']) 17 | print("Number of questions: {}".format(len(gif_names))) 18 | for idx, gif in enumerate(gif_names): 19 | gif_abs_path = os.path.join(args.video_dir, ''.join([gif, '.gif'])) 20 | input_paths.append((gif_abs_path, keys[idx])) 21 | input_paths = list(set(input_paths)) 22 | print("Number of unique videos: {}".format(len(input_paths))) 23 | 24 | return input_paths 25 | 26 | 27 | def openeded_encoding_data(args, vocab, questions, video_names, video_ids, answers, mode='train'): 28 | ''' Encode question tokens''' 29 | print('Encoding dataset') 30 | questions_encoded = [] 31 | questions_len = [] 32 | video_ids_tbw = [] 33 | video_names_tbw = [] 34 | all_answers = [] 35 | question_ids = [] 36 | for idx, question in enumerate(questions): 37 | question = question.lower()[:-1] 38 | question_tokens = nltk.word_tokenize(question) 39 | question_encoded = utils.encode(question_tokens, vocab['question_token_to_idx'], allow_unk=True) 40 | questions_encoded.append(question_encoded) 41 | questions_len.append(len(question_encoded)) 42 | question_ids.append(idx) 43 | video_names_tbw.append(video_names[idx]) 44 | video_ids_tbw.append(video_ids[idx]) 45 | 46 | if args.question_type == "frameqa": 47 | answer = answers[idx] 48 | if answer in vocab['answer_token_to_idx']: 49 | answer = vocab['answer_token_to_idx'][answer] 50 | elif mode in ['train']: 51 | answer = 0 52 | elif mode in ['val', 'test']: 53 | answer = 1 54 | else: 55 | answer = max(int(answers[idx]), 1) 56 | all_answers.append(answer) 57 | 58 | # Pad encoded questions 59 | max_question_length = max(len(x) for x in questions_encoded) 60 | for qe in questions_encoded: 61 | while len(qe) < max_question_length: 62 | qe.append(vocab['question_token_to_idx']['']) 63 | 64 | questions_encoded = np.asarray(questions_encoded, dtype=np.int32) 65 | questions_len = np.asarray(questions_len, dtype=np.int32) 66 | print(questions_encoded.shape) 67 | 68 | glove_matrix = None 69 | if mode == 'train': 70 | token_itow = {i: w for w, i in vocab['question_token_to_idx'].items()} 71 | print("Load glove from %s" % args.glove_pt) 72 | glove = pickle.load(open(args.glove_pt, 'rb')) 73 | dim_word = glove['the'].shape[0] 74 | glove_matrix = [] 75 | for i in range(len(token_itow)): 76 | vector = glove.get(token_itow[i], np.zeros((dim_word,))) 77 | glove_matrix.append(vector) 78 | glove_matrix = np.asarray(glove_matrix, dtype=np.float32) 79 | print(glove_matrix.shape) 80 | 81 | print('Writing ', args.output_pt.format(args.question_type, args.question_type, mode)) 82 | obj = { 83 | 'questions': questions_encoded, 84 | 'questions_len': questions_len, 85 | 'question_id': question_ids, 86 | 'video_ids': np.asarray(video_ids_tbw), 87 | 'video_names': np.array(video_names_tbw), 88 | 'answers': all_answers, 89 | 'glove': glove_matrix, 90 | } 91 | with open(args.output_pt.format(args.question_type, args.question_type, mode), 'wb') as f: 92 | pickle.dump(obj, f) 93 | 94 | def multichoice_encoding_data(args, vocab, questions, video_names, video_ids, answers, ans_candidates, mode='train'): 95 | # Encode all questions 96 | print('Encoding dataset') 97 | questions_encoded = [] 98 | questions_len = [] 99 | question_ids = [] 100 | all_answer_cands_encoded = [] 101 | all_answer_cands_len = [] 102 | video_ids_tbw = [] 103 | video_names_tbw = [] 104 | correct_answers = [] 105 | for idx, question in enumerate(questions): 106 | question = question.lower()[:-1] 107 | question_tokens = nltk.word_tokenize(question) 108 | question_encoded = utils.encode(question_tokens, vocab['question_answer_token_to_idx'], allow_unk=True) 109 | questions_encoded.append(question_encoded) 110 | questions_len.append(len(question_encoded)) 111 | question_ids.append(idx) 112 | video_names_tbw.append(video_names[idx]) 113 | video_ids_tbw.append(video_ids[idx]) 114 | # grounthtruth 115 | answer = int(answers[idx]) 116 | correct_answers.append(answer) 117 | # answer candidates 118 | candidates = ans_candidates[idx] 119 | candidates_encoded = [] 120 | candidates_len = [] 121 | for ans in candidates: 122 | ans = ans.lower() 123 | ans_tokens = nltk.word_tokenize(ans) 124 | cand_encoded = utils.encode(ans_tokens, vocab['question_answer_token_to_idx'], allow_unk=True) 125 | candidates_encoded.append(cand_encoded) 126 | candidates_len.append(len(cand_encoded)) 127 | all_answer_cands_encoded.append(candidates_encoded) 128 | all_answer_cands_len.append(candidates_len) 129 | 130 | # Pad encoded questions 131 | max_question_length = max(len(x) for x in questions_encoded) 132 | for qe in questions_encoded: 133 | while len(qe) < max_question_length: 134 | qe.append(vocab['question_answer_token_to_idx']['']) 135 | 136 | questions_encoded = np.asarray(questions_encoded, dtype=np.int32) 137 | questions_len = np.asarray(questions_len, dtype=np.int32) 138 | print(questions_encoded.shape) 139 | 140 | # Pad encoded answer candidates 141 | max_answer_cand_length = max(max(len(x) for x in candidate) for candidate in all_answer_cands_encoded) 142 | for ans_cands in all_answer_cands_encoded: 143 | for ans in ans_cands: 144 | while len(ans) < max_answer_cand_length: 145 | ans.append(vocab['question_answer_token_to_idx']['']) 146 | all_answer_cands_encoded = np.asarray(all_answer_cands_encoded, dtype=np.int32) 147 | all_answer_cands_len = np.asarray(all_answer_cands_len, dtype=np.int32) 148 | print(all_answer_cands_encoded.shape) 149 | 150 | glove_matrix = None 151 | if mode in ['train']: 152 | token_itow = {i: w for w, i in vocab['question_answer_token_to_idx'].items()} 153 | print("Load glove from %s" % args.glove_pt) 154 | glove = pickle.load(open(args.glove_pt, 'rb')) 155 | dim_word = glove['the'].shape[0] 156 | glove_matrix = [] 157 | for i in range(len(token_itow)): 158 | vector = glove.get(token_itow[i], np.zeros((dim_word,))) 159 | glove_matrix.append(vector) 160 | glove_matrix = np.asarray(glove_matrix, dtype=np.float32) 161 | print(glove_matrix.shape) 162 | 163 | print('Writing ', args.output_pt.format(args.question_type, args.question_type, mode)) 164 | obj = { 165 | 'questions': questions_encoded, 166 | 'questions_len': questions_len, 167 | 'question_id': question_ids, 168 | 'video_ids': np.asarray(video_ids_tbw), 169 | 'video_names': np.array(video_names_tbw), 170 | 'ans_candidates': all_answer_cands_encoded, 171 | 'ans_candidates_len': all_answer_cands_len, 172 | 'answers': correct_answers, 173 | 'glove': glove_matrix, 174 | } 175 | with open(args.output_pt.format(args.question_type, args.question_type, mode), 'wb') as f: 176 | pickle.dump(obj, f) 177 | 178 | def process_questions_openended(args): 179 | print('Loading dataset') 180 | if args.mode in ["train"]: 181 | csv_data = pd.read_csv(args.annotation_file.format("Train", args.question_type), delimiter='\t') 182 | else: 183 | csv_data = pd.read_csv(args.annotation_file.format("Test", args.question_type), delimiter='\t') 184 | csv_data = csv_data.iloc[np.random.permutation(len(csv_data))] 185 | questions = list(csv_data['question']) 186 | answers = list(csv_data['answer']) 187 | video_names = list(csv_data['gif_name']) 188 | video_ids = list(csv_data['key']) 189 | 190 | print('number of questions: %s' % len(questions)) 191 | # Either create the vocab or load it from disk 192 | if args.mode in ['train']: 193 | print('Building vocab') 194 | answer_cnt = {} 195 | 196 | if args.question_type == "frameqa": 197 | for i, answer in enumerate(answers): 198 | answer_cnt[answer] = answer_cnt.get(answer, 0) + 1 199 | 200 | answer_token_to_idx = {'': 0} 201 | for token in answer_cnt: 202 | answer_token_to_idx[token] = len(answer_token_to_idx) 203 | print('Get answer_token_to_idx, num: %d' % len(answer_token_to_idx)) 204 | elif args.question_type == 'count': 205 | answer_token_to_idx = {'': 0} 206 | 207 | question_token_to_idx = {'': 0, '': 1} 208 | for i, q in enumerate(questions): 209 | question = q.lower()[:-1] 210 | for token in nltk.word_tokenize(question): 211 | if token not in question_token_to_idx: 212 | question_token_to_idx[token] = len(question_token_to_idx) 213 | print('Get question_token_to_idx') 214 | print(len(question_token_to_idx)) 215 | 216 | vocab = { 217 | 'question_token_to_idx': question_token_to_idx, 218 | 'answer_token_to_idx': answer_token_to_idx, 219 | 'question_answer_token_to_idx': {'': 0, '': 1} 220 | } 221 | 222 | print('Write into %s' % args.vocab_json.format(args.question_type, args.question_type)) 223 | with open(args.vocab_json.format(args.question_type, args.question_type), 'w') as f: 224 | json.dump(vocab, f, indent=4) 225 | 226 | # split 10% of questions for evaluation 227 | split = int(0.9 * len(questions)) 228 | train_questions = questions[:split] 229 | train_answers = answers[:split] 230 | train_video_names = video_names[:split] 231 | train_video_ids = video_ids[:split] 232 | 233 | val_questions = questions[split:] 234 | val_answers = answers[split:] 235 | val_video_names = video_names[split:] 236 | val_video_ids = video_ids[split:] 237 | 238 | openeded_encoding_data(args, vocab, train_questions, train_video_names, train_video_ids, train_answers, mode='train') 239 | openeded_encoding_data(args, vocab, val_questions, val_video_names, val_video_ids, val_answers, mode='val') 240 | else: 241 | print('Loading vocab') 242 | with open(args.vocab_json.format(args.question_type, args.question_type), 'r') as f: 243 | vocab = json.load(f) 244 | openeded_encoding_data(args, vocab, questions, video_names, video_ids, answers, mode='test') 245 | 246 | 247 | 248 | 249 | def process_questions_mulchoices(args): 250 | print('Loading dataset') 251 | if args.mode in ["train", "val"]: 252 | csv_data = pd.read_csv(args.annotation_file.format("Train", args.question_type), delimiter='\t') 253 | else: 254 | csv_data = pd.read_csv(args.annotation_file.format("Test", args.question_type), delimiter='\t') 255 | csv_data = csv_data.iloc[np.random.permutation(len(csv_data))] 256 | questions = list(csv_data['question']) 257 | answers = list(csv_data['answer']) 258 | video_names = list(csv_data['gif_name']) 259 | video_ids = list(csv_data['key']) 260 | ans_candidates = np.asarray( 261 | [csv_data['a1'], csv_data['a2'], csv_data['a3'], csv_data['a4'], csv_data['a5']]) 262 | ans_candidates = ans_candidates.transpose() 263 | print(ans_candidates.shape) 264 | # ans_candidates: (num_ques, 5) 265 | print('number of questions: %s' % len(questions)) 266 | # Either create the vocab or load it from disk 267 | if args.mode in ['train']: 268 | print('Building vocab') 269 | 270 | answer_token_to_idx = {'': 0, '': 1} 271 | question_answer_token_to_idx = {'': 0, '': 1} 272 | for candidates in ans_candidates: 273 | for ans in candidates: 274 | ans = ans.lower() 275 | for token in nltk.word_tokenize(ans): 276 | if token not in answer_token_to_idx: 277 | answer_token_to_idx[token] = len(answer_token_to_idx) 278 | if token not in question_answer_token_to_idx: 279 | question_answer_token_to_idx[token] = len(question_answer_token_to_idx) 280 | print('Get answer_token_to_idx, num: %d' % len(answer_token_to_idx)) 281 | 282 | question_token_to_idx = {'': 0, '': 1} 283 | for i, q in enumerate(questions): 284 | question = q.lower()[:-1] 285 | for token in nltk.word_tokenize(question): 286 | if token not in question_token_to_idx: 287 | question_token_to_idx[token] = len(question_token_to_idx) 288 | if token not in question_answer_token_to_idx: 289 | question_answer_token_to_idx[token] = len(question_answer_token_to_idx) 290 | 291 | print('Get question_token_to_idx') 292 | print(len(question_token_to_idx)) 293 | print('Get question_answer_token_to_idx') 294 | print(len(question_answer_token_to_idx)) 295 | 296 | vocab = { 297 | 'question_token_to_idx': question_token_to_idx, 298 | 'answer_token_to_idx': answer_token_to_idx, 299 | 'question_answer_token_to_idx': question_answer_token_to_idx, 300 | } 301 | 302 | print('Write into %s' % args.vocab_json.format(args.question_type, args.question_type)) 303 | with open(args.vocab_json.format(args.question_type, args.question_type), 'w') as f: 304 | json.dump(vocab, f, indent=4) 305 | 306 | # split 10% of questions for evaluation 307 | split = int(0.9 * len(questions)) 308 | train_questions = questions[:split] 309 | train_answers = answers[:split] 310 | train_video_names = video_names[:split] 311 | train_video_ids = video_ids[:split] 312 | train_ans_candidates = ans_candidates[:split, :] 313 | 314 | val_questions = questions[split:] 315 | val_answers = answers[split:] 316 | val_video_names = video_names[split:] 317 | val_video_ids = video_ids[split:] 318 | val_ans_candidates = ans_candidates[split:, :] 319 | 320 | multichoice_encoding_data(args, vocab, train_questions, train_video_names, train_video_ids, train_answers, train_ans_candidates, mode='train') 321 | multichoice_encoding_data(args, vocab, val_questions, val_video_names, val_video_ids, val_answers, 322 | val_ans_candidates, mode='val') 323 | else: 324 | print('Loading vocab') 325 | with open(args.vocab_json.format(args.question_type, args.question_type), 'r') as f: 326 | vocab = json.load(f) 327 | multichoice_encoding_data(args, vocab, questions, video_names, video_ids, answers, 328 | ans_candidates, mode='test') 329 | -------------------------------------------------------------------------------- /tools/datautils/utils.py: -------------------------------------------------------------------------------- 1 | import time 2 | import json 3 | import os 4 | import os.path as osp 5 | 6 | def load_file(file_name): 7 | annos = None 8 | with open(file_name, 'r') as fp: 9 | if osp.splitext(file_name)[1]== '.txt': 10 | annos = fp.readlines() 11 | annos = [line.rstrip() for line in annos] 12 | if osp.splitext(file_name)[1] == '.json': 13 | annos = json.load(fp) 14 | 15 | return annos 16 | 17 | def save_file(obj, filename): 18 | """ 19 | save obj to filename 20 | :param obj: 21 | :param filename: 22 | :return: 23 | """ 24 | filepath = osp.dirname(filename) 25 | if filepath != '' and not osp.exists(filepath): 26 | os.makedirs(filepath) 27 | else: 28 | with open(filename, 'w') as fp: 29 | json.dump(obj, fp) 30 | 31 | def encode(seq_tokens, token_to_idx, allow_unk=False): 32 | seq_idx = [] 33 | for token in seq_tokens: 34 | if token not in token_to_idx: 35 | if allow_unk: 36 | token = '' 37 | else: 38 | raise KeyError('Token "%s" not in vocab' % token) 39 | seq_idx.append(token_to_idx[token]) 40 | return seq_idx 41 | 42 | 43 | def decode(seq_idx, idx_to_token, delim=None, stop_at_end=True): 44 | tokens = [] 45 | for idx in seq_idx: 46 | tokens.append(idx_to_token[idx]) 47 | if stop_at_end and tokens[-1] == '': 48 | break 49 | if delim is None: 50 | return tokens 51 | else: 52 | return delim.join(tokens) 53 | 54 | # -------------------------------------------------------- 55 | # Fast R-CNN 56 | # Copyright (c) 2015 Microsoft 57 | # Licensed under The MIT License [see LICENSE for details] 58 | # Written by Ross Girshick 59 | # -------------------------------------------------------- 60 | 61 | class Timer(object): 62 | """A simple timer.""" 63 | def __init__(self): 64 | self.total_time = 0. 65 | self.calls = 0 66 | self.start_time = 0. 67 | self.diff = 0. 68 | self.average_time = 0. 69 | 70 | def tic(self): 71 | # using time.time instead of time.clock because time time.clock 72 | # does not normalize for multithreading 73 | self.start_time = time.time() 74 | 75 | def toc(self, average=True): 76 | self.diff = time.time() - self.start_time 77 | self.total_time += self.diff 78 | self.calls += 1 79 | self.average_time = self.total_time / self.calls 80 | if average: 81 | return self.average_time 82 | else: 83 | return self.diff -------------------------------------------------------------------------------- /tools/demo.py: -------------------------------------------------------------------------------- 1 | import h5py 2 | import os 3 | import os.path as osp 4 | import numpy as np 5 | from bbox_visualizer import * 6 | import sys 7 | sys.path.insert(0, '../') 8 | from util import load_file, save_to 9 | 10 | bbox_colors = np.loadtxt('colors.txt') 11 | 12 | 13 | def sample_clips(total_frames, num_clips, num_frames_per_clip): 14 | clips = [] 15 | frames = [str(f+1).zfill(6) for f in range(total_frames)] 16 | for i in np.linspace(0, total_frames, num_clips + 2, dtype=np.int32)[1: num_clips + 1]: 17 | clip_start = int(i) - int(num_frames_per_clip / 2) 18 | clip_end = int(i) + int(num_frames_per_clip / 2) 19 | clip_start = 0 if clip_start < 0 else clip_start 20 | clip_end = total_frames if clip_end > total_frames else clip_end 21 | clip = frames[clip_start:clip_end] 22 | if clip_start == 0 and len(clip) < num_frames_per_clip: 23 | shortage = num_frames_per_clip - (clip_end - clip_start) 24 | added_fids = [] 25 | for _ in range(shortage): 26 | added_fids.append(frames[clip_start]) 27 | if len(added_fids) > 0: 28 | clip = added_fids + clip 29 | if clip_end == total_frames and len(clip) < num_frames_per_clip: 30 | shortage = num_frames_per_clip - (clip_end - clip_start) 31 | added_fids = [] 32 | for _ in range(shortage): 33 | added_fids.append(frames[clip_end-1]) 34 | if len(added_fids) > 0: 35 | clip += added_fids 36 | clip = clip[::4] 37 | clips.append(clip) 38 | clips = clips[::2] 39 | return clips 40 | 41 | 42 | def get_vbbox(feat_file, qvid, bbox_num): 43 | with h5py.File(feat_file, 'r') as fp: 44 | vids = fp['ids'] 45 | bboxes = fp['bbox'] 46 | for id, (vid, bbox) in enumerate(zip(vids, bboxes)): 47 | if str(vid) != qvid: continue 48 | vbbox = bbox[:,:,:bbox_num, :] 49 | 50 | return vbbox 51 | 52 | 53 | def vis_det(feat_file, vname): 54 | bbox_num = 5 55 | vid = vname.split('/')[-1] 56 | vbbox = get_vbbox(feat_file, vid, bbox_num) 57 | fids = os.listdir(vname) 58 | total_frames = len(fids) 59 | clips = sample_clips(total_frames, 16, 16) 60 | # clips = np.asarray(clips).reshape(-1) 61 | out_dir = '../demo/' 62 | 63 | for i, cids in enumerate(clips): 64 | for f, fid in enumerate(cids): 65 | img_path = osp.join(vname, fid+'.jpg') 66 | bboxes = vbbox[i][f] 67 | 68 | bboxes = [[int(np.round(b)) for b in bbox] for bbox in bboxes] 69 | # bbox = [int(np.round(b)) for b in bbox] 70 | img = cv2.imread(img_path) 71 | img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) 72 | output = draw_multiple_rectangles(img, bboxes, bbox_colors) 73 | # output = draw_rectangle(img, bbox) 74 | 75 | out_file = osp.join(out_dir, str(vid)) 76 | if not osp.exists(out_file): 77 | os.makedirs(out_file) 78 | img = cv2.cvtColor(output, cv2.COLOR_RGB2BGR) 79 | cv2.imwrite(osp.join(out_file, fid+'.jpg'), img) 80 | # cv2.imshow('image', output) 81 | # cv2.waitKey(0) 82 | 83 | 84 | 85 | def main(): 86 | dataset = 'nextqa' 87 | feat_file = f'../../data/{dataset}/region_feat_n/acregion_8c20b_val.h5' 88 | #the videos are decoded by 6 pfs 89 | frame_dir = '/home/jbxiao/workspace/data/nextqa/frames/' 90 | vname = f'{frame_dir}/3376544720' 91 | vis_det(feat_file, vname) 92 | 93 | if __name__ == "__main__": 94 | main() 95 | -------------------------------------------------------------------------------- /tools/extract_video.py: -------------------------------------------------------------------------------- 1 | # ==================================================== 2 | # @Time : 15/4/21 12:38 PM 3 | # @Author : Xiao Junbin 4 | # @Email : junbin@comp.nus.edu.sg 5 | # @File : extract_video.py.py 6 | # ==================================================== 7 | import os 8 | import os.path as osp 9 | import shutil 10 | import subprocess 11 | import pandas as pd 12 | import json 13 | import sys 14 | sys.path.insert(0, '../') 15 | from util import load_file 16 | 17 | # def load_file(filename): 18 | # with open(filename, 'r') as fp: 19 | # data = json.load(fp) 20 | # return data 21 | 22 | def get_video_list(filename, out_file): 23 | data = load_file(filename) 24 | video_ids = list(data['video_id']) 25 | video_ids = list(set(video_ids)) 26 | # video_ids = os.listdir(filename) 27 | # video_ids = sorted(video_ids) 28 | print(len(video_ids)) 29 | with open(out_file, 'w') as fp: 30 | json.dump(video_ids, fp, indent=4) 31 | return video_ids 32 | 33 | 34 | def extract_frame(video, dst): 35 | 36 | with open(os.devnull, 'w') as ffmpeg_log: 37 | if os.path.exists(dst): 38 | # print(" cleanup: "+dst+"/") 39 | shutil.rmtree(dst) 40 | os.makedirs(dst) 41 | video2frame_cmd = [ 42 | "ffmpeg", 43 | '-y', 44 | '-i', video, 45 | '-r', "6", # 6 frames per second 46 | # '-vf', "scale=400:300", 47 | '-qscale:v', "2", 48 | '{0}/%06d.jpg'.format(dst) 49 | ] 50 | subprocess.call(video2frame_cmd, stdout = ffmpeg_log, stderr=ffmpeg_log) 51 | 52 | 53 | def extract_videos(raw_dir, vlist, frame_dir, map_vid=None): 54 | 55 | vnum = len(vlist) 56 | for id, vid in enumerate(vlist): 57 | # if id <= 400: continue 58 | # if id > 400: break 59 | vid = str(vid) 60 | if map_vid != None: 61 | video = osp.join(raw_dir, f'{map_vid[vid]}.mp4') 62 | else: 63 | video = osp.join(raw_dir, f'{vid}.mp4') 64 | dst = osp.join(frame_dir, vid) 65 | if not osp.exists(video): 66 | print(video) 67 | extract_frame(video, dst) 68 | if id % 20 == 0: 69 | print('{}/{}'.format(id, vnum)) 70 | 71 | 72 | def main(): 73 | video_dir = '/storage/jbxiao/workspace/data/nextqa/' 74 | raw_dir = osp.join(video_dir, 'videos/') 75 | frame_dir = osp.join(video_dir, 'frames_val/') 76 | anno_dir = '../datasets/nextqa/' 77 | vlist_file = osp.join(anno_dir, 'vlist.json') 78 | map_file = osp.join(anno_dir, 'map_vid_vidorID.json') 79 | if not osp.exists(vlist_file): 80 | dset = 'val' #train/test 81 | qa_file = osp.join(anno_dir, f'{dset}.csv') 82 | vlist_file = osp.join(anno_dir, f'vlist_{dset}.json') 83 | vlist = get_video_list(qa_file, vlist_file) 84 | else: 85 | vlist = load_file(vlist_file) 86 | map_vid = load_file(map_file) 87 | extract_videos(raw_dir, vlist, frame_dir, map_vid=map_vid) 88 | 89 | 90 | if __name__ == "__main__": 91 | main() 92 | -------------------------------------------------------------------------------- /tools/feat_app.sh: -------------------------------------------------------------------------------- 1 | ######################################################################### 2 | # File Name: feat_app.sh 3 | # Author: Xiao Junbin 4 | # mail: xiaojunbin@u.nus.edu 5 | # Created Time: Sat 19 Sep 2020 09:22:26 PM 6 | ######################################################################### 7 | #!/bin/bash 8 | GPUID=$1 9 | CUDA_VISIBLE_DEVICES=$GPUID python preprocess_features.py \ 10 | --dataset 'nextqa' \ 11 | --model 'resnet101' \ 12 | --image_width 224 \ 13 | --image_height 224 14 | -------------------------------------------------------------------------------- /tools/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doc-doc/CoVGT/cbc9fa7830b304f3c3f9c53040489ea9ad35a9aa/tools/models/__init__.py -------------------------------------------------------------------------------- /tools/models/densenet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from collections import OrderedDict 5 | import math 6 | 7 | __all__ = ['DenseNet', 'densenet121', 'densenet169', 'densenet201', 'densenet264'] 8 | 9 | 10 | def densenet121(**kwargs): 11 | model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 24, 16), 12 | **kwargs) 13 | return model 14 | 15 | 16 | def densenet169(**kwargs): 17 | model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 32, 32), 18 | **kwargs) 19 | return model 20 | 21 | 22 | def densenet201(**kwargs): 23 | model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 48, 32), 24 | **kwargs) 25 | return model 26 | 27 | 28 | def densenet264(**kwargs): 29 | model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 64, 48), 30 | **kwargs) 31 | return model 32 | 33 | 34 | def get_fine_tuning_parameters(model, ft_begin_index): 35 | if ft_begin_index == 0: 36 | return model.parameters() 37 | 38 | ft_module_names = [] 39 | for i in range(ft_begin_index, 5): 40 | ft_module_names.append('denseblock{}'.format(ft_begin_index)) 41 | ft_module_names.append('transition{}'.format(ft_begin_index)) 42 | ft_module_names.append('norm5') 43 | ft_module_names.append('classifier') 44 | 45 | parameters = [] 46 | for k, v in model.named_parameters(): 47 | for ft_module in ft_module_names: 48 | if ft_module in k: 49 | parameters.append({'params': v}) 50 | break 51 | else: 52 | parameters.append({'params': v, 'lr': 0.0}) 53 | 54 | return parameters 55 | 56 | 57 | class _DenseLayer(nn.Sequential): 58 | def __init__(self, num_input_features, growth_rate, bn_size, drop_rate): 59 | super(_DenseLayer, self).__init__() 60 | self.add_module('norm.1', nn.BatchNorm3d(num_input_features)) 61 | self.add_module('relu.1', nn.ReLU(inplace=True)) 62 | self.add_module('conv.1', nn.Conv3d(num_input_features, bn_size * growth_rate, 63 | kernel_size=1, stride=1, bias=False)) 64 | self.add_module('norm.2', nn.BatchNorm3d(bn_size * growth_rate)) 65 | self.add_module('relu.2', nn.ReLU(inplace=True)) 66 | self.add_module('conv.2', nn.Conv3d(bn_size * growth_rate, growth_rate, 67 | kernel_size=3, stride=1, padding=1, bias=False)) 68 | self.drop_rate = drop_rate 69 | 70 | def forward(self, x): 71 | new_features = super(_DenseLayer, self).forward(x) 72 | if self.drop_rate > 0: 73 | new_features = F.dropout(new_features, p=self.drop_rate, training=self.training) 74 | return torch.cat([x, new_features], 1) 75 | 76 | 77 | class _DenseBlock(nn.Sequential): 78 | def __init__(self, num_layers, num_input_features, bn_size, growth_rate, drop_rate): 79 | super(_DenseBlock, self).__init__() 80 | for i in range(num_layers): 81 | layer = _DenseLayer(num_input_features + i * growth_rate, growth_rate, bn_size, drop_rate) 82 | self.add_module('denselayer%d' % (i + 1), layer) 83 | 84 | 85 | class _Transition(nn.Sequential): 86 | def __init__(self, num_input_features, num_output_features): 87 | super(_Transition, self).__init__() 88 | self.add_module('norm', nn.BatchNorm3d(num_input_features)) 89 | self.add_module('relu', nn.ReLU(inplace=True)) 90 | self.add_module('conv', nn.Conv3d(num_input_features, num_output_features, 91 | kernel_size=1, stride=1, bias=False)) 92 | self.add_module('pool', nn.AvgPool3d(kernel_size=2, stride=2)) 93 | 94 | 95 | class DenseNet(nn.Module): 96 | """Densenet-BC model class 97 | Args: 98 | growth_rate (int) - how many filters to add each layer (k in paper) 99 | block_config (list of 4 ints) - how many layers in each pooling block 100 | num_init_features (int) - the number of filters to learn in the first convolution layer 101 | bn_size (int) - multiplicative factor for number of bottle neck layers 102 | (i.e. bn_size * k features in the bottleneck layer) 103 | drop_rate (float) - dropout rate after each dense layer 104 | num_classes (int) - number of classification classes 105 | """ 106 | def __init__(self, sample_size, sample_duration, growth_rate=32, block_config=(6, 12, 24, 16), 107 | num_init_features=64, bn_size=4, drop_rate=0, num_classes=1000, last_fc=True): 108 | 109 | super(DenseNet, self).__init__() 110 | 111 | self.last_fc = last_fc 112 | 113 | self.sample_size = sample_size 114 | self.sample_duration = sample_duration 115 | 116 | # First convolution 117 | self.features = nn.Sequential(OrderedDict([ 118 | ('conv0', nn.Conv3d(3, num_init_features, kernel_size=7, 119 | stride=(1, 2, 2), padding=(3, 3, 3), bias=False)), 120 | ('norm0', nn.BatchNorm3d(num_init_features)), 121 | ('relu0', nn.ReLU(inplace=True)), 122 | ('pool0', nn.MaxPool3d(kernel_size=3, stride=2, padding=1)), 123 | ])) 124 | 125 | # Each denseblock 126 | num_features = num_init_features 127 | for i, num_layers in enumerate(block_config): 128 | block = _DenseBlock(num_layers=num_layers, num_input_features=num_features, 129 | bn_size=bn_size, growth_rate=growth_rate, drop_rate=drop_rate) 130 | self.features.add_module('denseblock%d' % (i + 1), block) 131 | num_features = num_features + num_layers * growth_rate 132 | if i != len(block_config) - 1: 133 | trans = _Transition(num_input_features=num_features, num_output_features=num_features // 2) 134 | self.features.add_module('transition%d' % (i + 1), trans) 135 | num_features = num_features // 2 136 | 137 | # Final batch norm 138 | self.features.add_module('norm5', nn.BatchNorm2d(num_features)) 139 | 140 | # Linear layer 141 | self.classifier = nn.Linear(num_features, num_classes) 142 | 143 | def forward(self, x): 144 | features = self.features(x) 145 | out = F.relu(features, inplace=True) 146 | last_duration = math.ceil(self.sample_duration / 16) 147 | last_size = math.floor(self.sample_size / 32) 148 | out = F.avg_pool3d(out, kernel_size=(last_duration, last_size, last_size)).view(features.size(0), -1) 149 | if self.last_fc: 150 | out = self.classifier(out) 151 | return out 152 | -------------------------------------------------------------------------------- /tools/models/pre_act_resnet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.autograd import Variable 5 | import math 6 | from functools import partial 7 | 8 | __all__ = ['PreActivationResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152', 'resnet200'] 9 | 10 | 11 | def conv3x3x3(in_planes, out_planes, stride=1): 12 | # 3x3x3 convolution with padding 13 | return nn.Conv3d(in_planes, out_planes, kernel_size=3, 14 | stride=stride, padding=1, bias=False) 15 | 16 | 17 | def downsample_basic_block(x, planes, stride): 18 | out = F.avg_pool3d(x, kernel_size=1, stride=stride) 19 | zero_pads = torch.Tensor(out.size(0), planes - out.size(1), 20 | out.size(2), out.size(3), 21 | out.size(4)).zero_() 22 | if isinstance(out.data, torch.cuda.FloatTensor): 23 | zero_pads = zero_pads.cuda() 24 | 25 | out = Variable(torch.cat([out.data, zero_pads], dim=1)) 26 | 27 | return out 28 | 29 | 30 | class PreActivationBasicBlock(nn.Module): 31 | expansion = 1 32 | 33 | def __init__(self, inplanes, planes, stride=1, downsample=None): 34 | super(PreActivationBasicBlock, self).__init__() 35 | self.bn1 = nn.BatchNorm3d(inplanes) 36 | self.conv1 = conv3x3x3(inplanes, planes, stride) 37 | self.bn2 = nn.BatchNorm3d(planes) 38 | self.conv2 = conv3x3x3(planes, planes) 39 | self.relu = nn.ReLU(inplace=True) 40 | self.downsample = downsample 41 | self.stride = stride 42 | 43 | def forward(self, x): 44 | residual = x 45 | 46 | out = self.bn1(x) 47 | out = self.relu(out) 48 | out = self.conv1(out) 49 | 50 | out = self.bn2(out) 51 | out = self.relu(out) 52 | out = self.conv2(out) 53 | 54 | if self.downsample is not None: 55 | residual = self.downsample(x) 56 | 57 | out += residual 58 | 59 | return out 60 | 61 | 62 | class PreActivationBottleneck(nn.Module): 63 | expansion = 4 64 | 65 | def __init__(self, inplanes, planes, stride=1, downsample=None): 66 | super(PreActivationBottleneck, self).__init__() 67 | self.bn1 = nn.BatchNorm3d(inplanes) 68 | self.conv1 = nn.Conv3d(inplanes, planes, kernel_size=1, bias=False) 69 | self.bn2 = nn.BatchNorm3d(planes) 70 | self.conv2 = nn.Conv3d(planes, planes, kernel_size=3, stride=stride, 71 | padding=1, bias=False) 72 | self.bn3 = nn.BatchNorm3d(planes) 73 | self.conv3 = nn.Conv3d(planes, planes * 4, kernel_size=1, bias=False) 74 | self.relu = nn.ReLU(inplace=True) 75 | self.downsample = downsample 76 | self.stride = stride 77 | 78 | def forward(self, x): 79 | residual = x 80 | 81 | out = self.bn1(x) 82 | out = self.relu(out) 83 | out = self.conv1(out) 84 | 85 | out = self.bn2(out) 86 | out = self.relu(out) 87 | out = self.conv2(out) 88 | 89 | out = self.bn3(out) 90 | out = self.relu(out) 91 | out = self.conv3(out) 92 | 93 | if self.downsample is not None: 94 | residual = self.downsample(x) 95 | 96 | out += residual 97 | 98 | return out 99 | 100 | 101 | class PreActivationResNet(nn.Module): 102 | 103 | def __init__(self, block, layers, sample_size, sample_duration, shortcut_type='B', num_classes=400, last_fc=True): 104 | self.last_fc = last_fc 105 | 106 | self.inplanes = 64 107 | super(PreActivationResNet, self).__init__() 108 | self.conv1 = nn.Conv3d(3, 64, kernel_size=7, stride=(1, 2, 2), 109 | padding=(3, 3, 3), bias=False) 110 | self.bn1 = nn.BatchNorm3d(64) 111 | self.relu = nn.ReLU(inplace=True) 112 | self.maxpool = nn.MaxPool3d(kernel_size=(3, 3, 3), stride=2, padding=1) 113 | self.layer1 = self._make_layer(block, 64, layers[0], shortcut_type) 114 | self.layer2 = self._make_layer(block, 128, layers[1], shortcut_type, stride=2) 115 | self.layer3 = self._make_layer(block, 256, layers[2], shortcut_type, stride=2) 116 | self.layer4 = self._make_layer(block, 512, layers[3], shortcut_type, stride=2) 117 | last_duration = math.ceil(sample_duration / 16) 118 | last_size = math.ceil(sample_size / 32) 119 | self.avgpool = nn.AvgPool3d((last_duration, last_size, last_size), stride=1) 120 | self.fc = nn.Linear(512 * block.expansion, num_classes) 121 | 122 | for m in self.modules(): 123 | if isinstance(m, nn.Conv3d): 124 | n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels 125 | m.weight.data.normal_(0, math.sqrt(2. / n)) 126 | elif isinstance(m, nn.BatchNorm3d): 127 | m.weight.data.fill_(1) 128 | m.bias.data.zero_() 129 | 130 | def _make_layer(self, block, planes, blocks, shortcut_type, stride=1): 131 | downsample = None 132 | if stride != 1 or self.inplanes != planes * block.expansion: 133 | if shortcut_type == 'A': 134 | downsample = partial(downsample_basic_block, 135 | planes=planes * block.expansion, 136 | stride=stride) 137 | else: 138 | downsample = nn.Sequential( 139 | nn.Conv3d(self.inplanes, planes * block.expansion, 140 | kernel_size=1, stride=stride, bias=False), 141 | nn.BatchNorm3d(planes * block.expansion) 142 | ) 143 | 144 | layers = [] 145 | layers.append(block(self.inplanes, planes, stride, downsample)) 146 | self.inplanes = planes * block.expansion 147 | for i in range(1, blocks): 148 | layers.append(block(self.inplanes, planes)) 149 | 150 | return nn.Sequential(*layers) 151 | 152 | def forward(self, x): 153 | x = self.conv1(x) 154 | x = self.bn1(x) 155 | x = self.relu(x) 156 | x = self.maxpool(x) 157 | 158 | x = self.layer1(x) 159 | x = self.layer2(x) 160 | x = self.layer3(x) 161 | x = self.layer4(x) 162 | 163 | x = self.avgpool(x) 164 | 165 | x = x.view(x.size(0), -1) 166 | if self.last_fc: 167 | x = self.fc(x) 168 | 169 | return x 170 | 171 | def get_fine_tuning_parameters(model, ft_begin_index): 172 | if ft_begin_index == 0: 173 | return model.parameters() 174 | 175 | ft_module_names = [] 176 | for i in range(ft_begin_index, 5): 177 | ft_module_names.append('layer{}'.format(ft_begin_index)) 178 | ft_module_names.append('fc') 179 | 180 | parameters = [] 181 | for k, v in model.named_parameters(): 182 | for ft_module in ft_module_names: 183 | if ft_module in k: 184 | parameters.append({'params': v}) 185 | break 186 | else: 187 | parameters.append({'params': v, 'lr': 0.0}) 188 | 189 | return parameters 190 | 191 | def resnet18(**kwargs): 192 | """Constructs a ResNet-18 model. 193 | """ 194 | model = PreActivationResNet(PreActivationBasicBlock, [2, 2, 2, 2], **kwargs) 195 | return model 196 | 197 | def resnet34(**kwargs): 198 | """Constructs a ResNet-34 model. 199 | """ 200 | model = PreActivationResNet(PreActivationBasicBlock, [3, 4, 6, 3], **kwargs) 201 | return model 202 | 203 | 204 | def resnet50(**kwargs): 205 | """Constructs a ResNet-50 model. 206 | """ 207 | model = PreActivationResNet(PreActivationBottleneck, [3, 4, 6, 3], **kwargs) 208 | return model 209 | 210 | def resnet101(**kwargs): 211 | """Constructs a ResNet-101 model. 212 | """ 213 | model = PreActivationResNet(PreActivationBottleneck, [3, 4, 23, 3], **kwargs) 214 | return model 215 | 216 | def resnet152(**kwargs): 217 | """Constructs a ResNet-101 model. 218 | """ 219 | model = PreActivationResNet(PreActivationBottleneck, [3, 8, 36, 3], **kwargs) 220 | return model 221 | 222 | def resnet200(**kwargs): 223 | """Constructs a ResNet-101 model. 224 | """ 225 | model = PreActivationResNet(PreActivationBottleneck, [3, 24, 36, 3], **kwargs) 226 | return model 227 | -------------------------------------------------------------------------------- /tools/models/resnet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.autograd import Variable 5 | import math 6 | from functools import partial 7 | 8 | __all__ = ['ResNet', 'resnet10', 'resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152', 'resnet200'] 9 | 10 | 11 | def conv3x3x3(in_planes, out_planes, stride=1): 12 | # 3x3x3 convolution with padding 13 | return nn.Conv3d(in_planes, out_planes, kernel_size=3, 14 | stride=stride, padding=1, bias=False) 15 | 16 | 17 | def downsample_basic_block(x, planes, stride): 18 | out = F.avg_pool3d(x, kernel_size=1, stride=stride) 19 | zero_pads = torch.Tensor(out.size(0), planes - out.size(1), 20 | out.size(2), out.size(3), 21 | out.size(4)).zero_() 22 | if isinstance(out.data, torch.cuda.FloatTensor): 23 | zero_pads = zero_pads.cuda() 24 | 25 | out = Variable(torch.cat([out.data, zero_pads], dim=1)) 26 | 27 | return out 28 | 29 | 30 | class BasicBlock(nn.Module): 31 | expansion = 1 32 | 33 | def __init__(self, inplanes, planes, stride=1, downsample=None): 34 | super(BasicBlock, self).__init__() 35 | self.conv1 = conv3x3x3(inplanes, planes, stride) 36 | self.bn1 = nn.BatchNorm3d(planes) 37 | self.relu = nn.ReLU(inplace=True) 38 | self.conv2 = conv3x3x3(planes, planes) 39 | self.bn2 = nn.BatchNorm3d(planes) 40 | self.downsample = downsample 41 | self.stride = stride 42 | 43 | def forward(self, x): 44 | residual = x 45 | 46 | out = self.conv1(x) 47 | out = self.bn1(out) 48 | out = self.relu(out) 49 | 50 | out = self.conv2(out) 51 | out = self.bn2(out) 52 | 53 | if self.downsample is not None: 54 | residual = self.downsample(x) 55 | 56 | out += residual 57 | out = self.relu(out) 58 | 59 | return out 60 | 61 | 62 | class Bottleneck(nn.Module): 63 | expansion = 4 64 | 65 | def __init__(self, inplanes, planes, stride=1, downsample=None): 66 | super(Bottleneck, self).__init__() 67 | self.conv1 = nn.Conv3d(inplanes, planes, kernel_size=1, bias=False) 68 | self.bn1 = nn.BatchNorm3d(planes) 69 | self.conv2 = nn.Conv3d(planes, planes, kernel_size=3, stride=stride, 70 | padding=1, bias=False) 71 | self.bn2 = nn.BatchNorm3d(planes) 72 | self.conv3 = nn.Conv3d(planes, planes * 4, kernel_size=1, bias=False) 73 | self.bn3 = nn.BatchNorm3d(planes * 4) 74 | self.relu = nn.ReLU(inplace=True) 75 | self.downsample = downsample 76 | self.stride = stride 77 | 78 | def forward(self, x): 79 | residual = x 80 | 81 | out = self.conv1(x) 82 | out = self.bn1(out) 83 | out = self.relu(out) 84 | 85 | out = self.conv2(out) 86 | out = self.bn2(out) 87 | out = self.relu(out) 88 | 89 | out = self.conv3(out) 90 | out = self.bn3(out) 91 | 92 | if self.downsample is not None: 93 | residual = self.downsample(x) 94 | 95 | out += residual 96 | out = self.relu(out) 97 | 98 | return out 99 | 100 | 101 | class ResNet(nn.Module): 102 | 103 | def __init__(self, block, layers, sample_size, sample_duration, shortcut_type='B', num_classes=400, last_fc=True): 104 | self.last_fc = last_fc 105 | 106 | self.inplanes = 64 107 | super(ResNet, self).__init__() 108 | self.conv1 = nn.Conv3d(3, 64, kernel_size=7, stride=(1, 2, 2), 109 | padding=(3, 3, 3), bias=False) 110 | self.bn1 = nn.BatchNorm3d(64) 111 | self.relu = nn.ReLU(inplace=True) 112 | self.maxpool = nn.MaxPool3d(kernel_size=(3, 3, 3), stride=2, padding=1) 113 | self.layer1 = self._make_layer(block, 64, layers[0], shortcut_type) 114 | self.layer2 = self._make_layer(block, 128, layers[1], shortcut_type, stride=2) 115 | self.layer3 = self._make_layer(block, 256, layers[2], shortcut_type, stride=2) 116 | self.layer4 = self._make_layer(block, 512, layers[3], shortcut_type, stride=2) 117 | last_duration = math.ceil(sample_duration / 16) 118 | last_size = math.ceil(sample_size / 32) 119 | self.avgpool = nn.AvgPool3d((last_duration, last_size, last_size), stride=1) 120 | self.fc = nn.Linear(512 * block.expansion, num_classes) 121 | 122 | for m in self.modules(): 123 | if isinstance(m, nn.Conv3d): 124 | n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels 125 | m.weight.data.normal_(0, math.sqrt(2. / n)) 126 | elif isinstance(m, nn.BatchNorm3d): 127 | m.weight.data.fill_(1) 128 | m.bias.data.zero_() 129 | 130 | def _make_layer(self, block, planes, blocks, shortcut_type, stride=1): 131 | downsample = None 132 | if stride != 1 or self.inplanes != planes * block.expansion: 133 | if shortcut_type == 'A': 134 | downsample = partial(downsample_basic_block, 135 | planes=planes * block.expansion, 136 | stride=stride) 137 | else: 138 | downsample = nn.Sequential( 139 | nn.Conv3d(self.inplanes, planes * block.expansion, 140 | kernel_size=1, stride=stride, bias=False), 141 | nn.BatchNorm3d(planes * block.expansion) 142 | ) 143 | 144 | layers = [] 145 | layers.append(block(self.inplanes, planes, stride, downsample)) 146 | self.inplanes = planes * block.expansion 147 | for i in range(1, blocks): 148 | layers.append(block(self.inplanes, planes)) 149 | 150 | return nn.Sequential(*layers) 151 | 152 | def forward(self, x): 153 | x = self.conv1(x) 154 | x = self.bn1(x) 155 | x = self.relu(x) 156 | x = self.maxpool(x) 157 | 158 | x = self.layer1(x) 159 | x = self.layer2(x) 160 | x = self.layer3(x) 161 | x = self.layer4(x) 162 | 163 | 164 | x = self.avgpool(x) 165 | 166 | x = x.view(x.size(0), -1) 167 | if self.last_fc: 168 | x = self.fc(x) 169 | 170 | return x 171 | 172 | 173 | def get_fine_tuning_parameters(model, ft_begin_index): 174 | if ft_begin_index == 0: 175 | return model.parameters() 176 | 177 | ft_module_names = [] 178 | for i in range(ft_begin_index, 5): 179 | ft_module_names.append('layer{}'.format(ft_begin_index)) 180 | ft_module_names.append('fc') 181 | 182 | parameters = [] 183 | for k, v in model.named_parameters(): 184 | for ft_module in ft_module_names: 185 | if ft_module in k: 186 | parameters.append({'params': v}) 187 | break 188 | else: 189 | parameters.append({'params': v, 'lr': 0.0}) 190 | 191 | return parameters 192 | 193 | 194 | def resnet10(**kwargs): 195 | """Constructs a ResNet-18 model. 196 | """ 197 | model = ResNet(BasicBlock, [1, 1, 1, 1], **kwargs) 198 | return model 199 | 200 | def resnet18(**kwargs): 201 | """Constructs a ResNet-18 model. 202 | """ 203 | model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs) 204 | return model 205 | 206 | def resnet34(**kwargs): 207 | """Constructs a ResNet-34 model. 208 | """ 209 | model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs) 210 | return model 211 | 212 | def resnet50(**kwargs): 213 | """Constructs a ResNet-50 model. 214 | """ 215 | model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs) 216 | return model 217 | 218 | def resnet101(**kwargs): 219 | """Constructs a ResNet-101 model. 220 | """ 221 | model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs) 222 | return model 223 | 224 | def resnet152(**kwargs): 225 | """Constructs a ResNet-101 model. 226 | """ 227 | model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs) 228 | return model 229 | 230 | def resnet200(**kwargs): 231 | """Constructs a ResNet-101 model. 232 | """ 233 | model = ResNet(Bottleneck, [3, 24, 36, 3], **kwargs) 234 | return model 235 | -------------------------------------------------------------------------------- /tools/models/resnext.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.autograd import Variable 5 | import math 6 | from functools import partial 7 | 8 | __all__ = ['ResNeXt', 'resnet50', 'resnet101'] 9 | 10 | 11 | def conv3x3x3(in_planes, out_planes, stride=1): 12 | # 3x3x3 convolution with padding 13 | return nn.Conv3d(in_planes, out_planes, kernel_size=3, 14 | stride=stride, padding=1, bias=False) 15 | 16 | 17 | def downsample_basic_block(x, planes, stride): 18 | out = F.avg_pool3d(x, kernel_size=1, stride=stride) 19 | zero_pads = torch.Tensor(out.size(0), planes - out.size(1), 20 | out.size(2), out.size(3), 21 | out.size(4)).zero_() 22 | if isinstance(out.data, torch.cuda.FloatTensor): 23 | zero_pads = zero_pads.cuda() 24 | 25 | out = Variable(torch.cat([out.data, zero_pads], dim=1)) 26 | 27 | return out 28 | 29 | 30 | class ResNeXtBottleneck(nn.Module): 31 | expansion = 2 32 | 33 | def __init__(self, inplanes, planes, cardinality, stride=1, downsample=None): 34 | super(ResNeXtBottleneck, self).__init__() 35 | mid_planes = cardinality * int(planes / 32) 36 | self.conv1 = nn.Conv3d(inplanes, mid_planes, kernel_size=1, bias=False) 37 | self.bn1 = nn.BatchNorm3d(mid_planes) 38 | self.conv2 = nn.Conv3d(mid_planes, mid_planes, kernel_size=3, stride=stride, 39 | padding=1, groups=cardinality, bias=False) 40 | self.bn2 = nn.BatchNorm3d(mid_planes) 41 | self.conv3 = nn.Conv3d(mid_planes, planes * self.expansion, kernel_size=1, bias=False) 42 | self.bn3 = nn.BatchNorm3d(planes * self.expansion) 43 | self.relu = nn.ReLU(inplace=True) 44 | self.downsample = downsample 45 | self.stride = stride 46 | 47 | def forward(self, x): 48 | residual = x 49 | 50 | out = self.conv1(x) 51 | out = self.bn1(out) 52 | out = self.relu(out) 53 | 54 | out = self.conv2(out) 55 | out = self.bn2(out) 56 | out = self.relu(out) 57 | 58 | out = self.conv3(out) 59 | out = self.bn3(out) 60 | 61 | if self.downsample is not None: 62 | residual = self.downsample(x) 63 | 64 | out += residual 65 | out = self.relu(out) 66 | 67 | return out 68 | 69 | 70 | class ResNeXt(nn.Module): 71 | 72 | def __init__(self, block, layers, sample_size, sample_duration, shortcut_type='B', cardinality=32, num_classes=400, last_fc=True): 73 | self.last_fc = last_fc 74 | 75 | self.inplanes = 64 76 | super(ResNeXt, self).__init__() 77 | self.conv1 = nn.Conv3d(3, 64, kernel_size=7, stride=(1, 2, 2), 78 | padding=(3, 3, 3), bias=False) 79 | self.bn1 = nn.BatchNorm3d(64) 80 | self.relu = nn.ReLU(inplace=True) 81 | self.maxpool = nn.MaxPool3d(kernel_size=(3, 3, 3), stride=2, padding=1) 82 | self.layer1 = self._make_layer(block, 128, layers[0], shortcut_type, cardinality) 83 | self.layer2 = self._make_layer(block, 256, layers[1], shortcut_type, cardinality, stride=2) 84 | self.layer3 = self._make_layer(block, 512, layers[2], shortcut_type, cardinality, stride=2) 85 | self.layer4 = self._make_layer(block, 1024, layers[3], shortcut_type, cardinality, stride=2) 86 | last_duration = math.ceil(sample_duration / 16) 87 | last_size = math.ceil(sample_size / 32) 88 | self.avgpool = nn.AvgPool3d((last_duration, last_size, last_size), stride=1) 89 | self.fc = nn.Linear(cardinality * 32 * block.expansion, num_classes) 90 | 91 | for m in self.modules(): 92 | if isinstance(m, nn.Conv3d): 93 | n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels 94 | m.weight.data.normal_(0, math.sqrt(2. / n)) 95 | elif isinstance(m, nn.BatchNorm3d): 96 | m.weight.data.fill_(1) 97 | m.bias.data.zero_() 98 | 99 | def _make_layer(self, block, planes, blocks, shortcut_type, cardinality, stride=1): 100 | downsample = None 101 | if stride != 1 or self.inplanes != planes * block.expansion: 102 | if shortcut_type == 'A': 103 | downsample = partial(downsample_basic_block, 104 | planes=planes * block.expansion, 105 | stride=stride) 106 | else: 107 | downsample = nn.Sequential( 108 | nn.Conv3d(self.inplanes, planes * block.expansion, 109 | kernel_size=1, stride=stride, bias=False), 110 | nn.BatchNorm3d(planes * block.expansion) 111 | ) 112 | 113 | layers = [] 114 | layers.append(block(self.inplanes, planes, cardinality, stride, downsample)) 115 | self.inplanes = planes * block.expansion 116 | for i in range(1, blocks): 117 | layers.append(block(self.inplanes, planes, cardinality)) 118 | 119 | return nn.Sequential(*layers) 120 | 121 | def forward(self, x): 122 | x = self.conv1(x) 123 | x = self.bn1(x) 124 | x = self.relu(x) 125 | x = self.maxpool(x) 126 | 127 | x = self.layer1(x) 128 | x = self.layer2(x) 129 | x = self.layer3(x) 130 | x = self.layer4(x) 131 | 132 | 133 | x = self.avgpool(x) 134 | x = x.view(x.size(0), -1) 135 | if self.last_fc: 136 | x = self.fc(x) 137 | 138 | return x 139 | 140 | def get_fine_tuning_parameters(model, ft_begin_index): 141 | if ft_begin_index == 0: 142 | return model.parameters() 143 | 144 | ft_module_names = [] 145 | for i in range(ft_begin_index, 5): 146 | ft_module_names.append('layer{}'.format(ft_begin_index)) 147 | ft_module_names.append('fc') 148 | 149 | parameters = [] 150 | for k, v in model.named_parameters(): 151 | for ft_module in ft_module_names: 152 | if ft_module in k: 153 | parameters.append({'params': v}) 154 | break 155 | else: 156 | parameters.append({'params': v, 'lr': 0.0}) 157 | 158 | return parameters 159 | 160 | def resnet50(**kwargs): 161 | """Constructs a ResNet-50 model. 162 | """ 163 | model = ResNeXt(ResNeXtBottleneck, [3, 4, 6, 3], **kwargs) 164 | return model 165 | 166 | def resnet101(**kwargs): 167 | """Constructs a ResNet-101 model. 168 | """ 169 | model = ResNeXt(ResNeXtBottleneck, [3, 4, 23, 3], **kwargs) 170 | return model 171 | 172 | def resnet152(**kwargs): 173 | """Constructs a ResNet-101 model. 174 | """ 175 | model = ResNeXt(ResNeXtBottleneck, [3, 8, 36, 3], **kwargs) 176 | return model 177 | -------------------------------------------------------------------------------- /tools/models/wide_resnet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.autograd import Variable 5 | import math 6 | from functools import partial 7 | 8 | __all__ = ['WideResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101'] 9 | 10 | 11 | def conv3x3x3(in_planes, out_planes, stride=1): 12 | # 3x3x3 convolution with padding 13 | return nn.Conv3d(in_planes, out_planes, kernel_size=3, 14 | stride=stride, padding=1, bias=False) 15 | 16 | 17 | def downsample_basic_block(x, planes, stride): 18 | out = F.avg_pool3d(x, kernel_size=1, stride=stride) 19 | zero_pads = torch.Tensor(out.size(0), planes - out.size(1), 20 | out.size(2), out.size(3), 21 | out.size(4)).zero_() 22 | if isinstance(out.data, torch.cuda.FloatTensor): 23 | zero_pads = zero_pads.cuda() 24 | 25 | out = Variable(torch.cat([out.data, zero_pads], dim=1)) 26 | 27 | return out 28 | 29 | 30 | class WideBottleneck(nn.Module): 31 | expansion = 2 32 | 33 | def __init__(self, inplanes, planes, stride=1, downsample=None): 34 | super(WideBottleneck, self).__init__() 35 | self.conv1 = nn.Conv3d(inplanes, planes, kernel_size=1, bias=False) 36 | self.bn1 = nn.BatchNorm3d(planes) 37 | self.conv2 = nn.Conv3d(planes, planes, kernel_size=3, stride=stride, 38 | padding=1, bias=False) 39 | self.bn2 = nn.BatchNorm3d(planes) 40 | self.conv3 = nn.Conv3d(planes, planes * self.expansion, kernel_size=1, bias=False) 41 | self.bn3 = nn.BatchNorm3d(planes * self.expansion) 42 | self.relu = nn.ReLU(inplace=True) 43 | self.downsample = downsample 44 | self.stride = stride 45 | 46 | def forward(self, x): 47 | residual = x 48 | 49 | out = self.conv1(x) 50 | out = self.bn1(out) 51 | out = self.relu(out) 52 | 53 | out = self.conv2(out) 54 | out = self.bn2(out) 55 | out = self.relu(out) 56 | 57 | out = self.conv3(out) 58 | out = self.bn3(out) 59 | 60 | if self.downsample is not None: 61 | residual = self.downsample(x) 62 | 63 | out += residual 64 | out = self.relu(out) 65 | 66 | return out 67 | 68 | 69 | class WideResNet(nn.Module): 70 | 71 | def __init__(self, block, layers, sample_size, sample_duration, k=1, shortcut_type='B', num_classes=400, last_fc=True): 72 | self.last_fc = last_fc 73 | 74 | self.inplanes = 64 75 | super(WideResNet, self).__init__() 76 | self.conv1 = nn.Conv3d(3, 64, kernel_size=7, stride=(1, 2, 2), 77 | padding=(3, 3, 3), bias=False) 78 | self.bn1 = nn.BatchNorm3d(64) 79 | self.relu = nn.ReLU(inplace=True) 80 | self.maxpool = nn.MaxPool3d(kernel_size=(3, 3, 3), stride=2, padding=1) 81 | self.layer1 = self._make_layer(block, 64 * k, layers[0], shortcut_type) 82 | self.layer2 = self._make_layer(block, 128 * k, layers[1], shortcut_type, stride=2) 83 | self.layer3 = self._make_layer(block, 256 * k, layers[2], shortcut_type, stride=2) 84 | self.layer4 = self._make_layer(block, 512 * k, layers[3], shortcut_type, stride=2) 85 | last_duration = math.ceil(sample_duration / 16) 86 | last_size = math.ceil(sample_size / 32) 87 | self.avgpool = nn.AvgPool3d((last_duration, last_size, last_size), stride=1) 88 | self.fc = nn.Linear(512 * k * block.expansion, num_classes) 89 | 90 | for m in self.modules(): 91 | if isinstance(m, nn.Conv3d): 92 | n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels 93 | m.weight.data.normal_(0, math.sqrt(2. / n)) 94 | elif isinstance(m, nn.BatchNorm3d): 95 | m.weight.data.fill_(1) 96 | m.bias.data.zero_() 97 | 98 | def _make_layer(self, block, planes, blocks, shortcut_type, stride=1): 99 | downsample = None 100 | if stride != 1 or self.inplanes != planes * block.expansion: 101 | if shortcut_type == 'A': 102 | downsample = partial(downsample_basic_block, 103 | planes=planes * block.expansion, 104 | stride=stride) 105 | else: 106 | downsample = nn.Sequential( 107 | nn.Conv3d(self.inplanes, planes * block.expansion, 108 | kernel_size=1, stride=stride, bias=False), 109 | nn.BatchNorm3d(planes * block.expansion) 110 | ) 111 | 112 | layers = [] 113 | layers.append(block(self.inplanes, planes, stride, downsample)) 114 | self.inplanes = planes * block.expansion 115 | for i in range(1, blocks): 116 | layers.append(block(self.inplanes, planes)) 117 | 118 | return nn.Sequential(*layers) 119 | 120 | def forward(self, x): 121 | x = self.conv1(x) 122 | x = self.bn1(x) 123 | x = self.relu(x) 124 | x = self.maxpool(x) 125 | 126 | x = self.layer1(x) 127 | x = self.layer2(x) 128 | x = self.layer3(x) 129 | x = self.layer4(x) 130 | 131 | x = self.avgpool(x) 132 | 133 | x = x.view(x.size(0), -1) 134 | if self.last_fc: 135 | x = self.fc(x) 136 | 137 | return x 138 | 139 | def get_fine_tuning_parameters(model, ft_begin_index): 140 | if ft_begin_index == 0: 141 | return model.parameters() 142 | 143 | ft_module_names = [] 144 | for i in range(ft_begin_index, 5): 145 | ft_module_names.append('layer{}'.format(ft_begin_index)) 146 | ft_module_names.append('fc') 147 | 148 | parameters = [] 149 | for k, v in model.named_parameters(): 150 | for ft_module in ft_module_names: 151 | if ft_module in k: 152 | parameters.append({'params': v}) 153 | break 154 | else: 155 | parameters.append({'params': v, 'lr': 0.0}) 156 | 157 | return parameters 158 | 159 | def resnet50(**kwargs): 160 | """Constructs a ResNet-50 model. 161 | """ 162 | model = WideResNet(WideBottleneck, [3, 4, 6, 3], **kwargs) 163 | return model 164 | -------------------------------------------------------------------------------- /tools/object_align.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.insert(0, '../') 3 | import h5py 4 | import os.path as osp 5 | import numpy as np 6 | from sklearn.metrics.pairwise import pairwise_distances 7 | from sklearn.preprocessing import normalize 8 | import sys 9 | sys.path.insert(0, '../') 10 | from util import load_file, save_to 11 | import os 12 | 13 | def align_object(video_feature_path, mode): 14 | bbox_feat_file = osp.join(video_feature_path, 'region_8c10b_{}.h5'.format(mode)) 15 | print('Load {}...'.format(bbox_feat_file)) 16 | out_file = osp.join(bbox_feat_file+'.h5') 17 | fout = h5py.File(out_file, 'w') 18 | string_dt = h5py.special_dtype(vlen=str) 19 | with h5py.File(bbox_feat_file, 'r') as fp: 20 | vids = fp['ids'] 21 | feats = fp['feat'] 22 | bboxes = fp['bbox'] 23 | fout.create_dataset('ids', shape=vids.shape, dtype=string_dt, data=vids) 24 | 25 | feat_alns, bbox_alns = [], [] 26 | for id, (vid, feat, bbox) in enumerate(zip(vids, feats, bboxes)): 27 | 28 | cnum, fnum, rnum, _ = feat.shape 29 | cur_feat_aln, cur_bbox_aln = [], [] 30 | for cid, (cur_feat, cur_bbox) in enumerate(zip(feat, bbox)): 31 | vid_feat_aln, vid_bbox_aln = align(cur_feat, cur_bbox, vid, cid) 32 | cur_feat_aln.append(vid_feat_aln) 33 | cur_bbox_aln.append(vid_bbox_aln) 34 | 35 | feat_alns.append(cur_feat_aln) 36 | bbox_alns.append(cur_bbox_aln) 37 | if id % 100 == 0: 38 | print(f'{id}/{len(vids)}') 39 | 40 | feat_alns = np.asarray(feat_alns) 41 | bbox_alns = np.asarray(bbox_alns) 42 | print(feat_alns.shape, bbox_alns.shape) 43 | 44 | fout.create_dataset('feat', shape=feat_alns.shape, dtype=np.float32, data=feat_alns) 45 | fout.create_dataset('bbox', shape=bbox_alns.shape, dtype=np.float32, data=bbox_alns) 46 | 47 | 48 | def align_object_byv(video_feature_path, vlist_file): 49 | vlist = load_file(vlist_file) 50 | indir = osp.join(video_feature_path, 'bbox_feat') 51 | outdir = osp.join(video_feature_path, 'bbox_feat_aln') 52 | vnum = len(vlist) 53 | print(vnum) 54 | for idx, vid in enumerate(vlist): 55 | if idx <= 8000: continue 56 | if idx > 10000: break 57 | outfile = osp.join(outdir, vid+'.npz') 58 | if osp.exists(outfile): 59 | continue 60 | infile = osp.join(indir, vid+'.npz') 61 | region_feat = np.load(infile) 62 | 63 | roi_feat, roi_bbox = align_feat_bbox(region_feat['feat'][:8], region_feat['bbox'][:8], vid) 64 | out_dir = osp.dirname(outfile) 65 | if not osp.exists(out_dir): 66 | os.makedirs(out_dir) 67 | np.savez_compressed(outfile, feat=roi_feat, bbox=roi_bbox) 68 | if idx % 100 == 0: 69 | print(f'{idx}/{vnum}', outfile) 70 | print(roi_feat.shape, roi_bbox.shape) 71 | 72 | 73 | def align_feat_bbox(feat, bbox, vid): 74 | cur_feat_aln, cur_bbox_aln = [], [] 75 | for cid, (cur_feat, cur_bbox) in enumerate(zip(feat, bbox)): 76 | vid_feat_aln, vid_bbox_aln = align(cur_feat, cur_bbox, vid, cid) 77 | cur_feat_aln.append(vid_feat_aln) 78 | cur_bbox_aln.append(vid_bbox_aln) 79 | return np.asarray(cur_feat_aln), np.asarray(cur_bbox_aln) 80 | 81 | 82 | def align(feats, bboxes, vid, cid): 83 | new_feats, new_bboxes = [], [] 84 | paths = get_tracks(feats, bboxes, vid, cid) 85 | for i in range(len(paths)): 86 | obj_feat, obj_pos = [], [] 87 | for fid in range(len(feats)): 88 | feat = feats[fid][paths[i][fid]] 89 | bbox = bboxes[fid][paths[i][fid]] 90 | obj_feat.append(feat) 91 | obj_pos.append(bbox) 92 | new_feats.append(obj_feat) 93 | new_bboxes.append(obj_pos) 94 | new_feats = np.asarray(new_feats).transpose(1, 0, 2) 95 | new_bboxes = np.asarray(new_bboxes).transpose(1, 0, 2) 96 | return new_feats, new_bboxes 97 | 98 | 99 | def get_tracks(feats, bboxes, vid, cid): 100 | links = get_link(feats, bboxes) 101 | paths = [] 102 | for i in range(bboxes.shape[1]): 103 | max_path = find_max_path_greedy(links, i) 104 | links = update_links(links, max_path) 105 | max_path = [i] + max_path 106 | paths.append(max_path) 107 | # vis_path(vid, cid, bboxes, max_path) 108 | # break 109 | return paths 110 | 111 | 112 | def get_link(feats, bboxes): 113 | fnum = feats.shape[0] 114 | link_cretiria = [] 115 | for fid in range(fnum-1): 116 | feat_p, feat_n = feats[fid], feats[fid+1] 117 | sim_f = pairwise_distances(feat_p, feat_n, 'cosine', n_jobs=1) 118 | sim_f = 1-sim_f 119 | box_p, box_n = bboxes[fid], bboxes[fid+1] 120 | areas_p = np.array([get_area(bbox) for bbox in box_p]) 121 | areas_n = np.array([get_area(bbox) for bbox in box_n]) 122 | op_box = [] 123 | for bid, bbox in enumerate(box_p): 124 | area_p = areas_p[bid] 125 | x1 = np.maximum(bbox[0], box_n[:, 0]) 126 | y1 = np.maximum(bbox[1], box_n[:, 1]) 127 | x2 = np.minimum(bbox[2], box_n[:, 2]) 128 | y2 = np.minimum(bbox[3], box_n[:, 3]) 129 | W = np.maximum(0, x2 - x1 + 1) 130 | H = np.maximum(0, y2 - y1 + 1) 131 | ov_area = W * H 132 | IoUs = ov_area / (area_p + areas_n - ov_area) 133 | op_box.append(IoUs) 134 | scores = np.asarray(op_box) + sim_f #equal importance 135 | link_cretiria.append(scores) 136 | return np.asarray(link_cretiria) 137 | 138 | 139 | def update_links(links, max_path): 140 | """ 141 | remove the nodes at the max_path 142 | """ 143 | for i, v in enumerate(max_path): 144 | links[i][v] = 0 145 | return links 146 | 147 | 148 | def find_max_path_greedy(link_scores, sid): 149 | path = [] 150 | for i in range(link_scores.shape[0]): 151 | sid = np.argmax(link_scores[i][sid]) 152 | path.append(sid) 153 | return path 154 | 155 | 156 | def get_area(bbox): 157 | area = (bbox[2]-bbox[0]+1)*(bbox[3]-bbox[1]+1) 158 | return area 159 | 160 | 161 | def main(): 162 | video_feature_path = f'../../data/feats/nextqa/region_feat_n/' 163 | align_object(video_feature_path, 'test') 164 | # dataset_dir = '../../data/datasets/nextqa/test.csv' 165 | # vlist_file = dataset_dir + 'vlist.json' 166 | # if osp.exists(vlist_file): 167 | # vlist = load_file(vlist_file) 168 | # else: 169 | # data = load_file(dataset_dir) 170 | # vlist = list(set(list(data['video_id']))) 171 | # save_to(vlist_file, vlist) 172 | # align_object_byv(video_feature_path, vlist_file) 173 | 174 | 175 | if __name__ == "__main__": 176 | main() 177 | -------------------------------------------------------------------------------- /tools/preprocess_features.py: -------------------------------------------------------------------------------- 1 | import argparse, os 2 | import h5py 3 | from scipy.misc import imresize 4 | import skvideo.io as sio 5 | from PIL import Image 6 | import cv2 7 | import json 8 | import torch 9 | from torch import nn 10 | import torchvision 11 | import random 12 | import numpy as np 13 | import shutil 14 | import subprocess 15 | from models import resnext 16 | from datautils import utils 17 | from datautils import tgif_qa 18 | from datautils import msrvtt_qa 19 | from datautils import msvd_qa 20 | import os.path as osp 21 | import sys 22 | sys.path.insert(0, '../') 23 | import time 24 | from util import load_file, save_to 25 | 26 | 27 | def build_resnet(): 28 | if not hasattr(torchvision.models, args.model): 29 | raise ValueError('Invalid model "%s"' % args.model) 30 | if not 'resnet' in args.model: 31 | raise ValueError('Feature extraction only supports ResNets') 32 | cnn = getattr(torchvision.models, args.model)(pretrained=True) 33 | model = torch.nn.Sequential(*list(cnn.children())[:-1]) 34 | 35 | model.cuda() 36 | model.eval() 37 | return model 38 | 39 | 40 | def build_resnext(): 41 | model = resnext.resnet101(num_classes=400, shortcut_type='B', cardinality=32, 42 | sample_size=112, sample_duration=16, 43 | last_fc=False) 44 | model = model.cuda() 45 | model = nn.DataParallel(model, device_ids=None) 46 | assert os.path.exists('../../data/pretrained/resnext-101-kinetics.pth') 47 | # download from https://drive.google.com/drive/folders/1zvl89AgFAApbH0At-gMuZSeQB_LpNP-M 48 | model_data = torch.load('../../data/pretrained/resnext-101-kinetics.pth', map_location='cpu') 49 | model.load_state_dict(model_data['state_dict']) 50 | model.eval() 51 | return model 52 | 53 | def extract_frame(video, dst): 54 | with open(os.devnull, 'w') as ffmpeg_log: 55 | if os.path.exists(dst): 56 | # print(" cleanup: "+dst+"/") 57 | shutil.rmtree(dst) 58 | os.makedirs(dst) 59 | video2frame_cmd = [ 60 | "ffmpeg", 61 | '-y', 62 | '-i', video, 63 | '-r', "10", 64 | # '-vf', "scale=400:300", 65 | '-vsync', '0', 66 | '-qscale:v', "2", 67 | '{0}/%06d.jpg'.format(dst) 68 | ] 69 | subprocess.call(video2frame_cmd, stdout = ffmpeg_log, stderr=ffmpeg_log) 70 | 71 | 72 | def run_batch(cur_batch, model): 73 | """ 74 | Args: 75 | cur_batch: treat a video as a batch of images 76 | model: ResNet model for feature extraction 77 | Returns: 78 | ResNet extracted feature. 79 | """ 80 | mean = np.array([0.485, 0.456, 0.406]).reshape(1, 3, 1, 1) 81 | std = np.array([0.229, 0.224, 0.224]).reshape(1, 3, 1, 1) 82 | 83 | image_batch = np.concatenate(cur_batch, 0).astype(np.float32) 84 | image_batch = (image_batch / 255.0 - mean) / std 85 | image_batch = torch.FloatTensor(image_batch).cuda() 86 | with torch.no_grad(): 87 | image_batch = torch.autograd.Variable(image_batch) 88 | 89 | feats = model(image_batch) 90 | feats = feats.data.cpu().clone().numpy() 91 | 92 | return feats 93 | 94 | 95 | def extract_clips_with_consecutive_frames(path, num_clips, num_frames_per_clip): 96 | """ 97 | Args: 98 | path: path of a video 99 | num_clips: expected numbers of splitted clips 100 | num_frames_per_clip: number of frames in a single clip, pretrained model only supports 16 frames 101 | Returns: 102 | A list of raw features of clips. 103 | """ 104 | 105 | clips = list() 106 | t1 = time.time() 107 | frame_list = sorted(os.listdir(path)) 108 | video_data = [np.asarray(Image.open(osp.join(path, img))) for img in frame_list] 109 | 110 | valid = True 111 | video_data = np.asarray(video_data) 112 | t2 = time.time() 113 | print(t2-t1) 114 | 115 | total_frames = video_data.shape[0] 116 | img_size = (args.image_height, args.image_width) 117 | for i in np.linspace(0, total_frames, num_clips + 2, dtype=np.int32)[1:num_clips + 1]: 118 | clip_start = int(i) - int(num_frames_per_clip / 2) 119 | clip_end = int(i) + int(num_frames_per_clip / 2) 120 | if clip_start < 0: 121 | clip_start = 0 122 | if clip_end > total_frames: 123 | clip_end = total_frames - 1 124 | clip = video_data[clip_start:clip_end] 125 | 126 | if clip_start == 0: 127 | shortage = num_frames_per_clip - (clip_end - clip_start) 128 | added_frames = [] 129 | for _ in range(shortage): 130 | added_frames.append(np.expand_dims(video_data[clip_start], axis=0)) 131 | if len(added_frames) > 0: 132 | added_frames = np.concatenate(added_frames, axis=0) 133 | clip = np.concatenate((added_frames, clip), axis=0) 134 | if clip_end == (total_frames - 1): 135 | shortage = num_frames_per_clip - (clip_end - clip_start) 136 | added_frames = [] 137 | for _ in range(shortage): 138 | added_frames.append(np.expand_dims(video_data[clip_end], axis=0)) 139 | if len(added_frames) > 0: 140 | added_frames = np.concatenate(added_frames, axis=0) 141 | clip = np.concatenate((clip, added_frames), axis=0) 142 | 143 | 144 | # new_clip = clip #.transpose(0, 3, 1, 2)[None] 145 | # if clip.shape[0] < num_frames_per_clip: 146 | clip = clip[::4] #sample 4 frames per clip 147 | new_clip = [] 148 | # for j in range(num_frames_per_clip): 149 | # if j >= len(clip): 150 | # new_clip.append(new_clip[-1]) 151 | # else: 152 | # new_clip.append(clip[j]) 153 | for frame_data in clip: 154 | # frame_data = clip[j] 155 | img = Image.fromarray(frame_data) 156 | img = imresize(img, img_size, interp='bicubic') 157 | frame_data = np.array(img) 158 | frame_data = frame_data.transpose(2, 0, 1)[None] 159 | new_clip.append(frame_data) 160 | new_clip = np.asarray(new_clip) # (num_frames, width, height, channels) 161 | # print(new_clip.shape) 162 | if args.model in ['resnext101']: 163 | new_clip = np.squeeze(new_clip) 164 | new_clip = np.transpose(new_clip, axes=(1, 0, 2, 3)) 165 | clips.append(new_clip) 166 | 167 | clips = clips[::4] # sample 8 clips per video 168 | t3 = time.time() 169 | 170 | return clips, valid 171 | 172 | def extract_clip_frames(vpath, clips): 173 | """ 174 | Args: 175 | path: path of a video 176 | num_clips: expected numbers of splitted clips 177 | num_frames_per_clip: number of frames in a single clip, pretrained model only supports 16 frames 178 | Returns: 179 | A list of raw features of clips. 180 | """ 181 | # para_dict = {'r':'10', 'vsync':'0', 'qscale:v':'2'} 182 | # print(vpath) 183 | # rate = 10 184 | # meta = skvideo.io.ffprobe(vpath) 185 | # fp = meta['video']['@avg_frame_rate'] 186 | # tstamp = int(fp.split('/')[0])//rate 187 | try: 188 | video_data = sio.vread(vpath) #ffmpeg as backend 189 | except: 190 | return None 191 | # video_data = video_data[::tstamp] 192 | total_frames, width, height, channel = video_data.shape 193 | # print(video_data.shape) 194 | img_size = (224, 224) #(args.image_height, args.image_width) 195 | img_clip = [] 196 | num_clip = 8 197 | clips = clips[:8] 198 | for i, cids in enumerate(clips): 199 | # if i > 7: break 200 | fids = [int(r) for r in cids] 201 | # print(fids, video_data.shape) 202 | if fids[-1] >= total_frames: 203 | fids[-1] = total_frames -1 204 | clip = video_data[fids] 205 | new_clip = [] 206 | for j in range(4): 207 | frame_data = clip[j] 208 | img = Image.fromarray(frame_data) 209 | img = imresize(img, img_size, interp='bicubic') 210 | img = img.transpose(2, 0, 1)[None] 211 | frame_data = np.array(img) 212 | new_clip.append(frame_data) 213 | # new_clip = np.asarray(new_clip) # (num_frames, width, height, channels) 214 | img_clip.extend(new_clip) 215 | 216 | return img_clip 217 | 218 | 219 | def generate_npy(model, video_dir, clip_file, outfile): 220 | 221 | vclips = load_file(clip_file) 222 | vclips = sorted(vclips.items(), key=lambda a:a[0]) 223 | dataset_size = len(vclips) 224 | print(dataset_size) 225 | 226 | i0 = 0 227 | _t = {'misc': utils.Timer()} 228 | for i, (vname, clip) in enumerate(vclips): 229 | #if i <= 4000: continue 230 | #if i > 10000: break 231 | out_file = osp.join(outfile, vname+'.npy') 232 | if osp.exists(out_file): 233 | continue 234 | video_path = osp.join(video_dir, vname+'.mp4') 235 | if not osp.exists(video_path): 236 | # print(video_path) 237 | continue 238 | clips = extract_clip_frames(video_path, clip) 239 | if clips == None: continue 240 | clips = np.asarray(clips) 241 | clip_feat = run_batch(clips, model) 242 | clip_feat = clip_feat.squeeze()#(32, 2048) 243 | 244 | feat = clip_feat.reshape(8, 4, 2048) 245 | dirname = osp.dirname(out_file) 246 | if not osp.exists(dirname): 247 | os.makedirs(dirname) 248 | np.save(out_file, feat) 249 | if i % 200 == 0: 250 | print(f'{i}/{dataset_size}') 251 | 252 | def prepare_inputs(path, frame_list): 253 | video_data = [np.asarray(Image.open(osp.join(path, img))) for img in frame_list] 254 | video_data = np.asarray(video_data) 255 | total_frames = video_data.shape[0] 256 | img_size = (224, 224) 257 | video_inputs = [] 258 | for j in range(total_frames): 259 | frame_data = video_data[j] 260 | img = Image.fromarray(frame_data) 261 | img = imresize(img, img_size, interp='bicubic') 262 | img = img.transpose(2, 0, 1)[None] 263 | frame_data = np.array(img) 264 | video_inputs.append(frame_data) 265 | video_inputs = np.asarray(video_inputs) 266 | # print(video_inputs.shape) 267 | return video_inputs 268 | 269 | def generate_npy_byframe(model, video_list_file, video_dir, out_dir): 270 | videos = load_file(video_list_file) 271 | vnum = len(videos) 272 | for iv, vname in enumerate(videos): 273 | # if iv <= 2400: continue 274 | # if iv > 3000: break 275 | fpath = f'{video_dir}/{vname}' 276 | frames = sorted(os.listdir(fpath)) 277 | out_path = osp.join(out_dir, vname) 278 | if osp.exists(out_path): continue 279 | videos = prepare_inputs(fpath, frames) 280 | fnum = videos.shape[0] 281 | if fnum > 100: 282 | it = fnum//100 283 | left = fnum % 100 284 | video_feats = [] 285 | for i in range(it): 286 | data = run_batch(videos[i*100:100*(i+1)], model) 287 | video_feats.append(data) 288 | if left > 0: 289 | data = run_batch(videos[i*100:(i*100)+left], model) 290 | video_feats.append(data) 291 | # print(len(video_feats)) 292 | video_feats = np.concatenate(video_feats, 0) 293 | assert video_feats.shape[0] == fnum, 'error' 294 | else: 295 | video_feats = run_batch(videos, model) 296 | video_feats = video_feats.squeeze() 297 | if not osp.exists(out_path): 298 | os.makedirs(out_path) 299 | for iff, frame in enumerate(frames): 300 | fname = frame.split('.')[0] 301 | fpath_out = f'{out_path}/{fname}' 302 | # if osp.exists(fpath_out+'.npy'): continue 303 | np.save(fpath_out, video_feats[iff]) 304 | if iv % 100 == 0: 305 | print(f'{iv}/{vnum}') 306 | 307 | 308 | def generate_h5(model, v_path, v_file, num_clips, outfile): 309 | """ 310 | Args: 311 | model: loaded pretrained model for feature extraction 312 | video_ids: list of video ids 313 | num_clips: expected numbers of splitted clips 314 | outfile: path of output file to be written 315 | Returns: 316 | h5 file containing visual features of splitted clips. 317 | """ 318 | if args.dataset == "tgif-qa": 319 | if not os.path.exists('dataset/tgif-qa/{}'.format(args.question_type)): 320 | os.makedirs('dataset/tgif-qa/{}'.format(args.question_type)) 321 | else: 322 | if not os.path.exists(args.dataset): 323 | os.makedirs(args.dataset) 324 | 325 | vlist = load_file(v_file) 326 | dataset_size = len(vlist) 327 | print(dataset_size) 328 | vnames = [] 329 | with h5py.File(outfile, 'w') as fd: 330 | feat_dset = None 331 | video_ids_dset = None 332 | i0 = 0 333 | _t = {'misc': utils.Timer()} 334 | for i in range(0, dataset_size): 335 | # if i < 20: continue 336 | _t['misc'].tic() 337 | 338 | video_path = osp.join(v_path, str(vlist[i])) 339 | 340 | clips, valid = extract_clips_with_consecutive_frames(video_path, num_clips=num_clips, num_frames_per_clip=16) 341 | 342 | nclip, nframe = 8, 4 343 | if args.feature_type == 'appearance': 344 | clip_feat = [] 345 | if valid: 346 | # for clip_id, clip in enumerate(clips): 347 | # feats = run_batch(clip, model) # (16, 2048) 348 | # feats = feats.squeeze() 349 | # clip_feat.append(feats) 350 | # t4 = time.time() 351 | clips = np.asarray(clips).squeeze() 352 | clips = clips.reshape(clips.shape[0]*clips.shape[1], clips.shape[2],clips.shape[3],clips.shape[4]) 353 | 354 | clips = torch.FloatTensor(clips).cuda().squeeze() 355 | # print(clips.shape) 356 | clip_feat = model(clips).squeeze() 357 | # print(clip_feat.shape) 358 | clip_feat = clip_feat.view(nclip, nframe, -1).detach().cpu().numpy() 359 | else: 360 | clip_feat = np.zeros(shape=(nclip, nframe, 2048)) 361 | 362 | if feat_dset is None: 363 | print(clip_feat.shape) 364 | C, F, D = clip_feat.shape 365 | feat_dset = fd.create_dataset('resnet_features', (dataset_size, C, F, D), 366 | dtype=np.float32) 367 | video_ids_dset = fd.create_dataset('ids', shape=(dataset_size,), dtype=np.int) 368 | 369 | elif args.feature_type == 'motion': 370 | if valid: 371 | clip_torch = torch.FloatTensor(np.asarray(clips)).cuda() 372 | clip_feat = model(clip_torch) # (8, 2048) 373 | clip_feat = clip_feat.squeeze() 374 | clip_feat = clip_feat.detach().cpu().numpy() 375 | else: 376 | clip_feat = np.zeros(shape=(nclip, 2048)) 377 | if feat_dset is None: 378 | print(clip_feat.shape) 379 | C, D = clip_feat.shape 380 | feat_dset = fd.create_dataset('resnext_features', (dataset_size, C, D), 381 | dtype=np.float32) 382 | video_ids_dset = fd.create_dataset('ids', shape=(dataset_size,), dtype=np.int) 383 | 384 | 385 | i1 = i0 + 1 386 | feat_dset[i0:i1] = clip_feat 387 | video_ids_dset[i0:i1] = int(vlist[i]) 388 | i0 = i1 389 | _t['misc'].toc() 390 | 391 | if (i % 100 == 0): 392 | print('{:d}/{:d} {:.3f}s (projected finish: {:.2f} hours)' \ 393 | .format(i1, dataset_size, _t['misc'].average_time, 394 | _t['misc'].average_time * (dataset_size - i1) / 3600)) 395 | 396 | varry = np.array(vlist, dtype=object) 397 | string_dt = h5py.special_dtype(vlen=str) 398 | fd.create_dataset('ids', data=varry, dtype=string_dt) 399 | 400 | 401 | if __name__ == '__main__': 402 | parser = argparse.ArgumentParser() 403 | parser.add_argument('--gpu_id', type=int, default=0, help='specify which gpu will be used') 404 | # dataset info 405 | parser.add_argument('--dataset', default='nextqa', choices=['tgif-qa', 'msvd', 'star', 'msrvtt', 'nextqa','webvid', 'causalvid'], type=str) 406 | parser.add_argument('--question_type', default='none', choices=['frameqa', 'count', 'transition', 'action', 'none'], type=str) 407 | # output 408 | parser.add_argument('--out', dest='outfile', 409 | help='output filepath', 410 | default="../../data/nextqa/feat_{}.h5", type=str) 411 | # image sizes 412 | parser.add_argument('--num_clips', default=32, type=int) 413 | parser.add_argument('--image_height', default=112*2, type=int) 414 | parser.add_argument('--image_width', default=112*2, type=int) 415 | 416 | # network params 417 | parser.add_argument('--model', default='resnet101', choices=['resnet101', 'resnext101'], type=str) 418 | parser.add_argument('--seed', default='666', type=int, help='random seed') 419 | args = parser.parse_args() 420 | if args.model == 'resnet101': 421 | args.feature_type = 'appearance' 422 | elif args.model == 'resnext101': 423 | args.feature_type = 'motion' 424 | else: 425 | raise Exception('Feature type not supported!') 426 | # set gpu 427 | if args.model != 'resnext101': 428 | torch.cuda.set_device(args.gpu_id) 429 | torch.manual_seed(args.seed) 430 | np.random.seed(args.seed) 431 | 432 | # annotation files 433 | if args.dataset == 'tgifqa': 434 | args.annotation_file = '/storage_fast/jbxiao/workspace/VideoQA/data/{args.dataset}/videos.json' 435 | args.video_dir = '/raid/jbxiao/data/tgifqa/frames/' 436 | args.outfile = '../../data/{}/{}/{}_{}_{}_feat.h5' 437 | video_paths = tgif_qa.load_video_paths(args) 438 | random.shuffle(video_paths) 439 | # load model 440 | if args.model == 'resnet101': 441 | model = build_resnet() 442 | elif args.model == 'resnext101': 443 | model = build_resnext() 444 | generate_h5(model, video_paths, args.num_clips, 445 | args.outfile.format(args.dataset, args.question_type, args.dataset, args.question_type, args.feature_type)) 446 | 447 | elif args.dataset == 'webvid': 448 | args.video_dir = '/raid/jbxiao/data/WebVid/videos/' 449 | if args.model == 'resnet101': 450 | model = build_resnet() 451 | elif args.model == 'resnext101': 452 | model = build_resnext() 453 | clip_file = f'/storage_fast/jbxiao/workspace/VideoQA/data/datasets/webvid/val_clip.json' 454 | generate_npy(model, args.video_dir, clip_file, args.outfile) 455 | 456 | 457 | elif args.dataset == 'msvd-qa': 458 | args.annotation_file = '/ceph-g/lethao/datasets/msvd/MSVD-QA/{}_qa.json' 459 | args.video_dir = '/ceph-g/lethao/datasets/msvd/MSVD-QA/video/' 460 | args.video_name_mapping = '/ceph-g/lethao/datasets/msvd/youtube_mapping.txt' 461 | video_paths = msvd_qa.load_video_paths(args) 462 | random.shuffle(video_paths) 463 | # load model 464 | if args.model == 'resnet101': 465 | model = build_resnet() 466 | elif args.model == 'resnext101': 467 | model = build_resnext() 468 | generate_h5(model, video_paths, args.num_clips, 469 | args.outfile.format(args.dataset, args.dataset, args.feature_type)) 470 | 471 | elif args.dataset == 'nextqa': 472 | args.video_list_file = '../datasets/nextqa/vlist.json' #obtained from train/val/test csv files 473 | args.video_dir = '/storage/jbxiao/workspace/data/nextqa/frames/' #extacted video frames, refer to extract_video.py 474 | if args.model == 'resnet101': 475 | model = build_resnet() 476 | elif args.model == 'resnext101': 477 | model = build_resnext() 478 | args.image_height = 112 479 | args.image_width = 112 480 | generate_h5(model, args.video_dir, args.video_list_file, args.num_clips, args.outfile.format(args.feature_type)) 481 | -------------------------------------------------------------------------------- /tools/split_dataset_feat.py: -------------------------------------------------------------------------------- 1 | # ==================================================== 2 | # @Time : 6/5/21 1:32 PM 3 | # @Author : Xiao Junbin 4 | # @Email : junbin@comp.nus.edu.sg 5 | # @File : split_dataset_feat.py 6 | # ==================================================== 7 | import h5py 8 | import numpy as np 9 | import os 10 | import os.path as osp 11 | import pandas as pd 12 | 13 | 14 | def np2h5(in_dir, out_dir, video_list, mode): 15 | out_file = osp.join(out_dir, 'region_16c20b_{}.h5'.format(mode)) 16 | video_fd = h5py.File(out_file, 'w') 17 | feat_dset, bbox_dset, ids_dset = None, None, None 18 | bbox_num = 20 19 | for video in video_list: 20 | bbox_file = osp.join(in_dir, str(video) + '.npz') 21 | npz = np.load(bbox_file) 22 | roi_feat = npz['feat'] 23 | bnum = roi_feat.shape[2] 24 | roi_bbox = npz['bbox'] 25 | # if bnum < bbox_num: 26 | # add_num = bbox_num - bnum 27 | # print(add_num) 28 | # add_feat, add_bbox = [], [] 29 | # for _ in range(add_num): 30 | # add_feat.append(roi_feat[:, :, bnum-1, :]) 31 | # add_bbox.append(roi_bbox[:, :, bnum-1, :]) 32 | # add_feat = np.asarray(add_feat).transpose(1, 2, 0, 3) 33 | # add_bbox = np.asarray(add_bbox).transpose(1, 2, 0, 3) 34 | # print(add_feat.shape, add_bbox.shape) 35 | # roi_feat = np.concatenate((roi_feat, add_feat), axis=2) 36 | # roi_bbox = np.concatenate((roi_bbox, add_bbox), axis=2) 37 | 38 | roi_feat = roi_feat[:, :, :bbox_num, :] 39 | 40 | roi_bbox = roi_bbox[:, :, :bbox_num, :] 41 | # print(roi_feat.shape, roi_bbox.shape) 42 | if feat_dset is None: 43 | dataset_size = len(video_list) 44 | C, F, R, D = roi_feat.shape 45 | feat_dset = video_fd.create_dataset('feat', (dataset_size, C, F, R, D), 46 | dtype=np.float32) 47 | ids_dset = video_fd.create_dataset('ids', shape=(dataset_size,), dtype=np.int) 48 | C, F, R, D = roi_bbox.shape 49 | bbox_dset = video_fd.create_dataset('bbox', shape=(dataset_size, C, F, R, D), 50 | dtype=np.float32) 51 | ival = 0 52 | 53 | feat_dset[ival:(ival + 1)] = roi_feat 54 | bbox_dset[ival:(ival + 1)] = roi_bbox 55 | ids_dset[ival:(ival + 1)] = int(video) 56 | 57 | ival += 1 58 | print('Save to {}'.format(out_file)) 59 | 60 | def split_dataset_feat(filename, out_dir, train_list, val_list, test_list): 61 | 62 | train_fd = h5py.File(osp.join(out_dir, 'app_feat_train.h5'), 'w') 63 | val_fd = h5py.File(osp.join(out_dir, 'app_feat_val.h5'), 'w') 64 | test_fd = h5py.File(osp.join(out_dir, 'app_feat_test.h5'), 'w') 65 | val_feat_dset, val_ids_dset = None, None 66 | test_feat_dset, test_ids_dset = None, None 67 | train_feat_dset, train_ids_dset = None, None 68 | 69 | feat_name = 'resnet_features' 70 | with h5py.File(filename, 'r') as fp: 71 | vids = fp['ids'] 72 | feats = fp[feat_name] 73 | for vid, feat in zip(vids, feats): 74 | if vid in val_list: 75 | if val_feat_dset is None: 76 | dataset_size = len(val_list) 77 | C, F, D = feat.shape 78 | # C, D = feat.shape 79 | val_feat_dset = val_fd.create_dataset(feat_name, (dataset_size, C, F, D), 80 | dtype=np.float32) 81 | val_ids_dset = val_fd.create_dataset('ids', shape=(dataset_size,), dtype=np.int) 82 | ival = 0 83 | val_feat_dset[ival:(ival+1)] = feat 84 | val_ids_dset[ival:(ival+1)] = int(vid) 85 | ival += 1 86 | elif vid in test_list: 87 | if test_feat_dset is None: 88 | dataset_size = len(test_list) 89 | C, F, D = feat.shape 90 | # C, D = feat.shape 91 | test_feat_dset = test_fd.create_dataset(feat_name, (dataset_size, C, F, D), 92 | dtype=np.float32) 93 | test_ids_dset = test_fd.create_dataset('ids', shape=(dataset_size,), dtype=np.int) 94 | itest = 0 95 | 96 | test_feat_dset[itest:(itest + 1)] = feat 97 | test_ids_dset[itest:(itest + 1)] = int(vid) 98 | itest += 1 99 | else: 100 | if train_feat_dset is None: 101 | dataset_size = len(train_list) 102 | C, F, D = feat.shape 103 | # C, D = feat.shape 104 | train_feat_dset = train_fd.create_dataset(feat_name, (dataset_size, C, F, D), 105 | dtype=np.float32) 106 | train_ids_dset = train_fd.create_dataset('ids', shape=(dataset_size,), dtype=np.int) 107 | itrain = 0 108 | 109 | train_feat_dset[itrain:(itrain + 1)] = feat 110 | train_ids_dset[itrain:(itrain + 1)] = int(vid) 111 | itrain += 1 112 | 113 | def get_video_list(filename): 114 | samples = pd.read_csv(filename) 115 | videos = samples['video'] 116 | videos = list(set(videos)) 117 | print(len(videos)) 118 | return sorted(videos) 119 | 120 | def main(): 121 | dataset = 'nextqa' 122 | data_dir = '../../data/{}/'.format(dataset) 123 | dataset_dir = '../datasets/{}/'.format(dataset) 124 | # in_dir = osp.join(data_dir, 'region_n') 125 | out_dir = osp.join(data_dir, 'frame_feat') 126 | train_file = osp.join(dataset_dir, 'train.csv') 127 | val_file = osp.join(dataset_dir, 'val.csv') 128 | test_file = osp.join(dataset_dir, 'test.csv') 129 | train_list = get_video_list(train_file) 130 | val_list = get_video_list(val_file) 131 | test_list = get_video_list(test_file) 132 | 133 | # np2h5(in_dir, out_dir, test_list, 'test') 134 | # np2h5(in_dir, out_dir, val_list, 'val') 135 | # np2h5(in_dir, out_dir, train_list, 'train') 136 | 137 | h5filename = osp.join(out_dir, 'feat_appearance.h5') 138 | split_dataset_feat(h5filename, out_dir, train_list, val_list, test_list) 139 | 140 | 141 | if __name__ == "__main__": 142 | main() -------------------------------------------------------------------------------- /train/__pycache__/train_covgt.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doc-doc/CoVGT/cbc9fa7830b304f3c3f9c53040489ea9ad35a9aa/train/__pycache__/train_covgt.cpython-38.pyc -------------------------------------------------------------------------------- /train/train_covgt.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import logging 5 | import collections 6 | from util import compute_aggreeings, AverageMeter, get_mask, mask_tokens 7 | import os.path as osp 8 | import json 9 | #from fvcore.nn import FlopCountAnalysis 10 | 11 | def eval(model, data_loader, a2v, args, test=False, tokenizer="RoBERTa"): 12 | model.eval() 13 | count = 0 14 | metrics, counts = collections.defaultdict(int), collections.defaultdict(int) 15 | 16 | with torch.no_grad(): 17 | if not args.mc: 18 | model.module._compute_answer_embedding(a2v) 19 | results = {} 20 | for i, batch in enumerate(data_loader): 21 | answer_id, answer, video_o, video_f, question, question_id, seg_feats, seg_num = ( 22 | batch["answer_id"], 23 | batch["answer"].cuda(), 24 | batch["video_o"].cuda(), 25 | batch["video_f"].cuda(), 26 | batch["question"].cuda(), 27 | batch['question_id'], 28 | batch['seg_feats'].cuda(), 29 | batch['seg_num'] 30 | ) 31 | 32 | video_len = batch["video_len"] 33 | seq_len = batch["seq_len"] 34 | 35 | question_mask = (question!=tokenizer.pad_token_id).float() #RobBERETa 36 | answer_mask = (answer!=tokenizer.pad_token_id).float() #RobBERETa 37 | 38 | video_mask = get_mask(video_len, video_o.size(1)).cuda() 39 | count += answer_id.size(0) 40 | video = (video_o, video_f) 41 | if not args.mc: 42 | predicts = model( 43 | video, 44 | question, 45 | text_mask=question_mask, 46 | video_mask=video_mask, 47 | seq_len = seq_len 48 | ) 49 | topk = torch.topk(predicts, dim=1, k=10).indices.cpu() 50 | if args.dataset != "ivqa": 51 | answer_id_expanded = answer_id.view(-1, 1).expand_as(topk) 52 | else: 53 | answer_id = (answer_id / 2).clamp(max=1) 54 | answer_id_expanded = answer_id 55 | metrics = compute_aggreeings( 56 | topk, 57 | answer_id_expanded, 58 | [1, 10], 59 | ["acc", "acc10"], 60 | metrics, 61 | ivqa=(args.dataset == "ivqa"), 62 | ) 63 | for bs, qid in enumerate(question_id): 64 | results[qid] = {'prediction': int(topk.numpy()[bs,0]), 'answer':int(answer_id.numpy()[bs])} 65 | else: 66 | #############Model FLOPs########## 67 | # inputs = (video, question, None, answer.cuda(), seq_len, video_mask, answer_mask) 68 | # flops = FlopCountAnalysis(model, inputs) 69 | # print('Model FLOPs:', flops.total()/1000000) #use batch_size 1 70 | # break 71 | ################################### 72 | fusion_proj, answer_proj = model( 73 | video, 74 | question, 75 | text_mask=answer_mask, 76 | video_mask=video_mask, 77 | answer=answer, 78 | seq_len = seq_len, 79 | seg_feats = seg_feats, 80 | seg_num = seg_num 81 | ) 82 | # predicts = fusion_proj.squeeze() 83 | 84 | fusion_proj = fusion_proj.unsqueeze(2) 85 | predicts = torch.bmm(answer_proj, fusion_proj).squeeze() 86 | 87 | predicted = torch.max(predicts, dim=1).indices.cpu() 88 | metrics["acc"] += (predicted == answer_id).sum().item() 89 | for bs, qid in enumerate(question_id): 90 | results[qid] = {'prediction': int(predicted.numpy()[bs]), 'answer':int(answer_id.numpy()[bs])} 91 | 92 | step = "val" if not test else "test" 93 | 94 | for k in metrics: 95 | # print(metrics[k], count) 96 | v = metrics[k] / count 97 | logging.info(f"{step} {k}: {v:.2%}") 98 | break 99 | 100 | return metrics["acc"] / count, results 101 | 102 | 103 | def train(model, train_loader, a2v, optimizer, criterion, scheduler, epoch, args, tokenizer): 104 | model.train() 105 | running_vqa_loss, running_acc, running_mlm_loss, running_cl_loss = ( 106 | AverageMeter(), 107 | AverageMeter(), 108 | AverageMeter(), 109 | AverageMeter() 110 | ) 111 | for i, batch in enumerate(train_loader): 112 | answer_id, answer, video_o, video_f, question, seg_feats, seg_num, qsn_id, qsn_token_ids, qsn_seq_len = ( 113 | batch["answer_id"], 114 | batch["answer"], 115 | batch["video_o"].cuda(), 116 | batch["video_f"].cuda(), 117 | batch["question"].cuda(), 118 | batch['seg_feats'].cuda(), 119 | batch['seg_num'], 120 | batch['qsn_id'], 121 | batch['qsn_token_ids'], 122 | batch['qsn_seq_len'] 123 | ) 124 | 125 | video_len = batch["video_len"] 126 | 127 | question_mask = (question != tokenizer.pad_token_id).float().cuda() #RobBERETa 128 | answer_mask = (answer!=tokenizer.pad_token_id).float().cuda() #RobBERETa 129 | video_mask = ( 130 | get_mask(video_len, video_o.size(1)).cuda() if args.max_feats > 0 else None 131 | ) 132 | 133 | qsn_mask = (qsn_token_ids != tokenizer.pad_token_id).float().cuda() 134 | 135 | video = (video_o, video_f) 136 | N = answer_id.size(0) 137 | seq_len = batch["seq_len"] 138 | if not args.mc: 139 | model.module._compute_answer_embedding(a2v) 140 | predicts = model( 141 | video, 142 | question, 143 | text_mask=question_mask, 144 | video_mask=video_mask, 145 | seq_len = seq_len 146 | ) 147 | else: 148 | fusion_proj, answer_proj = model( 149 | video, 150 | question, 151 | text_mask=answer_mask, 152 | video_mask=video_mask, 153 | answer=answer.cuda(), 154 | seq_len = seq_len, 155 | seg_feats = seg_feats, 156 | seg_num = seg_num 157 | ) 158 | 159 | fusion_proj = fusion_proj.unsqueeze(2) 160 | predicts = torch.bmm(answer_proj, fusion_proj).squeeze() 161 | 162 | if args.dataset == "ivqa": 163 | a = (answer_id / 2).clamp(max=1).cuda() 164 | vqa_loss = criterion(predicts, a) 165 | predicted = torch.max(predicts, dim=1).indices.cpu() 166 | predicted = F.one_hot(predicted, num_classes=len(a2v)) 167 | running_acc.update((predicted * a.cpu()).sum().item() / N, N) 168 | else: 169 | vqa_loss = criterion(predicts, answer_id.cuda()) 170 | predicted = torch.max(predicts, dim=1).indices.cpu() 171 | running_acc.update((predicted == answer_id).sum().item() / N, N) 172 | if args.cl_loss: 173 | vt_proj, txt_proj = model( 174 | video, 175 | question, 176 | text_mask=qsn_mask, 177 | video_mask=video_mask, 178 | answer=qsn_token_ids, 179 | seq_len = qsn_seq_len, 180 | seg_feats = seg_feats, 181 | seg_num = seg_num 182 | ) 183 | vt_proj = vt_proj.unsqueeze(2) 184 | cl_predicts = torch.bmm(txt_proj, vt_proj).squeeze() 185 | cl_loss = criterion(cl_predicts, qsn_id.cuda()) 186 | # cl_predicted = torch.max(cl_predicts, dim=1).indices.cpu() 187 | # running_acc.update((predicted == answer_id).sum().item() / N, N) 188 | 189 | if args.mlm_prob: 190 | max_seq_len = args.qmax_words 191 | if args.mc > 0: 192 | tmp_id = [aid+(args.mc*i) for i, aid in enumerate(answer_id)] 193 | inputs = answer.view(N*args.mc, -1)[tmp_id,:] 194 | # question_mask = (inputs>0).float() 195 | question_mask = (inputs!=1).float() 196 | max_seq_len = args.amax_words 197 | else: 198 | inputs = batch["question"] 199 | 200 | inputs, labels = mask_tokens(inputs, tokenizer, mlm_probability=args.mlm_prob) 201 | mlm_loss = model( 202 | video, 203 | question=inputs.cuda(), 204 | labels=labels.cuda(), 205 | text_mask=question_mask, 206 | video_mask=video_mask, 207 | max_seq_len=max_seq_len, 208 | mode="mlm", 209 | ) 210 | mlm_loss = mlm_loss.mean() 211 | loss = mlm_loss + vqa_loss 212 | if args.cl_loss: 213 | loss = vqa_loss + args.cl_loss*cl_loss 214 | if args.cl_loss and args.mlm_prob: 215 | loss = vqa_loss + args.cl_loss*cl_loss + mlm_loss 216 | if not args.cl_loss and not args.mlm_prob: 217 | loss = vqa_loss 218 | 219 | optimizer.zero_grad() 220 | loss.backward() 221 | if args.clip: 222 | nn.utils.clip_grad_norm_(model.parameters(), max_norm=args.clip) 223 | optimizer.step() 224 | scheduler.step() 225 | 226 | running_vqa_loss.update(vqa_loss.detach().cpu().item(), N) 227 | if args.mlm_prob: 228 | running_mlm_loss.update(mlm_loss.detach().cpu().item(), N) 229 | if args.cl_loss: 230 | running_cl_loss.update(cl_loss.detach().cpu().item(), N) 231 | if (i + 1) % (len(train_loader) // args.freq_display) == 0: 232 | if args.mlm_prob: 233 | logging.info( 234 | f"Epoch {epoch + 1}/{args.epochs}, Progress: {float(i + 1) / len(train_loader):.4f}, Lvqa loss: " 235 | f"{running_vqa_loss.avg:.4f}, Training acc: {running_acc.avg:.2%}, MLM loss: {running_mlm_loss.avg:.4f}, Lvq Loss: {running_cl_loss.avg:.4f}" 236 | ) 237 | elif args.cl_loss: 238 | logging.info( 239 | f"Epoch {epoch + 1}/{args.epochs}, Progress: {float(i + 1) / len(train_loader):.4f}, Lvqa loss: " 240 | f"{running_vqa_loss.avg:.4f}, Train acc: {running_acc.avg:.2%}, Lvq Loss: {running_cl_loss.avg:.4f}" 241 | ) 242 | else: 243 | logging.info( 244 | f"Epoch {epoch + 1}/{args.epochs}, Progress: {float(i + 1) / len(train_loader):.4f}, Lvqa loss: " 245 | f"{running_vqa_loss.avg:.4f}, Train acc: {running_acc.avg:.2%}" 246 | ) 247 | running_acc.reset() 248 | running_vqa_loss.reset() 249 | running_mlm_loss.reset() 250 | running_cl_loss.reset() 251 | -------------------------------------------------------------------------------- /util.py: -------------------------------------------------------------------------------- 1 | import re 2 | import torch 3 | import torch.nn.functional as F 4 | import json 5 | import collections 6 | import numpy as np 7 | import os 8 | import os.path as osp 9 | import pandas as pd 10 | import logging 11 | import pickle as pkl 12 | import string 13 | 14 | def tokenize( 15 | seq, 16 | tokenizer, 17 | add_special_tokens=True, 18 | max_length=10, 19 | dynamic_padding=True, 20 | truncation=True, 21 | ): 22 | """ 23 | :param seq: sequence of sequences of text 24 | :param tokenizer: bert_tokenizer 25 | :return: torch tensor padded up to length max_length of bert tokens 26 | """ 27 | token_ids = tokenizer.batch_encode_plus( 28 | seq, 29 | add_special_tokens=add_special_tokens, 30 | max_length=max_length, 31 | padding="longest" if dynamic_padding else "max_length", 32 | truncation=truncation, 33 | )["input_ids"] 34 | # tokens = [tokenizer.tokenize(s, add_special_tokens=add_special_tokens) for s in seq] 35 | tokens = '' 36 | return torch.tensor(token_ids, dtype=torch.long), tokens 37 | 38 | def transform_bb(roi_bbox, width, height): 39 | dshape = list(roi_bbox.shape) 40 | tmp_bbox = roi_bbox.reshape([-1, 4]) 41 | relative_bbox = tmp_bbox / np.asarray([width, height, width, height]) 42 | relative_area = (tmp_bbox[:, 2] - tmp_bbox[:, 0] + 1) * \ 43 | (tmp_bbox[:, 3] - tmp_bbox[:, 1] + 1)/ (width*height) 44 | relative_area = relative_area.reshape(-1, 1) 45 | bbox_feat = np.hstack((relative_bbox, relative_area)) 46 | dshape[-1] += 1 47 | bbox_feat = bbox_feat.reshape(dshape) 48 | 49 | return bbox_feat 50 | 51 | 52 | def compute_aggreeings(topk, answers, thresholds, names, metrics, ivqa=False): 53 | """ Updates metrics dictionary by computing aggreeings for different thresholds """ 54 | if not ivqa: 55 | # sp_num = topk.shape[0] 56 | for i, x in enumerate(thresholds): 57 | agreeingsx = (topk[:, :x] == answers[:, :x]).sum().item() 58 | # unk = 0 59 | # for j in range(sp_num): 60 | # if answers[j, 0].item() == 0 and 0 in topk[j, :x].numpy(): 61 | # unk += 1 62 | metrics[names[i]] += agreeingsx #-unk 63 | else: 64 | for i, x in enumerate(thresholds): 65 | predicted = F.one_hot(topk[:, :x], num_classes=answers.shape[-1]).sum(1) 66 | metrics[names[i]] += (predicted * answers).max(1)[0].sum().item() 67 | return metrics 68 | 69 | 70 | class AverageMeter: 71 | """ Computes and stores the average and current value for training stats """ 72 | 73 | def __init__(self): 74 | self.reset() 75 | 76 | def reset(self): 77 | """ Reset all statistics """ 78 | self.val = 0 79 | self.avg = 0 80 | self.sum = 0 81 | self.count = 0 82 | 83 | def update(self, val, n=1): 84 | """ Update statistics """ 85 | self.val = val 86 | self.sum += val * n 87 | self.count += n 88 | self.avg = self.sum / self.count 89 | 90 | 91 | def get_mask(lengths, max_length): 92 | """ Computes a batch of padding masks given batched lengths """ 93 | mask = 1 * ( 94 | torch.arange(max_length).unsqueeze(1).to(lengths.device) < lengths 95 | ).transpose(0, 1) 96 | return mask 97 | 98 | 99 | def compute_a2v(vocab_path, bert_tokenizer, amax_words): 100 | """ Precomputes GloVe answer embeddings for all answers in the vocabulary """ 101 | a2id = json.load(open(vocab_path, "r")) 102 | # a2id['[UNK]'] = 0 103 | id2a = {v: k for k, v in a2id.items()} 104 | a2v, _ = tokenize( 105 | list(a2id.keys()), 106 | bert_tokenizer, 107 | add_special_tokens=True, 108 | max_length=amax_words, 109 | dynamic_padding=True, 110 | truncation=True, 111 | ) 112 | if torch.cuda.is_available(): 113 | a2v = a2v.cuda() # (vocabulary_size, 1, we_dim) 114 | return a2id, id2a, a2v 115 | 116 | 117 | def mask_tokens(inputs, tokenizer, mlm_probability): 118 | """ 119 | Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. 120 | """ 121 | if tokenizer.mask_token is None: 122 | raise ValueError( 123 | "This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the --mlm flag if you want to use this tokenizer." 124 | ) 125 | 126 | labels = inputs.clone() 127 | # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa) 128 | 129 | probability_matrix = torch.full(labels.shape, mlm_probability) 130 | # find special token 131 | special_tokens_mask = [ 132 | tokenizer.get_special_tokens_mask(tkid, already_has_special_tokens=True) 133 | for tkid in labels.tolist() 134 | ] 135 | # do not mask special token 136 | probability_matrix.masked_fill_( 137 | torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0 138 | ) 139 | 140 | if tokenizer._pad_token is not None: 141 | padding_mask = labels.eq(tokenizer.pad_token_id) 142 | probability_matrix.masked_fill_(padding_mask, value=0.0) 143 | 144 | masked_indices = torch.bernoulli(probability_matrix).bool() 145 | labels[~masked_indices] = -100 # We only compute loss on masked tokens 146 | 147 | # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK]) 148 | indices_replaced = ( 149 | torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices 150 | ) 151 | inputs[indices_replaced] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token) 152 | 153 | # 10% of the time, we replace masked input tokens with random word 154 | indices_random = ( 155 | torch.bernoulli(torch.full(labels.shape, 0.5)).bool() 156 | & masked_indices 157 | & ~indices_replaced 158 | ) 159 | random_words = torch.randint(len(tokenizer), labels.shape, dtype=torch.long) 160 | inputs[indices_random] = random_words[indices_random] 161 | 162 | # The rest of the time (10% of the time) we keep the masked input tokens unchanged 163 | 164 | return inputs, labels 165 | 166 | 167 | def get_types(dataset): 168 | """ Type2Id mapping for VideoQA datasets """ 169 | if dataset == "tgif": 170 | return {"what": 0, "how": 1, "color": 2, "where": 3} 171 | elif dataset == "activitynet": 172 | return { 173 | "motion": 0, 174 | "spatial": 1, 175 | "temporal": 2, 176 | "yesno": 3, 177 | "color": 4, 178 | "object": 5, 179 | "location": 6, 180 | "number": 7, 181 | "other": 8, 182 | } 183 | elif dataset == "msvd" or dataset == "msrvtt": 184 | return {"what": 0, "how": 1, "color": 2, "where": 3, "who": 4, "when": 5} 185 | elif dataset == "ivqa": 186 | return {"scenes": 0} 187 | else: 188 | raise NotImplementedError 189 | 190 | 191 | def get_most_common(loader, ivqa=False, n=4): 192 | """ Outputs most common answers and splits in n parts the answers depending on their frequency""" 193 | if ivqa: 194 | ans = [] 195 | for a1, a2, a3, a4, a5 in zip( 196 | list(loader.dataset.data["answer1"]), 197 | list(loader.dataset.data["answer2"]), 198 | list(loader.dataset.data["answer3"]), 199 | list(loader.dataset.data["answer4"]), 200 | list(loader.dataset.data["answer5"]), 201 | ): 202 | counteri = collections.Counter([a1, a2, a3, a4, a5]) 203 | for w in counteri: 204 | if ( 205 | counteri[w] >= 2 206 | ): # an answer is considered as right if it has been annotated by two workers 207 | ans.append(w) 208 | else: 209 | ans = list(loader.dataset.data["answer"]) 210 | most_common = collections.Counter(ans).most_common() 211 | 212 | total = sum(x[1] for x in most_common) 213 | splits = [0] * (n + 1) 214 | j = 0 215 | for i in range(n): 216 | cur_total = 0 217 | while j < len(most_common) and cur_total < total / n: 218 | cur_total += most_common[j][1] 219 | j += 1 220 | splits[i + 1] = j 221 | return most_common, splits, total 222 | 223 | 224 | def compute_word_stats( 225 | topk, answers, a2id, a2v, most_common, metrics, counts, ivqa, top10=False 226 | ): 227 | """ Similar as compute_agreeings, computes agreeings and counts for most common words """ 228 | if not ivqa: 229 | for word, cword in most_common: 230 | if word not in a2id: 231 | counts[word] = cword 232 | continue 233 | predicted = topk[:, 0] 234 | metrics[f"acc_{word}"] += ( 235 | (predicted[answers == a2id[word]] == a2id[word]).sum().item() 236 | ) 237 | if top10: 238 | predicted10 = topk[:, :10] 239 | metrics[f"acc10_{word}"] += ( 240 | (predicted10[answers == a2id[word]] == a2id[word]).sum().item() 241 | ) 242 | counts[word] += (answers == a2id[word]).sum().item() 243 | else: 244 | for word, cword in most_common: 245 | if word not in a2id: 246 | counts[word] = cword 247 | continue 248 | predicted = F.one_hot(topk[:, 0], num_classes=len(a2v)) 249 | ans_word = answers[:, a2id[word]] 250 | metrics[f"acc_{word}"] += ( 251 | (predicted[:, a2id[word]][ans_word == 1] * ans_word[ans_word == 1]) 252 | .sum() 253 | .item() 254 | ) 255 | if top10: 256 | predicted10 = F.one_hot(topk[:, :10], num_classes=len(a2v)).sum(1) 257 | metrics[f"acc10_{word}"] += ( 258 | ( 259 | predicted10[:, a2id[word]][ans_word == 1] 260 | * ans_word[ans_word == 1] 261 | ) 262 | .sum() 263 | .item() 264 | ) 265 | counts[word] += (ans_word == 1).sum().item() 266 | return metrics, counts 267 | 268 | 269 | def compute_metrics(x): 270 | sx = np.sort(-x, axis=1) 271 | d = np.diag(-x) 272 | d = d[:, np.newaxis] 273 | ind = sx - d 274 | ind = np.where(ind == 0) 275 | ind = ind[1] 276 | metrics = {} 277 | metrics["R1"] = float(np.sum(ind == 0)) / len(ind) 278 | metrics["R10"] = float(np.sum(ind < 10)) / len(ind) 279 | metrics["R100"] = float(np.sum(ind < 100)) / len(ind) 280 | metrics["MR"] = np.median(ind) + 1 281 | return metrics 282 | 283 | 284 | def print_computed_metrics(metrics): 285 | r1 = metrics["R1"] 286 | r10 = metrics["R10"] 287 | r100 = metrics["R100"] 288 | mr = metrics["MR"] 289 | return "R@1: {:.4f} - R@10: {:.4f} - R@100: {:.4f} - Median R: {}".format( 290 | r1, r10, r100, mr 291 | ) 292 | 293 | 294 | #added by Junbin 295 | def get_qsn_type(qsn, ans_rsn): 296 | dos = ['does', 'do', 'did'] 297 | bes = ['was', 'were', 'is', 'are'] 298 | w5h1 = ['what', 'who', 'which', 'why', 'how', 'where'] 299 | qsn_sp = qsn.split() 300 | type = qsn_sp[0].lower() 301 | if type == 'what': 302 | if qsn_sp[1].lower() in dos: 303 | type = 'whata' 304 | elif qsn_sp[1].lower() in bes: 305 | type = 'whatb' 306 | else: 307 | type = 'whato' 308 | elif type == 'how': 309 | if qsn_sp[1].lower() == 'many': 310 | type = 'howm' 311 | elif type not in w5h1: 312 | type = 'other' 313 | if ans_rsn in ['pr', 'cr']: 314 | type += 'r' 315 | return type 316 | 317 | def major_type(tgroup): 318 | ans_num = 0 319 | mtype = '' 320 | for type, item in tgroup.items(): 321 | if len(item) > ans_num: 322 | ans_num = len(item) 323 | mtype = type 324 | return mtype 325 | 326 | def group(csv_data, gt=True): 327 | ans_group, qsn_group = {}, {} 328 | for idx, row in csv_data.iterrows(): 329 | qsn, ans = row['question'], row['answer'] 330 | if gt: 331 | type = row['type'] 332 | if type == 'TP': type = 'TN' 333 | else: 334 | type = 'null' if 'type' not in row else row['type'] 335 | type = get_qsn_type(qsn, type) 336 | if type not in ans_group: 337 | ans_group[type] = {ans} 338 | qsn_group[type] = {qsn} 339 | else: 340 | ans_group[type].add(ans) 341 | qsn_group[type].add(qsn) 342 | return ans_group, qsn_group 343 | 344 | 345 | def load_model_by_key(cur_model, model_path): 346 | model_dict = torch.load(model_path) 347 | new_model_dict = {} 348 | for k, v in cur_model.state_dict().items(): 349 | if k in model_dict: 350 | v = model_dict[k] 351 | else: 352 | pass 353 | # print(k) 354 | new_model_dict[k] = v 355 | return new_model_dict 356 | 357 | 358 | def load_file(filename): 359 | ''' 360 | added by junbin Xiao 361 | ''' 362 | file_type = osp.splitext(filename)[-1] 363 | if file_type == '.csv': 364 | data = pd.read_csv(filename) 365 | else: 366 | with open(filename, 'r') as fp: 367 | if file_type == '.json': 368 | data = json.load(fp) 369 | elif file_type == '.txt': 370 | data = fp.readlines() 371 | data = [datum.rstrip('\n') for datum in data] 372 | return data 373 | 374 | 375 | def save_to(filename, data): 376 | ''' 377 | added by junbin Xiao 378 | ''' 379 | logging.info(f'Save to {filename}') 380 | dirname = osp.dirname(filename) 381 | if not osp.exists(dirname): 382 | os.makedirs(dirname) 383 | with open(filename, 'w') as fp: 384 | json.dump(data, fp) 385 | 386 | def pkload(filename): 387 | with open(filename, 'rb') as fp: 388 | data = pkl.load(fp) 389 | return data 390 | --------------------------------------------------------------------------------