├── .gitignore
├── LICENSE
├── README.md
├── args.py
├── dataloader
├── .gitignore
└── cvqa_loader.py
├── datasets
└── nextqa
│ ├── map_vid_vidorID.json
│ ├── test.csv
│ ├── train.csv
│ ├── val.csv
│ └── vlist.json
├── eval_next.py
├── global_parameters.py
├── loss.py
├── main.py
├── misc
├── CoVGT-res.png
└── CoVGT.png
├── model
├── .gitignore
├── CoVGT.py
├── EncoderVid.py
├── cmatt.py
├── graph.py
├── language_model.py
└── vqa_model.py
├── requirements.txt
├── shells
├── cvid_test.sh
├── cvid_train.sh
├── msrvtt_test.sh
├── msrvtt_train.sh
├── next_test.sh
├── next_train.sh
├── tgif_ftrain.sh
├── tgif_test.sh
├── tgif_train.sh
└── webvid_train.sh
├── tools
├── __pycache__
│ └── object_align.cpython-38.pyc
├── bbox_visualizer.py
├── colors.txt
├── datautils
│ ├── msrvtt_qa.py
│ ├── msvd_qa.py
│ ├── nextqa.py
│ ├── tgif_qa.py
│ └── utils.py
├── demo.py
├── extract_video.py
├── feat_app.sh
├── models
│ ├── __init__.py
│ ├── densenet.py
│ ├── pre_act_resnet.py
│ ├── resnet.py
│ ├── resnext.py
│ └── wide_resnet.py
├── object_align.py
├── preprocess_features.py
└── split_dataset_feat.py
├── train
├── __pycache__
│ └── train_covgt.cpython-38.pyc
└── train_covgt.py
└── util.py
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | __pycache__
3 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Contrastive Video Question Answering via Video Graph Transformer
2 |
3 | Abstract
4 | This repo holds the code for our paper CoVGT accepted to IEEE T-PAMI'23. The work extends our preliminary publication at ECCV'22. We highlight the following differences compared to the conference version:
5 |
6 | * Jointly supervised and self-supervised contrastive objectives to optimize VGT.
7 | * Substitute BERT with a stronger language model (e.g., RoBERTa) for QA embedding.
8 | * Extended results on Causal-VidQA and STAR-QA and more comprehensive ablation studies.
9 |
10 | The code is based on VGT.
11 |
12 |
13 |
14 |
15 |

16 |
17 |
18 | ## Todo
19 | 1. [ ] Release feature of other datasets. Please email the first author and specify the reason as the data is strictly for research purpose.
20 |
21 | ## Environment
22 | Assume you have installed Anaconda3, cuda version > 11.0 with gpu memory >= 24G, please do the following to setup the envs:
23 | ```
24 | >conda create -n videoqa python==3.8.16
25 | >conda activate videoqa
26 | >git clone https://github.com/doc-doc/CoVGT.git
27 | >pip install -r requirements.txt
28 | >conda install pytorch==1.8.1 torchvision==0.9.1 cudatoolkit=11.1 -c pytorch -c nvidia
29 | ```
30 | ## Preparation
31 | Please create a data folder outside this repo, so you have two folders in your workspace 'workspace/data/' and 'workspace/CoVGT/'.
32 |
33 | Below we use NExT-QA as an example to get you farmiliar with the code.
34 | Please download the related video feature and QA annotations according to the links provided in the ```Results and Resources``` section. Note that the QA annotations will be saved into ```workspace/CoVGT/datasets/nextqa/``` after you clone this repo., video features into ```workspace/data/nextqa/``` and checkpoint files into ```workspace/data/save_models/nextqa/```. Change default paths in global_parameters.py and args.py for your own datasets.
35 |
36 | ## Inference
37 | ```
38 | ./shell/next_test.sh 0
39 | ```
40 | ## Evaluation
41 | ```
42 | python eval_next.py --folder CoVGT_FTCoWV --mode test
43 | ```
44 |
45 | ## Results and Resources
46 | **Table 1. VideoQA Accuracy (%) on Test Set.
**
47 |
48 |
49 | Cross-Modal Pretrain |
50 | NExT-QA |
51 | Causal-VidQA |
52 | STAR |
53 | TGIF-QA (Action) |
54 | TGIF-QA (Trans) |
55 | TGIF-QA (FrameQA) |
56 | TGIF-QA-R* (Action) |
57 | TGIF-QA-R* (Trans) |
58 | MSRVTT-QA |
59 |
60 |
61 | - |
62 | 59.4 |
63 | 59.1 |
64 | 44.0 |
65 | 94.7 |
66 | 97.6 |
67 | 61.6 |
68 | 60.8 |
69 | 73.8 |
70 | 38.3 |
71 |
72 |
73 |
74 | WebVid0.18M |
75 | 59.7 |
76 | 60.8 |
77 | 46.2 |
78 | 91.3 |
79 | 96.2 |
80 | 61.7 |
81 | 61.0 |
82 | 73.2 |
83 | 40.0 |
84 |
85 |
86 | - |
87 | feats |
88 | feats |
89 | feats |
90 | feats |
91 | feats |
92 | feats |
93 | feats |
94 | feats |
95 | feats |
96 |
97 |
98 | - |
99 | videos |
100 | videos |
101 | videos |
102 | videos |
103 | videos |
104 | videos |
105 | videos |
106 | videos |
107 | videos |
108 |
109 |
110 | - |
111 | Q&A |
112 | Q&A |
113 | Q&A |
114 | Q&A |
115 | Q&A |
116 | Q&A |
117 | Q&A |
118 | Q&A |
119 | Q&A |
120 |
121 |
122 | (The feature files are identical to VGT. We have merged some files of the same dataset to avoid too many links.)
123 |
124 | ## Train
125 | We have provided all the scripts in the folder 'shells', you can start your training by specifying the GPU IDs behind the script. (If you have multiple GPUs, you can separate them with comma: ./shell/nextqa_train.sh 0,1)
126 | ```
127 | ./shell/nextqa_train.sh 0
128 | ```
129 | It will train the model and save to the folder 'save_models/nextqa/CoVGT/'. You will get results around 60.1% and 59.4% on the val and test set respectively.
130 |
131 | ### Result Visualization (NExT-QA)
132 |
133 |

134 |
135 |
136 | ## Citations
137 | ```
138 | @ARTICLE {xiao2023contrastive,
139 | author = {Junbin Xiao and Pan Zhou and Angela Yao and Yicong Li and Richang Hong and Shuicheng Yan and Tat Seng Chua},
140 | journal = {IEEE Transactions on Pattern Analysis & Machine Intelligence},
141 | title = {Contrastive Video Question Answering via Video Graph Transformer},
142 | year = {2023},
143 | volume = {45},
144 | number = {11},
145 | issn = {1939-3539},
146 | pages = {13265-13280},
147 | doi = {10.1109/TPAMI.2023.3292266},
148 | publisher = {IEEE Computer Society},
149 | address = {Los Alamitos, CA, USA},
150 | month = {nov}
151 | }
152 | ```
153 | ```
154 | @inproceedings{xiao2022video,
155 | title={Video Graph Transformer for Video Question Answering},
156 | author={Xiao, Junbin and Zhou, Pan and Chua, Tat-Seng and Yan, Shuicheng},
157 | booktitle={European Conference on Computer Vision},
158 | pages={39--58},
159 | year={2022},
160 | organization={Springer}
161 | }
162 | ```
163 | ## Notes
164 | If you use any resources from this repo, please kindly cite our paper and acknowledge the source.
165 | ## License
166 | This repository is released under the Apache 2.0 license as found in the [LICENSE](LICENSE) file.
167 |
--------------------------------------------------------------------------------
/args.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os
3 |
4 | from global_parameters import (
5 | DEFAULT_DATASET_DIR,
6 | DEFAULT_CKPT_DIR,
7 | TRANSFORMERS_PATH,
8 | SSD_DIR,
9 | dataset2folder,
10 | )
11 |
12 | def get_args():
13 | parser = argparse.ArgumentParser()
14 | parser.add_argument(
15 | "--dataset",
16 | type=str,
17 | default="ivqa",
18 | choices=[
19 | "ivqa",
20 | "msrvtt",
21 | "msrvttmc",
22 | "msvd",
23 | "webvid",
24 | "activitynet",
25 | "howto100m",
26 | "howtovqa",
27 | "how2qa",
28 | "nextqa",
29 | "star",
30 | "tgifqa/transition",
31 | "tgifqa/action",
32 | "tgifqa/frameqa",
33 | "tgifqa2/transition",
34 | "tgifqa2/action",
35 | "causalvid"
36 | ],
37 | )
38 | parser.add_argument(
39 | "--subset",
40 | type=str,
41 | default="",
42 | choices=["", "1", "10", "20", "50"],
43 | help="use a subset of the generated dataset",
44 | )
45 |
46 | # Model
47 | parser.add_argument(
48 | "--baseline",
49 | type=str,
50 | default="",
51 | choices=["", "qa"],
52 | help="qa baseline does not use the video, video baseline does not use the question",
53 | )
54 | parser.add_argument(
55 | "--n_layers",
56 | type=int,
57 | default=2,
58 | help="number of layers in the multi-modal transformer",
59 | )
60 | parser.add_argument(
61 | "--n_heads",
62 | type=int,
63 | default=8,
64 | help="number of attention heads in the multi-modal transformer",
65 | )
66 | parser.add_argument(
67 | "--embd_dim",
68 | type=int,
69 | default=512,
70 | help="multi-modal transformer and final embedding dimension",
71 | )
72 | parser.add_argument(
73 | "--ff_dim",
74 | type=int,
75 | default=2048,
76 | help="multi-modal transformer feed-forward dimension",
77 | )
78 | parser.add_argument(
79 | "--dropout",
80 | type=float,
81 | default=0.1,
82 | help="dropout rate in the multi-modal transformer",
83 | )
84 | parser.add_argument(
85 | "--sentence_dim",
86 | type=int,
87 | default=2048,
88 | help="sentence dimension for the differentiable bag-of-words embedding the answers",
89 | )
90 | parser.add_argument(
91 | "--qmax_words",
92 | type=int,
93 | default=20,
94 | help="maximum number of words in the question",
95 | )
96 | parser.add_argument(
97 | "--amax_words",
98 | type=int,
99 | default=10,
100 | help="maximum number of words in the answer",
101 | )
102 | parser.add_argument(
103 | "--max_feats",
104 | type=int,
105 | default=20,
106 | help="maximum number of video features considered",
107 | )
108 |
109 | # Paths
110 | parser.add_argument(
111 | "--dataset_dir",
112 | type=str,
113 | default=DEFAULT_DATASET_DIR,
114 | help="folder where the datasets folders are stored",
115 | )
116 | parser.add_argument(
117 | "--ssd_dir",
118 | type=str,
119 | default=SSD_DIR,
120 | help="folder with ssd storage where the HowTo100M features are stored",
121 | )
122 | parser.add_argument(
123 | "--checkpoint_predir",
124 | type=str,
125 | default=DEFAULT_CKPT_DIR,
126 | help="folder to store checkpoints",
127 | )
128 | parser.add_argument(
129 | "--checkpoint_dir", type=str, default="", help="subfolder to store checkpoint"
130 | )
131 | parser.add_argument(
132 | "--pretrain_path", type=str, default="", help="path to pretrained checkpoint"
133 | )
134 | parser.add_argument(
135 | "--bert_path",
136 | type=str,
137 | default=TRANSFORMERS_PATH,
138 | help="path to transformer models checkpoints",
139 | )
140 |
141 | # Train
142 | parser.add_argument("--batch_size", type=int, default=256)
143 | parser.add_argument("--batch_size_val", type=int, default=2048)
144 | parser.add_argument(
145 | "--n_pair",
146 | type=int,
147 | default=32,
148 | help="number of clips per video to consider to train on HowToVQA69M",
149 | )
150 | parser.add_argument("--seed", type=int, default=1)
151 | parser.add_argument("--epochs", type=int, default=20)
152 | parser.add_argument(
153 | "--test", type=int, default=0, help="use to evaluate without training"
154 | )
155 | parser.add_argument(
156 | "--lr", type=float, default=0.00005, help="initial learning rate"
157 | )
158 | parser.add_argument("--weight_decay", type=float, default=0, help="weight decay")
159 | parser.add_argument(
160 | "--clip",
161 | type=float,
162 | default=12,
163 | help="gradient clipping",
164 | )
165 |
166 | # Print
167 | parser.add_argument(
168 | "--freq_display", type=int, default=3, help="number of train prints per epoch"
169 | )
170 | parser.add_argument(
171 | "--num_thread_reader", type=int, default=16, help="number of workers"
172 | )
173 |
174 | # Masked Language Modeling and Cross-Modal Matching parameters
175 | parser.add_argument("--mlm_prob", type=float, default=0.15)
176 | parser.add_argument("--n_negs", type=int, default=1)
177 | parser.add_argument("--lr_decay", type=float, default=0.9)
178 | parser.add_argument("--min_time", type=int, default=10)
179 | parser.add_argument("--min_words", type=int, default=10)
180 |
181 | # Demo parameters
182 | parser.add_argument(
183 | "--question_example", type=str, default="", help="demo question text"
184 | )
185 | parser.add_argument("--video_example", type=str, default="", help="demo video path")
186 | parser.add_argument("--port", type=int, default=8899, help="demo port")
187 | parser.add_argument(
188 | "--pretrain_path2", type=str, default="", help="second demo model"
189 | )
190 | parser.add_argument(
191 | "--save_dir", type=str, default="./save_models/", help="path to save dir"
192 | )
193 | parser.add_argument(
194 | "--mc", type=int, default=5, help="number of multiple choices"
195 | )
196 | parser.add_argument(
197 | "--bnum", type=int, default=10, help="number of region proposal"
198 | )
199 | parser.add_argument(
200 | "--cl_loss", type=float, default=0, help="trade offf with contrastive loss"
201 | )
202 | parser.add_argument(
203 | "--lan", type=str, default='RoBERTa', help="BERT or RoBERTa"
204 | )
205 |
206 | args = parser.parse_args()
207 |
208 | os.environ["TRANSFORMERS_CACHE"] = args.bert_path
209 | # args.save_dir = './save_dir/'
210 |
211 | #args.save_dir = os.path.join(args.checkpoint_predir, args.checkpoint_dir)
212 |
213 | # multiple-choice arg
214 | # args.mc = 4 if args.dataset == "how2qa" else 0
215 | # args.mc = 5 if args.dataset == "nextqa" else 0
216 |
217 | # feature dimension
218 | args.feature_dim = 2048 # S3D:1024 app_mot:4096 #2048 RoI
219 | args.word_dim = 768 # DistilBERT
220 |
221 | # Map from dataset name to folder name
222 |
223 | load_path = os.path.join(args.dataset_dir, args.dataset)
224 | args.load_path = load_path
225 |
226 | if args.dataset not in ["howto100m", "howtovqa"]: # VideoQA dataset
227 | args.features_path = f'../data/{args.dataset}/' #os.path.join(load_path, "s3d.pth")
228 | # args.features_path = f'/data/datasets/{args.dataset}/'
229 | args.train_csv_path = os.path.join(load_path, "train.csv")
230 | if args.dataset == 'tgifqa':
231 | args.val_csv_path = os.path.join(load_path, "test.csv")
232 | else:
233 | args.val_csv_path = os.path.join(load_path, "val.csv")
234 | args.test_csv_path = os.path.join(load_path, "test.csv")
235 | args.vocab_path = os.path.join(load_path, "vocab.json")
236 | else: # Pretraining dataset
237 | args.features_path = os.path.join(
238 | args.ssd_dir, "s3d_features", "howto100m_s3d_features"
239 | )
240 | if args.dataset == "howto100m":
241 | args.caption_path = os.path.join(
242 | load_path, "caption_howto100m_sw_nointersec_norepeat.pickle"
243 | )
244 | args.train_csv_path = os.path.join(
245 | load_path, f"s3d_features_nointersec.csv"
246 | )
247 | args.youcook_val_path = os.path.join(
248 | args.dataset_dir, "YouCook2", "youcook_unpooled_val.pkl"
249 | )
250 | args.msrvtt_test_csv_path = os.path.join(
251 | args.dataset_dir, "MSR-VTT", "MSRVTT_JSFUSION_test.csv"
252 | )
253 | args.msrvtt_test_features_path = os.path.join(
254 | args.dataset_dir, "MSR-VTT", "msrvtt_test_unpooled_s3d_features.pth"
255 | )
256 | elif args.dataset == "howtovqa":
257 | if not args.subset:
258 | args.caption_path = os.path.join(load_path, "howtovqa.pkl")
259 | args.train_csv_path = os.path.join(load_path, "train_howtovqa.csv")
260 | args.val_csv_path = os.path.join(load_path, "val_howtovqa.csv")
261 | else:
262 | args.caption_path = os.path.join(
263 | load_path, f"howtovqa_{args.subset}.pickle"
264 | )
265 | args.train_csv_path = os.path.join(
266 | load_path, f"train_howtovqa_{args.subset}.csv"
267 | )
268 | args.val_csv_path = os.path.join(
269 | load_path, f"val_howtovqa_{args.subset}.csv"
270 | )
271 |
272 | return args
273 |
--------------------------------------------------------------------------------
/dataloader/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | __pycache__
3 |
--------------------------------------------------------------------------------
/eval_next.py:
--------------------------------------------------------------------------------
1 | import os.path as osp
2 | from util import load_file
3 | import argparse
4 |
5 | map_name = {'CW': 'Why', 'CH': 'How', 'TN': 'Bef&Aft', 'TC': 'When',
6 | 'DC': 'Cnt', 'DL': 'Loc', 'DO': 'Other', 'C': 'Acc_C',
7 | 'T': 'Acc_T', 'D': 'Acc_D'}
8 |
9 | def accuracy_metric(sample_list, result):
10 |
11 | group = {'CW':[], 'CH':[], 'TN':[], 'TC':[], 'DC':[], 'DL':[], 'DO':[]}
12 | for id, row in sample_list.iterrows():
13 | qns_id = str(row['video_id']) + '_' + str(row['qid'])
14 | qtype = str(row['type'])
15 | #(combine temporal qns of previous and next as 'TN')
16 | if qtype == 'TP':
17 | qtype = 'TN'
18 | group[qtype].append(qns_id)
19 |
20 | preds = result
21 | group_acc = {'CW': 0, 'CH': 0, 'TN': 0, 'TC': 0, 'DC': 0, 'DL': 0, 'DO': 0}
22 | group_cnt = {'CW': 0, 'CH': 0, 'TN': 0, 'TC': 0, 'DC': 0, 'DL': 0, 'DO': 0}
23 | overall_acc = {'C':0, 'T':0, 'D':0}
24 | overall_cnt = {'C':0, 'T':0, 'D':0}
25 | all_acc = 0
26 | all_cnt = 0
27 | for qtype, qns_ids in group.items():
28 | cnt = 0
29 | acc = 0
30 | for qid in qns_ids:
31 |
32 | cnt += 1
33 | answer = preds[qid]['answer']
34 | pred = preds[qid]['prediction']
35 | if answer == pred:
36 | acc += 1
37 |
38 | group_cnt[qtype] = cnt
39 | group_acc[qtype] += acc
40 | overall_acc[qtype[0]] += acc
41 | overall_cnt[qtype[0]] += cnt
42 | all_acc += acc
43 | all_cnt += cnt
44 |
45 |
46 | for qtype, value in overall_acc.items():
47 | group_acc[qtype] = value
48 | group_cnt[qtype] = overall_cnt[qtype]
49 |
50 | for qtype in group_acc:
51 | if group_cnt[qtype] == 0: continue
52 | print(map_name[qtype], end='\t')
53 | print('')
54 | for qtype, acc in group_acc.items():
55 | if group_cnt[qtype] == 0: continue
56 | print('{:.2f}'.format(acc*100.0/group_cnt[qtype]), end ='\t')
57 | print('')
58 | print('Acc: {:.2f}'.format(all_acc*100.0/all_cnt))
59 |
60 |
61 |
62 | def accuracy_metric_sub(sample_list, result, sub_ids):
63 |
64 | sub_ids = [int(id) for id in sub_ids]
65 | subset = sample_list.iloc[sub_ids]
66 |
67 | accuracy_metric(subset, result)
68 |
69 |
70 |
71 | def main(result_file, mode='val'):
72 | dataset_dir = '../data/datasets/nextqa/'
73 | data_set = mode
74 | sample_list_file = osp.join(dataset_dir, data_set+'.csv')
75 | print('Evaluating {}'.format(result_file))
76 |
77 | sample_list = load_file(sample_list_file)
78 | result = load_file(result_file)
79 | accuracy_metric(sample_list, result)
80 |
81 | if mode == 'val':
82 | hard_subset = osp.join(dataset_dir, 'atp-hard-ct4.txt')
83 | sub_ids = load_file(hard_subset)
84 | accuracy_metric_sub(sample_list, result, sub_ids)
85 |
86 |
87 |
88 | if __name__ == "__main__":
89 | parser = argparse.ArgumentParser()
90 | parser.add_argument("--mode", type=str, default='val', choices=['val','test'])
91 | parser.add_argument("--folder", type=str)
92 | args = parser.parse_args()
93 | res_dir = '../data/save_models/nextqa/'+args.folder
94 | #res_dir = '../data/models/nextqa/'
95 | mode = args.mode
96 | model_prefix = 'res'
97 | result_file = '{}/{}-{}.json'.format(res_dir, mode, model_prefix)
98 | main(result_file, mode)
99 |
--------------------------------------------------------------------------------
/global_parameters.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | # Fill the paths
4 | DEFAULT_DATASET_DIR = "./datasets/" # where the datasets folders are
5 | DEFAULT_CKPT_DIR = "../data/models/" # where the training checkpoints and logs will be saved
6 | DEFAULT_MODEL_DIR = "../data/pretrain_models/" # where the pretrained models are
7 | SSD_DIR = "../data/feats/" # where the HowTo100M S3D features are
8 | HOWTO_FEATURES_PATH = os.path.join(SSD_DIR, "s3d_features", "howto100m_s3d_features")
9 |
10 | # Map from dataset name to folder name
11 | dataset2folder = {
12 | "ivqa": "iVQA",
13 | "msrvtt": "MSRVTT-QA",
14 | "msvd": "msvd",
15 | "activitynet": "ActivityNet-QA",
16 | "howto100m": "HowTo100M",
17 | "howtovqa": "HowToVQA69M",
18 | "how2qa": "How2QA",
19 | "nextqa": "nextqa"
20 | }
21 |
22 | # Datasets
23 | IVQA_PATH = os.path.join(
24 | DEFAULT_DATASET_DIR, dataset2folder["ivqa"]
25 | ) # Path where iVQA is downloaded
26 | MSRVTT_PATH = os.path.join(
27 | DEFAULT_DATASET_DIR, dataset2folder["msrvtt"]
28 | ) # Path where MSRVTT-QA is downloaded
29 | MSVD_PATH = os.path.join(
30 | DEFAULT_DATASET_DIR, dataset2folder["msvd"]
31 | ) # Path where MSVD-QA is downloaded
32 | ACT_PATH = os.path.join(
33 | DEFAULT_DATASET_DIR, dataset2folder["activitynet"]
34 | ) # Path where ActivityNet-QA is downloaded
35 | HOWTO_PATH = os.path.join(
36 | DEFAULT_DATASET_DIR, dataset2folder["howto100m"]
37 | ) # Path where HowTo100M is downloaded
38 | HOWTOVQA_PATH = os.path.join(
39 | DEFAULT_DATASET_DIR, dataset2folder["howtovqa"]
40 | ) # Path where HowToVQA69M is downloaded / generated
41 | HOW2QA_PATH = os.path.join(
42 | DEFAULT_DATASET_DIR, dataset2folder["how2qa"]
43 | ) # Path where How2QA is downloaded
44 | NEXTQA_PATH = os.path.join(
45 | DEFAULT_DATASET_DIR, dataset2folder["nextqa"]
46 | ) # Path where How2QA is downloaded
47 |
48 |
49 | # Models
50 | S3D_PATH = os.path.join(
51 | DEFAULT_MODEL_DIR, "s3d_howto100m.pth"
52 | ) # Path to S3D checkpoint
53 | S3D_DICT_PATH = os.path.join(
54 | DEFAULT_MODEL_DIR, "s3d_dict.npy"
55 | ) # Path to S3D dictionary
56 | PUNCTUATOR_PATH = os.path.join(
57 | DEFAULT_MODEL_DIR, "INTERSPEECH-T-BRNN.pcl"
58 | ) # Path to Punctuator2 checkpoint
59 | TRANSFORMERS_PATH = os.path.join(
60 | DEFAULT_MODEL_DIR, "transformers"
61 | ) # Path where the transformers checkpoints will be saved
62 |
63 | # Question-answer Generation
64 | punct_dir = os.path.join(
65 | SSD_DIR, "punct"
66 | ) # Path where the punctuated clips will be created (1 file per unique video)
67 | QG_REPO_DIR = "" # Path where the question generation repo is cloned
68 | answers_dir = os.path.join(
69 | SSD_DIR, "ans"
70 | ) # Path where the extracted answers will be saved (1 file per unique video)
71 | qas_dir = os.path.join(
72 | SSD_DIR, "qas"
73 | ) # Path where the generated question-answers will be saved (1 file per unique video)
74 |
--------------------------------------------------------------------------------
/loss.py:
--------------------------------------------------------------------------------
1 | import torch as torch
2 | import torch.nn.functional as F
3 |
4 |
5 | class Contrastive_Loss(torch.nn.Module):
6 | def __init__(self):
7 | super(Contrastive_Loss, self).__init__()
8 | self.ce_loss = torch.nn.CrossEntropyLoss()
9 |
10 | def forward(self, x, target):
11 | return self.ce_loss(x, target)
12 |
13 |
14 | class LogSoftmax(torch.nn.Module):
15 | def __init__(self, dim):
16 | super(LogSoftmax, self).__init__()
17 | self.dim = dim
18 |
19 | def forward(self, x, a):
20 | nll = -F.log_softmax(x, self.dim, _stacklevel=5)
21 | return (nll * a / a.sum(1, keepdim=True).clamp(min=1)).sum(dim=1).mean()
22 |
23 |
24 | class NCELoss(torch.nn.Module):
25 | def __init__(self, batch_size=4096):
26 | super(NCELoss, self).__init__()
27 | self.ce_loss = torch.nn.CrossEntropyLoss()
28 |
29 | def forward(self, x):
30 | batch_size = len(x)
31 | target = torch.arange(batch_size).cuda()
32 | x = torch.cat((x, x.t()), dim=1)
33 | return self.ce_loss(x, target)
34 |
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.optim as optim
4 | import numpy as np
5 | import random
6 | import os
7 | import os.path as osp
8 | import logging
9 |
10 | from transformers import get_cosine_schedule_with_warmup
11 | from args import get_args
12 | from model.CoVGT import VGT
13 | from loss import LogSoftmax
14 | from util import compute_a2v, load_model_by_key, save_to
15 | from dataloader.cvqa_loader import get_videoqa_loaders
16 | from train.train_covgt import train, eval
17 |
18 |
19 |
20 | def main(args):
21 | if not (os.path.isdir(args.save_dir)):
22 | os.mkdir(os.path.join(args.save_dir))
23 | logging.basicConfig(
24 | level=logging.INFO, format="%(asctime)s %(levelname)-8s %(message)s"
25 | )
26 | logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
27 | rootLogger = logging.getLogger()
28 | fileHandler = logging.FileHandler(os.path.join(args.save_dir, "stdout.log"), "w+")
29 | fileHandler.setFormatter(logFormatter)
30 | rootLogger.addHandler(fileHandler)
31 | logging.info(args)
32 |
33 |
34 | if args.lan == 'BERT':
35 | from transformers import BertTokenizer
36 | tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
37 | elif args.lan == 'RoBERTa':
38 | from transformers import RobertaTokenizerFast,RobertaTokenizer
39 | tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
40 |
41 |
42 | a2id, id2a, a2v = None, None, None
43 | if not args.mc:
44 | a2id, id2a, a2v = compute_a2v(
45 | vocab_path=args.vocab_path,
46 | bert_tokenizer=tokenizer,
47 | amax_words=args.amax_words,
48 | )
49 | logging.info(f"Length of Answer Vocabulary: {len(a2id)}")
50 |
51 | # Model
52 | model = VGT(
53 | tokenizer = tokenizer,
54 | feature_dim=args.feature_dim,
55 | word_dim=args.word_dim,
56 | N=args.n_layers,
57 | d_model=args.embd_dim,
58 | d_ff=args.ff_dim,
59 | h=args.n_heads,
60 | dropout=args.dropout,
61 | T=args.max_feats,
62 | Q=args.qmax_words,
63 | vocab_size = tokenizer.vocab_size,
64 | baseline=args.baseline,
65 | bnum=args.bnum,
66 | lan=args.lan
67 | )
68 | model.cuda()
69 | logging.info("Using {} GPUs".format(torch.cuda.device_count()))
70 |
71 | # Load pretrain path
72 | model = nn.DataParallel(model)
73 |
74 | if args.pretrain_path != "":
75 | # model.load_state_dict(torch.load(args.pretrain_path))
76 | model.load_state_dict(load_model_by_key(model, args.pretrain_path))
77 | logging.info(f"Loaded checkpoint {args.pretrain_path}")
78 | logging.info(
79 | f"Nb of trainable params:{sum(p.numel() for p in model.parameters() if p.requires_grad)}"
80 | )
81 |
82 | (
83 | train_loader,
84 | val_loader,
85 | test_loader,
86 | ) = get_videoqa_loaders(args, args.features_path, a2id, tokenizer, test_mode = args.test)
87 |
88 | if args.test:
89 | logging.info("number of test instances: {}".format(len(test_loader.dataset)))
90 | else:
91 | logging.info("number of train instances: {}".format(len(train_loader.dataset)))
92 | logging.info("number of val instances: {}".format(len(val_loader.dataset)))
93 |
94 |
95 | criterion = nn.CrossEntropyLoss(ignore_index=-1)
96 | # criterion = MultipleChoiceLoss()
97 | params_for_optimization = list(p for p in model.parameters() if p.requires_grad)
98 | optimizer = optim.Adam(
99 | params_for_optimization, lr=args.lr, weight_decay=args.weight_decay
100 | )
101 | criterion.cuda()
102 |
103 | # Training
104 | if not args.test:
105 | scheduler = get_cosine_schedule_with_warmup(
106 | optimizer, 0, len(train_loader) * args.epochs
107 | )
108 | logging.info(
109 | f"Set cosine schedule with {len(train_loader) * args.epochs} iterations"
110 | )
111 | if args.pretrain_path != "":
112 | val_acc, results = eval(model, val_loader, a2v, args, test=False, tokenizer=tokenizer) # zero-shot VideoQA
113 | save_path = osp.join(args.save_dir, 'val-res0.json')
114 | save_to (save_path, results)
115 | best_val_acc = 0 if args.pretrain_path == "" else val_acc
116 | best_epoch = 0
117 | for epoch in range(args.epochs):
118 | train(model, train_loader, a2v, optimizer, criterion, scheduler, epoch, args, tokenizer)
119 | val_acc, results = eval(model, val_loader, a2v, args, test=False, tokenizer=tokenizer)
120 | if val_acc > best_val_acc:
121 | best_val_acc = val_acc
122 | best_epoch = epoch
123 | torch.save(
124 | model.state_dict(), os.path.join(args.save_dir, "best_model.pth")
125 | )
126 | save_path = osp.join(args.save_dir, 'val-res.json')
127 | save_to (save_path, results)
128 | if args.dataset == 'webvid':
129 | ep_file = os.path.join(args.save_dir, f"e{epoch}.pth")
130 | torch.save(model.state_dict(), ep_file)
131 | logging.info('Save to '+ep_file)
132 | logging.info(f"Best val model at epoch {best_epoch + 1}")
133 | else:
134 | # Evaluate on test set
135 | test_acc, results = eval(model, test_loader, a2v, args, test=True, tokenizer=tokenizer)
136 | save_path = osp.join(args.save_dir, 'test-res.json')
137 | save_to(save_path, results)
138 |
139 |
140 | if __name__ == "__main__":
141 | # set random seeds
142 | args = get_args()
143 | torch.backends.cudnn.enabled = False
144 | torch.cuda.manual_seed(args.seed)
145 | torch.manual_seed(args.seed)
146 | np.random.seed(args.seed)
147 | random.seed(args.seed)
148 | torch.backends.cudnn.benchmark = True
149 |
150 | main(args)
151 |
--------------------------------------------------------------------------------
/misc/CoVGT-res.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/doc-doc/CoVGT/cbc9fa7830b304f3c3f9c53040489ea9ad35a9aa/misc/CoVGT-res.png
--------------------------------------------------------------------------------
/misc/CoVGT.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/doc-doc/CoVGT/cbc9fa7830b304f3c3f9c53040489ea9ad35a9aa/misc/CoVGT.png
--------------------------------------------------------------------------------
/model/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | __pycache__
3 |
--------------------------------------------------------------------------------
/model/EncoderVid.py:
--------------------------------------------------------------------------------
1 | # Copyright 2022 Garena Online Private Limited
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import torch.nn as nn
16 | import torch
17 |
18 | class EncoderVid(nn.Module):
19 | def __init__(self, feat_dim, bbox_dim, feat_hidden, pos_hidden, input_dropout_p=0.3):
20 |
21 | super(EncoderVid, self).__init__()
22 | self.dim_feat = feat_dim
23 | self.dim_bbox = bbox_dim
24 | self.dim_hidden = feat_hidden
25 | self.input_dropout_p = input_dropout_p
26 |
27 | input_dim = feat_dim
28 |
29 | input_dim += pos_hidden
30 | self.bbox_conv = nn.Sequential(
31 | nn.Conv2d(self.dim_bbox, pos_hidden, kernel_size=1),
32 | nn.BatchNorm2d(pos_hidden),
33 | nn.ReLU(),
34 | nn.Conv2d(pos_hidden, pos_hidden, kernel_size=1),
35 | nn.BatchNorm2d(pos_hidden),
36 | nn.ReLU(),
37 |
38 | )
39 |
40 | self.tohid = nn.Sequential(
41 | nn.Linear(feat_dim+pos_hidden, feat_hidden),
42 | nn.ELU(inplace=True))
43 |
44 | # self.roi_conv = nn.Sequential(
45 | # nn.Conv1d(feat_dim, feat_hidden, kernel_size=3, padding=1),
46 | # nn.ELU(inplace=True)
47 | # )
48 |
49 | # self.roi_conv = nn.Sequential(
50 | # nn.Conv2d(4, 4, kernel_size=1),
51 | # nn.BatchNorm2d(4),
52 | # nn.ReLU(),
53 | # )
54 |
55 |
56 | def forward(self, video_o):
57 |
58 | bsize, numc, numf, numr, fdim = video_o.shape
59 |
60 | video_o = video_o.view(bsize, numc*numf, numr, fdim)
61 | roi_feat = video_o[:,:,:, :self.dim_feat]
62 | roi_bbox = video_o[:,:,:, self.dim_feat:(self.dim_feat+self.dim_bbox)]
63 |
64 | bbox_pos = self.bbox_conv(roi_bbox.permute(
65 | 0, 3, 1, 2)).permute(0, 2, 3, 1)
66 |
67 | bbox_features = torch.cat([roi_feat, bbox_pos], dim=-1)
68 |
69 | bbox_feat = self.tohid(bbox_features)
70 |
71 | return bbox_feat
72 |
--------------------------------------------------------------------------------
/model/cmatt.py:
--------------------------------------------------------------------------------
1 | __author__ = "Jie Lei"
2 |
3 | import torch
4 | import torch.nn as nn
5 | import torch.nn.functional as F
6 | from torch.autograd import Variable
7 |
8 |
9 | class CMAtten(nn.Module):
10 |
11 | def __init__(self):
12 | super(CMAtten, self).__init__()
13 |
14 |
15 | def similarity(self, s1, l1, s2, l2):
16 | """
17 | :param s1: [B, t1, D]
18 | :param l1: [B]
19 | :param s2: [B, t2, D]
20 | :param l2: [B]
21 | :return:
22 | """
23 | s = torch.bmm(s1, s2.transpose(1, 2))
24 |
25 | # import ipdb; ipdb.set_trace()
26 | s_mask = s.data.new(*s.size()).fill_(1).bool() # [B, T1, T2]
27 | # Init similarity mask using lengths
28 | for i, (l_1, l_2) in enumerate(zip(l1, l2)):
29 | s_mask[i][:l_1, :l_2] = 0
30 |
31 | s_mask = Variable(s_mask)
32 | s.data.masked_fill_(s_mask.data, -float("inf"))
33 | return s
34 |
35 | @classmethod
36 | def get_u_tile(cls, s, s2):
37 | """
38 | attended vectors of s2 for each word in s1,
39 | signify which words in s2 are most relevant to words in s1
40 | """
41 | a_weight = F.softmax(s, dim=2) # [B, l1, l2]
42 | # remove nan from softmax on -inf
43 | # print(a_weight.shape, s2.shape)
44 | a_weight.data.masked_fill_(a_weight.data != a_weight.data, 0)
45 | # [B, l1, l2] * [B, l2, D] -> [B, l1, D]
46 | u_tile = torch.bmm(a_weight, s2)
47 | return u_tile, a_weight
48 |
49 |
50 | def forward(self, s1, l1, s2, l2):
51 | s = self.similarity(s1, l1, s2, l2)
52 | u_tile, a_weight = self.get_u_tile(s, s2)
53 |
54 | return u_tile, a_weight
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
--------------------------------------------------------------------------------
/model/graph.py:
--------------------------------------------------------------------------------
1 | import torch.nn as nn
2 | import torch
3 | from torch.autograd import Variable
4 | import torch.nn.functional as F
5 | from torch.nn.parameter import Parameter
6 | import math
7 |
8 | class GraphConvolution(nn.Module):
9 | """
10 | Simple GCN layer, similar to https://arxiv.org/abs/1609.02907
11 | """
12 |
13 | def __init__(self, in_features, out_features, bias=True, skip=True):
14 | super(GraphConvolution, self).__init__()
15 | self.skip = skip
16 | self.in_features = in_features
17 | self.out_features = out_features
18 | self.weight = Parameter(torch.Tensor(in_features, out_features))
19 | if bias:
20 | self.bias = Parameter(torch.Tensor(out_features))
21 | else:
22 | self.register_parameter('bias', None)
23 | self.reset_parameters()
24 |
25 | def reset_parameters(self):
26 | stdv = 1. / math.sqrt(self.weight.size(1))
27 | self.weight.data.uniform_(-stdv, stdv)
28 | if self.bias is not None:
29 | self.bias.data.uniform_(-stdv, stdv)
30 |
31 | def forward(self, input, adj):
32 | # TODO make fc more efficient via "pack_padded_sequence"
33 |
34 | support = torch.bmm(input, self.weight.unsqueeze(
35 | 0).expand(input.shape[0], -1, -1))
36 | output = torch.bmm(adj, support)
37 | #output = SparseMM(adj)(support)
38 | if self.bias is not None:
39 | output += self.bias.unsqueeze(0).expand(input.shape[0], -1, -1)
40 | if self.skip:
41 | output += support
42 |
43 | return output
44 |
45 | def __repr__(self):
46 | return self.__class__.__name__ + ' (' \
47 | + str(self.in_features) + ' -> ' \
48 | + str(self.out_features) + ')'
49 |
50 |
51 | class Graph(nn.Module):
52 |
53 | def __init__(self, dim_in, dim_hidden, dim_out, num_layers, dropout):
54 | super(Graph, self).__init__()
55 | self.fc_k = nn.Linear(dim_in, dim_hidden)
56 | self.fc_q = nn.Linear(dim_in, dim_hidden)
57 |
58 | dim_hidden = dim_out if num_layers == 1 else dim_hidden
59 | self.layers = nn.ModuleList([
60 | GraphConvolution(dim_in, dim_hidden)
61 | ])
62 |
63 | for i in range(num_layers - 1):
64 | dim_tmp = dim_out if i == num_layers-2 else dim_hidden
65 | self.layers.append(GraphConvolution(dim_hidden, dim_tmp))
66 |
67 | self.dropout = dropout
68 |
69 |
70 | def build_graph(self, x):
71 | batch_size, s_len = x.shape[0], x.shape[1]
72 | emb_k = self.fc_k(x)
73 | emb_q = self.fc_q(x)
74 | length = torch.tensor([s_len] * batch_size, dtype=torch.long)
75 |
76 | s = torch.bmm(emb_k, emb_q.transpose(1, 2))
77 |
78 | s_mask = s.data.new(*s.size()).fill_(1).bool() # [B, T1, T2]
79 | # Init similarity mask using lengths
80 | for i, (l_1, l_2) in enumerate(zip(length, length)):
81 | s_mask[i][:l_1, :l_2] = 0
82 | s_mask = Variable(s_mask)
83 | s.data.masked_fill_(s_mask.data, -float("inf"))
84 |
85 | A = s #F.softmax(s, dim=2) # [B, t1, t2]
86 |
87 | # remove nan from softmax on -inf
88 | A.data.masked_fill_(A.data != A.data, 0)
89 |
90 | return A
91 |
92 | def forward(self, X, A):
93 | for layer in self.layers:
94 | X = F.relu(layer(X, A))
95 | X = F.dropout(X, self.dropout, training=self.training)
96 | return X
97 |
--------------------------------------------------------------------------------
/model/language_model.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 | from transformers.activations import gelu
5 | from model.cmatt import CMAtten
6 |
7 | class Bert(nn.Module):
8 | """ Finetuned *BERT module """
9 |
10 | def __init__(self, tokenizer, lan='RoBERTa'):
11 | super(Bert, self).__init__()
12 |
13 | if lan == 'BERT':
14 | from transformers import BertTokenizer, BertModel, BertConfig
15 | config = BertConfig.from_pretrained("bert-base-uncased", output_hidden_states=True)
16 | self.bert = BertModel.from_pretrained("bert-base-uncased", config=config)
17 | elif lan == 'RoBERTa':
18 | from transformers import RobertaModel, RobertaConfig, RobertaTokenizerFast
19 | config = RobertaConfig.from_pretrained("roberta-base", output_hidden_states=True)
20 | self.bert = RobertaModel.from_pretrained("roberta-base", config=config)
21 | self.tokenizer = tokenizer
22 |
23 | # for name, param in self.bert.named_parameters():
24 | # param.requires_grad = False
25 |
26 | def forward(self, tokens):
27 | attention_mask = (tokens != self.tokenizer.pad_token_id).float()
28 | outs = self.bert(tokens, attention_mask=attention_mask)
29 | embds = outs[0]
30 | return embds, outs[1][-2]
31 |
32 |
33 | class Sentence_Maxpool(nn.Module):
34 | """ Utilitary for the answer module """
35 |
36 | def __init__(self, word_dimension, output_dim, relu=True):
37 | super(Sentence_Maxpool, self).__init__()
38 | self.fc = nn.Linear(word_dimension, output_dim)
39 | self.out_dim = output_dim
40 | self.relu = relu
41 |
42 | def forward(self, x_in):
43 | x = self.fc(x_in)
44 | x = torch.max(x, dim=1)[0]
45 | if self.relu:
46 | x = F.relu(x)
47 | return x
48 |
49 |
50 | class FFN(nn.Module):
51 | def __init__(self, word_dim, hidden_dim, out_dim, dropout=0.3):
52 | super().__init__()
53 | activation = "gelu"
54 | self.dropout = nn.Dropout(p=dropout)
55 | self.lin1 = nn.Linear(in_features=word_dim, out_features=hidden_dim)
56 | self.lin2 = nn.Linear(in_features=hidden_dim, out_features=out_dim)
57 | assert activation in [
58 | "relu",
59 | "gelu",
60 | ], "activation ({}) must be in ['relu', 'gelu']".format(activation)
61 | self.activation = gelu if activation == "gelu" else nn.ReLU()
62 |
63 | def forward(self, input):
64 | x = self.lin1(input)
65 | x = self.activation(x)
66 | x = self.lin2(x)
67 | x = self.dropout(x)
68 | return x
69 |
70 | class AModel(nn.Module):
71 | """
72 | Answer embedding module
73 | """
74 |
75 | def __init__(self, tokenizer, lan='RoBERTa', word_dim=768, out_dim=512):
76 | super(AModel, self).__init__()
77 | self.bert = Bert(tokenizer, lan=lan)
78 | self.linear_text = nn.Linear(word_dim, out_dim)
79 |
80 | # self.linear_text = FFN(word_dim, out_dim, out_dim)
81 |
82 | def forward(self, answer):
83 |
84 | if len(answer.shape) == 3:
85 | #multi-choice
86 | bs, nans, lans = answer.shape
87 | answer = answer.view(bs * nans, lans)
88 | answer, hd_state = self.bert(answer)
89 | answer = self.linear_text(answer)
90 | answer_g = answer.mean(dim=1)
91 | # answer_g = answer[:, 0, :]
92 | answer_g = answer_g.view(bs, nans, -1)
93 | else:
94 | answer, hd_state = self.bert(answer)
95 | answer = self.linear_text(answer)
96 | answer_g = answer.mean(dim=1)
97 | # answer_g = answer[:, 0, :]
98 |
99 | return answer_g, answer
100 |
101 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | h5py==3.9.0
2 | hostlist==1.4.8
3 | huggingface-hub==0.16.4
4 | numpy==1.22.0
5 | pandas==1.4.1
6 | Pillow==9.3.0
7 | python-dateutil==2.8.2
8 | PyYAML==6.0
9 | scikit-learn==1.0.2
10 | scipy==1.8.0
11 | sentencepiece==0.1.96
12 | tokenizers==0.11.6
13 | torch==1.8.1
14 | torchvision==0.9.1
15 | tqdm==4.63.1
16 | transformers==4.17.0
17 |
--------------------------------------------------------------------------------
/shells/cvid_test.sh:
--------------------------------------------------------------------------------
1 | GPU=$1
2 | CUDA_VISIBLE_DEVICES=$GPU python main.py --checkpoint_dir=causalvid \
3 | --dataset=causalvid \
4 | --mc=5 \
5 | --bnum=10 \
6 | --test=1 \
7 | --qmax_words=0 \
8 | --amax_words=38 \
9 | --max_feats=32 \
10 | --batch_size=64 \
11 | --batch_size_val=64 \
12 | --num_thread_reader=8 \
13 | --mlm_prob=0 \
14 | --n_layers=1 \
15 | --embd_dim=512 \
16 | --ff_dim=1024 \
17 | --dropout=0.3 \
18 | --lan="RoBERTa" \
19 | --save_dir='./save_models/causalvid/CoVGT/' \
20 | --pretrain_path='./save_models/causalvid/CoVGT/best_model.pth'
21 |
--------------------------------------------------------------------------------
/shells/cvid_train.sh:
--------------------------------------------------------------------------------
1 | GPU=$1
2 | CUDA_VISIBLE_DEVICES=$GPU python main.py --checkpoint_dir=causalvid \
3 | --dataset=causalvid \
4 | --mc=5 \
5 | --bnum=10 \
6 | --epochs=20 \
7 | --lr=0.00001 \
8 | --qmax_words=0 \
9 | --amax_words=38 \
10 | --max_feats=32 \
11 | --batch_size=64 \
12 | --batch_size_val=64 \
13 | --num_thread_reader=8 \
14 | --mlm_prob=0 \
15 | --n_layers=1 \
16 | --embd_dim=512 \
17 | --ff_dim=1024 \
18 | --dropout=0.3 \
19 | --seed=666 \
20 | --cl_loss=0 \
21 | --lan="RoBERTa" \
22 | --save_dir='./save_models/causalvid/CoVGT/' \
23 | --pretrain_path='./save_models/causalvid/CoVGT/best_model.pth'
--------------------------------------------------------------------------------
/shells/msrvtt_test.sh:
--------------------------------------------------------------------------------
1 | GPU=$1
2 | CUDA_VISIBLE_DEVICES=$GPU python main.py --checkpoint_dir=msrvtt \
3 | --dataset=msrvtt \
4 | --mc=0 \
5 | --bnum=10 \
6 | --test=1 \
7 | --qmax_words=20 \
8 | --amax_words=5 \
9 | --max_feats=32 \
10 | --batch_size=64 \
11 | --batch_size_val=64 \
12 | --num_thread_reader=8 \
13 | --mlm_prob=0 \
14 | --n_layers=1 \
15 | --embd_dim=512 \
16 | --ff_dim=1024 \
17 | --dropout=0.3 \
18 | --save_dir='../data/save_models/msrvtt/180k_ft/' \
19 | --pretrain_path='../data/save_models/msrvtt/180k_ft/best_model.pth'
20 |
--------------------------------------------------------------------------------
/shells/msrvtt_train.sh:
--------------------------------------------------------------------------------
1 | GPU=$1
2 | CUDA_VISIBLE_DEVICES=$GPU python main.py --checkpoint_dir=msrvtt \
3 | --dataset=msrvtt \
4 | --mc=0 \
5 | --bnum=10 \
6 | --epochs=30 \
7 | --lr=0.00001 \
8 | --qmax_words=20 \
9 | --amax_words=5 \
10 | --max_feats=32 \
11 | --batch_size=64 \
12 | --batch_size_val=64 \
13 | --num_thread_reader=8 \
14 | --mlm_prob=0 \
15 | --n_layers=1 \
16 | --embd_dim=512 \
17 | --ff_dim=1024 \
18 | --dropout=0.3 \
19 | --save_dir='../data/save_models/msrvtt/180k+_ft/' \
20 | --seed=666 \
21 | --pretrain_path='../data/save_models/msrvtt/180k+_ft/best_model.pth'
22 |
--------------------------------------------------------------------------------
/shells/next_test.sh:
--------------------------------------------------------------------------------
1 | GPU=$1
2 | CUDA_VISIBLE_DEVICES=$GPU python main.py --checkpoint_dir=nextqa \
3 | --dataset=nextqa \
4 | --mc=5 \
5 | --bnum=10 \
6 | --test=1 \
7 | --qmax_words=0 \
8 | --amax_words=38 \
9 | --max_feats=32 \
10 | --batch_size=64 \
11 | --batch_size_val=64 \
12 | --num_thread_reader=4 \
13 | --mlm_prob=0 \
14 | --n_layers=1 \
15 | --embd_dim=512 \
16 | --ff_dim=1024 \
17 | --dropout=0.3 \
18 | --lan="RoBERTa" \
19 | --save_dir='../data/save_models/nextqa/CoVGT_FTCoWV/' \
20 | --pretrain_path='../data/save_models/nextqa/CoVGT_FTCoWV/best_model.pth' \
21 | #--CM_PT=1
22 |
--------------------------------------------------------------------------------
/shells/next_train.sh:
--------------------------------------------------------------------------------
1 | GPU=$1
2 | CUDA_VISIBLE_DEVICES=$GPU python main.py --checkpoint_dir=nextqa \
3 | --dataset=nextqa \
4 | --mc=5 \
5 | --bnum=5 \
6 | --epochs=20 \
7 | --lr=0.00001 \
8 | --qmax_words=30 \
9 | --amax_words=38 \
10 | --max_feats=32 \
11 | --batch_size=64 \
12 | --batch_size_val=64 \
13 | --num_thread_reader=8 \
14 | --mlm_prob=0 \
15 | --cl_loss=1 \
16 | --n_layers=1 \
17 | --embd_dim=512 \
18 | --ff_dim=1024 \
19 | --dropout=0.3 \
20 | --seed=666 \
21 | --lan="RoBERTa" \
22 | --save_dir='../data/save_models/nextqa/CoVGT/' \
23 | #--pretrain_path=../data/save_models/webvid180K/co_e1.pth \
24 |
25 |
26 |
27 |
--------------------------------------------------------------------------------
/shells/tgif_ftrain.sh:
--------------------------------------------------------------------------------
1 | GPU=$1
2 | CUDA_VISIBLE_DEVICES=$GPU python main.py --checkpoint_dir=tgifqa \
3 | --dataset=tgifqa/frameqa \
4 | --mc=0 \
5 | --bnum=10 \
6 | --epochs=30 \
7 | --lr=0.00001 \
8 | --qmax_words=20 \
9 | --amax_words=5 \
10 | --max_feats=32 \
11 | --batch_size=64 \
12 | --batch_size_val=64 \
13 | --num_thread_reader=8 \
14 | --mlm_prob=0 \
15 | --n_layers=1 \
16 | --embd_dim=512 \
17 | --ff_dim=1024 \
18 | --dropout=0.3 \
19 | --save_dir='../data/save_models/tgifqa/frameqa/VGT/' \
20 | --seed=666 \
21 | # --pretrain_path='../data/save_models/webvid/180K/e1.pth'
--------------------------------------------------------------------------------
/shells/tgif_test.sh:
--------------------------------------------------------------------------------
1 | GPU=$1
2 | CUDA_VISIBLE_DEVICES=$GPU python main.py --checkpoint_dir=tgifqa \
3 | --dataset=tgifqa/transition \
4 | --mc=5 \
5 | --test=1 \
6 | --qmax_words=0 \
7 | --amax_words=20 \
8 | --max_feats=32 \
9 | --batch_size=64 \
10 | --batch_size_val=64 \
11 | --num_thread_reader=8 \
12 | --mlm_prob=0 \
13 | --n_layers=1 \
14 | --embd_dim=512 \
15 | --ff_dim=1024 \
16 | --dropout=0.3 \
17 | --save_dir='../data/save_models/tgifqa/transition/VGT/' \
18 | --pretrain_path='../data/save_models/tgifqa/transition/VGT/best_model.pth'
19 |
--------------------------------------------------------------------------------
/shells/tgif_train.sh:
--------------------------------------------------------------------------------
1 | GPU=$1
2 | CUDA_VISIBLE_DEVICES=$GPU python main.py --checkpoint_dir=tgifqa \
3 | --dataset=tgifqa/action \
4 | --mc=5 \
5 | --epochs=30 \
6 | --lr=0.00001 \
7 | --qmax_words=0 \
8 | --amax_words=20 \
9 | --max_feats=32 \
10 | --batch_size=64 \
11 | --batch_size_val=64 \
12 | --num_thread_reader=4 \
13 | --mlm_prob=0 \
14 | --n_layers=1 \
15 | --embd_dim=512 \
16 | --ff_dim=1024 \
17 | --dropout=0.3 \
18 | --save_dir='../data/save_models/tgifqa/action/VGT/' \
19 | --seed=666 \
20 | # --pretrain_path=../data/save_models/webvid/180K/e1.pth
21 |
--------------------------------------------------------------------------------
/shells/webvid_train.sh:
--------------------------------------------------------------------------------
1 | GPU=$1
2 | CUDA_VISIBLE_DEVICES=$GPU python main.py --checkpoint_dir=webvid \
3 | --dataset=webvid \
4 | --mc=64 \
5 | --epochs=3 \
6 | --lr=0.00005 \
7 | --qmax_words=0 \
8 | --amax_words=20 \
9 | --max_feats=32 \
10 | --batch_size=64 \
11 | --batch_size_val=64 \
12 | --num_thread_reader=16 \
13 | --mlm_prob=0.15 \
14 | --n_layers=1 \
15 | --embd_dim=512 \
16 | --ff_dim=1024 \
17 | --dropout=0.3 \
18 | --save_dir='./save_models/webvid/025/' \
19 | --seed=666 \
20 |
21 |
--------------------------------------------------------------------------------
/tools/__pycache__/object_align.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/doc-doc/CoVGT/cbc9fa7830b304f3c3f9c53040489ea9ad35a9aa/tools/__pycache__/object_align.cpython-38.pyc
--------------------------------------------------------------------------------
/tools/bbox_visualizer.py:
--------------------------------------------------------------------------------
1 | import cv2
2 |
3 |
4 | def draw_rectangle(img,
5 | bbox,
6 | bbox_color=(255, 255, 255),
7 | thickness=3,
8 | is_opaque=False,
9 | alpha=0.5):
10 | """Draws the rectangle around the object
11 |
12 | Parameters
13 | ----------
14 | img : ndarray
15 | the actual image
16 | bbox : list
17 | a list containing x_min, y_min, x_max and y_max of the rectangle positions
18 | bbox_color : tuple, optional
19 | the color of the box, by default (255,255,255)
20 | thickness : int, optional
21 | thickness of the outline of the box, by default 3
22 | is_opaque : bool, optional
23 | if False, draws a solid rectangular outline. Else, a filled rectangle which is semi transparent, by default False
24 | alpha : float, optional
25 | strength of the opacity, by default 0.5
26 |
27 | Returns
28 | -------
29 | ndarray
30 | the image with the bounding box drawn
31 | """
32 |
33 | output = img.copy()
34 | if not is_opaque:
35 | cv2.rectangle(output, (bbox[0], bbox[1]), (bbox[2], bbox[3]),
36 | bbox_color, thickness)
37 | else:
38 | overlay = img.copy()
39 |
40 | cv2.rectangle(overlay, (bbox[0], bbox[1]), (bbox[2], bbox[3]),
41 | bbox_color, -1)
42 | cv2.addWeighted(overlay, alpha, output, 1 - alpha, 0, output)
43 |
44 | return output
45 |
46 |
47 | def add_label(img,
48 | label,
49 | bbox,
50 | draw_bg=True,
51 | text_bg_color=(255, 255, 255),
52 | text_color=(0, 0, 0),
53 | top=True):
54 | """adds label, inside or outside the rectangle
55 |
56 | Parameters
57 | ----------
58 | img : ndarray
59 | the image on which the label is to be written, preferably the image with the rectangular bounding box drawn
60 | label : str
61 | the text (label) to be written
62 | bbox : list
63 | a list containing x_min, y_min, x_max and y_max of the rectangle positions
64 | draw_bg : bool, optional
65 | if True, draws the background of the text, else just the text is written, by default True
66 | text_bg_color : tuple, optional
67 | the background color of the label that is filled, by default (255, 255, 255)
68 | text_color : tuple, optional
69 | color of the text (label) to be written, by default (0, 0, 0)
70 | top : bool, optional
71 | if True, writes the label on top of the bounding box, else inside, by default True
72 |
73 | Returns
74 | -------
75 | ndarray
76 | the image with the label written
77 | """
78 |
79 | text_width = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 1, 2)[0][0]
80 |
81 | if top:
82 | label_bg = [bbox[0], bbox[1], bbox[0] + text_width, bbox[1] - 30]
83 | if draw_bg:
84 | cv2.rectangle(img, (label_bg[0], label_bg[1]),
85 | (label_bg[2] + 5, label_bg[3]), text_bg_color, -1)
86 | cv2.putText(img, label, (bbox[0] + 5, bbox[1] - 5),
87 | cv2.FONT_HERSHEY_SIMPLEX, 1, text_color, 2)
88 |
89 | else:
90 | label_bg = [bbox[0], bbox[1], bbox[0] + text_width, bbox[1] + 30]
91 | if draw_bg:
92 | cv2.rectangle(img, (label_bg[0], label_bg[1]),
93 | (label_bg[2] + 5, label_bg[3]), text_bg_color, -1)
94 | cv2.putText(img, label, (bbox[0] + 5, bbox[1] - 5 + 30),
95 | cv2.FONT_HERSHEY_SIMPLEX, 1, text_color, 2)
96 |
97 | return img
98 |
99 |
100 | def add_T_label(img,
101 | label,
102 | bbox,
103 | draw_bg=True,
104 | text_bg_color=(255, 255, 255),
105 | text_color=(0, 0, 0)):
106 | """adds a T label to the rectangle, originating from the top of the rectangle
107 |
108 | Parameters
109 | ----------
110 | img : ndarray
111 | the image on which the T label is to be written/drawn, preferably the image with the rectangular bounding box drawn
112 | label : str
113 | the text (label) to be written
114 | bbox : list
115 | a list containing x_min, y_min, x_max and y_max of the rectangle positions
116 | draw_bg : bool, optional
117 | if True, draws the background of the text, else just the text is written, by default True
118 | text_bg_color : tuple, optional
119 | the background color of the label that is filled, by default (255, 255, 255)
120 | text_color : tuple, optional
121 | color of the text (label) to be written, by default (0, 0, 0)
122 |
123 | Returns
124 | -------
125 | ndarray
126 | the image with the T label drawn/written
127 | """
128 |
129 | text_width = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 1, 2)[0][0]
130 | text_height = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 1, 2)[0][1]
131 |
132 | # draw vertical line
133 | x_center = (bbox[0] + bbox[2]) // 2
134 | y_top = bbox[1] - 50
135 | cv2.line(img, (x_center, bbox[1]), (x_center, y_top), text_bg_color, 3)
136 |
137 | # draw rectangle with label
138 | y_bottom = y_top
139 | y_top = y_bottom - text_height - 5
140 | x_left = x_center - (text_width // 2) - 5
141 | x_right = x_center + (text_width // 2) + 5
142 | if draw_bg:
143 | cv2.rectangle(img, (x_left, y_top - 3), (x_right, y_bottom),
144 | text_bg_color, -1)
145 | cv2.putText(img, label, (x_left + 5, y_bottom - 7),
146 | cv2.FONT_HERSHEY_SIMPLEX, 1, text_color, 2)
147 |
148 | return img
149 |
150 |
151 | def draw_flag_with_label(img,
152 | label,
153 | bbox,
154 | write_label=True,
155 | line_color=(255, 255, 255),
156 | text_bg_color=(255, 255, 255),
157 | text_color=(0, 0, 0)):
158 | """draws a pole from the middle of the object that is to be labeled and adds the label to the flag
159 |
160 | Parameters
161 | ----------
162 | img : ndarray
163 | the image on which the flag is to be drawn
164 | label : str
165 | label that is written inside the flag
166 | bbox : list
167 | a list containing x_min, y_min, x_max and y_max of the rectangle positions
168 | write_label : bool, optional
169 | if True, writes the label, otherwise, it's just a vertical line, by default True
170 | line_color : tuple, optional
171 | the color of the pole of the flag, by default (255, 255, 255)
172 | text_bg_color : tuple, optional
173 | the background color of the label that is filled, by default (255, 255, 255)
174 | text_color : tuple, optional
175 | color of the text (label) to be written, by default (0, 0, 0)
176 |
177 | Returns
178 | -------
179 | ndarray
180 | the image with flag drawn and the label written in the flag
181 | """
182 |
183 | # draw vertical line
184 |
185 | x_center = (bbox[0] + bbox[2]) // 2
186 | y_bottom = int((bbox[1] * .75 + bbox[3] * .25))
187 | y_top = bbox[1] - (y_bottom - bbox[1])
188 |
189 | start_point = (x_center, y_top)
190 | end_point = (x_center, y_bottom)
191 |
192 | cv2.line(img, start_point, end_point, line_color, 3)
193 |
194 | # write label
195 |
196 | if write_label:
197 | text_width = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 1,
198 | 2)[0][0]
199 | label_bg = [
200 | start_point[0], start_point[1], start_point[0] + text_width,
201 | start_point[1] + 30
202 | ]
203 | cv2.rectangle(img, (label_bg[0], label_bg[1]),
204 | (label_bg[2] + 5, label_bg[3]), text_bg_color, -1)
205 | cv2.putText(img, label, (start_point[0] + 5, start_point[1] - 5 + 30),
206 | cv2.FONT_HERSHEY_SIMPLEX, 1, text_color, 2)
207 |
208 | return img
209 |
210 |
211 | # THE FOLLOWING ARE OPTIONAL FUNCTIONS THAT CAN BE USED FOR DRAWING OR LABELLING MULTIPLE OBJECTS IN THE SAME
212 | # IMAGE. IN ORDER TO HAVE FULL CONTROL OF YOUR VISUALIZATIONS IT IS ADVISABLE TO USE THE ABOVE FUNCTIONS IN FOR LOOPS
213 | # INSTEAD OF THE FUNCTIONS BELOW
214 |
215 |
216 | def draw_multiple_rectangles(img,
217 | bboxes,
218 | bbox_color=(255, 255, 255),
219 | thickness=2,
220 | is_opaque=False,
221 | alpha=0.5):
222 | """draws multiple rectangles
223 |
224 | img : ndarray
225 | the actual image
226 | bboxes : list
227 | a list of lists, each inner list containing x_min, y_min, x_max and y_max of the rectangle positions
228 | bbox_color : tuple, optional
229 | the color of the boxes, by default (255,255,255)
230 | thickness : int, optional
231 | thickness of the outline of the boxes, by default 3
232 | is_opaque : bool, optional
233 | if False, draws solid rectangular outlines for rectangles. Else, filled rectangles which are semi transparent, by default False
234 | alpha : float, optional
235 | strength of the opacity, by default 0.5
236 |
237 | Returns
238 | -------
239 | ndarray
240 | the image with the bounding boxes drawn
241 | """
242 |
243 | for bid, bbox in enumerate(bboxes):
244 | img = draw_rectangle(img, bbox, bbox_color[bid], thickness, is_opaque,
245 | alpha)
246 | return img
247 |
248 |
249 | def add_multiple_labels(img,
250 | labels,
251 | bboxes,
252 | draw_bg=True,
253 | text_bg_color=(255, 255, 255),
254 | text_color=(0, 0, 0),
255 | top=True):
256 | """add labels, inside or outside the rectangles
257 |
258 | Parameters
259 | ----------
260 | img : ndarray
261 | the image on which the labels are to be written, preferably the image with the rectangular bounding boxes drawn
262 | labels : list
263 | a list of string of the texts (labels) to be written
264 | bboxes : list
265 | a list of lists, each inner list containing x_min, y_min, x_max and y_max of the rectangle positions
266 | draw_bg : bool, optional
267 | if True, draws the background of the texts, else just the texts are written, by default True
268 | text_bg_color : tuple, optional
269 | the background color of the labels that are filled, by default (255, 255, 255)
270 | text_color : tuple, optional
271 | color of the texts (labels) to be written, by default (0, 0, 0)
272 | top : bool, optional
273 | if True, writes the labels on top of the bounding boxes, else inside, by default True
274 |
275 | Returns
276 | -------
277 | ndarray
278 | the image with the labels written
279 | """
280 |
281 | for label, bbox in zip(labels, bboxes):
282 | img = add_label(img, label, bbox, draw_bg, text_bg_color, text_color,
283 | top)
284 |
285 | return img
286 |
287 |
288 | def add_multiple_T_labels(img,
289 | labels,
290 | bboxes,
291 | draw_bg=True,
292 | text_bg_color=(255, 255, 255),
293 | text_color=(0, 0, 0)):
294 | """adds T labels to the rectangles, each originating from the top of the rectangle
295 |
296 | Parameters
297 | ----------
298 | img : ndarray
299 | the image on which the T labels are to be written/drawn, preferably the image with the rectangular bounding boxes drawn
300 | labels : list
301 | the texts (labels) to be written
302 | bboxes : list
303 | a list of lists, each inner list containing x_min, y_min, x_max and y_max of the rectangle positions
304 | draw_bg : bool, optional
305 | if True, draws the background of the texts, else just the texts are written, by default True
306 | text_bg_color : tuple, optional
307 | the background color of the labels that are filled, by default (255, 255, 255)
308 | text_color : tuple, optional
309 | color of the texts (labels) to be written, by default (0, 0, 0)
310 |
311 | Returns
312 | -------
313 | ndarray
314 | the image with the T labels drawn/written
315 | """
316 |
317 | for label, bbox in zip(labels, bboxes):
318 | add_T_label(img, label, bbox, draw_bg, text_bg_color, text_color)
319 |
320 | return img
321 |
322 |
323 | def draw_multiple_flags_with_labels(img,
324 | labels,
325 | bboxes,
326 | write_label=True,
327 | line_color=(255, 255, 255),
328 | text_bg_color=(255, 255, 255),
329 | text_color=(0, 0, 0)):
330 | """draws poles from the middle of the objects that are to be labeled and adds the labels to the flags
331 |
332 | Parameters
333 | ----------
334 | img : ndarray
335 | the image on which the flags are to be drawn
336 | labels : list
337 | labels that are written inside the flags
338 | bbox : list
339 | a list of lists, each inner list containing x_min, y_min, x_max and y_max of the rectangle positions
340 | write_label : bool, optional
341 | if True, writes the labels, otherwise, it's just a vertical line for each object, by default True
342 | line_color : tuple, optional
343 | the color of the pole of the flags, by default (255, 255, 255)
344 | text_bg_color : tuple, optional
345 | the background color of the labels that are filled, by default (255, 255, 255)
346 | text_color : tuple, optional
347 | color of the texts (labels) to be written, by default (0, 0, 0)
348 |
349 | Returns
350 | -------
351 | ndarray
352 | the image with flags drawn and the labels written in the flags
353 | """
354 |
355 | for label, bbox in zip(labels, bboxes):
356 | img = draw_flag_with_label(img, label, bbox, write_label, line_color,
357 | text_bg_color, text_color)
358 | return img
359 |
--------------------------------------------------------------------------------
/tools/colors.txt:
--------------------------------------------------------------------------------
1 | 255 0 0
2 | 255 255 0
3 | 0 255 0
4 | 255 153 18
5 | 0 255 255
6 | 63 211 144
7 | 240 141 163
8 | 149 139 206
9 | 166 31 247
10 | 210 148 204
11 | 196 142 86
12 | 138 48 98
13 | 85 16 165
14 | 84 103 158
15 | 186 202 87
16 | 149 52 56
17 | 169 184 132
18 | 156 176 226
19 | 233 214 139
20 | 35 124 145
21 | 10 116 109
22 | 89 231 101
23 | 198 145 242
24 | 113 43 121
25 | 49 61 103
26 | 196 239 149
27 | 227 80 71
28 | 70 3 76
29 | 143 43 181
30 | 159 31 2
31 | 171 53 200
32 | 233 49 105
33 | 75 127 208
34 | 221 246 66
35 | 238 11 216
36 | 101 36 178
37 | 198 5 97
38 | 42 179 23
39 | 124 62 186
40 | 25 90 250
41 | 180 50 78
42 | 40 107 146
43 | 147 80 68
44 | 110 147 182
45 | 141 199 99
46 | 183 74 21
47 | 6 157 170
48 | 133 168 215
49 | 18 51 5
50 | 136 196 212
51 | 224 237 188
52 | 172 61 214
53 |
--------------------------------------------------------------------------------
/tools/datautils/msrvtt_qa.py:
--------------------------------------------------------------------------------
1 | import json
2 | from datautils import utils
3 | import nltk
4 | from collections import Counter
5 |
6 | import pickle
7 | import numpy as np
8 |
9 |
10 | def load_video_paths(args):
11 | ''' Load a list of (path,image_id tuples).'''
12 | video_paths = []
13 | modes = ['train', 'val', 'test']
14 | for mode in modes:
15 | with open(args.annotation_file.format(mode), 'r') as anno_file:
16 | instances = json.load(anno_file)
17 | video_ids = [instance['video_id'] for instance in instances]
18 | video_ids = set(video_ids)
19 | if mode in ['train', 'val']:
20 | for video_id in video_ids:
21 | video_paths.append((args.video_dir + 'videos/video{}.mp4'.format(video_id), video_id))
22 | else:
23 | for video_id in video_ids:
24 | video_paths.append((args.video_dir + 'videos/video{}.mp4'.format(video_id), video_id))
25 |
26 | return video_paths
27 |
28 |
29 | def process_questions(args):
30 | ''' Encode question tokens'''
31 | print('Loading dataset')
32 | with open(args.annotation_file, 'r') as dataset_file:
33 | instances = json.load(dataset_file)
34 |
35 | # Either create the vocab or load it from disk
36 | if args.mode in ['train']:
37 | print('Building vocab')
38 | answer_cnt = {}
39 | for instance in instances:
40 | answer = instance['answer']
41 | answer_cnt[answer] = answer_cnt.get(answer, 0) + 1
42 |
43 | answer_token_to_idx = {'': 0, '': 1}
44 | answer_counter = Counter(answer_cnt)
45 | frequent_answers = answer_counter.most_common(args.answer_top)
46 | total_ans = sum(item[1] for item in answer_counter.items())
47 | total_freq_ans = sum(item[1] for item in frequent_answers)
48 | print("Number of unique answers:", len(answer_counter))
49 | print("Total number of answers:", total_ans)
50 | print("Top %i answers account for %f%%" % (len(frequent_answers), total_freq_ans * 100.0 / total_ans))
51 |
52 | for token, cnt in Counter(answer_cnt).most_common(args.answer_top):
53 | answer_token_to_idx[token] = len(answer_token_to_idx)
54 | print('Get answer_token_to_idx, num: %d' % len(answer_token_to_idx))
55 |
56 | question_token_to_idx = {'': 0, '': 1}
57 | for i, instance in enumerate(instances):
58 | question = instance['question'].lower()[:-1]
59 | for token in nltk.word_tokenize(question):
60 | if token not in question_token_to_idx:
61 | question_token_to_idx[token] = len(question_token_to_idx)
62 | print('Get question_token_to_idx')
63 | print(len(question_token_to_idx))
64 |
65 | vocab = {
66 | 'question_token_to_idx': question_token_to_idx,
67 | 'answer_token_to_idx': answer_token_to_idx,
68 | 'question_answer_token_to_idx': {'': 0, '': 1}
69 | }
70 |
71 | print('Write into %s' % args.vocab_json.format(args.dataset, args.dataset))
72 | with open(args.vocab_json.format(args.dataset, args.dataset), 'w') as f:
73 | json.dump(vocab, f, indent=4)
74 | else:
75 | print('Loading vocab')
76 | with open(args.vocab_json.format(args.dataset, args.dataset), 'r') as f:
77 | vocab = json.load(f)
78 |
79 | # Encode all questions
80 | print('Encoding dataset')
81 | questions_encoded = []
82 | questions_len = []
83 | question_ids = []
84 | video_ids_tbw = []
85 | video_names_tbw = []
86 | all_answers = []
87 | for idx, instance in enumerate(instances):
88 | question = instance['question'].lower()[:-1]
89 | question_tokens = nltk.word_tokenize(question)
90 | question_encoded = utils.encode(question_tokens, vocab['question_token_to_idx'], allow_unk=True)
91 | questions_encoded.append(question_encoded)
92 | questions_len.append(len(question_encoded))
93 | question_ids.append(idx)
94 | im_name = instance['video_id']
95 | video_ids_tbw.append(im_name)
96 | video_names_tbw.append(im_name)
97 |
98 | if instance['answer'] in vocab['answer_token_to_idx']:
99 | answer = vocab['answer_token_to_idx'][instance['answer']]
100 | elif args.mode in ['train']:
101 | answer = 0
102 | elif args.mode in ['val', 'test']:
103 | answer = 1
104 |
105 | all_answers.append(answer)
106 | max_question_length = max(len(x) for x in questions_encoded)
107 | for qe in questions_encoded:
108 | while len(qe) < max_question_length:
109 | qe.append(vocab['question_token_to_idx'][''])
110 |
111 | questions_encoded = np.asarray(questions_encoded, dtype=np.int32)
112 | questions_len = np.asarray(questions_len, dtype=np.int32)
113 | print(questions_encoded.shape)
114 |
115 | glove_matrix = None
116 | if args.mode == 'train':
117 | token_itow = {i: w for w, i in vocab['question_token_to_idx'].items()}
118 | print("Load glove from %s" % args.glove_pt)
119 | glove = pickle.load(open(args.glove_pt, 'rb'))
120 | dim_word = glove['the'].shape[0]
121 | glove_matrix = []
122 | for i in range(len(token_itow)):
123 | vector = glove.get(token_itow[i], np.zeros((dim_word,)))
124 | glove_matrix.append(vector)
125 | glove_matrix = np.asarray(glove_matrix, dtype=np.float32)
126 | print(glove_matrix.shape)
127 |
128 | print('Writing', args.output_pt.format(args.dataset, args.dataset, args.mode))
129 | obj = {
130 | 'questions': questions_encoded,
131 | 'questions_len': questions_len,
132 | 'question_id': question_ids,
133 | 'video_ids': np.asarray(video_ids_tbw),
134 | 'video_names': np.array(video_names_tbw),
135 | 'answers': all_answers,
136 | 'glove': glove_matrix,
137 | }
138 | with open(args.output_pt.format(args.dataset, args.dataset, args.mode), 'wb') as f:
139 | pickle.dump(obj, f)
140 |
--------------------------------------------------------------------------------
/tools/datautils/msvd_qa.py:
--------------------------------------------------------------------------------
1 | import json
2 | from datautils import utils
3 | import nltk
4 | from collections import Counter
5 |
6 | import pickle
7 | import numpy as np
8 |
9 |
10 | def load_video_paths(args):
11 | ''' Load a list of (path,image_id tuples).'''
12 | video_paths = []
13 | video_ids = []
14 | modes = ['train', 'val', 'test']
15 | for mode in modes:
16 | with open(args.annotation_file.format(mode), 'r') as anno_file:
17 | instances = json.load(anno_file)
18 | [video_ids.append(instance['video_id']) for instance in instances]
19 | video_ids = set(video_ids)
20 | with open(args.video_name_mapping, 'r') as mapping:
21 | mapping_pairs = mapping.read().split('\n')
22 | mapping_dict = {}
23 | for idx in range(len(mapping_pairs)):
24 | cur_pair = mapping_pairs[idx].split(' ')
25 | mapping_dict[cur_pair[1]] = cur_pair[0]
26 | for video_id in video_ids:
27 | video_paths.append((args.video_dir + 'YouTubeClips/{}.avi'.format(mapping_dict['vid' + str(video_id)]), video_id))
28 | return video_paths
29 |
30 |
31 | def process_questions(args):
32 | ''' Encode question tokens'''
33 | print('Loading dataset')
34 | with open(args.annotation_file, 'r') as dataset_file:
35 | instances = json.load(dataset_file)
36 |
37 | # Either create the vocab or load it from disk
38 | if args.mode in ['train']:
39 | print('Building vocab')
40 | answer_cnt = {}
41 | for instance in instances:
42 | answer = instance['answer']
43 | answer_cnt[answer] = answer_cnt.get(answer, 0) + 1
44 |
45 | answer_token_to_idx = {'': 0, '': 1}
46 | answer_counter = Counter(answer_cnt)
47 | frequent_answers = answer_counter.most_common(args.answer_top)
48 | total_ans = sum(item[1] for item in answer_counter.items())
49 | total_freq_ans = sum(item[1] for item in frequent_answers)
50 | print("Number of unique answers:", len(answer_counter))
51 | print("Total number of answers:", total_ans)
52 | print("Top %i answers account for %f%%" % (len(frequent_answers), total_freq_ans * 100.0 / total_ans))
53 |
54 | for token, cnt in Counter(answer_cnt).most_common(args.answer_top):
55 | answer_token_to_idx[token] = len(answer_token_to_idx)
56 | print('Get answer_token_to_idx, num: %d' % len(answer_token_to_idx))
57 |
58 | question_token_to_idx = {'': 0, '': 1}
59 | for i, instance in enumerate(instances):
60 | question = instance['question'].lower()[:-1]
61 | for token in nltk.word_tokenize(question):
62 | if token not in question_token_to_idx:
63 | question_token_to_idx[token] = len(question_token_to_idx)
64 | print('Get question_token_to_idx')
65 | print(len(question_token_to_idx))
66 |
67 | vocab = {
68 | 'question_token_to_idx': question_token_to_idx,
69 | 'answer_token_to_idx': answer_token_to_idx,
70 | 'question_answer_token_to_idx': {'': 0, '': 1}
71 | }
72 |
73 | print('Write into %s' % args.vocab_json.format(args.dataset, args.dataset))
74 | with open(args.vocab_json.format(args.dataset, args.dataset), 'w') as f:
75 | json.dump(vocab, f, indent=4)
76 | else:
77 | print('Loading vocab')
78 | with open(args.vocab_json.format(args.dataset, args.dataset), 'r') as f:
79 | vocab = json.load(f)
80 |
81 | # Encode all questions
82 | print('Encoding dataset')
83 | questions_encoded = []
84 | questions_len = []
85 | question_ids = []
86 | video_ids_tbw = []
87 | video_names_tbw = []
88 | all_answers = []
89 | for idx, instance in enumerate(instances):
90 | question = instance['question'].lower()[:-1]
91 | question_tokens = nltk.word_tokenize(question)
92 | question_encoded = utils.encode(question_tokens, vocab['question_token_to_idx'], allow_unk=True)
93 | questions_encoded.append(question_encoded)
94 | questions_len.append(len(question_encoded))
95 | question_ids.append(idx)
96 | im_name = instance['video_id']
97 | video_ids_tbw.append(im_name)
98 | video_names_tbw.append(im_name)
99 |
100 | if instance['answer'] in vocab['answer_token_to_idx']:
101 | answer = vocab['answer_token_to_idx'][instance['answer']]
102 | elif args.mode in ['train']:
103 | answer = 0
104 | elif args.mode in ['val', 'test']:
105 | answer = 1
106 |
107 | all_answers.append(answer)
108 | max_question_length = max(len(x) for x in questions_encoded)
109 | for qe in questions_encoded:
110 | while len(qe) < max_question_length:
111 | qe.append(vocab['question_token_to_idx'][''])
112 |
113 | questions_encoded = np.asarray(questions_encoded, dtype=np.int32)
114 | questions_len = np.asarray(questions_len, dtype=np.int32)
115 | print(questions_encoded.shape)
116 |
117 | glove_matrix = None
118 | if args.mode == 'train':
119 | token_itow = {i: w for w, i in vocab['question_token_to_idx'].items()}
120 | print("Load glove from %s" % args.glove_pt)
121 | glove = pickle.load(open(args.glove_pt, 'rb'))
122 | dim_word = glove['the'].shape[0]
123 | glove_matrix = []
124 | for i in range(len(token_itow)):
125 | vector = glove.get(token_itow[i], np.zeros((dim_word,)))
126 | glove_matrix.append(vector)
127 | glove_matrix = np.asarray(glove_matrix, dtype=np.float32)
128 | print(glove_matrix.shape)
129 |
130 | print('Writing', args.output_pt.format(args.dataset, args.dataset, args.mode))
131 | obj = {
132 | 'questions': questions_encoded,
133 | 'questions_len': questions_len,
134 | 'question_id': question_ids,
135 | 'video_ids': np.asarray(video_ids_tbw),
136 | 'video_names': np.array(video_names_tbw),
137 | 'answers': all_answers,
138 | 'glove': glove_matrix,
139 | }
140 | with open(args.output_pt.format(args.dataset, args.dataset, args.mode), 'wb') as f:
141 | pickle.dump(obj, f)
142 |
--------------------------------------------------------------------------------
/tools/datautils/nextqa.py:
--------------------------------------------------------------------------------
1 | import os
2 | import pandas as pd
3 | import json
4 | from datautils import utils
5 | import nltk
6 | import os.path as osp
7 | import pickle
8 | import numpy as np
9 |
10 |
11 | def load_video_paths(args):
12 | ''' Load a list of (path,image_id tuples).'''
13 | input_paths = []
14 | annotation = pd.read_csv(args.annotation_file.format(args.question_type), delimiter='\t')
15 | gif_names = list(annotation['gif_name'])
16 | keys = list(annotation['key'])
17 | print("Number of questions: {}".format(len(gif_names)))
18 | for idx, gif in enumerate(gif_names):
19 | gif_abs_path = os.path.join(args.video_dir, ''.join([gif, '.gif']))
20 | input_paths.append((gif_abs_path, keys[idx]))
21 | input_paths = list(set(input_paths))
22 | print("Number of unique videos: {}".format(len(input_paths)))
23 |
24 | return input_paths
25 |
26 |
27 | def openeded_encoding_data(args, vocab, questions, video_names, video_ids, answers, mode='train'):
28 | ''' Encode question tokens'''
29 | print('Encoding dataset')
30 | questions_encoded = []
31 | questions_len = []
32 | video_ids_tbw = []
33 | video_names_tbw = []
34 | all_answers = []
35 | question_ids = []
36 | for idx, question in enumerate(questions):
37 | question = question.lower()[:-1]
38 | question_tokens = nltk.word_tokenize(question)
39 | question_encoded = utils.encode(question_tokens, vocab['question_token_to_idx'], allow_unk=True)
40 | questions_encoded.append(question_encoded)
41 | questions_len.append(len(question_encoded))
42 | question_ids.append(idx)
43 | video_names_tbw.append(video_names[idx])
44 | video_ids_tbw.append(video_ids[idx])
45 |
46 | if args.question_type == "frameqa":
47 | answer = answers[idx]
48 | if answer in vocab['answer_token_to_idx']:
49 | answer = vocab['answer_token_to_idx'][answer]
50 | elif mode in ['train']:
51 | answer = 0
52 | elif mode in ['val', 'test']:
53 | answer = 1
54 | else:
55 | answer = max(int(answers[idx]), 1)
56 | all_answers.append(answer)
57 |
58 | # Pad encoded questions
59 | max_question_length = max(len(x) for x in questions_encoded)
60 | for qe in questions_encoded:
61 | while len(qe) < max_question_length:
62 | qe.append(vocab['question_token_to_idx'][''])
63 |
64 | questions_encoded = np.asarray(questions_encoded, dtype=np.int32)
65 | questions_len = np.asarray(questions_len, dtype=np.int32)
66 | print(questions_encoded.shape)
67 |
68 | glove_matrix = None
69 | if mode == 'train':
70 | token_itow = {i: w for w, i in vocab['question_token_to_idx'].items()}
71 | print("Load glove from %s" % args.glove_pt)
72 | glove = pickle.load(open(args.glove_pt, 'rb'))
73 | dim_word = glove['the'].shape[0]
74 | glove_matrix = []
75 | for i in range(len(token_itow)):
76 | vector = glove.get(token_itow[i], np.zeros((dim_word,)))
77 | glove_matrix.append(vector)
78 | glove_matrix = np.asarray(glove_matrix, dtype=np.float32)
79 | print(glove_matrix.shape)
80 |
81 | print('Writing ', args.output_pt.format(args.question_type, args.question_type, mode))
82 | obj = {
83 | 'questions': questions_encoded,
84 | 'questions_len': questions_len,
85 | 'question_id': question_ids,
86 | 'video_ids': np.asarray(video_ids_tbw),
87 | 'video_names': np.array(video_names_tbw),
88 | 'answers': all_answers,
89 | 'glove': glove_matrix,
90 | }
91 | with open(args.output_pt.format(args.question_type, args.question_type, mode), 'wb') as f:
92 | pickle.dump(obj, f)
93 |
94 | def multichoice_encoding_data(args, vocab, questions, qns_ids, video_names, video_ids, answers, ans_candidates, mode='train'):
95 | # Encode all questions
96 | print('Encoding dataset')
97 | questions_encoded = []
98 | questions_len = []
99 | question_ids = qns_ids
100 | all_answer_cands_encoded = []
101 | all_answer_cands_len = []
102 | video_ids_tbw = []
103 | video_names_tbw = []
104 | correct_answers = []
105 | for idx, question in enumerate(questions):
106 |
107 | question = question.lower()
108 | question_tokens = nltk.word_tokenize(question)
109 | question_encoded = utils.encode(question_tokens, vocab['question_answer_token_to_idx'], allow_unk=True)
110 | questions_encoded.append(question_encoded)
111 | questions_len.append(len(question_encoded))
112 | # question_ids.append(idx)
113 | video_names_tbw.append(video_names[idx])
114 | video_ids_tbw.append(video_ids[idx])
115 | # grounthtruth
116 | answer = int(answers[idx])
117 | correct_answers.append(answer)
118 | # answer candidates
119 | candidates = ans_candidates[idx]
120 | candidates_encoded = []
121 | candidates_len = []
122 | for ans in candidates:
123 |
124 | ans = ans.lower()
125 | ans_tokens = nltk.word_tokenize(ans)
126 | cand_encoded = utils.encode(ans_tokens, vocab['question_answer_token_to_idx'], allow_unk=True)
127 | candidates_encoded.append(cand_encoded)
128 | candidates_len.append(len(cand_encoded))
129 | all_answer_cands_encoded.append(candidates_encoded)
130 | all_answer_cands_len.append(candidates_len)
131 |
132 | # Pad encoded questions
133 | max_question_length = max(len(x) for x in questions_encoded)
134 | for qe in questions_encoded:
135 | while len(qe) < max_question_length:
136 | qe.append(vocab['question_answer_token_to_idx'][''])
137 |
138 | questions_encoded = np.asarray(questions_encoded, dtype=np.int32)
139 | questions_len = np.asarray(questions_len, dtype=np.int32)
140 | print(questions_encoded.shape)
141 |
142 | # Pad encoded answer candidates
143 | max_answer_cand_length = max(max(len(x) for x in candidate) for candidate in all_answer_cands_encoded)
144 | for ans_cands in all_answer_cands_encoded:
145 | for ans in ans_cands:
146 | while len(ans) < max_answer_cand_length:
147 | ans.append(vocab['question_answer_token_to_idx'][''])
148 | all_answer_cands_encoded = np.asarray(all_answer_cands_encoded, dtype=np.int32)
149 | all_answer_cands_len = np.asarray(all_answer_cands_len, dtype=np.int32)
150 | print(all_answer_cands_encoded.shape)
151 |
152 | glove_matrix = None
153 | # if mode in ['train']:
154 | # token_itow = {i: w for w, i in vocab['question_answer_token_to_idx'].items()}
155 | # print("Load glove from %s" % args.glove_pt)
156 | # glove = pickle.load(open(args.glove_pt, 'rb'))
157 | # dim_word = glove['the'].shape[0]
158 | # glove_matrix = []
159 | # for i in range(len(token_itow)):
160 | # vector = glove.get(token_itow[i], np.zeros((dim_word,)))
161 | # glove_matrix.append(vector)
162 | # glove_matrix = np.asarray(glove_matrix, dtype=np.float32)
163 | # print(glove_matrix.shape)
164 |
165 | print('Writing ', args.output_pt.format(mode))
166 | obj = {
167 | 'questions': questions_encoded,
168 | 'questions_len': questions_len,
169 | 'question_id': question_ids,
170 | 'video_ids': np.asarray(video_ids_tbw),
171 | 'video_names': np.array(video_names_tbw),
172 | 'ans_candidates': all_answer_cands_encoded,
173 | 'ans_candidates_len': all_answer_cands_len,
174 | 'answers': correct_answers,
175 | 'glove': glove_matrix,
176 | }
177 | with open(args.output_pt.format(mode), 'wb') as f:
178 | pickle.dump(obj, f)
179 |
180 | def process_questions_openended(args):
181 | print('Loading dataset')
182 | if args.mode in ["train"]:
183 | csv_data = pd.read_csv(args.annotation_file.format("Train", args.question_type), delimiter='\t')
184 | else:
185 | csv_data = pd.read_csv(args.annotation_file.format("Test", args.question_type), delimiter='\t')
186 | csv_data = csv_data.iloc[np.random.permutation(len(csv_data))]
187 | questions = list(csv_data['question'])
188 | answers = list(csv_data['answer'])
189 | video_names = list(csv_data['gif_name'])
190 | video_ids = list(csv_data['key'])
191 |
192 | print('number of questions: %s' % len(questions))
193 | # Either create the vocab or load it from disk
194 | if args.mode in ['train']:
195 | print('Building vocab')
196 | answer_cnt = {}
197 |
198 | if args.question_type == "frameqa":
199 | for i, answer in enumerate(answers):
200 | answer_cnt[answer] = answer_cnt.get(answer, 0) + 1
201 |
202 | answer_token_to_idx = {'': 0}
203 | for token in answer_cnt:
204 | answer_token_to_idx[token] = len(answer_token_to_idx)
205 | print('Get answer_token_to_idx, num: %d' % len(answer_token_to_idx))
206 | elif args.question_type == 'count':
207 | answer_token_to_idx = {'': 0}
208 |
209 | question_token_to_idx = {'': 0, '': 1}
210 | for i, q in enumerate(questions):
211 | question = q.lower()[:-1]
212 | for token in nltk.word_tokenize(question):
213 | if token not in question_token_to_idx:
214 | question_token_to_idx[token] = len(question_token_to_idx)
215 | print('Get question_token_to_idx')
216 | print(len(question_token_to_idx))
217 |
218 | vocab = {
219 | 'question_token_to_idx': question_token_to_idx,
220 | 'answer_token_to_idx': answer_token_to_idx,
221 | 'question_answer_token_to_idx': {'': 0, '': 1}
222 | }
223 |
224 | print('Write into %s' % args.vocab_json.format(args.question_type, args.question_type))
225 | with open(args.vocab_json.format(args.question_type, args.question_type), 'w') as f:
226 | json.dump(vocab, f, indent=4)
227 |
228 | # split 10% of questions for evaluation
229 | split = int(0.9 * len(questions))
230 | train_questions = questions[:split]
231 | train_answers = answers[:split]
232 | train_video_names = video_names[:split]
233 | train_video_ids = video_ids[:split]
234 |
235 | val_questions = questions[split:]
236 | val_answers = answers[split:]
237 | val_video_names = video_names[split:]
238 | val_video_ids = video_ids[split:]
239 |
240 | openeded_encoding_data(args, vocab, train_questions, train_video_names, train_video_ids, train_answers, mode='train')
241 | openeded_encoding_data(args, vocab, val_questions, val_video_names, val_video_ids, val_answers, mode='val')
242 | else:
243 | print('Loading vocab')
244 | with open(args.vocab_json.format(args.question_type, args.question_type), 'r') as f:
245 | vocab = json.load(f)
246 | openeded_encoding_data(args, vocab, questions, video_names, video_ids, answers, mode='test')
247 |
248 |
249 | def process_questions_mulchoices(args):
250 | print('Loading dataset')
251 | # if args.mode in ["train", "val"]:
252 | # csv_data = pd.read_csv(args.annotation_file.format("Train", args.question_type), delimiter=',')
253 | # else:
254 | # csv_data = pd.read_csv(args.annotation_file.format("Test", args.question_type), delimiter=',')
255 |
256 | if args.mode == 'all':
257 | csv_data = pd.read_csv(args.annotation_file.format(args.mode), delimiter=',').astype('string')
258 | else:
259 | csv_data = pd.read_csv(args.annotation_file.format(args.mode), delimiter=',').astype('string')
260 |
261 | # csv_data = csv_data.iloc[np.random.permutation(len(csv_data))]
262 | questions = list(csv_data['question'])
263 | answers = list(csv_data['answer'])
264 | video_names = list(csv_data['video'])
265 | video_ids = list(csv_data['video'])
266 | qns_ids = list(csv_data['qid'])
267 | qns_ids = [vname+'_'+qid for vname, qid in zip(video_names, qns_ids)]
268 | ans_candidates = np.asarray([csv_data['a0'], csv_data['a1'], csv_data['a2'], csv_data['a3'], csv_data['a4']])
269 | ans_candidates = ans_candidates.transpose()
270 | print(ans_candidates.shape)
271 | # ans_candidates: (num_ques, 5)
272 | print('number of questions: %s' % len(questions))
273 | # Either create the vocab or load it from disk
274 | #if args.mode in ['train']:
275 | if not osp.exists(args.vocab_json.format('train')):
276 | print('Building vocab')
277 | answer_token_to_idx = {'': 0, '': 1}
278 | question_answer_token_to_idx = {'': 0, '': 1}
279 | for candidates in ans_candidates:
280 | #print(candidates)
281 | for ans in candidates:
282 | if type(ans) != 'str': continue
283 | ans = ans.lower()
284 | for token in nltk.word_tokenize(ans):
285 | if token not in answer_token_to_idx:
286 | answer_token_to_idx[token] = len(answer_token_to_idx)
287 | if token not in question_answer_token_to_idx:
288 | question_answer_token_to_idx[token] = len(question_answer_token_to_idx)
289 | print('Get answer_token_to_idx, num: %d' % len(answer_token_to_idx))
290 |
291 | question_token_to_idx = {'': 0, '': 1}
292 | for i, q in enumerate(questions):
293 | question = str(q).lower()[:-1]
294 | for token in nltk.word_tokenize(question):
295 | if token not in question_token_to_idx:
296 | question_token_to_idx[token] = len(question_token_to_idx)
297 | if token not in question_answer_token_to_idx:
298 | question_answer_token_to_idx[token] = len(question_answer_token_to_idx)
299 |
300 | print('Get question_token_to_idx')
301 | print(len(question_token_to_idx))
302 | print('Get question_answer_token_to_idx')
303 | print(len(question_answer_token_to_idx))
304 |
305 | vocab = {
306 | 'question_token_to_idx': question_token_to_idx,
307 | 'answer_token_to_idx': answer_token_to_idx,
308 | 'question_answer_token_to_idx': question_answer_token_to_idx,
309 | }
310 |
311 | print('Write into %s' % args.vocab_json.format(args.mode))
312 | with open(args.vocab_json.format(args.mode), 'w') as f:
313 | json.dump(vocab, f, indent=4)
314 |
315 | # split 10% of questions for evaluation
316 | # split = int(0.9 * len(questions))
317 | # train_questions = questions[:split]
318 | # train_answers = answers[:split]
319 | # train_video_names = video_names[:split]
320 | # train_video_ids = video_ids[:split]
321 | # train_ans_candidates = ans_candidates[:split, :]
322 | #
323 | # val_questions = questions[split:]
324 | # val_answers = answers[split:]
325 | # val_video_names = video_names[split:]
326 | # val_video_ids = video_ids[split:]
327 | # val_ans_candidates = ans_candidates[split:, :]
328 |
329 | multichoice_encoding_data(args, vocab, questions, qns_ids, video_names, video_ids, answers,
330 | ans_candidates, mode='train')
331 | # multichoice_encoding_data(args, vocab, val_questions, val_video_names, val_video_ids, val_answers,
332 | # val_ans_candidates, mode='val')
333 | else:
334 | print('Loading vocab')
335 | with open(args.vocab_json.format('train'), 'r') as f:
336 | vocab = json.load(f)
337 | multichoice_encoding_data(args, vocab, questions, qns_ids, video_names, video_ids, answers,
338 | ans_candidates, mode=args.mode)
339 |
--------------------------------------------------------------------------------
/tools/datautils/tgif_qa.py:
--------------------------------------------------------------------------------
1 | import os
2 | import pandas as pd
3 | import json
4 | from datautils import utils
5 | import nltk
6 |
7 | import pickle
8 | import numpy as np
9 |
10 |
11 | def load_video_paths(args):
12 | ''' Load a list of (path,image_id tuples).'''
13 | input_paths = []
14 | annotation = pd.read_csv(args.annotation_file.format(args.question_type), delimiter='\t')
15 | gif_names = list(annotation['gif_name'])
16 | keys = list(annotation['key'])
17 | print("Number of questions: {}".format(len(gif_names)))
18 | for idx, gif in enumerate(gif_names):
19 | gif_abs_path = os.path.join(args.video_dir, ''.join([gif, '.gif']))
20 | input_paths.append((gif_abs_path, keys[idx]))
21 | input_paths = list(set(input_paths))
22 | print("Number of unique videos: {}".format(len(input_paths)))
23 |
24 | return input_paths
25 |
26 |
27 | def openeded_encoding_data(args, vocab, questions, video_names, video_ids, answers, mode='train'):
28 | ''' Encode question tokens'''
29 | print('Encoding dataset')
30 | questions_encoded = []
31 | questions_len = []
32 | video_ids_tbw = []
33 | video_names_tbw = []
34 | all_answers = []
35 | question_ids = []
36 | for idx, question in enumerate(questions):
37 | question = question.lower()[:-1]
38 | question_tokens = nltk.word_tokenize(question)
39 | question_encoded = utils.encode(question_tokens, vocab['question_token_to_idx'], allow_unk=True)
40 | questions_encoded.append(question_encoded)
41 | questions_len.append(len(question_encoded))
42 | question_ids.append(idx)
43 | video_names_tbw.append(video_names[idx])
44 | video_ids_tbw.append(video_ids[idx])
45 |
46 | if args.question_type == "frameqa":
47 | answer = answers[idx]
48 | if answer in vocab['answer_token_to_idx']:
49 | answer = vocab['answer_token_to_idx'][answer]
50 | elif mode in ['train']:
51 | answer = 0
52 | elif mode in ['val', 'test']:
53 | answer = 1
54 | else:
55 | answer = max(int(answers[idx]), 1)
56 | all_answers.append(answer)
57 |
58 | # Pad encoded questions
59 | max_question_length = max(len(x) for x in questions_encoded)
60 | for qe in questions_encoded:
61 | while len(qe) < max_question_length:
62 | qe.append(vocab['question_token_to_idx'][''])
63 |
64 | questions_encoded = np.asarray(questions_encoded, dtype=np.int32)
65 | questions_len = np.asarray(questions_len, dtype=np.int32)
66 | print(questions_encoded.shape)
67 |
68 | glove_matrix = None
69 | if mode == 'train':
70 | token_itow = {i: w for w, i in vocab['question_token_to_idx'].items()}
71 | print("Load glove from %s" % args.glove_pt)
72 | glove = pickle.load(open(args.glove_pt, 'rb'))
73 | dim_word = glove['the'].shape[0]
74 | glove_matrix = []
75 | for i in range(len(token_itow)):
76 | vector = glove.get(token_itow[i], np.zeros((dim_word,)))
77 | glove_matrix.append(vector)
78 | glove_matrix = np.asarray(glove_matrix, dtype=np.float32)
79 | print(glove_matrix.shape)
80 |
81 | print('Writing ', args.output_pt.format(args.question_type, args.question_type, mode))
82 | obj = {
83 | 'questions': questions_encoded,
84 | 'questions_len': questions_len,
85 | 'question_id': question_ids,
86 | 'video_ids': np.asarray(video_ids_tbw),
87 | 'video_names': np.array(video_names_tbw),
88 | 'answers': all_answers,
89 | 'glove': glove_matrix,
90 | }
91 | with open(args.output_pt.format(args.question_type, args.question_type, mode), 'wb') as f:
92 | pickle.dump(obj, f)
93 |
94 | def multichoice_encoding_data(args, vocab, questions, video_names, video_ids, answers, ans_candidates, mode='train'):
95 | # Encode all questions
96 | print('Encoding dataset')
97 | questions_encoded = []
98 | questions_len = []
99 | question_ids = []
100 | all_answer_cands_encoded = []
101 | all_answer_cands_len = []
102 | video_ids_tbw = []
103 | video_names_tbw = []
104 | correct_answers = []
105 | for idx, question in enumerate(questions):
106 | question = question.lower()[:-1]
107 | question_tokens = nltk.word_tokenize(question)
108 | question_encoded = utils.encode(question_tokens, vocab['question_answer_token_to_idx'], allow_unk=True)
109 | questions_encoded.append(question_encoded)
110 | questions_len.append(len(question_encoded))
111 | question_ids.append(idx)
112 | video_names_tbw.append(video_names[idx])
113 | video_ids_tbw.append(video_ids[idx])
114 | # grounthtruth
115 | answer = int(answers[idx])
116 | correct_answers.append(answer)
117 | # answer candidates
118 | candidates = ans_candidates[idx]
119 | candidates_encoded = []
120 | candidates_len = []
121 | for ans in candidates:
122 | ans = ans.lower()
123 | ans_tokens = nltk.word_tokenize(ans)
124 | cand_encoded = utils.encode(ans_tokens, vocab['question_answer_token_to_idx'], allow_unk=True)
125 | candidates_encoded.append(cand_encoded)
126 | candidates_len.append(len(cand_encoded))
127 | all_answer_cands_encoded.append(candidates_encoded)
128 | all_answer_cands_len.append(candidates_len)
129 |
130 | # Pad encoded questions
131 | max_question_length = max(len(x) for x in questions_encoded)
132 | for qe in questions_encoded:
133 | while len(qe) < max_question_length:
134 | qe.append(vocab['question_answer_token_to_idx'][''])
135 |
136 | questions_encoded = np.asarray(questions_encoded, dtype=np.int32)
137 | questions_len = np.asarray(questions_len, dtype=np.int32)
138 | print(questions_encoded.shape)
139 |
140 | # Pad encoded answer candidates
141 | max_answer_cand_length = max(max(len(x) for x in candidate) for candidate in all_answer_cands_encoded)
142 | for ans_cands in all_answer_cands_encoded:
143 | for ans in ans_cands:
144 | while len(ans) < max_answer_cand_length:
145 | ans.append(vocab['question_answer_token_to_idx'][''])
146 | all_answer_cands_encoded = np.asarray(all_answer_cands_encoded, dtype=np.int32)
147 | all_answer_cands_len = np.asarray(all_answer_cands_len, dtype=np.int32)
148 | print(all_answer_cands_encoded.shape)
149 |
150 | glove_matrix = None
151 | if mode in ['train']:
152 | token_itow = {i: w for w, i in vocab['question_answer_token_to_idx'].items()}
153 | print("Load glove from %s" % args.glove_pt)
154 | glove = pickle.load(open(args.glove_pt, 'rb'))
155 | dim_word = glove['the'].shape[0]
156 | glove_matrix = []
157 | for i in range(len(token_itow)):
158 | vector = glove.get(token_itow[i], np.zeros((dim_word,)))
159 | glove_matrix.append(vector)
160 | glove_matrix = np.asarray(glove_matrix, dtype=np.float32)
161 | print(glove_matrix.shape)
162 |
163 | print('Writing ', args.output_pt.format(args.question_type, args.question_type, mode))
164 | obj = {
165 | 'questions': questions_encoded,
166 | 'questions_len': questions_len,
167 | 'question_id': question_ids,
168 | 'video_ids': np.asarray(video_ids_tbw),
169 | 'video_names': np.array(video_names_tbw),
170 | 'ans_candidates': all_answer_cands_encoded,
171 | 'ans_candidates_len': all_answer_cands_len,
172 | 'answers': correct_answers,
173 | 'glove': glove_matrix,
174 | }
175 | with open(args.output_pt.format(args.question_type, args.question_type, mode), 'wb') as f:
176 | pickle.dump(obj, f)
177 |
178 | def process_questions_openended(args):
179 | print('Loading dataset')
180 | if args.mode in ["train"]:
181 | csv_data = pd.read_csv(args.annotation_file.format("Train", args.question_type), delimiter='\t')
182 | else:
183 | csv_data = pd.read_csv(args.annotation_file.format("Test", args.question_type), delimiter='\t')
184 | csv_data = csv_data.iloc[np.random.permutation(len(csv_data))]
185 | questions = list(csv_data['question'])
186 | answers = list(csv_data['answer'])
187 | video_names = list(csv_data['gif_name'])
188 | video_ids = list(csv_data['key'])
189 |
190 | print('number of questions: %s' % len(questions))
191 | # Either create the vocab or load it from disk
192 | if args.mode in ['train']:
193 | print('Building vocab')
194 | answer_cnt = {}
195 |
196 | if args.question_type == "frameqa":
197 | for i, answer in enumerate(answers):
198 | answer_cnt[answer] = answer_cnt.get(answer, 0) + 1
199 |
200 | answer_token_to_idx = {'': 0}
201 | for token in answer_cnt:
202 | answer_token_to_idx[token] = len(answer_token_to_idx)
203 | print('Get answer_token_to_idx, num: %d' % len(answer_token_to_idx))
204 | elif args.question_type == 'count':
205 | answer_token_to_idx = {'': 0}
206 |
207 | question_token_to_idx = {'': 0, '': 1}
208 | for i, q in enumerate(questions):
209 | question = q.lower()[:-1]
210 | for token in nltk.word_tokenize(question):
211 | if token not in question_token_to_idx:
212 | question_token_to_idx[token] = len(question_token_to_idx)
213 | print('Get question_token_to_idx')
214 | print(len(question_token_to_idx))
215 |
216 | vocab = {
217 | 'question_token_to_idx': question_token_to_idx,
218 | 'answer_token_to_idx': answer_token_to_idx,
219 | 'question_answer_token_to_idx': {'': 0, '': 1}
220 | }
221 |
222 | print('Write into %s' % args.vocab_json.format(args.question_type, args.question_type))
223 | with open(args.vocab_json.format(args.question_type, args.question_type), 'w') as f:
224 | json.dump(vocab, f, indent=4)
225 |
226 | # split 10% of questions for evaluation
227 | split = int(0.9 * len(questions))
228 | train_questions = questions[:split]
229 | train_answers = answers[:split]
230 | train_video_names = video_names[:split]
231 | train_video_ids = video_ids[:split]
232 |
233 | val_questions = questions[split:]
234 | val_answers = answers[split:]
235 | val_video_names = video_names[split:]
236 | val_video_ids = video_ids[split:]
237 |
238 | openeded_encoding_data(args, vocab, train_questions, train_video_names, train_video_ids, train_answers, mode='train')
239 | openeded_encoding_data(args, vocab, val_questions, val_video_names, val_video_ids, val_answers, mode='val')
240 | else:
241 | print('Loading vocab')
242 | with open(args.vocab_json.format(args.question_type, args.question_type), 'r') as f:
243 | vocab = json.load(f)
244 | openeded_encoding_data(args, vocab, questions, video_names, video_ids, answers, mode='test')
245 |
246 |
247 |
248 |
249 | def process_questions_mulchoices(args):
250 | print('Loading dataset')
251 | if args.mode in ["train", "val"]:
252 | csv_data = pd.read_csv(args.annotation_file.format("Train", args.question_type), delimiter='\t')
253 | else:
254 | csv_data = pd.read_csv(args.annotation_file.format("Test", args.question_type), delimiter='\t')
255 | csv_data = csv_data.iloc[np.random.permutation(len(csv_data))]
256 | questions = list(csv_data['question'])
257 | answers = list(csv_data['answer'])
258 | video_names = list(csv_data['gif_name'])
259 | video_ids = list(csv_data['key'])
260 | ans_candidates = np.asarray(
261 | [csv_data['a1'], csv_data['a2'], csv_data['a3'], csv_data['a4'], csv_data['a5']])
262 | ans_candidates = ans_candidates.transpose()
263 | print(ans_candidates.shape)
264 | # ans_candidates: (num_ques, 5)
265 | print('number of questions: %s' % len(questions))
266 | # Either create the vocab or load it from disk
267 | if args.mode in ['train']:
268 | print('Building vocab')
269 |
270 | answer_token_to_idx = {'': 0, '': 1}
271 | question_answer_token_to_idx = {'': 0, '': 1}
272 | for candidates in ans_candidates:
273 | for ans in candidates:
274 | ans = ans.lower()
275 | for token in nltk.word_tokenize(ans):
276 | if token not in answer_token_to_idx:
277 | answer_token_to_idx[token] = len(answer_token_to_idx)
278 | if token not in question_answer_token_to_idx:
279 | question_answer_token_to_idx[token] = len(question_answer_token_to_idx)
280 | print('Get answer_token_to_idx, num: %d' % len(answer_token_to_idx))
281 |
282 | question_token_to_idx = {'': 0, '': 1}
283 | for i, q in enumerate(questions):
284 | question = q.lower()[:-1]
285 | for token in nltk.word_tokenize(question):
286 | if token not in question_token_to_idx:
287 | question_token_to_idx[token] = len(question_token_to_idx)
288 | if token not in question_answer_token_to_idx:
289 | question_answer_token_to_idx[token] = len(question_answer_token_to_idx)
290 |
291 | print('Get question_token_to_idx')
292 | print(len(question_token_to_idx))
293 | print('Get question_answer_token_to_idx')
294 | print(len(question_answer_token_to_idx))
295 |
296 | vocab = {
297 | 'question_token_to_idx': question_token_to_idx,
298 | 'answer_token_to_idx': answer_token_to_idx,
299 | 'question_answer_token_to_idx': question_answer_token_to_idx,
300 | }
301 |
302 | print('Write into %s' % args.vocab_json.format(args.question_type, args.question_type))
303 | with open(args.vocab_json.format(args.question_type, args.question_type), 'w') as f:
304 | json.dump(vocab, f, indent=4)
305 |
306 | # split 10% of questions for evaluation
307 | split = int(0.9 * len(questions))
308 | train_questions = questions[:split]
309 | train_answers = answers[:split]
310 | train_video_names = video_names[:split]
311 | train_video_ids = video_ids[:split]
312 | train_ans_candidates = ans_candidates[:split, :]
313 |
314 | val_questions = questions[split:]
315 | val_answers = answers[split:]
316 | val_video_names = video_names[split:]
317 | val_video_ids = video_ids[split:]
318 | val_ans_candidates = ans_candidates[split:, :]
319 |
320 | multichoice_encoding_data(args, vocab, train_questions, train_video_names, train_video_ids, train_answers, train_ans_candidates, mode='train')
321 | multichoice_encoding_data(args, vocab, val_questions, val_video_names, val_video_ids, val_answers,
322 | val_ans_candidates, mode='val')
323 | else:
324 | print('Loading vocab')
325 | with open(args.vocab_json.format(args.question_type, args.question_type), 'r') as f:
326 | vocab = json.load(f)
327 | multichoice_encoding_data(args, vocab, questions, video_names, video_ids, answers,
328 | ans_candidates, mode='test')
329 |
--------------------------------------------------------------------------------
/tools/datautils/utils.py:
--------------------------------------------------------------------------------
1 | import time
2 | import json
3 | import os
4 | import os.path as osp
5 |
6 | def load_file(file_name):
7 | annos = None
8 | with open(file_name, 'r') as fp:
9 | if osp.splitext(file_name)[1]== '.txt':
10 | annos = fp.readlines()
11 | annos = [line.rstrip() for line in annos]
12 | if osp.splitext(file_name)[1] == '.json':
13 | annos = json.load(fp)
14 |
15 | return annos
16 |
17 | def save_file(obj, filename):
18 | """
19 | save obj to filename
20 | :param obj:
21 | :param filename:
22 | :return:
23 | """
24 | filepath = osp.dirname(filename)
25 | if filepath != '' and not osp.exists(filepath):
26 | os.makedirs(filepath)
27 | else:
28 | with open(filename, 'w') as fp:
29 | json.dump(obj, fp)
30 |
31 | def encode(seq_tokens, token_to_idx, allow_unk=False):
32 | seq_idx = []
33 | for token in seq_tokens:
34 | if token not in token_to_idx:
35 | if allow_unk:
36 | token = ''
37 | else:
38 | raise KeyError('Token "%s" not in vocab' % token)
39 | seq_idx.append(token_to_idx[token])
40 | return seq_idx
41 |
42 |
43 | def decode(seq_idx, idx_to_token, delim=None, stop_at_end=True):
44 | tokens = []
45 | for idx in seq_idx:
46 | tokens.append(idx_to_token[idx])
47 | if stop_at_end and tokens[-1] == '':
48 | break
49 | if delim is None:
50 | return tokens
51 | else:
52 | return delim.join(tokens)
53 |
54 | # --------------------------------------------------------
55 | # Fast R-CNN
56 | # Copyright (c) 2015 Microsoft
57 | # Licensed under The MIT License [see LICENSE for details]
58 | # Written by Ross Girshick
59 | # --------------------------------------------------------
60 |
61 | class Timer(object):
62 | """A simple timer."""
63 | def __init__(self):
64 | self.total_time = 0.
65 | self.calls = 0
66 | self.start_time = 0.
67 | self.diff = 0.
68 | self.average_time = 0.
69 |
70 | def tic(self):
71 | # using time.time instead of time.clock because time time.clock
72 | # does not normalize for multithreading
73 | self.start_time = time.time()
74 |
75 | def toc(self, average=True):
76 | self.diff = time.time() - self.start_time
77 | self.total_time += self.diff
78 | self.calls += 1
79 | self.average_time = self.total_time / self.calls
80 | if average:
81 | return self.average_time
82 | else:
83 | return self.diff
--------------------------------------------------------------------------------
/tools/demo.py:
--------------------------------------------------------------------------------
1 | import h5py
2 | import os
3 | import os.path as osp
4 | import numpy as np
5 | from bbox_visualizer import *
6 | import sys
7 | sys.path.insert(0, '../')
8 | from util import load_file, save_to
9 |
10 | bbox_colors = np.loadtxt('colors.txt')
11 |
12 |
13 | def sample_clips(total_frames, num_clips, num_frames_per_clip):
14 | clips = []
15 | frames = [str(f+1).zfill(6) for f in range(total_frames)]
16 | for i in np.linspace(0, total_frames, num_clips + 2, dtype=np.int32)[1: num_clips + 1]:
17 | clip_start = int(i) - int(num_frames_per_clip / 2)
18 | clip_end = int(i) + int(num_frames_per_clip / 2)
19 | clip_start = 0 if clip_start < 0 else clip_start
20 | clip_end = total_frames if clip_end > total_frames else clip_end
21 | clip = frames[clip_start:clip_end]
22 | if clip_start == 0 and len(clip) < num_frames_per_clip:
23 | shortage = num_frames_per_clip - (clip_end - clip_start)
24 | added_fids = []
25 | for _ in range(shortage):
26 | added_fids.append(frames[clip_start])
27 | if len(added_fids) > 0:
28 | clip = added_fids + clip
29 | if clip_end == total_frames and len(clip) < num_frames_per_clip:
30 | shortage = num_frames_per_clip - (clip_end - clip_start)
31 | added_fids = []
32 | for _ in range(shortage):
33 | added_fids.append(frames[clip_end-1])
34 | if len(added_fids) > 0:
35 | clip += added_fids
36 | clip = clip[::4]
37 | clips.append(clip)
38 | clips = clips[::2]
39 | return clips
40 |
41 |
42 | def get_vbbox(feat_file, qvid, bbox_num):
43 | with h5py.File(feat_file, 'r') as fp:
44 | vids = fp['ids']
45 | bboxes = fp['bbox']
46 | for id, (vid, bbox) in enumerate(zip(vids, bboxes)):
47 | if str(vid) != qvid: continue
48 | vbbox = bbox[:,:,:bbox_num, :]
49 |
50 | return vbbox
51 |
52 |
53 | def vis_det(feat_file, vname):
54 | bbox_num = 5
55 | vid = vname.split('/')[-1]
56 | vbbox = get_vbbox(feat_file, vid, bbox_num)
57 | fids = os.listdir(vname)
58 | total_frames = len(fids)
59 | clips = sample_clips(total_frames, 16, 16)
60 | # clips = np.asarray(clips).reshape(-1)
61 | out_dir = '../demo/'
62 |
63 | for i, cids in enumerate(clips):
64 | for f, fid in enumerate(cids):
65 | img_path = osp.join(vname, fid+'.jpg')
66 | bboxes = vbbox[i][f]
67 |
68 | bboxes = [[int(np.round(b)) for b in bbox] for bbox in bboxes]
69 | # bbox = [int(np.round(b)) for b in bbox]
70 | img = cv2.imread(img_path)
71 | img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
72 | output = draw_multiple_rectangles(img, bboxes, bbox_colors)
73 | # output = draw_rectangle(img, bbox)
74 |
75 | out_file = osp.join(out_dir, str(vid))
76 | if not osp.exists(out_file):
77 | os.makedirs(out_file)
78 | img = cv2.cvtColor(output, cv2.COLOR_RGB2BGR)
79 | cv2.imwrite(osp.join(out_file, fid+'.jpg'), img)
80 | # cv2.imshow('image', output)
81 | # cv2.waitKey(0)
82 |
83 |
84 |
85 | def main():
86 | dataset = 'nextqa'
87 | feat_file = f'../../data/{dataset}/region_feat_n/acregion_8c20b_val.h5'
88 | #the videos are decoded by 6 pfs
89 | frame_dir = '/home/jbxiao/workspace/data/nextqa/frames/'
90 | vname = f'{frame_dir}/3376544720'
91 | vis_det(feat_file, vname)
92 |
93 | if __name__ == "__main__":
94 | main()
95 |
--------------------------------------------------------------------------------
/tools/extract_video.py:
--------------------------------------------------------------------------------
1 | # ====================================================
2 | # @Time : 15/4/21 12:38 PM
3 | # @Author : Xiao Junbin
4 | # @Email : junbin@comp.nus.edu.sg
5 | # @File : extract_video.py.py
6 | # ====================================================
7 | import os
8 | import os.path as osp
9 | import shutil
10 | import subprocess
11 | import pandas as pd
12 | import json
13 | import sys
14 | sys.path.insert(0, '../')
15 | from util import load_file
16 |
17 | # def load_file(filename):
18 | # with open(filename, 'r') as fp:
19 | # data = json.load(fp)
20 | # return data
21 |
22 | def get_video_list(filename, out_file):
23 | data = load_file(filename)
24 | video_ids = list(data['video_id'])
25 | video_ids = list(set(video_ids))
26 | # video_ids = os.listdir(filename)
27 | # video_ids = sorted(video_ids)
28 | print(len(video_ids))
29 | with open(out_file, 'w') as fp:
30 | json.dump(video_ids, fp, indent=4)
31 | return video_ids
32 |
33 |
34 | def extract_frame(video, dst):
35 |
36 | with open(os.devnull, 'w') as ffmpeg_log:
37 | if os.path.exists(dst):
38 | # print(" cleanup: "+dst+"/")
39 | shutil.rmtree(dst)
40 | os.makedirs(dst)
41 | video2frame_cmd = [
42 | "ffmpeg",
43 | '-y',
44 | '-i', video,
45 | '-r', "6", # 6 frames per second
46 | # '-vf', "scale=400:300",
47 | '-qscale:v', "2",
48 | '{0}/%06d.jpg'.format(dst)
49 | ]
50 | subprocess.call(video2frame_cmd, stdout = ffmpeg_log, stderr=ffmpeg_log)
51 |
52 |
53 | def extract_videos(raw_dir, vlist, frame_dir, map_vid=None):
54 |
55 | vnum = len(vlist)
56 | for id, vid in enumerate(vlist):
57 | # if id <= 400: continue
58 | # if id > 400: break
59 | vid = str(vid)
60 | if map_vid != None:
61 | video = osp.join(raw_dir, f'{map_vid[vid]}.mp4')
62 | else:
63 | video = osp.join(raw_dir, f'{vid}.mp4')
64 | dst = osp.join(frame_dir, vid)
65 | if not osp.exists(video):
66 | print(video)
67 | extract_frame(video, dst)
68 | if id % 20 == 0:
69 | print('{}/{}'.format(id, vnum))
70 |
71 |
72 | def main():
73 | video_dir = '/storage/jbxiao/workspace/data/nextqa/'
74 | raw_dir = osp.join(video_dir, 'videos/')
75 | frame_dir = osp.join(video_dir, 'frames_val/')
76 | anno_dir = '../datasets/nextqa/'
77 | vlist_file = osp.join(anno_dir, 'vlist.json')
78 | map_file = osp.join(anno_dir, 'map_vid_vidorID.json')
79 | if not osp.exists(vlist_file):
80 | dset = 'val' #train/test
81 | qa_file = osp.join(anno_dir, f'{dset}.csv')
82 | vlist_file = osp.join(anno_dir, f'vlist_{dset}.json')
83 | vlist = get_video_list(qa_file, vlist_file)
84 | else:
85 | vlist = load_file(vlist_file)
86 | map_vid = load_file(map_file)
87 | extract_videos(raw_dir, vlist, frame_dir, map_vid=map_vid)
88 |
89 |
90 | if __name__ == "__main__":
91 | main()
92 |
--------------------------------------------------------------------------------
/tools/feat_app.sh:
--------------------------------------------------------------------------------
1 | #########################################################################
2 | # File Name: feat_app.sh
3 | # Author: Xiao Junbin
4 | # mail: xiaojunbin@u.nus.edu
5 | # Created Time: Sat 19 Sep 2020 09:22:26 PM
6 | #########################################################################
7 | #!/bin/bash
8 | GPUID=$1
9 | CUDA_VISIBLE_DEVICES=$GPUID python preprocess_features.py \
10 | --dataset 'nextqa' \
11 | --model 'resnet101' \
12 | --image_width 224 \
13 | --image_height 224
14 |
--------------------------------------------------------------------------------
/tools/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/doc-doc/CoVGT/cbc9fa7830b304f3c3f9c53040489ea9ad35a9aa/tools/models/__init__.py
--------------------------------------------------------------------------------
/tools/models/densenet.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 | from collections import OrderedDict
5 | import math
6 |
7 | __all__ = ['DenseNet', 'densenet121', 'densenet169', 'densenet201', 'densenet264']
8 |
9 |
10 | def densenet121(**kwargs):
11 | model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 24, 16),
12 | **kwargs)
13 | return model
14 |
15 |
16 | def densenet169(**kwargs):
17 | model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 32, 32),
18 | **kwargs)
19 | return model
20 |
21 |
22 | def densenet201(**kwargs):
23 | model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 48, 32),
24 | **kwargs)
25 | return model
26 |
27 |
28 | def densenet264(**kwargs):
29 | model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 64, 48),
30 | **kwargs)
31 | return model
32 |
33 |
34 | def get_fine_tuning_parameters(model, ft_begin_index):
35 | if ft_begin_index == 0:
36 | return model.parameters()
37 |
38 | ft_module_names = []
39 | for i in range(ft_begin_index, 5):
40 | ft_module_names.append('denseblock{}'.format(ft_begin_index))
41 | ft_module_names.append('transition{}'.format(ft_begin_index))
42 | ft_module_names.append('norm5')
43 | ft_module_names.append('classifier')
44 |
45 | parameters = []
46 | for k, v in model.named_parameters():
47 | for ft_module in ft_module_names:
48 | if ft_module in k:
49 | parameters.append({'params': v})
50 | break
51 | else:
52 | parameters.append({'params': v, 'lr': 0.0})
53 |
54 | return parameters
55 |
56 |
57 | class _DenseLayer(nn.Sequential):
58 | def __init__(self, num_input_features, growth_rate, bn_size, drop_rate):
59 | super(_DenseLayer, self).__init__()
60 | self.add_module('norm.1', nn.BatchNorm3d(num_input_features))
61 | self.add_module('relu.1', nn.ReLU(inplace=True))
62 | self.add_module('conv.1', nn.Conv3d(num_input_features, bn_size * growth_rate,
63 | kernel_size=1, stride=1, bias=False))
64 | self.add_module('norm.2', nn.BatchNorm3d(bn_size * growth_rate))
65 | self.add_module('relu.2', nn.ReLU(inplace=True))
66 | self.add_module('conv.2', nn.Conv3d(bn_size * growth_rate, growth_rate,
67 | kernel_size=3, stride=1, padding=1, bias=False))
68 | self.drop_rate = drop_rate
69 |
70 | def forward(self, x):
71 | new_features = super(_DenseLayer, self).forward(x)
72 | if self.drop_rate > 0:
73 | new_features = F.dropout(new_features, p=self.drop_rate, training=self.training)
74 | return torch.cat([x, new_features], 1)
75 |
76 |
77 | class _DenseBlock(nn.Sequential):
78 | def __init__(self, num_layers, num_input_features, bn_size, growth_rate, drop_rate):
79 | super(_DenseBlock, self).__init__()
80 | for i in range(num_layers):
81 | layer = _DenseLayer(num_input_features + i * growth_rate, growth_rate, bn_size, drop_rate)
82 | self.add_module('denselayer%d' % (i + 1), layer)
83 |
84 |
85 | class _Transition(nn.Sequential):
86 | def __init__(self, num_input_features, num_output_features):
87 | super(_Transition, self).__init__()
88 | self.add_module('norm', nn.BatchNorm3d(num_input_features))
89 | self.add_module('relu', nn.ReLU(inplace=True))
90 | self.add_module('conv', nn.Conv3d(num_input_features, num_output_features,
91 | kernel_size=1, stride=1, bias=False))
92 | self.add_module('pool', nn.AvgPool3d(kernel_size=2, stride=2))
93 |
94 |
95 | class DenseNet(nn.Module):
96 | """Densenet-BC model class
97 | Args:
98 | growth_rate (int) - how many filters to add each layer (k in paper)
99 | block_config (list of 4 ints) - how many layers in each pooling block
100 | num_init_features (int) - the number of filters to learn in the first convolution layer
101 | bn_size (int) - multiplicative factor for number of bottle neck layers
102 | (i.e. bn_size * k features in the bottleneck layer)
103 | drop_rate (float) - dropout rate after each dense layer
104 | num_classes (int) - number of classification classes
105 | """
106 | def __init__(self, sample_size, sample_duration, growth_rate=32, block_config=(6, 12, 24, 16),
107 | num_init_features=64, bn_size=4, drop_rate=0, num_classes=1000, last_fc=True):
108 |
109 | super(DenseNet, self).__init__()
110 |
111 | self.last_fc = last_fc
112 |
113 | self.sample_size = sample_size
114 | self.sample_duration = sample_duration
115 |
116 | # First convolution
117 | self.features = nn.Sequential(OrderedDict([
118 | ('conv0', nn.Conv3d(3, num_init_features, kernel_size=7,
119 | stride=(1, 2, 2), padding=(3, 3, 3), bias=False)),
120 | ('norm0', nn.BatchNorm3d(num_init_features)),
121 | ('relu0', nn.ReLU(inplace=True)),
122 | ('pool0', nn.MaxPool3d(kernel_size=3, stride=2, padding=1)),
123 | ]))
124 |
125 | # Each denseblock
126 | num_features = num_init_features
127 | for i, num_layers in enumerate(block_config):
128 | block = _DenseBlock(num_layers=num_layers, num_input_features=num_features,
129 | bn_size=bn_size, growth_rate=growth_rate, drop_rate=drop_rate)
130 | self.features.add_module('denseblock%d' % (i + 1), block)
131 | num_features = num_features + num_layers * growth_rate
132 | if i != len(block_config) - 1:
133 | trans = _Transition(num_input_features=num_features, num_output_features=num_features // 2)
134 | self.features.add_module('transition%d' % (i + 1), trans)
135 | num_features = num_features // 2
136 |
137 | # Final batch norm
138 | self.features.add_module('norm5', nn.BatchNorm2d(num_features))
139 |
140 | # Linear layer
141 | self.classifier = nn.Linear(num_features, num_classes)
142 |
143 | def forward(self, x):
144 | features = self.features(x)
145 | out = F.relu(features, inplace=True)
146 | last_duration = math.ceil(self.sample_duration / 16)
147 | last_size = math.floor(self.sample_size / 32)
148 | out = F.avg_pool3d(out, kernel_size=(last_duration, last_size, last_size)).view(features.size(0), -1)
149 | if self.last_fc:
150 | out = self.classifier(out)
151 | return out
152 |
--------------------------------------------------------------------------------
/tools/models/pre_act_resnet.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 | from torch.autograd import Variable
5 | import math
6 | from functools import partial
7 |
8 | __all__ = ['PreActivationResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152', 'resnet200']
9 |
10 |
11 | def conv3x3x3(in_planes, out_planes, stride=1):
12 | # 3x3x3 convolution with padding
13 | return nn.Conv3d(in_planes, out_planes, kernel_size=3,
14 | stride=stride, padding=1, bias=False)
15 |
16 |
17 | def downsample_basic_block(x, planes, stride):
18 | out = F.avg_pool3d(x, kernel_size=1, stride=stride)
19 | zero_pads = torch.Tensor(out.size(0), planes - out.size(1),
20 | out.size(2), out.size(3),
21 | out.size(4)).zero_()
22 | if isinstance(out.data, torch.cuda.FloatTensor):
23 | zero_pads = zero_pads.cuda()
24 |
25 | out = Variable(torch.cat([out.data, zero_pads], dim=1))
26 |
27 | return out
28 |
29 |
30 | class PreActivationBasicBlock(nn.Module):
31 | expansion = 1
32 |
33 | def __init__(self, inplanes, planes, stride=1, downsample=None):
34 | super(PreActivationBasicBlock, self).__init__()
35 | self.bn1 = nn.BatchNorm3d(inplanes)
36 | self.conv1 = conv3x3x3(inplanes, planes, stride)
37 | self.bn2 = nn.BatchNorm3d(planes)
38 | self.conv2 = conv3x3x3(planes, planes)
39 | self.relu = nn.ReLU(inplace=True)
40 | self.downsample = downsample
41 | self.stride = stride
42 |
43 | def forward(self, x):
44 | residual = x
45 |
46 | out = self.bn1(x)
47 | out = self.relu(out)
48 | out = self.conv1(out)
49 |
50 | out = self.bn2(out)
51 | out = self.relu(out)
52 | out = self.conv2(out)
53 |
54 | if self.downsample is not None:
55 | residual = self.downsample(x)
56 |
57 | out += residual
58 |
59 | return out
60 |
61 |
62 | class PreActivationBottleneck(nn.Module):
63 | expansion = 4
64 |
65 | def __init__(self, inplanes, planes, stride=1, downsample=None):
66 | super(PreActivationBottleneck, self).__init__()
67 | self.bn1 = nn.BatchNorm3d(inplanes)
68 | self.conv1 = nn.Conv3d(inplanes, planes, kernel_size=1, bias=False)
69 | self.bn2 = nn.BatchNorm3d(planes)
70 | self.conv2 = nn.Conv3d(planes, planes, kernel_size=3, stride=stride,
71 | padding=1, bias=False)
72 | self.bn3 = nn.BatchNorm3d(planes)
73 | self.conv3 = nn.Conv3d(planes, planes * 4, kernel_size=1, bias=False)
74 | self.relu = nn.ReLU(inplace=True)
75 | self.downsample = downsample
76 | self.stride = stride
77 |
78 | def forward(self, x):
79 | residual = x
80 |
81 | out = self.bn1(x)
82 | out = self.relu(out)
83 | out = self.conv1(out)
84 |
85 | out = self.bn2(out)
86 | out = self.relu(out)
87 | out = self.conv2(out)
88 |
89 | out = self.bn3(out)
90 | out = self.relu(out)
91 | out = self.conv3(out)
92 |
93 | if self.downsample is not None:
94 | residual = self.downsample(x)
95 |
96 | out += residual
97 |
98 | return out
99 |
100 |
101 | class PreActivationResNet(nn.Module):
102 |
103 | def __init__(self, block, layers, sample_size, sample_duration, shortcut_type='B', num_classes=400, last_fc=True):
104 | self.last_fc = last_fc
105 |
106 | self.inplanes = 64
107 | super(PreActivationResNet, self).__init__()
108 | self.conv1 = nn.Conv3d(3, 64, kernel_size=7, stride=(1, 2, 2),
109 | padding=(3, 3, 3), bias=False)
110 | self.bn1 = nn.BatchNorm3d(64)
111 | self.relu = nn.ReLU(inplace=True)
112 | self.maxpool = nn.MaxPool3d(kernel_size=(3, 3, 3), stride=2, padding=1)
113 | self.layer1 = self._make_layer(block, 64, layers[0], shortcut_type)
114 | self.layer2 = self._make_layer(block, 128, layers[1], shortcut_type, stride=2)
115 | self.layer3 = self._make_layer(block, 256, layers[2], shortcut_type, stride=2)
116 | self.layer4 = self._make_layer(block, 512, layers[3], shortcut_type, stride=2)
117 | last_duration = math.ceil(sample_duration / 16)
118 | last_size = math.ceil(sample_size / 32)
119 | self.avgpool = nn.AvgPool3d((last_duration, last_size, last_size), stride=1)
120 | self.fc = nn.Linear(512 * block.expansion, num_classes)
121 |
122 | for m in self.modules():
123 | if isinstance(m, nn.Conv3d):
124 | n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
125 | m.weight.data.normal_(0, math.sqrt(2. / n))
126 | elif isinstance(m, nn.BatchNorm3d):
127 | m.weight.data.fill_(1)
128 | m.bias.data.zero_()
129 |
130 | def _make_layer(self, block, planes, blocks, shortcut_type, stride=1):
131 | downsample = None
132 | if stride != 1 or self.inplanes != planes * block.expansion:
133 | if shortcut_type == 'A':
134 | downsample = partial(downsample_basic_block,
135 | planes=planes * block.expansion,
136 | stride=stride)
137 | else:
138 | downsample = nn.Sequential(
139 | nn.Conv3d(self.inplanes, planes * block.expansion,
140 | kernel_size=1, stride=stride, bias=False),
141 | nn.BatchNorm3d(planes * block.expansion)
142 | )
143 |
144 | layers = []
145 | layers.append(block(self.inplanes, planes, stride, downsample))
146 | self.inplanes = planes * block.expansion
147 | for i in range(1, blocks):
148 | layers.append(block(self.inplanes, planes))
149 |
150 | return nn.Sequential(*layers)
151 |
152 | def forward(self, x):
153 | x = self.conv1(x)
154 | x = self.bn1(x)
155 | x = self.relu(x)
156 | x = self.maxpool(x)
157 |
158 | x = self.layer1(x)
159 | x = self.layer2(x)
160 | x = self.layer3(x)
161 | x = self.layer4(x)
162 |
163 | x = self.avgpool(x)
164 |
165 | x = x.view(x.size(0), -1)
166 | if self.last_fc:
167 | x = self.fc(x)
168 |
169 | return x
170 |
171 | def get_fine_tuning_parameters(model, ft_begin_index):
172 | if ft_begin_index == 0:
173 | return model.parameters()
174 |
175 | ft_module_names = []
176 | for i in range(ft_begin_index, 5):
177 | ft_module_names.append('layer{}'.format(ft_begin_index))
178 | ft_module_names.append('fc')
179 |
180 | parameters = []
181 | for k, v in model.named_parameters():
182 | for ft_module in ft_module_names:
183 | if ft_module in k:
184 | parameters.append({'params': v})
185 | break
186 | else:
187 | parameters.append({'params': v, 'lr': 0.0})
188 |
189 | return parameters
190 |
191 | def resnet18(**kwargs):
192 | """Constructs a ResNet-18 model.
193 | """
194 | model = PreActivationResNet(PreActivationBasicBlock, [2, 2, 2, 2], **kwargs)
195 | return model
196 |
197 | def resnet34(**kwargs):
198 | """Constructs a ResNet-34 model.
199 | """
200 | model = PreActivationResNet(PreActivationBasicBlock, [3, 4, 6, 3], **kwargs)
201 | return model
202 |
203 |
204 | def resnet50(**kwargs):
205 | """Constructs a ResNet-50 model.
206 | """
207 | model = PreActivationResNet(PreActivationBottleneck, [3, 4, 6, 3], **kwargs)
208 | return model
209 |
210 | def resnet101(**kwargs):
211 | """Constructs a ResNet-101 model.
212 | """
213 | model = PreActivationResNet(PreActivationBottleneck, [3, 4, 23, 3], **kwargs)
214 | return model
215 |
216 | def resnet152(**kwargs):
217 | """Constructs a ResNet-101 model.
218 | """
219 | model = PreActivationResNet(PreActivationBottleneck, [3, 8, 36, 3], **kwargs)
220 | return model
221 |
222 | def resnet200(**kwargs):
223 | """Constructs a ResNet-101 model.
224 | """
225 | model = PreActivationResNet(PreActivationBottleneck, [3, 24, 36, 3], **kwargs)
226 | return model
227 |
--------------------------------------------------------------------------------
/tools/models/resnet.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 | from torch.autograd import Variable
5 | import math
6 | from functools import partial
7 |
8 | __all__ = ['ResNet', 'resnet10', 'resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152', 'resnet200']
9 |
10 |
11 | def conv3x3x3(in_planes, out_planes, stride=1):
12 | # 3x3x3 convolution with padding
13 | return nn.Conv3d(in_planes, out_planes, kernel_size=3,
14 | stride=stride, padding=1, bias=False)
15 |
16 |
17 | def downsample_basic_block(x, planes, stride):
18 | out = F.avg_pool3d(x, kernel_size=1, stride=stride)
19 | zero_pads = torch.Tensor(out.size(0), planes - out.size(1),
20 | out.size(2), out.size(3),
21 | out.size(4)).zero_()
22 | if isinstance(out.data, torch.cuda.FloatTensor):
23 | zero_pads = zero_pads.cuda()
24 |
25 | out = Variable(torch.cat([out.data, zero_pads], dim=1))
26 |
27 | return out
28 |
29 |
30 | class BasicBlock(nn.Module):
31 | expansion = 1
32 |
33 | def __init__(self, inplanes, planes, stride=1, downsample=None):
34 | super(BasicBlock, self).__init__()
35 | self.conv1 = conv3x3x3(inplanes, planes, stride)
36 | self.bn1 = nn.BatchNorm3d(planes)
37 | self.relu = nn.ReLU(inplace=True)
38 | self.conv2 = conv3x3x3(planes, planes)
39 | self.bn2 = nn.BatchNorm3d(planes)
40 | self.downsample = downsample
41 | self.stride = stride
42 |
43 | def forward(self, x):
44 | residual = x
45 |
46 | out = self.conv1(x)
47 | out = self.bn1(out)
48 | out = self.relu(out)
49 |
50 | out = self.conv2(out)
51 | out = self.bn2(out)
52 |
53 | if self.downsample is not None:
54 | residual = self.downsample(x)
55 |
56 | out += residual
57 | out = self.relu(out)
58 |
59 | return out
60 |
61 |
62 | class Bottleneck(nn.Module):
63 | expansion = 4
64 |
65 | def __init__(self, inplanes, planes, stride=1, downsample=None):
66 | super(Bottleneck, self).__init__()
67 | self.conv1 = nn.Conv3d(inplanes, planes, kernel_size=1, bias=False)
68 | self.bn1 = nn.BatchNorm3d(planes)
69 | self.conv2 = nn.Conv3d(planes, planes, kernel_size=3, stride=stride,
70 | padding=1, bias=False)
71 | self.bn2 = nn.BatchNorm3d(planes)
72 | self.conv3 = nn.Conv3d(planes, planes * 4, kernel_size=1, bias=False)
73 | self.bn3 = nn.BatchNorm3d(planes * 4)
74 | self.relu = nn.ReLU(inplace=True)
75 | self.downsample = downsample
76 | self.stride = stride
77 |
78 | def forward(self, x):
79 | residual = x
80 |
81 | out = self.conv1(x)
82 | out = self.bn1(out)
83 | out = self.relu(out)
84 |
85 | out = self.conv2(out)
86 | out = self.bn2(out)
87 | out = self.relu(out)
88 |
89 | out = self.conv3(out)
90 | out = self.bn3(out)
91 |
92 | if self.downsample is not None:
93 | residual = self.downsample(x)
94 |
95 | out += residual
96 | out = self.relu(out)
97 |
98 | return out
99 |
100 |
101 | class ResNet(nn.Module):
102 |
103 | def __init__(self, block, layers, sample_size, sample_duration, shortcut_type='B', num_classes=400, last_fc=True):
104 | self.last_fc = last_fc
105 |
106 | self.inplanes = 64
107 | super(ResNet, self).__init__()
108 | self.conv1 = nn.Conv3d(3, 64, kernel_size=7, stride=(1, 2, 2),
109 | padding=(3, 3, 3), bias=False)
110 | self.bn1 = nn.BatchNorm3d(64)
111 | self.relu = nn.ReLU(inplace=True)
112 | self.maxpool = nn.MaxPool3d(kernel_size=(3, 3, 3), stride=2, padding=1)
113 | self.layer1 = self._make_layer(block, 64, layers[0], shortcut_type)
114 | self.layer2 = self._make_layer(block, 128, layers[1], shortcut_type, stride=2)
115 | self.layer3 = self._make_layer(block, 256, layers[2], shortcut_type, stride=2)
116 | self.layer4 = self._make_layer(block, 512, layers[3], shortcut_type, stride=2)
117 | last_duration = math.ceil(sample_duration / 16)
118 | last_size = math.ceil(sample_size / 32)
119 | self.avgpool = nn.AvgPool3d((last_duration, last_size, last_size), stride=1)
120 | self.fc = nn.Linear(512 * block.expansion, num_classes)
121 |
122 | for m in self.modules():
123 | if isinstance(m, nn.Conv3d):
124 | n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
125 | m.weight.data.normal_(0, math.sqrt(2. / n))
126 | elif isinstance(m, nn.BatchNorm3d):
127 | m.weight.data.fill_(1)
128 | m.bias.data.zero_()
129 |
130 | def _make_layer(self, block, planes, blocks, shortcut_type, stride=1):
131 | downsample = None
132 | if stride != 1 or self.inplanes != planes * block.expansion:
133 | if shortcut_type == 'A':
134 | downsample = partial(downsample_basic_block,
135 | planes=planes * block.expansion,
136 | stride=stride)
137 | else:
138 | downsample = nn.Sequential(
139 | nn.Conv3d(self.inplanes, planes * block.expansion,
140 | kernel_size=1, stride=stride, bias=False),
141 | nn.BatchNorm3d(planes * block.expansion)
142 | )
143 |
144 | layers = []
145 | layers.append(block(self.inplanes, planes, stride, downsample))
146 | self.inplanes = planes * block.expansion
147 | for i in range(1, blocks):
148 | layers.append(block(self.inplanes, planes))
149 |
150 | return nn.Sequential(*layers)
151 |
152 | def forward(self, x):
153 | x = self.conv1(x)
154 | x = self.bn1(x)
155 | x = self.relu(x)
156 | x = self.maxpool(x)
157 |
158 | x = self.layer1(x)
159 | x = self.layer2(x)
160 | x = self.layer3(x)
161 | x = self.layer4(x)
162 |
163 |
164 | x = self.avgpool(x)
165 |
166 | x = x.view(x.size(0), -1)
167 | if self.last_fc:
168 | x = self.fc(x)
169 |
170 | return x
171 |
172 |
173 | def get_fine_tuning_parameters(model, ft_begin_index):
174 | if ft_begin_index == 0:
175 | return model.parameters()
176 |
177 | ft_module_names = []
178 | for i in range(ft_begin_index, 5):
179 | ft_module_names.append('layer{}'.format(ft_begin_index))
180 | ft_module_names.append('fc')
181 |
182 | parameters = []
183 | for k, v in model.named_parameters():
184 | for ft_module in ft_module_names:
185 | if ft_module in k:
186 | parameters.append({'params': v})
187 | break
188 | else:
189 | parameters.append({'params': v, 'lr': 0.0})
190 |
191 | return parameters
192 |
193 |
194 | def resnet10(**kwargs):
195 | """Constructs a ResNet-18 model.
196 | """
197 | model = ResNet(BasicBlock, [1, 1, 1, 1], **kwargs)
198 | return model
199 |
200 | def resnet18(**kwargs):
201 | """Constructs a ResNet-18 model.
202 | """
203 | model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)
204 | return model
205 |
206 | def resnet34(**kwargs):
207 | """Constructs a ResNet-34 model.
208 | """
209 | model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs)
210 | return model
211 |
212 | def resnet50(**kwargs):
213 | """Constructs a ResNet-50 model.
214 | """
215 | model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
216 | return model
217 |
218 | def resnet101(**kwargs):
219 | """Constructs a ResNet-101 model.
220 | """
221 | model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
222 | return model
223 |
224 | def resnet152(**kwargs):
225 | """Constructs a ResNet-101 model.
226 | """
227 | model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs)
228 | return model
229 |
230 | def resnet200(**kwargs):
231 | """Constructs a ResNet-101 model.
232 | """
233 | model = ResNet(Bottleneck, [3, 24, 36, 3], **kwargs)
234 | return model
235 |
--------------------------------------------------------------------------------
/tools/models/resnext.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 | from torch.autograd import Variable
5 | import math
6 | from functools import partial
7 |
8 | __all__ = ['ResNeXt', 'resnet50', 'resnet101']
9 |
10 |
11 | def conv3x3x3(in_planes, out_planes, stride=1):
12 | # 3x3x3 convolution with padding
13 | return nn.Conv3d(in_planes, out_planes, kernel_size=3,
14 | stride=stride, padding=1, bias=False)
15 |
16 |
17 | def downsample_basic_block(x, planes, stride):
18 | out = F.avg_pool3d(x, kernel_size=1, stride=stride)
19 | zero_pads = torch.Tensor(out.size(0), planes - out.size(1),
20 | out.size(2), out.size(3),
21 | out.size(4)).zero_()
22 | if isinstance(out.data, torch.cuda.FloatTensor):
23 | zero_pads = zero_pads.cuda()
24 |
25 | out = Variable(torch.cat([out.data, zero_pads], dim=1))
26 |
27 | return out
28 |
29 |
30 | class ResNeXtBottleneck(nn.Module):
31 | expansion = 2
32 |
33 | def __init__(self, inplanes, planes, cardinality, stride=1, downsample=None):
34 | super(ResNeXtBottleneck, self).__init__()
35 | mid_planes = cardinality * int(planes / 32)
36 | self.conv1 = nn.Conv3d(inplanes, mid_planes, kernel_size=1, bias=False)
37 | self.bn1 = nn.BatchNorm3d(mid_planes)
38 | self.conv2 = nn.Conv3d(mid_planes, mid_planes, kernel_size=3, stride=stride,
39 | padding=1, groups=cardinality, bias=False)
40 | self.bn2 = nn.BatchNorm3d(mid_planes)
41 | self.conv3 = nn.Conv3d(mid_planes, planes * self.expansion, kernel_size=1, bias=False)
42 | self.bn3 = nn.BatchNorm3d(planes * self.expansion)
43 | self.relu = nn.ReLU(inplace=True)
44 | self.downsample = downsample
45 | self.stride = stride
46 |
47 | def forward(self, x):
48 | residual = x
49 |
50 | out = self.conv1(x)
51 | out = self.bn1(out)
52 | out = self.relu(out)
53 |
54 | out = self.conv2(out)
55 | out = self.bn2(out)
56 | out = self.relu(out)
57 |
58 | out = self.conv3(out)
59 | out = self.bn3(out)
60 |
61 | if self.downsample is not None:
62 | residual = self.downsample(x)
63 |
64 | out += residual
65 | out = self.relu(out)
66 |
67 | return out
68 |
69 |
70 | class ResNeXt(nn.Module):
71 |
72 | def __init__(self, block, layers, sample_size, sample_duration, shortcut_type='B', cardinality=32, num_classes=400, last_fc=True):
73 | self.last_fc = last_fc
74 |
75 | self.inplanes = 64
76 | super(ResNeXt, self).__init__()
77 | self.conv1 = nn.Conv3d(3, 64, kernel_size=7, stride=(1, 2, 2),
78 | padding=(3, 3, 3), bias=False)
79 | self.bn1 = nn.BatchNorm3d(64)
80 | self.relu = nn.ReLU(inplace=True)
81 | self.maxpool = nn.MaxPool3d(kernel_size=(3, 3, 3), stride=2, padding=1)
82 | self.layer1 = self._make_layer(block, 128, layers[0], shortcut_type, cardinality)
83 | self.layer2 = self._make_layer(block, 256, layers[1], shortcut_type, cardinality, stride=2)
84 | self.layer3 = self._make_layer(block, 512, layers[2], shortcut_type, cardinality, stride=2)
85 | self.layer4 = self._make_layer(block, 1024, layers[3], shortcut_type, cardinality, stride=2)
86 | last_duration = math.ceil(sample_duration / 16)
87 | last_size = math.ceil(sample_size / 32)
88 | self.avgpool = nn.AvgPool3d((last_duration, last_size, last_size), stride=1)
89 | self.fc = nn.Linear(cardinality * 32 * block.expansion, num_classes)
90 |
91 | for m in self.modules():
92 | if isinstance(m, nn.Conv3d):
93 | n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
94 | m.weight.data.normal_(0, math.sqrt(2. / n))
95 | elif isinstance(m, nn.BatchNorm3d):
96 | m.weight.data.fill_(1)
97 | m.bias.data.zero_()
98 |
99 | def _make_layer(self, block, planes, blocks, shortcut_type, cardinality, stride=1):
100 | downsample = None
101 | if stride != 1 or self.inplanes != planes * block.expansion:
102 | if shortcut_type == 'A':
103 | downsample = partial(downsample_basic_block,
104 | planes=planes * block.expansion,
105 | stride=stride)
106 | else:
107 | downsample = nn.Sequential(
108 | nn.Conv3d(self.inplanes, planes * block.expansion,
109 | kernel_size=1, stride=stride, bias=False),
110 | nn.BatchNorm3d(planes * block.expansion)
111 | )
112 |
113 | layers = []
114 | layers.append(block(self.inplanes, planes, cardinality, stride, downsample))
115 | self.inplanes = planes * block.expansion
116 | for i in range(1, blocks):
117 | layers.append(block(self.inplanes, planes, cardinality))
118 |
119 | return nn.Sequential(*layers)
120 |
121 | def forward(self, x):
122 | x = self.conv1(x)
123 | x = self.bn1(x)
124 | x = self.relu(x)
125 | x = self.maxpool(x)
126 |
127 | x = self.layer1(x)
128 | x = self.layer2(x)
129 | x = self.layer3(x)
130 | x = self.layer4(x)
131 |
132 |
133 | x = self.avgpool(x)
134 | x = x.view(x.size(0), -1)
135 | if self.last_fc:
136 | x = self.fc(x)
137 |
138 | return x
139 |
140 | def get_fine_tuning_parameters(model, ft_begin_index):
141 | if ft_begin_index == 0:
142 | return model.parameters()
143 |
144 | ft_module_names = []
145 | for i in range(ft_begin_index, 5):
146 | ft_module_names.append('layer{}'.format(ft_begin_index))
147 | ft_module_names.append('fc')
148 |
149 | parameters = []
150 | for k, v in model.named_parameters():
151 | for ft_module in ft_module_names:
152 | if ft_module in k:
153 | parameters.append({'params': v})
154 | break
155 | else:
156 | parameters.append({'params': v, 'lr': 0.0})
157 |
158 | return parameters
159 |
160 | def resnet50(**kwargs):
161 | """Constructs a ResNet-50 model.
162 | """
163 | model = ResNeXt(ResNeXtBottleneck, [3, 4, 6, 3], **kwargs)
164 | return model
165 |
166 | def resnet101(**kwargs):
167 | """Constructs a ResNet-101 model.
168 | """
169 | model = ResNeXt(ResNeXtBottleneck, [3, 4, 23, 3], **kwargs)
170 | return model
171 |
172 | def resnet152(**kwargs):
173 | """Constructs a ResNet-101 model.
174 | """
175 | model = ResNeXt(ResNeXtBottleneck, [3, 8, 36, 3], **kwargs)
176 | return model
177 |
--------------------------------------------------------------------------------
/tools/models/wide_resnet.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 | from torch.autograd import Variable
5 | import math
6 | from functools import partial
7 |
8 | __all__ = ['WideResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101']
9 |
10 |
11 | def conv3x3x3(in_planes, out_planes, stride=1):
12 | # 3x3x3 convolution with padding
13 | return nn.Conv3d(in_planes, out_planes, kernel_size=3,
14 | stride=stride, padding=1, bias=False)
15 |
16 |
17 | def downsample_basic_block(x, planes, stride):
18 | out = F.avg_pool3d(x, kernel_size=1, stride=stride)
19 | zero_pads = torch.Tensor(out.size(0), planes - out.size(1),
20 | out.size(2), out.size(3),
21 | out.size(4)).zero_()
22 | if isinstance(out.data, torch.cuda.FloatTensor):
23 | zero_pads = zero_pads.cuda()
24 |
25 | out = Variable(torch.cat([out.data, zero_pads], dim=1))
26 |
27 | return out
28 |
29 |
30 | class WideBottleneck(nn.Module):
31 | expansion = 2
32 |
33 | def __init__(self, inplanes, planes, stride=1, downsample=None):
34 | super(WideBottleneck, self).__init__()
35 | self.conv1 = nn.Conv3d(inplanes, planes, kernel_size=1, bias=False)
36 | self.bn1 = nn.BatchNorm3d(planes)
37 | self.conv2 = nn.Conv3d(planes, planes, kernel_size=3, stride=stride,
38 | padding=1, bias=False)
39 | self.bn2 = nn.BatchNorm3d(planes)
40 | self.conv3 = nn.Conv3d(planes, planes * self.expansion, kernel_size=1, bias=False)
41 | self.bn3 = nn.BatchNorm3d(planes * self.expansion)
42 | self.relu = nn.ReLU(inplace=True)
43 | self.downsample = downsample
44 | self.stride = stride
45 |
46 | def forward(self, x):
47 | residual = x
48 |
49 | out = self.conv1(x)
50 | out = self.bn1(out)
51 | out = self.relu(out)
52 |
53 | out = self.conv2(out)
54 | out = self.bn2(out)
55 | out = self.relu(out)
56 |
57 | out = self.conv3(out)
58 | out = self.bn3(out)
59 |
60 | if self.downsample is not None:
61 | residual = self.downsample(x)
62 |
63 | out += residual
64 | out = self.relu(out)
65 |
66 | return out
67 |
68 |
69 | class WideResNet(nn.Module):
70 |
71 | def __init__(self, block, layers, sample_size, sample_duration, k=1, shortcut_type='B', num_classes=400, last_fc=True):
72 | self.last_fc = last_fc
73 |
74 | self.inplanes = 64
75 | super(WideResNet, self).__init__()
76 | self.conv1 = nn.Conv3d(3, 64, kernel_size=7, stride=(1, 2, 2),
77 | padding=(3, 3, 3), bias=False)
78 | self.bn1 = nn.BatchNorm3d(64)
79 | self.relu = nn.ReLU(inplace=True)
80 | self.maxpool = nn.MaxPool3d(kernel_size=(3, 3, 3), stride=2, padding=1)
81 | self.layer1 = self._make_layer(block, 64 * k, layers[0], shortcut_type)
82 | self.layer2 = self._make_layer(block, 128 * k, layers[1], shortcut_type, stride=2)
83 | self.layer3 = self._make_layer(block, 256 * k, layers[2], shortcut_type, stride=2)
84 | self.layer4 = self._make_layer(block, 512 * k, layers[3], shortcut_type, stride=2)
85 | last_duration = math.ceil(sample_duration / 16)
86 | last_size = math.ceil(sample_size / 32)
87 | self.avgpool = nn.AvgPool3d((last_duration, last_size, last_size), stride=1)
88 | self.fc = nn.Linear(512 * k * block.expansion, num_classes)
89 |
90 | for m in self.modules():
91 | if isinstance(m, nn.Conv3d):
92 | n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
93 | m.weight.data.normal_(0, math.sqrt(2. / n))
94 | elif isinstance(m, nn.BatchNorm3d):
95 | m.weight.data.fill_(1)
96 | m.bias.data.zero_()
97 |
98 | def _make_layer(self, block, planes, blocks, shortcut_type, stride=1):
99 | downsample = None
100 | if stride != 1 or self.inplanes != planes * block.expansion:
101 | if shortcut_type == 'A':
102 | downsample = partial(downsample_basic_block,
103 | planes=planes * block.expansion,
104 | stride=stride)
105 | else:
106 | downsample = nn.Sequential(
107 | nn.Conv3d(self.inplanes, planes * block.expansion,
108 | kernel_size=1, stride=stride, bias=False),
109 | nn.BatchNorm3d(planes * block.expansion)
110 | )
111 |
112 | layers = []
113 | layers.append(block(self.inplanes, planes, stride, downsample))
114 | self.inplanes = planes * block.expansion
115 | for i in range(1, blocks):
116 | layers.append(block(self.inplanes, planes))
117 |
118 | return nn.Sequential(*layers)
119 |
120 | def forward(self, x):
121 | x = self.conv1(x)
122 | x = self.bn1(x)
123 | x = self.relu(x)
124 | x = self.maxpool(x)
125 |
126 | x = self.layer1(x)
127 | x = self.layer2(x)
128 | x = self.layer3(x)
129 | x = self.layer4(x)
130 |
131 | x = self.avgpool(x)
132 |
133 | x = x.view(x.size(0), -1)
134 | if self.last_fc:
135 | x = self.fc(x)
136 |
137 | return x
138 |
139 | def get_fine_tuning_parameters(model, ft_begin_index):
140 | if ft_begin_index == 0:
141 | return model.parameters()
142 |
143 | ft_module_names = []
144 | for i in range(ft_begin_index, 5):
145 | ft_module_names.append('layer{}'.format(ft_begin_index))
146 | ft_module_names.append('fc')
147 |
148 | parameters = []
149 | for k, v in model.named_parameters():
150 | for ft_module in ft_module_names:
151 | if ft_module in k:
152 | parameters.append({'params': v})
153 | break
154 | else:
155 | parameters.append({'params': v, 'lr': 0.0})
156 |
157 | return parameters
158 |
159 | def resnet50(**kwargs):
160 | """Constructs a ResNet-50 model.
161 | """
162 | model = WideResNet(WideBottleneck, [3, 4, 6, 3], **kwargs)
163 | return model
164 |
--------------------------------------------------------------------------------
/tools/object_align.py:
--------------------------------------------------------------------------------
1 | import sys
2 | sys.path.insert(0, '../')
3 | import h5py
4 | import os.path as osp
5 | import numpy as np
6 | from sklearn.metrics.pairwise import pairwise_distances
7 | from sklearn.preprocessing import normalize
8 | import sys
9 | sys.path.insert(0, '../')
10 | from util import load_file, save_to
11 | import os
12 |
13 | def align_object(video_feature_path, mode):
14 | bbox_feat_file = osp.join(video_feature_path, 'region_8c10b_{}.h5'.format(mode))
15 | print('Load {}...'.format(bbox_feat_file))
16 | out_file = osp.join(bbox_feat_file+'.h5')
17 | fout = h5py.File(out_file, 'w')
18 | string_dt = h5py.special_dtype(vlen=str)
19 | with h5py.File(bbox_feat_file, 'r') as fp:
20 | vids = fp['ids']
21 | feats = fp['feat']
22 | bboxes = fp['bbox']
23 | fout.create_dataset('ids', shape=vids.shape, dtype=string_dt, data=vids)
24 |
25 | feat_alns, bbox_alns = [], []
26 | for id, (vid, feat, bbox) in enumerate(zip(vids, feats, bboxes)):
27 |
28 | cnum, fnum, rnum, _ = feat.shape
29 | cur_feat_aln, cur_bbox_aln = [], []
30 | for cid, (cur_feat, cur_bbox) in enumerate(zip(feat, bbox)):
31 | vid_feat_aln, vid_bbox_aln = align(cur_feat, cur_bbox, vid, cid)
32 | cur_feat_aln.append(vid_feat_aln)
33 | cur_bbox_aln.append(vid_bbox_aln)
34 |
35 | feat_alns.append(cur_feat_aln)
36 | bbox_alns.append(cur_bbox_aln)
37 | if id % 100 == 0:
38 | print(f'{id}/{len(vids)}')
39 |
40 | feat_alns = np.asarray(feat_alns)
41 | bbox_alns = np.asarray(bbox_alns)
42 | print(feat_alns.shape, bbox_alns.shape)
43 |
44 | fout.create_dataset('feat', shape=feat_alns.shape, dtype=np.float32, data=feat_alns)
45 | fout.create_dataset('bbox', shape=bbox_alns.shape, dtype=np.float32, data=bbox_alns)
46 |
47 |
48 | def align_object_byv(video_feature_path, vlist_file):
49 | vlist = load_file(vlist_file)
50 | indir = osp.join(video_feature_path, 'bbox_feat')
51 | outdir = osp.join(video_feature_path, 'bbox_feat_aln')
52 | vnum = len(vlist)
53 | print(vnum)
54 | for idx, vid in enumerate(vlist):
55 | if idx <= 8000: continue
56 | if idx > 10000: break
57 | outfile = osp.join(outdir, vid+'.npz')
58 | if osp.exists(outfile):
59 | continue
60 | infile = osp.join(indir, vid+'.npz')
61 | region_feat = np.load(infile)
62 |
63 | roi_feat, roi_bbox = align_feat_bbox(region_feat['feat'][:8], region_feat['bbox'][:8], vid)
64 | out_dir = osp.dirname(outfile)
65 | if not osp.exists(out_dir):
66 | os.makedirs(out_dir)
67 | np.savez_compressed(outfile, feat=roi_feat, bbox=roi_bbox)
68 | if idx % 100 == 0:
69 | print(f'{idx}/{vnum}', outfile)
70 | print(roi_feat.shape, roi_bbox.shape)
71 |
72 |
73 | def align_feat_bbox(feat, bbox, vid):
74 | cur_feat_aln, cur_bbox_aln = [], []
75 | for cid, (cur_feat, cur_bbox) in enumerate(zip(feat, bbox)):
76 | vid_feat_aln, vid_bbox_aln = align(cur_feat, cur_bbox, vid, cid)
77 | cur_feat_aln.append(vid_feat_aln)
78 | cur_bbox_aln.append(vid_bbox_aln)
79 | return np.asarray(cur_feat_aln), np.asarray(cur_bbox_aln)
80 |
81 |
82 | def align(feats, bboxes, vid, cid):
83 | new_feats, new_bboxes = [], []
84 | paths = get_tracks(feats, bboxes, vid, cid)
85 | for i in range(len(paths)):
86 | obj_feat, obj_pos = [], []
87 | for fid in range(len(feats)):
88 | feat = feats[fid][paths[i][fid]]
89 | bbox = bboxes[fid][paths[i][fid]]
90 | obj_feat.append(feat)
91 | obj_pos.append(bbox)
92 | new_feats.append(obj_feat)
93 | new_bboxes.append(obj_pos)
94 | new_feats = np.asarray(new_feats).transpose(1, 0, 2)
95 | new_bboxes = np.asarray(new_bboxes).transpose(1, 0, 2)
96 | return new_feats, new_bboxes
97 |
98 |
99 | def get_tracks(feats, bboxes, vid, cid):
100 | links = get_link(feats, bboxes)
101 | paths = []
102 | for i in range(bboxes.shape[1]):
103 | max_path = find_max_path_greedy(links, i)
104 | links = update_links(links, max_path)
105 | max_path = [i] + max_path
106 | paths.append(max_path)
107 | # vis_path(vid, cid, bboxes, max_path)
108 | # break
109 | return paths
110 |
111 |
112 | def get_link(feats, bboxes):
113 | fnum = feats.shape[0]
114 | link_cretiria = []
115 | for fid in range(fnum-1):
116 | feat_p, feat_n = feats[fid], feats[fid+1]
117 | sim_f = pairwise_distances(feat_p, feat_n, 'cosine', n_jobs=1)
118 | sim_f = 1-sim_f
119 | box_p, box_n = bboxes[fid], bboxes[fid+1]
120 | areas_p = np.array([get_area(bbox) for bbox in box_p])
121 | areas_n = np.array([get_area(bbox) for bbox in box_n])
122 | op_box = []
123 | for bid, bbox in enumerate(box_p):
124 | area_p = areas_p[bid]
125 | x1 = np.maximum(bbox[0], box_n[:, 0])
126 | y1 = np.maximum(bbox[1], box_n[:, 1])
127 | x2 = np.minimum(bbox[2], box_n[:, 2])
128 | y2 = np.minimum(bbox[3], box_n[:, 3])
129 | W = np.maximum(0, x2 - x1 + 1)
130 | H = np.maximum(0, y2 - y1 + 1)
131 | ov_area = W * H
132 | IoUs = ov_area / (area_p + areas_n - ov_area)
133 | op_box.append(IoUs)
134 | scores = np.asarray(op_box) + sim_f #equal importance
135 | link_cretiria.append(scores)
136 | return np.asarray(link_cretiria)
137 |
138 |
139 | def update_links(links, max_path):
140 | """
141 | remove the nodes at the max_path
142 | """
143 | for i, v in enumerate(max_path):
144 | links[i][v] = 0
145 | return links
146 |
147 |
148 | def find_max_path_greedy(link_scores, sid):
149 | path = []
150 | for i in range(link_scores.shape[0]):
151 | sid = np.argmax(link_scores[i][sid])
152 | path.append(sid)
153 | return path
154 |
155 |
156 | def get_area(bbox):
157 | area = (bbox[2]-bbox[0]+1)*(bbox[3]-bbox[1]+1)
158 | return area
159 |
160 |
161 | def main():
162 | video_feature_path = f'../../data/feats/nextqa/region_feat_n/'
163 | align_object(video_feature_path, 'test')
164 | # dataset_dir = '../../data/datasets/nextqa/test.csv'
165 | # vlist_file = dataset_dir + 'vlist.json'
166 | # if osp.exists(vlist_file):
167 | # vlist = load_file(vlist_file)
168 | # else:
169 | # data = load_file(dataset_dir)
170 | # vlist = list(set(list(data['video_id'])))
171 | # save_to(vlist_file, vlist)
172 | # align_object_byv(video_feature_path, vlist_file)
173 |
174 |
175 | if __name__ == "__main__":
176 | main()
177 |
--------------------------------------------------------------------------------
/tools/preprocess_features.py:
--------------------------------------------------------------------------------
1 | import argparse, os
2 | import h5py
3 | from scipy.misc import imresize
4 | import skvideo.io as sio
5 | from PIL import Image
6 | import cv2
7 | import json
8 | import torch
9 | from torch import nn
10 | import torchvision
11 | import random
12 | import numpy as np
13 | import shutil
14 | import subprocess
15 | from models import resnext
16 | from datautils import utils
17 | from datautils import tgif_qa
18 | from datautils import msrvtt_qa
19 | from datautils import msvd_qa
20 | import os.path as osp
21 | import sys
22 | sys.path.insert(0, '../')
23 | import time
24 | from util import load_file, save_to
25 |
26 |
27 | def build_resnet():
28 | if not hasattr(torchvision.models, args.model):
29 | raise ValueError('Invalid model "%s"' % args.model)
30 | if not 'resnet' in args.model:
31 | raise ValueError('Feature extraction only supports ResNets')
32 | cnn = getattr(torchvision.models, args.model)(pretrained=True)
33 | model = torch.nn.Sequential(*list(cnn.children())[:-1])
34 |
35 | model.cuda()
36 | model.eval()
37 | return model
38 |
39 |
40 | def build_resnext():
41 | model = resnext.resnet101(num_classes=400, shortcut_type='B', cardinality=32,
42 | sample_size=112, sample_duration=16,
43 | last_fc=False)
44 | model = model.cuda()
45 | model = nn.DataParallel(model, device_ids=None)
46 | assert os.path.exists('../../data/pretrained/resnext-101-kinetics.pth')
47 | # download from https://drive.google.com/drive/folders/1zvl89AgFAApbH0At-gMuZSeQB_LpNP-M
48 | model_data = torch.load('../../data/pretrained/resnext-101-kinetics.pth', map_location='cpu')
49 | model.load_state_dict(model_data['state_dict'])
50 | model.eval()
51 | return model
52 |
53 | def extract_frame(video, dst):
54 | with open(os.devnull, 'w') as ffmpeg_log:
55 | if os.path.exists(dst):
56 | # print(" cleanup: "+dst+"/")
57 | shutil.rmtree(dst)
58 | os.makedirs(dst)
59 | video2frame_cmd = [
60 | "ffmpeg",
61 | '-y',
62 | '-i', video,
63 | '-r', "10",
64 | # '-vf', "scale=400:300",
65 | '-vsync', '0',
66 | '-qscale:v', "2",
67 | '{0}/%06d.jpg'.format(dst)
68 | ]
69 | subprocess.call(video2frame_cmd, stdout = ffmpeg_log, stderr=ffmpeg_log)
70 |
71 |
72 | def run_batch(cur_batch, model):
73 | """
74 | Args:
75 | cur_batch: treat a video as a batch of images
76 | model: ResNet model for feature extraction
77 | Returns:
78 | ResNet extracted feature.
79 | """
80 | mean = np.array([0.485, 0.456, 0.406]).reshape(1, 3, 1, 1)
81 | std = np.array([0.229, 0.224, 0.224]).reshape(1, 3, 1, 1)
82 |
83 | image_batch = np.concatenate(cur_batch, 0).astype(np.float32)
84 | image_batch = (image_batch / 255.0 - mean) / std
85 | image_batch = torch.FloatTensor(image_batch).cuda()
86 | with torch.no_grad():
87 | image_batch = torch.autograd.Variable(image_batch)
88 |
89 | feats = model(image_batch)
90 | feats = feats.data.cpu().clone().numpy()
91 |
92 | return feats
93 |
94 |
95 | def extract_clips_with_consecutive_frames(path, num_clips, num_frames_per_clip):
96 | """
97 | Args:
98 | path: path of a video
99 | num_clips: expected numbers of splitted clips
100 | num_frames_per_clip: number of frames in a single clip, pretrained model only supports 16 frames
101 | Returns:
102 | A list of raw features of clips.
103 | """
104 |
105 | clips = list()
106 | t1 = time.time()
107 | frame_list = sorted(os.listdir(path))
108 | video_data = [np.asarray(Image.open(osp.join(path, img))) for img in frame_list]
109 |
110 | valid = True
111 | video_data = np.asarray(video_data)
112 | t2 = time.time()
113 | print(t2-t1)
114 |
115 | total_frames = video_data.shape[0]
116 | img_size = (args.image_height, args.image_width)
117 | for i in np.linspace(0, total_frames, num_clips + 2, dtype=np.int32)[1:num_clips + 1]:
118 | clip_start = int(i) - int(num_frames_per_clip / 2)
119 | clip_end = int(i) + int(num_frames_per_clip / 2)
120 | if clip_start < 0:
121 | clip_start = 0
122 | if clip_end > total_frames:
123 | clip_end = total_frames - 1
124 | clip = video_data[clip_start:clip_end]
125 |
126 | if clip_start == 0:
127 | shortage = num_frames_per_clip - (clip_end - clip_start)
128 | added_frames = []
129 | for _ in range(shortage):
130 | added_frames.append(np.expand_dims(video_data[clip_start], axis=0))
131 | if len(added_frames) > 0:
132 | added_frames = np.concatenate(added_frames, axis=0)
133 | clip = np.concatenate((added_frames, clip), axis=0)
134 | if clip_end == (total_frames - 1):
135 | shortage = num_frames_per_clip - (clip_end - clip_start)
136 | added_frames = []
137 | for _ in range(shortage):
138 | added_frames.append(np.expand_dims(video_data[clip_end], axis=0))
139 | if len(added_frames) > 0:
140 | added_frames = np.concatenate(added_frames, axis=0)
141 | clip = np.concatenate((clip, added_frames), axis=0)
142 |
143 |
144 | # new_clip = clip #.transpose(0, 3, 1, 2)[None]
145 | # if clip.shape[0] < num_frames_per_clip:
146 | clip = clip[::4] #sample 4 frames per clip
147 | new_clip = []
148 | # for j in range(num_frames_per_clip):
149 | # if j >= len(clip):
150 | # new_clip.append(new_clip[-1])
151 | # else:
152 | # new_clip.append(clip[j])
153 | for frame_data in clip:
154 | # frame_data = clip[j]
155 | img = Image.fromarray(frame_data)
156 | img = imresize(img, img_size, interp='bicubic')
157 | frame_data = np.array(img)
158 | frame_data = frame_data.transpose(2, 0, 1)[None]
159 | new_clip.append(frame_data)
160 | new_clip = np.asarray(new_clip) # (num_frames, width, height, channels)
161 | # print(new_clip.shape)
162 | if args.model in ['resnext101']:
163 | new_clip = np.squeeze(new_clip)
164 | new_clip = np.transpose(new_clip, axes=(1, 0, 2, 3))
165 | clips.append(new_clip)
166 |
167 | clips = clips[::4] # sample 8 clips per video
168 | t3 = time.time()
169 |
170 | return clips, valid
171 |
172 | def extract_clip_frames(vpath, clips):
173 | """
174 | Args:
175 | path: path of a video
176 | num_clips: expected numbers of splitted clips
177 | num_frames_per_clip: number of frames in a single clip, pretrained model only supports 16 frames
178 | Returns:
179 | A list of raw features of clips.
180 | """
181 | # para_dict = {'r':'10', 'vsync':'0', 'qscale:v':'2'}
182 | # print(vpath)
183 | # rate = 10
184 | # meta = skvideo.io.ffprobe(vpath)
185 | # fp = meta['video']['@avg_frame_rate']
186 | # tstamp = int(fp.split('/')[0])//rate
187 | try:
188 | video_data = sio.vread(vpath) #ffmpeg as backend
189 | except:
190 | return None
191 | # video_data = video_data[::tstamp]
192 | total_frames, width, height, channel = video_data.shape
193 | # print(video_data.shape)
194 | img_size = (224, 224) #(args.image_height, args.image_width)
195 | img_clip = []
196 | num_clip = 8
197 | clips = clips[:8]
198 | for i, cids in enumerate(clips):
199 | # if i > 7: break
200 | fids = [int(r) for r in cids]
201 | # print(fids, video_data.shape)
202 | if fids[-1] >= total_frames:
203 | fids[-1] = total_frames -1
204 | clip = video_data[fids]
205 | new_clip = []
206 | for j in range(4):
207 | frame_data = clip[j]
208 | img = Image.fromarray(frame_data)
209 | img = imresize(img, img_size, interp='bicubic')
210 | img = img.transpose(2, 0, 1)[None]
211 | frame_data = np.array(img)
212 | new_clip.append(frame_data)
213 | # new_clip = np.asarray(new_clip) # (num_frames, width, height, channels)
214 | img_clip.extend(new_clip)
215 |
216 | return img_clip
217 |
218 |
219 | def generate_npy(model, video_dir, clip_file, outfile):
220 |
221 | vclips = load_file(clip_file)
222 | vclips = sorted(vclips.items(), key=lambda a:a[0])
223 | dataset_size = len(vclips)
224 | print(dataset_size)
225 |
226 | i0 = 0
227 | _t = {'misc': utils.Timer()}
228 | for i, (vname, clip) in enumerate(vclips):
229 | #if i <= 4000: continue
230 | #if i > 10000: break
231 | out_file = osp.join(outfile, vname+'.npy')
232 | if osp.exists(out_file):
233 | continue
234 | video_path = osp.join(video_dir, vname+'.mp4')
235 | if not osp.exists(video_path):
236 | # print(video_path)
237 | continue
238 | clips = extract_clip_frames(video_path, clip)
239 | if clips == None: continue
240 | clips = np.asarray(clips)
241 | clip_feat = run_batch(clips, model)
242 | clip_feat = clip_feat.squeeze()#(32, 2048)
243 |
244 | feat = clip_feat.reshape(8, 4, 2048)
245 | dirname = osp.dirname(out_file)
246 | if not osp.exists(dirname):
247 | os.makedirs(dirname)
248 | np.save(out_file, feat)
249 | if i % 200 == 0:
250 | print(f'{i}/{dataset_size}')
251 |
252 | def prepare_inputs(path, frame_list):
253 | video_data = [np.asarray(Image.open(osp.join(path, img))) for img in frame_list]
254 | video_data = np.asarray(video_data)
255 | total_frames = video_data.shape[0]
256 | img_size = (224, 224)
257 | video_inputs = []
258 | for j in range(total_frames):
259 | frame_data = video_data[j]
260 | img = Image.fromarray(frame_data)
261 | img = imresize(img, img_size, interp='bicubic')
262 | img = img.transpose(2, 0, 1)[None]
263 | frame_data = np.array(img)
264 | video_inputs.append(frame_data)
265 | video_inputs = np.asarray(video_inputs)
266 | # print(video_inputs.shape)
267 | return video_inputs
268 |
269 | def generate_npy_byframe(model, video_list_file, video_dir, out_dir):
270 | videos = load_file(video_list_file)
271 | vnum = len(videos)
272 | for iv, vname in enumerate(videos):
273 | # if iv <= 2400: continue
274 | # if iv > 3000: break
275 | fpath = f'{video_dir}/{vname}'
276 | frames = sorted(os.listdir(fpath))
277 | out_path = osp.join(out_dir, vname)
278 | if osp.exists(out_path): continue
279 | videos = prepare_inputs(fpath, frames)
280 | fnum = videos.shape[0]
281 | if fnum > 100:
282 | it = fnum//100
283 | left = fnum % 100
284 | video_feats = []
285 | for i in range(it):
286 | data = run_batch(videos[i*100:100*(i+1)], model)
287 | video_feats.append(data)
288 | if left > 0:
289 | data = run_batch(videos[i*100:(i*100)+left], model)
290 | video_feats.append(data)
291 | # print(len(video_feats))
292 | video_feats = np.concatenate(video_feats, 0)
293 | assert video_feats.shape[0] == fnum, 'error'
294 | else:
295 | video_feats = run_batch(videos, model)
296 | video_feats = video_feats.squeeze()
297 | if not osp.exists(out_path):
298 | os.makedirs(out_path)
299 | for iff, frame in enumerate(frames):
300 | fname = frame.split('.')[0]
301 | fpath_out = f'{out_path}/{fname}'
302 | # if osp.exists(fpath_out+'.npy'): continue
303 | np.save(fpath_out, video_feats[iff])
304 | if iv % 100 == 0:
305 | print(f'{iv}/{vnum}')
306 |
307 |
308 | def generate_h5(model, v_path, v_file, num_clips, outfile):
309 | """
310 | Args:
311 | model: loaded pretrained model for feature extraction
312 | video_ids: list of video ids
313 | num_clips: expected numbers of splitted clips
314 | outfile: path of output file to be written
315 | Returns:
316 | h5 file containing visual features of splitted clips.
317 | """
318 | if args.dataset == "tgif-qa":
319 | if not os.path.exists('dataset/tgif-qa/{}'.format(args.question_type)):
320 | os.makedirs('dataset/tgif-qa/{}'.format(args.question_type))
321 | else:
322 | if not os.path.exists(args.dataset):
323 | os.makedirs(args.dataset)
324 |
325 | vlist = load_file(v_file)
326 | dataset_size = len(vlist)
327 | print(dataset_size)
328 | vnames = []
329 | with h5py.File(outfile, 'w') as fd:
330 | feat_dset = None
331 | video_ids_dset = None
332 | i0 = 0
333 | _t = {'misc': utils.Timer()}
334 | for i in range(0, dataset_size):
335 | # if i < 20: continue
336 | _t['misc'].tic()
337 |
338 | video_path = osp.join(v_path, str(vlist[i]))
339 |
340 | clips, valid = extract_clips_with_consecutive_frames(video_path, num_clips=num_clips, num_frames_per_clip=16)
341 |
342 | nclip, nframe = 8, 4
343 | if args.feature_type == 'appearance':
344 | clip_feat = []
345 | if valid:
346 | # for clip_id, clip in enumerate(clips):
347 | # feats = run_batch(clip, model) # (16, 2048)
348 | # feats = feats.squeeze()
349 | # clip_feat.append(feats)
350 | # t4 = time.time()
351 | clips = np.asarray(clips).squeeze()
352 | clips = clips.reshape(clips.shape[0]*clips.shape[1], clips.shape[2],clips.shape[3],clips.shape[4])
353 |
354 | clips = torch.FloatTensor(clips).cuda().squeeze()
355 | # print(clips.shape)
356 | clip_feat = model(clips).squeeze()
357 | # print(clip_feat.shape)
358 | clip_feat = clip_feat.view(nclip, nframe, -1).detach().cpu().numpy()
359 | else:
360 | clip_feat = np.zeros(shape=(nclip, nframe, 2048))
361 |
362 | if feat_dset is None:
363 | print(clip_feat.shape)
364 | C, F, D = clip_feat.shape
365 | feat_dset = fd.create_dataset('resnet_features', (dataset_size, C, F, D),
366 | dtype=np.float32)
367 | video_ids_dset = fd.create_dataset('ids', shape=(dataset_size,), dtype=np.int)
368 |
369 | elif args.feature_type == 'motion':
370 | if valid:
371 | clip_torch = torch.FloatTensor(np.asarray(clips)).cuda()
372 | clip_feat = model(clip_torch) # (8, 2048)
373 | clip_feat = clip_feat.squeeze()
374 | clip_feat = clip_feat.detach().cpu().numpy()
375 | else:
376 | clip_feat = np.zeros(shape=(nclip, 2048))
377 | if feat_dset is None:
378 | print(clip_feat.shape)
379 | C, D = clip_feat.shape
380 | feat_dset = fd.create_dataset('resnext_features', (dataset_size, C, D),
381 | dtype=np.float32)
382 | video_ids_dset = fd.create_dataset('ids', shape=(dataset_size,), dtype=np.int)
383 |
384 |
385 | i1 = i0 + 1
386 | feat_dset[i0:i1] = clip_feat
387 | video_ids_dset[i0:i1] = int(vlist[i])
388 | i0 = i1
389 | _t['misc'].toc()
390 |
391 | if (i % 100 == 0):
392 | print('{:d}/{:d} {:.3f}s (projected finish: {:.2f} hours)' \
393 | .format(i1, dataset_size, _t['misc'].average_time,
394 | _t['misc'].average_time * (dataset_size - i1) / 3600))
395 |
396 | varry = np.array(vlist, dtype=object)
397 | string_dt = h5py.special_dtype(vlen=str)
398 | fd.create_dataset('ids', data=varry, dtype=string_dt)
399 |
400 |
401 | if __name__ == '__main__':
402 | parser = argparse.ArgumentParser()
403 | parser.add_argument('--gpu_id', type=int, default=0, help='specify which gpu will be used')
404 | # dataset info
405 | parser.add_argument('--dataset', default='nextqa', choices=['tgif-qa', 'msvd', 'star', 'msrvtt', 'nextqa','webvid', 'causalvid'], type=str)
406 | parser.add_argument('--question_type', default='none', choices=['frameqa', 'count', 'transition', 'action', 'none'], type=str)
407 | # output
408 | parser.add_argument('--out', dest='outfile',
409 | help='output filepath',
410 | default="../../data/nextqa/feat_{}.h5", type=str)
411 | # image sizes
412 | parser.add_argument('--num_clips', default=32, type=int)
413 | parser.add_argument('--image_height', default=112*2, type=int)
414 | parser.add_argument('--image_width', default=112*2, type=int)
415 |
416 | # network params
417 | parser.add_argument('--model', default='resnet101', choices=['resnet101', 'resnext101'], type=str)
418 | parser.add_argument('--seed', default='666', type=int, help='random seed')
419 | args = parser.parse_args()
420 | if args.model == 'resnet101':
421 | args.feature_type = 'appearance'
422 | elif args.model == 'resnext101':
423 | args.feature_type = 'motion'
424 | else:
425 | raise Exception('Feature type not supported!')
426 | # set gpu
427 | if args.model != 'resnext101':
428 | torch.cuda.set_device(args.gpu_id)
429 | torch.manual_seed(args.seed)
430 | np.random.seed(args.seed)
431 |
432 | # annotation files
433 | if args.dataset == 'tgifqa':
434 | args.annotation_file = '/storage_fast/jbxiao/workspace/VideoQA/data/{args.dataset}/videos.json'
435 | args.video_dir = '/raid/jbxiao/data/tgifqa/frames/'
436 | args.outfile = '../../data/{}/{}/{}_{}_{}_feat.h5'
437 | video_paths = tgif_qa.load_video_paths(args)
438 | random.shuffle(video_paths)
439 | # load model
440 | if args.model == 'resnet101':
441 | model = build_resnet()
442 | elif args.model == 'resnext101':
443 | model = build_resnext()
444 | generate_h5(model, video_paths, args.num_clips,
445 | args.outfile.format(args.dataset, args.question_type, args.dataset, args.question_type, args.feature_type))
446 |
447 | elif args.dataset == 'webvid':
448 | args.video_dir = '/raid/jbxiao/data/WebVid/videos/'
449 | if args.model == 'resnet101':
450 | model = build_resnet()
451 | elif args.model == 'resnext101':
452 | model = build_resnext()
453 | clip_file = f'/storage_fast/jbxiao/workspace/VideoQA/data/datasets/webvid/val_clip.json'
454 | generate_npy(model, args.video_dir, clip_file, args.outfile)
455 |
456 |
457 | elif args.dataset == 'msvd-qa':
458 | args.annotation_file = '/ceph-g/lethao/datasets/msvd/MSVD-QA/{}_qa.json'
459 | args.video_dir = '/ceph-g/lethao/datasets/msvd/MSVD-QA/video/'
460 | args.video_name_mapping = '/ceph-g/lethao/datasets/msvd/youtube_mapping.txt'
461 | video_paths = msvd_qa.load_video_paths(args)
462 | random.shuffle(video_paths)
463 | # load model
464 | if args.model == 'resnet101':
465 | model = build_resnet()
466 | elif args.model == 'resnext101':
467 | model = build_resnext()
468 | generate_h5(model, video_paths, args.num_clips,
469 | args.outfile.format(args.dataset, args.dataset, args.feature_type))
470 |
471 | elif args.dataset == 'nextqa':
472 | args.video_list_file = '../datasets/nextqa/vlist.json' #obtained from train/val/test csv files
473 | args.video_dir = '/storage/jbxiao/workspace/data/nextqa/frames/' #extacted video frames, refer to extract_video.py
474 | if args.model == 'resnet101':
475 | model = build_resnet()
476 | elif args.model == 'resnext101':
477 | model = build_resnext()
478 | args.image_height = 112
479 | args.image_width = 112
480 | generate_h5(model, args.video_dir, args.video_list_file, args.num_clips, args.outfile.format(args.feature_type))
481 |
--------------------------------------------------------------------------------
/tools/split_dataset_feat.py:
--------------------------------------------------------------------------------
1 | # ====================================================
2 | # @Time : 6/5/21 1:32 PM
3 | # @Author : Xiao Junbin
4 | # @Email : junbin@comp.nus.edu.sg
5 | # @File : split_dataset_feat.py
6 | # ====================================================
7 | import h5py
8 | import numpy as np
9 | import os
10 | import os.path as osp
11 | import pandas as pd
12 |
13 |
14 | def np2h5(in_dir, out_dir, video_list, mode):
15 | out_file = osp.join(out_dir, 'region_16c20b_{}.h5'.format(mode))
16 | video_fd = h5py.File(out_file, 'w')
17 | feat_dset, bbox_dset, ids_dset = None, None, None
18 | bbox_num = 20
19 | for video in video_list:
20 | bbox_file = osp.join(in_dir, str(video) + '.npz')
21 | npz = np.load(bbox_file)
22 | roi_feat = npz['feat']
23 | bnum = roi_feat.shape[2]
24 | roi_bbox = npz['bbox']
25 | # if bnum < bbox_num:
26 | # add_num = bbox_num - bnum
27 | # print(add_num)
28 | # add_feat, add_bbox = [], []
29 | # for _ in range(add_num):
30 | # add_feat.append(roi_feat[:, :, bnum-1, :])
31 | # add_bbox.append(roi_bbox[:, :, bnum-1, :])
32 | # add_feat = np.asarray(add_feat).transpose(1, 2, 0, 3)
33 | # add_bbox = np.asarray(add_bbox).transpose(1, 2, 0, 3)
34 | # print(add_feat.shape, add_bbox.shape)
35 | # roi_feat = np.concatenate((roi_feat, add_feat), axis=2)
36 | # roi_bbox = np.concatenate((roi_bbox, add_bbox), axis=2)
37 |
38 | roi_feat = roi_feat[:, :, :bbox_num, :]
39 |
40 | roi_bbox = roi_bbox[:, :, :bbox_num, :]
41 | # print(roi_feat.shape, roi_bbox.shape)
42 | if feat_dset is None:
43 | dataset_size = len(video_list)
44 | C, F, R, D = roi_feat.shape
45 | feat_dset = video_fd.create_dataset('feat', (dataset_size, C, F, R, D),
46 | dtype=np.float32)
47 | ids_dset = video_fd.create_dataset('ids', shape=(dataset_size,), dtype=np.int)
48 | C, F, R, D = roi_bbox.shape
49 | bbox_dset = video_fd.create_dataset('bbox', shape=(dataset_size, C, F, R, D),
50 | dtype=np.float32)
51 | ival = 0
52 |
53 | feat_dset[ival:(ival + 1)] = roi_feat
54 | bbox_dset[ival:(ival + 1)] = roi_bbox
55 | ids_dset[ival:(ival + 1)] = int(video)
56 |
57 | ival += 1
58 | print('Save to {}'.format(out_file))
59 |
60 | def split_dataset_feat(filename, out_dir, train_list, val_list, test_list):
61 |
62 | train_fd = h5py.File(osp.join(out_dir, 'app_feat_train.h5'), 'w')
63 | val_fd = h5py.File(osp.join(out_dir, 'app_feat_val.h5'), 'w')
64 | test_fd = h5py.File(osp.join(out_dir, 'app_feat_test.h5'), 'w')
65 | val_feat_dset, val_ids_dset = None, None
66 | test_feat_dset, test_ids_dset = None, None
67 | train_feat_dset, train_ids_dset = None, None
68 |
69 | feat_name = 'resnet_features'
70 | with h5py.File(filename, 'r') as fp:
71 | vids = fp['ids']
72 | feats = fp[feat_name]
73 | for vid, feat in zip(vids, feats):
74 | if vid in val_list:
75 | if val_feat_dset is None:
76 | dataset_size = len(val_list)
77 | C, F, D = feat.shape
78 | # C, D = feat.shape
79 | val_feat_dset = val_fd.create_dataset(feat_name, (dataset_size, C, F, D),
80 | dtype=np.float32)
81 | val_ids_dset = val_fd.create_dataset('ids', shape=(dataset_size,), dtype=np.int)
82 | ival = 0
83 | val_feat_dset[ival:(ival+1)] = feat
84 | val_ids_dset[ival:(ival+1)] = int(vid)
85 | ival += 1
86 | elif vid in test_list:
87 | if test_feat_dset is None:
88 | dataset_size = len(test_list)
89 | C, F, D = feat.shape
90 | # C, D = feat.shape
91 | test_feat_dset = test_fd.create_dataset(feat_name, (dataset_size, C, F, D),
92 | dtype=np.float32)
93 | test_ids_dset = test_fd.create_dataset('ids', shape=(dataset_size,), dtype=np.int)
94 | itest = 0
95 |
96 | test_feat_dset[itest:(itest + 1)] = feat
97 | test_ids_dset[itest:(itest + 1)] = int(vid)
98 | itest += 1
99 | else:
100 | if train_feat_dset is None:
101 | dataset_size = len(train_list)
102 | C, F, D = feat.shape
103 | # C, D = feat.shape
104 | train_feat_dset = train_fd.create_dataset(feat_name, (dataset_size, C, F, D),
105 | dtype=np.float32)
106 | train_ids_dset = train_fd.create_dataset('ids', shape=(dataset_size,), dtype=np.int)
107 | itrain = 0
108 |
109 | train_feat_dset[itrain:(itrain + 1)] = feat
110 | train_ids_dset[itrain:(itrain + 1)] = int(vid)
111 | itrain += 1
112 |
113 | def get_video_list(filename):
114 | samples = pd.read_csv(filename)
115 | videos = samples['video']
116 | videos = list(set(videos))
117 | print(len(videos))
118 | return sorted(videos)
119 |
120 | def main():
121 | dataset = 'nextqa'
122 | data_dir = '../../data/{}/'.format(dataset)
123 | dataset_dir = '../datasets/{}/'.format(dataset)
124 | # in_dir = osp.join(data_dir, 'region_n')
125 | out_dir = osp.join(data_dir, 'frame_feat')
126 | train_file = osp.join(dataset_dir, 'train.csv')
127 | val_file = osp.join(dataset_dir, 'val.csv')
128 | test_file = osp.join(dataset_dir, 'test.csv')
129 | train_list = get_video_list(train_file)
130 | val_list = get_video_list(val_file)
131 | test_list = get_video_list(test_file)
132 |
133 | # np2h5(in_dir, out_dir, test_list, 'test')
134 | # np2h5(in_dir, out_dir, val_list, 'val')
135 | # np2h5(in_dir, out_dir, train_list, 'train')
136 |
137 | h5filename = osp.join(out_dir, 'feat_appearance.h5')
138 | split_dataset_feat(h5filename, out_dir, train_list, val_list, test_list)
139 |
140 |
141 | if __name__ == "__main__":
142 | main()
--------------------------------------------------------------------------------
/train/__pycache__/train_covgt.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/doc-doc/CoVGT/cbc9fa7830b304f3c3f9c53040489ea9ad35a9aa/train/__pycache__/train_covgt.cpython-38.pyc
--------------------------------------------------------------------------------
/train/train_covgt.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 | import logging
5 | import collections
6 | from util import compute_aggreeings, AverageMeter, get_mask, mask_tokens
7 | import os.path as osp
8 | import json
9 | #from fvcore.nn import FlopCountAnalysis
10 |
11 | def eval(model, data_loader, a2v, args, test=False, tokenizer="RoBERTa"):
12 | model.eval()
13 | count = 0
14 | metrics, counts = collections.defaultdict(int), collections.defaultdict(int)
15 |
16 | with torch.no_grad():
17 | if not args.mc:
18 | model.module._compute_answer_embedding(a2v)
19 | results = {}
20 | for i, batch in enumerate(data_loader):
21 | answer_id, answer, video_o, video_f, question, question_id, seg_feats, seg_num = (
22 | batch["answer_id"],
23 | batch["answer"].cuda(),
24 | batch["video_o"].cuda(),
25 | batch["video_f"].cuda(),
26 | batch["question"].cuda(),
27 | batch['question_id'],
28 | batch['seg_feats'].cuda(),
29 | batch['seg_num']
30 | )
31 |
32 | video_len = batch["video_len"]
33 | seq_len = batch["seq_len"]
34 |
35 | question_mask = (question!=tokenizer.pad_token_id).float() #RobBERETa
36 | answer_mask = (answer!=tokenizer.pad_token_id).float() #RobBERETa
37 |
38 | video_mask = get_mask(video_len, video_o.size(1)).cuda()
39 | count += answer_id.size(0)
40 | video = (video_o, video_f)
41 | if not args.mc:
42 | predicts = model(
43 | video,
44 | question,
45 | text_mask=question_mask,
46 | video_mask=video_mask,
47 | seq_len = seq_len
48 | )
49 | topk = torch.topk(predicts, dim=1, k=10).indices.cpu()
50 | if args.dataset != "ivqa":
51 | answer_id_expanded = answer_id.view(-1, 1).expand_as(topk)
52 | else:
53 | answer_id = (answer_id / 2).clamp(max=1)
54 | answer_id_expanded = answer_id
55 | metrics = compute_aggreeings(
56 | topk,
57 | answer_id_expanded,
58 | [1, 10],
59 | ["acc", "acc10"],
60 | metrics,
61 | ivqa=(args.dataset == "ivqa"),
62 | )
63 | for bs, qid in enumerate(question_id):
64 | results[qid] = {'prediction': int(topk.numpy()[bs,0]), 'answer':int(answer_id.numpy()[bs])}
65 | else:
66 | #############Model FLOPs##########
67 | # inputs = (video, question, None, answer.cuda(), seq_len, video_mask, answer_mask)
68 | # flops = FlopCountAnalysis(model, inputs)
69 | # print('Model FLOPs:', flops.total()/1000000) #use batch_size 1
70 | # break
71 | ###################################
72 | fusion_proj, answer_proj = model(
73 | video,
74 | question,
75 | text_mask=answer_mask,
76 | video_mask=video_mask,
77 | answer=answer,
78 | seq_len = seq_len,
79 | seg_feats = seg_feats,
80 | seg_num = seg_num
81 | )
82 | # predicts = fusion_proj.squeeze()
83 |
84 | fusion_proj = fusion_proj.unsqueeze(2)
85 | predicts = torch.bmm(answer_proj, fusion_proj).squeeze()
86 |
87 | predicted = torch.max(predicts, dim=1).indices.cpu()
88 | metrics["acc"] += (predicted == answer_id).sum().item()
89 | for bs, qid in enumerate(question_id):
90 | results[qid] = {'prediction': int(predicted.numpy()[bs]), 'answer':int(answer_id.numpy()[bs])}
91 |
92 | step = "val" if not test else "test"
93 |
94 | for k in metrics:
95 | # print(metrics[k], count)
96 | v = metrics[k] / count
97 | logging.info(f"{step} {k}: {v:.2%}")
98 | break
99 |
100 | return metrics["acc"] / count, results
101 |
102 |
103 | def train(model, train_loader, a2v, optimizer, criterion, scheduler, epoch, args, tokenizer):
104 | model.train()
105 | running_vqa_loss, running_acc, running_mlm_loss, running_cl_loss = (
106 | AverageMeter(),
107 | AverageMeter(),
108 | AverageMeter(),
109 | AverageMeter()
110 | )
111 | for i, batch in enumerate(train_loader):
112 | answer_id, answer, video_o, video_f, question, seg_feats, seg_num, qsn_id, qsn_token_ids, qsn_seq_len = (
113 | batch["answer_id"],
114 | batch["answer"],
115 | batch["video_o"].cuda(),
116 | batch["video_f"].cuda(),
117 | batch["question"].cuda(),
118 | batch['seg_feats'].cuda(),
119 | batch['seg_num'],
120 | batch['qsn_id'],
121 | batch['qsn_token_ids'],
122 | batch['qsn_seq_len']
123 | )
124 |
125 | video_len = batch["video_len"]
126 |
127 | question_mask = (question != tokenizer.pad_token_id).float().cuda() #RobBERETa
128 | answer_mask = (answer!=tokenizer.pad_token_id).float().cuda() #RobBERETa
129 | video_mask = (
130 | get_mask(video_len, video_o.size(1)).cuda() if args.max_feats > 0 else None
131 | )
132 |
133 | qsn_mask = (qsn_token_ids != tokenizer.pad_token_id).float().cuda()
134 |
135 | video = (video_o, video_f)
136 | N = answer_id.size(0)
137 | seq_len = batch["seq_len"]
138 | if not args.mc:
139 | model.module._compute_answer_embedding(a2v)
140 | predicts = model(
141 | video,
142 | question,
143 | text_mask=question_mask,
144 | video_mask=video_mask,
145 | seq_len = seq_len
146 | )
147 | else:
148 | fusion_proj, answer_proj = model(
149 | video,
150 | question,
151 | text_mask=answer_mask,
152 | video_mask=video_mask,
153 | answer=answer.cuda(),
154 | seq_len = seq_len,
155 | seg_feats = seg_feats,
156 | seg_num = seg_num
157 | )
158 |
159 | fusion_proj = fusion_proj.unsqueeze(2)
160 | predicts = torch.bmm(answer_proj, fusion_proj).squeeze()
161 |
162 | if args.dataset == "ivqa":
163 | a = (answer_id / 2).clamp(max=1).cuda()
164 | vqa_loss = criterion(predicts, a)
165 | predicted = torch.max(predicts, dim=1).indices.cpu()
166 | predicted = F.one_hot(predicted, num_classes=len(a2v))
167 | running_acc.update((predicted * a.cpu()).sum().item() / N, N)
168 | else:
169 | vqa_loss = criterion(predicts, answer_id.cuda())
170 | predicted = torch.max(predicts, dim=1).indices.cpu()
171 | running_acc.update((predicted == answer_id).sum().item() / N, N)
172 | if args.cl_loss:
173 | vt_proj, txt_proj = model(
174 | video,
175 | question,
176 | text_mask=qsn_mask,
177 | video_mask=video_mask,
178 | answer=qsn_token_ids,
179 | seq_len = qsn_seq_len,
180 | seg_feats = seg_feats,
181 | seg_num = seg_num
182 | )
183 | vt_proj = vt_proj.unsqueeze(2)
184 | cl_predicts = torch.bmm(txt_proj, vt_proj).squeeze()
185 | cl_loss = criterion(cl_predicts, qsn_id.cuda())
186 | # cl_predicted = torch.max(cl_predicts, dim=1).indices.cpu()
187 | # running_acc.update((predicted == answer_id).sum().item() / N, N)
188 |
189 | if args.mlm_prob:
190 | max_seq_len = args.qmax_words
191 | if args.mc > 0:
192 | tmp_id = [aid+(args.mc*i) for i, aid in enumerate(answer_id)]
193 | inputs = answer.view(N*args.mc, -1)[tmp_id,:]
194 | # question_mask = (inputs>0).float()
195 | question_mask = (inputs!=1).float()
196 | max_seq_len = args.amax_words
197 | else:
198 | inputs = batch["question"]
199 |
200 | inputs, labels = mask_tokens(inputs, tokenizer, mlm_probability=args.mlm_prob)
201 | mlm_loss = model(
202 | video,
203 | question=inputs.cuda(),
204 | labels=labels.cuda(),
205 | text_mask=question_mask,
206 | video_mask=video_mask,
207 | max_seq_len=max_seq_len,
208 | mode="mlm",
209 | )
210 | mlm_loss = mlm_loss.mean()
211 | loss = mlm_loss + vqa_loss
212 | if args.cl_loss:
213 | loss = vqa_loss + args.cl_loss*cl_loss
214 | if args.cl_loss and args.mlm_prob:
215 | loss = vqa_loss + args.cl_loss*cl_loss + mlm_loss
216 | if not args.cl_loss and not args.mlm_prob:
217 | loss = vqa_loss
218 |
219 | optimizer.zero_grad()
220 | loss.backward()
221 | if args.clip:
222 | nn.utils.clip_grad_norm_(model.parameters(), max_norm=args.clip)
223 | optimizer.step()
224 | scheduler.step()
225 |
226 | running_vqa_loss.update(vqa_loss.detach().cpu().item(), N)
227 | if args.mlm_prob:
228 | running_mlm_loss.update(mlm_loss.detach().cpu().item(), N)
229 | if args.cl_loss:
230 | running_cl_loss.update(cl_loss.detach().cpu().item(), N)
231 | if (i + 1) % (len(train_loader) // args.freq_display) == 0:
232 | if args.mlm_prob:
233 | logging.info(
234 | f"Epoch {epoch + 1}/{args.epochs}, Progress: {float(i + 1) / len(train_loader):.4f}, Lvqa loss: "
235 | f"{running_vqa_loss.avg:.4f}, Training acc: {running_acc.avg:.2%}, MLM loss: {running_mlm_loss.avg:.4f}, Lvq Loss: {running_cl_loss.avg:.4f}"
236 | )
237 | elif args.cl_loss:
238 | logging.info(
239 | f"Epoch {epoch + 1}/{args.epochs}, Progress: {float(i + 1) / len(train_loader):.4f}, Lvqa loss: "
240 | f"{running_vqa_loss.avg:.4f}, Train acc: {running_acc.avg:.2%}, Lvq Loss: {running_cl_loss.avg:.4f}"
241 | )
242 | else:
243 | logging.info(
244 | f"Epoch {epoch + 1}/{args.epochs}, Progress: {float(i + 1) / len(train_loader):.4f}, Lvqa loss: "
245 | f"{running_vqa_loss.avg:.4f}, Train acc: {running_acc.avg:.2%}"
246 | )
247 | running_acc.reset()
248 | running_vqa_loss.reset()
249 | running_mlm_loss.reset()
250 | running_cl_loss.reset()
251 |
--------------------------------------------------------------------------------
/util.py:
--------------------------------------------------------------------------------
1 | import re
2 | import torch
3 | import torch.nn.functional as F
4 | import json
5 | import collections
6 | import numpy as np
7 | import os
8 | import os.path as osp
9 | import pandas as pd
10 | import logging
11 | import pickle as pkl
12 | import string
13 |
14 | def tokenize(
15 | seq,
16 | tokenizer,
17 | add_special_tokens=True,
18 | max_length=10,
19 | dynamic_padding=True,
20 | truncation=True,
21 | ):
22 | """
23 | :param seq: sequence of sequences of text
24 | :param tokenizer: bert_tokenizer
25 | :return: torch tensor padded up to length max_length of bert tokens
26 | """
27 | token_ids = tokenizer.batch_encode_plus(
28 | seq,
29 | add_special_tokens=add_special_tokens,
30 | max_length=max_length,
31 | padding="longest" if dynamic_padding else "max_length",
32 | truncation=truncation,
33 | )["input_ids"]
34 | # tokens = [tokenizer.tokenize(s, add_special_tokens=add_special_tokens) for s in seq]
35 | tokens = ''
36 | return torch.tensor(token_ids, dtype=torch.long), tokens
37 |
38 | def transform_bb(roi_bbox, width, height):
39 | dshape = list(roi_bbox.shape)
40 | tmp_bbox = roi_bbox.reshape([-1, 4])
41 | relative_bbox = tmp_bbox / np.asarray([width, height, width, height])
42 | relative_area = (tmp_bbox[:, 2] - tmp_bbox[:, 0] + 1) * \
43 | (tmp_bbox[:, 3] - tmp_bbox[:, 1] + 1)/ (width*height)
44 | relative_area = relative_area.reshape(-1, 1)
45 | bbox_feat = np.hstack((relative_bbox, relative_area))
46 | dshape[-1] += 1
47 | bbox_feat = bbox_feat.reshape(dshape)
48 |
49 | return bbox_feat
50 |
51 |
52 | def compute_aggreeings(topk, answers, thresholds, names, metrics, ivqa=False):
53 | """ Updates metrics dictionary by computing aggreeings for different thresholds """
54 | if not ivqa:
55 | # sp_num = topk.shape[0]
56 | for i, x in enumerate(thresholds):
57 | agreeingsx = (topk[:, :x] == answers[:, :x]).sum().item()
58 | # unk = 0
59 | # for j in range(sp_num):
60 | # if answers[j, 0].item() == 0 and 0 in topk[j, :x].numpy():
61 | # unk += 1
62 | metrics[names[i]] += agreeingsx #-unk
63 | else:
64 | for i, x in enumerate(thresholds):
65 | predicted = F.one_hot(topk[:, :x], num_classes=answers.shape[-1]).sum(1)
66 | metrics[names[i]] += (predicted * answers).max(1)[0].sum().item()
67 | return metrics
68 |
69 |
70 | class AverageMeter:
71 | """ Computes and stores the average and current value for training stats """
72 |
73 | def __init__(self):
74 | self.reset()
75 |
76 | def reset(self):
77 | """ Reset all statistics """
78 | self.val = 0
79 | self.avg = 0
80 | self.sum = 0
81 | self.count = 0
82 |
83 | def update(self, val, n=1):
84 | """ Update statistics """
85 | self.val = val
86 | self.sum += val * n
87 | self.count += n
88 | self.avg = self.sum / self.count
89 |
90 |
91 | def get_mask(lengths, max_length):
92 | """ Computes a batch of padding masks given batched lengths """
93 | mask = 1 * (
94 | torch.arange(max_length).unsqueeze(1).to(lengths.device) < lengths
95 | ).transpose(0, 1)
96 | return mask
97 |
98 |
99 | def compute_a2v(vocab_path, bert_tokenizer, amax_words):
100 | """ Precomputes GloVe answer embeddings for all answers in the vocabulary """
101 | a2id = json.load(open(vocab_path, "r"))
102 | # a2id['[UNK]'] = 0
103 | id2a = {v: k for k, v in a2id.items()}
104 | a2v, _ = tokenize(
105 | list(a2id.keys()),
106 | bert_tokenizer,
107 | add_special_tokens=True,
108 | max_length=amax_words,
109 | dynamic_padding=True,
110 | truncation=True,
111 | )
112 | if torch.cuda.is_available():
113 | a2v = a2v.cuda() # (vocabulary_size, 1, we_dim)
114 | return a2id, id2a, a2v
115 |
116 |
117 | def mask_tokens(inputs, tokenizer, mlm_probability):
118 | """
119 | Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
120 | """
121 | if tokenizer.mask_token is None:
122 | raise ValueError(
123 | "This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the --mlm flag if you want to use this tokenizer."
124 | )
125 |
126 | labels = inputs.clone()
127 | # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
128 |
129 | probability_matrix = torch.full(labels.shape, mlm_probability)
130 | # find special token
131 | special_tokens_mask = [
132 | tokenizer.get_special_tokens_mask(tkid, already_has_special_tokens=True)
133 | for tkid in labels.tolist()
134 | ]
135 | # do not mask special token
136 | probability_matrix.masked_fill_(
137 | torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0
138 | )
139 |
140 | if tokenizer._pad_token is not None:
141 | padding_mask = labels.eq(tokenizer.pad_token_id)
142 | probability_matrix.masked_fill_(padding_mask, value=0.0)
143 |
144 | masked_indices = torch.bernoulli(probability_matrix).bool()
145 | labels[~masked_indices] = -100 # We only compute loss on masked tokens
146 |
147 | # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
148 | indices_replaced = (
149 | torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
150 | )
151 | inputs[indices_replaced] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)
152 |
153 | # 10% of the time, we replace masked input tokens with random word
154 | indices_random = (
155 | torch.bernoulli(torch.full(labels.shape, 0.5)).bool()
156 | & masked_indices
157 | & ~indices_replaced
158 | )
159 | random_words = torch.randint(len(tokenizer), labels.shape, dtype=torch.long)
160 | inputs[indices_random] = random_words[indices_random]
161 |
162 | # The rest of the time (10% of the time) we keep the masked input tokens unchanged
163 |
164 | return inputs, labels
165 |
166 |
167 | def get_types(dataset):
168 | """ Type2Id mapping for VideoQA datasets """
169 | if dataset == "tgif":
170 | return {"what": 0, "how": 1, "color": 2, "where": 3}
171 | elif dataset == "activitynet":
172 | return {
173 | "motion": 0,
174 | "spatial": 1,
175 | "temporal": 2,
176 | "yesno": 3,
177 | "color": 4,
178 | "object": 5,
179 | "location": 6,
180 | "number": 7,
181 | "other": 8,
182 | }
183 | elif dataset == "msvd" or dataset == "msrvtt":
184 | return {"what": 0, "how": 1, "color": 2, "where": 3, "who": 4, "when": 5}
185 | elif dataset == "ivqa":
186 | return {"scenes": 0}
187 | else:
188 | raise NotImplementedError
189 |
190 |
191 | def get_most_common(loader, ivqa=False, n=4):
192 | """ Outputs most common answers and splits in n parts the answers depending on their frequency"""
193 | if ivqa:
194 | ans = []
195 | for a1, a2, a3, a4, a5 in zip(
196 | list(loader.dataset.data["answer1"]),
197 | list(loader.dataset.data["answer2"]),
198 | list(loader.dataset.data["answer3"]),
199 | list(loader.dataset.data["answer4"]),
200 | list(loader.dataset.data["answer5"]),
201 | ):
202 | counteri = collections.Counter([a1, a2, a3, a4, a5])
203 | for w in counteri:
204 | if (
205 | counteri[w] >= 2
206 | ): # an answer is considered as right if it has been annotated by two workers
207 | ans.append(w)
208 | else:
209 | ans = list(loader.dataset.data["answer"])
210 | most_common = collections.Counter(ans).most_common()
211 |
212 | total = sum(x[1] for x in most_common)
213 | splits = [0] * (n + 1)
214 | j = 0
215 | for i in range(n):
216 | cur_total = 0
217 | while j < len(most_common) and cur_total < total / n:
218 | cur_total += most_common[j][1]
219 | j += 1
220 | splits[i + 1] = j
221 | return most_common, splits, total
222 |
223 |
224 | def compute_word_stats(
225 | topk, answers, a2id, a2v, most_common, metrics, counts, ivqa, top10=False
226 | ):
227 | """ Similar as compute_agreeings, computes agreeings and counts for most common words """
228 | if not ivqa:
229 | for word, cword in most_common:
230 | if word not in a2id:
231 | counts[word] = cword
232 | continue
233 | predicted = topk[:, 0]
234 | metrics[f"acc_{word}"] += (
235 | (predicted[answers == a2id[word]] == a2id[word]).sum().item()
236 | )
237 | if top10:
238 | predicted10 = topk[:, :10]
239 | metrics[f"acc10_{word}"] += (
240 | (predicted10[answers == a2id[word]] == a2id[word]).sum().item()
241 | )
242 | counts[word] += (answers == a2id[word]).sum().item()
243 | else:
244 | for word, cword in most_common:
245 | if word not in a2id:
246 | counts[word] = cword
247 | continue
248 | predicted = F.one_hot(topk[:, 0], num_classes=len(a2v))
249 | ans_word = answers[:, a2id[word]]
250 | metrics[f"acc_{word}"] += (
251 | (predicted[:, a2id[word]][ans_word == 1] * ans_word[ans_word == 1])
252 | .sum()
253 | .item()
254 | )
255 | if top10:
256 | predicted10 = F.one_hot(topk[:, :10], num_classes=len(a2v)).sum(1)
257 | metrics[f"acc10_{word}"] += (
258 | (
259 | predicted10[:, a2id[word]][ans_word == 1]
260 | * ans_word[ans_word == 1]
261 | )
262 | .sum()
263 | .item()
264 | )
265 | counts[word] += (ans_word == 1).sum().item()
266 | return metrics, counts
267 |
268 |
269 | def compute_metrics(x):
270 | sx = np.sort(-x, axis=1)
271 | d = np.diag(-x)
272 | d = d[:, np.newaxis]
273 | ind = sx - d
274 | ind = np.where(ind == 0)
275 | ind = ind[1]
276 | metrics = {}
277 | metrics["R1"] = float(np.sum(ind == 0)) / len(ind)
278 | metrics["R10"] = float(np.sum(ind < 10)) / len(ind)
279 | metrics["R100"] = float(np.sum(ind < 100)) / len(ind)
280 | metrics["MR"] = np.median(ind) + 1
281 | return metrics
282 |
283 |
284 | def print_computed_metrics(metrics):
285 | r1 = metrics["R1"]
286 | r10 = metrics["R10"]
287 | r100 = metrics["R100"]
288 | mr = metrics["MR"]
289 | return "R@1: {:.4f} - R@10: {:.4f} - R@100: {:.4f} - Median R: {}".format(
290 | r1, r10, r100, mr
291 | )
292 |
293 |
294 | #added by Junbin
295 | def get_qsn_type(qsn, ans_rsn):
296 | dos = ['does', 'do', 'did']
297 | bes = ['was', 'were', 'is', 'are']
298 | w5h1 = ['what', 'who', 'which', 'why', 'how', 'where']
299 | qsn_sp = qsn.split()
300 | type = qsn_sp[0].lower()
301 | if type == 'what':
302 | if qsn_sp[1].lower() in dos:
303 | type = 'whata'
304 | elif qsn_sp[1].lower() in bes:
305 | type = 'whatb'
306 | else:
307 | type = 'whato'
308 | elif type == 'how':
309 | if qsn_sp[1].lower() == 'many':
310 | type = 'howm'
311 | elif type not in w5h1:
312 | type = 'other'
313 | if ans_rsn in ['pr', 'cr']:
314 | type += 'r'
315 | return type
316 |
317 | def major_type(tgroup):
318 | ans_num = 0
319 | mtype = ''
320 | for type, item in tgroup.items():
321 | if len(item) > ans_num:
322 | ans_num = len(item)
323 | mtype = type
324 | return mtype
325 |
326 | def group(csv_data, gt=True):
327 | ans_group, qsn_group = {}, {}
328 | for idx, row in csv_data.iterrows():
329 | qsn, ans = row['question'], row['answer']
330 | if gt:
331 | type = row['type']
332 | if type == 'TP': type = 'TN'
333 | else:
334 | type = 'null' if 'type' not in row else row['type']
335 | type = get_qsn_type(qsn, type)
336 | if type not in ans_group:
337 | ans_group[type] = {ans}
338 | qsn_group[type] = {qsn}
339 | else:
340 | ans_group[type].add(ans)
341 | qsn_group[type].add(qsn)
342 | return ans_group, qsn_group
343 |
344 |
345 | def load_model_by_key(cur_model, model_path):
346 | model_dict = torch.load(model_path)
347 | new_model_dict = {}
348 | for k, v in cur_model.state_dict().items():
349 | if k in model_dict:
350 | v = model_dict[k]
351 | else:
352 | pass
353 | # print(k)
354 | new_model_dict[k] = v
355 | return new_model_dict
356 |
357 |
358 | def load_file(filename):
359 | '''
360 | added by junbin Xiao
361 | '''
362 | file_type = osp.splitext(filename)[-1]
363 | if file_type == '.csv':
364 | data = pd.read_csv(filename)
365 | else:
366 | with open(filename, 'r') as fp:
367 | if file_type == '.json':
368 | data = json.load(fp)
369 | elif file_type == '.txt':
370 | data = fp.readlines()
371 | data = [datum.rstrip('\n') for datum in data]
372 | return data
373 |
374 |
375 | def save_to(filename, data):
376 | '''
377 | added by junbin Xiao
378 | '''
379 | logging.info(f'Save to {filename}')
380 | dirname = osp.dirname(filename)
381 | if not osp.exists(dirname):
382 | os.makedirs(dirname)
383 | with open(filename, 'w') as fp:
384 | json.dump(data, fp)
385 |
386 | def pkload(filename):
387 | with open(filename, 'rb') as fp:
388 | data = pkl.load(fp)
389 | return data
390 |
--------------------------------------------------------------------------------