├── .gitignore
├── LICENSE
├── README.md
├── args.py
├── dataloader
    ├── .gitignore
    └── cvqa_loader.py
├── datasets
    └── nextqa
    │   ├── map_vid_vidorID.json
    │   ├── test.csv
    │   ├── train.csv
    │   ├── val.csv
    │   └── vlist.json
├── eval_next.py
├── global_parameters.py
├── loss.py
├── main.py
├── misc
    ├── CoVGT-res.png
    └── CoVGT.png
├── model
    ├── .gitignore
    ├── CoVGT.py
    ├── EncoderVid.py
    ├── cmatt.py
    ├── graph.py
    ├── language_model.py
    └── vqa_model.py
├── requirements.txt
├── shells
    ├── cvid_test.sh
    ├── cvid_train.sh
    ├── msrvtt_test.sh
    ├── msrvtt_train.sh
    ├── next_test.sh
    ├── next_train.sh
    ├── tgif_ftrain.sh
    ├── tgif_test.sh
    ├── tgif_train.sh
    └── webvid_train.sh
├── tools
    ├── __pycache__
    │   └── object_align.cpython-38.pyc
    ├── bbox_visualizer.py
    ├── colors.txt
    ├── datautils
    │   ├── msrvtt_qa.py
    │   ├── msvd_qa.py
    │   ├── nextqa.py
    │   ├── tgif_qa.py
    │   └── utils.py
    ├── demo.py
    ├── extract_video.py
    ├── feat_app.sh
    ├── models
    │   ├── __init__.py
    │   ├── densenet.py
    │   ├── pre_act_resnet.py
    │   ├── resnet.py
    │   ├── resnext.py
    │   └── wide_resnet.py
    ├── object_align.py
    ├── preprocess_features.py
    └── split_dataset_feat.py
├── train
    ├── __pycache__
    │   └── train_covgt.cpython-38.pyc
    └── train_covgt.py
└── util.py


/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | __pycache__
3 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Contrastive Video Question Answering via Video Graph Transformer
  2 | <details open>
  3 | <summary> <b>Abstract</b> </summary>
  4 | This repo holds the code for our paper <a href="https://arxiv.org/abs/2302.13668">CoVGT</a> accepted to <a href="https://ieeexplore.ieee.org/document/10172254">IEEE T-PAMI'23</a>. The work extends our preliminary publication at <a href="https://link.springer.com/chapter/10.1007/978-3-031-20059-5_3">ECCV'22</a>. We highlight the following differences compared to the conference version:
  5 | 
  6 | * Jointly supervised and self-supervised contrastive objectives to optimize VGT.
  7 | * Substitute BERT with a stronger language model (e.g., RoBERTa) for QA embedding.
  8 | * Extended results on Causal-VidQA and STAR-QA and more comprehensive ablation studies.
  9 |   
 10 | The code is based on <a href="https://github.com/sail-sg/VGT">VGT</a>.
 11 | </details>
 12 | 
 13 | 
 14 | <div align="center">
 15 |   <img width="50%" alt="Illustration of contrastive learning strategy" src="./misc/CoVGT.png">
 16 | </div>
 17 | 
 18 | ## Todo
 19 | 1. [ ] Release feature of other datasets. Please email the first author and specify the reason as the data is strictly for research purpose.
 20 | 
 21 | ## Environment
 22 | Assume you have installed Anaconda3, cuda version > 11.0 with gpu memory >= 24G, please do the following to setup the envs:
 23 | ```
 24 | >conda create -n videoqa python==3.8.16
 25 | >conda activate videoqa
 26 | >git clone https://github.com/doc-doc/CoVGT.git
 27 | >pip install -r requirements.txt
 28 | >conda install pytorch==1.8.1 torchvision==0.9.1 cudatoolkit=11.1 -c pytorch -c nvidia
 29 | ```
 30 | ## Preparation
 31 | Please create a data folder outside this repo, so you have two folders in your workspace 'workspace/data/' and 'workspace/CoVGT/'. 
 32 | 
 33 | Below we use NExT-QA as an example to get you farmiliar with the code. 
 34 | Please download the related video feature and QA annotations according to the links provided in the ```Results and Resources``` section. Note that the QA annotations will be saved into ```workspace/CoVGT/datasets/nextqa/``` after you clone this repo., video features into ```workspace/data/nextqa/``` and checkpoint files into ```workspace/data/save_models/nextqa/```. Change default paths in global_parameters.py and args.py for your own datasets.
 35 | 
 36 | ## Inference
 37 | ```
 38 | ./shell/next_test.sh 0
 39 | ```
 40 | ## Evaluation
 41 | ```
 42 | python eval_next.py --folder CoVGT_FTCoWV --mode test
 43 | ``` 
 44 | 
 45 | ## Results and Resources
 46 | **<p align="center">Table 1. VideoQA Accuracy (%) on Test Set.</p>**
 47 | <table>
 48 |   <tr>
 49 |     <th>Cross-Modal Pretrain</th>
 50 |     <th><a href="https://github.com/doc-doc/NExT-QA", target="_blank">NExT-QA</a></th>
 51 |     <th><a href="https://github.com/bcmi/Causal-VidQA", target="_blank">Causal-VidQA</a></th>
 52 |     <th><a href="https://bobbywu.com/STAR/">STAR</a></th>
 53 |     <th><a href="https://github.com/YunseokJANG/tgif-qa", target="_blank">TGIF-QA</a> (Action)</th>
 54 |     <th>TGIF-QA (Trans)</th>
 55 |     <th>TGIF-QA (FrameQA)</th>
 56 |     <th>TGIF-QA-R* (Action)</th>
 57 |     <th>TGIF-QA-R* (Trans)</th>
 58 |     <th><a href="https://github.com/xudejing/video-question-answering">MSRVTT-QA</a></th>
 59 |   </tr>
 60 |   <tr>
 61 |     <td>-</td>
 62 |     <td>59.4</td>
 63 |     <td>59.1</td>
 64 |     <td>44.0</td>
 65 |     <td>94.7</td>
 66 |     <td>97.6</td>
 67 |     <td>61.6</td>
 68 |     <td>60.8</td>
 69 |     <td>73.8</td>
 70 |     <td>38.3</td>
 71 |   </tr>
 72 |   
 73 |   <tr>
 74 |     <td><a href="https://drive.google.com/file/d/18KUgq22hLRmRfCgfMf_TXRgjCt3NE8vO/view?usp=sharing">WebVid0.18M</a></td>
 75 |     <td><a href="https://drive.google.com/file/d/1Udhm0BdD_YgcrLZ7RwMXwkKGlW-vHo7H/view?usp=sharing">59.7<a></td>
 76 |     <td>60.8</td>
 77 |     <td>46.2</td>
 78 |     <td>91.3</td>
 79 |     <td>96.2</td>
 80 |     <td>61.7</td>
 81 |     <td>61.0</td>
 82 |     <td>73.2</td>
 83 |     <td>40.0</td>
 84 |   </tr>
 85 |   <tr>
 86 |    <td>-</td>
 87 |     <td><a href="https://drive.google.com/file/d/19TRupHHAP9m0eE9n3b2HMcliiJE9VGcN/view?usp=sharing">feats</a></td>
 88 |     <td>feats</td>
 89 |     <td>feats</td>
 90 |     <td>feats</td>
 91 |     <td>feats</td>
 92 |     <td>feats</td>
 93 |     <td>feats</td>
 94 |     <td>feats</td>
 95 |     <td><a href="https://drive.google.com/file/d/1Y8AlFASLUt91GajD3uNIpuYx_2LCSFG9/view?usp=sharing">feats</a></td>
 96 |   </tr>
 97 |   <tr>
 98 |     <td>-</td>
 99 |     <td><a href="https://drive.google.com/file/d/1jTcRCrVHS66ckOUfWRb-rXdzJ52XAWQH/view?usp=sharing">videos</a></td>
100 |     <td>videos</td>
101 |     <td>videos</td>
102 |     <td>videos</td>
103 |     <td>videos</td>
104 |     <td>videos</td>
105 |     <td>videos</td>
106 |     <td>videos</td>
107 |     <td>videos</td>
108 |   </tr>
109 |   <tr>
110 |     <td>-</td>
111 |     <td><a href="https://github.com/doc-doc/CoVGT/tree/main/datasets/nextqa">Q&A</a></td>
112 |     <td><a href="https://drive.google.com/file/d/1XLPiVY72GV0NYXGNR32wNQQMRe1J-33E/view?usp=sharing">Q&A</a></td>
113 |     <td><a href="https://drive.google.com/file/d/16GBhLrtDooB4DbKgwgERY_XvfUQ-9p8J/view?usp=sharing">Q&A</a></td>
114 |     <td><a href="https://drive.google.com/file/d/1CE04mDGiQ2EcG5zbQAgERi-EukiIPKSw/view?usp=sharing">Q&A</a></td>
115 |     <td>Q&A</td>
116 |     <td>Q&A</td>
117 |     <td><a href="https://drive.google.com/file/d/1arKsPSkx9DwlX1SpP_qoh30PgfgitKQv/view?usp=sharing">Q&A</a></td>
118 |     <td>Q&A</td>
119 |     <td><a href="https://drive.google.com/file/d/1vstHqtgZQBkPzzRTZo-Nb15rAQ4b7gOl/view?usp=sharing">Q&A</a></td>
120 |   </tr>
121 | </table>
122 | (The feature files are identical to VGT. We have merged some files of the same dataset to avoid too many links.)
123 | 
124 | ## Train
125 | We have provided all the scripts in the folder 'shells', you can start your training by specifying the GPU IDs behind the script. (If you have multiple GPUs, you can separate them with comma: ./shell/nextqa_train.sh 0,1)
126 | ```
127 | ./shell/nextqa_train.sh 0
128 | ```
129 | It will train the model and save to the folder 'save_models/nextqa/CoVGT/'. You will get results around 60.1% and 59.4% on the val and test set respectively.
130 | 
131 | ### Result Visualization (NExT-QA)
132 | <div align="center">
133 |   <img width="100%" alt="VGT vs VGT without DGT" src="./misc/CoVGT-res.png">
134 | </div>
135 | 
136 | ## Citations 
137 | ```
138 | @ARTICLE {xiao2023contrastive,
139 | author = {Junbin Xiao and Pan Zhou and Angela Yao and Yicong Li and Richang Hong and Shuicheng Yan and Tat Seng Chua},
140 | journal = {IEEE Transactions on Pattern Analysis &amp; Machine Intelligence},
141 | title = {Contrastive Video Question Answering via Video Graph Transformer},
142 | year = {2023},
143 | volume = {45},
144 | number = {11},
145 | issn = {1939-3539},
146 | pages = {13265-13280},
147 | doi = {10.1109/TPAMI.2023.3292266},
148 | publisher = {IEEE Computer Society},
149 | address = {Los Alamitos, CA, USA},
150 | month = {nov}
151 | }
152 | ```
153 | ```
154 | @inproceedings{xiao2022video,
155 |   title={Video Graph Transformer for Video Question Answering},
156 |   author={Xiao, Junbin and Zhou, Pan and Chua, Tat-Seng and Yan, Shuicheng},
157 |   booktitle={European Conference on Computer Vision},
158 |   pages={39--58},
159 |   year={2022},
160 |   organization={Springer}
161 | }
162 | ```
163 | ## Notes
164 | If you use any resources from this repo, please kindly cite our paper and acknowledge the source.
165 | ## License
166 | This repository is released under the Apache 2.0 license as found in the [LICENSE](LICENSE) file.
167 | 


--------------------------------------------------------------------------------
/args.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | 
  4 | from global_parameters import (
  5 |     DEFAULT_DATASET_DIR,
  6 |     DEFAULT_CKPT_DIR,
  7 |     TRANSFORMERS_PATH,
  8 |     SSD_DIR,
  9 |     dataset2folder,
 10 | )
 11 | 
 12 | def get_args():
 13 |     parser = argparse.ArgumentParser()
 14 |     parser.add_argument(
 15 |         "--dataset",
 16 |         type=str,
 17 |         default="ivqa",
 18 |         choices=[
 19 |             "ivqa",
 20 |             "msrvtt",
 21 |             "msrvttmc",
 22 |             "msvd",
 23 |             "webvid",
 24 |             "activitynet",
 25 |             "howto100m",
 26 |             "howtovqa",
 27 |             "how2qa",
 28 |             "nextqa",
 29 |             "star",
 30 |             "tgifqa/transition",
 31 |             "tgifqa/action",
 32 |             "tgifqa/frameqa",
 33 |             "tgifqa2/transition",
 34 |             "tgifqa2/action",
 35 |             "causalvid"
 36 |         ],
 37 |     )
 38 |     parser.add_argument(
 39 |         "--subset",
 40 |         type=str,
 41 |         default="",
 42 |         choices=["", "1", "10", "20", "50"],
 43 |         help="use a subset of the generated dataset",
 44 |     )
 45 | 
 46 |     # Model
 47 |     parser.add_argument(
 48 |         "--baseline",
 49 |         type=str,
 50 |         default="",
 51 |         choices=["", "qa"],
 52 |         help="qa baseline does not use the video, video baseline does not use the question",
 53 |     )
 54 |     parser.add_argument(
 55 |         "--n_layers",
 56 |         type=int,
 57 |         default=2,
 58 |         help="number of layers in the multi-modal transformer",
 59 |     )
 60 |     parser.add_argument(
 61 |         "--n_heads",
 62 |         type=int,
 63 |         default=8,
 64 |         help="number of attention heads in the multi-modal transformer",
 65 |     )
 66 |     parser.add_argument(
 67 |         "--embd_dim",
 68 |         type=int,
 69 |         default=512,
 70 |         help="multi-modal transformer and final embedding dimension",
 71 |     )
 72 |     parser.add_argument(
 73 |         "--ff_dim",
 74 |         type=int,
 75 |         default=2048,
 76 |         help="multi-modal transformer feed-forward dimension",
 77 |     )
 78 |     parser.add_argument(
 79 |         "--dropout",
 80 |         type=float,
 81 |         default=0.1,
 82 |         help="dropout rate in the multi-modal transformer",
 83 |     )
 84 |     parser.add_argument(
 85 |         "--sentence_dim",
 86 |         type=int,
 87 |         default=2048,
 88 |         help="sentence dimension for the differentiable bag-of-words embedding the answers",
 89 |     )
 90 |     parser.add_argument(
 91 |         "--qmax_words",
 92 |         type=int,
 93 |         default=20,
 94 |         help="maximum number of words in the question",
 95 |     )
 96 |     parser.add_argument(
 97 |         "--amax_words",
 98 |         type=int,
 99 |         default=10,
100 |         help="maximum number of words in the answer",
101 |     )
102 |     parser.add_argument(
103 |         "--max_feats",
104 |         type=int,
105 |         default=20,
106 |         help="maximum number of video features considered",
107 |     )
108 | 
109 |     # Paths
110 |     parser.add_argument(
111 |         "--dataset_dir",
112 |         type=str,
113 |         default=DEFAULT_DATASET_DIR,
114 |         help="folder where the datasets folders are stored",
115 |     )
116 |     parser.add_argument(
117 |         "--ssd_dir",
118 |         type=str,
119 |         default=SSD_DIR,
120 |         help="folder with ssd storage where the HowTo100M features are stored",
121 |     )
122 |     parser.add_argument(
123 |         "--checkpoint_predir",
124 |         type=str,
125 |         default=DEFAULT_CKPT_DIR,
126 |         help="folder to store checkpoints",
127 |     )
128 |     parser.add_argument(
129 |         "--checkpoint_dir", type=str, default="", help="subfolder to store checkpoint"
130 |     )
131 |     parser.add_argument(
132 |         "--pretrain_path", type=str, default="", help="path to pretrained checkpoint"
133 |     )
134 |     parser.add_argument(
135 |         "--bert_path",
136 |         type=str,
137 |         default=TRANSFORMERS_PATH,
138 |         help="path to transformer models checkpoints",
139 |     )
140 | 
141 |     # Train
142 |     parser.add_argument("--batch_size", type=int, default=256)
143 |     parser.add_argument("--batch_size_val", type=int, default=2048)
144 |     parser.add_argument(
145 |         "--n_pair",
146 |         type=int,
147 |         default=32,
148 |         help="number of clips per video to consider to train on HowToVQA69M",
149 |     )
150 |     parser.add_argument("--seed", type=int, default=1)
151 |     parser.add_argument("--epochs", type=int, default=20)
152 |     parser.add_argument(
153 |         "--test", type=int, default=0, help="use to evaluate without training"
154 |     )
155 |     parser.add_argument(
156 |         "--lr", type=float, default=0.00005, help="initial learning rate"
157 |     )
158 |     parser.add_argument("--weight_decay", type=float, default=0, help="weight decay")
159 |     parser.add_argument(
160 |         "--clip",
161 |         type=float,
162 |         default=12,
163 |         help="gradient clipping",
164 |     )
165 | 
166 |     # Print
167 |     parser.add_argument(
168 |         "--freq_display", type=int, default=3, help="number of train prints per epoch"
169 |     )
170 |     parser.add_argument(
171 |         "--num_thread_reader", type=int, default=16, help="number of workers"
172 |     )
173 | 
174 |     # Masked Language Modeling and Cross-Modal Matching parameters
175 |     parser.add_argument("--mlm_prob", type=float, default=0.15)
176 |     parser.add_argument("--n_negs", type=int, default=1)
177 |     parser.add_argument("--lr_decay", type=float, default=0.9)
178 |     parser.add_argument("--min_time", type=int, default=10)
179 |     parser.add_argument("--min_words", type=int, default=10)
180 | 
181 |     # Demo parameters
182 |     parser.add_argument(
183 |         "--question_example", type=str, default="", help="demo question text"
184 |     )
185 |     parser.add_argument("--video_example", type=str, default="", help="demo video path")
186 |     parser.add_argument("--port", type=int, default=8899, help="demo port")
187 |     parser.add_argument(
188 |         "--pretrain_path2", type=str, default="", help="second demo model"
189 |     )
190 |     parser.add_argument(
191 |         "--save_dir", type=str, default="./save_models/", help="path to save dir"
192 |     )
193 |     parser.add_argument(
194 |         "--mc", type=int, default=5, help="number of multiple choices"
195 |     )
196 |     parser.add_argument(
197 |         "--bnum", type=int, default=10, help="number of region proposal"
198 |     )
199 |     parser.add_argument(
200 |         "--cl_loss", type=float, default=0, help="trade offf with contrastive loss"
201 |     )
202 |     parser.add_argument(
203 |         "--lan", type=str, default='RoBERTa', help="BERT or RoBERTa"
204 |     )
205 | 
206 |     args = parser.parse_args()
207 | 
208 |     os.environ["TRANSFORMERS_CACHE"] = args.bert_path
209 |     # args.save_dir = './save_dir/'
210 |     
211 |     #args.save_dir = os.path.join(args.checkpoint_predir, args.checkpoint_dir)
212 | 
213 |     # multiple-choice arg
214 |     # args.mc = 4 if args.dataset == "how2qa" else 0
215 |     # args.mc = 5 if args.dataset == "nextqa" else 0
216 | 
217 |     # feature dimension
218 |     args.feature_dim = 2048  # S3D:1024 app_mot:4096 #2048 RoI
219 |     args.word_dim = 768  # DistilBERT
220 | 
221 |     # Map from dataset name to folder name
222 | 
223 |     load_path = os.path.join(args.dataset_dir, args.dataset)
224 |     args.load_path = load_path
225 | 
226 |     if args.dataset not in ["howto100m", "howtovqa"]:  # VideoQA dataset
227 |         args.features_path = f'../data/{args.dataset}/' #os.path.join(load_path, "s3d.pth")
228 |         # args.features_path = f'/data/datasets/{args.dataset}/'
229 |         args.train_csv_path = os.path.join(load_path, "train.csv")
230 |         if args.dataset == 'tgifqa':
231 |             args.val_csv_path = os.path.join(load_path, "test.csv")
232 |         else:
233 |             args.val_csv_path = os.path.join(load_path, "val.csv")
234 |         args.test_csv_path = os.path.join(load_path, "test.csv")
235 |         args.vocab_path = os.path.join(load_path, "vocab.json")
236 |     else:  # Pretraining dataset
237 |         args.features_path = os.path.join(
238 |             args.ssd_dir, "s3d_features", "howto100m_s3d_features"
239 |         )
240 |         if args.dataset == "howto100m":
241 |             args.caption_path = os.path.join(
242 |                 load_path, "caption_howto100m_sw_nointersec_norepeat.pickle"
243 |             )
244 |             args.train_csv_path = os.path.join(
245 |                 load_path, f"s3d_features_nointersec.csv"
246 |             )
247 |             args.youcook_val_path = os.path.join(
248 |                 args.dataset_dir, "YouCook2", "youcook_unpooled_val.pkl"
249 |             )
250 |             args.msrvtt_test_csv_path = os.path.join(
251 |                 args.dataset_dir, "MSR-VTT", "MSRVTT_JSFUSION_test.csv"
252 |             )
253 |             args.msrvtt_test_features_path = os.path.join(
254 |                 args.dataset_dir, "MSR-VTT", "msrvtt_test_unpooled_s3d_features.pth"
255 |             )
256 |         elif args.dataset == "howtovqa":
257 |             if not args.subset:
258 |                 args.caption_path = os.path.join(load_path, "howtovqa.pkl")
259 |                 args.train_csv_path = os.path.join(load_path, "train_howtovqa.csv")
260 |                 args.val_csv_path = os.path.join(load_path, "val_howtovqa.csv")
261 |             else:
262 |                 args.caption_path = os.path.join(
263 |                     load_path, f"howtovqa_{args.subset}.pickle"
264 |                 )
265 |                 args.train_csv_path = os.path.join(
266 |                     load_path, f"train_howtovqa_{args.subset}.csv"
267 |                 )
268 |                 args.val_csv_path = os.path.join(
269 |                     load_path, f"val_howtovqa_{args.subset}.csv"
270 |                 )
271 | 
272 |     return args
273 | 


--------------------------------------------------------------------------------
/dataloader/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | __pycache__
3 | 


--------------------------------------------------------------------------------
/eval_next.py:
--------------------------------------------------------------------------------
 1 | import os.path as osp
 2 | from util import load_file
 3 | import argparse
 4 | 
 5 | map_name = {'CW': 'Why', 'CH': 'How', 'TN': 'Bef&Aft', 'TC': 'When', 
 6 |             'DC': 'Cnt', 'DL': 'Loc', 'DO': 'Other', 'C': 'Acc_C', 
 7 |             'T': 'Acc_T', 'D': 'Acc_D'}
 8 | 
 9 | def accuracy_metric(sample_list, result):
10 |     
11 |     group = {'CW':[], 'CH':[], 'TN':[], 'TC':[], 'DC':[], 'DL':[], 'DO':[]}
12 |     for id, row in sample_list.iterrows():
13 |         qns_id = str(row['video_id']) + '_' + str(row['qid'])
14 |         qtype = str(row['type'])
15 |         #(combine temporal qns of previous and next as 'TN')
16 |         if qtype == 'TP': 
17 |             qtype = 'TN'
18 |         group[qtype].append(qns_id)
19 | 
20 |     preds = result
21 |     group_acc = {'CW': 0, 'CH': 0, 'TN': 0, 'TC': 0, 'DC': 0, 'DL': 0, 'DO': 0}
22 |     group_cnt = {'CW': 0, 'CH': 0, 'TN': 0, 'TC': 0, 'DC': 0, 'DL': 0, 'DO': 0}
23 |     overall_acc = {'C':0, 'T':0, 'D':0}
24 |     overall_cnt = {'C':0, 'T':0, 'D':0}
25 |     all_acc = 0
26 |     all_cnt = 0
27 |     for qtype, qns_ids in group.items():
28 |         cnt = 0
29 |         acc = 0
30 |         for qid in qns_ids:
31 | 
32 |             cnt += 1
33 |             answer = preds[qid]['answer']
34 |             pred = preds[qid]['prediction']
35 |             if answer == pred: 
36 |                 acc += 1
37 | 
38 |         group_cnt[qtype] = cnt
39 |         group_acc[qtype] += acc
40 |         overall_acc[qtype[0]] += acc
41 |         overall_cnt[qtype[0]] += cnt
42 |         all_acc += acc
43 |         all_cnt += cnt
44 | 
45 | 
46 |     for qtype, value in overall_acc.items():
47 |         group_acc[qtype] = value
48 |         group_cnt[qtype] = overall_cnt[qtype]
49 | 
50 |     for qtype in group_acc:
51 |         if group_cnt[qtype] == 0: continue
52 |         print(map_name[qtype], end='\t')
53 |     print('')
54 |     for qtype, acc in group_acc.items():
55 |         if group_cnt[qtype] == 0: continue
56 |         print('{:.2f}'.format(acc*100.0/group_cnt[qtype]), end ='\t')
57 |     print('')
58 |     print('Acc: {:.2f}'.format(all_acc*100.0/all_cnt))
59 | 
60 | 
61 | 
62 | def accuracy_metric_sub(sample_list, result, sub_ids):
63 |     
64 |     sub_ids = [int(id) for id in sub_ids]
65 |     subset = sample_list.iloc[sub_ids]
66 | 
67 |     accuracy_metric(subset, result)
68 | 
69 | 
70 | 
71 | def main(result_file, mode='val'):
72 |     dataset_dir = '../data/datasets/nextqa/'
73 |     data_set = mode
74 |     sample_list_file = osp.join(dataset_dir, data_set+'.csv')
75 |     print('Evaluating {}'.format(result_file))
76 | 
77 |     sample_list = load_file(sample_list_file)
78 |     result = load_file(result_file)
79 |     accuracy_metric(sample_list, result)
80 | 
81 |     if mode == 'val':
82 |         hard_subset = osp.join(dataset_dir, 'atp-hard-ct4.txt')
83 |         sub_ids = load_file(hard_subset)
84 |         accuracy_metric_sub(sample_list, result, sub_ids)
85 | 
86 | 
87 | 
88 | if __name__ == "__main__":
89 |     parser = argparse.ArgumentParser()
90 |     parser.add_argument("--mode", type=str, default='val', choices=['val','test'])
91 |     parser.add_argument("--folder", type=str)
92 |     args = parser.parse_args()
93 |     res_dir = '../data/save_models/nextqa/'+args.folder
94 |     #res_dir = '../data/models/nextqa/'
95 |     mode = args.mode
96 |     model_prefix = 'res'
97 |     result_file = '{}/{}-{}.json'.format(res_dir, mode, model_prefix)
98 |     main(result_file, mode)
99 | 


--------------------------------------------------------------------------------
/global_parameters.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | # Fill the paths
 4 | DEFAULT_DATASET_DIR = "./datasets/"  # where the datasets folders are
 5 | DEFAULT_CKPT_DIR = "../data/models/"   # where the training checkpoints and logs will be saved
 6 | DEFAULT_MODEL_DIR = "../data/pretrain_models/"  # where the pretrained models are
 7 | SSD_DIR = "../data/feats/"  # where the HowTo100M S3D features are
 8 | HOWTO_FEATURES_PATH = os.path.join(SSD_DIR, "s3d_features", "howto100m_s3d_features")
 9 | 
10 | # Map from dataset name to folder name
11 | dataset2folder = {
12 |     "ivqa": "iVQA",
13 |     "msrvtt": "MSRVTT-QA",
14 |     "msvd": "msvd",
15 |     "activitynet": "ActivityNet-QA",
16 |     "howto100m": "HowTo100M",
17 |     "howtovqa": "HowToVQA69M",
18 |     "how2qa": "How2QA",
19 |     "nextqa": "nextqa"
20 | }
21 | 
22 | # Datasets
23 | IVQA_PATH = os.path.join(
24 |     DEFAULT_DATASET_DIR, dataset2folder["ivqa"]
25 | )  # Path where iVQA is downloaded
26 | MSRVTT_PATH = os.path.join(
27 |     DEFAULT_DATASET_DIR, dataset2folder["msrvtt"]
28 | )  # Path where MSRVTT-QA is downloaded
29 | MSVD_PATH = os.path.join(
30 |     DEFAULT_DATASET_DIR, dataset2folder["msvd"]
31 | )  # Path where MSVD-QA is downloaded
32 | ACT_PATH = os.path.join(
33 |     DEFAULT_DATASET_DIR, dataset2folder["activitynet"]
34 | )  # Path where ActivityNet-QA is downloaded
35 | HOWTO_PATH = os.path.join(
36 |     DEFAULT_DATASET_DIR, dataset2folder["howto100m"]
37 | )  # Path where HowTo100M is downloaded
38 | HOWTOVQA_PATH = os.path.join(
39 |     DEFAULT_DATASET_DIR, dataset2folder["howtovqa"]
40 | )  # Path where HowToVQA69M is downloaded / generated
41 | HOW2QA_PATH = os.path.join(
42 |     DEFAULT_DATASET_DIR, dataset2folder["how2qa"]
43 | )  # Path where How2QA is downloaded
44 | NEXTQA_PATH = os.path.join(
45 |     DEFAULT_DATASET_DIR, dataset2folder["nextqa"]
46 | )  # Path where How2QA is downloaded
47 | 
48 | 
49 | # Models
50 | S3D_PATH = os.path.join(
51 |     DEFAULT_MODEL_DIR, "s3d_howto100m.pth"
52 | )  # Path to S3D checkpoint
53 | S3D_DICT_PATH = os.path.join(
54 |     DEFAULT_MODEL_DIR, "s3d_dict.npy"
55 | )  # Path to S3D dictionary
56 | PUNCTUATOR_PATH = os.path.join(
57 |     DEFAULT_MODEL_DIR, "INTERSPEECH-T-BRNN.pcl"
58 | )  # Path to Punctuator2 checkpoint
59 | TRANSFORMERS_PATH = os.path.join(
60 |     DEFAULT_MODEL_DIR, "transformers"
61 | )  # Path where the transformers checkpoints will be saved
62 | 
63 | # Question-answer Generation
64 | punct_dir = os.path.join(
65 |     SSD_DIR, "punct"
66 | )  # Path where the punctuated clips will be created (1 file per unique video)
67 | QG_REPO_DIR = ""  # Path where the question generation repo is cloned
68 | answers_dir = os.path.join(
69 |     SSD_DIR, "ans"
70 | )  # Path where the extracted answers will be saved (1 file per unique video)
71 | qas_dir = os.path.join(
72 |     SSD_DIR, "qas"
73 | )  # Path where the generated question-answers will be saved (1 file per unique video)
74 | 


--------------------------------------------------------------------------------
/loss.py:
--------------------------------------------------------------------------------
 1 | import torch as torch
 2 | import torch.nn.functional as F
 3 | 
 4 | 
 5 | class Contrastive_Loss(torch.nn.Module):
 6 |     def __init__(self):
 7 |         super(Contrastive_Loss, self).__init__()
 8 |         self.ce_loss = torch.nn.CrossEntropyLoss()
 9 | 
10 |     def forward(self, x, target):
11 |         return self.ce_loss(x, target)
12 | 
13 | 
14 | class LogSoftmax(torch.nn.Module):
15 |     def __init__(self, dim):
16 |         super(LogSoftmax, self).__init__()
17 |         self.dim = dim
18 | 
19 |     def forward(self, x, a):
20 |         nll = -F.log_softmax(x, self.dim, _stacklevel=5)
21 |         return (nll * a / a.sum(1, keepdim=True).clamp(min=1)).sum(dim=1).mean()
22 | 
23 | 
24 | class NCELoss(torch.nn.Module):
25 |     def __init__(self, batch_size=4096):
26 |         super(NCELoss, self).__init__()
27 |         self.ce_loss = torch.nn.CrossEntropyLoss()
28 | 
29 |     def forward(self, x):
30 |         batch_size = len(x)
31 |         target = torch.arange(batch_size).cuda()
32 |         x = torch.cat((x, x.t()), dim=1)
33 |         return self.ce_loss(x, target)
34 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.optim as optim
  4 | import numpy as np
  5 | import random
  6 | import os
  7 | import os.path as osp
  8 | import logging
  9 | 
 10 | from transformers import get_cosine_schedule_with_warmup
 11 | from args import get_args
 12 | from model.CoVGT import VGT
 13 | from loss import LogSoftmax
 14 | from util import compute_a2v, load_model_by_key, save_to
 15 | from dataloader.cvqa_loader import get_videoqa_loaders
 16 | from train.train_covgt import train, eval
 17 | 
 18 | 
 19 | 
 20 | def main(args):
 21 |     if not (os.path.isdir(args.save_dir)):
 22 |         os.mkdir(os.path.join(args.save_dir))
 23 |     logging.basicConfig(
 24 |         level=logging.INFO, format="%(asctime)s %(levelname)-8s %(message)s"
 25 |     )
 26 |     logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
 27 |     rootLogger = logging.getLogger()
 28 |     fileHandler = logging.FileHandler(os.path.join(args.save_dir, "stdout.log"), "w+")
 29 |     fileHandler.setFormatter(logFormatter)
 30 |     rootLogger.addHandler(fileHandler)
 31 |     logging.info(args)
 32 | 
 33 | 
 34 |     if args.lan == 'BERT':
 35 |         from transformers import BertTokenizer
 36 |         tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
 37 |     elif args.lan == 'RoBERTa':
 38 |         from transformers import RobertaTokenizerFast,RobertaTokenizer
 39 |         tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
 40 | 
 41 |     
 42 |     a2id, id2a, a2v = None, None, None
 43 |     if not args.mc:
 44 |         a2id, id2a, a2v = compute_a2v(
 45 |             vocab_path=args.vocab_path,
 46 |             bert_tokenizer=tokenizer,
 47 |             amax_words=args.amax_words,
 48 |         )
 49 |         logging.info(f"Length of Answer Vocabulary: {len(a2id)}")
 50 | 
 51 |     # Model
 52 |     model = VGT(
 53 |         tokenizer = tokenizer,
 54 |         feature_dim=args.feature_dim,
 55 |         word_dim=args.word_dim,
 56 |         N=args.n_layers,
 57 |         d_model=args.embd_dim,
 58 |         d_ff=args.ff_dim,
 59 |         h=args.n_heads,
 60 |         dropout=args.dropout,
 61 |         T=args.max_feats,
 62 |         Q=args.qmax_words,
 63 |         vocab_size = tokenizer.vocab_size,
 64 |         baseline=args.baseline,
 65 |         bnum=args.bnum,
 66 |         lan=args.lan
 67 |     )
 68 |     model.cuda()
 69 |     logging.info("Using {} GPUs".format(torch.cuda.device_count()))
 70 | 
 71 |     # Load pretrain path
 72 |     model = nn.DataParallel(model)
 73 |     
 74 |     if args.pretrain_path != "":
 75 |         # model.load_state_dict(torch.load(args.pretrain_path))
 76 |         model.load_state_dict(load_model_by_key(model, args.pretrain_path))
 77 |         logging.info(f"Loaded checkpoint {args.pretrain_path}")
 78 |     logging.info(
 79 |         f"Nb of trainable params:{sum(p.numel() for p in model.parameters() if p.requires_grad)}"
 80 |     )
 81 | 
 82 |     (
 83 |         train_loader,
 84 |         val_loader,
 85 |         test_loader,
 86 |     ) = get_videoqa_loaders(args, args.features_path, a2id, tokenizer, test_mode = args.test)
 87 | 
 88 |     if args.test:
 89 |         logging.info("number of test instances: {}".format(len(test_loader.dataset)))
 90 |     else:
 91 |         logging.info("number of train instances: {}".format(len(train_loader.dataset)))
 92 |         logging.info("number of val instances: {}".format(len(val_loader.dataset)))
 93 | 
 94 |    
 95 |     criterion = nn.CrossEntropyLoss(ignore_index=-1)
 96 |     # criterion = MultipleChoiceLoss()
 97 |     params_for_optimization = list(p for p in model.parameters() if p.requires_grad)
 98 |     optimizer = optim.Adam(
 99 |         params_for_optimization, lr=args.lr, weight_decay=args.weight_decay
100 |     )
101 |     criterion.cuda()
102 | 
103 |     # Training
104 |     if not args.test:
105 |         scheduler = get_cosine_schedule_with_warmup(
106 |             optimizer, 0, len(train_loader) * args.epochs
107 |         )
108 |         logging.info(
109 |             f"Set cosine schedule with {len(train_loader) * args.epochs} iterations"
110 |         )
111 |         if args.pretrain_path != "":
112 |             val_acc, results = eval(model, val_loader, a2v, args, test=False, tokenizer=tokenizer)  # zero-shot VideoQA
113 |             save_path = osp.join(args.save_dir, 'val-res0.json')
114 |             save_to (save_path, results)
115 |         best_val_acc = 0 if args.pretrain_path == "" else val_acc
116 |         best_epoch = 0
117 |         for epoch in range(args.epochs):
118 |             train(model, train_loader, a2v, optimizer, criterion, scheduler, epoch, args, tokenizer)
119 |             val_acc, results = eval(model, val_loader, a2v, args, test=False, tokenizer=tokenizer)
120 |             if val_acc > best_val_acc:
121 |                 best_val_acc = val_acc
122 |                 best_epoch = epoch
123 |                 torch.save(
124 |                     model.state_dict(), os.path.join(args.save_dir, "best_model.pth")
125 |                 )
126 |                 save_path = osp.join(args.save_dir, 'val-res.json')
127 |                 save_to (save_path, results)
128 |             if args.dataset == 'webvid': 
129 |                 ep_file = os.path.join(args.save_dir, f"e{epoch}.pth")
130 |                 torch.save(model.state_dict(), ep_file)
131 |                 logging.info('Save to '+ep_file)
132 |         logging.info(f"Best val model at epoch {best_epoch + 1}")
133 |     else:   
134 |     # Evaluate on test set
135 |         test_acc, results = eval(model, test_loader, a2v, args, test=True, tokenizer=tokenizer)
136 |         save_path = osp.join(args.save_dir, 'test-res.json')
137 |         save_to(save_path, results)
138 | 
139 | 
140 | if __name__ == "__main__":
141 |     # set random seeds
142 |     args = get_args()
143 |     torch.backends.cudnn.enabled = False
144 |     torch.cuda.manual_seed(args.seed)
145 |     torch.manual_seed(args.seed)
146 |     np.random.seed(args.seed)
147 |     random.seed(args.seed)
148 |     torch.backends.cudnn.benchmark = True
149 |         
150 |     main(args)
151 | 


--------------------------------------------------------------------------------
/misc/CoVGT-res.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/doc-doc/CoVGT/cbc9fa7830b304f3c3f9c53040489ea9ad35a9aa/misc/CoVGT-res.png


--------------------------------------------------------------------------------
/misc/CoVGT.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/doc-doc/CoVGT/cbc9fa7830b304f3c3f9c53040489ea9ad35a9aa/misc/CoVGT.png


--------------------------------------------------------------------------------
/model/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | __pycache__
3 | 


--------------------------------------------------------------------------------
/model/EncoderVid.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2022 Garena Online Private Limited
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import torch.nn as nn
16 | import torch
17 | 
18 | class EncoderVid(nn.Module):
19 |     def __init__(self, feat_dim, bbox_dim, feat_hidden, pos_hidden, input_dropout_p=0.3):
20 |         
21 |         super(EncoderVid, self).__init__()
22 |         self.dim_feat = feat_dim
23 |         self.dim_bbox = bbox_dim
24 |         self.dim_hidden = feat_hidden
25 |         self.input_dropout_p = input_dropout_p
26 | 
27 |         input_dim = feat_dim
28 | 
29 |         input_dim += pos_hidden
30 |         self.bbox_conv = nn.Sequential(
31 |             nn.Conv2d(self.dim_bbox, pos_hidden, kernel_size=1),
32 |             nn.BatchNorm2d(pos_hidden),
33 |             nn.ReLU(),
34 |             nn.Conv2d(pos_hidden, pos_hidden, kernel_size=1),
35 |             nn.BatchNorm2d(pos_hidden),
36 |             nn.ReLU(),
37 |             
38 |         )
39 | 
40 |         self.tohid = nn.Sequential(
41 |             nn.Linear(feat_dim+pos_hidden, feat_hidden),
42 |             nn.ELU(inplace=True))
43 |         
44 |         # self.roi_conv = nn.Sequential(
45 |         #     nn.Conv1d(feat_dim, feat_hidden, kernel_size=3, padding=1),
46 |         #     nn.ELU(inplace=True)
47 |         # )
48 | 
49 |         # self.roi_conv = nn.Sequential(
50 |         #     nn.Conv2d(4, 4, kernel_size=1),
51 |         #     nn.BatchNorm2d(4),
52 |         #     nn.ReLU(),
53 |         # )
54 | 
55 | 
56 |     def forward(self, video_o):
57 |         
58 |         bsize, numc, numf, numr, fdim =  video_o.shape
59 |        
60 |         video_o = video_o.view(bsize, numc*numf, numr, fdim)
61 |         roi_feat = video_o[:,:,:, :self.dim_feat]
62 |         roi_bbox = video_o[:,:,:, self.dim_feat:(self.dim_feat+self.dim_bbox)]
63 |         
64 |         bbox_pos = self.bbox_conv(roi_bbox.permute(
65 |             0, 3, 1, 2)).permute(0, 2, 3, 1)
66 |  
67 |         bbox_features = torch.cat([roi_feat, bbox_pos], dim=-1)
68 | 
69 |         bbox_feat = self.tohid(bbox_features)
70 |         
71 |         return bbox_feat
72 | 


--------------------------------------------------------------------------------
/model/cmatt.py:
--------------------------------------------------------------------------------
 1 | __author__ = "Jie Lei"
 2 | 
 3 | import torch
 4 | import torch.nn as nn
 5 | import torch.nn.functional as F
 6 | from torch.autograd import Variable
 7 | 
 8 | 
 9 | class CMAtten(nn.Module):
10 |     
11 |     def __init__(self):
12 |         super(CMAtten, self).__init__()
13 |        
14 | 
15 |     def similarity(self, s1, l1, s2, l2):
16 |         """
17 |         :param s1: [B, t1, D]
18 |         :param l1: [B]
19 |         :param s2: [B, t2, D]
20 |         :param l2: [B]
21 |         :return:
22 |         """
23 |         s = torch.bmm(s1, s2.transpose(1, 2))
24 | 
25 |         # import ipdb; ipdb.set_trace()
26 |         s_mask = s.data.new(*s.size()).fill_(1).bool()  # [B, T1, T2]
27 |         # Init similarity mask using lengths
28 |         for i, (l_1, l_2) in enumerate(zip(l1, l2)):
29 |             s_mask[i][:l_1, :l_2] = 0
30 | 
31 |         s_mask = Variable(s_mask)
32 |         s.data.masked_fill_(s_mask.data, -float("inf"))
33 |         return s
34 | 
35 |     @classmethod
36 |     def get_u_tile(cls, s, s2):
37 |         """
38 |         attended vectors of s2 for each word in s1,
39 |         signify which words in s2 are most relevant to words in s1
40 |         """
41 |         a_weight = F.softmax(s, dim=2)  # [B, l1, l2]
42 |         # remove nan from softmax on -inf
43 |         # print(a_weight.shape, s2.shape)
44 |         a_weight.data.masked_fill_(a_weight.data != a_weight.data, 0)
45 |         # [B, l1, l2] * [B, l2, D] -> [B, l1, D]
46 |         u_tile = torch.bmm(a_weight, s2)
47 |         return u_tile, a_weight
48 | 
49 | 
50 |     def forward(self, s1, l1, s2, l2):
51 |         s = self.similarity(s1, l1, s2, l2)
52 |         u_tile, a_weight = self.get_u_tile(s, s2)
53 |         
54 |         return u_tile, a_weight
55 |         
56 |         
57 | 
58 | 
59 | 
60 | 
61 | 
62 | 


--------------------------------------------------------------------------------
/model/graph.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | import torch
 3 | from torch.autograd import Variable
 4 | import torch.nn.functional as F
 5 | from torch.nn.parameter import Parameter
 6 | import math
 7 | 
 8 | class GraphConvolution(nn.Module):
 9 |     """
10 |     Simple GCN layer, similar to https://arxiv.org/abs/1609.02907
11 |     """
12 | 
13 |     def __init__(self, in_features, out_features, bias=True, skip=True):
14 |         super(GraphConvolution, self).__init__()
15 |         self.skip = skip
16 |         self.in_features = in_features
17 |         self.out_features = out_features
18 |         self.weight = Parameter(torch.Tensor(in_features, out_features))
19 |         if bias:
20 |             self.bias = Parameter(torch.Tensor(out_features))
21 |         else:
22 |             self.register_parameter('bias', None)
23 |         self.reset_parameters()
24 | 
25 |     def reset_parameters(self):
26 |         stdv = 1. / math.sqrt(self.weight.size(1))
27 |         self.weight.data.uniform_(-stdv, stdv)
28 |         if self.bias is not None:
29 |             self.bias.data.uniform_(-stdv, stdv)
30 | 
31 |     def forward(self, input, adj):
32 |         # TODO make fc more efficient via "pack_padded_sequence"
33 |         
34 |         support = torch.bmm(input, self.weight.unsqueeze(
35 |             0).expand(input.shape[0], -1, -1))
36 |         output = torch.bmm(adj, support)
37 |         #output = SparseMM(adj)(support)
38 |         if self.bias is not None:
39 |             output += self.bias.unsqueeze(0).expand(input.shape[0], -1, -1)
40 |         if self.skip:
41 |             output += support
42 | 
43 |         return output
44 | 
45 |     def __repr__(self):
46 |         return self.__class__.__name__ + ' (' \
47 |             + str(self.in_features) + ' -> ' \
48 |             + str(self.out_features) + ')'
49 | 
50 | 
51 | class Graph(nn.Module):
52 |     
53 |     def __init__(self, dim_in, dim_hidden, dim_out, num_layers, dropout):
54 |         super(Graph, self).__init__()
55 |         self.fc_k = nn.Linear(dim_in, dim_hidden)
56 |         self.fc_q = nn.Linear(dim_in, dim_hidden)
57 | 
58 |         dim_hidden = dim_out if num_layers == 1 else dim_hidden
59 |         self.layers = nn.ModuleList([
60 |             GraphConvolution(dim_in, dim_hidden)
61 |         ])
62 | 
63 |         for i in range(num_layers - 1):
64 |             dim_tmp = dim_out if i == num_layers-2 else dim_hidden
65 |             self.layers.append(GraphConvolution(dim_hidden, dim_tmp))
66 | 
67 |         self.dropout = dropout
68 | 
69 |     
70 |     def build_graph(self, x):
71 |         batch_size, s_len = x.shape[0], x.shape[1]
72 |         emb_k = self.fc_k(x)
73 |         emb_q = self.fc_q(x)
74 |         length = torch.tensor([s_len] * batch_size, dtype=torch.long)
75 | 
76 |         s = torch.bmm(emb_k, emb_q.transpose(1, 2))
77 | 
78 |         s_mask = s.data.new(*s.size()).fill_(1).bool()  # [B, T1, T2]
79 |         # Init similarity mask using lengths
80 |         for i, (l_1, l_2) in enumerate(zip(length, length)):
81 |             s_mask[i][:l_1, :l_2] = 0
82 |         s_mask = Variable(s_mask)
83 |         s.data.masked_fill_(s_mask.data, -float("inf"))
84 | 
85 |         A = s #F.softmax(s, dim=2)  # [B, t1, t2]
86 |         
87 |         # remove nan from softmax on -inf
88 |         A.data.masked_fill_(A.data != A.data, 0)
89 | 
90 |         return A
91 |     
92 |     def forward(self, X, A):
93 |         for layer in self.layers:
94 |             X = F.relu(layer(X, A))
95 |             X = F.dropout(X, self.dropout, training=self.training)
96 |         return X
97 | 


--------------------------------------------------------------------------------
/model/language_model.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | from transformers.activations import gelu
  5 | from model.cmatt import CMAtten 
  6 | 
  7 | class Bert(nn.Module):
  8 |     """ Finetuned *BERT module """
  9 | 
 10 |     def __init__(self, tokenizer, lan='RoBERTa'):
 11 |         super(Bert, self).__init__()
 12 |   
 13 |         if lan == 'BERT':
 14 |             from transformers import BertTokenizer, BertModel, BertConfig
 15 |             config = BertConfig.from_pretrained("bert-base-uncased", output_hidden_states=True)
 16 |             self.bert = BertModel.from_pretrained("bert-base-uncased", config=config)
 17 |         elif lan == 'RoBERTa':
 18 |             from transformers import RobertaModel, RobertaConfig, RobertaTokenizerFast
 19 |             config = RobertaConfig.from_pretrained("roberta-base", output_hidden_states=True)
 20 |             self.bert = RobertaModel.from_pretrained("roberta-base", config=config)
 21 |         self.tokenizer = tokenizer
 22 |         
 23 |         # for name, param in self.bert.named_parameters():
 24 |         #     param.requires_grad = False
 25 |         
 26 |     def forward(self, tokens):
 27 |         attention_mask = (tokens != self.tokenizer.pad_token_id).float()
 28 |         outs = self.bert(tokens, attention_mask=attention_mask)
 29 |         embds = outs[0]
 30 |         return embds, outs[1][-2]
 31 | 
 32 | 
 33 | class Sentence_Maxpool(nn.Module):
 34 |     """ Utilitary for the answer module """
 35 | 
 36 |     def __init__(self, word_dimension, output_dim, relu=True):
 37 |         super(Sentence_Maxpool, self).__init__()
 38 |         self.fc = nn.Linear(word_dimension, output_dim)
 39 |         self.out_dim = output_dim
 40 |         self.relu = relu
 41 | 
 42 |     def forward(self, x_in):
 43 |         x = self.fc(x_in)
 44 |         x = torch.max(x, dim=1)[0]
 45 |         if self.relu:
 46 |             x = F.relu(x)
 47 |         return x
 48 | 
 49 | 
 50 | class FFN(nn.Module):
 51 |     def __init__(self, word_dim, hidden_dim, out_dim, dropout=0.3):
 52 |         super().__init__()
 53 |         activation = "gelu"
 54 |         self.dropout = nn.Dropout(p=dropout)
 55 |         self.lin1 = nn.Linear(in_features=word_dim, out_features=hidden_dim)
 56 |         self.lin2 = nn.Linear(in_features=hidden_dim, out_features=out_dim)
 57 |         assert activation in [
 58 |             "relu",
 59 |             "gelu",
 60 |         ], "activation ({}) must be in ['relu', 'gelu']".format(activation)
 61 |         self.activation = gelu if activation == "gelu" else nn.ReLU()
 62 | 
 63 |     def forward(self, input):
 64 |         x = self.lin1(input)
 65 |         x = self.activation(x)
 66 |         x = self.lin2(x)
 67 |         x = self.dropout(x)
 68 |         return x
 69 | 
 70 | class AModel(nn.Module):
 71 |     """
 72 |     Answer embedding module
 73 |     """
 74 | 
 75 |     def __init__(self, tokenizer, lan='RoBERTa', word_dim=768, out_dim=512):
 76 |         super(AModel, self).__init__()
 77 |         self.bert = Bert(tokenizer, lan=lan)
 78 |         self.linear_text = nn.Linear(word_dim, out_dim)
 79 | 
 80 |         # self.linear_text = FFN(word_dim, out_dim, out_dim)
 81 |         
 82 |     def forward(self, answer):
 83 | 
 84 |         if len(answer.shape) == 3:
 85 |             #multi-choice
 86 |             bs, nans, lans = answer.shape
 87 |             answer = answer.view(bs * nans, lans)
 88 |             answer, hd_state = self.bert(answer)
 89 |             answer = self.linear_text(answer)
 90 |             answer_g = answer.mean(dim=1)
 91 |             # answer_g = answer[:, 0, :]
 92 |             answer_g = answer_g.view(bs, nans, -1)
 93 |         else:
 94 |             answer, hd_state = self.bert(answer)
 95 |             answer = self.linear_text(answer)
 96 |             answer_g = answer.mean(dim=1)
 97 |             # answer_g = answer[:, 0, :]
 98 |         
 99 |         return answer_g, answer
100 | 
101 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | h5py==3.9.0
 2 | hostlist==1.4.8
 3 | huggingface-hub==0.16.4
 4 | numpy==1.22.0
 5 | pandas==1.4.1
 6 | Pillow==9.3.0
 7 | python-dateutil==2.8.2
 8 | PyYAML==6.0
 9 | scikit-learn==1.0.2
10 | scipy==1.8.0
11 | sentencepiece==0.1.96
12 | tokenizers==0.11.6
13 | torch==1.8.1
14 | torchvision==0.9.1
15 | tqdm==4.63.1
16 | transformers==4.17.0
17 | 


--------------------------------------------------------------------------------
/shells/cvid_test.sh:
--------------------------------------------------------------------------------
 1 | GPU=$1
 2 | CUDA_VISIBLE_DEVICES=$GPU python main.py --checkpoint_dir=causalvid \
 3 | 	--dataset=causalvid \
 4 | 	--mc=5 \
 5 | 	--bnum=10 \
 6 | 	--test=1 \
 7 | 	--qmax_words=0 \
 8 | 	--amax_words=38 \
 9 | 	--max_feats=32 \
10 | 	--batch_size=64 \
11 | 	--batch_size_val=64 \
12 | 	--num_thread_reader=8 \
13 | 	--mlm_prob=0 \
14 | 	--n_layers=1 \
15 | 	--embd_dim=512 \
16 | 	--ff_dim=1024 \
17 | 	--dropout=0.3 \
18 | 	--lan="RoBERTa" \
19 | 	--save_dir='./save_models/causalvid/CoVGT/' \
20 | 	--pretrain_path='./save_models/causalvid/CoVGT/best_model.pth'
21 | 


--------------------------------------------------------------------------------
/shells/cvid_train.sh:
--------------------------------------------------------------------------------
 1 | GPU=$1
 2 | CUDA_VISIBLE_DEVICES=$GPU python main.py --checkpoint_dir=causalvid \
 3 | 	--dataset=causalvid \
 4 | 	--mc=5 \
 5 | 	--bnum=10 \
 6 | 	--epochs=20 \
 7 | 	--lr=0.00001 \
 8 | 	--qmax_words=0 \
 9 | 	--amax_words=38 \
10 | 	--max_feats=32 \
11 | 	--batch_size=64 \
12 | 	--batch_size_val=64 \
13 | 	--num_thread_reader=8 \
14 | 	--mlm_prob=0 \
15 | 	--n_layers=1 \
16 | 	--embd_dim=512 \
17 | 	--ff_dim=1024 \
18 | 	--dropout=0.3 \
19 | 	--seed=666 \
20 | 	--cl_loss=0 \
21 | 	--lan="RoBERTa" \
22 | 	--save_dir='./save_models/causalvid/CoVGT/' \
23 | 	--pretrain_path='./save_models/causalvid/CoVGT/best_model.pth'


--------------------------------------------------------------------------------
/shells/msrvtt_test.sh:
--------------------------------------------------------------------------------
 1 | GPU=$1
 2 | CUDA_VISIBLE_DEVICES=$GPU python main.py --checkpoint_dir=msrvtt \
 3 | 	--dataset=msrvtt \
 4 | 	--mc=0 \
 5 | 	--bnum=10 \
 6 | 	--test=1 \
 7 | 	--qmax_words=20 \
 8 | 	--amax_words=5 \
 9 | 	--max_feats=32 \
10 | 	--batch_size=64 \
11 | 	--batch_size_val=64 \
12 | 	--num_thread_reader=8 \
13 | 	--mlm_prob=0 \
14 | 	--n_layers=1 \
15 | 	--embd_dim=512 \
16 | 	--ff_dim=1024 \
17 | 	--dropout=0.3 \
18 | 	--save_dir='../data/save_models/msrvtt/180k_ft/' \
19 | 	--pretrain_path='../data/save_models/msrvtt/180k_ft/best_model.pth'
20 | 


--------------------------------------------------------------------------------
/shells/msrvtt_train.sh:
--------------------------------------------------------------------------------
 1 | GPU=$1
 2 | CUDA_VISIBLE_DEVICES=$GPU python main.py --checkpoint_dir=msrvtt \
 3 | 	--dataset=msrvtt \
 4 | 	--mc=0 \
 5 | 	--bnum=10 \
 6 | 	--epochs=30 \
 7 | 	--lr=0.00001 \
 8 | 	--qmax_words=20 \
 9 | 	--amax_words=5 \
10 | 	--max_feats=32 \
11 | 	--batch_size=64 \
12 | 	--batch_size_val=64 \
13 | 	--num_thread_reader=8 \
14 | 	--mlm_prob=0 \
15 | 	--n_layers=1 \
16 | 	--embd_dim=512 \
17 | 	--ff_dim=1024 \
18 | 	--dropout=0.3 \
19 | 	--save_dir='../data/save_models/msrvtt/180k+_ft/' \
20 | 	--seed=666 \
21 | 	--pretrain_path='../data/save_models/msrvtt/180k+_ft/best_model.pth'
22 | 


--------------------------------------------------------------------------------
/shells/next_test.sh:
--------------------------------------------------------------------------------
 1 | GPU=$1
 2 | CUDA_VISIBLE_DEVICES=$GPU python main.py --checkpoint_dir=nextqa \
 3 | 	--dataset=nextqa \
 4 | 	--mc=5 \
 5 | 	--bnum=10 \
 6 | 	--test=1 \
 7 | 	--qmax_words=0 \
 8 | 	--amax_words=38 \
 9 | 	--max_feats=32 \
10 | 	--batch_size=64 \
11 | 	--batch_size_val=64 \
12 | 	--num_thread_reader=4 \
13 | 	--mlm_prob=0 \
14 | 	--n_layers=1 \
15 | 	--embd_dim=512 \
16 | 	--ff_dim=1024 \
17 | 	--dropout=0.3 \
18 | 	--lan="RoBERTa" \
19 | 	--save_dir='../data/save_models/nextqa/CoVGT_FTCoWV/' \
20 | 	--pretrain_path='../data/save_models/nextqa/CoVGT_FTCoWV/best_model.pth' \
21 | 	#--CM_PT=1
22 | 


--------------------------------------------------------------------------------
/shells/next_train.sh:
--------------------------------------------------------------------------------
 1 | GPU=$1
 2 | CUDA_VISIBLE_DEVICES=$GPU python main.py --checkpoint_dir=nextqa \
 3 | 	--dataset=nextqa \
 4 | 	--mc=5 \
 5 | 	--bnum=5 \
 6 | 	--epochs=20 \
 7 | 	--lr=0.00001 \
 8 | 	--qmax_words=30 \
 9 | 	--amax_words=38 \
10 | 	--max_feats=32 \
11 | 	--batch_size=64 \
12 | 	--batch_size_val=64 \
13 | 	--num_thread_reader=8 \
14 | 	--mlm_prob=0 \
15 | 	--cl_loss=1 \
16 | 	--n_layers=1 \
17 | 	--embd_dim=512 \
18 | 	--ff_dim=1024 \
19 | 	--dropout=0.3 \
20 | 	--seed=666 \
21 | 	--lan="RoBERTa" \
22 | 	--save_dir='../data/save_models/nextqa/CoVGT/' \
23 | 	#--pretrain_path=../data/save_models/webvid180K/co_e1.pth \
24 | 	
25 | 	
26 | 	
27 | 


--------------------------------------------------------------------------------
/shells/tgif_ftrain.sh:
--------------------------------------------------------------------------------
 1 | GPU=$1
 2 | CUDA_VISIBLE_DEVICES=$GPU python main.py --checkpoint_dir=tgifqa \
 3 | 	--dataset=tgifqa/frameqa \
 4 | 	--mc=0 \
 5 | 	--bnum=10 \
 6 | 	--epochs=30 \
 7 | 	--lr=0.00001 \
 8 | 	--qmax_words=20 \
 9 | 	--amax_words=5 \
10 | 	--max_feats=32 \
11 | 	--batch_size=64 \
12 | 	--batch_size_val=64 \
13 | 	--num_thread_reader=8 \
14 | 	--mlm_prob=0 \
15 | 	--n_layers=1 \
16 | 	--embd_dim=512 \
17 | 	--ff_dim=1024 \
18 | 	--dropout=0.3 \
19 | 	--save_dir='../data/save_models/tgifqa/frameqa/VGT/' \
20 | 	--seed=666 \
21 | 	# --pretrain_path='../data/save_models/webvid/180K/e1.pth'


--------------------------------------------------------------------------------
/shells/tgif_test.sh:
--------------------------------------------------------------------------------
 1 | GPU=$1
 2 | CUDA_VISIBLE_DEVICES=$GPU python main.py --checkpoint_dir=tgifqa \
 3 | 	--dataset=tgifqa/transition \
 4 | 	--mc=5 \
 5 | 	--test=1 \
 6 | 	--qmax_words=0 \
 7 | 	--amax_words=20 \
 8 | 	--max_feats=32 \
 9 | 	--batch_size=64 \
10 | 	--batch_size_val=64 \
11 | 	--num_thread_reader=8 \
12 | 	--mlm_prob=0 \
13 | 	--n_layers=1 \
14 | 	--embd_dim=512 \
15 | 	--ff_dim=1024 \
16 | 	--dropout=0.3 \
17 | 	--save_dir='../data/save_models/tgifqa/transition/VGT/' \
18 | 	--pretrain_path='../data/save_models/tgifqa/transition/VGT/best_model.pth'
19 | 


--------------------------------------------------------------------------------
/shells/tgif_train.sh:
--------------------------------------------------------------------------------
 1 | GPU=$1
 2 | CUDA_VISIBLE_DEVICES=$GPU python main.py --checkpoint_dir=tgifqa \
 3 | 	--dataset=tgifqa/action \
 4 | 	--mc=5 \
 5 | 	--epochs=30 \
 6 | 	--lr=0.00001 \
 7 | 	--qmax_words=0 \
 8 | 	--amax_words=20 \
 9 | 	--max_feats=32 \
10 | 	--batch_size=64 \
11 | 	--batch_size_val=64 \
12 | 	--num_thread_reader=4 \
13 | 	--mlm_prob=0 \
14 | 	--n_layers=1 \
15 | 	--embd_dim=512 \
16 | 	--ff_dim=1024 \
17 | 	--dropout=0.3 \
18 | 	--save_dir='../data/save_models/tgifqa/action/VGT/' \
19 | 	--seed=666 \
20 | 	# --pretrain_path=../data/save_models/webvid/180K/e1.pth
21 | 


--------------------------------------------------------------------------------
/shells/webvid_train.sh:
--------------------------------------------------------------------------------
 1 | GPU=$1
 2 | CUDA_VISIBLE_DEVICES=$GPU python main.py --checkpoint_dir=webvid \
 3 | 	--dataset=webvid \
 4 | 	--mc=64 \
 5 | 	--epochs=3 \
 6 | 	--lr=0.00005 \
 7 | 	--qmax_words=0 \
 8 | 	--amax_words=20 \
 9 | 	--max_feats=32 \
10 | 	--batch_size=64 \
11 | 	--batch_size_val=64 \
12 | 	--num_thread_reader=16 \
13 | 	--mlm_prob=0.15 \
14 | 	--n_layers=1 \
15 | 	--embd_dim=512 \
16 | 	--ff_dim=1024 \
17 | 	--dropout=0.3 \
18 | 	--save_dir='./save_models/webvid/025/' \
19 | 	--seed=666 \
20 | 
21 | 


--------------------------------------------------------------------------------
/tools/__pycache__/object_align.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/doc-doc/CoVGT/cbc9fa7830b304f3c3f9c53040489ea9ad35a9aa/tools/__pycache__/object_align.cpython-38.pyc


--------------------------------------------------------------------------------
/tools/bbox_visualizer.py:
--------------------------------------------------------------------------------
  1 | import cv2
  2 | 
  3 | 
  4 | def draw_rectangle(img,
  5 |                    bbox,
  6 |                    bbox_color=(255, 255, 255),
  7 |                    thickness=3,
  8 |                    is_opaque=False,
  9 |                    alpha=0.5):
 10 |     """Draws the rectangle around the object
 11 | 
 12 |     Parameters
 13 |     ----------
 14 |     img : ndarray
 15 |         the actual image
 16 |     bbox : list
 17 |         a list containing x_min, y_min, x_max and y_max of the rectangle positions
 18 |     bbox_color : tuple, optional
 19 |         the color of the box, by default (255,255,255)
 20 |     thickness : int, optional
 21 |         thickness of the outline of the box, by default 3
 22 |     is_opaque : bool, optional
 23 |         if False, draws a solid rectangular outline. Else, a filled rectangle which is semi transparent, by default False
 24 |     alpha : float, optional
 25 |         strength of the opacity, by default 0.5
 26 | 
 27 |     Returns
 28 |     -------
 29 |     ndarray
 30 |         the image with the bounding box drawn
 31 |     """
 32 | 
 33 |     output = img.copy()
 34 |     if not is_opaque:
 35 |         cv2.rectangle(output, (bbox[0], bbox[1]), (bbox[2], bbox[3]),
 36 |                       bbox_color, thickness)
 37 |     else:
 38 |         overlay = img.copy()
 39 | 
 40 |         cv2.rectangle(overlay, (bbox[0], bbox[1]), (bbox[2], bbox[3]),
 41 |                       bbox_color, -1)
 42 |         cv2.addWeighted(overlay, alpha, output, 1 - alpha, 0, output)
 43 | 
 44 |     return output
 45 | 
 46 | 
 47 | def add_label(img,
 48 |               label,
 49 |               bbox,
 50 |               draw_bg=True,
 51 |               text_bg_color=(255, 255, 255),
 52 |               text_color=(0, 0, 0),
 53 |               top=True):
 54 |     """adds label, inside or outside the rectangle
 55 | 
 56 |     Parameters
 57 |     ----------
 58 |     img : ndarray
 59 |         the image on which the label is to be written, preferably the image with the rectangular bounding box drawn
 60 |     label : str
 61 |         the text (label) to be written
 62 |     bbox : list
 63 |         a list containing x_min, y_min, x_max and y_max of the rectangle positions
 64 |     draw_bg : bool, optional
 65 |         if True, draws the background of the text, else just the text is written, by default True
 66 |     text_bg_color : tuple, optional
 67 |         the background color of the label that is filled, by default (255, 255, 255)
 68 |     text_color : tuple, optional
 69 |         color of the text (label) to be written, by default (0, 0, 0)
 70 |     top : bool, optional
 71 |         if True, writes the label on top of the bounding box, else inside, by default True
 72 | 
 73 |     Returns
 74 |     -------
 75 |     ndarray
 76 |         the image with the label written
 77 |     """
 78 | 
 79 |     text_width = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 1, 2)[0][0]
 80 | 
 81 |     if top:
 82 |         label_bg = [bbox[0], bbox[1], bbox[0] + text_width, bbox[1] - 30]
 83 |         if draw_bg:
 84 |             cv2.rectangle(img, (label_bg[0], label_bg[1]),
 85 |                           (label_bg[2] + 5, label_bg[3]), text_bg_color, -1)
 86 |         cv2.putText(img, label, (bbox[0] + 5, bbox[1] - 5),
 87 |                     cv2.FONT_HERSHEY_SIMPLEX, 1, text_color, 2)
 88 | 
 89 |     else:
 90 |         label_bg = [bbox[0], bbox[1], bbox[0] + text_width, bbox[1] + 30]
 91 |         if draw_bg:
 92 |             cv2.rectangle(img, (label_bg[0], label_bg[1]),
 93 |                           (label_bg[2] + 5, label_bg[3]), text_bg_color, -1)
 94 |         cv2.putText(img, label, (bbox[0] + 5, bbox[1] - 5 + 30),
 95 |                     cv2.FONT_HERSHEY_SIMPLEX, 1, text_color, 2)
 96 | 
 97 |     return img
 98 | 
 99 | 
100 | def add_T_label(img,
101 |                 label,
102 |                 bbox,
103 |                 draw_bg=True,
104 |                 text_bg_color=(255, 255, 255),
105 |                 text_color=(0, 0, 0)):
106 |     """adds a T label to the rectangle, originating from the top of the rectangle
107 | 
108 |     Parameters
109 |     ----------
110 |     img : ndarray
111 |         the image on which the T label is to be written/drawn, preferably the image with the rectangular bounding box drawn
112 |     label : str
113 |         the text (label) to be written
114 |     bbox : list
115 |         a list containing x_min, y_min, x_max and y_max of the rectangle positions
116 |     draw_bg : bool, optional
117 |         if True, draws the background of the text, else just the text is written, by default True
118 |     text_bg_color : tuple, optional
119 |         the background color of the label that is filled, by default (255, 255, 255)
120 |     text_color : tuple, optional
121 |         color of the text (label) to be written, by default (0, 0, 0)
122 | 
123 |     Returns
124 |     -------
125 |     ndarray
126 |         the image with the T label drawn/written
127 |     """
128 | 
129 |     text_width = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 1, 2)[0][0]
130 |     text_height = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 1, 2)[0][1]
131 | 
132 |     # draw vertical line
133 |     x_center = (bbox[0] + bbox[2]) // 2
134 |     y_top = bbox[1] - 50
135 |     cv2.line(img, (x_center, bbox[1]), (x_center, y_top), text_bg_color, 3)
136 | 
137 |     # draw rectangle with label
138 |     y_bottom = y_top
139 |     y_top = y_bottom - text_height - 5
140 |     x_left = x_center - (text_width // 2) - 5
141 |     x_right = x_center + (text_width // 2) + 5
142 |     if draw_bg:
143 |         cv2.rectangle(img, (x_left, y_top - 3), (x_right, y_bottom),
144 |                       text_bg_color, -1)
145 |     cv2.putText(img, label, (x_left + 5, y_bottom - 7),
146 |                 cv2.FONT_HERSHEY_SIMPLEX, 1, text_color, 2)
147 | 
148 |     return img
149 | 
150 | 
151 | def draw_flag_with_label(img,
152 |                          label,
153 |                          bbox,
154 |                          write_label=True,
155 |                          line_color=(255, 255, 255),
156 |                          text_bg_color=(255, 255, 255),
157 |                          text_color=(0, 0, 0)):
158 |     """draws a pole from the middle of the object that is to be labeled and adds the label to the flag
159 | 
160 |     Parameters
161 |     ----------
162 |     img : ndarray
163 |         the image on which the flag is to be drawn
164 |     label : str
165 |         label that is written inside the flag
166 |     bbox : list
167 |         a list containing x_min, y_min, x_max and y_max of the rectangle positions
168 |     write_label : bool, optional
169 |         if True, writes the label, otherwise, it's just a vertical line, by default True
170 |     line_color : tuple, optional
171 |         the color of the pole of the flag, by default (255, 255, 255)
172 |     text_bg_color : tuple, optional
173 |         the background color of the label that is filled, by default (255, 255, 255)
174 |     text_color : tuple, optional
175 |         color of the text (label) to be written, by default (0, 0, 0)
176 | 
177 |     Returns
178 |     -------
179 |     ndarray
180 |         the image with flag drawn and the label written in the flag
181 |     """
182 | 
183 |     # draw vertical line
184 | 
185 |     x_center = (bbox[0] + bbox[2]) // 2
186 |     y_bottom = int((bbox[1] * .75 + bbox[3] * .25))
187 |     y_top = bbox[1] - (y_bottom - bbox[1])
188 | 
189 |     start_point = (x_center, y_top)
190 |     end_point = (x_center, y_bottom)
191 | 
192 |     cv2.line(img, start_point, end_point, line_color, 3)
193 | 
194 |     # write label
195 | 
196 |     if write_label:
197 |         text_width = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 1,
198 |                                      2)[0][0]
199 |         label_bg = [
200 |             start_point[0], start_point[1], start_point[0] + text_width,
201 |             start_point[1] + 30
202 |         ]
203 |         cv2.rectangle(img, (label_bg[0], label_bg[1]),
204 |                       (label_bg[2] + 5, label_bg[3]), text_bg_color, -1)
205 |         cv2.putText(img, label, (start_point[0] + 5, start_point[1] - 5 + 30),
206 |                     cv2.FONT_HERSHEY_SIMPLEX, 1, text_color, 2)
207 | 
208 |     return img
209 | 
210 | 
211 | # THE FOLLOWING ARE OPTIONAL FUNCTIONS THAT CAN BE USED FOR DRAWING OR LABELLING MULTIPLE OBJECTS IN THE SAME
212 | # IMAGE. IN ORDER TO HAVE FULL CONTROL OF YOUR VISUALIZATIONS IT IS ADVISABLE TO USE THE ABOVE FUNCTIONS IN FOR LOOPS
213 | # INSTEAD OF THE FUNCTIONS BELOW
214 | 
215 | 
216 | def draw_multiple_rectangles(img,
217 |                              bboxes,
218 |                              bbox_color=(255, 255, 255),
219 |                              thickness=2,
220 |                              is_opaque=False,
221 |                              alpha=0.5):
222 |     """draws multiple rectangles
223 | 
224 |     img : ndarray
225 |         the actual image
226 |     bboxes : list
227 |         a list of lists, each inner list containing x_min, y_min, x_max and y_max of the rectangle positions
228 |     bbox_color : tuple, optional
229 |         the color of the boxes, by default (255,255,255)
230 |     thickness : int, optional
231 |         thickness of the outline of the boxes, by default 3
232 |     is_opaque : bool, optional
233 |         if False, draws solid rectangular outlines for rectangles. Else, filled rectangles which are semi transparent, by default False
234 |     alpha : float, optional
235 |         strength of the opacity, by default 0.5
236 | 
237 |     Returns
238 |     -------
239 |     ndarray
240 |         the image with the bounding boxes drawn
241 |     """
242 | 
243 |     for bid, bbox in enumerate(bboxes):
244 |         img = draw_rectangle(img, bbox, bbox_color[bid], thickness, is_opaque,
245 |                              alpha)
246 |     return img
247 | 
248 | 
249 | def add_multiple_labels(img,
250 |                         labels,
251 |                         bboxes,
252 |                         draw_bg=True,
253 |                         text_bg_color=(255, 255, 255),
254 |                         text_color=(0, 0, 0),
255 |                         top=True):
256 |     """add labels, inside or outside the rectangles
257 | 
258 |     Parameters
259 |     ----------
260 |     img : ndarray
261 |         the image on which the labels are to be written, preferably the image with the rectangular bounding boxes drawn
262 |     labels : list
263 |         a list of string of the texts (labels) to be written
264 |     bboxes : list
265 |         a list of lists, each inner list containing x_min, y_min, x_max and y_max of the rectangle positions
266 |     draw_bg : bool, optional
267 |         if True, draws the background of the texts, else just the texts are written, by default True
268 |     text_bg_color : tuple, optional
269 |         the background color of the labels that are filled, by default (255, 255, 255)
270 |     text_color : tuple, optional
271 |         color of the texts (labels) to be written, by default (0, 0, 0)
272 |     top : bool, optional
273 |         if True, writes the labels on top of the bounding boxes, else inside, by default True
274 | 
275 |     Returns
276 |     -------
277 |     ndarray
278 |         the image with the labels written
279 |     """
280 | 
281 |     for label, bbox in zip(labels, bboxes):
282 |         img = add_label(img, label, bbox, draw_bg, text_bg_color, text_color,
283 |                         top)
284 | 
285 |     return img
286 | 
287 | 
288 | def add_multiple_T_labels(img,
289 |                           labels,
290 |                           bboxes,
291 |                           draw_bg=True,
292 |                           text_bg_color=(255, 255, 255),
293 |                           text_color=(0, 0, 0)):
294 |     """adds T labels to the rectangles, each originating from the top of the rectangle
295 | 
296 |     Parameters
297 |     ----------
298 |     img : ndarray
299 |         the image on which the T labels are to be written/drawn, preferably the image with the rectangular bounding boxes drawn
300 |     labels : list
301 |         the texts (labels) to be written
302 |     bboxes : list
303 |         a list of lists, each inner list containing x_min, y_min, x_max and y_max of the rectangle positions
304 |     draw_bg : bool, optional
305 |         if True, draws the background of the texts, else just the texts are written, by default True
306 |     text_bg_color : tuple, optional
307 |         the background color of the labels that are filled, by default (255, 255, 255)
308 |     text_color : tuple, optional
309 |         color of the texts (labels) to be written, by default (0, 0, 0)
310 | 
311 |     Returns
312 |     -------
313 |     ndarray
314 |         the image with the T labels drawn/written
315 |     """
316 | 
317 |     for label, bbox in zip(labels, bboxes):
318 |         add_T_label(img, label, bbox, draw_bg, text_bg_color, text_color)
319 | 
320 |     return img
321 | 
322 | 
323 | def draw_multiple_flags_with_labels(img,
324 |                                     labels,
325 |                                     bboxes,
326 |                                     write_label=True,
327 |                                     line_color=(255, 255, 255),
328 |                                     text_bg_color=(255, 255, 255),
329 |                                     text_color=(0, 0, 0)):
330 |     """draws poles from the middle of the objects that are to be labeled and adds the labels to the flags
331 | 
332 |     Parameters
333 |     ----------
334 |     img : ndarray
335 |         the image on which the flags are to be drawn
336 |     labels : list
337 |         labels that are written inside the flags
338 |     bbox : list
339 |         a list of lists, each inner list containing x_min, y_min, x_max and y_max of the rectangle positions
340 |     write_label : bool, optional
341 |         if True, writes the labels, otherwise, it's just a vertical line for each object, by default True
342 |     line_color : tuple, optional
343 |         the color of the pole of the flags, by default (255, 255, 255)
344 |     text_bg_color : tuple, optional
345 |         the background color of the labels that are filled, by default (255, 255, 255)
346 |     text_color : tuple, optional
347 |         color of the texts (labels) to be written, by default (0, 0, 0)
348 | 
349 |     Returns
350 |     -------
351 |     ndarray
352 |         the image with flags drawn and the labels written in the flags
353 |     """
354 | 
355 |     for label, bbox in zip(labels, bboxes):
356 |         img = draw_flag_with_label(img, label, bbox, write_label, line_color,
357 |                                    text_bg_color, text_color)
358 |     return img
359 | 


--------------------------------------------------------------------------------
/tools/colors.txt:
--------------------------------------------------------------------------------
 1 | 255 0 0
 2 | 255 255 0
 3 | 0 255 0
 4 | 255 153 18
 5 | 0 255 255
 6 | 63 211 144
 7 | 240 141 163
 8 | 149 139 206
 9 | 166 31 247
10 | 210 148 204
11 | 196 142 86
12 | 138 48 98
13 | 85 16 165
14 | 84 103 158
15 | 186 202 87
16 | 149 52 56
17 | 169 184 132
18 | 156 176 226
19 | 233 214 139
20 | 35 124 145
21 | 10 116 109
22 | 89 231 101
23 | 198 145 242
24 | 113 43 121
25 | 49 61 103
26 | 196 239 149
27 | 227 80 71
28 | 70 3 76
29 | 143 43 181
30 | 159 31 2
31 | 171 53 200
32 | 233 49 105
33 | 75 127 208
34 | 221 246 66
35 | 238 11 216
36 | 101 36 178
37 | 198 5 97
38 | 42 179 23
39 | 124 62 186
40 | 25 90 250
41 | 180 50 78
42 | 40 107 146
43 | 147 80 68
44 | 110 147 182
45 | 141 199 99
46 | 183 74 21
47 | 6 157 170
48 | 133 168 215
49 | 18 51 5
50 | 136 196 212
51 | 224 237 188
52 | 172 61 214
53 | 


--------------------------------------------------------------------------------
/tools/datautils/msrvtt_qa.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | from datautils import utils
  3 | import nltk
  4 | from collections import Counter
  5 | 
  6 | import pickle
  7 | import numpy as np
  8 | 
  9 | 
 10 | def load_video_paths(args):
 11 |     ''' Load a list of (path,image_id tuples).'''
 12 |     video_paths = []
 13 |     modes = ['train', 'val', 'test']
 14 |     for mode in modes:
 15 |         with open(args.annotation_file.format(mode), 'r') as anno_file:
 16 |             instances = json.load(anno_file)
 17 |         video_ids = [instance['video_id'] for instance in instances]
 18 |         video_ids = set(video_ids)
 19 |         if mode in ['train', 'val']:
 20 |             for video_id in video_ids:
 21 |                 video_paths.append((args.video_dir + 'videos/video{}.mp4'.format(video_id), video_id))
 22 |         else:
 23 |             for video_id in video_ids:
 24 |                 video_paths.append((args.video_dir + 'videos/video{}.mp4'.format(video_id), video_id))
 25 | 
 26 |     return video_paths
 27 | 
 28 | 
 29 | def process_questions(args):
 30 |     ''' Encode question tokens'''
 31 |     print('Loading dataset')
 32 |     with open(args.annotation_file, 'r') as dataset_file:
 33 |         instances = json.load(dataset_file)
 34 | 
 35 |     # Either create the vocab or load it from disk
 36 |     if args.mode in ['train']:
 37 |         print('Building vocab')
 38 |         answer_cnt = {}
 39 |         for instance in instances:
 40 |             answer = instance['answer']
 41 |             answer_cnt[answer] = answer_cnt.get(answer, 0) + 1
 42 | 
 43 |         answer_token_to_idx = {'<UNK0>': 0, '<UNK1>': 1}
 44 |         answer_counter = Counter(answer_cnt)
 45 |         frequent_answers = answer_counter.most_common(args.answer_top)
 46 |         total_ans = sum(item[1] for item in answer_counter.items())
 47 |         total_freq_ans = sum(item[1] for item in frequent_answers)
 48 |         print("Number of unique answers:", len(answer_counter))
 49 |         print("Total number of answers:", total_ans)
 50 |         print("Top %i answers account for %f%%" % (len(frequent_answers), total_freq_ans * 100.0 / total_ans))
 51 | 
 52 |         for token, cnt in Counter(answer_cnt).most_common(args.answer_top):
 53 |             answer_token_to_idx[token] = len(answer_token_to_idx)
 54 |         print('Get answer_token_to_idx, num: %d' % len(answer_token_to_idx))
 55 | 
 56 |         question_token_to_idx = {'<NULL>': 0, '<UNK>': 1}
 57 |         for i, instance in enumerate(instances):
 58 |             question = instance['question'].lower()[:-1]
 59 |             for token in nltk.word_tokenize(question):
 60 |                 if token not in question_token_to_idx:
 61 |                     question_token_to_idx[token] = len(question_token_to_idx)
 62 |         print('Get question_token_to_idx')
 63 |         print(len(question_token_to_idx))
 64 | 
 65 |         vocab = {
 66 |             'question_token_to_idx': question_token_to_idx,
 67 |             'answer_token_to_idx': answer_token_to_idx,
 68 |             'question_answer_token_to_idx': {'<NULL>': 0, '<UNK>': 1}
 69 |         }
 70 | 
 71 |         print('Write into %s' % args.vocab_json.format(args.dataset, args.dataset))
 72 |         with open(args.vocab_json.format(args.dataset, args.dataset), 'w') as f:
 73 |             json.dump(vocab, f, indent=4)
 74 |     else:
 75 |         print('Loading vocab')
 76 |         with open(args.vocab_json.format(args.dataset, args.dataset), 'r') as f:
 77 |             vocab = json.load(f)
 78 | 
 79 |     # Encode all questions
 80 |     print('Encoding dataset')
 81 |     questions_encoded = []
 82 |     questions_len = []
 83 |     question_ids = []
 84 |     video_ids_tbw = []
 85 |     video_names_tbw = []
 86 |     all_answers = []
 87 |     for idx, instance in enumerate(instances):
 88 |         question = instance['question'].lower()[:-1]
 89 |         question_tokens = nltk.word_tokenize(question)
 90 |         question_encoded = utils.encode(question_tokens, vocab['question_token_to_idx'], allow_unk=True)
 91 |         questions_encoded.append(question_encoded)
 92 |         questions_len.append(len(question_encoded))
 93 |         question_ids.append(idx)
 94 |         im_name = instance['video_id']
 95 |         video_ids_tbw.append(im_name)
 96 |         video_names_tbw.append(im_name)
 97 | 
 98 |         if instance['answer'] in vocab['answer_token_to_idx']:
 99 |             answer = vocab['answer_token_to_idx'][instance['answer']]
100 |         elif args.mode in ['train']:
101 |             answer = 0
102 |         elif args.mode in ['val', 'test']:
103 |             answer = 1
104 | 
105 |         all_answers.append(answer)
106 |     max_question_length = max(len(x) for x in questions_encoded)
107 |     for qe in questions_encoded:
108 |         while len(qe) < max_question_length:
109 |             qe.append(vocab['question_token_to_idx']['<NULL>'])
110 | 
111 |     questions_encoded = np.asarray(questions_encoded, dtype=np.int32)
112 |     questions_len = np.asarray(questions_len, dtype=np.int32)
113 |     print(questions_encoded.shape)
114 | 
115 |     glove_matrix = None
116 |     if args.mode == 'train':
117 |         token_itow = {i: w for w, i in vocab['question_token_to_idx'].items()}
118 |         print("Load glove from %s" % args.glove_pt)
119 |         glove = pickle.load(open(args.glove_pt, 'rb'))
120 |         dim_word = glove['the'].shape[0]
121 |         glove_matrix = []
122 |         for i in range(len(token_itow)):
123 |             vector = glove.get(token_itow[i], np.zeros((dim_word,)))
124 |             glove_matrix.append(vector)
125 |         glove_matrix = np.asarray(glove_matrix, dtype=np.float32)
126 |         print(glove_matrix.shape)
127 | 
128 |     print('Writing', args.output_pt.format(args.dataset, args.dataset, args.mode))
129 |     obj = {
130 |         'questions': questions_encoded,
131 |         'questions_len': questions_len,
132 |         'question_id': question_ids,
133 |         'video_ids': np.asarray(video_ids_tbw),
134 |         'video_names': np.array(video_names_tbw),
135 |         'answers': all_answers,
136 |         'glove': glove_matrix,
137 |     }
138 |     with open(args.output_pt.format(args.dataset, args.dataset, args.mode), 'wb') as f:
139 |         pickle.dump(obj, f)
140 | 


--------------------------------------------------------------------------------
/tools/datautils/msvd_qa.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | from datautils import utils
  3 | import nltk
  4 | from collections import Counter
  5 | 
  6 | import pickle
  7 | import numpy as np
  8 | 
  9 | 
 10 | def load_video_paths(args):
 11 |     ''' Load a list of (path,image_id tuples).'''
 12 |     video_paths = []
 13 |     video_ids = []
 14 |     modes = ['train', 'val', 'test']
 15 |     for mode in modes:
 16 |         with open(args.annotation_file.format(mode), 'r') as anno_file:
 17 |             instances = json.load(anno_file)
 18 |         [video_ids.append(instance['video_id']) for instance in instances]
 19 |     video_ids = set(video_ids)
 20 |     with open(args.video_name_mapping, 'r') as mapping:
 21 |         mapping_pairs = mapping.read().split('\n')
 22 |     mapping_dict = {}
 23 |     for idx in range(len(mapping_pairs)):
 24 |         cur_pair = mapping_pairs[idx].split(' ')
 25 |         mapping_dict[cur_pair[1]] = cur_pair[0]
 26 |     for video_id in video_ids:
 27 |         video_paths.append((args.video_dir + 'YouTubeClips/{}.avi'.format(mapping_dict['vid' + str(video_id)]), video_id))
 28 |     return video_paths
 29 | 
 30 | 
 31 | def process_questions(args):
 32 |     ''' Encode question tokens'''
 33 |     print('Loading dataset')
 34 |     with open(args.annotation_file, 'r') as dataset_file:
 35 |         instances = json.load(dataset_file)
 36 | 
 37 |     # Either create the vocab or load it from disk
 38 |     if args.mode in ['train']:
 39 |         print('Building vocab')
 40 |         answer_cnt = {}
 41 |         for instance in instances:
 42 |             answer = instance['answer']
 43 |             answer_cnt[answer] = answer_cnt.get(answer, 0) + 1
 44 | 
 45 |         answer_token_to_idx = {'<UNK0>': 0, '<UNK1>': 1}
 46 |         answer_counter = Counter(answer_cnt)
 47 |         frequent_answers = answer_counter.most_common(args.answer_top)
 48 |         total_ans = sum(item[1] for item in answer_counter.items())
 49 |         total_freq_ans = sum(item[1] for item in frequent_answers)
 50 |         print("Number of unique answers:", len(answer_counter))
 51 |         print("Total number of answers:", total_ans)
 52 |         print("Top %i answers account for %f%%" % (len(frequent_answers), total_freq_ans * 100.0 / total_ans))
 53 | 
 54 |         for token, cnt in Counter(answer_cnt).most_common(args.answer_top):
 55 |             answer_token_to_idx[token] = len(answer_token_to_idx)
 56 |         print('Get answer_token_to_idx, num: %d' % len(answer_token_to_idx))
 57 | 
 58 |         question_token_to_idx = {'<NULL>': 0, '<UNK>': 1}
 59 |         for i, instance in enumerate(instances):
 60 |             question = instance['question'].lower()[:-1]
 61 |             for token in nltk.word_tokenize(question):
 62 |                 if token not in question_token_to_idx:
 63 |                     question_token_to_idx[token] = len(question_token_to_idx)
 64 |         print('Get question_token_to_idx')
 65 |         print(len(question_token_to_idx))
 66 | 
 67 |         vocab = {
 68 |             'question_token_to_idx': question_token_to_idx,
 69 |             'answer_token_to_idx': answer_token_to_idx,
 70 |             'question_answer_token_to_idx': {'<NULL>': 0, '<UNK>': 1}
 71 |         }
 72 | 
 73 |         print('Write into %s' % args.vocab_json.format(args.dataset, args.dataset))
 74 |         with open(args.vocab_json.format(args.dataset, args.dataset), 'w') as f:
 75 |             json.dump(vocab, f, indent=4)
 76 |     else:
 77 |         print('Loading vocab')
 78 |         with open(args.vocab_json.format(args.dataset, args.dataset), 'r') as f:
 79 |             vocab = json.load(f)
 80 | 
 81 |     # Encode all questions
 82 |     print('Encoding dataset')
 83 |     questions_encoded = []
 84 |     questions_len = []
 85 |     question_ids = []
 86 |     video_ids_tbw = []
 87 |     video_names_tbw = []
 88 |     all_answers = []
 89 |     for idx, instance in enumerate(instances):
 90 |         question = instance['question'].lower()[:-1]
 91 |         question_tokens = nltk.word_tokenize(question)
 92 |         question_encoded = utils.encode(question_tokens, vocab['question_token_to_idx'], allow_unk=True)
 93 |         questions_encoded.append(question_encoded)
 94 |         questions_len.append(len(question_encoded))
 95 |         question_ids.append(idx)
 96 |         im_name = instance['video_id']
 97 |         video_ids_tbw.append(im_name)
 98 |         video_names_tbw.append(im_name)
 99 | 
100 |         if instance['answer'] in vocab['answer_token_to_idx']:
101 |             answer = vocab['answer_token_to_idx'][instance['answer']]
102 |         elif args.mode in ['train']:
103 |             answer = 0
104 |         elif args.mode in ['val', 'test']:
105 |             answer = 1
106 | 
107 |         all_answers.append(answer)
108 |     max_question_length = max(len(x) for x in questions_encoded)
109 |     for qe in questions_encoded:
110 |         while len(qe) < max_question_length:
111 |             qe.append(vocab['question_token_to_idx']['<NULL>'])
112 | 
113 |     questions_encoded = np.asarray(questions_encoded, dtype=np.int32)
114 |     questions_len = np.asarray(questions_len, dtype=np.int32)
115 |     print(questions_encoded.shape)
116 | 
117 |     glove_matrix = None
118 |     if args.mode == 'train':
119 |         token_itow = {i: w for w, i in vocab['question_token_to_idx'].items()}
120 |         print("Load glove from %s" % args.glove_pt)
121 |         glove = pickle.load(open(args.glove_pt, 'rb'))
122 |         dim_word = glove['the'].shape[0]
123 |         glove_matrix = []
124 |         for i in range(len(token_itow)):
125 |             vector = glove.get(token_itow[i], np.zeros((dim_word,)))
126 |             glove_matrix.append(vector)
127 |         glove_matrix = np.asarray(glove_matrix, dtype=np.float32)
128 |         print(glove_matrix.shape)
129 | 
130 |     print('Writing', args.output_pt.format(args.dataset, args.dataset, args.mode))
131 |     obj = {
132 |         'questions': questions_encoded,
133 |         'questions_len': questions_len,
134 |         'question_id': question_ids,
135 |         'video_ids': np.asarray(video_ids_tbw),
136 |         'video_names': np.array(video_names_tbw),
137 |         'answers': all_answers,
138 |         'glove': glove_matrix,
139 |     }
140 |     with open(args.output_pt.format(args.dataset, args.dataset, args.mode), 'wb') as f:
141 |         pickle.dump(obj, f)
142 | 


--------------------------------------------------------------------------------
/tools/datautils/nextqa.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pandas as pd
  3 | import json
  4 | from datautils import utils
  5 | import nltk
  6 | import os.path as osp
  7 | import pickle
  8 | import numpy as np
  9 | 
 10 | 
 11 | def load_video_paths(args):
 12 |     ''' Load a list of (path,image_id tuples).'''
 13 |     input_paths = []
 14 |     annotation = pd.read_csv(args.annotation_file.format(args.question_type), delimiter='\t')
 15 |     gif_names = list(annotation['gif_name'])
 16 |     keys = list(annotation['key'])
 17 |     print("Number of questions: {}".format(len(gif_names)))
 18 |     for idx, gif in enumerate(gif_names):
 19 |         gif_abs_path = os.path.join(args.video_dir, ''.join([gif, '.gif']))
 20 |         input_paths.append((gif_abs_path, keys[idx]))
 21 |     input_paths = list(set(input_paths))
 22 |     print("Number of unique videos: {}".format(len(input_paths)))
 23 | 
 24 |     return input_paths
 25 | 
 26 | 
 27 | def openeded_encoding_data(args, vocab, questions, video_names, video_ids, answers, mode='train'):
 28 |     ''' Encode question tokens'''
 29 |     print('Encoding dataset')
 30 |     questions_encoded = []
 31 |     questions_len = []
 32 |     video_ids_tbw = []
 33 |     video_names_tbw = []
 34 |     all_answers = []
 35 |     question_ids = []
 36 |     for idx, question in enumerate(questions):
 37 |         question = question.lower()[:-1]
 38 |         question_tokens = nltk.word_tokenize(question)
 39 |         question_encoded = utils.encode(question_tokens, vocab['question_token_to_idx'], allow_unk=True)
 40 |         questions_encoded.append(question_encoded)
 41 |         questions_len.append(len(question_encoded))
 42 |         question_ids.append(idx)
 43 |         video_names_tbw.append(video_names[idx])
 44 |         video_ids_tbw.append(video_ids[idx])
 45 | 
 46 |         if args.question_type == "frameqa":
 47 |             answer = answers[idx]
 48 |             if answer in vocab['answer_token_to_idx']:
 49 |                 answer = vocab['answer_token_to_idx'][answer]
 50 |             elif mode in ['train']:
 51 |                 answer = 0
 52 |             elif mode in ['val', 'test']:
 53 |                 answer = 1
 54 |         else:
 55 |             answer = max(int(answers[idx]), 1)
 56 |         all_answers.append(answer)
 57 | 
 58 |     # Pad encoded questions
 59 |     max_question_length = max(len(x) for x in questions_encoded)
 60 |     for qe in questions_encoded:
 61 |         while len(qe) < max_question_length:
 62 |             qe.append(vocab['question_token_to_idx']['<NULL>'])
 63 | 
 64 |     questions_encoded = np.asarray(questions_encoded, dtype=np.int32)
 65 |     questions_len = np.asarray(questions_len, dtype=np.int32)
 66 |     print(questions_encoded.shape)
 67 | 
 68 |     glove_matrix = None
 69 |     if mode == 'train':
 70 |         token_itow = {i: w for w, i in vocab['question_token_to_idx'].items()}
 71 |         print("Load glove from %s" % args.glove_pt)
 72 |         glove = pickle.load(open(args.glove_pt, 'rb'))
 73 |         dim_word = glove['the'].shape[0]
 74 |         glove_matrix = []
 75 |         for i in range(len(token_itow)):
 76 |             vector = glove.get(token_itow[i], np.zeros((dim_word,)))
 77 |             glove_matrix.append(vector)
 78 |         glove_matrix = np.asarray(glove_matrix, dtype=np.float32)
 79 |         print(glove_matrix.shape)
 80 | 
 81 |     print('Writing ', args.output_pt.format(args.question_type, args.question_type, mode))
 82 |     obj = {
 83 |         'questions': questions_encoded,
 84 |         'questions_len': questions_len,
 85 |         'question_id': question_ids,
 86 |         'video_ids': np.asarray(video_ids_tbw),
 87 |         'video_names': np.array(video_names_tbw),
 88 |         'answers': all_answers,
 89 |         'glove': glove_matrix,
 90 |     }
 91 |     with open(args.output_pt.format(args.question_type, args.question_type, mode), 'wb') as f:
 92 |         pickle.dump(obj, f)
 93 | 
 94 | def multichoice_encoding_data(args, vocab, questions, qns_ids, video_names, video_ids, answers, ans_candidates, mode='train'):
 95 |     # Encode all questions
 96 |     print('Encoding dataset')
 97 |     questions_encoded = []
 98 |     questions_len = []
 99 |     question_ids = qns_ids
100 |     all_answer_cands_encoded = []
101 |     all_answer_cands_len = []
102 |     video_ids_tbw = []
103 |     video_names_tbw = []
104 |     correct_answers = []
105 |     for idx, question in enumerate(questions):
106 | 
107 |         question = question.lower()
108 |         question_tokens = nltk.word_tokenize(question)
109 |         question_encoded = utils.encode(question_tokens, vocab['question_answer_token_to_idx'], allow_unk=True)
110 |         questions_encoded.append(question_encoded)
111 |         questions_len.append(len(question_encoded))
112 |         # question_ids.append(idx)
113 |         video_names_tbw.append(video_names[idx])
114 |         video_ids_tbw.append(video_ids[idx])
115 |         # grounthtruth
116 |         answer = int(answers[idx])
117 |         correct_answers.append(answer)
118 |         # answer candidates
119 |         candidates = ans_candidates[idx]
120 |         candidates_encoded = []
121 |         candidates_len = []
122 |         for ans in candidates:
123 | 
124 |             ans = ans.lower()
125 |             ans_tokens = nltk.word_tokenize(ans)
126 |             cand_encoded = utils.encode(ans_tokens, vocab['question_answer_token_to_idx'], allow_unk=True)
127 |             candidates_encoded.append(cand_encoded)
128 |             candidates_len.append(len(cand_encoded))
129 |         all_answer_cands_encoded.append(candidates_encoded)
130 |         all_answer_cands_len.append(candidates_len)
131 | 
132 |     # Pad encoded questions
133 |     max_question_length = max(len(x) for x in questions_encoded)
134 |     for qe in questions_encoded:
135 |         while len(qe) < max_question_length:
136 |             qe.append(vocab['question_answer_token_to_idx']['<NULL>'])
137 | 
138 |     questions_encoded = np.asarray(questions_encoded, dtype=np.int32)
139 |     questions_len = np.asarray(questions_len, dtype=np.int32)
140 |     print(questions_encoded.shape)
141 | 
142 |     # Pad encoded answer candidates
143 |     max_answer_cand_length = max(max(len(x) for x in candidate) for candidate in all_answer_cands_encoded)
144 |     for ans_cands in all_answer_cands_encoded:
145 |         for ans in ans_cands:
146 |             while len(ans) < max_answer_cand_length:
147 |                 ans.append(vocab['question_answer_token_to_idx']['<NULL>'])
148 |     all_answer_cands_encoded = np.asarray(all_answer_cands_encoded, dtype=np.int32)
149 |     all_answer_cands_len = np.asarray(all_answer_cands_len, dtype=np.int32)
150 |     print(all_answer_cands_encoded.shape)
151 | 
152 |     glove_matrix = None
153 |     # if mode in ['train']:
154 |     #     token_itow = {i: w for w, i in vocab['question_answer_token_to_idx'].items()}
155 |     #     print("Load glove from %s" % args.glove_pt)
156 |     #     glove = pickle.load(open(args.glove_pt, 'rb'))
157 |     #     dim_word = glove['the'].shape[0]
158 |     #     glove_matrix = []
159 |     #     for i in range(len(token_itow)):
160 |     #         vector = glove.get(token_itow[i], np.zeros((dim_word,)))
161 |     #         glove_matrix.append(vector)
162 |     #     glove_matrix = np.asarray(glove_matrix, dtype=np.float32)
163 |     #     print(glove_matrix.shape)
164 | 
165 |     print('Writing ', args.output_pt.format(mode))
166 |     obj = {
167 |         'questions': questions_encoded,
168 |         'questions_len': questions_len,
169 |         'question_id': question_ids,
170 |         'video_ids': np.asarray(video_ids_tbw),
171 |         'video_names': np.array(video_names_tbw),
172 |         'ans_candidates': all_answer_cands_encoded,
173 |         'ans_candidates_len': all_answer_cands_len,
174 |         'answers': correct_answers,
175 |         'glove': glove_matrix,
176 |     }
177 |     with open(args.output_pt.format(mode), 'wb') as f:
178 |         pickle.dump(obj, f)
179 | 
180 | def process_questions_openended(args):
181 |     print('Loading dataset')
182 |     if args.mode in ["train"]:
183 |         csv_data = pd.read_csv(args.annotation_file.format("Train", args.question_type), delimiter='\t')
184 |     else:
185 |         csv_data = pd.read_csv(args.annotation_file.format("Test", args.question_type), delimiter='\t')
186 |     csv_data = csv_data.iloc[np.random.permutation(len(csv_data))]
187 |     questions = list(csv_data['question'])
188 |     answers = list(csv_data['answer'])
189 |     video_names = list(csv_data['gif_name'])
190 |     video_ids = list(csv_data['key'])
191 | 
192 |     print('number of questions: %s' % len(questions))
193 |     # Either create the vocab or load it from disk
194 |     if args.mode in ['train']:
195 |         print('Building vocab')
196 |         answer_cnt = {}
197 | 
198 |         if args.question_type == "frameqa":
199 |             for i, answer in enumerate(answers):
200 |                 answer_cnt[answer] = answer_cnt.get(answer, 0) + 1
201 | 
202 |             answer_token_to_idx = {'<UNK>': 0}
203 |             for token in answer_cnt:
204 |                 answer_token_to_idx[token] = len(answer_token_to_idx)
205 |             print('Get answer_token_to_idx, num: %d' % len(answer_token_to_idx))
206 |         elif args.question_type == 'count':
207 |             answer_token_to_idx = {'<UNK>': 0}
208 | 
209 |         question_token_to_idx = {'<NULL>': 0, '<UNK>': 1}
210 |         for i, q in enumerate(questions):
211 |             question = q.lower()[:-1]
212 |             for token in nltk.word_tokenize(question):
213 |                 if token not in question_token_to_idx:
214 |                     question_token_to_idx[token] = len(question_token_to_idx)
215 |         print('Get question_token_to_idx')
216 |         print(len(question_token_to_idx))
217 | 
218 |         vocab = {
219 |             'question_token_to_idx': question_token_to_idx,
220 |             'answer_token_to_idx': answer_token_to_idx,
221 |             'question_answer_token_to_idx': {'<NULL>': 0, '<UNK>': 1}
222 |         }
223 | 
224 |         print('Write into %s' % args.vocab_json.format(args.question_type, args.question_type))
225 |         with open(args.vocab_json.format(args.question_type, args.question_type), 'w') as f:
226 |             json.dump(vocab, f, indent=4)
227 | 
228 |         # split 10% of questions for evaluation
229 |         split = int(0.9 * len(questions))
230 |         train_questions = questions[:split]
231 |         train_answers = answers[:split]
232 |         train_video_names = video_names[:split]
233 |         train_video_ids = video_ids[:split]
234 | 
235 |         val_questions = questions[split:]
236 |         val_answers = answers[split:]
237 |         val_video_names = video_names[split:]
238 |         val_video_ids = video_ids[split:]
239 | 
240 |         openeded_encoding_data(args, vocab, train_questions, train_video_names, train_video_ids, train_answers, mode='train')
241 |         openeded_encoding_data(args, vocab, val_questions, val_video_names, val_video_ids, val_answers, mode='val')
242 |     else:
243 |         print('Loading vocab')
244 |         with open(args.vocab_json.format(args.question_type, args.question_type), 'r') as f:
245 |             vocab = json.load(f)
246 |         openeded_encoding_data(args, vocab, questions, video_names, video_ids, answers, mode='test')
247 | 
248 | 
249 | def process_questions_mulchoices(args):
250 |     print('Loading dataset')
251 |     # if args.mode in ["train", "val"]:
252 |     #     csv_data = pd.read_csv(args.annotation_file.format("Train", args.question_type), delimiter=',')
253 |     # else:
254 |     #     csv_data = pd.read_csv(args.annotation_file.format("Test", args.question_type), delimiter=',')
255 | 
256 |     if args.mode == 'all':
257 |         csv_data = pd.read_csv(args.annotation_file.format(args.mode), delimiter=',').astype('string')
258 |     else:
259 |         csv_data = pd.read_csv(args.annotation_file.format(args.mode), delimiter=',').astype('string')
260 | 
261 |     # csv_data = csv_data.iloc[np.random.permutation(len(csv_data))]
262 |     questions = list(csv_data['question'])
263 |     answers = list(csv_data['answer'])
264 |     video_names = list(csv_data['video'])
265 |     video_ids = list(csv_data['video'])
266 |     qns_ids = list(csv_data['qid'])
267 |     qns_ids = [vname+'_'+qid for vname, qid in zip(video_names, qns_ids)]
268 |     ans_candidates = np.asarray([csv_data['a0'], csv_data['a1'], csv_data['a2'], csv_data['a3'], csv_data['a4']])
269 |     ans_candidates = ans_candidates.transpose()
270 |     print(ans_candidates.shape)
271 |     # ans_candidates: (num_ques, 5)
272 |     print('number of questions: %s' % len(questions))
273 |     # Either create the vocab or load it from disk
274 |     #if args.mode in ['train']:
275 |     if not osp.exists(args.vocab_json.format('train')):
276 |         print('Building vocab')
277 |         answer_token_to_idx = {'<UNK0>': 0, '<UNK1>': 1}
278 |         question_answer_token_to_idx = {'<NULL>': 0, '<UNK>': 1}
279 |         for candidates in ans_candidates:
280 |             #print(candidates)
281 |             for ans in candidates:
282 |                 if type(ans) != 'str': continue
283 |                 ans = ans.lower()
284 |                 for token in nltk.word_tokenize(ans):
285 |                     if token not in answer_token_to_idx:
286 |                         answer_token_to_idx[token] = len(answer_token_to_idx)
287 |                     if token not in question_answer_token_to_idx:
288 |                         question_answer_token_to_idx[token] = len(question_answer_token_to_idx)
289 |         print('Get answer_token_to_idx, num: %d' % len(answer_token_to_idx))
290 | 
291 |         question_token_to_idx = {'<NULL>': 0, '<UNK>': 1}
292 |         for i, q in enumerate(questions):
293 |             question = str(q).lower()[:-1]
294 |             for token in nltk.word_tokenize(question):
295 |                 if token not in question_token_to_idx:
296 |                     question_token_to_idx[token] = len(question_token_to_idx)
297 |                 if token not in question_answer_token_to_idx:
298 |                     question_answer_token_to_idx[token] = len(question_answer_token_to_idx)
299 | 
300 |         print('Get question_token_to_idx')
301 |         print(len(question_token_to_idx))
302 |         print('Get question_answer_token_to_idx')
303 |         print(len(question_answer_token_to_idx))
304 | 
305 |         vocab = {
306 |             'question_token_to_idx': question_token_to_idx,
307 |             'answer_token_to_idx': answer_token_to_idx,
308 |             'question_answer_token_to_idx': question_answer_token_to_idx,
309 |         }
310 | 
311 |         print('Write into %s' % args.vocab_json.format(args.mode))
312 |         with open(args.vocab_json.format(args.mode), 'w') as f:
313 |             json.dump(vocab, f, indent=4)
314 | 
315 |         # split 10% of questions for evaluation
316 |         # split = int(0.9 * len(questions))
317 |         # train_questions = questions[:split]
318 |         # train_answers = answers[:split]
319 |         # train_video_names = video_names[:split]
320 |         # train_video_ids = video_ids[:split]
321 |         # train_ans_candidates = ans_candidates[:split, :]
322 |         #
323 |         # val_questions = questions[split:]
324 |         # val_answers = answers[split:]
325 |         # val_video_names = video_names[split:]
326 |         # val_video_ids = video_ids[split:]
327 |         # val_ans_candidates = ans_candidates[split:, :]
328 | 
329 |         multichoice_encoding_data(args, vocab, questions, qns_ids, video_names, video_ids, answers,
330 |                                   ans_candidates, mode='train')
331 |         # multichoice_encoding_data(args, vocab, val_questions, val_video_names, val_video_ids, val_answers,
332 |         #                           val_ans_candidates, mode='val')
333 |     else:
334 |         print('Loading vocab')
335 |         with open(args.vocab_json.format('train'), 'r') as f:
336 |             vocab = json.load(f)
337 |         multichoice_encoding_data(args, vocab, questions, qns_ids, video_names, video_ids, answers,
338 |                                   ans_candidates, mode=args.mode)
339 | 


--------------------------------------------------------------------------------
/tools/datautils/tgif_qa.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pandas as pd
  3 | import json
  4 | from datautils import utils
  5 | import nltk
  6 | 
  7 | import pickle
  8 | import numpy as np
  9 | 
 10 | 
 11 | def load_video_paths(args):
 12 |     ''' Load a list of (path,image_id tuples).'''
 13 |     input_paths = []
 14 |     annotation = pd.read_csv(args.annotation_file.format(args.question_type), delimiter='\t')
 15 |     gif_names = list(annotation['gif_name'])
 16 |     keys = list(annotation['key'])
 17 |     print("Number of questions: {}".format(len(gif_names)))
 18 |     for idx, gif in enumerate(gif_names):
 19 |         gif_abs_path = os.path.join(args.video_dir, ''.join([gif, '.gif']))
 20 |         input_paths.append((gif_abs_path, keys[idx]))
 21 |     input_paths = list(set(input_paths))
 22 |     print("Number of unique videos: {}".format(len(input_paths)))
 23 | 
 24 |     return input_paths
 25 | 
 26 | 
 27 | def openeded_encoding_data(args, vocab, questions, video_names, video_ids, answers, mode='train'):
 28 |     ''' Encode question tokens'''
 29 |     print('Encoding dataset')
 30 |     questions_encoded = []
 31 |     questions_len = []
 32 |     video_ids_tbw = []
 33 |     video_names_tbw = []
 34 |     all_answers = []
 35 |     question_ids = []
 36 |     for idx, question in enumerate(questions):
 37 |         question = question.lower()[:-1]
 38 |         question_tokens = nltk.word_tokenize(question)
 39 |         question_encoded = utils.encode(question_tokens, vocab['question_token_to_idx'], allow_unk=True)
 40 |         questions_encoded.append(question_encoded)
 41 |         questions_len.append(len(question_encoded))
 42 |         question_ids.append(idx)
 43 |         video_names_tbw.append(video_names[idx])
 44 |         video_ids_tbw.append(video_ids[idx])
 45 | 
 46 |         if args.question_type == "frameqa":
 47 |             answer = answers[idx]
 48 |             if answer in vocab['answer_token_to_idx']:
 49 |                 answer = vocab['answer_token_to_idx'][answer]
 50 |             elif mode in ['train']:
 51 |                 answer = 0
 52 |             elif mode in ['val', 'test']:
 53 |                 answer = 1
 54 |         else:
 55 |             answer = max(int(answers[idx]), 1)
 56 |         all_answers.append(answer)
 57 | 
 58 |     # Pad encoded questions
 59 |     max_question_length = max(len(x) for x in questions_encoded)
 60 |     for qe in questions_encoded:
 61 |         while len(qe) < max_question_length:
 62 |             qe.append(vocab['question_token_to_idx']['<NULL>'])
 63 | 
 64 |     questions_encoded = np.asarray(questions_encoded, dtype=np.int32)
 65 |     questions_len = np.asarray(questions_len, dtype=np.int32)
 66 |     print(questions_encoded.shape)
 67 | 
 68 |     glove_matrix = None
 69 |     if mode == 'train':
 70 |         token_itow = {i: w for w, i in vocab['question_token_to_idx'].items()}
 71 |         print("Load glove from %s" % args.glove_pt)
 72 |         glove = pickle.load(open(args.glove_pt, 'rb'))
 73 |         dim_word = glove['the'].shape[0]
 74 |         glove_matrix = []
 75 |         for i in range(len(token_itow)):
 76 |             vector = glove.get(token_itow[i], np.zeros((dim_word,)))
 77 |             glove_matrix.append(vector)
 78 |         glove_matrix = np.asarray(glove_matrix, dtype=np.float32)
 79 |         print(glove_matrix.shape)
 80 | 
 81 |     print('Writing ', args.output_pt.format(args.question_type, args.question_type, mode))
 82 |     obj = {
 83 |         'questions': questions_encoded,
 84 |         'questions_len': questions_len,
 85 |         'question_id': question_ids,
 86 |         'video_ids': np.asarray(video_ids_tbw),
 87 |         'video_names': np.array(video_names_tbw),
 88 |         'answers': all_answers,
 89 |         'glove': glove_matrix,
 90 |     }
 91 |     with open(args.output_pt.format(args.question_type, args.question_type, mode), 'wb') as f:
 92 |         pickle.dump(obj, f)
 93 | 
 94 | def multichoice_encoding_data(args, vocab, questions, video_names, video_ids, answers, ans_candidates, mode='train'):
 95 |     # Encode all questions
 96 |     print('Encoding dataset')
 97 |     questions_encoded = []
 98 |     questions_len = []
 99 |     question_ids = []
100 |     all_answer_cands_encoded = []
101 |     all_answer_cands_len = []
102 |     video_ids_tbw = []
103 |     video_names_tbw = []
104 |     correct_answers = []
105 |     for idx, question in enumerate(questions):
106 |         question = question.lower()[:-1]
107 |         question_tokens = nltk.word_tokenize(question)
108 |         question_encoded = utils.encode(question_tokens, vocab['question_answer_token_to_idx'], allow_unk=True)
109 |         questions_encoded.append(question_encoded)
110 |         questions_len.append(len(question_encoded))
111 |         question_ids.append(idx)
112 |         video_names_tbw.append(video_names[idx])
113 |         video_ids_tbw.append(video_ids[idx])
114 |         # grounthtruth
115 |         answer = int(answers[idx])
116 |         correct_answers.append(answer)
117 |         # answer candidates
118 |         candidates = ans_candidates[idx]
119 |         candidates_encoded = []
120 |         candidates_len = []
121 |         for ans in candidates:
122 |             ans = ans.lower()
123 |             ans_tokens = nltk.word_tokenize(ans)
124 |             cand_encoded = utils.encode(ans_tokens, vocab['question_answer_token_to_idx'], allow_unk=True)
125 |             candidates_encoded.append(cand_encoded)
126 |             candidates_len.append(len(cand_encoded))
127 |         all_answer_cands_encoded.append(candidates_encoded)
128 |         all_answer_cands_len.append(candidates_len)
129 | 
130 |     # Pad encoded questions
131 |     max_question_length = max(len(x) for x in questions_encoded)
132 |     for qe in questions_encoded:
133 |         while len(qe) < max_question_length:
134 |             qe.append(vocab['question_answer_token_to_idx']['<NULL>'])
135 | 
136 |     questions_encoded = np.asarray(questions_encoded, dtype=np.int32)
137 |     questions_len = np.asarray(questions_len, dtype=np.int32)
138 |     print(questions_encoded.shape)
139 | 
140 |     # Pad encoded answer candidates
141 |     max_answer_cand_length = max(max(len(x) for x in candidate) for candidate in all_answer_cands_encoded)
142 |     for ans_cands in all_answer_cands_encoded:
143 |         for ans in ans_cands:
144 |             while len(ans) < max_answer_cand_length:
145 |                 ans.append(vocab['question_answer_token_to_idx']['<NULL>'])
146 |     all_answer_cands_encoded = np.asarray(all_answer_cands_encoded, dtype=np.int32)
147 |     all_answer_cands_len = np.asarray(all_answer_cands_len, dtype=np.int32)
148 |     print(all_answer_cands_encoded.shape)
149 | 
150 |     glove_matrix = None
151 |     if mode in ['train']:
152 |         token_itow = {i: w for w, i in vocab['question_answer_token_to_idx'].items()}
153 |         print("Load glove from %s" % args.glove_pt)
154 |         glove = pickle.load(open(args.glove_pt, 'rb'))
155 |         dim_word = glove['the'].shape[0]
156 |         glove_matrix = []
157 |         for i in range(len(token_itow)):
158 |             vector = glove.get(token_itow[i], np.zeros((dim_word,)))
159 |             glove_matrix.append(vector)
160 |         glove_matrix = np.asarray(glove_matrix, dtype=np.float32)
161 |         print(glove_matrix.shape)
162 | 
163 |     print('Writing ', args.output_pt.format(args.question_type, args.question_type, mode))
164 |     obj = {
165 |         'questions': questions_encoded,
166 |         'questions_len': questions_len,
167 |         'question_id': question_ids,
168 |         'video_ids': np.asarray(video_ids_tbw),
169 |         'video_names': np.array(video_names_tbw),
170 |         'ans_candidates': all_answer_cands_encoded,
171 |         'ans_candidates_len': all_answer_cands_len,
172 |         'answers': correct_answers,
173 |         'glove': glove_matrix,
174 |     }
175 |     with open(args.output_pt.format(args.question_type, args.question_type, mode), 'wb') as f:
176 |         pickle.dump(obj, f)
177 | 
178 | def process_questions_openended(args):
179 |     print('Loading dataset')
180 |     if args.mode in ["train"]:
181 |         csv_data = pd.read_csv(args.annotation_file.format("Train", args.question_type), delimiter='\t')
182 |     else:
183 |         csv_data = pd.read_csv(args.annotation_file.format("Test", args.question_type), delimiter='\t')
184 |     csv_data = csv_data.iloc[np.random.permutation(len(csv_data))]
185 |     questions = list(csv_data['question'])
186 |     answers = list(csv_data['answer'])
187 |     video_names = list(csv_data['gif_name'])
188 |     video_ids = list(csv_data['key'])
189 | 
190 |     print('number of questions: %s' % len(questions))
191 |     # Either create the vocab or load it from disk
192 |     if args.mode in ['train']:
193 |         print('Building vocab')
194 |         answer_cnt = {}
195 | 
196 |         if args.question_type == "frameqa":
197 |             for i, answer in enumerate(answers):
198 |                 answer_cnt[answer] = answer_cnt.get(answer, 0) + 1
199 | 
200 |             answer_token_to_idx = {'<UNK>': 0}
201 |             for token in answer_cnt:
202 |                 answer_token_to_idx[token] = len(answer_token_to_idx)
203 |             print('Get answer_token_to_idx, num: %d' % len(answer_token_to_idx))
204 |         elif args.question_type == 'count':
205 |             answer_token_to_idx = {'<UNK>': 0}
206 | 
207 |         question_token_to_idx = {'<NULL>': 0, '<UNK>': 1}
208 |         for i, q in enumerate(questions):
209 |             question = q.lower()[:-1]
210 |             for token in nltk.word_tokenize(question):
211 |                 if token not in question_token_to_idx:
212 |                     question_token_to_idx[token] = len(question_token_to_idx)
213 |         print('Get question_token_to_idx')
214 |         print(len(question_token_to_idx))
215 | 
216 |         vocab = {
217 |             'question_token_to_idx': question_token_to_idx,
218 |             'answer_token_to_idx': answer_token_to_idx,
219 |             'question_answer_token_to_idx': {'<NULL>': 0, '<UNK>': 1}
220 |         }
221 | 
222 |         print('Write into %s' % args.vocab_json.format(args.question_type, args.question_type))
223 |         with open(args.vocab_json.format(args.question_type, args.question_type), 'w') as f:
224 |             json.dump(vocab, f, indent=4)
225 | 
226 |         # split 10% of questions for evaluation
227 |         split = int(0.9 * len(questions))
228 |         train_questions = questions[:split]
229 |         train_answers = answers[:split]
230 |         train_video_names = video_names[:split]
231 |         train_video_ids = video_ids[:split]
232 | 
233 |         val_questions = questions[split:]
234 |         val_answers = answers[split:]
235 |         val_video_names = video_names[split:]
236 |         val_video_ids = video_ids[split:]
237 | 
238 |         openeded_encoding_data(args, vocab, train_questions, train_video_names, train_video_ids, train_answers, mode='train')
239 |         openeded_encoding_data(args, vocab, val_questions, val_video_names, val_video_ids, val_answers, mode='val')
240 |     else:
241 |         print('Loading vocab')
242 |         with open(args.vocab_json.format(args.question_type, args.question_type), 'r') as f:
243 |             vocab = json.load(f)
244 |         openeded_encoding_data(args, vocab, questions, video_names, video_ids, answers, mode='test')
245 | 
246 | 
247 | 
248 | 
249 | def process_questions_mulchoices(args):
250 |     print('Loading dataset')
251 |     if args.mode in ["train", "val"]:
252 |         csv_data = pd.read_csv(args.annotation_file.format("Train", args.question_type), delimiter='\t')
253 |     else:
254 |         csv_data = pd.read_csv(args.annotation_file.format("Test", args.question_type), delimiter='\t')
255 |     csv_data = csv_data.iloc[np.random.permutation(len(csv_data))]
256 |     questions = list(csv_data['question'])
257 |     answers = list(csv_data['answer'])
258 |     video_names = list(csv_data['gif_name'])
259 |     video_ids = list(csv_data['key'])
260 |     ans_candidates = np.asarray(
261 |         [csv_data['a1'], csv_data['a2'], csv_data['a3'], csv_data['a4'], csv_data['a5']])
262 |     ans_candidates = ans_candidates.transpose()
263 |     print(ans_candidates.shape)
264 |     # ans_candidates: (num_ques, 5)
265 |     print('number of questions: %s' % len(questions))
266 |     # Either create the vocab or load it from disk
267 |     if args.mode in ['train']:
268 |         print('Building vocab')
269 | 
270 |         answer_token_to_idx = {'<UNK0>': 0, '<UNK1>': 1}
271 |         question_answer_token_to_idx = {'<NULL>': 0, '<UNK>': 1}
272 |         for candidates in ans_candidates:
273 |             for ans in candidates:
274 |                 ans = ans.lower()
275 |                 for token in nltk.word_tokenize(ans):
276 |                     if token not in answer_token_to_idx:
277 |                         answer_token_to_idx[token] = len(answer_token_to_idx)
278 |                     if token not in question_answer_token_to_idx:
279 |                         question_answer_token_to_idx[token] = len(question_answer_token_to_idx)
280 |         print('Get answer_token_to_idx, num: %d' % len(answer_token_to_idx))
281 | 
282 |         question_token_to_idx = {'<NULL>': 0, '<UNK>': 1}
283 |         for i, q in enumerate(questions):
284 |             question = q.lower()[:-1]
285 |             for token in nltk.word_tokenize(question):
286 |                 if token not in question_token_to_idx:
287 |                     question_token_to_idx[token] = len(question_token_to_idx)
288 |                 if token not in question_answer_token_to_idx:
289 |                     question_answer_token_to_idx[token] = len(question_answer_token_to_idx)
290 | 
291 |         print('Get question_token_to_idx')
292 |         print(len(question_token_to_idx))
293 |         print('Get question_answer_token_to_idx')
294 |         print(len(question_answer_token_to_idx))
295 | 
296 |         vocab = {
297 |             'question_token_to_idx': question_token_to_idx,
298 |             'answer_token_to_idx': answer_token_to_idx,
299 |             'question_answer_token_to_idx': question_answer_token_to_idx,
300 |         }
301 | 
302 |         print('Write into %s' % args.vocab_json.format(args.question_type, args.question_type))
303 |         with open(args.vocab_json.format(args.question_type, args.question_type), 'w') as f:
304 |             json.dump(vocab, f, indent=4)
305 | 
306 |         # split 10% of questions for evaluation
307 |         split = int(0.9 * len(questions))
308 |         train_questions = questions[:split]
309 |         train_answers = answers[:split]
310 |         train_video_names = video_names[:split]
311 |         train_video_ids = video_ids[:split]
312 |         train_ans_candidates = ans_candidates[:split, :]
313 | 
314 |         val_questions = questions[split:]
315 |         val_answers = answers[split:]
316 |         val_video_names = video_names[split:]
317 |         val_video_ids = video_ids[split:]
318 |         val_ans_candidates = ans_candidates[split:, :]
319 | 
320 |         multichoice_encoding_data(args, vocab, train_questions, train_video_names, train_video_ids, train_answers, train_ans_candidates, mode='train')
321 |         multichoice_encoding_data(args, vocab, val_questions, val_video_names, val_video_ids, val_answers,
322 |                                   val_ans_candidates, mode='val')
323 |     else:
324 |         print('Loading vocab')
325 |         with open(args.vocab_json.format(args.question_type, args.question_type), 'r') as f:
326 |             vocab = json.load(f)
327 |         multichoice_encoding_data(args, vocab, questions, video_names, video_ids, answers,
328 |                                   ans_candidates, mode='test')
329 | 


--------------------------------------------------------------------------------
/tools/datautils/utils.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import json
 3 | import os
 4 | import os.path as osp
 5 | 
 6 | def load_file(file_name):
 7 |     annos = None
 8 |     with open(file_name, 'r') as fp:
 9 |         if osp.splitext(file_name)[1]== '.txt':
10 |             annos = fp.readlines()
11 |             annos = [line.rstrip() for line in annos]
12 |         if osp.splitext(file_name)[1] == '.json':
13 |             annos = json.load(fp)
14 | 
15 |     return annos
16 | 
17 | def save_file(obj, filename):
18 |     """
19 |     save obj to filename
20 |     :param obj:
21 |     :param filename:
22 |     :return:
23 |     """
24 |     filepath = osp.dirname(filename)
25 |     if filepath != '' and not osp.exists(filepath):
26 |         os.makedirs(filepath)
27 |     else:
28 |         with open(filename, 'w') as fp:
29 |             json.dump(obj, fp)
30 | 
31 | def encode(seq_tokens, token_to_idx, allow_unk=False):
32 |   seq_idx = []
33 |   for token in seq_tokens:
34 |     if token not in token_to_idx:
35 |       if allow_unk:
36 |         token = '<UNK>'
37 |       else:
38 |         raise KeyError('Token "%s" not in vocab' % token)
39 |     seq_idx.append(token_to_idx[token])
40 |   return seq_idx
41 | 
42 | 
43 | def decode(seq_idx, idx_to_token, delim=None, stop_at_end=True):
44 |   tokens = []
45 |   for idx in seq_idx:
46 |     tokens.append(idx_to_token[idx])
47 |     if stop_at_end and tokens[-1] == '<END>':
48 |       break
49 |   if delim is None:
50 |     return tokens
51 |   else:
52 |     return delim.join(tokens)
53 | 
54 | # --------------------------------------------------------
55 | # Fast R-CNN
56 | # Copyright (c) 2015 Microsoft
57 | # Licensed under The MIT License [see LICENSE for details]
58 | # Written by Ross Girshick
59 | # --------------------------------------------------------
60 | 
61 | class Timer(object):
62 |     """A simple timer."""
63 |     def __init__(self):
64 |         self.total_time = 0.
65 |         self.calls = 0
66 |         self.start_time = 0.
67 |         self.diff = 0.
68 |         self.average_time = 0.
69 | 
70 |     def tic(self):
71 |         # using time.time instead of time.clock because time time.clock
72 |         # does not normalize for multithreading
73 |         self.start_time = time.time()
74 | 
75 |     def toc(self, average=True):
76 |         self.diff = time.time() - self.start_time
77 |         self.total_time += self.diff
78 |         self.calls += 1
79 |         self.average_time = self.total_time / self.calls
80 |         if average:
81 |             return self.average_time
82 |         else:
83 |             return self.diff


--------------------------------------------------------------------------------
/tools/demo.py:
--------------------------------------------------------------------------------
 1 | import h5py
 2 | import os
 3 | import os.path as osp
 4 | import numpy as np
 5 | from bbox_visualizer import *
 6 | import sys
 7 | sys.path.insert(0, '../')
 8 | from util import load_file, save_to
 9 | 
10 | bbox_colors = np.loadtxt('colors.txt')
11 | 
12 | 
13 | def sample_clips(total_frames, num_clips, num_frames_per_clip):
14 |     clips = []
15 |     frames = [str(f+1).zfill(6) for f in range(total_frames)]
16 |     for i in np.linspace(0, total_frames, num_clips + 2, dtype=np.int32)[1: num_clips + 1]:
17 |         clip_start = int(i) - int(num_frames_per_clip / 2)
18 |         clip_end = int(i) + int(num_frames_per_clip / 2)
19 |         clip_start = 0 if clip_start < 0 else clip_start
20 |         clip_end = total_frames if clip_end > total_frames else clip_end
21 |         clip = frames[clip_start:clip_end] 
22 |         if clip_start == 0 and len(clip) < num_frames_per_clip:
23 |             shortage = num_frames_per_clip - (clip_end - clip_start)
24 |             added_fids = []
25 |             for _ in range(shortage):
26 |                 added_fids.append(frames[clip_start])
27 |             if len(added_fids) > 0:
28 |                 clip = added_fids + clip
29 |         if clip_end == total_frames and len(clip) < num_frames_per_clip:
30 |             shortage = num_frames_per_clip - (clip_end - clip_start)
31 |             added_fids = []
32 |             for _ in range(shortage):
33 |                 added_fids.append(frames[clip_end-1])
34 |             if len(added_fids) > 0:
35 |                 clip += added_fids
36 |         clip = clip[::4]
37 |         clips.append(clip)
38 |     clips = clips[::2]
39 |     return clips
40 | 
41 | 
42 | def get_vbbox(feat_file, qvid, bbox_num):
43 |     with h5py.File(feat_file, 'r') as fp:
44 |         vids = fp['ids']
45 |         bboxes = fp['bbox']
46 |         for id, (vid, bbox) in enumerate(zip(vids, bboxes)):
47 |             if str(vid) != qvid: continue
48 |             vbbox = bbox[:,:,:bbox_num, :] 
49 |     
50 |     return vbbox
51 | 
52 | 
53 | def vis_det(feat_file, vname):
54 |     bbox_num = 5
55 |     vid = vname.split('/')[-1]
56 |     vbbox = get_vbbox(feat_file, vid, bbox_num)
57 |     fids = os.listdir(vname)
58 |     total_frames = len(fids)
59 |     clips = sample_clips(total_frames, 16, 16)
60 |     # clips = np.asarray(clips).reshape(-1)
61 |     out_dir = '../demo/'
62 |     
63 |     for i, cids in enumerate(clips):
64 |         for f, fid in enumerate(cids):
65 |             img_path = osp.join(vname, fid+'.jpg')
66 |             bboxes = vbbox[i][f]
67 |             
68 |             bboxes = [[int(np.round(b)) for b in bbox] for bbox in bboxes]
69 |             # bbox = [int(np.round(b)) for b in bbox]
70 |             img = cv2.imread(img_path)
71 |             img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
72 |             output = draw_multiple_rectangles(img, bboxes, bbox_colors)
73 |             # output = draw_rectangle(img, bbox)
74 |             
75 |             out_file = osp.join(out_dir, str(vid))
76 |             if not osp.exists(out_file):
77 |                 os.makedirs(out_file)
78 |             img = cv2.cvtColor(output, cv2.COLOR_RGB2BGR)
79 |             cv2.imwrite(osp.join(out_file, fid+'.jpg'), img)
80 |             # cv2.imshow('image', output)
81 |             # cv2.waitKey(0)
82 | 
83 | 
84 | 
85 | def main():
86 |     dataset = 'nextqa'
87 |     feat_file = f'../../data/{dataset}/region_feat_n/acregion_8c20b_val.h5'
88 |     #the videos are decoded by 6 pfs
89 |     frame_dir = '/home/jbxiao/workspace/data/nextqa/frames/'
90 |     vname = f'{frame_dir}/3376544720'
91 |     vis_det(feat_file, vname)
92 | 
93 | if __name__ == "__main__":
94 |     main()
95 | 


--------------------------------------------------------------------------------
/tools/extract_video.py:
--------------------------------------------------------------------------------
 1 | # ====================================================
 2 | # @Time    : 15/4/21 12:38 PM
 3 | # @Author  : Xiao Junbin
 4 | # @Email   : junbin@comp.nus.edu.sg
 5 | # @File    : extract_video.py.py
 6 | # ====================================================
 7 | import os
 8 | import os.path as osp
 9 | import shutil
10 | import subprocess
11 | import pandas as pd
12 | import json
13 | import sys
14 | sys.path.insert(0, '../')
15 | from util import load_file
16 | 
17 | # def load_file(filename):
18 | #     with open(filename, 'r') as fp:
19 | #         data = json.load(fp)
20 | #     return data
21 | 
22 | def get_video_list(filename, out_file):
23 |     data = load_file(filename)
24 |     video_ids = list(data['video_id'])
25 |     video_ids = list(set(video_ids))
26 |     # video_ids = os.listdir(filename)
27 |     # video_ids = sorted(video_ids)
28 |     print(len(video_ids))
29 |     with open(out_file, 'w') as fp:
30 |         json.dump(video_ids, fp, indent=4)
31 |     return video_ids
32 | 
33 | 
34 | def extract_frame(video, dst):
35 |     
36 |     with open(os.devnull, 'w') as ffmpeg_log:
37 |         if os.path.exists(dst):
38 |             # print(" cleanup: "+dst+"/")
39 |             shutil.rmtree(dst)
40 |         os.makedirs(dst)
41 |         video2frame_cmd = [
42 |             "ffmpeg",
43 |             '-y',
44 |             '-i', video,
45 |             '-r', "6", # 6 frames per second
46 |             # '-vf', "scale=400:300",
47 |             '-qscale:v', "2",
48 |             '{0}/%06d.jpg'.format(dst)
49 |         ]
50 |         subprocess.call(video2frame_cmd, stdout = ffmpeg_log, stderr=ffmpeg_log)
51 | 
52 | 
53 | def extract_videos(raw_dir, vlist, frame_dir, map_vid=None):
54 |     
55 |     vnum = len(vlist)
56 |     for id, vid in enumerate(vlist):
57 |         # if id <= 400: continue
58 |         # if id > 400: break
59 |         vid = str(vid)
60 |         if map_vid != None:
61 |             video = osp.join(raw_dir, f'{map_vid[vid]}.mp4')
62 |         else:
63 |             video = osp.join(raw_dir, f'{vid}.mp4')
64 |         dst = osp.join(frame_dir, vid)
65 |         if not osp.exists(video):
66 |             print(video)
67 |         extract_frame(video, dst)
68 |         if id % 20 == 0:
69 |             print('{}/{}'.format(id, vnum))
70 | 
71 | 
72 | def main():
73 |     video_dir = '/storage/jbxiao/workspace/data/nextqa/'
74 |     raw_dir = osp.join(video_dir, 'videos/')
75 |     frame_dir = osp.join(video_dir, 'frames_val/')
76 |     anno_dir = '../datasets/nextqa/'
77 |     vlist_file = osp.join(anno_dir, 'vlist.json')
78 |     map_file = osp.join(anno_dir, 'map_vid_vidorID.json')
79 |     if not osp.exists(vlist_file):
80 |         dset = 'val' #train/test
81 |         qa_file = osp.join(anno_dir, f'{dset}.csv')
82 |         vlist_file = osp.join(anno_dir, f'vlist_{dset}.json')
83 |         vlist = get_video_list(qa_file, vlist_file)
84 |     else:
85 |         vlist = load_file(vlist_file)
86 |     map_vid = load_file(map_file)
87 |     extract_videos(raw_dir, vlist, frame_dir, map_vid=map_vid)
88 | 
89 | 
90 | if __name__ == "__main__":
91 |     main()
92 | 


--------------------------------------------------------------------------------
/tools/feat_app.sh:
--------------------------------------------------------------------------------
 1 | #########################################################################
 2 | # File Name: feat_app.sh
 3 | # Author: Xiao Junbin
 4 | # mail: xiaojunbin@u.nus.edu
 5 | # Created Time: Sat 19 Sep 2020 09:22:26 PM
 6 | #########################################################################
 7 | #!/bin/bash
 8 | GPUID=$1
 9 | CUDA_VISIBLE_DEVICES=$GPUID python preprocess_features.py \
10 | 	--dataset 'nextqa' \
11 | 	--model 'resnet101' \
12 | 	--image_width 224 \
13 | 	--image_height 224
14 | 


--------------------------------------------------------------------------------
/tools/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/doc-doc/CoVGT/cbc9fa7830b304f3c3f9c53040489ea9ad35a9aa/tools/models/__init__.py


--------------------------------------------------------------------------------
/tools/models/densenet.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | from collections import OrderedDict
  5 | import math
  6 | 
  7 | __all__ = ['DenseNet', 'densenet121', 'densenet169', 'densenet201', 'densenet264']
  8 | 
  9 | 
 10 | def densenet121(**kwargs):
 11 |     model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 24, 16),
 12 |                      **kwargs)
 13 |     return model
 14 | 
 15 | 
 16 | def densenet169(**kwargs):
 17 |     model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 32, 32),
 18 |                      **kwargs)
 19 |     return model
 20 | 
 21 | 
 22 | def densenet201(**kwargs):
 23 |     model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 48, 32),
 24 |                      **kwargs)
 25 |     return model
 26 | 
 27 | 
 28 | def densenet264(**kwargs):
 29 |     model = DenseNet(num_init_features=64, growth_rate=32, block_config=(6, 12, 64, 48),
 30 |                      **kwargs)
 31 |     return model
 32 | 
 33 | 
 34 | def get_fine_tuning_parameters(model, ft_begin_index):
 35 |     if ft_begin_index == 0:
 36 |         return model.parameters()
 37 | 
 38 |     ft_module_names = []
 39 |     for i in range(ft_begin_index, 5):
 40 |         ft_module_names.append('denseblock{}'.format(ft_begin_index))
 41 |         ft_module_names.append('transition{}'.format(ft_begin_index))
 42 |     ft_module_names.append('norm5')
 43 |     ft_module_names.append('classifier')
 44 | 
 45 |     parameters = []
 46 |     for k, v in model.named_parameters():
 47 |         for ft_module in ft_module_names:
 48 |             if ft_module in k:
 49 |                 parameters.append({'params': v})
 50 |                 break
 51 |         else:
 52 |             parameters.append({'params': v, 'lr': 0.0})
 53 | 
 54 |     return parameters
 55 | 
 56 | 
 57 | class _DenseLayer(nn.Sequential):
 58 |     def __init__(self, num_input_features, growth_rate, bn_size, drop_rate):
 59 |         super(_DenseLayer, self).__init__()
 60 |         self.add_module('norm.1', nn.BatchNorm3d(num_input_features))
 61 |         self.add_module('relu.1', nn.ReLU(inplace=True))
 62 |         self.add_module('conv.1', nn.Conv3d(num_input_features, bn_size * growth_rate,
 63 |                                             kernel_size=1, stride=1, bias=False))
 64 |         self.add_module('norm.2', nn.BatchNorm3d(bn_size * growth_rate))
 65 |         self.add_module('relu.2', nn.ReLU(inplace=True))
 66 |         self.add_module('conv.2', nn.Conv3d(bn_size * growth_rate, growth_rate,
 67 |                                             kernel_size=3, stride=1, padding=1, bias=False))
 68 |         self.drop_rate = drop_rate
 69 | 
 70 |     def forward(self, x):
 71 |         new_features = super(_DenseLayer, self).forward(x)
 72 |         if self.drop_rate > 0:
 73 |             new_features = F.dropout(new_features, p=self.drop_rate, training=self.training)
 74 |         return torch.cat([x, new_features], 1)
 75 | 
 76 | 
 77 | class _DenseBlock(nn.Sequential):
 78 |     def __init__(self, num_layers, num_input_features, bn_size, growth_rate, drop_rate):
 79 |         super(_DenseBlock, self).__init__()
 80 |         for i in range(num_layers):
 81 |             layer = _DenseLayer(num_input_features + i * growth_rate, growth_rate, bn_size, drop_rate)
 82 |             self.add_module('denselayer%d' % (i + 1), layer)
 83 | 
 84 | 
 85 | class _Transition(nn.Sequential):
 86 |     def __init__(self, num_input_features, num_output_features):
 87 |         super(_Transition, self).__init__()
 88 |         self.add_module('norm', nn.BatchNorm3d(num_input_features))
 89 |         self.add_module('relu', nn.ReLU(inplace=True))
 90 |         self.add_module('conv', nn.Conv3d(num_input_features, num_output_features,
 91 |                                           kernel_size=1, stride=1, bias=False))
 92 |         self.add_module('pool', nn.AvgPool3d(kernel_size=2, stride=2))
 93 | 
 94 | 
 95 | class DenseNet(nn.Module):
 96 |     """Densenet-BC model class
 97 |     Args:
 98 |         growth_rate (int) - how many filters to add each layer (k in paper)
 99 |         block_config (list of 4 ints) - how many layers in each pooling block
100 |         num_init_features (int) - the number of filters to learn in the first convolution layer
101 |         bn_size (int) - multiplicative factor for number of bottle neck layers
102 |           (i.e. bn_size * k features in the bottleneck layer)
103 |         drop_rate (float) - dropout rate after each dense layer
104 |         num_classes (int) - number of classification classes
105 |     """
106 |     def __init__(self, sample_size, sample_duration, growth_rate=32, block_config=(6, 12, 24, 16),
107 |                  num_init_features=64, bn_size=4, drop_rate=0, num_classes=1000, last_fc=True):
108 | 
109 |         super(DenseNet, self).__init__()
110 | 
111 |         self.last_fc = last_fc
112 | 
113 |         self.sample_size = sample_size
114 |         self.sample_duration = sample_duration
115 | 
116 |         # First convolution
117 |         self.features = nn.Sequential(OrderedDict([
118 |             ('conv0', nn.Conv3d(3, num_init_features, kernel_size=7,
119 |                                 stride=(1, 2, 2), padding=(3, 3, 3), bias=False)),
120 |             ('norm0', nn.BatchNorm3d(num_init_features)),
121 |             ('relu0', nn.ReLU(inplace=True)),
122 |             ('pool0', nn.MaxPool3d(kernel_size=3, stride=2, padding=1)),
123 |         ]))
124 | 
125 |         # Each denseblock
126 |         num_features = num_init_features
127 |         for i, num_layers in enumerate(block_config):
128 |             block = _DenseBlock(num_layers=num_layers, num_input_features=num_features,
129 |                                 bn_size=bn_size, growth_rate=growth_rate, drop_rate=drop_rate)
130 |             self.features.add_module('denseblock%d' % (i + 1), block)
131 |             num_features = num_features + num_layers * growth_rate
132 |             if i != len(block_config) - 1:
133 |                 trans = _Transition(num_input_features=num_features, num_output_features=num_features // 2)
134 |                 self.features.add_module('transition%d' % (i + 1), trans)
135 |                 num_features = num_features // 2
136 | 
137 |         # Final batch norm
138 |         self.features.add_module('norm5', nn.BatchNorm2d(num_features))
139 | 
140 |         # Linear layer
141 |         self.classifier = nn.Linear(num_features, num_classes)
142 | 
143 |     def forward(self, x):
144 |         features = self.features(x)
145 |         out = F.relu(features, inplace=True)
146 |         last_duration = math.ceil(self.sample_duration / 16)
147 |         last_size = math.floor(self.sample_size / 32)
148 |         out = F.avg_pool3d(out, kernel_size=(last_duration, last_size, last_size)).view(features.size(0), -1)
149 |         if self.last_fc:
150 |             out = self.classifier(out)
151 |         return out
152 | 


--------------------------------------------------------------------------------
/tools/models/pre_act_resnet.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | from torch.autograd import Variable
  5 | import math
  6 | from functools import partial
  7 | 
  8 | __all__ = ['PreActivationResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152', 'resnet200']
  9 | 
 10 | 
 11 | def conv3x3x3(in_planes, out_planes, stride=1):
 12 |     # 3x3x3 convolution with padding
 13 |     return nn.Conv3d(in_planes, out_planes, kernel_size=3,
 14 |                      stride=stride, padding=1, bias=False)
 15 | 
 16 | 
 17 | def downsample_basic_block(x, planes, stride):
 18 |     out = F.avg_pool3d(x, kernel_size=1, stride=stride)
 19 |     zero_pads = torch.Tensor(out.size(0), planes - out.size(1),
 20 |                              out.size(2), out.size(3),
 21 |                              out.size(4)).zero_()
 22 |     if isinstance(out.data, torch.cuda.FloatTensor):
 23 |         zero_pads = zero_pads.cuda()
 24 | 
 25 |     out = Variable(torch.cat([out.data, zero_pads], dim=1))
 26 | 
 27 |     return out
 28 | 
 29 | 
 30 | class PreActivationBasicBlock(nn.Module):
 31 |     expansion = 1
 32 | 
 33 |     def __init__(self, inplanes, planes, stride=1, downsample=None):
 34 |         super(PreActivationBasicBlock, self).__init__()
 35 |         self.bn1 = nn.BatchNorm3d(inplanes)
 36 |         self.conv1 = conv3x3x3(inplanes, planes, stride)
 37 |         self.bn2 = nn.BatchNorm3d(planes)
 38 |         self.conv2 = conv3x3x3(planes, planes)
 39 |         self.relu = nn.ReLU(inplace=True)
 40 |         self.downsample = downsample
 41 |         self.stride = stride
 42 | 
 43 |     def forward(self, x):
 44 |         residual = x
 45 | 
 46 |         out = self.bn1(x)
 47 |         out = self.relu(out)
 48 |         out = self.conv1(out)
 49 | 
 50 |         out = self.bn2(out)
 51 |         out = self.relu(out)
 52 |         out = self.conv2(out)
 53 | 
 54 |         if self.downsample is not None:
 55 |             residual = self.downsample(x)
 56 | 
 57 |         out += residual
 58 | 
 59 |         return out
 60 | 
 61 | 
 62 | class PreActivationBottleneck(nn.Module):
 63 |     expansion = 4
 64 | 
 65 |     def __init__(self, inplanes, planes, stride=1, downsample=None):
 66 |         super(PreActivationBottleneck, self).__init__()
 67 |         self.bn1 = nn.BatchNorm3d(inplanes)
 68 |         self.conv1 = nn.Conv3d(inplanes, planes, kernel_size=1, bias=False)
 69 |         self.bn2 = nn.BatchNorm3d(planes)
 70 |         self.conv2 = nn.Conv3d(planes, planes, kernel_size=3, stride=stride,
 71 |                                padding=1, bias=False)
 72 |         self.bn3 = nn.BatchNorm3d(planes)
 73 |         self.conv3 = nn.Conv3d(planes, planes * 4, kernel_size=1, bias=False)
 74 |         self.relu = nn.ReLU(inplace=True)
 75 |         self.downsample = downsample
 76 |         self.stride = stride
 77 | 
 78 |     def forward(self, x):
 79 |         residual = x
 80 | 
 81 |         out = self.bn1(x)
 82 |         out = self.relu(out)
 83 |         out = self.conv1(out)
 84 | 
 85 |         out = self.bn2(out)
 86 |         out = self.relu(out)
 87 |         out = self.conv2(out)
 88 | 
 89 |         out = self.bn3(out)
 90 |         out = self.relu(out)
 91 |         out = self.conv3(out)
 92 | 
 93 |         if self.downsample is not None:
 94 |             residual = self.downsample(x)
 95 | 
 96 |         out += residual
 97 | 
 98 |         return out
 99 | 
100 | 
101 | class PreActivationResNet(nn.Module):
102 | 
103 |     def __init__(self, block, layers, sample_size, sample_duration, shortcut_type='B', num_classes=400, last_fc=True):
104 |         self.last_fc = last_fc
105 | 
106 |         self.inplanes = 64
107 |         super(PreActivationResNet, self).__init__()
108 |         self.conv1 = nn.Conv3d(3, 64, kernel_size=7, stride=(1, 2, 2),
109 |                                padding=(3, 3, 3), bias=False)
110 |         self.bn1 = nn.BatchNorm3d(64)
111 |         self.relu = nn.ReLU(inplace=True)
112 |         self.maxpool = nn.MaxPool3d(kernel_size=(3, 3, 3), stride=2, padding=1)
113 |         self.layer1 = self._make_layer(block, 64, layers[0], shortcut_type)
114 |         self.layer2 = self._make_layer(block, 128, layers[1], shortcut_type, stride=2)
115 |         self.layer3 = self._make_layer(block, 256, layers[2], shortcut_type, stride=2)
116 |         self.layer4 = self._make_layer(block, 512, layers[3], shortcut_type, stride=2)
117 |         last_duration = math.ceil(sample_duration / 16)
118 |         last_size = math.ceil(sample_size / 32)
119 |         self.avgpool = nn.AvgPool3d((last_duration, last_size, last_size), stride=1)
120 |         self.fc = nn.Linear(512 * block.expansion, num_classes)
121 | 
122 |         for m in self.modules():
123 |             if isinstance(m, nn.Conv3d):
124 |                 n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
125 |                 m.weight.data.normal_(0, math.sqrt(2. / n))
126 |             elif isinstance(m, nn.BatchNorm3d):
127 |                 m.weight.data.fill_(1)
128 |                 m.bias.data.zero_()
129 | 
130 |     def _make_layer(self, block, planes, blocks, shortcut_type, stride=1):
131 |         downsample = None
132 |         if stride != 1 or self.inplanes != planes * block.expansion:
133 |             if shortcut_type == 'A':
134 |                 downsample = partial(downsample_basic_block,
135 |                                      planes=planes * block.expansion,
136 |                                      stride=stride)
137 |             else:
138 |                 downsample = nn.Sequential(
139 |                     nn.Conv3d(self.inplanes, planes * block.expansion,
140 |                               kernel_size=1, stride=stride, bias=False),
141 |                     nn.BatchNorm3d(planes * block.expansion)
142 |                 )
143 | 
144 |         layers = []
145 |         layers.append(block(self.inplanes, planes, stride, downsample))
146 |         self.inplanes = planes * block.expansion
147 |         for i in range(1, blocks):
148 |             layers.append(block(self.inplanes, planes))
149 | 
150 |         return nn.Sequential(*layers)
151 | 
152 |     def forward(self, x):
153 |         x = self.conv1(x)
154 |         x = self.bn1(x)
155 |         x = self.relu(x)
156 |         x = self.maxpool(x)
157 | 
158 |         x = self.layer1(x)
159 |         x = self.layer2(x)
160 |         x = self.layer3(x)
161 |         x = self.layer4(x)
162 | 
163 |         x = self.avgpool(x)
164 | 
165 |         x = x.view(x.size(0), -1)
166 |         if self.last_fc:
167 |             x = self.fc(x)
168 | 
169 |         return x
170 | 
171 | def get_fine_tuning_parameters(model, ft_begin_index):
172 |     if ft_begin_index == 0:
173 |         return model.parameters()
174 | 
175 |     ft_module_names = []
176 |     for i in range(ft_begin_index, 5):
177 |         ft_module_names.append('layer{}'.format(ft_begin_index))
178 |     ft_module_names.append('fc')
179 | 
180 |     parameters = []
181 |     for k, v in model.named_parameters():
182 |         for ft_module in ft_module_names:
183 |             if ft_module in k:
184 |                 parameters.append({'params': v})
185 |                 break
186 |         else:
187 |             parameters.append({'params': v, 'lr': 0.0})
188 | 
189 |     return parameters
190 | 
191 | def resnet18(**kwargs):
192 |     """Constructs a ResNet-18 model.
193 |     """
194 |     model = PreActivationResNet(PreActivationBasicBlock, [2, 2, 2, 2], **kwargs)
195 |     return model
196 | 
197 | def resnet34(**kwargs):
198 |     """Constructs a ResNet-34 model.
199 |     """
200 |     model = PreActivationResNet(PreActivationBasicBlock, [3, 4, 6, 3], **kwargs)
201 |     return model
202 | 
203 | 
204 | def resnet50(**kwargs):
205 |     """Constructs a ResNet-50 model.
206 |     """
207 |     model = PreActivationResNet(PreActivationBottleneck, [3, 4, 6, 3], **kwargs)
208 |     return model
209 | 
210 | def resnet101(**kwargs):
211 |     """Constructs a ResNet-101 model.
212 |     """
213 |     model = PreActivationResNet(PreActivationBottleneck, [3, 4, 23, 3], **kwargs)
214 |     return model
215 | 
216 | def resnet152(**kwargs):
217 |     """Constructs a ResNet-101 model.
218 |     """
219 |     model = PreActivationResNet(PreActivationBottleneck, [3, 8, 36, 3], **kwargs)
220 |     return model
221 | 
222 | def resnet200(**kwargs):
223 |     """Constructs a ResNet-101 model.
224 |     """
225 |     model = PreActivationResNet(PreActivationBottleneck, [3, 24, 36, 3], **kwargs)
226 |     return model
227 | 


--------------------------------------------------------------------------------
/tools/models/resnet.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | from torch.autograd import Variable
  5 | import math
  6 | from functools import partial
  7 | 
  8 | __all__ = ['ResNet', 'resnet10', 'resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152', 'resnet200']
  9 | 
 10 | 
 11 | def conv3x3x3(in_planes, out_planes, stride=1):
 12 |     # 3x3x3 convolution with padding
 13 |     return nn.Conv3d(in_planes, out_planes, kernel_size=3,
 14 |                      stride=stride, padding=1, bias=False)
 15 | 
 16 | 
 17 | def downsample_basic_block(x, planes, stride):
 18 |     out = F.avg_pool3d(x, kernel_size=1, stride=stride)
 19 |     zero_pads = torch.Tensor(out.size(0), planes - out.size(1),
 20 |                              out.size(2), out.size(3),
 21 |                              out.size(4)).zero_()
 22 |     if isinstance(out.data, torch.cuda.FloatTensor):
 23 |         zero_pads = zero_pads.cuda()
 24 | 
 25 |     out = Variable(torch.cat([out.data, zero_pads], dim=1))
 26 | 
 27 |     return out
 28 | 
 29 | 
 30 | class BasicBlock(nn.Module):
 31 |     expansion = 1
 32 | 
 33 |     def __init__(self, inplanes, planes, stride=1, downsample=None):
 34 |         super(BasicBlock, self).__init__()
 35 |         self.conv1 = conv3x3x3(inplanes, planes, stride)
 36 |         self.bn1 = nn.BatchNorm3d(planes)
 37 |         self.relu = nn.ReLU(inplace=True)
 38 |         self.conv2 = conv3x3x3(planes, planes)
 39 |         self.bn2 = nn.BatchNorm3d(planes)
 40 |         self.downsample = downsample
 41 |         self.stride = stride
 42 | 
 43 |     def forward(self, x):
 44 |         residual = x
 45 | 
 46 |         out = self.conv1(x)
 47 |         out = self.bn1(out)
 48 |         out = self.relu(out)
 49 | 
 50 |         out = self.conv2(out)
 51 |         out = self.bn2(out)
 52 | 
 53 |         if self.downsample is not None:
 54 |             residual = self.downsample(x)
 55 | 
 56 |         out += residual
 57 |         out = self.relu(out)
 58 | 
 59 |         return out
 60 | 
 61 | 
 62 | class Bottleneck(nn.Module):
 63 |     expansion = 4
 64 | 
 65 |     def __init__(self, inplanes, planes, stride=1, downsample=None):
 66 |         super(Bottleneck, self).__init__()
 67 |         self.conv1 = nn.Conv3d(inplanes, planes, kernel_size=1, bias=False)
 68 |         self.bn1 = nn.BatchNorm3d(planes)
 69 |         self.conv2 = nn.Conv3d(planes, planes, kernel_size=3, stride=stride,
 70 |                                padding=1, bias=False)
 71 |         self.bn2 = nn.BatchNorm3d(planes)
 72 |         self.conv3 = nn.Conv3d(planes, planes * 4, kernel_size=1, bias=False)
 73 |         self.bn3 = nn.BatchNorm3d(planes * 4)
 74 |         self.relu = nn.ReLU(inplace=True)
 75 |         self.downsample = downsample
 76 |         self.stride = stride
 77 | 
 78 |     def forward(self, x):
 79 |         residual = x
 80 | 
 81 |         out = self.conv1(x)
 82 |         out = self.bn1(out)
 83 |         out = self.relu(out)
 84 | 
 85 |         out = self.conv2(out)
 86 |         out = self.bn2(out)
 87 |         out = self.relu(out)
 88 | 
 89 |         out = self.conv3(out)
 90 |         out = self.bn3(out)
 91 | 
 92 |         if self.downsample is not None:
 93 |             residual = self.downsample(x)
 94 | 
 95 |         out += residual
 96 |         out = self.relu(out)
 97 | 
 98 |         return out
 99 | 
100 | 
101 | class ResNet(nn.Module):
102 | 
103 |     def __init__(self, block, layers, sample_size, sample_duration, shortcut_type='B', num_classes=400, last_fc=True):
104 |         self.last_fc = last_fc
105 | 
106 |         self.inplanes = 64
107 |         super(ResNet, self).__init__()
108 |         self.conv1 = nn.Conv3d(3, 64, kernel_size=7, stride=(1, 2, 2),
109 |                                padding=(3, 3, 3), bias=False)
110 |         self.bn1 = nn.BatchNorm3d(64)
111 |         self.relu = nn.ReLU(inplace=True)
112 |         self.maxpool = nn.MaxPool3d(kernel_size=(3, 3, 3), stride=2, padding=1)
113 |         self.layer1 = self._make_layer(block, 64, layers[0], shortcut_type)
114 |         self.layer2 = self._make_layer(block, 128, layers[1], shortcut_type, stride=2)
115 |         self.layer3 = self._make_layer(block, 256, layers[2], shortcut_type, stride=2)
116 |         self.layer4 = self._make_layer(block, 512, layers[3], shortcut_type, stride=2)
117 |         last_duration = math.ceil(sample_duration / 16)
118 |         last_size = math.ceil(sample_size / 32)
119 |         self.avgpool = nn.AvgPool3d((last_duration, last_size, last_size), stride=1)
120 |         self.fc = nn.Linear(512 * block.expansion, num_classes)
121 | 
122 |         for m in self.modules():
123 |             if isinstance(m, nn.Conv3d):
124 |                 n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
125 |                 m.weight.data.normal_(0, math.sqrt(2. / n))
126 |             elif isinstance(m, nn.BatchNorm3d):
127 |                 m.weight.data.fill_(1)
128 |                 m.bias.data.zero_()
129 | 
130 |     def _make_layer(self, block, planes, blocks, shortcut_type, stride=1):
131 |         downsample = None
132 |         if stride != 1 or self.inplanes != planes * block.expansion:
133 |             if shortcut_type == 'A':
134 |                 downsample = partial(downsample_basic_block,
135 |                                      planes=planes * block.expansion,
136 |                                      stride=stride)
137 |             else:
138 |                 downsample = nn.Sequential(
139 |                     nn.Conv3d(self.inplanes, planes * block.expansion,
140 |                               kernel_size=1, stride=stride, bias=False),
141 |                     nn.BatchNorm3d(planes * block.expansion)
142 |                 )
143 | 
144 |         layers = []
145 |         layers.append(block(self.inplanes, planes, stride, downsample))
146 |         self.inplanes = planes * block.expansion
147 |         for i in range(1, blocks):
148 |             layers.append(block(self.inplanes, planes))
149 | 
150 |         return nn.Sequential(*layers)
151 | 
152 |     def forward(self, x):
153 |         x = self.conv1(x)
154 |         x = self.bn1(x)
155 |         x = self.relu(x)
156 |         x = self.maxpool(x)
157 | 
158 |         x = self.layer1(x)
159 |         x = self.layer2(x)
160 |         x = self.layer3(x)
161 |         x = self.layer4(x)
162 |        
163 |         
164 |         x = self.avgpool(x)
165 | 
166 |         x = x.view(x.size(0), -1)
167 |         if self.last_fc:
168 |             x = self.fc(x)
169 |         
170 |         return x
171 | 
172 | 
173 | def get_fine_tuning_parameters(model, ft_begin_index):
174 |     if ft_begin_index == 0:
175 |         return model.parameters()
176 | 
177 |     ft_module_names = []
178 |     for i in range(ft_begin_index, 5):
179 |         ft_module_names.append('layer{}'.format(ft_begin_index))
180 |     ft_module_names.append('fc')
181 | 
182 |     parameters = []
183 |     for k, v in model.named_parameters():
184 |         for ft_module in ft_module_names:
185 |             if ft_module in k:
186 |                 parameters.append({'params': v})
187 |                 break
188 |         else:
189 |             parameters.append({'params': v, 'lr': 0.0})
190 | 
191 |     return parameters
192 | 
193 | 
194 | def resnet10(**kwargs):
195 |     """Constructs a ResNet-18 model.
196 |     """
197 |     model = ResNet(BasicBlock, [1, 1, 1, 1], **kwargs)
198 |     return model
199 | 
200 | def resnet18(**kwargs):
201 |     """Constructs a ResNet-18 model.
202 |     """
203 |     model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)
204 |     return model
205 | 
206 | def resnet34(**kwargs):
207 |     """Constructs a ResNet-34 model.
208 |     """
209 |     model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs)
210 |     return model
211 | 
212 | def resnet50(**kwargs):
213 |     """Constructs a ResNet-50 model.
214 |     """
215 |     model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
216 |     return model
217 | 
218 | def resnet101(**kwargs):
219 |     """Constructs a ResNet-101 model.
220 |     """
221 |     model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
222 |     return model
223 | 
224 | def resnet152(**kwargs):
225 |     """Constructs a ResNet-101 model.
226 |     """
227 |     model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs)
228 |     return model
229 | 
230 | def resnet200(**kwargs):
231 |     """Constructs a ResNet-101 model.
232 |     """
233 |     model = ResNet(Bottleneck, [3, 24, 36, 3], **kwargs)
234 |     return model
235 | 


--------------------------------------------------------------------------------
/tools/models/resnext.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | from torch.autograd import Variable
  5 | import math
  6 | from functools import partial
  7 | 
  8 | __all__ = ['ResNeXt', 'resnet50', 'resnet101']
  9 | 
 10 | 
 11 | def conv3x3x3(in_planes, out_planes, stride=1):
 12 |     # 3x3x3 convolution with padding
 13 |     return nn.Conv3d(in_planes, out_planes, kernel_size=3,
 14 |                      stride=stride, padding=1, bias=False)
 15 | 
 16 | 
 17 | def downsample_basic_block(x, planes, stride):
 18 |     out = F.avg_pool3d(x, kernel_size=1, stride=stride)
 19 |     zero_pads = torch.Tensor(out.size(0), planes - out.size(1),
 20 |                              out.size(2), out.size(3),
 21 |                              out.size(4)).zero_()
 22 |     if isinstance(out.data, torch.cuda.FloatTensor):
 23 |         zero_pads = zero_pads.cuda()
 24 | 
 25 |     out = Variable(torch.cat([out.data, zero_pads], dim=1))
 26 | 
 27 |     return out
 28 | 
 29 | 
 30 | class ResNeXtBottleneck(nn.Module):
 31 |     expansion = 2
 32 | 
 33 |     def __init__(self, inplanes, planes, cardinality, stride=1, downsample=None):
 34 |         super(ResNeXtBottleneck, self).__init__()
 35 |         mid_planes = cardinality * int(planes / 32)
 36 |         self.conv1 = nn.Conv3d(inplanes, mid_planes, kernel_size=1, bias=False)
 37 |         self.bn1 = nn.BatchNorm3d(mid_planes)
 38 |         self.conv2 = nn.Conv3d(mid_planes, mid_planes, kernel_size=3, stride=stride,
 39 |                                padding=1, groups=cardinality, bias=False)
 40 |         self.bn2 = nn.BatchNorm3d(mid_planes)
 41 |         self.conv3 = nn.Conv3d(mid_planes, planes * self.expansion, kernel_size=1, bias=False)
 42 |         self.bn3 = nn.BatchNorm3d(planes * self.expansion)
 43 |         self.relu = nn.ReLU(inplace=True)
 44 |         self.downsample = downsample
 45 |         self.stride = stride
 46 | 
 47 |     def forward(self, x):
 48 |         residual = x
 49 | 
 50 |         out = self.conv1(x)
 51 |         out = self.bn1(out)
 52 |         out = self.relu(out)
 53 | 
 54 |         out = self.conv2(out)
 55 |         out = self.bn2(out)
 56 |         out = self.relu(out)
 57 | 
 58 |         out = self.conv3(out)
 59 |         out = self.bn3(out)
 60 | 
 61 |         if self.downsample is not None:
 62 |             residual = self.downsample(x)
 63 | 
 64 |         out += residual
 65 |         out = self.relu(out)
 66 | 
 67 |         return out
 68 | 
 69 | 
 70 | class ResNeXt(nn.Module):
 71 | 
 72 |     def __init__(self, block, layers, sample_size, sample_duration, shortcut_type='B', cardinality=32, num_classes=400, last_fc=True):
 73 |         self.last_fc = last_fc
 74 | 
 75 |         self.inplanes = 64
 76 |         super(ResNeXt, self).__init__()
 77 |         self.conv1 = nn.Conv3d(3, 64, kernel_size=7, stride=(1, 2, 2),
 78 |                                padding=(3, 3, 3), bias=False)
 79 |         self.bn1 = nn.BatchNorm3d(64)
 80 |         self.relu = nn.ReLU(inplace=True)
 81 |         self.maxpool = nn.MaxPool3d(kernel_size=(3, 3, 3), stride=2, padding=1)
 82 |         self.layer1 = self._make_layer(block, 128, layers[0], shortcut_type, cardinality)
 83 |         self.layer2 = self._make_layer(block, 256, layers[1], shortcut_type, cardinality, stride=2)
 84 |         self.layer3 = self._make_layer(block, 512, layers[2], shortcut_type, cardinality, stride=2)
 85 |         self.layer4 = self._make_layer(block, 1024, layers[3], shortcut_type, cardinality, stride=2)
 86 |         last_duration = math.ceil(sample_duration / 16)
 87 |         last_size = math.ceil(sample_size / 32)
 88 |         self.avgpool = nn.AvgPool3d((last_duration, last_size, last_size), stride=1)
 89 |         self.fc = nn.Linear(cardinality * 32 * block.expansion, num_classes)
 90 | 
 91 |         for m in self.modules():
 92 |             if isinstance(m, nn.Conv3d):
 93 |                 n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
 94 |                 m.weight.data.normal_(0, math.sqrt(2. / n))
 95 |             elif isinstance(m, nn.BatchNorm3d):
 96 |                 m.weight.data.fill_(1)
 97 |                 m.bias.data.zero_()
 98 | 
 99 |     def _make_layer(self, block, planes, blocks, shortcut_type, cardinality, stride=1):
100 |         downsample = None
101 |         if stride != 1 or self.inplanes != planes * block.expansion:
102 |             if shortcut_type == 'A':
103 |                 downsample = partial(downsample_basic_block,
104 |                                      planes=planes * block.expansion,
105 |                                      stride=stride)
106 |             else:
107 |                 downsample = nn.Sequential(
108 |                     nn.Conv3d(self.inplanes, planes * block.expansion,
109 |                               kernel_size=1, stride=stride, bias=False),
110 |                     nn.BatchNorm3d(planes * block.expansion)
111 |                 )
112 | 
113 |         layers = []
114 |         layers.append(block(self.inplanes, planes, cardinality, stride, downsample))
115 |         self.inplanes = planes * block.expansion
116 |         for i in range(1, blocks):
117 |             layers.append(block(self.inplanes, planes, cardinality))
118 | 
119 |         return nn.Sequential(*layers)
120 | 
121 |     def forward(self, x):
122 |         x = self.conv1(x)
123 |         x = self.bn1(x)
124 |         x = self.relu(x)
125 |         x = self.maxpool(x)
126 | 
127 |         x = self.layer1(x)
128 |         x = self.layer2(x)
129 |         x = self.layer3(x)
130 |         x = self.layer4(x)
131 | 
132 |         
133 |         x = self.avgpool(x)
134 |         x = x.view(x.size(0), -1)
135 |         if self.last_fc:
136 |             x = self.fc(x)
137 |         
138 |         return x
139 | 
140 | def get_fine_tuning_parameters(model, ft_begin_index):
141 |     if ft_begin_index == 0:
142 |         return model.parameters()
143 | 
144 |     ft_module_names = []
145 |     for i in range(ft_begin_index, 5):
146 |         ft_module_names.append('layer{}'.format(ft_begin_index))
147 |     ft_module_names.append('fc')
148 | 
149 |     parameters = []
150 |     for k, v in model.named_parameters():
151 |         for ft_module in ft_module_names:
152 |             if ft_module in k:
153 |                 parameters.append({'params': v})
154 |                 break
155 |         else:
156 |             parameters.append({'params': v, 'lr': 0.0})
157 | 
158 |     return parameters
159 | 
160 | def resnet50(**kwargs):
161 |     """Constructs a ResNet-50 model.
162 |     """
163 |     model = ResNeXt(ResNeXtBottleneck, [3, 4, 6, 3], **kwargs)
164 |     return model
165 | 
166 | def resnet101(**kwargs):
167 |     """Constructs a ResNet-101 model.
168 |     """
169 |     model = ResNeXt(ResNeXtBottleneck, [3, 4, 23, 3], **kwargs)
170 |     return model
171 | 
172 | def resnet152(**kwargs):
173 |     """Constructs a ResNet-101 model.
174 |     """
175 |     model = ResNeXt(ResNeXtBottleneck, [3, 8, 36, 3], **kwargs)
176 |     return model
177 | 


--------------------------------------------------------------------------------
/tools/models/wide_resnet.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | from torch.autograd import Variable
  5 | import math
  6 | from functools import partial
  7 | 
  8 | __all__ = ['WideResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101']
  9 | 
 10 | 
 11 | def conv3x3x3(in_planes, out_planes, stride=1):
 12 |     # 3x3x3 convolution with padding
 13 |     return nn.Conv3d(in_planes, out_planes, kernel_size=3,
 14 |                      stride=stride, padding=1, bias=False)
 15 | 
 16 | 
 17 | def downsample_basic_block(x, planes, stride):
 18 |     out = F.avg_pool3d(x, kernel_size=1, stride=stride)
 19 |     zero_pads = torch.Tensor(out.size(0), planes - out.size(1),
 20 |                              out.size(2), out.size(3),
 21 |                              out.size(4)).zero_()
 22 |     if isinstance(out.data, torch.cuda.FloatTensor):
 23 |         zero_pads = zero_pads.cuda()
 24 | 
 25 |     out = Variable(torch.cat([out.data, zero_pads], dim=1))
 26 | 
 27 |     return out
 28 | 
 29 | 
 30 | class WideBottleneck(nn.Module):
 31 |     expansion = 2
 32 | 
 33 |     def __init__(self, inplanes, planes, stride=1, downsample=None):
 34 |         super(WideBottleneck, self).__init__()
 35 |         self.conv1 = nn.Conv3d(inplanes, planes, kernel_size=1, bias=False)
 36 |         self.bn1 = nn.BatchNorm3d(planes)
 37 |         self.conv2 = nn.Conv3d(planes, planes, kernel_size=3, stride=stride,
 38 |                                padding=1, bias=False)
 39 |         self.bn2 = nn.BatchNorm3d(planes)
 40 |         self.conv3 = nn.Conv3d(planes, planes * self.expansion, kernel_size=1, bias=False)
 41 |         self.bn3 = nn.BatchNorm3d(planes * self.expansion)
 42 |         self.relu = nn.ReLU(inplace=True)
 43 |         self.downsample = downsample
 44 |         self.stride = stride
 45 | 
 46 |     def forward(self, x):
 47 |         residual = x
 48 | 
 49 |         out = self.conv1(x)
 50 |         out = self.bn1(out)
 51 |         out = self.relu(out)
 52 | 
 53 |         out = self.conv2(out)
 54 |         out = self.bn2(out)
 55 |         out = self.relu(out)
 56 | 
 57 |         out = self.conv3(out)
 58 |         out = self.bn3(out)
 59 | 
 60 |         if self.downsample is not None:
 61 |             residual = self.downsample(x)
 62 | 
 63 |         out += residual
 64 |         out = self.relu(out)
 65 | 
 66 |         return out
 67 | 
 68 | 
 69 | class WideResNet(nn.Module):
 70 | 
 71 |     def __init__(self, block, layers, sample_size, sample_duration, k=1, shortcut_type='B', num_classes=400, last_fc=True):
 72 |         self.last_fc = last_fc
 73 | 
 74 |         self.inplanes = 64
 75 |         super(WideResNet, self).__init__()
 76 |         self.conv1 = nn.Conv3d(3, 64, kernel_size=7, stride=(1, 2, 2),
 77 |                                padding=(3, 3, 3), bias=False)
 78 |         self.bn1 = nn.BatchNorm3d(64)
 79 |         self.relu = nn.ReLU(inplace=True)
 80 |         self.maxpool = nn.MaxPool3d(kernel_size=(3, 3, 3), stride=2, padding=1)
 81 |         self.layer1 = self._make_layer(block, 64 * k, layers[0], shortcut_type)
 82 |         self.layer2 = self._make_layer(block, 128 * k, layers[1], shortcut_type, stride=2)
 83 |         self.layer3 = self._make_layer(block, 256 * k, layers[2], shortcut_type, stride=2)
 84 |         self.layer4 = self._make_layer(block, 512 * k, layers[3], shortcut_type, stride=2)
 85 |         last_duration = math.ceil(sample_duration / 16)
 86 |         last_size = math.ceil(sample_size / 32)
 87 |         self.avgpool = nn.AvgPool3d((last_duration, last_size, last_size), stride=1)
 88 |         self.fc = nn.Linear(512 * k * block.expansion, num_classes)
 89 | 
 90 |         for m in self.modules():
 91 |             if isinstance(m, nn.Conv3d):
 92 |                 n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
 93 |                 m.weight.data.normal_(0, math.sqrt(2. / n))
 94 |             elif isinstance(m, nn.BatchNorm3d):
 95 |                 m.weight.data.fill_(1)
 96 |                 m.bias.data.zero_()
 97 | 
 98 |     def _make_layer(self, block, planes, blocks, shortcut_type, stride=1):
 99 |         downsample = None
100 |         if stride != 1 or self.inplanes != planes * block.expansion:
101 |             if shortcut_type == 'A':
102 |                 downsample = partial(downsample_basic_block,
103 |                                      planes=planes * block.expansion,
104 |                                      stride=stride)
105 |             else:
106 |                 downsample = nn.Sequential(
107 |                     nn.Conv3d(self.inplanes, planes * block.expansion,
108 |                               kernel_size=1, stride=stride, bias=False),
109 |                     nn.BatchNorm3d(planes * block.expansion)
110 |                 )
111 | 
112 |         layers = []
113 |         layers.append(block(self.inplanes, planes, stride, downsample))
114 |         self.inplanes = planes * block.expansion
115 |         for i in range(1, blocks):
116 |             layers.append(block(self.inplanes, planes))
117 | 
118 |         return nn.Sequential(*layers)
119 | 
120 |     def forward(self, x):
121 |         x = self.conv1(x)
122 |         x = self.bn1(x)
123 |         x = self.relu(x)
124 |         x = self.maxpool(x)
125 | 
126 |         x = self.layer1(x)
127 |         x = self.layer2(x)
128 |         x = self.layer3(x)
129 |         x = self.layer4(x)
130 | 
131 |         x = self.avgpool(x)
132 | 
133 |         x = x.view(x.size(0), -1)
134 |         if self.last_fc:
135 |             x = self.fc(x)
136 | 
137 |         return x
138 | 
139 | def get_fine_tuning_parameters(model, ft_begin_index):
140 |     if ft_begin_index == 0:
141 |         return model.parameters()
142 | 
143 |     ft_module_names = []
144 |     for i in range(ft_begin_index, 5):
145 |         ft_module_names.append('layer{}'.format(ft_begin_index))
146 |     ft_module_names.append('fc')
147 | 
148 |     parameters = []
149 |     for k, v in model.named_parameters():
150 |         for ft_module in ft_module_names:
151 |             if ft_module in k:
152 |                 parameters.append({'params': v})
153 |                 break
154 |         else:
155 |             parameters.append({'params': v, 'lr': 0.0})
156 | 
157 |     return parameters
158 | 
159 | def resnet50(**kwargs):
160 |     """Constructs a ResNet-50 model.
161 |     """
162 |     model = WideResNet(WideBottleneck, [3, 4, 6, 3], **kwargs)
163 |     return model
164 | 


--------------------------------------------------------------------------------
/tools/object_align.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | sys.path.insert(0, '../')
  3 | import h5py
  4 | import os.path as osp
  5 | import numpy as np
  6 | from sklearn.metrics.pairwise import pairwise_distances
  7 | from sklearn.preprocessing import normalize
  8 | import sys
  9 | sys.path.insert(0, '../')
 10 | from util import load_file, save_to
 11 | import os
 12 | 
 13 | def align_object(video_feature_path, mode):
 14 |     bbox_feat_file = osp.join(video_feature_path, 'region_8c10b_{}.h5'.format(mode))
 15 |     print('Load {}...'.format(bbox_feat_file))         
 16 |     out_file = osp.join(bbox_feat_file+'.h5')
 17 |     fout = h5py.File(out_file, 'w')
 18 |     string_dt = h5py.special_dtype(vlen=str)
 19 |     with h5py.File(bbox_feat_file, 'r') as fp:
 20 |         vids = fp['ids']
 21 |         feats = fp['feat']
 22 |         bboxes = fp['bbox']
 23 |         fout.create_dataset('ids', shape=vids.shape, dtype=string_dt, data=vids)
 24 |         
 25 |         feat_alns, bbox_alns = [], []
 26 |         for id, (vid, feat, bbox) in enumerate(zip(vids, feats, bboxes)):
 27 |                         
 28 |             cnum, fnum, rnum, _ = feat.shape
 29 |             cur_feat_aln, cur_bbox_aln = [], []
 30 |             for cid, (cur_feat, cur_bbox) in enumerate(zip(feat, bbox)):
 31 |                 vid_feat_aln, vid_bbox_aln = align(cur_feat, cur_bbox, vid, cid)
 32 |                 cur_feat_aln.append(vid_feat_aln)
 33 |                 cur_bbox_aln.append(vid_bbox_aln)
 34 |                 
 35 |             feat_alns.append(cur_feat_aln)
 36 |             bbox_alns.append(cur_bbox_aln)
 37 |             if id % 100 == 0:
 38 |                 print(f'{id}/{len(vids)}')
 39 | 
 40 |         feat_alns = np.asarray(feat_alns)
 41 |         bbox_alns = np.asarray(bbox_alns)
 42 |         print(feat_alns.shape, bbox_alns.shape)
 43 |         
 44 |         fout.create_dataset('feat', shape=feat_alns.shape, dtype=np.float32, data=feat_alns)
 45 |         fout.create_dataset('bbox', shape=bbox_alns.shape, dtype=np.float32, data=bbox_alns)
 46 | 
 47 | 
 48 | def align_object_byv(video_feature_path, vlist_file):
 49 |     vlist = load_file(vlist_file)
 50 |     indir = osp.join(video_feature_path, 'bbox_feat')
 51 |     outdir = osp.join(video_feature_path, 'bbox_feat_aln')
 52 |     vnum = len(vlist)
 53 |     print(vnum)
 54 |     for idx, vid in enumerate(vlist):
 55 |         if idx <= 8000: continue
 56 |         if idx > 10000: break
 57 |         outfile = osp.join(outdir, vid+'.npz')
 58 |         if osp.exists(outfile):
 59 |             continue
 60 |         infile = osp.join(indir, vid+'.npz')
 61 |         region_feat = np.load(infile)
 62 |         
 63 |         roi_feat, roi_bbox = align_feat_bbox(region_feat['feat'][:8], region_feat['bbox'][:8], vid)
 64 |         out_dir = osp.dirname(outfile)
 65 |         if not osp.exists(out_dir):
 66 |             os.makedirs(out_dir)
 67 |         np.savez_compressed(outfile, feat=roi_feat, bbox=roi_bbox)
 68 |         if idx % 100 == 0:
 69 |             print(f'{idx}/{vnum}', outfile)
 70 |             print(roi_feat.shape, roi_bbox.shape)
 71 | 
 72 | 
 73 | def align_feat_bbox(feat, bbox, vid):
 74 |     cur_feat_aln, cur_bbox_aln = [], []
 75 |     for cid, (cur_feat, cur_bbox) in enumerate(zip(feat, bbox)):
 76 |         vid_feat_aln, vid_bbox_aln = align(cur_feat, cur_bbox, vid, cid)
 77 |         cur_feat_aln.append(vid_feat_aln)
 78 |         cur_bbox_aln.append(vid_bbox_aln)
 79 |     return np.asarray(cur_feat_aln), np.asarray(cur_bbox_aln)
 80 | 
 81 | 
 82 | def align(feats, bboxes, vid, cid):
 83 |     new_feats, new_bboxes = [], []
 84 |     paths = get_tracks(feats, bboxes, vid, cid)
 85 |     for i in range(len(paths)):
 86 |         obj_feat, obj_pos = [], []
 87 |         for fid in range(len(feats)):
 88 |             feat = feats[fid][paths[i][fid]]
 89 |             bbox = bboxes[fid][paths[i][fid]]
 90 |             obj_feat.append(feat)
 91 |             obj_pos.append(bbox)
 92 |         new_feats.append(obj_feat)
 93 |         new_bboxes.append(obj_pos)
 94 |     new_feats = np.asarray(new_feats).transpose(1, 0, 2)
 95 |     new_bboxes = np.asarray(new_bboxes).transpose(1, 0, 2)
 96 |     return new_feats, new_bboxes
 97 | 
 98 | 
 99 | def get_tracks(feats, bboxes, vid, cid):
100 |     links = get_link(feats, bboxes)
101 |     paths = []
102 |     for i in range(bboxes.shape[1]):
103 |         max_path = find_max_path_greedy(links, i)
104 |         links = update_links(links, max_path)
105 |         max_path = [i] + max_path
106 |         paths.append(max_path)
107 |         # vis_path(vid, cid, bboxes, max_path)
108 |         # break
109 |     return paths
110 | 
111 | 
112 | def get_link(feats, bboxes):
113 |     fnum = feats.shape[0]
114 |     link_cretiria = []
115 |     for fid in range(fnum-1):
116 |         feat_p, feat_n = feats[fid], feats[fid+1]
117 |         sim_f = pairwise_distances(feat_p, feat_n, 'cosine', n_jobs=1)
118 |         sim_f = 1-sim_f
119 |         box_p, box_n = bboxes[fid], bboxes[fid+1]
120 |         areas_p = np.array([get_area(bbox) for bbox in box_p])
121 |         areas_n = np.array([get_area(bbox) for bbox in box_n])
122 |         op_box = []
123 |         for bid, bbox in enumerate(box_p):
124 |             area_p = areas_p[bid]
125 |             x1 = np.maximum(bbox[0], box_n[:, 0])
126 |             y1 = np.maximum(bbox[1], box_n[:, 1])
127 |             x2 = np.minimum(bbox[2], box_n[:, 2])
128 |             y2 = np.minimum(bbox[3], box_n[:, 3])
129 |             W = np.maximum(0, x2 - x1 + 1)
130 |             H = np.maximum(0, y2 - y1 + 1)
131 |             ov_area = W * H
132 |             IoUs = ov_area / (area_p + areas_n - ov_area)
133 |             op_box.append(IoUs)
134 |         scores = np.asarray(op_box) + sim_f #equal importance
135 |         link_cretiria.append(scores)
136 |     return np.asarray(link_cretiria)
137 | 
138 | 
139 | def update_links(links, max_path):
140 |     """
141 |     remove the nodes at the max_path
142 |     """
143 |     for i, v in enumerate(max_path):
144 |         links[i][v] = 0
145 |     return links
146 | 
147 | 
148 | def find_max_path_greedy(link_scores, sid):
149 |     path = []
150 |     for i in range(link_scores.shape[0]):
151 |         sid = np.argmax(link_scores[i][sid])
152 |         path.append(sid)
153 |     return path
154 | 
155 | 
156 | def get_area(bbox):
157 |      area = (bbox[2]-bbox[0]+1)*(bbox[3]-bbox[1]+1)
158 |      return area
159 | 
160 | 
161 | def main():
162 |     video_feature_path = f'../../data/feats/nextqa/region_feat_n/'
163 |     align_object(video_feature_path, 'test')
164 |     # dataset_dir = '../../data/datasets/nextqa/test.csv'
165 |     # vlist_file = dataset_dir + 'vlist.json'
166 |     # if osp.exists(vlist_file):
167 |     #     vlist = load_file(vlist_file)
168 |     # else:
169 |     #     data = load_file(dataset_dir)
170 |     #     vlist = list(set(list(data['video_id'])))
171 |     #     save_to(vlist_file, vlist)
172 |     # align_object_byv(video_feature_path, vlist_file)
173 | 
174 | 
175 | if __name__ == "__main__":
176 |     main()
177 | 


--------------------------------------------------------------------------------
/tools/preprocess_features.py:
--------------------------------------------------------------------------------
  1 | import argparse, os
  2 | import h5py
  3 | from scipy.misc import imresize
  4 | import skvideo.io as sio
  5 | from PIL import Image
  6 | import cv2
  7 | import json
  8 | import torch
  9 | from torch import nn
 10 | import torchvision
 11 | import random
 12 | import numpy as np
 13 | import shutil
 14 | import subprocess
 15 | from models import resnext
 16 | from datautils import utils
 17 | from datautils import tgif_qa
 18 | from datautils import msrvtt_qa
 19 | from datautils import msvd_qa
 20 | import os.path as osp
 21 | import sys
 22 | sys.path.insert(0, '../')
 23 | import time
 24 | from util import load_file, save_to
 25 | 
 26 | 
 27 | def build_resnet():
 28 |     if not hasattr(torchvision.models, args.model):
 29 |         raise ValueError('Invalid model "%s"' % args.model)
 30 |     if not 'resnet' in args.model:
 31 |         raise ValueError('Feature extraction only supports ResNets')
 32 |     cnn = getattr(torchvision.models, args.model)(pretrained=True)
 33 |     model = torch.nn.Sequential(*list(cnn.children())[:-1])
 34 |     
 35 |     model.cuda()
 36 |     model.eval()
 37 |     return model
 38 | 
 39 | 
 40 | def build_resnext():
 41 |     model = resnext.resnet101(num_classes=400, shortcut_type='B', cardinality=32,
 42 |                               sample_size=112, sample_duration=16,
 43 |                               last_fc=False)
 44 |     model = model.cuda()
 45 |     model = nn.DataParallel(model, device_ids=None)
 46 |     assert os.path.exists('../../data/pretrained/resnext-101-kinetics.pth') 
 47 |     # download from https://drive.google.com/drive/folders/1zvl89AgFAApbH0At-gMuZSeQB_LpNP-M
 48 |     model_data = torch.load('../../data/pretrained/resnext-101-kinetics.pth', map_location='cpu')
 49 |     model.load_state_dict(model_data['state_dict'])
 50 |     model.eval()
 51 |     return model
 52 | 
 53 | def extract_frame(video, dst):
 54 |     with open(os.devnull, 'w') as ffmpeg_log:
 55 |         if os.path.exists(dst):
 56 |             # print(" cleanup: "+dst+"/")
 57 |             shutil.rmtree(dst)
 58 |         os.makedirs(dst)
 59 |         video2frame_cmd = [
 60 |             "ffmpeg",
 61 |             '-y',
 62 |             '-i', video,
 63 |             '-r', "10",
 64 |             # '-vf', "scale=400:300",
 65 |             '-vsync', '0',
 66 |             '-qscale:v', "2",
 67 |             '{0}/%06d.jpg'.format(dst)
 68 |         ]
 69 |         subprocess.call(video2frame_cmd, stdout = ffmpeg_log, stderr=ffmpeg_log)
 70 | 
 71 | 
 72 | def run_batch(cur_batch, model):
 73 |     """
 74 |     Args:
 75 |         cur_batch: treat a video as a batch of images
 76 |         model: ResNet model for feature extraction
 77 |     Returns:
 78 |         ResNet extracted feature.
 79 |     """
 80 |     mean = np.array([0.485, 0.456, 0.406]).reshape(1, 3, 1, 1)
 81 |     std = np.array([0.229, 0.224, 0.224]).reshape(1, 3, 1, 1)
 82 | 
 83 |     image_batch = np.concatenate(cur_batch, 0).astype(np.float32)
 84 |     image_batch = (image_batch / 255.0 - mean) / std
 85 |     image_batch = torch.FloatTensor(image_batch).cuda()
 86 |     with torch.no_grad():
 87 |         image_batch = torch.autograd.Variable(image_batch)
 88 | 
 89 |     feats = model(image_batch)
 90 |     feats = feats.data.cpu().clone().numpy()
 91 | 
 92 |     return feats
 93 | 
 94 | 
 95 | def extract_clips_with_consecutive_frames(path, num_clips, num_frames_per_clip):
 96 |     """
 97 |     Args:
 98 |         path: path of a video
 99 |         num_clips: expected numbers of splitted clips
100 |         num_frames_per_clip: number of frames in a single clip, pretrained model only supports 16 frames
101 |     Returns:
102 |         A list of raw features of clips.
103 |     """
104 |     
105 |     clips = list()
106 |     t1 = time.time()
107 |     frame_list = sorted(os.listdir(path))
108 |     video_data = [np.asarray(Image.open(osp.join(path, img))) for img in frame_list]
109 |     
110 |     valid = True
111 |     video_data = np.asarray(video_data)
112 |     t2 = time.time()
113 |     print(t2-t1)
114 | 
115 |     total_frames = video_data.shape[0]
116 |     img_size = (args.image_height, args.image_width)
117 |     for i in np.linspace(0, total_frames, num_clips + 2, dtype=np.int32)[1:num_clips + 1]:
118 |         clip_start = int(i) - int(num_frames_per_clip / 2)
119 |         clip_end = int(i) + int(num_frames_per_clip / 2)
120 |         if clip_start < 0:
121 |             clip_start = 0
122 |         if clip_end > total_frames:
123 |             clip_end = total_frames - 1
124 |         clip = video_data[clip_start:clip_end]
125 | 
126 |         if clip_start == 0:
127 |             shortage = num_frames_per_clip - (clip_end - clip_start)
128 |             added_frames = []
129 |             for _ in range(shortage):
130 |                 added_frames.append(np.expand_dims(video_data[clip_start], axis=0))
131 |             if len(added_frames) > 0:
132 |                 added_frames = np.concatenate(added_frames, axis=0)
133 |                 clip = np.concatenate((added_frames, clip), axis=0)
134 |         if clip_end == (total_frames - 1):
135 |             shortage = num_frames_per_clip - (clip_end - clip_start)
136 |             added_frames = []
137 |             for _ in range(shortage):
138 |                 added_frames.append(np.expand_dims(video_data[clip_end], axis=0))
139 |             if len(added_frames) > 0:
140 |                 added_frames = np.concatenate(added_frames, axis=0)
141 |                 clip = np.concatenate((clip, added_frames), axis=0)
142 |         
143 | 
144 |         # new_clip = clip #.transpose(0, 3, 1, 2)[None]
145 |         # if clip.shape[0] < num_frames_per_clip:
146 |         clip = clip[::4] #sample 4 frames per clip
147 |         new_clip = []
148 |         # for j in range(num_frames_per_clip):
149 |             # if j >= len(clip):
150 |             #     new_clip.append(new_clip[-1])
151 |             # else:
152 |                 # new_clip.append(clip[j])
153 |         for frame_data in clip:
154 |             # frame_data = clip[j]
155 |             img = Image.fromarray(frame_data)
156 |             img = imresize(img, img_size, interp='bicubic')
157 |             frame_data = np.array(img)
158 |             frame_data = frame_data.transpose(2, 0, 1)[None]
159 |             new_clip.append(frame_data)
160 |         new_clip = np.asarray(new_clip)  # (num_frames, width, height, channels)
161 |         # print(new_clip.shape)
162 |         if args.model in ['resnext101']:
163 |             new_clip = np.squeeze(new_clip)
164 |             new_clip = np.transpose(new_clip, axes=(1, 0, 2, 3))
165 |         clips.append(new_clip)
166 |     
167 |     clips = clips[::4] # sample 8 clips per video
168 |     t3 = time.time()
169 |     
170 |     return clips, valid
171 | 
172 | def extract_clip_frames(vpath, clips):
173 |     """
174 |     Args:
175 |         path: path of a video
176 |         num_clips: expected numbers of splitted clips
177 |         num_frames_per_clip: number of frames in a single clip, pretrained model only supports 16 frames
178 |     Returns:
179 |         A list of raw features of clips.
180 |     """
181 |     # para_dict = {'r':'10', 'vsync':'0', 'qscale:v':'2'}
182 |     # print(vpath)
183 |     # rate = 10
184 |     # meta = skvideo.io.ffprobe(vpath)
185 |     # fp = meta['video']['@avg_frame_rate']
186 |     # tstamp = int(fp.split('/')[0])//rate
187 |     try:
188 |         video_data = sio.vread(vpath) #ffmpeg as backend
189 |     except:
190 |         return None
191 |     # video_data = video_data[::tstamp]
192 |     total_frames, width, height, channel = video_data.shape
193 |     # print(video_data.shape)
194 |     img_size = (224, 224) #(args.image_height, args.image_width)
195 |     img_clip = []
196 |     num_clip = 8
197 |     clips = clips[:8]
198 |     for i, cids in enumerate(clips):
199 |         # if i > 7: break
200 |         fids = [int(r) for r in cids]
201 |         # print(fids, video_data.shape)
202 |         if fids[-1] >= total_frames:
203 |             fids[-1] = total_frames -1
204 |         clip = video_data[fids]
205 |         new_clip = []
206 |         for j in range(4):
207 |             frame_data = clip[j]
208 |             img = Image.fromarray(frame_data)
209 |             img = imresize(img, img_size, interp='bicubic')
210 |             img = img.transpose(2, 0, 1)[None]
211 |             frame_data = np.array(img)
212 |             new_clip.append(frame_data)
213 |         # new_clip = np.asarray(new_clip)  # (num_frames, width, height, channels)
214 |         img_clip.extend(new_clip)
215 |     
216 |     return img_clip
217 | 
218 | 
219 | def generate_npy(model, video_dir, clip_file, outfile):
220 | 
221 |     vclips = load_file(clip_file) 
222 |     vclips = sorted(vclips.items(), key=lambda a:a[0])
223 |     dataset_size = len(vclips)
224 |     print(dataset_size)
225 | 
226 |     i0 = 0
227 |     _t = {'misc': utils.Timer()}
228 |     for i, (vname, clip) in enumerate(vclips):
229 |         #if i <= 4000: continue
230 |         #if i > 10000: break
231 |         out_file = osp.join(outfile, vname+'.npy')
232 |         if osp.exists(out_file): 
233 |             continue
234 |         video_path = osp.join(video_dir, vname+'.mp4')
235 |         if not osp.exists(video_path):
236 |             # print(video_path)
237 |             continue
238 |         clips = extract_clip_frames(video_path, clip)
239 |         if clips == None: continue
240 |         clips = np.asarray(clips)
241 |         clip_feat = run_batch(clips, model)
242 |         clip_feat = clip_feat.squeeze()#(32, 2048)
243 |         
244 |         feat = clip_feat.reshape(8, 4, 2048)
245 |         dirname = osp.dirname(out_file)
246 |         if not osp.exists(dirname):
247 |             os.makedirs(dirname)
248 |         np.save(out_file, feat)
249 |         if i % 200 == 0:
250 |             print(f'{i}/{dataset_size}')
251 | 
252 | def prepare_inputs(path, frame_list):
253 |     video_data = [np.asarray(Image.open(osp.join(path, img))) for img in frame_list]
254 |     video_data = np.asarray(video_data)
255 |     total_frames = video_data.shape[0]
256 |     img_size = (224, 224)
257 |     video_inputs = []
258 |     for j in range(total_frames):
259 |         frame_data = video_data[j]
260 |         img = Image.fromarray(frame_data)
261 |         img = imresize(img, img_size, interp='bicubic')
262 |         img = img.transpose(2, 0, 1)[None]
263 |         frame_data = np.array(img)
264 |         video_inputs.append(frame_data)
265 |     video_inputs = np.asarray(video_inputs)
266 |     # print(video_inputs.shape)
267 |     return video_inputs
268 | 
269 | def generate_npy_byframe(model, video_list_file, video_dir, out_dir):
270 |     videos = load_file(video_list_file)
271 |     vnum = len(videos)
272 |     for iv, vname in enumerate(videos):
273 |         # if iv <= 2400: continue
274 |         # if iv > 3000: break
275 |         fpath = f'{video_dir}/{vname}'
276 |         frames = sorted(os.listdir(fpath))
277 |         out_path = osp.join(out_dir, vname)
278 |         if osp.exists(out_path): continue
279 |         videos = prepare_inputs(fpath, frames)
280 |         fnum = videos.shape[0]
281 |         if fnum > 100:
282 |             it = fnum//100
283 |             left = fnum % 100
284 |             video_feats = []
285 |             for i in range(it):
286 |                 data = run_batch(videos[i*100:100*(i+1)], model)
287 |                 video_feats.append(data)
288 |             if left > 0:
289 |                 data = run_batch(videos[i*100:(i*100)+left], model)
290 |                 video_feats.append(data)
291 |             # print(len(video_feats))
292 |             video_feats = np.concatenate(video_feats, 0)
293 |             assert video_feats.shape[0] == fnum, 'error'
294 |         else:
295 |             video_feats = run_batch(videos, model)
296 |         video_feats = video_feats.squeeze()
297 |         if not osp.exists(out_path): 
298 |             os.makedirs(out_path)
299 |         for iff, frame in enumerate(frames):
300 |             fname = frame.split('.')[0]
301 |             fpath_out = f'{out_path}/{fname}'
302 |             # if osp.exists(fpath_out+'.npy'): continue
303 |             np.save(fpath_out, video_feats[iff])
304 |         if iv % 100 == 0:
305 |             print(f'{iv}/{vnum}')
306 |         
307 | 
308 | def generate_h5(model, v_path, v_file, num_clips, outfile):
309 |     """
310 |     Args:
311 |         model: loaded pretrained model for feature extraction
312 |         video_ids: list of video ids
313 |         num_clips: expected numbers of splitted clips
314 |         outfile: path of output file to be written
315 |     Returns:
316 |         h5 file containing visual features of splitted clips.
317 |     """
318 |     if args.dataset == "tgif-qa":
319 |         if not os.path.exists('dataset/tgif-qa/{}'.format(args.question_type)):
320 |             os.makedirs('dataset/tgif-qa/{}'.format(args.question_type))
321 |     else:
322 |         if not os.path.exists(args.dataset):
323 |             os.makedirs(args.dataset)
324 |     
325 |     vlist = load_file(v_file)
326 |     dataset_size = len(vlist) 
327 |     print(dataset_size)
328 |     vnames = []
329 |     with h5py.File(outfile, 'w') as fd:
330 |         feat_dset = None
331 |         video_ids_dset = None
332 |         i0 = 0
333 |         _t = {'misc': utils.Timer()}
334 |         for i in range(0, dataset_size):
335 |             # if i < 20: continue
336 |             _t['misc'].tic()
337 |             
338 |             video_path = osp.join(v_path, str(vlist[i]))
339 |             
340 |             clips, valid = extract_clips_with_consecutive_frames(video_path, num_clips=num_clips, num_frames_per_clip=16)
341 |             
342 |             nclip, nframe = 8, 4
343 |             if args.feature_type == 'appearance':
344 |                 clip_feat = []
345 |                 if valid:
346 |                     # for clip_id, clip in enumerate(clips):
347 |                     #     feats = run_batch(clip, model)  # (16, 2048)
348 |                     #     feats = feats.squeeze()
349 |                     #     clip_feat.append(feats)
350 |                     # t4 = time.time()
351 |                     clips = np.asarray(clips).squeeze()
352 |                     clips = clips.reshape(clips.shape[0]*clips.shape[1], clips.shape[2],clips.shape[3],clips.shape[4])
353 |                     
354 |                     clips = torch.FloatTensor(clips).cuda().squeeze()
355 |                     # print(clips.shape)
356 |                     clip_feat = model(clips).squeeze()
357 |                     # print(clip_feat.shape)
358 |                     clip_feat = clip_feat.view(nclip, nframe, -1).detach().cpu().numpy()
359 |                 else:
360 |                     clip_feat = np.zeros(shape=(nclip, nframe, 2048))
361 |                 
362 |                 if feat_dset is None:
363 |                     print(clip_feat.shape)
364 |                     C, F, D = clip_feat.shape
365 |                     feat_dset = fd.create_dataset('resnet_features', (dataset_size, C, F, D),
366 |                                                   dtype=np.float32)
367 |                     video_ids_dset = fd.create_dataset('ids', shape=(dataset_size,), dtype=np.int)
368 |             
369 |             elif args.feature_type == 'motion':
370 |                 if valid:
371 |                     clip_torch = torch.FloatTensor(np.asarray(clips)).cuda()
372 |                     clip_feat = model(clip_torch)  # (8, 2048)
373 |                     clip_feat = clip_feat.squeeze()
374 |                     clip_feat = clip_feat.detach().cpu().numpy()
375 |                 else:
376 |                     clip_feat = np.zeros(shape=(nclip, 2048))
377 |                 if feat_dset is None:
378 |                     print(clip_feat.shape)
379 |                     C, D = clip_feat.shape
380 |                     feat_dset = fd.create_dataset('resnext_features', (dataset_size, C, D),
381 |                                                   dtype=np.float32)
382 |                     video_ids_dset = fd.create_dataset('ids', shape=(dataset_size,), dtype=np.int)
383 | 
384 |             
385 |             i1 = i0 + 1
386 |             feat_dset[i0:i1] = clip_feat
387 |             video_ids_dset[i0:i1] = int(vlist[i])
388 |             i0 = i1
389 |             _t['misc'].toc()
390 | 
391 |             if (i % 100 == 0):
392 |                 print('{:d}/{:d} {:.3f}s (projected finish: {:.2f} hours)' \
393 |                         .format(i1, dataset_size, _t['misc'].average_time,
394 |                                 _t['misc'].average_time * (dataset_size - i1) / 3600))
395 |         
396 |         varry = np.array(vlist, dtype=object)
397 |         string_dt = h5py.special_dtype(vlen=str)
398 |         fd.create_dataset('ids', data=varry, dtype=string_dt)
399 | 
400 | 
401 | if __name__ == '__main__':
402 |     parser = argparse.ArgumentParser()
403 |     parser.add_argument('--gpu_id', type=int, default=0, help='specify which gpu will be used')
404 |     # dataset info
405 |     parser.add_argument('--dataset', default='nextqa', choices=['tgif-qa', 'msvd', 'star', 'msrvtt', 'nextqa','webvid', 'causalvid'], type=str)
406 |     parser.add_argument('--question_type', default='none', choices=['frameqa', 'count', 'transition', 'action', 'none'], type=str)
407 |     # output
408 |     parser.add_argument('--out', dest='outfile',
409 |                         help='output filepath',
410 |                         default="../../data/nextqa/feat_{}.h5", type=str)
411 |     # image sizes
412 |     parser.add_argument('--num_clips', default=32, type=int)
413 |     parser.add_argument('--image_height', default=112*2, type=int)
414 |     parser.add_argument('--image_width', default=112*2, type=int)
415 | 
416 |     # network params
417 |     parser.add_argument('--model', default='resnet101', choices=['resnet101', 'resnext101'], type=str)
418 |     parser.add_argument('--seed', default='666', type=int, help='random seed')
419 |     args = parser.parse_args()
420 |     if args.model == 'resnet101':
421 |         args.feature_type = 'appearance'
422 |     elif args.model == 'resnext101':
423 |         args.feature_type = 'motion'
424 |     else:
425 |         raise Exception('Feature type not supported!')
426 |     # set gpu
427 |     if args.model != 'resnext101':
428 |         torch.cuda.set_device(args.gpu_id)
429 |     torch.manual_seed(args.seed)
430 |     np.random.seed(args.seed)
431 | 
432 |     # annotation files
433 |     if args.dataset == 'tgifqa':
434 |         args.annotation_file = '/storage_fast/jbxiao/workspace/VideoQA/data/{args.dataset}/videos.json'
435 |         args.video_dir = '/raid/jbxiao/data/tgifqa/frames/'
436 |         args.outfile = '../../data/{}/{}/{}_{}_{}_feat.h5'
437 |         video_paths = tgif_qa.load_video_paths(args)
438 |         random.shuffle(video_paths)
439 |         # load model
440 |         if args.model == 'resnet101':
441 |             model = build_resnet()
442 |         elif args.model == 'resnext101':
443 |             model = build_resnext()
444 |         generate_h5(model, video_paths, args.num_clips,
445 |                     args.outfile.format(args.dataset, args.question_type, args.dataset, args.question_type, args.feature_type))
446 |     
447 |     elif args.dataset == 'webvid':
448 |         args.video_dir = '/raid/jbxiao/data/WebVid/videos/'
449 |         if args.model == 'resnet101':
450 |             model = build_resnet()
451 |         elif args.model == 'resnext101':
452 |             model = build_resnext()
453 |         clip_file = f'/storage_fast/jbxiao/workspace/VideoQA/data/datasets/webvid/val_clip.json'
454 |         generate_npy(model, args.video_dir, clip_file, args.outfile)
455 |         
456 | 
457 |     elif args.dataset == 'msvd-qa':
458 |         args.annotation_file = '/ceph-g/lethao/datasets/msvd/MSVD-QA/{}_qa.json'
459 |         args.video_dir = '/ceph-g/lethao/datasets/msvd/MSVD-QA/video/'
460 |         args.video_name_mapping = '/ceph-g/lethao/datasets/msvd/youtube_mapping.txt'
461 |         video_paths = msvd_qa.load_video_paths(args)
462 |         random.shuffle(video_paths)
463 |         # load model
464 |         if args.model == 'resnet101':
465 |             model = build_resnet()
466 |         elif args.model == 'resnext101':
467 |             model = build_resnext()
468 |         generate_h5(model, video_paths, args.num_clips,
469 |                     args.outfile.format(args.dataset, args.dataset, args.feature_type))
470 | 
471 |     elif args.dataset == 'nextqa':
472 |         args.video_list_file = '../datasets/nextqa/vlist.json' #obtained from train/val/test csv files
473 |         args.video_dir = '/storage/jbxiao/workspace/data/nextqa/frames/' #extacted video frames, refer to extract_video.py
474 |         if args.model == 'resnet101':
475 |             model = build_resnet()
476 |         elif args.model == 'resnext101':
477 |             model = build_resnext()
478 |             args.image_height = 112
479 |             args.image_width = 112
480 |         generate_h5(model, args.video_dir, args.video_list_file, args.num_clips, args.outfile.format(args.feature_type))
481 | 


--------------------------------------------------------------------------------
/tools/split_dataset_feat.py:
--------------------------------------------------------------------------------
  1 | # ====================================================
  2 | # @Time    : 6/5/21 1:32 PM
  3 | # @Author  : Xiao Junbin
  4 | # @Email   : junbin@comp.nus.edu.sg
  5 | # @File    : split_dataset_feat.py
  6 | # ====================================================
  7 | import h5py
  8 | import numpy as np
  9 | import os
 10 | import os.path as osp
 11 | import pandas as pd
 12 | 
 13 | 
 14 | def np2h5(in_dir, out_dir, video_list, mode):
 15 |     out_file = osp.join(out_dir, 'region_16c20b_{}.h5'.format(mode))
 16 |     video_fd = h5py.File(out_file, 'w')
 17 |     feat_dset, bbox_dset, ids_dset = None, None, None
 18 |     bbox_num = 20
 19 |     for video in video_list:
 20 |         bbox_file = osp.join(in_dir, str(video) + '.npz')
 21 |         npz = np.load(bbox_file)
 22 |         roi_feat = npz['feat']
 23 |         bnum = roi_feat.shape[2]
 24 |         roi_bbox = npz['bbox']
 25 |         # if bnum < bbox_num:
 26 |         #     add_num = bbox_num - bnum
 27 |         #     print(add_num)
 28 |         #     add_feat, add_bbox = [], []
 29 |         #     for _ in range(add_num):
 30 |         #         add_feat.append(roi_feat[:, :, bnum-1, :])
 31 |         #         add_bbox.append(roi_bbox[:, :, bnum-1, :])
 32 |         #     add_feat = np.asarray(add_feat).transpose(1, 2, 0, 3)
 33 |         #     add_bbox = np.asarray(add_bbox).transpose(1, 2, 0, 3)
 34 |         #     print(add_feat.shape, add_bbox.shape)
 35 |         #     roi_feat = np.concatenate((roi_feat, add_feat), axis=2)
 36 |         #     roi_bbox = np.concatenate((roi_bbox, add_bbox), axis=2)
 37 | 
 38 |         roi_feat = roi_feat[:, :, :bbox_num, :]
 39 |        
 40 |         roi_bbox = roi_bbox[:, :, :bbox_num, :]
 41 |         # print(roi_feat.shape, roi_bbox.shape)
 42 |         if feat_dset is None:
 43 |             dataset_size = len(video_list)
 44 |             C, F, R, D = roi_feat.shape
 45 |             feat_dset = video_fd.create_dataset('feat', (dataset_size, C, F, R, D),
 46 |                                                   dtype=np.float32)
 47 |             ids_dset = video_fd.create_dataset('ids', shape=(dataset_size,), dtype=np.int)
 48 |             C, F, R, D = roi_bbox.shape
 49 |             bbox_dset = video_fd.create_dataset('bbox', shape=(dataset_size, C, F, R, D),
 50 |                                                 dtype=np.float32)
 51 |             ival = 0
 52 | 
 53 |         feat_dset[ival:(ival + 1)] = roi_feat
 54 |         bbox_dset[ival:(ival + 1)] = roi_bbox
 55 |         ids_dset[ival:(ival + 1)] = int(video)
 56 | 
 57 |         ival += 1
 58 |     print('Save to {}'.format(out_file))
 59 | 
 60 | def split_dataset_feat(filename, out_dir, train_list, val_list, test_list):
 61 | 
 62 |     train_fd = h5py.File(osp.join(out_dir, 'app_feat_train.h5'), 'w')
 63 |     val_fd = h5py.File(osp.join(out_dir, 'app_feat_val.h5'), 'w')
 64 |     test_fd = h5py.File(osp.join(out_dir, 'app_feat_test.h5'), 'w')
 65 |     val_feat_dset, val_ids_dset = None, None
 66 |     test_feat_dset, test_ids_dset = None, None
 67 |     train_feat_dset, train_ids_dset = None, None
 68 | 
 69 |     feat_name = 'resnet_features'
 70 |     with h5py.File(filename, 'r') as fp:
 71 |         vids = fp['ids']
 72 |         feats = fp[feat_name]
 73 |         for vid, feat in zip(vids, feats):
 74 |             if vid in val_list:
 75 |                 if val_feat_dset is None:
 76 |                     dataset_size = len(val_list)
 77 |                     C, F, D = feat.shape
 78 |                     # C, D = feat.shape
 79 |                     val_feat_dset = val_fd.create_dataset(feat_name, (dataset_size, C, F, D),
 80 |                                                       dtype=np.float32)
 81 |                     val_ids_dset = val_fd.create_dataset('ids', shape=(dataset_size,), dtype=np.int)
 82 |                     ival = 0
 83 |                 val_feat_dset[ival:(ival+1)] = feat
 84 |                 val_ids_dset[ival:(ival+1)] = int(vid)
 85 |                 ival += 1
 86 |             elif vid in test_list:
 87 |                 if test_feat_dset is None:
 88 |                     dataset_size = len(test_list)
 89 |                     C, F, D = feat.shape
 90 |                     # C, D = feat.shape
 91 |                     test_feat_dset = test_fd.create_dataset(feat_name, (dataset_size, C, F, D),
 92 |                                                       dtype=np.float32)
 93 |                     test_ids_dset = test_fd.create_dataset('ids', shape=(dataset_size,), dtype=np.int)
 94 |                     itest = 0
 95 | 
 96 |                 test_feat_dset[itest:(itest + 1)] = feat
 97 |                 test_ids_dset[itest:(itest + 1)] = int(vid)
 98 |                 itest += 1
 99 |             else:
100 |                 if train_feat_dset is None:
101 |                     dataset_size = len(train_list)
102 |                     C, F, D = feat.shape
103 |                     # C, D = feat.shape
104 |                     train_feat_dset = train_fd.create_dataset(feat_name, (dataset_size, C, F, D),
105 |                                                       dtype=np.float32)
106 |                     train_ids_dset = train_fd.create_dataset('ids', shape=(dataset_size,), dtype=np.int)
107 |                     itrain = 0
108 | 
109 |                 train_feat_dset[itrain:(itrain + 1)] = feat
110 |                 train_ids_dset[itrain:(itrain + 1)] = int(vid)
111 |                 itrain += 1
112 | 
113 | def get_video_list(filename):
114 |     samples = pd.read_csv(filename)
115 |     videos = samples['video']
116 |     videos = list(set(videos))
117 |     print(len(videos))
118 |     return sorted(videos)
119 | 
120 | def main():
121 |     dataset = 'nextqa'
122 |     data_dir = '../../data/{}/'.format(dataset)
123 |     dataset_dir = '../datasets/{}/'.format(dataset)
124 |     # in_dir = osp.join(data_dir, 'region_n')
125 |     out_dir = osp.join(data_dir, 'frame_feat')
126 |     train_file = osp.join(dataset_dir, 'train.csv')
127 |     val_file = osp.join(dataset_dir, 'val.csv')
128 |     test_file = osp.join(dataset_dir, 'test.csv')
129 |     train_list = get_video_list(train_file)
130 |     val_list = get_video_list(val_file)
131 |     test_list = get_video_list(test_file)
132 | 
133 |     # np2h5(in_dir, out_dir, test_list, 'test')
134 |     # np2h5(in_dir, out_dir, val_list, 'val')
135 |     # np2h5(in_dir, out_dir, train_list, 'train')
136 | 
137 |     h5filename = osp.join(out_dir, 'feat_appearance.h5')
138 |     split_dataset_feat(h5filename, out_dir, train_list, val_list, test_list)
139 | 
140 | 
141 | if __name__ == "__main__":
142 |     main()


--------------------------------------------------------------------------------
/train/__pycache__/train_covgt.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/doc-doc/CoVGT/cbc9fa7830b304f3c3f9c53040489ea9ad35a9aa/train/__pycache__/train_covgt.cpython-38.pyc


--------------------------------------------------------------------------------
/train/train_covgt.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | import logging
  5 | import collections
  6 | from util import compute_aggreeings, AverageMeter, get_mask, mask_tokens
  7 | import os.path as osp
  8 | import json
  9 | #from fvcore.nn import FlopCountAnalysis
 10 | 
 11 | def eval(model, data_loader, a2v, args, test=False, tokenizer="RoBERTa"):
 12 |     model.eval()
 13 |     count = 0
 14 |     metrics, counts = collections.defaultdict(int), collections.defaultdict(int)
 15 | 
 16 |     with torch.no_grad():
 17 |         if not args.mc:
 18 |             model.module._compute_answer_embedding(a2v)
 19 |         results = {}
 20 |         for i, batch in enumerate(data_loader):
 21 |             answer_id, answer, video_o, video_f, question, question_id, seg_feats, seg_num = (
 22 |                 batch["answer_id"],
 23 |                 batch["answer"].cuda(),
 24 |                 batch["video_o"].cuda(),
 25 |                 batch["video_f"].cuda(),
 26 |                 batch["question"].cuda(),
 27 |                 batch['question_id'],
 28 |                 batch['seg_feats'].cuda(),
 29 |                 batch['seg_num']
 30 |             )
 31 |            
 32 |             video_len = batch["video_len"]
 33 |             seq_len = batch["seq_len"]
 34 |            
 35 |             question_mask = (question!=tokenizer.pad_token_id).float() #RobBERETa
 36 |             answer_mask = (answer!=tokenizer.pad_token_id).float() #RobBERETa
 37 | 
 38 |             video_mask = get_mask(video_len, video_o.size(1)).cuda()
 39 |             count += answer_id.size(0)
 40 |             video = (video_o, video_f)
 41 |             if not args.mc:
 42 |                 predicts = model(
 43 |                     video,
 44 |                     question,
 45 |                     text_mask=question_mask,
 46 |                     video_mask=video_mask,
 47 |                     seq_len = seq_len
 48 |                 )
 49 |                 topk = torch.topk(predicts, dim=1, k=10).indices.cpu()
 50 |                 if args.dataset != "ivqa":
 51 |                     answer_id_expanded = answer_id.view(-1, 1).expand_as(topk)
 52 |                 else:
 53 |                     answer_id = (answer_id / 2).clamp(max=1)
 54 |                     answer_id_expanded = answer_id
 55 |                 metrics = compute_aggreeings(
 56 |                     topk,
 57 |                     answer_id_expanded,
 58 |                     [1, 10],
 59 |                     ["acc", "acc10"],
 60 |                     metrics,
 61 |                     ivqa=(args.dataset == "ivqa"),
 62 |                 )
 63 |                 for bs, qid in enumerate(question_id):
 64 |                     results[qid] = {'prediction': int(topk.numpy()[bs,0]), 'answer':int(answer_id.numpy()[bs])}
 65 |             else:
 66 |                 #############Model FLOPs##########
 67 |                 # inputs = (video, question, None, answer.cuda(), seq_len, video_mask, answer_mask)
 68 |                 # flops = FlopCountAnalysis(model, inputs)
 69 |                 # print('Model FLOPs:', flops.total()/1000000) #use batch_size 1
 70 |                 # break
 71 |                 ###################################
 72 |                 fusion_proj, answer_proj = model(
 73 |                     video,
 74 |                     question,
 75 |                     text_mask=answer_mask,
 76 |                     video_mask=video_mask,
 77 |                     answer=answer,
 78 |                     seq_len = seq_len,
 79 |                     seg_feats = seg_feats,
 80 |                     seg_num = seg_num
 81 |                 )
 82 |                 # predicts = fusion_proj.squeeze() 
 83 |                 
 84 |                 fusion_proj = fusion_proj.unsqueeze(2)
 85 |                 predicts = torch.bmm(answer_proj, fusion_proj).squeeze()
 86 |                 
 87 |                 predicted = torch.max(predicts, dim=1).indices.cpu()
 88 |                 metrics["acc"] += (predicted == answer_id).sum().item()
 89 |                 for bs, qid in enumerate(question_id):
 90 |                     results[qid] = {'prediction': int(predicted.numpy()[bs]), 'answer':int(answer_id.numpy()[bs])}
 91 | 
 92 |     step = "val" if not test else "test"
 93 |     
 94 |     for k in metrics:
 95 |         # print(metrics[k], count)
 96 |         v = metrics[k] / count
 97 |         logging.info(f"{step} {k}: {v:.2%}")
 98 |         break
 99 | 
100 |     return metrics["acc"] / count, results
101 | 
102 | 
103 | def train(model, train_loader, a2v, optimizer, criterion, scheduler, epoch, args, tokenizer):
104 |     model.train()
105 |     running_vqa_loss, running_acc, running_mlm_loss, running_cl_loss = (
106 |         AverageMeter(),
107 |         AverageMeter(),
108 |         AverageMeter(),
109 |         AverageMeter()
110 |     )
111 |     for i, batch in enumerate(train_loader):
112 |         answer_id, answer, video_o, video_f, question, seg_feats, seg_num, qsn_id, qsn_token_ids, qsn_seq_len = (
113 |             batch["answer_id"],
114 |             batch["answer"],
115 |             batch["video_o"].cuda(),
116 |             batch["video_f"].cuda(),
117 |             batch["question"].cuda(),
118 |             batch['seg_feats'].cuda(),
119 |             batch['seg_num'],
120 |             batch['qsn_id'],
121 |             batch['qsn_token_ids'],
122 |             batch['qsn_seq_len']
123 |         )
124 |         
125 |         video_len = batch["video_len"]
126 |         
127 |         question_mask = (question != tokenizer.pad_token_id).float().cuda() #RobBERETa
128 |         answer_mask = (answer!=tokenizer.pad_token_id).float().cuda() #RobBERETa
129 |         video_mask = (
130 |             get_mask(video_len, video_o.size(1)).cuda() if args.max_feats > 0 else None
131 |         )
132 |        
133 |         qsn_mask = (qsn_token_ids != tokenizer.pad_token_id).float().cuda()
134 |         
135 |         video = (video_o, video_f)
136 |         N = answer_id.size(0)
137 |         seq_len = batch["seq_len"]
138 |         if not args.mc:
139 |             model.module._compute_answer_embedding(a2v)
140 |             predicts = model(
141 |                 video,
142 |                 question,
143 |                 text_mask=question_mask,
144 |                 video_mask=video_mask,
145 |                 seq_len = seq_len
146 |             )
147 |         else:
148 |             fusion_proj, answer_proj = model(
149 |                 video,
150 |                 question,
151 |                 text_mask=answer_mask,
152 |                 video_mask=video_mask,
153 |                 answer=answer.cuda(),
154 |                 seq_len = seq_len,
155 |                 seg_feats = seg_feats,
156 |                 seg_num = seg_num
157 |             )
158 |                     
159 |             fusion_proj = fusion_proj.unsqueeze(2)
160 |             predicts = torch.bmm(answer_proj, fusion_proj).squeeze()
161 | 
162 |         if args.dataset == "ivqa":
163 |             a = (answer_id / 2).clamp(max=1).cuda()
164 |             vqa_loss = criterion(predicts, a)
165 |             predicted = torch.max(predicts, dim=1).indices.cpu()
166 |             predicted = F.one_hot(predicted, num_classes=len(a2v))
167 |             running_acc.update((predicted * a.cpu()).sum().item() / N, N)
168 |         else:
169 |             vqa_loss = criterion(predicts, answer_id.cuda())
170 |             predicted = torch.max(predicts, dim=1).indices.cpu() 
171 |             running_acc.update((predicted == answer_id).sum().item() / N, N)
172 |         if args.cl_loss:
173 |             vt_proj, txt_proj = model(
174 |                 video,
175 |                 question,
176 |                 text_mask=qsn_mask,
177 |                 video_mask=video_mask,
178 |                 answer=qsn_token_ids,
179 |                 seq_len = qsn_seq_len,
180 |                 seg_feats = seg_feats,
181 |                 seg_num = seg_num
182 |             )
183 |             vt_proj = vt_proj.unsqueeze(2)
184 |             cl_predicts = torch.bmm(txt_proj, vt_proj).squeeze()
185 |             cl_loss = criterion(cl_predicts, qsn_id.cuda())
186 |             # cl_predicted = torch.max(cl_predicts, dim=1).indices.cpu()
187 |             # running_acc.update((predicted == answer_id).sum().item() / N, N)
188 | 
189 |         if args.mlm_prob:
190 |             max_seq_len = args.qmax_words
191 |             if args.mc > 0:
192 |                 tmp_id = [aid+(args.mc*i) for i, aid in enumerate(answer_id)]
193 |                 inputs = answer.view(N*args.mc, -1)[tmp_id,:]
194 |                 # question_mask = (inputs>0).float()
195 |                 question_mask = (inputs!=1).float()
196 |                 max_seq_len = args.amax_words
197 |             else:
198 |                 inputs = batch["question"]
199 |             
200 |             inputs, labels = mask_tokens(inputs, tokenizer, mlm_probability=args.mlm_prob)
201 |             mlm_loss = model(
202 |                 video,
203 |                 question=inputs.cuda(),
204 |                 labels=labels.cuda(),
205 |                 text_mask=question_mask,
206 |                 video_mask=video_mask,
207 |                 max_seq_len=max_seq_len,
208 |                 mode="mlm",
209 |             )
210 |             mlm_loss = mlm_loss.mean()
211 |             loss = mlm_loss + vqa_loss
212 |         if args.cl_loss:
213 |             loss = vqa_loss + args.cl_loss*cl_loss
214 |         if args.cl_loss and args.mlm_prob:
215 |             loss = vqa_loss + args.cl_loss*cl_loss + mlm_loss
216 |         if not args.cl_loss and not args.mlm_prob:
217 |             loss = vqa_loss
218 | 
219 |         optimizer.zero_grad()
220 |         loss.backward()
221 |         if args.clip:
222 |             nn.utils.clip_grad_norm_(model.parameters(), max_norm=args.clip)
223 |         optimizer.step()
224 |         scheduler.step()
225 |         
226 |         running_vqa_loss.update(vqa_loss.detach().cpu().item(), N)
227 |         if args.mlm_prob:
228 |             running_mlm_loss.update(mlm_loss.detach().cpu().item(), N)
229 |         if args.cl_loss:
230 |             running_cl_loss.update(cl_loss.detach().cpu().item(), N)
231 |         if (i + 1) % (len(train_loader) // args.freq_display) == 0:
232 |             if args.mlm_prob:
233 |                 logging.info(
234 |                     f"Epoch {epoch + 1}/{args.epochs}, Progress: {float(i + 1) / len(train_loader):.4f}, Lvqa loss: "
235 |                     f"{running_vqa_loss.avg:.4f}, Training acc: {running_acc.avg:.2%}, MLM loss: {running_mlm_loss.avg:.4f}, Lvq Loss: {running_cl_loss.avg:.4f}"
236 |                 )
237 |             elif args.cl_loss:
238 |                 logging.info(
239 |                     f"Epoch {epoch + 1}/{args.epochs}, Progress: {float(i + 1) / len(train_loader):.4f}, Lvqa loss: "
240 |                     f"{running_vqa_loss.avg:.4f}, Train acc: {running_acc.avg:.2%}, Lvq Loss: {running_cl_loss.avg:.4f}"
241 |                 )
242 |             else:
243 |                 logging.info(
244 |                     f"Epoch {epoch + 1}/{args.epochs}, Progress: {float(i + 1) / len(train_loader):.4f}, Lvqa loss: "
245 |                     f"{running_vqa_loss.avg:.4f}, Train acc: {running_acc.avg:.2%}"
246 |                 )
247 |             running_acc.reset()
248 |             running_vqa_loss.reset()
249 |             running_mlm_loss.reset()
250 |             running_cl_loss.reset()
251 | 


--------------------------------------------------------------------------------
/util.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import torch
  3 | import torch.nn.functional as F
  4 | import json
  5 | import collections
  6 | import numpy as np
  7 | import os
  8 | import os.path as osp
  9 | import pandas as pd
 10 | import logging
 11 | import pickle as pkl
 12 | import string
 13 | 
 14 | def tokenize(
 15 |     seq,
 16 |     tokenizer,
 17 |     add_special_tokens=True,
 18 |     max_length=10,
 19 |     dynamic_padding=True,
 20 |     truncation=True,
 21 | ):
 22 |     """
 23 |     :param seq: sequence of sequences of text
 24 |     :param tokenizer: bert_tokenizer
 25 |     :return: torch tensor padded up to length max_length of bert tokens
 26 |     """
 27 |     token_ids = tokenizer.batch_encode_plus(
 28 |         seq,
 29 |         add_special_tokens=add_special_tokens,
 30 |         max_length=max_length,
 31 |         padding="longest" if dynamic_padding else "max_length",
 32 |         truncation=truncation,
 33 |     )["input_ids"]
 34 |     # tokens = [tokenizer.tokenize(s, add_special_tokens=add_special_tokens) for s in seq]
 35 |     tokens = ''
 36 |     return torch.tensor(token_ids, dtype=torch.long), tokens
 37 | 
 38 | def transform_bb(roi_bbox, width, height):
 39 |     dshape = list(roi_bbox.shape)
 40 |     tmp_bbox = roi_bbox.reshape([-1, 4])
 41 |     relative_bbox = tmp_bbox / np.asarray([width, height, width, height])
 42 |     relative_area = (tmp_bbox[:, 2] - tmp_bbox[:, 0] + 1) * \
 43 |                     (tmp_bbox[:, 3] - tmp_bbox[:, 1] + 1)/ (width*height)
 44 |     relative_area = relative_area.reshape(-1, 1)
 45 |     bbox_feat = np.hstack((relative_bbox, relative_area))
 46 |     dshape[-1] += 1
 47 |     bbox_feat = bbox_feat.reshape(dshape)
 48 | 
 49 |     return bbox_feat
 50 | 
 51 | 
 52 | def compute_aggreeings(topk, answers, thresholds, names, metrics, ivqa=False):
 53 |     """ Updates metrics dictionary by computing aggreeings for different thresholds """
 54 |     if not ivqa:
 55 |         # sp_num = topk.shape[0]
 56 |         for i, x in enumerate(thresholds):
 57 |             agreeingsx = (topk[:, :x] == answers[:, :x]).sum().item()
 58 |             # unk = 0
 59 |             # for j in range(sp_num):
 60 |             #     if answers[j, 0].item() == 0 and 0 in topk[j, :x].numpy():
 61 |             #         unk += 1
 62 |             metrics[names[i]] += agreeingsx #-unk
 63 |     else:
 64 |         for i, x in enumerate(thresholds):
 65 |             predicted = F.one_hot(topk[:, :x], num_classes=answers.shape[-1]).sum(1)
 66 |             metrics[names[i]] += (predicted * answers).max(1)[0].sum().item()
 67 |     return metrics
 68 | 
 69 | 
 70 | class AverageMeter:
 71 |     """ Computes and stores the average and current value for training stats """
 72 | 
 73 |     def __init__(self):
 74 |         self.reset()
 75 | 
 76 |     def reset(self):
 77 |         """ Reset all statistics """
 78 |         self.val = 0
 79 |         self.avg = 0
 80 |         self.sum = 0
 81 |         self.count = 0
 82 | 
 83 |     def update(self, val, n=1):
 84 |         """ Update statistics """
 85 |         self.val = val
 86 |         self.sum += val * n
 87 |         self.count += n
 88 |         self.avg = self.sum / self.count
 89 | 
 90 | 
 91 | def get_mask(lengths, max_length):
 92 |     """ Computes a batch of padding masks given batched lengths """
 93 |     mask = 1 * (
 94 |         torch.arange(max_length).unsqueeze(1).to(lengths.device) < lengths
 95 |     ).transpose(0, 1)
 96 |     return mask
 97 | 
 98 | 
 99 | def compute_a2v(vocab_path, bert_tokenizer, amax_words):
100 |     """ Precomputes GloVe answer embeddings for all answers in the vocabulary """
101 |     a2id = json.load(open(vocab_path, "r"))
102 |     # a2id['[UNK]'] = 0
103 |     id2a = {v: k for k, v in a2id.items()}
104 |     a2v, _ = tokenize(
105 |         list(a2id.keys()),
106 |         bert_tokenizer,
107 |         add_special_tokens=True,
108 |         max_length=amax_words,
109 |         dynamic_padding=True,
110 |         truncation=True,
111 |     )
112 |     if torch.cuda.is_available():
113 |         a2v = a2v.cuda()  # (vocabulary_size, 1, we_dim)
114 |     return a2id, id2a, a2v
115 | 
116 | 
117 | def mask_tokens(inputs, tokenizer, mlm_probability):
118 |     """
119 |     Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
120 |     """
121 |     if tokenizer.mask_token is None:
122 |         raise ValueError(
123 |             "This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the --mlm flag if you want to use this tokenizer."
124 |         )
125 | 
126 |     labels = inputs.clone()
127 |     # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
128 |     
129 |     probability_matrix = torch.full(labels.shape, mlm_probability)
130 |     # find special token
131 |     special_tokens_mask = [
132 |         tokenizer.get_special_tokens_mask(tkid, already_has_special_tokens=True)
133 |         for tkid in labels.tolist()
134 |     ]
135 |     # do not mask special token
136 |     probability_matrix.masked_fill_(
137 |         torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0
138 |     )
139 |     
140 |     if tokenizer._pad_token is not None:
141 |         padding_mask = labels.eq(tokenizer.pad_token_id)
142 |         probability_matrix.masked_fill_(padding_mask, value=0.0)
143 |     
144 |     masked_indices = torch.bernoulli(probability_matrix).bool()
145 |     labels[~masked_indices] = -100  # We only compute loss on masked tokens
146 | 
147 |     # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
148 |     indices_replaced = (
149 |         torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
150 |     )
151 |     inputs[indices_replaced] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)
152 | 
153 |     # 10% of the time, we replace masked input tokens with random word
154 |     indices_random = (
155 |         torch.bernoulli(torch.full(labels.shape, 0.5)).bool()
156 |         & masked_indices
157 |         & ~indices_replaced
158 |     )
159 |     random_words = torch.randint(len(tokenizer), labels.shape, dtype=torch.long)
160 |     inputs[indices_random] = random_words[indices_random]
161 | 
162 |     # The rest of the time (10% of the time) we keep the masked input tokens unchanged
163 |     
164 |     return inputs, labels
165 | 
166 | 
167 | def get_types(dataset):
168 |     """ Type2Id mapping for VideoQA datasets """
169 |     if dataset == "tgif":
170 |         return {"what": 0, "how": 1, "color": 2, "where": 3}
171 |     elif dataset == "activitynet":
172 |         return {
173 |             "motion": 0,
174 |             "spatial": 1,
175 |             "temporal": 2,
176 |             "yesno": 3,
177 |             "color": 4,
178 |             "object": 5,
179 |             "location": 6,
180 |             "number": 7,
181 |             "other": 8,
182 |         }
183 |     elif dataset == "msvd" or dataset == "msrvtt":
184 |         return {"what": 0, "how": 1, "color": 2, "where": 3, "who": 4, "when": 5}
185 |     elif dataset == "ivqa":
186 |         return {"scenes": 0}
187 |     else:
188 |         raise NotImplementedError
189 | 
190 | 
191 | def get_most_common(loader, ivqa=False, n=4):
192 |     """ Outputs most common answers and splits in n parts the answers depending on their frequency"""
193 |     if ivqa:
194 |         ans = []
195 |         for a1, a2, a3, a4, a5 in zip(
196 |             list(loader.dataset.data["answer1"]),
197 |             list(loader.dataset.data["answer2"]),
198 |             list(loader.dataset.data["answer3"]),
199 |             list(loader.dataset.data["answer4"]),
200 |             list(loader.dataset.data["answer5"]),
201 |         ):
202 |             counteri = collections.Counter([a1, a2, a3, a4, a5])
203 |             for w in counteri:
204 |                 if (
205 |                     counteri[w] >= 2
206 |                 ):  # an answer is considered as right if it has been annotated by two workers
207 |                     ans.append(w)
208 |     else:
209 |         ans = list(loader.dataset.data["answer"])
210 |     most_common = collections.Counter(ans).most_common()
211 | 
212 |     total = sum(x[1] for x in most_common)
213 |     splits = [0] * (n + 1)
214 |     j = 0
215 |     for i in range(n):
216 |         cur_total = 0
217 |         while j < len(most_common) and cur_total < total / n:
218 |             cur_total += most_common[j][1]
219 |             j += 1
220 |         splits[i + 1] = j
221 |     return most_common, splits, total
222 | 
223 | 
224 | def compute_word_stats(
225 |     topk, answers, a2id, a2v, most_common, metrics, counts, ivqa, top10=False
226 | ):
227 |     """ Similar as compute_agreeings, computes agreeings and counts for most common words """
228 |     if not ivqa:
229 |         for word, cword in most_common:
230 |             if word not in a2id:
231 |                 counts[word] = cword
232 |                 continue
233 |             predicted = topk[:, 0]
234 |             metrics[f"acc_{word}"] += (
235 |                 (predicted[answers == a2id[word]] == a2id[word]).sum().item()
236 |             )
237 |             if top10:
238 |                 predicted10 = topk[:, :10]
239 |                 metrics[f"acc10_{word}"] += (
240 |                     (predicted10[answers == a2id[word]] == a2id[word]).sum().item()
241 |                 )
242 |             counts[word] += (answers == a2id[word]).sum().item()
243 |     else:
244 |         for word, cword in most_common:
245 |             if word not in a2id:
246 |                 counts[word] = cword
247 |                 continue
248 |             predicted = F.one_hot(topk[:, 0], num_classes=len(a2v))
249 |             ans_word = answers[:, a2id[word]]
250 |             metrics[f"acc_{word}"] += (
251 |                 (predicted[:, a2id[word]][ans_word == 1] * ans_word[ans_word == 1])
252 |                 .sum()
253 |                 .item()
254 |             )
255 |             if top10:
256 |                 predicted10 = F.one_hot(topk[:, :10], num_classes=len(a2v)).sum(1)
257 |                 metrics[f"acc10_{word}"] += (
258 |                     (
259 |                         predicted10[:, a2id[word]][ans_word == 1]
260 |                         * ans_word[ans_word == 1]
261 |                     )
262 |                     .sum()
263 |                     .item()
264 |                 )
265 |             counts[word] += (ans_word == 1).sum().item()
266 |     return metrics, counts
267 | 
268 | 
269 | def compute_metrics(x):
270 |     sx = np.sort(-x, axis=1)
271 |     d = np.diag(-x)
272 |     d = d[:, np.newaxis]
273 |     ind = sx - d
274 |     ind = np.where(ind == 0)
275 |     ind = ind[1]
276 |     metrics = {}
277 |     metrics["R1"] = float(np.sum(ind == 0)) / len(ind)
278 |     metrics["R10"] = float(np.sum(ind < 10)) / len(ind)
279 |     metrics["R100"] = float(np.sum(ind < 100)) / len(ind)
280 |     metrics["MR"] = np.median(ind) + 1
281 |     return metrics
282 | 
283 | 
284 | def print_computed_metrics(metrics):
285 |     r1 = metrics["R1"]
286 |     r10 = metrics["R10"]
287 |     r100 = metrics["R100"]
288 |     mr = metrics["MR"]
289 |     return "R@1: {:.4f} - R@10: {:.4f} - R@100: {:.4f} - Median R: {}".format(
290 |         r1, r10, r100, mr
291 |     )
292 | 
293 | 
294 | #added by Junbin
295 | def get_qsn_type(qsn, ans_rsn):
296 |     dos = ['does', 'do', 'did']
297 |     bes = ['was', 'were', 'is', 'are']
298 |     w5h1 = ['what', 'who', 'which', 'why', 'how', 'where']
299 |     qsn_sp = qsn.split()
300 |     type = qsn_sp[0].lower()
301 |     if type == 'what':
302 |         if qsn_sp[1].lower() in dos:
303 |             type = 'whata'
304 |         elif qsn_sp[1].lower() in bes:
305 |             type = 'whatb'
306 |         else:
307 |             type = 'whato'
308 |     elif type == 'how':
309 |         if qsn_sp[1].lower() == 'many':
310 |             type = 'howm'
311 |     elif type not in w5h1:
312 |         type = 'other'
313 |     if ans_rsn in ['pr', 'cr']:
314 |         type += 'r'
315 |     return type
316 | 
317 | def major_type(tgroup):
318 |     ans_num = 0
319 |     mtype = ''
320 |     for type, item in tgroup.items():
321 |         if len(item) > ans_num:
322 |             ans_num = len(item)
323 |             mtype = type
324 |     return mtype
325 | 
326 | def group(csv_data, gt=True):
327 |     ans_group, qsn_group = {}, {}
328 |     for idx, row in csv_data.iterrows():
329 |         qsn, ans = row['question'], row['answer']
330 |         if gt:
331 |             type = row['type']
332 |             if type == 'TP': type = 'TN'
333 |         else:
334 |             type = 'null' if 'type' not in row else row['type']
335 |             type = get_qsn_type(qsn, type)
336 |         if type not in ans_group:
337 |             ans_group[type] = {ans}
338 |             qsn_group[type] = {qsn}
339 |         else:
340 |             ans_group[type].add(ans)
341 |             qsn_group[type].add(qsn)
342 |     return ans_group, qsn_group
343 | 
344 | 
345 | def load_model_by_key(cur_model, model_path):
346 |     model_dict = torch.load(model_path)
347 |     new_model_dict = {}
348 |     for k, v in cur_model.state_dict().items():
349 |         if k in model_dict:
350 |             v = model_dict[k]
351 |         else:
352 |             pass
353 |             # print(k)
354 |         new_model_dict[k] = v
355 |     return new_model_dict
356 | 
357 | 
358 | def load_file(filename):
359 |     '''
360 |     added by junbin Xiao
361 |     '''
362 |     file_type = osp.splitext(filename)[-1]
363 |     if file_type == '.csv':
364 |         data = pd.read_csv(filename)
365 |     else:
366 |         with open(filename, 'r') as fp:
367 |             if file_type == '.json':
368 |                 data = json.load(fp)
369 |             elif file_type == '.txt':
370 |                 data = fp.readlines()
371 |                 data = [datum.rstrip('\n') for datum in data]
372 |     return data
373 | 
374 | 
375 | def save_to(filename, data):
376 |     '''
377 |     added by junbin Xiao
378 |     '''
379 |     logging.info(f'Save to {filename}')
380 |     dirname = osp.dirname(filename)
381 |     if not osp.exists(dirname):
382 |         os.makedirs(dirname)
383 |     with open(filename, 'w') as fp:
384 |         json.dump(data, fp)
385 | 
386 | def pkload(filename):
387 |     with open(filename, 'rb') as fp:
388 |         data = pkl.load(fp)
389 |     return data
390 | 


--------------------------------------------------------------------------------