├── .gitignore
├── LICENSE
├── README.md
├── assets
└── cfvqa.png
├── cfvqa
├── __init__.py
├── __version__.py
├── datasets
│ ├── __init__.py
│ ├── factory.py
│ ├── scripts
│ │ ├── download_vqa2.sh
│ │ └── download_vqacp2.sh
│ ├── vqa2.py
│ ├── vqacp.py
│ └── vqacp2.py
├── models
│ ├── criterions
│ │ ├── __init__.py
│ │ ├── cfvqa_criterion.py
│ │ ├── factory.py
│ │ └── rubi_criterion.py
│ ├── metrics
│ │ ├── __init__.py
│ │ ├── factory.py
│ │ ├── vqa_cfvqa_metrics.py
│ │ ├── vqa_cfvqasimple_metrics.py
│ │ └── vqa_rubi_metrics.py
│ └── networks
│ │ ├── __init__.py
│ │ ├── cfvqa.py
│ │ ├── factory.py
│ │ ├── rubi.py
│ │ ├── san_net.py
│ │ ├── smrl_net.py
│ │ ├── updn_net.py
│ │ └── utils.py
├── optimizers
│ ├── __init__.py
│ └── factory.py
└── options
│ ├── vqa2
│ ├── smrl_baseline.yaml
│ ├── smrl_cfvqa_hm.yaml
│ ├── smrl_cfvqa_sum.yaml
│ ├── smrl_cfvqasimple_hm.yaml
│ ├── smrl_cfvqasimple_rubi.yaml
│ ├── smrl_cfvqasimple_sum.yaml
│ └── smrl_rubi.yaml
│ └── vqacp2
│ ├── smrl_baseline.yaml
│ ├── smrl_cfvqa_hm.yaml
│ ├── smrl_cfvqa_sum.yaml
│ ├── smrl_cfvqasimple_hm.yaml
│ ├── smrl_cfvqasimple_rubi.yaml
│ ├── smrl_cfvqasimple_sum.yaml
│ └── smrl_rubi.yaml
└── requirements.txt
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # pipenv
88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
91 | # install all needed dependencies.
92 | #Pipfile.lock
93 |
94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95 | __pypackages__/
96 |
97 | # Celery stuff
98 | celerybeat-schedule
99 | celerybeat.pid
100 |
101 | # SageMath parsed files
102 | *.sage.py
103 |
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 |
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 |
117 | # Rope project settings
118 | .ropeproject
119 |
120 | # mkdocs documentation
121 | /site
122 |
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 |
128 | # Pyre type checker
129 | .pyre/
130 |
131 | data/
132 | data
133 | logs/
134 | logs
135 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Counterfactual VQA (CF-VQA)
2 |
3 | This repository is the Pytorch implementation of our paper ["Counterfactual VQA: A Cause-Effect Look at Language Bias"](https://arxiv.org/abs/2006.04315) in CVPR 2021. This code is implemented as a fork of [RUBi][1].
4 |
5 | CF-VQA is proposed to capture and mitigate language bias in VQA from the view of causality. CF-VQA (1) captures the language bias as the direct causal effect of questions on answers, and (2) reduces the language bias by subtracting the direct language effect from the total causal effect.
6 |
7 |
8 |
9 |
10 |
11 |
12 | If you find this paper helps your research, please kindly consider citing our paper in your publications.
13 | ```
14 | @inproceedings{niu2020counterfactual,
15 | title={Counterfactual VQA: A Cause-Effect Look at Language Bias},
16 | author={Niu, Yulei and Tang, Kaihua and Zhang, Hanwang and Lu, Zhiwu and Hua, Xian-Sheng and Wen, Ji-Rong},
17 | booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
18 | year={2021}
19 | }
20 | ```
21 | ## Summary
22 |
23 | * [Installation](#installation)
24 | * [Setup and dependencies](#1-setup-and-dependencies)
25 | * [Download datasets](#2-download-datasets)
26 | * [Quick start](#quick-start)
27 | * [Train a model](#train-a-model)
28 | * [Evaluate a model](#evaluate-a-model)
29 | * [Useful commands](#useful-commands)
30 | * [Acknowledgment](#acknowledgment)
31 |
32 | ## Installation
33 |
34 |
35 | ### 1. Setup and dependencies
36 |
37 | Install Anaconda or Miniconda distribution based on Python3+ from their downloads' site.
38 |
39 | ```bash
40 | conda create --name cfvqa python=3.7
41 | source activate cfvqa
42 | pip install -r requirements.txt
43 | ```
44 |
45 | ### 2. Download datasets
46 |
47 | Download annotations, images and features for VQA experiments:
48 | ```bash
49 | bash cfvqa/datasets/scripts/download_vqa2.sh
50 | bash cfvqa/datasets/scripts/download_vqacp2.sh
51 | ```
52 |
53 |
54 | ## Quick start
55 |
56 |
57 | ### Train a model
58 |
59 | The [bootstrap/run.py](https://github.com/Cadene/bootstrap.pytorch/blob/master/bootstrap/run.py) file load the options contained in a yaml file, create the corresponding experiment directory and start the training procedure. For instance, you can train our best model on VQA-CP v2 (CFVQA+SUM+SMRL) by running:
60 | ```bash?
61 | python -m bootstrap.run -o cfvqa/options/vqacp2/smrl_cfvqa_sum.yaml
62 | ```
63 | Then, several files are going to be created in `logs/vqacp2/smrl_cfvqa_sum/`:
64 | - [options.yaml] (copy of options)
65 | - [logs.txt] (history of print)
66 | - [logs.json] (batchs and epochs statistics)
67 | - **[\_vq\_val\_oe.json] (statistics for the language-prior based strategy, e.g., RUBi)**
68 | - **[\_cfvqa\_val\_oe.json] (statistics for CF-VQA)**
69 | - [\_q\_val\_oe.json] (statistics for language-only branch)
70 | - [\_v\_val\_oe.json] (statistics for vision-only branch)
71 | - [\_all\_val\_oe.json] (statistics for the ensembled branch)
72 | - ckpt_last_engine.pth.tar (checkpoints of last epoch)
73 | - ckpt_last_model.pth.tar
74 | - ckpt_last_optimizer.pth.tar
75 |
76 | Many options are available in the options directory. CFVQA represents the complete causal graph while cfvqas represents the simplified causal graph.
77 |
78 | ### Evaluate a model
79 |
80 | There is no test set on VQA-CP v2, our main dataset. The evaluation is done on the validation set. For a model trained on VQA v2, you can evaluate your model on the test set. In this example, [boostrap/run.py](https://github.com/Cadene/bootstrap.pytorch/blob/master/bootstrap/run.py) load the options from your experiment directory, resume the best checkpoint on the validation set and start an evaluation on the testing set instead of the validation set while skipping the training set (train_split is empty). Thanks to `--misc.logs_name`, the logs will be written in the new `logs_predicate.txt` and `logs_predicate.json` files, instead of being appended to the `logs.txt` and `logs.json` files.
81 | ```bash
82 | python -m bootstrap.run \
83 | -o ./logs/vqacp2/smrl_cfvqa_sum/options.yaml \
84 | --exp.resume last \
85 | --dataset.train_split ''\
86 | --dataset.eval_split val \
87 | --misc.logs_name test
88 | ```
89 |
90 | ## Useful commands
91 |
92 |
93 | ### Use a specific GPU
94 |
95 | For a specific experiment:
96 | ```bash
97 | CUDA_VISIBLE_DEVICES=0 python -m bootstrap.run -o cfvqa/options/vqacp2/smrl_cfvqa_sum.yaml
98 | ```
99 |
100 | For the current terminal session:
101 | ```bash
102 | export CUDA_VISIBLE_DEVICES=0
103 | ```
104 |
105 | ### Overwrite an option
106 |
107 | The boostrap.pytorch framework makes it easy to overwrite a hyperparameter. In this example, we run an experiment with a non-default learning rate. Thus, I also overwrite the experiment directory path:
108 | ```bash
109 | python -m bootstrap.run -o cfvqa/options/vqacp2/smrl_cfvqa_sum.yaml \
110 | --optimizer.lr 0.0003 \
111 | --exp.dir logs/vqacp2/smrl_cfvqa_sum_lr,0.0003
112 | ```
113 |
114 | ### Resume training
115 |
116 | If a problem occurs, it is easy to resume the last epoch by specifying the options file from the experiment directory while overwritting the `exp.resume` option (default is None):
117 | ```bash
118 | python -m bootstrap.run -o logs/vqacp2/smrl_cfvqa_sum/options.yaml \
119 | --exp.resume last
120 | ```
121 |
122 |
123 | ## Acknowledgment
124 |
125 | Special thanks to the authors of [RUBi][1], [BLOCK][2], and [bootstrap.pytorch][3], and the datasets used in this research project.
126 |
127 |
128 | [1]: https://github.com/cdancette/rubi.bootstrap.pytorch
129 | [2]: https://github.com/Cadene/block.bootstrap.pytorch
130 | [3]: https://github.com/Cadene/bootstrap.pytorch
131 |
--------------------------------------------------------------------------------
/assets/cfvqa.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yuleiniu/cfvqa/736795bef50db9c76818f9a08202c7f325489afd/assets/cfvqa.png
--------------------------------------------------------------------------------
/cfvqa/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yuleiniu/cfvqa/736795bef50db9c76818f9a08202c7f325489afd/cfvqa/__init__.py
--------------------------------------------------------------------------------
/cfvqa/__version__.py:
--------------------------------------------------------------------------------
1 | __version__ = '0.0.0'
2 |
--------------------------------------------------------------------------------
/cfvqa/datasets/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yuleiniu/cfvqa/736795bef50db9c76818f9a08202c7f325489afd/cfvqa/datasets/__init__.py
--------------------------------------------------------------------------------
/cfvqa/datasets/factory.py:
--------------------------------------------------------------------------------
1 | from bootstrap.lib.options import Options
2 | from block.datasets.tdiuc import TDIUC
3 | from block.datasets.vrd import VRD
4 | from block.datasets.vg import VG
5 | from block.datasets.vqa_utils import ListVQADatasets
6 | from .vqa2 import VQA2
7 | from .vqacp2 import VQACP2
8 | from .vqacp import VQACP
9 |
10 | def factory(engine=None):
11 | opt = Options()['dataset']
12 |
13 | dataset = {}
14 | if opt.get('train_split', None):
15 | dataset['train'] = factory_split(opt['train_split'])
16 | if opt.get('eval_split', None):
17 | dataset['eval'] = factory_split(opt['eval_split'])
18 |
19 | return dataset
20 |
21 | def factory_split(split):
22 | opt = Options()['dataset']
23 | shuffle = ('train' in split)
24 |
25 | if opt['name'] == 'vqacp2':
26 | assert(split in ['train', 'val', 'test'])
27 | samplingans = (opt['samplingans'] and split == 'train')
28 |
29 | dataset = VQACP2(
30 | dir_data=opt['dir'],
31 | split=split,
32 | batch_size=opt['batch_size'],
33 | nb_threads=opt['nb_threads'],
34 | pin_memory=Options()['misc']['cuda'],
35 | shuffle=shuffle,
36 | nans=opt['nans'],
37 | minwcount=opt['minwcount'],
38 | nlp=opt['nlp'],
39 | proc_split=opt['proc_split'],
40 | samplingans=samplingans,
41 | dir_rcnn=opt['dir_rcnn'],
42 | dir_cnn=opt.get('dir_cnn', None),
43 | dir_vgg16=opt.get('dir_vgg16', None),
44 | )
45 | elif opt['name'] == 'vqacp':
46 | assert(split in ['train', 'val', 'test'])
47 | samplingans = (opt['samplingans'] and split == 'train')
48 |
49 | dataset = VQACP(
50 | dir_data=opt['dir'],
51 | split=split,
52 | batch_size=opt['batch_size'],
53 | nb_threads=opt['nb_threads'],
54 | pin_memory=Options()['misc']['cuda'],
55 | shuffle=shuffle,
56 | nans=opt['nans'],
57 | minwcount=opt['minwcount'],
58 | nlp=opt['nlp'],
59 | proc_split=opt['proc_split'],
60 | samplingans=samplingans,
61 | dir_rcnn=opt['dir_rcnn'],
62 | dir_cnn=opt.get('dir_cnn', None),
63 | dir_vgg16=opt.get('dir_vgg16', None),
64 | )
65 |
66 | elif opt['name'] == 'vqacpv2-with-testdev':
67 | assert(split in ['train', 'val', 'test'])
68 | samplingans = (opt['samplingans'] and split == 'train')
69 | dataset = VQACP2(
70 | dir_data=opt['dir'],
71 | split=split,
72 | batch_size=opt['batch_size'],
73 | nb_threads=opt['nb_threads'],
74 | pin_memory=Options()['misc']['cuda'],
75 | shuffle=shuffle,
76 | nans=opt['nans'],
77 | minwcount=opt['minwcount'],
78 | nlp=opt['nlp'],
79 | proc_split=opt['proc_split'],
80 | samplingans=samplingans,
81 | dir_rcnn=opt['dir_rcnn'],
82 | dir_cnn=opt.get('dir_cnn', None),
83 | dir_vgg16=opt.get('dir_vgg16', None),
84 | has_testdevset=True,
85 | )
86 |
87 | elif opt['name'] == 'vqa2':
88 | assert(split in ['train', 'val', 'test'])
89 | samplingans = (opt['samplingans'] and split == 'train')
90 |
91 | if opt['vg']:
92 | assert(opt['proc_split'] == 'trainval')
93 |
94 | # trainvalset
95 | vqa2 = VQA2(
96 | dir_data=opt['dir'],
97 | split='train',
98 | nans=opt['nans'],
99 | minwcount=opt['minwcount'],
100 | nlp=opt['nlp'],
101 | proc_split=opt['proc_split'],
102 | samplingans=samplingans,
103 | dir_rcnn=opt['dir_rcnn'])
104 |
105 | vg = VG(
106 | dir_data=opt['dir_vg'],
107 | split='train',
108 | nans=10000,
109 | minwcount=0,
110 | nlp=opt['nlp'],
111 | dir_rcnn=opt['dir_rcnn_vg'])
112 |
113 | vqa2vg = ListVQADatasets(
114 | [vqa2,vg],
115 | split='train',
116 | batch_size=opt['batch_size'],
117 | nb_threads=opt['nb_threads'],
118 | pin_memory=Options()['misc.cuda'],
119 | shuffle=shuffle)
120 |
121 | if split == 'train':
122 | dataset = vqa2vg
123 | else:
124 | dataset = VQA2(
125 | dir_data=opt['dir'],
126 | split=split,
127 | batch_size=opt['batch_size'],
128 | nb_threads=opt['nb_threads'],
129 | pin_memory=Options()['misc.cuda'],
130 | shuffle=False,
131 | nans=opt['nans'],
132 | minwcount=opt['minwcount'],
133 | nlp=opt['nlp'],
134 | proc_split=opt['proc_split'],
135 | samplingans=samplingans,
136 | dir_rcnn=opt['dir_rcnn'])
137 | dataset.sync_from(vqa2vg)
138 |
139 | else:
140 | dataset = VQA2(
141 | dir_data=opt['dir'],
142 | split=split,
143 | batch_size=opt['batch_size'],
144 | nb_threads=opt['nb_threads'],
145 | pin_memory=Options()['misc.cuda'],
146 | shuffle=shuffle,
147 | nans=opt['nans'],
148 | minwcount=opt['minwcount'],
149 | nlp=opt['nlp'],
150 | proc_split=opt['proc_split'],
151 | samplingans=samplingans,
152 | dir_rcnn=opt['dir_rcnn'],
153 | dir_cnn=opt.get('dir_cnn', None),
154 | )
155 |
156 | return dataset
157 |
--------------------------------------------------------------------------------
/cfvqa/datasets/scripts/download_vqa2.sh:
--------------------------------------------------------------------------------
1 | mkdir -p data/vqa
2 | cd data/vqa
3 | wget http://data.lip6.fr/cadene/block/vqa2.tar.gz
4 | wget http://data.lip6.fr/cadene/block/coco.tar.gz
5 | tar -xzvf vqa2.tar.gz
6 | tar -xzvf coco.tar.gz
7 |
8 | mkdir -p data/vqa/coco/extract_rcnn
9 | cd data/vqa/coco/extract_rcnn
10 | wget http://data.lip6.fr/cadene/block/coco/extract_rcnn/2018-04-27_bottom-up-attention_fixed_36.tar
11 | tar -xvf 2018-04-27_bottom-up-attention_fixed_36.tar
12 |
--------------------------------------------------------------------------------
/cfvqa/datasets/scripts/download_vqacp2.sh:
--------------------------------------------------------------------------------
1 | mkdir -p data/vqa
2 | cd data/vqa
3 | wget http://data.lip6.fr/cadene/murel/vqacp2.tar.gz
4 | tar -xzvf vqacp2.tar.gz
5 |
--------------------------------------------------------------------------------
/cfvqa/datasets/vqa2.py:
--------------------------------------------------------------------------------
1 | import os
2 | import csv
3 | import copy
4 | import json
5 | import torch
6 | import numpy as np
7 | from os import path as osp
8 | from bootstrap.lib.logger import Logger
9 | from bootstrap.lib.options import Options
10 | from block.datasets.vqa_utils import AbstractVQA
11 | from copy import deepcopy
12 | import random
13 | import tqdm
14 | import h5py
15 |
16 | class VQA2(AbstractVQA):
17 |
18 | def __init__(self,
19 | dir_data='data/vqa2',
20 | split='train',
21 | batch_size=10,
22 | nb_threads=4,
23 | pin_memory=False,
24 | shuffle=False,
25 | nans=1000,
26 | minwcount=10,
27 | nlp='mcb',
28 | proc_split='train',
29 | samplingans=False,
30 | dir_rcnn='data/coco/extract_rcnn',
31 | adversarial=False,
32 | dir_cnn=None
33 | ):
34 |
35 | super(VQA2, self).__init__(
36 | dir_data=dir_data,
37 | split=split,
38 | batch_size=batch_size,
39 | nb_threads=nb_threads,
40 | pin_memory=pin_memory,
41 | shuffle=shuffle,
42 | nans=nans,
43 | minwcount=minwcount,
44 | nlp=nlp,
45 | proc_split=proc_split,
46 | samplingans=samplingans,
47 | has_valset=True,
48 | has_testset=True,
49 | has_answers_occurence=True,
50 | do_tokenize_answers=False)
51 |
52 | self.dir_rcnn = dir_rcnn
53 | self.dir_cnn = dir_cnn
54 | self.load_image_features()
55 | # to activate manually in visualization context (notebo# to activate manually in visualization context (notebook)
56 | self.load_original_annotation = False
57 |
58 | def add_rcnn_to_item(self, item):
59 | path_rcnn = os.path.join(self.dir_rcnn, '{}.pth'.format(item['image_name']))
60 | item_rcnn = torch.load(path_rcnn)
61 | item['visual'] = item_rcnn['pooled_feat']
62 | item['coord'] = item_rcnn['rois']
63 | item['norm_coord'] = item_rcnn.get('norm_rois', None)
64 | item['nb_regions'] = item['visual'].size(0)
65 | return item
66 |
67 | def add_cnn_to_item(self, item):
68 | image_name = item['image_name']
69 | if image_name in self.image_names_to_index_train:
70 | index = self.image_names_to_index_train[image_name]
71 | image = torch.tensor(self.image_features_train['att'][index])
72 | elif image_name in self.image_names_to_index_val:
73 | index = self.image_names_to_index_val[image_name]
74 | image = torch.tensor(self.image_features_val['att'][index])
75 | image = image.permute(1, 2, 0).view(196, 2048)
76 | item['visual'] = image
77 | return item
78 |
79 | def load_image_features(self):
80 | if self.dir_cnn:
81 | filename_train = os.path.join(self.dir_cnn, 'trainset.hdf5')
82 | filename_val = os.path.join(self.dir_cnn, 'valset.hdf5')
83 | Logger()(f"Opening file {filename_train}, {filename_val}")
84 | self.image_features_train = h5py.File(filename_train, 'r', swmr=True)
85 | self.image_features_val = h5py.File(filename_val, 'r', swmr=True)
86 | # load txt
87 | with open(os.path.join(self.dir_cnn, 'trainset.txt'.format(self.split)), 'r') as f:
88 | self.image_names_to_index_train = {}
89 | for i, line in enumerate(f):
90 | self.image_names_to_index_train[line.strip()] = i
91 | with open(os.path.join(self.dir_cnn, 'valset.txt'.format(self.split)), 'r') as f:
92 | self.image_names_to_index_val = {}
93 | for i, line in enumerate(f):
94 | self.image_names_to_index_val[line.strip()] = i
95 |
96 | def __getitem__(self, index):
97 | item = {}
98 | item['index'] = index
99 |
100 | # Process Question (word token)
101 | question = self.dataset['questions'][index]
102 | if self.load_original_annotation:
103 | item['original_question'] = question
104 |
105 | item['question_id'] = question['question_id']
106 |
107 | item['question'] = torch.tensor(question['question_wids'], dtype=torch.long)
108 | item['lengths'] = torch.tensor([len(question['question_wids'])], dtype=torch.long)
109 | item['image_name'] = question['image_name']
110 |
111 | # Process Object, Attribut and Relational features
112 | # Process Object, Attribut and Relational features
113 | if self.dir_rcnn:
114 | item = self.add_rcnn_to_item(item)
115 | elif self.dir_cnn:
116 | item = self.add_cnn_to_item(item)
117 |
118 | # Process Answer if exists
119 | if 'annotations' in self.dataset:
120 | annotation = self.dataset['annotations'][index]
121 | if self.load_original_annotation:
122 | item['original_annotation'] = annotation
123 | if 'train' in self.split and self.samplingans:
124 | proba = annotation['answers_count']
125 | proba = proba / np.sum(proba)
126 | item['answer_id'] = int(np.random.choice(annotation['answers_id'], p=proba))
127 | else:
128 | item['answer_id'] = annotation['answer_id']
129 | item['class_id'] = torch.tensor([item['answer_id']], dtype=torch.long)
130 | item['answer'] = annotation['answer']
131 | item['question_type'] = annotation['question_type']
132 | else:
133 | if item['question_id'] in self.is_qid_testdev:
134 | item['is_testdev'] = True
135 | else:
136 | item['is_testdev'] = False
137 |
138 | # if Options()['model.network.name'] == 'xmn_net':
139 | # num_feat = 36
140 | # relation_mask = np.zeros((num_feat, num_feat))
141 | # boxes = item['coord']
142 | # for i in range(num_feat):
143 | # for j in range(i+1, num_feat):
144 | # # if there is no overlap between two bounding box
145 | # if boxes[0,i]>boxes[2,j] or boxes[0,j]>boxes[2,i] or boxes[1,i]>boxes[3,j] or boxes[1,j]>boxes[3,i]:
146 | # pass
147 | # else:
148 | # relation_mask[i,j] = relation_mask[j,i] = 1
149 | # relation_mask = torch.from_numpy(relation_mask).byte()
150 | # item['relation_mask'] = relation_mask
151 |
152 | return item
153 |
154 | def download(self):
155 | dir_zip = osp.join(self.dir_raw, 'zip')
156 | os.system('mkdir -p '+dir_zip)
157 | dir_ann = osp.join(self.dir_raw, 'annotations')
158 | os.system('mkdir -p '+dir_ann)
159 | os.system('wget http://visualqa.org/data/mscoco/vqa/v2_Questions_Train_mscoco.zip -P '+dir_zip)
160 | os.system('wget http://visualqa.org/data/mscoco/vqa/v2_Questions_Val_mscoco.zip -P '+dir_zip)
161 | os.system('wget http://visualqa.org/data/mscoco/vqa/v2_Questions_Test_mscoco.zip -P '+dir_zip)
162 | os.system('wget http://visualqa.org/data/mscoco/vqa/v2_Annotations_Train_mscoco.zip -P '+dir_zip)
163 | os.system('wget http://visualqa.org/data/mscoco/vqa/v2_Annotations_Val_mscoco.zip -P '+dir_zip)
164 | os.system('unzip '+osp.join(dir_zip, 'v2_Questions_Train_mscoco.zip')+' -d '+dir_ann)
165 | os.system('unzip '+osp.join(dir_zip, 'v2_Questions_Val_mscoco.zip')+' -d '+dir_ann)
166 | os.system('unzip '+osp.join(dir_zip, 'v2_Questions_Test_mscoco.zip')+' -d '+dir_ann)
167 | os.system('unzip '+osp.join(dir_zip, 'v2_Annotations_Train_mscoco.zip')+' -d '+dir_ann)
168 | os.system('unzip '+osp.join(dir_zip, 'v2_Annotations_Val_mscoco.zip')+' -d '+dir_ann)
169 | os.system('mv '+osp.join(dir_ann, 'v2_mscoco_train2014_annotations.json')+' '
170 | +osp.join(dir_ann, 'mscoco_train2014_annotations.json'))
171 | os.system('mv '+osp.join(dir_ann, 'v2_mscoco_val2014_annotations.json')+' '
172 | +osp.join(dir_ann, 'mscoco_val2014_annotations.json'))
173 | os.system('mv '+osp.join(dir_ann, 'v2_OpenEnded_mscoco_train2014_questions.json')+' '
174 | +osp.join(dir_ann, 'OpenEnded_mscoco_train2014_questions.json'))
175 | os.system('mv '+osp.join(dir_ann, 'v2_OpenEnded_mscoco_val2014_questions.json')+' '
176 | +osp.join(dir_ann, 'OpenEnded_mscoco_val2014_questions.json'))
177 | os.system('mv '+osp.join(dir_ann, 'v2_OpenEnded_mscoco_test2015_questions.json')+' '
178 | +osp.join(dir_ann, 'OpenEnded_mscoco_test2015_questions.json'))
179 | os.system('mv '+osp.join(dir_ann, 'v2_OpenEnded_mscoco_test-dev2015_questions.json')+' '
180 | +osp.join(dir_ann, 'OpenEnded_mscoco_test-dev2015_questions.json'))
181 |
--------------------------------------------------------------------------------
/cfvqa/datasets/vqacp.py:
--------------------------------------------------------------------------------
1 | import os
2 | import csv
3 | import copy
4 | import json
5 | import torch
6 | import numpy as np
7 | from tqdm import tqdm
8 | from os import path as osp
9 | from bootstrap.lib.logger import Logger
10 | from block.datasets.vqa_utils import AbstractVQA
11 | from copy import deepcopy
12 | import random
13 | import h5py
14 |
15 | class VQACP(AbstractVQA):
16 |
17 | def __init__(self,
18 | dir_data='data/vqa/vqacp2',
19 | split='train',
20 | batch_size=80,
21 | nb_threads=4,
22 | pin_memory=False,
23 | shuffle=False,
24 | nans=1000,
25 | minwcount=10,
26 | nlp='mcb',
27 | proc_split='train',
28 | samplingans=False,
29 | dir_rcnn='data/coco/extract_rcnn',
30 | dir_cnn=None,
31 | dir_vgg16=None,
32 | has_testdevset=False,
33 | ):
34 | super(VQACP, self).__init__(
35 | dir_data=dir_data,
36 | split=split,
37 | batch_size=batch_size,
38 | nb_threads=nb_threads,
39 | pin_memory=pin_memory,
40 | shuffle=shuffle,
41 | nans=nans,
42 | minwcount=minwcount,
43 | nlp=nlp,
44 | proc_split=proc_split,
45 | samplingans=samplingans,
46 | has_valset=True,
47 | has_testset=False,
48 | has_testdevset=has_testdevset,
49 | has_testset_anno=False,
50 | has_answers_occurence=True,
51 | do_tokenize_answers=False)
52 | self.dir_rcnn = dir_rcnn
53 | self.dir_cnn = dir_cnn
54 | self.dir_vgg16 = dir_vgg16
55 | self.load_image_features()
56 | self.load_original_annotation = False
57 |
58 | def add_rcnn_to_item(self, item):
59 | path_rcnn = os.path.join(self.dir_rcnn, '{}.pth'.format(item['image_name']))
60 | item_rcnn = torch.load(path_rcnn)
61 | item['visual'] = item_rcnn['pooled_feat']
62 | item['coord'] = item_rcnn['rois']
63 | item['norm_coord'] = item_rcnn['norm_rois']
64 | item['nb_regions'] = item['visual'].size(0)
65 | return item
66 |
67 | def load_image_features(self):
68 | if self.dir_cnn:
69 | filename_train = os.path.join(self.dir_cnn, 'trainset.hdf5')
70 | filename_val = os.path.join(self.dir_cnn, 'valset.hdf5')
71 | Logger()(f"Opening file {filename_train}, {filename_val}")
72 | self.image_features_train = h5py.File(filename_train, 'r', swmr=True)
73 | self.image_features_val = h5py.File(filename_val, 'r', swmr=True)
74 | # load txt
75 | with open(os.path.join(self.dir_cnn, 'trainset.txt'.format(self.split)), 'r') as f:
76 | self.image_names_to_index_train = {}
77 | for i, line in enumerate(f):
78 | self.image_names_to_index_train[line.strip()] = i
79 | with open(os.path.join(self.dir_cnn, 'valset.txt'.format(self.split)), 'r') as f:
80 | self.image_names_to_index_val = {}
81 | for i, line in enumerate(f):
82 | self.image_names_to_index_val[line.strip()] = i
83 | elif self.dir_vgg16:
84 | # list filenames
85 | self.filenames_train = os.listdir(os.path.join(self.dir_vgg16, 'train'))
86 | self.filenames_val = os.listdir(os.path.join(self.dir_vgg16, 'val'))
87 |
88 |
89 | def add_vgg_to_item(self, item):
90 | image_name = item['image_name']
91 | filename = image_name + '.pth'
92 | if filename in self.filenames_train:
93 | path = os.path.join(self.dir_vgg16, 'train', filename)
94 | elif filename in self.filenames_val:
95 | path = os.path.join(self.dir_vgg16, 'val', filename)
96 | visual = torch.load(path)
97 | visual = visual.permute(1, 2, 0).view(14*14, 512)
98 | item['visual'] = visual
99 | return item
100 |
101 | def add_cnn_to_item(self, item):
102 | image_name = item['image_name']
103 | if image_name in self.image_names_to_index_train:
104 | index = self.image_names_to_index_train[image_name]
105 | image = torch.tensor(self.image_features_train['att'][index])
106 | elif image_name in self.image_names_to_index_val:
107 | index = self.image_names_to_index_val[image_name]
108 | image = torch.tensor(self.image_features_val['att'][index])
109 | image = image.permute(1, 2, 0).view(196, 2048)
110 | item['visual'] = image
111 | return item
112 |
113 | def __getitem__(self, index):
114 | item = {}
115 | item['index'] = index
116 |
117 | # Process Question (word token)
118 | question = self.dataset['questions'][index]
119 | if self.load_original_annotation:
120 | item['original_question'] = question
121 | item['question_id'] = question['question_id']
122 | item['question'] = torch.LongTensor(question['question_wids'])
123 | item['lengths'] = torch.LongTensor([len(question['question_wids'])])
124 | item['image_name'] = question['image_name']
125 |
126 | # Process Object, Attribut and Relational features
127 | if self.dir_rcnn:
128 | item = self.add_rcnn_to_item(item)
129 | elif self.dir_cnn:
130 | item = self.add_cnn_to_item(item)
131 | elif self.dir_vgg16:
132 | item = self.add_vgg_to_item(item)
133 |
134 | # Process Answer if exists
135 | if 'annotations' in self.dataset:
136 | annotation = self.dataset['annotations'][index]
137 | if self.load_original_annotation:
138 | item['original_annotation'] = annotation
139 | if 'train' in self.split and self.samplingans:
140 | proba = annotation['answers_count']
141 | proba = proba / np.sum(proba)
142 | item['answer_id'] = int(np.random.choice(annotation['answers_id'], p=proba))
143 | else:
144 | item['answer_id'] = annotation['answer_id']
145 | item['class_id'] = torch.LongTensor([item['answer_id']])
146 | item['answer'] = annotation['answer']
147 | item['question_type'] = annotation['question_type']
148 |
149 | return item
150 |
151 | def download(self):
152 | dir_ann = osp.join(self.dir_raw, 'annotations')
153 | os.system('mkdir -p '+dir_ann)
154 | os.system('wget https://computing.ece.vt.edu/~aish/vqacp/vqacp_v1_train_questions.json -P' + dir_ann)
155 | os.system('wget https://computing.ece.vt.edu/~aish/vqacp/vqacp_v1_test_questions.json -P' + dir_ann)
156 | os.system('wget https://computing.ece.vt.edu/~aish/vqacp/vqacp_v1_train_annotations.json -P' + dir_ann)
157 | os.system('wget https://computing.ece.vt.edu/~aish/vqacp/vqacp_v1_test_annotations.json -P' + dir_ann)
158 | train_q = {"questions":json.load(open(osp.join(dir_ann, "vqacp_v1_train_questions.json")))}
159 | val_q = {"questions":json.load(open(osp.join(dir_ann, "vqacp_v1_test_questions.json")))}
160 | train_ann = {"annotations":json.load(open(osp.join(dir_ann, "vqacp_v1_train_annotations.json")))}
161 | val_ann = {"annotations":json.load(open(osp.join(dir_ann, "vqacp_v1_test_annotations.json")))}
162 | train_q['info'] = {}
163 | train_q['data_type'] = 'mscoco'
164 | train_q['data_subtype'] = "train2014cp"
165 | train_q['task_type'] = "Open-Ended"
166 | train_q['license'] = {}
167 | val_q['info'] = {}
168 | val_q['data_type'] = 'mscoco'
169 | val_q['data_subtype'] = "val2014cp"
170 | val_q['task_type'] = "Open-Ended"
171 | val_q['license'] = {}
172 | for k in ["info", 'data_type','data_subtype', 'license']:
173 | train_ann[k] = train_q[k]
174 | val_ann[k] = val_q[k]
175 | with open(osp.join(dir_ann, "OpenEnded_mscoco_train2014_questions.json"), 'w') as F:
176 | F.write(json.dumps(train_q))
177 | with open(osp.join(dir_ann, "OpenEnded_mscoco_val2014_questions.json"), 'w') as F:
178 | F.write(json.dumps(val_q))
179 | with open(osp.join(dir_ann, "mscoco_train2014_annotations.json"), 'w') as F:
180 | F.write(json.dumps(train_ann))
181 | with open(osp.join(dir_ann, "mscoco_val2014_annotations.json"), 'w') as F:
182 | F.write(json.dumps(val_ann))
183 |
184 | def add_image_names(self, dataset):
185 | for q in dataset['questions']:
186 | q['image_name'] = 'COCO_%s_%012d.jpg'%(q['coco_split'],q['image_id'])
187 | return dataset
188 |
189 |
--------------------------------------------------------------------------------
/cfvqa/datasets/vqacp2.py:
--------------------------------------------------------------------------------
1 | import os
2 | import csv
3 | import copy
4 | import json
5 | import torch
6 | import numpy as np
7 | from tqdm import tqdm
8 | from os import path as osp
9 | from bootstrap.lib.logger import Logger
10 | from block.datasets.vqa_utils import AbstractVQA
11 | from copy import deepcopy
12 | import random
13 | import h5py
14 |
15 | class VQACP2(AbstractVQA):
16 |
17 | def __init__(self,
18 | dir_data='data/vqa/vqacp2',
19 | split='train',
20 | batch_size=80,
21 | nb_threads=4,
22 | pin_memory=False,
23 | shuffle=False,
24 | nans=1000,
25 | minwcount=10,
26 | nlp='mcb',
27 | proc_split='train',
28 | samplingans=False,
29 | dir_rcnn='data/coco/extract_rcnn',
30 | dir_cnn=None,
31 | dir_vgg16=None,
32 | has_testdevset=False,
33 | ):
34 | super(VQACP2, self).__init__(
35 | dir_data=dir_data,
36 | split=split,
37 | batch_size=batch_size,
38 | nb_threads=nb_threads,
39 | pin_memory=pin_memory,
40 | shuffle=shuffle,
41 | nans=nans,
42 | minwcount=minwcount,
43 | nlp=nlp,
44 | proc_split=proc_split,
45 | samplingans=samplingans,
46 | has_valset=True,
47 | has_testset=False,
48 | has_testdevset=has_testdevset,
49 | has_testset_anno=False,
50 | has_answers_occurence=True,
51 | do_tokenize_answers=False)
52 | self.dir_rcnn = dir_rcnn
53 | self.dir_cnn = dir_cnn
54 | self.dir_vgg16 = dir_vgg16
55 | self.load_image_features()
56 | self.load_original_annotation = False
57 |
58 | def add_rcnn_to_item(self, item):
59 | path_rcnn = os.path.join(self.dir_rcnn, '{}.pth'.format(item['image_name']))
60 | item_rcnn = torch.load(path_rcnn)
61 | item['visual'] = item_rcnn['pooled_feat']
62 | item['coord'] = item_rcnn['rois']
63 | item['norm_coord'] = item_rcnn['norm_rois']
64 | item['nb_regions'] = item['visual'].size(0)
65 | return item
66 |
67 | def load_image_features(self):
68 | if self.dir_cnn:
69 | filename_train = os.path.join(self.dir_cnn, 'trainset.hdf5')
70 | filename_val = os.path.join(self.dir_cnn, 'valset.hdf5')
71 | Logger()(f"Opening file {filename_train}, {filename_val}")
72 | self.image_features_train = h5py.File(filename_train, 'r', swmr=True)
73 | self.image_features_val = h5py.File(filename_val, 'r', swmr=True)
74 | # load txt
75 | with open(os.path.join(self.dir_cnn, 'trainset.txt'.format(self.split)), 'r') as f:
76 | self.image_names_to_index_train = {}
77 | for i, line in enumerate(f):
78 | self.image_names_to_index_train[line.strip()] = i
79 | with open(os.path.join(self.dir_cnn, 'valset.txt'.format(self.split)), 'r') as f:
80 | self.image_names_to_index_val = {}
81 | for i, line in enumerate(f):
82 | self.image_names_to_index_val[line.strip()] = i
83 | elif self.dir_vgg16:
84 | # list filenames
85 | self.filenames_train = os.listdir(os.path.join(self.dir_vgg16, 'train'))
86 | self.filenames_val = os.listdir(os.path.join(self.dir_vgg16, 'val'))
87 |
88 |
89 | def add_vgg_to_item(self, item):
90 | image_name = item['image_name']
91 | filename = image_name + '.pth'
92 | if filename in self.filenames_train:
93 | path = os.path.join(self.dir_vgg16, 'train', filename)
94 | elif filename in self.filenames_val:
95 | path = os.path.join(self.dir_vgg16, 'val', filename)
96 | visual = torch.load(path)
97 | visual = visual.permute(1, 2, 0).view(14*14, 512)
98 | item['visual'] = visual
99 | return item
100 |
101 | def add_cnn_to_item(self, item):
102 | image_name = item['image_name']
103 | if image_name in self.image_names_to_index_train:
104 | index = self.image_names_to_index_train[image_name]
105 | image = torch.tensor(self.image_features_train['att'][index])
106 | elif image_name in self.image_names_to_index_val:
107 | index = self.image_names_to_index_val[image_name]
108 | image = torch.tensor(self.image_features_val['att'][index])
109 | image = image.permute(1, 2, 0).view(196, 2048)
110 | item['visual'] = image
111 | return item
112 |
113 | def __getitem__(self, index):
114 | item = {}
115 | item['index'] = index
116 |
117 | # Process Question (word token)
118 | question = self.dataset['questions'][index]
119 | if self.load_original_annotation:
120 | item['original_question'] = question
121 | item['question_id'] = question['question_id']
122 | item['question'] = torch.LongTensor(question['question_wids'])
123 | item['lengths'] = torch.LongTensor([len(question['question_wids'])])
124 | item['image_name'] = question['image_name']
125 |
126 | # Process Object, Attribut and Relational features
127 | if self.dir_rcnn:
128 | item = self.add_rcnn_to_item(item)
129 | elif self.dir_cnn:
130 | item = self.add_cnn_to_item(item)
131 | elif self.dir_vgg16:
132 | item = self.add_vgg_to_item(item)
133 |
134 | # Process Answer if exists
135 | if 'annotations' in self.dataset:
136 | annotation = self.dataset['annotations'][index]
137 | if self.load_original_annotation:
138 | item['original_annotation'] = annotation
139 | if 'train' in self.split and self.samplingans:
140 | proba = annotation['answers_count']
141 | proba = proba / np.sum(proba)
142 | item['answer_id'] = int(np.random.choice(annotation['answers_id'], p=proba))
143 | else:
144 | item['answer_id'] = annotation['answer_id']
145 | item['class_id'] = torch.LongTensor([item['answer_id']])
146 | item['answer'] = annotation['answer']
147 | item['question_type'] = annotation['question_type']
148 |
149 | return item
150 |
151 | def download(self):
152 | dir_ann = osp.join(self.dir_raw, 'annotations')
153 | os.system('mkdir -p '+dir_ann)
154 | os.system('wget https://computing.ece.vt.edu/~aish/vqacp/vqacp_v2_train_questions.json -P' + dir_ann)
155 | os.system('wget https://computing.ece.vt.edu/~aish/vqacp/vqacp_v2_test_questions.json -P' + dir_ann)
156 | os.system('wget https://computing.ece.vt.edu/~aish/vqacp/vqacp_v2_train_annotations.json -P' + dir_ann)
157 | os.system('wget https://computing.ece.vt.edu/~aish/vqacp/vqacp_v2_test_annotations.json -P' + dir_ann)
158 | train_q = {"questions":json.load(open(osp.join(dir_ann, "vqacp_v2_train_questions.json")))}
159 | val_q = {"questions":json.load(open(osp.join(dir_ann, "vqacp_v2_test_questions.json")))}
160 | train_ann = {"annotations":json.load(open(osp.join(dir_ann, "vqacp_v2_train_annotations.json")))}
161 | val_ann = {"annotations":json.load(open(osp.join(dir_ann, "vqacp_v2_test_annotations.json")))}
162 | train_q['info'] = {}
163 | train_q['data_type'] = 'mscoco'
164 | train_q['data_subtype'] = "train2014cp"
165 | train_q['task_type'] = "Open-Ended"
166 | train_q['license'] = {}
167 | val_q['info'] = {}
168 | val_q['data_type'] = 'mscoco'
169 | val_q['data_subtype'] = "val2014cp"
170 | val_q['task_type'] = "Open-Ended"
171 | val_q['license'] = {}
172 | for k in ["info", 'data_type','data_subtype', 'license']:
173 | train_ann[k] = train_q[k]
174 | val_ann[k] = val_q[k]
175 | with open(osp.join(dir_ann, "OpenEnded_mscoco_train2014_questions.json"), 'w') as F:
176 | F.write(json.dumps(train_q))
177 | with open(osp.join(dir_ann, "OpenEnded_mscoco_val2014_questions.json"), 'w') as F:
178 | F.write(json.dumps(val_q))
179 | with open(osp.join(dir_ann, "mscoco_train2014_annotations.json"), 'w') as F:
180 | F.write(json.dumps(train_ann))
181 | with open(osp.join(dir_ann, "mscoco_val2014_annotations.json"), 'w') as F:
182 | F.write(json.dumps(val_ann))
183 |
184 | def add_image_names(self, dataset):
185 | for q in dataset['questions']:
186 | q['image_name'] = 'COCO_%s_%012d.jpg'%(q['coco_split'],q['image_id'])
187 | return dataset
188 |
189 |
--------------------------------------------------------------------------------
/cfvqa/models/criterions/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yuleiniu/cfvqa/736795bef50db9c76818f9a08202c7f325489afd/cfvqa/models/criterions/__init__.py
--------------------------------------------------------------------------------
/cfvqa/models/criterions/cfvqa_criterion.py:
--------------------------------------------------------------------------------
1 | import torch.nn as nn
2 | import torch
3 | import torch.nn.functional as F
4 | from bootstrap.lib.logger import Logger
5 | from bootstrap.lib.options import Options
6 |
7 | class CFVQACriterion(nn.Module):
8 |
9 | def __init__(self, question_loss_weight=1.0, vision_loss_weight=1.0, is_va=True):
10 | super().__init__()
11 | self.is_va = is_va
12 |
13 | Logger()(f'CFVQACriterion, with question_loss_weight = ({question_loss_weight})')
14 | if self.is_va:
15 | Logger()(f'CFVQACriterion, with vision_loss_weight = ({vision_loss_weight})')
16 |
17 | self.fusion_loss = nn.CrossEntropyLoss()
18 | self.question_loss = nn.CrossEntropyLoss()
19 | self.question_loss_weight = question_loss_weight
20 | if self.is_va:
21 | self.vision_loss = nn.CrossEntropyLoss()
22 | self.vision_loss_weight = vision_loss_weight
23 |
24 | def forward(self, net_out, batch):
25 | out = {}
26 | class_id = batch['class_id'].squeeze(1)
27 |
28 | logits_rubi = net_out['logits_all']
29 | fusion_loss = self.fusion_loss(logits_rubi, class_id)
30 |
31 | logits_q = net_out['logits_q']
32 | question_loss = self.question_loss(logits_q, class_id)
33 |
34 | if self.is_va:
35 | logits_v = net_out['logits_v']
36 | vision_loss = self.vision_loss(logits_v, class_id)
37 |
38 | nde = net_out['z_nde']
39 | p_te = torch.nn.functional.softmax(logits_rubi, -1).clone().detach()
40 | p_nde = torch.nn.functional.softmax(nde, -1)
41 | kl_loss = - p_te*p_nde.log()
42 | kl_loss = kl_loss.sum(1).mean()
43 |
44 | loss = fusion_loss \
45 | + self.question_loss_weight * question_loss \
46 | + kl_loss
47 | if self.is_va:
48 | loss += self.vision_loss_weight * vision_loss
49 |
50 | out['loss'] = loss
51 | out['loss_mm_q'] = fusion_loss
52 | out['loss_q'] = question_loss
53 | if self.is_va:
54 | out['loss_v'] = vision_loss
55 | return out
56 |
--------------------------------------------------------------------------------
/cfvqa/models/criterions/factory.py:
--------------------------------------------------------------------------------
1 | from bootstrap.lib.options import Options
2 | from block.models.criterions.vqa_cross_entropy import VQACrossEntropyLoss
3 | from .rubi_criterion import RUBiCriterion
4 | from .cfvqa_criterion import CFVQACriterion
5 |
6 | def factory(engine, mode):
7 | name = Options()['model.criterion.name']
8 | split = engine.dataset[mode].split
9 | eval_only = 'train' not in engine.dataset
10 |
11 | opt = Options()['model.criterion']
12 | if split == "test" and 'tdiuc' not in Options()['dataset.name']:
13 | return None
14 | if name == 'vqa_cross_entropy':
15 | criterion = VQACrossEntropyLoss()
16 | elif name == "rubi_criterion":
17 | criterion = RUBiCriterion(
18 | question_loss_weight=opt['question_loss_weight']
19 | )
20 | elif name == "cfvqa_criterion":
21 | criterion = CFVQACriterion(
22 | question_loss_weight=opt['question_loss_weight'],
23 | vision_loss_weight=opt['vision_loss_weight'],
24 | is_va=True
25 | )
26 | elif name == "cfvqasimple_criterion":
27 | criterion = CFVQACriterion(
28 | question_loss_weight=opt['question_loss_weight'],
29 | is_va=False
30 | )
31 | else:
32 | raise ValueError(name)
33 | return criterion
34 |
--------------------------------------------------------------------------------
/cfvqa/models/criterions/rubi_criterion.py:
--------------------------------------------------------------------------------
1 | import torch.nn as nn
2 | import torch
3 | import torch.nn.functional as F
4 | from bootstrap.lib.logger import Logger
5 | from bootstrap.lib.options import Options
6 |
7 | class RUBiCriterion(nn.Module):
8 |
9 | def __init__(self, question_loss_weight=1.0):
10 | super().__init__()
11 |
12 | Logger()(f'RUBiCriterion, with question_loss_weight = ({question_loss_weight})')
13 |
14 | self.question_loss_weight = question_loss_weight
15 | self.fusion_loss = nn.CrossEntropyLoss()
16 | self.question_loss = nn.CrossEntropyLoss()
17 |
18 | def forward(self, net_out, batch):
19 | out = {}
20 | # logits = net_out['logits']
21 | logits_q = net_out['logits_q']
22 | logits_rubi = net_out['logits_all']
23 | class_id = batch['class_id'].squeeze(1)
24 | fusion_loss = self.fusion_loss(logits_rubi, class_id)
25 | question_loss = self.question_loss(logits_q, class_id)
26 | loss = fusion_loss + self.question_loss_weight * question_loss
27 |
28 | out['loss'] = loss
29 | out['loss_mm_q'] = fusion_loss
30 | out['loss_q'] = question_loss
31 | return out
32 |
--------------------------------------------------------------------------------
/cfvqa/models/metrics/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yuleiniu/cfvqa/736795bef50db9c76818f9a08202c7f325489afd/cfvqa/models/metrics/__init__.py
--------------------------------------------------------------------------------
/cfvqa/models/metrics/factory.py:
--------------------------------------------------------------------------------
1 | from bootstrap.lib.options import Options
2 | from block.models.metrics.vqa_accuracies import VQAAccuracies
3 | from .vqa_rubi_metrics import VQARUBiMetrics
4 | from .vqa_cfvqa_metrics import VQACFVQAMetrics
5 | from .vqa_cfvqasimple_metrics import VQACFVQASimpleMetrics
6 |
7 | def factory(engine, mode):
8 | name = Options()['model.metric.name']
9 | metric = None
10 |
11 | if name == 'vqa_accuracies':
12 | open_ended = ('tdiuc' not in Options()['dataset.name'] and 'gqa' not in Options()['dataset.name'])
13 | if mode == 'train':
14 | split = engine.dataset['train'].split
15 | if split == 'train':
16 | metric = VQAAccuracies(engine,
17 | mode='train',
18 | open_ended=open_ended,
19 | tdiuc=True,
20 | dir_exp=Options()['exp.dir'],
21 | dir_vqa=Options()['dataset.dir'])
22 | elif split == 'trainval':
23 | metric = None
24 | else:
25 | raise ValueError(split)
26 | elif mode == 'eval':
27 | metric = VQAAccuracies(engine,
28 | mode='eval',
29 | open_ended=open_ended,
30 | tdiuc=('tdiuc' in Options()['dataset.name'] or Options()['dataset.eval_split'] != 'test'),
31 | dir_exp=Options()['exp.dir'],
32 | dir_vqa=Options()['dataset.dir'])
33 | else:
34 | metric = None
35 |
36 | elif name == "vqa_rubi_metrics":
37 | open_ended = ('tdiuc' not in Options()['dataset.name'] and 'gqa' not in Options()['dataset.name'])
38 | metric = VQARUBiMetrics(engine,
39 | mode=mode,
40 | open_ended=open_ended,
41 | tdiuc=True,
42 | dir_exp=Options()['exp.dir'],
43 | dir_vqa=Options()['dataset.dir']
44 | )
45 |
46 | elif name == "vqa_cfvqa_metrics":
47 | open_ended = ('tdiuc' not in Options()['dataset.name'] and 'gqa' not in Options()['dataset.name'])
48 | metric = VQACFVQAMetrics(engine,
49 | mode=mode,
50 | open_ended=open_ended,
51 | tdiuc=True,
52 | dir_exp=Options()['exp.dir'],
53 | dir_vqa=Options()['dataset.dir'],
54 | )
55 |
56 | elif name == "vqa_cfvqasimple_metrics":
57 | open_ended = ('tdiuc' not in Options()['dataset.name'] and 'gqa' not in Options()['dataset.name'])
58 | metric = VQACFVQASimpleMetrics(engine,
59 | mode=mode,
60 | open_ended=open_ended,
61 | tdiuc=True,
62 | dir_exp=Options()['exp.dir'],
63 | dir_vqa=Options()['dataset.dir'],
64 | )
65 |
66 | else:
67 | raise ValueError(name)
68 | return metric
69 |
--------------------------------------------------------------------------------
/cfvqa/models/metrics/vqa_cfvqa_metrics.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import os
4 | import json
5 | from scipy import stats
6 | import numpy as np
7 | from collections import defaultdict
8 |
9 | from bootstrap.models.metrics.accuracy import accuracy
10 | from block.models.metrics.vqa_accuracies import VQAAccuracies
11 | from bootstrap.lib.logger import Logger
12 | from bootstrap.lib.options import Options
13 | from bootstrap.lib.logger import Logger
14 |
15 | class VQAAccuracy(nn.Module):
16 |
17 | def __init__(self, topk=[1,5]):
18 | super().__init__()
19 | self.topk = topk
20 | self.metric_list = ['_all', '_vq', '_cfvqa', '_q', '_v']
21 |
22 | def forward(self, cri_out, net_out, batch):
23 | out = {}
24 | class_id = batch['class_id'].data.cpu()
25 | for key in self.metric_list:
26 | logits = net_out[f'logits{key}'].data.cpu()
27 | acc_out = accuracy(logits, class_id, topk=self.topk)
28 | for i, k in enumerate(self.topk):
29 | out[f'accuracy{key}_top{k}'] = acc_out[i]
30 | return out
31 |
32 |
33 | class VQACFVQAMetrics(VQAAccuracies):
34 |
35 | def __init__(self, *args, **kwargs):
36 | super().__init__(*args, **kwargs)
37 | self.metric_list = ['_all', '_vq', '_cfvqa', '_q', '_v']
38 | if Options()['dataset.eval_split'] == 'test': # 0430
39 | self.accuracy = None
40 | else:
41 | self.accuracy = VQAAccuracy()
42 | self.rm_dir_rslt = 1 if Options()['dataset.train_split'] is not None else 0
43 |
44 | def forward(self, cri_out, net_out, batch):
45 | out = {}
46 | if self.accuracy is not None:
47 | out = self.accuracy(cri_out, net_out, batch)
48 |
49 | # add answers and answer_ids keys to net_out
50 | net_out = self.engine.model.network.process_answers(net_out)
51 |
52 | batch_size = len(batch['index'])
53 | for i in range(batch_size):
54 |
55 | # Open Ended Accuracy (VQA-VQA2)
56 | if self.open_ended:
57 | for key in self.metric_list:
58 | pred_item = {
59 | 'question_id': batch['question_id'][i],
60 | 'answer': net_out[f'answers{key}'][i]
61 | }
62 | self.results[key].append(pred_item)
63 |
64 | # if self.dataset.split == 'test': # 0430
65 | # pred_item = {
66 | # 'question_id': batch['question_id'][i],
67 | # 'answer': net_out[f'answers{key}'][i]
68 | # # 'answer': net_out[f'answers'][i]
69 | # }
70 | # # if 'is_testdev' in batch and batch['is_testdev'][i]: # 0430
71 | # # self.results_testdev.append(pred_item)
72 |
73 | # if self.logits['tensor'] is None:
74 | # self.logits['tensor'] = torch.FloatTensor(len(self.dataset), logits.size(1))
75 |
76 | # self.logits['tensor'][self.idx] = logits[i]
77 | # self.logits['qid_to_idx'][batch['question_id'][i]] = self.idx
78 |
79 | # self.idx += 1
80 |
81 | # TDIUC metrics
82 | if self.tdiuc:
83 | gt_aid = batch['answer_id'][i]
84 | gt_ans = batch['answer'][i]
85 | gt_type = batch['question_type'][i]
86 | self.gt_types.append(gt_type)
87 | if gt_ans in self.ans_to_aid:
88 | self.gt_aids.append(gt_aid)
89 | else:
90 | self.gt_aids.append(-1)
91 | self.gt_aid_not_found += 1
92 |
93 | for key in self.metric_list:
94 | qid = batch['question_id'][i]
95 | pred_aid = net_out[f'answer_ids{key}'][i]
96 | self.pred_aids[key].append(pred_aid)
97 |
98 | self.res_by_type[key][gt_type+'_pred'].append(pred_aid)
99 |
100 | if gt_ans in self.ans_to_aid:
101 | self.res_by_type[key][gt_type+'_gt'].append(gt_aid)
102 | if gt_aid == pred_aid:
103 | self.res_by_type[key][gt_type+'_t'].append(pred_aid)
104 | else:
105 | self.res_by_type[key][gt_type+'_f'].append(pred_aid)
106 | else:
107 | self.res_by_type[key][gt_type+'_gt'].append(-1)
108 | self.res_by_type[key][gt_type+'_f'].append(pred_aid)
109 | return out
110 |
111 | def reset_oe(self):
112 | self.results = dict()
113 | self.dir_rslt = dict()
114 | self.path_rslt = dict()
115 | for key in self.metric_list:
116 | self.results[key] = []
117 | self.dir_rslt[key] = os.path.join(
118 | self.dir_exp,
119 | f'results{key}',
120 | self.dataset.split,
121 | 'epoch,{}'.format(self.engine.epoch))
122 | os.system('mkdir -p '+self.dir_rslt[key])
123 | self.path_rslt[key] = os.path.join(
124 | self.dir_rslt[key],
125 | 'OpenEnded_mscoco_{}_model_results.json'.format(
126 | self.dataset.get_subtype()))
127 |
128 | if self.dataset.split == 'test':
129 | pass
130 | # self.results_testdev = []
131 | # self.path_rslt_testdev = os.path.join(
132 | # self.dir_rslt,
133 | # 'OpenEnded_mscoco_{}_model_results.json'.format(
134 | # self.dataset.get_subtype(testdev=True)))
135 |
136 | # self.path_logits = os.path.join(self.dir_rslt, 'logits.pth')
137 | # os.system('mkdir -p '+os.path.dirname(self.path_logits))
138 |
139 | # self.logits = {}
140 | # self.logits['aid_to_ans'] = self.engine.model.network.aid_to_ans
141 | # self.logits['qid_to_idx'] = {}
142 | # self.logits['tensor'] = None
143 |
144 | # self.idx = 0
145 |
146 | # path_aid_to_ans = os.path.join(self.dir_rslt, 'aid_to_ans.json')
147 | # with open(path_aid_to_ans, 'w') as f:
148 | # json.dump(self.engine.model.network.aid_to_ans, f)
149 |
150 |
151 | def reset_tdiuc(self):
152 | self.pred_aids = defaultdict(list)
153 | self.gt_aids = []
154 | self.gt_types = []
155 | self.gt_aid_not_found = 0
156 | self.res_by_type = {key: defaultdict(list) for key in self.metric_list}
157 |
158 |
159 | def compute_oe_accuracy(self):
160 | logs_name_prefix = Options()['misc'].get('logs_name', '') or ''
161 |
162 | for key in self.metric_list:
163 | logs_name = (logs_name_prefix + key) or "logs"
164 | with open(self.path_rslt[key], 'w') as f:
165 | json.dump(self.results[key], f)
166 |
167 | # if self.dataset.split == 'test':
168 | # with open(self.path_rslt_testdev, 'w') as f:
169 | # json.dump(self.results_testdev, f)
170 |
171 | if 'test' not in self.dataset.split:
172 | call_to_prog = 'python -m block.models.metrics.compute_oe_accuracy '\
173 | + '--dir_vqa {} --dir_exp {} --dir_rslt {} --epoch {} --split {} --logs_name {} --rm {} &'\
174 | .format(self.dir_vqa, self.dir_exp, self.dir_rslt[key], self.engine.epoch, self.dataset.split, logs_name, self.rm_dir_rslt)
175 | Logger()('`'+call_to_prog+'`')
176 | os.system(call_to_prog)
177 |
178 |
179 | def compute_tdiuc_metrics(self):
180 | Logger()('{} of validation answers were not found in ans_to_aid'.format(self.gt_aid_not_found))
181 |
182 | for key in self.metric_list:
183 | Logger()(f'Computing TDIUC metrics for logits{key}')
184 | accuracy = float(100*np.mean(np.array(self.pred_aids[key])==np.array(self.gt_aids)))
185 | Logger()('Overall Traditional Accuracy is {:.2f}'.format(accuracy))
186 | Logger().log_value('{}_epoch.tdiuc.accuracy{}'.format(self.mode, key), accuracy, should_print=False)
187 |
188 | types = list(set(self.gt_types))
189 | sum_acc = []
190 | eps = 1e-10
191 |
192 | Logger()('---------------------------------------')
193 | Logger()('Not using per-answer normalization...')
194 | for tp in types:
195 | acc = 100*(len(self.res_by_type[key][tp+'_t'])/len(self.res_by_type[key][tp+'_t']+self.res_by_type[key][tp+'_f']))
196 | sum_acc.append(acc+eps)
197 | Logger()(f"Accuracy {key} for class '{tp}' is {acc:.2f}")
198 | Logger().log_value('{}_epoch.tdiuc{}.perQuestionType.{}'.format(self.mode, key, tp), acc, should_print=False)
199 |
200 | acc_mpt_a = float(np.mean(np.array(sum_acc)))
201 | Logger()('Arithmetic MPT Accuracy {} is {:.2f}'.format(key, acc_mpt_a))
202 | Logger().log_value('{}_epoch.tdiuc{}.acc_mpt_a'.format(self.mode, key), acc_mpt_a, should_print=False)
203 |
204 | acc_mpt_h = float(stats.hmean(sum_acc))
205 | Logger()('Harmonic MPT Accuracy {} is {:.2f}'.format(key, acc_mpt_h))
206 | Logger().log_value('{}_epoch.tdiuc{}.acc_mpt_h'.format(self.mode, key), acc_mpt_h, should_print=False)
207 |
208 | Logger()('---------------------------------------')
209 | Logger()('Using per-answer normalization...')
210 | for tp in types:
211 | per_ans_stat = defaultdict(int)
212 | for g,p in zip(self.res_by_type[key][tp+'_gt'],self.res_by_type[key][tp+'_pred']):
213 | per_ans_stat[str(g)+'_gt']+=1
214 | if g==p:
215 | per_ans_stat[str(g)]+=1
216 | unq_acc = 0
217 | for unq_ans in set(self.res_by_type[key][tp+'_gt']):
218 | acc_curr_ans = per_ans_stat[str(unq_ans)]/per_ans_stat[str(unq_ans)+'_gt']
219 | unq_acc +=acc_curr_ans
220 | acc = 100*unq_acc/len(set(self.res_by_type[key][tp+'_gt']))
221 | sum_acc.append(acc+eps)
222 | Logger()("Accuracy {} for class '{}' is {:.2f}".format(key, tp, acc))
223 | Logger().log_value('{}_epoch.tdiuc{}.perQuestionType_norm.{}'.format(self.mode, key, tp), acc, should_print=False)
224 |
225 | acc_mpt_a = float(np.mean(np.array(sum_acc)))
226 | Logger()('Arithmetic MPT Accuracy is {:.2f}'.format(acc_mpt_a))
227 | Logger().log_value('{}_epoch.tdiuc{}.acc_mpt_a_norm'.format(self.mode, key), acc_mpt_a, should_print=False)
228 |
229 | acc_mpt_h = float(stats.hmean(sum_acc))
230 | Logger()('Harmonic MPT Accuracy is {:.2f}'.format(acc_mpt_h))
231 | Logger().log_value('{}_epoch.tdiuc{}.acc_mpt_h_norm'.format(self.mode, key), acc_mpt_h, should_print=False)
232 |
--------------------------------------------------------------------------------
/cfvqa/models/metrics/vqa_cfvqasimple_metrics.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import os
4 | import json
5 | from scipy import stats
6 | import numpy as np
7 | from collections import defaultdict
8 |
9 | from bootstrap.models.metrics.accuracy import accuracy
10 | from block.models.metrics.vqa_accuracies import VQAAccuracies
11 | from bootstrap.lib.logger import Logger
12 | from bootstrap.lib.options import Options
13 | from bootstrap.lib.logger import Logger
14 |
15 | class VQAAccuracy(nn.Module):
16 |
17 | def __init__(self, topk=[1,5]):
18 | super().__init__()
19 | self.topk = topk
20 | self.metric_list = ['_all', '_vq', '_cfvqa', '_q']
21 |
22 | def forward(self, cri_out, net_out, batch):
23 | out = {}
24 | class_id = batch['class_id'].data.cpu()
25 | for key in self.metric_list:
26 | logits = net_out[f'logits{key}'].data.cpu()
27 | acc_out = accuracy(logits, class_id, topk=self.topk)
28 | for i, k in enumerate(self.topk):
29 | out[f'accuracy{key}_top{k}'] = acc_out[i]
30 | return out
31 |
32 |
33 | class VQACFVQASimpleMetrics(VQAAccuracies):
34 |
35 | def __init__(self, *args, **kwargs):
36 | super().__init__(*args, **kwargs)
37 | self.metric_list = ['_all', '_vq', '_cfvqa', '_q']
38 | if Options()['dataset.eval_split'] == 'test': # 0430
39 | self.accuracy = None
40 | else:
41 | self.accuracy = VQAAccuracy()
42 | self.rm_dir_rslt = 1 if Options()['dataset.train_split'] is not None else 0
43 |
44 | def forward(self, cri_out, net_out, batch):
45 | out = {}
46 | if self.accuracy is not None:
47 | out = self.accuracy(cri_out, net_out, batch)
48 |
49 | # add answers and answer_ids keys to net_out
50 | net_out = self.engine.model.network.process_answers(net_out)
51 |
52 | batch_size = len(batch['index'])
53 | for i in range(batch_size):
54 |
55 | # Open Ended Accuracy (VQA-VQA2)
56 | if self.open_ended:
57 | for key in self.metric_list:
58 | pred_item = {
59 | 'question_id': batch['question_id'][i],
60 | 'answer': net_out[f'answers{key}'][i]
61 | }
62 | self.results[key].append(pred_item)
63 |
64 | # if self.dataset.split == 'test': # 0430
65 | # pred_item = {
66 | # 'question_id': batch['question_id'][i],
67 | # 'answer': net_out[f'answers{key}'][i]
68 | # # 'answer': net_out[f'answers'][i]
69 | # }
70 | # # if 'is_testdev' in batch and batch['is_testdev'][i]: # 0430
71 | # # self.results_testdev.append(pred_item)
72 |
73 | # if self.logits['tensor'] is None:
74 | # self.logits['tensor'] = torch.FloatTensor(len(self.dataset), logits.size(1))
75 |
76 | # self.logits['tensor'][self.idx] = logits[i]
77 | # self.logits['qid_to_idx'][batch['question_id'][i]] = self.idx
78 |
79 | # self.idx += 1
80 |
81 | # TDIUC metrics
82 | if self.tdiuc:
83 | gt_aid = batch['answer_id'][i]
84 | gt_ans = batch['answer'][i]
85 | gt_type = batch['question_type'][i]
86 | self.gt_types.append(gt_type)
87 | if gt_ans in self.ans_to_aid:
88 | self.gt_aids.append(gt_aid)
89 | else:
90 | self.gt_aids.append(-1)
91 | self.gt_aid_not_found += 1
92 |
93 | for key in self.metric_list:
94 | qid = batch['question_id'][i]
95 | pred_aid = net_out[f'answer_ids{key}'][i]
96 | self.pred_aids[key].append(pred_aid)
97 |
98 | self.res_by_type[key][gt_type+'_pred'].append(pred_aid)
99 |
100 | if gt_ans in self.ans_to_aid:
101 | self.res_by_type[key][gt_type+'_gt'].append(gt_aid)
102 | if gt_aid == pred_aid:
103 | self.res_by_type[key][gt_type+'_t'].append(pred_aid)
104 | else:
105 | self.res_by_type[key][gt_type+'_f'].append(pred_aid)
106 | else:
107 | self.res_by_type[key][gt_type+'_gt'].append(-1)
108 | self.res_by_type[key][gt_type+'_f'].append(pred_aid)
109 | return out
110 |
111 | def reset_oe(self):
112 | self.results = dict()
113 | self.dir_rslt = dict()
114 | self.path_rslt = dict()
115 | for key in self.metric_list:
116 | self.results[key] = []
117 | self.dir_rslt[key] = os.path.join(
118 | self.dir_exp,
119 | f'results{key}',
120 | self.dataset.split,
121 | 'epoch,{}'.format(self.engine.epoch))
122 | os.system('mkdir -p '+self.dir_rslt[key])
123 | self.path_rslt[key] = os.path.join(
124 | self.dir_rslt[key],
125 | 'OpenEnded_mscoco_{}_model_results.json'.format(
126 | self.dataset.get_subtype()))
127 |
128 | if self.dataset.split == 'test':
129 | pass
130 | # self.results_testdev = []
131 | # self.path_rslt_testdev = os.path.join(
132 | # self.dir_rslt,
133 | # 'OpenEnded_mscoco_{}_model_results.json'.format(
134 | # self.dataset.get_subtype(testdev=True)))
135 |
136 | # self.path_logits = os.path.join(self.dir_rslt, 'logits.pth')
137 | # os.system('mkdir -p '+os.path.dirname(self.path_logits))
138 |
139 | # self.logits = {}
140 | # self.logits['aid_to_ans'] = self.engine.model.network.aid_to_ans
141 | # self.logits['qid_to_idx'] = {}
142 | # self.logits['tensor'] = None
143 |
144 | # self.idx = 0
145 |
146 | # path_aid_to_ans = os.path.join(self.dir_rslt, 'aid_to_ans.json')
147 | # with open(path_aid_to_ans, 'w') as f:
148 | # json.dump(self.engine.model.network.aid_to_ans, f)
149 |
150 |
151 | def reset_tdiuc(self):
152 | self.pred_aids = defaultdict(list)
153 | self.gt_aids = []
154 | self.gt_types = []
155 | self.gt_aid_not_found = 0
156 | self.res_by_type = {key: defaultdict(list) for key in self.metric_list}
157 |
158 |
159 | def compute_oe_accuracy(self):
160 | logs_name_prefix = Options()['misc'].get('logs_name', '') or ''
161 |
162 | for key in self.metric_list:
163 | logs_name = (logs_name_prefix + key) or "logs"
164 | with open(self.path_rslt[key], 'w') as f:
165 | json.dump(self.results[key], f)
166 |
167 | # if self.dataset.split == 'test':
168 | # with open(self.path_rslt_testdev, 'w') as f:
169 | # json.dump(self.results_testdev, f)
170 |
171 | if 'test' not in self.dataset.split:
172 | call_to_prog = 'python -m block.models.metrics.compute_oe_accuracy '\
173 | + '--dir_vqa {} --dir_exp {} --dir_rslt {} --epoch {} --split {} --logs_name {} --rm {} &'\
174 | .format(self.dir_vqa, self.dir_exp, self.dir_rslt[key], self.engine.epoch, self.dataset.split, logs_name, self.rm_dir_rslt)
175 | Logger()('`'+call_to_prog+'`')
176 | os.system(call_to_prog)
177 |
178 |
179 | def compute_tdiuc_metrics(self):
180 | Logger()('{} of validation answers were not found in ans_to_aid'.format(self.gt_aid_not_found))
181 |
182 | for key in self.metric_list:
183 | Logger()(f'Computing TDIUC metrics for logits{key}')
184 | accuracy = float(100*np.mean(np.array(self.pred_aids[key])==np.array(self.gt_aids)))
185 | Logger()('Overall Traditional Accuracy is {:.2f}'.format(accuracy))
186 | Logger().log_value('{}_epoch.tdiuc.accuracy{}'.format(self.mode, key), accuracy, should_print=False)
187 |
188 | types = list(set(self.gt_types))
189 | sum_acc = []
190 | eps = 1e-10
191 |
192 | Logger()('---------------------------------------')
193 | Logger()('Not using per-answer normalization...')
194 | for tp in types:
195 | acc = 100*(len(self.res_by_type[key][tp+'_t'])/len(self.res_by_type[key][tp+'_t']+self.res_by_type[key][tp+'_f']))
196 | sum_acc.append(acc+eps)
197 | Logger()(f"Accuracy {key} for class '{tp}' is {acc:.2f}")
198 | Logger().log_value('{}_epoch.tdiuc{}.perQuestionType.{}'.format(self.mode, key, tp), acc, should_print=False)
199 |
200 | acc_mpt_a = float(np.mean(np.array(sum_acc)))
201 | Logger()('Arithmetic MPT Accuracy {} is {:.2f}'.format(key, acc_mpt_a))
202 | Logger().log_value('{}_epoch.tdiuc{}.acc_mpt_a'.format(self.mode, key), acc_mpt_a, should_print=False)
203 |
204 | acc_mpt_h = float(stats.hmean(sum_acc))
205 | Logger()('Harmonic MPT Accuracy {} is {:.2f}'.format(key, acc_mpt_h))
206 | Logger().log_value('{}_epoch.tdiuc{}.acc_mpt_h'.format(self.mode, key), acc_mpt_h, should_print=False)
207 |
208 | Logger()('---------------------------------------')
209 | Logger()('Using per-answer normalization...')
210 | for tp in types:
211 | per_ans_stat = defaultdict(int)
212 | for g,p in zip(self.res_by_type[key][tp+'_gt'],self.res_by_type[key][tp+'_pred']):
213 | per_ans_stat[str(g)+'_gt']+=1
214 | if g==p:
215 | per_ans_stat[str(g)]+=1
216 | unq_acc = 0
217 | for unq_ans in set(self.res_by_type[key][tp+'_gt']):
218 | acc_curr_ans = per_ans_stat[str(unq_ans)]/per_ans_stat[str(unq_ans)+'_gt']
219 | unq_acc +=acc_curr_ans
220 | acc = 100*unq_acc/len(set(self.res_by_type[key][tp+'_gt']))
221 | sum_acc.append(acc+eps)
222 | Logger()("Accuracy {} for class '{}' is {:.2f}".format(key, tp, acc))
223 | Logger().log_value('{}_epoch.tdiuc{}.perQuestionType_norm.{}'.format(self.mode, key, tp), acc, should_print=False)
224 |
225 | acc_mpt_a = float(np.mean(np.array(sum_acc)))
226 | Logger()('Arithmetic MPT Accuracy is {:.2f}'.format(acc_mpt_a))
227 | Logger().log_value('{}_epoch.tdiuc{}.acc_mpt_a_norm'.format(self.mode, key), acc_mpt_a, should_print=False)
228 |
229 | acc_mpt_h = float(stats.hmean(sum_acc))
230 | Logger()('Harmonic MPT Accuracy is {:.2f}'.format(acc_mpt_h))
231 | Logger().log_value('{}_epoch.tdiuc{}.acc_mpt_h_norm'.format(self.mode, key), acc_mpt_h, should_print=False)
232 |
--------------------------------------------------------------------------------
/cfvqa/models/metrics/vqa_rubi_metrics.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import os
4 | import json
5 | from scipy import stats
6 | import numpy as np
7 | from collections import defaultdict
8 |
9 | from bootstrap.models.metrics.accuracy import accuracy
10 | from block.models.metrics.vqa_accuracies import VQAAccuracies
11 | from bootstrap.lib.logger import Logger
12 | from bootstrap.lib.options import Options
13 | from bootstrap.lib.logger import Logger
14 |
15 | class VQAAccuracy(nn.Module):
16 |
17 | def __init__(self, topk=[1,5]):
18 | super().__init__()
19 | self.topk = topk
20 |
21 | def forward(self, cri_out, net_out, batch):
22 | out = {}
23 | class_id = batch['class_id'].data.cpu()
24 | for key in ['', '_all', '_q']:
25 | logits = net_out[f'logits{key}'].data.cpu()
26 | acc_out = accuracy(logits, class_id, topk=self.topk)
27 | for i, k in enumerate(self.topk):
28 | out[f'accuracy{key}_top{k}'] = acc_out[i]
29 | return out
30 |
31 |
32 | class VQARUBiMetrics(VQAAccuracies):
33 |
34 | def __init__(self, *args, **kwargs):
35 | super().__init__(*args, **kwargs)
36 | self.accuracy = VQAAccuracy()
37 | self.rm_dir_rslt = 1 if Options()['dataset.train_split'] is not None else 0
38 |
39 | def forward(self, cri_out, net_out, batch):
40 | out = {}
41 | if self.accuracy is not None:
42 | out = self.accuracy(cri_out, net_out, batch)
43 |
44 | # add answers and answer_ids keys to net_out
45 | net_out = self.engine.model.network.process_answers(net_out)
46 |
47 | batch_size = len(batch['index'])
48 | for i in range(batch_size):
49 |
50 | # Open Ended Accuracy (VQA-VQA2)
51 | if self.open_ended:
52 | for key in ['', '_all', '_q']:
53 | pred_item = {
54 | 'question_id': batch['question_id'][i],
55 | 'answer': net_out[f'answers{key}'][i]
56 | }
57 | self.results[key].append(pred_item)
58 |
59 | if self.dataset.split == 'test':
60 | pred_item = {
61 | 'question_id': batch['question_id'][i],
62 | 'answer': net_out[f'answers'][i]
63 | }
64 | if 'is_testdev' in batch and batch['is_testdev'][i]:
65 | self.results_testdev.append(pred_item)
66 |
67 | if self.logits['tensor'] is None:
68 | self.logits['tensor'] = torch.FloatTensor(len(self.dataset), logits.size(1))
69 |
70 | self.logits['tensor'][self.idx] = logits[i]
71 | self.logits['qid_to_idx'][batch['question_id'][i]] = self.idx
72 |
73 | self.idx += 1
74 |
75 | # TDIUC metrics
76 | if self.tdiuc:
77 | gt_aid = batch['answer_id'][i]
78 | gt_ans = batch['answer'][i]
79 | gt_type = batch['question_type'][i]
80 | self.gt_types.append(gt_type)
81 | if gt_ans in self.ans_to_aid:
82 | self.gt_aids.append(gt_aid)
83 | else:
84 | self.gt_aids.append(-1)
85 | self.gt_aid_not_found += 1
86 |
87 | for key in ['', '_all', '_q']:
88 | qid = batch['question_id'][i]
89 | pred_aid = net_out[f'answer_ids{key}'][i]
90 | self.pred_aids[key].append(pred_aid)
91 |
92 | self.res_by_type[key][gt_type+'_pred'].append(pred_aid)
93 |
94 | if gt_ans in self.ans_to_aid:
95 | self.res_by_type[key][gt_type+'_gt'].append(gt_aid)
96 | if gt_aid == pred_aid:
97 | self.res_by_type[key][gt_type+'_t'].append(pred_aid)
98 | else:
99 | self.res_by_type[key][gt_type+'_f'].append(pred_aid)
100 | else:
101 | self.res_by_type[key][gt_type+'_gt'].append(-1)
102 | self.res_by_type[key][gt_type+'_f'].append(pred_aid)
103 | return out
104 |
105 | def reset_oe(self):
106 | self.results = dict()
107 | self.dir_rslt = dict()
108 | self.path_rslt = dict()
109 | for key in ['', '_q', '_all']:
110 | self.results[key] = []
111 | self.dir_rslt[key] = os.path.join(
112 | self.dir_exp,
113 | f'results{key}',
114 | self.dataset.split,
115 | 'epoch,{}'.format(self.engine.epoch))
116 | os.system('mkdir -p '+self.dir_rslt[key])
117 | self.path_rslt[key] = os.path.join(
118 | self.dir_rslt[key],
119 | 'OpenEnded_mscoco_{}_model_results.json'.format(
120 | self.dataset.get_subtype()))
121 |
122 | if self.dataset.split == 'test':
123 | pass
124 | # self.results_testdev = []
125 | # self.path_rslt_testdev = os.path.join(
126 | # self.dir_rslt,
127 | # 'OpenEnded_mscoco_{}_model_results.json'.format(
128 | # self.dataset.get_subtype(testdev=True)))
129 |
130 | # self.path_logits = os.path.join(self.dir_rslt, 'logits.pth')
131 | # os.system('mkdir -p '+os.path.dirname(self.path_logits))
132 |
133 | # self.logits = {}
134 | # self.logits['aid_to_ans'] = self.engine.model.network.aid_to_ans
135 | # self.logits['qid_to_idx'] = {}
136 | # self.logits['tensor'] = None
137 |
138 | # self.idx = 0
139 |
140 | # path_aid_to_ans = os.path.join(self.dir_rslt, 'aid_to_ans.json')
141 | # with open(path_aid_to_ans, 'w') as f:
142 | # json.dump(self.engine.model.network.aid_to_ans, f)
143 |
144 |
145 | def reset_tdiuc(self):
146 | self.pred_aids = defaultdict(list)
147 | self.gt_aids = []
148 | self.gt_types = []
149 | self.gt_aid_not_found = 0
150 | self.res_by_type = {key: defaultdict(list) for key in ['', '_all', '_q']}
151 |
152 |
153 | def compute_oe_accuracy(self):
154 | logs_name_prefix = Options()['misc'].get('logs_name', '') or ''
155 |
156 | for key in ['', '_all', '_q']:
157 | logs_name = (logs_name_prefix + key) or "logs"
158 | with open(self.path_rslt[key], 'w') as f:
159 | json.dump(self.results[key], f)
160 |
161 | # if self.dataset.split == 'test':
162 | # with open(self.path_rslt_testdev, 'w') as f:
163 | # json.dump(self.results_testdev, f)
164 |
165 | if 'test' not in self.dataset.split:
166 | call_to_prog = 'python -m block.models.metrics.compute_oe_accuracy '\
167 | + '--dir_vqa {} --dir_exp {} --dir_rslt {} --epoch {} --split {} --logs_name {} --rm {} &'\
168 | .format(self.dir_vqa, self.dir_exp, self.dir_rslt[key], self.engine.epoch, self.dataset.split, logs_name, self.rm_dir_rslt)
169 | Logger()('`'+call_to_prog+'`')
170 | os.system(call_to_prog)
171 |
172 |
173 | def compute_tdiuc_metrics(self):
174 | Logger()('{} of validation answers were not found in ans_to_aid'.format(self.gt_aid_not_found))
175 |
176 | for key in ['', '_all', '_q']:
177 | Logger()(f'Computing TDIUC metrics for logits{key}')
178 | accuracy = float(100*np.mean(np.array(self.pred_aids[key])==np.array(self.gt_aids)))
179 | Logger()('Overall Traditional Accuracy is {:.2f}'.format(accuracy))
180 | Logger().log_value('{}_epoch.tdiuc.accuracy{}'.format(self.mode, key), accuracy, should_print=False)
181 |
182 | types = list(set(self.gt_types))
183 | sum_acc = []
184 | eps = 1e-10
185 |
186 | Logger()('---------------------------------------')
187 | Logger()('Not using per-answer normalization...')
188 | for tp in types:
189 | acc = 100*(len(self.res_by_type[key][tp+'_t'])/len(self.res_by_type[key][tp+'_t']+self.res_by_type[key][tp+'_f']))
190 | sum_acc.append(acc+eps)
191 | Logger()(f"Accuracy {key} for class '{tp}' is {acc:.2f}")
192 | Logger().log_value('{}_epoch.tdiuc{}.perQuestionType.{}'.format(self.mode, key, tp), acc, should_print=False)
193 |
194 | acc_mpt_a = float(np.mean(np.array(sum_acc)))
195 | Logger()('Arithmetic MPT Accuracy {} is {:.2f}'.format(key, acc_mpt_a))
196 | Logger().log_value('{}_epoch.tdiuc{}.acc_mpt_a'.format(self.mode, key), acc_mpt_a, should_print=False)
197 |
198 | acc_mpt_h = float(stats.hmean(sum_acc))
199 | Logger()('Harmonic MPT Accuracy {} is {:.2f}'.format(key, acc_mpt_h))
200 | Logger().log_value('{}_epoch.tdiuc{}.acc_mpt_h'.format(self.mode, key), acc_mpt_h, should_print=False)
201 |
202 | Logger()('---------------------------------------')
203 | Logger()('Using per-answer normalization...')
204 | for tp in types:
205 | per_ans_stat = defaultdict(int)
206 | for g,p in zip(self.res_by_type[key][tp+'_gt'],self.res_by_type[key][tp+'_pred']):
207 | per_ans_stat[str(g)+'_gt']+=1
208 | if g==p:
209 | per_ans_stat[str(g)]+=1
210 | unq_acc = 0
211 | for unq_ans in set(self.res_by_type[key][tp+'_gt']):
212 | acc_curr_ans = per_ans_stat[str(unq_ans)]/per_ans_stat[str(unq_ans)+'_gt']
213 | unq_acc +=acc_curr_ans
214 | acc = 100*unq_acc/len(set(self.res_by_type[key][tp+'_gt']))
215 | sum_acc.append(acc+eps)
216 | Logger()("Accuracy {} for class '{}' is {:.2f}".format(key, tp, acc))
217 | Logger().log_value('{}_epoch.tdiuc{}.perQuestionType_norm.{}'.format(self.mode, key, tp), acc, should_print=False)
218 |
219 | acc_mpt_a = float(np.mean(np.array(sum_acc)))
220 | Logger()('Arithmetic MPT Accuracy is {:.2f}'.format(acc_mpt_a))
221 | Logger().log_value('{}_epoch.tdiuc{}.acc_mpt_a_norm'.format(self.mode, key), acc_mpt_a, should_print=False)
222 |
223 | acc_mpt_h = float(stats.hmean(sum_acc))
224 | Logger()('Harmonic MPT Accuracy is {:.2f}'.format(acc_mpt_h))
225 | Logger().log_value('{}_epoch.tdiuc{}.acc_mpt_h_norm'.format(self.mode, key), acc_mpt_h, should_print=False)
226 |
--------------------------------------------------------------------------------
/cfvqa/models/networks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yuleiniu/cfvqa/736795bef50db9c76818f9a08202c7f325489afd/cfvqa/models/networks/__init__.py
--------------------------------------------------------------------------------
/cfvqa/models/networks/cfvqa.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | from block.models.networks.mlp import MLP
4 | from .utils import grad_mul_const # mask_softmax, grad_reverse, grad_reverse_mask,
5 |
6 | eps = 1e-12
7 |
8 | class CFVQA(nn.Module):
9 | """
10 | Wraps another model
11 | The original model must return a dictionnary containing the 'logits' key (predictions before softmax)
12 | Returns:
13 | - logits_vq: the original predictions of the model, i.e., NIE
14 | - logits_q: the predictions from the question-only branch
15 | - logits_v: the predictions from the vision-only branch
16 | - logits_all: the predictions from the ensemble model
17 | - logits_cfvqa: the predictions based on CF-VQA, i.e., TIE
18 | => Use `logits_all`, `logits_q` and `logits_v` for the loss
19 | """
20 | def __init__(self, model, output_size, classif_q, classif_v, fusion_mode, end_classif=True, is_va=True):
21 | super().__init__()
22 | self.net = model
23 | self.end_classif = end_classif
24 |
25 | assert fusion_mode in ['rubi', 'hm', 'sum'], "Fusion mode should be rubi/hm/sum."
26 | self.fusion_mode = fusion_mode
27 | self.is_va = is_va and (not fusion_mode=='rubi') # RUBi does not consider V->A
28 |
29 | # Q->A branch
30 | self.q_1 = MLP(**classif_q)
31 | if self.end_classif: # default: True (following RUBi)
32 | self.q_2 = nn.Linear(output_size, output_size)
33 |
34 | # V->A branch
35 | if self.is_va: # default: True (containing V->A)
36 | self.v_1 = MLP(**classif_v)
37 | if self.end_classif: # default: True (following RUBi)
38 | self.v_2 = nn.Linear(output_size, output_size)
39 |
40 | self.constant = nn.Parameter(torch.tensor(0.0))
41 |
42 | def forward(self, batch):
43 | out = {}
44 | # model prediction
45 | net_out = self.net(batch)
46 | logits = net_out['logits']
47 |
48 | # Q->A branch
49 | q_embedding = net_out['q_emb'] # N * q_emb
50 | q_embedding = grad_mul_const(q_embedding, 0.0) # don't backpropagate
51 | q_pred = self.q_1(q_embedding)
52 |
53 | # V->A branch
54 | if self.is_va:
55 | v_embedding = net_out['v_emb'] # N * v_emb
56 | v_embedding = grad_mul_const(v_embedding, 0.0) # don't backpropagate
57 | v_pred = self.v_1(v_embedding)
58 | else:
59 | v_pred = None
60 |
61 | # both q, k and v are the facts
62 | z_qkv = self.fusion(logits, q_pred, v_pred, q_fact=True, k_fact=True, v_fact=True) # te
63 | # q is the fact while k and v are the counterfactuals
64 | z_q = self.fusion(logits, q_pred, v_pred, q_fact=True, k_fact=False, v_fact=False) # nie
65 |
66 | logits_cfvqa = z_qkv - z_q
67 |
68 | if self.end_classif:
69 | q_out = self.q_2(q_pred)
70 | if self.is_va:
71 | v_out = self.v_2(v_pred)
72 | else:
73 | q_out = q_pred
74 | if self.is_va:
75 | v_out = v_pred
76 |
77 | out['logits_all'] = z_qkv # for optimization
78 | out['logits_vq'] = logits # predictions of the original VQ branch, i.e., NIE
79 | out['logits_cfvqa'] = logits_cfvqa # predictions of CFVQA, i.e., TIE
80 | out['logits_q'] = q_out # for optimization
81 | if self.is_va:
82 | out['logits_v'] = v_out # for optimization
83 |
84 | if self.is_va:
85 | out['z_nde'] = self.fusion(logits.clone().detach(), q_pred.clone().detach(), v_pred.clone().detach(), q_fact=True, k_fact=False, v_fact=False) # tie
86 | else:
87 | out['z_nde'] = self.fusion(logits.clone().detach(), q_pred.clone().detach(), None, q_fact=True, k_fact=False, v_fact=False) # tie
88 |
89 | return out
90 |
91 | def process_answers(self, out, key=''):
92 | out = self.net.process_answers(out, key='_all')
93 | out = self.net.process_answers(out, key='_vq')
94 | out = self.net.process_answers(out, key='_cfvqa')
95 | out = self.net.process_answers(out, key='_q')
96 | if self.is_va:
97 | out = self.net.process_answers(out, key='_v')
98 | return out
99 |
100 | def fusion(self, z_k, z_q, z_v, q_fact=False, k_fact=False, v_fact=False):
101 |
102 | z_k, z_q, z_v = self.transform(z_k, z_q, z_v, q_fact, k_fact, v_fact)
103 |
104 | if self.fusion_mode == 'rubi':
105 | z = z_k * torch.sigmoid(z_q)
106 |
107 | elif self.fusion_mode == 'hm':
108 | if self.is_va:
109 | z = z_k * z_q * z_v
110 | else:
111 | z = z_k * z_q
112 | z = torch.log(z + eps) - torch.log1p(z)
113 |
114 | elif self.fusion_mode == 'sum':
115 | if self.is_va:
116 | z = z_k + z_q + z_v
117 | else:
118 | z = z_k + z_q
119 | z = torch.log(torch.sigmoid(z) + eps)
120 |
121 | return z
122 |
123 | def transform(self, z_k, z_q, z_v, q_fact=False, k_fact=False, v_fact=False):
124 |
125 | if not k_fact:
126 | z_k = self.constant * torch.ones_like(z_k).cuda()
127 |
128 | if not q_fact:
129 | z_q = self.constant * torch.ones_like(z_q).cuda()
130 |
131 | if self.is_va:
132 | if not v_fact:
133 | z_v = self.constant * torch.ones_like(z_v).cuda()
134 |
135 | if self.fusion_mode == 'hm':
136 | z_k = torch.sigmoid(z_k)
137 | z_q = torch.sigmoid(z_q)
138 | if self.is_va:
139 | z_v = torch.sigmoid(z_v)
140 |
141 | return z_k, z_q, z_v
--------------------------------------------------------------------------------
/cfvqa/models/networks/factory.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import copy
3 | import torch
4 | import torch.nn as nn
5 | import os
6 | import json
7 | from bootstrap.lib.options import Options
8 | from bootstrap.models.networks.data_parallel import DataParallel
9 | from block.models.networks.vqa_net import VQANet as AttentionNet
10 | from bootstrap.lib.logger import Logger
11 |
12 | from .rubi import RUBiNet
13 | from .cfvqa import CFVQA
14 |
15 | def factory(engine):
16 | mode = list(engine.dataset.keys())[0]
17 | dataset = engine.dataset[mode]
18 | opt = Options()['model.network']
19 |
20 |
21 | if opt['base'] == 'smrl':
22 | from .smrl_net import SMRLNet as BaselineNet
23 | elif opt['base'] == 'updn':
24 | from .updn_net import UpDnNet as BaselineNet
25 | elif opt['base'] == 'san':
26 | from .san_net import SANNet as BaselineNet
27 | else:
28 | raise ValueError(opt['base'])
29 |
30 | orig_net = BaselineNet(
31 | txt_enc=opt['txt_enc'],
32 | self_q_att=opt['self_q_att'],
33 | agg=opt['agg'],
34 | classif=opt['classif'],
35 | wid_to_word=dataset.wid_to_word,
36 | word_to_wid=dataset.word_to_wid,
37 | aid_to_ans=dataset.aid_to_ans,
38 | ans_to_aid=dataset.ans_to_aid,
39 | fusion=opt['fusion'],
40 | residual=opt['residual'],
41 | q_single=opt['q_single'],
42 | )
43 |
44 | if opt['name'] == 'baseline':
45 | net = orig_net
46 |
47 | elif opt['name'] == 'rubi':
48 | net = RUBiNet(
49 | model=orig_net,
50 | output_size=len(dataset.aid_to_ans),
51 | classif=opt['rubi_params']['mlp_q']
52 | )
53 |
54 | elif opt['name'] == 'cfvqa':
55 | net = CFVQA(
56 | model=orig_net,
57 | output_size=len(dataset.aid_to_ans),
58 | classif_q=opt['cfvqa_params']['mlp_q'],
59 | classif_v=opt['cfvqa_params']['mlp_v'],
60 | fusion_mode=opt['fusion_mode'],
61 | is_va=True
62 | )
63 |
64 | elif opt['name'] == 'cfvqasimple':
65 | net = CFVQA(
66 | model=orig_net,
67 | output_size=len(dataset.aid_to_ans),
68 | classif_q=opt['cfvqa_params']['mlp_q'],
69 | classif_v=None,
70 | fusion_mode=opt['fusion_mode'],
71 | is_va=False
72 | )
73 |
74 | else:
75 | raise ValueError(opt['name'])
76 |
77 | if Options()['misc.cuda'] and torch.cuda.device_count() > 1:
78 | net = DataParallel(net)
79 |
80 | return net
81 |
82 |
--------------------------------------------------------------------------------
/cfvqa/models/networks/rubi.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | from block.models.networks.mlp import MLP
4 | from .utils import grad_mul_const # mask_softmax, grad_reverse, grad_reverse_mask,
5 |
6 |
7 | class RUBiNet(nn.Module):
8 | """
9 | Wraps another model
10 | The original model must return a dictionnary containing the 'logits' key (predictions before softmax)
11 | Returns:
12 | - logits: the original predictions of the model
13 | - logits_q: the predictions from the question-only branch
14 | - logits_rubi: the updated predictions from the model by the mask.
15 | => Use `logits_rubi` and `logits_q` for the loss
16 | """
17 | def __init__(self, model, output_size, classif, end_classif=True):
18 | super().__init__()
19 | self.net = model
20 | self.c_1 = MLP(**classif)
21 | self.end_classif = end_classif
22 | if self.end_classif:
23 | self.c_2 = nn.Linear(output_size, output_size)
24 |
25 | def forward(self, batch):
26 | out = {}
27 | # model prediction
28 | net_out = self.net(batch)
29 | logits = net_out['logits']
30 |
31 | q_embedding = net_out['q_emb'] # N * q_emb
32 | q_embedding = grad_mul_const(q_embedding, 0.0) # don't backpropagate through question encoder
33 | q_pred = self.c_1(q_embedding)
34 | fusion_pred = logits * torch.sigmoid(q_pred)
35 |
36 | if self.end_classif:
37 | q_out = self.c_2(q_pred)
38 | else:
39 | q_out = q_pred
40 |
41 | out['logits'] = net_out['logits']
42 | out['logits_all'] = fusion_pred
43 | out['logits_q'] = q_out
44 | return out
45 |
46 | def process_answers(self, out, key=''):
47 | out = self.net.process_answers(out)
48 | out = self.net.process_answers(out, key='_all')
49 | out = self.net.process_answers(out, key='_q')
50 | return out
51 |
--------------------------------------------------------------------------------
/cfvqa/models/networks/san_net.py:
--------------------------------------------------------------------------------
1 | from copy import deepcopy
2 | import itertools
3 | import os
4 | import numpy as np
5 | import scipy
6 | import torch
7 | import torch.nn as nn
8 | import torch.nn.functional as F
9 | from bootstrap.lib.options import Options
10 | from bootstrap.lib.logger import Logger
11 | import block
12 | from block.models.networks.vqa_net import factory_text_enc
13 | from block.models.networks.mlp import MLP
14 |
15 | from .utils import mask_softmax
16 |
17 | from torch.nn.utils.weight_norm import weight_norm
18 | from torch.autograd import Variable
19 |
20 | class SANNet(nn.Module):
21 |
22 | def __init__(self,
23 | txt_enc={},
24 | self_q_att=False,
25 | agg={},
26 | classif={},
27 | wid_to_word={},
28 | word_to_wid={},
29 | aid_to_ans=[],
30 | ans_to_aid={},
31 | fusion={},
32 | residual=False,
33 | q_single=False
34 | ):
35 | super().__init__()
36 | self.self_q_att = self_q_att
37 | self.agg = agg
38 | assert self.agg['type'] in ['max', 'mean']
39 | self.classif = classif
40 | self.wid_to_word = wid_to_word
41 | self.word_to_wid = word_to_wid
42 | self.aid_to_ans = aid_to_ans
43 | self.ans_to_aid = ans_to_aid
44 | self.fusion = fusion
45 | self.residual = residual
46 |
47 | # Modules
48 | self.txt_enc = self.get_text_enc(self.wid_to_word, txt_enc)
49 | if self.self_q_att:
50 | self.q_att_linear0 = nn.Linear(2400, 512)
51 | self.q_att_linear1 = nn.Linear(512, 2)
52 |
53 | if q_single:
54 | self.txt_enc_single = self.get_text_enc(self.wid_to_word, txt_enc)
55 | if self.self_q_att:
56 | self.q_att_linear0_single = nn.Linear(2400, 512)
57 | self.q_att_linear1_single = nn.Linear(512, 2)
58 |
59 | if self.classif['mlp']['dimensions'][-1] != len(self.aid_to_ans):
60 | Logger()(f"Warning, the classif_mm output dimension ({self.classif['mlp']['dimensions'][-1]})"
61 | f"doesn't match the number of answers ({len(self.aid_to_ans)}). Modifying the output dimension.")
62 | self.classif['mlp']['dimensions'][-1] = len(self.aid_to_ans)
63 |
64 | self.classif_module = MLP(**self.classif['mlp'])
65 |
66 | # UpDn
67 | q_dim = self.fusion['input_dims'][0]
68 | v_dim = self.fusion['input_dims'][1]
69 | output_dim = self.fusion['output_dim']
70 | att_size = 512
71 | self.v_att = Attention(v_dim, v_dim, att_size, 36, output_dim, drop_ratio=0.5)
72 | self.txt_enc.rnn = QuestionEmbedding(620, q_dim, 1, False, 0.0)
73 |
74 | self.q_net = FCNet([q_dim, output_dim])
75 | # self.v_net = FCNet([v_dim, output_dim])
76 |
77 | Logger().log_value('nparams',
78 | sum(p.numel() for p in self.parameters() if p.requires_grad),
79 | should_print=True)
80 |
81 | Logger().log_value('nparams_txt_enc',
82 | self.get_nparams_txt_enc(),
83 | should_print=True)
84 |
85 |
86 | def get_text_enc(self, vocab_words, options):
87 | """
88 | returns the text encoding network.
89 | """
90 | return factory_text_enc(self.wid_to_word, options)
91 |
92 | def get_nparams_txt_enc(self):
93 | params = [p.numel() for p in self.txt_enc.parameters() if p.requires_grad]
94 | if self.self_q_att:
95 | params += [p.numel() for p in self.q_att_linear0.parameters() if p.requires_grad]
96 | params += [p.numel() for p in self.q_att_linear1.parameters() if p.requires_grad]
97 | return sum(params)
98 |
99 | def forward(self, batch):
100 | v = batch['visual']
101 | q = batch['question']
102 | l = batch['lengths'].data
103 | c = batch['norm_coord']
104 | nb_regions = batch.get('nb_regions')
105 |
106 | out = {}
107 |
108 | q_emb = self.process_question(q, l,)
109 | out['v_emb'] = v.mean(1)
110 | out['q_emb'] = q_emb
111 |
112 | # single txt encoder
113 | if self.txt_enc_single is not None:
114 | out['q_emb'] = self.process_question(q, l, self.txt_enc_single, self.q_att_linear0_single, self.q_att_linear1_single)
115 |
116 | # New
117 | q_repr = self.q_net(q_emb)
118 | joint_repr = self.v_att(q_repr, v)
119 |
120 | logits = self.classif_module(joint_repr)
121 | out['logits'] = logits
122 |
123 | return out
124 |
125 | def process_question(self, q, l, txt_enc=None, q_att_linear0=None, q_att_linear1=None):
126 | if txt_enc is None:
127 | txt_enc = self.txt_enc
128 | q_emb = txt_enc.embedding(q)
129 | q = txt_enc.rnn(q_emb)
130 | return q
131 |
132 | def process_answers(self, out, key=''):
133 | batch_size = out[f'logits{key}'].shape[0]
134 | _, pred = out[f'logits{key}'].data.max(1)
135 | pred.squeeze_()
136 | if batch_size != 1:
137 | out[f'answers{key}'] = [self.aid_to_ans[pred[i].item()] for i in range(batch_size)]
138 | out[f'answer_ids{key}'] = [pred[i].item() for i in range(batch_size)]
139 | else:
140 | out[f'answers{key}'] = [self.aid_to_ans[pred.item()]]
141 | out[f'answer_ids{key}'] = [pred.item()]
142 | return out
143 |
144 | class Attention(nn.Module): # Extend PyTorch's Module class
145 | def __init__(self, v_dim, q_dim, att_size, img_seq_size, output_size, drop_ratio):
146 | super(Attention, self).__init__() # Must call super __init__()
147 | self.v_dim = v_dim
148 | self.q_dim = q_dim
149 | self.att_size = att_size
150 | self.img_seq_size = img_seq_size
151 | self.output_size = output_size
152 | self.drop_ratio = drop_ratio
153 |
154 | self.tan = nn.Tanh()
155 | self.dp = nn.Dropout(drop_ratio)
156 | self.sf = nn.Softmax()
157 |
158 | self.fc11 = nn.Linear(q_dim, 768, bias=True)
159 | # self.fc111 = nn.Linear(768, 640, bias=True)
160 | self.fc111 = nn.Linear(768, att_size, bias=True)
161 | self.fc12 = nn.Linear(v_dim, 768, bias=False)
162 | # self.fc121 = nn.Linear(768, 640, bias=False)
163 | self.fc121 = nn.Linear(768, att_size, bias=False)
164 | self.linear_second = nn.Linear(att_size, att_size, bias=False)
165 | # self.linear_second = nn.Linear(att_size, img_seq_size, bias=False)
166 | self.fc13 = nn.Linear(att_size, 1, bias=True)
167 |
168 | self.fc21 = nn.Linear(q_dim, att_size, bias=True)
169 | self.fc22 = nn.Linear(v_dim, att_size, bias=False)
170 | self.fc23 = nn.Linear(att_size, 1, bias=True)
171 |
172 | self.fc = nn.Linear(v_dim, output_size, bias=True)
173 |
174 | # d = input_size | m = img_seq_size | k = att_size
175 | def forward(self, ques_feat, img_feat): # ques_feat -- [batch, d] | img_feat -- [batch_size, m, d]
176 | # print(img_feat.size(), ques_feat.size())
177 | # print(self.v_dim, self.q_dim)
178 | # print("=======================================================================")
179 | B = ques_feat.size(0)
180 |
181 | # Stack 1
182 |
183 | ques_emb_1 = self.fc11(ques_feat)
184 | ques_emb_1 = self.fc111(ques_emb_1) # [batch_size, att_size]
185 | img_emb_1 = self.fc12(img_feat)
186 | img_emb_1 = self.fc121(img_emb_1)
187 |
188 | # print(ques_emb_1.size(), img_emb_1.size())
189 | # print("=======================================================================")
190 |
191 | # h1 = self.tan(ques_emb_1.view(B, 1, self.att_size) + img_emb_1)
192 | h1 = self.tan(ques_emb_1.view(B, 1, self.att_size) + img_emb_1)
193 | h1_emb = self.linear_second(h1)
194 | h1_emb = self.fc13(h1_emb)
195 |
196 | p1 = self.sf(h1_emb.view(-1, self.img_seq_size)).view(B, 1, self.img_seq_size)
197 |
198 | # Weighted sum
199 | img_att1 = p1.matmul(img_feat)
200 | u1 = ques_feat + img_att1.view(-1, self.v_dim)
201 |
202 | # Stack 2
203 | ques_emb_2 = self.fc21(u1) # [batch_size, att_size]
204 | img_emb_2 = self.fc22(img_feat)
205 |
206 | h2 = self.tan(ques_emb_2.view(B, 1, self.att_size) + img_emb_2)
207 |
208 | h2_emb = self.fc23(self.dp(h2))
209 | p2 = self.sf(h2_emb.view(-1, self.img_seq_size)).view(B, 1, self.img_seq_size)
210 |
211 | # Weighted sum
212 | img_att2 = p2.matmul(img_feat)
213 | u2 = u1 + img_att2.view(-1, self.v_dim)
214 |
215 | return u2
216 |
217 | class FCNet(nn.Module):
218 | """Simple class for non-linear fully connect network
219 | """
220 | def __init__(self, dims):
221 | super(FCNet, self).__init__()
222 |
223 | layers = []
224 | for i in range(len(dims)-2):
225 | in_dim = dims[i]
226 | out_dim = dims[i+1]
227 | layers.append(weight_norm(nn.Linear(in_dim, out_dim), dim=None))
228 | layers.append(nn.ReLU())
229 | layers.append(weight_norm(nn.Linear(dims[-2], dims[-1]), dim=None))
230 | layers.append(nn.ReLU())
231 |
232 | self.main = nn.Sequential(*layers)
233 |
234 | def forward(self, x):
235 | return self.main(x)
236 |
237 |
238 | class QuestionEmbedding(nn.Module):
239 | def __init__(self, in_dim, num_hid, nlayers, bidirect, dropout, rnn_type='GRU'):
240 | """Module for question embedding
241 | """
242 | super(QuestionEmbedding, self).__init__()
243 | assert rnn_type == 'LSTM' or rnn_type == 'GRU'
244 | rnn_cls = nn.LSTM if rnn_type == 'LSTM' else nn.GRU
245 |
246 | self.rnn = rnn_cls(
247 | in_dim, num_hid, nlayers,
248 | bidirectional=bidirect,
249 | dropout=dropout,
250 | batch_first=True)
251 |
252 | self.in_dim = in_dim
253 | self.num_hid = num_hid
254 | self.nlayers = nlayers
255 | self.rnn_type = rnn_type
256 | self.ndirections = 1 + int(bidirect)
257 |
258 | def init_hidden(self, batch):
259 | # just to get the type of tensor
260 | weight = next(self.parameters()).data
261 | hid_shape = (self.nlayers * self.ndirections, batch, self.num_hid)
262 | if self.rnn_type == 'LSTM':
263 | return (Variable(weight.new(*hid_shape).zero_()),
264 | Variable(weight.new(*hid_shape).zero_()))
265 | else:
266 | return Variable(weight.new(*hid_shape).zero_())
267 |
268 | def forward(self, x):
269 | # x: [batch, sequence, in_dim]
270 | batch = x.size(0)
271 | hidden = self.init_hidden(batch)
272 | self.rnn.flatten_parameters()
273 | output, hidden = self.rnn(x, hidden)
274 |
275 | if self.ndirections == 1:
276 | return output[:, -1]
277 |
278 | forward_ = output[:, -1, :self.num_hid]
279 | backward = output[:, 0, self.num_hid:]
280 | return torch.cat((forward_, backward), dim=1)
281 |
282 | def forward_all(self, x):
283 | # x: [batch, sequence, in_dim]
284 | batch = x.size(0)
285 | hidden = self.init_hidden(batch)
286 | self.rnn.flatten_parameters()
287 | output, hidden = self.rnn(x, hidden)
288 | return output
289 |
--------------------------------------------------------------------------------
/cfvqa/models/networks/smrl_net.py:
--------------------------------------------------------------------------------
1 | from copy import deepcopy
2 | import itertools
3 | import os
4 | import numpy as np
5 | import scipy
6 | import torch
7 | import torch.nn as nn
8 | import torch.nn.functional as F
9 | from bootstrap.lib.options import Options
10 | from bootstrap.lib.logger import Logger
11 | import block
12 | from block.models.networks.vqa_net import factory_text_enc
13 | from block.models.networks.mlp import MLP
14 |
15 | from .utils import mask_softmax
16 |
17 | class SMRLNet(nn.Module):
18 |
19 | def __init__(self,
20 | txt_enc={},
21 | self_q_att=False,
22 | agg={},
23 | classif={},
24 | wid_to_word={},
25 | word_to_wid={},
26 | aid_to_ans=[],
27 | ans_to_aid={},
28 | fusion={},
29 | residual=False,
30 | q_single=False,
31 | ):
32 | super().__init__()
33 | self.self_q_att = self_q_att
34 | self.agg = agg
35 | assert self.agg['type'] in ['max', 'mean']
36 | self.classif = classif
37 | self.wid_to_word = wid_to_word
38 | self.word_to_wid = word_to_wid
39 | self.aid_to_ans = aid_to_ans
40 | self.ans_to_aid = ans_to_aid
41 | self.fusion = fusion
42 | self.residual = residual
43 |
44 | # Modules
45 | self.txt_enc = self.get_text_enc(self.wid_to_word, txt_enc)
46 | if self.self_q_att:
47 | self.q_att_linear0 = nn.Linear(2400, 512)
48 | self.q_att_linear1 = nn.Linear(512, 2)
49 |
50 | if q_single:
51 | self.txt_enc_single = self.get_text_enc(self.wid_to_word, txt_enc)
52 | if self.self_q_att:
53 | self.q_att_linear0_single = nn.Linear(2400, 512)
54 | self.q_att_linear1_single = nn.Linear(512, 2)
55 | else:
56 | self.txt_enc_single = None
57 |
58 | self.fusion_module = block.factory_fusion(self.fusion)
59 |
60 | if self.classif['mlp']['dimensions'][-1] != len(self.aid_to_ans):
61 | Logger()(f"Warning, the classif_mm output dimension ({self.classif['mlp']['dimensions'][-1]})"
62 | f"doesn't match the number of answers ({len(self.aid_to_ans)}). Modifying the output dimension.")
63 | self.classif['mlp']['dimensions'][-1] = len(self.aid_to_ans)
64 |
65 | self.classif_module = MLP(**self.classif['mlp'])
66 |
67 | Logger().log_value('nparams',
68 | sum(p.numel() for p in self.parameters() if p.requires_grad),
69 | should_print=True)
70 |
71 | Logger().log_value('nparams_txt_enc',
72 | self.get_nparams_txt_enc(),
73 | should_print=True)
74 |
75 |
76 | def get_text_enc(self, vocab_words, options):
77 | """
78 | returns the text encoding network.
79 | """
80 | return factory_text_enc(self.wid_to_word, options)
81 |
82 | def get_nparams_txt_enc(self):
83 | params = [p.numel() for p in self.txt_enc.parameters() if p.requires_grad]
84 | if self.self_q_att:
85 | params += [p.numel() for p in self.q_att_linear0.parameters() if p.requires_grad]
86 | params += [p.numel() for p in self.q_att_linear1.parameters() if p.requires_grad]
87 | return sum(params)
88 |
89 | def process_fusion(self, q, mm):
90 | bsize = mm.shape[0]
91 | n_regions = mm.shape[1]
92 |
93 | mm = mm.contiguous().view(bsize*n_regions, -1)
94 | mm = self.fusion_module([q, mm])
95 | mm = mm.view(bsize, n_regions, -1)
96 | return mm
97 |
98 | def forward(self, batch):
99 | v = batch['visual']
100 | q = batch['question']
101 | l = batch['lengths'].data
102 | c = batch['norm_coord']
103 | nb_regions = batch.get('nb_regions')
104 | bsize = v.shape[0]
105 | n_regions = v.shape[1]
106 |
107 | out = {}
108 |
109 | q = self.process_question(q, l,)
110 | out['q_emb'] = q
111 | q_expand = q[:,None,:].expand(bsize, n_regions, q.shape[1])
112 | q_expand = q_expand.contiguous().view(bsize*n_regions, -1)
113 |
114 | # single txt encoder
115 | if self.txt_enc_single is not None:
116 | out['q_emb'] = self.process_question(q, l, self.txt_enc_single, self.q_att_linear0_single, self.q_att_linear1_single)
117 |
118 | mm = self.process_fusion(q_expand, v,)
119 |
120 | if self.residual:
121 | mm = v + mm
122 |
123 | if self.agg['type'] == 'max':
124 | mm, mm_argmax = torch.max(mm, 1)
125 | elif self.agg['type'] == 'mean':
126 | mm = mm.mean(1)
127 |
128 | out['v_emb'] = v.mean(1)
129 | out['mm'] = mm
130 | out['mm_argmax'] = mm_argmax
131 |
132 | logits = self.classif_module(mm)
133 | out['logits'] = logits
134 | return out
135 |
136 | def process_question(self, q, l, txt_enc=None, q_att_linear0=None, q_att_linear1=None):
137 | if txt_enc is None:
138 | txt_enc = self.txt_enc
139 | if q_att_linear0 is None:
140 | q_att_linear0 = self.q_att_linear0
141 | if q_att_linear1 is None:
142 | q_att_linear1 = self.q_att_linear1
143 | q_emb = txt_enc.embedding(q)
144 |
145 | q, _ = txt_enc.rnn(q_emb)
146 |
147 | if self.self_q_att:
148 | q_att = q_att_linear0(q)
149 | q_att = F.relu(q_att)
150 | q_att = q_att_linear1(q_att)
151 | q_att = mask_softmax(q_att, l)
152 | #self.q_att_coeffs = q_att
153 | if q_att.size(2) > 1:
154 | q_atts = torch.unbind(q_att, dim=2)
155 | q_outs = []
156 | for q_att in q_atts:
157 | q_att = q_att.unsqueeze(2)
158 | q_att = q_att.expand_as(q)
159 | q_out = q_att*q
160 | q_out = q_out.sum(1)
161 | q_outs.append(q_out)
162 | q = torch.cat(q_outs, dim=1)
163 | else:
164 | q_att = q_att.expand_as(q)
165 | q = q_att * q
166 | q = q.sum(1)
167 | else:
168 | # l contains the number of words for each question
169 | # in case of multi-gpus it must be a Tensor
170 | # thus we convert it into a list during the forward pass
171 | l = list(l.data[:,0])
172 | q = txt_enc._select_last(q, l)
173 |
174 | return q
175 |
176 | def process_answers(self, out, key=''):
177 | batch_size = out[f'logits{key}'].shape[0]
178 | _, pred = out[f'logits{key}'].data.max(1)
179 | pred.squeeze_()
180 | if batch_size != 1:
181 | out[f'answers{key}'] = [self.aid_to_ans[pred[i].item()] for i in range(batch_size)]
182 | out[f'answer_ids{key}'] = [pred[i].item() for i in range(batch_size)]
183 | else:
184 | out[f'answers{key}'] = [self.aid_to_ans[pred.item()]]
185 | out[f'answer_ids{key}'] = [pred.item()]
186 | return out
187 |
--------------------------------------------------------------------------------
/cfvqa/models/networks/updn_net.py:
--------------------------------------------------------------------------------
1 | from copy import deepcopy
2 | import itertools
3 | import os
4 | import numpy as np
5 | import scipy
6 | import torch
7 | import torch.nn as nn
8 | import torch.nn.functional as F
9 | from bootstrap.lib.options import Options
10 | from bootstrap.lib.logger import Logger
11 | import block
12 | from block.models.networks.vqa_net import factory_text_enc
13 | from block.models.networks.mlp import MLP
14 |
15 | from .utils import mask_softmax
16 |
17 | from torch.nn.utils.weight_norm import weight_norm
18 |
19 | class UpDnNet(nn.Module):
20 |
21 | def __init__(self,
22 | txt_enc={},
23 | self_q_att=False,
24 | agg={},
25 | classif={},
26 | wid_to_word={},
27 | word_to_wid={},
28 | aid_to_ans=[],
29 | ans_to_aid={},
30 | fusion={},
31 | residual=False,
32 | q_single=False,
33 | ):
34 | super().__init__()
35 | self.self_q_att = self_q_att
36 | self.agg = agg
37 | assert self.agg['type'] in ['max', 'mean']
38 | self.classif = classif
39 | self.wid_to_word = wid_to_word
40 | self.word_to_wid = word_to_wid
41 | self.aid_to_ans = aid_to_ans
42 | self.ans_to_aid = ans_to_aid
43 | self.fusion = fusion
44 | self.residual = residual
45 |
46 | # Modules
47 | self.txt_enc = self.get_text_enc(self.wid_to_word, txt_enc)
48 | if self.self_q_att:
49 | self.q_att_linear0 = nn.Linear(2400, 512)
50 | self.q_att_linear1 = nn.Linear(512, 2)
51 |
52 | if q_single:
53 | self.txt_enc_single = self.get_text_enc(self.wid_to_word, txt_enc)
54 | if self.self_q_att:
55 | self.q_att_linear0_single = nn.Linear(2400, 512)
56 | self.q_att_linear1_single = nn.Linear(512, 2)
57 | else:
58 | self.txt_enc_single = None
59 |
60 | if self.classif['mlp']['dimensions'][-1] != len(self.aid_to_ans):
61 | Logger()(f"Warning, the classif_mm output dimension ({self.classif['mlp']['dimensions'][-1]})"
62 | f"doesn't match the number of answers ({len(self.aid_to_ans)}). Modifying the output dimension.")
63 | self.classif['mlp']['dimensions'][-1] = len(self.aid_to_ans)
64 |
65 | self.classif_module = MLP(**self.classif['mlp'])
66 |
67 | # UpDn
68 | q_dim = self.fusion['input_dims'][0]
69 | v_dim = self.fusion['input_dims'][1]
70 | output_dim = self.fusion['output_dim']
71 | self.v_att = Attention(v_dim, q_dim, output_dim)
72 | self.q_net = FCNet([q_dim, output_dim])
73 | self.v_net = FCNet([v_dim, output_dim])
74 |
75 | Logger().log_value('nparams',
76 | sum(p.numel() for p in self.parameters() if p.requires_grad),
77 | should_print=True)
78 |
79 | Logger().log_value('nparams_txt_enc',
80 | self.get_nparams_txt_enc(),
81 | should_print=True)
82 |
83 |
84 | def get_text_enc(self, vocab_words, options):
85 | """
86 | returns the text encoding network.
87 | """
88 | return factory_text_enc(self.wid_to_word, options)
89 |
90 | def get_nparams_txt_enc(self):
91 | params = [p.numel() for p in self.txt_enc.parameters() if p.requires_grad]
92 | if self.self_q_att:
93 | params += [p.numel() for p in self.q_att_linear0.parameters() if p.requires_grad]
94 | params += [p.numel() for p in self.q_att_linear1.parameters() if p.requires_grad]
95 | return sum(params)
96 |
97 | def forward(self, batch):
98 | v = batch['visual']
99 | q = batch['question']
100 | l = batch['lengths'].data
101 | c = batch['norm_coord']
102 | nb_regions = batch.get('nb_regions')
103 |
104 | out = {}
105 |
106 | q_emb = self.process_question(q, l,)
107 | out['v_emb'] = v.mean(1)
108 | out['q_emb'] = q_emb
109 |
110 | # single txt encoder
111 | if self.txt_enc_single is not None:
112 | out['q_emb'] = self.process_question(q, l, self.txt_enc_single, self.q_att_linear0_single, self.q_att_linear1_single)
113 |
114 | # New
115 | att = self.v_att(v, q_emb)
116 | v_emb = (att * v).sum(1)
117 | q_repr = self.q_net(q_emb)
118 | v_repr = self.v_net(v_emb)
119 | joint_repr = q_repr * v_repr
120 |
121 | logits = self.classif_module(joint_repr)
122 | out['logits'] = logits
123 |
124 | return out
125 |
126 | def process_question(self, q, l, txt_enc=None, q_att_linear0=None, q_att_linear1=None):
127 | if txt_enc is None:
128 | txt_enc = self.txt_enc
129 | if q_att_linear0 is None:
130 | q_att_linear0 = self.q_att_linear0
131 | if q_att_linear1 is None:
132 | q_att_linear1 = self.q_att_linear1
133 | q_emb = txt_enc.embedding(q)
134 |
135 | q, _ = txt_enc.rnn(q_emb)
136 |
137 | if self.self_q_att:
138 | q_att = q_att_linear0(q)
139 | q_att = F.relu(q_att)
140 | q_att = q_att_linear1(q_att)
141 | q_att = mask_softmax(q_att, l)
142 | #self.q_att_coeffs = q_att
143 | if q_att.size(2) > 1:
144 | q_atts = torch.unbind(q_att, dim=2)
145 | q_outs = []
146 | for q_att in q_atts:
147 | q_att = q_att.unsqueeze(2)
148 | q_att = q_att.expand_as(q)
149 | q_out = q_att*q
150 | q_out = q_out.sum(1)
151 | q_outs.append(q_out)
152 | q = torch.cat(q_outs, dim=1)
153 | else:
154 | q_att = q_att.expand_as(q)
155 | q = q_att * q
156 | q = q.sum(1)
157 | else:
158 | # l contains the number of words for each question
159 | # in case of multi-gpus it must be a Tensor
160 | # thus we convert it into a list during the forward pass
161 | l = list(l.data[:,0])
162 | q = txt_enc._select_last(q, l)
163 |
164 | return q
165 |
166 | def process_answers(self, out, key=''):
167 | batch_size = out[f'logits{key}'].shape[0]
168 | _, pred = out[f'logits{key}'].data.max(1)
169 | pred.squeeze_()
170 | if batch_size != 1:
171 | out[f'answers{key}'] = [self.aid_to_ans[pred[i].item()] for i in range(batch_size)]
172 | out[f'answer_ids{key}'] = [pred[i].item() for i in range(batch_size)]
173 | else:
174 | out[f'answers{key}'] = [self.aid_to_ans[pred.item()]]
175 | out[f'answer_ids{key}'] = [pred.item()]
176 | return out
177 |
178 | class Attention(nn.Module):
179 | def __init__(self, v_dim, q_dim, num_hid, dropout=0.2):
180 | super(Attention, self).__init__()
181 |
182 | self.v_proj = FCNet([v_dim, num_hid])
183 | self.q_proj = FCNet([q_dim, num_hid])
184 | self.dropout = nn.Dropout(dropout)
185 | self.linear = weight_norm(nn.Linear(num_hid, 1), dim=None)
186 |
187 | def forward(self, v, q):
188 | """
189 | v: [batch, k, vdim]
190 | q: [batch, qdim]
191 | """
192 | logits = self.logits(v, q)
193 | w = nn.functional.softmax(logits, 1)
194 | return w
195 |
196 | def logits(self, v, q):
197 | batch, k, _ = v.size()
198 | v_proj = self.v_proj(v) # [batch, k, qdim]
199 | q_proj = self.q_proj(q).unsqueeze(1).repeat(1, k, 1)
200 | joint_repr = v_proj * q_proj
201 | joint_repr = self.dropout(joint_repr)
202 | logits = self.linear(joint_repr)
203 | return logits
204 |
205 | class FCNet(nn.Module):
206 | """Simple class for non-linear fully connect network
207 | """
208 | def __init__(self, dims):
209 | super(FCNet, self).__init__()
210 |
211 | layers = []
212 | for i in range(len(dims)-2):
213 | in_dim = dims[i]
214 | out_dim = dims[i+1]
215 | layers.append(weight_norm(nn.Linear(in_dim, out_dim), dim=None))
216 | layers.append(nn.ReLU())
217 | layers.append(weight_norm(nn.Linear(dims[-2], dims[-1]), dim=None))
218 | layers.append(nn.ReLU())
219 |
220 | self.main = nn.Sequential(*layers)
221 |
222 | def forward(self, x):
223 | return self.main(x)
--------------------------------------------------------------------------------
/cfvqa/models/networks/utils.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | def mask_softmax(x, lengths):#, dim=1)
4 | mask = torch.zeros_like(x).to(device=x.device, non_blocking=True)
5 | t_lengths = lengths[:,:,None].expand_as(mask)
6 | arange_id = torch.arange(mask.size(1)).to(device=x.device, non_blocking=True)
7 | arange_id = arange_id[None,:,None].expand_as(mask)
8 |
9 | mask[arange_id=2:
26 | nn.init.xavier_uniform_(p.data)
27 | else:
28 | raise ValueError(p.dim())
29 |
30 | return optimizer
31 |
--------------------------------------------------------------------------------
/cfvqa/options/vqa2/smrl_baseline.yaml:
--------------------------------------------------------------------------------
1 | exp:
2 | dir: logs/vqa2/smrl_baseline
3 | resume: # last, best_[...], or empty (from scratch)
4 | dataset:
5 | import: cfvqa.datasets.factory
6 | name: vqa2 # or vqa2vg
7 | dir: data/vqa/vqa2
8 | train_split: train
9 | eval_split: val # or test
10 | proc_split: train # or trainval (preprocessing split, must be equal to train_split)
11 | nb_threads: 4
12 | batch_size: 256
13 | nans: 3000
14 | minwcount: 0
15 | nlp: mcb
16 | samplingans: True
17 | dir_rcnn: data/vqa/coco/extract_rcnn/2018-04-27_bottom-up-attention_fixed_36
18 | vg: false
19 | model:
20 | name: default
21 | network:
22 | import: cfvqa.models.networks.factory
23 | base: smrl
24 | name: baseline
25 | rubi_params:
26 | mlp_q:
27 | input_dim: 4800
28 | dimensions: [1024,1024,3000]
29 | txt_enc:
30 | name: skipthoughts
31 | type: BayesianUniSkip
32 | dropout: 0.25
33 | fixed_emb: False
34 | dir_st: data/skip-thoughts
35 | self_q_att: True
36 | residual: False
37 | q_single: False
38 | fusion:
39 | type: block
40 | input_dims: [4800, 2048]
41 | output_dim: 2048
42 | mm_dim: 1000
43 | chunks: 20
44 | rank: 15
45 | dropout_input: 0.
46 | dropout_pre_lin: 0.
47 | agg:
48 | type: max
49 | classif:
50 | mlp:
51 | input_dim: 2048
52 | dimensions: [1024,1024,3000]
53 | criterion:
54 | import: cfvqa.models.criterions.factory
55 | name: vqa_cross_entropy
56 | metric:
57 | import: cfvqa.models.metrics.factory
58 | name: vqa_accuracies
59 | optimizer:
60 | import: cfvqa.optimizers.factory
61 | name: Adam
62 | lr: 0.0003
63 | gradual_warmup_steps: [0.5, 2.0, 7.0] #torch.linspace
64 | gradual_warmup_steps_mm: [0.5, 2.0, 7.0] #torch.linspace
65 | lr_decay_epochs: [14, 24, 2] #range
66 | lr_decay_rate: .25
67 | engine:
68 | name: logger
69 | debug: False
70 | print_freq: 10
71 | nb_epochs: 22
72 | saving_criteria:
73 | - eval_epoch.accuracy_top1:max
74 | misc:
75 | logs_name:
76 | cuda: True
77 | seed: 1337
78 |
--------------------------------------------------------------------------------
/cfvqa/options/vqa2/smrl_cfvqa_hm.yaml:
--------------------------------------------------------------------------------
1 | exp:
2 | dir: logs/vqa2/smrl_cfvqa_hm
3 | resume: # last, best_[...], or empty (from scratch)
4 | dataset:
5 | import: cfvqa.datasets.factory
6 | name: vqa2 # or vqa2vg
7 | dir: data/vqa/vqa2
8 | train_split: train
9 | eval_split: val # or test
10 | proc_split: train # or trainval (preprocessing split, must be equal to train_split)
11 | nb_threads: 4
12 | batch_size: 256
13 | nans: 3000
14 | minwcount: 0
15 | nlp: mcb
16 | samplingans: True
17 | dir_rcnn: data/vqa/coco/extract_rcnn/2018-04-27_bottom-up-attention_fixed_36
18 | vg: false
19 | model:
20 | name: default
21 | network:
22 | import: cfvqa.models.networks.factory
23 | base: smrl
24 | name: cfvqa
25 | fusion_mode: hm
26 | cfvqa_params:
27 | mlp_q:
28 | input_dim: 4800
29 | dimensions: [1024,1024,3000]
30 | mlp_v:
31 | input_dim: 2048
32 | dimensions: [1024,1024,3000]
33 | txt_enc:
34 | name: skipthoughts
35 | type: BayesianUniSkip
36 | dropout: 0.25
37 | fixed_emb: False
38 | dir_st: data/skip-thoughts
39 | self_q_att: True
40 | residual: False
41 | q_single: False
42 | fusion:
43 | type: block
44 | input_dims: [4800, 2048]
45 | output_dim: 2048
46 | mm_dim: 1000
47 | chunks: 20
48 | rank: 15
49 | dropout_input: 0.
50 | dropout_pre_lin: 0.
51 | agg:
52 | type: max
53 | classif:
54 | mlp:
55 | input_dim: 2048
56 | dimensions: [1024,1024,3000]
57 | criterion:
58 | import: cfvqa.models.criterions.factory
59 | name: cfvqa_criterion
60 | question_loss_weight: 1.0
61 | vision_loss_weight: 1.0
62 | metric:
63 | import: cfvqa.models.metrics.factory
64 | name: vqa_cfvqa_metrics
65 | optimizer:
66 | import: cfvqa.optimizers.factory
67 | name: Adam
68 | lr: 0.0003
69 | gradual_warmup_steps: [0.5, 2.0, 7.0] #torch.linspace
70 | gradual_warmup_steps_mm: [0.5, 2.0, 7.0] #torch.linspace
71 | lr_decay_epochs: [14, 24, 2] #range
72 | lr_decay_rate: .25
73 | engine:
74 | name: logger
75 | debug: False
76 | print_freq: 10
77 | nb_epochs: 22
78 | saving_criteria:
79 | - eval_epoch.accuracy_all_top1:max
80 | - eval_epoch.accuracy_vq_top1:max
81 | - eval_epoch.accuracy_cfvqa_top1:max
82 | misc:
83 | logs_name:
84 | cuda: True
85 | seed: 1337
86 |
--------------------------------------------------------------------------------
/cfvqa/options/vqa2/smrl_cfvqa_sum.yaml:
--------------------------------------------------------------------------------
1 | exp:
2 | dir: logs/vqa2/smrl_cfvqa_sum
3 | resume: # last, best_[...], or empty (from scratch)
4 | dataset:
5 | import: cfvqa.datasets.factory
6 | name: vqa2 # or vqa2vg
7 | dir: data/vqa/vqa2
8 | train_split: train
9 | eval_split: val # or test
10 | proc_split: train # or trainval (preprocessing split, must be equal to train_split)
11 | nb_threads: 4
12 | batch_size: 256
13 | nans: 3000
14 | minwcount: 0
15 | nlp: mcb
16 | samplingans: True
17 | dir_rcnn: data/vqa/coco/extract_rcnn/2018-04-27_bottom-up-attention_fixed_36
18 | vg: false
19 | model:
20 | name: default
21 | network:
22 | import: cfvqa.models.networks.factory
23 | base: smrl
24 | name: cfvqa
25 | fusion_mode: sum
26 | cfvqa_params:
27 | mlp_q:
28 | input_dim: 4800
29 | dimensions: [1024,1024,3000]
30 | mlp_v:
31 | input_dim: 2048
32 | dimensions: [1024,1024,3000]
33 | txt_enc:
34 | name: skipthoughts
35 | type: BayesianUniSkip
36 | dropout: 0.25
37 | fixed_emb: False
38 | dir_st: data/skip-thoughts
39 | self_q_att: True
40 | residual: False
41 | q_single: False
42 | fusion:
43 | type: block
44 | input_dims: [4800, 2048]
45 | output_dim: 2048
46 | mm_dim: 1000
47 | chunks: 20
48 | rank: 15
49 | dropout_input: 0.
50 | dropout_pre_lin: 0.
51 | agg:
52 | type: max
53 | classif:
54 | mlp:
55 | input_dim: 2048
56 | dimensions: [1024,1024,3000]
57 | criterion:
58 | import: cfvqa.models.criterions.factory
59 | name: cfvqa_criterion
60 | question_loss_weight: 1.0
61 | vision_loss_weight: 1.0
62 | metric:
63 | import: cfvqa.models.metrics.factory
64 | name: vqa_cfvqa_metrics
65 | optimizer:
66 | import: cfvqa.optimizers.factory
67 | name: Adam
68 | lr: 0.0003
69 | gradual_warmup_steps: [0.5, 2.0, 7.0] #torch.linspace
70 | gradual_warmup_steps_mm: [0.5, 2.0, 7.0] #torch.linspace
71 | lr_decay_epochs: [14, 24, 2] #range
72 | lr_decay_rate: .25
73 | engine:
74 | name: logger
75 | debug: False
76 | print_freq: 10
77 | nb_epochs: 22
78 | saving_criteria:
79 | - eval_epoch.accuracy_all_top1:max
80 | - eval_epoch.accuracy_vq_top1:max
81 | - eval_epoch.accuracy_cfvqa_top1:max
82 | misc:
83 | logs_name:
84 | cuda: True
85 | seed: 1337
86 |
--------------------------------------------------------------------------------
/cfvqa/options/vqa2/smrl_cfvqasimple_hm.yaml:
--------------------------------------------------------------------------------
1 | exp:
2 | dir: logs/vqa2/smrl_cfvqasimple_hm
3 | resume: # last, best_[...], or empty (from scratch)
4 | dataset:
5 | import: cfvqa.datasets.factory
6 | name: vqa2 # or vqa2vg
7 | dir: data/vqa/vqa2
8 | train_split: train
9 | eval_split: val # or test
10 | proc_split: train # or trainval (preprocessing split, must be equal to train_split)
11 | nb_threads: 4
12 | batch_size: 256
13 | nans: 3000
14 | minwcount: 0
15 | nlp: mcb
16 | samplingans: True
17 | dir_rcnn: data/vqa/coco/extract_rcnn/2018-04-27_bottom-up-attention_fixed_36
18 | vg: false
19 | model:
20 | name: default
21 | network:
22 | import: cfvqa.models.networks.factory
23 | base: smrl
24 | name: cfvqasimple
25 | fusion_mode: hm
26 | cfvqa_params:
27 | mlp_q:
28 | input_dim: 4800
29 | dimensions: [1024,1024,3000]
30 | txt_enc:
31 | name: skipthoughts
32 | type: BayesianUniSkip
33 | dropout: 0.25
34 | fixed_emb: False
35 | dir_st: data/skip-thoughts
36 | self_q_att: True
37 | residual: False
38 | q_single: False
39 | fusion:
40 | type: block
41 | input_dims: [4800, 2048]
42 | output_dim: 2048
43 | mm_dim: 1000
44 | chunks: 20
45 | rank: 15
46 | dropout_input: 0.
47 | dropout_pre_lin: 0.
48 | agg:
49 | type: max
50 | classif:
51 | mlp:
52 | input_dim: 2048
53 | dimensions: [1024,1024,3000]
54 | criterion:
55 | import: cfvqa.models.criterions.factory
56 | name: cfvqasimple_criterion
57 | question_loss_weight: 1.0
58 | metric:
59 | import: cfvqa.models.metrics.factory
60 | name: vqa_cfvqasimple_metrics
61 | optimizer:
62 | import: cfvqa.optimizers.factory
63 | name: Adam
64 | lr: 0.0003
65 | gradual_warmup_steps: [0.5, 2.0, 7.0] #torch.linspace
66 | gradual_warmup_steps_mm: [0.5, 2.0, 7.0] #torch.linspace
67 | lr_decay_epochs: [14, 24, 2] #range
68 | lr_decay_rate: .25
69 | engine:
70 | name: logger
71 | debug: False
72 | print_freq: 10
73 | nb_epochs: 22
74 | saving_criteria:
75 | - eval_epoch.accuracy_all_top1:max
76 | - eval_epoch.accuracy_vq_top1:max
77 | - eval_epoch.accuracy_cfvqa_top1:max
78 | misc:
79 | logs_name:
80 | cuda: True
81 | seed: 1337
82 |
--------------------------------------------------------------------------------
/cfvqa/options/vqa2/smrl_cfvqasimple_rubi.yaml:
--------------------------------------------------------------------------------
1 | exp:
2 | dir: logs/vqa2/smrl_cfvqasimple_rubi
3 | resume: # last, best_[...], or empty (from scratch)
4 | dataset:
5 | import: cfvqa.datasets.factory
6 | name: vqa2 # or vqa2vg
7 | dir: data/vqa/vqa2
8 | train_split: train
9 | eval_split: val # or test
10 | proc_split: train # or trainval (preprocessing split, must be equal to train_split)
11 | nb_threads: 4
12 | batch_size: 256
13 | nans: 3000
14 | minwcount: 0
15 | nlp: mcb
16 | samplingans: True
17 | dir_rcnn: data/vqa/coco/extract_rcnn/2018-04-27_bottom-up-attention_fixed_36
18 | vg: false
19 | model:
20 | name: default
21 | network:
22 | import: cfvqa.models.networks.factory
23 | base: smrl
24 | name: cfvqasimple
25 | fusion_mode: rubi
26 | cfvqa_params:
27 | mlp_q:
28 | input_dim: 4800
29 | dimensions: [1024,1024,3000]
30 | txt_enc:
31 | name: skipthoughts
32 | type: BayesianUniSkip
33 | dropout: 0.25
34 | fixed_emb: False
35 | dir_st: data/skip-thoughts
36 | self_q_att: True
37 | residual: False
38 | q_single: False
39 | fusion:
40 | type: block
41 | input_dims: [4800, 2048]
42 | output_dim: 2048
43 | mm_dim: 1000
44 | chunks: 20
45 | rank: 15
46 | dropout_input: 0.
47 | dropout_pre_lin: 0.
48 | agg:
49 | type: max
50 | classif:
51 | mlp:
52 | input_dim: 2048
53 | dimensions: [1024,1024,3000]
54 | criterion:
55 | import: cfvqa.models.criterions.factory
56 | name: cfvqasimple_criterion
57 | question_loss_weight: 1.0
58 | metric:
59 | import: cfvqa.models.metrics.factory
60 | name: vqa_cfvqasimple_metrics
61 | optimizer:
62 | import: cfvqa.optimizers.factory
63 | name: Adam
64 | lr: 0.0003
65 | gradual_warmup_steps: [0.5, 2.0, 7.0] #torch.linspace
66 | gradual_warmup_steps_mm: [0.5, 2.0, 7.0] #torch.linspace
67 | lr_decay_epochs: [14, 24, 2] #range
68 | lr_decay_rate: .25
69 | engine:
70 | name: logger
71 | debug: False
72 | print_freq: 10
73 | nb_epochs: 22
74 | saving_criteria:
75 | - eval_epoch.accuracy_all_top1:max
76 | - eval_epoch.accuracy_vq_top1:max
77 | - eval_epoch.accuracy_cfvqa_top1:max
78 | misc:
79 | logs_name:
80 | cuda: True
81 | seed: 1337
82 |
--------------------------------------------------------------------------------
/cfvqa/options/vqa2/smrl_cfvqasimple_sum.yaml:
--------------------------------------------------------------------------------
1 | exp:
2 | dir: logs/vqa2/smrl_cfvqasimple_sum
3 | resume: # last, best_[...], or empty (from scratch)
4 | dataset:
5 | import: cfvqa.datasets.factory
6 | name: vqa2 # or vqa2vg
7 | dir: data/vqa/vqa2
8 | train_split: train
9 | eval_split: val # or test
10 | proc_split: train # or trainval (preprocessing split, must be equal to train_split)
11 | nb_threads: 4
12 | batch_size: 256
13 | nans: 3000
14 | minwcount: 0
15 | nlp: mcb
16 | samplingans: True
17 | dir_rcnn: data/vqa/coco/extract_rcnn/2018-04-27_bottom-up-attention_fixed_36
18 | vg: false
19 | model:
20 | name: default
21 | network:
22 | import: cfvqa.models.networks.factory
23 | base: smrl
24 | name: cfvqasimple
25 | fusion_mode: sum
26 | cfvqa_params:
27 | mlp_q:
28 | input_dim: 4800
29 | dimensions: [1024,1024,3000]
30 | txt_enc:
31 | name: skipthoughts
32 | type: BayesianUniSkip
33 | dropout: 0.25
34 | fixed_emb: False
35 | dir_st: data/skip-thoughts
36 | self_q_att: True
37 | residual: False
38 | q_single: False
39 | fusion:
40 | type: block
41 | input_dims: [4800, 2048]
42 | output_dim: 2048
43 | mm_dim: 1000
44 | chunks: 20
45 | rank: 15
46 | dropout_input: 0.
47 | dropout_pre_lin: 0.
48 | agg:
49 | type: max
50 | classif:
51 | mlp:
52 | input_dim: 2048
53 | dimensions: [1024,1024,3000]
54 | criterion:
55 | import: cfvqa.models.criterions.factory
56 | name: cfvqasimple_criterion
57 | question_loss_weight: 1.0
58 | metric:
59 | import: cfvqa.models.metrics.factory
60 | name: vqa_cfvqasimple_metrics
61 | optimizer:
62 | import: cfvqa.optimizers.factory
63 | name: Adam
64 | lr: 0.0003
65 | gradual_warmup_steps: [0.5, 2.0, 7.0] #torch.linspace
66 | gradual_warmup_steps_mm: [0.5, 2.0, 7.0] #torch.linspace
67 | lr_decay_epochs: [14, 24, 2] #range
68 | lr_decay_rate: .25
69 | engine:
70 | name: logger
71 | debug: False
72 | print_freq: 10
73 | nb_epochs: 22
74 | saving_criteria:
75 | - eval_epoch.accuracy_all_top1:max
76 | - eval_epoch.accuracy_vq_top1:max
77 | - eval_epoch.accuracy_cfvqa_top1:max
78 | misc:
79 | logs_name:
80 | cuda: True
81 | seed: 1337
82 |
--------------------------------------------------------------------------------
/cfvqa/options/vqa2/smrl_rubi.yaml:
--------------------------------------------------------------------------------
1 | exp:
2 | dir: logs/vqa2/smrl_rubi
3 | resume: # last, best_[...], or empty (from scratch)
4 | dataset:
5 | import: cfvqa.datasets.factory
6 | name: vqa2 # or vqa2vg
7 | dir: data/vqa/vqa2
8 | train_split: train
9 | eval_split: val # or test
10 | proc_split: train # or trainval (preprocessing split, must be equal to train_split)
11 | nb_threads: 4
12 | batch_size: 256
13 | nans: 3000
14 | minwcount: 0
15 | nlp: mcb
16 | samplingans: True
17 | dir_rcnn: data/vqa/coco/extract_rcnn/2018-04-27_bottom-up-attention_fixed_36
18 | vg: false
19 | model:
20 | name: default
21 | network:
22 | import: cfvqa.models.networks.factory
23 | base: smrl
24 | name: rubi
25 | rubi_params:
26 | mlp_q:
27 | input_dim: 4800
28 | dimensions: [1024,1024,3000]
29 | txt_enc:
30 | name: skipthoughts
31 | type: BayesianUniSkip
32 | dropout: 0.25
33 | fixed_emb: False
34 | dir_st: data/skip-thoughts
35 | self_q_att: True
36 | residual: False
37 | q_single: False
38 | fusion:
39 | type: block
40 | input_dims: [4800, 2048]
41 | output_dim: 2048
42 | mm_dim: 1000
43 | chunks: 20
44 | rank: 15
45 | dropout_input: 0.
46 | dropout_pre_lin: 0.
47 | agg:
48 | type: max
49 | classif:
50 | mlp:
51 | input_dim: 2048
52 | dimensions: [1024,1024,3000]
53 | criterion:
54 | import: cfvqa.models.criterions.factory
55 | name: rubi_criterion
56 | question_loss_weight: 1.0
57 | metric:
58 | import: cfvqa.models.metrics.factory
59 | name: vqa_rubi_metrics
60 | optimizer:
61 | import: cfvqa.optimizers.factory
62 | name: Adam
63 | lr: 0.0003
64 | gradual_warmup_steps: [0.5, 2.0, 7.0] #torch.linspace
65 | gradual_warmup_steps_mm: [0.5, 2.0, 7.0] #torch.linspace
66 | lr_decay_epochs: [14, 24, 2] #range
67 | lr_decay_rate: .25
68 | engine:
69 | name: logger
70 | debug: False
71 | print_freq: 10
72 | nb_epochs: 22
73 | saving_criteria:
74 | - eval_epoch.accuracy_top1:max
75 | - eval_epoch.accuracy_all_top1:max
76 | misc:
77 | logs_name:
78 | cuda: True
79 | seed: 1337
80 |
--------------------------------------------------------------------------------
/cfvqa/options/vqacp2/smrl_baseline.yaml:
--------------------------------------------------------------------------------
1 | exp:
2 | dir: logs/vqacp2/smrl_baseline
3 | resume: # last, best_[...], or empty (from scratch)
4 | dataset:
5 | import: cfvqa.datasets.factory
6 | name: vqacp2 # or vqa2vg
7 | dir: data/vqa/vqacp2
8 | train_split: train
9 | eval_split: val # or test
10 | proc_split: train # or trainval (preprocessing split, must be equal to train_split)
11 | nb_threads: 4
12 | batch_size: 256
13 | nans: 3000
14 | minwcount: 0
15 | nlp: mcb
16 | samplingans: True
17 | dir_rcnn: data/vqa/coco/extract_rcnn/2018-04-27_bottom-up-attention_fixed_36
18 | model:
19 | name: default
20 | network:
21 | import: cfvqa.models.networks.factory
22 | base: smrl
23 | name: baseline
24 | txt_enc:
25 | name: skipthoughts
26 | type: BayesianUniSkip
27 | dropout: 0.25
28 | fixed_emb: False
29 | dir_st: data/skip-thoughts
30 | self_q_att: True
31 | residual: False
32 | q_single: False
33 | fusion:
34 | type: block
35 | input_dims: [4800, 2048]
36 | output_dim: 2048
37 | mm_dim: 1000
38 | chunks: 20
39 | rank: 15
40 | dropout_input: 0.
41 | dropout_pre_lin: 0.
42 | agg:
43 | type: max
44 | classif:
45 | mlp:
46 | input_dim: 2048
47 | dimensions: [1024,1024,3000]
48 | criterion:
49 | import: cfvqa.models.criterions.factory
50 | name: vqa_cross_entropy
51 | metric:
52 | import: cfvqa.models.metrics.factory
53 | name: vqa_accuracies
54 | optimizer:
55 | import: cfvqa.optimizers.factory
56 | name: Adam
57 | lr: 0.0003
58 | gradual_warmup_steps: [0.5, 2.0, 7.0] #torch.linspace
59 | gradual_warmup_steps_mm: [0.5, 2.0, 7.0] #torch.linspace
60 | lr_decay_epochs: [14, 24, 2] #range
61 | lr_decay_rate: .25
62 | engine:
63 | name: logger
64 | debug: False
65 | print_freq: 10
66 | nb_epochs: 22
67 | saving_criteria:
68 | - eval_epoch.accuracy_top1:max
69 | misc:
70 | logs_name:
71 | cuda: True
72 | seed: 1337
73 |
--------------------------------------------------------------------------------
/cfvqa/options/vqacp2/smrl_cfvqa_hm.yaml:
--------------------------------------------------------------------------------
1 | exp:
2 | dir: logs/vqacp2/smrl_cfvqa_hm
3 | resume: # last, best_[...], or empty (from scratch)
4 | dataset:
5 | import: cfvqa.datasets.factory
6 | name: vqacp2 # or vqa2vg
7 | dir: data/vqa/vqacp2
8 | train_split: train
9 | eval_split: val # or test
10 | proc_split: train # or trainval (preprocessing split, must be equal to train_split)
11 | nb_threads: 4
12 | batch_size: 256
13 | nans: 3000
14 | minwcount: 0
15 | nlp: mcb
16 | samplingans: True
17 | dir_rcnn: data/vqa/coco/extract_rcnn/2018-04-27_bottom-up-attention_fixed_36
18 | model:
19 | name: default
20 | network:
21 | import: cfvqa.models.networks.factory
22 | base: smrl
23 | name: cfvqa
24 | fusion_mode: hm
25 | is_vq: True
26 | cfvqa_params:
27 | mlp_q:
28 | input_dim: 4800
29 | dimensions: [1024,1024,3000]
30 | mlp_v:
31 | input_dim: 2048
32 | dimensions: [1024,1024,3000]
33 | txt_enc:
34 | name: skipthoughts
35 | type: BayesianUniSkip
36 | dropout: 0.25
37 | fixed_emb: False
38 | dir_st: data/skip-thoughts
39 | self_q_att: True
40 | residual: False
41 | q_single: False
42 | fusion:
43 | type: block
44 | input_dims: [4800, 2048]
45 | output_dim: 2048
46 | mm_dim: 1000
47 | chunks: 20
48 | rank: 15
49 | dropout_input: 0.
50 | dropout_pre_lin: 0.
51 | agg:
52 | type: max
53 | classif:
54 | mlp:
55 | input_dim: 2048
56 | dimensions: [1024,1024,3000]
57 | criterion:
58 | import: cfvqa.models.criterions.factory
59 | name: cfvqa_criterion
60 | question_loss_weight: 1.0
61 | vision_loss_weight: 1.0
62 | metric:
63 | import: cfvqa.models.metrics.factory
64 | name: vqa_cfvqa_metrics
65 | optimizer:
66 | import: cfvqa.optimizers.factory
67 | name: Adam
68 | lr: 0.0003
69 | gradual_warmup_steps: [0.5, 2.0, 7.0] #torch.linspace
70 | gradual_warmup_steps_mm: [0.5, 2.0, 7.0] #torch.linspace
71 | lr_decay_epochs: [14, 24, 2] #range
72 | lr_decay_rate: .25
73 | engine:
74 | name: logger
75 | debug: False
76 | print_freq: 10
77 | nb_epochs: 22
78 | saving_criteria:
79 | - eval_epoch.accuracy_all_top1:max
80 | - eval_epoch.accuracy_vq_top1:max
81 | - eval_epoch.accuracy_cfvqa_top1:max
82 | misc:
83 | logs_name:
84 | cuda: True
85 | seed: 1337
86 |
--------------------------------------------------------------------------------
/cfvqa/options/vqacp2/smrl_cfvqa_sum.yaml:
--------------------------------------------------------------------------------
1 | exp:
2 | dir: logs/vqacp2/smrl_cfvqa_sum
3 | resume: # last, best_[...], or empty (from scratch)
4 | dataset:
5 | import: cfvqa.datasets.factory
6 | name: vqacp2 # or vqa2vg
7 | dir: data/vqa/vqacp2
8 | train_split: train
9 | eval_split: val # or test
10 | proc_split: train # or trainval (preprocessing split, must be equal to train_split)
11 | nb_threads: 4
12 | batch_size: 256
13 | nans: 3000
14 | minwcount: 0
15 | nlp: mcb
16 | samplingans: True
17 | dir_rcnn: data/vqa/coco/extract_rcnn/2018-04-27_bottom-up-attention_fixed_36
18 | model:
19 | name: default
20 | network:
21 | import: cfvqa.models.networks.factory
22 | base: smrl
23 | name: cfvqa
24 | fusion_mode: sum
25 | is_vq: True
26 | cfvqa_params:
27 | mlp_q:
28 | input_dim: 4800
29 | dimensions: [1024,1024,3000]
30 | mlp_v:
31 | input_dim: 2048
32 | dimensions: [1024,1024,3000]
33 | txt_enc:
34 | name: skipthoughts
35 | type: BayesianUniSkip
36 | dropout: 0.25
37 | fixed_emb: False
38 | dir_st: data/skip-thoughts
39 | self_q_att: True
40 | residual: False
41 | q_single: False
42 | fusion:
43 | type: block
44 | input_dims: [4800, 2048]
45 | output_dim: 2048
46 | mm_dim: 1000
47 | chunks: 20
48 | rank: 15
49 | dropout_input: 0.
50 | dropout_pre_lin: 0.
51 | agg:
52 | type: max
53 | classif:
54 | mlp:
55 | input_dim: 2048
56 | dimensions: [1024,1024,3000]
57 | criterion:
58 | import: cfvqa.models.criterions.factory
59 | name: cfvqa_criterion
60 | question_loss_weight: 1.0
61 | vision_loss_weight: 1.0
62 | metric:
63 | import: cfvqa.models.metrics.factory
64 | name: vqa_cfvqa_metrics
65 | optimizer:
66 | import: cfvqa.optimizers.factory
67 | name: Adam
68 | lr: 0.0003
69 | gradual_warmup_steps: [0.5, 2.0, 7.0] #torch.linspace
70 | gradual_warmup_steps_mm: [0.5, 2.0, 7.0] #torch.linspace
71 | lr_decay_epochs: [14, 24, 2] #range
72 | lr_decay_rate: .25
73 | engine:
74 | name: logger
75 | debug: False
76 | print_freq: 10
77 | nb_epochs: 22
78 | saving_criteria:
79 | - eval_epoch.accuracy_all_top1:max
80 | - eval_epoch.accuracy_vq_top1:max
81 | - eval_epoch.accuracy_cfvqa_top1:max
82 | misc:
83 | logs_name:
84 | cuda: True
85 | seed: 1337
86 |
--------------------------------------------------------------------------------
/cfvqa/options/vqacp2/smrl_cfvqasimple_hm.yaml:
--------------------------------------------------------------------------------
1 | exp:
2 | dir: logs/vqacp2/smrl_cfvqasimple_hm
3 | resume: # last, best_[...], or empty (from scratch)
4 | dataset:
5 | import: cfvqa.datasets.factory
6 | name: vqacp2 # or vqa2vg
7 | dir: data/vqa/vqacp2
8 | train_split: train
9 | eval_split: val # or test
10 | proc_split: train # or trainval (preprocessing split, must be equal to train_split)
11 | nb_threads: 4
12 | batch_size: 256
13 | nans: 3000
14 | minwcount: 0
15 | nlp: mcb
16 | samplingans: True
17 | dir_rcnn: data/vqa/coco/extract_rcnn/2018-04-27_bottom-up-attention_fixed_36
18 | model:
19 | name: default
20 | network:
21 | import: cfvqa.models.networks.factory
22 | base: smrl
23 | name: cfvqasimple
24 | fusion_mode: hm
25 | is_vq: False
26 | cfvqa_params:
27 | mlp_q:
28 | input_dim: 4800
29 | dimensions: [1024,1024,3000]
30 | txt_enc:
31 | name: skipthoughts
32 | type: BayesianUniSkip
33 | dropout: 0.25
34 | fixed_emb: False
35 | dir_st: data/skip-thoughts
36 | self_q_att: True
37 | residual: False
38 | q_single: False
39 | fusion:
40 | type: block
41 | input_dims: [4800, 2048]
42 | output_dim: 2048
43 | mm_dim: 1000
44 | chunks: 20
45 | rank: 15
46 | dropout_input: 0.
47 | dropout_pre_lin: 0.
48 | agg:
49 | type: max
50 | classif:
51 | mlp:
52 | input_dim: 2048
53 | dimensions: [1024,1024,3000]
54 | criterion:
55 | import: cfvqa.models.criterions.factory
56 | name: cfvqasimple_criterion
57 | question_loss_weight: 1.0
58 | metric:
59 | import: cfvqa.models.metrics.factory
60 | name: vqa_cfvqasimple_metrics
61 | optimizer:
62 | import: cfvqa.optimizers.factory
63 | name: Adam
64 | lr: 0.0003
65 | gradual_warmup_steps: [0.5, 2.0, 7.0] #torch.linspace
66 | gradual_warmup_steps_mm: [0.5, 2.0, 7.0] #torch.linspace
67 | lr_decay_epochs: [14, 24, 2] #range
68 | lr_decay_rate: .25
69 | engine:
70 | name: logger
71 | debug: False
72 | print_freq: 10
73 | nb_epochs: 22
74 | saving_criteria:
75 | - eval_epoch.accuracy_all_top1:max
76 | - eval_epoch.accuracy_vq_top1:max
77 | - eval_epoch.accuracy_cfvqa_top1:max
78 | misc:
79 | logs_name:
80 | cuda: True
81 | seed: 1337
82 |
--------------------------------------------------------------------------------
/cfvqa/options/vqacp2/smrl_cfvqasimple_rubi.yaml:
--------------------------------------------------------------------------------
1 | exp:
2 | dir: logs/vqacp2/smrl_cfvqasimple_rubi
3 | resume: # last, best_[...], or empty (from scratch)
4 | dataset:
5 | import: cfvqa.datasets.factory
6 | name: vqacp2 # or vqa2vg
7 | dir: data/vqa/vqacp2
8 | train_split: train
9 | eval_split: val # or test
10 | proc_split: train # or trainval (preprocessing split, must be equal to train_split)
11 | nb_threads: 4
12 | batch_size: 256
13 | nans: 3000
14 | minwcount: 0
15 | nlp: mcb
16 | samplingans: True
17 | dir_rcnn: data/vqa/coco/extract_rcnn/2018-04-27_bottom-up-attention_fixed_36
18 | model:
19 | name: default
20 | network:
21 | import: cfvqa.models.networks.factory
22 | base: smrl
23 | name: cfvqasimple
24 | fusion_mode: rubi
25 | is_vq: False
26 | cfvqa_params:
27 | mlp_q:
28 | input_dim: 4800
29 | dimensions: [1024,1024,3000]
30 | txt_enc:
31 | name: skipthoughts
32 | type: BayesianUniSkip
33 | dropout: 0.25
34 | fixed_emb: False
35 | dir_st: data/skip-thoughts
36 | self_q_att: True
37 | residual: False
38 | q_single: False
39 | fusion:
40 | type: block
41 | input_dims: [4800, 2048]
42 | output_dim: 2048
43 | mm_dim: 1000
44 | chunks: 20
45 | rank: 15
46 | dropout_input: 0.
47 | dropout_pre_lin: 0.
48 | agg:
49 | type: max
50 | classif:
51 | mlp:
52 | input_dim: 2048
53 | dimensions: [1024,1024,3000]
54 | criterion:
55 | import: cfvqa.models.criterions.factory
56 | name: cfvqasimple_criterion
57 | question_loss_weight: 1.0
58 | metric:
59 | import: cfvqa.models.metrics.factory
60 | name: vqa_cfvqasimple_metrics
61 | optimizer:
62 | import: cfvqa.optimizers.factory
63 | name: Adam
64 | lr: 0.0003
65 | gradual_warmup_steps: [0.5, 2.0, 7.0] #torch.linspace
66 | gradual_warmup_steps_mm: [0.5, 2.0, 7.0] #torch.linspace
67 | lr_decay_epochs: [14, 24, 2] #range
68 | lr_decay_rate: .25
69 | engine:
70 | name: logger
71 | debug: False
72 | print_freq: 10
73 | nb_epochs: 22
74 | saving_criteria:
75 | - eval_epoch.accuracy_all_top1:max
76 | - eval_epoch.accuracy_vq_top1:max
77 | - eval_epoch.accuracy_cfvqa_top1:max
78 | misc:
79 | logs_name:
80 | cuda: True
81 | seed: 1337
82 |
--------------------------------------------------------------------------------
/cfvqa/options/vqacp2/smrl_cfvqasimple_sum.yaml:
--------------------------------------------------------------------------------
1 | exp:
2 | dir: logs/vqacp2/smrl_cfvqasimple_sum
3 | resume: # last, best_[...], or empty (from scratch)
4 | dataset:
5 | import: cfvqa.datasets.factory
6 | name: vqacp2 # or vqa2vg
7 | dir: data/vqa/vqacp2
8 | train_split: train
9 | eval_split: val # or test
10 | proc_split: train # or trainval (preprocessing split, must be equal to train_split)
11 | nb_threads: 4
12 | batch_size: 256
13 | nans: 3000
14 | minwcount: 0
15 | nlp: mcb
16 | samplingans: True
17 | dir_rcnn: data/vqa/coco/extract_rcnn/2018-04-27_bottom-up-attention_fixed_36
18 | model:
19 | name: default
20 | network:
21 | import: cfvqa.models.networks.factory
22 | base: smrl
23 | name: cfvqasimple
24 | fusion_mode: sum
25 | is_vq: False
26 | cfvqa_params:
27 | mlp_q:
28 | input_dim: 4800
29 | dimensions: [1024,1024,3000]
30 | txt_enc:
31 | name: skipthoughts
32 | type: BayesianUniSkip
33 | dropout: 0.25
34 | fixed_emb: False
35 | dir_st: data/skip-thoughts
36 | self_q_att: True
37 | residual: False
38 | q_single: False
39 | fusion:
40 | type: block
41 | input_dims: [4800, 2048]
42 | output_dim: 2048
43 | mm_dim: 1000
44 | chunks: 20
45 | rank: 15
46 | dropout_input: 0.
47 | dropout_pre_lin: 0.
48 | agg:
49 | type: max
50 | classif:
51 | mlp:
52 | input_dim: 2048
53 | dimensions: [1024,1024,3000]
54 | criterion:
55 | import: cfvqa.models.criterions.factory
56 | name: cfvqasimple_criterion
57 | question_loss_weight: 1.0
58 | metric:
59 | import: cfvqa.models.metrics.factory
60 | name: vqa_cfvqasimple_metrics
61 | optimizer:
62 | import: cfvqa.optimizers.factory
63 | name: Adam
64 | lr: 0.0003
65 | gradual_warmup_steps: [0.5, 2.0, 7.0] #torch.linspace
66 | gradual_warmup_steps_mm: [0.5, 2.0, 7.0] #torch.linspace
67 | lr_decay_epochs: [14, 24, 2] #range
68 | lr_decay_rate: .25
69 | engine:
70 | name: logger
71 | debug: False
72 | print_freq: 10
73 | nb_epochs: 22
74 | saving_criteria:
75 | - eval_epoch.accuracy_all_top1:max
76 | - eval_epoch.accuracy_vq_top1:max
77 | - eval_epoch.accuracy_cfvqa_top1:max
78 | misc:
79 | logs_name:
80 | cuda: True
81 | seed: 1337
82 |
--------------------------------------------------------------------------------
/cfvqa/options/vqacp2/smrl_rubi.yaml:
--------------------------------------------------------------------------------
1 | exp:
2 | dir: logs/vqacp2/smrl_rubi
3 | resume: # last, best_[...], or empty (from scratch)
4 | dataset:
5 | import: cfvqa.datasets.factory
6 | name: vqacp2 # or vqa2vg
7 | dir: data/vqa/vqacp2
8 | train_split: train
9 | eval_split: val # or test
10 | proc_split: train # or trainval (preprocessing split, must be equal to train_split)
11 | nb_threads: 4
12 | batch_size: 256
13 | nans: 3000
14 | minwcount: 0
15 | nlp: mcb
16 | samplingans: True
17 | dir_rcnn: data/vqa/coco/extract_rcnn/2018-04-27_bottom-up-attention_fixed_36
18 | model:
19 | name: default
20 | network:
21 | import: cfvqa.models.networks.factory
22 | base: smrl
23 | name: rubi
24 | rubi_params:
25 | mlp_q:
26 | input_dim: 4800
27 | dimensions: [1024,1024,3000]
28 | txt_enc:
29 | name: skipthoughts
30 | type: BayesianUniSkip
31 | dropout: 0.25
32 | fixed_emb: False
33 | dir_st: data/skip-thoughts
34 | self_q_att: True
35 | residual: False
36 | q_single: False
37 | fusion:
38 | type: block
39 | input_dims: [4800, 2048]
40 | output_dim: 2048
41 | mm_dim: 1000
42 | chunks: 20
43 | rank: 15
44 | dropout_input: 0.
45 | dropout_pre_lin: 0.
46 | agg:
47 | type: max
48 | classif:
49 | mlp:
50 | input_dim: 2048
51 | dimensions: [1024,1024,3000]
52 | criterion:
53 | import: cfvqa.models.criterions.factory
54 | name: rubi_criterion
55 | question_loss_weight: 1.0
56 | metric:
57 | import: cfvqa.models.metrics.factory
58 | name: vqa_rubi_metrics
59 | optimizer:
60 | import: cfvqa.optimizers.factory
61 | name: Adam
62 | lr: 0.0003
63 | gradual_warmup_steps: [0.5, 2.0, 7.0] #torch.linspace
64 | gradual_warmup_steps_mm: [0.5, 2.0, 7.0] #torch.linspace
65 | lr_decay_epochs: [14, 24, 2] #range
66 | lr_decay_rate: .25
67 | engine:
68 | name: logger
69 | debug: False
70 | print_freq: 10
71 | nb_epochs: 22
72 | saving_criteria:
73 | - eval_epoch.accuracy_top1:max
74 | - eval_epoch.accuracy_all_top1:max
75 | misc:
76 | logs_name:
77 | cuda: True
78 | seed: 1337
79 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | block.bootstrap.pytorch
2 | h5py
3 | plotly==3.10.0
--------------------------------------------------------------------------------