├── .gitignore ├── .gitmodules ├── LICENSE ├── README.md ├── config └── clevr │ ├── config.baseline.json │ ├── config.cbn.json │ └── config.film.json └── src └── clevr ├── data_provider ├── clevr_batchifier.py ├── clevr_dataset.py └── clevr_tokenizer.py ├── models ├── baseline_network.py ├── film_network.py └── network_factory.py ├── preprocess_data ├── create_dictionary.py ├── extract_image_features.py ├── extract_raw_image.py └── test.py └── train └── train_clevr.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "src/generic"] 2 | path = src/generic 3 | url = https://github.com/GuessWhatGame/generic 4 | [submodule "src/neural_toolbox"] 5 | path = src/neural_toolbox 6 | url = https://github.com/GuessWhatGame/neural_toolbox 7 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CLEVR models 2 | 3 | This repo aims at reproducing the results of CLEVR from the following paper: 4 | - Learning Visual Reasoning Without Strong Priors [1] https://arxiv.org/abs/1707.03017 5 | - FiLM: Visual Reasoning with a General Conditioning Layer [2] https://arxiv.org/abs/1709.07871 6 | 7 | and unpublished results from: 8 | - Modulating early visual processing by language [1] https://arxiv.org/abs/1707.00683 9 | 10 | The code was equally developed by Florian Strub (University of Lille) and Harm de Vries (University of Montreal) 11 | 12 | The project is part of the CHISTERA - IGLU Project. 13 | 14 | #### Summary: 15 | 16 | * [Introduction](#introduction) 17 | * [Installation](#installation) 18 | * [Download](#Download) 19 | * [Requirements](#requirements) 20 | * [File architecture](#file-architecture) 21 | * [Data](#data) 22 | * [Pretrained models](#pretrained-models) 23 | * [Reproducing results](#reproducing-results) 24 | * [Process Data](#data) 25 | * [Train Model](#train-model) 26 | * [FAQ](#faq) 27 | * [Citation](#citation) 28 | 29 | ## Introduction 30 | 31 | We introduce a new CLEVR Baseline based on FiLM layers and Conditional Batch Normalization technique. 32 | 33 | ## Installation 34 | 35 | 36 | ### Download 37 | 38 | Our code has internal dependencies called submodules. To properly clone the repository, please use the following git command:\ 39 | 40 | ``` 41 | git clone --recursive https://github.com/GuessWhatGame/clevr.git 42 | ``` 43 | 44 | ### Requirements 45 | 46 | The code works on both python 2 and 3. It relies on the tensorflow python API. 47 | It requires the following python packages: 48 | 49 | ``` 50 | pip install \ 51 | tensorflow-gpu \ 52 | nltk \ 53 | tqdm 54 | ``` 55 | 56 | 57 | ### File architecture 58 | In the following, we assume that the following file/folder architecture is respected: 59 | 60 | ``` 61 | clevr 62 | ├── config # store the configuration file to create/train models 63 | | └── clevr 64 | | 65 | ├── out # store the output experiments (checkpoint, logs etc.) 66 | | └── clevr 67 | | 68 | ├── data # contains the CLEVR data 69 | | 70 | └── src # source files 71 | ``` 72 | 73 | To complete the git-clone file architecture, you can do: 74 | 75 | ``` 76 | cd guesswhat 77 | mkdir data; 78 | mkdir out; mkdir out/clevr 79 | ``` 80 | 81 | Of course, one is free to change this file architecture! 82 | 83 | ### Data 84 | CLEVR relies on the CLEVR dataset: http://cs.stanford.edu/people/jcjohns/clevr/ 85 | 86 | To download the CLEVR dataset please use wget: 87 | ``` 88 | wget https://s3-us-west-1.amazonaws.com/clevr/CLEVR_v1.0.zip -P data/ 89 | ``` 90 | 91 | ## Reproducing results 92 | 93 | To launch the experiments in the local directory, you first have to set the pyhton path: 94 | ``` 95 | export PYTHONPATH=src:${PYTHONPATH} 96 | ``` 97 | Note that you can also directly execute the experiments in the source folder. 98 | 99 | ### Process Data 100 | 101 | Before starting the training, one needs to create a dictionary 102 | 103 | #### Extract image features 104 | 105 | You do not need to extract image feature for VQA + CBN. 106 | Yet, this code does support any kind of image features as input. 107 | 108 | Following the original papers, we are going to extract fc8 features from the coco images by using a VGG-16 network. 109 | 110 | First, you need to download the ResNet-101 pretrained network provided by [slim-tensorflow](https://github.com/tensorflow/models/tree/master/research/slim): 111 | 112 | ``` 113 | wget http://download.tensorflow.org/models/resnet_v1_101_2016_08_28.tar.gz -P data/ 114 | tar zxvf data/resnet_v1_101_2016_08_28.tar.gz -C data/ 115 | ``` 116 | 117 | Them, use the following scripts src/vqa/preprocess_data/extract_img_features.py . 118 | ``` 119 | for mode in "${array[@]}"; do 120 | python src/vqa/preprocess_data/extract_img_features.py \ 121 | -img_dir data/CLEVR_v1.0/images \ 122 | -data_dir data/CLEVR_v1.0 \ 123 | -data_out data/CLEVR_v1.0 \ 124 | -img_size 224 125 | -ckpt data/resnet_v1_101.ckpt \ 126 | -feature_name block3/unit_22/bottleneck_v1 \ 127 | ``` 128 | 129 | 130 | #### Create dictionary 131 | 132 | To create the CLEVR dictionary, you need to use the python script clevr/src/clevr/preprocess_data/create_dico.py . 133 | 134 | ``` 135 | python src/clevr/preprocess_data/create_dictionary.py -data_dir data/CLEVR_v1.0 -dict_file dict.json 136 | ``` 137 | 138 | ### Train Model 139 | To train the network, you need to select/configure the kind of neural architecure you want. 140 | To do so, you have update the file config/clevr/config.json 141 | 142 | Once the config file is set, you can launch the training step: 143 | ``` 144 | python src/clevr/train/train_clevr.py \ 145 | -data_dir data/CLEVR_v1.0 \ 146 | -img_dir data/CLEVR_v1.0 \ 147 | -config config/clevr/config.film.json \ 148 | -exp_dir out/clevr \ 149 | -no_thread 2 150 | ``` 151 | 152 | After training, we obtained the following results: 153 | 154 | 155 | Temporary results: 156 | ------------------------- 157 | FiLM: ~96% accuracy on val 158 | Please note that this score is a bit lower that the pytorch version. 159 | We assume that this difference is mainly due to numerical stability. 160 | 161 | 162 | ## Citation 163 | 164 | 165 | ``` 166 | @inproceedings{perez2017learning, 167 | title={Learning Visual Reasoning Without Strong Priors}, 168 | author={Perez, Ethan and de Vries, Harm and Strub, Florian and Dumoulin, Vincent and Courville, Aaron}, 169 | booktitle={ICML Machine Learning in Speech and Language Processing Workshop}, 170 | year={2017} 171 | } 172 | 173 | @article{perez2017film, 174 | title={FiLM: Visual Reasoning with a General Conditioning Layer}, 175 | author={Perez, Ethan and Strub, Florian and de Vries, Harm and Dumoulin, Vincent and Courville, Aaron}, 176 | journal={arXiv preprint arXiv:1709.07871}, 177 | year={2017} 178 | } 179 | 180 | @inproceedings{guesswhat_game, 181 | author = {Harm de Vries and Florian Strub and J\'er\'emie Mary and Hugo Larochelle and Olivier Pietquin and Aaron C. Courville}, 182 | title = {Modulating early visual processing by language}, 183 | booktitle = {Advances in Neural Information Processing Systems 30}, 184 | year = {2017} 185 | url = {https://arxiv.org/abs/1707.00683} 186 | } 187 | ``` 188 | 189 | 190 | ## Acknowledgement 191 | - SequeL Team 192 | - Mila Team 193 | 194 | 195 | 196 | 197 | 198 | 199 | -------------------------------------------------------------------------------- /config/clevr/config.baseline.json: -------------------------------------------------------------------------------- 1 | { 2 | "name" : "Baseline", 3 | 4 | "model": { 5 | 6 | "type" : "baseline", 7 | 8 | "image": { 9 | "image_input": "conv", 10 | "dim": [14, 14, 1024], 11 | "normalize": false 12 | }, 13 | 14 | "question": { 15 | "word_embedding_dim": 200, 16 | "glove" : false, 17 | 18 | "cell": "gru", 19 | "rnn_state_size": 2048, 20 | "bidirectional" : true, 21 | "layer_norm" : true, 22 | "max_pool" : false 23 | }, 24 | 25 | "pooling" : { 26 | "mode": "classic", 27 | "no_attention_mlp": 256 28 | }, 29 | 30 | "fusion": { 31 | "mode" : "vis", 32 | "projection_size": 512 33 | }, 34 | 35 | "classifier": 36 | { 37 | "no_mlp_units": 512 38 | }, 39 | 40 | "dropout_keep_prob": 0.5 41 | }, 42 | 43 | "optimizer": { 44 | "no_epoch": 20, 45 | "learning_rate": 1e-4, 46 | "clip_val": 0, 47 | "batch_size": 64, 48 | "weight_decay": 0, 49 | "weight_decay_add": [], 50 | "weight_decay_remove": [] 51 | }, 52 | 53 | "seed": -1 54 | } -------------------------------------------------------------------------------- /config/clevr/config.cbn.json: -------------------------------------------------------------------------------- 1 | { 2 | "name" : "Baseline", 3 | 4 | "model": { 5 | 6 | "type" : "baseline", 7 | 8 | "image": { 9 | "image_input": "raw", 10 | "dim": [224, 224, 3], 11 | "normalize": true, 12 | 13 | "resnet_version": 50 14 | }, 15 | 16 | "cbn": { 17 | "use_cbn": true, 18 | "cbn_embedding_size": 128, 19 | "excluded_scope_names": ["block1", "block2", "block3"] 20 | } 21 | 22 | "question": { 23 | "word_embedding_dim": 200, 24 | "glove" : false, 25 | 26 | "cell": "gru", 27 | "rnn_state_size": 1024, 28 | "bidirectional" : false, 29 | "layer_norm" : false, 30 | "max_pool" : false 31 | }, 32 | 33 | "pooling" : { 34 | "mode": "classic", 35 | "no_attention_mlp": 256 36 | }, 37 | 38 | "fusion": { 39 | "mode" : "vis", 40 | "projection_size": 512 41 | }, 42 | 43 | "classifier": 44 | { 45 | "no_mlp_units": 512 46 | }, 47 | 48 | "dropout_keep_prob": 0.5 49 | }, 50 | 51 | "optimizer": { 52 | "no_epoch": 20, 53 | "learning_rate": 1e-4, 54 | "clip_val": 0, 55 | "batch_size": 64, 56 | "weight_decay": 0, 57 | "weight_decay_add": [], 58 | "weight_decay_remove": [] 59 | }, 60 | 61 | "seed": -1 62 | } -------------------------------------------------------------------------------- /config/clevr/config.film.json: -------------------------------------------------------------------------------- 1 | { 2 | "name" : "FiLM", 3 | 4 | "model": { 5 | 6 | "type" : "film", 7 | 8 | "question": { 9 | "word_embedding_dim": 200, 10 | "glove" : false, 11 | 12 | "cell": "gru", 13 | "rnn_state_size": 2048, 14 | "bidirectional" : true, 15 | "layer_norm" : true, 16 | "max_pool" : false 17 | }, 18 | 19 | "image": { 20 | "image_input": "conv", 21 | "dim": [14, 14, 1024], 22 | "normalize": false 23 | }, 24 | 25 | "film_block": 26 | { 27 | "stem" : { 28 | "spatial_location" : true, 29 | "conv_out": 256, 30 | "conv_kernel": [3,3] 31 | }, 32 | 33 | "resblock" : { 34 | "feature_size" : [128, 128, 128, 128], 35 | "spatial_location" : true, 36 | "kernel1" : [1,1], 37 | "kernel2" : [3,3] 38 | }, 39 | 40 | "head" : { 41 | "spatial_location" : true, 42 | "conv_out": 512, 43 | "conv_kernel": [1,1] 44 | } 45 | }, 46 | 47 | "pooling" : { 48 | "mode": "max" 49 | }, 50 | 51 | "classifier": 52 | { 53 | "no_mlp_units": 512 54 | }, 55 | 56 | "dropout_keep_prob" : 1.0 57 | 58 | }, 59 | 60 | "optimizer": { 61 | "no_epoch": 10, 62 | "learning_rate": 3e-4, 63 | "batch_size": 64, 64 | "clip_val": 10.0, 65 | "weight_decay": 5e-6, 66 | "weight_decay_add": ["film_stack"], 67 | "weight_decay_remove": ["FiLM_layer"] 68 | }, 69 | 70 | "seed": -1 71 | } -------------------------------------------------------------------------------- /src/clevr/data_provider/clevr_batchifier.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import numpy as np 3 | from generic.data_provider.nlp_utils import padder 4 | from generic.data_provider.batchifier import AbstractBatchifier 5 | 6 | 7 | class CLEVRBatchifier(AbstractBatchifier): 8 | 9 | def __init__(self, tokenizer): 10 | self.tokenizer = tokenizer 11 | 12 | def apply(self, games): 13 | 14 | batch = collections.defaultdict(list) 15 | batch_size = len(games) 16 | 17 | assert batch_size > 0 18 | 19 | for i, game in enumerate(games): 20 | 21 | batch["raw"].append(game) 22 | 23 | # Get question 24 | question = self.tokenizer.encode_question(game.question) 25 | batch['question'].append(question) 26 | 27 | # Get answers 28 | answer = self.tokenizer.encode_answer(game.answer) 29 | batch['answer'].append(answer) 30 | 31 | # retrieve the image source type 32 | img = game.image.get_image() 33 | if "image" not in batch: # initialize an empty array for better memory consumption 34 | batch["image"] = np.zeros((batch_size,) + img.shape, dtype=np.float32) 35 | batch["image"][i] = img 36 | 37 | # pad the questions 38 | batch['question'], batch['seq_length'] = padder(batch['question'], 39 | padding_symbol=self.tokenizer.padding_token) 40 | 41 | return batch 42 | -------------------------------------------------------------------------------- /src/clevr/data_provider/clevr_dataset.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import collections 4 | 5 | from generic.data_provider.dataset import AbstractDataset 6 | import os 7 | 8 | use_100 = False 9 | 10 | 11 | class Image: 12 | def __init__(self, id, filename, image_builder, which_set): 13 | self.id = id 14 | self.filename = filename 15 | 16 | if image_builder is not None: 17 | self.image_loader = image_builder.build(id, filename=filename, which_set=which_set) 18 | 19 | def get_image(self, **kwargs): 20 | return self.image_loader.get_image(**kwargs) 21 | 22 | 23 | class Game(object): 24 | def __init__(self, id, image, question, answer, question_family_index): 25 | self.id = id 26 | self.image = image 27 | self.question = question 28 | self.answer = answer 29 | self.question_family_index = question_family_index 30 | 31 | def __str__(self): 32 | return "[#q:{}, #p:{}] {} - {} ({})".format(self.id, self.image.id, self.question, self.answer, self.question_family_index) 33 | 34 | 35 | class CLEVRDataset(AbstractDataset): 36 | """Loads the dataset.""" 37 | 38 | def __init__(self, folder, which_set, image_builder=None, games_to_load=float("inf")): 39 | 40 | question_file_path = '{}/questions/CLEVR_{}_questions.json'.format(folder, which_set) 41 | 42 | games = [] 43 | self.question_family_index = collections.Counter() 44 | self.answer_counter = collections.Counter() 45 | 46 | with open(question_file_path) as question_file: 47 | print("Loading questions...") 48 | data = json.load(question_file) 49 | info = data["info"] 50 | samples = data["questions"] 51 | 52 | assert info["split"] == which_set 53 | 54 | print("Successfully Loaded CLEVR v{} ({})".format(info["version"], which_set)) 55 | 56 | for sample in samples: 57 | 58 | question_id = int(sample["question_index"]) 59 | question = sample["question"] 60 | question_family_index = sample.get("question_family_index", -1) # -1 for test set 61 | 62 | answer = sample.get("answer", None) # None for test set 63 | 64 | image_id = sample["image_index"] 65 | image_filename = sample["image_filename"] 66 | image_filename = os.path.join(which_set, image_filename) 67 | 68 | games.append(Game(id=question_id, 69 | image=Image(image_id, image_filename, image_builder, which_set), 70 | question=question, 71 | answer=answer, 72 | question_family_index=question_family_index)) 73 | 74 | self.question_family_index[question_family_index] += 1 75 | self.answer_counter[answer] += 1 76 | 77 | if len(games) >= games_to_load: 78 | break 79 | 80 | print('{} games loaded...'.format(len(games))) 81 | super(CLEVRDataset, self).__init__(games) 82 | 83 | 84 | if __name__ == '__main__': 85 | dataset = CLEVRDataset("/home/fstrub/Projects/clevr_data/", which_set="val") 86 | 87 | for d in dataset.games: 88 | if "How many things are" in d.question: 89 | print(d) -------------------------------------------------------------------------------- /src/clevr/data_provider/clevr_tokenizer.py: -------------------------------------------------------------------------------- 1 | from nltk.tokenize import TweetTokenizer 2 | import json 3 | import re 4 | 5 | # Note that this is a copy/past of VQATokenizer 6 | 7 | class CLEVRTokenizer: 8 | """ """ 9 | def __init__(self, dictionary_file): 10 | 11 | self.tokenizer = TweetTokenizer(preserve_case=False) 12 | with open(dictionary_file, 'r') as f: 13 | data = json.load(f) 14 | self.word2i = data['word2i'] 15 | self.answer2i = data['answer2i'] 16 | 17 | self.dictionary_file = dictionary_file 18 | 19 | self.i2word = {} 20 | for (k, v) in self.word2i.items(): 21 | self.i2word[v] = k 22 | 23 | self.i2answer = {} 24 | for (k, v) in self.answer2i.items(): 25 | self.i2answer[v] = k 26 | 27 | # Retrieve key values 28 | self.no_words = len(self.word2i) 29 | self.no_answers = len(self.answer2i) 30 | 31 | self.start_token = self.word2i[""] 32 | self.unknown_question_token = self.word2i[""] 33 | self.padding_token = self.word2i[""] 34 | 35 | self.padding_answer = self.answer2i[""] 36 | self.unknown_answer = self.answer2i[""] 37 | 38 | 39 | """ 40 | Input: String 41 | Output: List of tokens 42 | """ 43 | def encode_question(self, question): 44 | tokens = [self.start_token] 45 | for token in self.tokenizer.tokenize(question): 46 | if token not in self.word2i: 47 | token = '' 48 | tokens.append(self.word2i[token]) 49 | 50 | return tokens 51 | 52 | def decode_question(self, tokens): 53 | return ' '.join([self.i2word[tok] for tok in tokens]) 54 | 55 | def encode_answer(self, answer): 56 | if answer not in self.answer2i: 57 | return self.answer2i[''] 58 | return self.answer2i[answer] 59 | 60 | def decode_answer(self, answer_id): 61 | return self.i2answer[answer_id] 62 | 63 | def tokenize_question(self, question): 64 | return self.tokenizer.tokenize(question) 65 | 66 | 67 | 68 | 69 | -------------------------------------------------------------------------------- /src/clevr/models/baseline_network.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import tensorflow.contrib.layers as tfc_layers 3 | 4 | from neural_toolbox import rnn 5 | 6 | from generic.tf_utils.abstract_network import ResnetModel 7 | from generic.tf_factory.image_factory import get_image_features, get_cbn 8 | from generic.tf_factory.attention_factory import get_attention 9 | from generic.tf_factory.fusion_factory import get_fusion_mechanism 10 | 11 | 12 | class CLEVRNetwork(ResnetModel): 13 | 14 | def __init__(self, config, num_words, num_answers, device='', reuse=False): 15 | ResnetModel.__init__(self, "clevr", device=device) 16 | 17 | with tf.variable_scope(self.scope_name, reuse=reuse): 18 | 19 | batch_size = None 20 | self._is_training = tf.placeholder(tf.bool, name="is_training") 21 | 22 | dropout_keep_scalar = float(config["dropout_keep_prob"]) 23 | dropout_keep = tf.cond(self._is_training, 24 | lambda: tf.constant(dropout_keep_scalar), 25 | lambda: tf.constant(1.0)) 26 | 27 | ##################### 28 | # QUESTION 29 | ##################### 30 | 31 | self._question = tf.placeholder(tf.int32, [batch_size, None], name='question') 32 | self._seq_length = tf.placeholder(tf.int32, [batch_size], name='seq_length') 33 | self._answer = tf.placeholder(tf.int64, [batch_size, num_answers], name='answer') 34 | 35 | word_emb = tfc_layers.embed_sequence( 36 | ids=self._question, 37 | vocab_size=num_words, 38 | embed_dim=config["question"]["word_embedding_dim"], 39 | scope="word_embedding", 40 | reuse=reuse) 41 | 42 | if config["question"]['glove']: 43 | self._glove = tf.placeholder(tf.float32, [None, None, 300], name="glove") 44 | word_emb = tf.concat([word_emb, self._glove], axis=2) 45 | 46 | word_emb = tf.nn.dropout(word_emb, dropout_keep) 47 | _, last_rnn_state = rnn.rnn_factory( 48 | inputs=word_emb, 49 | seq_length=self._seq_length, 50 | cell=config["question"]["cell"], 51 | num_hidden=config["question"]["rnn_state_size"], 52 | bidirectional=config["question"]["bidirectional"], 53 | max_pool=config["question"]["max_pool"], 54 | layer_norm=config["question"]["layer_norm"], 55 | reuse=reuse) 56 | 57 | ##################### 58 | # IMAGES 59 | ##################### 60 | 61 | self._image = tf.placeholder(tf.float32, [batch_size] + config['image']["dim"], name='image') 62 | 63 | cbn = None 64 | if "cbn" in config: 65 | cbn = get_cbn(config["cbn"], last_rnn_state, dropout_keep, self._is_training) 66 | 67 | self.image_out = get_image_features(image=self._image, 68 | is_training=self._is_training, 69 | config=config['image'], 70 | cbn=cbn) 71 | 72 | if len(self.image_out.get_shape()) > 2: 73 | with tf.variable_scope("image_pooling"): 74 | self.image_out = get_attention(self.image_out, last_rnn_state, 75 | is_training=self._is_training, 76 | config=config["pooling"], 77 | dropout_keep=dropout_keep, 78 | reuse=reuse) 79 | 80 | ##################### 81 | # FUSION 82 | ##################### 83 | 84 | self.visdiag_embedding = get_fusion_mechanism(input1=self.image_out, 85 | input2=last_rnn_state, 86 | config=config.get["fusion"], 87 | dropout_keep=dropout_keep) 88 | 89 | ##################### 90 | # CLASSIFIER 91 | ##################### 92 | 93 | with tf.variable_scope('mlp'): 94 | num_hiddens = config['classifier']['no_mlp_units'] 95 | 96 | self.out = tfc_layers.fully_connected(self.visdiag_embedding, num_hiddens, activation_fn=tf.nn.relu) 97 | self.out = tfc_layers.fully_connected(self.out, num_answers, activation_fn=None) 98 | 99 | self.cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=self.out, labels=self._answer) 100 | self.loss = tf.reduce_mean(self.cross_entropy) 101 | 102 | self.softmax = tf.nn.softmax(self.out, name='answer_prob') 103 | self.prediction = tf.argmax(self.out, axis=1, name='predicted_answer') # no need to compute the softmax 104 | 105 | self.success = tf.equal(self.prediction, tf.argmax(self._answer, axis=1)) # no need to compute the softmax 106 | 107 | with tf.variable_scope('accuracy'): 108 | self.accuracy = tf.equal(self.prediction, tf.argmax(self._answer, axis=1)) 109 | self.accuracy = tf.reduce_mean(tf.cast(self.accuracy, tf.float32)) 110 | 111 | print('Model... CLEVR (baseline) build!') 112 | 113 | def get_loss(self): 114 | return self.loss 115 | 116 | def get_error(self): 117 | return 1 - self.accuracy 118 | 119 | def get_accuracy(self): 120 | return self.accuracy 121 | -------------------------------------------------------------------------------- /src/clevr/models/film_network.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import tensorflow.contrib.layers as tfc_layers 3 | 4 | from generic.tf_utils.abstract_network import ResnetModel 5 | from generic.tf_factory.image_factory import get_image_features 6 | from generic.tf_factory.attention_factory import get_attention 7 | 8 | import neural_toolbox.rnn as rnn 9 | 10 | from neural_toolbox.film_stack import FiLM_Stack 11 | 12 | 13 | class FiLMCLEVRNetwork(ResnetModel): 14 | 15 | def __init__(self, config, num_words, num_answers, reuse=False, device=''): 16 | ResnetModel.__init__(self, "clevr", device=device) 17 | 18 | with tf.variable_scope(self.scope_name, reuse=reuse): 19 | batch_size = None 20 | self._is_training = tf.placeholder(tf.bool, name="is_training") 21 | 22 | dropout_keep_scalar = float(config["dropout_keep_prob"]) 23 | dropout_keep = tf.cond(self._is_training, 24 | lambda: tf.constant(dropout_keep_scalar), 25 | lambda: tf.constant(1.0)) 26 | 27 | ##################### 28 | # QUESTION 29 | ##################### 30 | 31 | self._question = tf.placeholder(tf.int32, [batch_size, None], name='question') 32 | self._seq_length = tf.placeholder(tf.int32, [batch_size], name='seq_length') 33 | self._answer = tf.placeholder(tf.int64, [batch_size], name='answer') 34 | 35 | word_emb = tfc_layers.embed_sequence( 36 | ids=self._question, 37 | vocab_size=num_words, 38 | embed_dim=config["question"]["word_embedding_dim"], 39 | scope="word_embedding", 40 | reuse=reuse) 41 | 42 | if config["question"]['glove']: 43 | self._glove = tf.placeholder(tf.float32, [None, None, 300], name="glove") 44 | word_emb = tf.concat([word_emb, self._glove], axis=2) 45 | 46 | word_emb = tf.nn.dropout(word_emb, dropout_keep) 47 | 48 | _, last_rnn_state = rnn.rnn_factory( 49 | inputs=word_emb, 50 | seq_length=self._seq_length, 51 | cell=config["question"]["cell"], 52 | num_hidden=config["question"]["rnn_state_size"], 53 | bidirectional=config["question"]["bidirectional"], 54 | max_pool=config["question"]["max_pool"], 55 | layer_norm=config["question"]["layer_norm"], 56 | reuse=reuse) 57 | 58 | last_rnn_state = tf.nn.dropout(last_rnn_state, dropout_keep) 59 | 60 | ##################### 61 | # IMAGES 62 | ##################### 63 | 64 | self._image = tf.placeholder(tf.float32, [batch_size] + config['image']["dim"], name='image') 65 | 66 | visual_features = get_image_features(image=self._image, 67 | is_training=self._is_training, 68 | config=config['image']) 69 | 70 | with tf.variable_scope("image_film_stack", reuse=reuse): 71 | film_stack = FiLM_Stack(image=visual_features, 72 | film_input=last_rnn_state, 73 | is_training=self._is_training, 74 | config=config["film_block"], 75 | reuse=reuse) 76 | 77 | visual_features = film_stack.get() 78 | 79 | # Pool Image Features 80 | with tf.variable_scope("image_pooling"): 81 | multimodal_features = get_attention(visual_features, last_rnn_state, 82 | is_training=self._is_training, 83 | config=config["pooling"], 84 | dropout_keep=dropout_keep, 85 | reuse=reuse) 86 | 87 | with tf.variable_scope("classifier"): 88 | self.hidden_state = tfc_layers.fully_connected(multimodal_features, 89 | num_outputs=config["classifier"]["no_mlp_units"], 90 | normalizer_fn=tfc_layers.batch_norm, 91 | normalizer_params={"center": True, "scale": True, 92 | "decay": 0.9, 93 | "is_training": self._is_training, 94 | "reuse": reuse}, 95 | activation_fn=tf.nn.relu, 96 | reuse=reuse, 97 | scope="classifier_hidden_layer") 98 | 99 | self.out = tfc_layers.fully_connected(self.hidden_state, 100 | num_outputs=num_answers, 101 | activation_fn=None, 102 | reuse=reuse, 103 | scope="classifier_softmax_layer") 104 | 105 | ##################### 106 | # Loss 107 | ##################### 108 | 109 | self.cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.out, labels=self._answer, name='cross_entropy') 110 | self.loss = tf.reduce_mean(self.cross_entropy) 111 | 112 | self.softmax = tf.nn.softmax(self.out, name='answer_prob') 113 | self.prediction = tf.argmax(self.out, axis=1, name='predicted_answer') # no need to compute the softmax 114 | 115 | with tf.variable_scope('accuracy'): 116 | self.accuracy = tf.equal(self.prediction, self._answer) 117 | self.accuracy = tf.reduce_mean(tf.cast(self.accuracy, tf.float32)) 118 | 119 | tf.summary.scalar('accuracy', self.accuracy) 120 | 121 | print('Model... build!') 122 | 123 | def get_loss(self): 124 | return self.loss 125 | 126 | def get_accuracy(self): 127 | return self.accuracy 128 | 129 | 130 | if __name__ == "__main__": 131 | 132 | import json 133 | with open("../../../config/clevr/config.film.json", 'r') as f_config: 134 | conf = json.load(f_config) 135 | 136 | FiLMCLEVRNetwork(conf["model"], num_words=354, num_answers=56) 137 | -------------------------------------------------------------------------------- /src/clevr/models/network_factory.py: -------------------------------------------------------------------------------- 1 | from clevr.models.baseline_network import CLEVRNetwork 2 | from clevr.models.film_network import FiLMCLEVRNetwork 3 | 4 | 5 | # stupid factory class to create networks 6 | 7 | def create_network(config, num_words, num_answers, reuse=False, device=''): 8 | 9 | network_type = config["type"] 10 | 11 | if network_type == "film": 12 | return FiLMCLEVRNetwork(config, num_words=num_words, num_answers=num_answers, reuse=reuse, device=device) 13 | elif network_type == "baseline": 14 | return CLEVRNetwork(config, num_words=num_words, num_answers=num_answers, reuse=reuse, device=device) 15 | else: 16 | assert False, "Invalid network_type: should be: film/cbn" 17 | 18 | 19 | -------------------------------------------------------------------------------- /src/clevr/preprocess_data/create_dictionary.py: -------------------------------------------------------------------------------- 1 | from nltk.tokenize import TweetTokenizer 2 | import io 3 | import json 4 | import collections 5 | from clevr.data_provider.clevr_dataset import CLEVRDataset 6 | import argparse 7 | 8 | 9 | if __name__ == '__main__': 10 | 11 | parser = argparse.ArgumentParser('Creating dictionary..') 12 | 13 | parser.add_argument("-data_dir", type=str, help="Path to VQA dataset") 14 | parser.add_argument("-dict_file", type=str, default="dict.json", help="Name of the dictionary file") 15 | parser.add_argument("-min_occ", type=int, default=1, help='Minimum number of occurences to add word to dictionary (for Human Clevr)') 16 | args = parser.parse_args() 17 | 18 | dataset = CLEVRDataset(args.data_dir, which_set="train") 19 | games = dataset.games 20 | 21 | word2i = {'': 0, 22 | '': 1, 23 | '': 2, 24 | '': 3 25 | } 26 | 27 | answer2i = {'': 0, 28 | '': 1, 29 | '': 2, 30 | '': 3 31 | } 32 | 33 | answer2occ = dataset.answer_counter 34 | word2occ = collections.defaultdict(int) 35 | 36 | 37 | # Input words 38 | tknzr = TweetTokenizer(preserve_case=False) 39 | 40 | for game in games: 41 | input_tokens = tknzr.tokenize(game.question) 42 | for tok in input_tokens: 43 | word2occ[tok] += 1 44 | 45 | # parse the questions 46 | for word, occ in word2occ.items(): 47 | if occ >= args.min_occ: 48 | word2i[word] = len(word2i) 49 | 50 | # parse the answers 51 | for answer in answer2occ.keys(): 52 | answer2i[answer] = len(answer2i) 53 | 54 | print("Number of words): {}".format(len(word2i))) 55 | print("Number of answers: {}".format(len(answer2i))) 56 | 57 | with io.open(args.dict_file, 'w', encoding='utf8') as f_out: 58 | data = json.dumps({'word2i': word2i, 'answer2i': answer2i}) 59 | f_out.write(data) 60 | -------------------------------------------------------------------------------- /src/clevr/preprocess_data/extract_image_features.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os 3 | import tensorflow as tf 4 | from distutils.util import strtobool 5 | import numpy as np 6 | import argparse 7 | 8 | from generic.data_provider.image_loader import RawImageBuilder 9 | from generic.preprocess_data.extract_img_features import extract_features 10 | 11 | from neural_toolbox import resnet 12 | 13 | from clevr.data_provider.clevr_dataset import CLEVRDataset 14 | from clevr.data_provider.clevr_batchifier import CLEVRBatchifier 15 | 16 | 17 | 18 | parser = argparse.ArgumentParser('Feature extractor! ') 19 | 20 | parser.add_argument("-img_dir", type=str, required=True, help="Input Image folder") 21 | parser.add_argument("-data_dir", type=str, required=True,help="Dataset folder") 22 | parser.add_argument("-out_dir", type=str, required=True, help="Output directory for h5 files") 23 | parser.add_argument("-set_type", type=list, default=["val", "train", "test"], help='Select the dataset to dump') 24 | 25 | parser.add_argument("-ckpt", type=str, required=True, help="Path for network checkpoint: ") 26 | parser.add_argument("-resnet_version", type=int, default=101, choices=[50, 101, 152], help="Pick the resnet version [50/101/152]") 27 | parser.add_argument("-feature_name", type=str, default="block3/unit_22/bottleneck_v1", help="Pick the name of the network features") 28 | 29 | parser.add_argument("-subtract_mean", type=lambda x:bool(strtobool(x)), default="True", help="Preprocess the image by substracting the mean") 30 | parser.add_argument("-img_size", type=int, default=224, help="image size (pixels)") 31 | parser.add_argument("-batch_size", type=int, default=64, help="Batch size to extract features") 32 | 33 | parser.add_argument("-gpu_ratio", type=float, default=1., help="How many GPU ram is required? (ratio)") 34 | parser.add_argument("-no_thread", type=int, default=2, help="No thread to load batch") 35 | 36 | args = parser.parse_args() 37 | 38 | 39 | # define image 40 | if args.subtract_mean: 41 | channel_mean = np.array([123.68, 116.779, 103.939]) 42 | else: 43 | channel_mean = None 44 | 45 | 46 | # define the image loader 47 | source = 'image' 48 | images = tf.placeholder(tf.float32, [None, args.img_size, args.img_size, 3], name=source) 49 | image_builder = RawImageBuilder(args.img_dir, 50 | height=args.img_size, 51 | width=args.img_size, 52 | channel=channel_mean) 53 | 54 | # create network 55 | print("Create network...") 56 | ft_output = resnet.create_resnet(images, 57 | resnet_out=args.feature_name, 58 | resnet_version=args.resnet_version, 59 | is_training=False) 60 | 61 | 62 | extract_features( 63 | img_input = images, 64 | ft_output = ft_output, 65 | dataset_cstor = CLEVRDataset, 66 | dataset_args = {"folder": args.data_dir, "image_builder":image_builder}, 67 | batchifier_cstor = CLEVRBatchifier, 68 | out_dir = args.out_dir, 69 | set_type = args.set_type, 70 | network_ckpt=args.ckpt, 71 | batch_size = args.batch_size, 72 | no_threads = args.no_thread, 73 | gpu_ratio = args.gpu_ratio) 74 | 75 | -------------------------------------------------------------------------------- /src/clevr/preprocess_data/extract_raw_image.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from distutils.util import strtobool 3 | import numpy as np 4 | import argparse 5 | 6 | from generic.preprocess_data.extract_img_raw import extract_raw 7 | from generic.data_provider.image_loader import RawImageBuilder 8 | 9 | from clevr.data_provider.clevr_dataset import CLEVRDataset 10 | from clevr.data_provider.clevr_batchifier import CLEVRBatchifier 11 | 12 | 13 | 14 | parser = argparse.ArgumentParser('Feature extractor! ') 15 | 16 | parser.add_argument("-img_dir", type=str, required=True, help="Input Image folder") 17 | parser.add_argument("-data_dir", type=str, required=True,help="Dataset folder") 18 | parser.add_argument("-out_dir", type=str, required=True, help="Output directory for h5 files") 19 | parser.add_argument("-set_type", type=list, default=["val", "train", "test"], help='Select the dataset to dump') 20 | 21 | parser.add_argument("-subtract_mean", type=lambda x:bool(strtobool(x)), default="True", help="Preprocess the image by substracting the mean") 22 | parser.add_argument("-img_size", type=int, required=True, help="image size (pixels)") 23 | 24 | parser.add_argument("-gpu_ratio", type=float, default=1., help="How many GPU ram is required? (ratio)") 25 | parser.add_argument("-no_thread", type=int, default=2, help="No thread to load batch") 26 | 27 | args = parser.parse_args() 28 | 29 | 30 | 31 | # define image properties 32 | if args.subtract_mean: 33 | channel_mean = np.array([123.68, 116.779, 103.939]) 34 | else: 35 | channel_mean = None 36 | 37 | source_name = 'image' 38 | image_builder = RawImageBuilder(args.img_dir, 39 | height=args.img_size, 40 | width=args.img_size, 41 | channel=channel_mean) 42 | image_shape=[args.img_size, args.img_size, 3] 43 | 44 | extract_raw( 45 | image_shape=image_shape, 46 | dataset_cstor=CLEVRDataset, 47 | dataset_args={"folder": args.data_dir, "year": args.year, "image_builder":image_builder}, 48 | batchifier_cstor=CLEVRBatchifier, 49 | source_name=source_name, 50 | out_dir=args.out_dir, 51 | set_type=args.set_type, 52 | no_threads=args.no_thread, 53 | ) -------------------------------------------------------------------------------- /src/clevr/preprocess_data/test.py: -------------------------------------------------------------------------------- 1 | from multiprocessing.pool import ThreadPool 2 | from tqdm import tqdm 3 | 4 | from generic.data_provider.image_loader import h5FeatureBuilder 5 | from generic.data_provider.iterator import Iterator 6 | from generic.data_provider.nlp_utils import DummyTokenizer 7 | 8 | from clevr.data_provider.clevr_dataset import CLEVRDataset 9 | from clevr.data_provider.clevr_batchifier import CLEVRBatchifier 10 | 11 | if __name__ == "__main__": 12 | 13 | feat_dir = "/media/datas2/tmp" 14 | data_dir = "/home/sequel/fstrub/clevr_data" 15 | 16 | image_builder = h5FeatureBuilder(img_dir=feat_dir, bufferize=False) 17 | 18 | print("Load datasets...") 19 | dataset = CLEVRDataset(folder=data_dir, which_set="val", image_builder=image_builder) 20 | 21 | cpu_pool = ThreadPool(1) 22 | 23 | dummy_tokenizer = DummyTokenizer() 24 | 25 | batchifier = CLEVRBatchifier(tokenizer=dummy_tokenizer, sources=["image"]) 26 | iterator = Iterator(dataset, 27 | batch_size=64, 28 | pool=cpu_pool, 29 | batchifier=batchifier) 30 | 31 | for batch in tqdm(iterator): 32 | pass 33 | 34 | print("Done!") 35 | -------------------------------------------------------------------------------- /src/clevr/train/train_clevr.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import os 4 | import tensorflow as tf 5 | from distutils.util import strtobool 6 | 7 | from generic.data_provider.iterator import Iterator 8 | from generic.tf_utils.evaluator import Evaluator 9 | from generic.tf_utils.optimizer import create_optimizer 10 | from generic.tf_utils.ckpt_loader import create_resnet_saver 11 | from generic.utils.config import load_config 12 | from generic.utils.thread_pool import create_cpu_pool 13 | from generic.data_provider.image_loader import get_img_builder 14 | 15 | from clevr.data_provider.clevr_tokenizer import CLEVRTokenizer 16 | from clevr.data_provider.clevr_dataset import CLEVRDataset 17 | from clevr.data_provider.clevr_batchifier import CLEVRBatchifier 18 | from clevr.models.network_factory import create_network 19 | 20 | 21 | if __name__ == '__main__': 22 | 23 | ############################### 24 | # LOAD CONFIG 25 | ############################# 26 | 27 | parser = argparse.ArgumentParser('CLEVR network baseline!') 28 | 29 | parser.add_argument("-data_dir", type=str, help="Directory with data") 30 | parser.add_argument("-out_dir", type=str, help="Directory in which experiments are stored") 31 | parser.add_argument("-config", type=str, help='Config file') 32 | parser.add_argument("-dict_file", type=str, default="dict.json", help="Dictionary file name") 33 | parser.add_argument("-img_dir", type=str, help='Directory with images') 34 | parser.add_argument("-load_checkpoint", type=str, help="Load model parameters from specified checkpoint") 35 | parser.add_argument("-continue_exp", type=lambda x: bool(strtobool(x)), default="False", help="Continue previously started experiment?") 36 | parser.add_argument("-gpu_ratio", type=float, default=0.95, help="How many GPU ram is required? (ratio)") 37 | parser.add_argument("-no_thread", type=int, default=2, help="No thread to load batch") 38 | parser.add_argument("-no_games_to_load", type=int, default=float("inf"), help="No games to use during training Default : all") 39 | 40 | args = parser.parse_args() 41 | 42 | config, xp_manager = load_config(args) 43 | logger = logging.getLogger() 44 | 45 | # Load config 46 | finetune = config["model"]["image"].get('finetune', list()) 47 | batch_size = config['optimizer']['batch_size'] 48 | no_epoch = config["optimizer"]["no_epoch"] 49 | 50 | ############################### 51 | # LOAD DATA 52 | ############################# 53 | 54 | # Load image 55 | image_builder, crop_builder = None, None 56 | logger.info('Loading images..') 57 | image_builder = get_img_builder(config['model']['image'], args.img_dir) 58 | use_resnet = image_builder.is_raw_image() 59 | 60 | # Load data 61 | logger.info('Loading data..') 62 | trainset = CLEVRDataset(args.data_dir, "train", image_builder, args.no_games_to_load) 63 | validset = CLEVRDataset(args.data_dir, "val", image_builder, args.no_games_to_load) 64 | testset = CLEVRDataset(args.data_dir, "test", image_builder, args.no_games_to_load) 65 | 66 | # Load dictionary 67 | logger.info('Loading dictionary..') 68 | tokenizer = CLEVRTokenizer(args.dict_file) 69 | 70 | # Build Network 71 | logger.info('Building network..') 72 | network = create_network(config["model"], 73 | num_words=tokenizer.no_words, 74 | num_answers=tokenizer.no_answers) 75 | 76 | # Build Optimizer 77 | logger.info('Building optimizer..') 78 | optimizer, outputs = create_optimizer(network, config["optimizer"], finetune=finetune) 79 | 80 | ############################### 81 | # START TRAINING 82 | ############################# 83 | 84 | # create a saver to store/load checkpoint 85 | saver = tf.train.Saver() 86 | resnet_saver = None 87 | 88 | # Retrieve only resnet variables 89 | if use_resnet: 90 | resnet_saver = create_resnet_saver([network]) 91 | 92 | # CPU/GPU option 93 | gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=args.gpu_ratio) 94 | 95 | with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options, allow_soft_placement=True)) as sess: 96 | 97 | sources = network.get_sources(sess) 98 | logger.info("Sources: " + ', '.join(sources)) 99 | 100 | sess.run(tf.global_variables_initializer()) 101 | if use_resnet: 102 | resnet_version = config['model']["image"]['resnet_version'] 103 | resnet_saver.restore(sess, os.path.join(args.data_dir, 'resnet_v1_{}.ckpt'.format(resnet_version))) 104 | 105 | sess.run(tf.global_variables_initializer()) 106 | if args.continue_exp or args.load_checkpoint is not None: 107 | start_epoch = xp_manager.load_checkpoint(sess, saver) 108 | else: 109 | start_epoch = 0 110 | 111 | # create training tools 112 | evaluator = Evaluator(sources, network.scope_name, network=network, tokenizer=tokenizer) 113 | batchifier = CLEVRBatchifier(tokenizer) 114 | xp_manager.configure_score_tracking("valid_accuracy", max_is_best=True) 115 | 116 | for t in range(start_epoch, no_epoch): 117 | logger.info('Epoch {}..'.format(t + 1)) 118 | 119 | # Create cpu pools (at each iteration otherwise threads may become zombie - python bug) 120 | cpu_pool = create_cpu_pool(args.no_thread, use_process=image_builder.require_multiprocess()) 121 | 122 | train_iterator = Iterator(trainset, 123 | batch_size=batch_size, pool=cpu_pool, 124 | batchifier=batchifier, 125 | shuffle=True) 126 | train_loss, train_accuracy = evaluator.process(sess, train_iterator, outputs=outputs + [optimizer]) 127 | 128 | valid_iterator = Iterator(validset, pool=cpu_pool, 129 | batch_size=batch_size*2, 130 | batchifier=batchifier, 131 | shuffle=False) 132 | valid_loss, valid_accuracy = evaluator.process(sess, valid_iterator, outputs=outputs) 133 | 134 | logger.info("Training loss : {}".format(train_loss)) 135 | logger.info("Training accuracy : {}".format(train_accuracy)) 136 | logger.info("Validation loss : {}".format(valid_loss)) 137 | logger.info("Validation accuracy: {}".format(valid_accuracy)) 138 | 139 | xp_manager.save_checkpoint(sess, saver, 140 | epoch=t, 141 | losses=dict( 142 | train_accuracy=train_accuracy, 143 | valid_accuracy=valid_accuracy, 144 | train_loss=train_loss, 145 | valid_loss=valid_loss, 146 | )) 147 | 148 | # Load early stopping 149 | xp_manager.load_checkpoint(sess, saver, load_best=True) 150 | cpu_pool = create_cpu_pool(args.no_thread, use_process=image_builder.require_multiprocess()) 151 | 152 | # Create Listener 153 | test_iterator = Iterator(testset, pool=cpu_pool, 154 | batch_size=batch_size*2, 155 | batchifier=batchifier, 156 | shuffle=False) 157 | 158 | # TODO: create listener to dump test results 159 | # evaluator.process(sess, test_iterator, [], listener=None) 160 | --------------------------------------------------------------------------------