├── LICENSE ├── README.md ├── data ├── download_data.py ├── preprocess_image.py └── preprocess_text.py ├── figures ├── examples.png └── model.png ├── layers.py ├── license.txt ├── poster.pdf ├── run.py ├── sparse_graph_model.py ├── torch_dataset.py └── utils.py /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Learning Conditioned Graph Structures for Interpretable Visual Question Answering 2 | 3 | This code provides a pytorch implementation of our graph learning method for Visual Question Answering as described in [Learning Conditioned Graph Structures for Interpretable Visual Question Answering](https://arxiv.org/abs/1806.07243) 4 | 5 | ### Model diagram 6 | ![](./figures/model.png) 7 | ### Examples of learned graph structures 8 | ![](./figures/examples.png) 9 | 10 | ## Getting Started 11 | 12 | ### Reference 13 | 14 | If you use our code or any of the ideas from our paper please cite: 15 | ``` 16 | @article{learningconditionedgraph, 17 | author = {Will Norcliffe-Brown and Efstathios Vafeias and Sarah Parisot}, 18 | title = {Learning Conditioned Graph Structures for Interpretable Visual Question Answering}, 19 | journal = {arXiv preprint arXiv:1806.07243}, 20 | year = {2018} 21 | } 22 | ``` 23 | 24 | ### Requirements 25 | 26 | - [pytorch (0.3.1) (with CUDA)](https://pytorch.org/) 27 | - [zarr (2.2.0)](https://github.com/zarr-developers/zarr) 28 | - [tdqm](https://github.com/tqdm/tqdm) 29 | - [spacy](https://spacy.io/usage/) 30 | 31 | ### Data 32 | 33 | To download and unzip the required datasets, change to the data folder and run 34 | ``` 35 | $ cd data; python download_data.py 36 | ``` 37 | 38 | To preprocess the image data and text data the following commands can be executed respectively. (Setting the data variable to trainval or test for preprocess_image.py and train, val or test for preprocess_text.py depending on which dataset you want to preprocess) 39 | ``` 40 | $ python preprocess_image.py --data trainval; python preprocess_text.py --data train 41 | ``` 42 | ### Pretrained model 43 | If you would like a pretrained model, one can be found here: [example model](https://drive.google.com/file/d/1nBwZIy8SPbV2bqGYYA97uCHnybDqTjRa/view?usp=sharing). This model achieved 66.2% accuracy on test. 44 | 45 | 46 | ### Training 47 | 48 | To train a model on the train set with our default parameters run 49 | ``` 50 | $ python run.py --train 51 | ``` 52 | and to train a model on the train and validation set for evaluation on the test set run 53 | ``` 54 | $ python run.py --trainval 55 | ``` 56 | Models can be validated via 57 | ``` 58 | $ python run.py --eval --model_path path_to_your_model 59 | ``` 60 | and a json of results from the test set can be produced with 61 | ``` 62 | $ python run.py --test --model_path path_to_your_model 63 | ``` 64 | To reproduce our results train a model on the trainval set with the default parameters, 65 | run the test script and evaluate the json on the [EvalAI website](https://evalai.cloudcv.org/). 66 | 67 | ## Authors 68 | 69 | * **Will Norcliffe-Brown** 70 | * **Sarah Parisot** 71 | * **Stathis Vafeias** 72 | 73 | 74 | ## License 75 | 76 | This project is licensed under the Apache 2.0 license - see [Apache license](license.txt) 77 | 78 | ## Acknowledgements 79 | 80 | Our code is based on this implementation of the 2017 VQA challenge winner [https://github.com/markdtw/vqa-winner-cvprw-2017](https://github.com/markdtw/vqa-winner-cvprw-2017) 81 | -------------------------------------------------------------------------------- /data/download_data.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 AimBrain Ltd. 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | 17 | # download input questions (training, validation and test sets) 18 | os.system( 19 | 'wget https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Questions_Train_mscoco.zip -P zip/') 20 | os.system( 21 | 'wget https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Questions_Val_mscoco.zip -P zip/') 22 | os.system( 23 | 'wget https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Questions_Test_mscoco.zip -P zip/') 24 | 25 | # download annotations (training and validation sets) 26 | os.system( 27 | 'wget https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Annotations_Train_mscoco.zip -P zip/') 28 | os.system( 29 | 'wget https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Annotations_Val_mscoco.zip -P zip/') 30 | 31 | # download pre-trained glove embeddings 32 | os.system('wget http://nlp.stanford.edu/data/glove.6B.zip -P zip/') 33 | 34 | # download rcnn extracted features (may take a while, both very large files) 35 | os.system( 36 | 'wget https://imagecaption.blob.core.windows.net/imagecaption/trainval_36.zip -P zip/') 37 | os.system( 38 | 'wget https://imagecaption.blob.core.windows.net/imagecaption/test2015_36.zip -P zip/') 39 | 40 | # extract them 41 | os.system('unzip zip/v2_Questions_Train_mscoco.zip -d raw/') 42 | os.system('unzip zip/v2_Questions_Val_mscoco.zip -d raw/') 43 | os.system('unzip zip/v2_Questions_Test_mscoco.zip -d raw/') 44 | os.system('unzip zip/v2_Annotations_Train_mscoco.zip -d raw/') 45 | os.system('unzip zip/v2_Annotations_Val_mscoco.zip -d raw/') 46 | os.system('unzip zip/glove.6B.zip -d ./') 47 | os.system('unzip zip/trainval_36.zip -d raw/') 48 | os.system('unzip zip/test2015_36.zip -d raw/') -------------------------------------------------------------------------------- /data/preprocess_image.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 AimBrain Ltd. 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from __future__ import division 16 | from __future__ import print_function 17 | from __future__ import absolute_import 18 | 19 | import os 20 | import argparse 21 | import base64 22 | import numpy as np 23 | import csv 24 | import sys 25 | import h5py 26 | import pandas as pd 27 | import zarr 28 | from tqdm import tqdm 29 | 30 | 31 | csv.field_size_limit(sys.maxsize) 32 | 33 | 34 | def features_to_zarr(phase): 35 | FIELDNAMES = ['image_id', 'image_w', 'image_h', 36 | 'num_boxes', 'boxes', 'features'] 37 | 38 | if phase == 'trainval': 39 | infiles = [ 40 | 'raw/trainval_36/trainval_resnet101_faster_rcnn_genome_36.tsv', 41 | ] 42 | elif phase == 'test': 43 | infiles = [ 44 | 'raw/test2015_36/test2015_resnet101_faster_rcnn_genome_36.tsv', 45 | ] 46 | else: 47 | raise SystemExit('Unrecognised phase') 48 | 49 | # Read the tsv and append to files 50 | boxes = zarr.open_group(phase + '_boxes.zarr', mode='w') 51 | features = zarr.open_group(phase + '.zarr', mode='w') 52 | image_size = {} 53 | for infile in infiles: 54 | with open(infile, "r") as tsv_in_file: 55 | reader = csv.DictReader( 56 | tsv_in_file, delimiter='\t', fieldnames=FIELDNAMES) 57 | print('Converting ' + infile + ' to zarr...') 58 | for item in tqdm(reader): 59 | item['image_id'] = str(item['image_id']) 60 | item['image_h'] = int(item['image_h']) 61 | item['image_w'] = int(item['image_w']) 62 | item['num_boxes'] = int(item['num_boxes']) 63 | for field in ['boxes', 'features']: 64 | encoded_str = base64.decodestring( 65 | item[field].encode('utf-8')) 66 | item[field] = np.frombuffer(encoded_str, 67 | dtype=np.float32).reshape((item['num_boxes'], -1)) 68 | # append to zarr files 69 | boxes.create_dataset(item['image_id'], data=item['boxes']) 70 | features.create_dataset(item['image_id'], data=item['features']) 71 | # image_size dict 72 | image_size[item['image_id']] = { 73 | 'image_h':item['image_h'], 74 | 'image_w':item['image_w'], 75 | } 76 | 77 | 78 | # convert dict to pandas dataframe 79 | 80 | 81 | # create image sizes csv 82 | print('Writing image sizes csv...') 83 | df = pd.DataFrame.from_dict(image_size) 84 | df = df.transpose() 85 | d = df.to_dict() 86 | dw = d['image_w'] 87 | dh = d['image_h'] 88 | d = [dw, dh] 89 | dwh = {} 90 | for k in dw.keys(): 91 | dwh[k] = np.array([d0[k] for d0 in d]) 92 | image_sizes = pd.DataFrame(dwh) 93 | image_sizes.to_csv(phase + '_image_size.csv') 94 | 95 | 96 | if __name__ == '__main__': 97 | parser = argparse.ArgumentParser( 98 | description='Preprocessing for VQA v2 image data') 99 | parser.add_argument('--data', nargs='+', help='trainval, and/or test, list of data phases to be processed', required=True) 100 | args, unparsed = parser.parse_known_args() 101 | if len(unparsed) != 0: 102 | raise SystemExit('Unknown argument: {}'.format(unparsed)) 103 | 104 | phase_list = args.data 105 | 106 | for phase in phase_list: 107 | # First download and extract 108 | 109 | if not os.path.exists(phase + '.zarr'): 110 | print('Converting features tsv to zarr file...') 111 | features_to_zarr(phase) 112 | 113 | print('Done') 114 | -------------------------------------------------------------------------------- /data/preprocess_text.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 AimBrain Ltd. 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from __future__ import division 16 | from __future__ import print_function 17 | from __future__ import absolute_import 18 | 19 | import os 20 | import json 21 | import collections 22 | import argparse 23 | import string 24 | from tqdm import tqdm 25 | from spacy.tokenizer import Tokenizer 26 | import en_core_web_sm 27 | 28 | try: 29 | import cPickle as pickle 30 | except: 31 | import pickle 32 | 33 | nlp = en_core_web_sm.load() 34 | tokenizer = Tokenizer(nlp.vocab) 35 | exclude = set(string.punctuation) 36 | 37 | 38 | def process_answers(q, phase, n_answers=3000): 39 | 40 | # find the n_answers most common answers 41 | counts = {} 42 | for row in q: 43 | counts[row['answer']] = counts.get(row['answer'], 0) + 1 44 | 45 | cw = sorted([(count, w) for w, count in counts.items()], reverse=True) 46 | 47 | vocab = [w for c, w in cw[:n_answers]] 48 | 49 | # a 0-indexed vocabulary translation table 50 | itow = {i: w for i, w in enumerate(vocab)} 51 | wtoi = {w: i for i, w in enumerate(vocab)} # inverse table 52 | pickle.dump({'itow': itow, 'wtoi': wtoi}, open(phase + '_a_dict.p', 'wb')) 53 | 54 | for row in q: 55 | accepted_answers = 0 56 | for w, c in row['answers']: 57 | if w in vocab: 58 | accepted_answers += c 59 | 60 | answers_scores = [] 61 | for w, c in row['answers']: 62 | if w in vocab: 63 | answers_scores.append((w, c / accepted_answers)) 64 | 65 | row['answers_w_scores'] = answers_scores 66 | 67 | json.dump(q, open('vqa_' + phase + '_final_3000.json', 'w')) 68 | 69 | 70 | def process_questions(q): 71 | # build question dictionary 72 | def build_vocab(questions): 73 | count_thr = 0 74 | # count up the number of times a word is used 75 | counts = {} 76 | for row in questions: 77 | for word in row['question_toked']: 78 | counts[word] = counts.get(word, 0) + 1 79 | cw = sorted([(count, w) for w, count in counts.items()], reverse=True) 80 | print('top words and their counts:') 81 | print('\n'.join(map(str, cw[:10]))) 82 | 83 | # print some stats 84 | total_words = sum(counts.values()) 85 | print('total words:', total_words) 86 | bad_words = [w for w, n in counts.items() if n <= count_thr] 87 | vocab = [w for w, n in counts.items() if n > count_thr] 88 | bad_count = sum(counts[w] for w in bad_words) 89 | print('number of bad words: %d/%d = %.2f%%' % 90 | (len(bad_words), len(counts), len(bad_words)*100.0/len(counts))) 91 | print('number of words in vocab would be %d' % (len(vocab), )) 92 | print('number of UNKs: %d/%d = %.2f%%' % 93 | (bad_count, total_words, bad_count*100.0/total_words)) 94 | 95 | return vocab 96 | 97 | vocab = build_vocab(q) 98 | # a 1-indexed vocab translation table 99 | itow = {i+1: w for i, w in enumerate(vocab)} 100 | wtoi = {w: i+1 for i, w in enumerate(vocab)} # inverse table 101 | pickle.dump({'itow': itow, 'wtoi': wtoi}, open(phase + '_q_dict.p', 'wb')) 102 | 103 | 104 | def tokenize_questions(qa, phase): 105 | qas = len(qa) 106 | for i, row in enumerate(tqdm(qa)): 107 | row['question_toked'] = [t.text if '?' not in t.text else t.text[:-1] 108 | for t in tokenizer(row['question'].lower())] # get spacey tokens and remove question marks 109 | if i == qas - 1: 110 | json.dump(qa, open('vqa_' + phase + '_toked.json', 'w')) 111 | 112 | 113 | def combine_qa(questions, annotations, phase): 114 | # Combine questions and answers in the same json file 115 | # 443757 questions 116 | data = [] 117 | for i, q in enumerate(tqdm(questions['questions'])): 118 | row = {} 119 | # load questions info 120 | row['question'] = q['question'] 121 | row['question_id'] = q['question_id'] 122 | row['image_id'] = str(q['image_id']) 123 | 124 | # load answers 125 | assert q['question_id'] == annotations[i]['question_id'] 126 | row['answer'] = annotations[i]['multiple_choice_answer'] 127 | 128 | answers = [] 129 | for ans in annotations[i]['answers']: 130 | answers.append(ans['answer']) 131 | row['answers'] = collections.Counter(answers).most_common() 132 | 133 | data.append(row) 134 | 135 | json.dump(data, open('vqa_' + phase + '_combined.json', 'w')) 136 | 137 | 138 | if __name__ == '__main__': 139 | parser = argparse.ArgumentParser( 140 | description='Preprocessing for VQA v2 text data') 141 | parser.add_argument('--data', nargs='+', help='train, val and/or test, list of data phases to be processed', required=True) 142 | parser.add_argument('--nanswers', default=3000, help='number of top answers to consider for classification.') 143 | args, unparsed = parser.parse_known_args() 144 | if len(unparsed) != 0: 145 | raise SystemExit('Unknown argument: {}'.format(unparsed)) 146 | 147 | phase_list = args.data 148 | 149 | for phase in phase_list: 150 | 151 | print('processing ' + phase + ' data') 152 | if phase != 'test': 153 | # Combine Q and A 154 | print('Combining question and answer...') 155 | question = json.load( 156 | open('raw/v2_OpenEnded_mscoco_' + phase + '2014_questions.json')) 157 | answers = json.load(open('raw/v2_mscoco_' + phase + '2014_annotations.json')) 158 | combine_qa(question, answers['annotations'], phase) 159 | 160 | # Tokenize 161 | print('Tokenizing...') 162 | t = json.load(open('vqa_' + phase + '_combined.json')) 163 | tokenize_questions(t, phase) 164 | else: 165 | print ('Tokenizing...') 166 | t = json.load(open('raw/v2_OpenEnded_mscoco_' + phase + '2015_questions.json')) 167 | t = t['questions'] 168 | tokenize_questions(t, phase) 169 | 170 | # Build dictionary for question and answers 171 | print('Building dictionary...') 172 | t = json.load(open('vqa_' + phase + '_toked.json')) 173 | if phase == 'train': 174 | process_questions(t) 175 | if phase != 'test': 176 | process_answers(t, phase, n_answers=args.nanswers) 177 | 178 | print('Done') 179 | -------------------------------------------------------------------------------- /figures/examples.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aimbrain/vqa-project/341122a267293017b55db4f033fbe81445af03ea/figures/examples.png -------------------------------------------------------------------------------- /figures/model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aimbrain/vqa-project/341122a267293017b55db4f033fbe81445af03ea/figures/model.png -------------------------------------------------------------------------------- /layers.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 AimBrain Ltd. 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import torch 16 | import numpy as np 17 | 18 | from torch.nn.parameter import Parameter 19 | from torch.nn.modules.module import Module 20 | import torch.nn as nn 21 | import torch.nn.functional as F 22 | 23 | 24 | class NeighbourhoodGraphConvolution(Module): 25 | ''' 26 | Implementation of: https://arxiv.org/pdf/1611.08402.pdf where we consider 27 | a fixed sized neighbourhood of nodes for each feature 28 | ''' 29 | 30 | def __init__(self, 31 | in_feat_dim, 32 | out_feat_dim, 33 | n_kernels, 34 | coordinate_dim, 35 | bias=False): 36 | super(NeighbourhoodGraphConvolution, self).__init__() 37 | ''' 38 | ## Variables: 39 | - in_feat_dim: dimensionality of input features 40 | - out_feat_dim: dimensionality of output features 41 | - n_kernels: number of Gaussian kernels to use 42 | - coordinate_dim : dimensionality of the pseudo coordinates 43 | - bias: whether to add a bias to convolutional kernels 44 | ''' 45 | 46 | # Set parameters 47 | self.n_kernels = n_kernels 48 | self.coordinate_dim = coordinate_dim 49 | self.in_feat_dim = in_feat_dim 50 | self.out_feat_dim = out_feat_dim 51 | self.bias = bias 52 | 53 | # Convolution filters weights 54 | self.conv_weights = nn.ModuleList([nn.Linear( 55 | in_feat_dim, out_feat_dim//n_kernels, bias=bias) for i in range(n_kernels)]) 56 | 57 | # Parameters of the Gaussian kernels 58 | self.mean_rho = Parameter(torch.Tensor(n_kernels, 1)) 59 | self.mean_theta = Parameter(torch.Tensor(n_kernels, 1)) 60 | self.precision_rho = Parameter(torch.Tensor(n_kernels, 1)) 61 | self.precision_theta = Parameter(torch.Tensor(n_kernels, 1)) 62 | 63 | self.init_parameters() 64 | 65 | def init_parameters(self): 66 | # Initialise Gaussian parameters 67 | self.mean_theta.data.uniform_(-np.pi, np.pi) 68 | self.mean_rho.data.uniform_(0, 1.0) 69 | self.precision_theta.data.uniform_(0.0, 1.0) 70 | self.precision_rho.data.uniform_(0.0, 1.0) 71 | 72 | def forward(self, neighbourhood_features, neighbourhood_pseudo_coord): 73 | ''' 74 | ## Inputs: 75 | - neighbourhood_features (batch_size, K, neighbourhood_size, in_feat_dim) 76 | - neighbourhood_pseudo_coord (batch_size, K, neighbourhood_size, coordinate_dim) 77 | ## Returns: 78 | - convolved_features (batch_size, K, neighbourhood_size, out_feat_dim) 79 | ''' 80 | 81 | # set parameters 82 | batch_size = neighbourhood_features.size(0) 83 | K = neighbourhood_features.size(1) 84 | neighbourhood_size = neighbourhood_features.size(2) 85 | 86 | # compute pseudo coordinate kernel weights 87 | weights = self.get_gaussian_weights(neighbourhood_pseudo_coord) 88 | weights = weights.view( 89 | batch_size*K, neighbourhood_size, self.n_kernels) 90 | 91 | # compute convolved features 92 | neighbourhood_features = neighbourhood_features.view( 93 | batch_size*K, neighbourhood_size, -1) 94 | convolved_features = self.convolution(neighbourhood_features, weights) 95 | convolved_features = convolved_features.view(-1, K, self.out_feat_dim) 96 | 97 | return convolved_features 98 | 99 | def get_gaussian_weights(self, pseudo_coord): 100 | ''' 101 | ## Inputs: 102 | - pseudo_coord (batch_size, K, K, pseudo_coord_dim) 103 | ## Returns: 104 | - weights (batch_size*K, neighbourhood_size, n_kernels) 105 | ''' 106 | 107 | # compute rho weights 108 | diff = (pseudo_coord[:, :, :, 0].contiguous().view(-1, 1) - self.mean_rho.view(1, -1))**2 109 | weights_rho = torch.exp(-0.5 * diff / 110 | (1e-14 + self.precision_rho.view(1, -1)**2)) 111 | 112 | # compute theta weights 113 | first_angle = torch.abs(pseudo_coord[:, :, :, 1].contiguous().view(-1, 1) - self.mean_theta.view(1, -1)) 114 | second_angle = torch.abs(2 * np.pi - first_angle) 115 | weights_theta = torch.exp(-0.5 * (torch.min(first_angle, second_angle)**2) 116 | / (1e-14 + self.precision_theta.view(1, -1)**2)) 117 | 118 | weights = weights_rho * weights_theta 119 | weights[(weights != weights).detach()] = 0 120 | 121 | # normalise weights 122 | weights = weights / torch.sum(weights, dim=1, keepdim=True) 123 | 124 | return weights 125 | 126 | def convolution(self, neighbourhood, weights): 127 | ''' 128 | ## Inputs: 129 | - neighbourhood (batch_size*K, neighbourhood_size, in_feat_dim) 130 | - weights (batch_size*K, neighbourhood_size, n_kernels) 131 | ## Returns: 132 | - convolved_features (batch_size*K, out_feat_dim) 133 | ''' 134 | # patch operator 135 | weighted_neighbourhood = torch.bmm( 136 | weights.transpose(1, 2), neighbourhood) 137 | 138 | # convolutions 139 | weighted_neighbourhood = [self.conv_weights[i](weighted_neighbourhood[:, i]) for i in range(self.n_kernels)] 140 | convolved_features = torch.cat([i.unsqueeze(1) for i in weighted_neighbourhood], dim=1) 141 | convolved_features = convolved_features.view(-1, self.out_feat_dim) 142 | 143 | return convolved_features 144 | 145 | 146 | class GraphLearner(Module): 147 | def __init__(self, in_feature_dim, combined_feature_dim, K, dropout=0.0): 148 | super(GraphLearner, self).__init__() 149 | 150 | ''' 151 | ## Variables: 152 | - in_feature_dim: dimensionality of input features 153 | - combined_feature_dim: dimensionality of the joint hidden embedding 154 | - K: number of graph nodes/objects on the image 155 | ''' 156 | 157 | # Parameters 158 | self.in_dim = in_feature_dim 159 | self.combined_dim = combined_feature_dim 160 | self.K = K 161 | 162 | # Embedding layers 163 | self.edge_layer_1 = nn.Linear(in_feature_dim, 164 | combined_feature_dim) 165 | self.edge_layer_2 = nn.Linear(combined_feature_dim, 166 | combined_feature_dim) 167 | 168 | # Regularisation 169 | self.dropout = nn.Dropout(p=dropout) 170 | self.edge_layer_1 = nn.utils.weight_norm(self.edge_layer_1) 171 | self.edge_layer_2 = nn.utils.weight_norm(self.edge_layer_2) 172 | 173 | def forward(self, graph_nodes): 174 | ''' 175 | ## Inputs: 176 | - graph_nodes (batch_size, K, in_feat_dim): input features 177 | ## Returns: 178 | - adjacency matrix (batch_size, K, K) 179 | ''' 180 | 181 | graph_nodes = graph_nodes.view(-1, self.in_dim) 182 | 183 | # layer 1 184 | h = self.edge_layer_1(graph_nodes) 185 | h = F.relu(h) 186 | 187 | # layer 2 188 | h = self.edge_layer_2(h) 189 | h = F.relu(h) 190 | 191 | # outer product 192 | h = h.view(-1, self.K, self.combined_dim) 193 | adjacency_matrix = torch.matmul(h, h.transpose(1, 2)) 194 | 195 | return adjacency_matrix 196 | 197 | -------------------------------------------------------------------------------- /license.txt: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | -------------------------------------------------------------------------------- /poster.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aimbrain/vqa-project/341122a267293017b55db4f033fbe81445af03ea/poster.pdf -------------------------------------------------------------------------------- /run.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 AimBrain Ltd. 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from __future__ import division 16 | from __future__ import print_function 17 | from __future__ import absolute_import 18 | import os 19 | import json 20 | import argparse 21 | from tqdm import tqdm 22 | 23 | import torch 24 | import torch.nn as nn 25 | from torch.utils.data import DataLoader 26 | from torch.utils.data.sampler import RandomSampler 27 | from torch.optim.lr_scheduler import MultiStepLR 28 | 29 | from sparse_graph_model import Model 30 | from torch_dataset import * 31 | from utils import * 32 | 33 | def eval_model(args): 34 | 35 | """ 36 | Computes the VQA accuracy over the validation set 37 | using a pre-trained model 38 | """ 39 | 40 | # Check that the model path is accurate 41 | if args.model_path and os.path.isfile(args.model_path): 42 | print('Resuming from checkpoint %s' % (args.model_path)) 43 | else: 44 | raise SystemExit('Need to provide model path.') 45 | 46 | # Set random seed 47 | torch.manual_seed(1000) 48 | if torch.cuda.is_available(): 49 | torch.cuda.manual_seed(1000) 50 | else: 51 | raise SystemExit('No CUDA available, script requires cuda') 52 | 53 | # Load the validation set 54 | print('Loading data') 55 | dataset = VQA_Dataset(args.data_dir, args.emb, train=False) 56 | loader = DataLoader(dataset, batch_size=args.bsize, 57 | shuffle=False, num_workers=5, 58 | collate_fn=collate_fn) 59 | 60 | # Print data and model parameters 61 | print('Parameters:\n\t' 62 | 'vocab size: %d\n\tembedding dim: %d\n\tfeature dim: %d' 63 | '\n\thidden dim: %d\n\toutput dim: %d' % (dataset.q_words, args.emb, 64 | dataset.feat_dim, 65 | args.hid, 66 | dataset.n_answers)) 67 | # Define the model 68 | model = Model(vocab_size=dataset.q_words, 69 | emb_dim=args.emb, 70 | feat_dim=dataset.feat_dim, 71 | hid_dim=args.hid, 72 | out_dim=dataset.n_answers, 73 | dropout=args.dropout, 74 | pretrained_wemb=dataset.pretrained_wemb, 75 | neighbourhood_size=args.neighbourhood_size) 76 | 77 | # move to CUDA 78 | model = model.cuda() 79 | 80 | # Restore pre-trained model 81 | ckpt = torch.load(args.model_path) 82 | model.load_state_dict(ckpt['state_dict']) 83 | model.train(False) 84 | 85 | # Compute accuracy 86 | result = [] 87 | correct = 0 88 | for step, next_batch in tqdm(enumerate(loader)): 89 | # move batch to cuda 90 | q_batch, _, vote_batch, i_batch, k_batch, qlen_batch = \ 91 | batch_to_cuda(next_batch, volatile=True) 92 | 93 | # get predictions 94 | output, _ = model(q_batch, i_batch, k_batch, qlen_batch) 95 | qid_batch = next_batch[3] 96 | _, oix = output.data.max(1) 97 | # record predictions 98 | for i, qid in enumerate(qid_batch): 99 | result.append({ 100 | 'question_id': int(qid.numpy()), 101 | 'answer': dataset.a_itow[oix[i]] 102 | }) 103 | # compute batch accuracy 104 | correct += total_vqa_score(output, vote_batch) 105 | 106 | # compute and print average accuracy 107 | acc = correct/dataset.n_questions*100 108 | print("accuracy: {} %".format(acc)) 109 | 110 | # save predictions 111 | json.dump(result, open('result.json', 'w')) 112 | print('Validation done') 113 | 114 | def train(args): 115 | 116 | """ 117 | Train a VQA model using the training set 118 | """ 119 | 120 | # set random seed 121 | torch.manual_seed(1000) 122 | if torch.cuda.is_available(): 123 | torch.cuda.manual_seed(1000) 124 | else: 125 | raise SystemExit('No CUDA available, script requires cuda') 126 | 127 | # Load the VQA training set 128 | print('Loading data') 129 | dataset = VQA_Dataset(args.data_dir, args.emb) 130 | loader = DataLoader(dataset, batch_size=args.bsize, 131 | shuffle=True, num_workers=5, collate_fn=collate_fn) 132 | 133 | # Load the VQA validation set 134 | dataset_test = VQA_Dataset(args.data_dir, args.emb, train=False) 135 | test_sampler = RandomSampler(dataset_test) 136 | loader_test = iter(DataLoader(dataset_test, 137 | batch_size=args.bsize, 138 | sampler=test_sampler, 139 | shuffle=False, 140 | num_workers=4, 141 | collate_fn=collate_fn)) 142 | 143 | n_batches = len(dataset)//args.bsize 144 | 145 | # Print data and model parameters 146 | print('Parameters:\n\t' 147 | 'vocab size: %d\n\tembedding dim: %d\n\tfeature dim: %d' 148 | '\n\thidden dim: %d\n\toutput dim: %d' % (dataset.q_words, args.emb, 149 | dataset.feat_dim, 150 | args.hid, 151 | dataset.n_answers)) 152 | print('Initializing model') 153 | 154 | model = Model(vocab_size=dataset.q_words, 155 | emb_dim=args.emb, 156 | feat_dim=dataset.feat_dim, 157 | hid_dim=args.hid, 158 | out_dim=dataset.n_answers, 159 | dropout=args.dropout, 160 | neighbourhood_size=args.neighbourhood_size, 161 | pretrained_wemb=dataset.pretrained_wemb) 162 | 163 | criterion = nn.MultiLabelSoftMarginLoss() 164 | 165 | # Move it to GPU 166 | model = model.cuda() 167 | criterion = criterion.cuda() 168 | 169 | # Define the optimiser 170 | optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) 171 | 172 | # Continue training from saved model 173 | start_ep = 0 174 | if args.model_path and os.path.isfile(args.model_path): 175 | print('Resuming from checkpoint %s' % (args.model_path)) 176 | ckpt = torch.load(args.model_path) 177 | start_ep = ckpt['epoch'] 178 | model.load_state_dict(ckpt['state_dict']) 179 | optimizer.load_state_dict(ckpt['optimizer']) 180 | 181 | # Update the learning rate 182 | for param_group in optimizer.param_groups: 183 | param_group['lr'] = args.lr 184 | 185 | # Learning rate scheduler 186 | scheduler = MultiStepLR(optimizer, milestones=[30], gamma=0.5) 187 | scheduler.last_epoch = start_ep - 1 188 | 189 | # Train iterations 190 | print('Start training.') 191 | for ep in range(start_ep, start_ep+args.ep): 192 | 193 | scheduler.step() 194 | ep_loss = 0.0 195 | ep_correct = 0.0 196 | ave_loss = 0.0 197 | ave_correct = 0.0 198 | losses = [] 199 | 200 | for step, next_batch in tqdm(enumerate(loader)): 201 | 202 | model.train() 203 | # Move batch to cuda 204 | q_batch, a_batch, vote_batch, i_batch, k_batch, qlen_batch = \ 205 | batch_to_cuda(next_batch) 206 | 207 | # forward pass 208 | output, adjacency_matrix = model( 209 | q_batch, i_batch, k_batch, qlen_batch) 210 | 211 | loss = criterion(output, a_batch) 212 | 213 | # Compute batch accuracy based on vqa evaluation 214 | correct = total_vqa_score(output, vote_batch) 215 | ep_correct += correct 216 | ep_loss += loss.data[0] 217 | ave_correct += correct 218 | ave_loss += loss.data[0] 219 | losses.append(loss.cpu().data[0]) 220 | 221 | # This is a 40 step average 222 | if step % 40 == 0 and step != 0: 223 | print(' Epoch %02d(%03d/%03d), ave loss: %.7f, ave accuracy: %.2f%%' % 224 | (ep+1, step, n_batches, ave_loss/40, 225 | ave_correct * 100 / (args.bsize*40))) 226 | 227 | ave_correct = 0 228 | ave_loss = 0 229 | 230 | # Compute gradient and do optimisation step 231 | optimizer.zero_grad() 232 | loss.backward() 233 | optimizer.step() 234 | 235 | # save model and compute validation accuracy every 400 steps 236 | if step % 400 == 0: 237 | epoch_loss = ep_loss / n_batches 238 | epoch_acc = ep_correct * 100 / (n_batches * args.bsize) 239 | 240 | save(model, optimizer, ep, epoch_loss, epoch_acc, 241 | dir=args.save_dir, name=args.name+'_'+str(ep+1)) 242 | 243 | # compute validation accuracy over a small subset of the validation set 244 | test_correct = 0 245 | model.train(False) 246 | 247 | for i in range(10): 248 | test_batch = next(loader_test) 249 | q_batch, a_batch, vote_batch, i_batch, k_batch, qlen_batch = \ 250 | batch_to_cuda(test_batch, volatile=True) 251 | output, _ = model(q_batch, i_batch, k_batch, qlen_batch) 252 | test_correct += total_vqa_score(output, vote_batch) 253 | 254 | model.train(True) 255 | acc = test_correct/(10*args.bsize)*100 256 | print("Validation accuracy: {:.2f} %".format(acc)) 257 | 258 | # save model and compute accuracy for epoch 259 | epoch_loss = ep_loss / n_batches 260 | epoch_acc = ep_correct * 100 / (n_batches * args.bsize) 261 | 262 | save(model, optimizer, ep, epoch_loss, epoch_acc, 263 | dir=args.save_dir, name=args.name+'_'+str(ep+1)) 264 | 265 | print('Epoch %02d done, average loss: %.3f, average accuracy: %.2f%%' % ( 266 | ep+1, epoch_loss, epoch_acc)) 267 | 268 | def test(args): 269 | 270 | """ 271 | Creates a result.json for predictions on 272 | the test set 273 | """ 274 | # Check that the model path is accurate 275 | if args.model_path and os.path.isfile(args.model_path): 276 | print('Resuming from checkpoint %s' % (args.model_path)) 277 | else: 278 | raise SystemExit('Need to provide model path.') 279 | 280 | torch.manual_seed(1000) 281 | if torch.cuda.is_available(): 282 | torch.cuda.manual_seed(1000) 283 | else: 284 | raise SystemExit('No CUDA available, script requires CUDA') 285 | 286 | print('Loading data') 287 | dataset = VQA_Dataset_Test(args.data_dir, args.emb, train=False) 288 | loader = DataLoader(dataset, batch_size=args.bsize, 289 | shuffle=False, num_workers=5, 290 | collate_fn=collate_fn) 291 | 292 | # Print data and model parameters 293 | print('Parameters:\n\t' 294 | 'vocab size: %d\n\tembedding dim: %d\n\tfeature dim: %d' 295 | '\n\thidden dim: %d\n\toutput dim: %d' % (dataset.q_words, args.emb, 296 | dataset.feat_dim, 297 | args.hid, 298 | dataset.n_answers)) 299 | 300 | # Define model 301 | model = Model(vocab_size=dataset.q_words, 302 | emb_dim=args.emb, 303 | feat_dim=dataset.feat_dim, 304 | hid_dim=args.hid, 305 | out_dim=dataset.n_answers, 306 | dropout=args.dropout, 307 | pretrained_wemb=dataset.pretrained_wemb, 308 | neighbourhood_size=args.neighbourhood_size) 309 | 310 | # move to CUDA 311 | model = model.cuda() 312 | 313 | # Restore pre-trained model 314 | ckpt = torch.load(args.model_path) 315 | model.load_state_dict(ckpt['state_dict']) 316 | model.train(False) 317 | 318 | result = [] 319 | for step, next_batch in tqdm(enumerate(loader)): 320 | # Batch preparation 321 | q_batch, _, _, i_batch, k_batch, qlen_batch = \ 322 | batch_to_cuda(next_batch, volatile=True) 323 | 324 | # get predictions 325 | output, _ = model(q_batch, i_batch, k_batch, qlen_batch) 326 | qid_batch = next_batch[3] 327 | _, oix = output.data.max(1) 328 | # record predictions 329 | for i, qid in enumerate(qid_batch): 330 | result.append({ 331 | 'question_id': int(qid.numpy()), 332 | 'answer': dataset.a_itow[oix[i]] 333 | }) 334 | 335 | json.dump(result, open('result.json', 'w')) 336 | print('Testing done') 337 | 338 | def trainval(args): 339 | 340 | """ 341 | Train a VQA model using the training + validation set 342 | """ 343 | 344 | # set random seed 345 | torch.manual_seed(1000) 346 | if torch.cuda.is_available(): 347 | torch.cuda.manual_seed(1000) 348 | else: 349 | raise SystemExit('No CUDA available, script requires CUDA.') 350 | 351 | # load train+val sets for training 352 | print ('Loading data') 353 | dataset = VQA_Dataset_Test(args.data_dir, args.emb) 354 | loader = DataLoader(dataset, batch_size=args.bsize, 355 | shuffle=True, num_workers=5, 356 | collate_fn=collate_fn) 357 | n_batches = len(dataset)//args.bsize 358 | 359 | # Print data and model parameters 360 | print ('Parameters:\n\tvocab size: %d\n\tembedding dim: %d\n\tfeature dim: %d\ 361 | \n\thidden dim: %d\n\toutput dim: %d' % (dataset.q_words, args.emb, dataset.feat_dim, 362 | args.hid, dataset.n_answers)) 363 | print ('Initializing model') 364 | 365 | model = Model(vocab_size=dataset.q_words, 366 | emb_dim=args.emb, 367 | feat_dim=dataset.feat_dim, 368 | hid_dim=args.hid, 369 | out_dim=dataset.n_answers, 370 | dropout=args.dropout, 371 | neighbourhood_size=args.neighbourhood_size, 372 | pretrained_wemb=dataset.pretrained_wemb) 373 | 374 | criterion = nn.MultiLabelSoftMarginLoss() 375 | 376 | # Move it to GPU 377 | model = model.cuda() 378 | criterion = criterion.cuda() 379 | 380 | # Define the optimizer 381 | optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) 382 | 383 | # Continue training from saved model 384 | start_ep = 0 385 | if args.model_path and os.path.isfile(args.model_path): 386 | print ('Resuming from checkpoint %s' % (args.model_path)) 387 | ckpt = torch.load(args.model_path) 388 | start_ep = ckpt['epoch'] 389 | model.load_state_dict(ckpt['state_dict']) 390 | optimizer.load_state_dict(ckpt['optimizer']) 391 | 392 | # ensure you can load with new lr 393 | for param_group in optimizer.param_groups: 394 | param_group['lr'] = args.lr 395 | 396 | # learner rate scheduler 397 | scheduler = MultiStepLR(optimizer, milestones=[30], gamma=0.5) 398 | scheduler.last_epoch = start_ep - 1 399 | 400 | # Training script 401 | print ('Start training.') 402 | for ep in range(start_ep, start_ep+args.ep): 403 | scheduler.step() 404 | ep_loss = 0.0 405 | ep_correct = 0.0 406 | ave_loss = 0.0 407 | ave_correct = 0.0 408 | losses = [] 409 | for step, next_batch in tqdm(enumerate(loader)): 410 | model.train() 411 | # batch to gpu 412 | q_batch, a_batch, vote_batch, i_batch, k_batch, qlen_batch = \ 413 | batch_to_cuda(next_batch) 414 | 415 | # Do model forward 416 | output, adjacency_matrix = model( 417 | q_batch, i_batch, k_batch, qlen_batch) 418 | 419 | loss = criterion(output, a_batch) 420 | 421 | # compute accuracy based on vqa evaluation 422 | correct = total_vqa_score(output, vote_batch) 423 | ep_correct += correct 424 | ep_loss += loss.data[0] 425 | ave_correct += correct 426 | ave_loss += loss.data[0] 427 | losses.append(loss.cpu().data[0]) 428 | # This is a 40 step average 429 | if step % 40 == 0 and step != 0: 430 | print(' Epoch %02d(%03d/%03d), ave loss: %.7f, ave accuracy: %.2f%%' % 431 | (ep+1, step, n_batches, ave_loss/40, 432 | ave_correct * 100 / (args.bsize*40))) 433 | 434 | ave_correct = 0 435 | ave_loss = 0 436 | ave_correct = ave_loss = ave_sparsity = 0 437 | 438 | # compute gradient and do optim step 439 | optimizer.zero_grad() 440 | loss.backward() 441 | optimizer.step() 442 | 443 | # save model and compute accuracy for epoch 444 | epoch_loss = ep_loss / n_batches 445 | epoch_acc = ep_correct * 100 / (n_batches * args.bsize) 446 | 447 | 448 | save(model, optimizer, ep, epoch_loss, epoch_acc, 449 | dir=args.save_dir, name=args.name+'_'+str(ep+1)) 450 | 451 | print('Epoch %02d done, average loss: %.3f, average accuracy: %.2f%%' % ( 452 | ep+1, epoch_loss, epoch_acc)) 453 | 454 | 455 | if __name__ == '__main__': 456 | parser = argparse.ArgumentParser( 457 | description='Conditional Graph Convolutions for VQA') 458 | parser.add_argument('--train', action='store_true', 459 | help='set this to training mode.') 460 | parser.add_argument('--trainval', action='store_true', 461 | help='set this to train+val mode.') 462 | parser.add_argument('--eval', action='store_true', 463 | help='set this to evaluation mode.') 464 | parser.add_argument('--test', action='store_true', 465 | help='set this to test mode.') 466 | parser.add_argument('--lr', metavar='', type=float, 467 | default=1e-4, help='initial learning rate') 468 | parser.add_argument('--ep', metavar='', type=int, 469 | default=40, help='number of epochs.') 470 | parser.add_argument('--bsize', metavar='', type=int, 471 | default=64, help='batch size.') 472 | parser.add_argument('--hid', metavar='', type=int, 473 | default=1024, help='hidden dimension') 474 | parser.add_argument('--emb', metavar='', type=int, default=300, 475 | help='question embedding dimension') 476 | parser.add_argument('--neighbourhood_size', metavar='', type=int, default=16, 477 | help='number of graph neighbours to consider') 478 | parser.add_argument('--data_dir', metavar='', type=str, default='./data', 479 | help='path to data directory') 480 | parser.add_argument('--save_dir', metavar='', type=str, default='./save') 481 | parser.add_argument('--name', metavar='', type=str, 482 | default='model', help='model name') 483 | parser.add_argument('--dropout', metavar='', type=float, default=0.5, 484 | help='probability of dropping out FC nodes during training') 485 | parser.add_argument('--model_path', metavar='', type=str, 486 | help='trained model path.') 487 | args, unparsed = parser.parse_known_args() 488 | if len(unparsed) != 0: 489 | raise SystemExit('Unknown argument: {}'.format(unparsed)) 490 | if args.train: 491 | train(args) 492 | if args.trainval: 493 | trainval(args) 494 | if args.eval: 495 | eval_model(args) 496 | if args.test: 497 | test(args) 498 | if not args.train and not args.eval and not args.trainval and not args.test: 499 | parser.print_help() 500 | -------------------------------------------------------------------------------- /sparse_graph_model.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 AimBrain Ltd. 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from __future__ import division 16 | from __future__ import print_function 17 | from __future__ import absolute_import 18 | 19 | import torch 20 | import torch.nn as nn 21 | import torch.nn.functional as F 22 | from torch.nn.utils.rnn import pack_padded_sequence 23 | 24 | from layers import NeighbourhoodGraphConvolution as GraphConvolution 25 | from layers import GraphLearner 26 | 27 | 28 | class Model(nn.Module): 29 | 30 | def __init__(self, 31 | vocab_size, 32 | emb_dim, 33 | feat_dim, 34 | hid_dim, 35 | out_dim, 36 | pretrained_wemb, 37 | dropout, 38 | n_kernels=8, 39 | neighbourhood_size=16): 40 | 41 | ''' 42 | ## Variables: 43 | - vocab_size: dimensionality of input vocabulary 44 | - emb_dim: question embedding size 45 | - feat_dim: dimensionality of input image features 46 | - out_dim: dimensionality of the output 47 | - dropout: dropout probability 48 | - n_kernels : number of Gaussian kernels for convolutions 49 | - bias: whether to add a bias to Gaussian kernels 50 | ''' 51 | 52 | super(Model, self).__init__() 53 | 54 | # Set parameters 55 | self.vocab_size = vocab_size 56 | self.emb_dim = emb_dim 57 | self.feat_dim = feat_dim 58 | self.hid_dim = hid_dim 59 | self.out_dim = out_dim 60 | self.neighbourhood_size = neighbourhood_size 61 | 62 | # initialize word embedding layer weight 63 | self.wembed = nn.Embedding(vocab_size, emb_dim) 64 | self.wembed.weight.data.copy_(torch.from_numpy(pretrained_wemb)) 65 | 66 | # question encoding 67 | self.q_lstm = nn.GRU(input_size=emb_dim, hidden_size=hid_dim) 68 | 69 | # graph learner 70 | self.adjacency_1 = GraphLearner(in_feature_dim=feat_dim + hid_dim, 71 | combined_feature_dim=512, 72 | K=36, 73 | dropout=dropout) 74 | 75 | # dropout layers 76 | self.dropout = nn.Dropout(p=dropout) 77 | self.dropout_q = nn.Dropout(p=dropout/2) 78 | 79 | # graph convolution layers 80 | self.graph_convolution_1 = \ 81 | GraphConvolution(feat_dim, hid_dim * 2, n_kernels, 2) 82 | self.graph_convolution_2 = \ 83 | GraphConvolution(hid_dim * 2, hid_dim, n_kernels, 2) 84 | 85 | # output classifier 86 | self.out_1 = nn.utils.weight_norm(nn.Linear(hid_dim, out_dim)) 87 | self.out_2 = nn.utils.weight_norm(nn.Linear(out_dim, out_dim)) 88 | 89 | def forward(self, question, image, K, qlen): 90 | ''' 91 | ## Inputs: 92 | - question (batch_size, max_qlen): input tokenised question 93 | - image (batch_size, K, feat_dim): input image features 94 | - K (int): number of image features/objects in the image 95 | - qlen (batch_size): vector describing the length (in words) of each input question 96 | ## Returns: 97 | - logits (batch_size, out_dim) 98 | ''' 99 | 100 | K = int(K[0].cpu().data.numpy()) 101 | 102 | # extract bounding boxes and compute centres 103 | bb = image[:, :, -4:].contiguous() 104 | bb_size = (bb[:, :, 2:]-bb[:, :, :2]) 105 | bb_centre = bb[:, :, :2] + 0.5*bb_size 106 | 107 | # apply dropout to image features 108 | image = self.dropout(image) 109 | 110 | # Compute pseudo coordinates 111 | pseudo_coord = self._compute_pseudo(bb_centre) 112 | 113 | # Compute question encoding 114 | emb = self.wembed(question) 115 | packed = pack_padded_sequence(emb, qlen, batch_first=True) # questions have variable lengths 116 | _, hid = self.q_lstm(packed) 117 | qenc = hid[0].unsqueeze(1) 118 | qenc_repeat = qenc.repeat(1, K, 1) 119 | 120 | # Learn adjacency matrix 121 | image_qenc_cat = torch.cat((image, qenc_repeat), dim=-1) 122 | adjacency_matrix = self.adjacency_1(image_qenc_cat) 123 | 124 | # Graph convolution 1 125 | neighbourhood_image, neighbourhood_pseudo = self._create_neighbourhood(image, 126 | pseudo_coord, 127 | adjacency_matrix, 128 | self.neighbourhood_size, 129 | weight=True) 130 | hidden_graph_1 = self.graph_convolution_1( 131 | neighbourhood_image, neighbourhood_pseudo) 132 | hidden_graph_1 = F.relu(hidden_graph_1) 133 | hidden_graph_1 = self.dropout(hidden_graph_1) 134 | 135 | # graph convolution 2 136 | hidden_graph_1, neighbourhood_pseudo = self._create_neighbourhood(hidden_graph_1, 137 | pseudo_coord, 138 | adjacency_matrix, 139 | self.neighbourhood_size, 140 | weight=False) 141 | hidden_graph_2 = self.graph_convolution_2( 142 | hidden_graph_1, neighbourhood_pseudo) 143 | hidden_graph_2 = F.relu(hidden_graph_2) 144 | 145 | hidden_graph_2, _ = torch.max(hidden_graph_2, dim=1) 146 | h = F.relu(qenc).squeeze(1)*hidden_graph_2 147 | 148 | # Output classifier 149 | hidden_1 = self.out_1(h) 150 | hidden_1 = F.relu(hidden_1) 151 | hidden_1 = self.dropout(hidden_1) 152 | logits = self.out_2(hidden_1) 153 | 154 | return logits, adjacency_matrix 155 | 156 | def _create_neighbourhood_feat(self, image, top_ind): 157 | ''' 158 | ## Inputs: 159 | - image (batch_size, K, feat_dim) 160 | - top_ind (batch_size, K, neighbourhood_size) 161 | ## Returns: 162 | - neighbourhood_image (batch_size, K, neighbourhood_size, feat_dim) 163 | ''' 164 | 165 | batch_size = image.size(0) 166 | K = image.size(1) 167 | feat_dim = image.size(2) 168 | neighbourhood_size = top_ind.size(-1) 169 | image = image.unsqueeze(1).expand(batch_size, K, K, feat_dim) 170 | idx = top_ind.unsqueeze(-1).expand(batch_size, 171 | K, neighbourhood_size, feat_dim) 172 | return torch.gather(image, dim=2, index=idx) 173 | 174 | def _create_neighbourhood_pseudo(self, pseudo, top_ind): 175 | ''' 176 | ## Inputs: 177 | - pseudo_coord (batch_size, K, K, coord_dim) 178 | - top_ind (batch_size, K, neighbourhood_size) 179 | ## Returns: 180 | - neighbourhood_pseudo (batch_size, K, neighbourhood_size, coord_dim) 181 | ''' 182 | batch_size = pseudo.size(0) 183 | K = pseudo.size(1) 184 | coord_dim = pseudo.size(3) 185 | neighbourhood_size = top_ind.size(-1) 186 | idx = top_ind.unsqueeze(-1).expand(batch_size, 187 | K, neighbourhood_size, coord_dim) 188 | return torch.gather(pseudo, dim=2, index=idx) 189 | 190 | def _create_neighbourhood(self, 191 | features, 192 | pseudo_coord, 193 | adjacency_matrix, 194 | neighbourhood_size, 195 | weight=True): 196 | 197 | ''' 198 | 199 | Creates a neighbourhood system for each graph node/image object 200 | 201 | ## Inputs: 202 | - features (batch_size, K, feat_dim): input image features 203 | - pseudo_coord (batch_size, K, K, coord_dim): pseudo coordinates for graph convolutions 204 | - adjacency_matrix (batch_size, K, K): learned adjacency matrix 205 | - neighbourhood_size (int) 206 | - weight (bool): specify if the features should be weighted by the adjacency matrix values 207 | 208 | ## Returns: 209 | - neighbourhood_image (batch_size, K, neighbourhood_size, feat_dim) 210 | - neighbourhood_pseudo (batch_size, K, neighbourhood_size, coord_dim) 211 | ''' 212 | 213 | # Number of graph nodes 214 | K = features.size(1) 215 | 216 | # extract top k neighbours for each node and normalise 217 | top_k, top_ind = torch.topk( 218 | adjacency_matrix, k=neighbourhood_size, dim=-1, sorted=False) 219 | top_k = torch.stack([F.softmax(top_k[:, k]) for k in range(K)]).transpose(0, 1) # (batch_size, K, neighbourhood_size) 220 | 221 | # extract top k features and pseudo coordinates 222 | neighbourhood_image = \ 223 | self._create_neighbourhood_feat(features, top_ind) 224 | neighbourhood_pseudo = \ 225 | self._create_neighbourhood_pseudo(pseudo_coord, top_ind) 226 | 227 | # weight neighbourhood features with graph edge weights 228 | if weight: 229 | neighbourhood_image = top_k.unsqueeze(-1)*neighbourhood_image 230 | 231 | return neighbourhood_image, neighbourhood_pseudo 232 | 233 | def _compute_pseudo(self, bb_centre): 234 | ''' 235 | 236 | Computes pseudo-coordinates from bounding box centre coordinates 237 | 238 | ## Inputs: 239 | - bb_centre (batch_size, K, coord_dim) 240 | - polar (bool: polar or euclidean coordinates) 241 | ## Returns: 242 | - pseudo_coord (batch_size, K, K, coord_dim) 243 | ''' 244 | 245 | K = bb_centre.size(1) 246 | 247 | # Compute cartesian coordinates (batch_size, K, K, 2) 248 | pseudo_coord = bb_centre.view(-1, K, 1, 2) - \ 249 | bb_centre.view(-1, 1, K, 2) 250 | 251 | # Conver to polar coordinates 252 | rho = torch.sqrt( 253 | pseudo_coord[:, :, :, 0]**2 + pseudo_coord[:, :, :, 1]**2) 254 | theta = torch.atan2( 255 | pseudo_coord[:, :, :, 0], pseudo_coord[:, :, :, 1]) 256 | pseudo_coord = torch.cat( 257 | (torch.unsqueeze(rho, -1), torch.unsqueeze(theta, -1)), dim=-1) 258 | 259 | return pseudo_coord 260 | -------------------------------------------------------------------------------- /torch_dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 AimBrain Ltd. 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from __future__ import absolute_import, division, print_function 16 | 17 | import os 18 | import json 19 | import numpy as np 20 | import zarr 21 | import pandas as pd 22 | from torch.utils.data import Dataset 23 | from torch.utils.data import dataloader 24 | 25 | try: 26 | import cPickle as pickle 27 | except: 28 | import pickle as pickle 29 | 30 | 31 | def collate_fn(batch): 32 | # put question lengths in descending order so that we can use packed sequences later 33 | batch.sort(key=lambda x: x[-1], reverse=True) 34 | return dataloader.default_collate(batch) 35 | 36 | 37 | class VQA_Dataset(Dataset): 38 | 39 | def __init__(self, data_dir, emb_dim=300, train=True): 40 | 41 | # Set parameters 42 | self.data_dir = data_dir # directory where the data is stored 43 | self.emb_dim = emb_dim # question embedding dimension 44 | self.train = train # train (True) or eval (False) mode 45 | self.seqlen = 14 # maximum question sequence length 46 | 47 | # Load training question dictionary 48 | q_dict = pickle.load( 49 | open(os.path.join(data_dir, 'train_q_dict.p'), 'rb')) 50 | self.q_itow = q_dict['itow'] 51 | self.q_wtoi = q_dict['wtoi'] 52 | self.q_words = len(self.q_itow) + 1 53 | 54 | # Load training answer dictionary 55 | a_dict = pickle.load( 56 | open(os.path.join(data_dir, 'train_a_dict.p'), 'rb')) 57 | self.a_itow = a_dict['itow'] 58 | self.a_wtoi = a_dict['wtoi'] 59 | self.n_answers = len(self.a_itow) + 1 60 | 61 | # Load image features and bounding boxes 62 | self.i_feat = zarr.open(os.path.join( 63 | data_dir, 'trainval.zarr'), mode='r') 64 | self.bbox = zarr.open(os.path.join( 65 | data_dir, 'trainval_boxes.zarr'), mode='r') 66 | self.sizes = pd.read_csv(os.path.join( 67 | data_dir, 'trainval_image_size.csv')) 68 | 69 | # Load questions 70 | if train: 71 | self.vqa = json.load( 72 | open(os.path.join(data_dir, 'vqa_train_final_3000.json'))) 73 | else: 74 | self.vqa = json.load( 75 | open(os.path.join(data_dir, 'vqa_val_final_3000.json'))) 76 | 77 | self.n_questions = len(self.vqa) 78 | 79 | print('Loading done') 80 | self.feat_dim = self.i_feat[list(self.i_feat.keys())[ 81 | 0]].shape[1] + 4 # + bbox 82 | self.init_pretrained_wemb(emb_dim) 83 | 84 | def init_pretrained_wemb(self, emb_dim): 85 | """ 86 | From blog.keras.io 87 | Initialises words embeddings with pre-trained GLOVE embeddings 88 | """ 89 | embeddings_index = {} 90 | f = open(os.path.join(self.data_dir, 'glove.6B.') + 91 | str(emb_dim) + 'd.txt') 92 | for line in f: 93 | values = line.split() 94 | word = values[0] 95 | coefs = np.asarray(values[1:], dtype=np.float32) 96 | embeddings_index[word] = coefs 97 | f.close() 98 | 99 | embedding_mat = np.zeros((self.q_words, emb_dim), dtype=np.float32) 100 | for word, i in self.q_wtoi.items(): 101 | embedding_v = embeddings_index.get(word) 102 | if embedding_v is not None: 103 | embedding_mat[i] = embedding_v 104 | 105 | self.pretrained_wemb = embedding_mat 106 | 107 | def __len__(self): 108 | return self.n_questions 109 | 110 | def __getitem__(self, idx): 111 | 112 | # question sample 113 | qlen = len(self.vqa[idx]['question_toked']) 114 | q = [0] * 100 115 | for i, w in enumerate(self.vqa[idx]['question_toked']): 116 | try: 117 | q[i] = self.q_wtoi[w] 118 | except: 119 | q[i] = 0 # validation questions may contain unseen word 120 | 121 | # soft label answers 122 | a = np.zeros(self.n_answers, dtype=np.float32) 123 | for w, c in self.vqa[idx]['answers_w_scores']: 124 | try: 125 | a[self.a_wtoi[w]] = c 126 | except: 127 | continue 128 | 129 | # number of votes for each answer 130 | n_votes = np.zeros(self.n_answers, dtype=np.float32) 131 | for w, c in self.vqa[idx]['answers']: 132 | try: 133 | n_votes[self.a_wtoi[w]] = c 134 | except: 135 | continue 136 | 137 | # id of the question 138 | qid = self.vqa[idx]['question_id'] 139 | 140 | # image sample 141 | iid = self.vqa[idx]['image_id'] 142 | img = self.i_feat[str(iid)] 143 | bboxes = np.asarray(self.bbox[str(iid)]) 144 | imsize = self.sizes[str(iid)] 145 | 146 | if np.logical_not(np.isfinite(img)).sum() > 0: 147 | raise ValueError 148 | 149 | # number of image objects 150 | k = 36 151 | 152 | # scale bounding boxes by image dimensions 153 | for i in range(k): 154 | bbox = bboxes[i] 155 | bbox[0] /= imsize[0] 156 | bbox[1] /= imsize[1] 157 | bbox[2] /= imsize[0] 158 | bbox[3] /= imsize[1] 159 | bboxes[i] = bbox 160 | 161 | # format variables 162 | q = np.asarray(q) 163 | a = np.asarray(a).reshape(-1) 164 | n_votes = np.asarray(n_votes).reshape(-1) 165 | qid = np.asarray(qid).reshape(-1) 166 | i = np.concatenate([img, bboxes], axis=1) 167 | k = np.asarray(k).reshape(1) 168 | 169 | return q, a, n_votes, qid, i, k, qlen 170 | 171 | 172 | class VQA_Dataset_Test(Dataset): 173 | 174 | def __init__(self, data_dir, emb_dim=300, train=True): 175 | self.data_dir = data_dir 176 | self.emb_dim = emb_dim 177 | self.train = train 178 | self.seqlen = 14 # hard set based on paper 179 | 180 | q_dict = pickle.load( 181 | open(os.path.join(data_dir, 'train_q_dict.p'), 'rb')) 182 | self.q_itow = q_dict['itow'] 183 | self.q_wtoi = q_dict['wtoi'] 184 | self.q_words = len(self.q_itow) + 1 185 | 186 | a_dict = pickle.load( 187 | open(os.path.join(data_dir, 'train_a_dict.p'), 'rb')) 188 | self.a_itow = a_dict['itow'] 189 | self.a_wtoi = a_dict['wtoi'] 190 | self.n_answers = len(self.a_itow) + 1 191 | 192 | if train: 193 | self.vqa = json.load(open(os.path.join(data_dir, 'vqa_train_final_3000.json'))) + \ 194 | json.load( 195 | open(os.path.join(data_dir, 'vqa_val_final_3000.json'))) 196 | self.i_feat = zarr.open(os.path.join( 197 | data_dir, 'trainval.zarr'), mode='r') 198 | self.bbox = zarr.open(os.path.join( 199 | data_dir, 'trainval_boxes.zarr'), mode='r') 200 | self.sizes = pd.read_csv(os.path.join( 201 | data_dir, 'trainval_image_size.csv')) 202 | else: 203 | self.vqa = json.load( 204 | open(os.path.join(data_dir, 'vqa_test_toked.json'))) 205 | self.i_feat = zarr.open(os.path.join( 206 | data_dir, 'test.zarr'), mode='r') 207 | self.bbox = zarr.open(os.path.join( 208 | data_dir, 'test_boxes.zarr'), mode='r') 209 | self.sizes = pd.read_csv(os.path.join( 210 | data_dir, 'test_image_size.csv')) 211 | 212 | self.n_questions = len(self.vqa) 213 | 214 | print('Loading done') 215 | self.feat_dim = self.i_feat[list(self.i_feat.keys())[ 216 | 0]].shape[1] + 4 # + bbox 217 | self.init_pretrained_wemb(emb_dim) 218 | 219 | def init_pretrained_wemb(self, emb_dim): 220 | """From blog.keras.io""" 221 | embeddings_index = {} 222 | f = open(os.path.join(self.data_dir, 'glove.6B.') + 223 | str(emb_dim) + 'd.txt') 224 | for line in f: 225 | values = line.split() 226 | word = values[0] 227 | coefs = np.asarray(values[1:], dtype=np.float32) 228 | embeddings_index[word] = coefs 229 | f.close() 230 | 231 | embedding_mat = np.zeros((self.q_words, emb_dim), dtype=np.float32) 232 | for word, i in self.q_wtoi.items(): 233 | embedding_v = embeddings_index.get(word) 234 | if embedding_v is not None: 235 | embedding_mat[i] = embedding_v 236 | 237 | self.pretrained_wemb = embedding_mat 238 | 239 | def __len__(self): 240 | return self.n_questions 241 | 242 | def __getitem__(self, idx): 243 | 244 | # question sample 245 | qlen = len(self.vqa[idx]['question_toked']) 246 | q = [0] * 100 247 | for i, w in enumerate(self.vqa[idx]['question_toked']): 248 | try: 249 | q[i] = self.q_wtoi[w] 250 | except: 251 | q[i] = 0 # validation questions may contain unseen word 252 | 253 | # soft label answers 254 | if self.train: 255 | a = np.zeros(self.n_answers, dtype=np.float32) 256 | for w, c in self.vqa[idx]['answers_w_scores']: 257 | try: 258 | a[self.a_wtoi[w]] = c 259 | except: 260 | continue 261 | a = np.asarray(a).reshape(-1) 262 | else: 263 | # return 0's for unknown test set answers 264 | a = 0 265 | 266 | # votes 267 | if self.train: 268 | n_votes = np.zeros(self.n_answers, dtype=np.float32) 269 | for w, c in self.vqa[idx]['answers']: 270 | try: 271 | n_votes[self.a_wtoi[w]] = c 272 | except: 273 | continue 274 | n_votes = np.asarray(n_votes).reshape(-1) 275 | else: 276 | # return 0's for unknown test set answers 277 | n_votes = 0 278 | 279 | # id of the question 280 | qid = self.vqa[idx]['question_id'] 281 | 282 | # image sample 283 | iid = self.vqa[idx]['image_id'] 284 | img = self.i_feat[str(iid)] 285 | bboxes = np.asarray(self.bbox[str(iid)]) 286 | imsize = self.sizes[str(iid)] 287 | 288 | if np.logical_not(np.isfinite(img)).sum() > 0: 289 | raise ValueError 290 | 291 | # k sample 292 | k = 36 293 | 294 | # scale bounding boxes by image dimensions 295 | for i in range(k): 296 | bbox = bboxes[i] 297 | bbox[0] /= imsize[0] 298 | bbox[1] /= imsize[1] 299 | bbox[2] /= imsize[0] 300 | bbox[3] /= imsize[1] 301 | bboxes[i] = bbox 302 | 303 | # format 304 | q = np.asarray(q) 305 | qid = np.asarray(qid).reshape(-1) 306 | i = np.concatenate([img, bboxes], axis=1) 307 | k = np.asarray(k).reshape(1) 308 | 309 | return q, a, n_votes, qid, i, k, qlen 310 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 AimBrain Ltd. 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | import torch 17 | from torch.autograd import Variable 18 | 19 | 20 | def batch_to_cuda(batch, volatile=False): 21 | # moves dataset batch on GPU 22 | 23 | q = Variable(batch[0], volatile=volatile, requires_grad=False).cuda() 24 | a = Variable(batch[1], volatile=volatile, requires_grad=False).cuda() 25 | n_votes = Variable(batch[2], volatile=volatile, requires_grad=False).cuda() 26 | i = Variable(batch[4], volatile=volatile, requires_grad=False).cuda() 27 | k = Variable(batch[5], volatile=volatile, requires_grad=False).cuda() 28 | qlen = list(batch[6]) 29 | return q, a, n_votes, i, k, qlen 30 | 31 | 32 | def save(model, optimizer, ep, epoch_loss, epoch_acc, dir, name): 33 | # saves model and optimizer state 34 | 35 | tbs = { 36 | 'epoch': ep + 1, 37 | 'loss': epoch_loss, 38 | 'accuracy': epoch_acc, 39 | 'state_dict': model.state_dict(), 40 | 'optimizer': optimizer.state_dict() 41 | } 42 | torch.save(tbs, os.path.join(dir, name + '.pth.tar')) 43 | 44 | 45 | def total_vqa_score(output_batch, n_votes_batch): 46 | # computes the total vqa score as assessed by the challenge 47 | 48 | vqa_score = 0 49 | _, oix = output_batch.data.max(1) 50 | for i, pred in enumerate(oix): 51 | count = n_votes_batch[i,pred] 52 | vqa_score += min(count.cpu().data[0]/3, 1) 53 | return vqa_score 54 | --------------------------------------------------------------------------------