├── LICENSE
├── README.md
├── data
    ├── download_data.py
    ├── preprocess_image.py
    └── preprocess_text.py
├── figures
    ├── examples.png
    └── model.png
├── layers.py
├── license.txt
├── poster.pdf
├── run.py
├── sparse_graph_model.py
├── torch_dataset.py
└── utils.py


/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Learning Conditioned Graph Structures for Interpretable Visual Question Answering
 2 | 
 3 | This code provides a pytorch implementation of our graph learning method for Visual Question Answering as described in [Learning Conditioned Graph Structures for Interpretable Visual Question Answering](https://arxiv.org/abs/1806.07243)
 4 | 
 5 | ### Model diagram
 6 | ![](./figures/model.png)
 7 | ### Examples of learned graph structures
 8 | ![](./figures/examples.png)
 9 | 
10 | ## Getting Started
11 | 
12 | ### Reference
13 | 
14 | If you use our code or any of the ideas from our paper please cite:
15 | ```
16 | @article{learningconditionedgraph,
17 | author = {Will Norcliffe-Brown and Efstathios Vafeias and Sarah Parisot},
18 | title = {Learning Conditioned Graph Structures for Interpretable Visual Question Answering},
19 | journal = {arXiv preprint arXiv:1806.07243},
20 | year = {2018}
21 | }
22 | ```
23 | 
24 | ### Requirements
25 | 
26 |  - [pytorch (0.3.1) (with CUDA)](https://pytorch.org/)
27 |  - [zarr (2.2.0)](https://github.com/zarr-developers/zarr)
28 |  - [tdqm](https://github.com/tqdm/tqdm)
29 |  - [spacy](https://spacy.io/usage/)
30 | 
31 | ### Data
32 | 
33 | To download and unzip the required datasets, change to the data folder and run 
34 |  ```
35 |  $ cd data; python download_data.py
36 |  ```
37 | 
38 | To preprocess the image data and text data the following commands can be executed respectively. (Setting the data variable to trainval or test for preprocess_image.py and train, val or test for preprocess_text.py depending on which dataset you want to preprocess)
39 | ```
40 | $ python preprocess_image.py --data trainval; python preprocess_text.py --data train
41 | ```
42 | ### Pretrained model
43 | If you would like a pretrained model, one can be found here: [example model](https://drive.google.com/file/d/1nBwZIy8SPbV2bqGYYA97uCHnybDqTjRa/view?usp=sharing). This model achieved 66.2% accuracy on test.
44 | 
45 | 
46 | ### Training
47 | 
48 | To train a model on the train set with our default parameters run
49 | ```
50 | $ python run.py --train
51 | ```
52 | and to train a model on the train and validation set for evaluation on the test set run
53 | ```
54 | $ python run.py --trainval
55 | ```
56 | Models can be validated via
57 |  ```
58 | $ python run.py --eval --model_path path_to_your_model
59 | ```
60 | and a json of results from the test set can be produced with
61 |  ```
62 | $ python run.py --test --model_path path_to_your_model
63 | ```
64 | To reproduce our results train a model on the trainval set with the default parameters,
65 | run the test script and evaluate the json on the [EvalAI website](https://evalai.cloudcv.org/).
66 | 
67 | ## Authors
68 | 
69 | * **Will Norcliffe-Brown**
70 | * **Sarah Parisot**
71 | * **Stathis Vafeias** 
72 | 
73 | 
74 | ## License
75 | 
76 | This project is licensed under the Apache 2.0 license - see [Apache license](license.txt)
77 | 
78 | ## Acknowledgements
79 | 
80 | Our code is based on this implementation of the 2017 VQA challenge winner [https://github.com/markdtw/vqa-winner-cvprw-2017](https://github.com/markdtw/vqa-winner-cvprw-2017)
81 | 


--------------------------------------------------------------------------------
/data/download_data.py:
--------------------------------------------------------------------------------
 1 | #    Copyright 2018 AimBrain Ltd.
 2 | 
 3 | #    Licensed under the Apache License, Version 2.0 (the "License");
 4 | #    you may not use this file except in compliance with the License.
 5 | #    You may obtain a copy of the License at
 6 | 
 7 | #        http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | #    Unless required by applicable law or agreed to in writing, software
10 | #    distributed under the License is distributed on an "AS IS" BASIS,
11 | #    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #    See the License for the specific language governing permissions and
13 | #    limitations under the License.
14 | 
15 | import os
16 | 
17 | # download input questions (training, validation and test sets)
18 | os.system(
19 |     'wget https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Questions_Train_mscoco.zip -P zip/')
20 | os.system(
21 |     'wget https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Questions_Val_mscoco.zip -P zip/')
22 | os.system(
23 |     'wget https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Questions_Test_mscoco.zip -P zip/')
24 | 
25 | # download annotations (training and validation sets)
26 | os.system(
27 |     'wget https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Annotations_Train_mscoco.zip -P zip/')
28 | os.system(
29 |     'wget https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Annotations_Val_mscoco.zip -P zip/')
30 | 
31 | # download pre-trained glove embeddings
32 | os.system('wget http://nlp.stanford.edu/data/glove.6B.zip -P zip/')
33 | 
34 | # download rcnn extracted features (may take a while, both very large files)
35 | os.system(
36 |     'wget https://imagecaption.blob.core.windows.net/imagecaption/trainval_36.zip -P zip/')
37 | os.system(
38 |     'wget https://imagecaption.blob.core.windows.net/imagecaption/test2015_36.zip -P zip/')
39 | 
40 | # extract them
41 | os.system('unzip zip/v2_Questions_Train_mscoco.zip -d raw/')
42 | os.system('unzip zip/v2_Questions_Val_mscoco.zip -d raw/')
43 | os.system('unzip zip/v2_Questions_Test_mscoco.zip -d raw/')
44 | os.system('unzip zip/v2_Annotations_Train_mscoco.zip -d raw/')
45 | os.system('unzip zip/v2_Annotations_Val_mscoco.zip -d raw/')
46 | os.system('unzip zip/glove.6B.zip -d ./')
47 | os.system('unzip zip/trainval_36.zip -d raw/')
48 | os.system('unzip zip/test2015_36.zip -d raw/')


--------------------------------------------------------------------------------
/data/preprocess_image.py:
--------------------------------------------------------------------------------
  1 | #    Copyright 2018 AimBrain Ltd.
  2 | 
  3 | #    Licensed under the Apache License, Version 2.0 (the "License");
  4 | #    you may not use this file except in compliance with the License.
  5 | #    You may obtain a copy of the License at
  6 | 
  7 | #        http://www.apache.org/licenses/LICENSE-2.0
  8 | 
  9 | #    Unless required by applicable law or agreed to in writing, software
 10 | #    distributed under the License is distributed on an "AS IS" BASIS,
 11 | #    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | #    See the License for the specific language governing permissions and
 13 | #    limitations under the License.
 14 | 
 15 | from __future__ import division
 16 | from __future__ import print_function
 17 | from __future__ import absolute_import
 18 | 
 19 | import os
 20 | import argparse
 21 | import base64
 22 | import numpy as np
 23 | import csv
 24 | import sys
 25 | import h5py
 26 | import pandas as pd
 27 | import zarr
 28 | from tqdm import tqdm
 29 | 
 30 | 
 31 | csv.field_size_limit(sys.maxsize)
 32 | 
 33 | 
 34 | def features_to_zarr(phase):
 35 |     FIELDNAMES = ['image_id', 'image_w', 'image_h',
 36 |                   'num_boxes', 'boxes', 'features']
 37 | 
 38 |     if phase == 'trainval':
 39 |         infiles = [
 40 |             'raw/trainval_36/trainval_resnet101_faster_rcnn_genome_36.tsv',
 41 |         ]
 42 |     elif phase == 'test':
 43 |         infiles = [
 44 |             'raw/test2015_36/test2015_resnet101_faster_rcnn_genome_36.tsv',
 45 |         ]
 46 |     else:
 47 |         raise SystemExit('Unrecognised phase')
 48 | 
 49 |     # Read the tsv and append to files
 50 |     boxes = zarr.open_group(phase + '_boxes.zarr', mode='w')
 51 |     features = zarr.open_group(phase + '.zarr', mode='w')
 52 |     image_size = {}
 53 |     for infile in infiles:
 54 |         with open(infile, "r") as tsv_in_file:
 55 |             reader = csv.DictReader(
 56 |                 tsv_in_file, delimiter='\t', fieldnames=FIELDNAMES)
 57 |             print('Converting ' + infile + ' to zarr...')
 58 |             for item in tqdm(reader):
 59 |                 item['image_id'] = str(item['image_id'])
 60 |                 item['image_h'] = int(item['image_h'])
 61 |                 item['image_w'] = int(item['image_w'])
 62 |                 item['num_boxes'] = int(item['num_boxes'])
 63 |                 for field in ['boxes', 'features']:
 64 |                     encoded_str = base64.decodestring(
 65 |                         item[field].encode('utf-8'))
 66 |                     item[field] = np.frombuffer(encoded_str,
 67 |                                                 dtype=np.float32).reshape((item['num_boxes'], -1))
 68 |                 # append to zarr files
 69 |                 boxes.create_dataset(item['image_id'], data=item['boxes'])
 70 |                 features.create_dataset(item['image_id'], data=item['features'])
 71 |                 # image_size dict
 72 |                 image_size[item['image_id']] = {
 73 |                     'image_h':item['image_h'],
 74 |                     'image_w':item['image_w'],
 75 |                 }
 76 | 
 77 | 
 78 |     # convert dict to pandas dataframe
 79 |     
 80 | 
 81 |     # create image sizes csv
 82 |     print('Writing image sizes csv...')
 83 |     df = pd.DataFrame.from_dict(image_size)
 84 |     df = df.transpose()
 85 |     d = df.to_dict()
 86 |     dw = d['image_w']
 87 |     dh = d['image_h']
 88 |     d = [dw, dh]
 89 |     dwh = {}
 90 |     for k in dw.keys():
 91 |         dwh[k] = np.array([d0[k] for d0 in d])
 92 |     image_sizes = pd.DataFrame(dwh)
 93 |     image_sizes.to_csv(phase + '_image_size.csv')
 94 | 
 95 | 
 96 | if __name__ == '__main__':
 97 |     parser = argparse.ArgumentParser(
 98 |                         description='Preprocessing for VQA v2 image data')
 99 |     parser.add_argument('--data', nargs='+', help='trainval, and/or test, list of data phases to be processed', required=True)
100 |     args, unparsed = parser.parse_known_args()
101 |     if len(unparsed) != 0:
102 |         raise SystemExit('Unknown argument: {}'.format(unparsed))
103 | 
104 |     phase_list = args.data
105 | 
106 |     for phase in phase_list:
107 |         # First download and extract
108 | 
109 |         if not os.path.exists(phase + '.zarr'):
110 |             print('Converting features tsv to zarr file...')
111 |             features_to_zarr(phase)
112 | 
113 |     print('Done')
114 | 


--------------------------------------------------------------------------------
/data/preprocess_text.py:
--------------------------------------------------------------------------------
  1 | #    Copyright 2018 AimBrain Ltd.
  2 | 
  3 | #    Licensed under the Apache License, Version 2.0 (the "License");
  4 | #    you may not use this file except in compliance with the License.
  5 | #    You may obtain a copy of the License at
  6 | 
  7 | #        http://www.apache.org/licenses/LICENSE-2.0
  8 | 
  9 | #    Unless required by applicable law or agreed to in writing, software
 10 | #    distributed under the License is distributed on an "AS IS" BASIS,
 11 | #    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | #    See the License for the specific language governing permissions and
 13 | #    limitations under the License.
 14 | 
 15 | from __future__ import division
 16 | from __future__ import print_function
 17 | from __future__ import absolute_import
 18 | 
 19 | import os
 20 | import json
 21 | import collections
 22 | import argparse
 23 | import string
 24 | from tqdm import tqdm
 25 | from spacy.tokenizer import Tokenizer
 26 | import en_core_web_sm
 27 | 
 28 | try:
 29 |     import cPickle as pickle
 30 | except:
 31 |     import pickle
 32 | 
 33 | nlp = en_core_web_sm.load()
 34 | tokenizer = Tokenizer(nlp.vocab)
 35 | exclude = set(string.punctuation)
 36 | 
 37 | 
 38 | def process_answers(q, phase, n_answers=3000):
 39 | 
 40 |     # find the n_answers most common answers
 41 |     counts = {}
 42 |     for row in q:
 43 |         counts[row['answer']] = counts.get(row['answer'], 0) + 1
 44 | 
 45 |     cw = sorted([(count, w) for w, count in counts.items()], reverse=True)
 46 | 
 47 |     vocab = [w for c, w in cw[:n_answers]]
 48 | 
 49 |     # a 0-indexed vocabulary translation table
 50 |     itow = {i: w for i, w in enumerate(vocab)}
 51 |     wtoi = {w: i for i, w in enumerate(vocab)}  # inverse table
 52 |     pickle.dump({'itow': itow, 'wtoi': wtoi}, open(phase + '_a_dict.p', 'wb'))
 53 | 
 54 |     for row in q:
 55 |         accepted_answers = 0
 56 |         for w, c in row['answers']:
 57 |             if w in vocab:
 58 |                 accepted_answers += c
 59 | 
 60 |         answers_scores = []
 61 |         for w, c in row['answers']:
 62 |             if w in vocab:
 63 |                 answers_scores.append((w, c / accepted_answers))
 64 | 
 65 |         row['answers_w_scores'] = answers_scores
 66 | 
 67 |     json.dump(q, open('vqa_' + phase + '_final_3000.json', 'w'))
 68 | 
 69 | 
 70 | def process_questions(q):
 71 |     # build question dictionary
 72 |     def build_vocab(questions):
 73 |         count_thr = 0
 74 |         # count up the number of times a word is used
 75 |         counts = {}
 76 |         for row in questions:
 77 |             for word in row['question_toked']:
 78 |                 counts[word] = counts.get(word, 0) + 1
 79 |         cw = sorted([(count, w) for w, count in counts.items()], reverse=True)
 80 |         print('top words and their counts:')
 81 |         print('\n'.join(map(str, cw[:10])))
 82 | 
 83 |         # print some stats
 84 |         total_words = sum(counts.values())
 85 |         print('total words:', total_words)
 86 |         bad_words = [w for w, n in counts.items() if n <= count_thr]
 87 |         vocab = [w for w, n in counts.items() if n > count_thr]
 88 |         bad_count = sum(counts[w] for w in bad_words)
 89 |         print('number of bad words: %d/%d = %.2f%%' %
 90 |               (len(bad_words), len(counts), len(bad_words)*100.0/len(counts)))
 91 |         print('number of words in vocab would be %d' % (len(vocab), ))
 92 |         print('number of UNKs: %d/%d = %.2f%%' %
 93 |               (bad_count, total_words, bad_count*100.0/total_words))
 94 | 
 95 |         return vocab
 96 | 
 97 |     vocab = build_vocab(q)
 98 |     # a 1-indexed vocab translation table
 99 |     itow = {i+1: w for i, w in enumerate(vocab)}
100 |     wtoi = {w: i+1 for i, w in enumerate(vocab)}  # inverse table
101 |     pickle.dump({'itow': itow, 'wtoi': wtoi}, open(phase + '_q_dict.p', 'wb'))
102 | 
103 | 
104 | def tokenize_questions(qa, phase):
105 |     qas = len(qa)
106 |     for i, row in enumerate(tqdm(qa)):
107 |         row['question_toked'] = [t.text if '?' not in t.text else t.text[:-1]
108 |                                  for t in tokenizer(row['question'].lower())]  # get spacey tokens and remove question marks
109 |         if i == qas - 1:
110 |             json.dump(qa, open('vqa_' + phase + '_toked.json', 'w'))
111 | 
112 | 
113 | def combine_qa(questions, annotations, phase):
114 |     # Combine questions and answers in the same json file
115 |     # 443757 questions
116 |     data = []
117 |     for i, q in enumerate(tqdm(questions['questions'])):
118 |         row = {}
119 |         # load questions info
120 |         row['question'] = q['question']
121 |         row['question_id'] = q['question_id']
122 |         row['image_id'] = str(q['image_id'])
123 | 
124 |         # load answers
125 |         assert q['question_id'] == annotations[i]['question_id']
126 |         row['answer'] = annotations[i]['multiple_choice_answer']
127 | 
128 |         answers = []
129 |         for ans in annotations[i]['answers']:
130 |             answers.append(ans['answer'])
131 |         row['answers'] = collections.Counter(answers).most_common()
132 | 
133 |         data.append(row)
134 | 
135 |     json.dump(data, open('vqa_' + phase + '_combined.json', 'w'))
136 | 
137 | 
138 | if __name__ == '__main__':
139 |     parser = argparse.ArgumentParser(
140 |                         description='Preprocessing for VQA v2 text data')
141 |     parser.add_argument('--data', nargs='+', help='train, val and/or test, list of data phases to be processed', required=True)
142 |     parser.add_argument('--nanswers', default=3000, help='number of top answers to consider for classification.')
143 |     args, unparsed = parser.parse_known_args()
144 |     if len(unparsed) != 0:
145 |         raise SystemExit('Unknown argument: {}'.format(unparsed))
146 | 
147 |     phase_list = args.data
148 | 
149 |     for phase in phase_list:
150 | 
151 |         print('processing ' + phase + ' data')
152 |         if phase != 'test':
153 |             # Combine Q and A
154 |             print('Combining question and answer...')
155 |             question = json.load(
156 |                 open('raw/v2_OpenEnded_mscoco_' + phase + '2014_questions.json'))
157 |             answers = json.load(open('raw/v2_mscoco_' + phase + '2014_annotations.json'))
158 |             combine_qa(question, answers['annotations'], phase)
159 | 
160 |             # Tokenize
161 |             print('Tokenizing...')
162 |             t = json.load(open('vqa_' + phase + '_combined.json'))
163 |             tokenize_questions(t, phase)
164 |         else:
165 |             print ('Tokenizing...')
166 |             t = json.load(open('raw/v2_OpenEnded_mscoco_' + phase + '2015_questions.json'))
167 |             t = t['questions']
168 |             tokenize_questions(t, phase)
169 | 
170 |         # Build dictionary for question and answers
171 |         print('Building dictionary...')
172 |         t = json.load(open('vqa_' + phase + '_toked.json'))
173 |         if phase == 'train':
174 |             process_questions(t)
175 |         if phase != 'test':
176 |             process_answers(t, phase, n_answers=args.nanswers)
177 | 
178 |     print('Done')
179 | 


--------------------------------------------------------------------------------
/figures/examples.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aimbrain/vqa-project/341122a267293017b55db4f033fbe81445af03ea/figures/examples.png


--------------------------------------------------------------------------------
/figures/model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aimbrain/vqa-project/341122a267293017b55db4f033fbe81445af03ea/figures/model.png


--------------------------------------------------------------------------------
/layers.py:
--------------------------------------------------------------------------------
  1 | #    Copyright 2018 AimBrain Ltd.
  2 | 
  3 | #    Licensed under the Apache License, Version 2.0 (the "License");
  4 | #    you may not use this file except in compliance with the License.
  5 | #    You may obtain a copy of the License at
  6 | 
  7 | #        http://www.apache.org/licenses/LICENSE-2.0
  8 | 
  9 | #    Unless required by applicable law or agreed to in writing, software
 10 | #    distributed under the License is distributed on an "AS IS" BASIS,
 11 | #    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | #    See the License for the specific language governing permissions and
 13 | #    limitations under the License.
 14 | 
 15 | import torch
 16 | import numpy as np
 17 | 
 18 | from torch.nn.parameter import Parameter
 19 | from torch.nn.modules.module import Module
 20 | import torch.nn as nn
 21 | import torch.nn.functional as F
 22 | 
 23 | 
 24 | class NeighbourhoodGraphConvolution(Module):
 25 |     '''
 26 |     Implementation of: https://arxiv.org/pdf/1611.08402.pdf where we consider
 27 |     a fixed sized neighbourhood of nodes for each feature
 28 |     '''
 29 | 
 30 |     def __init__(self,
 31 |                  in_feat_dim,
 32 |                  out_feat_dim,
 33 |                  n_kernels,
 34 |                  coordinate_dim,
 35 |                  bias=False):
 36 |         super(NeighbourhoodGraphConvolution, self).__init__()
 37 |         '''
 38 |         ## Variables:
 39 |         - in_feat_dim: dimensionality of input features
 40 |         - out_feat_dim: dimensionality of output features
 41 |         - n_kernels: number of Gaussian kernels to use
 42 |         - coordinate_dim : dimensionality of the pseudo coordinates
 43 |         - bias: whether to add a bias to convolutional kernels
 44 |         '''
 45 | 
 46 |         # Set parameters
 47 |         self.n_kernels = n_kernels
 48 |         self.coordinate_dim = coordinate_dim
 49 |         self.in_feat_dim = in_feat_dim
 50 |         self.out_feat_dim = out_feat_dim
 51 |         self.bias = bias
 52 | 
 53 |         # Convolution filters weights
 54 |         self.conv_weights = nn.ModuleList([nn.Linear(
 55 |             in_feat_dim, out_feat_dim//n_kernels, bias=bias) for i in range(n_kernels)])
 56 | 
 57 |         # Parameters of the Gaussian kernels
 58 |         self.mean_rho = Parameter(torch.Tensor(n_kernels, 1))
 59 |         self.mean_theta = Parameter(torch.Tensor(n_kernels, 1))
 60 |         self.precision_rho = Parameter(torch.Tensor(n_kernels, 1))
 61 |         self.precision_theta = Parameter(torch.Tensor(n_kernels, 1))
 62 | 
 63 |         self.init_parameters()
 64 | 
 65 |     def init_parameters(self):
 66 |         # Initialise Gaussian parameters
 67 |         self.mean_theta.data.uniform_(-np.pi, np.pi)
 68 |         self.mean_rho.data.uniform_(0, 1.0)
 69 |         self.precision_theta.data.uniform_(0.0, 1.0)
 70 |         self.precision_rho.data.uniform_(0.0, 1.0)
 71 | 
 72 |     def forward(self, neighbourhood_features, neighbourhood_pseudo_coord):
 73 |         '''
 74 |         ## Inputs:
 75 |         - neighbourhood_features (batch_size, K, neighbourhood_size, in_feat_dim)
 76 |         - neighbourhood_pseudo_coord (batch_size, K, neighbourhood_size, coordinate_dim)
 77 |         ## Returns:
 78 |         - convolved_features (batch_size, K, neighbourhood_size, out_feat_dim)
 79 |         '''
 80 | 
 81 |         # set parameters
 82 |         batch_size = neighbourhood_features.size(0)
 83 |         K = neighbourhood_features.size(1)
 84 |         neighbourhood_size = neighbourhood_features.size(2)
 85 | 
 86 |         # compute pseudo coordinate kernel weights
 87 |         weights = self.get_gaussian_weights(neighbourhood_pseudo_coord)
 88 |         weights = weights.view(
 89 |             batch_size*K, neighbourhood_size, self.n_kernels)
 90 | 
 91 |         # compute convolved features
 92 |         neighbourhood_features = neighbourhood_features.view(
 93 |             batch_size*K, neighbourhood_size, -1)
 94 |         convolved_features = self.convolution(neighbourhood_features, weights)
 95 |         convolved_features = convolved_features.view(-1, K, self.out_feat_dim)
 96 | 
 97 |         return convolved_features
 98 | 
 99 |     def get_gaussian_weights(self, pseudo_coord):
100 |         '''
101 |         ## Inputs:
102 |         - pseudo_coord (batch_size, K, K, pseudo_coord_dim)
103 |         ## Returns:
104 |         - weights (batch_size*K, neighbourhood_size, n_kernels)
105 |         '''
106 | 
107 |         # compute rho weights
108 |         diff = (pseudo_coord[:, :, :, 0].contiguous().view(-1, 1) - self.mean_rho.view(1, -1))**2
109 |         weights_rho = torch.exp(-0.5 * diff /
110 |                                 (1e-14 + self.precision_rho.view(1, -1)**2))
111 | 
112 |         # compute theta weights
113 |         first_angle = torch.abs(pseudo_coord[:, :, :, 1].contiguous().view(-1, 1) - self.mean_theta.view(1, -1))
114 |         second_angle = torch.abs(2 * np.pi - first_angle)
115 |         weights_theta = torch.exp(-0.5 * (torch.min(first_angle, second_angle)**2)
116 |                                   / (1e-14 + self.precision_theta.view(1, -1)**2))
117 | 
118 |         weights = weights_rho * weights_theta
119 |         weights[(weights != weights).detach()] = 0
120 | 
121 |         # normalise weights
122 |         weights = weights / torch.sum(weights, dim=1, keepdim=True)
123 | 
124 |         return weights
125 | 
126 |     def convolution(self, neighbourhood, weights):
127 |         '''
128 |         ## Inputs:
129 |         - neighbourhood (batch_size*K, neighbourhood_size, in_feat_dim)
130 |         - weights (batch_size*K, neighbourhood_size, n_kernels)
131 |         ## Returns:
132 |         - convolved_features (batch_size*K, out_feat_dim)
133 |         '''
134 |         # patch operator
135 |         weighted_neighbourhood = torch.bmm(
136 |             weights.transpose(1, 2), neighbourhood)
137 | 
138 |         # convolutions
139 |         weighted_neighbourhood = [self.conv_weights[i](weighted_neighbourhood[:, i]) for i in range(self.n_kernels)]
140 |         convolved_features = torch.cat([i.unsqueeze(1) for i in weighted_neighbourhood], dim=1)
141 |         convolved_features = convolved_features.view(-1, self.out_feat_dim)
142 | 
143 |         return convolved_features
144 | 
145 | 
146 | class GraphLearner(Module):
147 |     def __init__(self, in_feature_dim, combined_feature_dim, K, dropout=0.0):
148 |         super(GraphLearner, self).__init__()
149 | 
150 |         '''
151 |         ## Variables:
152 |         - in_feature_dim: dimensionality of input features
153 |         - combined_feature_dim: dimensionality of the joint hidden embedding
154 |         - K: number of graph nodes/objects on the image
155 |         '''
156 | 
157 |         # Parameters
158 |         self.in_dim = in_feature_dim
159 |         self.combined_dim = combined_feature_dim
160 |         self.K = K
161 | 
162 |         # Embedding layers
163 |         self.edge_layer_1 = nn.Linear(in_feature_dim, 
164 |                                       combined_feature_dim)
165 |         self.edge_layer_2 = nn.Linear(combined_feature_dim, 
166 |                                       combined_feature_dim)
167 | 
168 |         # Regularisation
169 |         self.dropout = nn.Dropout(p=dropout)
170 |         self.edge_layer_1 = nn.utils.weight_norm(self.edge_layer_1)
171 |         self.edge_layer_2 = nn.utils.weight_norm(self.edge_layer_2)
172 | 
173 |     def forward(self, graph_nodes):
174 |         '''
175 |         ## Inputs:
176 |         - graph_nodes (batch_size, K, in_feat_dim): input features
177 |         ## Returns:
178 |         - adjacency matrix (batch_size, K, K)
179 |         '''
180 | 
181 |         graph_nodes = graph_nodes.view(-1, self.in_dim)
182 | 
183 |         # layer 1
184 |         h = self.edge_layer_1(graph_nodes)
185 |         h = F.relu(h)
186 | 
187 |         # layer 2
188 |         h = self.edge_layer_2(h)
189 |         h = F.relu(h)
190 | 
191 |         # outer product
192 |         h = h.view(-1, self.K, self.combined_dim)
193 |         adjacency_matrix = torch.matmul(h, h.transpose(1, 2))
194 | 
195 |         return adjacency_matrix
196 | 
197 | 


--------------------------------------------------------------------------------
/license.txt:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 


--------------------------------------------------------------------------------
/poster.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aimbrain/vqa-project/341122a267293017b55db4f033fbe81445af03ea/poster.pdf


--------------------------------------------------------------------------------
/run.py:
--------------------------------------------------------------------------------
  1 | #    Copyright 2018 AimBrain Ltd.
  2 | 
  3 | #    Licensed under the Apache License, Version 2.0 (the "License");
  4 | #    you may not use this file except in compliance with the License.
  5 | #    You may obtain a copy of the License at
  6 | 
  7 | #        http://www.apache.org/licenses/LICENSE-2.0
  8 | 
  9 | #    Unless required by applicable law or agreed to in writing, software
 10 | #    distributed under the License is distributed on an "AS IS" BASIS,
 11 | #    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | #    See the License for the specific language governing permissions and
 13 | #    limitations under the License.
 14 | 
 15 | from __future__ import division
 16 | from __future__ import print_function
 17 | from __future__ import absolute_import
 18 | import os
 19 | import json
 20 | import argparse
 21 | from tqdm import tqdm
 22 | 
 23 | import torch
 24 | import torch.nn as nn
 25 | from torch.utils.data import DataLoader
 26 | from torch.utils.data.sampler import RandomSampler
 27 | from torch.optim.lr_scheduler import MultiStepLR
 28 | 
 29 | from sparse_graph_model import Model
 30 | from torch_dataset import *
 31 | from utils import *
 32 | 
 33 | def eval_model(args):
 34 | 
 35 |     """
 36 |         Computes the VQA accuracy over the validation set
 37 |         using a pre-trained model
 38 |     """
 39 | 
 40 |     # Check that the model path is accurate
 41 |     if args.model_path and os.path.isfile(args.model_path):
 42 |         print('Resuming from checkpoint %s' % (args.model_path))
 43 |     else:
 44 |         raise SystemExit('Need to provide model path.')
 45 | 
 46 |     # Set random seed
 47 |     torch.manual_seed(1000)
 48 |     if torch.cuda.is_available():
 49 |         torch.cuda.manual_seed(1000)
 50 |     else:
 51 |         raise SystemExit('No CUDA available, script requires cuda')
 52 | 
 53 |     # Load the validation set
 54 |     print('Loading data')
 55 |     dataset = VQA_Dataset(args.data_dir, args.emb, train=False)
 56 |     loader = DataLoader(dataset, batch_size=args.bsize,
 57 |                         shuffle=False, num_workers=5, 
 58 |                         collate_fn=collate_fn)
 59 | 
 60 |     # Print data and model parameters
 61 |     print('Parameters:\n\t'
 62 |           'vocab size: %d\n\tembedding dim: %d\n\tfeature dim: %d' 
 63 |           '\n\thidden dim: %d\n\toutput dim: %d' % (dataset.q_words, args.emb,
 64 |                                                     dataset.feat_dim,
 65 |                                                     args.hid,
 66 |                                                     dataset.n_answers))
 67 |     # Define the model
 68 |     model = Model(vocab_size=dataset.q_words,
 69 |                   emb_dim=args.emb,
 70 |                   feat_dim=dataset.feat_dim,
 71 |                   hid_dim=args.hid,
 72 |                   out_dim=dataset.n_answers,
 73 |                   dropout=args.dropout,
 74 |                   pretrained_wemb=dataset.pretrained_wemb,
 75 |                   neighbourhood_size=args.neighbourhood_size)
 76 | 
 77 |     # move to CUDA
 78 |     model = model.cuda()
 79 | 
 80 |     # Restore pre-trained model
 81 |     ckpt = torch.load(args.model_path)
 82 |     model.load_state_dict(ckpt['state_dict'])
 83 |     model.train(False)
 84 | 
 85 |     # Compute accuracy
 86 |     result = []
 87 |     correct = 0
 88 |     for step, next_batch in tqdm(enumerate(loader)):
 89 |         # move batch to cuda
 90 |         q_batch, _, vote_batch, i_batch, k_batch, qlen_batch = \
 91 |             batch_to_cuda(next_batch, volatile=True)
 92 | 
 93 |         # get predictions
 94 |         output, _ = model(q_batch, i_batch, k_batch, qlen_batch)
 95 |         qid_batch = next_batch[3]
 96 |         _, oix = output.data.max(1)
 97 |         # record predictions
 98 |         for i, qid in enumerate(qid_batch):
 99 |             result.append({
100 |                 'question_id': int(qid.numpy()),
101 |                 'answer': dataset.a_itow[oix[i]]
102 |             })
103 |         # compute batch accuracy
104 |         correct += total_vqa_score(output, vote_batch)
105 | 
106 |     # compute and print average accuracy
107 |     acc = correct/dataset.n_questions*100
108 |     print("accuracy: {} %".format(acc))
109 | 
110 |     # save predictions
111 |     json.dump(result, open('result.json', 'w'))
112 |     print('Validation done')
113 | 
114 | def train(args):
115 | 
116 |     """
117 |         Train a VQA model using the training set
118 |     """
119 | 
120 |     # set random seed
121 |     torch.manual_seed(1000)
122 |     if torch.cuda.is_available():
123 |         torch.cuda.manual_seed(1000)
124 |     else:
125 |         raise SystemExit('No CUDA available, script requires cuda')
126 | 
127 |     # Load the VQA training set
128 |     print('Loading data')
129 |     dataset = VQA_Dataset(args.data_dir, args.emb)
130 |     loader = DataLoader(dataset, batch_size=args.bsize,
131 |                         shuffle=True, num_workers=5, collate_fn=collate_fn)
132 | 
133 |     # Load the VQA validation set
134 |     dataset_test = VQA_Dataset(args.data_dir, args.emb, train=False)
135 |     test_sampler = RandomSampler(dataset_test)
136 |     loader_test = iter(DataLoader(dataset_test,
137 |                                   batch_size=args.bsize,
138 |                                   sampler=test_sampler,
139 |                                   shuffle=False,
140 |                                   num_workers=4,
141 |                                   collate_fn=collate_fn))
142 | 
143 |     n_batches = len(dataset)//args.bsize
144 | 
145 |     # Print data and model parameters
146 |     print('Parameters:\n\t'
147 |           'vocab size: %d\n\tembedding dim: %d\n\tfeature dim: %d'
148 |           '\n\thidden dim: %d\n\toutput dim: %d' % (dataset.q_words, args.emb,
149 |                                                     dataset.feat_dim,
150 |                                                     args.hid,
151 |                                                     dataset.n_answers))
152 |     print('Initializing model')
153 | 
154 |     model = Model(vocab_size=dataset.q_words,
155 |                   emb_dim=args.emb,
156 |                   feat_dim=dataset.feat_dim,
157 |                   hid_dim=args.hid,
158 |                   out_dim=dataset.n_answers,
159 |                   dropout=args.dropout,
160 |                   neighbourhood_size=args.neighbourhood_size,
161 |                   pretrained_wemb=dataset.pretrained_wemb)
162 | 
163 |     criterion = nn.MultiLabelSoftMarginLoss()
164 | 
165 |     # Move it to GPU
166 |     model = model.cuda()
167 |     criterion = criterion.cuda()
168 | 
169 |     # Define the optimiser
170 |     optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
171 | 
172 |     # Continue training from saved model
173 |     start_ep = 0
174 |     if args.model_path and os.path.isfile(args.model_path):
175 |         print('Resuming from checkpoint %s' % (args.model_path))
176 |         ckpt = torch.load(args.model_path)
177 |         start_ep = ckpt['epoch']
178 |         model.load_state_dict(ckpt['state_dict'])
179 |         optimizer.load_state_dict(ckpt['optimizer'])
180 | 
181 |     # Update the learning rate
182 |     for param_group in optimizer.param_groups:
183 |         param_group['lr'] = args.lr
184 | 
185 |     # Learning rate scheduler
186 |     scheduler = MultiStepLR(optimizer, milestones=[30], gamma=0.5)
187 |     scheduler.last_epoch = start_ep - 1
188 | 
189 |     # Train iterations
190 |     print('Start training.')
191 |     for ep in range(start_ep, start_ep+args.ep):
192 | 
193 |         scheduler.step()
194 |         ep_loss = 0.0
195 |         ep_correct = 0.0
196 |         ave_loss = 0.0
197 |         ave_correct = 0.0
198 |         losses = []
199 | 
200 |         for step, next_batch in tqdm(enumerate(loader)):
201 | 
202 |             model.train()
203 |             # Move batch to cuda
204 |             q_batch, a_batch, vote_batch, i_batch, k_batch, qlen_batch = \
205 |                 batch_to_cuda(next_batch)
206 | 
207 |             # forward pass
208 |             output, adjacency_matrix = model(
209 |                 q_batch, i_batch, k_batch, qlen_batch)
210 | 
211 |             loss = criterion(output, a_batch)
212 | 
213 |             # Compute batch accuracy based on vqa evaluation
214 |             correct = total_vqa_score(output, vote_batch)
215 |             ep_correct += correct
216 |             ep_loss += loss.data[0]
217 |             ave_correct += correct
218 |             ave_loss += loss.data[0]
219 |             losses.append(loss.cpu().data[0])
220 | 
221 |             # This is a 40 step average
222 |             if step % 40 == 0 and step != 0:
223 |                 print('  Epoch %02d(%03d/%03d), ave loss: %.7f, ave accuracy: %.2f%%' %
224 |                       (ep+1, step, n_batches, ave_loss/40,
225 |                        ave_correct * 100 / (args.bsize*40)))
226 | 
227 |                 ave_correct = 0
228 |                 ave_loss = 0
229 | 
230 |             # Compute gradient and do optimisation step
231 |             optimizer.zero_grad()
232 |             loss.backward()
233 |             optimizer.step()
234 | 
235 |             # save model and compute validation accuracy every 400 steps
236 |             if step % 400 == 0:
237 |                 epoch_loss = ep_loss / n_batches
238 |                 epoch_acc = ep_correct * 100 / (n_batches * args.bsize)
239 | 
240 |                 save(model, optimizer, ep, epoch_loss, epoch_acc,
241 |                      dir=args.save_dir, name=args.name+'_'+str(ep+1))
242 | 
243 |                 # compute validation accuracy over a small subset of the validation set
244 |                 test_correct = 0
245 |                 model.train(False)
246 | 
247 |                 for i in range(10):
248 |                     test_batch = next(loader_test)
249 |                     q_batch, a_batch, vote_batch, i_batch, k_batch, qlen_batch = \
250 |                         batch_to_cuda(test_batch, volatile=True)
251 |                     output, _ = model(q_batch, i_batch, k_batch, qlen_batch)
252 |                     test_correct += total_vqa_score(output, vote_batch)
253 | 
254 |                 model.train(True)
255 |                 acc = test_correct/(10*args.bsize)*100
256 |                 print("Validation accuracy: {:.2f} %".format(acc))
257 | 
258 |         # save model and compute accuracy for epoch
259 |         epoch_loss = ep_loss / n_batches
260 |         epoch_acc = ep_correct * 100 / (n_batches * args.bsize)
261 | 
262 |         save(model, optimizer, ep, epoch_loss, epoch_acc,
263 |              dir=args.save_dir, name=args.name+'_'+str(ep+1))
264 | 
265 |         print('Epoch %02d done, average loss: %.3f, average accuracy: %.2f%%' % (
266 |               ep+1, epoch_loss, epoch_acc))
267 | 
268 | def test(args):
269 | 
270 |     """
271 |         Creates a result.json for predictions on
272 |         the test set
273 |     """
274 |     # Check that the model path is accurate
275 |     if args.model_path and os.path.isfile(args.model_path):
276 |         print('Resuming from checkpoint %s' % (args.model_path))
277 |     else:
278 |         raise SystemExit('Need to provide model path.')
279 | 
280 |     torch.manual_seed(1000)
281 |     if torch.cuda.is_available():
282 |         torch.cuda.manual_seed(1000)
283 |     else:
284 |         raise SystemExit('No CUDA available, script requires CUDA')
285 |  
286 |     print('Loading data')
287 |     dataset = VQA_Dataset_Test(args.data_dir, args.emb, train=False)
288 |     loader = DataLoader(dataset, batch_size=args.bsize, 
289 |                         shuffle=False, num_workers=5, 
290 |                         collate_fn=collate_fn)
291 | 
292 |     # Print data and model parameters
293 |     print('Parameters:\n\t'
294 |           'vocab size: %d\n\tembedding dim: %d\n\tfeature dim: %d' 
295 |           '\n\thidden dim: %d\n\toutput dim: %d' % (dataset.q_words, args.emb,
296 |                                                     dataset.feat_dim,
297 |                                                     args.hid,
298 |                                                     dataset.n_answers))
299 | 
300 |     # Define model
301 |     model = Model(vocab_size=dataset.q_words,
302 |                   emb_dim=args.emb,
303 |                   feat_dim=dataset.feat_dim,
304 |                   hid_dim=args.hid,
305 |                   out_dim=dataset.n_answers,
306 |                   dropout=args.dropout,
307 |                   pretrained_wemb=dataset.pretrained_wemb,
308 |                   neighbourhood_size=args.neighbourhood_size)
309 |  
310 |     # move to CUDA
311 |     model = model.cuda()
312 |  
313 |     # Restore pre-trained model
314 |     ckpt = torch.load(args.model_path)
315 |     model.load_state_dict(ckpt['state_dict'])
316 |     model.train(False)
317 | 
318 |     result = []
319 |     for step, next_batch in tqdm(enumerate(loader)):
320 |         # Batch preparation
321 |         q_batch, _, _, i_batch, k_batch, qlen_batch = \
322 |             batch_to_cuda(next_batch, volatile=True)
323 |  
324 |         # get predictions
325 |         output, _ = model(q_batch, i_batch, k_batch, qlen_batch)
326 |         qid_batch = next_batch[3]
327 |         _, oix = output.data.max(1)
328 |         # record predictions
329 |         for i, qid in enumerate(qid_batch):
330 |             result.append({
331 |                 'question_id': int(qid.numpy()),
332 |                 'answer': dataset.a_itow[oix[i]]
333 |             })
334 |  
335 |     json.dump(result, open('result.json', 'w'))
336 |     print('Testing done')
337 | 
338 | def trainval(args):
339 | 
340 |     """
341 |         Train a VQA model using the training + validation set
342 |     """
343 |  
344 |     # set random seed
345 |     torch.manual_seed(1000)
346 |     if torch.cuda.is_available():
347 |         torch.cuda.manual_seed(1000)
348 |     else:
349 |         raise SystemExit('No CUDA available, script requires CUDA.')
350 |     
351 |     # load train+val sets for training
352 |     print ('Loading data')
353 |     dataset = VQA_Dataset_Test(args.data_dir, args.emb)
354 |     loader = DataLoader(dataset, batch_size=args.bsize, 
355 |                         shuffle=True, num_workers=5, 
356 |                         collate_fn=collate_fn)
357 |     n_batches = len(dataset)//args.bsize
358 | 
359 |     # Print data and model parameters
360 |     print ('Parameters:\n\tvocab size: %d\n\tembedding dim: %d\n\tfeature dim: %d\
361 |             \n\thidden dim: %d\n\toutput dim: %d' % (dataset.q_words, args.emb, dataset.feat_dim,
362 |                 args.hid, dataset.n_answers))
363 |     print ('Initializing model')
364 |  
365 |     model = Model(vocab_size=dataset.q_words,
366 |                   emb_dim=args.emb,
367 |                   feat_dim=dataset.feat_dim,
368 |                   hid_dim=args.hid,
369 |                   out_dim=dataset.n_answers,
370 |                   dropout=args.dropout,
371 |                   neighbourhood_size=args.neighbourhood_size,
372 |                   pretrained_wemb=dataset.pretrained_wemb)
373 |  
374 |     criterion = nn.MultiLabelSoftMarginLoss()
375 | 
376 |     # Move it to GPU
377 |     model = model.cuda()
378 |     criterion = criterion.cuda()
379 | 
380 |     # Define the optimizer
381 |     optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
382 |  
383 |     # Continue training from saved model
384 |     start_ep = 0
385 |     if args.model_path and os.path.isfile(args.model_path):
386 |         print ('Resuming from checkpoint %s' % (args.model_path))
387 |         ckpt = torch.load(args.model_path)
388 |         start_ep = ckpt['epoch']
389 |         model.load_state_dict(ckpt['state_dict'])
390 |         optimizer.load_state_dict(ckpt['optimizer'])
391 |  
392 |     # ensure you can load with new lr
393 |     for param_group in optimizer.param_groups:
394 |         param_group['lr'] = args.lr
395 |  
396 |     # learner rate scheduler
397 |     scheduler = MultiStepLR(optimizer, milestones=[30], gamma=0.5)
398 |     scheduler.last_epoch = start_ep - 1
399 |  
400 |     # Training script
401 |     print ('Start training.')
402 |     for ep in range(start_ep, start_ep+args.ep):
403 |         scheduler.step()
404 |         ep_loss = 0.0
405 |         ep_correct = 0.0
406 |         ave_loss = 0.0
407 |         ave_correct = 0.0
408 |         losses = []
409 |         for step, next_batch in tqdm(enumerate(loader)):
410 |             model.train()
411 |             # batch to gpu
412 |             q_batch, a_batch, vote_batch, i_batch, k_batch, qlen_batch = \
413 |                 batch_to_cuda(next_batch)
414 | 
415 |              # Do model forward
416 |             output, adjacency_matrix = model(
417 |                 q_batch, i_batch, k_batch, qlen_batch)
418 |             
419 |             loss = criterion(output, a_batch)
420 |  
421 |             # compute accuracy based on vqa evaluation
422 |             correct = total_vqa_score(output, vote_batch)
423 |             ep_correct += correct
424 |             ep_loss += loss.data[0]
425 |             ave_correct += correct
426 |             ave_loss += loss.data[0]
427 |             losses.append(loss.cpu().data[0])
428 |             # This is a 40 step average
429 |             if step % 40 == 0 and step != 0:
430 |                 print('  Epoch %02d(%03d/%03d), ave loss: %.7f, ave accuracy: %.2f%%' %
431 |                       (ep+1, step, n_batches, ave_loss/40,
432 |                        ave_correct * 100 / (args.bsize*40)))
433 | 
434 |                 ave_correct = 0
435 |                 ave_loss = 0
436 |                 ave_correct = ave_loss = ave_sparsity = 0
437 |  
438 |             # compute gradient and do optim step
439 |             optimizer.zero_grad()
440 |             loss.backward()
441 |             optimizer.step()
442 | 
443 |         # save model and compute accuracy for epoch
444 |         epoch_loss = ep_loss / n_batches
445 |         epoch_acc = ep_correct * 100 / (n_batches * args.bsize)
446 | 
447 | 
448 |         save(model, optimizer, ep, epoch_loss, epoch_acc,
449 |              dir=args.save_dir, name=args.name+'_'+str(ep+1))
450 | 
451 |         print('Epoch %02d done, average loss: %.3f, average accuracy: %.2f%%' % (
452 |               ep+1, epoch_loss, epoch_acc))
453 | 
454 | 
455 | if __name__ == '__main__':
456 |     parser = argparse.ArgumentParser(
457 |                         description='Conditional Graph Convolutions for VQA')
458 |     parser.add_argument('--train', action='store_true',
459 |                         help='set this to training mode.')
460 |     parser.add_argument('--trainval', action='store_true',
461 |                         help='set this to train+val mode.')
462 |     parser.add_argument('--eval', action='store_true',
463 |                         help='set this to evaluation mode.')
464 |     parser.add_argument('--test', action='store_true',
465 |                         help='set this to test mode.')
466 |     parser.add_argument('--lr', metavar='', type=float,
467 |                         default=1e-4, help='initial learning rate')
468 |     parser.add_argument('--ep', metavar='', type=int,
469 |                         default=40, help='number of epochs.')
470 |     parser.add_argument('--bsize', metavar='', type=int,
471 |                         default=64, help='batch size.')
472 |     parser.add_argument('--hid', metavar='', type=int,
473 |                         default=1024, help='hidden dimension')
474 |     parser.add_argument('--emb', metavar='', type=int, default=300,
475 |                         help='question embedding dimension')
476 |     parser.add_argument('--neighbourhood_size', metavar='', type=int, default=16,
477 |                         help='number of graph neighbours to consider')
478 |     parser.add_argument('--data_dir', metavar='', type=str, default='./data',
479 |                         help='path to data directory')
480 |     parser.add_argument('--save_dir', metavar='', type=str, default='./save')
481 |     parser.add_argument('--name', metavar='', type=str,
482 |                         default='model', help='model name')
483 |     parser.add_argument('--dropout', metavar='', type=float, default=0.5,
484 |                         help='probability of dropping out FC nodes during training')
485 |     parser.add_argument('--model_path', metavar='', type=str,
486 |                         help='trained model path.')
487 |     args, unparsed = parser.parse_known_args()
488 |     if len(unparsed) != 0:
489 |         raise SystemExit('Unknown argument: {}'.format(unparsed))
490 |     if args.train:
491 |         train(args)
492 |     if args.trainval:
493 |         trainval(args)
494 |     if args.eval:
495 |         eval_model(args)
496 |     if args.test:
497 |         test(args)
498 |     if not args.train and not args.eval and not args.trainval and not args.test:
499 |         parser.print_help()
500 | 


--------------------------------------------------------------------------------
/sparse_graph_model.py:
--------------------------------------------------------------------------------
  1 | #    Copyright 2018 AimBrain Ltd.
  2 | 
  3 | #    Licensed under the Apache License, Version 2.0 (the "License");
  4 | #    you may not use this file except in compliance with the License.
  5 | #    You may obtain a copy of the License at
  6 | 
  7 | #        http://www.apache.org/licenses/LICENSE-2.0
  8 | 
  9 | #    Unless required by applicable law or agreed to in writing, software
 10 | #    distributed under the License is distributed on an "AS IS" BASIS,
 11 | #    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | #    See the License for the specific language governing permissions and
 13 | #    limitations under the License.
 14 | 
 15 | from __future__ import division
 16 | from __future__ import print_function
 17 | from __future__ import absolute_import
 18 | 
 19 | import torch
 20 | import torch.nn as nn
 21 | import torch.nn.functional as F
 22 | from torch.nn.utils.rnn import pack_padded_sequence
 23 | 
 24 | from layers import NeighbourhoodGraphConvolution as GraphConvolution
 25 | from layers import GraphLearner
 26 | 
 27 | 
 28 | class Model(nn.Module):
 29 | 
 30 |     def __init__(self,
 31 |                  vocab_size,
 32 |                  emb_dim,
 33 |                  feat_dim,
 34 |                  hid_dim,
 35 |                  out_dim,
 36 |                  pretrained_wemb,
 37 |                  dropout,
 38 |                  n_kernels=8,
 39 |                  neighbourhood_size=16):
 40 | 
 41 |         '''
 42 |         ## Variables:
 43 |         - vocab_size: dimensionality of input vocabulary
 44 |         - emb_dim: question embedding size
 45 |         - feat_dim: dimensionality of input image features
 46 |         - out_dim: dimensionality of the output
 47 |         - dropout: dropout probability
 48 |         - n_kernels : number of Gaussian kernels for convolutions
 49 |         - bias: whether to add a bias to Gaussian kernels
 50 |         '''
 51 | 
 52 |         super(Model, self).__init__()
 53 | 
 54 |         # Set parameters
 55 |         self.vocab_size = vocab_size
 56 |         self.emb_dim = emb_dim
 57 |         self.feat_dim = feat_dim
 58 |         self.hid_dim = hid_dim
 59 |         self.out_dim = out_dim
 60 |         self.neighbourhood_size = neighbourhood_size
 61 | 
 62 |         # initialize word embedding layer weight
 63 |         self.wembed = nn.Embedding(vocab_size, emb_dim)
 64 |         self.wembed.weight.data.copy_(torch.from_numpy(pretrained_wemb))
 65 | 
 66 |         # question encoding
 67 |         self.q_lstm = nn.GRU(input_size=emb_dim, hidden_size=hid_dim)
 68 | 
 69 |         # graph learner
 70 |         self.adjacency_1 = GraphLearner(in_feature_dim=feat_dim + hid_dim,
 71 |                                         combined_feature_dim=512,
 72 |                                         K=36,
 73 |                                         dropout=dropout)
 74 | 
 75 |         # dropout layers
 76 |         self.dropout = nn.Dropout(p=dropout)
 77 |         self.dropout_q = nn.Dropout(p=dropout/2)
 78 | 
 79 |         # graph convolution layers
 80 |         self.graph_convolution_1 = \
 81 |             GraphConvolution(feat_dim, hid_dim * 2, n_kernels, 2)
 82 |         self.graph_convolution_2 = \
 83 |             GraphConvolution(hid_dim * 2, hid_dim, n_kernels, 2)
 84 | 
 85 |         # output classifier
 86 |         self.out_1 = nn.utils.weight_norm(nn.Linear(hid_dim, out_dim))
 87 |         self.out_2 = nn.utils.weight_norm(nn.Linear(out_dim, out_dim))
 88 | 
 89 |     def forward(self, question, image, K, qlen):
 90 |         '''
 91 |         ## Inputs:
 92 |         - question (batch_size, max_qlen): input tokenised question
 93 |         - image (batch_size, K, feat_dim): input image features
 94 |         - K (int): number of image features/objects in the image
 95 |         - qlen (batch_size): vector describing the length (in words) of each input question
 96 |         ## Returns:
 97 |         - logits (batch_size, out_dim)
 98 |         '''
 99 | 
100 |         K = int(K[0].cpu().data.numpy())
101 | 
102 |         # extract bounding boxes and compute centres
103 |         bb = image[:, :, -4:].contiguous()
104 |         bb_size = (bb[:, :, 2:]-bb[:, :, :2])
105 |         bb_centre = bb[:, :, :2] + 0.5*bb_size
106 | 
107 |         # apply dropout to image features
108 |         image = self.dropout(image)
109 | 
110 |         # Compute pseudo coordinates
111 |         pseudo_coord = self._compute_pseudo(bb_centre)
112 | 
113 |         # Compute question encoding
114 |         emb = self.wembed(question)
115 |         packed = pack_padded_sequence(emb, qlen, batch_first=True)  # questions have variable lengths
116 |         _, hid = self.q_lstm(packed)
117 |         qenc = hid[0].unsqueeze(1)
118 |         qenc_repeat = qenc.repeat(1, K, 1)
119 | 
120 |         # Learn adjacency matrix
121 |         image_qenc_cat = torch.cat((image, qenc_repeat), dim=-1)
122 |         adjacency_matrix = self.adjacency_1(image_qenc_cat)
123 | 
124 |         # Graph convolution 1
125 |         neighbourhood_image, neighbourhood_pseudo = self._create_neighbourhood(image,
126 |                                                                                pseudo_coord,
127 |                                                                                adjacency_matrix,
128 |                                                                                self.neighbourhood_size,
129 |                                                                                weight=True)
130 |         hidden_graph_1 = self.graph_convolution_1(
131 |             neighbourhood_image, neighbourhood_pseudo)
132 |         hidden_graph_1 = F.relu(hidden_graph_1)
133 |         hidden_graph_1 = self.dropout(hidden_graph_1)
134 | 
135 |         # graph convolution 2
136 |         hidden_graph_1, neighbourhood_pseudo = self._create_neighbourhood(hidden_graph_1,
137 |                                                                           pseudo_coord,
138 |                                                                           adjacency_matrix,
139 |                                                                           self.neighbourhood_size,
140 |                                                                           weight=False)
141 |         hidden_graph_2 = self.graph_convolution_2(
142 |             hidden_graph_1, neighbourhood_pseudo)
143 |         hidden_graph_2 = F.relu(hidden_graph_2)
144 | 
145 |         hidden_graph_2, _ = torch.max(hidden_graph_2, dim=1)
146 |         h = F.relu(qenc).squeeze(1)*hidden_graph_2
147 | 
148 |         # Output classifier
149 |         hidden_1 = self.out_1(h)
150 |         hidden_1 = F.relu(hidden_1)
151 |         hidden_1 = self.dropout(hidden_1)
152 |         logits = self.out_2(hidden_1)
153 | 
154 |         return logits, adjacency_matrix
155 | 
156 |     def _create_neighbourhood_feat(self, image, top_ind):
157 |         '''
158 |         ## Inputs:
159 |         - image (batch_size, K, feat_dim)
160 |         - top_ind (batch_size, K, neighbourhood_size)
161 |         ## Returns:
162 |         - neighbourhood_image (batch_size, K, neighbourhood_size, feat_dim)
163 |         '''
164 | 
165 |         batch_size = image.size(0)
166 |         K = image.size(1)
167 |         feat_dim = image.size(2)
168 |         neighbourhood_size = top_ind.size(-1)
169 |         image = image.unsqueeze(1).expand(batch_size, K, K, feat_dim)
170 |         idx = top_ind.unsqueeze(-1).expand(batch_size,
171 |                                            K, neighbourhood_size, feat_dim)
172 |         return torch.gather(image, dim=2, index=idx)
173 | 
174 |     def _create_neighbourhood_pseudo(self, pseudo, top_ind):
175 |         '''
176 |         ## Inputs:
177 |         - pseudo_coord (batch_size, K, K, coord_dim)
178 |         - top_ind (batch_size, K, neighbourhood_size)
179 |         ## Returns:
180 |         - neighbourhood_pseudo (batch_size, K, neighbourhood_size, coord_dim)
181 |         '''
182 |         batch_size = pseudo.size(0)
183 |         K = pseudo.size(1)
184 |         coord_dim = pseudo.size(3)
185 |         neighbourhood_size = top_ind.size(-1)
186 |         idx = top_ind.unsqueeze(-1).expand(batch_size,
187 |                                            K, neighbourhood_size, coord_dim)
188 |         return torch.gather(pseudo, dim=2, index=idx)
189 | 
190 |     def _create_neighbourhood(self,
191 |                               features,
192 |                               pseudo_coord,
193 |                               adjacency_matrix,
194 |                               neighbourhood_size,
195 |                               weight=True):
196 | 
197 |         '''
198 | 
199 |         Creates a neighbourhood system for each graph node/image object
200 | 
201 |         ## Inputs:
202 |         - features (batch_size, K, feat_dim): input image features
203 |         - pseudo_coord (batch_size, K, K, coord_dim): pseudo coordinates for graph convolutions
204 |         - adjacency_matrix (batch_size, K, K): learned adjacency matrix
205 |         - neighbourhood_size (int)
206 |         - weight (bool): specify if the features should be weighted by the adjacency matrix values
207 | 
208 |         ## Returns:
209 |         - neighbourhood_image (batch_size, K, neighbourhood_size, feat_dim)
210 |         - neighbourhood_pseudo (batch_size, K, neighbourhood_size, coord_dim)
211 |         '''
212 | 
213 |         # Number of graph nodes
214 |         K = features.size(1)
215 | 
216 |         # extract top k neighbours for each node and normalise
217 |         top_k, top_ind = torch.topk(
218 |             adjacency_matrix, k=neighbourhood_size, dim=-1, sorted=False)
219 |         top_k = torch.stack([F.softmax(top_k[:, k]) for k in range(K)]).transpose(0, 1)  # (batch_size, K, neighbourhood_size)
220 | 
221 |         # extract top k features and pseudo coordinates
222 |         neighbourhood_image = \
223 |             self._create_neighbourhood_feat(features, top_ind)
224 |         neighbourhood_pseudo = \
225 |             self._create_neighbourhood_pseudo(pseudo_coord, top_ind)
226 | 
227 |         # weight neighbourhood features with graph edge weights
228 |         if weight:
229 |             neighbourhood_image = top_k.unsqueeze(-1)*neighbourhood_image
230 | 
231 |         return neighbourhood_image, neighbourhood_pseudo
232 | 
233 |     def _compute_pseudo(self, bb_centre):
234 |         '''
235 | 
236 |         Computes pseudo-coordinates from bounding box centre coordinates
237 | 
238 |         ## Inputs:
239 |         - bb_centre (batch_size, K, coord_dim)
240 |         - polar (bool: polar or euclidean coordinates)
241 |         ## Returns:
242 |         - pseudo_coord (batch_size, K, K, coord_dim)
243 |         '''
244 | 
245 |         K = bb_centre.size(1)
246 | 
247 |         # Compute cartesian coordinates (batch_size, K, K, 2)
248 |         pseudo_coord = bb_centre.view(-1, K, 1, 2) - \
249 |             bb_centre.view(-1, 1, K, 2)
250 | 
251 |         # Conver to polar coordinates
252 |         rho = torch.sqrt(
253 |             pseudo_coord[:, :, :, 0]**2 + pseudo_coord[:, :, :, 1]**2)
254 |         theta = torch.atan2(
255 |             pseudo_coord[:, :, :, 0], pseudo_coord[:, :, :, 1])
256 |         pseudo_coord = torch.cat(
257 |             (torch.unsqueeze(rho, -1), torch.unsqueeze(theta, -1)), dim=-1)
258 | 
259 |         return pseudo_coord
260 | 


--------------------------------------------------------------------------------
/torch_dataset.py:
--------------------------------------------------------------------------------
  1 | #    Copyright 2018 AimBrain Ltd.
  2 | 
  3 | #    Licensed under the Apache License, Version 2.0 (the "License");
  4 | #    you may not use this file except in compliance with the License.
  5 | #    You may obtain a copy of the License at
  6 | 
  7 | #        http://www.apache.org/licenses/LICENSE-2.0
  8 | 
  9 | #    Unless required by applicable law or agreed to in writing, software
 10 | #    distributed under the License is distributed on an "AS IS" BASIS,
 11 | #    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | #    See the License for the specific language governing permissions and
 13 | #    limitations under the License.
 14 | 
 15 | from __future__ import absolute_import, division, print_function
 16 | 
 17 | import os
 18 | import json
 19 | import numpy as np
 20 | import zarr
 21 | import pandas as pd
 22 | from torch.utils.data import Dataset
 23 | from torch.utils.data import dataloader
 24 | 
 25 | try:
 26 |     import cPickle as pickle
 27 | except:
 28 |     import pickle as pickle
 29 | 
 30 | 
 31 | def collate_fn(batch):
 32 |     # put question lengths in descending order so that we can use packed sequences later
 33 |     batch.sort(key=lambda x: x[-1], reverse=True)
 34 |     return dataloader.default_collate(batch)
 35 | 
 36 | 
 37 | class VQA_Dataset(Dataset):
 38 | 
 39 |     def __init__(self, data_dir, emb_dim=300, train=True):
 40 | 
 41 |         # Set parameters
 42 |         self.data_dir = data_dir  # directory where the data is stored
 43 |         self.emb_dim = emb_dim    # question embedding dimension
 44 |         self.train = train        # train (True) or eval (False) mode
 45 |         self.seqlen = 14          # maximum question sequence length
 46 | 
 47 |         # Load training question dictionary
 48 |         q_dict = pickle.load(
 49 |             open(os.path.join(data_dir, 'train_q_dict.p'), 'rb'))
 50 |         self.q_itow = q_dict['itow']
 51 |         self.q_wtoi = q_dict['wtoi']
 52 |         self.q_words = len(self.q_itow) + 1
 53 | 
 54 |         # Load training answer dictionary
 55 |         a_dict = pickle.load(
 56 |             open(os.path.join(data_dir, 'train_a_dict.p'), 'rb'))
 57 |         self.a_itow = a_dict['itow']
 58 |         self.a_wtoi = a_dict['wtoi']
 59 |         self.n_answers = len(self.a_itow) + 1
 60 | 
 61 |         # Load image features and bounding boxes
 62 |         self.i_feat = zarr.open(os.path.join(
 63 |             data_dir, 'trainval.zarr'), mode='r')
 64 |         self.bbox = zarr.open(os.path.join(
 65 |             data_dir, 'trainval_boxes.zarr'), mode='r')
 66 |         self.sizes = pd.read_csv(os.path.join(
 67 |             data_dir, 'trainval_image_size.csv'))
 68 | 
 69 |         # Load questions
 70 |         if train:
 71 |             self.vqa = json.load(
 72 |                 open(os.path.join(data_dir, 'vqa_train_final_3000.json')))
 73 |         else:
 74 |             self.vqa = json.load(
 75 |                 open(os.path.join(data_dir, 'vqa_val_final_3000.json')))
 76 | 
 77 |         self.n_questions = len(self.vqa)
 78 | 
 79 |         print('Loading done')
 80 |         self.feat_dim = self.i_feat[list(self.i_feat.keys())[
 81 |             0]].shape[1] + 4  # + bbox
 82 |         self.init_pretrained_wemb(emb_dim)
 83 | 
 84 |     def init_pretrained_wemb(self, emb_dim):
 85 |         """
 86 |             From blog.keras.io
 87 |             Initialises words embeddings with pre-trained GLOVE embeddings
 88 |         """
 89 |         embeddings_index = {}
 90 |         f = open(os.path.join(self.data_dir, 'glove.6B.') +
 91 |                  str(emb_dim) + 'd.txt')
 92 |         for line in f:
 93 |             values = line.split()
 94 |             word = values[0]
 95 |             coefs = np.asarray(values[1:], dtype=np.float32)
 96 |             embeddings_index[word] = coefs
 97 |         f.close()
 98 | 
 99 |         embedding_mat = np.zeros((self.q_words, emb_dim), dtype=np.float32)
100 |         for word, i in self.q_wtoi.items():
101 |             embedding_v = embeddings_index.get(word)
102 |             if embedding_v is not None:
103 |                 embedding_mat[i] = embedding_v
104 | 
105 |         self.pretrained_wemb = embedding_mat
106 | 
107 |     def __len__(self):
108 |         return self.n_questions
109 | 
110 |     def __getitem__(self, idx):
111 | 
112 |         # question sample
113 |         qlen = len(self.vqa[idx]['question_toked'])
114 |         q = [0] * 100
115 |         for i, w in enumerate(self.vqa[idx]['question_toked']):
116 |             try:
117 |                 q[i] = self.q_wtoi[w]
118 |             except:
119 |                 q[i] = 0    # validation questions may contain unseen word
120 | 
121 |         # soft label answers
122 |         a = np.zeros(self.n_answers, dtype=np.float32)
123 |         for w, c in self.vqa[idx]['answers_w_scores']:
124 |             try:
125 |                 a[self.a_wtoi[w]] = c
126 |             except:
127 |                 continue
128 | 
129 |         # number of votes for each answer
130 |         n_votes = np.zeros(self.n_answers, dtype=np.float32)
131 |         for w, c in self.vqa[idx]['answers']:
132 |             try:
133 |                 n_votes[self.a_wtoi[w]] = c
134 |             except:
135 |                 continue
136 | 
137 |         # id of the question
138 |         qid = self.vqa[idx]['question_id']
139 | 
140 |         # image sample
141 |         iid = self.vqa[idx]['image_id']
142 |         img = self.i_feat[str(iid)]
143 |         bboxes = np.asarray(self.bbox[str(iid)])
144 |         imsize = self.sizes[str(iid)]
145 | 
146 |         if np.logical_not(np.isfinite(img)).sum() > 0:
147 |             raise ValueError
148 | 
149 |         # number of image objects
150 |         k = 36
151 | 
152 |         # scale bounding boxes by image dimensions
153 |         for i in range(k):
154 |             bbox = bboxes[i]
155 |             bbox[0] /= imsize[0]
156 |             bbox[1] /= imsize[1]
157 |             bbox[2] /= imsize[0]
158 |             bbox[3] /= imsize[1]
159 |             bboxes[i] = bbox
160 | 
161 |         # format variables
162 |         q = np.asarray(q)
163 |         a = np.asarray(a).reshape(-1)
164 |         n_votes = np.asarray(n_votes).reshape(-1)
165 |         qid = np.asarray(qid).reshape(-1)
166 |         i = np.concatenate([img, bboxes], axis=1)
167 |         k = np.asarray(k).reshape(1)
168 | 
169 |         return q, a, n_votes, qid, i, k, qlen
170 | 
171 | 
172 | class VQA_Dataset_Test(Dataset):
173 | 
174 |     def __init__(self, data_dir, emb_dim=300, train=True):
175 |         self.data_dir = data_dir
176 |         self.emb_dim = emb_dim
177 |         self.train = train
178 |         self.seqlen = 14    # hard set based on paper
179 | 
180 |         q_dict = pickle.load(
181 |             open(os.path.join(data_dir, 'train_q_dict.p'), 'rb'))
182 |         self.q_itow = q_dict['itow']
183 |         self.q_wtoi = q_dict['wtoi']
184 |         self.q_words = len(self.q_itow) + 1
185 | 
186 |         a_dict = pickle.load(
187 |             open(os.path.join(data_dir, 'train_a_dict.p'), 'rb'))
188 |         self.a_itow = a_dict['itow']
189 |         self.a_wtoi = a_dict['wtoi']
190 |         self.n_answers = len(self.a_itow) + 1
191 | 
192 |         if train:
193 |             self.vqa = json.load(open(os.path.join(data_dir, 'vqa_train_final_3000.json'))) + \
194 |                 json.load(
195 |                     open(os.path.join(data_dir, 'vqa_val_final_3000.json')))
196 |             self.i_feat = zarr.open(os.path.join(
197 |                 data_dir, 'trainval.zarr'), mode='r')
198 |             self.bbox = zarr.open(os.path.join(
199 |                 data_dir, 'trainval_boxes.zarr'), mode='r')
200 |             self.sizes = pd.read_csv(os.path.join(
201 |                 data_dir, 'trainval_image_size.csv'))
202 |         else:
203 |             self.vqa = json.load(
204 |                 open(os.path.join(data_dir, 'vqa_test_toked.json')))
205 |             self.i_feat = zarr.open(os.path.join(
206 |                 data_dir, 'test.zarr'), mode='r')
207 |             self.bbox = zarr.open(os.path.join(
208 |                 data_dir, 'test_boxes.zarr'), mode='r')
209 |             self.sizes = pd.read_csv(os.path.join(
210 |                 data_dir, 'test_image_size.csv'))
211 | 
212 |         self.n_questions = len(self.vqa)
213 | 
214 |         print('Loading done')
215 |         self.feat_dim = self.i_feat[list(self.i_feat.keys())[
216 |             0]].shape[1] + 4  # + bbox
217 |         self.init_pretrained_wemb(emb_dim)
218 | 
219 |     def init_pretrained_wemb(self, emb_dim):
220 |         """From blog.keras.io"""
221 |         embeddings_index = {}
222 |         f = open(os.path.join(self.data_dir, 'glove.6B.') +
223 |                  str(emb_dim) + 'd.txt')
224 |         for line in f:
225 |             values = line.split()
226 |             word = values[0]
227 |             coefs = np.asarray(values[1:], dtype=np.float32)
228 |             embeddings_index[word] = coefs
229 |         f.close()
230 | 
231 |         embedding_mat = np.zeros((self.q_words, emb_dim), dtype=np.float32)
232 |         for word, i in self.q_wtoi.items():
233 |             embedding_v = embeddings_index.get(word)
234 |             if embedding_v is not None:
235 |                 embedding_mat[i] = embedding_v
236 | 
237 |         self.pretrained_wemb = embedding_mat
238 | 
239 |     def __len__(self):
240 |         return self.n_questions
241 | 
242 |     def __getitem__(self, idx):
243 | 
244 |         # question sample
245 |         qlen = len(self.vqa[idx]['question_toked'])
246 |         q = [0] * 100
247 |         for i, w in enumerate(self.vqa[idx]['question_toked']):
248 |             try:
249 |                 q[i] = self.q_wtoi[w]
250 |             except:
251 |                 q[i] = 0    # validation questions may contain unseen word
252 | 
253 |         # soft label answers
254 |         if self.train:
255 |             a = np.zeros(self.n_answers, dtype=np.float32)
256 |             for w, c in self.vqa[idx]['answers_w_scores']:
257 |                 try:
258 |                     a[self.a_wtoi[w]] = c
259 |                 except:
260 |                     continue
261 |             a = np.asarray(a).reshape(-1)
262 |         else:
263 |             # return 0's for unknown test set answers
264 |             a = 0
265 | 
266 |         # votes
267 |         if self.train:
268 |             n_votes = np.zeros(self.n_answers, dtype=np.float32)
269 |             for w, c in self.vqa[idx]['answers']:
270 |                 try:
271 |                     n_votes[self.a_wtoi[w]] = c
272 |                 except:
273 |                     continue
274 |             n_votes = np.asarray(n_votes).reshape(-1)
275 |         else:
276 |             # return 0's for unknown test set answers
277 |             n_votes = 0
278 | 
279 |         # id of the question
280 |         qid = self.vqa[idx]['question_id']
281 | 
282 |         # image sample
283 |         iid = self.vqa[idx]['image_id']
284 |         img = self.i_feat[str(iid)]
285 |         bboxes = np.asarray(self.bbox[str(iid)])
286 |         imsize = self.sizes[str(iid)]
287 | 
288 |         if np.logical_not(np.isfinite(img)).sum() > 0:
289 |             raise ValueError
290 | 
291 |         # k sample
292 |         k = 36
293 | 
294 |         # scale bounding boxes by image dimensions
295 |         for i in range(k):
296 |             bbox = bboxes[i]
297 |             bbox[0] /= imsize[0]
298 |             bbox[1] /= imsize[1]
299 |             bbox[2] /= imsize[0]
300 |             bbox[3] /= imsize[1]
301 |             bboxes[i] = bbox
302 | 
303 |         # format
304 |         q = np.asarray(q)
305 |         qid = np.asarray(qid).reshape(-1)
306 |         i = np.concatenate([img, bboxes], axis=1)
307 |         k = np.asarray(k).reshape(1)
308 | 
309 |         return q, a, n_votes, qid, i, k, qlen
310 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | #    Copyright 2018 AimBrain Ltd.
 2 | 
 3 | #    Licensed under the Apache License, Version 2.0 (the "License");
 4 | #    you may not use this file except in compliance with the License.
 5 | #    You may obtain a copy of the License at
 6 | 
 7 | #        http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | #    Unless required by applicable law or agreed to in writing, software
10 | #    distributed under the License is distributed on an "AS IS" BASIS,
11 | #    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #    See the License for the specific language governing permissions and
13 | #    limitations under the License.
14 | 
15 | import os
16 | import torch
17 | from torch.autograd import Variable
18 | 
19 | 
20 | def batch_to_cuda(batch, volatile=False):
21 |     # moves dataset batch on GPU
22 | 
23 |     q = Variable(batch[0], volatile=volatile, requires_grad=False).cuda()
24 |     a = Variable(batch[1], volatile=volatile, requires_grad=False).cuda()
25 |     n_votes = Variable(batch[2], volatile=volatile, requires_grad=False).cuda()
26 |     i = Variable(batch[4], volatile=volatile, requires_grad=False).cuda()
27 |     k = Variable(batch[5], volatile=volatile, requires_grad=False).cuda()
28 |     qlen = list(batch[6])
29 |     return q, a, n_votes, i, k, qlen
30 | 
31 | 
32 | def save(model, optimizer, ep, epoch_loss, epoch_acc, dir, name):
33 |     # saves model and optimizer state
34 | 
35 |     tbs = {
36 |         'epoch': ep + 1,
37 |         'loss': epoch_loss,
38 |         'accuracy': epoch_acc,
39 |         'state_dict': model.state_dict(),
40 |         'optimizer': optimizer.state_dict()
41 |         }
42 |     torch.save(tbs, os.path.join(dir, name + '.pth.tar'))
43 | 
44 | 
45 | def total_vqa_score(output_batch, n_votes_batch):
46 |     # computes the total vqa score as assessed by the challenge
47 | 
48 |     vqa_score = 0
49 |     _, oix = output_batch.data.max(1)
50 |     for i, pred in enumerate(oix):
51 |         count = n_votes_batch[i,pred]
52 |         vqa_score += min(count.cpu().data[0]/3, 1)
53 |     return vqa_score
54 | 


--------------------------------------------------------------------------------